xlin 0.1.18__py2.py3-none-any.whl → 0.1.20__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xlin/jsonl.py CHANGED
@@ -221,3 +221,39 @@ def generator_from_paths(paths: List[Path], load_data: Callable[[Path], List[Dic
221
221
  jsonlist: List[Dict[str, Any]] = load_data(path)
222
222
  for row in jsonlist:
223
223
  yield path, row
224
+
225
+
226
+
227
+ def append_to_json_list(data: list[dict], file_path: str):
228
+ """Append a list of dictionaries to a JSON file."""
229
+ with open(file_path, "a") as f:
230
+ for item in data:
231
+ f.write(json.dumps(item) + "\n")
232
+
233
+
234
+ def row_to_json(row: dict) -> dict:
235
+ """Convert a row to a JSON object."""
236
+ new_row = {}
237
+ for k, v in row.items():
238
+ if isinstance(v, dict):
239
+ new_row[k] = row_to_json(v)
240
+ elif isinstance(v, list):
241
+ new_row[k] = [row_to_json(item) for item in v]
242
+ elif isinstance(v, pd.DataFrame):
243
+ new_row[k] = [row_to_json(item) for item in v.to_dict(orient="records")]
244
+ else:
245
+ new_row[k] = v
246
+
247
+ return new_row
248
+
249
+
250
+ def generator_from_json(path):
251
+ jsonlist = load_json(path)
252
+ for line in jsonlist:
253
+ yield line
254
+
255
+
256
+ def generator_from_jsonl(path):
257
+ jsonlist = load_json_list(path)
258
+ for line in jsonlist:
259
+ yield line
@@ -9,7 +9,8 @@ from pathlib import Path
9
9
  from tqdm import tqdm
10
10
  from loguru import logger
11
11
 
12
- from xlin.jsonl import load_json_list, save_json_list, load_json, save_json
12
+ from xlin.jsonl import append_to_json_list, dataframe_to_json_list, load_json_list, row_to_json, save_json_list, load_json, save_json
13
+ from xlin.read_as_dataframe import read_as_dataframe
13
14
  from xlin.util import ls
14
15
 
15
16
 
@@ -212,3 +213,145 @@ def continue_run(
212
213
  save_func(new_jsonlist, output_filepath)
213
214
  new_jsonfiles.append(output_filepath)
214
215
  return new_jsonfiles
216
+
217
+
218
+ def dataframe_mapping(
219
+ df: pd.DataFrame,
220
+ row_func: Callable[[dict], dict],
221
+ output_path: Optional[Union[str, Path]] = None,
222
+ force_overwrite: bool = False,
223
+ batch_size=multiprocessing.cpu_count(),
224
+ cache_batch_num=1,
225
+ thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
226
+ ):
227
+ """mapping a column to another column
228
+
229
+ Args:
230
+ df (DataFrame): [description]
231
+ row_func (function): (Dict[str, str]) -> Dict[str, str]
232
+ output_path (Path): 数据量大的时候需要缓存. None 表示不缓存中间结果
233
+ force_overwrite (bool): 是否强制覆盖 output_path
234
+ batch_size (int): batch size
235
+ cache_batch_num (int): cache batch num
236
+ thread_pool_size (int): thread pool size
237
+ """
238
+ need_caching = output_path is not None
239
+ tmp_list, output_list = list(), list()
240
+ start_idx = 0
241
+ if need_caching:
242
+ output_path = Path(output_path)
243
+ if output_path.exists() and not force_overwrite:
244
+ existed_df = read_as_dataframe(output_path)
245
+ start_idx = len(existed_df)
246
+ output_list = dataframe_to_json_list(existed_df)
247
+ logger.warning(f"Cache found that {output_path} has {start_idx} rows. This process will continue at row index {start_idx}.")
248
+ logger.warning(f"缓存 {output_path} 存在 {start_idx} 行. 本次处理将从第 {start_idx} 行开始.")
249
+ else:
250
+ output_path.parent.mkdir(parents=True, exist_ok=True)
251
+ pool = ThreadPool(thread_pool_size)
252
+ logger.debug(f"pool size: {thread_pool_size}, cpu count: {multiprocessing.cpu_count()}")
253
+ start_time = time.time()
254
+ last_save_time = start_time
255
+ with tqdm(total=len(df), desc="Processing", unit="rows") as pbar:
256
+ for i, line in df.iterrows():
257
+ pbar.update(1)
258
+ if i < start_idx:
259
+ continue
260
+ line_info: dict = line.to_dict()
261
+ tmp_list.append(line_info)
262
+ if len(tmp_list) == batch_size:
263
+ results = pool.map(row_func, tmp_list)
264
+ output_list.extend([row_to_json(x) for x in results])
265
+ tmp_list = list()
266
+ if need_caching and (i // batch_size) % cache_batch_num == 0:
267
+ current_time = time.time()
268
+ if current_time - last_save_time < 3:
269
+ # 如果多进程处理太快,为了不让 IO 成为瓶颈拉慢进度,不足 3 秒的批次都忽略,也不缓存中间结果
270
+ last_save_time = current_time
271
+ continue
272
+ rows_to_cache = output_list[start_idx:]
273
+ append_to_json_list(rows_to_cache, output_path)
274
+ start_idx = len(output_list)
275
+ last_save_time = time.time()
276
+ if need_caching:
277
+ pbar.set_postfix_str(f"Cache: {len(output_list)}/{len(df)}")
278
+ if len(tmp_list) > 0:
279
+ results = pool.map(row_func, tmp_list)
280
+ output_list.extend([row_to_json(x) for x in results])
281
+ pool.close()
282
+ if need_caching:
283
+ rows_to_cache = output_list[start_idx:]
284
+ append_to_json_list(rows_to_cache, output_path)
285
+ start_idx = len(output_list)
286
+ pbar.set_postfix_str(f"Cache: {len(output_list)}/{len(df)}")
287
+ output_df = pd.DataFrame(output_list)
288
+ return output_df
289
+
290
+
291
+ def dataframe_batch_mapping(
292
+ df: pd.DataFrame,
293
+ batch_row_func: Callable[[list[dict]], dict],
294
+ output_path: Optional[Union[str, Path]] = None,
295
+ force_overwrite: bool = False,
296
+ batch_size=multiprocessing.cpu_count(),
297
+ cache_batch_num=1,
298
+ ):
299
+ """mapping a column to another column
300
+
301
+ Args:
302
+ df (DataFrame): [description]
303
+ row_func (function): (Dict[str, str]) -> Dict[str, str]
304
+ output_path (Path): 数据量大的时候需要缓存. None 表示不缓存中间结果
305
+ force_overwrite (bool): 是否强制覆盖 output_path
306
+ batch_size (int): batch size
307
+ cache_batch_num (int): cache batch num
308
+ thread_pool_size (int): thread pool size
309
+ """
310
+ need_caching = output_path is not None
311
+ tmp_list, output_list = list(), list()
312
+ start_idx = 0
313
+ if need_caching:
314
+ output_path = Path(output_path)
315
+ if output_path.exists() and not force_overwrite:
316
+ existed_df = read_as_dataframe(output_path)
317
+ start_idx = len(existed_df)
318
+ output_list = dataframe_to_json_list(existed_df)
319
+ logger.warning(f"Cache found that {output_path} has {start_idx} rows. This process will continue at row index {start_idx}.")
320
+ logger.warning(f"缓存 {output_path} 存在 {start_idx} 行. 本次处理将从第 {start_idx} 行开始.")
321
+ else:
322
+ output_path.parent.mkdir(parents=True, exist_ok=True)
323
+ start_time = time.time()
324
+ last_save_time = start_time
325
+ with tqdm(total=len(df), desc="Processing", unit="rows") as pbar:
326
+ for i, line in df.iterrows():
327
+ pbar.update(1)
328
+ if i < start_idx:
329
+ continue
330
+ line_info: dict = line.to_dict()
331
+ tmp_list.append(line_info)
332
+ if len(tmp_list) == batch_size:
333
+ results = batch_row_func(tmp_list)
334
+ output_list.extend([row_to_json(x) for x in results])
335
+ tmp_list = list()
336
+ if need_caching and (i // batch_size) % cache_batch_num == 0:
337
+ current_time = time.time()
338
+ if current_time - last_save_time < 3:
339
+ # 如果多进程处理太快,为了不让 IO 成为瓶颈拉慢进度,不足 3 秒的批次都忽略,也不缓存中间结果
340
+ last_save_time = current_time
341
+ continue
342
+ rows_to_cache = output_list[start_idx:]
343
+ append_to_json_list(rows_to_cache, output_path)
344
+ start_idx = len(output_list)
345
+ last_save_time = time.time()
346
+ if need_caching:
347
+ pbar.set_postfix_str(f"Cache: {len(output_list)}/{len(df)}")
348
+ if len(tmp_list) > 0:
349
+ results = batch_row_func(tmp_list)
350
+ output_list.extend([row_to_json(x) for x in results])
351
+ if need_caching:
352
+ rows_to_cache = output_list[start_idx:]
353
+ append_to_json_list(rows_to_cache, output_path)
354
+ start_idx = len(output_list)
355
+ pbar.set_postfix_str(f"Cache: {len(output_list)}/{len(df)}")
356
+ output_df = pd.DataFrame(output_list)
357
+ return output_df
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: xlin
3
- Version: 0.1.18
3
+ Version: 0.1.20
4
4
  Summary: toolbox for LinXueyuan
5
5
  License: MIT
6
6
  Author: LinXueyuanStdio
@@ -1,15 +1,15 @@
1
1
  xlin/__init__.py,sha256=MWWCNPgJFS_oV2US52ULa4yg4Ku61qjn40NVKqcp9-c,248
2
2
  xlin/ischinese.py,sha256=Ia9IMQ6q-UHkdLwqS70L1fTnfSPbluFrv_I1UqsKquo,293
3
- xlin/jsonl.py,sha256=DvVM241a9VgQlp5WIMPRv-JIolT0RdSxw47IG_fc7xE,6690
3
+ xlin/jsonl.py,sha256=Ogn_9eIx1NPmI_hMvBVwuDTooJYDEJ8FTtViQ8zTVlQ,7618
4
4
  xlin/metric.py,sha256=N7wJ35y-C-IaBr1I1CJ_37lTG7gA69zmn9Xg6xSwKoI,1690
5
- xlin/multiprocess_mapping.py,sha256=pmzyEUYpbpIZ_ezyvWWWRpr7D7n4t3E3jW1nGXBbVck,7652
5
+ xlin/multiprocess_mapping.py,sha256=dRXQoLaG1dK_qZ8B3bJblV0RKM2gqIeSW1EaOZbIdD0,14251
6
6
  xlin/read_as_dataframe.py,sha256=T8A4qk4Grof_WC_mNz4QVaWDQgJ103rUAQ8tsamm8SQ,8898
7
7
  xlin/statistic.py,sha256=i0Z1gbW2IYHCA0lb16w1Ncrk0Q7Q1Ttm0n4we-ki6II,9301
8
8
  xlin/timing.py,sha256=XMT8dMcMolOMohDvAZOIM_BAiPMREhGQKnO1kc5s6PU,998
9
9
  xlin/util.py,sha256=TTWJaqF5D_r-gAZ_fj0kyHomvCagjwHXQZ2OPSgwd54,10976
10
10
  xlin/xls2xlsx.py,sha256=uSmXcDvIhi5Sq0LGidMXy0wErNBXdjaoa6EftYVjTXs,947
11
11
  xlin/yaml.py,sha256=kICi7G3Td5q2MaSXXt85qNTWoHMgjzt7pvn7r3C4dME,183
12
- xlin-0.1.18.dist-info/LICENSE,sha256=60ys6rRtc1dZOP8UjSUr9fAqhZudT3WpKe5WbMCralM,1066
13
- xlin-0.1.18.dist-info/METADATA,sha256=BWrBEOgAePxk0-vrjAgh2da_YA3HORi3awuFbZZbBUY,1098
14
- xlin-0.1.18.dist-info/WHEEL,sha256=IrRNNNJ-uuL1ggO5qMvT1GGhQVdQU54d6ZpYqEZfEWo,92
15
- xlin-0.1.18.dist-info/RECORD,,
12
+ xlin-0.1.20.dist-info/LICENSE,sha256=60ys6rRtc1dZOP8UjSUr9fAqhZudT3WpKe5WbMCralM,1066
13
+ xlin-0.1.20.dist-info/METADATA,sha256=DW9S85CerwgeiPFFETvVEai0OmxdIcoKSt9UXvIg71s,1098
14
+ xlin-0.1.20.dist-info/WHEEL,sha256=IrRNNNJ-uuL1ggO5qMvT1GGhQVdQU54d6ZpYqEZfEWo,92
15
+ xlin-0.1.20.dist-info/RECORD,,
File without changes
File without changes