xlin 0.1.18__py2.py3-none-any.whl → 0.1.19__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xlin/jsonl.py +25 -0
- xlin/multiprocess_mapping.py +144 -1
- {xlin-0.1.18.dist-info → xlin-0.1.19.dist-info}/METADATA +1 -1
- {xlin-0.1.18.dist-info → xlin-0.1.19.dist-info}/RECORD +6 -6
- {xlin-0.1.18.dist-info → xlin-0.1.19.dist-info}/LICENSE +0 -0
- {xlin-0.1.18.dist-info → xlin-0.1.19.dist-info}/WHEEL +0 -0
xlin/jsonl.py
CHANGED
@@ -221,3 +221,28 @@ def generator_from_paths(paths: List[Path], load_data: Callable[[Path], List[Dic
|
|
221
221
|
jsonlist: List[Dict[str, Any]] = load_data(path)
|
222
222
|
for row in jsonlist:
|
223
223
|
yield path, row
|
224
|
+
|
225
|
+
|
226
|
+
|
227
|
+
def append_to_json_list(data: list[dict], file_path: str):
|
228
|
+
"""Append a list of dictionaries to a JSON file."""
|
229
|
+
with open(file_path, "a") as f:
|
230
|
+
for item in data:
|
231
|
+
f.write(json.dumps(item) + "\n")
|
232
|
+
|
233
|
+
|
234
|
+
def row_to_json(row: dict) -> dict:
|
235
|
+
"""Convert a row to a JSON object."""
|
236
|
+
new_row = {}
|
237
|
+
for k, v in row.items():
|
238
|
+
if isinstance(v, dict):
|
239
|
+
new_row[k] = row_to_json(v)
|
240
|
+
elif isinstance(v, list):
|
241
|
+
new_row[k] = [row_to_json(item) for item in v]
|
242
|
+
elif isinstance(v, pd.DataFrame):
|
243
|
+
new_row[k] = [row_to_json(item) for item in v.to_dict(orient="records")]
|
244
|
+
else:
|
245
|
+
new_row[k] = v
|
246
|
+
|
247
|
+
return new_row
|
248
|
+
|
xlin/multiprocess_mapping.py
CHANGED
@@ -9,7 +9,8 @@ from pathlib import Path
|
|
9
9
|
from tqdm import tqdm
|
10
10
|
from loguru import logger
|
11
11
|
|
12
|
-
from xlin.jsonl import load_json_list, save_json_list, load_json, save_json
|
12
|
+
from xlin.jsonl import append_to_json_list, dataframe_to_json_list, load_json_list, row_to_json, save_json_list, load_json, save_json
|
13
|
+
from xlin.read_as_dataframe import read_as_dataframe
|
13
14
|
from xlin.util import ls
|
14
15
|
|
15
16
|
|
@@ -212,3 +213,145 @@ def continue_run(
|
|
212
213
|
save_func(new_jsonlist, output_filepath)
|
213
214
|
new_jsonfiles.append(output_filepath)
|
214
215
|
return new_jsonfiles
|
216
|
+
|
217
|
+
|
218
|
+
def dataframe_mapping(
|
219
|
+
df: pd.DataFrame,
|
220
|
+
row_func: Callable[[dict], dict],
|
221
|
+
output_path: Optional[Union[str, Path]] = None,
|
222
|
+
force_overwrite: bool = False,
|
223
|
+
batch_size=multiprocessing.cpu_count(),
|
224
|
+
cache_batch_num=1,
|
225
|
+
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
|
226
|
+
):
|
227
|
+
"""mapping a column to another column
|
228
|
+
|
229
|
+
Args:
|
230
|
+
df (DataFrame): [description]
|
231
|
+
row_func (function): (Dict[str, str]) -> Dict[str, str]
|
232
|
+
output_path (Path): 数据量大的时候需要缓存. None 表示不缓存中间结果
|
233
|
+
force_overwrite (bool): 是否强制覆盖 output_path
|
234
|
+
batch_size (int): batch size
|
235
|
+
cache_batch_num (int): cache batch num
|
236
|
+
thread_pool_size (int): thread pool size
|
237
|
+
"""
|
238
|
+
need_caching = output_path is not None
|
239
|
+
tmp_list, output_list = list(), list()
|
240
|
+
start_idx = 0
|
241
|
+
if need_caching:
|
242
|
+
output_path = Path(output_path)
|
243
|
+
if output_path.exists() and not force_overwrite:
|
244
|
+
existed_df = read_as_dataframe(output_path)
|
245
|
+
start_idx = len(existed_df)
|
246
|
+
output_list = dataframe_to_json_list(existed_df)
|
247
|
+
logger.warning(f"Cache found that {output_path} has {start_idx} rows. This process will continue at row index {start_idx}.")
|
248
|
+
logger.warning(f"缓存 {output_path} 存在 {start_idx} 行. 本次处理将从第 {start_idx} 行开始.")
|
249
|
+
else:
|
250
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
251
|
+
pool = ThreadPool(thread_pool_size)
|
252
|
+
logger.debug(f"pool size: {thread_pool_size}, cpu count: {multiprocessing.cpu_count()}")
|
253
|
+
start_time = time.time()
|
254
|
+
last_save_time = start_time
|
255
|
+
with tqdm(total=len(df), desc="Processing", unit="rows") as pbar:
|
256
|
+
for i, line in df.iterrows():
|
257
|
+
pbar.update(1)
|
258
|
+
if i < start_idx:
|
259
|
+
continue
|
260
|
+
line_info: dict = line.to_dict()
|
261
|
+
tmp_list.append(line_info)
|
262
|
+
if len(tmp_list) == batch_size:
|
263
|
+
results = pool.map(row_func, tmp_list)
|
264
|
+
output_list.extend([row_to_json(x) for x in results])
|
265
|
+
tmp_list = list()
|
266
|
+
if need_caching and (i // batch_size) % cache_batch_num == 0:
|
267
|
+
current_time = time.time()
|
268
|
+
if current_time - last_save_time < 3:
|
269
|
+
# 如果多进程处理太快,为了不让 IO 成为瓶颈拉慢进度,不足 3 秒的批次都忽略,也不缓存中间结果
|
270
|
+
last_save_time = current_time
|
271
|
+
continue
|
272
|
+
rows_to_cache = output_list[start_idx:]
|
273
|
+
append_to_json_list(rows_to_cache, output_path)
|
274
|
+
start_idx = len(output_list)
|
275
|
+
last_save_time = time.time()
|
276
|
+
if need_caching:
|
277
|
+
pbar.set_postfix_str(f"Cache: {len(output_list)}/{len(df)}")
|
278
|
+
if len(tmp_list) > 0:
|
279
|
+
results = pool.map(row_func, tmp_list)
|
280
|
+
output_list.extend([row_to_json(x) for x in results])
|
281
|
+
pool.close()
|
282
|
+
if need_caching:
|
283
|
+
rows_to_cache = output_list[start_idx:]
|
284
|
+
append_to_json_list(rows_to_cache, output_path)
|
285
|
+
start_idx = len(output_list)
|
286
|
+
pbar.set_postfix_str(f"Cache: {len(output_list)}/{len(df)}")
|
287
|
+
output_df = pd.DataFrame(output_list)
|
288
|
+
return output_df
|
289
|
+
|
290
|
+
|
291
|
+
def dataframe_batch_mapping(
|
292
|
+
df: pd.DataFrame,
|
293
|
+
batch_row_func: Callable[[list[dict]], dict],
|
294
|
+
output_path: Optional[Union[str, Path]] = None,
|
295
|
+
force_overwrite: bool = False,
|
296
|
+
batch_size=multiprocessing.cpu_count(),
|
297
|
+
cache_batch_num=1,
|
298
|
+
):
|
299
|
+
"""mapping a column to another column
|
300
|
+
|
301
|
+
Args:
|
302
|
+
df (DataFrame): [description]
|
303
|
+
row_func (function): (Dict[str, str]) -> Dict[str, str]
|
304
|
+
output_path (Path): 数据量大的时候需要缓存. None 表示不缓存中间结果
|
305
|
+
force_overwrite (bool): 是否强制覆盖 output_path
|
306
|
+
batch_size (int): batch size
|
307
|
+
cache_batch_num (int): cache batch num
|
308
|
+
thread_pool_size (int): thread pool size
|
309
|
+
"""
|
310
|
+
need_caching = output_path is not None
|
311
|
+
tmp_list, output_list = list(), list()
|
312
|
+
start_idx = 0
|
313
|
+
if need_caching:
|
314
|
+
output_path = Path(output_path)
|
315
|
+
if output_path.exists() and not force_overwrite:
|
316
|
+
existed_df = read_as_dataframe(output_path)
|
317
|
+
start_idx = len(existed_df)
|
318
|
+
output_list = dataframe_to_json_list(existed_df)
|
319
|
+
logger.warning(f"Cache found that {output_path} has {start_idx} rows. This process will continue at row index {start_idx}.")
|
320
|
+
logger.warning(f"缓存 {output_path} 存在 {start_idx} 行. 本次处理将从第 {start_idx} 行开始.")
|
321
|
+
else:
|
322
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
323
|
+
start_time = time.time()
|
324
|
+
last_save_time = start_time
|
325
|
+
with tqdm(total=len(df), desc="Processing", unit="rows") as pbar:
|
326
|
+
for i, line in df.iterrows():
|
327
|
+
pbar.update(1)
|
328
|
+
if i < start_idx:
|
329
|
+
continue
|
330
|
+
line_info: dict = line.to_dict()
|
331
|
+
tmp_list.append(line_info)
|
332
|
+
if len(tmp_list) == batch_size:
|
333
|
+
results = batch_row_func(tmp_list)
|
334
|
+
output_list.extend([row_to_json(x) for x in results])
|
335
|
+
tmp_list = list()
|
336
|
+
if need_caching and (i // batch_size) % cache_batch_num == 0:
|
337
|
+
current_time = time.time()
|
338
|
+
if current_time - last_save_time < 3:
|
339
|
+
# 如果多进程处理太快,为了不让 IO 成为瓶颈拉慢进度,不足 3 秒的批次都忽略,也不缓存中间结果
|
340
|
+
last_save_time = current_time
|
341
|
+
continue
|
342
|
+
rows_to_cache = output_list[start_idx:]
|
343
|
+
append_to_json_list(rows_to_cache, output_path)
|
344
|
+
start_idx = len(output_list)
|
345
|
+
last_save_time = time.time()
|
346
|
+
if need_caching:
|
347
|
+
pbar.set_postfix_str(f"Cache: {len(output_list)}/{len(df)}")
|
348
|
+
if len(tmp_list) > 0:
|
349
|
+
results = batch_row_func(tmp_list)
|
350
|
+
output_list.extend([row_to_json(x) for x in results])
|
351
|
+
if need_caching:
|
352
|
+
rows_to_cache = output_list[start_idx:]
|
353
|
+
append_to_json_list(rows_to_cache, output_path)
|
354
|
+
start_idx = len(output_list)
|
355
|
+
pbar.set_postfix_str(f"Cache: {len(output_list)}/{len(df)}")
|
356
|
+
output_df = pd.DataFrame(output_list)
|
357
|
+
return output_df
|
@@ -1,15 +1,15 @@
|
|
1
1
|
xlin/__init__.py,sha256=MWWCNPgJFS_oV2US52ULa4yg4Ku61qjn40NVKqcp9-c,248
|
2
2
|
xlin/ischinese.py,sha256=Ia9IMQ6q-UHkdLwqS70L1fTnfSPbluFrv_I1UqsKquo,293
|
3
|
-
xlin/jsonl.py,sha256=
|
3
|
+
xlin/jsonl.py,sha256=Ixd3zE_cQVF-2q9OuGj6C06opdlT5BNEuysDPZAVTrg,7395
|
4
4
|
xlin/metric.py,sha256=N7wJ35y-C-IaBr1I1CJ_37lTG7gA69zmn9Xg6xSwKoI,1690
|
5
|
-
xlin/multiprocess_mapping.py,sha256=
|
5
|
+
xlin/multiprocess_mapping.py,sha256=dRXQoLaG1dK_qZ8B3bJblV0RKM2gqIeSW1EaOZbIdD0,14251
|
6
6
|
xlin/read_as_dataframe.py,sha256=T8A4qk4Grof_WC_mNz4QVaWDQgJ103rUAQ8tsamm8SQ,8898
|
7
7
|
xlin/statistic.py,sha256=i0Z1gbW2IYHCA0lb16w1Ncrk0Q7Q1Ttm0n4we-ki6II,9301
|
8
8
|
xlin/timing.py,sha256=XMT8dMcMolOMohDvAZOIM_BAiPMREhGQKnO1kc5s6PU,998
|
9
9
|
xlin/util.py,sha256=TTWJaqF5D_r-gAZ_fj0kyHomvCagjwHXQZ2OPSgwd54,10976
|
10
10
|
xlin/xls2xlsx.py,sha256=uSmXcDvIhi5Sq0LGidMXy0wErNBXdjaoa6EftYVjTXs,947
|
11
11
|
xlin/yaml.py,sha256=kICi7G3Td5q2MaSXXt85qNTWoHMgjzt7pvn7r3C4dME,183
|
12
|
-
xlin-0.1.
|
13
|
-
xlin-0.1.
|
14
|
-
xlin-0.1.
|
15
|
-
xlin-0.1.
|
12
|
+
xlin-0.1.19.dist-info/LICENSE,sha256=60ys6rRtc1dZOP8UjSUr9fAqhZudT3WpKe5WbMCralM,1066
|
13
|
+
xlin-0.1.19.dist-info/METADATA,sha256=0RWLGvsvfW0jctkFK4VhKY5zSsfTOroEXNDTbjy8cxI,1098
|
14
|
+
xlin-0.1.19.dist-info/WHEEL,sha256=IrRNNNJ-uuL1ggO5qMvT1GGhQVdQU54d6ZpYqEZfEWo,92
|
15
|
+
xlin-0.1.19.dist-info/RECORD,,
|
File without changes
|
File without changes
|