PyPI - xlin - Versions diffs - 0.1.23__py2.py3-none-any.whl → 0.1.25__py2.py3-none-any.whl - Mend

xlin 0.1.23py2.py3-none-any.whl → 0.1.25py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

xlin/jsonl.py +8 -3
xlin/multiprocess_mapping.py +95 -66
{xlin-0.1.23.dist-info → xlin-0.1.25.dist-info}/METADATA +1 -1
{xlin-0.1.23.dist-info → xlin-0.1.25.dist-info}/RECORD +6 -6
{xlin-0.1.23.dist-info → xlin-0.1.25.dist-info}/LICENSE +0 -0
{xlin-0.1.23.dist-info → xlin-0.1.25.dist-info}/WHEEL +0 -0

xlin/jsonl.py CHANGED Viewed

@@ -224,11 +224,16 @@ def generator_from_paths(paths: List[Path], load_data: Callable[[Path], List[Dic
-def append_to_json_list(data: list[dict], file_path: str):
+def append_to_json_list(data: list[dict], file_path: Union[str, Path]):
     """Append a list of dictionaries to a JSON file."""
-    with open(file_path, "a") as f:
+    file_path = Path(file_path)
+    file_path.parent.mkdir(parents=True, exist_ok=True)
+    if file_path.exists() and file_path.is_dir():
+        print(f"{file_path} is a directory, not a file.")
+        return
+    with open(file_path, "a", encoding="utf-8") as f:
         for item in data:
-            f.write(json.dumps(item) + "\n")
+            f.write(json.dumps(item, ensure_ascii=False, separators=(",", ":")) + "\n")
 def row_to_json(row: dict) -> dict:

xlin/multiprocess_mapping.py CHANGED Viewed

@@ -21,15 +21,19 @@ def element_mapping(
     thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
 ):
     rows = []
+    # 转换为列表以获取长度，用于进度条显示
+    items = list(iterator)
+    total = len(items)
     if use_multiprocessing:
         pool = ThreadPool(thread_pool_size)
-        results = pool.map(mapping_func, iterator)
-        pool.close()
-        for ok, row in results:
+        # 使用imap替代map，结合tqdm显示进度
+        for ok, row in tqdm(pool.imap(mapping_func, items), total=total, desc="Processing"):
             if ok:
                 rows.append(row)
+        pool.close()
     else:
-        for row in tqdm(iterator):
+        for row in tqdm(items, desc="Processing"):
             ok, row = mapping_func(row)
             if ok:
                 rows.append(row)
@@ -70,57 +74,111 @@ def dataframe_with_row_mapping(
 def multiprocessing_mapping_jsonlist(
     jsonlist: List[Any],
-    output_path: Optional[Union[str, Path]],
-    partial_func,
+    partial_func: Callable[[Any], dict],
+    output_path: Optional[Union[str, Path]]=None,  # 输出路径，None表示不缓存
     batch_size=multiprocessing.cpu_count(),
     cache_batch_num=1,
     thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
+    use_process_pool=True,  # CPU密集型任务时设为True
+    preserve_order=True,  # 是否保持结果顺序
+    chunksize=None,  # 自动计算最佳分块大小
+    retry_count=0,  # 失败重试次数
 ):
-    """mapping a column to another column
+    """高效处理JSON列表，支持多进程/多线程
     Args:
-        df (DataFrame): [description]
-        output_path (Path): 数据量大的时候需要缓存
-        partial_func (function): (Dict[str, str]) -> Dict[str, str]
+        jsonlist (List[Any]): 要处理的JSON对象列表
+        output_path (Optional[Union[str, Path]]): 输出路径，None表示不缓存
+        partial_func (Callable): 处理函数，接收dict返回dict
+        batch_size (int): 批处理大小
+        cache_batch_num (int): 缓存批次数量
+        thread_pool_size (int): 线程/进程池大小
+        use_process_pool (bool): 是否使用进程池(CPU密集型任务)
+        preserve_order (bool): 是否保持结果顺序
+        chunksize (Optional[int]): 单个任务分块大小，None为自动计算
+        retry_count (int): 任务失败重试次数
     """
     need_caching = output_path is not None
-    tmp_list, output_list = list(), list()
+    output_list = []
     start_idx = 0
+    # 自动计算最佳chunksize
+    if chunksize is None:
+        chunksize = max(1, min(batch_size // thread_pool_size, 100))
+    # 处理缓存
     if need_caching:
         output_path = Path(output_path)
         if output_path.exists():
             output_list = load_json_list(output_path)
             start_idx = len(output_list)
-            logger.warning(f"Cache found {output_path} has {start_idx} rows. This process will continue at row index {start_idx}.")
-            logger.warning(f"缓存 {output_path} 存在 {start_idx} 行. 本次处理将从第 {start_idx} 行开始.")
+            logger.info(f"继续处理: 已有{start_idx}条记录，共{len(jsonlist)}条")
         else:
             output_path.parent.mkdir(parents=True, exist_ok=True)
-    pool = ThreadPool(thread_pool_size)
-    logger.debug(f"pool size: {thread_pool_size}, cpu count: {multiprocessing.cpu_count()}")
-    start_time = time.time()
-    last_save_time = start_time
-    for i, line in tqdm(list(enumerate(jsonlist))):
-        if i < start_idx:
-            continue
-        tmp_list.append(line)
-        if len(tmp_list) == batch_size:
-            results = pool.map(partial_func, tmp_list)
-            output_list.extend([x for x in results])
-            tmp_list = list()
-        if need_caching and (i // batch_size) % cache_batch_num == 0:
-            current_time = time.time()
-            if current_time - last_save_time < 3:
-                # 如果多进程处理太快，为了不让 IO 成为瓶颈拉慢进度，不足 3 秒的批次都忽略，也不缓存中间结果
-                last_save_time = current_time
-                continue
-            save_json_list(output_list, output_path)
-            last_save_time = time.time()
-    if len(tmp_list) > 0:
-        results = pool.map(partial_func, tmp_list)
-        output_list.extend([x for x in results])
-    pool.close()
+    # 选择线程池或进程池
+    if use_process_pool:
+        pool_cls = multiprocessing.Pool
+        logger.info(f"使用进程池(ProcessPool)，适用于CPU密集型任务")
+    else:
+        pool_cls = ThreadPool
+        logger.info(f"使用线程池(ThreadPool)，适用于IO密集型任务")
+    with pool_cls(thread_pool_size) as pool:
+        logger.info(f"池大小: {thread_pool_size}, 批处理大小: {batch_size}, 分块大小: {chunksize}")
+        # 准备要处理的数据
+        remaining_items = jsonlist[start_idx:]
+        total_items = len(remaining_items)
+        # 批量处理逻辑
+        def process_batch(items_batch, retry_remaining=retry_count):
+            try:
+                # 选择合适的映射方法
+                map_func = pool.imap_unordered if not preserve_order else pool.imap
+                return list(map_func(partial_func, items_batch, chunksize))
+            except Exception as e:
+                if retry_remaining > 0:
+                    logger.warning(f"批处理失败，重试中 ({retry_count-retry_remaining+1}/{retry_count}): {e}")
+                    return process_batch(items_batch, retry_remaining - 1)
+                else:
+                    logger.error(f"批处理失败: {e}")
+                    raise
+        # 处理数据
+        with tqdm(total=total_items, desc="处理数据", unit="项") as pbar:
+            # 跳过已处理的项目
+            pbar.update(start_idx)
+            # 分批处理
+            for i in range(0, total_items, batch_size):
+                batch = remaining_items[i : i + batch_size]
+                # 处理当前批次
+                batch_start_time = time.time()
+                results = process_batch(batch)
+                batch_time = time.time() - batch_start_time
+                # 更新结果
+                output_list.extend(results)
+                pbar.update(len(batch))
+                # 性能统计
+                items_per_second = len(batch) / batch_time if batch_time > 0 else 0
+                pbar.set_postfix_str(f"速率: {items_per_second:.1f}项/秒")
+                # 缓存逻辑
+                if need_caching and (i // batch_size) % cache_batch_num == 0:
+                    # 仅当处理速度足够慢时才保存缓存，避免IO成为瓶颈
+                    if batch_time > 3 or i + batch_size >= total_items:
+                        save_json_list(output_list, output_path)
+                        logger.debug(f"已保存{len(output_list)}条记录到{output_path}")
+    # 最终保存
     if need_caching:
         save_json_list(output_list, output_path)
+        logger.info(f"已完成处理并保存{len(output_list)}条记录")
     return output_list
@@ -186,35 +244,6 @@ def multiprocessing_mapping(
     return output_df, output_list
-def continue_run(
-    jsonfiles: List[str],
-    save_dir: str,
-    mapping_func,
-    load_func=load_json,
-    save_func=save_json,
-    batch_size=1024,
-    cache_size=8,
-):
-    save_dir: Path = Path(save_dir)
-    save_dir.mkdir(parents=True, exist_ok=True)
-    new_jsonfiles = []
-    for jsonfile in ls(jsonfiles):
-        jsonlist = load_func(jsonfile)
-        output_filepath = save_dir / jsonfile.name
-        for row in jsonlist:
-            row["来源"] = jsonfile.name
-        new_jsonlist = multiprocessing_mapping_jsonlist(
-            jsonlist,
-            output_filepath,
-            mapping_func,
-            batch_size,
-            cache_size,
-        )
-        save_func(new_jsonlist, output_filepath)
-        new_jsonfiles.append(output_filepath)
-    return new_jsonfiles
 def dataframe_mapping(
     df: pd.DataFrame,
     row_func: Callable[[dict], dict],

{xlin-0.1.23.dist-info → xlin-0.1.25.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: xlin
-Version: 0.1.23
+Version: 0.1.25
 Summary: toolbox for LinXueyuan
 License: MIT
 Author: LinXueyuanStdio

{xlin-0.1.23.dist-info → xlin-0.1.25.dist-info}/RECORD RENAMED Viewed

@@ -1,15 +1,15 @@
 xlin/__init__.py,sha256=MWWCNPgJFS_oV2US52ULa4yg4Ku61qjn40NVKqcp9-c,248
 xlin/ischinese.py,sha256=Ia9IMQ6q-UHkdLwqS70L1fTnfSPbluFrv_I1UqsKquo,293
-xlin/jsonl.py,sha256=Ogn_9eIx1NPmI_hMvBVwuDTooJYDEJ8FTtViQ8zTVlQ,7618
+xlin/jsonl.py,sha256=IDRydHh2x-8iAGCxt9ScK2wfNLNA40PxNxR5hhr4v6k,7903
 xlin/metric.py,sha256=N7wJ35y-C-IaBr1I1CJ_37lTG7gA69zmn9Xg6xSwKoI,1690
-xlin/multiprocess_mapping.py,sha256=dRXQoLaG1dK_qZ8B3bJblV0RKM2gqIeSW1EaOZbIdD0,14251
+xlin/multiprocess_mapping.py,sha256=ppSNidDLb6pI7_thCcqZBpYtKGTTS4osoPIIbWBu0d4,15893
 xlin/read_as_dataframe.py,sha256=MqY57L7Wp9UoWTRlZLSBKQNaZa-dKw51-ufrKvHKf8s,9041
 xlin/statistic.py,sha256=2DCUgzf7xkMFH4Pk9v82bFDNeSxCTjwPh9Y4IPJBHCE,9300
 xlin/timing.py,sha256=XMT8dMcMolOMohDvAZOIM_BAiPMREhGQKnO1kc5s6PU,998
 xlin/util.py,sha256=TTWJaqF5D_r-gAZ_fj0kyHomvCagjwHXQZ2OPSgwd54,10976
 xlin/xls2xlsx.py,sha256=uSmXcDvIhi5Sq0LGidMXy0wErNBXdjaoa6EftYVjTXs,947
 xlin/yaml.py,sha256=kICi7G3Td5q2MaSXXt85qNTWoHMgjzt7pvn7r3C4dME,183
-xlin-0.1.23.dist-info/LICENSE,sha256=60ys6rRtc1dZOP8UjSUr9fAqhZudT3WpKe5WbMCralM,1066
-xlin-0.1.23.dist-info/METADATA,sha256=b0fZmt4pTd0U3NZd0N3P3CfHNIjdsbebm2Km-IvX_-E,1098
-xlin-0.1.23.dist-info/WHEEL,sha256=IrRNNNJ-uuL1ggO5qMvT1GGhQVdQU54d6ZpYqEZfEWo,92
-xlin-0.1.23.dist-info/RECORD,,
+xlin-0.1.25.dist-info/LICENSE,sha256=60ys6rRtc1dZOP8UjSUr9fAqhZudT3WpKe5WbMCralM,1066
+xlin-0.1.25.dist-info/METADATA,sha256=4xqcaW20xkdlge7nsCWw5yRByrTyXsxZAgPca2TVFpY,1098
+xlin-0.1.25.dist-info/WHEEL,sha256=IrRNNNJ-uuL1ggO5qMvT1GGhQVdQU54d6ZpYqEZfEWo,92
+xlin-0.1.25.dist-info/RECORD,,

{xlin-0.1.23.dist-info → xlin-0.1.25.dist-info}/LICENSE RENAMED Viewed

File without changes

{xlin-0.1.23.dist-info → xlin-0.1.25.dist-info}/WHEEL RENAMED Viewed

File without changes

xlin 0.1.23__py2.py3-none-any.whl → 0.1.25__py2.py3-none-any.whl

xlin 0.1.23py2.py3-none-any.whl → 0.1.25py2.py3-none-any.whl