PyPI - xlin - Versions diffs - 0.1.18__py2.py3-none-any.whl → 0.1.20__py2.py3-none-any.whl - Mend

xlin 0.1.18py2.py3-none-any.whl → 0.1.20py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

xlin/jsonl.py +36 -0
xlin/multiprocess_mapping.py +144 -1
{xlin-0.1.18.dist-info → xlin-0.1.20.dist-info}/METADATA +1 -1
{xlin-0.1.18.dist-info → xlin-0.1.20.dist-info}/RECORD +6 -6
{xlin-0.1.18.dist-info → xlin-0.1.20.dist-info}/LICENSE +0 -0
{xlin-0.1.18.dist-info → xlin-0.1.20.dist-info}/WHEEL +0 -0

xlin/jsonl.py CHANGED Viewed

@@ -221,3 +221,39 @@ def generator_from_paths(paths: List[Path], load_data: Callable[[Path], List[Dic
         jsonlist: List[Dict[str, Any]] = load_data(path)
         for row in jsonlist:
             yield path, row
+def append_to_json_list(data: list[dict], file_path: str):
+    """Append a list of dictionaries to a JSON file."""
+    with open(file_path, "a") as f:
+        for item in data:
+            f.write(json.dumps(item) + "\n")
+def row_to_json(row: dict) -> dict:
+    """Convert a row to a JSON object."""
+    new_row = {}
+    for k, v in row.items():
+        if isinstance(v, dict):
+            new_row[k] = row_to_json(v)
+        elif isinstance(v, list):
+            new_row[k] = [row_to_json(item) for item in v]
+        elif isinstance(v, pd.DataFrame):
+            new_row[k] = [row_to_json(item) for item in v.to_dict(orient="records")]
+        else:
+            new_row[k] = v
+    return new_row
+def generator_from_json(path):
+    jsonlist = load_json(path)
+    for line in jsonlist:
+        yield line
+def generator_from_jsonl(path):
+    jsonlist = load_json_list(path)
+    for line in jsonlist:
+        yield line

xlin/multiprocess_mapping.py CHANGED Viewed

@@ -9,7 +9,8 @@ from pathlib import Path
 from tqdm import tqdm
 from loguru import logger
-from xlin.jsonl import load_json_list, save_json_list, load_json, save_json
+from xlin.jsonl import append_to_json_list, dataframe_to_json_list, load_json_list, row_to_json, save_json_list, load_json, save_json
+from xlin.read_as_dataframe import read_as_dataframe
 from xlin.util import ls
@@ -212,3 +213,145 @@ def continue_run(
         save_func(new_jsonlist, output_filepath)
         new_jsonfiles.append(output_filepath)
     return new_jsonfiles
+def dataframe_mapping(
+    df: pd.DataFrame,
+    row_func: Callable[[dict], dict],
+    output_path: Optional[Union[str, Path]] = None,
+    force_overwrite: bool = False,
+    batch_size=multiprocessing.cpu_count(),
+    cache_batch_num=1,
+    thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
+):
+    """mapping a column to another column
+    Args:
+        df (DataFrame): [description]
+        row_func (function): (Dict[str, str]) -> Dict[str, str]
+        output_path (Path): 数据量大的时候需要缓存. None 表示不缓存中间结果
+        force_overwrite (bool): 是否强制覆盖 output_path
+        batch_size (int): batch size
+        cache_batch_num (int): cache batch num
+        thread_pool_size (int): thread pool size
+    """
+    need_caching = output_path is not None
+    tmp_list, output_list = list(), list()
+    start_idx = 0
+    if need_caching:
+        output_path = Path(output_path)
+        if output_path.exists() and not force_overwrite:
+            existed_df = read_as_dataframe(output_path)
+            start_idx = len(existed_df)
+            output_list = dataframe_to_json_list(existed_df)
+            logger.warning(f"Cache found that {output_path} has {start_idx} rows. This process will continue at row index {start_idx}.")
+            logger.warning(f"缓存 {output_path} 存在 {start_idx} 行. 本次处理将从第 {start_idx} 行开始.")
+        else:
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+    pool = ThreadPool(thread_pool_size)
+    logger.debug(f"pool size: {thread_pool_size}, cpu count: {multiprocessing.cpu_count()}")
+    start_time = time.time()
+    last_save_time = start_time
+    with tqdm(total=len(df), desc="Processing", unit="rows") as pbar:
+        for i, line in df.iterrows():
+            pbar.update(1)
+            if i < start_idx:
+                continue
+            line_info: dict = line.to_dict()
+            tmp_list.append(line_info)
+            if len(tmp_list) == batch_size:
+                results = pool.map(row_func, tmp_list)
+                output_list.extend([row_to_json(x) for x in results])
+                tmp_list = list()
+            if need_caching and (i // batch_size) % cache_batch_num == 0:
+                current_time = time.time()
+                if current_time - last_save_time < 3:
+                    # 如果多进程处理太快，为了不让 IO 成为瓶颈拉慢进度，不足 3 秒的批次都忽略，也不缓存中间结果
+                    last_save_time = current_time
+                    continue
+                rows_to_cache = output_list[start_idx:]
+                append_to_json_list(rows_to_cache, output_path)
+                start_idx = len(output_list)
+                last_save_time = time.time()
+            if need_caching:
+                pbar.set_postfix_str(f"Cache: {len(output_list)}/{len(df)}")
+        if len(tmp_list) > 0:
+            results = pool.map(row_func, tmp_list)
+            output_list.extend([row_to_json(x) for x in results])
+        pool.close()
+        if need_caching:
+            rows_to_cache = output_list[start_idx:]
+            append_to_json_list(rows_to_cache, output_path)
+            start_idx = len(output_list)
+            pbar.set_postfix_str(f"Cache: {len(output_list)}/{len(df)}")
+    output_df = pd.DataFrame(output_list)
+    return output_df
+def dataframe_batch_mapping(
+    df: pd.DataFrame,
+    batch_row_func: Callable[[list[dict]], dict],
+    output_path: Optional[Union[str, Path]] = None,
+    force_overwrite: bool = False,
+    batch_size=multiprocessing.cpu_count(),
+    cache_batch_num=1,
+):
+    """mapping a column to another column
+    Args:
+        df (DataFrame): [description]
+        row_func (function): (Dict[str, str]) -> Dict[str, str]
+        output_path (Path): 数据量大的时候需要缓存. None 表示不缓存中间结果
+        force_overwrite (bool): 是否强制覆盖 output_path
+        batch_size (int): batch size
+        cache_batch_num (int): cache batch num
+        thread_pool_size (int): thread pool size
+    """
+    need_caching = output_path is not None
+    tmp_list, output_list = list(), list()
+    start_idx = 0
+    if need_caching:
+        output_path = Path(output_path)
+        if output_path.exists() and not force_overwrite:
+            existed_df = read_as_dataframe(output_path)
+            start_idx = len(existed_df)
+            output_list = dataframe_to_json_list(existed_df)
+            logger.warning(f"Cache found that {output_path} has {start_idx} rows. This process will continue at row index {start_idx}.")
+            logger.warning(f"缓存 {output_path} 存在 {start_idx} 行. 本次处理将从第 {start_idx} 行开始.")
+        else:
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+    start_time = time.time()
+    last_save_time = start_time
+    with tqdm(total=len(df), desc="Processing", unit="rows") as pbar:
+        for i, line in df.iterrows():
+            pbar.update(1)
+            if i < start_idx:
+                continue
+            line_info: dict = line.to_dict()
+            tmp_list.append(line_info)
+            if len(tmp_list) == batch_size:
+                results = batch_row_func(tmp_list)
+                output_list.extend([row_to_json(x) for x in results])
+                tmp_list = list()
+            if need_caching and (i // batch_size) % cache_batch_num == 0:
+                current_time = time.time()
+                if current_time - last_save_time < 3:
+                    # 如果多进程处理太快，为了不让 IO 成为瓶颈拉慢进度，不足 3 秒的批次都忽略，也不缓存中间结果
+                    last_save_time = current_time
+                    continue
+                rows_to_cache = output_list[start_idx:]
+                append_to_json_list(rows_to_cache, output_path)
+                start_idx = len(output_list)
+                last_save_time = time.time()
+            if need_caching:
+                pbar.set_postfix_str(f"Cache: {len(output_list)}/{len(df)}")
+        if len(tmp_list) > 0:
+            results = batch_row_func(tmp_list)
+            output_list.extend([row_to_json(x) for x in results])
+        if need_caching:
+            rows_to_cache = output_list[start_idx:]
+            append_to_json_list(rows_to_cache, output_path)
+            start_idx = len(output_list)
+            pbar.set_postfix_str(f"Cache: {len(output_list)}/{len(df)}")
+    output_df = pd.DataFrame(output_list)
+    return output_df

{xlin-0.1.18.dist-info → xlin-0.1.20.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: xlin
-Version: 0.1.18
+Version: 0.1.20
 Summary: toolbox for LinXueyuan
 License: MIT
 Author: LinXueyuanStdio

{xlin-0.1.18.dist-info → xlin-0.1.20.dist-info}/RECORD RENAMED Viewed

@@ -1,15 +1,15 @@
 xlin/__init__.py,sha256=MWWCNPgJFS_oV2US52ULa4yg4Ku61qjn40NVKqcp9-c,248
 xlin/ischinese.py,sha256=Ia9IMQ6q-UHkdLwqS70L1fTnfSPbluFrv_I1UqsKquo,293
-xlin/jsonl.py,sha256=DvVM241a9VgQlp5WIMPRv-JIolT0RdSxw47IG_fc7xE,6690
+xlin/jsonl.py,sha256=Ogn_9eIx1NPmI_hMvBVwuDTooJYDEJ8FTtViQ8zTVlQ,7618
 xlin/metric.py,sha256=N7wJ35y-C-IaBr1I1CJ_37lTG7gA69zmn9Xg6xSwKoI,1690
-xlin/multiprocess_mapping.py,sha256=pmzyEUYpbpIZ_ezyvWWWRpr7D7n4t3E3jW1nGXBbVck,7652
+xlin/multiprocess_mapping.py,sha256=dRXQoLaG1dK_qZ8B3bJblV0RKM2gqIeSW1EaOZbIdD0,14251
 xlin/read_as_dataframe.py,sha256=T8A4qk4Grof_WC_mNz4QVaWDQgJ103rUAQ8tsamm8SQ,8898
 xlin/statistic.py,sha256=i0Z1gbW2IYHCA0lb16w1Ncrk0Q7Q1Ttm0n4we-ki6II,9301
 xlin/timing.py,sha256=XMT8dMcMolOMohDvAZOIM_BAiPMREhGQKnO1kc5s6PU,998
 xlin/util.py,sha256=TTWJaqF5D_r-gAZ_fj0kyHomvCagjwHXQZ2OPSgwd54,10976
 xlin/xls2xlsx.py,sha256=uSmXcDvIhi5Sq0LGidMXy0wErNBXdjaoa6EftYVjTXs,947
 xlin/yaml.py,sha256=kICi7G3Td5q2MaSXXt85qNTWoHMgjzt7pvn7r3C4dME,183
-xlin-0.1.18.dist-info/LICENSE,sha256=60ys6rRtc1dZOP8UjSUr9fAqhZudT3WpKe5WbMCralM,1066
-xlin-0.1.18.dist-info/METADATA,sha256=BWrBEOgAePxk0-vrjAgh2da_YA3HORi3awuFbZZbBUY,1098
-xlin-0.1.18.dist-info/WHEEL,sha256=IrRNNNJ-uuL1ggO5qMvT1GGhQVdQU54d6ZpYqEZfEWo,92
-xlin-0.1.18.dist-info/RECORD,,
+xlin-0.1.20.dist-info/LICENSE,sha256=60ys6rRtc1dZOP8UjSUr9fAqhZudT3WpKe5WbMCralM,1066
+xlin-0.1.20.dist-info/METADATA,sha256=DW9S85CerwgeiPFFETvVEai0OmxdIcoKSt9UXvIg71s,1098
+xlin-0.1.20.dist-info/WHEEL,sha256=IrRNNNJ-uuL1ggO5qMvT1GGhQVdQU54d6ZpYqEZfEWo,92
+xlin-0.1.20.dist-info/RECORD,,

{xlin-0.1.18.dist-info → xlin-0.1.20.dist-info}/LICENSE RENAMED Viewed

File without changes

{xlin-0.1.18.dist-info → xlin-0.1.20.dist-info}/WHEEL RENAMED Viewed

File without changes

xlin 0.1.18__py2.py3-none-any.whl → 0.1.20__py2.py3-none-any.whl

xlin 0.1.18py2.py3-none-any.whl → 0.1.20py2.py3-none-any.whl