PyPI - xlin - Versions diffs - 0.1.4__tar.gz → 0.1.6__tar.gz - Mend

xlin 0.1.4tar.gz → 0.1.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

{xlin-0.1.4 → xlin-0.1.6}/PKG-INFO +1 -1
{xlin-0.1.4 → xlin-0.1.6}/pyproject.toml +1 -1
{xlin-0.1.4 → xlin-0.1.6}/xlin/multiprocess_mapping.py +59 -48
{xlin-0.1.4 → xlin-0.1.6}/LICENSE +0 -0
{xlin-0.1.4 → xlin-0.1.6}/README.md +0 -0
{xlin-0.1.4 → xlin-0.1.6}/xlin/__init__.py +0 -0
{xlin-0.1.4 → xlin-0.1.6}/xlin/ischinese.py +0 -0
{xlin-0.1.4 → xlin-0.1.6}/xlin/jsonl.py +0 -0
{xlin-0.1.4 → xlin-0.1.6}/xlin/read_as_dataframe.py +0 -0
{xlin-0.1.4 → xlin-0.1.6}/xlin/statistic.py +0 -0
{xlin-0.1.4 → xlin-0.1.6}/xlin/terminal_color.py +0 -0
{xlin-0.1.4 → xlin-0.1.6}/xlin/util.py +1 -1
{xlin-0.1.4 → xlin-0.1.6}/xlin/uuid.py +0 -0
{xlin-0.1.4 → xlin-0.1.6}/xlin/xls2xlsx.py +0 -0
{xlin-0.1.4 → xlin-0.1.6}/xlin/yaml.py +0 -0

{xlin-0.1.4 → xlin-0.1.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: xlin
-Version: 0.1.4
+Version: 0.1.6
 Summary: toolbox for LinXueyuan
 License: MIT
 Author: XiChen

{xlin-0.1.4 → xlin-0.1.6}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "xlin"
-version = "0.1.4"
+version = "0.1.6"
 description = "toolbox for LinXueyuan"
 authors = ["XiChen <23211526+LinXueyuanStdio@users.noreply.github.com>"]
 license = "MIT"

{xlin-0.1.4 → xlin-0.1.6}/xlin/multiprocess_mapping.py RENAMED Viewed

@@ -9,7 +9,62 @@ from pathlib import Path
 from tqdm import tqdm
 from loguru import logger
-from xlin.jsonl import dataframe_to_json_list, load_json_list, save_json_list, load_json, save_json
+from xlin.jsonl import load_json_list, save_json_list, load_json, save_json
+from xlin.util import ls
+def element_mapping(
+    iterator: List[Any],
+    mapping_func: Callable[[Any], Tuple[bool, Any]],
+    use_multiprocessing=True,
+    thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
+):
+    rows = []
+    if use_multiprocessing:
+        pool = ThreadPool(thread_pool_size)
+        results = pool.map(mapping_func, iterator)
+        pool.close()
+        for ok, row in results:
+            if ok:
+                rows.append(row)
+    else:
+        for row in tqdm(iterator):
+            ok, row = mapping_func(row)
+            if ok:
+                rows.append(row)
+    return rows
+def batch_mapping(
+    iterator: List[Any],
+    mapping_func: Callable[[List[Any]], Tuple[bool, List[Any]]],
+    use_multiprocessing=True,
+    thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
+    batch_size=4,
+):
+    batch_iterator = []
+    batch = []
+    for i, item in enumerate(iterator):
+        batch.append(item)
+        if len(batch) == batch_size:
+            batch_iterator.append(batch)
+            batch = []
+    if len(batch) > 0:
+        batch_iterator.append(batch)
+    rows = element_mapping(batch_iterator, mapping_func, use_multiprocessing, thread_pool_size)
+    rows = [row for batch in rows for row in batch]
+    return rows
+def dataframe_with_row_mapping(
+    df: pd.DataFrame,
+    mapping_func: Callable[[dict], Tuple[bool, dict]],
+    use_multiprocessing=True,
+    thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
+):
+    rows = element_mapping(df.iterrows(), lambda x: mapping_func(x[1]), use_multiprocessing, thread_pool_size)
+    df = pd.DataFrame(rows)
+    return df
 def multiprocessing_mapping_jsonlist(
@@ -62,6 +117,7 @@ def multiprocessing_mapping_jsonlist(
     if len(tmp_list) > 0:
         results = pool.map(partial_func, tmp_list)
         output_list.extend([x for x in results])
+    pool.close()
     if need_caching:
         save_json_list(output_list, output_path)
     return output_list
@@ -122,57 +178,13 @@ def multiprocessing_mapping(
     if len(tmp_list) > 0:
         results = pool.map(partial_func, tmp_list)
         output_list.extend([x for x in results])
+    pool.close()
     output_df = pd.DataFrame(output_list)
     if need_caching:
         output_df.to_excel(output_path, index=False)
     return output_df, output_list
-def dataframe_with_row_mapping(
-    df: pd.DataFrame,
-    mapping_func: Callable[[Tuple[int, dict]], Tuple[bool, dict]],
-    use_multiprocessing=True,
-    thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
-):
-    rows = []
-    if use_multiprocessing:
-        pool = ThreadPool(thread_pool_size)
-        logger.debug(f"pool size: {thread_pool_size}, cpu count: {multiprocessing.cpu_count()}")
-        results = pool.map(mapping_func, enumerate(dataframe_to_json_list(df)))
-        for ok, row in results:
-            if ok:
-                rows.append(row)
-    else:
-        for i, row in tqdm(df.iterrows()):
-            ok, row = mapping_func(i, row)
-            if ok:
-                rows.append(row)
-    df = pd.DataFrame(rows)
-    return df
-def list_with_element_mapping(
-    iterator: List[Any],
-    mapping_func: Callable[[Tuple[int, Any]], Tuple[bool, Any]],
-    use_multiprocessing=True,
-    thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
-):
-    rows = []
-    if use_multiprocessing:
-        pool = ThreadPool(thread_pool_size)
-        logger.debug(f"pool size: {thread_pool_size}, cpu count: {multiprocessing.cpu_count()}")
-        results = pool.map(mapping_func, enumerate(iterator))
-        for ok, row in results:
-            if ok:
-                rows.append(row)
-    else:
-        for i, row in tqdm(enumerate(iterator)):
-            ok, row = mapping_func(i, row)
-            if ok:
-                rows.append(row)
-    return rows
 def continue_run(
     jsonfiles: List[str],
     save_dir: str,
@@ -185,8 +197,7 @@ def continue_run(
     save_dir: Path = Path(save_dir)
     save_dir.mkdir(parents=True, exist_ok=True)
     new_jsonfiles = []
-    for jsonfile in jsonfiles:
-        jsonfile = Path(jsonfile)
+    for jsonfile in ls(jsonfiles):
         jsonlist = load_func(jsonfile)
         output_filepath = save_dir / jsonfile.name
         for row in jsonlist:

{xlin-0.1.4 → xlin-0.1.6}/LICENSE RENAMED Viewed

File without changes

{xlin-0.1.4 → xlin-0.1.6}/README.md RENAMED Viewed

File without changes

{xlin-0.1.4 → xlin-0.1.6}/xlin/__init__.py RENAMED Viewed

File without changes

{xlin-0.1.4 → xlin-0.1.6}/xlin/ischinese.py RENAMED Viewed

File without changes

{xlin-0.1.4 → xlin-0.1.6}/xlin/jsonl.py RENAMED Viewed

File without changes

{xlin-0.1.4 → xlin-0.1.6}/xlin/read_as_dataframe.py RENAMED Viewed

File without changes

{xlin-0.1.4 → xlin-0.1.6}/xlin/statistic.py RENAMED Viewed

File without changes

{xlin-0.1.4 → xlin-0.1.6}/xlin/terminal_color.py RENAMED Viewed

File without changes

{xlin-0.1.4 → xlin-0.1.6}/xlin/util.py RENAMED Viewed

@@ -1,5 +1,5 @@
-from collections import defaultdict
 from typing import *
+from collections import defaultdict
 from pathlib import Path
 import pandas as pd
 import os

{xlin-0.1.4 → xlin-0.1.6}/xlin/uuid.py RENAMED Viewed

File without changes

{xlin-0.1.4 → xlin-0.1.6}/xlin/xls2xlsx.py RENAMED Viewed

File without changes

{xlin-0.1.4 → xlin-0.1.6}/xlin/yaml.py RENAMED Viewed

File without changes

xlin 0.1.4__tar.gz → 0.1.6__tar.gz

xlin 0.1.4tar.gz → 0.1.6tar.gz