PyPI - xlin - Versions diffs - 0.1.37__tar.gz → 0.1.38__tar.gz - Mend

xlin 0.1.37tar.gz → 0.1.38tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

{xlin-0.1.37 → xlin-0.1.38}/PKG-INFO +1 -1
{xlin-0.1.37 → xlin-0.1.38}/pyproject.toml +1 -1
{xlin-0.1.37 → xlin-0.1.38}/xlin/jsonl.py +99 -24
{xlin-0.1.37 → xlin-0.1.38}/xlin/multiprocess_mapping.py +14 -1
{xlin-0.1.37 → xlin-0.1.38}/xlin/read_as_dataframe.py +3 -1
{xlin-0.1.37 → xlin-0.1.38}/xlin/util.py +3 -2
{xlin-0.1.37 → xlin-0.1.38}/LICENSE +0 -0
{xlin-0.1.37 → xlin-0.1.38}/README.md +0 -0
{xlin-0.1.37 → xlin-0.1.38}/xlin/__init__.py +0 -0
{xlin-0.1.37 → xlin-0.1.38}/xlin/ischinese.py +0 -0
{xlin-0.1.37 → xlin-0.1.38}/xlin/metric.py +0 -0
{xlin-0.1.37 → xlin-0.1.38}/xlin/statistic.py +0 -0
{xlin-0.1.37 → xlin-0.1.38}/xlin/timing.py +0 -0
{xlin-0.1.37 → xlin-0.1.38}/xlin/xls2xlsx.py +0 -0
{xlin-0.1.37 → xlin-0.1.38}/xlin/yaml.py +0 -0

{xlin-0.1.37 → xlin-0.1.38}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: xlin
-Version: 0.1.37
+Version: 0.1.38
 Summary: toolbox for LinXueyuan
 License: MIT
 Author: LinXueyuanStdio

{xlin-0.1.37 → xlin-0.1.38}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "xlin"
-version = "0.1.37"
+version = "0.1.38"
 description = "toolbox for LinXueyuan"
 authors = ["LinXueyuanStdio <23211526+LinXueyuanStdio@users.noreply.github.com>"]
 license = "MIT"

{xlin-0.1.37 → xlin-0.1.38}/xlin/jsonl.py RENAMED Viewed

@@ -4,6 +4,10 @@ from typing import *
 from pathlib import Path
 from loguru import logger
 import pandas as pd
+import pyexcel
+from xlin.util import ls
+from xlin.xls2xlsx import is_xslx
 def dataframe_to_json_list(df: pd.DataFrame):
@@ -68,11 +72,91 @@ def load_text(filename):
 def load_json_or_jsonl(filepath: str):
+    """
+    read_as_json_list 更好用，可以无缝切换到：read_as_json_list(filepath)
+    """
     if is_jsonl(filepath):
         return load_json_list(filepath)
     return load_json(filepath)
+def read_as_json_list(
+    filepath: Union[str, Path, List[str], List[Path]],
+    sheet_name: Optional[str] = None,
+    skip_None: bool = True,
+    skip_blank: bool = True,
+    filter: Callable[[Path], bool] = lambda x: True,
+) -> List[Dict]:
+    """
+    读取文件或递归读取文件夹里的文件为 JSON list（List[Dict]）。
+    支持格式：json, jsonl, xlsx, xls, csv, parquet, feather, pkl, h5, txt, tsv, xml, html, db
+    """
+    if isinstance(filepath, list):
+        json_list = []
+        for path in filepath:
+            try:
+                sub_list = read_as_json_list(path, sheet_name, skip_None, skip_blank, filter)
+                for obj in sub_list:
+                    if isinstance(obj, dict):
+                        obj["数据来源"] = Path(path).name
+                json_list.extend(sub_list)
+            except Exception as e:
+                print(f"读取失败 {path}: {e}")
+        return json_list
+    filepath = Path(filepath)
+    if filepath.is_dir():
+        paths = ls(filepath, filter=filter, expand_all_subdir=True)
+        return read_as_json_list(paths, sheet_name, skip_None, skip_blank, filter)
+    filename = filepath.name
+    if filename.endswith(".json") or filename.endswith(".jsonl"):
+        if is_jsonl(filepath):
+            return load_json_list(filepath)
+        else:
+            return [load_json(filepath)]
+    elif filename.endswith(".xlsx"):
+        if sheet_name is None:
+            df = pd.read_excel(filepath)
+        else:
+            df = pd.read_excel(filepath, sheet_name)
+    elif filename.endswith(".xls"):
+        if is_xslx(filepath):
+            if sheet_name is None:
+                df = pd.read_excel(filepath)
+            else:
+                df = pd.read_excel(filepath, sheet_name)
+        else:
+            df = pyexcel.get_sheet(file_name=filepath)
+    elif filename.endswith(".csv"):
+        df = pd.read_csv(filepath)
+    elif filename.endswith(".parquet"):
+        df = pd.read_parquet(filepath)
+    elif filename.endswith(".feather"):
+        df = pd.read_feather(filepath)
+    elif filename.endswith(".pkl"):
+        df = pd.read_pickle(filepath)
+    elif filename.endswith(".h5"):
+        df = pd.read_hdf(filepath)
+    elif filename.endswith(".txt"):
+        df = pd.read_csv(filepath, delimiter="\t")
+    elif filename.endswith(".tsv"):
+        df = pd.read_csv(filepath, delimiter="\t")
+    elif filename.endswith(".xml"):
+        df = pd.read_xml(filepath)
+    elif filename.endswith(".html"):
+        df = pd.read_html(filepath)[0]
+    elif filename.endswith(".db"):
+        if sheet_name is None:
+            raise ValueError("读取 .db 文件需要提供 sheet_name 作为表名")
+        df = pd.read_sql_table(sheet_name, f"sqlite:///{filepath}")
+    else:
+        raise ValueError(f"Unsupported file type: {filepath}")
+    return df.to_dict(orient="records")
 def load_json(filename: str):
     with open(filename, "r", encoding="utf-8") as f:
         return json.load(f)
@@ -84,16 +168,24 @@ def save_json(json_list: Union[Dict[str, str], List[Dict[str, str]]], filename:
         return json.dump(json_list, f, ensure_ascii=False, separators=(",", ":"), indent=2)
-def load_json_list(filename: str):
+def load_json_list(filename: str, skip_None=True, skip_blank=True) -> List[Dict[str, str]]:
     with open(filename, "r", encoding="utf-8") as f:
         lines = f.readlines()
         json_list = []
-        for i in lines:
+        for i, line in enumerate(lines):
+            line = line.strip()
+            if line == "":
+                if not skip_blank:
+                    json_list.append("")
+                continue
+            if line == "None":
+                if not skip_None:
+                    json_list.append(None)
+                continue
             try:
-                obj = json.loads(i.strip())
+                obj = json.loads(line)
             except:
-                print("格式损坏数据，无法加载")
-                print(i)
+                print(f"格式损坏，跳过第 {i} 行: {repr(line)}")
                 continue
             json_list.append(obj)
         return json_list
@@ -176,7 +268,7 @@ def apply_changes_to_paths(
 ):
     total_updated = 0
     total_deleted = 0
-    for path in paths:
+    for path in ls(paths):
         if verbose:
             print("checking", path)
         jsonlist = load_json(path)
@@ -199,25 +291,8 @@ def apply_changes_to_paths(
     print(f"total: updated {total_updated}, deleted {total_deleted}")
-def backup_current_output(row: Dict[str, str], output_key="output"):
-    if "old_output" in row:
-        for i in range(1, 10):
-            if f"old_output{i}" not in row:
-                row[f"old_output{i}"] = row[output_key]
-                break
-    else:
-        row["old_output"] = row[output_key]
-    return row
-def backup_and_set_output(row: Dict[str, str], output: str):
-    backup_current_output(row)
-    row["output"] = output
-    return row
 def generator_from_paths(paths: List[Path], load_data: Callable[[Path], List[Dict[str, Any]]] = load_json):
-    for path in paths:
+    for path in ls(paths):
         jsonlist: List[Dict[str, Any]] = load_data(path)
         for row in jsonlist:
             yield path, row

{xlin-0.1.37 → xlin-0.1.38}/xlin/multiprocess_mapping.py RENAMED Viewed

@@ -100,6 +100,16 @@ def xmap(
         preserve_order (bool): 是否保持结果顺序
         chunksize (Optional[int]): 单个任务分块大小，None为自动计算
         retry_count (int): 任务失败重试次数
+    Example:
+        >>> from xlin.multiprocess_mapping import xmap
+        >>> jsonlist = [{"id": 1, "text": "Hello"}, {"id": 2, "text": "World"}]
+        >>> def work_func(item):
+        ...     item["text"] = item["text"].upper()
+        ...     return item
+        >>> results = xmap(jsonlist, work_func, output_path="output.jsonl", batch_size=2)
+        >>> print(results)
+        [{'id': 1, 'text': 'HELLO'}, {'id': 2, 'text': 'WORLD'}]
     """
     need_caching = output_path is not None
     output_list = []
@@ -204,7 +214,7 @@ def xmap(
 def multiprocessing_mapping(
     df: pd.DataFrame,
     output_path: Optional[Union[str, Path]],
-    partial_func,
+    partial_func: Callable[[Dict[str, str]], Dict[str, str]],
     batch_size=multiprocessing.cpu_count(),
     cache_batch_num=1,
     thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
@@ -215,6 +225,9 @@ def multiprocessing_mapping(
         df (DataFrame): [description]
         output_path (Path): 数据量大的时候需要缓存
         partial_func (function): (Dict[str, str]) -> Dict[str, str]
+        batch_size (int): batch size
+        cache_batch_num (int): cache batch num
+        thread_pool_size (int): thread pool size
     """
     need_caching = output_path is not None
     tmp_list, output_list = list(), list()

{xlin-0.1.37 → xlin-0.1.38}/xlin/read_as_dataframe.py RENAMED Viewed

@@ -77,7 +77,9 @@ def read_as_dataframe(
     elif filename.endswith(".html"):
         df = pd.read_html(filepath)[0]
     elif filename.endswith(".db"):
-        df = pd.read_sql_table(sheet_name, filepath)
+        if sheet_name is None:
+            raise ValueError("读取 .db 文件需要提供 sheet_name 作为表名")
+        df = pd.read_sql_table(sheet_name, f"sqlite:///{filepath}")
     else:
         raise ValueError(
             (

{xlin-0.1.37 → xlin-0.1.38}/xlin/util.py RENAMED Viewed

@@ -1,14 +1,15 @@
 from typing import *
 from collections import defaultdict
 from pathlib import Path
-import pandas as pd
 import os
 import asyncio
 import datetime
-from loguru import logger
 import shutil
 import random
+import pandas as pd
+from loguru import logger
 date_str = datetime.datetime.now().strftime("%Y%m%d")
 datetime_str = datetime.datetime.now().strftime("%Y%m%d_%Hh%Mm%Ss")

{xlin-0.1.37 → xlin-0.1.38}/LICENSE RENAMED Viewed

File without changes

{xlin-0.1.37 → xlin-0.1.38}/README.md RENAMED Viewed

File without changes

{xlin-0.1.37 → xlin-0.1.38}/xlin/__init__.py RENAMED Viewed

File without changes

{xlin-0.1.37 → xlin-0.1.38}/xlin/ischinese.py RENAMED Viewed

File without changes

{xlin-0.1.37 → xlin-0.1.38}/xlin/metric.py RENAMED Viewed

File without changes

{xlin-0.1.37 → xlin-0.1.38}/xlin/statistic.py RENAMED Viewed

File without changes

{xlin-0.1.37 → xlin-0.1.38}/xlin/timing.py RENAMED Viewed

File without changes

{xlin-0.1.37 → xlin-0.1.38}/xlin/xls2xlsx.py RENAMED Viewed

File without changes

{xlin-0.1.37 → xlin-0.1.38}/xlin/yaml.py RENAMED Viewed

File without changes

xlin 0.1.37__tar.gz → 0.1.38__tar.gz

xlin 0.1.37tar.gz → 0.1.38tar.gz