PyPI - xlin - Versions diffs - 0.1.11__tar.gz → 0.1.13__tar.gz - Mend

xlin 0.1.11tar.gz → 0.1.13tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

{xlin-0.1.11 → xlin-0.1.13}/PKG-INFO +4 -2
{xlin-0.1.11 → xlin-0.1.13}/pyproject.toml +2 -1
{xlin-0.1.11 → xlin-0.1.13}/xlin/read_as_dataframe.py +88 -57
xlin-0.1.13/xlin/statistic.py +116 -0
xlin-0.1.11/xlin/statistic.py +0 -33
{xlin-0.1.11 → xlin-0.1.13}/LICENSE +0 -0
{xlin-0.1.11 → xlin-0.1.13}/README.md +0 -0
{xlin-0.1.11 → xlin-0.1.13}/xlin/__init__.py +0 -0
{xlin-0.1.11 → xlin-0.1.13}/xlin/ischinese.py +0 -0
{xlin-0.1.11 → xlin-0.1.13}/xlin/jsonl.py +0 -0
{xlin-0.1.11 → xlin-0.1.13}/xlin/metric.py +0 -0
{xlin-0.1.11 → xlin-0.1.13}/xlin/multiprocess_mapping.py +0 -0
{xlin-0.1.11 → xlin-0.1.13}/xlin/util.py +0 -0
{xlin-0.1.11 → xlin-0.1.13}/xlin/xls2xlsx.py +0 -0
{xlin-0.1.11 → xlin-0.1.13}/xlin/yaml.py +0 -0

{xlin-0.1.11 → xlin-0.1.13}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.3
 Name: xlin
-Version: 0.1.11
+Version: 0.1.13
 Summary: toolbox for LinXueyuan
 License: MIT
 Author: XiChen
@@ -18,11 +18,13 @@ Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
 Requires-Dist: loguru
 Requires-Dist: pandas
 Requires-Dist: pyexcel
 Requires-Dist: pyexcel-xls
 Requires-Dist: pyexcel-xlsx
+Requires-Dist: pyyaml
 Requires-Dist: tqdm
 Requires-Dist: xlsxwriter
 Description-Content-Type: text/markdown

{xlin-0.1.11 → xlin-0.1.13}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "xlin"
-version = "0.1.11"
+version = "0.1.13"
 description = "toolbox for LinXueyuan"
 authors = ["XiChen <23211526+LinXueyuanStdio@users.noreply.github.com>"]
 license = "MIT"
@@ -14,6 +14,7 @@ pyexcel-xls = "*"
 pyexcel-xlsx = "*"
 xlsxwriter = "*"
 tqdm = "*"
+pyyaml = "*"
 [build-system]
 requires = ["poetry-core"]

{xlin-0.1.11 → xlin-0.1.13}/xlin/read_as_dataframe.py RENAMED Viewed

@@ -12,23 +12,23 @@ from xlin.jsonl import dataframe_to_json_list, load_json, load_json_list, save_j
 from xlin.xls2xlsx import is_xslx
-def valid_to_read_as_dataframe(filename: str) -> bool:
-    suffix_list = [".json", ".jsonl", ".xlsx", "xls", ".csv"]
-    return any([filename.endswith(suffix) for suffix in suffix_list])
-def read_as_dataframe(filepath: Union[str, Path], sheet_name: Optional[str] = None, fill_empty_str_to_na=True) -> pd.DataFrame:
+def read_as_dataframe(
+    filepath: Union[str, Path],
+    sheet_name: Optional[str] = None,
+    fill_empty_str_to_na=True,
+    filter=lambda x: True,
+) -> pd.DataFrame:
     """
-    读取文件为表格
+    读取文件为表格。如果是文件夹，则读取文件夹下的所有文件为表格并拼接
     """
     filepath = Path(filepath)
     if filepath.is_dir():
-        paths = ls(filepath, expand_all_subdir=True)
+        paths = ls(filepath, filter=filter, expand_all_subdir=True)
         df_list = []
         for path in paths:
             try:
-                df = read_as_dataframe(path, sheet_name, fill_empty_str_to_na)
-                df['数据来源'] = path.name
+                df = read_as_dataframe(path, sheet_name, fill_empty_str_to_na, filter)
+                df["数据来源"] = path.name
             except:
                 df = pd.DataFrame()
             df_list.append(df)
@@ -44,56 +44,64 @@ def read_as_dataframe(filepath: Union[str, Path], sheet_name: Optional[str] = No
             json_list = load_json_list(filepath)
         df = pd.DataFrame(json_list)
     elif filename.endswith(".xlsx"):
-        df = pd.read_excel(filepath) if sheet_name is None else pd.read_excel(filepath, sheet_name)
+        if sheet_name is None:
+            df = pd.read_excel(filepath)
+        else:
+            df = pd.read_excel(filepath, sheet_name)
     elif filename.endswith(".xls"):
         if is_xslx(filepath):
-            df = pd.read_excel(filepath) if sheet_name is None else pd.read_excel(filepath, sheet_name)
+            if sheet_name is None:
+                df = pd.read_excel(filepath)
+            else:
+                df = pd.read_excel(filepath, sheet_name)
         else:
             df = pyexcel.get_sheet(file_name=filepath)
     elif filename.endswith(".csv"):
         df = pd.read_csv(filepath)
+    elif filename.endswith(".parquet"):
+        df = pd.read_parquet(filepath)
+    elif filename.endswith(".feather"):
+        df = pd.read_feather(filepath)
+    elif filename.endswith(".pkl"):
+        df = pd.read_pickle(filepath)
+    elif filename.endswith(".h5"):
+        df = pd.read_hdf(filepath)
+    elif filename.endswith(".txt"):
+        df = pd.read_csv(filepath, delimiter="\t")
+    elif filename.endswith(".tsv"):
+        df = pd.read_csv(filepath, delimiter="\t")
+    elif filename.endswith(".xml"):
+        df = pd.read_xml(filepath)
+    elif filename.endswith(".html"):
+        df = pd.read_html(filepath)[0]
+    elif filename.endswith(".db"):
+        df = pd.read_sql_table(sheet_name, filepath)
     else:
-        raise ValueError(f"Unsupported filetype {filepath}. filetype not in [json, jsonl, xlsx, xls, csv]")
+        raise ValueError(
+            (
+                f"Unsupported filetype {filepath}. filetype not in \n"
+                "[json, jsonl, xlsx, xls, csv, "
+                "parquet, feather, pkl, h5, txt, "
+                "tsv, xml, html, db]"
+            )
+        )
     if fill_empty_str_to_na:
         df.fillna("", inplace=True)
     return df
-def read_maybe_dir_as_dataframe(filepath: Union[str, Path], sheet_name: Optional[str] = None) -> pd.DataFrame:
-    """
-    input path 可能是文件夹，此时将文件夹下的所有表格拼接到一起返回，要求所有表头一致
-    如果不是文件夹，则为文件，尝试直接读取为表格返回
-    """
-    out_list = list()
-    if not isinstance(filepath, Path):
-        filepath = Path(filepath)
-    if not filepath.exists():
-        raise ValueError(f"Path Not Exist: {filepath}")
-    if not filepath.is_dir():
-        return read_as_dataframe(filepath, sheet_name)
-    files = os.listdir(filepath)
-    for file_name in files:
-        if not valid_to_read_as_dataframe(file_name):
-            continue
-        input_file = filepath / file_name
-        df = read_as_dataframe(input_file, sheet_name)
-        df.fillna("", inplace=True)
-        for _, line in df.iterrows():
-            line = line.to_dict()
-            out_list.append(line)
-    df = pd.DataFrame(out_list)
-    return df
-def read_as_dataframe_dict(filepath: Union[str, Path], fill_empty_str_to_na=True):
+def read_as_dataframe_dict(
+    filepath: Union[str, Path],
+    fill_empty_str_to_na=True,
+    filter=lambda x: True,
+):
     filepath = Path(filepath)
     if filepath.is_dir():
-        paths = ls(filepath, expand_all_subdir=True)
+        paths = ls(filepath, filter=filter, expand_all_subdir=True)
         df_dict_list = []
         for path in paths:
             try:
-                df_dict = read_as_dataframe_dict(path, fill_empty_str_to_na)
+                df_dict = read_as_dataframe_dict(path, fill_empty_str_to_na, filter)
             except:
                 df_dict = {}
             df_dict_list.append(df_dict)
@@ -104,11 +112,11 @@ def read_as_dataframe_dict(filepath: Union[str, Path], fill_empty_str_to_na=True
         for name, df in df_dict.items():
             if fill_empty_str_to_na:
                 df.fillna("", inplace=True)
-            df['数据来源'] = filepath.name
+            df["数据来源"] = filepath.name
     elif isinstance(df_dict, pd.DataFrame):
         if fill_empty_str_to_na:
             df_dict.fillna("", inplace=True)
-        df_dict['数据来源'] = filepath.name
+        df_dict["数据来源"] = filepath.name
     return df_dict
@@ -137,7 +145,10 @@ def save_df_dict(df_dict: Dict[str, pd.DataFrame], output_filepath: Union[str, P
     return output_filepath
-def save_df_from_jsonlist(jsonlist: List[Dict[str, str]], output_filepath: Union[str, Path]):
+def save_df_from_jsonlist(
+    jsonlist: List[Dict[str, str]],
+    output_filepath: Union[str, Path],
+):
     df = pd.DataFrame(jsonlist)
     return save_df(df, output_filepath)
@@ -150,7 +161,9 @@ def save_df(df: pd.DataFrame, output_filepath: Union[str, Path]):
     return output_filepath
-def lazy_build_dataframe(name: str, output_filepath: Path, func, filetype: str = "xlsx"):
+def lazy_build_dataframe(
+    name: str, output_filepath: Path, func, filetype: str = "xlsx"
+):
     logger.info(name)
     output_filepath.parent.mkdir(parents=True, exist_ok=True)
     if output_filepath.exists():
@@ -161,9 +174,13 @@ def lazy_build_dataframe(name: str, output_filepath: Path, func, filetype: str =
         if filetype == "xlsx":
             df.to_excel(output_filepath.parent / f"{filename}.xlsx", index=False)
         elif filetype == "json":
-            save_json_list(dataframe_to_json_list(df), output_filepath.parent / f"{filename}.json")
+            save_json_list(
+                dataframe_to_json_list(df), output_filepath.parent / f"{filename}.json"
+            )
         elif filetype == "jsonl":
-            save_json_list(dataframe_to_json_list(df), output_filepath.parent / f"{filename}.jsonl")
+            save_json_list(
+                dataframe_to_json_list(df), output_filepath.parent / f"{filename}.jsonl"
+            )
         else:
             logger.warning(f"不认识的 {filetype}，默认保存为 xlsx")
             df.to_excel(output_filepath.parent / f"{filename}.xlsx", index=False)
@@ -171,7 +188,13 @@ def lazy_build_dataframe(name: str, output_filepath: Path, func, filetype: str =
     return df
-def lazy_build_dataframe_dict(name: str, output_filepath: Path, df_dict: Dict[str, pd.DataFrame], func, skip_sheets: List[str] = list()):
+def lazy_build_dataframe_dict(
+    name: str,
+    output_filepath: Path,
+    df_dict: Dict[str, pd.DataFrame],
+    func: Callable[[str, pd.DataFrame], pd.DataFrame],
+    skip_sheets: List[str] = list(),
+):
     logger.info(name)
     output_filepath.parent.mkdir(parents=True, exist_ok=True)
     if output_filepath.exists():
@@ -193,13 +216,17 @@ def merge_multiple_df_dict(list_of_df_dict: List[Dict[str, pd.DataFrame]], sort=
     for df_dict in list_of_df_dict:
         for k, df in df_dict.items():
             df_dict_merged[k].append(df)
-    df_dict_merged: Dict[str, pd.DataFrame] = {k: pd.concat(v) for k, v in df_dict_merged.items()}
+    df_dict_merged: Dict[str, pd.DataFrame] = {
+        k: pd.concat(v) for k, v in df_dict_merged.items()
+    }
     if sort:
-        df_dict_merged: Dict[str, pd.DataFrame] = {k: df_dict_merged[k] for k in sorted(df_dict_merged)}
+        df_dict_merged: Dict[str, pd.DataFrame] = {
+            k: df_dict_merged[k] for k in sorted(df_dict_merged)
+        }
     return df_dict_merged
-def remove_duplicate_and_sort(df: pd.DataFrame, key_col="query", sort_by='label'):
+def remove_duplicate_and_sort(df: pd.DataFrame, key_col="query", sort_by="label"):
     query_to_rows = {}
     for i, row in df.iterrows():
         query_to_rows[row[key_col]] = row
@@ -218,16 +245,20 @@ def highlight_max(x):
     return [("background-color: yellow" if m else "") for m in is_max]
-def split_dataframe(df: pd.DataFrame, output_dir: Union[str, Path], tag: str, split_count=6):
+def split_dataframe(
+    df: pd.DataFrame,
+    output_dir: Union[str, Path],
+    output_filename_prefix: str,
+    split_count=6,
+):
     output_dir = Path(output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
     rows = dataframe_to_json_list(df)
     split_step = len(rows) // split_count + 1
     df_list = []
     for i in range(0, len(rows), split_step):
-        filepath = output_dir / f"{tag}_{i // split_step}.xlsx"
-        df_i = pd.DataFrame(rows[i:i+split_step])
+        filepath = output_dir / f"{output_filename_prefix}_{i // split_step}.xlsx"
+        df_i = pd.DataFrame(rows[i : i + split_step])
         df_i.to_excel(filepath, index=False)
         df_list.append(df_i)
     return df_list

xlin-0.1.13/xlin/statistic.py ADDED Viewed

@@ -0,0 +1,116 @@
+from typing import List
+import pandas as pd
+def bucket_count(length: List[int], step=50, skip_zero_count=False):
+    grouped_count = []
+    j = 0
+    for i in range(0, max(length) + step, step):
+        grouped_count.append(0)
+        while j < len(length) and length[j] < i:
+            grouped_count[i // step] += 1
+            j += 1
+    x, y = [], []
+    for i, j in enumerate(grouped_count):
+        if i == 0:
+            continue
+        if skip_zero_count and j == 0:
+            continue
+        print(f"[{(i-1)*step}, {i*step})  {j}   {sum(grouped_count[:i+1])/len(length)*100:.2f}%")
+        x.append((i - 1) * step)
+        y.append(j)
+    return x, y
+def statistic_char_length(df: pd.DataFrame, instruction_key="instruction"):
+    length = []
+    for i, row in df.iterrows():
+        length.append(len(row[instruction_key]))
+    length.sort()
+    return length
+def statistic_token_length(df: pd.DataFrame, model_path: str, row_to_prompt: lambda row: row["prompt"]):
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    lengths = []
+    for i, row in df.iterrows():
+        prompt = row_to_prompt(row)
+        inputs = tokenizer(prompt, return_tensors="pt")
+        length = inputs["input_ids"].shape[1]
+        lengths.append(length)
+    lengths.sort()
+    return lengths
+def draw_histogram(data: list[int], bins=30, title="Data Distribution Analysis"):
+    import numpy as np
+    import matplotlib.pyplot as plt
+    from scipy.stats import gaussian_kde
+    data = np.array(data)
+    # 计算统计指标
+    mean = np.mean(data)
+    median = np.median(data)
+    std = np.std(data)
+    q25, q75 = np.percentile(data, [25, 75])
+    data_range = (np.min(data), np.max(data))
+    # 创建图形和坐标轴
+    plt.figure(figsize=(12, 7), dpi=100)
+    # 绘制直方图
+    plt.hist(data, bins=bins, density=True, alpha=0.5, color="skyblue", edgecolor="white", label="Distribution")
+    # 绘制核密度估计（KDE）
+    kde = gaussian_kde(data)
+    x_vals = np.linspace(data_range[0] - 1, data_range[1] + 1, 1000)
+    plt.plot(x_vals, kde(x_vals), color="navy", linewidth=2, label="KDE Curve")
+    # 添加统计线
+    plt.axvline(mean, color="red", linestyle="--", linewidth=2, label=f"Mean ({mean:.2f})")
+    plt.axvline(median, color="green", linestyle="-.", linewidth=2, label=f"Median ({median:.2f})")
+    plt.axvspan(mean - std, mean + std, color="orange", alpha=0.1, label=f"±1 Std.Dev ({std:.2f})")
+    # 添加四分位线
+    plt.axvline(q25, color="purple", linestyle=":", alpha=0.8, label=f"25th Percentile ({q25:.2f})")
+    plt.axvline(q75, color="purple", linestyle=":", alpha=0.8, label=f"75th Percentile ({q75:.2f})")
+    # 添加统计摘要
+    stats_text = f"""\
+Data Range: [{data_range[0]:.2f}, {data_range[1]:.2f}]
+Observations: {len(data):,}
+Standard Deviation: {std:.2f}
+IQR: {q75 - q25:.2f}
+Skewness: {float((data - mean).mean()**3 / std**3):.4f}
+Kurtosis: {float((data - mean).mean()**4 / std**4):.4f}\
+"""
+# 文字左对齐 align
+    plt.annotate(stats_text, xy=(0.99, 0.98), xycoords="axes fraction", ha="right", va="top", fontfamily="monospace", bbox=dict(boxstyle="round", facecolor="white", alpha=0.8),)
+    # 设置图形属性
+    plt.title(title, fontsize=14, pad=20)
+    plt.xlabel("Value", fontsize=12)
+    plt.ylabel("Density", fontsize=12)
+    plt.grid(True, linestyle="--", alpha=0.4)
+    plt.legend(loc="upper left", frameon=True, framealpha=0.9, shadow=True)
+    # 调整坐标轴范围
+    buffer = (data_range[1] - data_range[0]) * 0.1
+    plt.xlim(data_range[0] - buffer, data_range[1] + buffer)
+    # 显示图形
+    plt.tight_layout()
+    plt.show()
+def draw_pie(numbers: List[int], title="Pie Chart of Numbers"):
+    import matplotlib.pyplot as plt
+    plt.pie(numbers, labels=[str(i) for i in range(len(numbers))], autopct='%1.1f%%')
+    plt.title(title)
+    plt.show()

xlin-0.1.11/xlin/statistic.py DELETED Viewed

@@ -1,33 +0,0 @@
-from typing import List
-import pandas as pd
-def bucket_count(length: List[int], step=50, skip_zero_count=False):
-    grouped_count = []
-    j = 0
-    for i in range(0, max(length) + step, step):
-        grouped_count.append(0)
-        while j < len(length) and length[j] < i:
-            grouped_count[i // step] += 1
-            j += 1
-    x, y = [], []
-    for i, j in enumerate(grouped_count):
-        if i == 0:
-            continue
-        if skip_zero_count and j == 0:
-            continue
-        print(f"[{(i-1)*step}, {i*step})  {j}   {sum(grouped_count[:i+1])/len(length)*100:.2f}%")
-        x.append((i - 1) * step)
-        y.append(j)
-    return x, y
-def statistic_char_length(df: pd.DataFrame, instruction_key="instruction"):
-    length = []
-    for i, row in df.iterrows():
-        length.append(len(row[instruction_key]))
-    length.sort()
-    return length

{xlin-0.1.11 → xlin-0.1.13}/LICENSE RENAMED Viewed

File without changes

{xlin-0.1.11 → xlin-0.1.13}/README.md RENAMED Viewed

File without changes

{xlin-0.1.11 → xlin-0.1.13}/xlin/__init__.py RENAMED Viewed

File without changes

{xlin-0.1.11 → xlin-0.1.13}/xlin/ischinese.py RENAMED Viewed

File without changes

{xlin-0.1.11 → xlin-0.1.13}/xlin/jsonl.py RENAMED Viewed

File without changes

{xlin-0.1.11 → xlin-0.1.13}/xlin/metric.py RENAMED Viewed

File without changes

{xlin-0.1.11 → xlin-0.1.13}/xlin/multiprocess_mapping.py RENAMED Viewed

File without changes

{xlin-0.1.11 → xlin-0.1.13}/xlin/util.py RENAMED Viewed

File without changes

{xlin-0.1.11 → xlin-0.1.13}/xlin/xls2xlsx.py RENAMED Viewed

File without changes

{xlin-0.1.11 → xlin-0.1.13}/xlin/yaml.py RENAMED Viewed

File without changes

xlin 0.1.11__tar.gz → 0.1.13__tar.gz

xlin 0.1.11tar.gz → 0.1.13tar.gz