PyPI - dtflow - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

dtflow 0.2.0py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

dtflow/__init__.py +36 -2
dtflow/__main__.py +292 -239
dtflow/cli/__init__.py +8 -2
dtflow/cli/commands.py +1030 -92
dtflow/converters.py +456 -0
dtflow/core.py +96 -31
dtflow/lineage.py +407 -0
dtflow/mcp/cli.py +14 -14
dtflow/pipeline.py +450 -0
dtflow/storage/io.py +376 -370
dtflow/streaming.py +661 -0
dtflow/tokenizers.py +387 -31
dtflow/utils/display.py +5 -4
{dtflow-0.2.0.dist-info → dtflow-0.3.1.dist-info}/METADATA +234 -15
dtflow-0.3.1.dist-info/RECORD +24 -0
dtflow-0.2.0.dist-info/RECORD +0 -21
{dtflow-0.2.0.dist-info → dtflow-0.3.1.dist-info}/WHEEL +0 -0
{dtflow-0.2.0.dist-info → dtflow-0.3.1.dist-info}/entry_points.txt +0 -0

dtflow/storage/io.py CHANGED Viewed

@@ -1,15 +1,19 @@
 """
 Input/Output utilities for saving and loading data.
+使用 Polars 作为主要 I/O 引擎，性能比 Pandas 快 3-5 倍。
+使用 orjson 作为 JSON 解析引擎，性能比标准 json 快 10 倍。
 """
-from typing import List, Dict, Any, Optional
-import json
-import os
+import orjson
 from pathlib import Path
+from typing import Any, Dict, List, Optional
+import polars as pl
-def save_data(data: List[Dict[str, Any]],
-              filepath: str,
-              file_format: Optional[str] = None) -> None:
+def save_data(
+    data: List[Dict[str, Any]], filepath: str, file_format: Optional[str] = None
+) -> None:
     """
     Save data to file.
@@ -21,23 +25,22 @@ def save_data(data: List[Dict[str, Any]],
     filepath = Path(filepath)
     filepath.parent.mkdir(parents=True, exist_ok=True)
-    # Auto-detect format from extension
     if file_format is None:
         file_format = _detect_format(filepath)
-    if file_format == 'jsonl':
+    if file_format == "jsonl":
         _save_jsonl(data, filepath)
-    elif file_format == 'json':
+    elif file_format == "json":
         _save_json(data, filepath)
-    elif file_format == 'csv':
+    elif file_format == "csv":
         _save_csv(data, filepath)
-    elif file_format == 'parquet':
+    elif file_format == "parquet":
         _save_parquet(data, filepath)
-    elif file_format == 'arrow':
+    elif file_format == "arrow":
         _save_arrow(data, filepath)
-    elif file_format == 'excel':
+    elif file_format == "excel":
         _save_excel(data, filepath)
-    elif file_format == 'flaxkv':
+    elif file_format == "flaxkv":
         _save_flaxkv(data, filepath)
     else:
         raise ValueError(f"Unknown file format: {file_format}")
@@ -59,23 +62,22 @@ def load_data(filepath: str, file_format: Optional[str] = None) -> List[Dict[str
     if not filepath.exists():
         raise FileNotFoundError(f"File not found: {filepath}")
-    # Auto-detect format from extension
     if file_format is None:
         file_format = _detect_format(filepath)
-    if file_format == 'jsonl':
+    if file_format == "jsonl":
         return _load_jsonl(filepath)
-    elif file_format == 'json':
+    elif file_format == "json":
         return _load_json(filepath)
-    elif file_format == 'csv':
+    elif file_format == "csv":
         return _load_csv(filepath)
-    elif file_format == 'parquet':
+    elif file_format == "parquet":
         return _load_parquet(filepath)
-    elif file_format == 'arrow':
+    elif file_format == "arrow":
         return _load_arrow(filepath)
-    elif file_format == 'excel':
+    elif file_format == "excel":
         return _load_excel(filepath)
-    elif file_format == 'flaxkv':
+    elif file_format == "flaxkv":
         return _load_flaxkv(filepath)
     else:
         raise ValueError(f"Unknown file format: {file_format}")
@@ -84,200 +86,204 @@ def load_data(filepath: str, file_format: Optional[str] = None) -> List[Dict[str
 def _detect_format(filepath: Path) -> str:
     """Detect file format from extension."""
     ext = filepath.suffix.lower()
-    if ext == '.jsonl':
-        return 'jsonl'
-    elif ext == '.json':
-        return 'json'
-    elif ext == '.csv':
-        return 'csv'
-    elif ext == '.parquet':
-        return 'parquet'
-    elif ext in ('.arrow', '.feather'):
-        return 'arrow'
-    elif ext in ('.xlsx', '.xls'):
-        return 'excel'
-    elif ext == '.flaxkv' or ext == '':
-        # For FlaxKV, filepath is typically a directory
-        return 'flaxkv'
+    if ext == ".jsonl":
+        return "jsonl"
+    elif ext == ".json":
+        return "json"
+    elif ext == ".csv":
+        return "csv"
+    elif ext == ".parquet":
+        return "parquet"
+    elif ext in (".arrow", ".feather"):
+        return "arrow"
+    elif ext in (".xlsx", ".xls"):
+        return "excel"
+    elif ext == ".flaxkv" or ext == "":
+        return "flaxkv"
     else:
-        # Default to JSONL
-        return 'jsonl'
+        return "jsonl"
 # ============ JSONL Format ============
+# JSONL 保持用原生 Python，因为需要处理复杂嵌套结构
 def _save_jsonl(data: List[Dict[str, Any]], filepath: Path) -> None:
     """Save data in JSONL format."""
-    with open(filepath, 'w', encoding='utf-8') as f:
+    with open(filepath, "wb") as f:
         for item in data:
-            json_line = json.dumps(item, ensure_ascii=False)
-            f.write(json_line + '\n')
+            f.write(orjson.dumps(item) + b"\n")
 def _load_jsonl(filepath: Path) -> List[Dict[str, Any]]:
     """Load data from JSONL format."""
     data = []
-    with open(filepath, 'r', encoding='utf-8') as f:
+    with open(filepath, "rb") as f:
         for line in f:
             line = line.strip()
             if line:
-                data.append(json.loads(line))
+                data.append(orjson.loads(line))
     return data
 # ============ JSON Format ============
 def _save_json(data: List[Dict[str, Any]], filepath: Path) -> None:
     """Save data in JSON format."""
-    with open(filepath, 'w', encoding='utf-8') as f:
-        json.dump(data, f, ensure_ascii=False, indent=2)
+    with open(filepath, "wb") as f:
+        f.write(orjson.dumps(data, option=orjson.OPT_INDENT_2))
 def _load_json(filepath: Path) -> List[Dict[str, Any]]:
     """Load data from JSON format."""
-    with open(filepath, 'r', encoding='utf-8') as f:
-        data = json.load(f)
+    with open(filepath, "rb") as f:
+        data = orjson.loads(f.read())
-    # Ensure data is a list
     if not isinstance(data, list):
         data = [data]
     return data
-# ============ CSV Format ============
+# ============ CSV Format (Polars) ============
 def _save_csv(data: List[Dict[str, Any]], filepath: Path) -> None:
-    """Save data in CSV format."""
-    try:
-        import pandas as pd
-    except ImportError:
-        raise ImportError("pandas is required for CSV support. Install with: pip install pandas")
+    """Save data in CSV format using Polars."""
+    if not data:
+        # 空数据，创建空文件
+        filepath.touch()
+        return
-    df = pd.DataFrame(data)
-    df.to_csv(filepath, index=False, encoding='utf-8')
+    # 序列化复杂字段为 JSON 字符串
+    serialized = _serialize_complex_fields(data)
+    df = pl.DataFrame(serialized)
+    df.write_csv(filepath)
 def _load_csv(filepath: Path) -> List[Dict[str, Any]]:
-    """Load data from CSV format."""
-    try:
-        import pandas as pd
-    except ImportError:
-        raise ImportError("pandas is required for CSV support. Install with: pip install pandas")
+    """Load data from CSV format using Polars."""
+    df = pl.read_csv(filepath)
+    data = df.to_dicts()
+    # 反序列化 JSON 字符串
+    return _deserialize_complex_fields(data)
-    df = pd.read_csv(filepath, encoding='utf-8')
-    return df.to_dict('records')
+# ============ Parquet Format (Polars) ============
-# ============ Excel Format ============
-def _save_excel(data: List[Dict[str, Any]], filepath: Path) -> None:
-    """Save data in Excel format."""
-    try:
-        import pandas as pd
-    except ImportError:
-        raise ImportError("pandas and openpyxl are required for Excel support. Install with: pip install pandas openpyxl")
+def _save_parquet(data: List[Dict[str, Any]], filepath: Path) -> None:
+    """Save data in Parquet format using Polars."""
+    if not data:
+        # 空数据，创建空 parquet
+        pl.DataFrame().write_parquet(filepath)
+        return
-    df = pd.DataFrame(data)
-    df.to_excel(filepath, index=False)
+    serialized = _serialize_complex_fields(data)
+    df = pl.DataFrame(serialized)
+    df.write_parquet(filepath)
-def _load_excel(filepath: Path) -> List[Dict[str, Any]]:
-    """Load data from Excel format."""
-    try:
-        import pandas as pd
-    except ImportError:
-        raise ImportError("pandas and openpyxl are required for Excel support. Install with: pip install pandas openpyxl")
+def _load_parquet(filepath: Path) -> List[Dict[str, Any]]:
+    """Load data from Parquet format using Polars."""
+    df = pl.read_parquet(filepath)
+    data = df.to_dicts()
+    return _deserialize_complex_fields(data)
-    df = pd.read_excel(filepath)
-    return df.to_dict('records')
+# ============ Arrow Format (Polars) ============
-# ============ Parquet Format ============
-def _save_parquet(data: List[Dict[str, Any]], filepath: Path) -> None:
-    """Save data in Parquet format."""
-    try:
-        import pandas as pd
-    except ImportError:
-        raise ImportError("pandas is required for Parquet support. Install with: pip install pandas pyarrow")
+def _save_arrow(data: List[Dict[str, Any]], filepath: Path) -> None:
+    """Save data in Arrow IPC format using Polars."""
+    if not data:
+        pl.DataFrame().write_ipc(filepath)
+        return
-    df = pd.DataFrame(data)
-    df.to_parquet(filepath, index=False, engine='pyarrow')
+    serialized = _serialize_complex_fields(data)
+    df = pl.DataFrame(serialized)
+    df.write_ipc(filepath)
-def _load_parquet(filepath: Path) -> List[Dict[str, Any]]:
-    """Load data from Parquet format."""
-    try:
-        import pandas as pd
-    except ImportError:
-        raise ImportError("pandas is required for Parquet support. Install with: pip install pandas pyarrow")
+def _load_arrow(filepath: Path) -> List[Dict[str, Any]]:
+    """Load data from Arrow IPC format using Polars."""
+    df = pl.read_ipc(filepath)
+    data = df.to_dicts()
+    return _deserialize_complex_fields(data)
-    df = pd.read_parquet(filepath, engine='pyarrow')
-    return df.to_dict('records')
+# ============ Excel Format ============
+# Excel 需要额外依赖，保持可选
-# ============ Arrow Format ============
+def _save_excel(data: List[Dict[str, Any]], filepath: Path) -> None:
+    """Save data in Excel format."""
+    if not data:
+        # 空数据
+        try:
+            import xlsxwriter
+            workbook = xlsxwriter.Workbook(str(filepath))
+            workbook.close()
+        except ImportError:
+            raise ImportError(
+                "xlsxwriter is required for Excel write. Install with: pip install xlsxwriter"
+            )
+        return
+    serialized = _serialize_complex_fields(data)
+    df = pl.DataFrame(serialized)
+    df.write_excel(filepath)
-def _save_arrow(data: List[Dict[str, Any]], filepath: Path) -> None:
-    """Save data in Arrow IPC format (also known as Feather v2).
-    Note: Complex nested structures (like list of dicts) are serialized as JSON strings.
-    """
-    try:
-        import pyarrow as pa
-        import pyarrow.feather as feather
-    except ImportError:
-        raise ImportError("pyarrow is required for Arrow support. Install with: pip install pyarrow")
+def _load_excel(filepath: Path) -> List[Dict[str, Any]]:
+    """Load data from Excel format."""
+    df = pl.read_excel(filepath)
+    data = df.to_dicts()
+    return _deserialize_complex_fields(data)
-    # Serialize complex fields to JSON strings for Arrow compatibility
-    serialized_data = []
+# ============ 复杂字段序列化 ============
+def _serialize_complex_fields(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """将复杂字段（list, dict）序列化为 JSON 字符串"""
+    result = []
     for item in data:
         new_item = {}
         for k, v in item.items():
             if isinstance(v, (list, dict)):
-                new_item[k] = json.dumps(v, ensure_ascii=False)
+                new_item[k] = orjson.dumps(v).decode("utf-8")
             else:
                 new_item[k] = v
-        serialized_data.append(new_item)
-    table = pa.Table.from_pylist(serialized_data)
-    # Use Feather format (simpler and more portable)
-    feather.write_feather(table, filepath)
-def _load_arrow(filepath: Path) -> List[Dict[str, Any]]:
-    """Load data from Arrow IPC format (also known as Feather v2).
-    Note: JSON-serialized fields are automatically deserialized.
-    """
-    try:
-        import pyarrow.feather as feather
-    except ImportError:
-        raise ImportError("pyarrow is required for Arrow support. Install with: pip install pyarrow")
+        result.append(new_item)
+    return result
-    table = feather.read_table(filepath)
-    data = table.to_pylist()
-    # Deserialize JSON strings back to complex objects
+def _deserialize_complex_fields(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """将 JSON 字符串反序列化为复杂字段"""
     result = []
     for item in data:
         new_item = {}
         for k, v in item.items():
-            if isinstance(v, str) and v.startswith(('[', '{')):
+            if isinstance(v, str) and v.startswith(("[", "{")):
                 try:
-                    new_item[k] = json.loads(v)
-                except json.JSONDecodeError:
+                    new_item[k] = orjson.loads(v)
+                except orjson.JSONDecodeError:
                     new_item[k] = v
             else:
                 new_item[k] = v
         result.append(new_item)
     return result
-# ============ Additional Utilities ============
+def _clean_null_fields(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """清理 Polars 添加的 null 字段，保持原始数据结构"""
+    return [{k: v for k, v in item.items() if v is not None} for item in data]
+# ============ Streaming Utilities ============
 def sample_data(
     data: List[Dict[str, Any]],
@@ -290,28 +296,12 @@ def sample_data(
     Args:
         data: List of data items
-        num: Number of items to sample.
-            - num > 0: sample specified number of items
-            - num = 0: sample all data
-            - num < 0: Python slice style (e.g., -1 means last 1, -10 means last 10)
+        num: Number of items to sample
         sample_type: Sampling method - "random", "head", or "tail"
-        seed: Random seed for reproducibility (only for random sampling)
+        seed: Random seed for reproducibility
     Returns:
         Sampled data list
-    Examples:
-        >>> data = [{"id": i} for i in range(100)]
-        >>> sample_data(data, num=5, sample_type="head")
-        [{'id': 0}, {'id': 1}, {'id': 2}, {'id': 3}, {'id': 4}]
-        >>> sample_data(data, num=3, sample_type="tail")
-        [{'id': 97}, {'id': 98}, {'id': 99}]
-        >>> len(sample_data(data, num=0))  # 0 means all
-        100
-        >>> sample_data(data, num=-1, sample_type="head")  # last 1 item
-        [{'id': 99}]
-        >>> sample_data(data, num=-3, sample_type="tail")  # last 3 items
-        [{'id': 97}, {'id': 98}, {'id': 99}]
     """
     import random as rand_module
@@ -320,15 +310,11 @@ def sample_data(
     total = len(data)
-    # Determine actual number to sample
     if num == 0:
-        # 0 means sample all data
         actual_num = total
     elif num < 0:
-        # Negative number: Python slice style (e.g., -1 means 1 item, -10 means 10 items)
         actual_num = min(abs(num), total)
     else:
-        # Positive number: normal sampling
         actual_num = min(num, total)
     if sample_type == "head":
@@ -349,32 +335,23 @@ def sample_file(
     output: Optional[str] = None,
 ) -> List[Dict[str, Any]]:
     """
-    Sample data from a file with streaming support for large files.
-    对于 head/tail 采样，支持流式读取，不需要加载整个文件到内存。
-    对于 random 采样，JSONL 使用蓄水池采样算法，其他格式需要加载全部数据。
+    Sample data from a file with streaming support.
     Args:
-        filepath: Input file path (supports csv, xlsx, jsonl, json, parquet, arrow, feather)
+        filepath: Input file path
         num: Number of items to sample
-        sample_type: Sampling method - "random", "head", or "tail"
-        seed: Random seed for reproducibility (only for random sampling)
-        output: Output file path (optional, if provided, saves sampled data)
+        sample_type: Sampling method
+        seed: Random seed
+        output: Output file path
     Returns:
         Sampled data list
-    Examples:
-        >>> sampled = sample_file("data.jsonl", num=100, sample_type="random")
-        >>> sample_file("data.csv", num=50, output="sampled.jsonl")
     """
     filepath = Path(filepath)
     file_format = _detect_format(filepath)
-    # 尝试使用流式采样
     sampled = _stream_sample(filepath, file_format, num, sample_type, seed)
-    # Save if output specified
     if output:
         save_data(sampled, output)
@@ -388,18 +365,7 @@ def _stream_sample(
     sample_type: str,
     seed: Optional[int],
 ) -> List[Dict[str, Any]]:
-    """
-    流式采样实现。
-    支持的流式优化：
-    - head: jsonl, csv, parquet, arrow, excel
-    - tail: jsonl（反向读取）
-    - random: jsonl（蓄水池采样）
-    num == 0 表示采样所有数据，回退到全量加载。
-    num < 0 表示 Python 切片风格，回退到全量加载。
-    """
-    # num == 0 表示采样所有数据，num < 0 表示切片风格，都需要全量加载
+    """流式采样实现"""
     if num <= 0:
         data = load_data(str(filepath))
         return sample_data(data, num=num, sample_type=sample_type, seed=seed)
@@ -417,13 +383,27 @@ def _stream_sample(
         elif file_format == "excel":
             return _stream_head_excel(filepath, num)
-    # tail 采样优化（仅 JSONL）
-    if sample_type == "tail" and file_format == "jsonl":
-        return _stream_tail_jsonl(filepath, num)
+    # tail 采样优化
+    if sample_type == "tail":
+        if file_format == "jsonl":
+            return _stream_tail_jsonl(filepath, num)
+        elif file_format == "csv":
+            return _stream_tail_csv(filepath, num)
+        elif file_format == "parquet":
+            return _stream_tail_parquet(filepath, num)
+        elif file_format == "arrow":
+            return _stream_tail_arrow(filepath, num)
-    # random 采样优化（仅 JSONL，使用蓄水池采样）
-    if sample_type == "random" and file_format == "jsonl":
-        return _stream_random_jsonl(filepath, num, seed)
+    # random 采样优化
+    if sample_type == "random":
+        if file_format == "jsonl":
+            return _stream_random_jsonl(filepath, num, seed)
+        elif file_format == "csv":
+            return _stream_random_csv(filepath, num, seed)
+        elif file_format == "parquet":
+            return _stream_random_parquet(filepath, num, seed)
+        elif file_format == "arrow":
+            return _stream_random_arrow(filepath, num, seed)
     # 其他情况回退到全量加载
     data = load_data(str(filepath))
@@ -431,262 +411,288 @@ def _stream_sample(
 def _stream_head_jsonl(filepath: Path, num: int) -> List[Dict[str, Any]]:
-    """JSONL 流式读取前 N 行"""
-    result = []
-    with open(filepath, "r", encoding="utf-8") as f:
-        for line in f:
-            line = line.strip()
-            if line:
-                result.append(json.loads(line))
-                if len(result) >= num:
-                    break
-    return result
+    """JSONL 流式读取前 N 行（使用 Polars ndjson）"""
+    try:
+        df = pl.scan_ndjson(filepath).head(num).collect()
+        return _clean_null_fields(df.to_dicts())
+    except Exception as e:
+        # 回退到 Python 实现
+        import sys
+        print(f"[Warning] Polars ndjson 解析失败，回退到 Python 实现: {type(e).__name__}", file=sys.stderr)
+        result = []
+        with open(filepath, "rb") as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    try:
+                        result.append(orjson.loads(line))
+                    except orjson.JSONDecodeError:
+                        continue  # 跳过无效行
+                    if len(result) >= num:
+                        break
+        return result
 def _stream_head_csv(filepath: Path, num: int) -> List[Dict[str, Any]]:
-    """CSV 流式读取前 N 行"""
-    try:
-        import pandas as pd
-    except ImportError:
-        raise ImportError("pandas is required for CSV support. Install with: pip install pandas")
-    df = pd.read_csv(filepath, encoding="utf-8", nrows=num)
-    return df.to_dict("records")
+    """CSV 流式读取前 N 行（使用 Polars LazyFrame）"""
+    df = pl.scan_csv(filepath).head(num).collect()
+    return _deserialize_complex_fields(df.to_dicts())
 def _stream_head_parquet(filepath: Path, num: int) -> List[Dict[str, Any]]:
-    """Parquet 真流式读取前 N 行（使用 iter_batches 避免全量加载）"""
-    try:
-        import pyarrow.parquet as pq
-    except ImportError:
-        raise ImportError("pyarrow is required for Parquet support. Install with: pip install pyarrow")
+    """Parquet 流式读取前 N 行（使用 Polars LazyFrame）"""
+    df = pl.scan_parquet(filepath).head(num).collect()
+    return _deserialize_complex_fields(df.to_dicts())
-    parquet_file = pq.ParquetFile(filepath)
-    result = []
-    # 使用 iter_batches 真正流式读取，只读取需要的数据
-    for batch in parquet_file.iter_batches(batch_size=min(num, 10000)):
-        batch_data = batch.to_pylist()
-        result.extend(batch_data)
-        if len(result) >= num:
-            break
+def _stream_head_arrow(filepath: Path, num: int) -> List[Dict[str, Any]]:
+    """Arrow 流式读取前 N 行（使用 Polars LazyFrame）"""
+    df = pl.scan_ipc(filepath).head(num).collect()
+    return _deserialize_complex_fields(df.to_dicts())
-    return result[:num]
+def _stream_head_excel(filepath: Path, num: int) -> List[Dict[str, Any]]:
+    """Excel 读取前 N 行"""
+    # Excel 不支持 lazy scan，使用普通读取
+    df = pl.read_excel(filepath).head(num)
+    return _deserialize_complex_fields(df.to_dicts())
-def _stream_head_arrow(filepath: Path, num: int) -> List[Dict[str, Any]]:
-    """Arrow/Feather 流式读取前 N 行"""
+def _stream_tail_jsonl(filepath: Path, num: int) -> List[Dict[str, Any]]:
+    """JSONL 流式读取后 N 行（使用 Polars ndjson）"""
     try:
-        import pyarrow.feather as feather
-    except ImportError:
-        raise ImportError("pyarrow is required for Arrow support. Install with: pip install pyarrow")
+        df = pl.scan_ndjson(filepath).tail(num).collect()
+        return _clean_null_fields(df.to_dicts())
+    except Exception as e:
+        # 回退到 Python 两遍遍历实现
+        import sys
+        print(f"[Warning] Polars ndjson 解析失败，回退到 Python 实现: {type(e).__name__}", file=sys.stderr)
-    table = feather.read_table(filepath)
-    sliced = table.slice(0, min(num, table.num_rows))
-    return _deserialize_arrow_data(sliced.to_pylist())
+        total_lines = 0
+        with open(filepath, "rb") as f:
+            for _ in f:
+                total_lines += 1
+        if total_lines <= num:
+            return _load_jsonl(filepath)
-def _deserialize_arrow_data(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-    """反序列化 Arrow 数据中的 JSON 字符串字段"""
-    result = []
-    for item in data:
-        new_item = {}
-        for k, v in item.items():
-            if isinstance(v, str) and v.startswith(("[", "{")):
-                try:
-                    new_item[k] = json.loads(v)
-                except json.JSONDecodeError:
-                    new_item[k] = v
-            else:
-                new_item[k] = v
-        result.append(new_item)
-    return result
+        skip_count = total_lines - num
+        result = []
+        with open(filepath, "rb") as f:
+            for i, line in enumerate(f):
+                if i < skip_count:
+                    continue
+                line = line.strip()
+                if line:
+                    try:
+                        result.append(orjson.loads(line))
+                    except orjson.JSONDecodeError:
+                        continue  # 跳过无效行
+        return result
-def _stream_head_excel(filepath: Path, num: int) -> List[Dict[str, Any]]:
-    """Excel 流式读取前 N 行"""
-    try:
-        import pandas as pd
-    except ImportError:
-        raise ImportError("pandas and openpyxl are required for Excel support")
+def _stream_tail_csv(filepath: Path, num: int) -> List[Dict[str, Any]]:
+    """CSV 流式读取后 N 行（使用 Polars LazyFrame）"""
+    df = pl.scan_csv(filepath).tail(num).collect()
+    return _deserialize_complex_fields(df.to_dicts())
-    df = pd.read_excel(filepath, nrows=num)
-    return df.to_dict("records")
+def _stream_tail_parquet(filepath: Path, num: int) -> List[Dict[str, Any]]:
+    """Parquet 流式读取后 N 行（使用 Polars LazyFrame）"""
+    df = pl.scan_parquet(filepath).tail(num).collect()
+    return _deserialize_complex_fields(df.to_dicts())
-def append_to_file(data: List[Dict[str, Any]],
-                   filepath: str,
-                   file_format: str = 'jsonl') -> None:
-    """
-    Append data to an existing file.
-    Args:
-        data: List of data items to append
-        filepath: Path to file
-        file_format: File format (only 'jsonl' supported for append)
-    """
-    filepath = Path(filepath)
+def _stream_tail_arrow(filepath: Path, num: int) -> List[Dict[str, Any]]:
+    """Arrow 流式读取后 N 行（使用 Polars LazyFrame）"""
+    df = pl.scan_ipc(filepath).tail(num).collect()
+    return _deserialize_complex_fields(df.to_dicts())
-    if file_format != 'jsonl':
-        raise ValueError("Only JSONL format supports appending")
-    filepath.parent.mkdir(parents=True, exist_ok=True)
-    with open(filepath, 'a', encoding='utf-8') as f:
-        for item in data:
-            json_line = json.dumps(item, ensure_ascii=False)
-            f.write(json_line + '\n')
+# 文件大小阈值：超过此值使用 Python 流式采样，否则使用 Polars
+_STREAM_THRESHOLD_BYTES = 100 * 1024 * 1024  # 100MB
-def count_lines(filepath: str) -> int:
-    """
-    Count number of lines in a JSONL file without loading all data.
-    Args:
-        filepath: Path to JSONL file
+def _count_sample_jsonl(
+    filepath: Path, num: int, seed: Optional[int] = None
+) -> List[Dict[str, Any]]:
+    """JSONL 流式采样（Polars 计数 + Python 选择性读取）
-    Returns:
-        Number of lines
+    策略：
+    1. 使用 Polars 快速获取行数（比 Python 快 4 倍）
+    2. 生成随机索引
+    3. Python 遍历文件，只解析选中的行
     """
-    count = 0
-    with open(filepath, 'r', encoding='utf-8') as f:
-        for _ in f:
-            count += 1
-    return count
+    import random
+    # Step 1: Polars 快速获取行数
+    try:
+        total_lines = pl.scan_ndjson(filepath).select(pl.len()).collect().item()
+    except Exception:
+        # 回退到 Python 计数
+        with open(filepath, "rb") as f:
+            total_lines = sum(1 for _ in f)
-def stream_jsonl(filepath: str, chunk_size: int = 1000):
-    """
-    Stream JSONL file in chunks.
+    if total_lines == 0:
+        return []
-    Args:
-        filepath: Path to JSONL file
-        chunk_size: Number of items per chunk
+    # 采样数超过总行数，读取全部
+    if num >= total_lines:
+        return _load_jsonl(filepath)
-    Yields:
-        Chunks of data items
-    """
-    chunk = []
-    with open(filepath, 'r', encoding='utf-8') as f:
-        for line in f:
-            line = line.strip()
-            if line:
-                chunk.append(json.loads(line))
-                if len(chunk) >= chunk_size:
-                    yield chunk
-                    chunk = []
+    # Step 2: 生成随机索引
+    if seed is not None:
+        random.seed(seed)
+    selected_indices = set(random.sample(range(total_lines), num))
-        if chunk:
-            yield chunk
+    # Step 3: 只解析选中的行
+    result = []
+    with open(filepath, "rb") as f:
+        for i, line in enumerate(f):
+            if i in selected_indices:
+                line = line.strip()
+                if line:
+                    try:
+                        result.append(orjson.loads(line))
+                    except orjson.JSONDecodeError:
+                        continue
+                if len(result) >= num:
+                    break
+    return result
-# ============ JSONL 流式采样优化 ============
+def _stream_random_jsonl(
+    filepath: Path, num: int, seed: Optional[int] = None
+) -> List[Dict[str, Any]]:
+    """JSONL 随机采样
-def _stream_tail_jsonl(filepath: Path, num: int) -> List[Dict[str, Any]]:
+    策略：
+    - 小文件 (<100MB): 使用 Polars collect+sample
+    - 大文件 (>=100MB): 使用 count+sample 流式采样（更快且内存友好）
     """
-    JSONL 反向读取后 N 行（避免全量加载）。
+    file_size = filepath.stat().st_size
-    使用双端队列保持最后 N 行，内存占用 O(num) 而非 O(total)。
-    """
-    from collections import deque
+    # 大文件使用流式采样（更快）
+    if file_size >= _STREAM_THRESHOLD_BYTES:
+        return _count_sample_jsonl(filepath, num, seed)
-    # 使用 deque 的 maxlen 自动保持最后 N 个元素
-    buffer = deque(maxlen=num)
+    # 小文件尝试 Polars
+    try:
+        df = pl.scan_ndjson(filepath).collect()
+        if len(df) <= num:
+            return _clean_null_fields(df.to_dicts())
+        sampled = df.sample(n=num, seed=seed)
+        return _clean_null_fields(sampled.to_dicts())
+    except Exception as e:
+        import sys
+        print(f"[Warning] Polars ndjson 解析失败，回退到流式采样: {type(e).__name__}", file=sys.stderr)
+        return _count_sample_jsonl(filepath, num, seed)
+def _stream_random_csv(
+    filepath: Path, num: int, seed: Optional[int] = None
+) -> List[Dict[str, Any]]:
+    """CSV 随机采样（使用 Polars）"""
+    df = pl.scan_csv(filepath).collect()
+    if len(df) <= num:
+        return _deserialize_complex_fields(df.to_dicts())
+    sampled = df.sample(n=num, seed=seed)
+    return _deserialize_complex_fields(sampled.to_dicts())
-    with open(filepath, "r", encoding="utf-8") as f:
-        for line in f:
-            line = line.strip()
-            if line:
-                buffer.append(json.loads(line))
-    return list(buffer)
+def _stream_random_parquet(
+    filepath: Path, num: int, seed: Optional[int] = None
+) -> List[Dict[str, Any]]:
+    """Parquet 随机采样（使用 Polars）"""
+    df = pl.scan_parquet(filepath).collect()
+    if len(df) <= num:
+        return _deserialize_complex_fields(df.to_dicts())
+    sampled = df.sample(n=num, seed=seed)
+    return _deserialize_complex_fields(sampled.to_dicts())
-def _stream_random_jsonl(
+def _stream_random_arrow(
     filepath: Path, num: int, seed: Optional[int] = None
 ) -> List[Dict[str, Any]]:
-    """
-    JSONL 蓄水池采样（Reservoir Sampling）。
+    """Arrow 随机采样（使用 Polars）"""
+    df = pl.scan_ipc(filepath).collect()
+    if len(df) <= num:
+        return _deserialize_complex_fields(df.to_dicts())
+    sampled = df.sample(n=num, seed=seed)
+    return _deserialize_complex_fields(sampled.to_dicts())
-    单次遍历文件，内存占用 O(num)，适合超大文件随机采样。
-    算法保证每条数据被选中的概率相等。
-    """
-    import random
-    if seed is not None:
-        random.seed(seed)
+# ============ Additional Utilities ============
+def append_to_file(
+    data: List[Dict[str, Any]], filepath: str, file_format: str = "jsonl"
+) -> None:
+    """Append data to an existing file (only JSONL supported)."""
+    filepath = Path(filepath)
+    if file_format != "jsonl":
+        raise ValueError("Only JSONL format supports appending")
+    filepath.parent.mkdir(parents=True, exist_ok=True)
+    with open(filepath, "ab") as f:
+        for item in data:
+            f.write(orjson.dumps(item) + b"\n")
-    reservoir = []  # 蓄水池
+def count_lines(filepath: str) -> int:
+    """Count number of lines in a JSONL file."""
+    count = 0
     with open(filepath, "r", encoding="utf-8") as f:
-        for i, line in enumerate(f):
-            line = line.strip()
-            if not line:
-                continue
+        for _ in f:
+            count += 1
+    return count
-            item = json.loads(line)
-            if len(reservoir) < num:
-                # 蓄水池未满，直接加入
-                reservoir.append(item)
-            else:
-                # 蓄水池已满，以 num/(i+1) 的概率替换
-                j = random.randint(0, i)
-                if j < num:
-                    reservoir[j] = item
+def stream_jsonl(filepath: str, chunk_size: int = 1000):
+    """Stream JSONL file in chunks."""
+    chunk = []
+    with open(filepath, "rb") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                chunk.append(orjson.loads(line))
+                if len(chunk) >= chunk_size:
+                    yield chunk
+                    chunk = []
-    return reservoir
+        if chunk:
+            yield chunk
 # ============ FlaxKV Format ============
-def _save_flaxkv(data: List[Dict[str, Any]], filepath: Path) -> None:
-    """
-    Save data in FlaxKV format.
-    Args:
-        data: List of data items to save
-        filepath: Path to FlaxKV database (directory)
-    """
+def _save_flaxkv(data: List[Dict[str, Any]], filepath: Path) -> None:
+    """Save data in FlaxKV format."""
     from flaxkv2 import FlaxKV
-    # Use the directory name as the database name
     db_name = filepath.stem if filepath.stem else "data"
     db_path = filepath.parent
-    # Create FlaxKV database
     with FlaxKV(db_name, str(db_path)) as db:
-        # Store metadata
-        db["_metadata"] = {
-            "total": len(data),
-            "format": "flaxkv"
-        }
+        db["_metadata"] = {"total": len(data), "format": "flaxkv"}
-        # Store each item with index as key
         for i, item in enumerate(data):
             db[f"item:{i}"] = item
 def _load_flaxkv(filepath: Path) -> List[Dict[str, Any]]:
-    """
-    Load data from FlaxKV format.
-    Args:
-        filepath: Path to FlaxKV database (directory)
-    Returns:
-        List of data items
-    """
+    """Load data from FlaxKV format."""
     from flaxkv2 import FlaxKV
-    # Use the directory name as the database name
     db_name = filepath.stem if filepath.stem else "data"
     db_path = filepath.parent
-    # Open FlaxKV database
     with FlaxKV(db_name, str(db_path)) as db:
-        # Collect all items
         items = []
         for key in sorted(db.keys()):
             if key.startswith("item:"):

dtflow 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

dtflow 0.2.0py3-none-any.whl → 0.3.1py3-none-any.whl