dtflow 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dtflow/__init__.py +1 -1
- dtflow/cli/common.py +13 -9
- dtflow/cli/stats.py +114 -36
- dtflow/core.py +66 -10
- dtflow/lineage.py +17 -0
- dtflow/streaming.py +93 -35
- dtflow/tokenizers.py +84 -29
- dtflow/utils/field_path.py +6 -2
- {dtflow-0.5.0.dist-info → dtflow-0.5.2.dist-info}/METADATA +11 -1
- {dtflow-0.5.0.dist-info → dtflow-0.5.2.dist-info}/RECORD +12 -12
- {dtflow-0.5.0.dist-info → dtflow-0.5.2.dist-info}/WHEEL +0 -0
- {dtflow-0.5.0.dist-info → dtflow-0.5.2.dist-info}/entry_points.txt +0 -0
dtflow/__init__.py
CHANGED
dtflow/cli/common.py
CHANGED
|
@@ -57,7 +57,7 @@ def _get_file_row_count(filepath: Path) -> Optional[int]:
|
|
|
57
57
|
return None
|
|
58
58
|
|
|
59
59
|
|
|
60
|
-
def _format_value(value: Any, max_len: int =
|
|
60
|
+
def _format_value(value: Any, max_len: int = 120) -> str:
|
|
61
61
|
"""格式化单个值,长文本截断。"""
|
|
62
62
|
if value is None:
|
|
63
63
|
return "[dim]null[/dim]"
|
|
@@ -66,18 +66,22 @@ def _format_value(value: Any, max_len: int = 80) -> str:
|
|
|
66
66
|
if isinstance(value, (int, float)):
|
|
67
67
|
return f"[cyan]{value}[/cyan]"
|
|
68
68
|
if isinstance(value, str):
|
|
69
|
+
half_len = max_len // 2
|
|
69
70
|
# 处理多行文本
|
|
70
71
|
if "\n" in value:
|
|
71
72
|
lines = value.split("\n")
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
73
|
+
preview = value.replace("\n", "\\n")
|
|
74
|
+
if len(preview) > max_len:
|
|
75
|
+
# 前半 + 省略标记 + 后半
|
|
76
|
+
head = preview[:half_len]
|
|
77
|
+
tail = preview[-half_len:]
|
|
78
|
+
return f'"{head} [yellow]<<<{len(lines)}行>>>[/yellow] {tail}"'
|
|
78
79
|
return f'"{preview}"'
|
|
79
80
|
if len(value) > max_len:
|
|
80
|
-
|
|
81
|
+
# 前半 + 省略标记 + 后半
|
|
82
|
+
head = value[:half_len]
|
|
83
|
+
tail = value[-half_len:]
|
|
84
|
+
return f'"{head} [yellow]<<<{len(value)}字符>>>[/yellow] {tail}"'
|
|
81
85
|
return f'"{value}"'
|
|
82
86
|
return str(value)
|
|
83
87
|
|
|
@@ -86,7 +90,7 @@ def _format_nested(
|
|
|
86
90
|
value: Any,
|
|
87
91
|
indent: str = "",
|
|
88
92
|
is_last: bool = True,
|
|
89
|
-
max_len: int =
|
|
93
|
+
max_len: int = 120,
|
|
90
94
|
) -> List[str]:
|
|
91
95
|
"""
|
|
92
96
|
递归格式化嵌套结构,返回行列表。
|
dtflow/cli/stats.py
CHANGED
|
@@ -465,34 +465,65 @@ def token_stats(
|
|
|
465
465
|
return
|
|
466
466
|
|
|
467
467
|
total = len(data)
|
|
468
|
-
print(f" 共 {total} 条数据")
|
|
469
|
-
print(f"🔢 统计 Token (模型: {model}, 字段: {field})...")
|
|
468
|
+
print(f" 共 {total:,} 条数据")
|
|
470
469
|
|
|
471
470
|
# 检查字段类型并选择合适的统计方法(支持嵌套路径)
|
|
472
471
|
sample = data[0]
|
|
473
472
|
field_value = get_field_with_spec(sample, field)
|
|
474
473
|
|
|
474
|
+
# 尝试使用 rich 进度条
|
|
475
475
|
try:
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
476
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn
|
|
477
|
+
|
|
478
|
+
with Progress(
|
|
479
|
+
SpinnerColumn(),
|
|
480
|
+
TextColumn("[bold blue]统计 Token"),
|
|
481
|
+
BarColumn(),
|
|
482
|
+
TaskProgressColumn(),
|
|
483
|
+
TextColumn(f"(模型: {model})"),
|
|
484
|
+
) as progress:
|
|
485
|
+
task = progress.add_task("", total=total)
|
|
486
|
+
|
|
487
|
+
def update_progress(current: int, total_count: int):
|
|
488
|
+
progress.update(task, completed=current)
|
|
489
|
+
|
|
490
|
+
if isinstance(field_value, list) and field_value and isinstance(field_value[0], dict):
|
|
491
|
+
from ..tokenizers import messages_token_stats
|
|
492
|
+
|
|
493
|
+
stats_result = messages_token_stats(
|
|
494
|
+
data, messages_field=field, model=model, progress_callback=update_progress
|
|
495
|
+
)
|
|
496
|
+
_print_messages_token_stats(stats_result, detailed)
|
|
497
|
+
else:
|
|
498
|
+
from ..tokenizers import token_stats as compute_token_stats
|
|
499
|
+
|
|
500
|
+
stats_result = compute_token_stats(
|
|
501
|
+
data, fields=field, model=model, progress_callback=update_progress
|
|
502
|
+
)
|
|
503
|
+
_print_text_token_stats(stats_result, detailed)
|
|
504
|
+
|
|
505
|
+
except ImportError:
|
|
506
|
+
# 没有 rich,显示简单进度
|
|
507
|
+
print(f"🔢 统计 Token (模型: {model}, 字段: {field})...")
|
|
508
|
+
try:
|
|
509
|
+
if isinstance(field_value, list) and field_value and isinstance(field_value[0], dict):
|
|
510
|
+
from ..tokenizers import messages_token_stats
|
|
511
|
+
|
|
512
|
+
stats_result = messages_token_stats(data, messages_field=field, model=model)
|
|
513
|
+
_print_messages_token_stats(stats_result, detailed)
|
|
514
|
+
else:
|
|
515
|
+
from ..tokenizers import token_stats as compute_token_stats
|
|
494
516
|
|
|
495
|
-
|
|
517
|
+
stats_result = compute_token_stats(data, fields=field, model=model)
|
|
518
|
+
_print_text_token_stats(stats_result, detailed)
|
|
519
|
+
except ImportError as e:
|
|
520
|
+
print(f"错误: {e}")
|
|
521
|
+
return
|
|
522
|
+
except Exception as e:
|
|
523
|
+
print(f"错误: 统计失败 - {e}")
|
|
524
|
+
import traceback
|
|
525
|
+
|
|
526
|
+
traceback.print_exc()
|
|
496
527
|
|
|
497
528
|
|
|
498
529
|
def _print_messages_token_stats(stats: Dict[str, Any], detailed: bool) -> None:
|
|
@@ -505,21 +536,39 @@ def _print_messages_token_stats(stats: Dict[str, Any], detailed: bool) -> None:
|
|
|
505
536
|
console = Console()
|
|
506
537
|
|
|
507
538
|
# 概览
|
|
539
|
+
std = stats.get("std_tokens", 0)
|
|
508
540
|
overview = (
|
|
509
541
|
f"[bold]总样本数:[/bold] {stats['count']:,}\n"
|
|
510
542
|
f"[bold]总 Token:[/bold] {stats['total_tokens']:,}\n"
|
|
511
|
-
f"[bold]平均 Token:[/bold] {stats['avg_tokens']:,}\n"
|
|
512
|
-
f"[bold]中位数:[/bold] {stats['median_tokens']:,}\n"
|
|
543
|
+
f"[bold]平均 Token:[/bold] {stats['avg_tokens']:,} (std: {std:.1f})\n"
|
|
513
544
|
f"[bold]范围:[/bold] {stats['min_tokens']:,} - {stats['max_tokens']:,}"
|
|
514
545
|
)
|
|
515
546
|
console.print(Panel(overview, title="📊 Token 统计概览", expand=False))
|
|
516
547
|
|
|
548
|
+
# 百分位数表格
|
|
549
|
+
table = Table(title="📈 分布统计")
|
|
550
|
+
table.add_column("百分位", style="cyan", justify="center")
|
|
551
|
+
table.add_column("Token 数", justify="right")
|
|
552
|
+
percentiles = [
|
|
553
|
+
("Min", stats["min_tokens"]),
|
|
554
|
+
("P25", stats.get("p25", "-")),
|
|
555
|
+
("P50 (中位数)", stats.get("median_tokens", "-")),
|
|
556
|
+
("P75", stats.get("p75", "-")),
|
|
557
|
+
("P90", stats.get("p90", "-")),
|
|
558
|
+
("P95", stats.get("p95", "-")),
|
|
559
|
+
("P99", stats.get("p99", "-")),
|
|
560
|
+
("Max", stats["max_tokens"]),
|
|
561
|
+
]
|
|
562
|
+
for name, val in percentiles:
|
|
563
|
+
table.add_row(name, f"{val:,}" if isinstance(val, int) else str(val))
|
|
564
|
+
console.print(table)
|
|
565
|
+
|
|
517
566
|
if detailed:
|
|
518
|
-
#
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
567
|
+
# 分角色统计
|
|
568
|
+
role_table = Table(title="📋 分角色统计")
|
|
569
|
+
role_table.add_column("角色", style="cyan")
|
|
570
|
+
role_table.add_column("Token 数", justify="right")
|
|
571
|
+
role_table.add_column("占比", justify="right")
|
|
523
572
|
|
|
524
573
|
total = stats["total_tokens"]
|
|
525
574
|
for role, key in [
|
|
@@ -529,22 +578,27 @@ def _print_messages_token_stats(stats: Dict[str, Any], detailed: bool) -> None:
|
|
|
529
578
|
]:
|
|
530
579
|
tokens = stats.get(key, 0)
|
|
531
580
|
pct = tokens / total * 100 if total > 0 else 0
|
|
532
|
-
|
|
581
|
+
role_table.add_row(role, f"{tokens:,}", f"{pct:.1f}%")
|
|
533
582
|
|
|
534
|
-
console.print(
|
|
583
|
+
console.print(role_table)
|
|
535
584
|
console.print(f"\n平均对话轮数: {stats.get('avg_turns', 0)}")
|
|
536
585
|
|
|
537
586
|
except ImportError:
|
|
538
587
|
# 没有 rich,使用普通打印
|
|
588
|
+
std = stats.get("std_tokens", 0)
|
|
539
589
|
print(f"\n{'=' * 40}")
|
|
540
590
|
print("📊 Token 统计概览")
|
|
541
591
|
print(f"{'=' * 40}")
|
|
542
592
|
print(f"总样本数: {stats['count']:,}")
|
|
543
593
|
print(f"总 Token: {stats['total_tokens']:,}")
|
|
544
|
-
print(f"平均 Token: {stats['avg_tokens']:,}")
|
|
545
|
-
print(f"中位数: {stats['median_tokens']:,}")
|
|
594
|
+
print(f"平均 Token: {stats['avg_tokens']:,} (std: {std:.1f})")
|
|
546
595
|
print(f"范围: {stats['min_tokens']:,} - {stats['max_tokens']:,}")
|
|
547
596
|
|
|
597
|
+
print(f"\n📈 百分位分布:")
|
|
598
|
+
print(f" P25: {stats.get('p25', '-'):,} P50: {stats.get('median_tokens', '-'):,}")
|
|
599
|
+
print(f" P75: {stats.get('p75', '-'):,} P90: {stats.get('p90', '-'):,}")
|
|
600
|
+
print(f" P95: {stats.get('p95', '-'):,} P99: {stats.get('p99', '-'):,}")
|
|
601
|
+
|
|
548
602
|
if detailed:
|
|
549
603
|
print(f"\n{'=' * 40}")
|
|
550
604
|
print("📋 分角色统计")
|
|
@@ -566,24 +620,48 @@ def _print_text_token_stats(stats: Dict[str, Any], detailed: bool) -> None:
|
|
|
566
620
|
try:
|
|
567
621
|
from rich.console import Console
|
|
568
622
|
from rich.panel import Panel
|
|
623
|
+
from rich.table import Table
|
|
569
624
|
|
|
570
625
|
console = Console()
|
|
571
626
|
|
|
627
|
+
std = stats.get("std_tokens", 0)
|
|
572
628
|
overview = (
|
|
573
629
|
f"[bold]总样本数:[/bold] {stats['count']:,}\n"
|
|
574
630
|
f"[bold]总 Token:[/bold] {stats['total_tokens']:,}\n"
|
|
575
|
-
f"[bold]平均 Token:[/bold] {stats['avg_tokens']:.1f}\n"
|
|
576
|
-
f"[bold]中位数:[/bold] {stats['median_tokens']:,}\n"
|
|
631
|
+
f"[bold]平均 Token:[/bold] {stats['avg_tokens']:.1f} (std: {std:.1f})\n"
|
|
577
632
|
f"[bold]范围:[/bold] {stats['min_tokens']:,} - {stats['max_tokens']:,}"
|
|
578
633
|
)
|
|
579
634
|
console.print(Panel(overview, title="📊 Token 统计", expand=False))
|
|
580
635
|
|
|
636
|
+
# 百分位数表格
|
|
637
|
+
table = Table(title="📈 分布统计")
|
|
638
|
+
table.add_column("百分位", style="cyan", justify="center")
|
|
639
|
+
table.add_column("Token 数", justify="right")
|
|
640
|
+
percentiles = [
|
|
641
|
+
("Min", stats["min_tokens"]),
|
|
642
|
+
("P25", stats.get("p25", "-")),
|
|
643
|
+
("P50 (中位数)", stats.get("median_tokens", "-")),
|
|
644
|
+
("P75", stats.get("p75", "-")),
|
|
645
|
+
("P90", stats.get("p90", "-")),
|
|
646
|
+
("P95", stats.get("p95", "-")),
|
|
647
|
+
("P99", stats.get("p99", "-")),
|
|
648
|
+
("Max", stats["max_tokens"]),
|
|
649
|
+
]
|
|
650
|
+
for name, val in percentiles:
|
|
651
|
+
table.add_row(name, f"{val:,}" if isinstance(val, int) else str(val))
|
|
652
|
+
console.print(table)
|
|
653
|
+
|
|
581
654
|
except ImportError:
|
|
655
|
+
std = stats.get("std_tokens", 0)
|
|
582
656
|
print(f"\n{'=' * 40}")
|
|
583
657
|
print("📊 Token 统计")
|
|
584
658
|
print(f"{'=' * 40}")
|
|
585
659
|
print(f"总样本数: {stats['count']:,}")
|
|
586
660
|
print(f"总 Token: {stats['total_tokens']:,}")
|
|
587
|
-
print(f"平均 Token: {stats['avg_tokens']:.1f}")
|
|
588
|
-
print(f"中位数: {stats['median_tokens']:,}")
|
|
661
|
+
print(f"平均 Token: {stats['avg_tokens']:.1f} (std: {std:.1f})")
|
|
589
662
|
print(f"范围: {stats['min_tokens']:,} - {stats['max_tokens']:,}")
|
|
663
|
+
|
|
664
|
+
print(f"\n📈 百分位分布:")
|
|
665
|
+
print(f" P25: {stats.get('p25', '-'):,} P50: {stats.get('median_tokens', '-'):,}")
|
|
666
|
+
print(f" P75: {stats.get('p75', '-'):,} P90: {stats.get('p90', '-'):,}")
|
|
667
|
+
print(f" P95: {stats.get('p95', '-'):,} P99: {stats.get('p99', '-'):,}")
|
dtflow/core.py
CHANGED
|
@@ -793,19 +793,29 @@ class DataTransformer:
|
|
|
793
793
|
seed: 随机种子
|
|
794
794
|
|
|
795
795
|
Returns:
|
|
796
|
-
(train, test) 两个 DataTransformer
|
|
796
|
+
(train, test) 两个 DataTransformer,各自拥有独立的血缘追踪器
|
|
797
797
|
"""
|
|
798
798
|
data = self.shuffle(seed).data
|
|
799
799
|
split_idx = int(len(data) * ratio)
|
|
800
800
|
|
|
801
|
-
#
|
|
801
|
+
# 分割后血缘追踪器各自独立(使用深拷贝避免相互影响)
|
|
802
802
|
tracker = self._lineage_tracker
|
|
803
|
+
train_tracker = None
|
|
804
|
+
test_tracker = None
|
|
805
|
+
|
|
803
806
|
if tracker:
|
|
804
807
|
tracker.record("split", {"ratio": ratio, "seed": seed}, len(self._data), len(data))
|
|
808
|
+
# 为每个子数据集创建独立的追踪器副本
|
|
809
|
+
train_tracker = tracker.copy()
|
|
810
|
+
train_tracker.record("split_part", {"part": "train", "ratio": ratio}, len(data), split_idx)
|
|
811
|
+
test_tracker = tracker.copy()
|
|
812
|
+
test_tracker.record(
|
|
813
|
+
"split_part", {"part": "test", "ratio": 1 - ratio}, len(data), len(data) - split_idx
|
|
814
|
+
)
|
|
805
815
|
|
|
806
816
|
return (
|
|
807
|
-
DataTransformer(data[:split_idx], _lineage_tracker=
|
|
808
|
-
DataTransformer(data[split_idx:], _lineage_tracker=
|
|
817
|
+
DataTransformer(data[:split_idx], _lineage_tracker=train_tracker),
|
|
818
|
+
DataTransformer(data[split_idx:], _lineage_tracker=test_tracker),
|
|
809
819
|
)
|
|
810
820
|
|
|
811
821
|
# ============ 并行处理 ============
|
|
@@ -815,6 +825,7 @@ class DataTransformer:
|
|
|
815
825
|
func: Callable[[Dict], Any],
|
|
816
826
|
workers: Optional[int] = None,
|
|
817
827
|
chunksize: int = 1000,
|
|
828
|
+
timeout: Optional[float] = None,
|
|
818
829
|
) -> List[Any]:
|
|
819
830
|
"""
|
|
820
831
|
并行执行转换函数(使用多进程)。
|
|
@@ -825,24 +836,46 @@ class DataTransformer:
|
|
|
825
836
|
func: 转换函数,接收原始 dict,返回转换结果
|
|
826
837
|
workers: 进程数,默认为 CPU 核心数
|
|
827
838
|
chunksize: 每个进程处理的数据块大小
|
|
839
|
+
timeout: 超时时间(秒),None 表示无超时
|
|
828
840
|
|
|
829
841
|
Returns:
|
|
830
842
|
转换后的结果列表
|
|
831
843
|
|
|
844
|
+
Raises:
|
|
845
|
+
TypeError: 如果 func 无法被 pickle(如 lambda 函数)
|
|
846
|
+
RuntimeError: 如果子进程执行出错或超时
|
|
847
|
+
|
|
832
848
|
Examples:
|
|
833
849
|
>>> def transform(item):
|
|
834
850
|
... return {"id": item["id"], "text": item["text"].upper()}
|
|
835
851
|
>>> results = dt.map_parallel(transform)
|
|
836
852
|
"""
|
|
837
|
-
from multiprocessing import Pool, cpu_count
|
|
853
|
+
from multiprocessing import Pool, TimeoutError, cpu_count
|
|
854
|
+
import pickle
|
|
838
855
|
|
|
839
856
|
if not self._data:
|
|
840
857
|
return []
|
|
841
858
|
|
|
859
|
+
# 检查函数是否可 pickle
|
|
860
|
+
try:
|
|
861
|
+
pickle.dumps(func)
|
|
862
|
+
except (pickle.PicklingError, AttributeError, TypeError) as e:
|
|
863
|
+
func_name = getattr(func, "__name__", str(func))
|
|
864
|
+
raise TypeError(
|
|
865
|
+
f"函数 '{func_name}' 无法被 pickle,不能用于并行处理。"
|
|
866
|
+
f"请使用模块级函数而非 lambda 或闭包。错误: {e}"
|
|
867
|
+
) from e
|
|
868
|
+
|
|
842
869
|
workers = workers or cpu_count()
|
|
843
870
|
|
|
844
|
-
|
|
845
|
-
|
|
871
|
+
try:
|
|
872
|
+
with Pool(workers) as pool:
|
|
873
|
+
async_result = pool.map_async(func, self._data, chunksize=chunksize)
|
|
874
|
+
results = async_result.get(timeout=timeout)
|
|
875
|
+
except TimeoutError:
|
|
876
|
+
raise RuntimeError(f"并行处理超时({timeout}秒)")
|
|
877
|
+
except Exception as e:
|
|
878
|
+
raise RuntimeError(f"并行处理失败: {type(e).__name__}: {e}") from e
|
|
846
879
|
|
|
847
880
|
return results
|
|
848
881
|
|
|
@@ -851,6 +884,7 @@ class DataTransformer:
|
|
|
851
884
|
func: Callable[[Dict], bool],
|
|
852
885
|
workers: Optional[int] = None,
|
|
853
886
|
chunksize: int = 1000,
|
|
887
|
+
timeout: Optional[float] = None,
|
|
854
888
|
) -> "DataTransformer":
|
|
855
889
|
"""
|
|
856
890
|
并行执行过滤函数(使用多进程)。
|
|
@@ -861,24 +895,46 @@ class DataTransformer:
|
|
|
861
895
|
func: 过滤函数,接收原始 dict,返回 True 保留
|
|
862
896
|
workers: 进程数,默认为 CPU 核心数
|
|
863
897
|
chunksize: 每个进程处理的数据块大小
|
|
898
|
+
timeout: 超时时间(秒),None 表示无超时
|
|
864
899
|
|
|
865
900
|
Returns:
|
|
866
901
|
过滤后的新 DataTransformer
|
|
867
902
|
|
|
903
|
+
Raises:
|
|
904
|
+
TypeError: 如果 func 无法被 pickle(如 lambda 函数)
|
|
905
|
+
RuntimeError: 如果子进程执行出错或超时
|
|
906
|
+
|
|
868
907
|
Examples:
|
|
869
908
|
>>> def is_valid(item):
|
|
870
909
|
... return len(item["text"]) > 10
|
|
871
910
|
>>> filtered = dt.filter_parallel(is_valid)
|
|
872
911
|
"""
|
|
873
|
-
from multiprocessing import Pool, cpu_count
|
|
912
|
+
from multiprocessing import Pool, TimeoutError, cpu_count
|
|
913
|
+
import pickle
|
|
874
914
|
|
|
875
915
|
if not self._data:
|
|
876
916
|
return DataTransformer([])
|
|
877
917
|
|
|
918
|
+
# 检查函数是否可 pickle
|
|
919
|
+
try:
|
|
920
|
+
pickle.dumps(func)
|
|
921
|
+
except (pickle.PicklingError, AttributeError, TypeError) as e:
|
|
922
|
+
func_name = getattr(func, "__name__", str(func))
|
|
923
|
+
raise TypeError(
|
|
924
|
+
f"函数 '{func_name}' 无法被 pickle,不能用于并行处理。"
|
|
925
|
+
f"请使用模块级函数而非 lambda 或闭包。错误: {e}"
|
|
926
|
+
) from e
|
|
927
|
+
|
|
878
928
|
workers = workers or cpu_count()
|
|
879
929
|
|
|
880
|
-
|
|
881
|
-
|
|
930
|
+
try:
|
|
931
|
+
with Pool(workers) as pool:
|
|
932
|
+
async_result = pool.map_async(func, self._data, chunksize=chunksize)
|
|
933
|
+
mask = async_result.get(timeout=timeout)
|
|
934
|
+
except TimeoutError:
|
|
935
|
+
raise RuntimeError(f"并行处理超时({timeout}秒)")
|
|
936
|
+
except Exception as e:
|
|
937
|
+
raise RuntimeError(f"并行处理失败: {type(e).__name__}: {e}") from e
|
|
882
938
|
|
|
883
939
|
filtered = [item for item, keep in zip(self._data, mask) if keep]
|
|
884
940
|
return DataTransformer(filtered)
|
dtflow/lineage.py
CHANGED
|
@@ -237,6 +237,23 @@ class LineageTracker:
|
|
|
237
237
|
|
|
238
238
|
return lineage_path
|
|
239
239
|
|
|
240
|
+
def copy(self) -> "LineageTracker":
|
|
241
|
+
"""
|
|
242
|
+
创建追踪器的深拷贝。
|
|
243
|
+
|
|
244
|
+
用于 split() 等场景,确保子数据集有独立的血缘追踪。
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
新的 LineageTracker 实例
|
|
248
|
+
"""
|
|
249
|
+
import copy as copy_module
|
|
250
|
+
|
|
251
|
+
new_tracker = LineageTracker.__new__(LineageTracker)
|
|
252
|
+
new_tracker.source_path = self.source_path
|
|
253
|
+
new_tracker.source_lineage = self.source_lineage # LineageRecord 是不可变的,可共享
|
|
254
|
+
new_tracker.operations = copy_module.deepcopy(self.operations)
|
|
255
|
+
return new_tracker
|
|
256
|
+
|
|
240
257
|
|
|
241
258
|
def _sanitize_params(params: Dict[str, Any]) -> Dict[str, Any]:
|
|
242
259
|
"""
|
dtflow/streaming.py
CHANGED
|
@@ -365,50 +365,108 @@ class StreamingTransformer:
|
|
|
365
365
|
"""
|
|
366
366
|
批量流式保存(CSV/Parquet/Arrow)。
|
|
367
367
|
|
|
368
|
-
|
|
368
|
+
真正的流式写入:分批处理,每批写入后释放内存。
|
|
369
|
+
内存占用 O(batch_size) 而非 O(n)。
|
|
369
370
|
"""
|
|
370
371
|
path = Path(filepath)
|
|
371
|
-
|
|
372
|
+
count = 0
|
|
373
|
+
batch = []
|
|
374
|
+
first_batch = True
|
|
372
375
|
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
if self._total is not None:
|
|
376
|
-
columns = [
|
|
377
|
-
SpinnerColumn(),
|
|
378
|
-
TextColumn("[progress.description]{task.description}"),
|
|
379
|
-
BarColumn(),
|
|
380
|
-
TaskProgressColumn(),
|
|
381
|
-
MofNCompleteColumn(),
|
|
382
|
-
TimeElapsedColumn(),
|
|
383
|
-
TimeRemainingColumn(),
|
|
384
|
-
]
|
|
385
|
-
else:
|
|
386
|
-
columns = [
|
|
387
|
-
SpinnerColumn(),
|
|
388
|
-
TextColumn("[progress.description]{task.description}"),
|
|
389
|
-
MofNCompleteColumn(),
|
|
390
|
-
TimeElapsedColumn(),
|
|
391
|
-
]
|
|
376
|
+
# 进度条配置
|
|
377
|
+
progress_columns = self._get_progress_columns()
|
|
392
378
|
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
for item in self._iterator:
|
|
400
|
-
all_items.append(item)
|
|
379
|
+
def write_batch(items: List[Dict], is_first: bool, writer_state: Dict):
|
|
380
|
+
"""写入一批数据"""
|
|
381
|
+
if not items:
|
|
382
|
+
return
|
|
383
|
+
|
|
384
|
+
df = pl.DataFrame(items)
|
|
401
385
|
|
|
402
|
-
if all_items:
|
|
403
|
-
df = pl.DataFrame(all_items)
|
|
404
386
|
if fmt == "csv":
|
|
405
|
-
|
|
387
|
+
if is_first:
|
|
388
|
+
df.write_csv(path)
|
|
389
|
+
else:
|
|
390
|
+
# CSV 追加模式:不写表头
|
|
391
|
+
with open(path, "ab") as f:
|
|
392
|
+
f.write(df.write_csv(include_header=False).encode("utf-8"))
|
|
393
|
+
|
|
406
394
|
elif fmt == "parquet":
|
|
407
|
-
|
|
395
|
+
import pyarrow as pa
|
|
396
|
+
import pyarrow.parquet as pq
|
|
397
|
+
|
|
398
|
+
table = df.to_arrow()
|
|
399
|
+
if is_first:
|
|
400
|
+
writer_state["writer"] = pq.ParquetWriter(str(path), table.schema)
|
|
401
|
+
writer_state["writer"].write_table(table)
|
|
402
|
+
|
|
408
403
|
elif fmt == "arrow":
|
|
409
|
-
|
|
404
|
+
import pyarrow as pa
|
|
405
|
+
|
|
406
|
+
table = df.to_arrow()
|
|
407
|
+
if is_first:
|
|
408
|
+
writer_state["writer"] = pa.ipc.new_file(str(path), table.schema)
|
|
409
|
+
for record_batch in table.to_batches():
|
|
410
|
+
writer_state["writer"].write_batch(record_batch)
|
|
411
|
+
|
|
412
|
+
writer_state: Dict[str, Any] = {}
|
|
413
|
+
|
|
414
|
+
try:
|
|
415
|
+
if show_progress:
|
|
416
|
+
with Progress(*progress_columns) as progress:
|
|
417
|
+
task = progress.add_task("处理中", total=self._total)
|
|
418
|
+
for item in self._iterator:
|
|
419
|
+
batch.append(item)
|
|
420
|
+
count += 1
|
|
421
|
+
progress.update(task, advance=1)
|
|
422
|
+
|
|
423
|
+
if len(batch) >= batch_size:
|
|
424
|
+
write_batch(batch, first_batch, writer_state)
|
|
425
|
+
first_batch = False
|
|
426
|
+
batch = [] # 释放内存
|
|
427
|
+
|
|
428
|
+
# 写入最后一批
|
|
429
|
+
if batch:
|
|
430
|
+
write_batch(batch, first_batch, writer_state)
|
|
431
|
+
else:
|
|
432
|
+
for item in self._iterator:
|
|
433
|
+
batch.append(item)
|
|
434
|
+
count += 1
|
|
435
|
+
|
|
436
|
+
if len(batch) >= batch_size:
|
|
437
|
+
write_batch(batch, first_batch, writer_state)
|
|
438
|
+
first_batch = False
|
|
439
|
+
batch = []
|
|
410
440
|
|
|
411
|
-
|
|
441
|
+
if batch:
|
|
442
|
+
write_batch(batch, first_batch, writer_state)
|
|
443
|
+
|
|
444
|
+
finally:
|
|
445
|
+
# 关闭 writer
|
|
446
|
+
if "writer" in writer_state:
|
|
447
|
+
writer_state["writer"].close()
|
|
448
|
+
|
|
449
|
+
return count
|
|
450
|
+
|
|
451
|
+
def _get_progress_columns(self):
|
|
452
|
+
"""获取进度条列配置"""
|
|
453
|
+
if self._total is not None:
|
|
454
|
+
return [
|
|
455
|
+
SpinnerColumn(),
|
|
456
|
+
TextColumn("[progress.description]{task.description}"),
|
|
457
|
+
BarColumn(),
|
|
458
|
+
TaskProgressColumn(),
|
|
459
|
+
MofNCompleteColumn(),
|
|
460
|
+
TimeElapsedColumn(),
|
|
461
|
+
TimeRemainingColumn(),
|
|
462
|
+
]
|
|
463
|
+
else:
|
|
464
|
+
return [
|
|
465
|
+
SpinnerColumn(),
|
|
466
|
+
TextColumn("[progress.description]{task.description}"),
|
|
467
|
+
MofNCompleteColumn(),
|
|
468
|
+
TimeElapsedColumn(),
|
|
469
|
+
]
|
|
412
470
|
|
|
413
471
|
def save_sharded(
|
|
414
472
|
self,
|
dtflow/tokenizers.py
CHANGED
|
@@ -210,7 +210,10 @@ def token_counter(
|
|
|
210
210
|
创建 token 计数转换函数。
|
|
211
211
|
|
|
212
212
|
Args:
|
|
213
|
-
fields:
|
|
213
|
+
fields: 要统计的字段(单个或多个),支持嵌套路径语法
|
|
214
|
+
- 简单字段: "text"
|
|
215
|
+
- 嵌套字段: "meta.content", "data.text"
|
|
216
|
+
- 索引: "messages[0].content", "messages[-1].content"
|
|
214
217
|
model: 模型名称或别名,如 "qwen2.5", "gpt-4", "llama3" 等
|
|
215
218
|
backend: 后端选择,None 则自动检测
|
|
216
219
|
output_field: 输出字段名
|
|
@@ -221,6 +224,7 @@ def token_counter(
|
|
|
221
224
|
Examples:
|
|
222
225
|
>>> dt.transform(token_counter("text"))
|
|
223
226
|
>>> dt.transform(token_counter(["question", "answer"], model="qwen3"))
|
|
227
|
+
>>> dt.transform(token_counter("messages[-1].content")) # 最后一条消息
|
|
224
228
|
"""
|
|
225
229
|
if isinstance(fields, str):
|
|
226
230
|
fields = [fields]
|
|
@@ -229,7 +233,7 @@ def token_counter(
|
|
|
229
233
|
result = item.to_dict() if hasattr(item, "to_dict") else dict(item)
|
|
230
234
|
total = 0
|
|
231
235
|
for field in fields:
|
|
232
|
-
value = item
|
|
236
|
+
value = get_field_with_spec(item, field, default="")
|
|
233
237
|
if value:
|
|
234
238
|
total += count_tokens(str(value), model=model, backend=backend)
|
|
235
239
|
result[output_field] = total
|
|
@@ -249,7 +253,10 @@ def token_filter(
|
|
|
249
253
|
创建基于 token 长度的过滤函数。
|
|
250
254
|
|
|
251
255
|
Args:
|
|
252
|
-
fields:
|
|
256
|
+
fields: 要统计的字段(单个或多个),支持嵌套路径语法
|
|
257
|
+
- 简单字段: "text"
|
|
258
|
+
- 嵌套字段: "meta.content", "data.text"
|
|
259
|
+
- 索引: "messages[0].content", "messages[-1].content"
|
|
253
260
|
min_tokens: 最小 token 数(包含)
|
|
254
261
|
max_tokens: 最大 token 数(包含)
|
|
255
262
|
model: 模型名称
|
|
@@ -261,6 +268,7 @@ def token_filter(
|
|
|
261
268
|
Examples:
|
|
262
269
|
>>> dt.filter(token_filter("text", min_tokens=10, max_tokens=512))
|
|
263
270
|
>>> dt.filter(token_filter(["q", "a"], max_tokens=2048))
|
|
271
|
+
>>> dt.filter(token_filter("messages[-1].content", max_tokens=1024))
|
|
264
272
|
"""
|
|
265
273
|
if isinstance(fields, str):
|
|
266
274
|
fields = [fields]
|
|
@@ -268,7 +276,7 @@ def token_filter(
|
|
|
268
276
|
def filter_func(item) -> bool:
|
|
269
277
|
total = 0
|
|
270
278
|
for field in fields:
|
|
271
|
-
value = item
|
|
279
|
+
value = get_field_with_spec(item, field, default="")
|
|
272
280
|
if value:
|
|
273
281
|
total += count_tokens(str(value), model=model, backend=backend)
|
|
274
282
|
|
|
@@ -281,11 +289,32 @@ def token_filter(
|
|
|
281
289
|
return filter_func
|
|
282
290
|
|
|
283
291
|
|
|
292
|
+
def _percentile(sorted_data: List[int], p: float) -> int:
|
|
293
|
+
"""计算百分位数"""
|
|
294
|
+
n = len(sorted_data)
|
|
295
|
+
if n == 0:
|
|
296
|
+
return 0
|
|
297
|
+
idx = (n - 1) * p / 100
|
|
298
|
+
lower = int(idx)
|
|
299
|
+
upper = min(lower + 1, n - 1)
|
|
300
|
+
weight = idx - lower
|
|
301
|
+
return int(sorted_data[lower] * (1 - weight) + sorted_data[upper] * weight)
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def _std(counts: List[int], avg: float) -> float:
|
|
305
|
+
"""计算标准差"""
|
|
306
|
+
if len(counts) < 2:
|
|
307
|
+
return 0.0
|
|
308
|
+
variance = sum((x - avg) ** 2 for x in counts) / len(counts)
|
|
309
|
+
return variance**0.5
|
|
310
|
+
|
|
311
|
+
|
|
284
312
|
def token_stats(
|
|
285
313
|
data: List[Dict[str, Any]],
|
|
286
314
|
fields: Union[str, List[str]],
|
|
287
315
|
model: str = DEFAULT_MODEL,
|
|
288
316
|
backend: Optional[str] = None,
|
|
317
|
+
progress_callback: Optional[Callable[[int, int], None]] = None,
|
|
289
318
|
) -> Dict[str, Any]:
|
|
290
319
|
"""
|
|
291
320
|
统计数据集的 token 信息。
|
|
@@ -295,9 +324,17 @@ def token_stats(
|
|
|
295
324
|
fields: 要统计的字段,支持嵌套路径语法(如 meta.text, messages[-1].content)
|
|
296
325
|
model: 模型名称或别名,如 "qwen2.5", "gpt-4" 等
|
|
297
326
|
backend: 后端选择,None 则自动检测
|
|
327
|
+
progress_callback: 进度回调函数,接收 (current, total) 两个参数
|
|
298
328
|
|
|
299
329
|
Returns:
|
|
300
|
-
|
|
330
|
+
统计信息字典,包含:
|
|
331
|
+
- total_tokens: 总 token 数
|
|
332
|
+
- count: 样本数
|
|
333
|
+
- avg_tokens: 平均 token 数
|
|
334
|
+
- std_tokens: 标准差
|
|
335
|
+
- min_tokens, max_tokens: 最小/最大值
|
|
336
|
+
- median_tokens: 中位数 (p50)
|
|
337
|
+
- p25, p75, p90, p95, p99: 百分位数
|
|
301
338
|
"""
|
|
302
339
|
if isinstance(fields, str):
|
|
303
340
|
fields = [fields]
|
|
@@ -306,21 +343,33 @@ def token_stats(
|
|
|
306
343
|
return {"total_tokens": 0, "count": 0}
|
|
307
344
|
|
|
308
345
|
counts = []
|
|
309
|
-
|
|
346
|
+
total_items = len(data)
|
|
347
|
+
for i, item in enumerate(data):
|
|
310
348
|
total = 0
|
|
311
349
|
for field in fields:
|
|
312
350
|
value = get_field_with_spec(item, field, default="")
|
|
313
351
|
if value:
|
|
314
352
|
total += count_tokens(str(value), model=model, backend=backend)
|
|
315
353
|
counts.append(total)
|
|
354
|
+
if progress_callback:
|
|
355
|
+
progress_callback(i + 1, total_items)
|
|
356
|
+
|
|
357
|
+
sorted_counts = sorted(counts)
|
|
358
|
+
avg = sum(counts) / len(counts)
|
|
316
359
|
|
|
317
360
|
return {
|
|
318
361
|
"total_tokens": sum(counts),
|
|
319
362
|
"count": len(counts),
|
|
320
|
-
"avg_tokens":
|
|
363
|
+
"avg_tokens": avg,
|
|
364
|
+
"std_tokens": _std(counts, avg),
|
|
321
365
|
"min_tokens": min(counts),
|
|
322
366
|
"max_tokens": max(counts),
|
|
323
|
-
"median_tokens":
|
|
367
|
+
"median_tokens": _percentile(sorted_counts, 50),
|
|
368
|
+
"p25": _percentile(sorted_counts, 25),
|
|
369
|
+
"p75": _percentile(sorted_counts, 75),
|
|
370
|
+
"p90": _percentile(sorted_counts, 90),
|
|
371
|
+
"p95": _percentile(sorted_counts, 95),
|
|
372
|
+
"p99": _percentile(sorted_counts, 99),
|
|
324
373
|
}
|
|
325
374
|
|
|
326
375
|
|
|
@@ -504,6 +553,7 @@ def messages_token_stats(
|
|
|
504
553
|
messages_field: str = "messages",
|
|
505
554
|
model: str = DEFAULT_MODEL,
|
|
506
555
|
backend: Optional[str] = None,
|
|
556
|
+
progress_callback: Optional[Callable[[int, int], None]] = None,
|
|
507
557
|
) -> Dict[str, Any]:
|
|
508
558
|
"""
|
|
509
559
|
统计数据集中 messages 的 token 信息。
|
|
@@ -513,25 +563,18 @@ def messages_token_stats(
|
|
|
513
563
|
messages_field: messages 字段名,支持嵌套路径语法(如 conversation.messages)
|
|
514
564
|
model: 模型名称或别名
|
|
515
565
|
backend: 后端,None 则自动检测
|
|
566
|
+
progress_callback: 进度回调函数,接收 (current, total) 两个参数
|
|
516
567
|
|
|
517
568
|
Returns:
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
"user_tokens": 200000,
|
|
528
|
-
"assistant_tokens": 290000,
|
|
529
|
-
"system_tokens": 10000,
|
|
530
|
-
"avg_tokens": 500,
|
|
531
|
-
"max_tokens": 2048,
|
|
532
|
-
"min_tokens": 50,
|
|
533
|
-
"avg_turns": 4,
|
|
534
|
-
}
|
|
569
|
+
统计信息字典,包含:
|
|
570
|
+
- count: 样本数
|
|
571
|
+
- total_tokens: 总 token 数
|
|
572
|
+
- user_tokens, assistant_tokens, system_tokens: 各角色 token 数
|
|
573
|
+
- avg_tokens, std_tokens: 平均值和标准差
|
|
574
|
+
- min_tokens, max_tokens: 最小/最大值
|
|
575
|
+
- median_tokens: 中位数
|
|
576
|
+
- p25, p75, p90, p95, p99: 百分位数
|
|
577
|
+
- avg_turns: 平均对话轮数
|
|
535
578
|
"""
|
|
536
579
|
_backend = backend or _auto_backend(model)
|
|
537
580
|
|
|
@@ -539,24 +582,36 @@ def messages_token_stats(
|
|
|
539
582
|
return {"count": 0, "total_tokens": 0}
|
|
540
583
|
|
|
541
584
|
all_stats = []
|
|
542
|
-
|
|
585
|
+
total_items = len(data)
|
|
586
|
+
for i, item in enumerate(data):
|
|
543
587
|
messages = get_field_with_spec(item, messages_field, default=[])
|
|
544
588
|
if messages:
|
|
545
589
|
all_stats.append(_count_messages_tokens(messages, model=model, backend=_backend))
|
|
590
|
+
if progress_callback:
|
|
591
|
+
progress_callback(i + 1, total_items)
|
|
546
592
|
|
|
547
593
|
if not all_stats:
|
|
548
594
|
return {"count": 0, "total_tokens": 0}
|
|
549
595
|
|
|
550
596
|
totals = [s["total"] for s in all_stats]
|
|
597
|
+
sorted_totals = sorted(totals)
|
|
598
|
+
avg = sum(totals) / len(totals)
|
|
599
|
+
|
|
551
600
|
return {
|
|
552
601
|
"count": len(all_stats),
|
|
553
602
|
"total_tokens": sum(totals),
|
|
554
603
|
"user_tokens": sum(s["user"] for s in all_stats),
|
|
555
604
|
"assistant_tokens": sum(s["assistant"] for s in all_stats),
|
|
556
605
|
"system_tokens": sum(s["system"] for s in all_stats),
|
|
557
|
-
"avg_tokens":
|
|
558
|
-
"
|
|
606
|
+
"avg_tokens": int(avg),
|
|
607
|
+
"std_tokens": _std(totals, avg),
|
|
559
608
|
"min_tokens": min(totals),
|
|
560
|
-
"
|
|
609
|
+
"max_tokens": max(totals),
|
|
610
|
+
"median_tokens": _percentile(sorted_totals, 50),
|
|
611
|
+
"p25": _percentile(sorted_totals, 25),
|
|
612
|
+
"p75": _percentile(sorted_totals, 75),
|
|
613
|
+
"p90": _percentile(sorted_totals, 90),
|
|
614
|
+
"p95": _percentile(sorted_totals, 95),
|
|
615
|
+
"p99": _percentile(sorted_totals, 99),
|
|
561
616
|
"avg_turns": sum(s["turns"] for s in all_stats) // len(all_stats),
|
|
562
617
|
}
|
dtflow/utils/field_path.py
CHANGED
|
@@ -96,7 +96,9 @@ def _parse_path(path: str) -> List[Union[str, int, Literal["*", "#"]]]:
|
|
|
96
96
|
continue
|
|
97
97
|
|
|
98
98
|
# 解析 field[index] 格式
|
|
99
|
-
match = re.match(
|
|
99
|
+
match = re.match(
|
|
100
|
+
r"([a-zA-Z_\u4e00-\u9fff][a-zA-Z0-9_\u4e00-\u9fff]*)?(?:\[(-?\d+|\*)\])?", part
|
|
101
|
+
)
|
|
100
102
|
if match:
|
|
101
103
|
field_name, index = match.groups()
|
|
102
104
|
|
|
@@ -175,10 +177,12 @@ def _get_value_by_segments(
|
|
|
175
177
|
|
|
176
178
|
return values
|
|
177
179
|
|
|
178
|
-
#
|
|
180
|
+
# 字典字段访问(支持 dict 和类 dict 对象如 DictWrapper)
|
|
179
181
|
if isinstance(seg, str):
|
|
180
182
|
if isinstance(current, dict):
|
|
181
183
|
current = current.get(seg)
|
|
184
|
+
elif hasattr(current, "get"):
|
|
185
|
+
current = current.get(seg)
|
|
182
186
|
else:
|
|
183
187
|
return None
|
|
184
188
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dtflow
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.2
|
|
4
4
|
Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
|
|
5
5
|
Project-URL: Homepage, https://github.com/yourusername/DataTransformer
|
|
6
6
|
Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
|
|
@@ -32,16 +32,26 @@ Requires-Dist: orjson>=3.9.0
|
|
|
32
32
|
Requires-Dist: polars>=0.20.0
|
|
33
33
|
Requires-Dist: pyyaml>=5.4.0
|
|
34
34
|
Requires-Dist: rich>=10.0.0
|
|
35
|
+
Requires-Dist: tiktoken>=0.5.0
|
|
35
36
|
Requires-Dist: typer>=0.9.0
|
|
36
37
|
Provides-Extra: converters
|
|
37
38
|
Requires-Dist: datasets>=2.0.0; extra == 'converters'
|
|
38
39
|
Provides-Extra: dev
|
|
39
40
|
Requires-Dist: black>=21.0; extra == 'dev'
|
|
41
|
+
Requires-Dist: datasets>=2.0.0; extra == 'dev'
|
|
42
|
+
Requires-Dist: datasketch>=1.5.0; extra == 'dev'
|
|
40
43
|
Requires-Dist: flake8>=3.9.0; extra == 'dev'
|
|
44
|
+
Requires-Dist: huggingface-hub>=0.20.0; extra == 'dev'
|
|
41
45
|
Requires-Dist: isort>=5.9.0; extra == 'dev'
|
|
42
46
|
Requires-Dist: mypy>=0.910; extra == 'dev'
|
|
47
|
+
Requires-Dist: pyarrow; extra == 'dev'
|
|
43
48
|
Requires-Dist: pytest-cov>=2.12.0; extra == 'dev'
|
|
44
49
|
Requires-Dist: pytest>=6.0.0; extra == 'dev'
|
|
50
|
+
Requires-Dist: rich>=10.0.0; extra == 'dev'
|
|
51
|
+
Requires-Dist: scikit-learn>=0.24.0; extra == 'dev'
|
|
52
|
+
Requires-Dist: tiktoken>=0.5.0; extra == 'dev'
|
|
53
|
+
Requires-Dist: tokenizers>=0.15.0; extra == 'dev'
|
|
54
|
+
Requires-Dist: toolong>=1.5.0; extra == 'dev'
|
|
45
55
|
Provides-Extra: display
|
|
46
56
|
Provides-Extra: docs
|
|
47
57
|
Requires-Dist: myst-parser>=0.15.0; extra == 'docs'
|
|
@@ -1,23 +1,23 @@
|
|
|
1
|
-
dtflow/__init__.py,sha256=
|
|
1
|
+
dtflow/__init__.py,sha256=PTqh_6-F6eEwg1RxQ0ueP6CYnZauMuqYhlZe2BJphr0,3031
|
|
2
2
|
dtflow/__main__.py,sha256=ySpqvEn7k-vsrYFPx-8O6p-yx_24KccgnOSPd2XybhM,12572
|
|
3
3
|
dtflow/converters.py,sha256=gyy-K15zjzGBawFnZa8D9JX37JZ47rey2GhjKa2pxFo,22081
|
|
4
|
-
dtflow/core.py,sha256=
|
|
4
|
+
dtflow/core.py,sha256=qMo6B3LK--TWRK7ZBKObGcs3pKFnd0NPoaM0T8JC7Jw,38135
|
|
5
5
|
dtflow/framework.py,sha256=jyICi_RWHjX7WfsXdSbWmP1SL7y1OWSPyd5G5Y-lvg4,17578
|
|
6
|
-
dtflow/lineage.py,sha256=
|
|
6
|
+
dtflow/lineage.py,sha256=jie3OL1qK90-_cOOqqLbhSJ1oGUktDM1x5HRpQ5Qiyc,12800
|
|
7
7
|
dtflow/pipeline.py,sha256=zZaC4fg5vsp_30Fhbg75vu0yggsdvf28bWBiVDWzZ6Y,13901
|
|
8
8
|
dtflow/presets.py,sha256=OP1nnM5NFk5Kli9FsXK0xAot48E5OQ6-VOIJT9ffXPg,5023
|
|
9
9
|
dtflow/schema.py,sha256=IFcij22_UFKcgKT1YWwRg2QJO0vcAvCb1arZmsGByts,16824
|
|
10
|
-
dtflow/streaming.py,sha256=
|
|
11
|
-
dtflow/tokenizers.py,sha256=
|
|
10
|
+
dtflow/streaming.py,sha256=dxpNd1-Wz_PTLTdvM5qn06_2TJr5NRlIIuw0LOSS2Iw,24755
|
|
11
|
+
dtflow/tokenizers.py,sha256=7ZAelSmcDxLWH5kICgH9Q1ULH3_BfDZb9suHMjJJRZU,20589
|
|
12
12
|
dtflow/cli/__init__.py,sha256=QhZ-thgx9IBTFII7T_hdoWFUl0CCsdGQHN5ZEZw2XB0,423
|
|
13
13
|
dtflow/cli/clean.py,sha256=y9VCRibgK1j8WIY3h0XZX0m93EdELQC7TdnseMWwS-0,17799
|
|
14
14
|
dtflow/cli/commands.py,sha256=ST65Ox_MKu-CKAtPVaxECAPXYOJiF7BhL32A4nsZZl0,1175
|
|
15
|
-
dtflow/cli/common.py,sha256=
|
|
15
|
+
dtflow/cli/common.py,sha256=nIPc9GBK61r6kmaI9OS3IyhcfPqShpDEHx1ddjFPnlM,13131
|
|
16
16
|
dtflow/cli/io_ops.py,sha256=BMDisP6dxzzmSjYwmeFwaHmpHHPqirmXAWeNTD-9MQM,13254
|
|
17
17
|
dtflow/cli/lineage.py,sha256=_lNh35nF9AA0Zy6FyZ4g8IzrXH2ZQnp3inF-o2Hs1pw,1383
|
|
18
18
|
dtflow/cli/pipeline.py,sha256=QNEo-BJlaC1CVnVeRZr7TwfuZYloJ4TebIzJ5ALzry0,1426
|
|
19
19
|
dtflow/cli/sample.py,sha256=vPTQlF0OXEry4QjO8uaD9vOae4AQbX9zDwVYOxg59ZI,10339
|
|
20
|
-
dtflow/cli/stats.py,sha256=
|
|
20
|
+
dtflow/cli/stats.py,sha256=u4ehCfgw1X8WuOyAjrApMRgcIO3BVmINbsTjxEscQro,24086
|
|
21
21
|
dtflow/cli/transform.py,sha256=w6xqMOxPxQvL2u_BPCfpDHuPSC9gmcqMPVN8s-B6bbY,15052
|
|
22
22
|
dtflow/cli/validate.py,sha256=65aGVlMS_Rq0Ch0YQ-TclVJ03RQP4CnG137wthzb8Ao,4384
|
|
23
23
|
dtflow/mcp/__init__.py,sha256=huEJ3rXDbxDRjsLPEvjNT2u3tWs6Poiv6fokPIrByjw,897
|
|
@@ -29,8 +29,8 @@ dtflow/storage/__init__.py,sha256=C0jpWNQU808Ezz7lWneddABal3wILy8ijFUNiSKbHV4,36
|
|
|
29
29
|
dtflow/storage/io.py,sha256=ZH2aSE-S89gpy3z4oTqhcqWf4u10OdkDoyul7o_YBDI,23374
|
|
30
30
|
dtflow/utils/__init__.py,sha256=f8v9HJZMWRI5AL64Vjr76Pf2Na_whOF9nJBKgPbXXYg,429
|
|
31
31
|
dtflow/utils/display.py,sha256=OeOdTh6mbDwSkDWlmkjfpTjy2QG8ZUaYU0NpHUWkpEQ,5881
|
|
32
|
-
dtflow/utils/field_path.py,sha256=
|
|
33
|
-
dtflow-0.5.
|
|
34
|
-
dtflow-0.5.
|
|
35
|
-
dtflow-0.5.
|
|
36
|
-
dtflow-0.5.
|
|
32
|
+
dtflow/utils/field_path.py,sha256=K8nU196RxTSJ1OoieTWGcYOWl9KjGq2iSxCAkfjECuM,7621
|
|
33
|
+
dtflow-0.5.2.dist-info/METADATA,sha256=RlpGaySrAIgTviom_Wyn6o2LWzQQVihff12Jpazy10o,22544
|
|
34
|
+
dtflow-0.5.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
35
|
+
dtflow-0.5.2.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
|
|
36
|
+
dtflow-0.5.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|