dtflow 0.5.0__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dtflow/__init__.py CHANGED
@@ -26,6 +26,12 @@ from .converters import ( # LLaMA-Factory 扩展; ms-swift
26
26
  to_swift_vlm,
27
27
  )
28
28
  from .core import DataTransformer, DictWrapper, TransformError, TransformErrors
29
+ from .framework import (
30
+ CompatibilityResult,
31
+ check_compatibility,
32
+ detect_format,
33
+ export_for,
34
+ )
29
35
  from .presets import get_preset, list_presets
30
36
  from .schema import (
31
37
  Field,
@@ -38,12 +44,6 @@ from .schema import (
38
44
  sharegpt_schema,
39
45
  validate_data,
40
46
  )
41
- from .framework import (
42
- CompatibilityResult,
43
- check_compatibility,
44
- detect_format,
45
- export_for,
46
- )
47
47
  from .storage import load_data, sample_file, save_data
48
48
  from .streaming import StreamingTransformer, load_sharded, load_stream, process_shards
49
49
  from .tokenizers import (
@@ -60,7 +60,7 @@ from .tokenizers import (
60
60
  token_stats,
61
61
  )
62
62
 
63
- __version__ = "0.5.0"
63
+ __version__ = "0.5.3"
64
64
 
65
65
  __all__ = [
66
66
  # core
dtflow/cli/common.py CHANGED
@@ -57,7 +57,7 @@ def _get_file_row_count(filepath: Path) -> Optional[int]:
57
57
  return None
58
58
 
59
59
 
60
- def _format_value(value: Any, max_len: int = 80) -> str:
60
+ def _format_value(value: Any, max_len: int = 120) -> str:
61
61
  """格式化单个值,长文本截断。"""
62
62
  if value is None:
63
63
  return "[dim]null[/dim]"
@@ -66,18 +66,22 @@ def _format_value(value: Any, max_len: int = 80) -> str:
66
66
  if isinstance(value, (int, float)):
67
67
  return f"[cyan]{value}[/cyan]"
68
68
  if isinstance(value, str):
69
+ half_len = max_len // 2
69
70
  # 处理多行文本
70
71
  if "\n" in value:
71
72
  lines = value.split("\n")
72
- if len(lines) > 3:
73
- preview = lines[0][:max_len] + f"... [dim]({len(lines)} 行)[/dim]"
74
- else:
75
- preview = value.replace("\n", "\\n")
76
- if len(preview) > max_len:
77
- preview = preview[:max_len] + "..."
73
+ preview = value.replace("\n", "\\n")
74
+ if len(preview) > max_len:
75
+ # 前半 + 省略标记 + 后半
76
+ head = preview[:half_len]
77
+ tail = preview[-half_len:]
78
+ return f'"{head} [yellow]<<<{len(lines)}行>>>[/yellow] {tail}"'
78
79
  return f'"{preview}"'
79
80
  if len(value) > max_len:
80
- return f'"{value[:max_len]}..." [dim]({len(value)} 字符)[/dim]'
81
+ # 前半 + 省略标记 + 后半
82
+ head = value[:half_len]
83
+ tail = value[-half_len:]
84
+ return f'"{head} [yellow]<<<{len(value)}字符>>>[/yellow] {tail}"'
81
85
  return f'"{value}"'
82
86
  return str(value)
83
87
 
@@ -86,7 +90,7 @@ def _format_nested(
86
90
  value: Any,
87
91
  indent: str = "",
88
92
  is_last: bool = True,
89
- max_len: int = 80,
93
+ max_len: int = 120,
90
94
  ) -> List[str]:
91
95
  """
92
96
  递归格式化嵌套结构,返回行列表。
dtflow/cli/stats.py CHANGED
@@ -465,34 +465,65 @@ def token_stats(
465
465
  return
466
466
 
467
467
  total = len(data)
468
- print(f" 共 {total} 条数据")
469
- print(f"🔢 统计 Token (模型: {model}, 字段: {field})...")
468
+ print(f" 共 {total:,} 条数据")
470
469
 
471
470
  # 检查字段类型并选择合适的统计方法(支持嵌套路径)
472
471
  sample = data[0]
473
472
  field_value = get_field_with_spec(sample, field)
474
473
 
474
+ # 尝试使用 rich 进度条
475
475
  try:
476
- if isinstance(field_value, list) and field_value and isinstance(field_value[0], dict):
477
- # messages 格式
478
- from ..tokenizers import messages_token_stats
479
-
480
- stats_result = messages_token_stats(data, messages_field=field, model=model)
481
- _print_messages_token_stats(stats_result, detailed)
482
- else:
483
- # 普通文本字段
484
- from ..tokenizers import token_stats as compute_token_stats
485
-
486
- stats_result = compute_token_stats(data, fields=field, model=model)
487
- _print_text_token_stats(stats_result, detailed)
488
- except ImportError as e:
489
- print(f"错误: {e}")
490
- return
491
- except Exception as e:
492
- print(f"错误: 统计失败 - {e}")
493
- import traceback
476
+ from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn
477
+
478
+ with Progress(
479
+ SpinnerColumn(),
480
+ TextColumn("[bold blue]统计 Token"),
481
+ BarColumn(),
482
+ TaskProgressColumn(),
483
+ TextColumn(f"(模型: {model})"),
484
+ ) as progress:
485
+ task = progress.add_task("", total=total)
486
+
487
+ def update_progress(current: int, total_count: int):
488
+ progress.update(task, completed=current)
489
+
490
+ if isinstance(field_value, list) and field_value and isinstance(field_value[0], dict):
491
+ from ..tokenizers import messages_token_stats
492
+
493
+ stats_result = messages_token_stats(
494
+ data, messages_field=field, model=model, progress_callback=update_progress
495
+ )
496
+ _print_messages_token_stats(stats_result, detailed)
497
+ else:
498
+ from ..tokenizers import token_stats as compute_token_stats
499
+
500
+ stats_result = compute_token_stats(
501
+ data, fields=field, model=model, progress_callback=update_progress
502
+ )
503
+ _print_text_token_stats(stats_result, detailed)
504
+
505
+ except ImportError:
506
+ # 没有 rich,显示简单进度
507
+ print(f"🔢 统计 Token (模型: {model}, 字段: {field})...")
508
+ try:
509
+ if isinstance(field_value, list) and field_value and isinstance(field_value[0], dict):
510
+ from ..tokenizers import messages_token_stats
511
+
512
+ stats_result = messages_token_stats(data, messages_field=field, model=model)
513
+ _print_messages_token_stats(stats_result, detailed)
514
+ else:
515
+ from ..tokenizers import token_stats as compute_token_stats
494
516
 
495
- traceback.print_exc()
517
+ stats_result = compute_token_stats(data, fields=field, model=model)
518
+ _print_text_token_stats(stats_result, detailed)
519
+ except ImportError as e:
520
+ print(f"错误: {e}")
521
+ return
522
+ except Exception as e:
523
+ print(f"错误: 统计失败 - {e}")
524
+ import traceback
525
+
526
+ traceback.print_exc()
496
527
 
497
528
 
498
529
  def _print_messages_token_stats(stats: Dict[str, Any], detailed: bool) -> None:
@@ -505,21 +536,39 @@ def _print_messages_token_stats(stats: Dict[str, Any], detailed: bool) -> None:
505
536
  console = Console()
506
537
 
507
538
  # 概览
539
+ std = stats.get("std_tokens", 0)
508
540
  overview = (
509
541
  f"[bold]总样本数:[/bold] {stats['count']:,}\n"
510
542
  f"[bold]总 Token:[/bold] {stats['total_tokens']:,}\n"
511
- f"[bold]平均 Token:[/bold] {stats['avg_tokens']:,}\n"
512
- f"[bold]中位数:[/bold] {stats['median_tokens']:,}\n"
543
+ f"[bold]平均 Token:[/bold] {stats['avg_tokens']:,} (std: {std:.1f})\n"
513
544
  f"[bold]范围:[/bold] {stats['min_tokens']:,} - {stats['max_tokens']:,}"
514
545
  )
515
546
  console.print(Panel(overview, title="📊 Token 统计概览", expand=False))
516
547
 
548
+ # 百分位数表格
549
+ table = Table(title="📈 分布统计")
550
+ table.add_column("百分位", style="cyan", justify="center")
551
+ table.add_column("Token 数", justify="right")
552
+ percentiles = [
553
+ ("Min", stats["min_tokens"]),
554
+ ("P25", stats.get("p25", "-")),
555
+ ("P50 (中位数)", stats.get("median_tokens", "-")),
556
+ ("P75", stats.get("p75", "-")),
557
+ ("P90", stats.get("p90", "-")),
558
+ ("P95", stats.get("p95", "-")),
559
+ ("P99", stats.get("p99", "-")),
560
+ ("Max", stats["max_tokens"]),
561
+ ]
562
+ for name, val in percentiles:
563
+ table.add_row(name, f"{val:,}" if isinstance(val, int) else str(val))
564
+ console.print(table)
565
+
517
566
  if detailed:
518
- # 详细统计
519
- table = Table(title="📋 分角色统计")
520
- table.add_column("角色", style="cyan")
521
- table.add_column("Token 数", justify="right")
522
- table.add_column("占比", justify="right")
567
+ # 分角色统计
568
+ role_table = Table(title="📋 分角色统计")
569
+ role_table.add_column("角色", style="cyan")
570
+ role_table.add_column("Token 数", justify="right")
571
+ role_table.add_column("占比", justify="right")
523
572
 
524
573
  total = stats["total_tokens"]
525
574
  for role, key in [
@@ -529,22 +578,27 @@ def _print_messages_token_stats(stats: Dict[str, Any], detailed: bool) -> None:
529
578
  ]:
530
579
  tokens = stats.get(key, 0)
531
580
  pct = tokens / total * 100 if total > 0 else 0
532
- table.add_row(role, f"{tokens:,}", f"{pct:.1f}%")
581
+ role_table.add_row(role, f"{tokens:,}", f"{pct:.1f}%")
533
582
 
534
- console.print(table)
583
+ console.print(role_table)
535
584
  console.print(f"\n平均对话轮数: {stats.get('avg_turns', 0)}")
536
585
 
537
586
  except ImportError:
538
587
  # 没有 rich,使用普通打印
588
+ std = stats.get("std_tokens", 0)
539
589
  print(f"\n{'=' * 40}")
540
590
  print("📊 Token 统计概览")
541
591
  print(f"{'=' * 40}")
542
592
  print(f"总样本数: {stats['count']:,}")
543
593
  print(f"总 Token: {stats['total_tokens']:,}")
544
- print(f"平均 Token: {stats['avg_tokens']:,}")
545
- print(f"中位数: {stats['median_tokens']:,}")
594
+ print(f"平均 Token: {stats['avg_tokens']:,} (std: {std:.1f})")
546
595
  print(f"范围: {stats['min_tokens']:,} - {stats['max_tokens']:,}")
547
596
 
597
+ print(f"\n📈 百分位分布:")
598
+ print(f" P25: {stats.get('p25', '-'):,} P50: {stats.get('median_tokens', '-'):,}")
599
+ print(f" P75: {stats.get('p75', '-'):,} P90: {stats.get('p90', '-'):,}")
600
+ print(f" P95: {stats.get('p95', '-'):,} P99: {stats.get('p99', '-'):,}")
601
+
548
602
  if detailed:
549
603
  print(f"\n{'=' * 40}")
550
604
  print("📋 分角色统计")
@@ -566,24 +620,48 @@ def _print_text_token_stats(stats: Dict[str, Any], detailed: bool) -> None:
566
620
  try:
567
621
  from rich.console import Console
568
622
  from rich.panel import Panel
623
+ from rich.table import Table
569
624
 
570
625
  console = Console()
571
626
 
627
+ std = stats.get("std_tokens", 0)
572
628
  overview = (
573
629
  f"[bold]总样本数:[/bold] {stats['count']:,}\n"
574
630
  f"[bold]总 Token:[/bold] {stats['total_tokens']:,}\n"
575
- f"[bold]平均 Token:[/bold] {stats['avg_tokens']:.1f}\n"
576
- f"[bold]中位数:[/bold] {stats['median_tokens']:,}\n"
631
+ f"[bold]平均 Token:[/bold] {stats['avg_tokens']:.1f} (std: {std:.1f})\n"
577
632
  f"[bold]范围:[/bold] {stats['min_tokens']:,} - {stats['max_tokens']:,}"
578
633
  )
579
634
  console.print(Panel(overview, title="📊 Token 统计", expand=False))
580
635
 
636
+ # 百分位数表格
637
+ table = Table(title="📈 分布统计")
638
+ table.add_column("百分位", style="cyan", justify="center")
639
+ table.add_column("Token 数", justify="right")
640
+ percentiles = [
641
+ ("Min", stats["min_tokens"]),
642
+ ("P25", stats.get("p25", "-")),
643
+ ("P50 (中位数)", stats.get("median_tokens", "-")),
644
+ ("P75", stats.get("p75", "-")),
645
+ ("P90", stats.get("p90", "-")),
646
+ ("P95", stats.get("p95", "-")),
647
+ ("P99", stats.get("p99", "-")),
648
+ ("Max", stats["max_tokens"]),
649
+ ]
650
+ for name, val in percentiles:
651
+ table.add_row(name, f"{val:,}" if isinstance(val, int) else str(val))
652
+ console.print(table)
653
+
581
654
  except ImportError:
655
+ std = stats.get("std_tokens", 0)
582
656
  print(f"\n{'=' * 40}")
583
657
  print("📊 Token 统计")
584
658
  print(f"{'=' * 40}")
585
659
  print(f"总样本数: {stats['count']:,}")
586
660
  print(f"总 Token: {stats['total_tokens']:,}")
587
- print(f"平均 Token: {stats['avg_tokens']:.1f}")
588
- print(f"中位数: {stats['median_tokens']:,}")
661
+ print(f"平均 Token: {stats['avg_tokens']:.1f} (std: {std:.1f})")
589
662
  print(f"范围: {stats['min_tokens']:,} - {stats['max_tokens']:,}")
663
+
664
+ print(f"\n📈 百分位分布:")
665
+ print(f" P25: {stats.get('p25', '-'):,} P50: {stats.get('median_tokens', '-'):,}")
666
+ print(f" P75: {stats.get('p75', '-'):,} P90: {stats.get('p90', '-'):,}")
667
+ print(f" P95: {stats.get('p95', '-'):,} P99: {stats.get('p99', '-'):,}")
dtflow/converters.py CHANGED
@@ -4,7 +4,7 @@
4
4
  提供与 HuggingFace datasets 等常用格式的互转功能。
5
5
  """
6
6
 
7
- from typing import Any, Callable, Dict, List, Optional, Union
7
+ from typing import Any, Callable, Dict, List, Optional
8
8
 
9
9
 
10
10
  def to_hf_dataset(data: List[Dict[str, Any]]):
@@ -143,14 +143,16 @@ def to_openai_batch(
143
143
  >>> batch_input = dt.to(to_openai_batch(model="gpt-4o"))
144
144
  """
145
145
 
146
- def transform(item, idx=[0]) -> dict:
146
+ counter = {"idx": 0}
147
+
148
+ def transform(item) -> dict:
147
149
  messages = item.get(messages_field, []) if hasattr(item, "get") else item[messages_field]
148
150
 
149
151
  if custom_id_field:
150
152
  custom_id = item.get(custom_id_field) if hasattr(item, "get") else item[custom_id_field]
151
153
  else:
152
- custom_id = f"request-{idx[0]}"
153
- idx[0] += 1
154
+ custom_id = f"request-{counter['idx']}"
155
+ counter["idx"] += 1
154
156
 
155
157
  return {
156
158
  "custom_id": str(custom_id),
@@ -196,7 +198,7 @@ def to_llama_factory(
196
198
  """
197
199
 
198
200
  def transform(item) -> dict:
199
- get = lambda f: (item.get(f, "") if hasattr(item, "get") else item.get(f, ""))
201
+ get = lambda f: item.get(f, "") if hasattr(item, "get") else getattr(item, f, "")
200
202
 
201
203
  result = {
202
204
  "instruction": get(instruction_field),
@@ -248,7 +250,7 @@ def to_axolotl(
248
250
  conversations = (
249
251
  item.get(conversations_field, [])
250
252
  if hasattr(item, "get")
251
- else item.get(conversations_field, [])
253
+ else getattr(item, conversations_field, [])
252
254
  )
253
255
 
254
256
  # 如果已经是正确格式,直接返回
@@ -257,7 +259,9 @@ def to_axolotl(
257
259
  return {"conversations": conversations}
258
260
 
259
261
  # 尝试从 messages 格式转换
260
- messages = item.get("messages", []) if hasattr(item, "get") else item.get("messages", [])
262
+ messages = (
263
+ item.get("messages", []) if hasattr(item, "get") else getattr(item, "messages", [])
264
+ )
261
265
  if messages:
262
266
  role_map = {"user": "human", "assistant": "gpt", "system": "system"}
263
267
  conversations = [
@@ -312,7 +316,7 @@ def to_llama_factory_sharegpt(
312
316
  }
313
317
 
314
318
  def transform(item) -> dict:
315
- get = lambda f: (item.get(f, "") if hasattr(item, "get") else item.get(f, ""))
319
+ get = lambda f: item.get(f, "") if hasattr(item, "get") else getattr(item, f, "")
316
320
  messages = get(messages_field) or []
317
321
 
318
322
  conversations = []
@@ -385,7 +389,7 @@ def to_llama_factory_vlm(
385
389
  """
386
390
 
387
391
  def transform(item) -> dict:
388
- get = lambda f: item.get(f) if hasattr(item, "get") else item.get(f)
392
+ get = lambda f: item.get(f) if hasattr(item, "get") else getattr(item, f, None)
389
393
  messages = get(messages_field) or []
390
394
 
391
395
  instruction = ""
@@ -467,7 +471,7 @@ def to_llama_factory_vlm_sharegpt(
467
471
  role_map = {"user": "human", "assistant": "gpt", "system": "system"}
468
472
 
469
473
  def transform(item) -> dict:
470
- get = lambda f: item.get(f) if hasattr(item, "get") else item.get(f)
474
+ get = lambda f: item.get(f) if hasattr(item, "get") else getattr(item, f, None)
471
475
  messages = get(messages_field) or []
472
476
 
473
477
  conversations = []
@@ -541,7 +545,7 @@ def to_swift_messages(
541
545
  """
542
546
 
543
547
  def transform(item) -> dict:
544
- get = lambda f: item.get(f) if hasattr(item, "get") else item.get(f)
548
+ get = lambda f: item.get(f) if hasattr(item, "get") else getattr(item, f, None)
545
549
  messages = get(messages_field) or []
546
550
 
547
551
  # 复制 messages,避免修改原数据
@@ -600,7 +604,7 @@ def to_swift_query_response(
600
604
  """
601
605
 
602
606
  def transform(item) -> dict:
603
- get = lambda f: item.get(f) if hasattr(item, "get") else item.get(f)
607
+ get = lambda f: item.get(f) if hasattr(item, "get") else getattr(item, f, None)
604
608
 
605
609
  query = get(query_field)
606
610
  response = get(response_field)
@@ -693,7 +697,7 @@ def to_swift_vlm(
693
697
  """
694
698
 
695
699
  def transform(item) -> dict:
696
- get = lambda f: item.get(f) if hasattr(item, "get") else item.get(f)
700
+ get = lambda f: item.get(f) if hasattr(item, "get") else getattr(item, f, None)
697
701
  messages = get(messages_field) or []
698
702
 
699
703
  result_messages = []
dtflow/core.py CHANGED
@@ -793,19 +793,29 @@ class DataTransformer:
793
793
  seed: 随机种子
794
794
 
795
795
  Returns:
796
- (train, test) 两个 DataTransformer
796
+ (train, test) 两个 DataTransformer,各自拥有独立的血缘追踪器
797
797
  """
798
798
  data = self.shuffle(seed).data
799
799
  split_idx = int(len(data) * ratio)
800
800
 
801
- # 分割后血缘追踪器各自独立
801
+ # 分割后血缘追踪器各自独立(使用深拷贝避免相互影响)
802
802
  tracker = self._lineage_tracker
803
+ train_tracker = None
804
+ test_tracker = None
805
+
803
806
  if tracker:
804
807
  tracker.record("split", {"ratio": ratio, "seed": seed}, len(self._data), len(data))
808
+ # 为每个子数据集创建独立的追踪器副本
809
+ train_tracker = tracker.copy()
810
+ train_tracker.record("split_part", {"part": "train", "ratio": ratio}, len(data), split_idx)
811
+ test_tracker = tracker.copy()
812
+ test_tracker.record(
813
+ "split_part", {"part": "test", "ratio": 1 - ratio}, len(data), len(data) - split_idx
814
+ )
805
815
 
806
816
  return (
807
- DataTransformer(data[:split_idx], _lineage_tracker=tracker),
808
- DataTransformer(data[split_idx:], _lineage_tracker=tracker),
817
+ DataTransformer(data[:split_idx], _lineage_tracker=train_tracker),
818
+ DataTransformer(data[split_idx:], _lineage_tracker=test_tracker),
809
819
  )
810
820
 
811
821
  # ============ 并行处理 ============
@@ -815,6 +825,7 @@ class DataTransformer:
815
825
  func: Callable[[Dict], Any],
816
826
  workers: Optional[int] = None,
817
827
  chunksize: int = 1000,
828
+ timeout: Optional[float] = None,
818
829
  ) -> List[Any]:
819
830
  """
820
831
  并行执行转换函数(使用多进程)。
@@ -825,24 +836,46 @@ class DataTransformer:
825
836
  func: 转换函数,接收原始 dict,返回转换结果
826
837
  workers: 进程数,默认为 CPU 核心数
827
838
  chunksize: 每个进程处理的数据块大小
839
+ timeout: 超时时间(秒),None 表示无超时
828
840
 
829
841
  Returns:
830
842
  转换后的结果列表
831
843
 
844
+ Raises:
845
+ TypeError: 如果 func 无法被 pickle(如 lambda 函数)
846
+ RuntimeError: 如果子进程执行出错或超时
847
+
832
848
  Examples:
833
849
  >>> def transform(item):
834
850
  ... return {"id": item["id"], "text": item["text"].upper()}
835
851
  >>> results = dt.map_parallel(transform)
836
852
  """
837
- from multiprocessing import Pool, cpu_count
853
+ from multiprocessing import Pool, TimeoutError, cpu_count
854
+ import pickle
838
855
 
839
856
  if not self._data:
840
857
  return []
841
858
 
859
+ # 检查函数是否可 pickle
860
+ try:
861
+ pickle.dumps(func)
862
+ except (pickle.PicklingError, AttributeError, TypeError) as e:
863
+ func_name = getattr(func, "__name__", str(func))
864
+ raise TypeError(
865
+ f"函数 '{func_name}' 无法被 pickle,不能用于并行处理。"
866
+ f"请使用模块级函数而非 lambda 或闭包。错误: {e}"
867
+ ) from e
868
+
842
869
  workers = workers or cpu_count()
843
870
 
844
- with Pool(workers) as pool:
845
- results = pool.map(func, self._data, chunksize=chunksize)
871
+ try:
872
+ with Pool(workers) as pool:
873
+ async_result = pool.map_async(func, self._data, chunksize=chunksize)
874
+ results = async_result.get(timeout=timeout)
875
+ except TimeoutError:
876
+ raise RuntimeError(f"并行处理超时({timeout}秒)")
877
+ except Exception as e:
878
+ raise RuntimeError(f"并行处理失败: {type(e).__name__}: {e}") from e
846
879
 
847
880
  return results
848
881
 
@@ -851,6 +884,7 @@ class DataTransformer:
851
884
  func: Callable[[Dict], bool],
852
885
  workers: Optional[int] = None,
853
886
  chunksize: int = 1000,
887
+ timeout: Optional[float] = None,
854
888
  ) -> "DataTransformer":
855
889
  """
856
890
  并行执行过滤函数(使用多进程)。
@@ -861,24 +895,46 @@ class DataTransformer:
861
895
  func: 过滤函数,接收原始 dict,返回 True 保留
862
896
  workers: 进程数,默认为 CPU 核心数
863
897
  chunksize: 每个进程处理的数据块大小
898
+ timeout: 超时时间(秒),None 表示无超时
864
899
 
865
900
  Returns:
866
901
  过滤后的新 DataTransformer
867
902
 
903
+ Raises:
904
+ TypeError: 如果 func 无法被 pickle(如 lambda 函数)
905
+ RuntimeError: 如果子进程执行出错或超时
906
+
868
907
  Examples:
869
908
  >>> def is_valid(item):
870
909
  ... return len(item["text"]) > 10
871
910
  >>> filtered = dt.filter_parallel(is_valid)
872
911
  """
873
- from multiprocessing import Pool, cpu_count
912
+ from multiprocessing import Pool, TimeoutError, cpu_count
913
+ import pickle
874
914
 
875
915
  if not self._data:
876
916
  return DataTransformer([])
877
917
 
918
+ # 检查函数是否可 pickle
919
+ try:
920
+ pickle.dumps(func)
921
+ except (pickle.PicklingError, AttributeError, TypeError) as e:
922
+ func_name = getattr(func, "__name__", str(func))
923
+ raise TypeError(
924
+ f"函数 '{func_name}' 无法被 pickle,不能用于并行处理。"
925
+ f"请使用模块级函数而非 lambda 或闭包。错误: {e}"
926
+ ) from e
927
+
878
928
  workers = workers or cpu_count()
879
929
 
880
- with Pool(workers) as pool:
881
- mask = pool.map(func, self._data, chunksize=chunksize)
930
+ try:
931
+ with Pool(workers) as pool:
932
+ async_result = pool.map_async(func, self._data, chunksize=chunksize)
933
+ mask = async_result.get(timeout=timeout)
934
+ except TimeoutError:
935
+ raise RuntimeError(f"并行处理超时({timeout}秒)")
936
+ except Exception as e:
937
+ raise RuntimeError(f"并行处理失败: {type(e).__name__}: {e}") from e
882
938
 
883
939
  filtered = [item for item, keep in zip(self._data, mask) if keep]
884
940
  return DataTransformer(filtered)
dtflow/lineage.py CHANGED
@@ -237,6 +237,23 @@ class LineageTracker:
237
237
 
238
238
  return lineage_path
239
239
 
240
+ def copy(self) -> "LineageTracker":
241
+ """
242
+ 创建追踪器的深拷贝。
243
+
244
+ 用于 split() 等场景,确保子数据集有独立的血缘追踪。
245
+
246
+ Returns:
247
+ 新的 LineageTracker 实例
248
+ """
249
+ import copy as copy_module
250
+
251
+ new_tracker = LineageTracker.__new__(LineageTracker)
252
+ new_tracker.source_path = self.source_path
253
+ new_tracker.source_lineage = self.source_lineage # LineageRecord 是不可变的,可共享
254
+ new_tracker.operations = copy_module.deepcopy(self.operations)
255
+ return new_tracker
256
+
240
257
 
241
258
  def _sanitize_params(params: Dict[str, Any]) -> Dict[str, Any]:
242
259
  """
dtflow/presets.py CHANGED
@@ -6,6 +6,8 @@
6
6
 
7
7
  from typing import Any, Callable
8
8
 
9
+ from dtflow.utils.helpers import get_field_value
10
+
9
11
 
10
12
  def openai_chat(
11
13
  user_field: str = "q", assistant_field: str = "a", system_prompt: str = None
@@ -33,8 +35,8 @@ def openai_chat(
33
35
  if system_prompt:
34
36
  messages.append({"role": "system", "content": system_prompt})
35
37
 
36
- user_content = getattr(item, user_field, None) or item.get(user_field, "")
37
- assistant_content = getattr(item, assistant_field, None) or item.get(assistant_field, "")
38
+ user_content = get_field_value(item, user_field)
39
+ assistant_content = get_field_value(item, assistant_field)
38
40
 
39
41
  messages.append({"role": "user", "content": user_content})
40
42
  messages.append({"role": "assistant", "content": assistant_content})
@@ -60,10 +62,9 @@ def alpaca(
60
62
 
61
63
  def transform(item: Any) -> dict:
62
64
  return {
63
- "instruction": getattr(item, instruction_field, None)
64
- or item.get(instruction_field, ""),
65
- "input": getattr(item, input_field, None) or item.get(input_field, ""),
66
- "output": getattr(item, output_field, None) or item.get(output_field, ""),
65
+ "instruction": get_field_value(item, instruction_field),
66
+ "input": get_field_value(item, input_field),
67
+ "output": get_field_value(item, output_field),
67
68
  }
68
69
 
69
70
  return transform
@@ -84,9 +85,7 @@ def sharegpt(conversations_field: str = "conversations", role_mapping: dict = No
84
85
  role_mapping = role_mapping or {"user": "human", "assistant": "gpt"}
85
86
 
86
87
  def transform(item: Any) -> dict:
87
- conversations = getattr(item, conversations_field, None) or item.get(
88
- conversations_field, []
89
- )
88
+ conversations = get_field_value(item, conversations_field, [])
90
89
 
91
90
  # 如果已经是对话格式,直接返回
92
91
  if conversations:
@@ -102,7 +101,7 @@ def sharegpt(conversations_field: str = "conversations", role_mapping: dict = No
102
101
  ("answer", "gpt"),
103
102
  ("output", "gpt"),
104
103
  ]:
105
- value = getattr(item, field, None) or item.get(field, None)
104
+ value = get_field_value(item, field, None)
106
105
  if value:
107
106
  result.append({"from": role, "value": value})
108
107
 
@@ -127,9 +126,9 @@ def dpo_pair(
127
126
 
128
127
  def transform(item: Any) -> dict:
129
128
  return {
130
- "prompt": getattr(item, prompt_field, None) or item.get(prompt_field, ""),
131
- "chosen": getattr(item, chosen_field, None) or item.get(chosen_field, ""),
132
- "rejected": getattr(item, rejected_field, None) or item.get(rejected_field, ""),
129
+ "prompt": get_field_value(item, prompt_field),
130
+ "chosen": get_field_value(item, chosen_field),
131
+ "rejected": get_field_value(item, rejected_field),
133
132
  }
134
133
 
135
134
  return transform
@@ -148,8 +147,8 @@ def simple_qa(question_field: str = "q", answer_field: str = "a") -> Callable:
148
147
 
149
148
  def transform(item: Any) -> dict:
150
149
  return {
151
- "question": getattr(item, question_field, None) or item.get(question_field, ""),
152
- "answer": getattr(item, answer_field, None) or item.get(answer_field, ""),
150
+ "question": get_field_value(item, question_field),
151
+ "answer": get_field_value(item, answer_field),
153
152
  }
154
153
 
155
154
  return transform
dtflow/streaming.py CHANGED
@@ -365,50 +365,108 @@ class StreamingTransformer:
365
365
  """
366
366
  批量流式保存(CSV/Parquet/Arrow)。
367
367
 
368
- 读取和处理是流式的,写入时收集后一次性写入。
368
+ 真正的流式写入:分批处理,每批写入后释放内存。
369
+ 内存占用 O(batch_size) 而非 O(n)。
369
370
  """
370
371
  path = Path(filepath)
371
- all_items = []
372
+ count = 0
373
+ batch = []
374
+ first_batch = True
372
375
 
373
- if show_progress:
374
- # 根据是否有总数选择进度条样式
375
- if self._total is not None:
376
- columns = [
377
- SpinnerColumn(),
378
- TextColumn("[progress.description]{task.description}"),
379
- BarColumn(),
380
- TaskProgressColumn(),
381
- MofNCompleteColumn(),
382
- TimeElapsedColumn(),
383
- TimeRemainingColumn(),
384
- ]
385
- else:
386
- columns = [
387
- SpinnerColumn(),
388
- TextColumn("[progress.description]{task.description}"),
389
- MofNCompleteColumn(),
390
- TimeElapsedColumn(),
391
- ]
376
+ # 进度条配置
377
+ progress_columns = self._get_progress_columns()
392
378
 
393
- with Progress(*columns) as progress:
394
- task = progress.add_task("处理中", total=self._total)
395
- for item in self._iterator:
396
- all_items.append(item)
397
- progress.update(task, advance=1)
398
- else:
399
- for item in self._iterator:
400
- all_items.append(item)
379
+ def write_batch(items: List[Dict], is_first: bool, writer_state: Dict):
380
+ """写入一批数据"""
381
+ if not items:
382
+ return
383
+
384
+ df = pl.DataFrame(items)
401
385
 
402
- if all_items:
403
- df = pl.DataFrame(all_items)
404
386
  if fmt == "csv":
405
- df.write_csv(path)
387
+ if is_first:
388
+ df.write_csv(path)
389
+ else:
390
+ # CSV 追加模式:不写表头
391
+ with open(path, "ab") as f:
392
+ f.write(df.write_csv(include_header=False).encode("utf-8"))
393
+
406
394
  elif fmt == "parquet":
407
- df.write_parquet(path)
395
+ import pyarrow as pa
396
+ import pyarrow.parquet as pq
397
+
398
+ table = df.to_arrow()
399
+ if is_first:
400
+ writer_state["writer"] = pq.ParquetWriter(str(path), table.schema)
401
+ writer_state["writer"].write_table(table)
402
+
408
403
  elif fmt == "arrow":
409
- df.write_ipc(path)
404
+ import pyarrow as pa
405
+
406
+ table = df.to_arrow()
407
+ if is_first:
408
+ writer_state["writer"] = pa.ipc.new_file(str(path), table.schema)
409
+ for record_batch in table.to_batches():
410
+ writer_state["writer"].write_batch(record_batch)
411
+
412
+ writer_state: Dict[str, Any] = {}
413
+
414
+ try:
415
+ if show_progress:
416
+ with Progress(*progress_columns) as progress:
417
+ task = progress.add_task("处理中", total=self._total)
418
+ for item in self._iterator:
419
+ batch.append(item)
420
+ count += 1
421
+ progress.update(task, advance=1)
422
+
423
+ if len(batch) >= batch_size:
424
+ write_batch(batch, first_batch, writer_state)
425
+ first_batch = False
426
+ batch = [] # 释放内存
427
+
428
+ # 写入最后一批
429
+ if batch:
430
+ write_batch(batch, first_batch, writer_state)
431
+ else:
432
+ for item in self._iterator:
433
+ batch.append(item)
434
+ count += 1
435
+
436
+ if len(batch) >= batch_size:
437
+ write_batch(batch, first_batch, writer_state)
438
+ first_batch = False
439
+ batch = []
410
440
 
411
- return len(all_items)
441
+ if batch:
442
+ write_batch(batch, first_batch, writer_state)
443
+
444
+ finally:
445
+ # 关闭 writer
446
+ if "writer" in writer_state:
447
+ writer_state["writer"].close()
448
+
449
+ return count
450
+
451
+ def _get_progress_columns(self):
452
+ """获取进度条列配置"""
453
+ if self._total is not None:
454
+ return [
455
+ SpinnerColumn(),
456
+ TextColumn("[progress.description]{task.description}"),
457
+ BarColumn(),
458
+ TaskProgressColumn(),
459
+ MofNCompleteColumn(),
460
+ TimeElapsedColumn(),
461
+ TimeRemainingColumn(),
462
+ ]
463
+ else:
464
+ return [
465
+ SpinnerColumn(),
466
+ TextColumn("[progress.description]{task.description}"),
467
+ MofNCompleteColumn(),
468
+ TimeElapsedColumn(),
469
+ ]
412
470
 
413
471
  def save_sharded(
414
472
  self,
dtflow/tokenizers.py CHANGED
@@ -210,7 +210,10 @@ def token_counter(
210
210
  创建 token 计数转换函数。
211
211
 
212
212
  Args:
213
- fields: 要统计的字段(单个或多个)
213
+ fields: 要统计的字段(单个或多个),支持嵌套路径语法
214
+ - 简单字段: "text"
215
+ - 嵌套字段: "meta.content", "data.text"
216
+ - 索引: "messages[0].content", "messages[-1].content"
214
217
  model: 模型名称或别名,如 "qwen2.5", "gpt-4", "llama3" 等
215
218
  backend: 后端选择,None 则自动检测
216
219
  output_field: 输出字段名
@@ -221,6 +224,7 @@ def token_counter(
221
224
  Examples:
222
225
  >>> dt.transform(token_counter("text"))
223
226
  >>> dt.transform(token_counter(["question", "answer"], model="qwen3"))
227
+ >>> dt.transform(token_counter("messages[-1].content")) # 最后一条消息
224
228
  """
225
229
  if isinstance(fields, str):
226
230
  fields = [fields]
@@ -229,7 +233,7 @@ def token_counter(
229
233
  result = item.to_dict() if hasattr(item, "to_dict") else dict(item)
230
234
  total = 0
231
235
  for field in fields:
232
- value = item.get(field, "") if hasattr(item, "get") else item[field]
236
+ value = get_field_with_spec(item, field, default="")
233
237
  if value:
234
238
  total += count_tokens(str(value), model=model, backend=backend)
235
239
  result[output_field] = total
@@ -249,7 +253,10 @@ def token_filter(
249
253
  创建基于 token 长度的过滤函数。
250
254
 
251
255
  Args:
252
- fields: 要统计的字段(单个或多个)
256
+ fields: 要统计的字段(单个或多个),支持嵌套路径语法
257
+ - 简单字段: "text"
258
+ - 嵌套字段: "meta.content", "data.text"
259
+ - 索引: "messages[0].content", "messages[-1].content"
253
260
  min_tokens: 最小 token 数(包含)
254
261
  max_tokens: 最大 token 数(包含)
255
262
  model: 模型名称
@@ -261,6 +268,7 @@ def token_filter(
261
268
  Examples:
262
269
  >>> dt.filter(token_filter("text", min_tokens=10, max_tokens=512))
263
270
  >>> dt.filter(token_filter(["q", "a"], max_tokens=2048))
271
+ >>> dt.filter(token_filter("messages[-1].content", max_tokens=1024))
264
272
  """
265
273
  if isinstance(fields, str):
266
274
  fields = [fields]
@@ -268,7 +276,7 @@ def token_filter(
268
276
  def filter_func(item) -> bool:
269
277
  total = 0
270
278
  for field in fields:
271
- value = item.get(field, "") if hasattr(item, "get") else item[field]
279
+ value = get_field_with_spec(item, field, default="")
272
280
  if value:
273
281
  total += count_tokens(str(value), model=model, backend=backend)
274
282
 
@@ -281,11 +289,32 @@ def token_filter(
281
289
  return filter_func
282
290
 
283
291
 
292
+ def _percentile(sorted_data: List[int], p: float) -> int:
293
+ """计算百分位数"""
294
+ n = len(sorted_data)
295
+ if n == 0:
296
+ return 0
297
+ idx = (n - 1) * p / 100
298
+ lower = int(idx)
299
+ upper = min(lower + 1, n - 1)
300
+ weight = idx - lower
301
+ return int(sorted_data[lower] * (1 - weight) + sorted_data[upper] * weight)
302
+
303
+
304
+ def _std(counts: List[int], avg: float) -> float:
305
+ """计算标准差"""
306
+ if len(counts) < 2:
307
+ return 0.0
308
+ variance = sum((x - avg) ** 2 for x in counts) / len(counts)
309
+ return variance**0.5
310
+
311
+
284
312
  def token_stats(
285
313
  data: List[Dict[str, Any]],
286
314
  fields: Union[str, List[str]],
287
315
  model: str = DEFAULT_MODEL,
288
316
  backend: Optional[str] = None,
317
+ progress_callback: Optional[Callable[[int, int], None]] = None,
289
318
  ) -> Dict[str, Any]:
290
319
  """
291
320
  统计数据集的 token 信息。
@@ -295,9 +324,17 @@ def token_stats(
295
324
  fields: 要统计的字段,支持嵌套路径语法(如 meta.text, messages[-1].content)
296
325
  model: 模型名称或别名,如 "qwen2.5", "gpt-4" 等
297
326
  backend: 后端选择,None 则自动检测
327
+ progress_callback: 进度回调函数,接收 (current, total) 两个参数
298
328
 
299
329
  Returns:
300
- 统计信息字典
330
+ 统计信息字典,包含:
331
+ - total_tokens: 总 token 数
332
+ - count: 样本数
333
+ - avg_tokens: 平均 token 数
334
+ - std_tokens: 标准差
335
+ - min_tokens, max_tokens: 最小/最大值
336
+ - median_tokens: 中位数 (p50)
337
+ - p25, p75, p90, p95, p99: 百分位数
301
338
  """
302
339
  if isinstance(fields, str):
303
340
  fields = [fields]
@@ -306,21 +343,33 @@ def token_stats(
306
343
  return {"total_tokens": 0, "count": 0}
307
344
 
308
345
  counts = []
309
- for item in data:
346
+ total_items = len(data)
347
+ for i, item in enumerate(data):
310
348
  total = 0
311
349
  for field in fields:
312
350
  value = get_field_with_spec(item, field, default="")
313
351
  if value:
314
352
  total += count_tokens(str(value), model=model, backend=backend)
315
353
  counts.append(total)
354
+ if progress_callback:
355
+ progress_callback(i + 1, total_items)
356
+
357
+ sorted_counts = sorted(counts)
358
+ avg = sum(counts) / len(counts)
316
359
 
317
360
  return {
318
361
  "total_tokens": sum(counts),
319
362
  "count": len(counts),
320
- "avg_tokens": sum(counts) / len(counts),
363
+ "avg_tokens": avg,
364
+ "std_tokens": _std(counts, avg),
321
365
  "min_tokens": min(counts),
322
366
  "max_tokens": max(counts),
323
- "median_tokens": sorted(counts)[len(counts) // 2],
367
+ "median_tokens": _percentile(sorted_counts, 50),
368
+ "p25": _percentile(sorted_counts, 25),
369
+ "p75": _percentile(sorted_counts, 75),
370
+ "p90": _percentile(sorted_counts, 90),
371
+ "p95": _percentile(sorted_counts, 95),
372
+ "p99": _percentile(sorted_counts, 99),
324
373
  }
325
374
 
326
375
 
@@ -504,6 +553,7 @@ def messages_token_stats(
504
553
  messages_field: str = "messages",
505
554
  model: str = DEFAULT_MODEL,
506
555
  backend: Optional[str] = None,
556
+ progress_callback: Optional[Callable[[int, int], None]] = None,
507
557
  ) -> Dict[str, Any]:
508
558
  """
509
559
  统计数据集中 messages 的 token 信息。
@@ -513,25 +563,18 @@ def messages_token_stats(
513
563
  messages_field: messages 字段名,支持嵌套路径语法(如 conversation.messages)
514
564
  model: 模型名称或别名
515
565
  backend: 后端,None 则自动检测
566
+ progress_callback: 进度回调函数,接收 (current, total) 两个参数
516
567
 
517
568
  Returns:
518
- 统计信息字典
519
-
520
- Examples:
521
- >>> stats = messages_token_stats(dt.data) # 使用默认 qwen2.5
522
- >>> stats = messages_token_stats(dt.data, model="qwen3")
523
- >>> print(stats)
524
- {
525
- "count": 1000,
526
- "total_tokens": 500000,
527
- "user_tokens": 200000,
528
- "assistant_tokens": 290000,
529
- "system_tokens": 10000,
530
- "avg_tokens": 500,
531
- "max_tokens": 2048,
532
- "min_tokens": 50,
533
- "avg_turns": 4,
534
- }
569
+ 统计信息字典,包含:
570
+ - count: 样本数
571
+ - total_tokens: 总 token 数
572
+ - user_tokens, assistant_tokens, system_tokens: 各角色 token 数
573
+ - avg_tokens, std_tokens: 平均值和标准差
574
+ - min_tokens, max_tokens: 最小/最大值
575
+ - median_tokens: 中位数
576
+ - p25, p75, p90, p95, p99: 百分位数
577
+ - avg_turns: 平均对话轮数
535
578
  """
536
579
  _backend = backend or _auto_backend(model)
537
580
 
@@ -539,24 +582,36 @@ def messages_token_stats(
539
582
  return {"count": 0, "total_tokens": 0}
540
583
 
541
584
  all_stats = []
542
- for item in data:
585
+ total_items = len(data)
586
+ for i, item in enumerate(data):
543
587
  messages = get_field_with_spec(item, messages_field, default=[])
544
588
  if messages:
545
589
  all_stats.append(_count_messages_tokens(messages, model=model, backend=_backend))
590
+ if progress_callback:
591
+ progress_callback(i + 1, total_items)
546
592
 
547
593
  if not all_stats:
548
594
  return {"count": 0, "total_tokens": 0}
549
595
 
550
596
  totals = [s["total"] for s in all_stats]
597
+ sorted_totals = sorted(totals)
598
+ avg = sum(totals) / len(totals)
599
+
551
600
  return {
552
601
  "count": len(all_stats),
553
602
  "total_tokens": sum(totals),
554
603
  "user_tokens": sum(s["user"] for s in all_stats),
555
604
  "assistant_tokens": sum(s["assistant"] for s in all_stats),
556
605
  "system_tokens": sum(s["system"] for s in all_stats),
557
- "avg_tokens": sum(totals) // len(totals),
558
- "max_tokens": max(totals),
606
+ "avg_tokens": int(avg),
607
+ "std_tokens": _std(totals, avg),
559
608
  "min_tokens": min(totals),
560
- "median_tokens": sorted(totals)[len(totals) // 2],
609
+ "max_tokens": max(totals),
610
+ "median_tokens": _percentile(sorted_totals, 50),
611
+ "p25": _percentile(sorted_totals, 25),
612
+ "p75": _percentile(sorted_totals, 75),
613
+ "p90": _percentile(sorted_totals, 90),
614
+ "p95": _percentile(sorted_totals, 95),
615
+ "p99": _percentile(sorted_totals, 99),
561
616
  "avg_turns": sum(s["turns"] for s in all_stats) // len(all_stats),
562
617
  }
dtflow/utils/__init__.py CHANGED
@@ -9,6 +9,7 @@ from .field_path import (
9
9
  get_field_with_spec,
10
10
  parse_field_spec,
11
11
  )
12
+ from .helpers import get_field_value
12
13
 
13
14
  __all__ = [
14
15
  "display_data",
@@ -20,4 +21,6 @@ __all__ = [
20
21
  "extract",
21
22
  "extract_with_spec",
22
23
  "ExpandMode",
24
+ # helpers
25
+ "get_field_value",
23
26
  ]
@@ -96,7 +96,9 @@ def _parse_path(path: str) -> List[Union[str, int, Literal["*", "#"]]]:
96
96
  continue
97
97
 
98
98
  # 解析 field[index] 格式
99
- match = re.match(r"([a-zA-Z_\u4e00-\u9fff][a-zA-Z0-9_\u4e00-\u9fff]*)?(?:\[(-?\d+|\*)\])?", part)
99
+ match = re.match(
100
+ r"([a-zA-Z_\u4e00-\u9fff][a-zA-Z0-9_\u4e00-\u9fff]*)?(?:\[(-?\d+|\*)\])?", part
101
+ )
100
102
  if match:
101
103
  field_name, index = match.groups()
102
104
 
@@ -175,10 +177,12 @@ def _get_value_by_segments(
175
177
 
176
178
  return values
177
179
 
178
- # 字典字段访问
180
+ # 字典字段访问(支持 dict 和类 dict 对象如 DictWrapper)
179
181
  if isinstance(seg, str):
180
182
  if isinstance(current, dict):
181
183
  current = current.get(seg)
184
+ elif hasattr(current, "get"):
185
+ current = current.get(seg)
182
186
  else:
183
187
  return None
184
188
 
@@ -0,0 +1,30 @@
1
+ """公共辅助函数"""
2
+
3
+ from typing import Any
4
+
5
+
6
+ def get_field_value(item: Any, field: str, default: Any = "") -> Any:
7
+ """
8
+ 获取字段值,支持 DictWrapper 和普通 dict。
9
+
10
+ 优先尝试 dict.get(),如果没有 get 方法则使用 getattr()。
11
+
12
+ Args:
13
+ item: 数据对象(dict 或 DictWrapper)
14
+ field: 字段名
15
+ default: 默认值
16
+
17
+ Returns:
18
+ 字段值或默认值
19
+
20
+ Examples:
21
+ >>> get_field_value({"name": "test"}, "name")
22
+ 'test'
23
+ >>> get_field_value({"name": ""}, "name", "default")
24
+ 'default'
25
+ """
26
+ if hasattr(item, "get"):
27
+ value = item.get(field, default)
28
+ else:
29
+ value = getattr(item, field, default)
30
+ return value if value else default
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dtflow
3
- Version: 0.5.0
3
+ Version: 0.5.3
4
4
  Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
5
5
  Project-URL: Homepage, https://github.com/yourusername/DataTransformer
6
6
  Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
@@ -32,16 +32,26 @@ Requires-Dist: orjson>=3.9.0
32
32
  Requires-Dist: polars>=0.20.0
33
33
  Requires-Dist: pyyaml>=5.4.0
34
34
  Requires-Dist: rich>=10.0.0
35
+ Requires-Dist: tiktoken>=0.5.0
35
36
  Requires-Dist: typer>=0.9.0
36
37
  Provides-Extra: converters
37
38
  Requires-Dist: datasets>=2.0.0; extra == 'converters'
38
39
  Provides-Extra: dev
39
40
  Requires-Dist: black>=21.0; extra == 'dev'
41
+ Requires-Dist: datasets>=2.0.0; extra == 'dev'
42
+ Requires-Dist: datasketch>=1.5.0; extra == 'dev'
40
43
  Requires-Dist: flake8>=3.9.0; extra == 'dev'
44
+ Requires-Dist: huggingface-hub>=0.20.0; extra == 'dev'
41
45
  Requires-Dist: isort>=5.9.0; extra == 'dev'
42
46
  Requires-Dist: mypy>=0.910; extra == 'dev'
47
+ Requires-Dist: pyarrow; extra == 'dev'
43
48
  Requires-Dist: pytest-cov>=2.12.0; extra == 'dev'
44
49
  Requires-Dist: pytest>=6.0.0; extra == 'dev'
50
+ Requires-Dist: rich>=10.0.0; extra == 'dev'
51
+ Requires-Dist: scikit-learn>=0.24.0; extra == 'dev'
52
+ Requires-Dist: tiktoken>=0.5.0; extra == 'dev'
53
+ Requires-Dist: tokenizers>=0.15.0; extra == 'dev'
54
+ Requires-Dist: toolong>=1.5.0; extra == 'dev'
45
55
  Provides-Extra: display
46
56
  Provides-Extra: docs
47
57
  Requires-Dist: myst-parser>=0.15.0; extra == 'docs'
@@ -1,23 +1,23 @@
1
- dtflow/__init__.py,sha256=fOkG8g8VXS1HFk2ztmaJpjHBXmArHGBW8WE8tHPHXts,3031
1
+ dtflow/__init__.py,sha256=RJql_KmINJNbq2FEqU7jD9Z0c5ETkxQJPvUUPKiFt74,3031
2
2
  dtflow/__main__.py,sha256=ySpqvEn7k-vsrYFPx-8O6p-yx_24KccgnOSPd2XybhM,12572
3
- dtflow/converters.py,sha256=gyy-K15zjzGBawFnZa8D9JX37JZ47rey2GhjKa2pxFo,22081
4
- dtflow/core.py,sha256=HJAlxOaCtwvLOWF9JSC-2li3fsyRE2Q-H9unj9GQJ6M,35445
3
+ dtflow/converters.py,sha256=yXafSDeRC7DB2MMj8fD1NWjAG8HoAGh5Ay2A5Z7s6xA,22206
4
+ dtflow/core.py,sha256=qMo6B3LK--TWRK7ZBKObGcs3pKFnd0NPoaM0T8JC7Jw,38135
5
5
  dtflow/framework.py,sha256=jyICi_RWHjX7WfsXdSbWmP1SL7y1OWSPyd5G5Y-lvg4,17578
6
- dtflow/lineage.py,sha256=vQ06lxBHftu-Ma5HlISp3F2eiIvwagQSnUGaLeABDZY,12190
6
+ dtflow/lineage.py,sha256=jie3OL1qK90-_cOOqqLbhSJ1oGUktDM1x5HRpQ5Qiyc,12800
7
7
  dtflow/pipeline.py,sha256=zZaC4fg5vsp_30Fhbg75vu0yggsdvf28bWBiVDWzZ6Y,13901
8
- dtflow/presets.py,sha256=OP1nnM5NFk5Kli9FsXK0xAot48E5OQ6-VOIJT9ffXPg,5023
8
+ dtflow/presets.py,sha256=qa8WQJhbNMuGxqqgA9BFadEBwDB9s0zWNxxhzF3q1K8,4701
9
9
  dtflow/schema.py,sha256=IFcij22_UFKcgKT1YWwRg2QJO0vcAvCb1arZmsGByts,16824
10
- dtflow/streaming.py,sha256=jtWQjkhhZqfyzIaFskXNvooGAYDQBn1b6X8FHgaCZYk,22704
11
- dtflow/tokenizers.py,sha256=zxE6XZGjZ_DOGCjRSClI9xaAbFVf8FS6jwwssGoi_9U,18111
10
+ dtflow/streaming.py,sha256=dxpNd1-Wz_PTLTdvM5qn06_2TJr5NRlIIuw0LOSS2Iw,24755
11
+ dtflow/tokenizers.py,sha256=7ZAelSmcDxLWH5kICgH9Q1ULH3_BfDZb9suHMjJJRZU,20589
12
12
  dtflow/cli/__init__.py,sha256=QhZ-thgx9IBTFII7T_hdoWFUl0CCsdGQHN5ZEZw2XB0,423
13
13
  dtflow/cli/clean.py,sha256=y9VCRibgK1j8WIY3h0XZX0m93EdELQC7TdnseMWwS-0,17799
14
14
  dtflow/cli/commands.py,sha256=ST65Ox_MKu-CKAtPVaxECAPXYOJiF7BhL32A4nsZZl0,1175
15
- dtflow/cli/common.py,sha256=FsDFVNcLj_874qSg2dGef4V7mqPU9THLchT8PxJpBt8,12955
15
+ dtflow/cli/common.py,sha256=nIPc9GBK61r6kmaI9OS3IyhcfPqShpDEHx1ddjFPnlM,13131
16
16
  dtflow/cli/io_ops.py,sha256=BMDisP6dxzzmSjYwmeFwaHmpHHPqirmXAWeNTD-9MQM,13254
17
17
  dtflow/cli/lineage.py,sha256=_lNh35nF9AA0Zy6FyZ4g8IzrXH2ZQnp3inF-o2Hs1pw,1383
18
18
  dtflow/cli/pipeline.py,sha256=QNEo-BJlaC1CVnVeRZr7TwfuZYloJ4TebIzJ5ALzry0,1426
19
19
  dtflow/cli/sample.py,sha256=vPTQlF0OXEry4QjO8uaD9vOae4AQbX9zDwVYOxg59ZI,10339
20
- dtflow/cli/stats.py,sha256=HByF0sFMqY1kM75dnjTcJbMKDdQNdOt4iDba4au_-pI,20495
20
+ dtflow/cli/stats.py,sha256=u4ehCfgw1X8WuOyAjrApMRgcIO3BVmINbsTjxEscQro,24086
21
21
  dtflow/cli/transform.py,sha256=w6xqMOxPxQvL2u_BPCfpDHuPSC9gmcqMPVN8s-B6bbY,15052
22
22
  dtflow/cli/validate.py,sha256=65aGVlMS_Rq0Ch0YQ-TclVJ03RQP4CnG137wthzb8Ao,4384
23
23
  dtflow/mcp/__init__.py,sha256=huEJ3rXDbxDRjsLPEvjNT2u3tWs6Poiv6fokPIrByjw,897
@@ -27,10 +27,11 @@ dtflow/mcp/docs.py,sha256=DI2Vf-eFo4chRP_bDLsv4Uc3kJt8_1emz8N-NBSVirM,8834
27
27
  dtflow/mcp/server.py,sha256=Nf0UlqDGhV55ndGuEglfr7VRjDWAC_9rRsNhdr0-ssM,4275
28
28
  dtflow/storage/__init__.py,sha256=C0jpWNQU808Ezz7lWneddABal3wILy8ijFUNiSKbHV4,362
29
29
  dtflow/storage/io.py,sha256=ZH2aSE-S89gpy3z4oTqhcqWf4u10OdkDoyul7o_YBDI,23374
30
- dtflow/utils/__init__.py,sha256=f8v9HJZMWRI5AL64Vjr76Pf2Na_whOF9nJBKgPbXXYg,429
30
+ dtflow/utils/__init__.py,sha256=Pn-ltwV04fBQmeZG7FxInDQmzH29LYOi90LgeLMEuQk,506
31
31
  dtflow/utils/display.py,sha256=OeOdTh6mbDwSkDWlmkjfpTjy2QG8ZUaYU0NpHUWkpEQ,5881
32
- dtflow/utils/field_path.py,sha256=WcNA-LZh3H61a77FEzB_R7YAyyZl3M8ofdq05ytQGmI,7459
33
- dtflow-0.5.0.dist-info/METADATA,sha256=chELFIevPb1h7ZydbWtH9rM7RiA2n3Ep-XWL1qbaHk0,22084
34
- dtflow-0.5.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
35
- dtflow-0.5.0.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
36
- dtflow-0.5.0.dist-info/RECORD,,
32
+ dtflow/utils/field_path.py,sha256=K8nU196RxTSJ1OoieTWGcYOWl9KjGq2iSxCAkfjECuM,7621
33
+ dtflow/utils/helpers.py,sha256=JXN176_B2pm53GLVyZ1wj3wrmBJG52Tkw6AMQSdj7M8,791
34
+ dtflow-0.5.3.dist-info/METADATA,sha256=5joXihL8gkmnNEaUTqRpe0_U-y8osaIfdX0v91WVtK8,22544
35
+ dtflow-0.5.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
36
+ dtflow-0.5.3.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
37
+ dtflow-0.5.3.dist-info/RECORD,,
File without changes