dtflow 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dtflow/__init__.py CHANGED
@@ -42,7 +42,7 @@ from .tokenizers import (
42
42
  token_stats,
43
43
  )
44
44
 
45
- __version__ = "0.4.0"
45
+ __version__ = "0.4.2"
46
46
 
47
47
  __all__ = [
48
48
  # core
dtflow/__main__.py CHANGED
@@ -56,7 +56,8 @@ app = typer.Typer(
56
56
  @app.command()
57
57
  def sample(
58
58
  filename: str = typer.Argument(..., help="输入文件路径"),
59
- num: int = typer.Argument(10, help="采样数量"),
59
+ num_arg: Optional[int] = typer.Argument(None, help="采样数量", metavar="NUM"),
60
+ num: int = typer.Option(10, "--num", "-n", help="采样数量", show_default=True),
60
61
  type: str = typer.Option("head", "--type", "-t", help="采样方式: random/head/tail"),
61
62
  output: Optional[str] = typer.Option(None, "--output", "-o", help="输出文件路径"),
62
63
  seed: Optional[int] = typer.Option(None, "--seed", help="随机种子"),
@@ -65,29 +66,36 @@ def sample(
65
66
  fields: Optional[str] = typer.Option(None, "--fields", "-f", help="只显示指定字段(逗号分隔)"),
66
67
  ):
67
68
  """从数据文件中采样指定数量的数据"""
68
- _sample(filename, num, type, output, seed, by, uniform, fields)
69
+ actual_num = num_arg if num_arg is not None else num
70
+ _sample(filename, actual_num, type, output, seed, by, uniform, fields)
69
71
 
70
72
 
71
73
  @app.command()
72
74
  def head(
73
75
  filename: str = typer.Argument(..., help="输入文件路径"),
74
- num: int = typer.Argument(10, help="显示数量"),
76
+ num_arg: Optional[int] = typer.Argument(None, help="显示数量", metavar="NUM"),
77
+ num: int = typer.Option(10, "--num", "-n", help="显示数量", show_default=True),
75
78
  output: Optional[str] = typer.Option(None, "--output", "-o", help="输出文件路径"),
76
79
  fields: Optional[str] = typer.Option(None, "--fields", "-f", help="只显示指定字段"),
77
80
  ):
78
81
  """显示文件的前 N 条数据"""
79
- _head(filename, num, output, fields)
82
+ # 位置参数优先于选项参数
83
+ actual_num = num_arg if num_arg is not None else num
84
+ _head(filename, actual_num, output, fields)
80
85
 
81
86
 
82
87
  @app.command()
83
88
  def tail(
84
89
  filename: str = typer.Argument(..., help="输入文件路径"),
85
- num: int = typer.Argument(10, help="显示数量"),
90
+ num_arg: Optional[int] = typer.Argument(None, help="显示数量", metavar="NUM"),
91
+ num: int = typer.Option(10, "--num", "-n", help="显示数量", show_default=True),
86
92
  output: Optional[str] = typer.Option(None, "--output", "-o", help="输出文件路径"),
87
93
  fields: Optional[str] = typer.Option(None, "--fields", "-f", help="只显示指定字段"),
88
94
  ):
89
95
  """显示文件的后 N 条数据"""
90
- _tail(filename, num, output, fields)
96
+ # 位置参数优先于选项参数
97
+ actual_num = num_arg if num_arg is not None else num
98
+ _tail(filename, actual_num, output, fields)
91
99
 
92
100
 
93
101
  # ============ 数据转换命令 ============
@@ -161,9 +169,10 @@ def clean(
161
169
  def stats(
162
170
  filename: str = typer.Argument(..., help="输入文件路径"),
163
171
  top: int = typer.Option(10, "--top", "-n", help="显示 Top N 值"),
172
+ full: bool = typer.Option(False, "--full", "-f", help="完整模式:统计值分布、唯一值等详细信息"),
164
173
  ):
165
174
  """显示数据文件的统计信息"""
166
- _stats(filename, top)
175
+ _stats(filename, top, full)
167
176
 
168
177
 
169
178
  @app.command("token-stats")
dtflow/cli/commands.py CHANGED
@@ -796,6 +796,17 @@ def _generate_default_transform(field_names: List[str]) -> str:
796
796
  return "\n".join(lines) if lines else " # 在这里定义输出字段"
797
797
 
798
798
 
799
+ def _unwrap(obj: Any) -> Any:
800
+ """递归将 DictWrapper 转换为普通 dict"""
801
+ if hasattr(obj, "to_dict"):
802
+ return _unwrap(obj.to_dict())
803
+ if isinstance(obj, dict):
804
+ return {k: _unwrap(v) for k, v in obj.items()}
805
+ if isinstance(obj, list):
806
+ return [_unwrap(v) for v in obj]
807
+ return obj
808
+
809
+
799
810
  def _execute_transform(
800
811
  input_path: Path,
801
812
  config_path: Path,
@@ -829,7 +840,8 @@ def _execute_transform(
829
840
  try:
830
841
  # 包装转换函数以支持属性访问(配置文件中定义的 Item 类)
831
842
  def wrapped_transform(item):
832
- return transform_func(DictWrapper(item))
843
+ result = transform_func(DictWrapper(item))
844
+ return _unwrap(result)
833
845
 
834
846
  st = load_stream(str(input_path))
835
847
  if num:
@@ -926,7 +938,8 @@ def _execute_preset_transform(
926
938
  try:
927
939
  # 包装转换函数以支持属性访问
928
940
  def wrapped_transform(item):
929
- return transform_func(DictWrapper(item))
941
+ result = transform_func(DictWrapper(item))
942
+ return _unwrap(result)
930
943
 
931
944
  st = load_stream(str(input_path))
932
945
  if num:
@@ -1276,17 +1289,23 @@ def _concat_streaming(file_paths: List[Path], output: str) -> int:
1276
1289
  def stats(
1277
1290
  filename: str,
1278
1291
  top: int = 10,
1292
+ full: bool = False,
1279
1293
  ) -> None:
1280
1294
  """
1281
- 显示数据文件的统计信息(类似 pandas df.info() + df.describe())。
1295
+ 显示数据文件的统计信息。
1296
+
1297
+ 默认快速模式:只统计行数和字段结构。
1298
+ 完整模式(--full):统计值分布、唯一值、长度等详细信息。
1282
1299
 
1283
1300
  Args:
1284
1301
  filename: 输入文件路径,支持 csv/excel/jsonl/json/parquet/arrow/feather 格式
1285
- top: 显示频率最高的前 N 个值,默认 10
1302
+ top: 显示频率最高的前 N 个值,默认 10(仅完整模式)
1303
+ full: 完整模式,统计值分布、唯一值等详细信息
1286
1304
 
1287
1305
  Examples:
1288
- dt stats data.jsonl
1289
- dt stats data.csv --top=5
1306
+ dt stats data.jsonl # 快速模式(默认)
1307
+ dt stats data.jsonl --full # 完整模式
1308
+ dt stats data.csv -f --top=5 # 完整模式,显示 Top 5
1290
1309
  """
1291
1310
  filepath = Path(filename)
1292
1311
 
@@ -1297,6 +1316,10 @@ def stats(
1297
1316
  if not _check_file_format(filepath):
1298
1317
  return
1299
1318
 
1319
+ if not full:
1320
+ _quick_stats(filepath)
1321
+ return
1322
+
1300
1323
  # 加载数据
1301
1324
  try:
1302
1325
  data = load_data(str(filepath))
@@ -1316,6 +1339,142 @@ def stats(
1316
1339
  _print_stats(filepath.name, total, field_stats)
1317
1340
 
1318
1341
 
1342
+ def _quick_stats(filepath: Path) -> None:
1343
+ """
1344
+ 快速统计模式:只统计行数和字段结构,不遍历全部数据。
1345
+
1346
+ 特点:
1347
+ - 使用流式计数,不加载全部数据到内存
1348
+ - 只读取前几条数据来推断字段结构
1349
+ - 不计算值分布、唯一值等耗时统计
1350
+ """
1351
+ import orjson
1352
+
1353
+ from ..streaming import _count_rows_fast
1354
+
1355
+ ext = filepath.suffix.lower()
1356
+ file_size = filepath.stat().st_size
1357
+
1358
+ # 格式化文件大小
1359
+ def format_size(size: int) -> str:
1360
+ for unit in ["B", "KB", "MB", "GB"]:
1361
+ if size < 1024:
1362
+ return f"{size:.1f} {unit}"
1363
+ size /= 1024
1364
+ return f"{size:.1f} TB"
1365
+
1366
+ # 快速统计行数
1367
+ total = _count_rows_fast(str(filepath))
1368
+ if total is None:
1369
+ # 回退:手动计数
1370
+ total = 0
1371
+ try:
1372
+ with open(filepath, "rb") as f:
1373
+ for line in f:
1374
+ if line.strip():
1375
+ total += 1
1376
+ except Exception:
1377
+ total = -1
1378
+
1379
+ # 读取前几条数据推断字段结构
1380
+ sample_data = []
1381
+ sample_size = 5
1382
+ try:
1383
+ if ext == ".jsonl":
1384
+ with open(filepath, "rb") as f:
1385
+ for i, line in enumerate(f):
1386
+ if i >= sample_size:
1387
+ break
1388
+ line = line.strip()
1389
+ if line:
1390
+ sample_data.append(orjson.loads(line))
1391
+ elif ext == ".csv":
1392
+ import polars as pl
1393
+
1394
+ df = pl.scan_csv(str(filepath)).head(sample_size).collect()
1395
+ sample_data = df.to_dicts()
1396
+ elif ext == ".parquet":
1397
+ import polars as pl
1398
+
1399
+ df = pl.scan_parquet(str(filepath)).head(sample_size).collect()
1400
+ sample_data = df.to_dicts()
1401
+ elif ext in (".arrow", ".feather"):
1402
+ import polars as pl
1403
+
1404
+ df = pl.scan_ipc(str(filepath)).head(sample_size).collect()
1405
+ sample_data = df.to_dicts()
1406
+ elif ext == ".json":
1407
+ with open(filepath, "rb") as f:
1408
+ data = orjson.loads(f.read())
1409
+ if isinstance(data, list):
1410
+ sample_data = data[:sample_size]
1411
+ except Exception:
1412
+ pass
1413
+
1414
+ # 分析字段结构
1415
+ fields = []
1416
+ if sample_data:
1417
+ all_keys = set()
1418
+ for item in sample_data:
1419
+ all_keys.update(item.keys())
1420
+
1421
+ for key in sorted(all_keys):
1422
+ # 从采样数据中推断类型
1423
+ sample_values = [item.get(key) for item in sample_data if key in item]
1424
+ non_null = [v for v in sample_values if v is not None]
1425
+ if non_null:
1426
+ field_type = _infer_type(non_null)
1427
+ else:
1428
+ field_type = "unknown"
1429
+ fields.append({"field": key, "type": field_type})
1430
+
1431
+ # 输出
1432
+ try:
1433
+ from rich.console import Console
1434
+ from rich.panel import Panel
1435
+ from rich.table import Table
1436
+
1437
+ console = Console()
1438
+
1439
+ # 概览
1440
+ console.print(
1441
+ Panel(
1442
+ f"[bold]文件:[/bold] {filepath.name}\n"
1443
+ f"[bold]大小:[/bold] {format_size(file_size)}\n"
1444
+ f"[bold]总数:[/bold] {total:,} 条\n"
1445
+ f"[bold]字段:[/bold] {len(fields)} 个",
1446
+ title="📊 快速统计",
1447
+ expand=False,
1448
+ )
1449
+ )
1450
+
1451
+ if fields:
1452
+ table = Table(title="📋 字段结构", show_header=True, header_style="bold cyan")
1453
+ table.add_column("#", style="dim", justify="right")
1454
+ table.add_column("字段", style="green")
1455
+ table.add_column("类型", style="yellow")
1456
+
1457
+ for i, f in enumerate(fields, 1):
1458
+ table.add_row(str(i), f["field"], f["type"])
1459
+
1460
+ console.print(table)
1461
+
1462
+ except ImportError:
1463
+ # 没有 rich,使用普通打印
1464
+ print(f"\n{'=' * 40}")
1465
+ print("📊 快速统计")
1466
+ print(f"{'=' * 40}")
1467
+ print(f"文件: {filepath.name}")
1468
+ print(f"大小: {format_size(file_size)}")
1469
+ print(f"总数: {total:,} 条")
1470
+ print(f"字段: {len(fields)} 个")
1471
+
1472
+ if fields:
1473
+ print(f"\n📋 字段结构:")
1474
+ for i, f in enumerate(fields, 1):
1475
+ print(f" {i}. {f['field']} ({f['type']})")
1476
+
1477
+
1319
1478
  def _compute_field_stats(data: List[Dict], top: int) -> List[Dict[str, Any]]:
1320
1479
  """
1321
1480
  单次遍历计算每个字段的统计信息。
@@ -1397,22 +1556,34 @@ def _count_unique(values: List[Any], field_type: str) -> int:
1397
1556
  """
1398
1557
  计算唯一值数量。
1399
1558
 
1400
- 对于简单类型直接比较,对于 list/dict 使用 hash 节省内存。
1559
+ 对于简单类型直接比较,对于 list/dict 或混合类型使用 hash
1401
1560
  """
1402
1561
  if field_type in ("list", "dict"):
1403
- # 复杂类型:使用 orjson 序列化后计算 hash
1404
- import hashlib
1562
+ return _count_unique_by_hash(values)
1563
+ else:
1564
+ # 简单类型:尝试直接比较,失败则回退到 hash 方式
1565
+ try:
1566
+ return len(set(values))
1567
+ except TypeError:
1568
+ # 混合类型(如字段中既有 str 又有 dict),回退到 hash
1569
+ return _count_unique_by_hash(values)
1570
+
1405
1571
 
1406
- import orjson
1572
+ def _count_unique_by_hash(values: List[Any]) -> int:
1573
+ """使用 orjson 序列化后计算 hash 来统计唯一值"""
1574
+ import hashlib
1407
1575
 
1408
- seen = set()
1409
- for v in values:
1576
+ import orjson
1577
+
1578
+ seen = set()
1579
+ for v in values:
1580
+ try:
1410
1581
  h = hashlib.md5(orjson.dumps(v, option=orjson.OPT_SORT_KEYS)).digest()
1411
1582
  seen.add(h)
1412
- return len(seen)
1413
- else:
1414
- # 简单类型:直接比较
1415
- return len(set(values))
1583
+ except TypeError:
1584
+ # 无法序列化的值,用 repr 兜底
1585
+ seen.add(repr(v))
1586
+ return len(seen)
1416
1587
 
1417
1588
 
1418
1589
  def _infer_type(values: List[Any]) -> str:
dtflow/streaming.py CHANGED
@@ -84,6 +84,8 @@ class StreamingTransformer:
84
84
  self._source_path = source_path
85
85
  self._total = total
86
86
  self._operations: List[Dict[str, Any]] = []
87
+ self._error_count = 0
88
+ self._first_error: Optional[str] = None
87
89
 
88
90
  @classmethod
89
91
  def load_stream(cls, filepath: str, batch_size: int = 10000) -> "StreamingTransformer":
@@ -194,17 +196,20 @@ class StreamingTransformer:
194
196
  Returns:
195
197
  新的 StreamingTransformer(惰性,不立即执行)
196
198
  """
199
+ # transform 是 1:1 转换,保留 total
200
+ new_st = StreamingTransformer(iter([]), self._source_path, total=self._total)
201
+ new_st._operations = self._operations + [{"type": "transform", "func": func}]
197
202
 
198
203
  def transformed_iterator():
199
204
  for item in self._iterator:
200
205
  try:
201
206
  yield func(item)
202
- except Exception:
203
- pass # 跳过错误
207
+ except Exception as e:
208
+ new_st._error_count += 1
209
+ if new_st._first_error is None:
210
+ new_st._first_error = f"{type(e).__name__}: {e}"
204
211
 
205
- # transform 是 1:1 转换,保留 total
206
- new_st = StreamingTransformer(transformed_iterator(), self._source_path, total=self._total)
207
- new_st._operations = self._operations + [{"type": "transform", "func": func}]
212
+ new_st._iterator = transformed_iterator()
208
213
  return new_st
209
214
 
210
215
  def head(self, n: int) -> "StreamingTransformer":
@@ -299,16 +304,21 @@ class StreamingTransformer:
299
304
  ext = path.suffix.lower()
300
305
 
301
306
  if ext == ".jsonl":
302
- return self._save_jsonl(filepath, show_progress)
307
+ count = self._save_jsonl(filepath, show_progress)
303
308
  elif ext == ".csv":
304
- return self._save_batched(filepath, "csv", batch_size, show_progress)
309
+ count = self._save_batched(filepath, "csv", batch_size, show_progress)
305
310
  elif ext == ".parquet":
306
- return self._save_batched(filepath, "parquet", batch_size, show_progress)
311
+ count = self._save_batched(filepath, "parquet", batch_size, show_progress)
307
312
  elif ext in (".arrow", ".feather"):
308
- return self._save_batched(filepath, "arrow", batch_size, show_progress)
313
+ count = self._save_batched(filepath, "arrow", batch_size, show_progress)
309
314
  else:
310
- # 默认 JSONL
311
- return self._save_jsonl(filepath, show_progress)
315
+ count = self._save_jsonl(filepath, show_progress)
316
+
317
+ # 打印错误摘要
318
+ if self._error_count > 0:
319
+ print(f"⚠️ 跳过 {self._error_count} 条错误记录: {self._first_error}")
320
+
321
+ return count
312
322
 
313
323
  def _save_jsonl(self, filepath: str, show_progress: bool) -> int:
314
324
  """JSONL 逐行流式保存(使用 orjson)"""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dtflow
3
- Version: 0.4.0
3
+ Version: 0.4.2
4
4
  Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
5
5
  Project-URL: Homepage, https://github.com/yourusername/DataTransformer
6
6
  Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
@@ -1,14 +1,14 @@
1
- dtflow/__init__.py,sha256=OF6TdEQPvEpcAsuBBsHeycXo6OfDY_Ar_YWaMPhiBFI,2347
2
- dtflow/__main__.py,sha256=7lKluJTruDPN4CKSK2mWLUxSUlVLtkrqXyRMjlGk7SY,10595
1
+ dtflow/__init__.py,sha256=outMIxwn2hlX6wMUfLFrBkDVknGEzJOkNUHOUWfEGuw,2347
2
+ dtflow/__main__.py,sha256=eVzGbJQxhoOvvmk1076AzT7VscOXSJu18VGGdZ9r9h8,11359
3
3
  dtflow/converters.py,sha256=gyy-K15zjzGBawFnZa8D9JX37JZ47rey2GhjKa2pxFo,22081
4
4
  dtflow/core.py,sha256=szm9qmRVe1Q97O18UTGz7xTsdV-V8L4D6Bl1bxBJCWk,28778
5
5
  dtflow/lineage.py,sha256=vQ06lxBHftu-Ma5HlISp3F2eiIvwagQSnUGaLeABDZY,12190
6
6
  dtflow/pipeline.py,sha256=zZaC4fg5vsp_30Fhbg75vu0yggsdvf28bWBiVDWzZ6Y,13901
7
7
  dtflow/presets.py,sha256=OP1nnM5NFk5Kli9FsXK0xAot48E5OQ6-VOIJT9ffXPg,5023
8
- dtflow/streaming.py,sha256=O8waTXDOEk_6ES_H3-TKTc3zyc-EC8DjOfgepAKV96A,21556
8
+ dtflow/streaming.py,sha256=lYf9gi5U-3oqr7oEe5mENx1r-LtRb2YfGNq1fP3_sw4,21972
9
9
  dtflow/tokenizers.py,sha256=zxE6XZGjZ_DOGCjRSClI9xaAbFVf8FS6jwwssGoi_9U,18111
10
10
  dtflow/cli/__init__.py,sha256=QhZ-thgx9IBTFII7T_hdoWFUl0CCsdGQHN5ZEZw2XB0,423
11
- dtflow/cli/commands.py,sha256=1NEHcwNq68le-YEy70j5bacn4RLWSJj_HWcZkOUl2bI,84537
11
+ dtflow/cli/commands.py,sha256=IZ2rHnJ7RHmXOW19JLjtHPfzbfNj5vq_FT2YDSao2SI,90303
12
12
  dtflow/mcp/__init__.py,sha256=huEJ3rXDbxDRjsLPEvjNT2u3tWs6Poiv6fokPIrByjw,897
13
13
  dtflow/mcp/__main__.py,sha256=PoT2ZZmJq9xDZxDACJfqDW9Ld_ukHrGNK-0XUd7WGnY,448
14
14
  dtflow/mcp/cli.py,sha256=ck0oOS_642cNktxULaMRE7BJfMxsBCwotmCj3PSPwVk,13110
@@ -19,7 +19,7 @@ dtflow/storage/io.py,sha256=XNWLL10a7jgOjM1IfTN9kIuW23dwzFE1nnaw4E3LaiU,21885
19
19
  dtflow/utils/__init__.py,sha256=f8v9HJZMWRI5AL64Vjr76Pf2Na_whOF9nJBKgPbXXYg,429
20
20
  dtflow/utils/display.py,sha256=OeOdTh6mbDwSkDWlmkjfpTjy2QG8ZUaYU0NpHUWkpEQ,5881
21
21
  dtflow/utils/field_path.py,sha256=WcNA-LZh3H61a77FEzB_R7YAyyZl3M8ofdq05ytQGmI,7459
22
- dtflow-0.4.0.dist-info/METADATA,sha256=HJhvSYxPG6wmYZPx0qLVQLSbmyK0CIp8qiu3ppe65mA,18306
23
- dtflow-0.4.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
24
- dtflow-0.4.0.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
25
- dtflow-0.4.0.dist-info/RECORD,,
22
+ dtflow-0.4.2.dist-info/METADATA,sha256=Rck3BDh1Vvpr24fUvCAcnmPXQOrZxTu_OYMAvJHADnU,18306
23
+ dtflow-0.4.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
24
+ dtflow-0.4.2.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
25
+ dtflow-0.4.2.dist-info/RECORD,,
File without changes