dtflow 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dtflow/__init__.py CHANGED
@@ -42,7 +42,7 @@ from .tokenizers import (
42
42
  token_stats,
43
43
  )
44
44
 
45
- __version__ = "0.4.1"
45
+ __version__ = "0.4.2"
46
46
 
47
47
  __all__ = [
48
48
  # core
dtflow/__main__.py CHANGED
@@ -56,7 +56,8 @@ app = typer.Typer(
56
56
  @app.command()
57
57
  def sample(
58
58
  filename: str = typer.Argument(..., help="输入文件路径"),
59
- num: int = typer.Argument(10, help="采样数量"),
59
+ num_arg: Optional[int] = typer.Argument(None, help="采样数量", metavar="NUM"),
60
+ num: int = typer.Option(10, "--num", "-n", help="采样数量", show_default=True),
60
61
  type: str = typer.Option("head", "--type", "-t", help="采样方式: random/head/tail"),
61
62
  output: Optional[str] = typer.Option(None, "--output", "-o", help="输出文件路径"),
62
63
  seed: Optional[int] = typer.Option(None, "--seed", help="随机种子"),
@@ -65,29 +66,36 @@ def sample(
65
66
  fields: Optional[str] = typer.Option(None, "--fields", "-f", help="只显示指定字段(逗号分隔)"),
66
67
  ):
67
68
  """从数据文件中采样指定数量的数据"""
68
- _sample(filename, num, type, output, seed, by, uniform, fields)
69
+ actual_num = num_arg if num_arg is not None else num
70
+ _sample(filename, actual_num, type, output, seed, by, uniform, fields)
69
71
 
70
72
 
71
73
  @app.command()
72
74
  def head(
73
75
  filename: str = typer.Argument(..., help="输入文件路径"),
74
- num: int = typer.Argument(10, help="显示数量"),
76
+ num_arg: Optional[int] = typer.Argument(None, help="显示数量", metavar="NUM"),
77
+ num: int = typer.Option(10, "--num", "-n", help="显示数量", show_default=True),
75
78
  output: Optional[str] = typer.Option(None, "--output", "-o", help="输出文件路径"),
76
79
  fields: Optional[str] = typer.Option(None, "--fields", "-f", help="只显示指定字段"),
77
80
  ):
78
81
  """显示文件的前 N 条数据"""
79
- _head(filename, num, output, fields)
82
+ # 位置参数优先于选项参数
83
+ actual_num = num_arg if num_arg is not None else num
84
+ _head(filename, actual_num, output, fields)
80
85
 
81
86
 
82
87
  @app.command()
83
88
  def tail(
84
89
  filename: str = typer.Argument(..., help="输入文件路径"),
85
- num: int = typer.Argument(10, help="显示数量"),
90
+ num_arg: Optional[int] = typer.Argument(None, help="显示数量", metavar="NUM"),
91
+ num: int = typer.Option(10, "--num", "-n", help="显示数量", show_default=True),
86
92
  output: Optional[str] = typer.Option(None, "--output", "-o", help="输出文件路径"),
87
93
  fields: Optional[str] = typer.Option(None, "--fields", "-f", help="只显示指定字段"),
88
94
  ):
89
95
  """显示文件的后 N 条数据"""
90
- _tail(filename, num, output, fields)
96
+ # 位置参数优先于选项参数
97
+ actual_num = num_arg if num_arg is not None else num
98
+ _tail(filename, actual_num, output, fields)
91
99
 
92
100
 
93
101
  # ============ 数据转换命令 ============
@@ -161,9 +169,10 @@ def clean(
161
169
  def stats(
162
170
  filename: str = typer.Argument(..., help="输入文件路径"),
163
171
  top: int = typer.Option(10, "--top", "-n", help="显示 Top N 值"),
172
+ full: bool = typer.Option(False, "--full", "-f", help="完整模式:统计值分布、唯一值等详细信息"),
164
173
  ):
165
174
  """显示数据文件的统计信息"""
166
- _stats(filename, top)
175
+ _stats(filename, top, full)
167
176
 
168
177
 
169
178
  @app.command("token-stats")
dtflow/cli/commands.py CHANGED
@@ -1289,17 +1289,23 @@ def _concat_streaming(file_paths: List[Path], output: str) -> int:
1289
1289
  def stats(
1290
1290
  filename: str,
1291
1291
  top: int = 10,
1292
+ full: bool = False,
1292
1293
  ) -> None:
1293
1294
  """
1294
- 显示数据文件的统计信息(类似 pandas df.info() + df.describe())。
1295
+ 显示数据文件的统计信息。
1296
+
1297
+ 默认快速模式:只统计行数和字段结构。
1298
+ 完整模式(--full):统计值分布、唯一值、长度等详细信息。
1295
1299
 
1296
1300
  Args:
1297
1301
  filename: 输入文件路径,支持 csv/excel/jsonl/json/parquet/arrow/feather 格式
1298
- top: 显示频率最高的前 N 个值,默认 10
1302
+ top: 显示频率最高的前 N 个值,默认 10(仅完整模式)
1303
+ full: 完整模式,统计值分布、唯一值等详细信息
1299
1304
 
1300
1305
  Examples:
1301
- dt stats data.jsonl
1302
- dt stats data.csv --top=5
1306
+ dt stats data.jsonl # 快速模式(默认)
1307
+ dt stats data.jsonl --full # 完整模式
1308
+ dt stats data.csv -f --top=5 # 完整模式,显示 Top 5
1303
1309
  """
1304
1310
  filepath = Path(filename)
1305
1311
 
@@ -1310,6 +1316,10 @@ def stats(
1310
1316
  if not _check_file_format(filepath):
1311
1317
  return
1312
1318
 
1319
+ if not full:
1320
+ _quick_stats(filepath)
1321
+ return
1322
+
1313
1323
  # 加载数据
1314
1324
  try:
1315
1325
  data = load_data(str(filepath))
@@ -1329,6 +1339,142 @@ def stats(
1329
1339
  _print_stats(filepath.name, total, field_stats)
1330
1340
 
1331
1341
 
1342
+ def _quick_stats(filepath: Path) -> None:
1343
+ """
1344
+ 快速统计模式:只统计行数和字段结构,不遍历全部数据。
1345
+
1346
+ 特点:
1347
+ - 使用流式计数,不加载全部数据到内存
1348
+ - 只读取前几条数据来推断字段结构
1349
+ - 不计算值分布、唯一值等耗时统计
1350
+ """
1351
+ import orjson
1352
+
1353
+ from ..streaming import _count_rows_fast
1354
+
1355
+ ext = filepath.suffix.lower()
1356
+ file_size = filepath.stat().st_size
1357
+
1358
+ # 格式化文件大小
1359
+ def format_size(size: int) -> str:
1360
+ for unit in ["B", "KB", "MB", "GB"]:
1361
+ if size < 1024:
1362
+ return f"{size:.1f} {unit}"
1363
+ size /= 1024
1364
+ return f"{size:.1f} TB"
1365
+
1366
+ # 快速统计行数
1367
+ total = _count_rows_fast(str(filepath))
1368
+ if total is None:
1369
+ # 回退:手动计数
1370
+ total = 0
1371
+ try:
1372
+ with open(filepath, "rb") as f:
1373
+ for line in f:
1374
+ if line.strip():
1375
+ total += 1
1376
+ except Exception:
1377
+ total = -1
1378
+
1379
+ # 读取前几条数据推断字段结构
1380
+ sample_data = []
1381
+ sample_size = 5
1382
+ try:
1383
+ if ext == ".jsonl":
1384
+ with open(filepath, "rb") as f:
1385
+ for i, line in enumerate(f):
1386
+ if i >= sample_size:
1387
+ break
1388
+ line = line.strip()
1389
+ if line:
1390
+ sample_data.append(orjson.loads(line))
1391
+ elif ext == ".csv":
1392
+ import polars as pl
1393
+
1394
+ df = pl.scan_csv(str(filepath)).head(sample_size).collect()
1395
+ sample_data = df.to_dicts()
1396
+ elif ext == ".parquet":
1397
+ import polars as pl
1398
+
1399
+ df = pl.scan_parquet(str(filepath)).head(sample_size).collect()
1400
+ sample_data = df.to_dicts()
1401
+ elif ext in (".arrow", ".feather"):
1402
+ import polars as pl
1403
+
1404
+ df = pl.scan_ipc(str(filepath)).head(sample_size).collect()
1405
+ sample_data = df.to_dicts()
1406
+ elif ext == ".json":
1407
+ with open(filepath, "rb") as f:
1408
+ data = orjson.loads(f.read())
1409
+ if isinstance(data, list):
1410
+ sample_data = data[:sample_size]
1411
+ except Exception:
1412
+ pass
1413
+
1414
+ # 分析字段结构
1415
+ fields = []
1416
+ if sample_data:
1417
+ all_keys = set()
1418
+ for item in sample_data:
1419
+ all_keys.update(item.keys())
1420
+
1421
+ for key in sorted(all_keys):
1422
+ # 从采样数据中推断类型
1423
+ sample_values = [item.get(key) for item in sample_data if key in item]
1424
+ non_null = [v for v in sample_values if v is not None]
1425
+ if non_null:
1426
+ field_type = _infer_type(non_null)
1427
+ else:
1428
+ field_type = "unknown"
1429
+ fields.append({"field": key, "type": field_type})
1430
+
1431
+ # 输出
1432
+ try:
1433
+ from rich.console import Console
1434
+ from rich.panel import Panel
1435
+ from rich.table import Table
1436
+
1437
+ console = Console()
1438
+
1439
+ # 概览
1440
+ console.print(
1441
+ Panel(
1442
+ f"[bold]文件:[/bold] {filepath.name}\n"
1443
+ f"[bold]大小:[/bold] {format_size(file_size)}\n"
1444
+ f"[bold]总数:[/bold] {total:,} 条\n"
1445
+ f"[bold]字段:[/bold] {len(fields)} 个",
1446
+ title="📊 快速统计",
1447
+ expand=False,
1448
+ )
1449
+ )
1450
+
1451
+ if fields:
1452
+ table = Table(title="📋 字段结构", show_header=True, header_style="bold cyan")
1453
+ table.add_column("#", style="dim", justify="right")
1454
+ table.add_column("字段", style="green")
1455
+ table.add_column("类型", style="yellow")
1456
+
1457
+ for i, f in enumerate(fields, 1):
1458
+ table.add_row(str(i), f["field"], f["type"])
1459
+
1460
+ console.print(table)
1461
+
1462
+ except ImportError:
1463
+ # 没有 rich,使用普通打印
1464
+ print(f"\n{'=' * 40}")
1465
+ print("📊 快速统计")
1466
+ print(f"{'=' * 40}")
1467
+ print(f"文件: {filepath.name}")
1468
+ print(f"大小: {format_size(file_size)}")
1469
+ print(f"总数: {total:,} 条")
1470
+ print(f"字段: {len(fields)} 个")
1471
+
1472
+ if fields:
1473
+ print(f"\n📋 字段结构:")
1474
+ for i, f in enumerate(fields, 1):
1475
+ print(f" {i}. {f['field']} ({f['type']})")
1476
+
1477
+
1332
1478
  def _compute_field_stats(data: List[Dict], top: int) -> List[Dict[str, Any]]:
1333
1479
  """
1334
1480
  单次遍历计算每个字段的统计信息。
@@ -1410,22 +1556,34 @@ def _count_unique(values: List[Any], field_type: str) -> int:
1410
1556
  """
1411
1557
  计算唯一值数量。
1412
1558
 
1413
- 对于简单类型直接比较,对于 list/dict 使用 hash 节省内存。
1559
+ 对于简单类型直接比较,对于 list/dict 或混合类型使用 hash
1414
1560
  """
1415
1561
  if field_type in ("list", "dict"):
1416
- # 复杂类型:使用 orjson 序列化后计算 hash
1417
- import hashlib
1562
+ return _count_unique_by_hash(values)
1563
+ else:
1564
+ # 简单类型:尝试直接比较,失败则回退到 hash 方式
1565
+ try:
1566
+ return len(set(values))
1567
+ except TypeError:
1568
+ # 混合类型(如字段中既有 str 又有 dict),回退到 hash
1569
+ return _count_unique_by_hash(values)
1570
+
1418
1571
 
1419
- import orjson
1572
+ def _count_unique_by_hash(values: List[Any]) -> int:
1573
+ """使用 orjson 序列化后计算 hash 来统计唯一值"""
1574
+ import hashlib
1420
1575
 
1421
- seen = set()
1422
- for v in values:
1576
+ import orjson
1577
+
1578
+ seen = set()
1579
+ for v in values:
1580
+ try:
1423
1581
  h = hashlib.md5(orjson.dumps(v, option=orjson.OPT_SORT_KEYS)).digest()
1424
1582
  seen.add(h)
1425
- return len(seen)
1426
- else:
1427
- # 简单类型:直接比较
1428
- return len(set(values))
1583
+ except TypeError:
1584
+ # 无法序列化的值,用 repr 兜底
1585
+ seen.add(repr(v))
1586
+ return len(seen)
1429
1587
 
1430
1588
 
1431
1589
  def _infer_type(values: List[Any]) -> str:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dtflow
3
- Version: 0.4.1
3
+ Version: 0.4.2
4
4
  Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
5
5
  Project-URL: Homepage, https://github.com/yourusername/DataTransformer
6
6
  Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
@@ -1,5 +1,5 @@
1
- dtflow/__init__.py,sha256=Gd9Us_BDXaxmMIGlz51E6OZDohqzweOrvB-2j8k3KVs,2347
2
- dtflow/__main__.py,sha256=7lKluJTruDPN4CKSK2mWLUxSUlVLtkrqXyRMjlGk7SY,10595
1
+ dtflow/__init__.py,sha256=outMIxwn2hlX6wMUfLFrBkDVknGEzJOkNUHOUWfEGuw,2347
2
+ dtflow/__main__.py,sha256=eVzGbJQxhoOvvmk1076AzT7VscOXSJu18VGGdZ9r9h8,11359
3
3
  dtflow/converters.py,sha256=gyy-K15zjzGBawFnZa8D9JX37JZ47rey2GhjKa2pxFo,22081
4
4
  dtflow/core.py,sha256=szm9qmRVe1Q97O18UTGz7xTsdV-V8L4D6Bl1bxBJCWk,28778
5
5
  dtflow/lineage.py,sha256=vQ06lxBHftu-Ma5HlISp3F2eiIvwagQSnUGaLeABDZY,12190
@@ -8,7 +8,7 @@ dtflow/presets.py,sha256=OP1nnM5NFk5Kli9FsXK0xAot48E5OQ6-VOIJT9ffXPg,5023
8
8
  dtflow/streaming.py,sha256=lYf9gi5U-3oqr7oEe5mENx1r-LtRb2YfGNq1fP3_sw4,21972
9
9
  dtflow/tokenizers.py,sha256=zxE6XZGjZ_DOGCjRSClI9xaAbFVf8FS6jwwssGoi_9U,18111
10
10
  dtflow/cli/__init__.py,sha256=QhZ-thgx9IBTFII7T_hdoWFUl0CCsdGQHN5ZEZw2XB0,423
11
- dtflow/cli/commands.py,sha256=8t_HgFuFqGt1HXPpEDV47qB2fwMD5C6d9Bjj-VNb37I,84958
11
+ dtflow/cli/commands.py,sha256=IZ2rHnJ7RHmXOW19JLjtHPfzbfNj5vq_FT2YDSao2SI,90303
12
12
  dtflow/mcp/__init__.py,sha256=huEJ3rXDbxDRjsLPEvjNT2u3tWs6Poiv6fokPIrByjw,897
13
13
  dtflow/mcp/__main__.py,sha256=PoT2ZZmJq9xDZxDACJfqDW9Ld_ukHrGNK-0XUd7WGnY,448
14
14
  dtflow/mcp/cli.py,sha256=ck0oOS_642cNktxULaMRE7BJfMxsBCwotmCj3PSPwVk,13110
@@ -19,7 +19,7 @@ dtflow/storage/io.py,sha256=XNWLL10a7jgOjM1IfTN9kIuW23dwzFE1nnaw4E3LaiU,21885
19
19
  dtflow/utils/__init__.py,sha256=f8v9HJZMWRI5AL64Vjr76Pf2Na_whOF9nJBKgPbXXYg,429
20
20
  dtflow/utils/display.py,sha256=OeOdTh6mbDwSkDWlmkjfpTjy2QG8ZUaYU0NpHUWkpEQ,5881
21
21
  dtflow/utils/field_path.py,sha256=WcNA-LZh3H61a77FEzB_R7YAyyZl3M8ofdq05ytQGmI,7459
22
- dtflow-0.4.1.dist-info/METADATA,sha256=-rdgDNFMy3pPO5mpMcKlB_quxSlD9mUIoe_tIUXoPP4,18306
23
- dtflow-0.4.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
24
- dtflow-0.4.1.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
25
- dtflow-0.4.1.dist-info/RECORD,,
22
+ dtflow-0.4.2.dist-info/METADATA,sha256=Rck3BDh1Vvpr24fUvCAcnmPXQOrZxTu_OYMAvJHADnU,18306
23
+ dtflow-0.4.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
24
+ dtflow-0.4.2.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
25
+ dtflow-0.4.2.dist-info/RECORD,,
File without changes