dtflow 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dtflow/__init__.py +1 -1
- dtflow/__main__.py +16 -7
- dtflow/cli/commands.py +172 -14
- {dtflow-0.4.1.dist-info → dtflow-0.4.2.dist-info}/METADATA +1 -1
- {dtflow-0.4.1.dist-info → dtflow-0.4.2.dist-info}/RECORD +7 -7
- {dtflow-0.4.1.dist-info → dtflow-0.4.2.dist-info}/WHEEL +0 -0
- {dtflow-0.4.1.dist-info → dtflow-0.4.2.dist-info}/entry_points.txt +0 -0
dtflow/__init__.py
CHANGED
dtflow/__main__.py
CHANGED
|
@@ -56,7 +56,8 @@ app = typer.Typer(
|
|
|
56
56
|
@app.command()
|
|
57
57
|
def sample(
|
|
58
58
|
filename: str = typer.Argument(..., help="输入文件路径"),
|
|
59
|
-
|
|
59
|
+
num_arg: Optional[int] = typer.Argument(None, help="采样数量", metavar="NUM"),
|
|
60
|
+
num: int = typer.Option(10, "--num", "-n", help="采样数量", show_default=True),
|
|
60
61
|
type: str = typer.Option("head", "--type", "-t", help="采样方式: random/head/tail"),
|
|
61
62
|
output: Optional[str] = typer.Option(None, "--output", "-o", help="输出文件路径"),
|
|
62
63
|
seed: Optional[int] = typer.Option(None, "--seed", help="随机种子"),
|
|
@@ -65,29 +66,36 @@ def sample(
|
|
|
65
66
|
fields: Optional[str] = typer.Option(None, "--fields", "-f", help="只显示指定字段(逗号分隔)"),
|
|
66
67
|
):
|
|
67
68
|
"""从数据文件中采样指定数量的数据"""
|
|
68
|
-
|
|
69
|
+
actual_num = num_arg if num_arg is not None else num
|
|
70
|
+
_sample(filename, actual_num, type, output, seed, by, uniform, fields)
|
|
69
71
|
|
|
70
72
|
|
|
71
73
|
@app.command()
|
|
72
74
|
def head(
|
|
73
75
|
filename: str = typer.Argument(..., help="输入文件路径"),
|
|
74
|
-
|
|
76
|
+
num_arg: Optional[int] = typer.Argument(None, help="显示数量", metavar="NUM"),
|
|
77
|
+
num: int = typer.Option(10, "--num", "-n", help="显示数量", show_default=True),
|
|
75
78
|
output: Optional[str] = typer.Option(None, "--output", "-o", help="输出文件路径"),
|
|
76
79
|
fields: Optional[str] = typer.Option(None, "--fields", "-f", help="只显示指定字段"),
|
|
77
80
|
):
|
|
78
81
|
"""显示文件的前 N 条数据"""
|
|
79
|
-
|
|
82
|
+
# 位置参数优先于选项参数
|
|
83
|
+
actual_num = num_arg if num_arg is not None else num
|
|
84
|
+
_head(filename, actual_num, output, fields)
|
|
80
85
|
|
|
81
86
|
|
|
82
87
|
@app.command()
|
|
83
88
|
def tail(
|
|
84
89
|
filename: str = typer.Argument(..., help="输入文件路径"),
|
|
85
|
-
|
|
90
|
+
num_arg: Optional[int] = typer.Argument(None, help="显示数量", metavar="NUM"),
|
|
91
|
+
num: int = typer.Option(10, "--num", "-n", help="显示数量", show_default=True),
|
|
86
92
|
output: Optional[str] = typer.Option(None, "--output", "-o", help="输出文件路径"),
|
|
87
93
|
fields: Optional[str] = typer.Option(None, "--fields", "-f", help="只显示指定字段"),
|
|
88
94
|
):
|
|
89
95
|
"""显示文件的后 N 条数据"""
|
|
90
|
-
|
|
96
|
+
# 位置参数优先于选项参数
|
|
97
|
+
actual_num = num_arg if num_arg is not None else num
|
|
98
|
+
_tail(filename, actual_num, output, fields)
|
|
91
99
|
|
|
92
100
|
|
|
93
101
|
# ============ 数据转换命令 ============
|
|
@@ -161,9 +169,10 @@ def clean(
|
|
|
161
169
|
def stats(
|
|
162
170
|
filename: str = typer.Argument(..., help="输入文件路径"),
|
|
163
171
|
top: int = typer.Option(10, "--top", "-n", help="显示 Top N 值"),
|
|
172
|
+
full: bool = typer.Option(False, "--full", "-f", help="完整模式:统计值分布、唯一值等详细信息"),
|
|
164
173
|
):
|
|
165
174
|
"""显示数据文件的统计信息"""
|
|
166
|
-
_stats(filename, top)
|
|
175
|
+
_stats(filename, top, full)
|
|
167
176
|
|
|
168
177
|
|
|
169
178
|
@app.command("token-stats")
|
dtflow/cli/commands.py
CHANGED
|
@@ -1289,17 +1289,23 @@ def _concat_streaming(file_paths: List[Path], output: str) -> int:
|
|
|
1289
1289
|
def stats(
|
|
1290
1290
|
filename: str,
|
|
1291
1291
|
top: int = 10,
|
|
1292
|
+
full: bool = False,
|
|
1292
1293
|
) -> None:
|
|
1293
1294
|
"""
|
|
1294
|
-
|
|
1295
|
+
显示数据文件的统计信息。
|
|
1296
|
+
|
|
1297
|
+
默认快速模式:只统计行数和字段结构。
|
|
1298
|
+
完整模式(--full):统计值分布、唯一值、长度等详细信息。
|
|
1295
1299
|
|
|
1296
1300
|
Args:
|
|
1297
1301
|
filename: 输入文件路径,支持 csv/excel/jsonl/json/parquet/arrow/feather 格式
|
|
1298
|
-
top: 显示频率最高的前 N 个值,默认 10
|
|
1302
|
+
top: 显示频率最高的前 N 个值,默认 10(仅完整模式)
|
|
1303
|
+
full: 完整模式,统计值分布、唯一值等详细信息
|
|
1299
1304
|
|
|
1300
1305
|
Examples:
|
|
1301
|
-
dt stats data.jsonl
|
|
1302
|
-
dt stats data.
|
|
1306
|
+
dt stats data.jsonl # 快速模式(默认)
|
|
1307
|
+
dt stats data.jsonl --full # 完整模式
|
|
1308
|
+
dt stats data.csv -f --top=5 # 完整模式,显示 Top 5
|
|
1303
1309
|
"""
|
|
1304
1310
|
filepath = Path(filename)
|
|
1305
1311
|
|
|
@@ -1310,6 +1316,10 @@ def stats(
|
|
|
1310
1316
|
if not _check_file_format(filepath):
|
|
1311
1317
|
return
|
|
1312
1318
|
|
|
1319
|
+
if not full:
|
|
1320
|
+
_quick_stats(filepath)
|
|
1321
|
+
return
|
|
1322
|
+
|
|
1313
1323
|
# 加载数据
|
|
1314
1324
|
try:
|
|
1315
1325
|
data = load_data(str(filepath))
|
|
@@ -1329,6 +1339,142 @@ def stats(
|
|
|
1329
1339
|
_print_stats(filepath.name, total, field_stats)
|
|
1330
1340
|
|
|
1331
1341
|
|
|
1342
|
+
def _quick_stats(filepath: Path) -> None:
|
|
1343
|
+
"""
|
|
1344
|
+
快速统计模式:只统计行数和字段结构,不遍历全部数据。
|
|
1345
|
+
|
|
1346
|
+
特点:
|
|
1347
|
+
- 使用流式计数,不加载全部数据到内存
|
|
1348
|
+
- 只读取前几条数据来推断字段结构
|
|
1349
|
+
- 不计算值分布、唯一值等耗时统计
|
|
1350
|
+
"""
|
|
1351
|
+
import orjson
|
|
1352
|
+
|
|
1353
|
+
from ..streaming import _count_rows_fast
|
|
1354
|
+
|
|
1355
|
+
ext = filepath.suffix.lower()
|
|
1356
|
+
file_size = filepath.stat().st_size
|
|
1357
|
+
|
|
1358
|
+
# 格式化文件大小
|
|
1359
|
+
def format_size(size: int) -> str:
|
|
1360
|
+
for unit in ["B", "KB", "MB", "GB"]:
|
|
1361
|
+
if size < 1024:
|
|
1362
|
+
return f"{size:.1f} {unit}"
|
|
1363
|
+
size /= 1024
|
|
1364
|
+
return f"{size:.1f} TB"
|
|
1365
|
+
|
|
1366
|
+
# 快速统计行数
|
|
1367
|
+
total = _count_rows_fast(str(filepath))
|
|
1368
|
+
if total is None:
|
|
1369
|
+
# 回退:手动计数
|
|
1370
|
+
total = 0
|
|
1371
|
+
try:
|
|
1372
|
+
with open(filepath, "rb") as f:
|
|
1373
|
+
for line in f:
|
|
1374
|
+
if line.strip():
|
|
1375
|
+
total += 1
|
|
1376
|
+
except Exception:
|
|
1377
|
+
total = -1
|
|
1378
|
+
|
|
1379
|
+
# 读取前几条数据推断字段结构
|
|
1380
|
+
sample_data = []
|
|
1381
|
+
sample_size = 5
|
|
1382
|
+
try:
|
|
1383
|
+
if ext == ".jsonl":
|
|
1384
|
+
with open(filepath, "rb") as f:
|
|
1385
|
+
for i, line in enumerate(f):
|
|
1386
|
+
if i >= sample_size:
|
|
1387
|
+
break
|
|
1388
|
+
line = line.strip()
|
|
1389
|
+
if line:
|
|
1390
|
+
sample_data.append(orjson.loads(line))
|
|
1391
|
+
elif ext == ".csv":
|
|
1392
|
+
import polars as pl
|
|
1393
|
+
|
|
1394
|
+
df = pl.scan_csv(str(filepath)).head(sample_size).collect()
|
|
1395
|
+
sample_data = df.to_dicts()
|
|
1396
|
+
elif ext == ".parquet":
|
|
1397
|
+
import polars as pl
|
|
1398
|
+
|
|
1399
|
+
df = pl.scan_parquet(str(filepath)).head(sample_size).collect()
|
|
1400
|
+
sample_data = df.to_dicts()
|
|
1401
|
+
elif ext in (".arrow", ".feather"):
|
|
1402
|
+
import polars as pl
|
|
1403
|
+
|
|
1404
|
+
df = pl.scan_ipc(str(filepath)).head(sample_size).collect()
|
|
1405
|
+
sample_data = df.to_dicts()
|
|
1406
|
+
elif ext == ".json":
|
|
1407
|
+
with open(filepath, "rb") as f:
|
|
1408
|
+
data = orjson.loads(f.read())
|
|
1409
|
+
if isinstance(data, list):
|
|
1410
|
+
sample_data = data[:sample_size]
|
|
1411
|
+
except Exception:
|
|
1412
|
+
pass
|
|
1413
|
+
|
|
1414
|
+
# 分析字段结构
|
|
1415
|
+
fields = []
|
|
1416
|
+
if sample_data:
|
|
1417
|
+
all_keys = set()
|
|
1418
|
+
for item in sample_data:
|
|
1419
|
+
all_keys.update(item.keys())
|
|
1420
|
+
|
|
1421
|
+
for key in sorted(all_keys):
|
|
1422
|
+
# 从采样数据中推断类型
|
|
1423
|
+
sample_values = [item.get(key) for item in sample_data if key in item]
|
|
1424
|
+
non_null = [v for v in sample_values if v is not None]
|
|
1425
|
+
if non_null:
|
|
1426
|
+
field_type = _infer_type(non_null)
|
|
1427
|
+
else:
|
|
1428
|
+
field_type = "unknown"
|
|
1429
|
+
fields.append({"field": key, "type": field_type})
|
|
1430
|
+
|
|
1431
|
+
# 输出
|
|
1432
|
+
try:
|
|
1433
|
+
from rich.console import Console
|
|
1434
|
+
from rich.panel import Panel
|
|
1435
|
+
from rich.table import Table
|
|
1436
|
+
|
|
1437
|
+
console = Console()
|
|
1438
|
+
|
|
1439
|
+
# 概览
|
|
1440
|
+
console.print(
|
|
1441
|
+
Panel(
|
|
1442
|
+
f"[bold]文件:[/bold] {filepath.name}\n"
|
|
1443
|
+
f"[bold]大小:[/bold] {format_size(file_size)}\n"
|
|
1444
|
+
f"[bold]总数:[/bold] {total:,} 条\n"
|
|
1445
|
+
f"[bold]字段:[/bold] {len(fields)} 个",
|
|
1446
|
+
title="📊 快速统计",
|
|
1447
|
+
expand=False,
|
|
1448
|
+
)
|
|
1449
|
+
)
|
|
1450
|
+
|
|
1451
|
+
if fields:
|
|
1452
|
+
table = Table(title="📋 字段结构", show_header=True, header_style="bold cyan")
|
|
1453
|
+
table.add_column("#", style="dim", justify="right")
|
|
1454
|
+
table.add_column("字段", style="green")
|
|
1455
|
+
table.add_column("类型", style="yellow")
|
|
1456
|
+
|
|
1457
|
+
for i, f in enumerate(fields, 1):
|
|
1458
|
+
table.add_row(str(i), f["field"], f["type"])
|
|
1459
|
+
|
|
1460
|
+
console.print(table)
|
|
1461
|
+
|
|
1462
|
+
except ImportError:
|
|
1463
|
+
# 没有 rich,使用普通打印
|
|
1464
|
+
print(f"\n{'=' * 40}")
|
|
1465
|
+
print("📊 快速统计")
|
|
1466
|
+
print(f"{'=' * 40}")
|
|
1467
|
+
print(f"文件: {filepath.name}")
|
|
1468
|
+
print(f"大小: {format_size(file_size)}")
|
|
1469
|
+
print(f"总数: {total:,} 条")
|
|
1470
|
+
print(f"字段: {len(fields)} 个")
|
|
1471
|
+
|
|
1472
|
+
if fields:
|
|
1473
|
+
print(f"\n📋 字段结构:")
|
|
1474
|
+
for i, f in enumerate(fields, 1):
|
|
1475
|
+
print(f" {i}. {f['field']} ({f['type']})")
|
|
1476
|
+
|
|
1477
|
+
|
|
1332
1478
|
def _compute_field_stats(data: List[Dict], top: int) -> List[Dict[str, Any]]:
|
|
1333
1479
|
"""
|
|
1334
1480
|
单次遍历计算每个字段的统计信息。
|
|
@@ -1410,22 +1556,34 @@ def _count_unique(values: List[Any], field_type: str) -> int:
|
|
|
1410
1556
|
"""
|
|
1411
1557
|
计算唯一值数量。
|
|
1412
1558
|
|
|
1413
|
-
对于简单类型直接比较,对于 list/dict
|
|
1559
|
+
对于简单类型直接比较,对于 list/dict 或混合类型使用 hash。
|
|
1414
1560
|
"""
|
|
1415
1561
|
if field_type in ("list", "dict"):
|
|
1416
|
-
|
|
1417
|
-
|
|
1562
|
+
return _count_unique_by_hash(values)
|
|
1563
|
+
else:
|
|
1564
|
+
# 简单类型:尝试直接比较,失败则回退到 hash 方式
|
|
1565
|
+
try:
|
|
1566
|
+
return len(set(values))
|
|
1567
|
+
except TypeError:
|
|
1568
|
+
# 混合类型(如字段中既有 str 又有 dict),回退到 hash
|
|
1569
|
+
return _count_unique_by_hash(values)
|
|
1570
|
+
|
|
1418
1571
|
|
|
1419
|
-
|
|
1572
|
+
def _count_unique_by_hash(values: List[Any]) -> int:
|
|
1573
|
+
"""使用 orjson 序列化后计算 hash 来统计唯一值"""
|
|
1574
|
+
import hashlib
|
|
1420
1575
|
|
|
1421
|
-
|
|
1422
|
-
|
|
1576
|
+
import orjson
|
|
1577
|
+
|
|
1578
|
+
seen = set()
|
|
1579
|
+
for v in values:
|
|
1580
|
+
try:
|
|
1423
1581
|
h = hashlib.md5(orjson.dumps(v, option=orjson.OPT_SORT_KEYS)).digest()
|
|
1424
1582
|
seen.add(h)
|
|
1425
|
-
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
|
|
1583
|
+
except TypeError:
|
|
1584
|
+
# 无法序列化的值,用 repr 兜底
|
|
1585
|
+
seen.add(repr(v))
|
|
1586
|
+
return len(seen)
|
|
1429
1587
|
|
|
1430
1588
|
|
|
1431
1589
|
def _infer_type(values: List[Any]) -> str:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dtflow
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.2
|
|
4
4
|
Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
|
|
5
5
|
Project-URL: Homepage, https://github.com/yourusername/DataTransformer
|
|
6
6
|
Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
dtflow/__init__.py,sha256=
|
|
2
|
-
dtflow/__main__.py,sha256=
|
|
1
|
+
dtflow/__init__.py,sha256=outMIxwn2hlX6wMUfLFrBkDVknGEzJOkNUHOUWfEGuw,2347
|
|
2
|
+
dtflow/__main__.py,sha256=eVzGbJQxhoOvvmk1076AzT7VscOXSJu18VGGdZ9r9h8,11359
|
|
3
3
|
dtflow/converters.py,sha256=gyy-K15zjzGBawFnZa8D9JX37JZ47rey2GhjKa2pxFo,22081
|
|
4
4
|
dtflow/core.py,sha256=szm9qmRVe1Q97O18UTGz7xTsdV-V8L4D6Bl1bxBJCWk,28778
|
|
5
5
|
dtflow/lineage.py,sha256=vQ06lxBHftu-Ma5HlISp3F2eiIvwagQSnUGaLeABDZY,12190
|
|
@@ -8,7 +8,7 @@ dtflow/presets.py,sha256=OP1nnM5NFk5Kli9FsXK0xAot48E5OQ6-VOIJT9ffXPg,5023
|
|
|
8
8
|
dtflow/streaming.py,sha256=lYf9gi5U-3oqr7oEe5mENx1r-LtRb2YfGNq1fP3_sw4,21972
|
|
9
9
|
dtflow/tokenizers.py,sha256=zxE6XZGjZ_DOGCjRSClI9xaAbFVf8FS6jwwssGoi_9U,18111
|
|
10
10
|
dtflow/cli/__init__.py,sha256=QhZ-thgx9IBTFII7T_hdoWFUl0CCsdGQHN5ZEZw2XB0,423
|
|
11
|
-
dtflow/cli/commands.py,sha256=
|
|
11
|
+
dtflow/cli/commands.py,sha256=IZ2rHnJ7RHmXOW19JLjtHPfzbfNj5vq_FT2YDSao2SI,90303
|
|
12
12
|
dtflow/mcp/__init__.py,sha256=huEJ3rXDbxDRjsLPEvjNT2u3tWs6Poiv6fokPIrByjw,897
|
|
13
13
|
dtflow/mcp/__main__.py,sha256=PoT2ZZmJq9xDZxDACJfqDW9Ld_ukHrGNK-0XUd7WGnY,448
|
|
14
14
|
dtflow/mcp/cli.py,sha256=ck0oOS_642cNktxULaMRE7BJfMxsBCwotmCj3PSPwVk,13110
|
|
@@ -19,7 +19,7 @@ dtflow/storage/io.py,sha256=XNWLL10a7jgOjM1IfTN9kIuW23dwzFE1nnaw4E3LaiU,21885
|
|
|
19
19
|
dtflow/utils/__init__.py,sha256=f8v9HJZMWRI5AL64Vjr76Pf2Na_whOF9nJBKgPbXXYg,429
|
|
20
20
|
dtflow/utils/display.py,sha256=OeOdTh6mbDwSkDWlmkjfpTjy2QG8ZUaYU0NpHUWkpEQ,5881
|
|
21
21
|
dtflow/utils/field_path.py,sha256=WcNA-LZh3H61a77FEzB_R7YAyyZl3M8ofdq05ytQGmI,7459
|
|
22
|
-
dtflow-0.4.
|
|
23
|
-
dtflow-0.4.
|
|
24
|
-
dtflow-0.4.
|
|
25
|
-
dtflow-0.4.
|
|
22
|
+
dtflow-0.4.2.dist-info/METADATA,sha256=Rck3BDh1Vvpr24fUvCAcnmPXQOrZxTu_OYMAvJHADnU,18306
|
|
23
|
+
dtflow-0.4.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
24
|
+
dtflow-0.4.2.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
|
|
25
|
+
dtflow-0.4.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|