dtflow 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dtflow/SKILL.md +245 -0
- dtflow/__init__.py +1 -1
- dtflow/__main__.py +70 -55
- dtflow/cli/clean.py +204 -8
- dtflow/cli/commands.py +16 -10
- dtflow/cli/skill.py +72 -0
- dtflow/cli/stats.py +247 -40
- dtflow/cli/validate.py +52 -19
- dtflow/parallel.py +115 -0
- dtflow/schema.py +99 -13
- dtflow/tokenizers.py +104 -21
- {dtflow-0.5.6.dist-info → dtflow-0.5.8.dist-info}/METADATA +47 -4
- {dtflow-0.5.6.dist-info → dtflow-0.5.8.dist-info}/RECORD +15 -17
- dtflow/mcp/__init__.py +0 -29
- dtflow/mcp/__main__.py +0 -18
- dtflow/mcp/cli.py +0 -388
- dtflow/mcp/docs.py +0 -416
- dtflow/mcp/server.py +0 -153
- {dtflow-0.5.6.dist-info → dtflow-0.5.8.dist-info}/WHEEL +0 -0
- {dtflow-0.5.6.dist-info → dtflow-0.5.8.dist-info}/entry_points.txt +0 -0
dtflow/cli/stats.py
CHANGED
|
@@ -3,7 +3,7 @@ CLI 数据统计相关命令
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Any, Dict, List
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
7
|
|
|
8
8
|
import orjson
|
|
9
9
|
|
|
@@ -22,6 +22,8 @@ def stats(
|
|
|
22
22
|
filename: str,
|
|
23
23
|
top: int = 10,
|
|
24
24
|
full: bool = False,
|
|
25
|
+
fields: Optional[List[str]] = None,
|
|
26
|
+
expand_fields: Optional[List[str]] = None,
|
|
25
27
|
) -> None:
|
|
26
28
|
"""
|
|
27
29
|
显示数据文件的统计信息。
|
|
@@ -33,11 +35,15 @@ def stats(
|
|
|
33
35
|
filename: 输入文件路径,支持 csv/excel/jsonl/json/parquet/arrow/feather 格式
|
|
34
36
|
top: 显示频率最高的前 N 个值,默认 10(仅完整模式)
|
|
35
37
|
full: 完整模式,统计值分布、唯一值等详细信息
|
|
38
|
+
fields: 指定统计的字段列表(支持嵌套路径)
|
|
39
|
+
expand_fields: 展开 list 字段统计的字段列表
|
|
36
40
|
|
|
37
41
|
Examples:
|
|
38
42
|
dt stats data.jsonl # 快速模式(默认)
|
|
39
43
|
dt stats data.jsonl --full # 完整模式
|
|
40
44
|
dt stats data.csv -f --top=5 # 完整模式,显示 Top 5
|
|
45
|
+
dt stats data.jsonl --full --field=category # 指定字段
|
|
46
|
+
dt stats data.jsonl --full --expand=tags # 展开 list 字段
|
|
41
47
|
"""
|
|
42
48
|
filepath = Path(filename)
|
|
43
49
|
|
|
@@ -48,7 +54,10 @@ def stats(
|
|
|
48
54
|
if not _check_file_format(filepath):
|
|
49
55
|
return
|
|
50
56
|
|
|
57
|
+
# 快速模式:忽略 --field 和 --expand 参数
|
|
51
58
|
if not full:
|
|
59
|
+
if fields or expand_fields:
|
|
60
|
+
print("⚠️ 警告: --field 和 --expand 参数仅在完整模式 (--full) 下生效")
|
|
52
61
|
_quick_stats(filepath)
|
|
53
62
|
return
|
|
54
63
|
|
|
@@ -65,7 +74,7 @@ def stats(
|
|
|
65
74
|
|
|
66
75
|
# 计算统计信息
|
|
67
76
|
total = len(data)
|
|
68
|
-
field_stats = _compute_field_stats(data, top)
|
|
77
|
+
field_stats = _compute_field_stats(data, top, fields, expand_fields)
|
|
69
78
|
|
|
70
79
|
# 输出统计信息
|
|
71
80
|
_print_stats(filepath.name, total, field_stats)
|
|
@@ -200,16 +209,104 @@ def _quick_stats(filepath: Path) -> None:
|
|
|
200
209
|
print(f"字段: {len(fields)} 个")
|
|
201
210
|
|
|
202
211
|
if fields:
|
|
203
|
-
print(
|
|
212
|
+
print("\n📋 字段结构:")
|
|
204
213
|
for i, f in enumerate(fields, 1):
|
|
205
214
|
print(f" {i}. {f['field']} ({f['type']})")
|
|
206
215
|
|
|
207
216
|
|
|
208
|
-
def
|
|
217
|
+
def _extract_with_wildcard(item: dict, field_spec: str) -> List[Any]:
|
|
218
|
+
"""处理包含 [*] 的字段路径,返回所有值"""
|
|
219
|
+
if "[*]" not in field_spec:
|
|
220
|
+
# 无 [*],直接返回单个值的列表
|
|
221
|
+
value = get_field_with_spec(item, field_spec)
|
|
222
|
+
return [value] if value is not None else []
|
|
223
|
+
|
|
224
|
+
# 分割路径:messages[*].role -> ("messages", ".role")
|
|
225
|
+
before, after = field_spec.split("[*]", 1)
|
|
226
|
+
after = after.lstrip(".") # 移除开头的点
|
|
227
|
+
|
|
228
|
+
# 获取数组
|
|
229
|
+
array = get_field_with_spec(item, before) if before else item
|
|
230
|
+
if not isinstance(array, list):
|
|
231
|
+
return []
|
|
232
|
+
|
|
233
|
+
# 提取每个元素的后续路径
|
|
234
|
+
results = []
|
|
235
|
+
for elem in array:
|
|
236
|
+
if after:
|
|
237
|
+
val = get_field_with_spec(elem, after)
|
|
238
|
+
else:
|
|
239
|
+
val = elem
|
|
240
|
+
if val is not None:
|
|
241
|
+
results.append(val)
|
|
242
|
+
|
|
243
|
+
return results
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def _extract_field_values(
|
|
247
|
+
data: List[Dict],
|
|
248
|
+
field_spec: str,
|
|
249
|
+
expand: bool = False,
|
|
250
|
+
) -> List[Any]:
|
|
251
|
+
"""
|
|
252
|
+
从数据中提取字段值。
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
data: 数据列表
|
|
256
|
+
field_spec: 字段路径规格(如 "messages[*].role")
|
|
257
|
+
expand: 是否展开 list
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
值列表(展开或不展开)
|
|
261
|
+
"""
|
|
262
|
+
all_values = []
|
|
263
|
+
|
|
264
|
+
for item in data:
|
|
265
|
+
if "[*]" in field_spec or expand:
|
|
266
|
+
# 使用通配符提取所有值
|
|
267
|
+
values = _extract_with_wildcard(item, field_spec)
|
|
268
|
+
|
|
269
|
+
if expand and len(values) == 1 and isinstance(values[0], list):
|
|
270
|
+
# 展开模式:如果返回单个列表,展开其元素
|
|
271
|
+
all_values.extend(values[0])
|
|
272
|
+
elif expand and values and isinstance(values[0], list):
|
|
273
|
+
# 多个列表,全部展开
|
|
274
|
+
for v in values:
|
|
275
|
+
if isinstance(v, list):
|
|
276
|
+
all_values.extend(v)
|
|
277
|
+
else:
|
|
278
|
+
all_values.append(v)
|
|
279
|
+
else:
|
|
280
|
+
# 不展开或非列表值
|
|
281
|
+
all_values.extend(values)
|
|
282
|
+
else:
|
|
283
|
+
# 普通字段路径
|
|
284
|
+
value = get_field_with_spec(item, field_spec)
|
|
285
|
+
if expand and isinstance(value, list):
|
|
286
|
+
# 展开 list
|
|
287
|
+
all_values.extend(value)
|
|
288
|
+
else:
|
|
289
|
+
all_values.append(value)
|
|
290
|
+
|
|
291
|
+
return all_values
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def _compute_field_stats(
|
|
295
|
+
data: List[Dict],
|
|
296
|
+
top: int,
|
|
297
|
+
fields: Optional[List[str]] = None,
|
|
298
|
+
expand_fields: Optional[List[str]] = None,
|
|
299
|
+
) -> List[Dict[str, Any]]:
|
|
209
300
|
"""
|
|
210
301
|
单次遍历计算每个字段的统计信息。
|
|
211
302
|
|
|
212
303
|
优化:将多次遍历合并为单次遍历,在遍历过程中同时收集所有统计数据。
|
|
304
|
+
|
|
305
|
+
Args:
|
|
306
|
+
data: 数据列表
|
|
307
|
+
top: Top N 值数量
|
|
308
|
+
fields: 指定统计的字段列表
|
|
309
|
+
expand_fields: 展开 list 字段统计的字段列表
|
|
213
310
|
"""
|
|
214
311
|
from collections import Counter, defaultdict
|
|
215
312
|
|
|
@@ -218,38 +315,115 @@ def _compute_field_stats(data: List[Dict], top: int) -> List[Dict[str, Any]]:
|
|
|
218
315
|
|
|
219
316
|
total = len(data)
|
|
220
317
|
|
|
221
|
-
#
|
|
222
|
-
|
|
223
|
-
|
|
318
|
+
# 如果没有指定字段,统计所有顶层字段(保持向后兼容)
|
|
319
|
+
if not fields and not expand_fields:
|
|
320
|
+
# 单次遍历收集所有字段的值和统计信息
|
|
321
|
+
field_values = defaultdict(list) # 存储每个字段的所有值
|
|
322
|
+
field_counters = defaultdict(Counter) # 存储每个字段的值频率(用于 top N)
|
|
323
|
+
|
|
324
|
+
for item in data:
|
|
325
|
+
for k, v in item.items():
|
|
326
|
+
field_values[k].append(v)
|
|
327
|
+
# 对值进行截断后计数(用于 top N 显示)
|
|
328
|
+
displayable = _truncate(v if v is not None else "", 30)
|
|
329
|
+
field_counters[k][displayable] += 1
|
|
330
|
+
|
|
331
|
+
# 根据收集的数据计算统计信息
|
|
332
|
+
stats_list = []
|
|
333
|
+
for field in sorted(field_values.keys()):
|
|
334
|
+
values = field_values[field]
|
|
335
|
+
non_null = [v for v in values if v is not None and v != ""]
|
|
336
|
+
non_null_count = len(non_null)
|
|
337
|
+
|
|
338
|
+
# 推断类型(从第一个非空值)
|
|
339
|
+
field_type = _infer_type(non_null)
|
|
340
|
+
|
|
341
|
+
# 基础统计
|
|
342
|
+
stat = {
|
|
343
|
+
"field": field,
|
|
344
|
+
"non_null": non_null_count,
|
|
345
|
+
"null_rate": f"{non_null_count / total * 100:.1f}%",
|
|
346
|
+
"type": field_type,
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
# 类型特定统计
|
|
350
|
+
if non_null:
|
|
351
|
+
# 唯一值计数(对复杂类型使用 hash 节省内存)
|
|
352
|
+
stat["unique"] = _count_unique(non_null, field_type)
|
|
353
|
+
|
|
354
|
+
# 字符串类型:计算长度统计
|
|
355
|
+
if field_type == "str":
|
|
356
|
+
lengths = [len(str(v)) for v in non_null]
|
|
357
|
+
stat["len_min"] = min(lengths)
|
|
358
|
+
stat["len_max"] = max(lengths)
|
|
359
|
+
stat["len_avg"] = sum(lengths) / len(lengths)
|
|
360
|
+
|
|
361
|
+
# 数值类型:计算数值统计
|
|
362
|
+
elif field_type in ("int", "float"):
|
|
363
|
+
nums = [float(v) for v in non_null if _is_numeric(v)]
|
|
364
|
+
if nums:
|
|
365
|
+
stat["min"] = min(nums)
|
|
366
|
+
stat["max"] = max(nums)
|
|
367
|
+
stat["avg"] = sum(nums) / len(nums)
|
|
368
|
+
|
|
369
|
+
# 列表类型:计算长度统计
|
|
370
|
+
elif field_type == "list":
|
|
371
|
+
lengths = [len(v) if isinstance(v, list) else 0 for v in non_null]
|
|
372
|
+
stat["len_min"] = min(lengths)
|
|
373
|
+
stat["len_max"] = max(lengths)
|
|
374
|
+
stat["len_avg"] = sum(lengths) / len(lengths)
|
|
375
|
+
|
|
376
|
+
# Top N 值(已在遍历时收集)
|
|
377
|
+
stat["top_values"] = field_counters[field].most_common(top)
|
|
378
|
+
|
|
379
|
+
stats_list.append(stat)
|
|
380
|
+
|
|
381
|
+
return stats_list
|
|
382
|
+
|
|
383
|
+
# 指定了字段:收集指定字段的统计
|
|
384
|
+
stats_list = []
|
|
385
|
+
expand_set = set(expand_fields) if expand_fields else set()
|
|
224
386
|
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
# 对值进行截断后计数(用于 top N 显示)
|
|
229
|
-
displayable = _truncate(v if v is not None else "", 30)
|
|
230
|
-
field_counters[k][displayable] += 1
|
|
387
|
+
# 合并字段列表
|
|
388
|
+
all_fields = set(fields) if fields else set()
|
|
389
|
+
all_fields.update(expand_set)
|
|
231
390
|
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
391
|
+
for field_spec in sorted(all_fields):
|
|
392
|
+
is_expanded = field_spec in expand_set
|
|
393
|
+
|
|
394
|
+
# 提取字段值
|
|
395
|
+
values = _extract_field_values(data, field_spec, expand=is_expanded)
|
|
396
|
+
|
|
397
|
+
# 过滤 None 和空值
|
|
236
398
|
non_null = [v for v in values if v is not None and v != ""]
|
|
237
399
|
non_null_count = len(non_null)
|
|
238
400
|
|
|
239
|
-
#
|
|
401
|
+
# 推断类型
|
|
240
402
|
field_type = _infer_type(non_null)
|
|
241
403
|
|
|
242
404
|
# 基础统计
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
405
|
+
if is_expanded:
|
|
406
|
+
# 展开模式:显示元素总数和平均数,而非非空率
|
|
407
|
+
stat = {
|
|
408
|
+
"field": field_spec,
|
|
409
|
+
"non_null": non_null_count,
|
|
410
|
+
"null_rate": f"总元素: {len(values)}",
|
|
411
|
+
"type": field_type,
|
|
412
|
+
"is_expanded": is_expanded,
|
|
413
|
+
}
|
|
414
|
+
else:
|
|
415
|
+
# 普通模式:显示非空率
|
|
416
|
+
stat = {
|
|
417
|
+
"field": field_spec,
|
|
418
|
+
"non_null": non_null_count,
|
|
419
|
+
"null_rate": f"{non_null_count / total * 100:.1f}%",
|
|
420
|
+
"type": field_type,
|
|
421
|
+
"is_expanded": is_expanded,
|
|
422
|
+
}
|
|
249
423
|
|
|
250
424
|
# 类型特定统计
|
|
251
425
|
if non_null:
|
|
252
|
-
#
|
|
426
|
+
# 唯一值计数
|
|
253
427
|
stat["unique"] = _count_unique(non_null, field_type)
|
|
254
428
|
|
|
255
429
|
# 字符串类型:计算长度统计
|
|
@@ -274,8 +448,12 @@ def _compute_field_stats(data: List[Dict], top: int) -> List[Dict[str, Any]]:
|
|
|
274
448
|
stat["len_max"] = max(lengths)
|
|
275
449
|
stat["len_avg"] = sum(lengths) / len(lengths)
|
|
276
450
|
|
|
277
|
-
# Top N
|
|
278
|
-
|
|
451
|
+
# Top N 值(需要重新计数)
|
|
452
|
+
counter = Counter()
|
|
453
|
+
for v in non_null:
|
|
454
|
+
displayable = _truncate(v if v is not None else "", 30)
|
|
455
|
+
counter[displayable] += 1
|
|
456
|
+
stat["top_values"] = counter.most_common(top)
|
|
279
457
|
|
|
280
458
|
stats_list.append(stat)
|
|
281
459
|
|
|
@@ -343,9 +521,18 @@ def _print_stats(filename: str, total: int, field_stats: List[Dict[str, Any]]) -
|
|
|
343
521
|
table.add_column("统计", style="dim")
|
|
344
522
|
|
|
345
523
|
for stat in field_stats:
|
|
346
|
-
|
|
524
|
+
# 使用 stat 中的 null_rate(支持展开模式的特殊显示)
|
|
525
|
+
if "null_rate" in stat:
|
|
526
|
+
non_null_rate = stat["null_rate"]
|
|
527
|
+
else:
|
|
528
|
+
non_null_rate = f"{stat['non_null'] / total * 100:.0f}%"
|
|
347
529
|
unique = str(stat.get("unique", "-"))
|
|
348
530
|
|
|
531
|
+
# 字段名(添加展开标记)
|
|
532
|
+
field_name = stat["field"]
|
|
533
|
+
if stat.get("is_expanded"):
|
|
534
|
+
field_name += " (展开)"
|
|
535
|
+
|
|
349
536
|
# 构建统计信息字符串
|
|
350
537
|
extra = []
|
|
351
538
|
if "len_avg" in stat:
|
|
@@ -363,7 +550,7 @@ def _print_stats(filename: str, total: int, field_stats: List[Dict[str, Any]]) -
|
|
|
363
550
|
)
|
|
364
551
|
|
|
365
552
|
table.add_row(
|
|
366
|
-
|
|
553
|
+
field_name,
|
|
367
554
|
stat["type"],
|
|
368
555
|
non_null_rate,
|
|
369
556
|
unique,
|
|
@@ -387,12 +574,19 @@ def _print_stats(filename: str, total: int, field_stats: List[Dict[str, Any]]) -
|
|
|
387
574
|
if unique_ratio > 0.9 and stat.get("unique", 0) > 100:
|
|
388
575
|
continue
|
|
389
576
|
|
|
577
|
+
# 字段名(添加展开标记)
|
|
578
|
+
field_display = stat["field"]
|
|
579
|
+
if stat.get("is_expanded"):
|
|
580
|
+
field_display += " (展开)"
|
|
581
|
+
|
|
390
582
|
console.print(
|
|
391
|
-
f"\n[bold cyan]{
|
|
583
|
+
f"\n[bold cyan]{field_display}[/bold cyan] 值分布 (Top {len(top_values)}):"
|
|
392
584
|
)
|
|
393
585
|
max_count = max(c for _, c in top_values) if top_values else 1
|
|
586
|
+
# 展开模式下使用 non_null(元素总数),否则使用 total(数据条数)
|
|
587
|
+
base_count = stat["non_null"] if stat.get("is_expanded") else total
|
|
394
588
|
for value, count in top_values:
|
|
395
|
-
pct = count /
|
|
589
|
+
pct = count / base_count * 100 if base_count > 0 else 0
|
|
396
590
|
bar_len = int(count / max_count * 20) # 按相对比例,最长 20 字符
|
|
397
591
|
bar = "█" * bar_len
|
|
398
592
|
display_value = value if value else "[空]"
|
|
@@ -403,14 +597,14 @@ def _print_stats(filename: str, total: int, field_stats: List[Dict[str, Any]]) -
|
|
|
403
597
|
except ImportError:
|
|
404
598
|
# 没有 rich,使用普通打印
|
|
405
599
|
print(f"\n{'=' * 50}")
|
|
406
|
-
print(
|
|
600
|
+
print("📊 数据概览")
|
|
407
601
|
print(f"{'=' * 50}")
|
|
408
602
|
print(f"文件: {filename}")
|
|
409
603
|
print(f"总数: {total:,} 条")
|
|
410
604
|
print(f"字段: {len(field_stats)} 个")
|
|
411
605
|
|
|
412
606
|
print(f"\n{'=' * 50}")
|
|
413
|
-
print(
|
|
607
|
+
print("📋 字段统计")
|
|
414
608
|
print(f"{'=' * 50}")
|
|
415
609
|
print(f"{'字段':<20} {'类型':<8} {'非空率':<8} {'唯一值':<8}")
|
|
416
610
|
print("-" * 50)
|
|
@@ -426,6 +620,7 @@ def token_stats(
|
|
|
426
620
|
field: str = "messages",
|
|
427
621
|
model: str = "cl100k_base",
|
|
428
622
|
detailed: bool = False,
|
|
623
|
+
workers: Optional[int] = None,
|
|
429
624
|
) -> None:
|
|
430
625
|
"""
|
|
431
626
|
统计数据集的 Token 信息。
|
|
@@ -435,6 +630,7 @@ def token_stats(
|
|
|
435
630
|
field: 要统计的字段(默认 messages),支持嵌套路径语法
|
|
436
631
|
model: 分词器: cl100k_base (默认), qwen2.5, llama3, gpt-4 等
|
|
437
632
|
detailed: 是否显示详细统计
|
|
633
|
+
workers: 并行进程数,None 自动检测,1 禁用并行
|
|
438
634
|
|
|
439
635
|
Examples:
|
|
440
636
|
dt token-stats data.jsonl
|
|
@@ -442,6 +638,7 @@ def token_stats(
|
|
|
442
638
|
dt token-stats data.jsonl --field=conversation.messages
|
|
443
639
|
dt token-stats data.jsonl --field=messages[-1].content # 统计最后一条消息
|
|
444
640
|
dt token-stats data.jsonl --detailed
|
|
641
|
+
dt token-stats data.jsonl --workers=4 # 使用 4 进程
|
|
445
642
|
"""
|
|
446
643
|
filepath = Path(filename)
|
|
447
644
|
|
|
@@ -473,7 +670,7 @@ def token_stats(
|
|
|
473
670
|
|
|
474
671
|
# 尝试使用 rich 进度条
|
|
475
672
|
try:
|
|
476
|
-
from rich.progress import Progress, SpinnerColumn,
|
|
673
|
+
from rich.progress import BarColumn, Progress, SpinnerColumn, TaskProgressColumn, TextColumn
|
|
477
674
|
|
|
478
675
|
with Progress(
|
|
479
676
|
SpinnerColumn(),
|
|
@@ -491,14 +688,22 @@ def token_stats(
|
|
|
491
688
|
from ..tokenizers import messages_token_stats
|
|
492
689
|
|
|
493
690
|
stats_result = messages_token_stats(
|
|
494
|
-
data,
|
|
691
|
+
data,
|
|
692
|
+
messages_field=field,
|
|
693
|
+
model=model,
|
|
694
|
+
progress_callback=update_progress,
|
|
695
|
+
workers=workers,
|
|
495
696
|
)
|
|
496
697
|
_print_messages_token_stats(stats_result, detailed)
|
|
497
698
|
else:
|
|
498
699
|
from ..tokenizers import token_stats as compute_token_stats
|
|
499
700
|
|
|
500
701
|
stats_result = compute_token_stats(
|
|
501
|
-
data,
|
|
702
|
+
data,
|
|
703
|
+
fields=field,
|
|
704
|
+
model=model,
|
|
705
|
+
progress_callback=update_progress,
|
|
706
|
+
workers=workers,
|
|
502
707
|
)
|
|
503
708
|
_print_text_token_stats(stats_result, detailed)
|
|
504
709
|
|
|
@@ -509,12 +714,14 @@ def token_stats(
|
|
|
509
714
|
if isinstance(field_value, list) and field_value and isinstance(field_value[0], dict):
|
|
510
715
|
from ..tokenizers import messages_token_stats
|
|
511
716
|
|
|
512
|
-
stats_result = messages_token_stats(
|
|
717
|
+
stats_result = messages_token_stats(
|
|
718
|
+
data, messages_field=field, model=model, workers=workers
|
|
719
|
+
)
|
|
513
720
|
_print_messages_token_stats(stats_result, detailed)
|
|
514
721
|
else:
|
|
515
722
|
from ..tokenizers import token_stats as compute_token_stats
|
|
516
723
|
|
|
517
|
-
stats_result = compute_token_stats(data, fields=field, model=model)
|
|
724
|
+
stats_result = compute_token_stats(data, fields=field, model=model, workers=workers)
|
|
518
725
|
_print_text_token_stats(stats_result, detailed)
|
|
519
726
|
except ImportError as e:
|
|
520
727
|
print(f"错误: {e}")
|
|
@@ -594,7 +801,7 @@ def _print_messages_token_stats(stats: Dict[str, Any], detailed: bool) -> None:
|
|
|
594
801
|
print(f"平均 Token: {stats['avg_tokens']:,} (std: {std:.1f})")
|
|
595
802
|
print(f"范围: {stats['min_tokens']:,} - {stats['max_tokens']:,}")
|
|
596
803
|
|
|
597
|
-
print(
|
|
804
|
+
print("\n📈 百分位分布:")
|
|
598
805
|
print(f" P25: {stats.get('p25', '-'):,} P50: {stats.get('median_tokens', '-'):,}")
|
|
599
806
|
print(f" P75: {stats.get('p75', '-'):,} P90: {stats.get('p90', '-'):,}")
|
|
600
807
|
print(f" P95: {stats.get('p95', '-'):,} P99: {stats.get('p99', '-'):,}")
|
|
@@ -661,7 +868,7 @@ def _print_text_token_stats(stats: Dict[str, Any], detailed: bool) -> None:
|
|
|
661
868
|
print(f"平均 Token: {stats['avg_tokens']:.1f} (std: {std:.1f})")
|
|
662
869
|
print(f"范围: {stats['min_tokens']:,} - {stats['max_tokens']:,}")
|
|
663
870
|
|
|
664
|
-
print(
|
|
871
|
+
print("\n📈 百分位分布:")
|
|
665
872
|
print(f" P25: {stats.get('p25', '-'):,} P50: {stats.get('median_tokens', '-'):,}")
|
|
666
873
|
print(f" P75: {stats.get('p75', '-'):,} P90: {stats.get('p90', '-'):,}")
|
|
667
874
|
print(f" P95: {stats.get('p95', '-'):,} P99: {stats.get('p99', '-'):,}")
|
dtflow/cli/validate.py
CHANGED
|
@@ -6,8 +6,6 @@ from pathlib import Path
|
|
|
6
6
|
from typing import Optional
|
|
7
7
|
|
|
8
8
|
from ..schema import (
|
|
9
|
-
Schema,
|
|
10
|
-
Field,
|
|
11
9
|
alpaca_schema,
|
|
12
10
|
dpo_schema,
|
|
13
11
|
openai_chat_schema,
|
|
@@ -16,7 +14,6 @@ from ..schema import (
|
|
|
16
14
|
from ..storage.io import load_data, save_data
|
|
17
15
|
from .common import _check_file_format
|
|
18
16
|
|
|
19
|
-
|
|
20
17
|
# 预设 Schema 映射
|
|
21
18
|
PRESET_SCHEMAS = {
|
|
22
19
|
"openai_chat": openai_chat_schema,
|
|
@@ -36,6 +33,7 @@ def validate(
|
|
|
36
33
|
filter_invalid: bool = False,
|
|
37
34
|
max_errors: int = 20,
|
|
38
35
|
verbose: bool = False,
|
|
36
|
+
workers: Optional[int] = None,
|
|
39
37
|
) -> None:
|
|
40
38
|
"""
|
|
41
39
|
使用 Schema 验证数据文件。
|
|
@@ -47,11 +45,13 @@ def validate(
|
|
|
47
45
|
filter_invalid: 过滤无效数据并保存
|
|
48
46
|
max_errors: 最多显示的错误数量
|
|
49
47
|
verbose: 显示详细信息
|
|
48
|
+
workers: 并行进程数,None 自动检测,1 禁用并行
|
|
50
49
|
|
|
51
50
|
Examples:
|
|
52
51
|
dt validate data.jsonl --preset=openai_chat
|
|
53
52
|
dt validate data.jsonl --preset=alpaca -o valid.jsonl
|
|
54
53
|
dt validate data.jsonl --preset=chat --filter
|
|
54
|
+
dt validate data.jsonl --preset=chat --workers=4
|
|
55
55
|
"""
|
|
56
56
|
filepath = Path(filename)
|
|
57
57
|
|
|
@@ -99,19 +99,54 @@ def validate(
|
|
|
99
99
|
print(f"总记录数: {total}")
|
|
100
100
|
print()
|
|
101
101
|
|
|
102
|
-
#
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
102
|
+
# 验证(使用并行或串行)
|
|
103
|
+
use_parallel = workers != 1 and total >= 1000
|
|
104
|
+
|
|
105
|
+
if use_parallel:
|
|
106
|
+
# 使用进度条(如果有 rich)
|
|
107
|
+
try:
|
|
108
|
+
from rich.progress import (
|
|
109
|
+
BarColumn,
|
|
110
|
+
Progress,
|
|
111
|
+
SpinnerColumn,
|
|
112
|
+
TaskProgressColumn,
|
|
113
|
+
TextColumn,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
with Progress(
|
|
117
|
+
SpinnerColumn(),
|
|
118
|
+
TextColumn("[bold blue]验证数据"),
|
|
119
|
+
BarColumn(),
|
|
120
|
+
TaskProgressColumn(),
|
|
121
|
+
) as progress:
|
|
122
|
+
task = progress.add_task("", total=total)
|
|
123
|
+
|
|
124
|
+
def update_progress(current: int, total_count: int):
|
|
125
|
+
progress.update(task, completed=current)
|
|
126
|
+
|
|
127
|
+
valid_data, invalid_results = schema.validate_parallel(
|
|
128
|
+
data, workers=workers, progress_callback=update_progress
|
|
129
|
+
)
|
|
130
|
+
except ImportError:
|
|
131
|
+
print("🔍 验证数据...")
|
|
132
|
+
valid_data, invalid_results = schema.validate_parallel(data, workers=workers)
|
|
133
|
+
|
|
134
|
+
invalid_count = len(invalid_results)
|
|
135
|
+
error_samples = invalid_results[:max_errors]
|
|
136
|
+
else:
|
|
137
|
+
# 串行验证
|
|
138
|
+
valid_data = []
|
|
139
|
+
invalid_count = 0
|
|
140
|
+
error_samples = []
|
|
141
|
+
|
|
142
|
+
for i, item in enumerate(data):
|
|
143
|
+
result = schema.validate(item)
|
|
144
|
+
if result.valid:
|
|
145
|
+
valid_data.append(item)
|
|
146
|
+
else:
|
|
147
|
+
invalid_count += 1
|
|
148
|
+
if len(error_samples) < max_errors:
|
|
149
|
+
error_samples.append((i, result))
|
|
115
150
|
|
|
116
151
|
valid_count = len(valid_data)
|
|
117
152
|
valid_ratio = valid_count / total * 100 if total > 0 else 0
|
|
@@ -138,9 +173,7 @@ def validate(
|
|
|
138
173
|
|
|
139
174
|
# 保存有效数据
|
|
140
175
|
if output or filter_invalid:
|
|
141
|
-
output_path = output or str(filepath).replace(
|
|
142
|
-
filepath.suffix, f"_valid{filepath.suffix}"
|
|
143
|
-
)
|
|
176
|
+
output_path = output or str(filepath).replace(filepath.suffix, f"_valid{filepath.suffix}")
|
|
144
177
|
save_data(valid_data, output_path)
|
|
145
178
|
print(f"✅ 有效数据已保存: {output_path} ({valid_count} 条)")
|
|
146
179
|
|