dtflow 0.5.8__py3-none-any.whl → 0.5.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dtflow/tokenizers.py CHANGED
@@ -122,8 +122,8 @@ def _get_tiktoken_encoder(model: str):
122
122
  _tokenizer_cache[model] = tiktoken.get_encoding(model)
123
123
  else:
124
124
  _tokenizer_cache[model] = tiktoken.encoding_for_model(model)
125
- except ImportError as e:
126
- raise ImportError("需要安装 tiktoken: pip install tiktoken") from e
125
+ except ImportError:
126
+ raise ImportError("需要安装 tiktoken: pip install tiktoken")
127
127
  return _tokenizer_cache[model]
128
128
 
129
129
 
@@ -149,12 +149,12 @@ def _get_hf_tokenizer(model: str):
149
149
 
150
150
  tokenizer = AutoTokenizer.from_pretrained(resolved, trust_remote_code=True)
151
151
  _tokenizer_cache[resolved] = ("transformers", tokenizer)
152
- except ImportError as e:
152
+ except ImportError:
153
153
  raise ImportError(
154
154
  "需要安装 tokenizers 或 transformers:\n"
155
155
  " pip install tokenizers huggingface_hub (推荐,更轻量)\n"
156
156
  " pip install transformers"
157
- ) from e
157
+ )
158
158
  return _tokenizer_cache[resolved]
159
159
 
160
160
 
@@ -309,29 +309,12 @@ def _std(counts: List[int], avg: float) -> float:
309
309
  return variance**0.5
310
310
 
311
311
 
312
- def _count_item_tokens(args: tuple) -> int:
313
- """
314
- 计算单条数据的 token 数(用于多进程)。
315
-
316
- Args:
317
- args: (item, fields, model, backend) 元组
318
- """
319
- item, fields, model, backend = args
320
- total = 0
321
- for field in fields:
322
- value = get_field_with_spec(item, field, default="")
323
- if value:
324
- total += count_tokens(str(value), model=model, backend=backend)
325
- return total
326
-
327
-
328
312
  def token_stats(
329
313
  data: List[Dict[str, Any]],
330
314
  fields: Union[str, List[str]],
331
315
  model: str = DEFAULT_MODEL,
332
316
  backend: Optional[str] = None,
333
317
  progress_callback: Optional[Callable[[int, int], None]] = None,
334
- workers: Optional[int] = None,
335
318
  ) -> Dict[str, Any]:
336
319
  """
337
320
  统计数据集的 token 信息。
@@ -342,7 +325,6 @@ def token_stats(
342
325
  model: 模型名称或别名,如 "qwen2.5", "gpt-4" 等
343
326
  backend: 后端选择,None 则自动检测
344
327
  progress_callback: 进度回调函数,接收 (current, total) 两个参数
345
- workers: 进程数,None 自动检测,1 表示禁用并行
346
328
 
347
329
  Returns:
348
330
  统计信息字典,包含:
@@ -360,42 +342,17 @@ def token_stats(
360
342
  if not data:
361
343
  return {"total_tokens": 0, "count": 0}
362
344
 
345
+ counts = []
363
346
  total_items = len(data)
364
- _backend = backend or _auto_backend(model)
365
-
366
- # 判断是否使用多进程
367
- use_parallel = workers != 1 and total_items >= 1000
368
-
369
- if use_parallel:
370
- from .parallel import get_optimal_workers, parallel_imap
371
-
372
- actual_workers = get_optimal_workers(total_items, workers)
373
- # 准备参数
374
- args_list = [(item, fields, model, _backend) for item in data]
375
- counts = []
376
- for i, result in enumerate(
377
- parallel_imap(
378
- _count_item_tokens,
379
- args_list,
380
- workers=actual_workers,
381
- threshold=1000,
382
- )
383
- ):
384
- counts.append(result)
385
- if progress_callback:
386
- progress_callback(i + 1, total_items)
387
- else:
388
- # 串行处理
389
- counts = []
390
- for i, item in enumerate(data):
391
- total = 0
392
- for field in fields:
393
- value = get_field_with_spec(item, field, default="")
394
- if value:
395
- total += count_tokens(str(value), model=model, backend=_backend)
396
- counts.append(total)
397
- if progress_callback:
398
- progress_callback(i + 1, total_items)
347
+ for i, item in enumerate(data):
348
+ total = 0
349
+ for field in fields:
350
+ value = get_field_with_spec(item, field, default="")
351
+ if value:
352
+ total += count_tokens(str(value), model=model, backend=backend)
353
+ counts.append(total)
354
+ if progress_callback:
355
+ progress_callback(i + 1, total_items)
399
356
 
400
357
  sorted_counts = sorted(counts)
401
358
  avg = sum(counts) / len(counts)
@@ -591,27 +548,12 @@ def messages_token_filter(
591
548
  return filter_func
592
549
 
593
550
 
594
- def _count_messages_tokens_wrapper(args: tuple) -> Optional[Dict[str, int]]:
595
- """
596
- 计算单条 messages 的 token 数(用于多进程)。
597
-
598
- Args:
599
- args: (item, messages_field, model, backend) 元组
600
- """
601
- item, messages_field, model, backend = args
602
- messages = get_field_with_spec(item, messages_field, default=[])
603
- if messages:
604
- return _count_messages_tokens(messages, model=model, backend=backend)
605
- return None
606
-
607
-
608
551
  def messages_token_stats(
609
552
  data: List[Dict[str, Any]],
610
553
  messages_field: str = "messages",
611
554
  model: str = DEFAULT_MODEL,
612
555
  backend: Optional[str] = None,
613
556
  progress_callback: Optional[Callable[[int, int], None]] = None,
614
- workers: Optional[int] = None,
615
557
  ) -> Dict[str, Any]:
616
558
  """
617
559
  统计数据集中 messages 的 token 信息。
@@ -622,7 +564,6 @@ def messages_token_stats(
622
564
  model: 模型名称或别名
623
565
  backend: 后端,None 则自动检测
624
566
  progress_callback: 进度回调函数,接收 (current, total) 两个参数
625
- workers: 进程数,None 自动检测,1 表示禁用并行
626
567
 
627
568
  Returns:
628
569
  统计信息字典,包含:
@@ -640,38 +581,14 @@ def messages_token_stats(
640
581
  if not data:
641
582
  return {"count": 0, "total_tokens": 0}
642
583
 
643
- total_items = len(data)
644
-
645
- # 判断是否使用多进程
646
- use_parallel = workers != 1 and total_items >= 1000
647
-
648
584
  all_stats = []
649
- if use_parallel:
650
- from .parallel import get_optimal_workers, parallel_imap
651
-
652
- actual_workers = get_optimal_workers(total_items, workers)
653
- args_list = [(item, messages_field, model, _backend) for item in data]
654
-
655
- for i, result in enumerate(
656
- parallel_imap(
657
- _count_messages_tokens_wrapper,
658
- args_list,
659
- workers=actual_workers,
660
- threshold=1000,
661
- )
662
- ):
663
- if result is not None:
664
- all_stats.append(result)
665
- if progress_callback:
666
- progress_callback(i + 1, total_items)
667
- else:
668
- # 串行处理
669
- for i, item in enumerate(data):
670
- messages = get_field_with_spec(item, messages_field, default=[])
671
- if messages:
672
- all_stats.append(_count_messages_tokens(messages, model=model, backend=_backend))
673
- if progress_callback:
674
- progress_callback(i + 1, total_items)
585
+ total_items = len(data)
586
+ for i, item in enumerate(data):
587
+ messages = get_field_with_spec(item, messages_field, default=[])
588
+ if messages:
589
+ all_stats.append(_count_messages_tokens(messages, model=model, backend=_backend))
590
+ if progress_callback:
591
+ progress_callback(i + 1, total_items)
675
592
 
676
593
  if not all_stats:
677
594
  return {"count": 0, "total_tokens": 0}
@@ -0,0 +1,124 @@
1
+ """
2
+ 文本清洗工具
3
+
4
+ 提供 LLM 输出的常见清洗函数:
5
+ - strip_think_tags: 去除 <think>...</think> 思考链内容
6
+ - extract_code_snippets: 提取 ``` 代码块
7
+ - parse_generic_tags: 解析 <tag>content</tag> 格式标签
8
+ """
9
+
10
+ import re
11
+ from typing import Dict, List
12
+
13
+
14
+ def strip_think_tags(text: str) -> str:
15
+ """去除 <think>...</think> 包裹的内容
16
+
17
+ Args:
18
+ text: 输入文本
19
+
20
+ Returns:
21
+ 去除思考链后的文本
22
+
23
+ Examples:
24
+ >>> strip_think_tags("<think>让我想想...</think>答案是42")
25
+ '答案是42'
26
+ """
27
+ if not text:
28
+ return text
29
+ return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
30
+
31
+
32
+ def extract_code_snippets(text: str, strict: bool = True) -> List[Dict[str, str]]:
33
+ """提取 ``` 代码块
34
+
35
+ Args:
36
+ text: 输入文本
37
+ strict: True 仅匹配 ```lang...``` 格式,False 额外匹配 {...} 格式
38
+
39
+ Returns:
40
+ 代码片段列表,每项为 {"language": ..., "code": ...}
41
+
42
+ Examples:
43
+ >>> extract_code_snippets("```json\\n{\"a\": 1}\\n```")
44
+ [{'language': 'json', 'code': '{"a": 1}'}]
45
+ """
46
+ pattern = r"```(\w+)?\s*([\s\S]*?)```"
47
+ matches = re.findall(pattern, text)
48
+
49
+ code_snippets = []
50
+ for lang, code in matches:
51
+ code_snippets.append(
52
+ {
53
+ "language": lang.strip() if lang else "unknown",
54
+ "code": code.strip(),
55
+ }
56
+ )
57
+
58
+ if not strict:
59
+ # 移除已匹配的 ``` 块,在剩余文本中匹配 { ... }
60
+ text = re.sub(pattern, "", text)
61
+ brace_matches = re.findall(r"\{[\s\S]*?\}", text)
62
+ for code in brace_matches:
63
+ code_snippets.append(
64
+ {
65
+ "language": "unknown",
66
+ "code": code.strip(),
67
+ }
68
+ )
69
+
70
+ return code_snippets
71
+
72
+
73
+ def parse_generic_tags(text: str, strict: bool = False) -> Dict[str, str]:
74
+ """解析 XML 风格标签
75
+
76
+ 支持两种模式:
77
+ - strict=True: 仅匹配闭合标签 <label>content</label>
78
+ - strict=False: 同时匹配开放式标签 <label>content,闭合标签优先
79
+
80
+ Args:
81
+ text: 输入文本
82
+ strict: 是否严格模式
83
+
84
+ Returns:
85
+ {标签名: 内容} 字典
86
+
87
+ Examples:
88
+ >>> parse_generic_tags("<标签>内容</标签>")
89
+ {'标签': '内容'}
90
+ >>> parse_generic_tags("<a>hello<b>world", strict=False)
91
+ {'a': 'hello', 'b': 'world'}
92
+ """
93
+ if not text:
94
+ return {}
95
+
96
+ result = {}
97
+
98
+ if strict:
99
+ pattern_closed = r"<([^>]+)>\s*(.*?)\s*</\1>"
100
+ matches = re.findall(pattern_closed, text, re.DOTALL)
101
+ for label, content in matches:
102
+ result[label.strip()] = content.strip()
103
+ else:
104
+ remaining_text = str(text)
105
+
106
+ # 1. 优先处理闭合标签
107
+ def process_closed_tag(match_obj):
108
+ label = match_obj.group(1).strip()
109
+ content = match_obj.group(2).strip()
110
+ result[label] = content
111
+ return ""
112
+
113
+ pattern_closed = r"<([^>]+)>\s*(.*?)\s*</\1>"
114
+ remaining_text = re.sub(pattern_closed, process_closed_tag, remaining_text, flags=re.DOTALL)
115
+
116
+ # 2. 在剩余文本中处理开放式标签
117
+ pattern_open = r"<([^>]+)>\s*(.*?)(?=<[^>]+>|$)"
118
+ matches_open = re.findall(pattern_open, remaining_text, re.DOTALL)
119
+ for label, content in matches_open:
120
+ label_stripped = label.strip()
121
+ if label_stripped not in result:
122
+ result[label_stripped] = content.strip()
123
+
124
+ return result
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dtflow
3
- Version: 0.5.8
3
+ Version: 0.5.9
4
4
  Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
5
5
  Project-URL: Homepage, https://github.com/yourusername/DataTransformer
6
6
  Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
@@ -44,6 +44,7 @@ Requires-Dist: flake8>=3.9.0; extra == 'dev'
44
44
  Requires-Dist: huggingface-hub>=0.20.0; extra == 'dev'
45
45
  Requires-Dist: isort>=5.9.0; extra == 'dev'
46
46
  Requires-Dist: mypy>=0.910; extra == 'dev'
47
+ Requires-Dist: pandas>=1.3.0; extra == 'dev'
47
48
  Requires-Dist: pyarrow; extra == 'dev'
48
49
  Requires-Dist: pytest-cov>=2.12.0; extra == 'dev'
49
50
  Requires-Dist: pytest>=6.0.0; extra == 'dev'
@@ -57,10 +58,14 @@ Provides-Extra: docs
57
58
  Requires-Dist: myst-parser>=0.15.0; extra == 'docs'
58
59
  Requires-Dist: sphinx-rtd-theme>=0.5.0; extra == 'docs'
59
60
  Requires-Dist: sphinx>=4.0.0; extra == 'docs'
61
+ Provides-Extra: eval
62
+ Requires-Dist: pandas>=1.3.0; extra == 'eval'
63
+ Requires-Dist: scikit-learn>=0.24.0; extra == 'eval'
60
64
  Provides-Extra: full
61
65
  Requires-Dist: datasets>=2.0.0; extra == 'full'
62
66
  Requires-Dist: datasketch>=1.5.0; extra == 'full'
63
67
  Requires-Dist: huggingface-hub>=0.20.0; extra == 'full'
68
+ Requires-Dist: pandas>=1.3.0; extra == 'full'
64
69
  Requires-Dist: pyarrow; extra == 'full'
65
70
  Requires-Dist: rich>=10.0.0; extra == 'full'
66
71
  Requires-Dist: scikit-learn>=0.24.0; extra == 'full'
@@ -435,6 +440,13 @@ dt sample data.jsonl 1000 --by=messages.# # 按消息数量分层采样
435
440
  dt sample data.jsonl --where="category=tech" # 筛选后采样
436
441
  dt sample data.jsonl --where="messages.#>=2" # 多条件筛选
437
442
 
443
+ # 按行范围查看(Python 切片语法)
444
+ dt slice data.jsonl 10:20 # 第 10-19 行(0-based,左闭右开)
445
+ dt slice data.jsonl :100 # 前 100 行
446
+ dt slice data.jsonl 100: # 第 100 行到末尾
447
+ dt slice data.jsonl 10:20 -o sliced.jsonl # 保存到文件
448
+ dt slice data.jsonl 10:20 -f question,answer # 只显示指定字段
449
+
438
450
  # 数据转换 - 预设模式
439
451
  dt transform data.jsonl --preset=openai_chat
440
452
  dt transform data.jsonl --preset=alpaca
@@ -452,7 +464,6 @@ dt run pipeline.yaml --input=new_data.jsonl --output=result.jsonl
452
464
  dt token-stats data.jsonl --field=messages --model=gpt-4
453
465
  dt token-stats data.jsonl --field=messages[-1].content # 统计最后一条消息
454
466
  dt token-stats data.jsonl --field=text --detailed
455
- dt token-stats data.jsonl --workers=4 # 多进程加速(数据量大时自动启用)
456
467
 
457
468
  # 数据对比
458
469
  dt diff v1/train.jsonl v2/train.jsonl
@@ -469,6 +480,9 @@ dt clean data.jsonl --max-len=messages[-1].content:500 # 最后一条消息最
469
480
  dt clean data.jsonl --keep=question,answer # 只保留这些字段
470
481
  dt clean data.jsonl --drop=metadata # 删除指定字段
471
482
  dt clean data.jsonl --strip # 去除字符串首尾空白
483
+ dt clean data.jsonl --min-tokens=content:10 # 最少 10 tokens
484
+ dt clean data.jsonl --max-tokens=content:1000 # 最多 1000 tokens
485
+ dt clean data.jsonl --min-tokens=text:50 -m gpt-4 # 指定分词器
472
486
 
473
487
  # 数据去重
474
488
  dt dedupe data.jsonl # 全量精确去重
@@ -477,6 +491,17 @@ dt dedupe data.jsonl --key=meta.id # 按嵌套字段去重
477
491
  dt dedupe data.jsonl --key=messages[0].content # 按第一条消息内容去重
478
492
  dt dedupe data.jsonl --key=text --similar=0.8 # 相似度去重
479
493
 
494
+ # 数据集切分
495
+ dt split data.jsonl --ratio=0.8 --seed=42 # 二分: train/test
496
+ dt split data.jsonl --ratio=0.7,0.15,0.15 # 三分: train/val/test
497
+ dt split data.jsonl --ratio=0.8 -o /tmp/output # 指定输出目录
498
+
499
+ # 训练框架导出
500
+ dt export data.jsonl --framework=llama-factory # 导出到 LLaMA-Factory
501
+ dt export data.jsonl -f swift -o ./swift_out # 导出到 ms-swift
502
+ dt export data.jsonl -f axolotl # 导出到 Axolotl
503
+ dt export data.jsonl -f llama-factory --check # 仅检查兼容性
504
+
480
505
  # 文件拼接
481
506
  dt concat a.jsonl b.jsonl -o merged.jsonl
482
507
 
@@ -496,7 +521,6 @@ dt validate data.jsonl --preset=openai_chat # 使用预设 schema 验
496
521
  dt validate data.jsonl --preset=alpaca --verbose # 详细输出
497
522
  dt validate data.jsonl --preset=sharegpt --filter-invalid -o valid.jsonl # 过滤出有效数据
498
523
  dt validate data.jsonl --preset=dpo --max-errors=100 # 限制错误输出数量
499
- dt validate data.jsonl --preset=openai_chat --workers=4 # 多进程加速
500
524
  ```
501
525
 
502
526
  ### 字段路径语法
@@ -522,6 +546,8 @@ CLI 命令中的字段参数支持嵌套路径语法,可访问深层嵌套的
522
546
  | `clean` | `--drop-empty=` | `--drop-empty=meta.source` |
523
547
  | `clean` | `--min-len=` | `--min-len=messages.#:2` |
524
548
  | `clean` | `--max-len=` | `--max-len=messages[-1].content:500` |
549
+ | `clean` | `--min-tokens=` | `--min-tokens=content:10` |
550
+ | `clean` | `--max-tokens=` | `--max-tokens=content:1000` |
525
551
  | `token-stats` | `--field=` | `--field=messages[-1].content` |
526
552
  | `diff` | `--key=` | `--key=meta.uuid` |
527
553
 
@@ -1,35 +1,39 @@
1
- dtflow/SKILL.md,sha256=nh12TTq_eRzl5O2CTgsiS809BBVR49kmpZ8n7UprMHI,9552
2
- dtflow/__init__.py,sha256=tofhUr_PMnsONnB3Hu-mwUrD4Q3bV7Kw_0S6dQw6ig8,3031
3
- dtflow/__main__.py,sha256=p8oZKQhwq04shCB3y_pkXjf-SZ4PZvg5PXdyUP-5rYA,13497
1
+ dtflow/SKILL.md,sha256=hPxJhroGmNbBv8MLZUkOA2yW1TDdUKEUYYlz9tW2mao,10393
2
+ dtflow/__init__.py,sha256=9ZqhqD8qQM9w2dfHKyUWIaqSX-X4elWtbaQN4CNBhgg,3031
3
+ dtflow/__main__.py,sha256=gg3v7u-Ot7AicgKrP1fuyKtMJXVduNuLmhy7L1LUPDg,17710
4
4
  dtflow/converters.py,sha256=X3qeFD7FCOMnfiP3MicL5MXimOm4XUYBs5pczIkudU0,22331
5
5
  dtflow/core.py,sha256=qMo6B3LK--TWRK7ZBKObGcs3pKFnd0NPoaM0T8JC7Jw,38135
6
+ dtflow/eval.py,sha256=_c-XP2zsOBznYltSyKEScOqvmPVX2orqepg5cNhXXB0,9836
6
7
  dtflow/framework.py,sha256=jyICi_RWHjX7WfsXdSbWmP1SL7y1OWSPyd5G5Y-lvg4,17578
7
8
  dtflow/lineage.py,sha256=jie3OL1qK90-_cOOqqLbhSJ1oGUktDM1x5HRpQ5Qiyc,12800
8
- dtflow/parallel.py,sha256=EnIdGEGMrZUNT2-CBIV93UFfpqr_jU_heqqvdGXcP-Y,3046
9
9
  dtflow/pipeline.py,sha256=zZaC4fg5vsp_30Fhbg75vu0yggsdvf28bWBiVDWzZ6Y,13901
10
10
  dtflow/presets.py,sha256=qa8WQJhbNMuGxqqgA9BFadEBwDB9s0zWNxxhzF3q1K8,4701
11
- dtflow/schema.py,sha256=zCZNEAqTMT1BS_p2t0CYczR5S9rqyDREa7ZsYI5pFGA,19885
11
+ dtflow/schema.py,sha256=IFcij22_UFKcgKT1YWwRg2QJO0vcAvCb1arZmsGByts,16824
12
12
  dtflow/streaming.py,sha256=dxpNd1-Wz_PTLTdvM5qn06_2TJr5NRlIIuw0LOSS2Iw,24755
13
- dtflow/tokenizers.py,sha256=GFQsuLSLn2GHn2kaXhJkP8G85lgsdLzYtJNbppQhYPE,23408
13
+ dtflow/tokenizers.py,sha256=7ZAelSmcDxLWH5kICgH9Q1ULH3_BfDZb9suHMjJJRZU,20589
14
14
  dtflow/cli/__init__.py,sha256=QhZ-thgx9IBTFII7T_hdoWFUl0CCsdGQHN5ZEZw2XB0,423
15
- dtflow/cli/clean.py,sha256=KuE9ODjD9gSZUIHaD2mQLTDO-1PDwN7EqUpj8EQfVCs,25663
16
- dtflow/cli/commands.py,sha256=zKUG-B9Az-spqyqM00cR8Sgc2UgeOPQDThJFHWDNO_w,1336
15
+ dtflow/cli/clean.py,sha256=BEQQlH2q6luCbx51M3oxxOwcnwlOA8vo9WX3Fp7I6AY,29498
16
+ dtflow/cli/commands.py,sha256=LvyDQ_nWUM7UlPDEFQadRdw5O2ZKDLgF41_xAJRhYxI,1583
17
17
  dtflow/cli/common.py,sha256=gCwnF5Sw2ploqfZJO_z3Ms9mR1HNT7Lj6ydHn0uVaIw,13817
18
+ dtflow/cli/eval.py,sha256=c53kCRH86k2Q_6vESKFlcepcNnTpO9O68agWK4_oJj8,9582
19
+ dtflow/cli/export.py,sha256=loRfVPwEVsDw3ZMKEYGp0Hy38kYZG2QT8JCMbz1dRzU,2156
18
20
  dtflow/cli/io_ops.py,sha256=BMDisP6dxzzmSjYwmeFwaHmpHHPqirmXAWeNTD-9MQM,13254
19
21
  dtflow/cli/lineage.py,sha256=_lNh35nF9AA0Zy6FyZ4g8IzrXH2ZQnp3inF-o2Hs1pw,1383
20
22
  dtflow/cli/pipeline.py,sha256=QNEo-BJlaC1CVnVeRZr7TwfuZYloJ4TebIzJ5ALzry0,1426
21
- dtflow/cli/sample.py,sha256=pubpx4AIzsarBEalD150MC2apYQSt4bal70IZkTfFO0,15475
23
+ dtflow/cli/sample.py,sha256=etbro5I0pyNgn0Qfhp1M6Bh-95JN-AntDa5AwVe_oKY,18269
22
24
  dtflow/cli/skill.py,sha256=opiTEBejA7JHKrEMftMOPDQlOgZ4n59rwaHXGU1Nukk,2022
23
- dtflow/cli/stats.py,sha256=HkTZD80h4tzYXTtMnfpjLUMP6kl_es6ifcmExxzGdMU,31813
25
+ dtflow/cli/split.py,sha256=96bhWnxHnjIqifoliLgciApkLbwQU8bWHovK8bcMk9g,3667
26
+ dtflow/cli/stats.py,sha256=Jx3d4X0ftgpzU5q5RAWZEVJWwXviQTF4EAwBmz1IliA,31366
24
27
  dtflow/cli/transform.py,sha256=w6xqMOxPxQvL2u_BPCfpDHuPSC9gmcqMPVN8s-B6bbY,15052
25
- dtflow/cli/validate.py,sha256=Frs-jKcDHmYozpmIYZueDSX5o2i1Xn-WW81FGUyUrng,5796
28
+ dtflow/cli/validate.py,sha256=65aGVlMS_Rq0Ch0YQ-TclVJ03RQP4CnG137wthzb8Ao,4384
26
29
  dtflow/storage/__init__.py,sha256=C0jpWNQU808Ezz7lWneddABal3wILy8ijFUNiSKbHV4,362
27
30
  dtflow/storage/io.py,sha256=ZH2aSE-S89gpy3z4oTqhcqWf4u10OdkDoyul7o_YBDI,23374
28
31
  dtflow/utils/__init__.py,sha256=Pn-ltwV04fBQmeZG7FxInDQmzH29LYOi90LgeLMEuQk,506
29
32
  dtflow/utils/display.py,sha256=OeOdTh6mbDwSkDWlmkjfpTjy2QG8ZUaYU0NpHUWkpEQ,5881
30
33
  dtflow/utils/field_path.py,sha256=K8nU196RxTSJ1OoieTWGcYOWl9KjGq2iSxCAkfjECuM,7621
31
34
  dtflow/utils/helpers.py,sha256=JXN176_B2pm53GLVyZ1wj3wrmBJG52Tkw6AMQSdj7M8,791
32
- dtflow-0.5.8.dist-info/METADATA,sha256=Tm_dfdQfGlShyDt95fNQ87JXiBRnf6mfDgx827h3Rnc,24487
33
- dtflow-0.5.8.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
34
- dtflow-0.5.8.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
35
- dtflow-0.5.8.dist-info/RECORD,,
35
+ dtflow/utils/text_parser.py,sha256=0t2TMOSha4dTiDu9H4ygdb67cI20zhtBH1XavDspL_g,3727
36
+ dtflow-0.5.9.dist-info/METADATA,sha256=Pu92Dz2vj7U_dki4A0e5xgka36BTT9K2PnN1LIeEhN0,25839
37
+ dtflow-0.5.9.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
38
+ dtflow-0.5.9.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
39
+ dtflow-0.5.9.dist-info/RECORD,,
dtflow/parallel.py DELETED
@@ -1,115 +0,0 @@
1
- """
2
- 并行处理模块
3
-
4
- 提供多进程并行处理工具,用于加速大数据集的 token 统计和 schema 验证。
5
- """
6
-
7
- from multiprocessing import Pool, cpu_count
8
- from typing import Callable, List, Optional, TypeVar
9
-
10
- T = TypeVar("T")
11
- R = TypeVar("R")
12
-
13
-
14
- def parallel_map(
15
- func: Callable[[T], R],
16
- data: List[T],
17
- workers: Optional[int] = None,
18
- threshold: int = 1000,
19
- chunksize: Optional[int] = None,
20
- ) -> List[R]:
21
- """
22
- 并行 map 操作。
23
-
24
- Args:
25
- func: 处理函数(必须可 pickle,不能是 lambda 或闭包)
26
- data: 数据列表
27
- workers: 进程数,None 则使用 CPU 核数
28
- threshold: 数据量阈值,低于此值使用串行
29
- chunksize: 每个进程的任务块大小,None 则自动计算
30
-
31
- Returns:
32
- 处理结果列表(保持顺序)
33
- """
34
- n = len(data)
35
-
36
- # 数据量小或指定单进程,使用串行
37
- if n < threshold or workers == 1:
38
- return [func(item) for item in data]
39
-
40
- workers = workers or cpu_count()
41
- workers = min(workers, n) # 进程数不超过数据量
42
-
43
- # 自动计算 chunksize
44
- if chunksize is None:
45
- chunksize = max(1, n // (workers * 4))
46
-
47
- with Pool(processes=workers) as pool:
48
- return pool.map(func, data, chunksize=chunksize)
49
-
50
-
51
- def parallel_imap(
52
- func: Callable[[T], R],
53
- data: List[T],
54
- workers: Optional[int] = None,
55
- threshold: int = 1000,
56
- chunksize: Optional[int] = None,
57
- ):
58
- """
59
- 并行 imap 操作(惰性迭代器版本,支持进度回调)。
60
-
61
- Args:
62
- func: 处理函数(必须可 pickle)
63
- data: 数据列表
64
- workers: 进程数,None 则使用 CPU 核数
65
- threshold: 数据量阈值,低于此值使用串行
66
- chunksize: 每个进程的任务块大小
67
-
68
- Yields:
69
- 处理结果(按顺序)
70
- """
71
- n = len(data)
72
-
73
- # 数据量小或指定单进程,使用串行
74
- if n < threshold or workers == 1:
75
- for item in data:
76
- yield func(item)
77
- return
78
-
79
- workers = workers or cpu_count()
80
- workers = min(workers, n)
81
-
82
- if chunksize is None:
83
- chunksize = max(1, n // (workers * 4))
84
-
85
- with Pool(processes=workers) as pool:
86
- for result in pool.imap(func, data, chunksize=chunksize):
87
- yield result
88
-
89
-
90
- def get_optimal_workers(data_size: int, default: Optional[int] = None) -> int:
91
- """
92
- 根据数据量计算最优进程数。
93
-
94
- Args:
95
- data_size: 数据量
96
- default: 用户指定的进程数,None 则自动计算
97
-
98
- Returns:
99
- 最优进程数
100
- """
101
- if default is not None:
102
- return default
103
-
104
- cpu_cores = cpu_count()
105
-
106
- # 数据量小于阈值,单进程
107
- if data_size < 1000:
108
- return 1
109
-
110
- # 数据量适中,使用一半 CPU
111
- if data_size < 10000:
112
- return max(1, cpu_cores // 2)
113
-
114
- # 大数据量,使用全部 CPU
115
- return cpu_cores
File without changes