dtflow 0.4.1__tar.gz → 0.4.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. {dtflow-0.4.1 → dtflow-0.4.3}/PKG-INFO +12 -1
  2. {dtflow-0.4.1 → dtflow-0.4.3}/README.md +11 -0
  3. {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/__init__.py +1 -1
  4. {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/__main__.py +19 -7
  5. dtflow-0.4.3/dtflow/cli/clean.py +486 -0
  6. dtflow-0.4.3/dtflow/cli/commands.py +56 -0
  7. dtflow-0.4.3/dtflow/cli/common.py +384 -0
  8. dtflow-0.4.3/dtflow/cli/io_ops.py +385 -0
  9. dtflow-0.4.3/dtflow/cli/lineage.py +49 -0
  10. dtflow-0.4.3/dtflow/cli/pipeline.py +54 -0
  11. dtflow-0.4.3/dtflow/cli/sample.py +294 -0
  12. dtflow-0.4.3/dtflow/cli/stats.py +589 -0
  13. dtflow-0.4.3/dtflow/cli/transform.py +486 -0
  14. {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/core.py +35 -0
  15. {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/storage/io.py +49 -6
  16. {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/streaming.py +25 -4
  17. {dtflow-0.4.1 → dtflow-0.4.3}/tests/test_transformer.py +33 -4
  18. dtflow-0.4.1/dtflow/cli/commands.py +0 -2482
  19. {dtflow-0.4.1 → dtflow-0.4.3}/.gitignore +0 -0
  20. {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/cli/__init__.py +0 -0
  21. {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/converters.py +0 -0
  22. {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/lineage.py +0 -0
  23. {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/mcp/__init__.py +0 -0
  24. {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/mcp/__main__.py +0 -0
  25. {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/mcp/cli.py +0 -0
  26. {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/mcp/docs.py +0 -0
  27. {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/mcp/server.py +0 -0
  28. {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/pipeline.py +0 -0
  29. {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/presets.py +0 -0
  30. {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/storage/__init__.py +0 -0
  31. {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/tokenizers.py +0 -0
  32. {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/utils/__init__.py +0 -0
  33. {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/utils/display.py +0 -0
  34. {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/utils/field_path.py +0 -0
  35. {dtflow-0.4.1 → dtflow-0.4.3}/pyproject.toml +0 -0
  36. {dtflow-0.4.1 → dtflow-0.4.3}/tests/benchmark_io.py +0 -0
  37. {dtflow-0.4.1 → dtflow-0.4.3}/tests/test_converters.py +0 -0
  38. {dtflow-0.4.1 → dtflow-0.4.3}/tests/test_field_path.py +0 -0
  39. {dtflow-0.4.1 → dtflow-0.4.3}/tests/test_io.py +0 -0
  40. {dtflow-0.4.1 → dtflow-0.4.3}/tests/test_lineage.py +0 -0
  41. {dtflow-0.4.1 → dtflow-0.4.3}/tests/test_pipeline.py +0 -0
  42. {dtflow-0.4.1 → dtflow-0.4.3}/tests/test_streaming.py +0 -0
  43. {dtflow-0.4.1 → dtflow-0.4.3}/tests/test_tokenizers.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dtflow
3
- Version: 0.4.1
3
+ Version: 0.4.3
4
4
  Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
5
5
  Project-URL: Homepage, https://github.com/yourusername/DataTransformer
6
6
  Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
@@ -126,6 +126,17 @@ dt.filter(lambda x: x.score > 0.8)
126
126
  dt.filter(lambda x: x.language == "zh")
127
127
  ```
128
128
 
129
+ ### 数据验证
130
+
131
+ ```python
132
+ # 验证数据,返回不通过的记录列表
133
+ errors = dt.validate(lambda x: len(x.messages) >= 2)
134
+
135
+ if errors:
136
+ for e in errors[:5]:
137
+ print(f"第 {e.index} 行: {e.error}")
138
+ ```
139
+
129
140
  ### 数据转换
130
141
 
131
142
  ```python
@@ -50,6 +50,17 @@ dt.filter(lambda x: x.score > 0.8)
50
50
  dt.filter(lambda x: x.language == "zh")
51
51
  ```
52
52
 
53
+ ### 数据验证
54
+
55
+ ```python
56
+ # 验证数据,返回不通过的记录列表
57
+ errors = dt.validate(lambda x: len(x.messages) >= 2)
58
+
59
+ if errors:
60
+ for e in errors[:5]:
61
+ print(f"第 {e.index} 行: {e.error}")
62
+ ```
63
+
53
64
  ### 数据转换
54
65
 
55
66
  ```python
@@ -42,7 +42,7 @@ from .tokenizers import (
42
42
  token_stats,
43
43
  )
44
44
 
45
- __version__ = "0.4.1"
45
+ __version__ = "0.4.3"
46
46
 
47
47
  __all__ = [
48
48
  # core
@@ -56,38 +56,49 @@ app = typer.Typer(
56
56
  @app.command()
57
57
  def sample(
58
58
  filename: str = typer.Argument(..., help="输入文件路径"),
59
- num: int = typer.Argument(10, help="采样数量"),
59
+ num_arg: Optional[int] = typer.Argument(None, help="采样数量", metavar="NUM"),
60
+ num: int = typer.Option(10, "--num", "-n", help="采样数量", show_default=True),
60
61
  type: str = typer.Option("head", "--type", "-t", help="采样方式: random/head/tail"),
61
62
  output: Optional[str] = typer.Option(None, "--output", "-o", help="输出文件路径"),
62
63
  seed: Optional[int] = typer.Option(None, "--seed", help="随机种子"),
63
64
  by: Optional[str] = typer.Option(None, "--by", help="分层采样字段"),
64
65
  uniform: bool = typer.Option(False, "--uniform", help="均匀采样模式"),
65
66
  fields: Optional[str] = typer.Option(None, "--fields", "-f", help="只显示指定字段(逗号分隔)"),
67
+ raw: bool = typer.Option(False, "--raw", "-r", help="输出原始 JSON(不截断)"),
66
68
  ):
67
69
  """从数据文件中采样指定数量的数据"""
68
- _sample(filename, num, type, output, seed, by, uniform, fields)
70
+ actual_num = num_arg if num_arg is not None else num
71
+ _sample(filename, actual_num, type, output, seed, by, uniform, fields, raw)
69
72
 
70
73
 
71
74
  @app.command()
72
75
  def head(
73
76
  filename: str = typer.Argument(..., help="输入文件路径"),
74
- num: int = typer.Argument(10, help="显示数量"),
77
+ num_arg: Optional[int] = typer.Argument(None, help="显示数量", metavar="NUM"),
78
+ num: int = typer.Option(10, "--num", "-n", help="显示数量", show_default=True),
75
79
  output: Optional[str] = typer.Option(None, "--output", "-o", help="输出文件路径"),
76
80
  fields: Optional[str] = typer.Option(None, "--fields", "-f", help="只显示指定字段"),
81
+ raw: bool = typer.Option(False, "--raw", "-r", help="输出原始 JSON(不截断)"),
77
82
  ):
78
83
  """显示文件的前 N 条数据"""
79
- _head(filename, num, output, fields)
84
+ # 位置参数优先于选项参数
85
+ actual_num = num_arg if num_arg is not None else num
86
+ _head(filename, actual_num, output, fields, raw)
80
87
 
81
88
 
82
89
  @app.command()
83
90
  def tail(
84
91
  filename: str = typer.Argument(..., help="输入文件路径"),
85
- num: int = typer.Argument(10, help="显示数量"),
92
+ num_arg: Optional[int] = typer.Argument(None, help="显示数量", metavar="NUM"),
93
+ num: int = typer.Option(10, "--num", "-n", help="显示数量", show_default=True),
86
94
  output: Optional[str] = typer.Option(None, "--output", "-o", help="输出文件路径"),
87
95
  fields: Optional[str] = typer.Option(None, "--fields", "-f", help="只显示指定字段"),
96
+ raw: bool = typer.Option(False, "--raw", "-r", help="输出原始 JSON(不截断)"),
88
97
  ):
89
98
  """显示文件的后 N 条数据"""
90
- _tail(filename, num, output, fields)
99
+ # 位置参数优先于选项参数
100
+ actual_num = num_arg if num_arg is not None else num
101
+ _tail(filename, actual_num, output, fields, raw)
91
102
 
92
103
 
93
104
  # ============ 数据转换命令 ============
@@ -161,9 +172,10 @@ def clean(
161
172
  def stats(
162
173
  filename: str = typer.Argument(..., help="输入文件路径"),
163
174
  top: int = typer.Option(10, "--top", "-n", help="显示 Top N 值"),
175
+ full: bool = typer.Option(False, "--full", "-f", help="完整模式:统计值分布、唯一值等详细信息"),
164
176
  ):
165
177
  """显示数据文件的统计信息"""
166
- _stats(filename, top)
178
+ _stats(filename, top, full)
167
179
 
168
180
 
169
181
  @app.command("token-stats")
@@ -0,0 +1,486 @@
1
+ """
2
+ CLI 数据清洗和去重相关命令
3
+ """
4
+
5
+ import os
6
+ import shutil
7
+ import tempfile
8
+ from pathlib import Path
9
+ from typing import Any, Dict, List, Optional
10
+
11
+ from ..core import DataTransformer
12
+ from ..storage.io import save_data
13
+ from ..streaming import load_stream
14
+ from ..utils.field_path import get_field_with_spec
15
+ from .common import (
16
+ _check_file_format,
17
+ _get_value_len,
18
+ _is_empty_value,
19
+ _is_streaming_supported,
20
+ _parse_field_list,
21
+ )
22
+
23
+
24
+ def dedupe(
25
+ filename: str,
26
+ key: Optional[str] = None,
27
+ similar: Optional[float] = None,
28
+ output: Optional[str] = None,
29
+ ) -> None:
30
+ """
31
+ 数据去重。
32
+
33
+ 支持两种模式:
34
+ 1. 精确去重(默认):完全相同的数据才去重
35
+ 2. 相似度去重:使用 MinHash+LSH 算法,相似度超过阈值则去重
36
+
37
+ Args:
38
+ filename: 输入文件路径,支持 csv/excel/jsonl/json/parquet/arrow/feather 格式
39
+ key: 去重依据字段,支持嵌套路径语法:
40
+ - meta.source 嵌套字段
41
+ - messages[0].role 数组索引
42
+ - messages[-1].content 负索引
43
+ - messages.# 数组长度
44
+ - messages[*].role:join 展开所有元素
45
+ 多个字段用逗号分隔。不指定则全量去重
46
+ similar: 相似度阈值(0-1),指定后启用相似度去重模式,需要指定 --key
47
+ output: 输出文件路径,不指定则覆盖原文件
48
+
49
+ Examples:
50
+ dt dedupe data.jsonl # 全量精确去重
51
+ dt dedupe data.jsonl --key=text # 按 text 字段精确去重
52
+ dt dedupe data.jsonl --key=user,timestamp # 按多字段组合精确去重
53
+ dt dedupe data.jsonl --key=meta.id # 按嵌套字段去重
54
+ dt dedupe data.jsonl --key=messages[0].content # 按第一条消息内容去重
55
+ dt dedupe data.jsonl --key=text --similar=0.8 # 相似度去重
56
+ """
57
+ filepath = Path(filename)
58
+
59
+ if not filepath.exists():
60
+ print(f"错误: 文件不存在 - {filename}")
61
+ return
62
+
63
+ if not _check_file_format(filepath):
64
+ return
65
+
66
+ # 相似度去重模式必须指定 key
67
+ if similar is not None and not key:
68
+ print("错误: 相似度去重需要指定 --key 参数")
69
+ return
70
+
71
+ if similar is not None and (similar <= 0 or similar > 1):
72
+ print("错误: --similar 参数必须在 0-1 之间")
73
+ return
74
+
75
+ # 加载数据
76
+ print(f"📊 加载数据: {filepath}")
77
+ try:
78
+ dt = DataTransformer.load(str(filepath))
79
+ except Exception as e:
80
+ print(f"错误: 无法读取文件 - {e}")
81
+ return
82
+
83
+ original_count = len(dt)
84
+ print(f" 共 {original_count} 条数据")
85
+
86
+ # 执行去重
87
+ if similar is not None:
88
+ # 相似度去重模式
89
+ print(f"🔑 相似度去重: 字段={key}, 阈值={similar}")
90
+ print("🔄 执行去重(MinHash+LSH)...")
91
+ try:
92
+ result = dt.dedupe_similar(key, threshold=similar)
93
+ except ImportError as e:
94
+ print(f"错误: {e}")
95
+ return
96
+ else:
97
+ # 精确去重模式
98
+ dedupe_key: Any = None
99
+ if key:
100
+ keys = [k.strip() for k in key.split(",")]
101
+ if len(keys) == 1:
102
+ dedupe_key = keys[0]
103
+ print(f"🔑 按字段精确去重: {dedupe_key}")
104
+ else:
105
+ dedupe_key = keys
106
+ print(f"🔑 按多字段组合精确去重: {', '.join(dedupe_key)}")
107
+ else:
108
+ print("🔑 全量精确去重")
109
+
110
+ print("🔄 执行去重...")
111
+ result = dt.dedupe(dedupe_key)
112
+
113
+ dedupe_count = len(result)
114
+ removed_count = original_count - dedupe_count
115
+
116
+ # 保存结果
117
+ output_path = output or str(filepath)
118
+ print(f"💾 保存结果: {output_path}")
119
+ try:
120
+ result.save(output_path)
121
+ except Exception as e:
122
+ print(f"错误: 无法保存文件 - {e}")
123
+ return
124
+
125
+ print(f"\n✅ 完成! 去除 {removed_count} 条重复数据,剩余 {dedupe_count} 条")
126
+
127
+
128
+ def clean(
129
+ filename: str,
130
+ drop_empty: Optional[str] = None,
131
+ min_len: Optional[str] = None,
132
+ max_len: Optional[str] = None,
133
+ keep: Optional[str] = None,
134
+ drop: Optional[str] = None,
135
+ strip: bool = False,
136
+ output: Optional[str] = None,
137
+ ) -> None:
138
+ """
139
+ 数据清洗(默认流式处理)。
140
+
141
+ Args:
142
+ filename: 输入文件路径,支持 csv/excel/jsonl/json/parquet/arrow/feather 格式
143
+ drop_empty: 删除空值记录,支持嵌套路径语法
144
+ - 不带值:删除任意字段为空的记录
145
+ - 指定字段:删除指定字段为空的记录(逗号分隔)
146
+ min_len: 最小长度过滤,格式 "字段:长度",字段支持嵌套路径
147
+ max_len: 最大长度过滤,格式 "字段:长度",字段支持嵌套路径
148
+ keep: 只保留指定字段(逗号分隔,仅支持顶层字段)
149
+ drop: 删除指定字段(逗号分隔,仅支持顶层字段)
150
+ strip: 去除所有字符串字段的首尾空白
151
+ output: 输出文件路径,不指定则覆盖原文件
152
+
153
+ Examples:
154
+ dt clean data.jsonl --drop-empty # 删除任意空值记录
155
+ dt clean data.jsonl --drop-empty=text,answer # 删除指定字段为空的记录
156
+ dt clean data.jsonl --drop-empty=meta.source # 删除嵌套字段为空的记录
157
+ dt clean data.jsonl --min-len=text:10 # text 字段最少 10 字符
158
+ dt clean data.jsonl --min-len=messages.#:2 # 至少 2 条消息
159
+ dt clean data.jsonl --max-len=messages[-1].content:500 # 最后一条消息最多 500 字符
160
+ dt clean data.jsonl --keep=question,answer # 只保留这些字段
161
+ dt clean data.jsonl --drop=metadata,timestamp # 删除这些字段
162
+ dt clean data.jsonl --strip # 去除字符串首尾空白
163
+ """
164
+ filepath = Path(filename)
165
+
166
+ if not filepath.exists():
167
+ print(f"错误: 文件不存在 - {filename}")
168
+ return
169
+
170
+ if not _check_file_format(filepath):
171
+ return
172
+
173
+ # 解析参数
174
+ min_len_field, min_len_value = _parse_len_param(min_len) if min_len else (None, None)
175
+ max_len_field, max_len_value = _parse_len_param(max_len) if max_len else (None, None)
176
+ keep_fields = _parse_field_list(keep) if keep else None
177
+ drop_fields_set = set(_parse_field_list(drop)) if drop else None
178
+ keep_set = set(keep_fields) if keep_fields else None
179
+
180
+ # 构建清洗配置
181
+ empty_fields = None
182
+ if drop_empty is not None:
183
+ if drop_empty == "" or drop_empty is True:
184
+ print("🔄 删除任意字段为空的记录...")
185
+ empty_fields = []
186
+ else:
187
+ empty_fields = _parse_field_list(drop_empty)
188
+ print(f"🔄 删除字段为空的记录: {', '.join(empty_fields)}")
189
+
190
+ if strip:
191
+ print("🔄 去除字符串首尾空白...")
192
+ if min_len_field:
193
+ print(f"🔄 过滤 {min_len_field} 长度 < {min_len_value} 的记录...")
194
+ if max_len_field:
195
+ print(f"🔄 过滤 {max_len_field} 长度 > {max_len_value} 的记录...")
196
+ if keep_fields:
197
+ print(f"🔄 只保留字段: {', '.join(keep_fields)}")
198
+ if drop_fields_set:
199
+ print(f"🔄 删除字段: {', '.join(drop_fields_set)}")
200
+
201
+ output_path = output or str(filepath)
202
+
203
+ # 检查输入输出是否相同(流式处理需要临时文件)
204
+ input_resolved = filepath.resolve()
205
+ output_resolved = Path(output_path).resolve()
206
+ use_temp_file = input_resolved == output_resolved
207
+
208
+ # 对于 JSONL 文件使用流式处理
209
+ if _is_streaming_supported(filepath):
210
+ print(f"📊 流式加载: {filepath}")
211
+
212
+ # 如果输入输出相同,使用临时文件
213
+ if use_temp_file:
214
+ print("⚠ 检测到输出文件与输入文件相同,将使用临时文件")
215
+ temp_fd, temp_path = tempfile.mkstemp(
216
+ suffix=output_resolved.suffix,
217
+ prefix=".tmp_",
218
+ dir=output_resolved.parent,
219
+ )
220
+ os.close(temp_fd)
221
+ actual_output = temp_path
222
+ else:
223
+ actual_output = output_path
224
+
225
+ try:
226
+ count = _clean_streaming(
227
+ str(filepath),
228
+ actual_output,
229
+ strip=strip,
230
+ empty_fields=empty_fields,
231
+ min_len_field=min_len_field,
232
+ min_len_value=min_len_value,
233
+ max_len_field=max_len_field,
234
+ max_len_value=max_len_value,
235
+ keep_set=keep_set,
236
+ drop_fields_set=drop_fields_set,
237
+ )
238
+
239
+ # 如果使用了临时文件,移动到目标位置
240
+ if use_temp_file:
241
+ shutil.move(temp_path, output_path)
242
+
243
+ print(f"💾 保存结果: {output_path}")
244
+ print(f"\n✅ 完成! 清洗后 {count} 条数据")
245
+ except Exception as e:
246
+ # 清理临时文件
247
+ if use_temp_file and os.path.exists(temp_path):
248
+ os.unlink(temp_path)
249
+ print(f"错误: 清洗失败 - {e}")
250
+ import traceback
251
+
252
+ traceback.print_exc()
253
+ return
254
+
255
+ # 非 JSONL 文件使用传统方式
256
+ print(f"📊 加载数据: {filepath}")
257
+ try:
258
+ dt = DataTransformer.load(str(filepath))
259
+ except Exception as e:
260
+ print(f"错误: 无法读取文件 - {e}")
261
+ return
262
+
263
+ original_count = len(dt)
264
+ print(f" 共 {original_count} 条数据")
265
+
266
+ # 单次遍历执行所有清洗操作
267
+ data, step_stats = _clean_data_single_pass(
268
+ dt.data,
269
+ strip=strip,
270
+ empty_fields=empty_fields,
271
+ min_len_field=min_len_field,
272
+ min_len_value=min_len_value,
273
+ max_len_field=max_len_field,
274
+ max_len_value=max_len_value,
275
+ keep_fields=keep_fields,
276
+ drop_fields=drop_fields_set,
277
+ )
278
+
279
+ # 保存结果
280
+ final_count = len(data)
281
+ print(f"💾 保存结果: {output_path}")
282
+
283
+ try:
284
+ save_data(data, output_path)
285
+ except Exception as e:
286
+ print(f"错误: 无法保存文件 - {e}")
287
+ return
288
+
289
+ # 打印统计
290
+ removed_count = original_count - final_count
291
+ print(f"\n✅ 完成!")
292
+ print(f" 原始: {original_count} 条 -> 清洗后: {final_count} 条 (删除 {removed_count} 条)")
293
+ if step_stats:
294
+ print(f" 步骤: {' | '.join(step_stats)}")
295
+
296
+
297
+ def _parse_len_param(param: str) -> tuple:
298
+ """解析长度参数,格式 'field:length'"""
299
+ if ":" not in param:
300
+ raise ValueError(f"长度参数格式错误: {param},应为 '字段:长度'")
301
+ parts = param.split(":", 1)
302
+ field = parts[0].strip()
303
+ try:
304
+ length = int(parts[1].strip())
305
+ except ValueError:
306
+ raise ValueError(f"长度必须是整数: {parts[1]}")
307
+ return field, length
308
+
309
+
310
+ def _clean_data_single_pass(
311
+ data: List[Dict],
312
+ strip: bool = False,
313
+ empty_fields: Optional[List[str]] = None,
314
+ min_len_field: Optional[str] = None,
315
+ min_len_value: Optional[int] = None,
316
+ max_len_field: Optional[str] = None,
317
+ max_len_value: Optional[int] = None,
318
+ keep_fields: Optional[List[str]] = None,
319
+ drop_fields: Optional[set] = None,
320
+ ) -> tuple:
321
+ """
322
+ 单次遍历执行所有清洗操作。
323
+
324
+ Args:
325
+ data: 原始数据列表
326
+ strip: 是否去除字符串首尾空白
327
+ empty_fields: 检查空值的字段列表(支持嵌套路径),空列表表示检查所有字段,None 表示不检查
328
+ min_len_field: 最小长度检查的字段(支持嵌套路径)
329
+ min_len_value: 最小长度值
330
+ max_len_field: 最大长度检查的字段(支持嵌套路径)
331
+ max_len_value: 最大长度值
332
+ keep_fields: 只保留的字段列表(仅支持顶层字段)
333
+ drop_fields: 要删除的字段集合(仅支持顶层字段)
334
+
335
+ Returns:
336
+ (清洗后的数据, 统计信息列表)
337
+ """
338
+ result = []
339
+ stats = {
340
+ "drop_empty": 0,
341
+ "min_len": 0,
342
+ "max_len": 0,
343
+ }
344
+
345
+ # 预先计算 keep_fields 集合(如果有的话)
346
+ keep_set = set(keep_fields) if keep_fields else None
347
+
348
+ for item in data:
349
+ # 1. strip 处理(在过滤前执行,这样空值检测更准确)
350
+ if strip:
351
+ item = {k: v.strip() if isinstance(v, str) else v for k, v in item.items()}
352
+
353
+ # 2. 空值过滤
354
+ if empty_fields is not None:
355
+ if len(empty_fields) == 0:
356
+ # 检查所有字段
357
+ if any(_is_empty_value(v) for v in item.values()):
358
+ stats["drop_empty"] += 1
359
+ continue
360
+ else:
361
+ # 检查指定字段(支持嵌套路径)
362
+ if any(_is_empty_value(get_field_with_spec(item, f)) for f in empty_fields):
363
+ stats["drop_empty"] += 1
364
+ continue
365
+
366
+ # 3. 最小长度过滤(支持嵌套路径)
367
+ if min_len_field is not None:
368
+ if _get_value_len(get_field_with_spec(item, min_len_field, default="")) < min_len_value:
369
+ stats["min_len"] += 1
370
+ continue
371
+
372
+ # 4. 最大长度过滤(支持嵌套路径)
373
+ if max_len_field is not None:
374
+ if _get_value_len(get_field_with_spec(item, max_len_field, default="")) > max_len_value:
375
+ stats["max_len"] += 1
376
+ continue
377
+
378
+ # 5. 字段管理(keep/drop)
379
+ if keep_set is not None:
380
+ item = {k: v for k, v in item.items() if k in keep_set}
381
+ elif drop_fields is not None:
382
+ item = {k: v for k, v in item.items() if k not in drop_fields}
383
+
384
+ result.append(item)
385
+
386
+ # 构建统计信息字符串列表
387
+ step_stats = []
388
+ if strip:
389
+ step_stats.append("strip")
390
+ if stats["drop_empty"] > 0:
391
+ step_stats.append(f"drop-empty: -{stats['drop_empty']}")
392
+ if stats["min_len"] > 0:
393
+ step_stats.append(f"min-len: -{stats['min_len']}")
394
+ if stats["max_len"] > 0:
395
+ step_stats.append(f"max-len: -{stats['max_len']}")
396
+ if keep_fields:
397
+ step_stats.append(f"keep: {len(keep_fields)} 字段")
398
+ if drop_fields:
399
+ step_stats.append(f"drop: {len(drop_fields)} 字段")
400
+
401
+ return result, step_stats
402
+
403
+
404
+ def _clean_streaming(
405
+ input_path: str,
406
+ output_path: str,
407
+ strip: bool = False,
408
+ empty_fields: Optional[List[str]] = None,
409
+ min_len_field: Optional[str] = None,
410
+ min_len_value: Optional[int] = None,
411
+ max_len_field: Optional[str] = None,
412
+ max_len_value: Optional[int] = None,
413
+ keep_set: Optional[set] = None,
414
+ drop_fields_set: Optional[set] = None,
415
+ ) -> int:
416
+ """
417
+ 流式清洗数据。
418
+
419
+ Returns:
420
+ 处理后的数据条数
421
+ """
422
+
423
+ def clean_filter(item: Dict) -> bool:
424
+ """过滤函数:返回 True 保留,False 过滤(支持嵌套路径)"""
425
+ # 空值过滤
426
+ if empty_fields is not None:
427
+ if len(empty_fields) == 0:
428
+ if any(_is_empty_value(v) for v in item.values()):
429
+ return False
430
+ else:
431
+ # 支持嵌套路径
432
+ if any(_is_empty_value(get_field_with_spec(item, f)) for f in empty_fields):
433
+ return False
434
+
435
+ # 最小长度过滤(支持嵌套路径)
436
+ if min_len_field is not None:
437
+ if _get_value_len(get_field_with_spec(item, min_len_field, default="")) < min_len_value:
438
+ return False
439
+
440
+ # 最大长度过滤(支持嵌套路径)
441
+ if max_len_field is not None:
442
+ if _get_value_len(get_field_with_spec(item, max_len_field, default="")) > max_len_value:
443
+ return False
444
+
445
+ return True
446
+
447
+ def clean_transform(item: Dict) -> Dict:
448
+ """转换函数:strip + 字段管理"""
449
+ # strip 处理
450
+ if strip:
451
+ item = {k: v.strip() if isinstance(v, str) else v for k, v in item.items()}
452
+
453
+ # 字段管理
454
+ if keep_set is not None:
455
+ item = {k: v for k, v in item.items() if k in keep_set}
456
+ elif drop_fields_set is not None:
457
+ item = {k: v for k, v in item.items() if k not in drop_fields_set}
458
+
459
+ return item
460
+
461
+ # 构建流式处理链
462
+ st = load_stream(input_path)
463
+
464
+ # 如果需要 strip,先执行 strip 转换(在过滤之前,这样空值检测更准确)
465
+ if strip:
466
+ st = st.transform(
467
+ lambda x: {k: v.strip() if isinstance(v, str) else v for k, v in x.items()}
468
+ )
469
+
470
+ # 执行过滤
471
+ if empty_fields is not None or min_len_field is not None or max_len_field is not None:
472
+ st = st.filter(clean_filter)
473
+
474
+ # 执行字段管理(如果没有 strip,也需要在这里处理)
475
+ if keep_set is not None or drop_fields_set is not None:
476
+
477
+ def field_transform(item):
478
+ if keep_set is not None:
479
+ return {k: v for k, v in item.items() if k in keep_set}
480
+ elif drop_fields_set is not None:
481
+ return {k: v for k, v in item.items() if k not in drop_fields_set}
482
+ return item
483
+
484
+ st = st.transform(field_transform)
485
+
486
+ return st.save(output_path)
@@ -0,0 +1,56 @@
1
+ """
2
+ CLI 命令统一导出入口
3
+
4
+ 各命令已按功能拆分到独立模块:
5
+ - sample.py 采样相关 (sample, head, tail)
6
+ - transform.py 转换相关 (transform)
7
+ - stats.py 统计相关 (stats, token_stats)
8
+ - clean.py 清洗相关 (clean, dedupe)
9
+ - io_ops.py IO 操作 (concat, diff)
10
+ - pipeline.py Pipeline (run)
11
+ - lineage.py 血缘追踪 (history)
12
+ - common.py 通用工具函数
13
+ """
14
+
15
+ # 采样命令
16
+ from .sample import head, sample, tail
17
+
18
+ # 转换命令
19
+ from .transform import transform
20
+
21
+ # 统计命令
22
+ from .stats import stats, token_stats
23
+
24
+ # 清洗命令
25
+ from .clean import clean, dedupe
26
+
27
+ # IO 操作命令
28
+ from .io_ops import concat, diff
29
+
30
+ # Pipeline 命令
31
+ from .pipeline import run
32
+
33
+ # 血缘追踪命令
34
+ from .lineage import history
35
+
36
+ __all__ = [
37
+ # 采样
38
+ "sample",
39
+ "head",
40
+ "tail",
41
+ # 转换
42
+ "transform",
43
+ # 统计
44
+ "stats",
45
+ "token_stats",
46
+ # 清洗
47
+ "clean",
48
+ "dedupe",
49
+ # IO 操作
50
+ "concat",
51
+ "diff",
52
+ # Pipeline
53
+ "run",
54
+ # 血缘
55
+ "history",
56
+ ]