dtflow 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dtflow/SKILL.md +22 -2
- dtflow/__init__.py +1 -1
- dtflow/__main__.py +39 -4
- dtflow/cli/clean.py +204 -8
- dtflow/cli/stats.py +247 -40
- dtflow/cli/validate.py +52 -19
- dtflow/parallel.py +115 -0
- dtflow/schema.py +99 -13
- dtflow/tokenizers.py +104 -21
- {dtflow-0.5.7.dist-info → dtflow-0.5.8.dist-info}/METADATA +8 -2
- {dtflow-0.5.7.dist-info → dtflow-0.5.8.dist-info}/RECORD +13 -12
- {dtflow-0.5.7.dist-info → dtflow-0.5.8.dist-info}/WHEEL +0 -0
- {dtflow-0.5.7.dist-info → dtflow-0.5.8.dist-info}/entry_points.txt +0 -0
dtflow/SKILL.md
CHANGED
|
@@ -1,6 +1,16 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: dtflow
|
|
3
|
-
description:
|
|
3
|
+
description: >
|
|
4
|
+
当用户需要处理 JSONL/CSV/Parquet/JSON/Arrow 数据文件时使用此 skill。
|
|
5
|
+
提供 CLI 工具 `dt` 和 Python API `DataTransformer`。
|
|
6
|
+
适用场景:(1) 查看数据:dt sample/head/tail 采样预览,dt stats 统计字段分布;
|
|
7
|
+
(2) 数据清洗:dt clean 支持 --drop-empty/--min-len/--max-len 过滤行,--keep/--drop/--rename/--promote/--add-field/--fill/--reorder 操作字段;
|
|
8
|
+
(3) 去重:dt dedupe 精确去重或 --similar 相似度去重;
|
|
9
|
+
(4) 格式转换:dt transform 预设模板(openai_chat/alpaca/sharegpt/dpo)或自定义配置;
|
|
10
|
+
(5) Schema 验证:dt validate --preset 验证数据格式;
|
|
11
|
+
(6) ML 训练框架导出:export_for("llama-factory"/"swift"/"axolotl") 一键生成训练配置;
|
|
12
|
+
(7) 大文件流式处理:load_stream() O(1) 内存处理 100GB+ 文件。
|
|
13
|
+
注意:此工具专注数据文件的结构化处理,不涉及 LLM 调用(LLM 调用请用 flexllm)。
|
|
4
14
|
---
|
|
5
15
|
|
|
6
16
|
# dtflow - 机器学习训练数据格式转换工具
|
|
@@ -135,12 +145,15 @@ dt.stats() # 统计
|
|
|
135
145
|
dt stats data.jsonl # 基本统计(文件大小、条数、字段)
|
|
136
146
|
dt stats data.jsonl --full # 完整模式:值分布、唯一值、非空率
|
|
137
147
|
dt stats data.jsonl --full -n 20 # 显示 Top 20 值分布
|
|
148
|
+
dt stats data.jsonl --field=meta.source # 只统计指定字段(支持嵌套路径,可多次使用)
|
|
149
|
+
dt stats data.jsonl --expand=tags # 展开 list 字段统计(可多次使用)
|
|
138
150
|
|
|
139
151
|
# Token 统计
|
|
140
152
|
dt token-stats data.jsonl # 默认统计 messages 字段
|
|
141
153
|
dt token-stats data.jsonl -f text # 指定统计字段
|
|
142
154
|
dt token-stats data.jsonl -m qwen2.5 # 指定分词器 (cl100k_base/qwen2.5/llama3)
|
|
143
155
|
dt token-stats data.jsonl --detailed # 显示详细统计
|
|
156
|
+
dt token-stats data.jsonl -w 4 # 多进程加速(数据量>=1000时自动启用)
|
|
144
157
|
|
|
145
158
|
# 采样(支持字段路径语法)
|
|
146
159
|
dt sample data.jsonl 100 # 随机采样 100 条
|
|
@@ -165,14 +178,21 @@ dt clean data.jsonl --max-len=text:2000 # 最大长度过滤
|
|
|
165
178
|
dt clean data.jsonl --min-len=messages.#:2 # 最少 2 条消息
|
|
166
179
|
dt clean data.jsonl --keep=question,answer # 只保留指定字段
|
|
167
180
|
dt clean data.jsonl --drop=metadata # 删除指定字段
|
|
181
|
+
dt clean data.jsonl --rename=question:instruction,answer:output # 重命名字段
|
|
182
|
+
dt clean data.jsonl --promote=meta.label # 提升嵌套字段到顶层
|
|
183
|
+
dt clean data.jsonl --promote=meta.label:tag # 提升并自定义名称
|
|
184
|
+
dt clean data.jsonl --add-field=source:web # 添加常量字段
|
|
185
|
+
dt clean data.jsonl --fill=label:unknown # 填充空值/缺失字段
|
|
186
|
+
dt clean data.jsonl --reorder=id,text,label # 控制字段输出顺序
|
|
168
187
|
dt clean data.jsonl --strip # 去除字符串首尾空白
|
|
169
|
-
dt clean data.jsonl --
|
|
188
|
+
dt clean data.jsonl --promote=meta.label --drop=meta --fill=label:unknown # 组合使用
|
|
170
189
|
|
|
171
190
|
# 验证
|
|
172
191
|
dt validate data.jsonl --preset=openai_chat # 预设: openai_chat/alpaca/dpo/sharegpt
|
|
173
192
|
dt validate data.jsonl -p alpaca -f -o valid.jsonl # 过滤无效数据并保存
|
|
174
193
|
dt validate data.jsonl -p openai_chat -v # 显示详细信息
|
|
175
194
|
dt validate data.jsonl -p openai_chat --max-errors=50 # 最多显示 50 条错误
|
|
195
|
+
dt validate data.jsonl -p openai_chat -w 4 # 多进程加速
|
|
176
196
|
|
|
177
197
|
# 转换
|
|
178
198
|
dt transform data.jsonl --preset=openai_chat
|
dtflow/__init__.py
CHANGED
dtflow/__main__.py
CHANGED
|
@@ -164,11 +164,34 @@ def clean(
|
|
|
164
164
|
max_len: Optional[str] = typer.Option(None, "--max-len", help="最大长度过滤 (字段:长度)"),
|
|
165
165
|
keep: Optional[str] = typer.Option(None, "--keep", help="只保留指定字段"),
|
|
166
166
|
drop: Optional[str] = typer.Option(None, "--drop", help="删除指定字段"),
|
|
167
|
+
rename: Optional[str] = typer.Option(None, "--rename", help="重命名字段 (old:new,old2:new2)"),
|
|
168
|
+
promote: Optional[str] = typer.Option(
|
|
169
|
+
None, "--promote", help="提升嵌套字段到顶层 (meta.label 或 meta.label:tag)"
|
|
170
|
+
),
|
|
171
|
+
add_field: Optional[str] = typer.Option(None, "--add-field", help="添加常量字段 (key:value)"),
|
|
172
|
+
fill: Optional[str] = typer.Option(None, "--fill", help="填充空值 (field:default_value)"),
|
|
173
|
+
reorder: Optional[str] = typer.Option(
|
|
174
|
+
None, "--reorder", help="控制字段顺序 (field1,field2,...)"
|
|
175
|
+
),
|
|
167
176
|
strip: bool = typer.Option(False, "--strip", help="去除字符串首尾空白"),
|
|
168
177
|
output: Optional[str] = typer.Option(None, "--output", "-o", help="输出文件路径"),
|
|
169
178
|
):
|
|
170
179
|
"""数据清洗"""
|
|
171
|
-
_clean(
|
|
180
|
+
_clean(
|
|
181
|
+
filename,
|
|
182
|
+
drop_empty,
|
|
183
|
+
min_len,
|
|
184
|
+
max_len,
|
|
185
|
+
keep,
|
|
186
|
+
drop,
|
|
187
|
+
rename,
|
|
188
|
+
promote,
|
|
189
|
+
add_field,
|
|
190
|
+
fill,
|
|
191
|
+
reorder,
|
|
192
|
+
strip,
|
|
193
|
+
output,
|
|
194
|
+
)
|
|
172
195
|
|
|
173
196
|
|
|
174
197
|
# ============ 数据统计命令 ============
|
|
@@ -179,9 +202,15 @@ def stats(
|
|
|
179
202
|
filename: str = typer.Argument(..., help="输入文件路径"),
|
|
180
203
|
top: int = typer.Option(10, "--top", "-n", help="显示 Top N 值"),
|
|
181
204
|
full: bool = typer.Option(False, "--full", "-f", help="完整模式:统计值分布、唯一值等详细信息"),
|
|
205
|
+
field: Optional[List[str]] = typer.Option(
|
|
206
|
+
None, "--field", help="指定统计字段(可多次使用),支持嵌套路径"
|
|
207
|
+
),
|
|
208
|
+
expand: Optional[List[str]] = typer.Option(
|
|
209
|
+
None, "--expand", help="展开 list 字段统计(可多次使用)"
|
|
210
|
+
),
|
|
182
211
|
):
|
|
183
212
|
"""显示数据文件的统计信息"""
|
|
184
|
-
_stats(filename, top, full)
|
|
213
|
+
_stats(filename, top, full, field, expand)
|
|
185
214
|
|
|
186
215
|
|
|
187
216
|
@app.command("token-stats")
|
|
@@ -192,9 +221,12 @@ def token_stats(
|
|
|
192
221
|
"cl100k_base", "--model", "-m", help="分词器: cl100k_base (默认), qwen2.5, llama3, gpt-4 等"
|
|
193
222
|
),
|
|
194
223
|
detailed: bool = typer.Option(False, "--detailed", "-d", help="显示详细统计"),
|
|
224
|
+
workers: Optional[int] = typer.Option(
|
|
225
|
+
None, "--workers", "-w", help="并行进程数 (默认自动, 1 禁用并行)"
|
|
226
|
+
),
|
|
195
227
|
):
|
|
196
228
|
"""统计数据集的 Token 信息"""
|
|
197
|
-
_token_stats(filename, field, model, detailed)
|
|
229
|
+
_token_stats(filename, field, model, detailed, workers)
|
|
198
230
|
|
|
199
231
|
|
|
200
232
|
@app.command()
|
|
@@ -230,9 +262,12 @@ def validate(
|
|
|
230
262
|
filter: bool = typer.Option(False, "--filter", "-f", help="过滤无效数据并保存"),
|
|
231
263
|
max_errors: int = typer.Option(20, "--max-errors", help="最多显示的错误数量"),
|
|
232
264
|
verbose: bool = typer.Option(False, "--verbose", "-v", help="显示详细信息"),
|
|
265
|
+
workers: Optional[int] = typer.Option(
|
|
266
|
+
None, "--workers", "-w", help="并行进程数 (默认自动, 1 禁用并行)"
|
|
267
|
+
),
|
|
233
268
|
):
|
|
234
269
|
"""使用预设 Schema 验证数据格式"""
|
|
235
|
-
_validate(filename, preset, output, filter, max_errors, verbose)
|
|
270
|
+
_validate(filename, preset, output, filter, max_errors, verbose, workers)
|
|
236
271
|
|
|
237
272
|
|
|
238
273
|
# ============ 工具命令 ============
|
dtflow/cli/clean.py
CHANGED
|
@@ -132,6 +132,11 @@ def clean(
|
|
|
132
132
|
max_len: Optional[str] = None,
|
|
133
133
|
keep: Optional[str] = None,
|
|
134
134
|
drop: Optional[str] = None,
|
|
135
|
+
rename: Optional[str] = None,
|
|
136
|
+
promote: Optional[str] = None,
|
|
137
|
+
add_field: Optional[str] = None,
|
|
138
|
+
fill: Optional[str] = None,
|
|
139
|
+
reorder: Optional[str] = None,
|
|
135
140
|
strip: bool = False,
|
|
136
141
|
output: Optional[str] = None,
|
|
137
142
|
) -> None:
|
|
@@ -147,18 +152,26 @@ def clean(
|
|
|
147
152
|
max_len: 最大长度过滤,格式 "字段:长度",字段支持嵌套路径
|
|
148
153
|
keep: 只保留指定字段(逗号分隔,仅支持顶层字段)
|
|
149
154
|
drop: 删除指定字段(逗号分隔,仅支持顶层字段)
|
|
155
|
+
rename: 重命名字段,格式 "old:new" 或 "old1:new1,old2:new2"
|
|
156
|
+
promote: 提升嵌套字段到顶层,格式 "path" 或 "path:name"(逗号分隔多个)
|
|
157
|
+
add_field: 添加常量字段,格式 "key:value"(逗号分隔多个)
|
|
158
|
+
fill: 填充空值,格式 "field:default_value"(逗号分隔多个)
|
|
159
|
+
reorder: 控制字段顺序(逗号分隔),未列出的字段追加在后面
|
|
150
160
|
strip: 去除所有字符串字段的首尾空白
|
|
151
161
|
output: 输出文件路径,不指定则覆盖原文件
|
|
152
162
|
|
|
153
163
|
Examples:
|
|
154
164
|
dt clean data.jsonl --drop-empty # 删除任意空值记录
|
|
155
165
|
dt clean data.jsonl --drop-empty=text,answer # 删除指定字段为空的记录
|
|
156
|
-
dt clean data.jsonl --drop-empty=meta.source # 删除嵌套字段为空的记录
|
|
157
166
|
dt clean data.jsonl --min-len=text:10 # text 字段最少 10 字符
|
|
158
|
-
dt clean data.jsonl --min-len=messages.#:2 # 至少 2 条消息
|
|
159
|
-
dt clean data.jsonl --max-len=messages[-1].content:500 # 最后一条消息最多 500 字符
|
|
160
167
|
dt clean data.jsonl --keep=question,answer # 只保留这些字段
|
|
161
168
|
dt clean data.jsonl --drop=metadata,timestamp # 删除这些字段
|
|
169
|
+
dt clean data.jsonl --rename=question:instruction # 重命名字段
|
|
170
|
+
dt clean data.jsonl --promote=meta.label # 提升嵌套字段到顶层
|
|
171
|
+
dt clean data.jsonl --promote=meta.label:tag # 提升并自定义名称
|
|
172
|
+
dt clean data.jsonl --add-field=source:web # 添加常量字段
|
|
173
|
+
dt clean data.jsonl --fill=label:unknown # 填充空值
|
|
174
|
+
dt clean data.jsonl --reorder=id,text,label # 控制字段顺序
|
|
162
175
|
dt clean data.jsonl --strip # 去除字符串首尾空白
|
|
163
176
|
"""
|
|
164
177
|
filepath = Path(filename)
|
|
@@ -175,6 +188,11 @@ def clean(
|
|
|
175
188
|
max_len_field, max_len_value = _parse_len_param(max_len) if max_len else (None, None)
|
|
176
189
|
keep_fields = _parse_field_list(keep) if keep else None
|
|
177
190
|
drop_fields_set = set(_parse_field_list(drop)) if drop else None
|
|
191
|
+
rename_map = _parse_rename_param(rename) if rename else None
|
|
192
|
+
promote_list = _parse_promote_param(promote) if promote else None
|
|
193
|
+
add_field_map = _parse_kv_param(add_field, "add-field") if add_field else None
|
|
194
|
+
fill_map = _parse_kv_param(fill, "fill") if fill else None
|
|
195
|
+
reorder_fields = _parse_field_list(reorder) if reorder else None
|
|
178
196
|
keep_set = set(keep_fields) if keep_fields else None
|
|
179
197
|
|
|
180
198
|
# 构建清洗配置
|
|
@@ -197,6 +215,20 @@ def clean(
|
|
|
197
215
|
print(f"🔄 只保留字段: {', '.join(keep_fields)}")
|
|
198
216
|
if drop_fields_set:
|
|
199
217
|
print(f"🔄 删除字段: {', '.join(drop_fields_set)}")
|
|
218
|
+
if rename_map:
|
|
219
|
+
rename_desc = ", ".join(f"{k} → {v}" for k, v in rename_map.items())
|
|
220
|
+
print(f"🔄 重命名字段: {rename_desc}")
|
|
221
|
+
if promote_list:
|
|
222
|
+
promote_desc = ", ".join(f"{src} → {dst}" for src, dst in promote_list)
|
|
223
|
+
print(f"🔄 提升字段: {promote_desc}")
|
|
224
|
+
if add_field_map:
|
|
225
|
+
add_desc = ", ".join(f"{k}={v}" for k, v in add_field_map.items())
|
|
226
|
+
print(f"🔄 添加字段: {add_desc}")
|
|
227
|
+
if fill_map:
|
|
228
|
+
fill_desc = ", ".join(f"{k}={v}" for k, v in fill_map.items())
|
|
229
|
+
print(f"🔄 填充空值: {fill_desc}")
|
|
230
|
+
if reorder_fields:
|
|
231
|
+
print(f"🔄 字段排序: {', '.join(reorder_fields)}")
|
|
200
232
|
|
|
201
233
|
output_path = output or str(filepath)
|
|
202
234
|
|
|
@@ -234,6 +266,11 @@ def clean(
|
|
|
234
266
|
max_len_value=max_len_value,
|
|
235
267
|
keep_set=keep_set,
|
|
236
268
|
drop_fields_set=drop_fields_set,
|
|
269
|
+
rename_map=rename_map,
|
|
270
|
+
promote_list=promote_list,
|
|
271
|
+
add_field_map=add_field_map,
|
|
272
|
+
fill_map=fill_map,
|
|
273
|
+
reorder_fields=reorder_fields,
|
|
237
274
|
)
|
|
238
275
|
|
|
239
276
|
# 如果使用了临时文件,移动到目标位置
|
|
@@ -274,6 +311,11 @@ def clean(
|
|
|
274
311
|
max_len_value=max_len_value,
|
|
275
312
|
keep_fields=keep_fields,
|
|
276
313
|
drop_fields=drop_fields_set,
|
|
314
|
+
rename_map=rename_map,
|
|
315
|
+
promote_list=promote_list,
|
|
316
|
+
add_field_map=add_field_map,
|
|
317
|
+
fill_map=fill_map,
|
|
318
|
+
reorder_fields=reorder_fields,
|
|
277
319
|
)
|
|
278
320
|
|
|
279
321
|
# 保存结果
|
|
@@ -288,12 +330,106 @@ def clean(
|
|
|
288
330
|
|
|
289
331
|
# 打印统计
|
|
290
332
|
removed_count = original_count - final_count
|
|
291
|
-
print(
|
|
333
|
+
print("\n✅ 完成!")
|
|
292
334
|
print(f" 原始: {original_count} 条 -> 清洗后: {final_count} 条 (删除 {removed_count} 条)")
|
|
293
335
|
if step_stats:
|
|
294
336
|
print(f" 步骤: {' | '.join(step_stats)}")
|
|
295
337
|
|
|
296
338
|
|
|
339
|
+
def _parse_rename_param(param: str) -> Dict[str, str]:
|
|
340
|
+
"""解析重命名参数,格式 'old:new' 或 'old1:new1,old2:new2'"""
|
|
341
|
+
rename_map = {}
|
|
342
|
+
for pair in param.split(","):
|
|
343
|
+
pair = pair.strip()
|
|
344
|
+
if ":" not in pair:
|
|
345
|
+
raise ValueError(f"重命名参数格式错误: {pair},应为 'old:new'")
|
|
346
|
+
old, new = pair.split(":", 1)
|
|
347
|
+
old, new = old.strip(), new.strip()
|
|
348
|
+
if not old or not new:
|
|
349
|
+
raise ValueError(f"重命名参数格式错误: {pair},字段名不能为空")
|
|
350
|
+
rename_map[old] = new
|
|
351
|
+
return rename_map
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def _parse_promote_param(param: str) -> List[tuple]:
|
|
355
|
+
"""
|
|
356
|
+
解析提升参数,格式 'path' 或 'path:name'(逗号分隔多个)。
|
|
357
|
+
|
|
358
|
+
Returns:
|
|
359
|
+
[(source_path, target_name), ...]
|
|
360
|
+
"""
|
|
361
|
+
result = []
|
|
362
|
+
for item in param.split(","):
|
|
363
|
+
item = item.strip()
|
|
364
|
+
if ":" in item:
|
|
365
|
+
src, dst = item.split(":", 1)
|
|
366
|
+
src, dst = src.strip(), dst.strip()
|
|
367
|
+
else:
|
|
368
|
+
src = item
|
|
369
|
+
# 默认用路径最后一段作为目标名
|
|
370
|
+
dst = src.rsplit(".", 1)[-1] if "." in src else src
|
|
371
|
+
if not src or not dst:
|
|
372
|
+
raise ValueError(f"promote 参数格式错误: {item}")
|
|
373
|
+
result.append((src, dst))
|
|
374
|
+
return result
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def _parse_kv_param(param: str, param_name: str) -> Dict[str, str]:
|
|
378
|
+
"""解析 key:value 格式参数(通用),用于 --add-field 和 --fill"""
|
|
379
|
+
kv_map = {}
|
|
380
|
+
for pair in param.split(","):
|
|
381
|
+
pair = pair.strip()
|
|
382
|
+
if ":" not in pair:
|
|
383
|
+
raise ValueError(f"{param_name} 参数格式错误: {pair},应为 'key:value'")
|
|
384
|
+
key, value = pair.split(":", 1)
|
|
385
|
+
key, value = key.strip(), value.strip()
|
|
386
|
+
if not key:
|
|
387
|
+
raise ValueError(f"{param_name} 参数格式错误: {pair},key 不能为空")
|
|
388
|
+
kv_map[key] = value
|
|
389
|
+
return kv_map
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def _rename_item(item: Dict, rename_map: Dict[str, str]) -> Dict:
|
|
393
|
+
"""重命名字段,保持字段顺序"""
|
|
394
|
+
return {rename_map.get(k, k): v for k, v in item.items()}
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def _promote_fields(item: Dict, promote_list: List[tuple]) -> Dict:
|
|
398
|
+
"""提升嵌套字段到顶层(始终添加字段,即使值为 None)"""
|
|
399
|
+
item = dict(item)
|
|
400
|
+
for src_path, dst_name in promote_list:
|
|
401
|
+
item[dst_name] = get_field_with_spec(item, src_path)
|
|
402
|
+
return item
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
def _add_fields(item: Dict, add_field_map: Dict[str, str]) -> Dict:
|
|
406
|
+
"""添加常量字段"""
|
|
407
|
+
item = dict(item)
|
|
408
|
+
item.update(add_field_map)
|
|
409
|
+
return item
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
def _fill_empty(item: Dict, fill_map: Dict[str, str]) -> Dict:
|
|
413
|
+
"""填充空值(字段不存在时也会添加)"""
|
|
414
|
+
item = dict(item)
|
|
415
|
+
for field, default in fill_map.items():
|
|
416
|
+
if field not in item or _is_empty_value(item[field]):
|
|
417
|
+
item[field] = default
|
|
418
|
+
return item
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
def _reorder_item(item: Dict, reorder_fields: List[str]) -> Dict:
|
|
422
|
+
"""按指定顺序重排字段,未列出的字段追加在后面"""
|
|
423
|
+
ordered = {}
|
|
424
|
+
for f in reorder_fields:
|
|
425
|
+
if f in item:
|
|
426
|
+
ordered[f] = item[f]
|
|
427
|
+
for k, v in item.items():
|
|
428
|
+
if k not in ordered:
|
|
429
|
+
ordered[k] = v
|
|
430
|
+
return ordered
|
|
431
|
+
|
|
432
|
+
|
|
297
433
|
def _parse_len_param(param: str) -> tuple:
|
|
298
434
|
"""解析长度参数,格式 'field:length'"""
|
|
299
435
|
if ":" not in param:
|
|
@@ -302,8 +438,8 @@ def _parse_len_param(param: str) -> tuple:
|
|
|
302
438
|
field = parts[0].strip()
|
|
303
439
|
try:
|
|
304
440
|
length = int(parts[1].strip())
|
|
305
|
-
except ValueError:
|
|
306
|
-
raise ValueError(f"长度必须是整数: {parts[1]}")
|
|
441
|
+
except ValueError as e:
|
|
442
|
+
raise ValueError(f"长度必须是整数: {parts[1]}") from e
|
|
307
443
|
return field, length
|
|
308
444
|
|
|
309
445
|
|
|
@@ -317,6 +453,11 @@ def _clean_data_single_pass(
|
|
|
317
453
|
max_len_value: Optional[int] = None,
|
|
318
454
|
keep_fields: Optional[List[str]] = None,
|
|
319
455
|
drop_fields: Optional[set] = None,
|
|
456
|
+
rename_map: Optional[Dict[str, str]] = None,
|
|
457
|
+
promote_list: Optional[List[tuple]] = None,
|
|
458
|
+
add_field_map: Optional[Dict[str, str]] = None,
|
|
459
|
+
fill_map: Optional[Dict[str, str]] = None,
|
|
460
|
+
reorder_fields: Optional[List[str]] = None,
|
|
320
461
|
) -> tuple:
|
|
321
462
|
"""
|
|
322
463
|
单次遍历执行所有清洗操作。
|
|
@@ -375,12 +516,32 @@ def _clean_data_single_pass(
|
|
|
375
516
|
stats["max_len"] += 1
|
|
376
517
|
continue
|
|
377
518
|
|
|
378
|
-
# 5.
|
|
519
|
+
# 5. 提升嵌套字段(在 drop 之前,否则父字段被删后无法提取)
|
|
520
|
+
if promote_list is not None:
|
|
521
|
+
item = _promote_fields(item, promote_list)
|
|
522
|
+
|
|
523
|
+
# 6. 字段管理(keep/drop)
|
|
379
524
|
if keep_set is not None:
|
|
380
525
|
item = {k: v for k, v in item.items() if k in keep_set}
|
|
381
526
|
elif drop_fields is not None:
|
|
382
527
|
item = {k: v for k, v in item.items() if k not in drop_fields}
|
|
383
528
|
|
|
529
|
+
# 7. 字段重命名
|
|
530
|
+
if rename_map is not None:
|
|
531
|
+
item = _rename_item(item, rename_map)
|
|
532
|
+
|
|
533
|
+
# 8. 添加常量字段
|
|
534
|
+
if add_field_map is not None:
|
|
535
|
+
item = _add_fields(item, add_field_map)
|
|
536
|
+
|
|
537
|
+
# 9. 填充空值
|
|
538
|
+
if fill_map is not None:
|
|
539
|
+
item = _fill_empty(item, fill_map)
|
|
540
|
+
|
|
541
|
+
# 10. 字段排序(最后执行)
|
|
542
|
+
if reorder_fields is not None:
|
|
543
|
+
item = _reorder_item(item, reorder_fields)
|
|
544
|
+
|
|
384
545
|
result.append(item)
|
|
385
546
|
|
|
386
547
|
# 构建统计信息字符串列表
|
|
@@ -397,6 +558,16 @@ def _clean_data_single_pass(
|
|
|
397
558
|
step_stats.append(f"keep: {len(keep_fields)} 字段")
|
|
398
559
|
if drop_fields:
|
|
399
560
|
step_stats.append(f"drop: {len(drop_fields)} 字段")
|
|
561
|
+
if rename_map:
|
|
562
|
+
step_stats.append(f"rename: {len(rename_map)} 字段")
|
|
563
|
+
if promote_list:
|
|
564
|
+
step_stats.append(f"promote: {len(promote_list)} 字段")
|
|
565
|
+
if add_field_map:
|
|
566
|
+
step_stats.append(f"add-field: {len(add_field_map)} 字段")
|
|
567
|
+
if fill_map:
|
|
568
|
+
step_stats.append(f"fill: {len(fill_map)} 字段")
|
|
569
|
+
if reorder_fields:
|
|
570
|
+
step_stats.append("reorder")
|
|
400
571
|
|
|
401
572
|
return result, step_stats
|
|
402
573
|
|
|
@@ -412,6 +583,11 @@ def _clean_streaming(
|
|
|
412
583
|
max_len_value: Optional[int] = None,
|
|
413
584
|
keep_set: Optional[set] = None,
|
|
414
585
|
drop_fields_set: Optional[set] = None,
|
|
586
|
+
rename_map: Optional[Dict[str, str]] = None,
|
|
587
|
+
promote_list: Optional[List[tuple]] = None,
|
|
588
|
+
add_field_map: Optional[Dict[str, str]] = None,
|
|
589
|
+
fill_map: Optional[Dict[str, str]] = None,
|
|
590
|
+
reorder_fields: Optional[List[str]] = None,
|
|
415
591
|
) -> int:
|
|
416
592
|
"""
|
|
417
593
|
流式清洗数据。
|
|
@@ -471,7 +647,11 @@ def _clean_streaming(
|
|
|
471
647
|
if empty_fields is not None or min_len_field is not None or max_len_field is not None:
|
|
472
648
|
st = st.filter(clean_filter)
|
|
473
649
|
|
|
474
|
-
#
|
|
650
|
+
# 提升嵌套字段(在 drop 之前,否则父字段被删后无法提取)
|
|
651
|
+
if promote_list is not None:
|
|
652
|
+
st = st.transform(lambda item: _promote_fields(item, promote_list))
|
|
653
|
+
|
|
654
|
+
# 执行字段管理(keep/drop)
|
|
475
655
|
if keep_set is not None or drop_fields_set is not None:
|
|
476
656
|
|
|
477
657
|
def field_transform(item):
|
|
@@ -483,4 +663,20 @@ def _clean_streaming(
|
|
|
483
663
|
|
|
484
664
|
st = st.transform(field_transform)
|
|
485
665
|
|
|
666
|
+
# 执行字段重命名
|
|
667
|
+
if rename_map is not None:
|
|
668
|
+
st = st.transform(lambda item: _rename_item(item, rename_map))
|
|
669
|
+
|
|
670
|
+
# 添加常量字段
|
|
671
|
+
if add_field_map is not None:
|
|
672
|
+
st = st.transform(lambda item: _add_fields(item, add_field_map))
|
|
673
|
+
|
|
674
|
+
# 填充空值
|
|
675
|
+
if fill_map is not None:
|
|
676
|
+
st = st.transform(lambda item: _fill_empty(item, fill_map))
|
|
677
|
+
|
|
678
|
+
# 字段排序(最后执行)
|
|
679
|
+
if reorder_fields is not None:
|
|
680
|
+
st = st.transform(lambda item: _reorder_item(item, reorder_fields))
|
|
681
|
+
|
|
486
682
|
return st.save(output_path)
|