dtflow 0.4.1__tar.gz → 0.4.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dtflow-0.4.1 → dtflow-0.4.3}/PKG-INFO +12 -1
- {dtflow-0.4.1 → dtflow-0.4.3}/README.md +11 -0
- {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/__init__.py +1 -1
- {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/__main__.py +19 -7
- dtflow-0.4.3/dtflow/cli/clean.py +486 -0
- dtflow-0.4.3/dtflow/cli/commands.py +56 -0
- dtflow-0.4.3/dtflow/cli/common.py +384 -0
- dtflow-0.4.3/dtflow/cli/io_ops.py +385 -0
- dtflow-0.4.3/dtflow/cli/lineage.py +49 -0
- dtflow-0.4.3/dtflow/cli/pipeline.py +54 -0
- dtflow-0.4.3/dtflow/cli/sample.py +294 -0
- dtflow-0.4.3/dtflow/cli/stats.py +589 -0
- dtflow-0.4.3/dtflow/cli/transform.py +486 -0
- {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/core.py +35 -0
- {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/storage/io.py +49 -6
- {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/streaming.py +25 -4
- {dtflow-0.4.1 → dtflow-0.4.3}/tests/test_transformer.py +33 -4
- dtflow-0.4.1/dtflow/cli/commands.py +0 -2482
- {dtflow-0.4.1 → dtflow-0.4.3}/.gitignore +0 -0
- {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/cli/__init__.py +0 -0
- {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/converters.py +0 -0
- {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/lineage.py +0 -0
- {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/mcp/__init__.py +0 -0
- {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/mcp/__main__.py +0 -0
- {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/mcp/cli.py +0 -0
- {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/mcp/docs.py +0 -0
- {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/mcp/server.py +0 -0
- {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/pipeline.py +0 -0
- {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/presets.py +0 -0
- {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/storage/__init__.py +0 -0
- {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/tokenizers.py +0 -0
- {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/utils/__init__.py +0 -0
- {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/utils/display.py +0 -0
- {dtflow-0.4.1 → dtflow-0.4.3}/dtflow/utils/field_path.py +0 -0
- {dtflow-0.4.1 → dtflow-0.4.3}/pyproject.toml +0 -0
- {dtflow-0.4.1 → dtflow-0.4.3}/tests/benchmark_io.py +0 -0
- {dtflow-0.4.1 → dtflow-0.4.3}/tests/test_converters.py +0 -0
- {dtflow-0.4.1 → dtflow-0.4.3}/tests/test_field_path.py +0 -0
- {dtflow-0.4.1 → dtflow-0.4.3}/tests/test_io.py +0 -0
- {dtflow-0.4.1 → dtflow-0.4.3}/tests/test_lineage.py +0 -0
- {dtflow-0.4.1 → dtflow-0.4.3}/tests/test_pipeline.py +0 -0
- {dtflow-0.4.1 → dtflow-0.4.3}/tests/test_streaming.py +0 -0
- {dtflow-0.4.1 → dtflow-0.4.3}/tests/test_tokenizers.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dtflow
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.3
|
|
4
4
|
Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
|
|
5
5
|
Project-URL: Homepage, https://github.com/yourusername/DataTransformer
|
|
6
6
|
Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
|
|
@@ -126,6 +126,17 @@ dt.filter(lambda x: x.score > 0.8)
|
|
|
126
126
|
dt.filter(lambda x: x.language == "zh")
|
|
127
127
|
```
|
|
128
128
|
|
|
129
|
+
### 数据验证
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
# 验证数据,返回不通过的记录列表
|
|
133
|
+
errors = dt.validate(lambda x: len(x.messages) >= 2)
|
|
134
|
+
|
|
135
|
+
if errors:
|
|
136
|
+
for e in errors[:5]:
|
|
137
|
+
print(f"第 {e.index} 行: {e.error}")
|
|
138
|
+
```
|
|
139
|
+
|
|
129
140
|
### 数据转换
|
|
130
141
|
|
|
131
142
|
```python
|
|
@@ -50,6 +50,17 @@ dt.filter(lambda x: x.score > 0.8)
|
|
|
50
50
|
dt.filter(lambda x: x.language == "zh")
|
|
51
51
|
```
|
|
52
52
|
|
|
53
|
+
### 数据验证
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
# 验证数据,返回不通过的记录列表
|
|
57
|
+
errors = dt.validate(lambda x: len(x.messages) >= 2)
|
|
58
|
+
|
|
59
|
+
if errors:
|
|
60
|
+
for e in errors[:5]:
|
|
61
|
+
print(f"第 {e.index} 行: {e.error}")
|
|
62
|
+
```
|
|
63
|
+
|
|
53
64
|
### 数据转换
|
|
54
65
|
|
|
55
66
|
```python
|
|
@@ -56,38 +56,49 @@ app = typer.Typer(
|
|
|
56
56
|
@app.command()
|
|
57
57
|
def sample(
|
|
58
58
|
filename: str = typer.Argument(..., help="输入文件路径"),
|
|
59
|
-
|
|
59
|
+
num_arg: Optional[int] = typer.Argument(None, help="采样数量", metavar="NUM"),
|
|
60
|
+
num: int = typer.Option(10, "--num", "-n", help="采样数量", show_default=True),
|
|
60
61
|
type: str = typer.Option("head", "--type", "-t", help="采样方式: random/head/tail"),
|
|
61
62
|
output: Optional[str] = typer.Option(None, "--output", "-o", help="输出文件路径"),
|
|
62
63
|
seed: Optional[int] = typer.Option(None, "--seed", help="随机种子"),
|
|
63
64
|
by: Optional[str] = typer.Option(None, "--by", help="分层采样字段"),
|
|
64
65
|
uniform: bool = typer.Option(False, "--uniform", help="均匀采样模式"),
|
|
65
66
|
fields: Optional[str] = typer.Option(None, "--fields", "-f", help="只显示指定字段(逗号分隔)"),
|
|
67
|
+
raw: bool = typer.Option(False, "--raw", "-r", help="输出原始 JSON(不截断)"),
|
|
66
68
|
):
|
|
67
69
|
"""从数据文件中采样指定数量的数据"""
|
|
68
|
-
|
|
70
|
+
actual_num = num_arg if num_arg is not None else num
|
|
71
|
+
_sample(filename, actual_num, type, output, seed, by, uniform, fields, raw)
|
|
69
72
|
|
|
70
73
|
|
|
71
74
|
@app.command()
|
|
72
75
|
def head(
|
|
73
76
|
filename: str = typer.Argument(..., help="输入文件路径"),
|
|
74
|
-
|
|
77
|
+
num_arg: Optional[int] = typer.Argument(None, help="显示数量", metavar="NUM"),
|
|
78
|
+
num: int = typer.Option(10, "--num", "-n", help="显示数量", show_default=True),
|
|
75
79
|
output: Optional[str] = typer.Option(None, "--output", "-o", help="输出文件路径"),
|
|
76
80
|
fields: Optional[str] = typer.Option(None, "--fields", "-f", help="只显示指定字段"),
|
|
81
|
+
raw: bool = typer.Option(False, "--raw", "-r", help="输出原始 JSON(不截断)"),
|
|
77
82
|
):
|
|
78
83
|
"""显示文件的前 N 条数据"""
|
|
79
|
-
|
|
84
|
+
# 位置参数优先于选项参数
|
|
85
|
+
actual_num = num_arg if num_arg is not None else num
|
|
86
|
+
_head(filename, actual_num, output, fields, raw)
|
|
80
87
|
|
|
81
88
|
|
|
82
89
|
@app.command()
|
|
83
90
|
def tail(
|
|
84
91
|
filename: str = typer.Argument(..., help="输入文件路径"),
|
|
85
|
-
|
|
92
|
+
num_arg: Optional[int] = typer.Argument(None, help="显示数量", metavar="NUM"),
|
|
93
|
+
num: int = typer.Option(10, "--num", "-n", help="显示数量", show_default=True),
|
|
86
94
|
output: Optional[str] = typer.Option(None, "--output", "-o", help="输出文件路径"),
|
|
87
95
|
fields: Optional[str] = typer.Option(None, "--fields", "-f", help="只显示指定字段"),
|
|
96
|
+
raw: bool = typer.Option(False, "--raw", "-r", help="输出原始 JSON(不截断)"),
|
|
88
97
|
):
|
|
89
98
|
"""显示文件的后 N 条数据"""
|
|
90
|
-
|
|
99
|
+
# 位置参数优先于选项参数
|
|
100
|
+
actual_num = num_arg if num_arg is not None else num
|
|
101
|
+
_tail(filename, actual_num, output, fields, raw)
|
|
91
102
|
|
|
92
103
|
|
|
93
104
|
# ============ 数据转换命令 ============
|
|
@@ -161,9 +172,10 @@ def clean(
|
|
|
161
172
|
def stats(
|
|
162
173
|
filename: str = typer.Argument(..., help="输入文件路径"),
|
|
163
174
|
top: int = typer.Option(10, "--top", "-n", help="显示 Top N 值"),
|
|
175
|
+
full: bool = typer.Option(False, "--full", "-f", help="完整模式:统计值分布、唯一值等详细信息"),
|
|
164
176
|
):
|
|
165
177
|
"""显示数据文件的统计信息"""
|
|
166
|
-
_stats(filename, top)
|
|
178
|
+
_stats(filename, top, full)
|
|
167
179
|
|
|
168
180
|
|
|
169
181
|
@app.command("token-stats")
|
|
@@ -0,0 +1,486 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLI 数据清洗和去重相关命令
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import shutil
|
|
7
|
+
import tempfile
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Dict, List, Optional
|
|
10
|
+
|
|
11
|
+
from ..core import DataTransformer
|
|
12
|
+
from ..storage.io import save_data
|
|
13
|
+
from ..streaming import load_stream
|
|
14
|
+
from ..utils.field_path import get_field_with_spec
|
|
15
|
+
from .common import (
|
|
16
|
+
_check_file_format,
|
|
17
|
+
_get_value_len,
|
|
18
|
+
_is_empty_value,
|
|
19
|
+
_is_streaming_supported,
|
|
20
|
+
_parse_field_list,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def dedupe(
|
|
25
|
+
filename: str,
|
|
26
|
+
key: Optional[str] = None,
|
|
27
|
+
similar: Optional[float] = None,
|
|
28
|
+
output: Optional[str] = None,
|
|
29
|
+
) -> None:
|
|
30
|
+
"""
|
|
31
|
+
数据去重。
|
|
32
|
+
|
|
33
|
+
支持两种模式:
|
|
34
|
+
1. 精确去重(默认):完全相同的数据才去重
|
|
35
|
+
2. 相似度去重:使用 MinHash+LSH 算法,相似度超过阈值则去重
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
filename: 输入文件路径,支持 csv/excel/jsonl/json/parquet/arrow/feather 格式
|
|
39
|
+
key: 去重依据字段,支持嵌套路径语法:
|
|
40
|
+
- meta.source 嵌套字段
|
|
41
|
+
- messages[0].role 数组索引
|
|
42
|
+
- messages[-1].content 负索引
|
|
43
|
+
- messages.# 数组长度
|
|
44
|
+
- messages[*].role:join 展开所有元素
|
|
45
|
+
多个字段用逗号分隔。不指定则全量去重
|
|
46
|
+
similar: 相似度阈值(0-1),指定后启用相似度去重模式,需要指定 --key
|
|
47
|
+
output: 输出文件路径,不指定则覆盖原文件
|
|
48
|
+
|
|
49
|
+
Examples:
|
|
50
|
+
dt dedupe data.jsonl # 全量精确去重
|
|
51
|
+
dt dedupe data.jsonl --key=text # 按 text 字段精确去重
|
|
52
|
+
dt dedupe data.jsonl --key=user,timestamp # 按多字段组合精确去重
|
|
53
|
+
dt dedupe data.jsonl --key=meta.id # 按嵌套字段去重
|
|
54
|
+
dt dedupe data.jsonl --key=messages[0].content # 按第一条消息内容去重
|
|
55
|
+
dt dedupe data.jsonl --key=text --similar=0.8 # 相似度去重
|
|
56
|
+
"""
|
|
57
|
+
filepath = Path(filename)
|
|
58
|
+
|
|
59
|
+
if not filepath.exists():
|
|
60
|
+
print(f"错误: 文件不存在 - {filename}")
|
|
61
|
+
return
|
|
62
|
+
|
|
63
|
+
if not _check_file_format(filepath):
|
|
64
|
+
return
|
|
65
|
+
|
|
66
|
+
# 相似度去重模式必须指定 key
|
|
67
|
+
if similar is not None and not key:
|
|
68
|
+
print("错误: 相似度去重需要指定 --key 参数")
|
|
69
|
+
return
|
|
70
|
+
|
|
71
|
+
if similar is not None and (similar <= 0 or similar > 1):
|
|
72
|
+
print("错误: --similar 参数必须在 0-1 之间")
|
|
73
|
+
return
|
|
74
|
+
|
|
75
|
+
# 加载数据
|
|
76
|
+
print(f"📊 加载数据: {filepath}")
|
|
77
|
+
try:
|
|
78
|
+
dt = DataTransformer.load(str(filepath))
|
|
79
|
+
except Exception as e:
|
|
80
|
+
print(f"错误: 无法读取文件 - {e}")
|
|
81
|
+
return
|
|
82
|
+
|
|
83
|
+
original_count = len(dt)
|
|
84
|
+
print(f" 共 {original_count} 条数据")
|
|
85
|
+
|
|
86
|
+
# 执行去重
|
|
87
|
+
if similar is not None:
|
|
88
|
+
# 相似度去重模式
|
|
89
|
+
print(f"🔑 相似度去重: 字段={key}, 阈值={similar}")
|
|
90
|
+
print("🔄 执行去重(MinHash+LSH)...")
|
|
91
|
+
try:
|
|
92
|
+
result = dt.dedupe_similar(key, threshold=similar)
|
|
93
|
+
except ImportError as e:
|
|
94
|
+
print(f"错误: {e}")
|
|
95
|
+
return
|
|
96
|
+
else:
|
|
97
|
+
# 精确去重模式
|
|
98
|
+
dedupe_key: Any = None
|
|
99
|
+
if key:
|
|
100
|
+
keys = [k.strip() for k in key.split(",")]
|
|
101
|
+
if len(keys) == 1:
|
|
102
|
+
dedupe_key = keys[0]
|
|
103
|
+
print(f"🔑 按字段精确去重: {dedupe_key}")
|
|
104
|
+
else:
|
|
105
|
+
dedupe_key = keys
|
|
106
|
+
print(f"🔑 按多字段组合精确去重: {', '.join(dedupe_key)}")
|
|
107
|
+
else:
|
|
108
|
+
print("🔑 全量精确去重")
|
|
109
|
+
|
|
110
|
+
print("🔄 执行去重...")
|
|
111
|
+
result = dt.dedupe(dedupe_key)
|
|
112
|
+
|
|
113
|
+
dedupe_count = len(result)
|
|
114
|
+
removed_count = original_count - dedupe_count
|
|
115
|
+
|
|
116
|
+
# 保存结果
|
|
117
|
+
output_path = output or str(filepath)
|
|
118
|
+
print(f"💾 保存结果: {output_path}")
|
|
119
|
+
try:
|
|
120
|
+
result.save(output_path)
|
|
121
|
+
except Exception as e:
|
|
122
|
+
print(f"错误: 无法保存文件 - {e}")
|
|
123
|
+
return
|
|
124
|
+
|
|
125
|
+
print(f"\n✅ 完成! 去除 {removed_count} 条重复数据,剩余 {dedupe_count} 条")
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def clean(
|
|
129
|
+
filename: str,
|
|
130
|
+
drop_empty: Optional[str] = None,
|
|
131
|
+
min_len: Optional[str] = None,
|
|
132
|
+
max_len: Optional[str] = None,
|
|
133
|
+
keep: Optional[str] = None,
|
|
134
|
+
drop: Optional[str] = None,
|
|
135
|
+
strip: bool = False,
|
|
136
|
+
output: Optional[str] = None,
|
|
137
|
+
) -> None:
|
|
138
|
+
"""
|
|
139
|
+
数据清洗(默认流式处理)。
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
filename: 输入文件路径,支持 csv/excel/jsonl/json/parquet/arrow/feather 格式
|
|
143
|
+
drop_empty: 删除空值记录,支持嵌套路径语法
|
|
144
|
+
- 不带值:删除任意字段为空的记录
|
|
145
|
+
- 指定字段:删除指定字段为空的记录(逗号分隔)
|
|
146
|
+
min_len: 最小长度过滤,格式 "字段:长度",字段支持嵌套路径
|
|
147
|
+
max_len: 最大长度过滤,格式 "字段:长度",字段支持嵌套路径
|
|
148
|
+
keep: 只保留指定字段(逗号分隔,仅支持顶层字段)
|
|
149
|
+
drop: 删除指定字段(逗号分隔,仅支持顶层字段)
|
|
150
|
+
strip: 去除所有字符串字段的首尾空白
|
|
151
|
+
output: 输出文件路径,不指定则覆盖原文件
|
|
152
|
+
|
|
153
|
+
Examples:
|
|
154
|
+
dt clean data.jsonl --drop-empty # 删除任意空值记录
|
|
155
|
+
dt clean data.jsonl --drop-empty=text,answer # 删除指定字段为空的记录
|
|
156
|
+
dt clean data.jsonl --drop-empty=meta.source # 删除嵌套字段为空的记录
|
|
157
|
+
dt clean data.jsonl --min-len=text:10 # text 字段最少 10 字符
|
|
158
|
+
dt clean data.jsonl --min-len=messages.#:2 # 至少 2 条消息
|
|
159
|
+
dt clean data.jsonl --max-len=messages[-1].content:500 # 最后一条消息最多 500 字符
|
|
160
|
+
dt clean data.jsonl --keep=question,answer # 只保留这些字段
|
|
161
|
+
dt clean data.jsonl --drop=metadata,timestamp # 删除这些字段
|
|
162
|
+
dt clean data.jsonl --strip # 去除字符串首尾空白
|
|
163
|
+
"""
|
|
164
|
+
filepath = Path(filename)
|
|
165
|
+
|
|
166
|
+
if not filepath.exists():
|
|
167
|
+
print(f"错误: 文件不存在 - {filename}")
|
|
168
|
+
return
|
|
169
|
+
|
|
170
|
+
if not _check_file_format(filepath):
|
|
171
|
+
return
|
|
172
|
+
|
|
173
|
+
# 解析参数
|
|
174
|
+
min_len_field, min_len_value = _parse_len_param(min_len) if min_len else (None, None)
|
|
175
|
+
max_len_field, max_len_value = _parse_len_param(max_len) if max_len else (None, None)
|
|
176
|
+
keep_fields = _parse_field_list(keep) if keep else None
|
|
177
|
+
drop_fields_set = set(_parse_field_list(drop)) if drop else None
|
|
178
|
+
keep_set = set(keep_fields) if keep_fields else None
|
|
179
|
+
|
|
180
|
+
# 构建清洗配置
|
|
181
|
+
empty_fields = None
|
|
182
|
+
if drop_empty is not None:
|
|
183
|
+
if drop_empty == "" or drop_empty is True:
|
|
184
|
+
print("🔄 删除任意字段为空的记录...")
|
|
185
|
+
empty_fields = []
|
|
186
|
+
else:
|
|
187
|
+
empty_fields = _parse_field_list(drop_empty)
|
|
188
|
+
print(f"🔄 删除字段为空的记录: {', '.join(empty_fields)}")
|
|
189
|
+
|
|
190
|
+
if strip:
|
|
191
|
+
print("🔄 去除字符串首尾空白...")
|
|
192
|
+
if min_len_field:
|
|
193
|
+
print(f"🔄 过滤 {min_len_field} 长度 < {min_len_value} 的记录...")
|
|
194
|
+
if max_len_field:
|
|
195
|
+
print(f"🔄 过滤 {max_len_field} 长度 > {max_len_value} 的记录...")
|
|
196
|
+
if keep_fields:
|
|
197
|
+
print(f"🔄 只保留字段: {', '.join(keep_fields)}")
|
|
198
|
+
if drop_fields_set:
|
|
199
|
+
print(f"🔄 删除字段: {', '.join(drop_fields_set)}")
|
|
200
|
+
|
|
201
|
+
output_path = output or str(filepath)
|
|
202
|
+
|
|
203
|
+
# 检查输入输出是否相同(流式处理需要临时文件)
|
|
204
|
+
input_resolved = filepath.resolve()
|
|
205
|
+
output_resolved = Path(output_path).resolve()
|
|
206
|
+
use_temp_file = input_resolved == output_resolved
|
|
207
|
+
|
|
208
|
+
# 对于 JSONL 文件使用流式处理
|
|
209
|
+
if _is_streaming_supported(filepath):
|
|
210
|
+
print(f"📊 流式加载: {filepath}")
|
|
211
|
+
|
|
212
|
+
# 如果输入输出相同,使用临时文件
|
|
213
|
+
if use_temp_file:
|
|
214
|
+
print("⚠ 检测到输出文件与输入文件相同,将使用临时文件")
|
|
215
|
+
temp_fd, temp_path = tempfile.mkstemp(
|
|
216
|
+
suffix=output_resolved.suffix,
|
|
217
|
+
prefix=".tmp_",
|
|
218
|
+
dir=output_resolved.parent,
|
|
219
|
+
)
|
|
220
|
+
os.close(temp_fd)
|
|
221
|
+
actual_output = temp_path
|
|
222
|
+
else:
|
|
223
|
+
actual_output = output_path
|
|
224
|
+
|
|
225
|
+
try:
|
|
226
|
+
count = _clean_streaming(
|
|
227
|
+
str(filepath),
|
|
228
|
+
actual_output,
|
|
229
|
+
strip=strip,
|
|
230
|
+
empty_fields=empty_fields,
|
|
231
|
+
min_len_field=min_len_field,
|
|
232
|
+
min_len_value=min_len_value,
|
|
233
|
+
max_len_field=max_len_field,
|
|
234
|
+
max_len_value=max_len_value,
|
|
235
|
+
keep_set=keep_set,
|
|
236
|
+
drop_fields_set=drop_fields_set,
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
# 如果使用了临时文件,移动到目标位置
|
|
240
|
+
if use_temp_file:
|
|
241
|
+
shutil.move(temp_path, output_path)
|
|
242
|
+
|
|
243
|
+
print(f"💾 保存结果: {output_path}")
|
|
244
|
+
print(f"\n✅ 完成! 清洗后 {count} 条数据")
|
|
245
|
+
except Exception as e:
|
|
246
|
+
# 清理临时文件
|
|
247
|
+
if use_temp_file and os.path.exists(temp_path):
|
|
248
|
+
os.unlink(temp_path)
|
|
249
|
+
print(f"错误: 清洗失败 - {e}")
|
|
250
|
+
import traceback
|
|
251
|
+
|
|
252
|
+
traceback.print_exc()
|
|
253
|
+
return
|
|
254
|
+
|
|
255
|
+
# 非 JSONL 文件使用传统方式
|
|
256
|
+
print(f"📊 加载数据: {filepath}")
|
|
257
|
+
try:
|
|
258
|
+
dt = DataTransformer.load(str(filepath))
|
|
259
|
+
except Exception as e:
|
|
260
|
+
print(f"错误: 无法读取文件 - {e}")
|
|
261
|
+
return
|
|
262
|
+
|
|
263
|
+
original_count = len(dt)
|
|
264
|
+
print(f" 共 {original_count} 条数据")
|
|
265
|
+
|
|
266
|
+
# 单次遍历执行所有清洗操作
|
|
267
|
+
data, step_stats = _clean_data_single_pass(
|
|
268
|
+
dt.data,
|
|
269
|
+
strip=strip,
|
|
270
|
+
empty_fields=empty_fields,
|
|
271
|
+
min_len_field=min_len_field,
|
|
272
|
+
min_len_value=min_len_value,
|
|
273
|
+
max_len_field=max_len_field,
|
|
274
|
+
max_len_value=max_len_value,
|
|
275
|
+
keep_fields=keep_fields,
|
|
276
|
+
drop_fields=drop_fields_set,
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
# 保存结果
|
|
280
|
+
final_count = len(data)
|
|
281
|
+
print(f"💾 保存结果: {output_path}")
|
|
282
|
+
|
|
283
|
+
try:
|
|
284
|
+
save_data(data, output_path)
|
|
285
|
+
except Exception as e:
|
|
286
|
+
print(f"错误: 无法保存文件 - {e}")
|
|
287
|
+
return
|
|
288
|
+
|
|
289
|
+
# 打印统计
|
|
290
|
+
removed_count = original_count - final_count
|
|
291
|
+
print(f"\n✅ 完成!")
|
|
292
|
+
print(f" 原始: {original_count} 条 -> 清洗后: {final_count} 条 (删除 {removed_count} 条)")
|
|
293
|
+
if step_stats:
|
|
294
|
+
print(f" 步骤: {' | '.join(step_stats)}")
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def _parse_len_param(param: str) -> tuple:
|
|
298
|
+
"""解析长度参数,格式 'field:length'"""
|
|
299
|
+
if ":" not in param:
|
|
300
|
+
raise ValueError(f"长度参数格式错误: {param},应为 '字段:长度'")
|
|
301
|
+
parts = param.split(":", 1)
|
|
302
|
+
field = parts[0].strip()
|
|
303
|
+
try:
|
|
304
|
+
length = int(parts[1].strip())
|
|
305
|
+
except ValueError:
|
|
306
|
+
raise ValueError(f"长度必须是整数: {parts[1]}")
|
|
307
|
+
return field, length
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def _clean_data_single_pass(
|
|
311
|
+
data: List[Dict],
|
|
312
|
+
strip: bool = False,
|
|
313
|
+
empty_fields: Optional[List[str]] = None,
|
|
314
|
+
min_len_field: Optional[str] = None,
|
|
315
|
+
min_len_value: Optional[int] = None,
|
|
316
|
+
max_len_field: Optional[str] = None,
|
|
317
|
+
max_len_value: Optional[int] = None,
|
|
318
|
+
keep_fields: Optional[List[str]] = None,
|
|
319
|
+
drop_fields: Optional[set] = None,
|
|
320
|
+
) -> tuple:
|
|
321
|
+
"""
|
|
322
|
+
单次遍历执行所有清洗操作。
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
data: 原始数据列表
|
|
326
|
+
strip: 是否去除字符串首尾空白
|
|
327
|
+
empty_fields: 检查空值的字段列表(支持嵌套路径),空列表表示检查所有字段,None 表示不检查
|
|
328
|
+
min_len_field: 最小长度检查的字段(支持嵌套路径)
|
|
329
|
+
min_len_value: 最小长度值
|
|
330
|
+
max_len_field: 最大长度检查的字段(支持嵌套路径)
|
|
331
|
+
max_len_value: 最大长度值
|
|
332
|
+
keep_fields: 只保留的字段列表(仅支持顶层字段)
|
|
333
|
+
drop_fields: 要删除的字段集合(仅支持顶层字段)
|
|
334
|
+
|
|
335
|
+
Returns:
|
|
336
|
+
(清洗后的数据, 统计信息列表)
|
|
337
|
+
"""
|
|
338
|
+
result = []
|
|
339
|
+
stats = {
|
|
340
|
+
"drop_empty": 0,
|
|
341
|
+
"min_len": 0,
|
|
342
|
+
"max_len": 0,
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
# 预先计算 keep_fields 集合(如果有的话)
|
|
346
|
+
keep_set = set(keep_fields) if keep_fields else None
|
|
347
|
+
|
|
348
|
+
for item in data:
|
|
349
|
+
# 1. strip 处理(在过滤前执行,这样空值检测更准确)
|
|
350
|
+
if strip:
|
|
351
|
+
item = {k: v.strip() if isinstance(v, str) else v for k, v in item.items()}
|
|
352
|
+
|
|
353
|
+
# 2. 空值过滤
|
|
354
|
+
if empty_fields is not None:
|
|
355
|
+
if len(empty_fields) == 0:
|
|
356
|
+
# 检查所有字段
|
|
357
|
+
if any(_is_empty_value(v) for v in item.values()):
|
|
358
|
+
stats["drop_empty"] += 1
|
|
359
|
+
continue
|
|
360
|
+
else:
|
|
361
|
+
# 检查指定字段(支持嵌套路径)
|
|
362
|
+
if any(_is_empty_value(get_field_with_spec(item, f)) for f in empty_fields):
|
|
363
|
+
stats["drop_empty"] += 1
|
|
364
|
+
continue
|
|
365
|
+
|
|
366
|
+
# 3. 最小长度过滤(支持嵌套路径)
|
|
367
|
+
if min_len_field is not None:
|
|
368
|
+
if _get_value_len(get_field_with_spec(item, min_len_field, default="")) < min_len_value:
|
|
369
|
+
stats["min_len"] += 1
|
|
370
|
+
continue
|
|
371
|
+
|
|
372
|
+
# 4. 最大长度过滤(支持嵌套路径)
|
|
373
|
+
if max_len_field is not None:
|
|
374
|
+
if _get_value_len(get_field_with_spec(item, max_len_field, default="")) > max_len_value:
|
|
375
|
+
stats["max_len"] += 1
|
|
376
|
+
continue
|
|
377
|
+
|
|
378
|
+
# 5. 字段管理(keep/drop)
|
|
379
|
+
if keep_set is not None:
|
|
380
|
+
item = {k: v for k, v in item.items() if k in keep_set}
|
|
381
|
+
elif drop_fields is not None:
|
|
382
|
+
item = {k: v for k, v in item.items() if k not in drop_fields}
|
|
383
|
+
|
|
384
|
+
result.append(item)
|
|
385
|
+
|
|
386
|
+
# 构建统计信息字符串列表
|
|
387
|
+
step_stats = []
|
|
388
|
+
if strip:
|
|
389
|
+
step_stats.append("strip")
|
|
390
|
+
if stats["drop_empty"] > 0:
|
|
391
|
+
step_stats.append(f"drop-empty: -{stats['drop_empty']}")
|
|
392
|
+
if stats["min_len"] > 0:
|
|
393
|
+
step_stats.append(f"min-len: -{stats['min_len']}")
|
|
394
|
+
if stats["max_len"] > 0:
|
|
395
|
+
step_stats.append(f"max-len: -{stats['max_len']}")
|
|
396
|
+
if keep_fields:
|
|
397
|
+
step_stats.append(f"keep: {len(keep_fields)} 字段")
|
|
398
|
+
if drop_fields:
|
|
399
|
+
step_stats.append(f"drop: {len(drop_fields)} 字段")
|
|
400
|
+
|
|
401
|
+
return result, step_stats
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
def _clean_streaming(
|
|
405
|
+
input_path: str,
|
|
406
|
+
output_path: str,
|
|
407
|
+
strip: bool = False,
|
|
408
|
+
empty_fields: Optional[List[str]] = None,
|
|
409
|
+
min_len_field: Optional[str] = None,
|
|
410
|
+
min_len_value: Optional[int] = None,
|
|
411
|
+
max_len_field: Optional[str] = None,
|
|
412
|
+
max_len_value: Optional[int] = None,
|
|
413
|
+
keep_set: Optional[set] = None,
|
|
414
|
+
drop_fields_set: Optional[set] = None,
|
|
415
|
+
) -> int:
|
|
416
|
+
"""
|
|
417
|
+
流式清洗数据。
|
|
418
|
+
|
|
419
|
+
Returns:
|
|
420
|
+
处理后的数据条数
|
|
421
|
+
"""
|
|
422
|
+
|
|
423
|
+
def clean_filter(item: Dict) -> bool:
|
|
424
|
+
"""过滤函数:返回 True 保留,False 过滤(支持嵌套路径)"""
|
|
425
|
+
# 空值过滤
|
|
426
|
+
if empty_fields is not None:
|
|
427
|
+
if len(empty_fields) == 0:
|
|
428
|
+
if any(_is_empty_value(v) for v in item.values()):
|
|
429
|
+
return False
|
|
430
|
+
else:
|
|
431
|
+
# 支持嵌套路径
|
|
432
|
+
if any(_is_empty_value(get_field_with_spec(item, f)) for f in empty_fields):
|
|
433
|
+
return False
|
|
434
|
+
|
|
435
|
+
# 最小长度过滤(支持嵌套路径)
|
|
436
|
+
if min_len_field is not None:
|
|
437
|
+
if _get_value_len(get_field_with_spec(item, min_len_field, default="")) < min_len_value:
|
|
438
|
+
return False
|
|
439
|
+
|
|
440
|
+
# 最大长度过滤(支持嵌套路径)
|
|
441
|
+
if max_len_field is not None:
|
|
442
|
+
if _get_value_len(get_field_with_spec(item, max_len_field, default="")) > max_len_value:
|
|
443
|
+
return False
|
|
444
|
+
|
|
445
|
+
return True
|
|
446
|
+
|
|
447
|
+
def clean_transform(item: Dict) -> Dict:
|
|
448
|
+
"""转换函数:strip + 字段管理"""
|
|
449
|
+
# strip 处理
|
|
450
|
+
if strip:
|
|
451
|
+
item = {k: v.strip() if isinstance(v, str) else v for k, v in item.items()}
|
|
452
|
+
|
|
453
|
+
# 字段管理
|
|
454
|
+
if keep_set is not None:
|
|
455
|
+
item = {k: v for k, v in item.items() if k in keep_set}
|
|
456
|
+
elif drop_fields_set is not None:
|
|
457
|
+
item = {k: v for k, v in item.items() if k not in drop_fields_set}
|
|
458
|
+
|
|
459
|
+
return item
|
|
460
|
+
|
|
461
|
+
# 构建流式处理链
|
|
462
|
+
st = load_stream(input_path)
|
|
463
|
+
|
|
464
|
+
# 如果需要 strip,先执行 strip 转换(在过滤之前,这样空值检测更准确)
|
|
465
|
+
if strip:
|
|
466
|
+
st = st.transform(
|
|
467
|
+
lambda x: {k: v.strip() if isinstance(v, str) else v for k, v in x.items()}
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
# 执行过滤
|
|
471
|
+
if empty_fields is not None or min_len_field is not None or max_len_field is not None:
|
|
472
|
+
st = st.filter(clean_filter)
|
|
473
|
+
|
|
474
|
+
# 执行字段管理(如果没有 strip,也需要在这里处理)
|
|
475
|
+
if keep_set is not None or drop_fields_set is not None:
|
|
476
|
+
|
|
477
|
+
def field_transform(item):
|
|
478
|
+
if keep_set is not None:
|
|
479
|
+
return {k: v for k, v in item.items() if k in keep_set}
|
|
480
|
+
elif drop_fields_set is not None:
|
|
481
|
+
return {k: v for k, v in item.items() if k not in drop_fields_set}
|
|
482
|
+
return item
|
|
483
|
+
|
|
484
|
+
st = st.transform(field_transform)
|
|
485
|
+
|
|
486
|
+
return st.save(output_path)
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLI 命令统一导出入口
|
|
3
|
+
|
|
4
|
+
各命令已按功能拆分到独立模块:
|
|
5
|
+
- sample.py 采样相关 (sample, head, tail)
|
|
6
|
+
- transform.py 转换相关 (transform)
|
|
7
|
+
- stats.py 统计相关 (stats, token_stats)
|
|
8
|
+
- clean.py 清洗相关 (clean, dedupe)
|
|
9
|
+
- io_ops.py IO 操作 (concat, diff)
|
|
10
|
+
- pipeline.py Pipeline (run)
|
|
11
|
+
- lineage.py 血缘追踪 (history)
|
|
12
|
+
- common.py 通用工具函数
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
# 采样命令
|
|
16
|
+
from .sample import head, sample, tail
|
|
17
|
+
|
|
18
|
+
# 转换命令
|
|
19
|
+
from .transform import transform
|
|
20
|
+
|
|
21
|
+
# 统计命令
|
|
22
|
+
from .stats import stats, token_stats
|
|
23
|
+
|
|
24
|
+
# 清洗命令
|
|
25
|
+
from .clean import clean, dedupe
|
|
26
|
+
|
|
27
|
+
# IO 操作命令
|
|
28
|
+
from .io_ops import concat, diff
|
|
29
|
+
|
|
30
|
+
# Pipeline 命令
|
|
31
|
+
from .pipeline import run
|
|
32
|
+
|
|
33
|
+
# 血缘追踪命令
|
|
34
|
+
from .lineage import history
|
|
35
|
+
|
|
36
|
+
__all__ = [
|
|
37
|
+
# 采样
|
|
38
|
+
"sample",
|
|
39
|
+
"head",
|
|
40
|
+
"tail",
|
|
41
|
+
# 转换
|
|
42
|
+
"transform",
|
|
43
|
+
# 统计
|
|
44
|
+
"stats",
|
|
45
|
+
"token_stats",
|
|
46
|
+
# 清洗
|
|
47
|
+
"clean",
|
|
48
|
+
"dedupe",
|
|
49
|
+
# IO 操作
|
|
50
|
+
"concat",
|
|
51
|
+
"diff",
|
|
52
|
+
# Pipeline
|
|
53
|
+
"run",
|
|
54
|
+
# 血缘
|
|
55
|
+
"history",
|
|
56
|
+
]
|