dtflow 0.4.2__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dtflow/__init__.py +1 -1
- dtflow/__main__.py +6 -3
- dtflow/cli/clean.py +486 -0
- dtflow/cli/commands.py +53 -2637
- dtflow/cli/common.py +384 -0
- dtflow/cli/io_ops.py +385 -0
- dtflow/cli/lineage.py +49 -0
- dtflow/cli/pipeline.py +54 -0
- dtflow/cli/sample.py +294 -0
- dtflow/cli/stats.py +589 -0
- dtflow/cli/transform.py +486 -0
- dtflow/core.py +35 -0
- dtflow/storage/io.py +49 -6
- dtflow/streaming.py +25 -4
- {dtflow-0.4.2.dist-info → dtflow-0.4.3.dist-info}/METADATA +12 -1
- dtflow-0.4.3.dist-info/RECORD +33 -0
- dtflow-0.4.2.dist-info/RECORD +0 -25
- {dtflow-0.4.2.dist-info → dtflow-0.4.3.dist-info}/WHEEL +0 -0
- {dtflow-0.4.2.dist-info → dtflow-0.4.3.dist-info}/entry_points.txt +0 -0
dtflow/cli/transform.py
ADDED
|
@@ -0,0 +1,486 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLI 数据转换相关命令
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import shutil
|
|
7
|
+
import tempfile
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any, Dict, List, Optional
|
|
11
|
+
|
|
12
|
+
import orjson
|
|
13
|
+
|
|
14
|
+
from ..core import DataTransformer, DictWrapper
|
|
15
|
+
from ..presets import get_preset, list_presets
|
|
16
|
+
from ..storage.io import load_data, save_data
|
|
17
|
+
from ..streaming import load_stream
|
|
18
|
+
from .common import _check_file_format, _is_streaming_supported
|
|
19
|
+
|
|
20
|
+
CONFIG_DIR = ".dt"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _get_config_path(input_path: Path, config_override: Optional[str] = None) -> Path:
|
|
24
|
+
"""获取配置文件路径"""
|
|
25
|
+
if config_override:
|
|
26
|
+
return Path(config_override)
|
|
27
|
+
|
|
28
|
+
# 使用输入文件名(不含扩展名)作为配置文件名
|
|
29
|
+
config_name = input_path.stem + ".py"
|
|
30
|
+
return input_path.parent / CONFIG_DIR / config_name
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def transform(
|
|
34
|
+
filename: str,
|
|
35
|
+
num: Optional[int] = None,
|
|
36
|
+
preset: Optional[str] = None,
|
|
37
|
+
config: Optional[str] = None,
|
|
38
|
+
output: Optional[str] = None,
|
|
39
|
+
) -> None:
|
|
40
|
+
"""
|
|
41
|
+
转换数据格式。
|
|
42
|
+
|
|
43
|
+
两种使用方式:
|
|
44
|
+
1. 配置文件模式(默认):自动生成配置文件,编辑后再次运行
|
|
45
|
+
2. 预设模式:使用 --preset 直接转换
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
filename: 输入文件路径,支持 csv/excel/jsonl/json/parquet/arrow/feather 格式
|
|
49
|
+
num: 只转换前 N 条数据(可选)
|
|
50
|
+
preset: 使用预设模板(openai_chat, alpaca, sharegpt, dpo_pair, simple_qa)
|
|
51
|
+
config: 配置文件路径(可选,默认 .dt/<filename>.py)
|
|
52
|
+
output: 输出文件路径
|
|
53
|
+
|
|
54
|
+
Examples:
|
|
55
|
+
dt transform data.jsonl # 首次生成配置
|
|
56
|
+
dt transform data.jsonl 10 # 只转换前 10 条
|
|
57
|
+
dt transform data.jsonl --preset=openai_chat # 使用预设
|
|
58
|
+
dt transform data.jsonl 100 --preset=alpaca # 预设 + 限制数量
|
|
59
|
+
"""
|
|
60
|
+
filepath = Path(filename)
|
|
61
|
+
if not filepath.exists():
|
|
62
|
+
print(f"错误: 文件不存在 - {filename}")
|
|
63
|
+
return
|
|
64
|
+
|
|
65
|
+
if not _check_file_format(filepath):
|
|
66
|
+
return
|
|
67
|
+
|
|
68
|
+
# 预设模式:直接使用预设转换
|
|
69
|
+
if preset:
|
|
70
|
+
_execute_preset_transform(filepath, preset, output, num)
|
|
71
|
+
return
|
|
72
|
+
|
|
73
|
+
# 配置文件模式
|
|
74
|
+
config_path = _get_config_path(filepath, config)
|
|
75
|
+
|
|
76
|
+
if not config_path.exists():
|
|
77
|
+
_generate_config(filepath, config_path)
|
|
78
|
+
else:
|
|
79
|
+
_execute_transform(filepath, config_path, output, num)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _generate_config(input_path: Path, config_path: Path) -> None:
|
|
83
|
+
"""分析输入数据并生成配置文件"""
|
|
84
|
+
print(f"📊 分析输入数据: {input_path}")
|
|
85
|
+
|
|
86
|
+
# 读取数据
|
|
87
|
+
try:
|
|
88
|
+
data = load_data(str(input_path))
|
|
89
|
+
except Exception as e:
|
|
90
|
+
print(f"错误: 无法读取文件 - {e}")
|
|
91
|
+
return
|
|
92
|
+
|
|
93
|
+
if not data:
|
|
94
|
+
print("错误: 文件为空")
|
|
95
|
+
return
|
|
96
|
+
|
|
97
|
+
total_count = len(data)
|
|
98
|
+
sample_item = data[0]
|
|
99
|
+
|
|
100
|
+
print(f" 检测到 {total_count} 条数据")
|
|
101
|
+
|
|
102
|
+
# 生成配置内容
|
|
103
|
+
config_content = _build_config_content(sample_item, input_path.name, total_count)
|
|
104
|
+
|
|
105
|
+
# 确保配置目录存在
|
|
106
|
+
config_path.parent.mkdir(parents=True, exist_ok=True)
|
|
107
|
+
|
|
108
|
+
# 写入配置文件
|
|
109
|
+
config_path.write_text(config_content, encoding="utf-8")
|
|
110
|
+
|
|
111
|
+
print(f"\n📝 已生成配置文件: {config_path}")
|
|
112
|
+
print("\n👉 下一步:")
|
|
113
|
+
print(f" 1. 编辑 {config_path},定义 transform 函数")
|
|
114
|
+
print(f" 2. 再次执行 dt transform {input_path.name} 完成转换")
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _build_config_content(sample: Dict[str, Any], filename: str, total: int) -> str:
|
|
118
|
+
"""构建配置文件内容"""
|
|
119
|
+
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
120
|
+
|
|
121
|
+
# 生成 Item 类的字段定义
|
|
122
|
+
fields_def = _generate_fields_definition(sample)
|
|
123
|
+
|
|
124
|
+
# 生成默认的 transform 函数(简单重命名)
|
|
125
|
+
field_names = list(sample.keys())
|
|
126
|
+
|
|
127
|
+
# 生成规范化的字段名用于示例
|
|
128
|
+
safe_field1 = _sanitize_field_name(field_names[0])[0] if field_names else "field1"
|
|
129
|
+
safe_field2 = _sanitize_field_name(field_names[1])[0] if len(field_names) > 1 else "field2"
|
|
130
|
+
|
|
131
|
+
# 生成默认输出文件名
|
|
132
|
+
base_name = Path(filename).stem
|
|
133
|
+
output_filename = f"{base_name}_output.jsonl"
|
|
134
|
+
|
|
135
|
+
config = f'''"""
|
|
136
|
+
DataTransformer 配置文件
|
|
137
|
+
生成时间: {now}
|
|
138
|
+
输入文件: {filename} ({total} 条)
|
|
139
|
+
"""
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
# ===== 输入数据结构(自动生成,IDE 可补全)=====
|
|
143
|
+
|
|
144
|
+
class Item:
|
|
145
|
+
{fields_def}
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
# ===== 定义转换逻辑 =====
|
|
149
|
+
# 提示:输入 item. 后 IDE 会自动补全可用字段
|
|
150
|
+
|
|
151
|
+
def transform(item: Item):
|
|
152
|
+
return {{
|
|
153
|
+
{_generate_default_transform(field_names)}
|
|
154
|
+
}}
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
# 输出文件路径
|
|
158
|
+
output = "{output_filename}"
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# ===== 示例 =====
|
|
162
|
+
#
|
|
163
|
+
# 示例1: 构建 OpenAI Chat 格式
|
|
164
|
+
# def transform(item: Item):
|
|
165
|
+
# return {{
|
|
166
|
+
# "messages": [
|
|
167
|
+
# {{"role": "user", "content": item.{safe_field1}}},
|
|
168
|
+
# {{"role": "assistant", "content": item.{safe_field2}}},
|
|
169
|
+
# ]
|
|
170
|
+
# }}
|
|
171
|
+
#
|
|
172
|
+
# 示例2: Alpaca 格式
|
|
173
|
+
# def transform(item: Item):
|
|
174
|
+
# return {{
|
|
175
|
+
# "instruction": item.{safe_field1},
|
|
176
|
+
# "input": "",
|
|
177
|
+
# "output": item.{safe_field2},
|
|
178
|
+
# }}
|
|
179
|
+
'''
|
|
180
|
+
return config
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _generate_fields_definition(sample: Dict[str, Any], indent: int = 4) -> str:
|
|
184
|
+
"""生成 Item 类的字段定义"""
|
|
185
|
+
lines = []
|
|
186
|
+
prefix = " " * indent
|
|
187
|
+
|
|
188
|
+
for key, value in sample.items():
|
|
189
|
+
type_name = _get_type_name(value)
|
|
190
|
+
example = _format_example_value(value)
|
|
191
|
+
safe_key, changed = _sanitize_field_name(key)
|
|
192
|
+
comment = f" # 原字段名: {key}" if changed else ""
|
|
193
|
+
lines.append(f"{prefix}{safe_key}: {type_name} = {example}{comment}")
|
|
194
|
+
|
|
195
|
+
return "\n".join(lines) if lines else f"{prefix}pass"
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _get_type_name(value: Any) -> str:
|
|
199
|
+
"""获取值的类型名称"""
|
|
200
|
+
if value is None:
|
|
201
|
+
return "str"
|
|
202
|
+
if isinstance(value, str):
|
|
203
|
+
return "str"
|
|
204
|
+
if isinstance(value, bool):
|
|
205
|
+
return "bool"
|
|
206
|
+
if isinstance(value, int):
|
|
207
|
+
return "int"
|
|
208
|
+
if isinstance(value, float):
|
|
209
|
+
return "float"
|
|
210
|
+
if isinstance(value, list):
|
|
211
|
+
return "list"
|
|
212
|
+
if isinstance(value, dict):
|
|
213
|
+
return "dict"
|
|
214
|
+
return "str"
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _format_example_value(value: Any, max_len: int = 50) -> str:
|
|
218
|
+
"""格式化示例值"""
|
|
219
|
+
if value is None:
|
|
220
|
+
return '""'
|
|
221
|
+
if isinstance(value, str):
|
|
222
|
+
# 截断长字符串
|
|
223
|
+
if len(value) > max_len:
|
|
224
|
+
value = value[:max_len] + "..."
|
|
225
|
+
# 使用 repr() 自动处理所有转义字符
|
|
226
|
+
return repr(value)
|
|
227
|
+
if isinstance(value, bool):
|
|
228
|
+
return str(value)
|
|
229
|
+
if isinstance(value, (int, float)):
|
|
230
|
+
return str(value)
|
|
231
|
+
if isinstance(value, (list, dict)):
|
|
232
|
+
s = orjson.dumps(value).decode("utf-8")
|
|
233
|
+
if len(s) > max_len:
|
|
234
|
+
return repr(s[:max_len] + "...")
|
|
235
|
+
return s
|
|
236
|
+
return '""'
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def _sanitize_field_name(name: str) -> tuple:
|
|
240
|
+
"""
|
|
241
|
+
将字段名规范化为合法的 Python 标识符。
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
tuple: (规范化后的名称, 是否被修改)
|
|
245
|
+
"""
|
|
246
|
+
if name.isidentifier():
|
|
247
|
+
return name, False
|
|
248
|
+
|
|
249
|
+
# 替换常见的非法字符
|
|
250
|
+
sanitized = name.replace("-", "_").replace(" ", "_").replace(".", "_")
|
|
251
|
+
|
|
252
|
+
# 如果以数字开头,添加前缀
|
|
253
|
+
if sanitized and sanitized[0].isdigit():
|
|
254
|
+
sanitized = "f_" + sanitized
|
|
255
|
+
|
|
256
|
+
# 移除其他非法字符
|
|
257
|
+
sanitized = "".join(c if c.isalnum() or c == "_" else "_" for c in sanitized)
|
|
258
|
+
|
|
259
|
+
# 确保不为空
|
|
260
|
+
if not sanitized:
|
|
261
|
+
sanitized = "field"
|
|
262
|
+
|
|
263
|
+
return sanitized, True
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def _generate_default_transform(field_names: List[str]) -> str:
|
|
267
|
+
"""生成默认的 transform 函数体"""
|
|
268
|
+
lines = []
|
|
269
|
+
for name in field_names[:5]: # 最多显示 5 个字段
|
|
270
|
+
safe_name, _ = _sanitize_field_name(name)
|
|
271
|
+
lines.append(f' "{name}": item.{safe_name},')
|
|
272
|
+
return "\n".join(lines) if lines else " # 在这里定义输出字段"
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def _unwrap(obj: Any) -> Any:
|
|
276
|
+
"""递归将 DictWrapper 转换为普通 dict"""
|
|
277
|
+
if hasattr(obj, "to_dict"):
|
|
278
|
+
return _unwrap(obj.to_dict())
|
|
279
|
+
if isinstance(obj, dict):
|
|
280
|
+
return {k: _unwrap(v) for k, v in obj.items()}
|
|
281
|
+
if isinstance(obj, list):
|
|
282
|
+
return [_unwrap(v) for v in obj]
|
|
283
|
+
return obj
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def _execute_transform(
|
|
287
|
+
input_path: Path,
|
|
288
|
+
config_path: Path,
|
|
289
|
+
output_override: Optional[str],
|
|
290
|
+
num: Optional[int],
|
|
291
|
+
) -> None:
|
|
292
|
+
"""执行数据转换(默认流式处理)"""
|
|
293
|
+
print(f"📂 加载配置: {config_path}")
|
|
294
|
+
|
|
295
|
+
# 动态加载配置文件
|
|
296
|
+
try:
|
|
297
|
+
config_ns = _load_config(config_path)
|
|
298
|
+
except Exception as e:
|
|
299
|
+
print(f"错误: 无法加载配置文件 - {e}")
|
|
300
|
+
return
|
|
301
|
+
|
|
302
|
+
# 获取 transform 函数
|
|
303
|
+
if "transform" not in config_ns:
|
|
304
|
+
print("错误: 配置文件中未定义 transform 函数")
|
|
305
|
+
return
|
|
306
|
+
|
|
307
|
+
transform_func = config_ns["transform"]
|
|
308
|
+
|
|
309
|
+
# 获取输出路径
|
|
310
|
+
output_path = output_override or config_ns.get("output", "output.jsonl")
|
|
311
|
+
|
|
312
|
+
# 对于 JSONL 文件使用流式处理
|
|
313
|
+
if _is_streaming_supported(input_path):
|
|
314
|
+
print(f"📊 流式加载: {input_path}")
|
|
315
|
+
print("🔄 执行转换...")
|
|
316
|
+
try:
|
|
317
|
+
# 包装转换函数以支持属性访问(配置文件中定义的 Item 类)
|
|
318
|
+
def wrapped_transform(item):
|
|
319
|
+
result = transform_func(DictWrapper(item))
|
|
320
|
+
return _unwrap(result)
|
|
321
|
+
|
|
322
|
+
st = load_stream(str(input_path))
|
|
323
|
+
if num:
|
|
324
|
+
st = st.head(num)
|
|
325
|
+
count = st.transform(wrapped_transform).save(output_path)
|
|
326
|
+
print(f"💾 保存结果: {output_path}")
|
|
327
|
+
print(f"\n✅ 完成! 已转换 {count} 条数据到 {output_path}")
|
|
328
|
+
except Exception as e:
|
|
329
|
+
print(f"错误: 转换失败 - {e}")
|
|
330
|
+
import traceback
|
|
331
|
+
|
|
332
|
+
traceback.print_exc()
|
|
333
|
+
return
|
|
334
|
+
|
|
335
|
+
# 非 JSONL 文件使用传统方式
|
|
336
|
+
print(f"📊 加载数据: {input_path}")
|
|
337
|
+
try:
|
|
338
|
+
dt = DataTransformer.load(str(input_path))
|
|
339
|
+
except Exception as e:
|
|
340
|
+
print(f"错误: 无法读取文件 - {e}")
|
|
341
|
+
return
|
|
342
|
+
|
|
343
|
+
total = len(dt)
|
|
344
|
+
if num:
|
|
345
|
+
dt = DataTransformer(dt.data[:num])
|
|
346
|
+
print(f" 处理前 {len(dt)}/{total} 条数据")
|
|
347
|
+
else:
|
|
348
|
+
print(f" 共 {total} 条数据")
|
|
349
|
+
|
|
350
|
+
# 执行转换(使用 Core 的 to 方法,自动支持属性访问)
|
|
351
|
+
print("🔄 执行转换...")
|
|
352
|
+
try:
|
|
353
|
+
results = dt.to(transform_func)
|
|
354
|
+
except Exception as e:
|
|
355
|
+
print(f"错误: 转换失败 - {e}")
|
|
356
|
+
import traceback
|
|
357
|
+
|
|
358
|
+
traceback.print_exc()
|
|
359
|
+
return
|
|
360
|
+
|
|
361
|
+
# 保存结果
|
|
362
|
+
print(f"💾 保存结果: {output_path}")
|
|
363
|
+
try:
|
|
364
|
+
save_data(results, output_path)
|
|
365
|
+
except Exception as e:
|
|
366
|
+
print(f"错误: 无法保存文件 - {e}")
|
|
367
|
+
return
|
|
368
|
+
|
|
369
|
+
print(f"\n✅ 完成! 已转换 {len(results)} 条数据到 {output_path}")
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def _execute_preset_transform(
|
|
373
|
+
input_path: Path,
|
|
374
|
+
preset_name: str,
|
|
375
|
+
output_override: Optional[str],
|
|
376
|
+
num: Optional[int],
|
|
377
|
+
) -> None:
|
|
378
|
+
"""使用预设模板执行转换(默认流式处理)"""
|
|
379
|
+
print(f"📂 使用预设: {preset_name}")
|
|
380
|
+
|
|
381
|
+
# 获取预设函数
|
|
382
|
+
try:
|
|
383
|
+
transform_func = get_preset(preset_name)
|
|
384
|
+
except ValueError as e:
|
|
385
|
+
print(f"错误: {e}")
|
|
386
|
+
print(f"可用预设: {', '.join(list_presets())}")
|
|
387
|
+
return
|
|
388
|
+
|
|
389
|
+
output_path = output_override or f"{input_path.stem}_{preset_name}.jsonl"
|
|
390
|
+
|
|
391
|
+
# 检查输入输出是否相同
|
|
392
|
+
input_resolved = input_path.resolve()
|
|
393
|
+
output_resolved = Path(output_path).resolve()
|
|
394
|
+
use_temp_file = input_resolved == output_resolved
|
|
395
|
+
|
|
396
|
+
# 对于 JSONL 文件使用流式处理
|
|
397
|
+
if _is_streaming_supported(input_path):
|
|
398
|
+
print(f"📊 流式加载: {input_path}")
|
|
399
|
+
print("🔄 执行转换...")
|
|
400
|
+
|
|
401
|
+
# 如果输入输出相同,使用临时文件
|
|
402
|
+
if use_temp_file:
|
|
403
|
+
print("⚠ 检测到输出文件与输入文件相同,将使用临时文件")
|
|
404
|
+
temp_fd, temp_path = tempfile.mkstemp(
|
|
405
|
+
suffix=output_resolved.suffix,
|
|
406
|
+
prefix=".tmp_",
|
|
407
|
+
dir=output_resolved.parent,
|
|
408
|
+
)
|
|
409
|
+
os.close(temp_fd)
|
|
410
|
+
actual_output = temp_path
|
|
411
|
+
else:
|
|
412
|
+
actual_output = output_path
|
|
413
|
+
|
|
414
|
+
try:
|
|
415
|
+
# 包装转换函数以支持属性访问
|
|
416
|
+
def wrapped_transform(item):
|
|
417
|
+
result = transform_func(DictWrapper(item))
|
|
418
|
+
return _unwrap(result)
|
|
419
|
+
|
|
420
|
+
st = load_stream(str(input_path))
|
|
421
|
+
if num:
|
|
422
|
+
st = st.head(num)
|
|
423
|
+
count = st.transform(wrapped_transform).save(actual_output)
|
|
424
|
+
|
|
425
|
+
# 如果使用了临时文件,移动到目标位置
|
|
426
|
+
if use_temp_file:
|
|
427
|
+
shutil.move(temp_path, output_path)
|
|
428
|
+
|
|
429
|
+
print(f"💾 保存结果: {output_path}")
|
|
430
|
+
print(f"\n✅ 完成! 已转换 {count} 条数据到 {output_path}")
|
|
431
|
+
except Exception as e:
|
|
432
|
+
# 清理临时文件
|
|
433
|
+
if use_temp_file and os.path.exists(temp_path):
|
|
434
|
+
os.unlink(temp_path)
|
|
435
|
+
print(f"错误: 转换失败 - {e}")
|
|
436
|
+
import traceback
|
|
437
|
+
|
|
438
|
+
traceback.print_exc()
|
|
439
|
+
return
|
|
440
|
+
|
|
441
|
+
# 非 JSONL 文件使用传统方式
|
|
442
|
+
print(f"📊 加载数据: {input_path}")
|
|
443
|
+
try:
|
|
444
|
+
dt = DataTransformer.load(str(input_path))
|
|
445
|
+
except Exception as e:
|
|
446
|
+
print(f"错误: 无法读取文件 - {e}")
|
|
447
|
+
return
|
|
448
|
+
|
|
449
|
+
total = len(dt)
|
|
450
|
+
if num:
|
|
451
|
+
dt = DataTransformer(dt.data[:num])
|
|
452
|
+
print(f" 处理前 {len(dt)}/{total} 条数据")
|
|
453
|
+
else:
|
|
454
|
+
print(f" 共 {total} 条数据")
|
|
455
|
+
|
|
456
|
+
# 执行转换
|
|
457
|
+
print("🔄 执行转换...")
|
|
458
|
+
try:
|
|
459
|
+
results = dt.to(transform_func)
|
|
460
|
+
except Exception as e:
|
|
461
|
+
print(f"错误: 转换失败 - {e}")
|
|
462
|
+
import traceback
|
|
463
|
+
|
|
464
|
+
traceback.print_exc()
|
|
465
|
+
return
|
|
466
|
+
|
|
467
|
+
# 保存结果
|
|
468
|
+
print(f"💾 保存结果: {output_path}")
|
|
469
|
+
try:
|
|
470
|
+
save_data(results, output_path)
|
|
471
|
+
except Exception as e:
|
|
472
|
+
print(f"错误: 无法保存文件 - {e}")
|
|
473
|
+
return
|
|
474
|
+
|
|
475
|
+
print(f"\n✅ 完成! 已转换 {len(results)} 条数据到 {output_path}")
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
def _load_config(config_path: Path) -> Dict[str, Any]:
|
|
479
|
+
"""动态加载 Python 配置文件"""
|
|
480
|
+
import importlib.util
|
|
481
|
+
|
|
482
|
+
spec = importlib.util.spec_from_file_location("dt_config", config_path)
|
|
483
|
+
module = importlib.util.module_from_spec(spec)
|
|
484
|
+
spec.loader.exec_module(module)
|
|
485
|
+
|
|
486
|
+
return {name: getattr(module, name) for name in dir(module) if not name.startswith("_")}
|
dtflow/core.py
CHANGED
|
@@ -351,6 +351,41 @@ class DataTransformer:
|
|
|
351
351
|
tracker.record("tail", {"n": n}, len(self._data), len(data))
|
|
352
352
|
return DataTransformer(data, _lineage_tracker=tracker)
|
|
353
353
|
|
|
354
|
+
def validate(
|
|
355
|
+
self,
|
|
356
|
+
func: Callable[[Any], bool],
|
|
357
|
+
raw: bool = False,
|
|
358
|
+
) -> List[TransformError]:
|
|
359
|
+
"""
|
|
360
|
+
验证数据,返回不通过的记录列表。
|
|
361
|
+
|
|
362
|
+
Args:
|
|
363
|
+
func: 验证函数,返回 True 表示通过,False 表示失败
|
|
364
|
+
raw: 原始模式,跳过 DictWrapper 包装
|
|
365
|
+
|
|
366
|
+
Returns:
|
|
367
|
+
验证失败的记录列表(TransformError)
|
|
368
|
+
|
|
369
|
+
Examples:
|
|
370
|
+
>>> dt = DataTransformer([{"a": 1}, {"a": -1}])
|
|
371
|
+
>>> errors = dt.validate(lambda x: x.a > 0)
|
|
372
|
+
>>> len(errors) # 1
|
|
373
|
+
>>> errors[0].index # 1
|
|
374
|
+
"""
|
|
375
|
+
errors = []
|
|
376
|
+
wrapper_func = (lambda x: x) if raw else DictWrapper
|
|
377
|
+
|
|
378
|
+
for i, item in enumerate(self._data):
|
|
379
|
+
try:
|
|
380
|
+
if not func(wrapper_func(item)):
|
|
381
|
+
errors.append(
|
|
382
|
+
TransformError(index=i, item=item, error=ValueError("验证未通过"))
|
|
383
|
+
)
|
|
384
|
+
except Exception as e:
|
|
385
|
+
errors.append(TransformError(index=i, item=item, error=e))
|
|
386
|
+
|
|
387
|
+
return errors
|
|
388
|
+
|
|
354
389
|
def dedupe(
|
|
355
390
|
self,
|
|
356
391
|
key: Union[None, str, List[str], Callable[[Any], Any]] = None,
|
dtflow/storage/io.py
CHANGED
|
@@ -115,13 +115,40 @@ def _save_jsonl(data: List[Dict[str, Any]], filepath: Path) -> None:
|
|
|
115
115
|
|
|
116
116
|
|
|
117
117
|
def _load_jsonl(filepath: Path) -> List[Dict[str, Any]]:
|
|
118
|
-
"""Load data from JSONL format.
|
|
118
|
+
"""Load data from JSONL format.
|
|
119
|
+
|
|
120
|
+
使用 orjson 解析,如果失败(如遇到 NaN 等非标准 JSON)则回退到标准 json。
|
|
121
|
+
"""
|
|
122
|
+
import json
|
|
123
|
+
import sys
|
|
124
|
+
|
|
119
125
|
data = []
|
|
126
|
+
use_fallback = False
|
|
127
|
+
|
|
120
128
|
with open(filepath, "rb") as f:
|
|
121
|
-
for line in f:
|
|
129
|
+
for i, line in enumerate(f):
|
|
122
130
|
line = line.strip()
|
|
123
|
-
if line:
|
|
124
|
-
|
|
131
|
+
if not line:
|
|
132
|
+
continue
|
|
133
|
+
|
|
134
|
+
if use_fallback:
|
|
135
|
+
# 已确认需要回退,直接用标准 json
|
|
136
|
+
data.append(json.loads(line))
|
|
137
|
+
else:
|
|
138
|
+
try:
|
|
139
|
+
data.append(orjson.loads(line))
|
|
140
|
+
except orjson.JSONDecodeError:
|
|
141
|
+
# orjson 解析失败,尝试标准 json(支持 NaN/Infinity)
|
|
142
|
+
try:
|
|
143
|
+
data.append(json.loads(line))
|
|
144
|
+
use_fallback = True
|
|
145
|
+
print(
|
|
146
|
+
f"[Warning] 第 {i+1} 行包含非标准 JSON(如 NaN),已切换到标准 json 解析",
|
|
147
|
+
file=sys.stderr,
|
|
148
|
+
)
|
|
149
|
+
except json.JSONDecodeError:
|
|
150
|
+
raise # 标准 json 也失败,抛出原始错误
|
|
151
|
+
|
|
125
152
|
return data
|
|
126
153
|
|
|
127
154
|
|
|
@@ -135,9 +162,25 @@ def _save_json(data: List[Dict[str, Any]], filepath: Path) -> None:
|
|
|
135
162
|
|
|
136
163
|
|
|
137
164
|
def _load_json(filepath: Path) -> List[Dict[str, Any]]:
|
|
138
|
-
"""Load data from JSON format.
|
|
165
|
+
"""Load data from JSON format.
|
|
166
|
+
|
|
167
|
+
使用 orjson 解析,如果失败(如遇到 NaN 等非标准 JSON)则回退到标准 json。
|
|
168
|
+
"""
|
|
169
|
+
import json
|
|
170
|
+
import sys
|
|
171
|
+
|
|
139
172
|
with open(filepath, "rb") as f:
|
|
140
|
-
|
|
173
|
+
content = f.read()
|
|
174
|
+
|
|
175
|
+
try:
|
|
176
|
+
data = orjson.loads(content)
|
|
177
|
+
except orjson.JSONDecodeError:
|
|
178
|
+
# orjson 解析失败,回退到标准 json
|
|
179
|
+
print(
|
|
180
|
+
f"[Warning] 文件包含非标准 JSON(如 NaN),使用标准 json 解析",
|
|
181
|
+
file=sys.stderr,
|
|
182
|
+
)
|
|
183
|
+
data = json.loads(content)
|
|
141
184
|
|
|
142
185
|
if not isinstance(data, list):
|
|
143
186
|
data = [data]
|
dtflow/streaming.py
CHANGED
|
@@ -622,12 +622,33 @@ def process_shards(
|
|
|
622
622
|
|
|
623
623
|
|
|
624
624
|
def _stream_jsonl(filepath: str) -> Generator[Dict[str, Any], None, None]:
|
|
625
|
-
"""JSONL 流式读取(使用 orjson)"""
|
|
625
|
+
"""JSONL 流式读取(使用 orjson,失败时回退到标准 json)"""
|
|
626
|
+
import json
|
|
627
|
+
import sys
|
|
628
|
+
|
|
629
|
+
use_fallback = False
|
|
630
|
+
|
|
626
631
|
with open(filepath, "rb") as f:
|
|
627
|
-
for line in f:
|
|
632
|
+
for i, line in enumerate(f):
|
|
628
633
|
line = line.strip()
|
|
629
|
-
if line:
|
|
630
|
-
|
|
634
|
+
if not line:
|
|
635
|
+
continue
|
|
636
|
+
|
|
637
|
+
if use_fallback:
|
|
638
|
+
yield json.loads(line)
|
|
639
|
+
else:
|
|
640
|
+
try:
|
|
641
|
+
yield orjson.loads(line)
|
|
642
|
+
except orjson.JSONDecodeError:
|
|
643
|
+
try:
|
|
644
|
+
yield json.loads(line)
|
|
645
|
+
use_fallback = True
|
|
646
|
+
print(
|
|
647
|
+
f"[Warning] 第 {i+1} 行包含非标准 JSON(如 NaN),已切换到标准 json 解析",
|
|
648
|
+
file=sys.stderr,
|
|
649
|
+
)
|
|
650
|
+
except json.JSONDecodeError:
|
|
651
|
+
raise
|
|
631
652
|
|
|
632
653
|
|
|
633
654
|
def _stream_csv(filepath: str, batch_size: int = 10000) -> Generator[Dict[str, Any], None, None]:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dtflow
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.3
|
|
4
4
|
Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
|
|
5
5
|
Project-URL: Homepage, https://github.com/yourusername/DataTransformer
|
|
6
6
|
Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
|
|
@@ -126,6 +126,17 @@ dt.filter(lambda x: x.score > 0.8)
|
|
|
126
126
|
dt.filter(lambda x: x.language == "zh")
|
|
127
127
|
```
|
|
128
128
|
|
|
129
|
+
### 数据验证
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
# 验证数据,返回不通过的记录列表
|
|
133
|
+
errors = dt.validate(lambda x: len(x.messages) >= 2)
|
|
134
|
+
|
|
135
|
+
if errors:
|
|
136
|
+
for e in errors[:5]:
|
|
137
|
+
print(f"第 {e.index} 行: {e.error}")
|
|
138
|
+
```
|
|
139
|
+
|
|
129
140
|
### 数据转换
|
|
130
141
|
|
|
131
142
|
```python
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
dtflow/__init__.py,sha256=F5fBna3PxmrOK34SnZxvolzmyi_pjIxNeDkNzGg8wvA,2347
|
|
2
|
+
dtflow/__main__.py,sha256=vuX2--_gXI25vy-Xb1uqhURFzu05VeMUYxIJ2q-XE7M,11656
|
|
3
|
+
dtflow/converters.py,sha256=gyy-K15zjzGBawFnZa8D9JX37JZ47rey2GhjKa2pxFo,22081
|
|
4
|
+
dtflow/core.py,sha256=5XivbEdcKMrj1wSfju6MDkqdCqkO_mRS-ALWJ3DOcKo,29937
|
|
5
|
+
dtflow/lineage.py,sha256=vQ06lxBHftu-Ma5HlISp3F2eiIvwagQSnUGaLeABDZY,12190
|
|
6
|
+
dtflow/pipeline.py,sha256=zZaC4fg5vsp_30Fhbg75vu0yggsdvf28bWBiVDWzZ6Y,13901
|
|
7
|
+
dtflow/presets.py,sha256=OP1nnM5NFk5Kli9FsXK0xAot48E5OQ6-VOIJT9ffXPg,5023
|
|
8
|
+
dtflow/streaming.py,sha256=jtWQjkhhZqfyzIaFskXNvooGAYDQBn1b6X8FHgaCZYk,22704
|
|
9
|
+
dtflow/tokenizers.py,sha256=zxE6XZGjZ_DOGCjRSClI9xaAbFVf8FS6jwwssGoi_9U,18111
|
|
10
|
+
dtflow/cli/__init__.py,sha256=QhZ-thgx9IBTFII7T_hdoWFUl0CCsdGQHN5ZEZw2XB0,423
|
|
11
|
+
dtflow/cli/clean.py,sha256=y9VCRibgK1j8WIY3h0XZX0m93EdELQC7TdnseMWwS-0,17799
|
|
12
|
+
dtflow/cli/commands.py,sha256=ExcD8Z_uXQhcewvgcPtIlPzsQG4QF93K8Bg6C3uUJHk,1094
|
|
13
|
+
dtflow/cli/common.py,sha256=FsDFVNcLj_874qSg2dGef4V7mqPU9THLchT8PxJpBt8,12955
|
|
14
|
+
dtflow/cli/io_ops.py,sha256=BMDisP6dxzzmSjYwmeFwaHmpHHPqirmXAWeNTD-9MQM,13254
|
|
15
|
+
dtflow/cli/lineage.py,sha256=_lNh35nF9AA0Zy6FyZ4g8IzrXH2ZQnp3inF-o2Hs1pw,1383
|
|
16
|
+
dtflow/cli/pipeline.py,sha256=QNEo-BJlaC1CVnVeRZr7TwfuZYloJ4TebIzJ5ALzry0,1426
|
|
17
|
+
dtflow/cli/sample.py,sha256=vPTQlF0OXEry4QjO8uaD9vOae4AQbX9zDwVYOxg59ZI,10339
|
|
18
|
+
dtflow/cli/stats.py,sha256=HByF0sFMqY1kM75dnjTcJbMKDdQNdOt4iDba4au_-pI,20495
|
|
19
|
+
dtflow/cli/transform.py,sha256=w6xqMOxPxQvL2u_BPCfpDHuPSC9gmcqMPVN8s-B6bbY,15052
|
|
20
|
+
dtflow/mcp/__init__.py,sha256=huEJ3rXDbxDRjsLPEvjNT2u3tWs6Poiv6fokPIrByjw,897
|
|
21
|
+
dtflow/mcp/__main__.py,sha256=PoT2ZZmJq9xDZxDACJfqDW9Ld_ukHrGNK-0XUd7WGnY,448
|
|
22
|
+
dtflow/mcp/cli.py,sha256=ck0oOS_642cNktxULaMRE7BJfMxsBCwotmCj3PSPwVk,13110
|
|
23
|
+
dtflow/mcp/docs.py,sha256=DI2Vf-eFo4chRP_bDLsv4Uc3kJt8_1emz8N-NBSVirM,8834
|
|
24
|
+
dtflow/mcp/server.py,sha256=Nf0UlqDGhV55ndGuEglfr7VRjDWAC_9rRsNhdr0-ssM,4275
|
|
25
|
+
dtflow/storage/__init__.py,sha256=C0jpWNQU808Ezz7lWneddABal3wILy8ijFUNiSKbHV4,362
|
|
26
|
+
dtflow/storage/io.py,sha256=ZH2aSE-S89gpy3z4oTqhcqWf4u10OdkDoyul7o_YBDI,23374
|
|
27
|
+
dtflow/utils/__init__.py,sha256=f8v9HJZMWRI5AL64Vjr76Pf2Na_whOF9nJBKgPbXXYg,429
|
|
28
|
+
dtflow/utils/display.py,sha256=OeOdTh6mbDwSkDWlmkjfpTjy2QG8ZUaYU0NpHUWkpEQ,5881
|
|
29
|
+
dtflow/utils/field_path.py,sha256=WcNA-LZh3H61a77FEzB_R7YAyyZl3M8ofdq05ytQGmI,7459
|
|
30
|
+
dtflow-0.4.3.dist-info/METADATA,sha256=LdjtTbPEuHlqxthUuMX2lqrj2l2kBIfVf-u6jDZmCeI,18524
|
|
31
|
+
dtflow-0.4.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
32
|
+
dtflow-0.4.3.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
|
|
33
|
+
dtflow-0.4.3.dist-info/RECORD,,
|