dtflow 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dtflow/__init__.py +1 -1
- dtflow/cli/commands.py +15 -2
- dtflow/streaming.py +21 -11
- {dtflow-0.4.0.dist-info → dtflow-0.4.1.dist-info}/METADATA +1 -1
- {dtflow-0.4.0.dist-info → dtflow-0.4.1.dist-info}/RECORD +7 -7
- {dtflow-0.4.0.dist-info → dtflow-0.4.1.dist-info}/WHEEL +0 -0
- {dtflow-0.4.0.dist-info → dtflow-0.4.1.dist-info}/entry_points.txt +0 -0
dtflow/__init__.py
CHANGED
dtflow/cli/commands.py
CHANGED
|
@@ -796,6 +796,17 @@ def _generate_default_transform(field_names: List[str]) -> str:
|
|
|
796
796
|
return "\n".join(lines) if lines else " # 在这里定义输出字段"
|
|
797
797
|
|
|
798
798
|
|
|
799
|
+
def _unwrap(obj: Any) -> Any:
|
|
800
|
+
"""递归将 DictWrapper 转换为普通 dict"""
|
|
801
|
+
if hasattr(obj, "to_dict"):
|
|
802
|
+
return _unwrap(obj.to_dict())
|
|
803
|
+
if isinstance(obj, dict):
|
|
804
|
+
return {k: _unwrap(v) for k, v in obj.items()}
|
|
805
|
+
if isinstance(obj, list):
|
|
806
|
+
return [_unwrap(v) for v in obj]
|
|
807
|
+
return obj
|
|
808
|
+
|
|
809
|
+
|
|
799
810
|
def _execute_transform(
|
|
800
811
|
input_path: Path,
|
|
801
812
|
config_path: Path,
|
|
@@ -829,7 +840,8 @@ def _execute_transform(
|
|
|
829
840
|
try:
|
|
830
841
|
# 包装转换函数以支持属性访问(配置文件中定义的 Item 类)
|
|
831
842
|
def wrapped_transform(item):
|
|
832
|
-
|
|
843
|
+
result = transform_func(DictWrapper(item))
|
|
844
|
+
return _unwrap(result)
|
|
833
845
|
|
|
834
846
|
st = load_stream(str(input_path))
|
|
835
847
|
if num:
|
|
@@ -926,7 +938,8 @@ def _execute_preset_transform(
|
|
|
926
938
|
try:
|
|
927
939
|
# 包装转换函数以支持属性访问
|
|
928
940
|
def wrapped_transform(item):
|
|
929
|
-
|
|
941
|
+
result = transform_func(DictWrapper(item))
|
|
942
|
+
return _unwrap(result)
|
|
930
943
|
|
|
931
944
|
st = load_stream(str(input_path))
|
|
932
945
|
if num:
|
dtflow/streaming.py
CHANGED
|
@@ -84,6 +84,8 @@ class StreamingTransformer:
|
|
|
84
84
|
self._source_path = source_path
|
|
85
85
|
self._total = total
|
|
86
86
|
self._operations: List[Dict[str, Any]] = []
|
|
87
|
+
self._error_count = 0
|
|
88
|
+
self._first_error: Optional[str] = None
|
|
87
89
|
|
|
88
90
|
@classmethod
|
|
89
91
|
def load_stream(cls, filepath: str, batch_size: int = 10000) -> "StreamingTransformer":
|
|
@@ -194,17 +196,20 @@ class StreamingTransformer:
|
|
|
194
196
|
Returns:
|
|
195
197
|
新的 StreamingTransformer(惰性,不立即执行)
|
|
196
198
|
"""
|
|
199
|
+
# transform 是 1:1 转换,保留 total
|
|
200
|
+
new_st = StreamingTransformer(iter([]), self._source_path, total=self._total)
|
|
201
|
+
new_st._operations = self._operations + [{"type": "transform", "func": func}]
|
|
197
202
|
|
|
198
203
|
def transformed_iterator():
|
|
199
204
|
for item in self._iterator:
|
|
200
205
|
try:
|
|
201
206
|
yield func(item)
|
|
202
|
-
except Exception:
|
|
203
|
-
|
|
207
|
+
except Exception as e:
|
|
208
|
+
new_st._error_count += 1
|
|
209
|
+
if new_st._first_error is None:
|
|
210
|
+
new_st._first_error = f"{type(e).__name__}: {e}"
|
|
204
211
|
|
|
205
|
-
|
|
206
|
-
new_st = StreamingTransformer(transformed_iterator(), self._source_path, total=self._total)
|
|
207
|
-
new_st._operations = self._operations + [{"type": "transform", "func": func}]
|
|
212
|
+
new_st._iterator = transformed_iterator()
|
|
208
213
|
return new_st
|
|
209
214
|
|
|
210
215
|
def head(self, n: int) -> "StreamingTransformer":
|
|
@@ -299,16 +304,21 @@ class StreamingTransformer:
|
|
|
299
304
|
ext = path.suffix.lower()
|
|
300
305
|
|
|
301
306
|
if ext == ".jsonl":
|
|
302
|
-
|
|
307
|
+
count = self._save_jsonl(filepath, show_progress)
|
|
303
308
|
elif ext == ".csv":
|
|
304
|
-
|
|
309
|
+
count = self._save_batched(filepath, "csv", batch_size, show_progress)
|
|
305
310
|
elif ext == ".parquet":
|
|
306
|
-
|
|
311
|
+
count = self._save_batched(filepath, "parquet", batch_size, show_progress)
|
|
307
312
|
elif ext in (".arrow", ".feather"):
|
|
308
|
-
|
|
313
|
+
count = self._save_batched(filepath, "arrow", batch_size, show_progress)
|
|
309
314
|
else:
|
|
310
|
-
|
|
311
|
-
|
|
315
|
+
count = self._save_jsonl(filepath, show_progress)
|
|
316
|
+
|
|
317
|
+
# 打印错误摘要
|
|
318
|
+
if self._error_count > 0:
|
|
319
|
+
print(f"⚠️ 跳过 {self._error_count} 条错误记录: {self._first_error}")
|
|
320
|
+
|
|
321
|
+
return count
|
|
312
322
|
|
|
313
323
|
def _save_jsonl(self, filepath: str, show_progress: bool) -> int:
|
|
314
324
|
"""JSONL 逐行流式保存(使用 orjson)"""
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dtflow
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
|
|
5
5
|
Project-URL: Homepage, https://github.com/yourusername/DataTransformer
|
|
6
6
|
Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
|
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
dtflow/__init__.py,sha256=
|
|
1
|
+
dtflow/__init__.py,sha256=Gd9Us_BDXaxmMIGlz51E6OZDohqzweOrvB-2j8k3KVs,2347
|
|
2
2
|
dtflow/__main__.py,sha256=7lKluJTruDPN4CKSK2mWLUxSUlVLtkrqXyRMjlGk7SY,10595
|
|
3
3
|
dtflow/converters.py,sha256=gyy-K15zjzGBawFnZa8D9JX37JZ47rey2GhjKa2pxFo,22081
|
|
4
4
|
dtflow/core.py,sha256=szm9qmRVe1Q97O18UTGz7xTsdV-V8L4D6Bl1bxBJCWk,28778
|
|
5
5
|
dtflow/lineage.py,sha256=vQ06lxBHftu-Ma5HlISp3F2eiIvwagQSnUGaLeABDZY,12190
|
|
6
6
|
dtflow/pipeline.py,sha256=zZaC4fg5vsp_30Fhbg75vu0yggsdvf28bWBiVDWzZ6Y,13901
|
|
7
7
|
dtflow/presets.py,sha256=OP1nnM5NFk5Kli9FsXK0xAot48E5OQ6-VOIJT9ffXPg,5023
|
|
8
|
-
dtflow/streaming.py,sha256=
|
|
8
|
+
dtflow/streaming.py,sha256=lYf9gi5U-3oqr7oEe5mENx1r-LtRb2YfGNq1fP3_sw4,21972
|
|
9
9
|
dtflow/tokenizers.py,sha256=zxE6XZGjZ_DOGCjRSClI9xaAbFVf8FS6jwwssGoi_9U,18111
|
|
10
10
|
dtflow/cli/__init__.py,sha256=QhZ-thgx9IBTFII7T_hdoWFUl0CCsdGQHN5ZEZw2XB0,423
|
|
11
|
-
dtflow/cli/commands.py,sha256=
|
|
11
|
+
dtflow/cli/commands.py,sha256=8t_HgFuFqGt1HXPpEDV47qB2fwMD5C6d9Bjj-VNb37I,84958
|
|
12
12
|
dtflow/mcp/__init__.py,sha256=huEJ3rXDbxDRjsLPEvjNT2u3tWs6Poiv6fokPIrByjw,897
|
|
13
13
|
dtflow/mcp/__main__.py,sha256=PoT2ZZmJq9xDZxDACJfqDW9Ld_ukHrGNK-0XUd7WGnY,448
|
|
14
14
|
dtflow/mcp/cli.py,sha256=ck0oOS_642cNktxULaMRE7BJfMxsBCwotmCj3PSPwVk,13110
|
|
@@ -19,7 +19,7 @@ dtflow/storage/io.py,sha256=XNWLL10a7jgOjM1IfTN9kIuW23dwzFE1nnaw4E3LaiU,21885
|
|
|
19
19
|
dtflow/utils/__init__.py,sha256=f8v9HJZMWRI5AL64Vjr76Pf2Na_whOF9nJBKgPbXXYg,429
|
|
20
20
|
dtflow/utils/display.py,sha256=OeOdTh6mbDwSkDWlmkjfpTjy2QG8ZUaYU0NpHUWkpEQ,5881
|
|
21
21
|
dtflow/utils/field_path.py,sha256=WcNA-LZh3H61a77FEzB_R7YAyyZl3M8ofdq05ytQGmI,7459
|
|
22
|
-
dtflow-0.4.
|
|
23
|
-
dtflow-0.4.
|
|
24
|
-
dtflow-0.4.
|
|
25
|
-
dtflow-0.4.
|
|
22
|
+
dtflow-0.4.1.dist-info/METADATA,sha256=-rdgDNFMy3pPO5mpMcKlB_quxSlD9mUIoe_tIUXoPP4,18306
|
|
23
|
+
dtflow-0.4.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
24
|
+
dtflow-0.4.1.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
|
|
25
|
+
dtflow-0.4.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|