dtflow 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dtflow/__init__.py CHANGED
@@ -42,7 +42,7 @@ from .tokenizers import (
42
42
  token_stats,
43
43
  )
44
44
 
45
- __version__ = "0.4.0"
45
+ __version__ = "0.4.1"
46
46
 
47
47
  __all__ = [
48
48
  # core
dtflow/cli/commands.py CHANGED
@@ -796,6 +796,17 @@ def _generate_default_transform(field_names: List[str]) -> str:
796
796
  return "\n".join(lines) if lines else " # 在这里定义输出字段"
797
797
 
798
798
 
799
+ def _unwrap(obj: Any) -> Any:
800
+ """递归将 DictWrapper 转换为普通 dict"""
801
+ if hasattr(obj, "to_dict"):
802
+ return _unwrap(obj.to_dict())
803
+ if isinstance(obj, dict):
804
+ return {k: _unwrap(v) for k, v in obj.items()}
805
+ if isinstance(obj, list):
806
+ return [_unwrap(v) for v in obj]
807
+ return obj
808
+
809
+
799
810
  def _execute_transform(
800
811
  input_path: Path,
801
812
  config_path: Path,
@@ -829,7 +840,8 @@ def _execute_transform(
829
840
  try:
830
841
  # 包装转换函数以支持属性访问(配置文件中定义的 Item 类)
831
842
  def wrapped_transform(item):
832
- return transform_func(DictWrapper(item))
843
+ result = transform_func(DictWrapper(item))
844
+ return _unwrap(result)
833
845
 
834
846
  st = load_stream(str(input_path))
835
847
  if num:
@@ -926,7 +938,8 @@ def _execute_preset_transform(
926
938
  try:
927
939
  # 包装转换函数以支持属性访问
928
940
  def wrapped_transform(item):
929
- return transform_func(DictWrapper(item))
941
+ result = transform_func(DictWrapper(item))
942
+ return _unwrap(result)
930
943
 
931
944
  st = load_stream(str(input_path))
932
945
  if num:
dtflow/streaming.py CHANGED
@@ -84,6 +84,8 @@ class StreamingTransformer:
84
84
  self._source_path = source_path
85
85
  self._total = total
86
86
  self._operations: List[Dict[str, Any]] = []
87
+ self._error_count = 0
88
+ self._first_error: Optional[str] = None
87
89
 
88
90
  @classmethod
89
91
  def load_stream(cls, filepath: str, batch_size: int = 10000) -> "StreamingTransformer":
@@ -194,17 +196,20 @@ class StreamingTransformer:
194
196
  Returns:
195
197
  新的 StreamingTransformer(惰性,不立即执行)
196
198
  """
199
+ # transform 是 1:1 转换,保留 total
200
+ new_st = StreamingTransformer(iter([]), self._source_path, total=self._total)
201
+ new_st._operations = self._operations + [{"type": "transform", "func": func}]
197
202
 
198
203
  def transformed_iterator():
199
204
  for item in self._iterator:
200
205
  try:
201
206
  yield func(item)
202
- except Exception:
203
- pass # 跳过错误
207
+ except Exception as e:
208
+ new_st._error_count += 1
209
+ if new_st._first_error is None:
210
+ new_st._first_error = f"{type(e).__name__}: {e}"
204
211
 
205
- # transform 是 1:1 转换,保留 total
206
- new_st = StreamingTransformer(transformed_iterator(), self._source_path, total=self._total)
207
- new_st._operations = self._operations + [{"type": "transform", "func": func}]
212
+ new_st._iterator = transformed_iterator()
208
213
  return new_st
209
214
 
210
215
  def head(self, n: int) -> "StreamingTransformer":
@@ -299,16 +304,21 @@ class StreamingTransformer:
299
304
  ext = path.suffix.lower()
300
305
 
301
306
  if ext == ".jsonl":
302
- return self._save_jsonl(filepath, show_progress)
307
+ count = self._save_jsonl(filepath, show_progress)
303
308
  elif ext == ".csv":
304
- return self._save_batched(filepath, "csv", batch_size, show_progress)
309
+ count = self._save_batched(filepath, "csv", batch_size, show_progress)
305
310
  elif ext == ".parquet":
306
- return self._save_batched(filepath, "parquet", batch_size, show_progress)
311
+ count = self._save_batched(filepath, "parquet", batch_size, show_progress)
307
312
  elif ext in (".arrow", ".feather"):
308
- return self._save_batched(filepath, "arrow", batch_size, show_progress)
313
+ count = self._save_batched(filepath, "arrow", batch_size, show_progress)
309
314
  else:
310
- # 默认 JSONL
311
- return self._save_jsonl(filepath, show_progress)
315
+ count = self._save_jsonl(filepath, show_progress)
316
+
317
+ # 打印错误摘要
318
+ if self._error_count > 0:
319
+ print(f"⚠️ 跳过 {self._error_count} 条错误记录: {self._first_error}")
320
+
321
+ return count
312
322
 
313
323
  def _save_jsonl(self, filepath: str, show_progress: bool) -> int:
314
324
  """JSONL 逐行流式保存(使用 orjson)"""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dtflow
3
- Version: 0.4.0
3
+ Version: 0.4.1
4
4
  Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
5
5
  Project-URL: Homepage, https://github.com/yourusername/DataTransformer
6
6
  Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
@@ -1,14 +1,14 @@
1
- dtflow/__init__.py,sha256=OF6TdEQPvEpcAsuBBsHeycXo6OfDY_Ar_YWaMPhiBFI,2347
1
+ dtflow/__init__.py,sha256=Gd9Us_BDXaxmMIGlz51E6OZDohqzweOrvB-2j8k3KVs,2347
2
2
  dtflow/__main__.py,sha256=7lKluJTruDPN4CKSK2mWLUxSUlVLtkrqXyRMjlGk7SY,10595
3
3
  dtflow/converters.py,sha256=gyy-K15zjzGBawFnZa8D9JX37JZ47rey2GhjKa2pxFo,22081
4
4
  dtflow/core.py,sha256=szm9qmRVe1Q97O18UTGz7xTsdV-V8L4D6Bl1bxBJCWk,28778
5
5
  dtflow/lineage.py,sha256=vQ06lxBHftu-Ma5HlISp3F2eiIvwagQSnUGaLeABDZY,12190
6
6
  dtflow/pipeline.py,sha256=zZaC4fg5vsp_30Fhbg75vu0yggsdvf28bWBiVDWzZ6Y,13901
7
7
  dtflow/presets.py,sha256=OP1nnM5NFk5Kli9FsXK0xAot48E5OQ6-VOIJT9ffXPg,5023
8
- dtflow/streaming.py,sha256=O8waTXDOEk_6ES_H3-TKTc3zyc-EC8DjOfgepAKV96A,21556
8
+ dtflow/streaming.py,sha256=lYf9gi5U-3oqr7oEe5mENx1r-LtRb2YfGNq1fP3_sw4,21972
9
9
  dtflow/tokenizers.py,sha256=zxE6XZGjZ_DOGCjRSClI9xaAbFVf8FS6jwwssGoi_9U,18111
10
10
  dtflow/cli/__init__.py,sha256=QhZ-thgx9IBTFII7T_hdoWFUl0CCsdGQHN5ZEZw2XB0,423
11
- dtflow/cli/commands.py,sha256=1NEHcwNq68le-YEy70j5bacn4RLWSJj_HWcZkOUl2bI,84537
11
+ dtflow/cli/commands.py,sha256=8t_HgFuFqGt1HXPpEDV47qB2fwMD5C6d9Bjj-VNb37I,84958
12
12
  dtflow/mcp/__init__.py,sha256=huEJ3rXDbxDRjsLPEvjNT2u3tWs6Poiv6fokPIrByjw,897
13
13
  dtflow/mcp/__main__.py,sha256=PoT2ZZmJq9xDZxDACJfqDW9Ld_ukHrGNK-0XUd7WGnY,448
14
14
  dtflow/mcp/cli.py,sha256=ck0oOS_642cNktxULaMRE7BJfMxsBCwotmCj3PSPwVk,13110
@@ -19,7 +19,7 @@ dtflow/storage/io.py,sha256=XNWLL10a7jgOjM1IfTN9kIuW23dwzFE1nnaw4E3LaiU,21885
19
19
  dtflow/utils/__init__.py,sha256=f8v9HJZMWRI5AL64Vjr76Pf2Na_whOF9nJBKgPbXXYg,429
20
20
  dtflow/utils/display.py,sha256=OeOdTh6mbDwSkDWlmkjfpTjy2QG8ZUaYU0NpHUWkpEQ,5881
21
21
  dtflow/utils/field_path.py,sha256=WcNA-LZh3H61a77FEzB_R7YAyyZl3M8ofdq05ytQGmI,7459
22
- dtflow-0.4.0.dist-info/METADATA,sha256=HJhvSYxPG6wmYZPx0qLVQLSbmyK0CIp8qiu3ppe65mA,18306
23
- dtflow-0.4.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
24
- dtflow-0.4.0.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
25
- dtflow-0.4.0.dist-info/RECORD,,
22
+ dtflow-0.4.1.dist-info/METADATA,sha256=-rdgDNFMy3pPO5mpMcKlB_quxSlD9mUIoe_tIUXoPP4,18306
23
+ dtflow-0.4.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
24
+ dtflow-0.4.1.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
25
+ dtflow-0.4.1.dist-info/RECORD,,
File without changes