dtflow 0.4.0__tar.gz → 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {dtflow-0.4.0 → dtflow-0.4.1}/PKG-INFO +1 -1
  2. {dtflow-0.4.0 → dtflow-0.4.1}/dtflow/__init__.py +1 -1
  3. {dtflow-0.4.0 → dtflow-0.4.1}/dtflow/cli/commands.py +15 -2
  4. {dtflow-0.4.0 → dtflow-0.4.1}/dtflow/streaming.py +21 -11
  5. {dtflow-0.4.0 → dtflow-0.4.1}/tests/test_streaming.py +19 -0
  6. {dtflow-0.4.0 → dtflow-0.4.1}/tests/test_transformer.py +49 -0
  7. {dtflow-0.4.0 → dtflow-0.4.1}/.gitignore +0 -0
  8. {dtflow-0.4.0 → dtflow-0.4.1}/README.md +0 -0
  9. {dtflow-0.4.0 → dtflow-0.4.1}/dtflow/__main__.py +0 -0
  10. {dtflow-0.4.0 → dtflow-0.4.1}/dtflow/cli/__init__.py +0 -0
  11. {dtflow-0.4.0 → dtflow-0.4.1}/dtflow/converters.py +0 -0
  12. {dtflow-0.4.0 → dtflow-0.4.1}/dtflow/core.py +0 -0
  13. {dtflow-0.4.0 → dtflow-0.4.1}/dtflow/lineage.py +0 -0
  14. {dtflow-0.4.0 → dtflow-0.4.1}/dtflow/mcp/__init__.py +0 -0
  15. {dtflow-0.4.0 → dtflow-0.4.1}/dtflow/mcp/__main__.py +0 -0
  16. {dtflow-0.4.0 → dtflow-0.4.1}/dtflow/mcp/cli.py +0 -0
  17. {dtflow-0.4.0 → dtflow-0.4.1}/dtflow/mcp/docs.py +0 -0
  18. {dtflow-0.4.0 → dtflow-0.4.1}/dtflow/mcp/server.py +0 -0
  19. {dtflow-0.4.0 → dtflow-0.4.1}/dtflow/pipeline.py +0 -0
  20. {dtflow-0.4.0 → dtflow-0.4.1}/dtflow/presets.py +0 -0
  21. {dtflow-0.4.0 → dtflow-0.4.1}/dtflow/storage/__init__.py +0 -0
  22. {dtflow-0.4.0 → dtflow-0.4.1}/dtflow/storage/io.py +0 -0
  23. {dtflow-0.4.0 → dtflow-0.4.1}/dtflow/tokenizers.py +0 -0
  24. {dtflow-0.4.0 → dtflow-0.4.1}/dtflow/utils/__init__.py +0 -0
  25. {dtflow-0.4.0 → dtflow-0.4.1}/dtflow/utils/display.py +0 -0
  26. {dtflow-0.4.0 → dtflow-0.4.1}/dtflow/utils/field_path.py +0 -0
  27. {dtflow-0.4.0 → dtflow-0.4.1}/pyproject.toml +0 -0
  28. {dtflow-0.4.0 → dtflow-0.4.1}/tests/benchmark_io.py +0 -0
  29. {dtflow-0.4.0 → dtflow-0.4.1}/tests/test_converters.py +0 -0
  30. {dtflow-0.4.0 → dtflow-0.4.1}/tests/test_field_path.py +0 -0
  31. {dtflow-0.4.0 → dtflow-0.4.1}/tests/test_io.py +0 -0
  32. {dtflow-0.4.0 → dtflow-0.4.1}/tests/test_lineage.py +0 -0
  33. {dtflow-0.4.0 → dtflow-0.4.1}/tests/test_pipeline.py +0 -0
  34. {dtflow-0.4.0 → dtflow-0.4.1}/tests/test_tokenizers.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dtflow
3
- Version: 0.4.0
3
+ Version: 0.4.1
4
4
  Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
5
5
  Project-URL: Homepage, https://github.com/yourusername/DataTransformer
6
6
  Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
@@ -42,7 +42,7 @@ from .tokenizers import (
42
42
  token_stats,
43
43
  )
44
44
 
45
- __version__ = "0.4.0"
45
+ __version__ = "0.4.1"
46
46
 
47
47
  __all__ = [
48
48
  # core
@@ -796,6 +796,17 @@ def _generate_default_transform(field_names: List[str]) -> str:
796
796
  return "\n".join(lines) if lines else " # 在这里定义输出字段"
797
797
 
798
798
 
799
+ def _unwrap(obj: Any) -> Any:
800
+ """递归将 DictWrapper 转换为普通 dict"""
801
+ if hasattr(obj, "to_dict"):
802
+ return _unwrap(obj.to_dict())
803
+ if isinstance(obj, dict):
804
+ return {k: _unwrap(v) for k, v in obj.items()}
805
+ if isinstance(obj, list):
806
+ return [_unwrap(v) for v in obj]
807
+ return obj
808
+
809
+
799
810
  def _execute_transform(
800
811
  input_path: Path,
801
812
  config_path: Path,
@@ -829,7 +840,8 @@ def _execute_transform(
829
840
  try:
830
841
  # 包装转换函数以支持属性访问(配置文件中定义的 Item 类)
831
842
  def wrapped_transform(item):
832
- return transform_func(DictWrapper(item))
843
+ result = transform_func(DictWrapper(item))
844
+ return _unwrap(result)
833
845
 
834
846
  st = load_stream(str(input_path))
835
847
  if num:
@@ -926,7 +938,8 @@ def _execute_preset_transform(
926
938
  try:
927
939
  # 包装转换函数以支持属性访问
928
940
  def wrapped_transform(item):
929
- return transform_func(DictWrapper(item))
941
+ result = transform_func(DictWrapper(item))
942
+ return _unwrap(result)
930
943
 
931
944
  st = load_stream(str(input_path))
932
945
  if num:
@@ -84,6 +84,8 @@ class StreamingTransformer:
84
84
  self._source_path = source_path
85
85
  self._total = total
86
86
  self._operations: List[Dict[str, Any]] = []
87
+ self._error_count = 0
88
+ self._first_error: Optional[str] = None
87
89
 
88
90
  @classmethod
89
91
  def load_stream(cls, filepath: str, batch_size: int = 10000) -> "StreamingTransformer":
@@ -194,17 +196,20 @@ class StreamingTransformer:
194
196
  Returns:
195
197
  新的 StreamingTransformer(惰性,不立即执行)
196
198
  """
199
+ # transform 是 1:1 转换,保留 total
200
+ new_st = StreamingTransformer(iter([]), self._source_path, total=self._total)
201
+ new_st._operations = self._operations + [{"type": "transform", "func": func}]
197
202
 
198
203
  def transformed_iterator():
199
204
  for item in self._iterator:
200
205
  try:
201
206
  yield func(item)
202
- except Exception:
203
- pass # 跳过错误
207
+ except Exception as e:
208
+ new_st._error_count += 1
209
+ if new_st._first_error is None:
210
+ new_st._first_error = f"{type(e).__name__}: {e}"
204
211
 
205
- # transform 是 1:1 转换,保留 total
206
- new_st = StreamingTransformer(transformed_iterator(), self._source_path, total=self._total)
207
- new_st._operations = self._operations + [{"type": "transform", "func": func}]
212
+ new_st._iterator = transformed_iterator()
208
213
  return new_st
209
214
 
210
215
  def head(self, n: int) -> "StreamingTransformer":
@@ -299,16 +304,21 @@ class StreamingTransformer:
299
304
  ext = path.suffix.lower()
300
305
 
301
306
  if ext == ".jsonl":
302
- return self._save_jsonl(filepath, show_progress)
307
+ count = self._save_jsonl(filepath, show_progress)
303
308
  elif ext == ".csv":
304
- return self._save_batched(filepath, "csv", batch_size, show_progress)
309
+ count = self._save_batched(filepath, "csv", batch_size, show_progress)
305
310
  elif ext == ".parquet":
306
- return self._save_batched(filepath, "parquet", batch_size, show_progress)
311
+ count = self._save_batched(filepath, "parquet", batch_size, show_progress)
307
312
  elif ext in (".arrow", ".feather"):
308
- return self._save_batched(filepath, "arrow", batch_size, show_progress)
313
+ count = self._save_batched(filepath, "arrow", batch_size, show_progress)
309
314
  else:
310
- # 默认 JSONL
311
- return self._save_jsonl(filepath, show_progress)
315
+ count = self._save_jsonl(filepath, show_progress)
316
+
317
+ # 打印错误摘要
318
+ if self._error_count > 0:
319
+ print(f"⚠️ 跳过 {self._error_count} 条错误记录: {self._first_error}")
320
+
321
+ return count
312
322
 
313
323
  def _save_jsonl(self, filepath: str, show_progress: bool) -> int:
314
324
  """JSONL 逐行流式保存(使用 orjson)"""
@@ -279,3 +279,22 @@ class TestEdgeCases:
279
279
  items2 = st.collect()
280
280
  assert len(items2) == 0
281
281
  os.unlink(temp_jsonl)
282
+
283
+ def test_transform_error_tracking(self, temp_jsonl, tmp_path):
284
+ """测试转换错误跟踪"""
285
+ st = load_stream(temp_jsonl)
286
+
287
+ def bad_transform(x):
288
+ if x["id"] % 10 == 5: # 5, 15, 25, ... 共 10 条会出错
289
+ raise KeyError("missing_key")
290
+ return {"new_id": x["id"]}
291
+
292
+ transformed = st.transform(bad_transform)
293
+ output_path = tmp_path / "output.jsonl"
294
+ count = transformed.save(str(output_path), show_progress=False)
295
+
296
+ # 验证结果
297
+ assert count == 90 # 100 - 10 = 90
298
+ assert transformed._error_count == 10
299
+ assert "KeyError" in transformed._first_error
300
+ os.unlink(temp_jsonl)
@@ -610,5 +610,54 @@ class TestConcat:
610
610
  assert len(dt2) == 1
611
611
 
612
612
 
613
+ class TestUnwrap:
614
+ """测试 _unwrap 函数(DictWrapper 转 dict)"""
615
+
616
+ def test_unwrap_simple(self):
617
+ """测试简单 DictWrapper 转换"""
618
+ from dtflow.cli.commands import _unwrap
619
+
620
+ wrapper = DictWrapper({"a": 1, "b": "text"})
621
+ result = _unwrap(wrapper)
622
+
623
+ assert result == {"a": 1, "b": "text"}
624
+ assert type(result) is dict
625
+
626
+ def test_unwrap_nested(self):
627
+ """测试嵌套 DictWrapper 转换"""
628
+ from dtflow.cli.commands import _unwrap
629
+
630
+ data = {"outer": {"inner": {"value": 123}}}
631
+ wrapper = DictWrapper(data)
632
+ # 访问嵌套会产生新的 DictWrapper
633
+ nested_wrapper = wrapper.outer.inner
634
+
635
+ result = _unwrap({"data": nested_wrapper, "list": [wrapper.outer]})
636
+
637
+ assert result == {"data": {"value": 123}, "list": [{"inner": {"value": 123}}]}
638
+ assert type(result["data"]) is dict
639
+ assert type(result["list"][0]) is dict
640
+
641
+ def test_unwrap_in_list(self):
642
+ """测试列表中的 DictWrapper 转换"""
643
+ from dtflow.cli.commands import _unwrap
644
+
645
+ wrapper = DictWrapper({"x": 1})
646
+ result = _unwrap([wrapper, {"y": 2}, wrapper])
647
+
648
+ assert result == [{"x": 1}, {"y": 2}, {"x": 1}]
649
+ assert all(type(item) is dict for item in result)
650
+
651
+ def test_unwrap_plain_dict(self):
652
+ """测试普通 dict 不受影响"""
653
+ from dtflow.cli.commands import _unwrap
654
+
655
+ data = {"a": 1, "nested": {"b": 2}}
656
+ result = _unwrap(data)
657
+
658
+ assert result == data
659
+ assert type(result) is dict
660
+
661
+
613
662
  if __name__ == "__main__":
614
663
  pytest.main([__file__, "-v"])
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes