dtflow 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dtflow/__init__.py CHANGED
@@ -26,6 +26,12 @@ from .converters import ( # LLaMA-Factory 扩展; ms-swift
26
26
  to_swift_vlm,
27
27
  )
28
28
  from .core import DataTransformer, DictWrapper, TransformError, TransformErrors
29
+ from .framework import (
30
+ CompatibilityResult,
31
+ check_compatibility,
32
+ detect_format,
33
+ export_for,
34
+ )
29
35
  from .presets import get_preset, list_presets
30
36
  from .schema import (
31
37
  Field,
@@ -38,12 +44,6 @@ from .schema import (
38
44
  sharegpt_schema,
39
45
  validate_data,
40
46
  )
41
- from .framework import (
42
- CompatibilityResult,
43
- check_compatibility,
44
- detect_format,
45
- export_for,
46
- )
47
47
  from .storage import load_data, sample_file, save_data
48
48
  from .streaming import StreamingTransformer, load_sharded, load_stream, process_shards
49
49
  from .tokenizers import (
@@ -60,7 +60,7 @@ from .tokenizers import (
60
60
  token_stats,
61
61
  )
62
62
 
63
- __version__ = "0.5.2"
63
+ __version__ = "0.5.3"
64
64
 
65
65
  __all__ = [
66
66
  # core
dtflow/converters.py CHANGED
@@ -4,7 +4,7 @@
4
4
  提供与 HuggingFace datasets 等常用格式的互转功能。
5
5
  """
6
6
 
7
- from typing import Any, Callable, Dict, List, Optional, Union
7
+ from typing import Any, Callable, Dict, List, Optional
8
8
 
9
9
 
10
10
  def to_hf_dataset(data: List[Dict[str, Any]]):
@@ -143,14 +143,16 @@ def to_openai_batch(
143
143
  >>> batch_input = dt.to(to_openai_batch(model="gpt-4o"))
144
144
  """
145
145
 
146
- def transform(item, idx=[0]) -> dict:
146
+ counter = {"idx": 0}
147
+
148
+ def transform(item) -> dict:
147
149
  messages = item.get(messages_field, []) if hasattr(item, "get") else item[messages_field]
148
150
 
149
151
  if custom_id_field:
150
152
  custom_id = item.get(custom_id_field) if hasattr(item, "get") else item[custom_id_field]
151
153
  else:
152
- custom_id = f"request-{idx[0]}"
153
- idx[0] += 1
154
+ custom_id = f"request-{counter['idx']}"
155
+ counter["idx"] += 1
154
156
 
155
157
  return {
156
158
  "custom_id": str(custom_id),
@@ -196,7 +198,7 @@ def to_llama_factory(
196
198
  """
197
199
 
198
200
  def transform(item) -> dict:
199
- get = lambda f: (item.get(f, "") if hasattr(item, "get") else item.get(f, ""))
201
+ get = lambda f: item.get(f, "") if hasattr(item, "get") else getattr(item, f, "")
200
202
 
201
203
  result = {
202
204
  "instruction": get(instruction_field),
@@ -248,7 +250,7 @@ def to_axolotl(
248
250
  conversations = (
249
251
  item.get(conversations_field, [])
250
252
  if hasattr(item, "get")
251
- else item.get(conversations_field, [])
253
+ else getattr(item, conversations_field, [])
252
254
  )
253
255
 
254
256
  # 如果已经是正确格式,直接返回
@@ -257,7 +259,9 @@ def to_axolotl(
257
259
  return {"conversations": conversations}
258
260
 
259
261
  # 尝试从 messages 格式转换
260
- messages = item.get("messages", []) if hasattr(item, "get") else item.get("messages", [])
262
+ messages = (
263
+ item.get("messages", []) if hasattr(item, "get") else getattr(item, "messages", [])
264
+ )
261
265
  if messages:
262
266
  role_map = {"user": "human", "assistant": "gpt", "system": "system"}
263
267
  conversations = [
@@ -312,7 +316,7 @@ def to_llama_factory_sharegpt(
312
316
  }
313
317
 
314
318
  def transform(item) -> dict:
315
- get = lambda f: (item.get(f, "") if hasattr(item, "get") else item.get(f, ""))
319
+ get = lambda f: item.get(f, "") if hasattr(item, "get") else getattr(item, f, "")
316
320
  messages = get(messages_field) or []
317
321
 
318
322
  conversations = []
@@ -385,7 +389,7 @@ def to_llama_factory_vlm(
385
389
  """
386
390
 
387
391
  def transform(item) -> dict:
388
- get = lambda f: item.get(f) if hasattr(item, "get") else item.get(f)
392
+ get = lambda f: item.get(f) if hasattr(item, "get") else getattr(item, f, None)
389
393
  messages = get(messages_field) or []
390
394
 
391
395
  instruction = ""
@@ -467,7 +471,7 @@ def to_llama_factory_vlm_sharegpt(
467
471
  role_map = {"user": "human", "assistant": "gpt", "system": "system"}
468
472
 
469
473
  def transform(item) -> dict:
470
- get = lambda f: item.get(f) if hasattr(item, "get") else item.get(f)
474
+ get = lambda f: item.get(f) if hasattr(item, "get") else getattr(item, f, None)
471
475
  messages = get(messages_field) or []
472
476
 
473
477
  conversations = []
@@ -541,7 +545,7 @@ def to_swift_messages(
541
545
  """
542
546
 
543
547
  def transform(item) -> dict:
544
- get = lambda f: item.get(f) if hasattr(item, "get") else item.get(f)
548
+ get = lambda f: item.get(f) if hasattr(item, "get") else getattr(item, f, None)
545
549
  messages = get(messages_field) or []
546
550
 
547
551
  # 复制 messages,避免修改原数据
@@ -600,7 +604,7 @@ def to_swift_query_response(
600
604
  """
601
605
 
602
606
  def transform(item) -> dict:
603
- get = lambda f: item.get(f) if hasattr(item, "get") else item.get(f)
607
+ get = lambda f: item.get(f) if hasattr(item, "get") else getattr(item, f, None)
604
608
 
605
609
  query = get(query_field)
606
610
  response = get(response_field)
@@ -693,7 +697,7 @@ def to_swift_vlm(
693
697
  """
694
698
 
695
699
  def transform(item) -> dict:
696
- get = lambda f: item.get(f) if hasattr(item, "get") else item.get(f)
700
+ get = lambda f: item.get(f) if hasattr(item, "get") else getattr(item, f, None)
697
701
  messages = get(messages_field) or []
698
702
 
699
703
  result_messages = []
dtflow/presets.py CHANGED
@@ -6,6 +6,8 @@
6
6
 
7
7
  from typing import Any, Callable
8
8
 
9
+ from dtflow.utils.helpers import get_field_value
10
+
9
11
 
10
12
  def openai_chat(
11
13
  user_field: str = "q", assistant_field: str = "a", system_prompt: str = None
@@ -33,8 +35,8 @@ def openai_chat(
33
35
  if system_prompt:
34
36
  messages.append({"role": "system", "content": system_prompt})
35
37
 
36
- user_content = getattr(item, user_field, None) or item.get(user_field, "")
37
- assistant_content = getattr(item, assistant_field, None) or item.get(assistant_field, "")
38
+ user_content = get_field_value(item, user_field)
39
+ assistant_content = get_field_value(item, assistant_field)
38
40
 
39
41
  messages.append({"role": "user", "content": user_content})
40
42
  messages.append({"role": "assistant", "content": assistant_content})
@@ -60,10 +62,9 @@ def alpaca(
60
62
 
61
63
  def transform(item: Any) -> dict:
62
64
  return {
63
- "instruction": getattr(item, instruction_field, None)
64
- or item.get(instruction_field, ""),
65
- "input": getattr(item, input_field, None) or item.get(input_field, ""),
66
- "output": getattr(item, output_field, None) or item.get(output_field, ""),
65
+ "instruction": get_field_value(item, instruction_field),
66
+ "input": get_field_value(item, input_field),
67
+ "output": get_field_value(item, output_field),
67
68
  }
68
69
 
69
70
  return transform
@@ -84,9 +85,7 @@ def sharegpt(conversations_field: str = "conversations", role_mapping: dict = No
84
85
  role_mapping = role_mapping or {"user": "human", "assistant": "gpt"}
85
86
 
86
87
  def transform(item: Any) -> dict:
87
- conversations = getattr(item, conversations_field, None) or item.get(
88
- conversations_field, []
89
- )
88
+ conversations = get_field_value(item, conversations_field, [])
90
89
 
91
90
  # 如果已经是对话格式,直接返回
92
91
  if conversations:
@@ -102,7 +101,7 @@ def sharegpt(conversations_field: str = "conversations", role_mapping: dict = No
102
101
  ("answer", "gpt"),
103
102
  ("output", "gpt"),
104
103
  ]:
105
- value = getattr(item, field, None) or item.get(field, None)
104
+ value = get_field_value(item, field, None)
106
105
  if value:
107
106
  result.append({"from": role, "value": value})
108
107
 
@@ -127,9 +126,9 @@ def dpo_pair(
127
126
 
128
127
  def transform(item: Any) -> dict:
129
128
  return {
130
- "prompt": getattr(item, prompt_field, None) or item.get(prompt_field, ""),
131
- "chosen": getattr(item, chosen_field, None) or item.get(chosen_field, ""),
132
- "rejected": getattr(item, rejected_field, None) or item.get(rejected_field, ""),
129
+ "prompt": get_field_value(item, prompt_field),
130
+ "chosen": get_field_value(item, chosen_field),
131
+ "rejected": get_field_value(item, rejected_field),
133
132
  }
134
133
 
135
134
  return transform
@@ -148,8 +147,8 @@ def simple_qa(question_field: str = "q", answer_field: str = "a") -> Callable:
148
147
 
149
148
  def transform(item: Any) -> dict:
150
149
  return {
151
- "question": getattr(item, question_field, None) or item.get(question_field, ""),
152
- "answer": getattr(item, answer_field, None) or item.get(answer_field, ""),
150
+ "question": get_field_value(item, question_field),
151
+ "answer": get_field_value(item, answer_field),
153
152
  }
154
153
 
155
154
  return transform
dtflow/utils/__init__.py CHANGED
@@ -9,6 +9,7 @@ from .field_path import (
9
9
  get_field_with_spec,
10
10
  parse_field_spec,
11
11
  )
12
+ from .helpers import get_field_value
12
13
 
13
14
  __all__ = [
14
15
  "display_data",
@@ -20,4 +21,6 @@ __all__ = [
20
21
  "extract",
21
22
  "extract_with_spec",
22
23
  "ExpandMode",
24
+ # helpers
25
+ "get_field_value",
23
26
  ]
@@ -0,0 +1,30 @@
1
+ """公共辅助函数"""
2
+
3
+ from typing import Any
4
+
5
+
6
+ def get_field_value(item: Any, field: str, default: Any = "") -> Any:
7
+ """
8
+ 获取字段值,支持 DictWrapper 和普通 dict。
9
+
10
+ 优先尝试 dict.get(),如果没有 get 方法则使用 getattr()。
11
+
12
+ Args:
13
+ item: 数据对象(dict 或 DictWrapper)
14
+ field: 字段名
15
+ default: 默认值
16
+
17
+ Returns:
18
+ 字段值或默认值
19
+
20
+ Examples:
21
+ >>> get_field_value({"name": "test"}, "name")
22
+ 'test'
23
+ >>> get_field_value({"name": ""}, "name", "default")
24
+ 'default'
25
+ """
26
+ if hasattr(item, "get"):
27
+ value = item.get(field, default)
28
+ else:
29
+ value = getattr(item, field, default)
30
+ return value if value else default
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dtflow
3
- Version: 0.5.2
3
+ Version: 0.5.3
4
4
  Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
5
5
  Project-URL: Homepage, https://github.com/yourusername/DataTransformer
6
6
  Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
@@ -1,11 +1,11 @@
1
- dtflow/__init__.py,sha256=PTqh_6-F6eEwg1RxQ0ueP6CYnZauMuqYhlZe2BJphr0,3031
1
+ dtflow/__init__.py,sha256=RJql_KmINJNbq2FEqU7jD9Z0c5ETkxQJPvUUPKiFt74,3031
2
2
  dtflow/__main__.py,sha256=ySpqvEn7k-vsrYFPx-8O6p-yx_24KccgnOSPd2XybhM,12572
3
- dtflow/converters.py,sha256=gyy-K15zjzGBawFnZa8D9JX37JZ47rey2GhjKa2pxFo,22081
3
+ dtflow/converters.py,sha256=yXafSDeRC7DB2MMj8fD1NWjAG8HoAGh5Ay2A5Z7s6xA,22206
4
4
  dtflow/core.py,sha256=qMo6B3LK--TWRK7ZBKObGcs3pKFnd0NPoaM0T8JC7Jw,38135
5
5
  dtflow/framework.py,sha256=jyICi_RWHjX7WfsXdSbWmP1SL7y1OWSPyd5G5Y-lvg4,17578
6
6
  dtflow/lineage.py,sha256=jie3OL1qK90-_cOOqqLbhSJ1oGUktDM1x5HRpQ5Qiyc,12800
7
7
  dtflow/pipeline.py,sha256=zZaC4fg5vsp_30Fhbg75vu0yggsdvf28bWBiVDWzZ6Y,13901
8
- dtflow/presets.py,sha256=OP1nnM5NFk5Kli9FsXK0xAot48E5OQ6-VOIJT9ffXPg,5023
8
+ dtflow/presets.py,sha256=qa8WQJhbNMuGxqqgA9BFadEBwDB9s0zWNxxhzF3q1K8,4701
9
9
  dtflow/schema.py,sha256=IFcij22_UFKcgKT1YWwRg2QJO0vcAvCb1arZmsGByts,16824
10
10
  dtflow/streaming.py,sha256=dxpNd1-Wz_PTLTdvM5qn06_2TJr5NRlIIuw0LOSS2Iw,24755
11
11
  dtflow/tokenizers.py,sha256=7ZAelSmcDxLWH5kICgH9Q1ULH3_BfDZb9suHMjJJRZU,20589
@@ -27,10 +27,11 @@ dtflow/mcp/docs.py,sha256=DI2Vf-eFo4chRP_bDLsv4Uc3kJt8_1emz8N-NBSVirM,8834
27
27
  dtflow/mcp/server.py,sha256=Nf0UlqDGhV55ndGuEglfr7VRjDWAC_9rRsNhdr0-ssM,4275
28
28
  dtflow/storage/__init__.py,sha256=C0jpWNQU808Ezz7lWneddABal3wILy8ijFUNiSKbHV4,362
29
29
  dtflow/storage/io.py,sha256=ZH2aSE-S89gpy3z4oTqhcqWf4u10OdkDoyul7o_YBDI,23374
30
- dtflow/utils/__init__.py,sha256=f8v9HJZMWRI5AL64Vjr76Pf2Na_whOF9nJBKgPbXXYg,429
30
+ dtflow/utils/__init__.py,sha256=Pn-ltwV04fBQmeZG7FxInDQmzH29LYOi90LgeLMEuQk,506
31
31
  dtflow/utils/display.py,sha256=OeOdTh6mbDwSkDWlmkjfpTjy2QG8ZUaYU0NpHUWkpEQ,5881
32
32
  dtflow/utils/field_path.py,sha256=K8nU196RxTSJ1OoieTWGcYOWl9KjGq2iSxCAkfjECuM,7621
33
- dtflow-0.5.2.dist-info/METADATA,sha256=RlpGaySrAIgTviom_Wyn6o2LWzQQVihff12Jpazy10o,22544
34
- dtflow-0.5.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
35
- dtflow-0.5.2.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
36
- dtflow-0.5.2.dist-info/RECORD,,
33
+ dtflow/utils/helpers.py,sha256=JXN176_B2pm53GLVyZ1wj3wrmBJG52Tkw6AMQSdj7M8,791
34
+ dtflow-0.5.3.dist-info/METADATA,sha256=5joXihL8gkmnNEaUTqRpe0_U-y8osaIfdX0v91WVtK8,22544
35
+ dtflow-0.5.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
36
+ dtflow-0.5.3.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
37
+ dtflow-0.5.3.dist-info/RECORD,,
File without changes