dtflow 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dtflow/__init__.py +7 -7
- dtflow/converters.py +17 -13
- dtflow/presets.py +14 -15
- dtflow/utils/__init__.py +3 -0
- dtflow/utils/helpers.py +30 -0
- {dtflow-0.5.2.dist-info → dtflow-0.5.3.dist-info}/METADATA +1 -1
- {dtflow-0.5.2.dist-info → dtflow-0.5.3.dist-info}/RECORD +9 -8
- {dtflow-0.5.2.dist-info → dtflow-0.5.3.dist-info}/WHEEL +0 -0
- {dtflow-0.5.2.dist-info → dtflow-0.5.3.dist-info}/entry_points.txt +0 -0
dtflow/__init__.py
CHANGED
|
@@ -26,6 +26,12 @@ from .converters import ( # LLaMA-Factory 扩展; ms-swift
|
|
|
26
26
|
to_swift_vlm,
|
|
27
27
|
)
|
|
28
28
|
from .core import DataTransformer, DictWrapper, TransformError, TransformErrors
|
|
29
|
+
from .framework import (
|
|
30
|
+
CompatibilityResult,
|
|
31
|
+
check_compatibility,
|
|
32
|
+
detect_format,
|
|
33
|
+
export_for,
|
|
34
|
+
)
|
|
29
35
|
from .presets import get_preset, list_presets
|
|
30
36
|
from .schema import (
|
|
31
37
|
Field,
|
|
@@ -38,12 +44,6 @@ from .schema import (
|
|
|
38
44
|
sharegpt_schema,
|
|
39
45
|
validate_data,
|
|
40
46
|
)
|
|
41
|
-
from .framework import (
|
|
42
|
-
CompatibilityResult,
|
|
43
|
-
check_compatibility,
|
|
44
|
-
detect_format,
|
|
45
|
-
export_for,
|
|
46
|
-
)
|
|
47
47
|
from .storage import load_data, sample_file, save_data
|
|
48
48
|
from .streaming import StreamingTransformer, load_sharded, load_stream, process_shards
|
|
49
49
|
from .tokenizers import (
|
|
@@ -60,7 +60,7 @@ from .tokenizers import (
|
|
|
60
60
|
token_stats,
|
|
61
61
|
)
|
|
62
62
|
|
|
63
|
-
__version__ = "0.5.
|
|
63
|
+
__version__ = "0.5.3"
|
|
64
64
|
|
|
65
65
|
__all__ = [
|
|
66
66
|
# core
|
dtflow/converters.py
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
提供与 HuggingFace datasets 等常用格式的互转功能。
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
from typing import Any, Callable, Dict, List, Optional
|
|
7
|
+
from typing import Any, Callable, Dict, List, Optional
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
def to_hf_dataset(data: List[Dict[str, Any]]):
|
|
@@ -143,14 +143,16 @@ def to_openai_batch(
|
|
|
143
143
|
>>> batch_input = dt.to(to_openai_batch(model="gpt-4o"))
|
|
144
144
|
"""
|
|
145
145
|
|
|
146
|
-
|
|
146
|
+
counter = {"idx": 0}
|
|
147
|
+
|
|
148
|
+
def transform(item) -> dict:
|
|
147
149
|
messages = item.get(messages_field, []) if hasattr(item, "get") else item[messages_field]
|
|
148
150
|
|
|
149
151
|
if custom_id_field:
|
|
150
152
|
custom_id = item.get(custom_id_field) if hasattr(item, "get") else item[custom_id_field]
|
|
151
153
|
else:
|
|
152
|
-
custom_id = f"request-{idx
|
|
153
|
-
idx
|
|
154
|
+
custom_id = f"request-{counter['idx']}"
|
|
155
|
+
counter["idx"] += 1
|
|
154
156
|
|
|
155
157
|
return {
|
|
156
158
|
"custom_id": str(custom_id),
|
|
@@ -196,7 +198,7 @@ def to_llama_factory(
|
|
|
196
198
|
"""
|
|
197
199
|
|
|
198
200
|
def transform(item) -> dict:
|
|
199
|
-
get = lambda f:
|
|
201
|
+
get = lambda f: item.get(f, "") if hasattr(item, "get") else getattr(item, f, "")
|
|
200
202
|
|
|
201
203
|
result = {
|
|
202
204
|
"instruction": get(instruction_field),
|
|
@@ -248,7 +250,7 @@ def to_axolotl(
|
|
|
248
250
|
conversations = (
|
|
249
251
|
item.get(conversations_field, [])
|
|
250
252
|
if hasattr(item, "get")
|
|
251
|
-
else item
|
|
253
|
+
else getattr(item, conversations_field, [])
|
|
252
254
|
)
|
|
253
255
|
|
|
254
256
|
# 如果已经是正确格式,直接返回
|
|
@@ -257,7 +259,9 @@ def to_axolotl(
|
|
|
257
259
|
return {"conversations": conversations}
|
|
258
260
|
|
|
259
261
|
# 尝试从 messages 格式转换
|
|
260
|
-
messages =
|
|
262
|
+
messages = (
|
|
263
|
+
item.get("messages", []) if hasattr(item, "get") else getattr(item, "messages", [])
|
|
264
|
+
)
|
|
261
265
|
if messages:
|
|
262
266
|
role_map = {"user": "human", "assistant": "gpt", "system": "system"}
|
|
263
267
|
conversations = [
|
|
@@ -312,7 +316,7 @@ def to_llama_factory_sharegpt(
|
|
|
312
316
|
}
|
|
313
317
|
|
|
314
318
|
def transform(item) -> dict:
|
|
315
|
-
get = lambda f:
|
|
319
|
+
get = lambda f: item.get(f, "") if hasattr(item, "get") else getattr(item, f, "")
|
|
316
320
|
messages = get(messages_field) or []
|
|
317
321
|
|
|
318
322
|
conversations = []
|
|
@@ -385,7 +389,7 @@ def to_llama_factory_vlm(
|
|
|
385
389
|
"""
|
|
386
390
|
|
|
387
391
|
def transform(item) -> dict:
|
|
388
|
-
get = lambda f: item.get(f) if hasattr(item, "get") else item
|
|
392
|
+
get = lambda f: item.get(f) if hasattr(item, "get") else getattr(item, f, None)
|
|
389
393
|
messages = get(messages_field) or []
|
|
390
394
|
|
|
391
395
|
instruction = ""
|
|
@@ -467,7 +471,7 @@ def to_llama_factory_vlm_sharegpt(
|
|
|
467
471
|
role_map = {"user": "human", "assistant": "gpt", "system": "system"}
|
|
468
472
|
|
|
469
473
|
def transform(item) -> dict:
|
|
470
|
-
get = lambda f: item.get(f) if hasattr(item, "get") else item
|
|
474
|
+
get = lambda f: item.get(f) if hasattr(item, "get") else getattr(item, f, None)
|
|
471
475
|
messages = get(messages_field) or []
|
|
472
476
|
|
|
473
477
|
conversations = []
|
|
@@ -541,7 +545,7 @@ def to_swift_messages(
|
|
|
541
545
|
"""
|
|
542
546
|
|
|
543
547
|
def transform(item) -> dict:
|
|
544
|
-
get = lambda f: item.get(f) if hasattr(item, "get") else item
|
|
548
|
+
get = lambda f: item.get(f) if hasattr(item, "get") else getattr(item, f, None)
|
|
545
549
|
messages = get(messages_field) or []
|
|
546
550
|
|
|
547
551
|
# 复制 messages,避免修改原数据
|
|
@@ -600,7 +604,7 @@ def to_swift_query_response(
|
|
|
600
604
|
"""
|
|
601
605
|
|
|
602
606
|
def transform(item) -> dict:
|
|
603
|
-
get = lambda f: item.get(f) if hasattr(item, "get") else item
|
|
607
|
+
get = lambda f: item.get(f) if hasattr(item, "get") else getattr(item, f, None)
|
|
604
608
|
|
|
605
609
|
query = get(query_field)
|
|
606
610
|
response = get(response_field)
|
|
@@ -693,7 +697,7 @@ def to_swift_vlm(
|
|
|
693
697
|
"""
|
|
694
698
|
|
|
695
699
|
def transform(item) -> dict:
|
|
696
|
-
get = lambda f: item.get(f) if hasattr(item, "get") else item
|
|
700
|
+
get = lambda f: item.get(f) if hasattr(item, "get") else getattr(item, f, None)
|
|
697
701
|
messages = get(messages_field) or []
|
|
698
702
|
|
|
699
703
|
result_messages = []
|
dtflow/presets.py
CHANGED
|
@@ -6,6 +6,8 @@
|
|
|
6
6
|
|
|
7
7
|
from typing import Any, Callable
|
|
8
8
|
|
|
9
|
+
from dtflow.utils.helpers import get_field_value
|
|
10
|
+
|
|
9
11
|
|
|
10
12
|
def openai_chat(
|
|
11
13
|
user_field: str = "q", assistant_field: str = "a", system_prompt: str = None
|
|
@@ -33,8 +35,8 @@ def openai_chat(
|
|
|
33
35
|
if system_prompt:
|
|
34
36
|
messages.append({"role": "system", "content": system_prompt})
|
|
35
37
|
|
|
36
|
-
user_content =
|
|
37
|
-
assistant_content =
|
|
38
|
+
user_content = get_field_value(item, user_field)
|
|
39
|
+
assistant_content = get_field_value(item, assistant_field)
|
|
38
40
|
|
|
39
41
|
messages.append({"role": "user", "content": user_content})
|
|
40
42
|
messages.append({"role": "assistant", "content": assistant_content})
|
|
@@ -60,10 +62,9 @@ def alpaca(
|
|
|
60
62
|
|
|
61
63
|
def transform(item: Any) -> dict:
|
|
62
64
|
return {
|
|
63
|
-
"instruction":
|
|
64
|
-
|
|
65
|
-
"
|
|
66
|
-
"output": getattr(item, output_field, None) or item.get(output_field, ""),
|
|
65
|
+
"instruction": get_field_value(item, instruction_field),
|
|
66
|
+
"input": get_field_value(item, input_field),
|
|
67
|
+
"output": get_field_value(item, output_field),
|
|
67
68
|
}
|
|
68
69
|
|
|
69
70
|
return transform
|
|
@@ -84,9 +85,7 @@ def sharegpt(conversations_field: str = "conversations", role_mapping: dict = No
|
|
|
84
85
|
role_mapping = role_mapping or {"user": "human", "assistant": "gpt"}
|
|
85
86
|
|
|
86
87
|
def transform(item: Any) -> dict:
|
|
87
|
-
conversations =
|
|
88
|
-
conversations_field, []
|
|
89
|
-
)
|
|
88
|
+
conversations = get_field_value(item, conversations_field, [])
|
|
90
89
|
|
|
91
90
|
# 如果已经是对话格式,直接返回
|
|
92
91
|
if conversations:
|
|
@@ -102,7 +101,7 @@ def sharegpt(conversations_field: str = "conversations", role_mapping: dict = No
|
|
|
102
101
|
("answer", "gpt"),
|
|
103
102
|
("output", "gpt"),
|
|
104
103
|
]:
|
|
105
|
-
value =
|
|
104
|
+
value = get_field_value(item, field, None)
|
|
106
105
|
if value:
|
|
107
106
|
result.append({"from": role, "value": value})
|
|
108
107
|
|
|
@@ -127,9 +126,9 @@ def dpo_pair(
|
|
|
127
126
|
|
|
128
127
|
def transform(item: Any) -> dict:
|
|
129
128
|
return {
|
|
130
|
-
"prompt":
|
|
131
|
-
"chosen":
|
|
132
|
-
"rejected":
|
|
129
|
+
"prompt": get_field_value(item, prompt_field),
|
|
130
|
+
"chosen": get_field_value(item, chosen_field),
|
|
131
|
+
"rejected": get_field_value(item, rejected_field),
|
|
133
132
|
}
|
|
134
133
|
|
|
135
134
|
return transform
|
|
@@ -148,8 +147,8 @@ def simple_qa(question_field: str = "q", answer_field: str = "a") -> Callable:
|
|
|
148
147
|
|
|
149
148
|
def transform(item: Any) -> dict:
|
|
150
149
|
return {
|
|
151
|
-
"question":
|
|
152
|
-
"answer":
|
|
150
|
+
"question": get_field_value(item, question_field),
|
|
151
|
+
"answer": get_field_value(item, answer_field),
|
|
153
152
|
}
|
|
154
153
|
|
|
155
154
|
return transform
|
dtflow/utils/__init__.py
CHANGED
|
@@ -9,6 +9,7 @@ from .field_path import (
|
|
|
9
9
|
get_field_with_spec,
|
|
10
10
|
parse_field_spec,
|
|
11
11
|
)
|
|
12
|
+
from .helpers import get_field_value
|
|
12
13
|
|
|
13
14
|
__all__ = [
|
|
14
15
|
"display_data",
|
|
@@ -20,4 +21,6 @@ __all__ = [
|
|
|
20
21
|
"extract",
|
|
21
22
|
"extract_with_spec",
|
|
22
23
|
"ExpandMode",
|
|
24
|
+
# helpers
|
|
25
|
+
"get_field_value",
|
|
23
26
|
]
|
dtflow/utils/helpers.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""公共辅助函数"""
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_field_value(item: Any, field: str, default: Any = "") -> Any:
|
|
7
|
+
"""
|
|
8
|
+
获取字段值,支持 DictWrapper 和普通 dict。
|
|
9
|
+
|
|
10
|
+
优先尝试 dict.get(),如果没有 get 方法则使用 getattr()。
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
item: 数据对象(dict 或 DictWrapper)
|
|
14
|
+
field: 字段名
|
|
15
|
+
default: 默认值
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
字段值或默认值
|
|
19
|
+
|
|
20
|
+
Examples:
|
|
21
|
+
>>> get_field_value({"name": "test"}, "name")
|
|
22
|
+
'test'
|
|
23
|
+
>>> get_field_value({"name": ""}, "name", "default")
|
|
24
|
+
'default'
|
|
25
|
+
"""
|
|
26
|
+
if hasattr(item, "get"):
|
|
27
|
+
value = item.get(field, default)
|
|
28
|
+
else:
|
|
29
|
+
value = getattr(item, field, default)
|
|
30
|
+
return value if value else default
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dtflow
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.3
|
|
4
4
|
Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
|
|
5
5
|
Project-URL: Homepage, https://github.com/yourusername/DataTransformer
|
|
6
6
|
Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
|
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
dtflow/__init__.py,sha256=
|
|
1
|
+
dtflow/__init__.py,sha256=RJql_KmINJNbq2FEqU7jD9Z0c5ETkxQJPvUUPKiFt74,3031
|
|
2
2
|
dtflow/__main__.py,sha256=ySpqvEn7k-vsrYFPx-8O6p-yx_24KccgnOSPd2XybhM,12572
|
|
3
|
-
dtflow/converters.py,sha256=
|
|
3
|
+
dtflow/converters.py,sha256=yXafSDeRC7DB2MMj8fD1NWjAG8HoAGh5Ay2A5Z7s6xA,22206
|
|
4
4
|
dtflow/core.py,sha256=qMo6B3LK--TWRK7ZBKObGcs3pKFnd0NPoaM0T8JC7Jw,38135
|
|
5
5
|
dtflow/framework.py,sha256=jyICi_RWHjX7WfsXdSbWmP1SL7y1OWSPyd5G5Y-lvg4,17578
|
|
6
6
|
dtflow/lineage.py,sha256=jie3OL1qK90-_cOOqqLbhSJ1oGUktDM1x5HRpQ5Qiyc,12800
|
|
7
7
|
dtflow/pipeline.py,sha256=zZaC4fg5vsp_30Fhbg75vu0yggsdvf28bWBiVDWzZ6Y,13901
|
|
8
|
-
dtflow/presets.py,sha256=
|
|
8
|
+
dtflow/presets.py,sha256=qa8WQJhbNMuGxqqgA9BFadEBwDB9s0zWNxxhzF3q1K8,4701
|
|
9
9
|
dtflow/schema.py,sha256=IFcij22_UFKcgKT1YWwRg2QJO0vcAvCb1arZmsGByts,16824
|
|
10
10
|
dtflow/streaming.py,sha256=dxpNd1-Wz_PTLTdvM5qn06_2TJr5NRlIIuw0LOSS2Iw,24755
|
|
11
11
|
dtflow/tokenizers.py,sha256=7ZAelSmcDxLWH5kICgH9Q1ULH3_BfDZb9suHMjJJRZU,20589
|
|
@@ -27,10 +27,11 @@ dtflow/mcp/docs.py,sha256=DI2Vf-eFo4chRP_bDLsv4Uc3kJt8_1emz8N-NBSVirM,8834
|
|
|
27
27
|
dtflow/mcp/server.py,sha256=Nf0UlqDGhV55ndGuEglfr7VRjDWAC_9rRsNhdr0-ssM,4275
|
|
28
28
|
dtflow/storage/__init__.py,sha256=C0jpWNQU808Ezz7lWneddABal3wILy8ijFUNiSKbHV4,362
|
|
29
29
|
dtflow/storage/io.py,sha256=ZH2aSE-S89gpy3z4oTqhcqWf4u10OdkDoyul7o_YBDI,23374
|
|
30
|
-
dtflow/utils/__init__.py,sha256=
|
|
30
|
+
dtflow/utils/__init__.py,sha256=Pn-ltwV04fBQmeZG7FxInDQmzH29LYOi90LgeLMEuQk,506
|
|
31
31
|
dtflow/utils/display.py,sha256=OeOdTh6mbDwSkDWlmkjfpTjy2QG8ZUaYU0NpHUWkpEQ,5881
|
|
32
32
|
dtflow/utils/field_path.py,sha256=K8nU196RxTSJ1OoieTWGcYOWl9KjGq2iSxCAkfjECuM,7621
|
|
33
|
-
dtflow
|
|
34
|
-
dtflow-0.5.
|
|
35
|
-
dtflow-0.5.
|
|
36
|
-
dtflow-0.5.
|
|
33
|
+
dtflow/utils/helpers.py,sha256=JXN176_B2pm53GLVyZ1wj3wrmBJG52Tkw6AMQSdj7M8,791
|
|
34
|
+
dtflow-0.5.3.dist-info/METADATA,sha256=5joXihL8gkmnNEaUTqRpe0_U-y8osaIfdX0v91WVtK8,22544
|
|
35
|
+
dtflow-0.5.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
36
|
+
dtflow-0.5.3.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
|
|
37
|
+
dtflow-0.5.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|