dtflow 0.5.3__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dtflow/__init__.py +1 -1
- dtflow/cli/common.py +18 -3
- dtflow/cli/sample.py +9 -5
- dtflow/converters.py +25 -13
- {dtflow-0.5.3.dist-info → dtflow-0.5.4.dist-info}/METADATA +1 -1
- {dtflow-0.5.3.dist-info → dtflow-0.5.4.dist-info}/RECORD +8 -8
- {dtflow-0.5.3.dist-info → dtflow-0.5.4.dist-info}/WHEEL +0 -0
- {dtflow-0.5.3.dist-info → dtflow-0.5.4.dist-info}/entry_points.txt +0 -0
dtflow/__init__.py
CHANGED
dtflow/cli/common.py
CHANGED
|
@@ -100,8 +100,6 @@ def _format_nested(
|
|
|
100
100
|
└─ 最后一项
|
|
101
101
|
"""
|
|
102
102
|
lines = []
|
|
103
|
-
branch = "└─ " if is_last else "├─ "
|
|
104
|
-
cont = " " if is_last else "│ "
|
|
105
103
|
|
|
106
104
|
if isinstance(value, dict):
|
|
107
105
|
items = list(value.items())
|
|
@@ -183,6 +181,7 @@ def _print_samples(
|
|
|
183
181
|
filename: Optional[str] = None,
|
|
184
182
|
total_count: Optional[int] = None,
|
|
185
183
|
fields: Optional[List[str]] = None,
|
|
184
|
+
file_size: Optional[int] = None,
|
|
186
185
|
) -> None:
|
|
187
186
|
"""
|
|
188
187
|
打印采样结果。
|
|
@@ -190,8 +189,9 @@ def _print_samples(
|
|
|
190
189
|
Args:
|
|
191
190
|
samples: 采样数据列表
|
|
192
191
|
filename: 文件名(用于显示概览)
|
|
193
|
-
total_count:
|
|
192
|
+
total_count: 文件总行数(用于显示概览),大文件时可能为 None
|
|
194
193
|
fields: 只显示指定字段
|
|
194
|
+
file_size: 文件大小(字节),当 total_count 为 None 时显示
|
|
195
195
|
"""
|
|
196
196
|
if not samples:
|
|
197
197
|
print("没有数据")
|
|
@@ -219,6 +219,8 @@ def _print_samples(
|
|
|
219
219
|
|
|
220
220
|
if total_count is not None:
|
|
221
221
|
info = f"总行数: {total_count:,} | 采样: {len(samples)} 条 | 字段: {len(all_fields)} 个"
|
|
222
|
+
elif file_size is not None:
|
|
223
|
+
info = f"文件大小: {_format_file_size(file_size)} | 采样: {len(samples)} 条 | 字段: {len(all_fields)} 个"
|
|
222
224
|
else:
|
|
223
225
|
info = f"采样: {len(samples)} 条 | 字段: {len(all_fields)} 个"
|
|
224
226
|
|
|
@@ -266,6 +268,10 @@ def _print_samples(
|
|
|
266
268
|
print(
|
|
267
269
|
f" 总行数: {total_count:,} | 采样: {len(samples)} 条 | 字段: {len(all_fields)} 个"
|
|
268
270
|
)
|
|
271
|
+
elif file_size is not None:
|
|
272
|
+
print(
|
|
273
|
+
f" 文件大小: {_format_file_size(file_size)} | 采样: {len(samples)} 条 | 字段: {len(all_fields)} 个"
|
|
274
|
+
)
|
|
269
275
|
else:
|
|
270
276
|
print(f" 采样: {len(samples)} 条 | 字段: {len(all_fields)} 个")
|
|
271
277
|
print(f" 字段: {', '.join(sorted(all_fields))}")
|
|
@@ -287,6 +293,15 @@ def _parse_field_list(value: Any) -> List[str]:
|
|
|
287
293
|
return [str(value)]
|
|
288
294
|
|
|
289
295
|
|
|
296
|
+
def _format_file_size(size: int) -> str:
|
|
297
|
+
"""格式化文件大小"""
|
|
298
|
+
for unit in ["B", "KB", "MB", "GB"]:
|
|
299
|
+
if size < 1024:
|
|
300
|
+
return f"{size:.1f} {unit}"
|
|
301
|
+
size /= 1024
|
|
302
|
+
return f"{size:.1f} TB"
|
|
303
|
+
|
|
304
|
+
|
|
290
305
|
def _is_empty_value(v: Any) -> bool:
|
|
291
306
|
"""判断值是否为空"""
|
|
292
307
|
if v is None:
|
dtflow/cli/sample.py
CHANGED
|
@@ -99,11 +99,15 @@ def sample(
|
|
|
99
99
|
for item in sampled:
|
|
100
100
|
print(orjson.dumps(item, option=orjson.OPT_INDENT_2).decode("utf-8"))
|
|
101
101
|
else:
|
|
102
|
-
#
|
|
103
|
-
|
|
102
|
+
# 大文件跳过行数统计(50MB 阈值)
|
|
103
|
+
file_size = filepath.stat().st_size
|
|
104
|
+
if file_size < 50 * 1024 * 1024:
|
|
105
|
+
total_count = _get_file_row_count(filepath)
|
|
106
|
+
else:
|
|
107
|
+
total_count = None
|
|
104
108
|
# 解析 fields 参数
|
|
105
109
|
field_list = _parse_field_list(fields) if fields else None
|
|
106
|
-
_print_samples(sampled, filepath.name, total_count, field_list)
|
|
110
|
+
_print_samples(sampled, filepath.name, total_count, field_list, file_size)
|
|
107
111
|
|
|
108
112
|
|
|
109
113
|
def _stratified_sample(
|
|
@@ -196,7 +200,7 @@ def _stratified_sample(
|
|
|
196
200
|
|
|
197
201
|
# 执行各组采样
|
|
198
202
|
result = []
|
|
199
|
-
print(
|
|
203
|
+
print("🔄 执行采样...")
|
|
200
204
|
for key in group_keys:
|
|
201
205
|
group_data = groups[key]
|
|
202
206
|
target = min(sample_counts[key], len(group_data))
|
|
@@ -215,7 +219,7 @@ def _stratified_sample(
|
|
|
215
219
|
result.extend(sampled)
|
|
216
220
|
|
|
217
221
|
# 打印采样结果
|
|
218
|
-
print(
|
|
222
|
+
print("\n📋 采样结果:")
|
|
219
223
|
result_groups: Dict[Any, int] = defaultdict(int)
|
|
220
224
|
for item in result:
|
|
221
225
|
key = item.get(stratify_field, "__null__")
|
dtflow/converters.py
CHANGED
|
@@ -23,8 +23,8 @@ def to_hf_dataset(data: List[Dict[str, Any]]):
|
|
|
23
23
|
"""
|
|
24
24
|
try:
|
|
25
25
|
from datasets import Dataset
|
|
26
|
-
except ImportError:
|
|
27
|
-
raise ImportError("需要安装 datasets: pip install datasets")
|
|
26
|
+
except ImportError as e:
|
|
27
|
+
raise ImportError("需要安装 datasets: pip install datasets") from e
|
|
28
28
|
|
|
29
29
|
return Dataset.from_list(data)
|
|
30
30
|
|
|
@@ -45,9 +45,9 @@ def from_hf_dataset(dataset, split: Optional[str] = None) -> List[Dict[str, Any]
|
|
|
45
45
|
>>> data = from_hf_dataset(my_dataset, split="train")
|
|
46
46
|
"""
|
|
47
47
|
try:
|
|
48
|
-
from datasets import
|
|
49
|
-
except ImportError:
|
|
50
|
-
raise ImportError("需要安装 datasets: pip install datasets")
|
|
48
|
+
from datasets import load_dataset
|
|
49
|
+
except ImportError as e:
|
|
50
|
+
raise ImportError("需要安装 datasets: pip install datasets") from e
|
|
51
51
|
|
|
52
52
|
# 如果是字符串,加载数据集
|
|
53
53
|
if isinstance(dataset, str):
|
|
@@ -198,7 +198,8 @@ def to_llama_factory(
|
|
|
198
198
|
"""
|
|
199
199
|
|
|
200
200
|
def transform(item) -> dict:
|
|
201
|
-
|
|
201
|
+
def get(f):
|
|
202
|
+
return item.get(f, "") if hasattr(item, "get") else getattr(item, f, "")
|
|
202
203
|
|
|
203
204
|
result = {
|
|
204
205
|
"instruction": get(instruction_field),
|
|
@@ -316,7 +317,9 @@ def to_llama_factory_sharegpt(
|
|
|
316
317
|
}
|
|
317
318
|
|
|
318
319
|
def transform(item) -> dict:
|
|
319
|
-
|
|
320
|
+
def get(f):
|
|
321
|
+
return item.get(f, "") if hasattr(item, "get") else getattr(item, f, "")
|
|
322
|
+
|
|
320
323
|
messages = get(messages_field) or []
|
|
321
324
|
|
|
322
325
|
conversations = []
|
|
@@ -389,7 +392,9 @@ def to_llama_factory_vlm(
|
|
|
389
392
|
"""
|
|
390
393
|
|
|
391
394
|
def transform(item) -> dict:
|
|
392
|
-
|
|
395
|
+
def get(f):
|
|
396
|
+
return item.get(f) if hasattr(item, "get") else getattr(item, f, None)
|
|
397
|
+
|
|
393
398
|
messages = get(messages_field) or []
|
|
394
399
|
|
|
395
400
|
instruction = ""
|
|
@@ -471,7 +476,9 @@ def to_llama_factory_vlm_sharegpt(
|
|
|
471
476
|
role_map = {"user": "human", "assistant": "gpt", "system": "system"}
|
|
472
477
|
|
|
473
478
|
def transform(item) -> dict:
|
|
474
|
-
|
|
479
|
+
def get(f):
|
|
480
|
+
return item.get(f) if hasattr(item, "get") else getattr(item, f, None)
|
|
481
|
+
|
|
475
482
|
messages = get(messages_field) or []
|
|
476
483
|
|
|
477
484
|
conversations = []
|
|
@@ -545,7 +552,9 @@ def to_swift_messages(
|
|
|
545
552
|
"""
|
|
546
553
|
|
|
547
554
|
def transform(item) -> dict:
|
|
548
|
-
|
|
555
|
+
def get(f):
|
|
556
|
+
return item.get(f) if hasattr(item, "get") else getattr(item, f, None)
|
|
557
|
+
|
|
549
558
|
messages = get(messages_field) or []
|
|
550
559
|
|
|
551
560
|
# 复制 messages,避免修改原数据
|
|
@@ -604,7 +613,8 @@ def to_swift_query_response(
|
|
|
604
613
|
"""
|
|
605
614
|
|
|
606
615
|
def transform(item) -> dict:
|
|
607
|
-
|
|
616
|
+
def get(f):
|
|
617
|
+
return item.get(f) if hasattr(item, "get") else getattr(item, f, None)
|
|
608
618
|
|
|
609
619
|
query = get(query_field)
|
|
610
620
|
response = get(response_field)
|
|
@@ -617,7 +627,7 @@ def to_swift_query_response(
|
|
|
617
627
|
current_query = ""
|
|
618
628
|
current_response = ""
|
|
619
629
|
|
|
620
|
-
for
|
|
630
|
+
for _i, msg in enumerate(messages):
|
|
621
631
|
role = msg.get("role", "")
|
|
622
632
|
content = msg.get("content", "")
|
|
623
633
|
|
|
@@ -697,7 +707,9 @@ def to_swift_vlm(
|
|
|
697
707
|
"""
|
|
698
708
|
|
|
699
709
|
def transform(item) -> dict:
|
|
700
|
-
|
|
710
|
+
def get(f):
|
|
711
|
+
return item.get(f) if hasattr(item, "get") else getattr(item, f, None)
|
|
712
|
+
|
|
701
713
|
messages = get(messages_field) or []
|
|
702
714
|
|
|
703
715
|
result_messages = []
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dtflow
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.4
|
|
4
4
|
Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
|
|
5
5
|
Project-URL: Homepage, https://github.com/yourusername/DataTransformer
|
|
6
6
|
Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
dtflow/__init__.py,sha256=
|
|
1
|
+
dtflow/__init__.py,sha256=yUwvKuVAmhDnp-1tYhZGlZcTdiEnZ3Jh-IJymgMIUhA,3031
|
|
2
2
|
dtflow/__main__.py,sha256=ySpqvEn7k-vsrYFPx-8O6p-yx_24KccgnOSPd2XybhM,12572
|
|
3
|
-
dtflow/converters.py,sha256=
|
|
3
|
+
dtflow/converters.py,sha256=X3qeFD7FCOMnfiP3MicL5MXimOm4XUYBs5pczIkudU0,22331
|
|
4
4
|
dtflow/core.py,sha256=qMo6B3LK--TWRK7ZBKObGcs3pKFnd0NPoaM0T8JC7Jw,38135
|
|
5
5
|
dtflow/framework.py,sha256=jyICi_RWHjX7WfsXdSbWmP1SL7y1OWSPyd5G5Y-lvg4,17578
|
|
6
6
|
dtflow/lineage.py,sha256=jie3OL1qK90-_cOOqqLbhSJ1oGUktDM1x5HRpQ5Qiyc,12800
|
|
@@ -12,11 +12,11 @@ dtflow/tokenizers.py,sha256=7ZAelSmcDxLWH5kICgH9Q1ULH3_BfDZb9suHMjJJRZU,20589
|
|
|
12
12
|
dtflow/cli/__init__.py,sha256=QhZ-thgx9IBTFII7T_hdoWFUl0CCsdGQHN5ZEZw2XB0,423
|
|
13
13
|
dtflow/cli/clean.py,sha256=y9VCRibgK1j8WIY3h0XZX0m93EdELQC7TdnseMWwS-0,17799
|
|
14
14
|
dtflow/cli/commands.py,sha256=ST65Ox_MKu-CKAtPVaxECAPXYOJiF7BhL32A4nsZZl0,1175
|
|
15
|
-
dtflow/cli/common.py,sha256=
|
|
15
|
+
dtflow/cli/common.py,sha256=gCwnF5Sw2ploqfZJO_z3Ms9mR1HNT7Lj6ydHn0uVaIw,13817
|
|
16
16
|
dtflow/cli/io_ops.py,sha256=BMDisP6dxzzmSjYwmeFwaHmpHHPqirmXAWeNTD-9MQM,13254
|
|
17
17
|
dtflow/cli/lineage.py,sha256=_lNh35nF9AA0Zy6FyZ4g8IzrXH2ZQnp3inF-o2Hs1pw,1383
|
|
18
18
|
dtflow/cli/pipeline.py,sha256=QNEo-BJlaC1CVnVeRZr7TwfuZYloJ4TebIzJ5ALzry0,1426
|
|
19
|
-
dtflow/cli/sample.py,sha256=
|
|
19
|
+
dtflow/cli/sample.py,sha256=LRCkpFi9t0CI2QjRKADmvwWMdGfLriqdNkoFG6_wQkY,10497
|
|
20
20
|
dtflow/cli/stats.py,sha256=u4ehCfgw1X8WuOyAjrApMRgcIO3BVmINbsTjxEscQro,24086
|
|
21
21
|
dtflow/cli/transform.py,sha256=w6xqMOxPxQvL2u_BPCfpDHuPSC9gmcqMPVN8s-B6bbY,15052
|
|
22
22
|
dtflow/cli/validate.py,sha256=65aGVlMS_Rq0Ch0YQ-TclVJ03RQP4CnG137wthzb8Ao,4384
|
|
@@ -31,7 +31,7 @@ dtflow/utils/__init__.py,sha256=Pn-ltwV04fBQmeZG7FxInDQmzH29LYOi90LgeLMEuQk,506
|
|
|
31
31
|
dtflow/utils/display.py,sha256=OeOdTh6mbDwSkDWlmkjfpTjy2QG8ZUaYU0NpHUWkpEQ,5881
|
|
32
32
|
dtflow/utils/field_path.py,sha256=K8nU196RxTSJ1OoieTWGcYOWl9KjGq2iSxCAkfjECuM,7621
|
|
33
33
|
dtflow/utils/helpers.py,sha256=JXN176_B2pm53GLVyZ1wj3wrmBJG52Tkw6AMQSdj7M8,791
|
|
34
|
-
dtflow-0.5.
|
|
35
|
-
dtflow-0.5.
|
|
36
|
-
dtflow-0.5.
|
|
37
|
-
dtflow-0.5.
|
|
34
|
+
dtflow-0.5.4.dist-info/METADATA,sha256=mQIIV3B-6VBOuNSRiPQjqOwdLTs6Nir6to1_FIER3d0,22544
|
|
35
|
+
dtflow-0.5.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
36
|
+
dtflow-0.5.4.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
|
|
37
|
+
dtflow-0.5.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|