dtflow 0.5.3__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dtflow/__init__.py CHANGED
@@ -60,7 +60,7 @@ from .tokenizers import (
60
60
  token_stats,
61
61
  )
62
62
 
63
- __version__ = "0.5.3"
63
+ __version__ = "0.5.4"
64
64
 
65
65
  __all__ = [
66
66
  # core
dtflow/cli/common.py CHANGED
@@ -100,8 +100,6 @@ def _format_nested(
100
100
  └─ 最后一项
101
101
  """
102
102
  lines = []
103
- branch = "└─ " if is_last else "├─ "
104
- cont = " " if is_last else "│ "
105
103
 
106
104
  if isinstance(value, dict):
107
105
  items = list(value.items())
@@ -183,6 +181,7 @@ def _print_samples(
183
181
  filename: Optional[str] = None,
184
182
  total_count: Optional[int] = None,
185
183
  fields: Optional[List[str]] = None,
184
+ file_size: Optional[int] = None,
186
185
  ) -> None:
187
186
  """
188
187
  打印采样结果。
@@ -190,8 +189,9 @@ def _print_samples(
190
189
  Args:
191
190
  samples: 采样数据列表
192
191
  filename: 文件名(用于显示概览)
193
- total_count: 文件总行数(用于显示概览)
192
+ total_count: 文件总行数(用于显示概览),大文件时可能为 None
194
193
  fields: 只显示指定字段
194
+ file_size: 文件大小(字节),当 total_count 为 None 时显示
195
195
  """
196
196
  if not samples:
197
197
  print("没有数据")
@@ -219,6 +219,8 @@ def _print_samples(
219
219
 
220
220
  if total_count is not None:
221
221
  info = f"总行数: {total_count:,} | 采样: {len(samples)} 条 | 字段: {len(all_fields)} 个"
222
+ elif file_size is not None:
223
+ info = f"文件大小: {_format_file_size(file_size)} | 采样: {len(samples)} 条 | 字段: {len(all_fields)} 个"
222
224
  else:
223
225
  info = f"采样: {len(samples)} 条 | 字段: {len(all_fields)} 个"
224
226
 
@@ -266,6 +268,10 @@ def _print_samples(
266
268
  print(
267
269
  f" 总行数: {total_count:,} | 采样: {len(samples)} 条 | 字段: {len(all_fields)} 个"
268
270
  )
271
+ elif file_size is not None:
272
+ print(
273
+ f" 文件大小: {_format_file_size(file_size)} | 采样: {len(samples)} 条 | 字段: {len(all_fields)} 个"
274
+ )
269
275
  else:
270
276
  print(f" 采样: {len(samples)} 条 | 字段: {len(all_fields)} 个")
271
277
  print(f" 字段: {', '.join(sorted(all_fields))}")
@@ -287,6 +293,15 @@ def _parse_field_list(value: Any) -> List[str]:
287
293
  return [str(value)]
288
294
 
289
295
 
296
+ def _format_file_size(size: int) -> str:
297
+ """格式化文件大小"""
298
+ for unit in ["B", "KB", "MB", "GB"]:
299
+ if size < 1024:
300
+ return f"{size:.1f} {unit}"
301
+ size /= 1024
302
+ return f"{size:.1f} TB"
303
+
304
+
290
305
  def _is_empty_value(v: Any) -> bool:
291
306
  """判断值是否为空"""
292
307
  if v is None:
dtflow/cli/sample.py CHANGED
@@ -99,11 +99,15 @@ def sample(
99
99
  for item in sampled:
100
100
  print(orjson.dumps(item, option=orjson.OPT_INDENT_2).decode("utf-8"))
101
101
  else:
102
- # 获取文件总行数用于显示
103
- total_count = _get_file_row_count(filepath)
102
+ # 大文件跳过行数统计(50MB 阈值)
103
+ file_size = filepath.stat().st_size
104
+ if file_size < 50 * 1024 * 1024:
105
+ total_count = _get_file_row_count(filepath)
106
+ else:
107
+ total_count = None
104
108
  # 解析 fields 参数
105
109
  field_list = _parse_field_list(fields) if fields else None
106
- _print_samples(sampled, filepath.name, total_count, field_list)
110
+ _print_samples(sampled, filepath.name, total_count, field_list, file_size)
107
111
 
108
112
 
109
113
  def _stratified_sample(
@@ -196,7 +200,7 @@ def _stratified_sample(
196
200
 
197
201
  # 执行各组采样
198
202
  result = []
199
- print(f"🔄 执行采样...")
203
+ print("🔄 执行采样...")
200
204
  for key in group_keys:
201
205
  group_data = groups[key]
202
206
  target = min(sample_counts[key], len(group_data))
@@ -215,7 +219,7 @@ def _stratified_sample(
215
219
  result.extend(sampled)
216
220
 
217
221
  # 打印采样结果
218
- print(f"\n📋 采样结果:")
222
+ print("\n📋 采样结果:")
219
223
  result_groups: Dict[Any, int] = defaultdict(int)
220
224
  for item in result:
221
225
  key = item.get(stratify_field, "__null__")
dtflow/converters.py CHANGED
@@ -23,8 +23,8 @@ def to_hf_dataset(data: List[Dict[str, Any]]):
23
23
  """
24
24
  try:
25
25
  from datasets import Dataset
26
- except ImportError:
27
- raise ImportError("需要安装 datasets: pip install datasets")
26
+ except ImportError as e:
27
+ raise ImportError("需要安装 datasets: pip install datasets") from e
28
28
 
29
29
  return Dataset.from_list(data)
30
30
 
@@ -45,9 +45,9 @@ def from_hf_dataset(dataset, split: Optional[str] = None) -> List[Dict[str, Any]
45
45
  >>> data = from_hf_dataset(my_dataset, split="train")
46
46
  """
47
47
  try:
48
- from datasets import Dataset, DatasetDict, load_dataset
49
- except ImportError:
50
- raise ImportError("需要安装 datasets: pip install datasets")
48
+ from datasets import load_dataset
49
+ except ImportError as e:
50
+ raise ImportError("需要安装 datasets: pip install datasets") from e
51
51
 
52
52
  # 如果是字符串,加载数据集
53
53
  if isinstance(dataset, str):
@@ -198,7 +198,8 @@ def to_llama_factory(
198
198
  """
199
199
 
200
200
  def transform(item) -> dict:
201
- get = lambda f: item.get(f, "") if hasattr(item, "get") else getattr(item, f, "")
201
+ def get(f):
202
+ return item.get(f, "") if hasattr(item, "get") else getattr(item, f, "")
202
203
 
203
204
  result = {
204
205
  "instruction": get(instruction_field),
@@ -316,7 +317,9 @@ def to_llama_factory_sharegpt(
316
317
  }
317
318
 
318
319
  def transform(item) -> dict:
319
- get = lambda f: item.get(f, "") if hasattr(item, "get") else getattr(item, f, "")
320
+ def get(f):
321
+ return item.get(f, "") if hasattr(item, "get") else getattr(item, f, "")
322
+
320
323
  messages = get(messages_field) or []
321
324
 
322
325
  conversations = []
@@ -389,7 +392,9 @@ def to_llama_factory_vlm(
389
392
  """
390
393
 
391
394
  def transform(item) -> dict:
392
- get = lambda f: item.get(f) if hasattr(item, "get") else getattr(item, f, None)
395
+ def get(f):
396
+ return item.get(f) if hasattr(item, "get") else getattr(item, f, None)
397
+
393
398
  messages = get(messages_field) or []
394
399
 
395
400
  instruction = ""
@@ -471,7 +476,9 @@ def to_llama_factory_vlm_sharegpt(
471
476
  role_map = {"user": "human", "assistant": "gpt", "system": "system"}
472
477
 
473
478
  def transform(item) -> dict:
474
- get = lambda f: item.get(f) if hasattr(item, "get") else getattr(item, f, None)
479
+ def get(f):
480
+ return item.get(f) if hasattr(item, "get") else getattr(item, f, None)
481
+
475
482
  messages = get(messages_field) or []
476
483
 
477
484
  conversations = []
@@ -545,7 +552,9 @@ def to_swift_messages(
545
552
  """
546
553
 
547
554
  def transform(item) -> dict:
548
- get = lambda f: item.get(f) if hasattr(item, "get") else getattr(item, f, None)
555
+ def get(f):
556
+ return item.get(f) if hasattr(item, "get") else getattr(item, f, None)
557
+
549
558
  messages = get(messages_field) or []
550
559
 
551
560
  # 复制 messages,避免修改原数据
@@ -604,7 +613,8 @@ def to_swift_query_response(
604
613
  """
605
614
 
606
615
  def transform(item) -> dict:
607
- get = lambda f: item.get(f) if hasattr(item, "get") else getattr(item, f, None)
616
+ def get(f):
617
+ return item.get(f) if hasattr(item, "get") else getattr(item, f, None)
608
618
 
609
619
  query = get(query_field)
610
620
  response = get(response_field)
@@ -617,7 +627,7 @@ def to_swift_query_response(
617
627
  current_query = ""
618
628
  current_response = ""
619
629
 
620
- for i, msg in enumerate(messages):
630
+ for _i, msg in enumerate(messages):
621
631
  role = msg.get("role", "")
622
632
  content = msg.get("content", "")
623
633
 
@@ -697,7 +707,9 @@ def to_swift_vlm(
697
707
  """
698
708
 
699
709
  def transform(item) -> dict:
700
- get = lambda f: item.get(f) if hasattr(item, "get") else getattr(item, f, None)
710
+ def get(f):
711
+ return item.get(f) if hasattr(item, "get") else getattr(item, f, None)
712
+
701
713
  messages = get(messages_field) or []
702
714
 
703
715
  result_messages = []
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dtflow
3
- Version: 0.5.3
3
+ Version: 0.5.4
4
4
  Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
5
5
  Project-URL: Homepage, https://github.com/yourusername/DataTransformer
6
6
  Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
@@ -1,6 +1,6 @@
1
- dtflow/__init__.py,sha256=RJql_KmINJNbq2FEqU7jD9Z0c5ETkxQJPvUUPKiFt74,3031
1
+ dtflow/__init__.py,sha256=yUwvKuVAmhDnp-1tYhZGlZcTdiEnZ3Jh-IJymgMIUhA,3031
2
2
  dtflow/__main__.py,sha256=ySpqvEn7k-vsrYFPx-8O6p-yx_24KccgnOSPd2XybhM,12572
3
- dtflow/converters.py,sha256=yXafSDeRC7DB2MMj8fD1NWjAG8HoAGh5Ay2A5Z7s6xA,22206
3
+ dtflow/converters.py,sha256=X3qeFD7FCOMnfiP3MicL5MXimOm4XUYBs5pczIkudU0,22331
4
4
  dtflow/core.py,sha256=qMo6B3LK--TWRK7ZBKObGcs3pKFnd0NPoaM0T8JC7Jw,38135
5
5
  dtflow/framework.py,sha256=jyICi_RWHjX7WfsXdSbWmP1SL7y1OWSPyd5G5Y-lvg4,17578
6
6
  dtflow/lineage.py,sha256=jie3OL1qK90-_cOOqqLbhSJ1oGUktDM1x5HRpQ5Qiyc,12800
@@ -12,11 +12,11 @@ dtflow/tokenizers.py,sha256=7ZAelSmcDxLWH5kICgH9Q1ULH3_BfDZb9suHMjJJRZU,20589
12
12
  dtflow/cli/__init__.py,sha256=QhZ-thgx9IBTFII7T_hdoWFUl0CCsdGQHN5ZEZw2XB0,423
13
13
  dtflow/cli/clean.py,sha256=y9VCRibgK1j8WIY3h0XZX0m93EdELQC7TdnseMWwS-0,17799
14
14
  dtflow/cli/commands.py,sha256=ST65Ox_MKu-CKAtPVaxECAPXYOJiF7BhL32A4nsZZl0,1175
15
- dtflow/cli/common.py,sha256=nIPc9GBK61r6kmaI9OS3IyhcfPqShpDEHx1ddjFPnlM,13131
15
+ dtflow/cli/common.py,sha256=gCwnF5Sw2ploqfZJO_z3Ms9mR1HNT7Lj6ydHn0uVaIw,13817
16
16
  dtflow/cli/io_ops.py,sha256=BMDisP6dxzzmSjYwmeFwaHmpHHPqirmXAWeNTD-9MQM,13254
17
17
  dtflow/cli/lineage.py,sha256=_lNh35nF9AA0Zy6FyZ4g8IzrXH2ZQnp3inF-o2Hs1pw,1383
18
18
  dtflow/cli/pipeline.py,sha256=QNEo-BJlaC1CVnVeRZr7TwfuZYloJ4TebIzJ5ALzry0,1426
19
- dtflow/cli/sample.py,sha256=vPTQlF0OXEry4QjO8uaD9vOae4AQbX9zDwVYOxg59ZI,10339
19
+ dtflow/cli/sample.py,sha256=LRCkpFi9t0CI2QjRKADmvwWMdGfLriqdNkoFG6_wQkY,10497
20
20
  dtflow/cli/stats.py,sha256=u4ehCfgw1X8WuOyAjrApMRgcIO3BVmINbsTjxEscQro,24086
21
21
  dtflow/cli/transform.py,sha256=w6xqMOxPxQvL2u_BPCfpDHuPSC9gmcqMPVN8s-B6bbY,15052
22
22
  dtflow/cli/validate.py,sha256=65aGVlMS_Rq0Ch0YQ-TclVJ03RQP4CnG137wthzb8Ao,4384
@@ -31,7 +31,7 @@ dtflow/utils/__init__.py,sha256=Pn-ltwV04fBQmeZG7FxInDQmzH29LYOi90LgeLMEuQk,506
31
31
  dtflow/utils/display.py,sha256=OeOdTh6mbDwSkDWlmkjfpTjy2QG8ZUaYU0NpHUWkpEQ,5881
32
32
  dtflow/utils/field_path.py,sha256=K8nU196RxTSJ1OoieTWGcYOWl9KjGq2iSxCAkfjECuM,7621
33
33
  dtflow/utils/helpers.py,sha256=JXN176_B2pm53GLVyZ1wj3wrmBJG52Tkw6AMQSdj7M8,791
34
- dtflow-0.5.3.dist-info/METADATA,sha256=5joXihL8gkmnNEaUTqRpe0_U-y8osaIfdX0v91WVtK8,22544
35
- dtflow-0.5.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
36
- dtflow-0.5.3.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
37
- dtflow-0.5.3.dist-info/RECORD,,
34
+ dtflow-0.5.4.dist-info/METADATA,sha256=mQIIV3B-6VBOuNSRiPQjqOwdLTs6Nir6to1_FIER3d0,22544
35
+ dtflow-0.5.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
36
+ dtflow-0.5.4.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
37
+ dtflow-0.5.4.dist-info/RECORD,,
File without changes