dtflow 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dtflow/__init__.py CHANGED
@@ -60,7 +60,7 @@ from .tokenizers import (
60
60
  token_stats,
61
61
  )
62
62
 
63
- __version__ = "0.5.4"
63
+ __version__ = "0.5.5"
64
64
 
65
65
  __all__ = [
66
66
  # core
dtflow/__main__.py CHANGED
@@ -60,7 +60,7 @@ def sample(
60
60
  filename: str = typer.Argument(..., help="输入文件路径"),
61
61
  num_arg: Optional[int] = typer.Argument(None, help="采样数量", metavar="NUM"),
62
62
  num: int = typer.Option(10, "--num", "-n", help="采样数量", show_default=True),
63
- type: str = typer.Option("head", "--type", "-t", help="采样方式: random/head/tail"),
63
+ type: str = typer.Option("random", "--type", "-t", help="采样方式: random/head/tail"),
64
64
  output: Optional[str] = typer.Option(None, "--output", "-o", help="输出文件路径"),
65
65
  seed: Optional[int] = typer.Option(None, "--seed", help="随机种子"),
66
66
  by: Optional[str] = typer.Option(None, "--by", help="分层采样字段"),
@@ -223,9 +223,7 @@ def validate(
223
223
  None, "--preset", "-p", help="预设 Schema: openai_chat, alpaca, dpo, sharegpt"
224
224
  ),
225
225
  output: Optional[str] = typer.Option(None, "--output", "-o", help="输出有效数据的文件路径"),
226
- filter: bool = typer.Option(
227
- False, "--filter", "-f", help="过滤无效数据并保存"
228
- ),
226
+ filter: bool = typer.Option(False, "--filter", "-f", help="过滤无效数据并保存"),
229
227
  max_errors: int = typer.Option(20, "--max-errors", help="最多显示的错误数量"),
230
228
  verbose: bool = typer.Option(False, "--verbose", "-v", help="显示详细信息"),
231
229
  ):
dtflow/cli/sample.py CHANGED
@@ -20,7 +20,7 @@ from .common import (
20
20
  def sample(
21
21
  filename: str,
22
22
  num: int = 10,
23
- type: Literal["random", "head", "tail"] = "head",
23
+ type: Literal["random", "head", "tail"] = "random",
24
24
  output: Optional[str] = None,
25
25
  seed: Optional[int] = None,
26
26
  by: Optional[str] = None,
@@ -37,7 +37,7 @@ def sample(
37
37
  - num > 0: 采样指定数量
38
38
  - num = 0: 采样所有数据
39
39
  - num < 0: Python 切片风格(如 -1 表示最后 1 条,-10 表示最后 10 条)
40
- type: 采样方式,可选 random/head/tail,默认 head
40
+ type: 采样方式,可选 random/head/tail,默认 random
41
41
  output: 输出文件路径,不指定则打印到控制台
42
42
  seed: 随机种子(仅在 type=random 时有效)
43
43
  by: 分层采样字段名,按该字段的值分组采样
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dtflow
3
- Version: 0.5.4
3
+ Version: 0.5.5
4
4
  Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
5
5
  Project-URL: Homepage, https://github.com/yourusername/DataTransformer
6
6
  Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
@@ -1,5 +1,5 @@
1
- dtflow/__init__.py,sha256=yUwvKuVAmhDnp-1tYhZGlZcTdiEnZ3Jh-IJymgMIUhA,3031
2
- dtflow/__main__.py,sha256=ySpqvEn7k-vsrYFPx-8O6p-yx_24KccgnOSPd2XybhM,12572
1
+ dtflow/__init__.py,sha256=6vE7u7NxILhzgsAoQzel0huISJhLLGgqVW2obUMz1xM,3031
2
+ dtflow/__main__.py,sha256=htJk04NOEZbDoZILnwCMKjG--NPWuwv-2ZbFXlk7jjM,12558
3
3
  dtflow/converters.py,sha256=X3qeFD7FCOMnfiP3MicL5MXimOm4XUYBs5pczIkudU0,22331
4
4
  dtflow/core.py,sha256=qMo6B3LK--TWRK7ZBKObGcs3pKFnd0NPoaM0T8JC7Jw,38135
5
5
  dtflow/framework.py,sha256=jyICi_RWHjX7WfsXdSbWmP1SL7y1OWSPyd5G5Y-lvg4,17578
@@ -16,7 +16,7 @@ dtflow/cli/common.py,sha256=gCwnF5Sw2ploqfZJO_z3Ms9mR1HNT7Lj6ydHn0uVaIw,13817
16
16
  dtflow/cli/io_ops.py,sha256=BMDisP6dxzzmSjYwmeFwaHmpHHPqirmXAWeNTD-9MQM,13254
17
17
  dtflow/cli/lineage.py,sha256=_lNh35nF9AA0Zy6FyZ4g8IzrXH2ZQnp3inF-o2Hs1pw,1383
18
18
  dtflow/cli/pipeline.py,sha256=QNEo-BJlaC1CVnVeRZr7TwfuZYloJ4TebIzJ5ALzry0,1426
19
- dtflow/cli/sample.py,sha256=LRCkpFi9t0CI2QjRKADmvwWMdGfLriqdNkoFG6_wQkY,10497
19
+ dtflow/cli/sample.py,sha256=y_yRG2vBr300HZD4MosHZ4uAYhiTSqDxVN-ZMWTURog,10501
20
20
  dtflow/cli/stats.py,sha256=u4ehCfgw1X8WuOyAjrApMRgcIO3BVmINbsTjxEscQro,24086
21
21
  dtflow/cli/transform.py,sha256=w6xqMOxPxQvL2u_BPCfpDHuPSC9gmcqMPVN8s-B6bbY,15052
22
22
  dtflow/cli/validate.py,sha256=65aGVlMS_Rq0Ch0YQ-TclVJ03RQP4CnG137wthzb8Ao,4384
@@ -31,7 +31,7 @@ dtflow/utils/__init__.py,sha256=Pn-ltwV04fBQmeZG7FxInDQmzH29LYOi90LgeLMEuQk,506
31
31
  dtflow/utils/display.py,sha256=OeOdTh6mbDwSkDWlmkjfpTjy2QG8ZUaYU0NpHUWkpEQ,5881
32
32
  dtflow/utils/field_path.py,sha256=K8nU196RxTSJ1OoieTWGcYOWl9KjGq2iSxCAkfjECuM,7621
33
33
  dtflow/utils/helpers.py,sha256=JXN176_B2pm53GLVyZ1wj3wrmBJG52Tkw6AMQSdj7M8,791
34
- dtflow-0.5.4.dist-info/METADATA,sha256=mQIIV3B-6VBOuNSRiPQjqOwdLTs6Nir6to1_FIER3d0,22544
35
- dtflow-0.5.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
36
- dtflow-0.5.4.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
37
- dtflow-0.5.4.dist-info/RECORD,,
34
+ dtflow-0.5.5.dist-info/METADATA,sha256=SzaIuZqLFK-Ncnim1NmBUs7bdnqIJiGtPikbnrM-gsA,22544
35
+ dtflow-0.5.5.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
36
+ dtflow-0.5.5.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
37
+ dtflow-0.5.5.dist-info/RECORD,,
File without changes