dtflow 0.3.2__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dtflow/__init__.py +1 -1
- dtflow/cli/commands.py +83 -40
- dtflow/core.py +32 -7
- dtflow/streaming.py +21 -11
- dtflow/tokenizers.py +6 -4
- dtflow/utils/__init__.py +19 -1
- dtflow/utils/field_path.py +274 -0
- {dtflow-0.3.2.dist-info → dtflow-0.4.1.dist-info}/METADATA +48 -3
- {dtflow-0.3.2.dist-info → dtflow-0.4.1.dist-info}/RECORD +11 -10
- {dtflow-0.3.2.dist-info → dtflow-0.4.1.dist-info}/WHEEL +0 -0
- {dtflow-0.3.2.dist-info → dtflow-0.4.1.dist-info}/entry_points.txt +0 -0
dtflow/__init__.py
CHANGED
dtflow/cli/commands.py
CHANGED
|
@@ -17,6 +17,7 @@ from ..pipeline import run_pipeline, validate_pipeline
|
|
|
17
17
|
from ..presets import get_preset, list_presets
|
|
18
18
|
from ..storage.io import load_data, sample_file, save_data
|
|
19
19
|
from ..streaming import load_stream
|
|
20
|
+
from ..utils.field_path import get_field_with_spec
|
|
20
21
|
|
|
21
22
|
# 支持的文件格式
|
|
22
23
|
SUPPORTED_FORMATS = {".csv", ".jsonl", ".json", ".xlsx", ".xls", ".parquet", ".arrow", ".feather"}
|
|
@@ -137,7 +138,12 @@ def _stratified_sample(
|
|
|
137
138
|
Args:
|
|
138
139
|
filepath: 文件路径
|
|
139
140
|
num: 目标采样总数
|
|
140
|
-
stratify_field:
|
|
141
|
+
stratify_field: 分层字段,支持嵌套路径语法:
|
|
142
|
+
- meta.source 嵌套字段
|
|
143
|
+
- messages[0].role 数组索引
|
|
144
|
+
- messages[-1].role 负索引
|
|
145
|
+
- messages.# 数组长度
|
|
146
|
+
- messages[*].role 展开所有元素(可加 :join/:unique 模式)
|
|
141
147
|
uniform: 是否均匀采样(各组相同数量)
|
|
142
148
|
seed: 随机种子
|
|
143
149
|
sample_type: 采样方式(用于组内采样)
|
|
@@ -158,10 +164,13 @@ def _stratified_sample(
|
|
|
158
164
|
if num <= 0 or num > total:
|
|
159
165
|
num = total
|
|
160
166
|
|
|
161
|
-
#
|
|
167
|
+
# 按字段分组(支持嵌套路径语法)
|
|
162
168
|
groups: Dict[Any, List[Dict]] = defaultdict(list)
|
|
163
169
|
for item in data:
|
|
164
|
-
key = item
|
|
170
|
+
key = get_field_with_spec(item, stratify_field, default="__null__")
|
|
171
|
+
# 确保 key 可哈希
|
|
172
|
+
if isinstance(key, list):
|
|
173
|
+
key = tuple(key)
|
|
165
174
|
groups[key].append(item)
|
|
166
175
|
|
|
167
176
|
group_keys = list(groups.keys())
|
|
@@ -787,6 +796,17 @@ def _generate_default_transform(field_names: List[str]) -> str:
|
|
|
787
796
|
return "\n".join(lines) if lines else " # 在这里定义输出字段"
|
|
788
797
|
|
|
789
798
|
|
|
799
|
+
def _unwrap(obj: Any) -> Any:
|
|
800
|
+
"""递归将 DictWrapper 转换为普通 dict"""
|
|
801
|
+
if hasattr(obj, "to_dict"):
|
|
802
|
+
return _unwrap(obj.to_dict())
|
|
803
|
+
if isinstance(obj, dict):
|
|
804
|
+
return {k: _unwrap(v) for k, v in obj.items()}
|
|
805
|
+
if isinstance(obj, list):
|
|
806
|
+
return [_unwrap(v) for v in obj]
|
|
807
|
+
return obj
|
|
808
|
+
|
|
809
|
+
|
|
790
810
|
def _execute_transform(
|
|
791
811
|
input_path: Path,
|
|
792
812
|
config_path: Path,
|
|
@@ -820,7 +840,8 @@ def _execute_transform(
|
|
|
820
840
|
try:
|
|
821
841
|
# 包装转换函数以支持属性访问(配置文件中定义的 Item 类)
|
|
822
842
|
def wrapped_transform(item):
|
|
823
|
-
|
|
843
|
+
result = transform_func(DictWrapper(item))
|
|
844
|
+
return _unwrap(result)
|
|
824
845
|
|
|
825
846
|
st = load_stream(str(input_path))
|
|
826
847
|
if num:
|
|
@@ -917,7 +938,8 @@ def _execute_preset_transform(
|
|
|
917
938
|
try:
|
|
918
939
|
# 包装转换函数以支持属性访问
|
|
919
940
|
def wrapped_transform(item):
|
|
920
|
-
|
|
941
|
+
result = transform_func(DictWrapper(item))
|
|
942
|
+
return _unwrap(result)
|
|
921
943
|
|
|
922
944
|
st = load_stream(str(input_path))
|
|
923
945
|
if num:
|
|
@@ -1006,7 +1028,13 @@ def dedupe(
|
|
|
1006
1028
|
|
|
1007
1029
|
Args:
|
|
1008
1030
|
filename: 输入文件路径,支持 csv/excel/jsonl/json/parquet/arrow/feather 格式
|
|
1009
|
-
key:
|
|
1031
|
+
key: 去重依据字段,支持嵌套路径语法:
|
|
1032
|
+
- meta.source 嵌套字段
|
|
1033
|
+
- messages[0].role 数组索引
|
|
1034
|
+
- messages[-1].content 负索引
|
|
1035
|
+
- messages.# 数组长度
|
|
1036
|
+
- messages[*].role:join 展开所有元素
|
|
1037
|
+
多个字段用逗号分隔。不指定则全量去重
|
|
1010
1038
|
similar: 相似度阈值(0-1),指定后启用相似度去重模式,需要指定 --key
|
|
1011
1039
|
output: 输出文件路径,不指定则覆盖原文件
|
|
1012
1040
|
|
|
@@ -1014,8 +1042,9 @@ def dedupe(
|
|
|
1014
1042
|
dt dedupe data.jsonl # 全量精确去重
|
|
1015
1043
|
dt dedupe data.jsonl --key=text # 按 text 字段精确去重
|
|
1016
1044
|
dt dedupe data.jsonl --key=user,timestamp # 按多字段组合精确去重
|
|
1017
|
-
dt dedupe data.jsonl --key=
|
|
1018
|
-
dt dedupe data.jsonl --
|
|
1045
|
+
dt dedupe data.jsonl --key=meta.id # 按嵌套字段去重
|
|
1046
|
+
dt dedupe data.jsonl --key=messages[0].content # 按第一条消息内容去重
|
|
1047
|
+
dt dedupe data.jsonl --key=text --similar=0.8 # 相似度去重
|
|
1019
1048
|
"""
|
|
1020
1049
|
filepath = Path(filename)
|
|
1021
1050
|
|
|
@@ -1596,25 +1625,26 @@ def clean(
|
|
|
1596
1625
|
|
|
1597
1626
|
Args:
|
|
1598
1627
|
filename: 输入文件路径,支持 csv/excel/jsonl/json/parquet/arrow/feather 格式
|
|
1599
|
-
drop_empty:
|
|
1628
|
+
drop_empty: 删除空值记录,支持嵌套路径语法
|
|
1600
1629
|
- 不带值:删除任意字段为空的记录
|
|
1601
1630
|
- 指定字段:删除指定字段为空的记录(逗号分隔)
|
|
1602
|
-
min_len: 最小长度过滤,格式 "字段:长度"
|
|
1603
|
-
max_len: 最大长度过滤,格式 "字段:长度"
|
|
1604
|
-
keep:
|
|
1605
|
-
drop:
|
|
1631
|
+
min_len: 最小长度过滤,格式 "字段:长度",字段支持嵌套路径
|
|
1632
|
+
max_len: 最大长度过滤,格式 "字段:长度",字段支持嵌套路径
|
|
1633
|
+
keep: 只保留指定字段(逗号分隔,仅支持顶层字段)
|
|
1634
|
+
drop: 删除指定字段(逗号分隔,仅支持顶层字段)
|
|
1606
1635
|
strip: 去除所有字符串字段的首尾空白
|
|
1607
1636
|
output: 输出文件路径,不指定则覆盖原文件
|
|
1608
1637
|
|
|
1609
1638
|
Examples:
|
|
1610
1639
|
dt clean data.jsonl --drop-empty # 删除任意空值记录
|
|
1611
1640
|
dt clean data.jsonl --drop-empty=text,answer # 删除指定字段为空的记录
|
|
1641
|
+
dt clean data.jsonl --drop-empty=meta.source # 删除嵌套字段为空的记录
|
|
1612
1642
|
dt clean data.jsonl --min-len=text:10 # text 字段最少 10 字符
|
|
1613
|
-
dt clean data.jsonl --
|
|
1643
|
+
dt clean data.jsonl --min-len=messages.#:2 # 至少 2 条消息
|
|
1644
|
+
dt clean data.jsonl --max-len=messages[-1].content:500 # 最后一条消息最多 500 字符
|
|
1614
1645
|
dt clean data.jsonl --keep=question,answer # 只保留这些字段
|
|
1615
1646
|
dt clean data.jsonl --drop=metadata,timestamp # 删除这些字段
|
|
1616
1647
|
dt clean data.jsonl --strip # 去除字符串首尾空白
|
|
1617
|
-
dt clean data.jsonl --drop-empty --strip -o out.jsonl
|
|
1618
1648
|
"""
|
|
1619
1649
|
filepath = Path(filename)
|
|
1620
1650
|
|
|
@@ -1784,9 +1814,18 @@ def _is_empty_value(v: Any) -> bool:
|
|
|
1784
1814
|
|
|
1785
1815
|
|
|
1786
1816
|
def _get_value_len(value: Any) -> int:
|
|
1787
|
-
"""
|
|
1817
|
+
"""
|
|
1818
|
+
获取值的长度。
|
|
1819
|
+
|
|
1820
|
+
- str/list/dict: 返回 len()
|
|
1821
|
+
- int/float: 直接返回该数值(用于 messages.# 这种返回数量的场景)
|
|
1822
|
+
- None: 返回 0
|
|
1823
|
+
- 其他: 转为字符串后返回长度
|
|
1824
|
+
"""
|
|
1788
1825
|
if value is None:
|
|
1789
1826
|
return 0
|
|
1827
|
+
if isinstance(value, (int, float)):
|
|
1828
|
+
return int(value)
|
|
1790
1829
|
if isinstance(value, (str, list, dict)):
|
|
1791
1830
|
return len(value)
|
|
1792
1831
|
return len(str(value))
|
|
@@ -1809,13 +1848,13 @@ def _clean_data_single_pass(
|
|
|
1809
1848
|
Args:
|
|
1810
1849
|
data: 原始数据列表
|
|
1811
1850
|
strip: 是否去除字符串首尾空白
|
|
1812
|
-
empty_fields:
|
|
1813
|
-
min_len_field:
|
|
1851
|
+
empty_fields: 检查空值的字段列表(支持嵌套路径),空列表表示检查所有字段,None 表示不检查
|
|
1852
|
+
min_len_field: 最小长度检查的字段(支持嵌套路径)
|
|
1814
1853
|
min_len_value: 最小长度值
|
|
1815
|
-
max_len_field:
|
|
1854
|
+
max_len_field: 最大长度检查的字段(支持嵌套路径)
|
|
1816
1855
|
max_len_value: 最大长度值
|
|
1817
|
-
keep_fields:
|
|
1818
|
-
drop_fields:
|
|
1856
|
+
keep_fields: 只保留的字段列表(仅支持顶层字段)
|
|
1857
|
+
drop_fields: 要删除的字段集合(仅支持顶层字段)
|
|
1819
1858
|
|
|
1820
1859
|
Returns:
|
|
1821
1860
|
(清洗后的数据, 统计信息列表)
|
|
@@ -1843,20 +1882,20 @@ def _clean_data_single_pass(
|
|
|
1843
1882
|
stats["drop_empty"] += 1
|
|
1844
1883
|
continue
|
|
1845
1884
|
else:
|
|
1846
|
-
#
|
|
1847
|
-
if any(_is_empty_value(item
|
|
1885
|
+
# 检查指定字段(支持嵌套路径)
|
|
1886
|
+
if any(_is_empty_value(get_field_with_spec(item, f)) for f in empty_fields):
|
|
1848
1887
|
stats["drop_empty"] += 1
|
|
1849
1888
|
continue
|
|
1850
1889
|
|
|
1851
|
-
# 3.
|
|
1890
|
+
# 3. 最小长度过滤(支持嵌套路径)
|
|
1852
1891
|
if min_len_field is not None:
|
|
1853
|
-
if _get_value_len(item
|
|
1892
|
+
if _get_value_len(get_field_with_spec(item, min_len_field, default="")) < min_len_value:
|
|
1854
1893
|
stats["min_len"] += 1
|
|
1855
1894
|
continue
|
|
1856
1895
|
|
|
1857
|
-
# 4.
|
|
1896
|
+
# 4. 最大长度过滤(支持嵌套路径)
|
|
1858
1897
|
if max_len_field is not None:
|
|
1859
|
-
if _get_value_len(item
|
|
1898
|
+
if _get_value_len(get_field_with_spec(item, max_len_field, default="")) > max_len_value:
|
|
1860
1899
|
stats["max_len"] += 1
|
|
1861
1900
|
continue
|
|
1862
1901
|
|
|
@@ -1906,24 +1945,25 @@ def _clean_streaming(
|
|
|
1906
1945
|
"""
|
|
1907
1946
|
|
|
1908
1947
|
def clean_filter(item: Dict) -> bool:
|
|
1909
|
-
"""过滤函数:返回 True 保留,False
|
|
1948
|
+
"""过滤函数:返回 True 保留,False 过滤(支持嵌套路径)"""
|
|
1910
1949
|
# 空值过滤
|
|
1911
1950
|
if empty_fields is not None:
|
|
1912
1951
|
if len(empty_fields) == 0:
|
|
1913
1952
|
if any(_is_empty_value(v) for v in item.values()):
|
|
1914
1953
|
return False
|
|
1915
1954
|
else:
|
|
1916
|
-
|
|
1955
|
+
# 支持嵌套路径
|
|
1956
|
+
if any(_is_empty_value(get_field_with_spec(item, f)) for f in empty_fields):
|
|
1917
1957
|
return False
|
|
1918
1958
|
|
|
1919
|
-
#
|
|
1959
|
+
# 最小长度过滤(支持嵌套路径)
|
|
1920
1960
|
if min_len_field is not None:
|
|
1921
|
-
if _get_value_len(item
|
|
1961
|
+
if _get_value_len(get_field_with_spec(item, min_len_field, default="")) < min_len_value:
|
|
1922
1962
|
return False
|
|
1923
1963
|
|
|
1924
|
-
#
|
|
1964
|
+
# 最大长度过滤(支持嵌套路径)
|
|
1925
1965
|
if max_len_field is not None:
|
|
1926
|
-
if _get_value_len(item
|
|
1966
|
+
if _get_value_len(get_field_with_spec(item, max_len_field, default="")) > max_len_value:
|
|
1927
1967
|
return False
|
|
1928
1968
|
|
|
1929
1969
|
return True
|
|
@@ -2033,13 +2073,15 @@ def token_stats(
|
|
|
2033
2073
|
|
|
2034
2074
|
Args:
|
|
2035
2075
|
filename: 输入文件路径
|
|
2036
|
-
field: 要统计的字段(默认 messages
|
|
2076
|
+
field: 要统计的字段(默认 messages),支持嵌套路径语法
|
|
2037
2077
|
model: 分词器: cl100k_base (默认), qwen2.5, llama3, gpt-4 等
|
|
2038
2078
|
detailed: 是否显示详细统计
|
|
2039
2079
|
|
|
2040
2080
|
Examples:
|
|
2041
2081
|
dt token-stats data.jsonl
|
|
2042
2082
|
dt token-stats data.jsonl --field=text --model=qwen2.5
|
|
2083
|
+
dt token-stats data.jsonl --field=conversation.messages
|
|
2084
|
+
dt token-stats data.jsonl --field=messages[-1].content # 统计最后一条消息
|
|
2043
2085
|
dt token-stats data.jsonl --detailed
|
|
2044
2086
|
"""
|
|
2045
2087
|
filepath = Path(filename)
|
|
@@ -2067,9 +2109,9 @@ def token_stats(
|
|
|
2067
2109
|
print(f" 共 {total} 条数据")
|
|
2068
2110
|
print(f"🔢 统计 Token (模型: {model}, 字段: {field})...")
|
|
2069
2111
|
|
|
2070
|
-
#
|
|
2112
|
+
# 检查字段类型并选择合适的统计方法(支持嵌套路径)
|
|
2071
2113
|
sample = data[0]
|
|
2072
|
-
field_value = sample
|
|
2114
|
+
field_value = get_field_with_spec(sample, field)
|
|
2073
2115
|
|
|
2074
2116
|
try:
|
|
2075
2117
|
if isinstance(field_value, list) and field_value and isinstance(field_value[0], dict):
|
|
@@ -2203,12 +2245,13 @@ def diff(
|
|
|
2203
2245
|
Args:
|
|
2204
2246
|
file1: 第一个文件路径
|
|
2205
2247
|
file2: 第二个文件路径
|
|
2206
|
-
key:
|
|
2248
|
+
key: 用于匹配的键字段,支持嵌套路径语法(可选)
|
|
2207
2249
|
output: 差异报告输出路径(可选)
|
|
2208
2250
|
|
|
2209
2251
|
Examples:
|
|
2210
2252
|
dt diff v1/train.jsonl v2/train.jsonl
|
|
2211
2253
|
dt diff a.jsonl b.jsonl --key=id
|
|
2254
|
+
dt diff a.jsonl b.jsonl --key=meta.uuid # 按嵌套字段匹配
|
|
2212
2255
|
dt diff a.jsonl b.jsonl --output=diff_report.json
|
|
2213
2256
|
"""
|
|
2214
2257
|
path1 = Path(file1)
|
|
@@ -2271,9 +2314,9 @@ def _compute_diff(
|
|
|
2271
2314
|
}
|
|
2272
2315
|
|
|
2273
2316
|
if key:
|
|
2274
|
-
# 基于 key
|
|
2275
|
-
dict1 = {item
|
|
2276
|
-
dict2 = {item
|
|
2317
|
+
# 基于 key 的精确匹配(支持嵌套路径)
|
|
2318
|
+
dict1 = {get_field_with_spec(item, key): item for item in data1 if get_field_with_spec(item, key) is not None}
|
|
2319
|
+
dict2 = {get_field_with_spec(item, key): item for item in data2 if get_field_with_spec(item, key) is not None}
|
|
2277
2320
|
|
|
2278
2321
|
keys1 = set(dict1.keys())
|
|
2279
2322
|
keys2 = set(dict2.keys())
|
dtflow/core.py
CHANGED
|
@@ -12,6 +12,7 @@ import orjson
|
|
|
12
12
|
|
|
13
13
|
from .lineage import LineageTracker
|
|
14
14
|
from .storage.io import load_data, save_data
|
|
15
|
+
from .utils.field_path import get_field_with_spec
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
def _fast_json_dumps(obj: Any) -> str:
|
|
@@ -393,16 +394,35 @@ class DataTransformer:
|
|
|
393
394
|
item: Dict[str, Any],
|
|
394
395
|
key: Union[None, str, List[str], Callable[[Any], Any]],
|
|
395
396
|
) -> Any:
|
|
396
|
-
"""
|
|
397
|
+
"""
|
|
398
|
+
获取去重用的 key。
|
|
399
|
+
|
|
400
|
+
支持字段路径语法:
|
|
401
|
+
- meta.source 嵌套字段
|
|
402
|
+
- messages[0].role 数组索引
|
|
403
|
+
- messages[-1].role 负索引
|
|
404
|
+
- messages.# 数组长度
|
|
405
|
+
- messages[*].role 展开所有元素(可加 :join/:unique 模式)
|
|
406
|
+
"""
|
|
397
407
|
if key is None:
|
|
398
408
|
# 全量去重:使用快速 JSON 序列化
|
|
399
409
|
return _fast_json_dumps(item)
|
|
400
410
|
elif isinstance(key, str):
|
|
401
|
-
#
|
|
402
|
-
|
|
411
|
+
# 单字段(支持嵌套路径)
|
|
412
|
+
val = get_field_with_spec(item, key)
|
|
413
|
+
# 确保可哈希
|
|
414
|
+
if isinstance(val, list):
|
|
415
|
+
return tuple(val)
|
|
416
|
+
return val
|
|
403
417
|
elif isinstance(key, list):
|
|
404
|
-
#
|
|
405
|
-
|
|
418
|
+
# 多字段组合(每个字段都支持嵌套路径)
|
|
419
|
+
vals = []
|
|
420
|
+
for k in key:
|
|
421
|
+
v = get_field_with_spec(item, k)
|
|
422
|
+
if isinstance(v, list):
|
|
423
|
+
v = tuple(v)
|
|
424
|
+
vals.append(v)
|
|
425
|
+
return tuple(vals)
|
|
406
426
|
elif callable(key):
|
|
407
427
|
# 自定义函数
|
|
408
428
|
return key(DictWrapper(item))
|
|
@@ -506,9 +526,14 @@ class DataTransformer:
|
|
|
506
526
|
item: Dict[str, Any],
|
|
507
527
|
key: Union[str, Callable[[Any], str]],
|
|
508
528
|
) -> str:
|
|
509
|
-
"""
|
|
529
|
+
"""
|
|
530
|
+
获取用于相似度比较的文本。
|
|
531
|
+
|
|
532
|
+
支持字段路径语法(同 _get_dedupe_key)。
|
|
533
|
+
"""
|
|
510
534
|
if isinstance(key, str):
|
|
511
|
-
|
|
535
|
+
val = get_field_with_spec(item, key, default="")
|
|
536
|
+
return str(val) if val else ""
|
|
512
537
|
elif callable(key):
|
|
513
538
|
return str(key(DictWrapper(item)))
|
|
514
539
|
else:
|
dtflow/streaming.py
CHANGED
|
@@ -84,6 +84,8 @@ class StreamingTransformer:
|
|
|
84
84
|
self._source_path = source_path
|
|
85
85
|
self._total = total
|
|
86
86
|
self._operations: List[Dict[str, Any]] = []
|
|
87
|
+
self._error_count = 0
|
|
88
|
+
self._first_error: Optional[str] = None
|
|
87
89
|
|
|
88
90
|
@classmethod
|
|
89
91
|
def load_stream(cls, filepath: str, batch_size: int = 10000) -> "StreamingTransformer":
|
|
@@ -194,17 +196,20 @@ class StreamingTransformer:
|
|
|
194
196
|
Returns:
|
|
195
197
|
新的 StreamingTransformer(惰性,不立即执行)
|
|
196
198
|
"""
|
|
199
|
+
# transform 是 1:1 转换,保留 total
|
|
200
|
+
new_st = StreamingTransformer(iter([]), self._source_path, total=self._total)
|
|
201
|
+
new_st._operations = self._operations + [{"type": "transform", "func": func}]
|
|
197
202
|
|
|
198
203
|
def transformed_iterator():
|
|
199
204
|
for item in self._iterator:
|
|
200
205
|
try:
|
|
201
206
|
yield func(item)
|
|
202
|
-
except Exception:
|
|
203
|
-
|
|
207
|
+
except Exception as e:
|
|
208
|
+
new_st._error_count += 1
|
|
209
|
+
if new_st._first_error is None:
|
|
210
|
+
new_st._first_error = f"{type(e).__name__}: {e}"
|
|
204
211
|
|
|
205
|
-
|
|
206
|
-
new_st = StreamingTransformer(transformed_iterator(), self._source_path, total=self._total)
|
|
207
|
-
new_st._operations = self._operations + [{"type": "transform", "func": func}]
|
|
212
|
+
new_st._iterator = transformed_iterator()
|
|
208
213
|
return new_st
|
|
209
214
|
|
|
210
215
|
def head(self, n: int) -> "StreamingTransformer":
|
|
@@ -299,16 +304,21 @@ class StreamingTransformer:
|
|
|
299
304
|
ext = path.suffix.lower()
|
|
300
305
|
|
|
301
306
|
if ext == ".jsonl":
|
|
302
|
-
|
|
307
|
+
count = self._save_jsonl(filepath, show_progress)
|
|
303
308
|
elif ext == ".csv":
|
|
304
|
-
|
|
309
|
+
count = self._save_batched(filepath, "csv", batch_size, show_progress)
|
|
305
310
|
elif ext == ".parquet":
|
|
306
|
-
|
|
311
|
+
count = self._save_batched(filepath, "parquet", batch_size, show_progress)
|
|
307
312
|
elif ext in (".arrow", ".feather"):
|
|
308
|
-
|
|
313
|
+
count = self._save_batched(filepath, "arrow", batch_size, show_progress)
|
|
309
314
|
else:
|
|
310
|
-
|
|
311
|
-
|
|
315
|
+
count = self._save_jsonl(filepath, show_progress)
|
|
316
|
+
|
|
317
|
+
# 打印错误摘要
|
|
318
|
+
if self._error_count > 0:
|
|
319
|
+
print(f"⚠️ 跳过 {self._error_count} 条错误记录: {self._first_error}")
|
|
320
|
+
|
|
321
|
+
return count
|
|
312
322
|
|
|
313
323
|
def _save_jsonl(self, filepath: str, show_progress: bool) -> int:
|
|
314
324
|
"""JSONL 逐行流式保存(使用 orjson)"""
|
dtflow/tokenizers.py
CHANGED
|
@@ -7,6 +7,8 @@ Token 统计模块
|
|
|
7
7
|
|
|
8
8
|
from typing import Any, Callable, Dict, List, Optional, Union
|
|
9
9
|
|
|
10
|
+
from .utils.field_path import get_field_with_spec
|
|
11
|
+
|
|
10
12
|
# 延迟导入,避免未安装时报错
|
|
11
13
|
_tokenizer_cache = {}
|
|
12
14
|
|
|
@@ -290,7 +292,7 @@ def token_stats(
|
|
|
290
292
|
|
|
291
293
|
Args:
|
|
292
294
|
data: 数据列表
|
|
293
|
-
fields:
|
|
295
|
+
fields: 要统计的字段,支持嵌套路径语法(如 meta.text, messages[-1].content)
|
|
294
296
|
model: 模型名称或别名,如 "qwen2.5", "gpt-4" 等
|
|
295
297
|
backend: 后端选择,None 则自动检测
|
|
296
298
|
|
|
@@ -307,7 +309,7 @@ def token_stats(
|
|
|
307
309
|
for item in data:
|
|
308
310
|
total = 0
|
|
309
311
|
for field in fields:
|
|
310
|
-
value = item
|
|
312
|
+
value = get_field_with_spec(item, field, default="")
|
|
311
313
|
if value:
|
|
312
314
|
total += count_tokens(str(value), model=model, backend=backend)
|
|
313
315
|
counts.append(total)
|
|
@@ -508,7 +510,7 @@ def messages_token_stats(
|
|
|
508
510
|
|
|
509
511
|
Args:
|
|
510
512
|
data: 数据列表
|
|
511
|
-
messages_field: messages
|
|
513
|
+
messages_field: messages 字段名,支持嵌套路径语法(如 conversation.messages)
|
|
512
514
|
model: 模型名称或别名
|
|
513
515
|
backend: 后端,None 则自动检测
|
|
514
516
|
|
|
@@ -538,7 +540,7 @@ def messages_token_stats(
|
|
|
538
540
|
|
|
539
541
|
all_stats = []
|
|
540
542
|
for item in data:
|
|
541
|
-
messages = item
|
|
543
|
+
messages = get_field_with_spec(item, messages_field, default=[])
|
|
542
544
|
if messages:
|
|
543
545
|
all_stats.append(_count_messages_tokens(messages, model=model, backend=_backend))
|
|
544
546
|
|
dtflow/utils/__init__.py
CHANGED
|
@@ -1,5 +1,23 @@
|
|
|
1
1
|
"""工具函数"""
|
|
2
2
|
|
|
3
3
|
from .display import display_data, print_stats
|
|
4
|
+
from .field_path import (
|
|
5
|
+
ExpandMode,
|
|
6
|
+
extract,
|
|
7
|
+
extract_with_spec,
|
|
8
|
+
get_field,
|
|
9
|
+
get_field_with_spec,
|
|
10
|
+
parse_field_spec,
|
|
11
|
+
)
|
|
4
12
|
|
|
5
|
-
__all__ = [
|
|
13
|
+
__all__ = [
|
|
14
|
+
"display_data",
|
|
15
|
+
"print_stats",
|
|
16
|
+
# field_path
|
|
17
|
+
"get_field",
|
|
18
|
+
"get_field_with_spec",
|
|
19
|
+
"parse_field_spec",
|
|
20
|
+
"extract",
|
|
21
|
+
"extract_with_spec",
|
|
22
|
+
"ExpandMode",
|
|
23
|
+
]
|
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
"""
|
|
2
|
+
字段路径解析模块
|
|
3
|
+
|
|
4
|
+
支持的语法:
|
|
5
|
+
a.b.c 嵌套字段访问
|
|
6
|
+
a[0].b 数组索引访问
|
|
7
|
+
a[-1].b 负索引访问
|
|
8
|
+
a.# 数组长度
|
|
9
|
+
a[*].b 展开所有元素
|
|
10
|
+
|
|
11
|
+
展开模式 (用于 [*]):
|
|
12
|
+
first 取第一个值(默认)
|
|
13
|
+
join 拼接为字符串(用 | 分隔)
|
|
14
|
+
unique 去重后排序拼接
|
|
15
|
+
|
|
16
|
+
用法:
|
|
17
|
+
from dtflow.utils.field_path import get_field
|
|
18
|
+
|
|
19
|
+
# 基础用法
|
|
20
|
+
get_field(item, "meta.source")
|
|
21
|
+
get_field(item, "messages[0].role")
|
|
22
|
+
get_field(item, "messages[-1].content")
|
|
23
|
+
get_field(item, "messages.#")
|
|
24
|
+
|
|
25
|
+
# 展开模式
|
|
26
|
+
get_field(item, "messages[*].role") # 默认取第一个
|
|
27
|
+
get_field(item, "messages[*].role", mode="join") # 拼接: "system|user|assistant"
|
|
28
|
+
get_field(item, "messages[*].role", mode="unique") # 去重: "assistant|system|user"
|
|
29
|
+
|
|
30
|
+
# 解析路径语法
|
|
31
|
+
path, mode = parse_field_spec("messages[*].role:unique")
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
import re
|
|
35
|
+
from typing import Any, List, Literal, Optional, Tuple, Union
|
|
36
|
+
|
|
37
|
+
# 展开模式类型
|
|
38
|
+
ExpandMode = Literal["first", "join", "unique"]
|
|
39
|
+
|
|
40
|
+
# 路径段解析正则
|
|
41
|
+
# 匹配: field, field[0], field[-1], field[*], field.#
|
|
42
|
+
_SEGMENT_PATTERN = re.compile(
|
|
43
|
+
r"([a-zA-Z_\u4e00-\u9fff][a-zA-Z0-9_\u4e00-\u9fff]*)" # 字段名(支持中文)
|
|
44
|
+
r"(?:\[(-?\d+|\*)\])?" # 可选的索引 [0], [-1], [*]
|
|
45
|
+
r"|(#)" # 或者长度操作符 #
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def parse_field_spec(spec: str) -> Tuple[str, ExpandMode]:
|
|
50
|
+
"""
|
|
51
|
+
解析字段规格,分离路径和展开模式
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
spec: 字段规格,如 "messages[*].role:unique"
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
(path, mode) 元组
|
|
58
|
+
|
|
59
|
+
Examples:
|
|
60
|
+
>>> parse_field_spec("meta.source")
|
|
61
|
+
('meta.source', 'first')
|
|
62
|
+
>>> parse_field_spec("messages[*].role:join")
|
|
63
|
+
('messages[*].role', 'join')
|
|
64
|
+
"""
|
|
65
|
+
if ":" in spec:
|
|
66
|
+
path, mode_str = spec.rsplit(":", 1)
|
|
67
|
+
if mode_str in ("first", "join", "unique"):
|
|
68
|
+
return path, mode_str # type: ignore
|
|
69
|
+
# 冒号不是模式分隔符,可能是字段名的一部分
|
|
70
|
+
return spec, "first"
|
|
71
|
+
return spec, "first"
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _parse_path(path: str) -> List[Union[str, int, Literal["*", "#"]]]:
|
|
75
|
+
"""
|
|
76
|
+
解析路径字符串为段列表
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
path: 路径字符串,如 "messages[0].role" 或 "meta.source"
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
段列表,如 ["messages", 0, "role"] 或 ["meta", "source"]
|
|
83
|
+
"""
|
|
84
|
+
segments: List[Union[str, int, Literal["*", "#"]]] = []
|
|
85
|
+
|
|
86
|
+
# 按点分割,但保留方括号内容
|
|
87
|
+
parts = path.replace("][", "].[").split(".")
|
|
88
|
+
|
|
89
|
+
for part in parts:
|
|
90
|
+
if not part:
|
|
91
|
+
continue
|
|
92
|
+
|
|
93
|
+
# 检查是否是长度操作符
|
|
94
|
+
if part == "#":
|
|
95
|
+
segments.append("#")
|
|
96
|
+
continue
|
|
97
|
+
|
|
98
|
+
# 解析 field[index] 格式
|
|
99
|
+
match = re.match(r"([a-zA-Z_\u4e00-\u9fff][a-zA-Z0-9_\u4e00-\u9fff]*)?(?:\[(-?\d+|\*)\])?", part)
|
|
100
|
+
if match:
|
|
101
|
+
field_name, index = match.groups()
|
|
102
|
+
|
|
103
|
+
if field_name:
|
|
104
|
+
segments.append(field_name)
|
|
105
|
+
|
|
106
|
+
if index is not None:
|
|
107
|
+
if index == "*":
|
|
108
|
+
segments.append("*")
|
|
109
|
+
else:
|
|
110
|
+
segments.append(int(index))
|
|
111
|
+
|
|
112
|
+
return segments
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _get_value_by_segments(
|
|
116
|
+
data: Any,
|
|
117
|
+
segments: List[Union[str, int, Literal["*", "#"]]],
|
|
118
|
+
mode: ExpandMode = "first",
|
|
119
|
+
) -> Any:
|
|
120
|
+
"""
|
|
121
|
+
根据段列表从数据中提取值
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
data: 源数据
|
|
125
|
+
segments: 路径段列表
|
|
126
|
+
mode: 展开模式
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
提取的值
|
|
130
|
+
"""
|
|
131
|
+
if not segments:
|
|
132
|
+
return data
|
|
133
|
+
|
|
134
|
+
current = data
|
|
135
|
+
i = 0
|
|
136
|
+
|
|
137
|
+
while i < len(segments):
|
|
138
|
+
seg = segments[i]
|
|
139
|
+
|
|
140
|
+
if current is None:
|
|
141
|
+
return None
|
|
142
|
+
|
|
143
|
+
# 长度操作符
|
|
144
|
+
if seg == "#":
|
|
145
|
+
if isinstance(current, (list, tuple, str)):
|
|
146
|
+
return len(current)
|
|
147
|
+
return None
|
|
148
|
+
|
|
149
|
+
# 展开操作符
|
|
150
|
+
if seg == "*":
|
|
151
|
+
if not isinstance(current, (list, tuple)):
|
|
152
|
+
return None
|
|
153
|
+
|
|
154
|
+
# 获取剩余路径
|
|
155
|
+
remaining = segments[i + 1 :]
|
|
156
|
+
|
|
157
|
+
# 对每个元素递归获取值
|
|
158
|
+
values = []
|
|
159
|
+
for item in current:
|
|
160
|
+
val = _get_value_by_segments(item, remaining, mode="first")
|
|
161
|
+
if val is not None:
|
|
162
|
+
values.append(val)
|
|
163
|
+
|
|
164
|
+
# 根据模式处理结果
|
|
165
|
+
if not values:
|
|
166
|
+
return None
|
|
167
|
+
|
|
168
|
+
if mode == "first":
|
|
169
|
+
return values[0]
|
|
170
|
+
elif mode == "join":
|
|
171
|
+
return "|".join(str(v) for v in values)
|
|
172
|
+
elif mode == "unique":
|
|
173
|
+
unique_vals = sorted(set(str(v) for v in values))
|
|
174
|
+
return "|".join(unique_vals)
|
|
175
|
+
|
|
176
|
+
return values
|
|
177
|
+
|
|
178
|
+
# 字典字段访问
|
|
179
|
+
if isinstance(seg, str):
|
|
180
|
+
if isinstance(current, dict):
|
|
181
|
+
current = current.get(seg)
|
|
182
|
+
else:
|
|
183
|
+
return None
|
|
184
|
+
|
|
185
|
+
# 数组索引访问
|
|
186
|
+
elif isinstance(seg, int):
|
|
187
|
+
if isinstance(current, (list, tuple)):
|
|
188
|
+
try:
|
|
189
|
+
current = current[seg]
|
|
190
|
+
except IndexError:
|
|
191
|
+
return None
|
|
192
|
+
else:
|
|
193
|
+
return None
|
|
194
|
+
|
|
195
|
+
i += 1
|
|
196
|
+
|
|
197
|
+
return current
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def get_field(
|
|
201
|
+
data: dict,
|
|
202
|
+
path: str,
|
|
203
|
+
mode: ExpandMode = "first",
|
|
204
|
+
default: Any = None,
|
|
205
|
+
) -> Any:
|
|
206
|
+
"""
|
|
207
|
+
从字典中获取嵌套字段值
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
data: 源字典
|
|
211
|
+
path: 字段路径
|
|
212
|
+
mode: 展开模式(当路径包含 [*] 时生效)
|
|
213
|
+
default: 默认值(当路径不存在时返回)
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
字段值或默认值
|
|
217
|
+
|
|
218
|
+
Examples:
|
|
219
|
+
>>> data = {"meta": {"source": "wiki"}, "messages": [{"role": "user"}, {"role": "assistant"}]}
|
|
220
|
+
|
|
221
|
+
# 嵌套字段
|
|
222
|
+
>>> get_field(data, "meta.source")
|
|
223
|
+
'wiki'
|
|
224
|
+
|
|
225
|
+
# 数组索引
|
|
226
|
+
>>> get_field(data, "messages[0].role")
|
|
227
|
+
'user'
|
|
228
|
+
>>> get_field(data, "messages[-1].role")
|
|
229
|
+
'assistant'
|
|
230
|
+
|
|
231
|
+
# 数组长度
|
|
232
|
+
>>> get_field(data, "messages.#")
|
|
233
|
+
2
|
|
234
|
+
|
|
235
|
+
# 展开所有元素
|
|
236
|
+
>>> get_field(data, "messages[*].role")
|
|
237
|
+
'user'
|
|
238
|
+
>>> get_field(data, "messages[*].role", mode="join")
|
|
239
|
+
'user|assistant'
|
|
240
|
+
>>> get_field(data, "messages[*].role", mode="unique")
|
|
241
|
+
'assistant|user'
|
|
242
|
+
"""
|
|
243
|
+
if not path:
|
|
244
|
+
return default
|
|
245
|
+
|
|
246
|
+
segments = _parse_path(path)
|
|
247
|
+
result = _get_value_by_segments(data, segments, mode)
|
|
248
|
+
|
|
249
|
+
return result if result is not None else default
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def get_field_with_spec(data: dict, spec: str, default: Any = None) -> Any:
|
|
253
|
+
"""
|
|
254
|
+
解析完整的字段规格并获取值
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
data: 源字典
|
|
258
|
+
spec: 字段规格,如 "messages[*].role:unique"
|
|
259
|
+
default: 默认值
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
字段值
|
|
263
|
+
|
|
264
|
+
Examples:
|
|
265
|
+
>>> get_field_with_spec(data, "messages[*].role:join")
|
|
266
|
+
'user|assistant'
|
|
267
|
+
"""
|
|
268
|
+
path, mode = parse_field_spec(spec)
|
|
269
|
+
return get_field(data, path, mode=mode, default=default)
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
# 便捷别名
|
|
273
|
+
extract = get_field
|
|
274
|
+
extract_with_spec = get_field_with_spec
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dtflow
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
|
|
5
5
|
Project-URL: Homepage, https://github.com/yourusername/DataTransformer
|
|
6
6
|
Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
|
|
@@ -301,6 +301,8 @@ dt.shuffle(seed=42)
|
|
|
301
301
|
dt sample data.jsonl --num=10
|
|
302
302
|
dt sample data.csv --num=100 --sample_type=head
|
|
303
303
|
dt sample data.jsonl 1000 --by=category # 分层采样
|
|
304
|
+
dt sample data.jsonl 1000 --by=meta.source # 按嵌套字段分层采样
|
|
305
|
+
dt sample data.jsonl 1000 --by=messages.# # 按消息数量分层采样
|
|
304
306
|
|
|
305
307
|
# 数据转换 - 预设模式
|
|
306
308
|
dt transform data.jsonl --preset=openai_chat
|
|
@@ -317,25 +319,30 @@ dt run pipeline.yaml --input=new_data.jsonl --output=result.jsonl
|
|
|
317
319
|
|
|
318
320
|
# Token 统计
|
|
319
321
|
dt token-stats data.jsonl --field=messages --model=gpt-4
|
|
322
|
+
dt token-stats data.jsonl --field=messages[-1].content # 统计最后一条消息
|
|
320
323
|
dt token-stats data.jsonl --field=text --detailed
|
|
321
324
|
|
|
322
325
|
# 数据对比
|
|
323
326
|
dt diff v1/train.jsonl v2/train.jsonl
|
|
324
327
|
dt diff a.jsonl b.jsonl --key=id
|
|
328
|
+
dt diff a.jsonl b.jsonl --key=meta.uuid # 按嵌套字段匹配
|
|
325
329
|
|
|
326
330
|
# 数据清洗
|
|
327
331
|
dt clean data.jsonl --drop-empty # 删除任意空值记录
|
|
328
332
|
dt clean data.jsonl --drop-empty=text,answer # 删除指定字段为空的记录
|
|
333
|
+
dt clean data.jsonl --drop-empty=meta.source # 删除嵌套字段为空的记录
|
|
329
334
|
dt clean data.jsonl --min-len=text:10 # text 字段最少 10 字符
|
|
330
|
-
dt clean data.jsonl --
|
|
335
|
+
dt clean data.jsonl --min-len=messages.#:2 # 至少 2 条消息
|
|
336
|
+
dt clean data.jsonl --max-len=messages[-1].content:500 # 最后一条消息最多 500 字符
|
|
331
337
|
dt clean data.jsonl --keep=question,answer # 只保留这些字段
|
|
332
338
|
dt clean data.jsonl --drop=metadata # 删除指定字段
|
|
333
339
|
dt clean data.jsonl --strip # 去除字符串首尾空白
|
|
334
|
-
dt clean data.jsonl --strip --drop-empty=text --min-len=text:10 -o clean.jsonl # 组合使用
|
|
335
340
|
|
|
336
341
|
# 数据去重
|
|
337
342
|
dt dedupe data.jsonl # 全量精确去重
|
|
338
343
|
dt dedupe data.jsonl --key=text # 按字段精确去重
|
|
344
|
+
dt dedupe data.jsonl --key=meta.id # 按嵌套字段去重
|
|
345
|
+
dt dedupe data.jsonl --key=messages[0].content # 按第一条消息内容去重
|
|
339
346
|
dt dedupe data.jsonl --key=text --similar=0.8 # 相似度去重
|
|
340
347
|
|
|
341
348
|
# 文件拼接
|
|
@@ -345,6 +352,44 @@ dt concat a.jsonl b.jsonl -o merged.jsonl
|
|
|
345
352
|
dt stats data.jsonl
|
|
346
353
|
```
|
|
347
354
|
|
|
355
|
+
### 字段路径语法
|
|
356
|
+
|
|
357
|
+
CLI 命令中的字段参数支持嵌套路径语法,可访问深层嵌套的数据:
|
|
358
|
+
|
|
359
|
+
| 语法 | 含义 | 示例 |
|
|
360
|
+
|------|------|------|
|
|
361
|
+
| `a.b.c` | 嵌套字段 | `meta.source` |
|
|
362
|
+
| `a[0].b` | 数组索引 | `messages[0].role` |
|
|
363
|
+
| `a[-1].b` | 负索引 | `messages[-1].content` |
|
|
364
|
+
| `a.#` | 数组长度 | `messages.#` |
|
|
365
|
+
| `a[*].b` | 展开所有元素 | `messages[*].role` |
|
|
366
|
+
| `a[*].b:join` | 展开并用 `\|` 拼接 | `messages[*].role:join` |
|
|
367
|
+
| `a[*].b:unique` | 展开去重后拼接 | `messages[*].role:unique` |
|
|
368
|
+
|
|
369
|
+
支持字段路径的命令参数:
|
|
370
|
+
|
|
371
|
+
| 命令 | 参数 | 示例 |
|
|
372
|
+
|------|------|------|
|
|
373
|
+
| `sample` | `--by=` | `--by=meta.source`、`--by=messages.#` |
|
|
374
|
+
| `dedupe` | `--key=` | `--key=meta.id`、`--key=messages[0].content` |
|
|
375
|
+
| `clean` | `--drop-empty=` | `--drop-empty=meta.source` |
|
|
376
|
+
| `clean` | `--min-len=` | `--min-len=messages.#:2` |
|
|
377
|
+
| `clean` | `--max-len=` | `--max-len=messages[-1].content:500` |
|
|
378
|
+
| `token-stats` | `--field=` | `--field=messages[-1].content` |
|
|
379
|
+
| `diff` | `--key=` | `--key=meta.uuid` |
|
|
380
|
+
|
|
381
|
+
示例数据:
|
|
382
|
+
```json
|
|
383
|
+
{"meta": {"source": "wiki"}, "messages": [{"role": "user", "content": "hi"}, {"role": "assistant", "content": "hello"}]}
|
|
384
|
+
```
|
|
385
|
+
|
|
386
|
+
- `meta.source` → `"wiki"`
|
|
387
|
+
- `messages[0].role` → `"user"`
|
|
388
|
+
- `messages[-1].content` → `"hello"`
|
|
389
|
+
- `messages.#` → `2`
|
|
390
|
+
- `messages[*].role` → `"user"` (默认取第一个)
|
|
391
|
+
- `messages[*].role:join` → `"user|assistant"`
|
|
392
|
+
|
|
348
393
|
### Pipeline 配置
|
|
349
394
|
|
|
350
395
|
使用 YAML 配置文件定义可复现的数据处理流程:
|
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
dtflow/__init__.py,sha256=
|
|
1
|
+
dtflow/__init__.py,sha256=Gd9Us_BDXaxmMIGlz51E6OZDohqzweOrvB-2j8k3KVs,2347
|
|
2
2
|
dtflow/__main__.py,sha256=7lKluJTruDPN4CKSK2mWLUxSUlVLtkrqXyRMjlGk7SY,10595
|
|
3
3
|
dtflow/converters.py,sha256=gyy-K15zjzGBawFnZa8D9JX37JZ47rey2GhjKa2pxFo,22081
|
|
4
|
-
dtflow/core.py,sha256=
|
|
4
|
+
dtflow/core.py,sha256=szm9qmRVe1Q97O18UTGz7xTsdV-V8L4D6Bl1bxBJCWk,28778
|
|
5
5
|
dtflow/lineage.py,sha256=vQ06lxBHftu-Ma5HlISp3F2eiIvwagQSnUGaLeABDZY,12190
|
|
6
6
|
dtflow/pipeline.py,sha256=zZaC4fg5vsp_30Fhbg75vu0yggsdvf28bWBiVDWzZ6Y,13901
|
|
7
7
|
dtflow/presets.py,sha256=OP1nnM5NFk5Kli9FsXK0xAot48E5OQ6-VOIJT9ffXPg,5023
|
|
8
|
-
dtflow/streaming.py,sha256=
|
|
9
|
-
dtflow/tokenizers.py,sha256=
|
|
8
|
+
dtflow/streaming.py,sha256=lYf9gi5U-3oqr7oEe5mENx1r-LtRb2YfGNq1fP3_sw4,21972
|
|
9
|
+
dtflow/tokenizers.py,sha256=zxE6XZGjZ_DOGCjRSClI9xaAbFVf8FS6jwwssGoi_9U,18111
|
|
10
10
|
dtflow/cli/__init__.py,sha256=QhZ-thgx9IBTFII7T_hdoWFUl0CCsdGQHN5ZEZw2XB0,423
|
|
11
|
-
dtflow/cli/commands.py,sha256=
|
|
11
|
+
dtflow/cli/commands.py,sha256=8t_HgFuFqGt1HXPpEDV47qB2fwMD5C6d9Bjj-VNb37I,84958
|
|
12
12
|
dtflow/mcp/__init__.py,sha256=huEJ3rXDbxDRjsLPEvjNT2u3tWs6Poiv6fokPIrByjw,897
|
|
13
13
|
dtflow/mcp/__main__.py,sha256=PoT2ZZmJq9xDZxDACJfqDW9Ld_ukHrGNK-0XUd7WGnY,448
|
|
14
14
|
dtflow/mcp/cli.py,sha256=ck0oOS_642cNktxULaMRE7BJfMxsBCwotmCj3PSPwVk,13110
|
|
@@ -16,9 +16,10 @@ dtflow/mcp/docs.py,sha256=DI2Vf-eFo4chRP_bDLsv4Uc3kJt8_1emz8N-NBSVirM,8834
|
|
|
16
16
|
dtflow/mcp/server.py,sha256=Nf0UlqDGhV55ndGuEglfr7VRjDWAC_9rRsNhdr0-ssM,4275
|
|
17
17
|
dtflow/storage/__init__.py,sha256=C0jpWNQU808Ezz7lWneddABal3wILy8ijFUNiSKbHV4,362
|
|
18
18
|
dtflow/storage/io.py,sha256=XNWLL10a7jgOjM1IfTN9kIuW23dwzFE1nnaw4E3LaiU,21885
|
|
19
|
-
dtflow/utils/__init__.py,sha256=
|
|
19
|
+
dtflow/utils/__init__.py,sha256=f8v9HJZMWRI5AL64Vjr76Pf2Na_whOF9nJBKgPbXXYg,429
|
|
20
20
|
dtflow/utils/display.py,sha256=OeOdTh6mbDwSkDWlmkjfpTjy2QG8ZUaYU0NpHUWkpEQ,5881
|
|
21
|
-
dtflow
|
|
22
|
-
dtflow-0.
|
|
23
|
-
dtflow-0.
|
|
24
|
-
dtflow-0.
|
|
21
|
+
dtflow/utils/field_path.py,sha256=WcNA-LZh3H61a77FEzB_R7YAyyZl3M8ofdq05ytQGmI,7459
|
|
22
|
+
dtflow-0.4.1.dist-info/METADATA,sha256=-rdgDNFMy3pPO5mpMcKlB_quxSlD9mUIoe_tIUXoPP4,18306
|
|
23
|
+
dtflow-0.4.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
24
|
+
dtflow-0.4.1.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
|
|
25
|
+
dtflow-0.4.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|