dtflow 0.5.8__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dtflow/SKILL.md +22 -6
- dtflow/__init__.py +1 -1
- dtflow/__main__.py +106 -6
- dtflow/cli/clean.py +90 -1
- dtflow/cli/commands.py +17 -1
- dtflow/cli/eval.py +288 -0
- dtflow/cli/export.py +81 -0
- dtflow/cli/sample.py +90 -3
- dtflow/cli/split.py +138 -0
- dtflow/eval.py +276 -0
- dtflow/utils/text_parser.py +124 -0
- {dtflow-0.5.8.dist-info → dtflow-0.5.10.dist-info}/METADATA +29 -1
- {dtflow-0.5.8.dist-info → dtflow-0.5.10.dist-info}/RECORD +15 -10
- {dtflow-0.5.8.dist-info → dtflow-0.5.10.dist-info}/WHEEL +0 -0
- {dtflow-0.5.8.dist-info → dtflow-0.5.10.dist-info}/entry_points.txt +0 -0
dtflow/eval.py
ADDED
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
"""
|
|
2
|
+
评估指标计算模块
|
|
3
|
+
|
|
4
|
+
提供分类任务的指标计算和评估报告导出:
|
|
5
|
+
- MetricsCalculator: 计算 accuracy/precision/recall/F1/混淆矩阵
|
|
6
|
+
- export_eval_report: 生成 metrics.md + result.jsonl + bad_case.jsonl
|
|
7
|
+
|
|
8
|
+
依赖: scikit-learn, pandas
|
|
9
|
+
安装: pip install dtflow[eval]
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
from datetime import datetime
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import TYPE_CHECKING, Optional
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from pandas import DataFrame
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _check_eval_deps():
|
|
22
|
+
"""检查 eval 依赖是否已安装"""
|
|
23
|
+
try:
|
|
24
|
+
import pandas # noqa: F401
|
|
25
|
+
import sklearn # noqa: F401
|
|
26
|
+
except ImportError as e:
|
|
27
|
+
missing = str(e).split("'")[1] if "'" in str(e) else str(e)
|
|
28
|
+
raise ImportError(
|
|
29
|
+
f"eval 功能需要额外依赖: {missing}\n" f"请运行: pip install dtflow[eval]"
|
|
30
|
+
) from e
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class MetricsCalculator:
|
|
34
|
+
"""分类指标计算器
|
|
35
|
+
|
|
36
|
+
基于 sklearn 计算 accuracy/precision/recall/F1/混淆矩阵/分类报告。
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
df: 包含预测列和标签列的 DataFrame
|
|
40
|
+
pred_col: 预测值列名
|
|
41
|
+
label_col: 标签值列名
|
|
42
|
+
include_macro_micro_avg: 是否在报告中包含 macro/micro 平均
|
|
43
|
+
remove_matrix_zero_row: 是否移除混淆矩阵中 support=0 的行
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
df: "DataFrame",
|
|
49
|
+
pred_col: str = "predict",
|
|
50
|
+
label_col: str = "label",
|
|
51
|
+
include_macro_micro_avg: bool = False,
|
|
52
|
+
remove_matrix_zero_row: bool = False,
|
|
53
|
+
):
|
|
54
|
+
_check_eval_deps()
|
|
55
|
+
self.df = df
|
|
56
|
+
self.y_pred = df[pred_col]
|
|
57
|
+
self.y_true = df[label_col]
|
|
58
|
+
self.all_labels = sorted(set(self.y_true.unique()).union(set(self.y_pred.unique())))
|
|
59
|
+
self.needed_labels = None
|
|
60
|
+
self.remove_matrix_zero_row = remove_matrix_zero_row
|
|
61
|
+
self.include_macro_micro_avg = include_macro_micro_avg
|
|
62
|
+
self.metrics = self._calculate_metrics()
|
|
63
|
+
|
|
64
|
+
def _calculate_metrics(self):
|
|
65
|
+
from sklearn.metrics import (
|
|
66
|
+
accuracy_score,
|
|
67
|
+
classification_report,
|
|
68
|
+
confusion_matrix,
|
|
69
|
+
precision_score,
|
|
70
|
+
recall_score,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
accuracy = accuracy_score(self.y_true, self.y_pred)
|
|
74
|
+
precision = precision_score(
|
|
75
|
+
self.y_true, self.y_pred, labels=self.all_labels, average="weighted", zero_division=0
|
|
76
|
+
)
|
|
77
|
+
recall = recall_score(
|
|
78
|
+
self.y_true, self.y_pred, labels=self.all_labels, average="weighted", zero_division=0
|
|
79
|
+
)
|
|
80
|
+
conf_matrix = confusion_matrix(self.y_true, self.y_pred, labels=self.all_labels)
|
|
81
|
+
report = classification_report(
|
|
82
|
+
self.y_true, self.y_pred, labels=self.all_labels, output_dict=True, zero_division=0
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# 默认只保留加权平均
|
|
86
|
+
if not self.include_macro_micro_avg:
|
|
87
|
+
report = {
|
|
88
|
+
label: metrics
|
|
89
|
+
for label, metrics in report.items()
|
|
90
|
+
if label in self.all_labels or label == "weighted avg"
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
# 去除 support=0 的类别(注意 accuracy 是 float 不是 dict)
|
|
94
|
+
report = {
|
|
95
|
+
label: metrics
|
|
96
|
+
for label, metrics in report.items()
|
|
97
|
+
if isinstance(metrics, dict) and metrics.get("support", 0) > 0
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
self.needed_labels = [label for label in report.keys() if label in self.all_labels]
|
|
101
|
+
|
|
102
|
+
# 可选移除混淆矩阵中不需要的行
|
|
103
|
+
needed_idx_list = [self.all_labels.index(label) for label in self.needed_labels]
|
|
104
|
+
if self.remove_matrix_zero_row:
|
|
105
|
+
conf_matrix = conf_matrix[needed_idx_list]
|
|
106
|
+
|
|
107
|
+
return {
|
|
108
|
+
"accuracy": accuracy,
|
|
109
|
+
"precision": precision,
|
|
110
|
+
"recall": recall,
|
|
111
|
+
"confusion_matrix": conf_matrix,
|
|
112
|
+
"classification_report": report,
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
def get_metrics(self):
|
|
116
|
+
return self.metrics
|
|
117
|
+
|
|
118
|
+
def format_classification_report_as_markdown(self):
|
|
119
|
+
"""将分类报告格式化为 Markdown 表格"""
|
|
120
|
+
report = self.metrics["classification_report"]
|
|
121
|
+
header = "| Label | Precision | Recall | F1-score | Support |\n"
|
|
122
|
+
separator = "|-------|-----------|--------|----------|---------|\n"
|
|
123
|
+
rows = []
|
|
124
|
+
for label, metrics in report.items():
|
|
125
|
+
if isinstance(metrics, dict):
|
|
126
|
+
rows.append(
|
|
127
|
+
f"| {label} | {metrics['precision']:.2f} | {metrics['recall']:.2f} "
|
|
128
|
+
f"| {metrics['f1-score']:.2f} | {metrics['support']:.0f} |"
|
|
129
|
+
)
|
|
130
|
+
return header + separator + "\n".join(rows)
|
|
131
|
+
|
|
132
|
+
def _clean_label_for_markdown(self, label, max_length=20):
|
|
133
|
+
"""清理标签文本,使其适合 Markdown 表格显示"""
|
|
134
|
+
label = str(label).replace("\n", " ")
|
|
135
|
+
label = label.replace("|", "\\|")
|
|
136
|
+
label = label.replace("-", "\\-")
|
|
137
|
+
label = label.replace("<", "<")
|
|
138
|
+
label = label.replace(">", ">")
|
|
139
|
+
if len(label) > max_length:
|
|
140
|
+
label = label[:max_length] + "..."
|
|
141
|
+
label = label.strip()
|
|
142
|
+
if not label:
|
|
143
|
+
label = "(empty)"
|
|
144
|
+
return label
|
|
145
|
+
|
|
146
|
+
def format_confusion_matrix_as_markdown(self, max_label_length=20):
|
|
147
|
+
"""将混淆矩阵格式化为 Markdown 表格"""
|
|
148
|
+
matrix = self.metrics["confusion_matrix"]
|
|
149
|
+
|
|
150
|
+
if self.remove_matrix_zero_row:
|
|
151
|
+
labels = self.needed_labels
|
|
152
|
+
else:
|
|
153
|
+
labels = self.all_labels
|
|
154
|
+
|
|
155
|
+
processed_labels = [self._clean_label_for_markdown(lb, max_label_length) for lb in labels]
|
|
156
|
+
|
|
157
|
+
header = "| 真实值/预测值 | " + " | ".join(processed_labels) + " |\n"
|
|
158
|
+
separator_parts = [":---:"] * (len(processed_labels) + 1)
|
|
159
|
+
separator = "| " + " | ".join(separator_parts) + " |\n"
|
|
160
|
+
|
|
161
|
+
rows = []
|
|
162
|
+
for i, row in enumerate(matrix):
|
|
163
|
+
row_label = self._clean_label_for_markdown(labels[i], max_label_length)
|
|
164
|
+
formatted_row = [f"{num:,}" for num in row]
|
|
165
|
+
rows.append(f"| {row_label} | " + " | ".join(formatted_row) + " |")
|
|
166
|
+
|
|
167
|
+
return header + separator + "\n".join(rows)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def export_eval_report(
|
|
171
|
+
df: "DataFrame",
|
|
172
|
+
pred_col: str,
|
|
173
|
+
label_col: str,
|
|
174
|
+
record_folder: str = "record",
|
|
175
|
+
input_name: Optional[str] = None,
|
|
176
|
+
):
|
|
177
|
+
"""生成评估报告并保存到指定目录
|
|
178
|
+
|
|
179
|
+
输出文件:
|
|
180
|
+
- metrics.md: 指标概览 + 分类报告 + 混淆矩阵
|
|
181
|
+
- result.jsonl: 完整预测结果
|
|
182
|
+
- bad_case.jsonl: 预测错误样本
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
df: 包含预测和标签的 DataFrame
|
|
186
|
+
pred_col: 预测值列名
|
|
187
|
+
label_col: 标签值列名
|
|
188
|
+
record_folder: 输出根目录
|
|
189
|
+
input_name: 输入文件名(用于子目录命名)
|
|
190
|
+
"""
|
|
191
|
+
from rich.console import Console
|
|
192
|
+
from rich.markdown import Markdown
|
|
193
|
+
|
|
194
|
+
calculator = MetricsCalculator(df, pred_col=pred_col, label_col=label_col)
|
|
195
|
+
metrics = calculator.get_metrics()
|
|
196
|
+
|
|
197
|
+
# 用 Rich Table 构建指标概览(替代 tabulate)
|
|
198
|
+
from rich.table import Table
|
|
199
|
+
|
|
200
|
+
overview_table = Table(title="指标概览", show_header=True)
|
|
201
|
+
overview_table.add_column("Accuracy", justify="center")
|
|
202
|
+
overview_table.add_column("Precision", justify="center")
|
|
203
|
+
overview_table.add_column("Recall", justify="center")
|
|
204
|
+
overview_table.add_row(
|
|
205
|
+
f"{metrics['accuracy']:.4f}",
|
|
206
|
+
f"{metrics['precision']:.4f}",
|
|
207
|
+
f"{metrics['recall']:.4f}",
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
# 构建 Markdown 报告内容
|
|
211
|
+
md = (
|
|
212
|
+
f"\n\n### 指标概览\n\n"
|
|
213
|
+
f"| Accuracy | Precision | Recall |\n"
|
|
214
|
+
f"|----------|-----------|--------|\n"
|
|
215
|
+
f"| {metrics['accuracy']:.4f} | {metrics['precision']:.4f} | {metrics['recall']:.4f} |"
|
|
216
|
+
)
|
|
217
|
+
metrics_md = calculator.format_classification_report_as_markdown()
|
|
218
|
+
confusion_md = calculator.format_confusion_matrix_as_markdown()
|
|
219
|
+
md += f"\n\n### Classification Report\n{metrics_md}\n" f"\n### Confusion Matrix\n{confusion_md}"
|
|
220
|
+
|
|
221
|
+
# 创建输出目录(带序号和时间戳)
|
|
222
|
+
now = datetime.now().strftime("%Y%m%d-%H-%M-%S")
|
|
223
|
+
record_path = Path(record_folder)
|
|
224
|
+
if input_name:
|
|
225
|
+
record_path = record_path / input_name
|
|
226
|
+
|
|
227
|
+
if record_path.exists():
|
|
228
|
+
existing = [d.name for d in record_path.iterdir() if d.is_dir()]
|
|
229
|
+
max_idx = 0
|
|
230
|
+
for name in existing:
|
|
231
|
+
parts = name.split("-", 1)
|
|
232
|
+
if parts[0].isdigit():
|
|
233
|
+
max_idx = max(max_idx, int(parts[0]))
|
|
234
|
+
idx = max_idx + 1
|
|
235
|
+
else:
|
|
236
|
+
idx = 1
|
|
237
|
+
|
|
238
|
+
record_path = record_path / f"{idx}-{now}"
|
|
239
|
+
record_path.mkdir(parents=True, exist_ok=True)
|
|
240
|
+
|
|
241
|
+
# 终端输出
|
|
242
|
+
console = Console()
|
|
243
|
+
console.print(overview_table)
|
|
244
|
+
console.print(Markdown(md))
|
|
245
|
+
|
|
246
|
+
# 保存文件
|
|
247
|
+
with open(os.path.join(record_path, "metrics.md"), "w", encoding="utf-8") as f:
|
|
248
|
+
f.write(md)
|
|
249
|
+
|
|
250
|
+
bad_case_df = df[df[pred_col] != df[label_col]]
|
|
251
|
+
|
|
252
|
+
# 保存 JSONL
|
|
253
|
+
df.to_json(
|
|
254
|
+
os.path.join(record_path, "result.jsonl"),
|
|
255
|
+
orient="records",
|
|
256
|
+
lines=True,
|
|
257
|
+
force_ascii=False,
|
|
258
|
+
)
|
|
259
|
+
bad_case_df.to_json(
|
|
260
|
+
os.path.join(record_path, "bad_case.jsonl"),
|
|
261
|
+
orient="records",
|
|
262
|
+
lines=True,
|
|
263
|
+
force_ascii=False,
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
# 尝试保存 CSV
|
|
267
|
+
try:
|
|
268
|
+
df.to_csv(os.path.join(record_path, "result.csv"), index=False)
|
|
269
|
+
bad_case_df.to_csv(os.path.join(record_path, "bad_case.csv"), index=False)
|
|
270
|
+
except Exception:
|
|
271
|
+
pass
|
|
272
|
+
|
|
273
|
+
console.print(f"\n[green]报告已保存到: {record_path}[/green]")
|
|
274
|
+
console.print(f"[dim] - metrics.md ({len(df)} 条数据, {len(bad_case_df)} 条错误)[/dim]")
|
|
275
|
+
|
|
276
|
+
return record_path
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""
|
|
2
|
+
文本清洗工具
|
|
3
|
+
|
|
4
|
+
提供 LLM 输出的常见清洗函数:
|
|
5
|
+
- strip_think_tags: 去除 <think>...</think> 思考链内容
|
|
6
|
+
- extract_code_snippets: 提取 ``` 代码块
|
|
7
|
+
- parse_generic_tags: 解析 <tag>content</tag> 格式标签
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
from typing import Dict, List
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def strip_think_tags(text: str) -> str:
|
|
15
|
+
"""去除 <think>...</think> 包裹的内容
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
text: 输入文本
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
去除思考链后的文本
|
|
22
|
+
|
|
23
|
+
Examples:
|
|
24
|
+
>>> strip_think_tags("<think>让我想想...</think>答案是42")
|
|
25
|
+
'答案是42'
|
|
26
|
+
"""
|
|
27
|
+
if not text:
|
|
28
|
+
return text
|
|
29
|
+
return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def extract_code_snippets(text: str, strict: bool = True) -> List[Dict[str, str]]:
|
|
33
|
+
"""提取 ``` 代码块
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
text: 输入文本
|
|
37
|
+
strict: True 仅匹配 ```lang...``` 格式,False 额外匹配 {...} 格式
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
代码片段列表,每项为 {"language": ..., "code": ...}
|
|
41
|
+
|
|
42
|
+
Examples:
|
|
43
|
+
>>> extract_code_snippets("```json\\n{\"a\": 1}\\n```")
|
|
44
|
+
[{'language': 'json', 'code': '{"a": 1}'}]
|
|
45
|
+
"""
|
|
46
|
+
pattern = r"```(\w+)?\s*([\s\S]*?)```"
|
|
47
|
+
matches = re.findall(pattern, text)
|
|
48
|
+
|
|
49
|
+
code_snippets = []
|
|
50
|
+
for lang, code in matches:
|
|
51
|
+
code_snippets.append(
|
|
52
|
+
{
|
|
53
|
+
"language": lang.strip() if lang else "unknown",
|
|
54
|
+
"code": code.strip(),
|
|
55
|
+
}
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
if not strict:
|
|
59
|
+
# 移除已匹配的 ``` 块,在剩余文本中匹配 { ... }
|
|
60
|
+
text = re.sub(pattern, "", text)
|
|
61
|
+
brace_matches = re.findall(r"\{[\s\S]*?\}", text)
|
|
62
|
+
for code in brace_matches:
|
|
63
|
+
code_snippets.append(
|
|
64
|
+
{
|
|
65
|
+
"language": "unknown",
|
|
66
|
+
"code": code.strip(),
|
|
67
|
+
}
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
return code_snippets
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def parse_generic_tags(text: str, strict: bool = False) -> Dict[str, str]:
|
|
74
|
+
"""解析 XML 风格标签
|
|
75
|
+
|
|
76
|
+
支持两种模式:
|
|
77
|
+
- strict=True: 仅匹配闭合标签 <label>content</label>
|
|
78
|
+
- strict=False: 同时匹配开放式标签 <label>content,闭合标签优先
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
text: 输入文本
|
|
82
|
+
strict: 是否严格模式
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
{标签名: 内容} 字典
|
|
86
|
+
|
|
87
|
+
Examples:
|
|
88
|
+
>>> parse_generic_tags("<标签>内容</标签>")
|
|
89
|
+
{'标签': '内容'}
|
|
90
|
+
>>> parse_generic_tags("<a>hello<b>world", strict=False)
|
|
91
|
+
{'a': 'hello', 'b': 'world'}
|
|
92
|
+
"""
|
|
93
|
+
if not text:
|
|
94
|
+
return {}
|
|
95
|
+
|
|
96
|
+
result = {}
|
|
97
|
+
|
|
98
|
+
if strict:
|
|
99
|
+
pattern_closed = r"<([^>]+)>\s*(.*?)\s*</\1>"
|
|
100
|
+
matches = re.findall(pattern_closed, text, re.DOTALL)
|
|
101
|
+
for label, content in matches:
|
|
102
|
+
result[label.strip()] = content.strip()
|
|
103
|
+
else:
|
|
104
|
+
remaining_text = str(text)
|
|
105
|
+
|
|
106
|
+
# 1. 优先处理闭合标签
|
|
107
|
+
def process_closed_tag(match_obj):
|
|
108
|
+
label = match_obj.group(1).strip()
|
|
109
|
+
content = match_obj.group(2).strip()
|
|
110
|
+
result[label] = content
|
|
111
|
+
return ""
|
|
112
|
+
|
|
113
|
+
pattern_closed = r"<([^>]+)>\s*(.*?)\s*</\1>"
|
|
114
|
+
remaining_text = re.sub(pattern_closed, process_closed_tag, remaining_text, flags=re.DOTALL)
|
|
115
|
+
|
|
116
|
+
# 2. 在剩余文本中处理开放式标签
|
|
117
|
+
pattern_open = r"<([^>]+)>\s*(.*?)(?=<[^>]+>|$)"
|
|
118
|
+
matches_open = re.findall(pattern_open, remaining_text, re.DOTALL)
|
|
119
|
+
for label, content in matches_open:
|
|
120
|
+
label_stripped = label.strip()
|
|
121
|
+
if label_stripped not in result:
|
|
122
|
+
result[label_stripped] = content.strip()
|
|
123
|
+
|
|
124
|
+
return result
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dtflow
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.10
|
|
4
4
|
Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
|
|
5
5
|
Project-URL: Homepage, https://github.com/yourusername/DataTransformer
|
|
6
6
|
Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
|
|
@@ -44,6 +44,7 @@ Requires-Dist: flake8>=3.9.0; extra == 'dev'
|
|
|
44
44
|
Requires-Dist: huggingface-hub>=0.20.0; extra == 'dev'
|
|
45
45
|
Requires-Dist: isort>=5.9.0; extra == 'dev'
|
|
46
46
|
Requires-Dist: mypy>=0.910; extra == 'dev'
|
|
47
|
+
Requires-Dist: pandas>=1.3.0; extra == 'dev'
|
|
47
48
|
Requires-Dist: pyarrow; extra == 'dev'
|
|
48
49
|
Requires-Dist: pytest-cov>=2.12.0; extra == 'dev'
|
|
49
50
|
Requires-Dist: pytest>=6.0.0; extra == 'dev'
|
|
@@ -57,10 +58,14 @@ Provides-Extra: docs
|
|
|
57
58
|
Requires-Dist: myst-parser>=0.15.0; extra == 'docs'
|
|
58
59
|
Requires-Dist: sphinx-rtd-theme>=0.5.0; extra == 'docs'
|
|
59
60
|
Requires-Dist: sphinx>=4.0.0; extra == 'docs'
|
|
61
|
+
Provides-Extra: eval
|
|
62
|
+
Requires-Dist: pandas>=1.3.0; extra == 'eval'
|
|
63
|
+
Requires-Dist: scikit-learn>=0.24.0; extra == 'eval'
|
|
60
64
|
Provides-Extra: full
|
|
61
65
|
Requires-Dist: datasets>=2.0.0; extra == 'full'
|
|
62
66
|
Requires-Dist: datasketch>=1.5.0; extra == 'full'
|
|
63
67
|
Requires-Dist: huggingface-hub>=0.20.0; extra == 'full'
|
|
68
|
+
Requires-Dist: pandas>=1.3.0; extra == 'full'
|
|
64
69
|
Requires-Dist: pyarrow; extra == 'full'
|
|
65
70
|
Requires-Dist: rich>=10.0.0; extra == 'full'
|
|
66
71
|
Requires-Dist: scikit-learn>=0.24.0; extra == 'full'
|
|
@@ -435,6 +440,13 @@ dt sample data.jsonl 1000 --by=messages.# # 按消息数量分层采样
|
|
|
435
440
|
dt sample data.jsonl --where="category=tech" # 筛选后采样
|
|
436
441
|
dt sample data.jsonl --where="messages.#>=2" # 多条件筛选
|
|
437
442
|
|
|
443
|
+
# 按行范围查看(Python 切片语法)
|
|
444
|
+
dt slice data.jsonl 10:20 # 第 10-19 行(0-based,左闭右开)
|
|
445
|
+
dt slice data.jsonl :100 # 前 100 行
|
|
446
|
+
dt slice data.jsonl 100: # 第 100 行到末尾
|
|
447
|
+
dt slice data.jsonl 10:20 -o sliced.jsonl # 保存到文件
|
|
448
|
+
dt slice data.jsonl 10:20 -f question,answer # 只显示指定字段
|
|
449
|
+
|
|
438
450
|
# 数据转换 - 预设模式
|
|
439
451
|
dt transform data.jsonl --preset=openai_chat
|
|
440
452
|
dt transform data.jsonl --preset=alpaca
|
|
@@ -469,6 +481,9 @@ dt clean data.jsonl --max-len=messages[-1].content:500 # 最后一条消息最
|
|
|
469
481
|
dt clean data.jsonl --keep=question,answer # 只保留这些字段
|
|
470
482
|
dt clean data.jsonl --drop=metadata # 删除指定字段
|
|
471
483
|
dt clean data.jsonl --strip # 去除字符串首尾空白
|
|
484
|
+
dt clean data.jsonl --min-tokens=content:10 # 最少 10 tokens
|
|
485
|
+
dt clean data.jsonl --max-tokens=content:1000 # 最多 1000 tokens
|
|
486
|
+
dt clean data.jsonl --min-tokens=text:50 -m gpt-4 # 指定分词器
|
|
472
487
|
|
|
473
488
|
# 数据去重
|
|
474
489
|
dt dedupe data.jsonl # 全量精确去重
|
|
@@ -477,6 +492,17 @@ dt dedupe data.jsonl --key=meta.id # 按嵌套字段去重
|
|
|
477
492
|
dt dedupe data.jsonl --key=messages[0].content # 按第一条消息内容去重
|
|
478
493
|
dt dedupe data.jsonl --key=text --similar=0.8 # 相似度去重
|
|
479
494
|
|
|
495
|
+
# 数据集切分
|
|
496
|
+
dt split data.jsonl --ratio=0.8 --seed=42 # 二分: train/test
|
|
497
|
+
dt split data.jsonl --ratio=0.7,0.15,0.15 # 三分: train/val/test
|
|
498
|
+
dt split data.jsonl --ratio=0.8 -o /tmp/output # 指定输出目录
|
|
499
|
+
|
|
500
|
+
# 训练框架导出
|
|
501
|
+
dt export data.jsonl --framework=llama-factory # 导出到 LLaMA-Factory
|
|
502
|
+
dt export data.jsonl -f swift -o ./swift_out # 导出到 ms-swift
|
|
503
|
+
dt export data.jsonl -f axolotl # 导出到 Axolotl
|
|
504
|
+
dt export data.jsonl -f llama-factory --check # 仅检查兼容性
|
|
505
|
+
|
|
480
506
|
# 文件拼接
|
|
481
507
|
dt concat a.jsonl b.jsonl -o merged.jsonl
|
|
482
508
|
|
|
@@ -522,6 +548,8 @@ CLI 命令中的字段参数支持嵌套路径语法,可访问深层嵌套的
|
|
|
522
548
|
| `clean` | `--drop-empty=` | `--drop-empty=meta.source` |
|
|
523
549
|
| `clean` | `--min-len=` | `--min-len=messages.#:2` |
|
|
524
550
|
| `clean` | `--max-len=` | `--max-len=messages[-1].content:500` |
|
|
551
|
+
| `clean` | `--min-tokens=` | `--min-tokens=content:10` |
|
|
552
|
+
| `clean` | `--max-tokens=` | `--max-tokens=content:1000` |
|
|
525
553
|
| `token-stats` | `--field=` | `--field=messages[-1].content` |
|
|
526
554
|
| `diff` | `--key=` | `--key=meta.uuid` |
|
|
527
555
|
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
dtflow/SKILL.md,sha256=
|
|
2
|
-
dtflow/__init__.py,sha256=
|
|
3
|
-
dtflow/__main__.py,sha256=
|
|
1
|
+
dtflow/SKILL.md,sha256=Oq8Kb5JghZMJ1WoP8OWhX3qAWaUY9Sip_iWAv8S2eMg,10567
|
|
2
|
+
dtflow/__init__.py,sha256=2A-P6k9VBIWZXRgXwYPFOwHMCmgkfKZVYuGuBziqqhc,3032
|
|
3
|
+
dtflow/__main__.py,sha256=_wrpYfOog6G83I17yuBe-hryBsaCrIwbXSEnzT-r28g,18008
|
|
4
4
|
dtflow/converters.py,sha256=X3qeFD7FCOMnfiP3MicL5MXimOm4XUYBs5pczIkudU0,22331
|
|
5
5
|
dtflow/core.py,sha256=qMo6B3LK--TWRK7ZBKObGcs3pKFnd0NPoaM0T8JC7Jw,38135
|
|
6
|
+
dtflow/eval.py,sha256=_c-XP2zsOBznYltSyKEScOqvmPVX2orqepg5cNhXXB0,9836
|
|
6
7
|
dtflow/framework.py,sha256=jyICi_RWHjX7WfsXdSbWmP1SL7y1OWSPyd5G5Y-lvg4,17578
|
|
7
8
|
dtflow/lineage.py,sha256=jie3OL1qK90-_cOOqqLbhSJ1oGUktDM1x5HRpQ5Qiyc,12800
|
|
8
9
|
dtflow/parallel.py,sha256=EnIdGEGMrZUNT2-CBIV93UFfpqr_jU_heqqvdGXcP-Y,3046
|
|
@@ -12,14 +13,17 @@ dtflow/schema.py,sha256=zCZNEAqTMT1BS_p2t0CYczR5S9rqyDREa7ZsYI5pFGA,19885
|
|
|
12
13
|
dtflow/streaming.py,sha256=dxpNd1-Wz_PTLTdvM5qn06_2TJr5NRlIIuw0LOSS2Iw,24755
|
|
13
14
|
dtflow/tokenizers.py,sha256=GFQsuLSLn2GHn2kaXhJkP8G85lgsdLzYtJNbppQhYPE,23408
|
|
14
15
|
dtflow/cli/__init__.py,sha256=QhZ-thgx9IBTFII7T_hdoWFUl0CCsdGQHN5ZEZw2XB0,423
|
|
15
|
-
dtflow/cli/clean.py,sha256=
|
|
16
|
-
dtflow/cli/commands.py,sha256=
|
|
16
|
+
dtflow/cli/clean.py,sha256=BEQQlH2q6luCbx51M3oxxOwcnwlOA8vo9WX3Fp7I6AY,29498
|
|
17
|
+
dtflow/cli/commands.py,sha256=LvyDQ_nWUM7UlPDEFQadRdw5O2ZKDLgF41_xAJRhYxI,1583
|
|
17
18
|
dtflow/cli/common.py,sha256=gCwnF5Sw2ploqfZJO_z3Ms9mR1HNT7Lj6ydHn0uVaIw,13817
|
|
19
|
+
dtflow/cli/eval.py,sha256=c53kCRH86k2Q_6vESKFlcepcNnTpO9O68agWK4_oJj8,9582
|
|
20
|
+
dtflow/cli/export.py,sha256=loRfVPwEVsDw3ZMKEYGp0Hy38kYZG2QT8JCMbz1dRzU,2156
|
|
18
21
|
dtflow/cli/io_ops.py,sha256=BMDisP6dxzzmSjYwmeFwaHmpHHPqirmXAWeNTD-9MQM,13254
|
|
19
22
|
dtflow/cli/lineage.py,sha256=_lNh35nF9AA0Zy6FyZ4g8IzrXH2ZQnp3inF-o2Hs1pw,1383
|
|
20
23
|
dtflow/cli/pipeline.py,sha256=QNEo-BJlaC1CVnVeRZr7TwfuZYloJ4TebIzJ5ALzry0,1426
|
|
21
|
-
dtflow/cli/sample.py,sha256=
|
|
24
|
+
dtflow/cli/sample.py,sha256=etbro5I0pyNgn0Qfhp1M6Bh-95JN-AntDa5AwVe_oKY,18269
|
|
22
25
|
dtflow/cli/skill.py,sha256=opiTEBejA7JHKrEMftMOPDQlOgZ4n59rwaHXGU1Nukk,2022
|
|
26
|
+
dtflow/cli/split.py,sha256=96bhWnxHnjIqifoliLgciApkLbwQU8bWHovK8bcMk9g,3667
|
|
23
27
|
dtflow/cli/stats.py,sha256=HkTZD80h4tzYXTtMnfpjLUMP6kl_es6ifcmExxzGdMU,31813
|
|
24
28
|
dtflow/cli/transform.py,sha256=w6xqMOxPxQvL2u_BPCfpDHuPSC9gmcqMPVN8s-B6bbY,15052
|
|
25
29
|
dtflow/cli/validate.py,sha256=Frs-jKcDHmYozpmIYZueDSX5o2i1Xn-WW81FGUyUrng,5796
|
|
@@ -29,7 +33,8 @@ dtflow/utils/__init__.py,sha256=Pn-ltwV04fBQmeZG7FxInDQmzH29LYOi90LgeLMEuQk,506
|
|
|
29
33
|
dtflow/utils/display.py,sha256=OeOdTh6mbDwSkDWlmkjfpTjy2QG8ZUaYU0NpHUWkpEQ,5881
|
|
30
34
|
dtflow/utils/field_path.py,sha256=K8nU196RxTSJ1OoieTWGcYOWl9KjGq2iSxCAkfjECuM,7621
|
|
31
35
|
dtflow/utils/helpers.py,sha256=JXN176_B2pm53GLVyZ1wj3wrmBJG52Tkw6AMQSdj7M8,791
|
|
32
|
-
dtflow
|
|
33
|
-
dtflow-0.5.
|
|
34
|
-
dtflow-0.5.
|
|
35
|
-
dtflow-0.5.
|
|
36
|
+
dtflow/utils/text_parser.py,sha256=0t2TMOSha4dTiDu9H4ygdb67cI20zhtBH1XavDspL_g,3727
|
|
37
|
+
dtflow-0.5.10.dist-info/METADATA,sha256=OGefMoe17by5IbxdxZgqoJ1Y6OWPt_iGEFM4KgltRZw,26023
|
|
38
|
+
dtflow-0.5.10.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
39
|
+
dtflow-0.5.10.dist-info/entry_points.txt,sha256=dadIDOK7Iu9pMxnMPBfpb4aAPe4hQbBOshpQYjVYpGc,44
|
|
40
|
+
dtflow-0.5.10.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|