mysphinx-forge 0.2.0__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/PKG-INFO +57 -13
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/README.md +56 -12
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/mysphinx_forge/cli.py +60 -38
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/mysphinx_forge/config.py +1 -1
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/mysphinx_forge/sft_dataset.py +86 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/mysphinx_forge.egg-info/PKG-INFO +57 -13
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/pyproject.toml +1 -1
- mysphinx_forge-0.2.1/tests/test_sft_dataset.py +177 -0
- mysphinx_forge-0.2.0/tests/test_sft_dataset.py +0 -86
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/mysphinx_forge/__init__.py +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/mysphinx_forge/cleaning.py +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/mysphinx_forge/cluster_labeling.py +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/mysphinx_forge/cluster_reporting.py +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/mysphinx_forge/clustering.py +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/mysphinx_forge/deduplication.py +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/mysphinx_forge/embedding.py +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/mysphinx_forge/env_utils.py +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/mysphinx_forge/file_io.py +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/mysphinx_forge/http_client.py +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/mysphinx_forge/logging_utils.py +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/mysphinx_forge/model_eval.py +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/mysphinx_forge/model_testing.py +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/mysphinx_forge/openai_responses.py +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/mysphinx_forge/progress.py +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/mysphinx_forge/semantic_deduplication.py +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/mysphinx_forge/splitting.py +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/mysphinx_forge.egg-info/SOURCES.txt +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/mysphinx_forge.egg-info/dependency_links.txt +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/mysphinx_forge.egg-info/entry_points.txt +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/mysphinx_forge.egg-info/requires.txt +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/mysphinx_forge.egg-info/top_level.txt +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/setup.cfg +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/tests/test_cleaning.py +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/tests/test_cli.py +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/tests/test_cluster_labeling.py +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/tests/test_cluster_reporting.py +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/tests/test_clustering.py +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/tests/test_deduplication.py +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/tests/test_file_io.py +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/tests/test_http_client.py +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/tests/test_model_eval.py +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/tests/test_model_testing.py +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/tests/test_semantic_deduplication.py +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/tests/test_sft_cli.py +0 -0
- {mysphinx_forge-0.2.0 → mysphinx_forge-0.2.1}/tests/test_splitting.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mysphinx-forge
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: Data and model workflow toolkit for cleaning, clustering, generation, and evaluation
|
|
5
5
|
Keywords: data-cleaning,deduplication,clustering,nlp,cli
|
|
6
6
|
Classifier: Development Status :: 3 - Alpha
|
|
@@ -65,7 +65,7 @@ Requires-Dist: transformers>=4.55.0; extra == "all"
|
|
|
65
65
|
| `cluster` | 文本聚类,并导出汇总、投影和 HTML 报告 | `*_clustered.*` |
|
|
66
66
|
| `split` | 切分 train / valid / test | `*_split_train.*` 等 |
|
|
67
67
|
| `model-test` | 批量执行模型推理或单条烟雾测试,含预期结果列时自动输出评估报告 | `*_model_tested.*` 或终端输出 |
|
|
68
|
-
| `convert-sft` | 转换表格数据为
|
|
68
|
+
| `convert-sft` | 转换表格数据为 SFT 数据(`alpaca` / `pa` 格式) | `*_alpaca.json` / `*_pa.jsonl`(超 10000 条自动切分) |
|
|
69
69
|
|
|
70
70
|
## 项目结构
|
|
71
71
|
|
|
@@ -174,10 +174,10 @@ uv run python main.py ...
|
|
|
174
174
|
- `mysphinx-forge.yaml`
|
|
175
175
|
- 程序内置默认值
|
|
176
176
|
|
|
177
|
-
配置文件中的**相对路径**(`input_file`、`output`、`system_prompt_file`、`
|
|
177
|
+
配置文件中的**相对路径**(`input_file`、`output`、`system_prompt_file`、`sft_system_prompt_file`、`embedding_model_path`、`test_model_path` 等)以**执行时的当前工作目录**为基准展开,与命令行参数行为一致。因此推荐从项目根目录运行工具,配置文件中的路径直接写相对于项目根目录的路径即可:
|
|
178
178
|
|
|
179
179
|
```yaml
|
|
180
|
-
# configs/
|
|
180
|
+
# configs/06_model_test.yaml
|
|
181
181
|
action: model-test
|
|
182
182
|
input_file: data/raw_deduplicated_split_test.xlsx # 相对于运行时 cwd(项目根目录)
|
|
183
183
|
model-test:
|
|
@@ -186,7 +186,7 @@ model-test:
|
|
|
186
186
|
|
|
187
187
|
```bash
|
|
188
188
|
# 从项目根目录运行,路径正确展开
|
|
189
|
-
mysphinx-forge --config configs/
|
|
189
|
+
mysphinx-forge --config configs/06_model_test.yaml
|
|
190
190
|
```
|
|
191
191
|
|
|
192
192
|
配置文件支持三种层级:
|
|
@@ -332,19 +332,23 @@ mysphinx-forge --action clean-deduplicate --input-file data/input.xlsx
|
|
|
332
332
|
mysphinx-forge --action clean-deduplicate --input-file data/input.xlsx --dedupe-mode semantic
|
|
333
333
|
```
|
|
334
334
|
|
|
335
|
-
### 导出
|
|
335
|
+
### 导出 SFT 数据
|
|
336
336
|
|
|
337
|
-
`convert-sft`
|
|
338
|
-
|
|
339
|
-
> **关于命名**:这里的 `sft` 指**训练阶段**(Supervised Fine-Tuning,监督微调),而非训练方法。`alpaca` 的 `instruction / input / output` 结构正是 SFT 阶段的标准监督数据,无法用于 `pt`(纯文本预训练)或 `dpo` / `rm`(需要 `chosen` / `rejected` 偏好对)等其它阶段。
|
|
340
|
-
>
|
|
341
|
-
> **与训练方法无关**:训练方法(`LoRA` / 全参数 / freeze)和数据格式是两个正交的维度。本命令产出的这份 `alpaca` JSON,无论你用 `LoRA` 还是全参数微调都**通用,一个字都不用改**——只需在 `LLaMA-Factory` 的训练配置里切换 `finetuning_type` 即可。
|
|
337
|
+
`convert-sft` 支持将表格转换为多种 SFT 格式,通过 `--sft-format` 参数选择,默认为 `alpaca`。
|
|
342
338
|
|
|
343
339
|
默认会自动探测:
|
|
344
340
|
|
|
345
341
|
- 输入列:`text` / `用户问题` / `客户问题` / `用户输入`
|
|
346
342
|
- 输出列:`category` / `label` / `intent` / `output` / `response` / `answer` / `target`
|
|
347
343
|
|
|
344
|
+
#### alpaca 格式(默认)
|
|
345
|
+
|
|
346
|
+
可直接给 `LLaMA-Factory` 做 `SFT` 训练。
|
|
347
|
+
|
|
348
|
+
> **关于命名**:这里的 `sft` 指**训练阶段**(Supervised Fine-Tuning,监督微调),而非训练方法。`alpaca` 的 `instruction / input / output` 结构正是 SFT 阶段的标准监督数据,无法用于 `pt`(纯文本预训练)或 `dpo` / `rm`(需要 `chosen` / `rejected` 偏好对)等其它阶段。
|
|
349
|
+
>
|
|
350
|
+
> **与训练方法无关**:训练方法(`LoRA` / 全参数 / freeze)和数据格式是两个正交的维度。本命令产出的这份 `alpaca` JSON,无论你用 `LoRA` 还是全参数微调都**通用,一个字都不用改**——只需在 `LLaMA-Factory` 的训练配置里切换 `finetuning_type` 即可。
|
|
351
|
+
|
|
348
352
|
基础示例:
|
|
349
353
|
|
|
350
354
|
```bash
|
|
@@ -369,7 +373,7 @@ mysphinx-forge --action convert-sft --input-file data/input.xlsx --sft-output-co
|
|
|
369
373
|
mysphinx-forge --action convert-sft --input-file data/input.xlsx --sft-output-column category --sft-instruction "请判断用户问题所属分类,只输出分类标签。" --sft-system-prompt "你是一个证券问答分类助手。"
|
|
370
374
|
```
|
|
371
375
|
|
|
372
|
-
|
|
376
|
+
##### `instruction` 与 `input` 字段的两种模式
|
|
373
377
|
|
|
374
378
|
**默认模式(`--sft-user-query-as-instruction`,默认开启)**
|
|
375
379
|
|
|
@@ -412,9 +416,49 @@ convert-sft:
|
|
|
412
416
|
|
|
413
417
|
> 注意:如果同时传入了显式 `--sft-instruction`,则该固定文本始终作为 `instruction`,`input` 照常存放用户输入,`--sft-user-query-as-instruction` 不影响此情形。
|
|
414
418
|
|
|
419
|
+
#### pa 格式
|
|
420
|
+
|
|
421
|
+
企业内部自定义格式,输出为 JSONL 文件,每行一条 JSON 数据,结构如下:
|
|
422
|
+
|
|
423
|
+
```json
|
|
424
|
+
{"conversations": [{"context": "系统提示(可选)", "role": "system"}, {"context": "用户输入", "role": "human"}, {"context": "模型回复", "role": "assistant"}], "id": "1"}
|
|
425
|
+
```
|
|
426
|
+
|
|
427
|
+
字段说明:
|
|
428
|
+
|
|
429
|
+
- `id`:数据编号,从 1 开始累计
|
|
430
|
+
- `conversations`:对话内容列表
|
|
431
|
+
- `role`:`system`(可选,对应 `--sft-system-prompt` / `--sft-system-column`)、`human`(用户输入)、`assistant`(模型回复)
|
|
432
|
+
- `context`:会话内容
|
|
433
|
+
|
|
434
|
+
基础示例:
|
|
435
|
+
|
|
436
|
+
```bash
|
|
437
|
+
mysphinx-forge --action convert-sft --sft-format pa --input-file data/input_deduplicated_split_train.xlsx
|
|
438
|
+
```
|
|
439
|
+
|
|
440
|
+
默认输出:
|
|
441
|
+
|
|
442
|
+
```bash
|
|
443
|
+
data/input_deduplicated_split_train_pa.jsonl
|
|
444
|
+
```
|
|
445
|
+
|
|
446
|
+
带 system prompt:
|
|
447
|
+
|
|
448
|
+
```bash
|
|
449
|
+
mysphinx-forge --action convert-sft --sft-format pa --input-file data/input.xlsx --sft-system-prompt "你是证券领域用户意图识别专家。"
|
|
450
|
+
```
|
|
451
|
+
|
|
452
|
+
**自动切分**:当转换结果超过 10000 条时,自动按 10000 条一份切分为多个 JSONL 文件,文件名末尾追加序号。例如输入文件 `input_deduplicated_split_train.xlsx` 包含 12000 条数据,输出为:
|
|
453
|
+
|
|
454
|
+
```
|
|
455
|
+
input_deduplicated_split_train_pa_1.jsonl # 前 10000 条
|
|
456
|
+
input_deduplicated_split_train_pa_2.jsonl # 后 2000 条
|
|
457
|
+
```
|
|
458
|
+
|
|
415
459
|
说明:
|
|
416
460
|
|
|
417
|
-
-
|
|
461
|
+
- 支持 `alpaca`(默认)和 `pa` 两种格式,通过 `--sft-format` 切换
|
|
418
462
|
- 空输入行和空输出行会自动跳过
|
|
419
463
|
- 会同时生成日志文件和 `*.meta.json` 元数据文件
|
|
420
464
|
|
|
@@ -28,7 +28,7 @@
|
|
|
28
28
|
| `cluster` | 文本聚类,并导出汇总、投影和 HTML 报告 | `*_clustered.*` |
|
|
29
29
|
| `split` | 切分 train / valid / test | `*_split_train.*` 等 |
|
|
30
30
|
| `model-test` | 批量执行模型推理或单条烟雾测试,含预期结果列时自动输出评估报告 | `*_model_tested.*` 或终端输出 |
|
|
31
|
-
| `convert-sft` | 转换表格数据为
|
|
31
|
+
| `convert-sft` | 转换表格数据为 SFT 数据(`alpaca` / `pa` 格式) | `*_alpaca.json` / `*_pa.jsonl`(超 10000 条自动切分) |
|
|
32
32
|
|
|
33
33
|
## 项目结构
|
|
34
34
|
|
|
@@ -137,10 +137,10 @@ uv run python main.py ...
|
|
|
137
137
|
- `mysphinx-forge.yaml`
|
|
138
138
|
- 程序内置默认值
|
|
139
139
|
|
|
140
|
-
配置文件中的**相对路径**(`input_file`、`output`、`system_prompt_file`、`
|
|
140
|
+
配置文件中的**相对路径**(`input_file`、`output`、`system_prompt_file`、`sft_system_prompt_file`、`embedding_model_path`、`test_model_path` 等)以**执行时的当前工作目录**为基准展开,与命令行参数行为一致。因此推荐从项目根目录运行工具,配置文件中的路径直接写相对于项目根目录的路径即可:
|
|
141
141
|
|
|
142
142
|
```yaml
|
|
143
|
-
# configs/
|
|
143
|
+
# configs/06_model_test.yaml
|
|
144
144
|
action: model-test
|
|
145
145
|
input_file: data/raw_deduplicated_split_test.xlsx # 相对于运行时 cwd(项目根目录)
|
|
146
146
|
model-test:
|
|
@@ -149,7 +149,7 @@ model-test:
|
|
|
149
149
|
|
|
150
150
|
```bash
|
|
151
151
|
# 从项目根目录运行,路径正确展开
|
|
152
|
-
mysphinx-forge --config configs/
|
|
152
|
+
mysphinx-forge --config configs/06_model_test.yaml
|
|
153
153
|
```
|
|
154
154
|
|
|
155
155
|
配置文件支持三种层级:
|
|
@@ -295,19 +295,23 @@ mysphinx-forge --action clean-deduplicate --input-file data/input.xlsx
|
|
|
295
295
|
mysphinx-forge --action clean-deduplicate --input-file data/input.xlsx --dedupe-mode semantic
|
|
296
296
|
```
|
|
297
297
|
|
|
298
|
-
### 导出
|
|
298
|
+
### 导出 SFT 数据
|
|
299
299
|
|
|
300
|
-
`convert-sft`
|
|
301
|
-
|
|
302
|
-
> **关于命名**:这里的 `sft` 指**训练阶段**(Supervised Fine-Tuning,监督微调),而非训练方法。`alpaca` 的 `instruction / input / output` 结构正是 SFT 阶段的标准监督数据,无法用于 `pt`(纯文本预训练)或 `dpo` / `rm`(需要 `chosen` / `rejected` 偏好对)等其它阶段。
|
|
303
|
-
>
|
|
304
|
-
> **与训练方法无关**:训练方法(`LoRA` / 全参数 / freeze)和数据格式是两个正交的维度。本命令产出的这份 `alpaca` JSON,无论你用 `LoRA` 还是全参数微调都**通用,一个字都不用改**——只需在 `LLaMA-Factory` 的训练配置里切换 `finetuning_type` 即可。
|
|
300
|
+
`convert-sft` 支持将表格转换为多种 SFT 格式,通过 `--sft-format` 参数选择,默认为 `alpaca`。
|
|
305
301
|
|
|
306
302
|
默认会自动探测:
|
|
307
303
|
|
|
308
304
|
- 输入列:`text` / `用户问题` / `客户问题` / `用户输入`
|
|
309
305
|
- 输出列:`category` / `label` / `intent` / `output` / `response` / `answer` / `target`
|
|
310
306
|
|
|
307
|
+
#### alpaca 格式(默认)
|
|
308
|
+
|
|
309
|
+
可直接给 `LLaMA-Factory` 做 `SFT` 训练。
|
|
310
|
+
|
|
311
|
+
> **关于命名**:这里的 `sft` 指**训练阶段**(Supervised Fine-Tuning,监督微调),而非训练方法。`alpaca` 的 `instruction / input / output` 结构正是 SFT 阶段的标准监督数据,无法用于 `pt`(纯文本预训练)或 `dpo` / `rm`(需要 `chosen` / `rejected` 偏好对)等其它阶段。
|
|
312
|
+
>
|
|
313
|
+
> **与训练方法无关**:训练方法(`LoRA` / 全参数 / freeze)和数据格式是两个正交的维度。本命令产出的这份 `alpaca` JSON,无论你用 `LoRA` 还是全参数微调都**通用,一个字都不用改**——只需在 `LLaMA-Factory` 的训练配置里切换 `finetuning_type` 即可。
|
|
314
|
+
|
|
311
315
|
基础示例:
|
|
312
316
|
|
|
313
317
|
```bash
|
|
@@ -332,7 +336,7 @@ mysphinx-forge --action convert-sft --input-file data/input.xlsx --sft-output-co
|
|
|
332
336
|
mysphinx-forge --action convert-sft --input-file data/input.xlsx --sft-output-column category --sft-instruction "请判断用户问题所属分类,只输出分类标签。" --sft-system-prompt "你是一个证券问答分类助手。"
|
|
333
337
|
```
|
|
334
338
|
|
|
335
|
-
|
|
339
|
+
##### `instruction` 与 `input` 字段的两种模式
|
|
336
340
|
|
|
337
341
|
**默认模式(`--sft-user-query-as-instruction`,默认开启)**
|
|
338
342
|
|
|
@@ -375,9 +379,49 @@ convert-sft:
|
|
|
375
379
|
|
|
376
380
|
> 注意:如果同时传入了显式 `--sft-instruction`,则该固定文本始终作为 `instruction`,`input` 照常存放用户输入,`--sft-user-query-as-instruction` 不影响此情形。
|
|
377
381
|
|
|
382
|
+
#### pa 格式
|
|
383
|
+
|
|
384
|
+
企业内部自定义格式,输出为 JSONL 文件,每行一条 JSON 数据,结构如下:
|
|
385
|
+
|
|
386
|
+
```json
|
|
387
|
+
{"conversations": [{"context": "系统提示(可选)", "role": "system"}, {"context": "用户输入", "role": "human"}, {"context": "模型回复", "role": "assistant"}], "id": "1"}
|
|
388
|
+
```
|
|
389
|
+
|
|
390
|
+
字段说明:
|
|
391
|
+
|
|
392
|
+
- `id`:数据编号,从 1 开始累计
|
|
393
|
+
- `conversations`:对话内容列表
|
|
394
|
+
- `role`:`system`(可选,对应 `--sft-system-prompt` / `--sft-system-column`)、`human`(用户输入)、`assistant`(模型回复)
|
|
395
|
+
- `context`:会话内容
|
|
396
|
+
|
|
397
|
+
基础示例:
|
|
398
|
+
|
|
399
|
+
```bash
|
|
400
|
+
mysphinx-forge --action convert-sft --sft-format pa --input-file data/input_deduplicated_split_train.xlsx
|
|
401
|
+
```
|
|
402
|
+
|
|
403
|
+
默认输出:
|
|
404
|
+
|
|
405
|
+
```bash
|
|
406
|
+
data/input_deduplicated_split_train_pa.jsonl
|
|
407
|
+
```
|
|
408
|
+
|
|
409
|
+
带 system prompt:
|
|
410
|
+
|
|
411
|
+
```bash
|
|
412
|
+
mysphinx-forge --action convert-sft --sft-format pa --input-file data/input.xlsx --sft-system-prompt "你是证券领域用户意图识别专家。"
|
|
413
|
+
```
|
|
414
|
+
|
|
415
|
+
**自动切分**:当转换结果超过 10000 条时,自动按 10000 条一份切分为多个 JSONL 文件,文件名末尾追加序号。例如输入文件 `input_deduplicated_split_train.xlsx` 包含 12000 条数据,输出为:
|
|
416
|
+
|
|
417
|
+
```
|
|
418
|
+
input_deduplicated_split_train_pa_1.jsonl # 前 10000 条
|
|
419
|
+
input_deduplicated_split_train_pa_2.jsonl # 后 2000 条
|
|
420
|
+
```
|
|
421
|
+
|
|
378
422
|
说明:
|
|
379
423
|
|
|
380
|
-
-
|
|
424
|
+
- 支持 `alpaca`(默认)和 `pa` 两种格式,通过 `--sft-format` 切换
|
|
381
425
|
- 空输入行和空输出行会自动跳过
|
|
382
426
|
- 会同时生成日志文件和 `*.meta.json` 元数据文件
|
|
383
427
|
|
|
@@ -61,9 +61,12 @@ from mysphinx_forge.semantic_deduplication import (
|
|
|
61
61
|
)
|
|
62
62
|
from mysphinx_forge.sft_dataset import (
|
|
63
63
|
DEFAULT_SFT_FORMAT,
|
|
64
|
+
PA_SFT_FORMAT,
|
|
64
65
|
SftConversionStats,
|
|
65
66
|
convert_dataframe_to_alpaca,
|
|
67
|
+
convert_dataframe_to_pa,
|
|
66
68
|
write_alpaca_dataset,
|
|
69
|
+
write_pa_dataset,
|
|
67
70
|
)
|
|
68
71
|
from mysphinx_forge.splitting import (
|
|
69
72
|
DEFAULT_SPLIT_RANDOM_SEED,
|
|
@@ -286,8 +289,8 @@ def main() -> int:
|
|
|
286
289
|
)
|
|
287
290
|
if args.action == "convert-sft":
|
|
288
291
|
try:
|
|
289
|
-
|
|
290
|
-
args.
|
|
292
|
+
resolved_sft_system_prompt = _resolve_sft_system_prompt(
|
|
293
|
+
args.sft_system_prompt_file, args.sft_system_prompt
|
|
291
294
|
)
|
|
292
295
|
except ValueError as exc:
|
|
293
296
|
print(str(exc))
|
|
@@ -298,8 +301,8 @@ def main() -> int:
|
|
|
298
301
|
args.target_column,
|
|
299
302
|
args.sft_format,
|
|
300
303
|
args.sft_output_column,
|
|
301
|
-
|
|
302
|
-
|
|
304
|
+
args.sft_instruction,
|
|
305
|
+
resolved_sft_system_prompt,
|
|
303
306
|
args.sft_system_column,
|
|
304
307
|
args.sft_user_query_as_instruction,
|
|
305
308
|
)
|
|
@@ -616,9 +619,9 @@ def _build_parser(
|
|
|
616
619
|
)
|
|
617
620
|
parser.add_argument(
|
|
618
621
|
"--sft-format",
|
|
619
|
-
choices=[DEFAULT_SFT_FORMAT],
|
|
622
|
+
choices=[DEFAULT_SFT_FORMAT, PA_SFT_FORMAT],
|
|
620
623
|
default=config_defaults.get("sft_format", DEFAULT_SFT_FORMAT),
|
|
621
|
-
help=f"SFT
|
|
624
|
+
help=f"SFT 导出格式,支持 {DEFAULT_SFT_FORMAT}(默认)和 {PA_SFT_FORMAT}。",
|
|
622
625
|
)
|
|
623
626
|
parser.add_argument(
|
|
624
627
|
"--sft-output-column",
|
|
@@ -630,16 +633,16 @@ def _build_parser(
|
|
|
630
633
|
default=config_defaults.get("sft_instruction", ""),
|
|
631
634
|
help="SFT Alpaca 样本中的 instruction。未指定时根据输出列自动生成。",
|
|
632
635
|
)
|
|
633
|
-
parser.add_argument(
|
|
634
|
-
"--sft-instruction-file",
|
|
635
|
-
dest="sft_instruction_file",
|
|
636
|
-
default=config_defaults.get("sft_instruction_file", ""),
|
|
637
|
-
help="SFT instruction 文件路径。文件内容优先于 --sft-instruction;两者都未指定时根据输出列自动生成。",
|
|
638
|
-
)
|
|
639
636
|
parser.add_argument(
|
|
640
637
|
"--sft-system-prompt",
|
|
641
638
|
default=config_defaults.get("sft_system_prompt", ""),
|
|
642
|
-
help="SFT
|
|
639
|
+
help="SFT 样本中的固定 system 字段(行内文本)。未指定则不写入。",
|
|
640
|
+
)
|
|
641
|
+
parser.add_argument(
|
|
642
|
+
"--sft-system-prompt-file",
|
|
643
|
+
dest="sft_system_prompt_file",
|
|
644
|
+
default=config_defaults.get("sft_system_prompt_file", ""),
|
|
645
|
+
help="SFT system prompt 文件路径(支持 .md 等文本文件)。文件内容优先于 --sft-system-prompt;两者都未指定则不写入 system 字段。",
|
|
643
646
|
)
|
|
644
647
|
parser.add_argument(
|
|
645
648
|
"--sft-system-column",
|
|
@@ -1491,23 +1494,20 @@ def _resolve_system_prompt(system_prompt_file: str) -> str:
|
|
|
1491
1494
|
return system_prompt
|
|
1492
1495
|
|
|
1493
1496
|
|
|
1494
|
-
def
|
|
1495
|
-
#
|
|
1496
|
-
|
|
1497
|
-
|
|
1498
|
-
return instruction_inline
|
|
1499
|
-
if instruction_file:
|
|
1500
|
-
file_path = Path(instruction_file)
|
|
1497
|
+
def _resolve_sft_system_prompt(system_prompt_file: str, system_prompt_inline: str) -> str:
|
|
1498
|
+
# 优先级:文件 > 行内文本 > 空字符串(不写入 system 字段)
|
|
1499
|
+
if system_prompt_file:
|
|
1500
|
+
file_path = Path(system_prompt_file)
|
|
1501
1501
|
try:
|
|
1502
1502
|
content = file_path.read_text(encoding="utf-8").strip()
|
|
1503
1503
|
except OSError as exc:
|
|
1504
1504
|
raise ValueError(
|
|
1505
|
-
f"读取 sft-
|
|
1505
|
+
f"读取 sft-system-prompt 文件失败:{file_path},{type(exc).__name__}: {exc}"
|
|
1506
1506
|
) from exc
|
|
1507
1507
|
if not content:
|
|
1508
|
-
raise ValueError(f"sft-
|
|
1508
|
+
raise ValueError(f"sft-system-prompt 文件内容为空:{file_path}")
|
|
1509
1509
|
return content
|
|
1510
|
-
return
|
|
1510
|
+
return system_prompt_inline
|
|
1511
1511
|
|
|
1512
1512
|
|
|
1513
1513
|
def _run_clean_csv_stream(
|
|
@@ -1759,24 +1759,37 @@ def _run_convert_sft(
|
|
|
1759
1759
|
run_stage("读取文件", logger=logger)
|
|
1760
1760
|
dataframe = load_dataframe(input_file)
|
|
1761
1761
|
run_stage("转换 SFT 数据", logger=logger)
|
|
1762
|
-
if sft_format
|
|
1762
|
+
if sft_format == PA_SFT_FORMAT:
|
|
1763
|
+
records, stats = convert_dataframe_to_pa(
|
|
1764
|
+
dataframe,
|
|
1765
|
+
target_column=target_column,
|
|
1766
|
+
output_column=sft_output_column,
|
|
1767
|
+
system_prompt=sft_system_prompt,
|
|
1768
|
+
system_column=sft_system_column,
|
|
1769
|
+
)
|
|
1770
|
+
elif sft_format == DEFAULT_SFT_FORMAT:
|
|
1771
|
+
records, stats = convert_dataframe_to_alpaca(
|
|
1772
|
+
dataframe,
|
|
1773
|
+
target_column=target_column,
|
|
1774
|
+
output_column=sft_output_column,
|
|
1775
|
+
instruction=sft_instruction,
|
|
1776
|
+
system_prompt=sft_system_prompt,
|
|
1777
|
+
system_column=sft_system_column,
|
|
1778
|
+
user_query_as_instruction=sft_user_query_as_instruction,
|
|
1779
|
+
)
|
|
1780
|
+
else:
|
|
1763
1781
|
raise ValueError(f"暂不支持的 SFT 格式:{sft_format}")
|
|
1764
|
-
records, stats = convert_dataframe_to_alpaca(
|
|
1765
|
-
dataframe,
|
|
1766
|
-
target_column=target_column,
|
|
1767
|
-
output_column=sft_output_column,
|
|
1768
|
-
instruction=sft_instruction,
|
|
1769
|
-
system_prompt=sft_system_prompt,
|
|
1770
|
-
system_column=sft_system_column,
|
|
1771
|
-
user_query_as_instruction=sft_user_query_as_instruction,
|
|
1772
|
-
)
|
|
1773
1782
|
except ValueError as exc:
|
|
1774
1783
|
_emit_error(str(exc), logger)
|
|
1775
1784
|
close_logger()
|
|
1776
1785
|
return 1
|
|
1777
1786
|
|
|
1778
1787
|
run_stage("写出结果", logger=logger)
|
|
1779
|
-
|
|
1788
|
+
if sft_format == PA_SFT_FORMAT:
|
|
1789
|
+
written_paths = write_pa_dataset(records, output_path)
|
|
1790
|
+
else:
|
|
1791
|
+
write_alpaca_dataset(records, output_path)
|
|
1792
|
+
written_paths = [output_path]
|
|
1780
1793
|
_write_meta(
|
|
1781
1794
|
output_path=output_path,
|
|
1782
1795
|
action="convert-sft",
|
|
@@ -1791,8 +1804,11 @@ def _run_convert_sft(
|
|
|
1791
1804
|
"sft_user_query_as_instruction": sft_user_query_as_instruction,
|
|
1792
1805
|
},
|
|
1793
1806
|
sft_conversion_stats=stats,
|
|
1807
|
+
extra_output_files={f"output_file_{i + 1}": p for i, p in enumerate(written_paths)}
|
|
1808
|
+
if len(written_paths) > 1
|
|
1809
|
+
else None,
|
|
1794
1810
|
)
|
|
1795
|
-
_print_sft_conversion_stats(stats,
|
|
1811
|
+
_print_sft_conversion_stats(stats, written_paths, logger)
|
|
1796
1812
|
close_logger()
|
|
1797
1813
|
return 0
|
|
1798
1814
|
|
|
@@ -1841,7 +1857,8 @@ def _resolve_split_part_output_paths(base_output_path: Path) -> tuple[Path, Path
|
|
|
1841
1857
|
def _resolve_sft_output_path(input_path: Path, output_arg: str | None, sft_format: str) -> Path:
|
|
1842
1858
|
if output_arg:
|
|
1843
1859
|
return Path(output_arg)
|
|
1844
|
-
|
|
1860
|
+
ext = ".jsonl" if sft_format == PA_SFT_FORMAT else ".json"
|
|
1861
|
+
return input_path.with_name(f"{input_path.stem}_{sft_format}{ext}")
|
|
1845
1862
|
|
|
1846
1863
|
|
|
1847
1864
|
def _resolve_match_output_path(output_path: Path) -> Path:
|
|
@@ -2014,10 +2031,15 @@ def _print_split_stats(
|
|
|
2014
2031
|
|
|
2015
2032
|
def _print_sft_conversion_stats(
|
|
2016
2033
|
stats: SftConversionStats,
|
|
2017
|
-
|
|
2034
|
+
output_paths: list[Path],
|
|
2018
2035
|
logger: Logger,
|
|
2019
2036
|
) -> None:
|
|
2020
|
-
|
|
2037
|
+
if len(output_paths) == 1:
|
|
2038
|
+
_emit_message(f"SFT 数据转换完成,输出文件:{output_paths[0]}", logger)
|
|
2039
|
+
else:
|
|
2040
|
+
_emit_message(f"SFT 数据转换完成,共生成 {len(output_paths)} 个文件:", logger)
|
|
2041
|
+
for p in output_paths:
|
|
2042
|
+
_emit_message(f" {p}", logger)
|
|
2021
2043
|
_emit_message(f"SFT 格式:{stats.format_name}", logger)
|
|
2022
2044
|
_emit_message(f"输入列:{stats.input_column}", logger)
|
|
2023
2045
|
_emit_message(f"输出列:{stats.output_column}", logger)
|
|
@@ -9,6 +9,8 @@ import pandas as pd
|
|
|
9
9
|
from mysphinx_forge.cleaning import resolve_target_column
|
|
10
10
|
|
|
11
11
|
DEFAULT_SFT_FORMAT = "alpaca"
|
|
12
|
+
PA_SFT_FORMAT = "pa"
|
|
13
|
+
PA_MAX_RECORDS_PER_FILE = 10_000
|
|
12
14
|
DEFAULT_SFT_OUTPUT_COLUMNS = (
|
|
13
15
|
"category",
|
|
14
16
|
"label",
|
|
@@ -107,6 +109,90 @@ def write_alpaca_dataset(records: list[dict[str, str]], output_path: str | Path)
|
|
|
107
109
|
)
|
|
108
110
|
|
|
109
111
|
|
|
112
|
+
def convert_dataframe_to_pa(
|
|
113
|
+
dataframe: pd.DataFrame,
|
|
114
|
+
*,
|
|
115
|
+
target_column: str = "text",
|
|
116
|
+
output_column: str = "",
|
|
117
|
+
system_prompt: str = "",
|
|
118
|
+
system_column: str = "",
|
|
119
|
+
) -> tuple[list[dict], SftConversionStats]:
|
|
120
|
+
resolved_input_column = resolve_target_column(dataframe, target_column)
|
|
121
|
+
resolved_output_column = resolve_sft_output_column(dataframe, output_column)
|
|
122
|
+
resolved_system_column = _resolve_optional_column(dataframe, system_column)
|
|
123
|
+
|
|
124
|
+
final_system_prompt = system_prompt.strip()
|
|
125
|
+
records: list[dict] = []
|
|
126
|
+
skipped_blank_input_rows = 0
|
|
127
|
+
skipped_blank_output_rows = 0
|
|
128
|
+
|
|
129
|
+
for _, row in dataframe.iterrows():
|
|
130
|
+
input_text = _cell_to_text(row[resolved_input_column])
|
|
131
|
+
output_text = _cell_to_text(row[resolved_output_column])
|
|
132
|
+
if not input_text:
|
|
133
|
+
skipped_blank_input_rows += 1
|
|
134
|
+
continue
|
|
135
|
+
if not output_text:
|
|
136
|
+
skipped_blank_output_rows += 1
|
|
137
|
+
continue
|
|
138
|
+
|
|
139
|
+
conversations: list[dict[str, str]] = []
|
|
140
|
+
|
|
141
|
+
system_text = final_system_prompt
|
|
142
|
+
if resolved_system_column is not None:
|
|
143
|
+
row_system = _cell_to_text(row[resolved_system_column])
|
|
144
|
+
if row_system:
|
|
145
|
+
system_text = row_system
|
|
146
|
+
if system_text:
|
|
147
|
+
conversations.append({"context": system_text, "role": "system"})
|
|
148
|
+
|
|
149
|
+
conversations.append({"context": input_text, "role": "human"})
|
|
150
|
+
conversations.append({"context": output_text, "role": "assistant"})
|
|
151
|
+
|
|
152
|
+
records.append({"conversations": conversations, "id": str(len(records) + 1)})
|
|
153
|
+
|
|
154
|
+
stats = SftConversionStats(
|
|
155
|
+
format_name=PA_SFT_FORMAT,
|
|
156
|
+
input_column=resolved_input_column,
|
|
157
|
+
output_column=resolved_output_column,
|
|
158
|
+
total_rows=len(dataframe),
|
|
159
|
+
converted_rows=len(records),
|
|
160
|
+
skipped_blank_input_rows=skipped_blank_input_rows,
|
|
161
|
+
skipped_blank_output_rows=skipped_blank_output_rows,
|
|
162
|
+
)
|
|
163
|
+
return records, stats
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def write_pa_dataset(records: list[dict], output_path: str | Path) -> list[Path]:
|
|
167
|
+
"""Write PA-format records as one or more JSONL files split at PA_MAX_RECORDS_PER_FILE.
|
|
168
|
+
|
|
169
|
+
Returns the list of paths written.
|
|
170
|
+
"""
|
|
171
|
+
output_path = Path(output_path)
|
|
172
|
+
total = len(records)
|
|
173
|
+
if total <= PA_MAX_RECORDS_PER_FILE:
|
|
174
|
+
_write_pa_jsonl(records, output_path)
|
|
175
|
+
return [output_path]
|
|
176
|
+
|
|
177
|
+
stem = output_path.stem
|
|
178
|
+
suffix = output_path.suffix
|
|
179
|
+
parent = output_path.parent
|
|
180
|
+
written: list[Path] = []
|
|
181
|
+
chunk_index = 1
|
|
182
|
+
for start in range(0, total, PA_MAX_RECORDS_PER_FILE):
|
|
183
|
+
chunk = records[start : start + PA_MAX_RECORDS_PER_FILE]
|
|
184
|
+
chunk_path = parent / f"{stem}_{chunk_index}{suffix}"
|
|
185
|
+
_write_pa_jsonl(chunk, chunk_path)
|
|
186
|
+
written.append(chunk_path)
|
|
187
|
+
chunk_index += 1
|
|
188
|
+
return written
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _write_pa_jsonl(records: list[dict], path: Path) -> None:
|
|
192
|
+
lines = [json.dumps(record, ensure_ascii=False) for record in records]
|
|
193
|
+
path.write_text("\n".join(lines) + "\n" if lines else "", encoding="utf-8")
|
|
194
|
+
|
|
195
|
+
|
|
110
196
|
def resolve_sft_output_column(dataframe: pd.DataFrame, output_column: str) -> str:
|
|
111
197
|
if output_column:
|
|
112
198
|
if output_column in dataframe.columns:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mysphinx-forge
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: Data and model workflow toolkit for cleaning, clustering, generation, and evaluation
|
|
5
5
|
Keywords: data-cleaning,deduplication,clustering,nlp,cli
|
|
6
6
|
Classifier: Development Status :: 3 - Alpha
|
|
@@ -65,7 +65,7 @@ Requires-Dist: transformers>=4.55.0; extra == "all"
|
|
|
65
65
|
| `cluster` | 文本聚类,并导出汇总、投影和 HTML 报告 | `*_clustered.*` |
|
|
66
66
|
| `split` | 切分 train / valid / test | `*_split_train.*` 等 |
|
|
67
67
|
| `model-test` | 批量执行模型推理或单条烟雾测试,含预期结果列时自动输出评估报告 | `*_model_tested.*` 或终端输出 |
|
|
68
|
-
| `convert-sft` | 转换表格数据为
|
|
68
|
+
| `convert-sft` | 转换表格数据为 SFT 数据(`alpaca` / `pa` 格式) | `*_alpaca.json` / `*_pa.jsonl`(超 10000 条自动切分) |
|
|
69
69
|
|
|
70
70
|
## 项目结构
|
|
71
71
|
|
|
@@ -174,10 +174,10 @@ uv run python main.py ...
|
|
|
174
174
|
- `mysphinx-forge.yaml`
|
|
175
175
|
- 程序内置默认值
|
|
176
176
|
|
|
177
|
-
配置文件中的**相对路径**(`input_file`、`output`、`system_prompt_file`、`
|
|
177
|
+
配置文件中的**相对路径**(`input_file`、`output`、`system_prompt_file`、`sft_system_prompt_file`、`embedding_model_path`、`test_model_path` 等)以**执行时的当前工作目录**为基准展开,与命令行参数行为一致。因此推荐从项目根目录运行工具,配置文件中的路径直接写相对于项目根目录的路径即可:
|
|
178
178
|
|
|
179
179
|
```yaml
|
|
180
|
-
# configs/
|
|
180
|
+
# configs/06_model_test.yaml
|
|
181
181
|
action: model-test
|
|
182
182
|
input_file: data/raw_deduplicated_split_test.xlsx # 相对于运行时 cwd(项目根目录)
|
|
183
183
|
model-test:
|
|
@@ -186,7 +186,7 @@ model-test:
|
|
|
186
186
|
|
|
187
187
|
```bash
|
|
188
188
|
# 从项目根目录运行,路径正确展开
|
|
189
|
-
mysphinx-forge --config configs/
|
|
189
|
+
mysphinx-forge --config configs/06_model_test.yaml
|
|
190
190
|
```
|
|
191
191
|
|
|
192
192
|
配置文件支持三种层级:
|
|
@@ -332,19 +332,23 @@ mysphinx-forge --action clean-deduplicate --input-file data/input.xlsx
|
|
|
332
332
|
mysphinx-forge --action clean-deduplicate --input-file data/input.xlsx --dedupe-mode semantic
|
|
333
333
|
```
|
|
334
334
|
|
|
335
|
-
### 导出
|
|
335
|
+
### 导出 SFT 数据
|
|
336
336
|
|
|
337
|
-
`convert-sft`
|
|
338
|
-
|
|
339
|
-
> **关于命名**:这里的 `sft` 指**训练阶段**(Supervised Fine-Tuning,监督微调),而非训练方法。`alpaca` 的 `instruction / input / output` 结构正是 SFT 阶段的标准监督数据,无法用于 `pt`(纯文本预训练)或 `dpo` / `rm`(需要 `chosen` / `rejected` 偏好对)等其它阶段。
|
|
340
|
-
>
|
|
341
|
-
> **与训练方法无关**:训练方法(`LoRA` / 全参数 / freeze)和数据格式是两个正交的维度。本命令产出的这份 `alpaca` JSON,无论你用 `LoRA` 还是全参数微调都**通用,一个字都不用改**——只需在 `LLaMA-Factory` 的训练配置里切换 `finetuning_type` 即可。
|
|
337
|
+
`convert-sft` 支持将表格转换为多种 SFT 格式,通过 `--sft-format` 参数选择,默认为 `alpaca`。
|
|
342
338
|
|
|
343
339
|
默认会自动探测:
|
|
344
340
|
|
|
345
341
|
- 输入列:`text` / `用户问题` / `客户问题` / `用户输入`
|
|
346
342
|
- 输出列:`category` / `label` / `intent` / `output` / `response` / `answer` / `target`
|
|
347
343
|
|
|
344
|
+
#### alpaca 格式(默认)
|
|
345
|
+
|
|
346
|
+
可直接给 `LLaMA-Factory` 做 `SFT` 训练。
|
|
347
|
+
|
|
348
|
+
> **关于命名**:这里的 `sft` 指**训练阶段**(Supervised Fine-Tuning,监督微调),而非训练方法。`alpaca` 的 `instruction / input / output` 结构正是 SFT 阶段的标准监督数据,无法用于 `pt`(纯文本预训练)或 `dpo` / `rm`(需要 `chosen` / `rejected` 偏好对)等其它阶段。
|
|
349
|
+
>
|
|
350
|
+
> **与训练方法无关**:训练方法(`LoRA` / 全参数 / freeze)和数据格式是两个正交的维度。本命令产出的这份 `alpaca` JSON,无论你用 `LoRA` 还是全参数微调都**通用,一个字都不用改**——只需在 `LLaMA-Factory` 的训练配置里切换 `finetuning_type` 即可。
|
|
351
|
+
|
|
348
352
|
基础示例:
|
|
349
353
|
|
|
350
354
|
```bash
|
|
@@ -369,7 +373,7 @@ mysphinx-forge --action convert-sft --input-file data/input.xlsx --sft-output-co
|
|
|
369
373
|
mysphinx-forge --action convert-sft --input-file data/input.xlsx --sft-output-column category --sft-instruction "请判断用户问题所属分类,只输出分类标签。" --sft-system-prompt "你是一个证券问答分类助手。"
|
|
370
374
|
```
|
|
371
375
|
|
|
372
|
-
|
|
376
|
+
##### `instruction` 与 `input` 字段的两种模式
|
|
373
377
|
|
|
374
378
|
**默认模式(`--sft-user-query-as-instruction`,默认开启)**
|
|
375
379
|
|
|
@@ -412,9 +416,49 @@ convert-sft:
|
|
|
412
416
|
|
|
413
417
|
> 注意:如果同时传入了显式 `--sft-instruction`,则该固定文本始终作为 `instruction`,`input` 照常存放用户输入,`--sft-user-query-as-instruction` 不影响此情形。
|
|
414
418
|
|
|
419
|
+
#### pa 格式
|
|
420
|
+
|
|
421
|
+
企业内部自定义格式,输出为 JSONL 文件,每行一条 JSON 数据,结构如下:
|
|
422
|
+
|
|
423
|
+
```json
|
|
424
|
+
{"conversations": [{"context": "系统提示(可选)", "role": "system"}, {"context": "用户输入", "role": "human"}, {"context": "模型回复", "role": "assistant"}], "id": "1"}
|
|
425
|
+
```
|
|
426
|
+
|
|
427
|
+
字段说明:
|
|
428
|
+
|
|
429
|
+
- `id`:数据编号,从 1 开始累计
|
|
430
|
+
- `conversations`:对话内容列表
|
|
431
|
+
- `role`:`system`(可选,对应 `--sft-system-prompt` / `--sft-system-column`)、`human`(用户输入)、`assistant`(模型回复)
|
|
432
|
+
- `context`:会话内容
|
|
433
|
+
|
|
434
|
+
基础示例:
|
|
435
|
+
|
|
436
|
+
```bash
|
|
437
|
+
mysphinx-forge --action convert-sft --sft-format pa --input-file data/input_deduplicated_split_train.xlsx
|
|
438
|
+
```
|
|
439
|
+
|
|
440
|
+
默认输出:
|
|
441
|
+
|
|
442
|
+
```bash
|
|
443
|
+
data/input_deduplicated_split_train_pa.jsonl
|
|
444
|
+
```
|
|
445
|
+
|
|
446
|
+
带 system prompt:
|
|
447
|
+
|
|
448
|
+
```bash
|
|
449
|
+
mysphinx-forge --action convert-sft --sft-format pa --input-file data/input.xlsx --sft-system-prompt "你是证券领域用户意图识别专家。"
|
|
450
|
+
```
|
|
451
|
+
|
|
452
|
+
**自动切分**:当转换结果超过 10000 条时,自动按 10000 条一份切分为多个 JSONL 文件,文件名末尾追加序号。例如输入文件 `input_deduplicated_split_train.xlsx` 包含 12000 条数据,输出为:
|
|
453
|
+
|
|
454
|
+
```
|
|
455
|
+
input_deduplicated_split_train_pa_1.jsonl # 前 10000 条
|
|
456
|
+
input_deduplicated_split_train_pa_2.jsonl # 后 2000 条
|
|
457
|
+
```
|
|
458
|
+
|
|
415
459
|
说明:
|
|
416
460
|
|
|
417
|
-
-
|
|
461
|
+
- 支持 `alpaca`(默认)和 `pa` 两种格式,通过 `--sft-format` 切换
|
|
418
462
|
- 空输入行和空输出行会自动跳过
|
|
419
463
|
- 会同时生成日志文件和 `*.meta.json` 元数据文件
|
|
420
464
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "mysphinx-forge"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.1"
|
|
8
8
|
description = "Data and model workflow toolkit for cleaning, clustering, generation, and evaluation"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.12"
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from mysphinx_forge.sft_dataset import (
|
|
7
|
+
PA_MAX_RECORDS_PER_FILE,
|
|
8
|
+
convert_dataframe_to_alpaca,
|
|
9
|
+
convert_dataframe_to_pa,
|
|
10
|
+
resolve_sft_output_column,
|
|
11
|
+
write_pa_dataset,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def test_convert_dataframe_to_alpaca_user_query_as_instruction_default() -> None:
|
|
16
|
+
dataframe = pd.DataFrame(
|
|
17
|
+
{
|
|
18
|
+
"用户输入": ["怎么买基金", "", "港股通怎么开"],
|
|
19
|
+
"category": ["基金", "股票", ""],
|
|
20
|
+
}
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
records, stats = convert_dataframe_to_alpaca(dataframe)
|
|
24
|
+
|
|
25
|
+
assert records == [
|
|
26
|
+
{
|
|
27
|
+
"instruction": "怎么买基金",
|
|
28
|
+
"input": "",
|
|
29
|
+
"output": "基金",
|
|
30
|
+
}
|
|
31
|
+
]
|
|
32
|
+
assert stats.input_column == "用户输入"
|
|
33
|
+
assert stats.output_column == "category"
|
|
34
|
+
assert stats.total_rows == 3
|
|
35
|
+
assert stats.converted_rows == 1
|
|
36
|
+
assert stats.skipped_blank_input_rows == 1
|
|
37
|
+
assert stats.skipped_blank_output_rows == 1
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_convert_dataframe_to_alpaca_user_query_as_input_when_disabled() -> None:
|
|
41
|
+
dataframe = pd.DataFrame(
|
|
42
|
+
{
|
|
43
|
+
"用户输入": ["怎么买基金", "", "港股通怎么开"],
|
|
44
|
+
"category": ["基金", "股票", ""],
|
|
45
|
+
}
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
records, stats = convert_dataframe_to_alpaca(dataframe, user_query_as_instruction=False)
|
|
49
|
+
|
|
50
|
+
assert records == [
|
|
51
|
+
{
|
|
52
|
+
"instruction": "请根据用户输入判断其category,只输出category。",
|
|
53
|
+
"input": "怎么买基金",
|
|
54
|
+
"output": "基金",
|
|
55
|
+
}
|
|
56
|
+
]
|
|
57
|
+
assert stats.converted_rows == 1
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def test_convert_dataframe_to_alpaca_prefers_system_column_over_fixed_prompt() -> None:
|
|
61
|
+
dataframe = pd.DataFrame(
|
|
62
|
+
{
|
|
63
|
+
"text": ["基金赎回多久到账"],
|
|
64
|
+
"label": ["基金"],
|
|
65
|
+
"system_text": ["你是证券分类助手,只输出标签。"],
|
|
66
|
+
}
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
records, stats = convert_dataframe_to_alpaca(
|
|
70
|
+
dataframe,
|
|
71
|
+
output_column="label",
|
|
72
|
+
system_prompt="固定 system",
|
|
73
|
+
system_column="system_text",
|
|
74
|
+
instruction="请分类。",
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
assert records == [
|
|
78
|
+
{
|
|
79
|
+
"instruction": "请分类。",
|
|
80
|
+
"input": "基金赎回多久到账",
|
|
81
|
+
"output": "基金",
|
|
82
|
+
"system": "你是证券分类助手,只输出标签。",
|
|
83
|
+
}
|
|
84
|
+
]
|
|
85
|
+
assert stats.output_column == "label"
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def test_convert_dataframe_to_pa_basic() -> None:
|
|
89
|
+
dataframe = pd.DataFrame(
|
|
90
|
+
{
|
|
91
|
+
"用户输入": ["怎么查询持仓", "", "怎么买基金"],
|
|
92
|
+
"category": ["查持仓", "股票", ""],
|
|
93
|
+
}
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
records, stats = convert_dataframe_to_pa(dataframe)
|
|
97
|
+
|
|
98
|
+
assert len(records) == 1
|
|
99
|
+
record = records[0]
|
|
100
|
+
assert record["id"] == "1"
|
|
101
|
+
conversations = record["conversations"]
|
|
102
|
+
assert len(conversations) == 2
|
|
103
|
+
assert conversations[0] == {"context": "怎么查询持仓", "role": "human"}
|
|
104
|
+
assert conversations[1] == {"context": "查持仓", "role": "assistant"}
|
|
105
|
+
assert stats.format_name == "pa"
|
|
106
|
+
assert stats.converted_rows == 1
|
|
107
|
+
assert stats.skipped_blank_input_rows == 1
|
|
108
|
+
assert stats.skipped_blank_output_rows == 1
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def test_convert_dataframe_to_pa_with_system_prompt() -> None:
|
|
112
|
+
dataframe = pd.DataFrame(
|
|
113
|
+
{
|
|
114
|
+
"text": ["怎么查询持仓"],
|
|
115
|
+
"label": ["查持仓"],
|
|
116
|
+
}
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
records, _ = convert_dataframe_to_pa(dataframe, system_prompt="你是证券助手。")
|
|
120
|
+
|
|
121
|
+
conversations = records[0]["conversations"]
|
|
122
|
+
assert len(conversations) == 3
|
|
123
|
+
assert conversations[0] == {"context": "你是证券助手。", "role": "system"}
|
|
124
|
+
assert conversations[1] == {"context": "怎么查询持仓", "role": "human"}
|
|
125
|
+
assert conversations[2] == {"context": "查持仓", "role": "assistant"}
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def test_convert_dataframe_to_pa_with_system_column_overrides_prompt() -> None:
|
|
129
|
+
dataframe = pd.DataFrame(
|
|
130
|
+
{
|
|
131
|
+
"text": ["怎么查询持仓"],
|
|
132
|
+
"label": ["查持仓"],
|
|
133
|
+
"sys": ["每行不同系统提示"],
|
|
134
|
+
}
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
records, _ = convert_dataframe_to_pa(
|
|
138
|
+
dataframe, system_prompt="固定提示", system_column="sys"
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
conversations = records[0]["conversations"]
|
|
142
|
+
assert conversations[0] == {"context": "每行不同系统提示", "role": "system"}
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def test_write_pa_dataset_single_file(tmp_path) -> None:
|
|
146
|
+
records = [{"conversations": [], "id": str(i + 1)} for i in range(5)]
|
|
147
|
+
out = tmp_path / "output_pa.jsonl"
|
|
148
|
+
written = write_pa_dataset(records, out)
|
|
149
|
+
assert written == [out]
|
|
150
|
+
lines = out.read_text(encoding="utf-8").strip().splitlines()
|
|
151
|
+
assert len(lines) == 5
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def test_write_pa_dataset_splits_when_exceeds_limit(tmp_path, monkeypatch) -> None:
|
|
155
|
+
import mysphinx_forge.sft_dataset as sft_mod
|
|
156
|
+
|
|
157
|
+
monkeypatch.setattr(sft_mod, "PA_MAX_RECORDS_PER_FILE", 3)
|
|
158
|
+
records = [{"conversations": [], "id": str(i + 1)} for i in range(7)]
|
|
159
|
+
out = tmp_path / "output_pa.jsonl"
|
|
160
|
+
written = write_pa_dataset(records, out)
|
|
161
|
+
assert len(written) == 3
|
|
162
|
+
assert written[0].name == "output_pa_1.jsonl"
|
|
163
|
+
assert written[1].name == "output_pa_2.jsonl"
|
|
164
|
+
assert written[2].name == "output_pa_3.jsonl"
|
|
165
|
+
lines1 = written[0].read_text(encoding="utf-8").strip().splitlines()
|
|
166
|
+
lines2 = written[1].read_text(encoding="utf-8").strip().splitlines()
|
|
167
|
+
lines3 = written[2].read_text(encoding="utf-8").strip().splitlines()
|
|
168
|
+
assert len(lines1) == 3
|
|
169
|
+
assert len(lines2) == 3
|
|
170
|
+
assert len(lines3) == 1
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def test_resolve_sft_output_column_requires_supported_or_explicit_column() -> None:
|
|
174
|
+
dataframe = pd.DataFrame({"text": ["a"], "other": ["b"]})
|
|
175
|
+
|
|
176
|
+
with pytest.raises(ValueError, match="未找到 SFT 输出列"):
|
|
177
|
+
resolve_sft_output_column(dataframe, "")
|
|
@@ -1,86 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import pandas as pd
|
|
4
|
-
import pytest
|
|
5
|
-
|
|
6
|
-
from mysphinx_forge.sft_dataset import convert_dataframe_to_alpaca, resolve_sft_output_column
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def test_convert_dataframe_to_alpaca_user_query_as_instruction_default() -> None:
|
|
10
|
-
dataframe = pd.DataFrame(
|
|
11
|
-
{
|
|
12
|
-
"用户输入": ["怎么买基金", "", "港股通怎么开"],
|
|
13
|
-
"category": ["基金", "股票", ""],
|
|
14
|
-
}
|
|
15
|
-
)
|
|
16
|
-
|
|
17
|
-
records, stats = convert_dataframe_to_alpaca(dataframe)
|
|
18
|
-
|
|
19
|
-
assert records == [
|
|
20
|
-
{
|
|
21
|
-
"instruction": "怎么买基金",
|
|
22
|
-
"input": "",
|
|
23
|
-
"output": "基金",
|
|
24
|
-
}
|
|
25
|
-
]
|
|
26
|
-
assert stats.input_column == "用户输入"
|
|
27
|
-
assert stats.output_column == "category"
|
|
28
|
-
assert stats.total_rows == 3
|
|
29
|
-
assert stats.converted_rows == 1
|
|
30
|
-
assert stats.skipped_blank_input_rows == 1
|
|
31
|
-
assert stats.skipped_blank_output_rows == 1
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
def test_convert_dataframe_to_alpaca_user_query_as_input_when_disabled() -> None:
|
|
35
|
-
dataframe = pd.DataFrame(
|
|
36
|
-
{
|
|
37
|
-
"用户输入": ["怎么买基金", "", "港股通怎么开"],
|
|
38
|
-
"category": ["基金", "股票", ""],
|
|
39
|
-
}
|
|
40
|
-
)
|
|
41
|
-
|
|
42
|
-
records, stats = convert_dataframe_to_alpaca(dataframe, user_query_as_instruction=False)
|
|
43
|
-
|
|
44
|
-
assert records == [
|
|
45
|
-
{
|
|
46
|
-
"instruction": "请根据用户输入判断其category,只输出category。",
|
|
47
|
-
"input": "怎么买基金",
|
|
48
|
-
"output": "基金",
|
|
49
|
-
}
|
|
50
|
-
]
|
|
51
|
-
assert stats.converted_rows == 1
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
def test_convert_dataframe_to_alpaca_prefers_system_column_over_fixed_prompt() -> None:
|
|
55
|
-
dataframe = pd.DataFrame(
|
|
56
|
-
{
|
|
57
|
-
"text": ["基金赎回多久到账"],
|
|
58
|
-
"label": ["基金"],
|
|
59
|
-
"system_text": ["你是证券分类助手,只输出标签。"],
|
|
60
|
-
}
|
|
61
|
-
)
|
|
62
|
-
|
|
63
|
-
records, stats = convert_dataframe_to_alpaca(
|
|
64
|
-
dataframe,
|
|
65
|
-
output_column="label",
|
|
66
|
-
system_prompt="固定 system",
|
|
67
|
-
system_column="system_text",
|
|
68
|
-
instruction="请分类。",
|
|
69
|
-
)
|
|
70
|
-
|
|
71
|
-
assert records == [
|
|
72
|
-
{
|
|
73
|
-
"instruction": "请分类。",
|
|
74
|
-
"input": "基金赎回多久到账",
|
|
75
|
-
"output": "基金",
|
|
76
|
-
"system": "你是证券分类助手,只输出标签。",
|
|
77
|
-
}
|
|
78
|
-
]
|
|
79
|
-
assert stats.output_column == "label"
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
def test_resolve_sft_output_column_requires_supported_or_explicit_column() -> None:
|
|
83
|
-
dataframe = pd.DataFrame({"text": ["a"], "other": ["b"]})
|
|
84
|
-
|
|
85
|
-
with pytest.raises(ValueError, match="未找到 SFT 输出列"):
|
|
86
|
-
resolve_sft_output_column(dataframe, "")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|