@icyfenix-dmla/cli 2026.5.24-16 → 2026.5.24-2045
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/shared/llm/__init__.py +2 -1
- package/shared/llm/sftdataset.py +108 -0
- package/src/commands/data.js +144 -10
- package/src/commands/manage.js +1 -1
- package/src/commands/server.js +1 -1
- package/src/commands/update.js +1 -1
- package/src/index.js +8 -0
- package/src/verbose.js +51 -0
- package/version.json +2 -2
package/package.json
CHANGED
package/shared/llm/__init__.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# LLM 模块
|
|
2
2
|
from .mini_mind_config import MiniMindConfig, RMSNorm, Attention, FeedForward, MiniMindBlock, MiniMindModel, MiniMindForCausalLM, precompute_freqs_cis, apply_rotary_pos_emb, repeat_kv
|
|
3
3
|
from .pretrain_dataset import PretrainDataset
|
|
4
|
+
from .sftdataset import SFTDataset, pre_processing_chat
|
|
4
5
|
|
|
5
|
-
__all__ = ['MiniMindConfig', 'RMSNorm', 'Attention', 'FeedForward', 'MiniMindBlock', 'MiniMindModel', 'MiniMindForCausalLM', 'precompute_freqs_cis', 'apply_rotary_pos_emb', 'repeat_kv', 'PretrainDataset']
|
|
6
|
+
__all__ = ['MiniMindConfig', 'RMSNorm', 'Attention', 'FeedForward', 'MiniMindBlock', 'MiniMindModel', 'MiniMindForCausalLM', 'precompute_freqs_cis', 'apply_rotary_pos_emb', 'repeat_kv', 'PretrainDataset', 'SFTDataset', 'pre_processing_chat']
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# SFTDataset, pre_processing_chat 定义
|
|
2
|
+
# 从文档自动提取生成
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import random
|
|
7
|
+
import torch
|
|
8
|
+
from datasets import load_dataset, Features, Value
|
|
9
|
+
from torch.utils.data import Dataset
|
|
10
|
+
|
|
11
|
+
class SFTDataset(Dataset):
|
|
12
|
+
"""
|
|
13
|
+
SFT 数据集:将对话数据 tokenize 为 next-token prediction 格式
|
|
14
|
+
|
|
15
|
+
与 PretrainDataset 的核心差异:
|
|
16
|
+
- 数据格式从 {"text": "..."} 变为 {"conversations": [...]}
|
|
17
|
+
- 标签掩码:仅 assistant 回答部分参与 loss,其余标记为 -100
|
|
18
|
+
- 使用 apply_chat_template 将对话转为 ChatML 格式
|
|
19
|
+
"""
|
|
20
|
+
def __init__(self, jsonl_path, tokenizer, max_length=768):
|
|
21
|
+
super().__init__()
|
|
22
|
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
23
|
+
self.tokenizer = tokenizer
|
|
24
|
+
self.max_length = max_length
|
|
25
|
+
features = Features({
|
|
26
|
+
'conversations': [{'role': Value('string'), 'content': Value('string'),
|
|
27
|
+
'reasoning_content': Value('string'), 'tools': Value('string'),
|
|
28
|
+
'tool_calls': Value('string')}]
|
|
29
|
+
})
|
|
30
|
+
self.samples = load_dataset('json', data_files=jsonl_path, split='train', features=features)
|
|
31
|
+
# 预计算 assistant 回答的起止标记 ID
|
|
32
|
+
self.bos_id = tokenizer(f'{tokenizer.bos_token}assistant\n', add_special_tokens=False).input_ids
|
|
33
|
+
self.eos_id = tokenizer(f'{tokenizer.eos_token}\n', add_special_tokens=False).input_ids
|
|
34
|
+
|
|
35
|
+
def __len__(self):
|
|
36
|
+
return len(self.samples)
|
|
37
|
+
|
|
38
|
+
def create_chat_prompt(self, conversations):
|
|
39
|
+
"""将对话列表应用 chat template 转为文本"""
|
|
40
|
+
messages = []
|
|
41
|
+
tools = None
|
|
42
|
+
for message in conversations:
|
|
43
|
+
message = dict(message)
|
|
44
|
+
if message.get("role") == "system" and message.get("tools"):
|
|
45
|
+
tools = json.loads(message["tools"]) if isinstance(message["tools"], str) else message["tools"]
|
|
46
|
+
if message.get("tool_calls") and isinstance(message["tool_calls"], str):
|
|
47
|
+
message["tool_calls"] = json.loads(message["tool_calls"])
|
|
48
|
+
messages.append(message)
|
|
49
|
+
return self.tokenizer.apply_chat_template(
|
|
50
|
+
messages, tokenize=False, add_generation_prompt=False, tools=tools
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
def generate_labels(self, input_ids):
|
|
54
|
+
"""生成标签:assistant 回答部分保留原始 ID,其余设为 -100"""
|
|
55
|
+
labels = [-100] * len(input_ids)
|
|
56
|
+
i = 0
|
|
57
|
+
while i < len(input_ids):
|
|
58
|
+
# 检测 <|im_start|>assistant\n 的位置
|
|
59
|
+
if input_ids[i:i + len(self.bos_id)] == self.bos_id:
|
|
60
|
+
start = i + len(self.bos_id)
|
|
61
|
+
end = start
|
|
62
|
+
# 查找对应的 <|im_end|>\n
|
|
63
|
+
while end < len(input_ids):
|
|
64
|
+
if input_ids[end:end + len(self.eos_id)] == self.eos_id:
|
|
65
|
+
break
|
|
66
|
+
end += 1
|
|
67
|
+
# 标记回答区间(包含 eos)
|
|
68
|
+
for j in range(start, min(end + len(self.eos_id), self.max_length)):
|
|
69
|
+
labels[j] = input_ids[j]
|
|
70
|
+
i = end + len(self.eos_id) if end < len(input_ids) else len(input_ids)
|
|
71
|
+
else:
|
|
72
|
+
i += 1
|
|
73
|
+
return labels
|
|
74
|
+
|
|
75
|
+
def __getitem__(self, index):
|
|
76
|
+
sample = self.samples[index]
|
|
77
|
+
conversations = pre_processing_chat(sample['conversations'])
|
|
78
|
+
prompt = self.create_chat_prompt(conversations)
|
|
79
|
+
input_ids = self.tokenizer(prompt).input_ids[:self.max_length]
|
|
80
|
+
# 填充到固定长度
|
|
81
|
+
input_ids += [self.tokenizer.pad_token_id] * (self.max_length - len(input_ids))
|
|
82
|
+
labels = self.generate_labels(input_ids)
|
|
83
|
+
return torch.tensor(input_ids, dtype=torch.long), torch.tensor(labels, dtype=torch.long)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def pre_processing_chat(conversations, add_system_ratio=0.2):
|
|
87
|
+
"""预处理对话数据:概率性添加系统提示词"""
|
|
88
|
+
# tool use 数据完整保留不做处理
|
|
89
|
+
if any(conv.get('tools') for conv in conversations):
|
|
90
|
+
return conversations
|
|
91
|
+
|
|
92
|
+
SYSTEM_PROMPTS = [
|
|
93
|
+
"你是一个知识丰富的AI,尽力为用户提供准确的信息。",
|
|
94
|
+
"你是minimind,一个小巧但有用的语言模型。",
|
|
95
|
+
"你是一个专业的AI助手,请提供有价值的回答。",
|
|
96
|
+
"你是minimind,请尽力帮助用户解决问题。",
|
|
97
|
+
"你是一个可靠的AI,请给出准确的回答。",
|
|
98
|
+
"You are a helpful AI assistant.",
|
|
99
|
+
"You are minimind, a lightweight intelligent assistant.",
|
|
100
|
+
"You are a friendly chatbot. Please answer the user's questions carefully.",
|
|
101
|
+
"You are a knowledgeable AI. Try your best to provide accurate information.",
|
|
102
|
+
"You are minimind, a small but useful language model."
|
|
103
|
+
]
|
|
104
|
+
# 概率性添加 system
|
|
105
|
+
if conversations[0].get('role') != 'system':
|
|
106
|
+
if random.random() < add_system_ratio:
|
|
107
|
+
return [{'role': 'system', 'content': random.choice(SYSTEM_PROMPTS)}] + conversations
|
|
108
|
+
return conversations
|
package/src/commands/data.js
CHANGED
|
@@ -8,8 +8,7 @@ const { prompt } = pkg
|
|
|
8
8
|
import fs from 'fs'
|
|
9
9
|
import path from 'path'
|
|
10
10
|
import os from 'os'
|
|
11
|
-
import { spawn } from '
|
|
12
|
-
import { execSync } from 'child_process'
|
|
11
|
+
import { spawn, execSync } from '../verbose.js'
|
|
13
12
|
import AdmZip from 'adm-zip'
|
|
14
13
|
|
|
15
14
|
// 配置文件路径
|
|
@@ -74,11 +73,20 @@ const DATASETS = [
|
|
|
74
73
|
{
|
|
75
74
|
id: 'minimind-pretrain',
|
|
76
75
|
name: 'MiniMind Pretrain (LLM预训练语料)',
|
|
77
|
-
url: 'https://www.modelscope.cn/datasets/icyfenix/
|
|
76
|
+
url: 'https://www.modelscope.cn/datasets/icyfenix/Minimind_Pretrain.git',
|
|
78
77
|
size: '~1.2GB',
|
|
79
78
|
format: 'git',
|
|
80
79
|
targetDir: 'datasets/minimind-pretrain',
|
|
81
80
|
source: 'ModelScope (icyfenix)'
|
|
81
|
+
},
|
|
82
|
+
{
|
|
83
|
+
id: 'minimind-sft',
|
|
84
|
+
name: 'MiniMind SFT (LLM监督微调语料)',
|
|
85
|
+
url: 'https://www.modelscope.cn/datasets/icyfenix/Minimind_SFT.git',
|
|
86
|
+
size: '~500MB',
|
|
87
|
+
format: 'git',
|
|
88
|
+
targetDir: 'datasets/minimind-sft',
|
|
89
|
+
source: 'ModelScope (icyfenix)'
|
|
82
90
|
}
|
|
83
91
|
]
|
|
84
92
|
|
|
@@ -301,10 +309,11 @@ async function showMainMenu(dataPath) {
|
|
|
301
309
|
const choices = [
|
|
302
310
|
{ name: '1', message: '挂载路径设置 ' + chalk.gray(`[当前: ${dataPath}]`) },
|
|
303
311
|
{ name: '2', message: '下载数据集' },
|
|
304
|
-
{ name: '3', message: '
|
|
305
|
-
{ name: '4', message: '
|
|
306
|
-
{ name: '5', message: '
|
|
307
|
-
{ name: '6', message: '
|
|
312
|
+
{ name: '3', message: '删除数据集' },
|
|
313
|
+
{ name: '4', message: '查看数据集列表' },
|
|
314
|
+
{ name: '5', message: '清空数据内容' },
|
|
315
|
+
{ name: '6', message: '删除数据卷' },
|
|
316
|
+
{ name: '7', message: '退出' }
|
|
308
317
|
]
|
|
309
318
|
|
|
310
319
|
const { action } = await prompt({
|
|
@@ -452,6 +461,117 @@ async function removeData() {
|
|
|
452
461
|
console.log(chalk.green('数据卷已删除'))
|
|
453
462
|
}
|
|
454
463
|
|
|
464
|
+
/**
|
|
465
|
+
* 删除数据集子菜单
|
|
466
|
+
*/
|
|
467
|
+
async function deleteDatasets() {
|
|
468
|
+
const dataPath = getDataVolumePath()
|
|
469
|
+
|
|
470
|
+
console.log()
|
|
471
|
+
console.log(chalk.bold('删除数据集'))
|
|
472
|
+
console.log()
|
|
473
|
+
|
|
474
|
+
// 收集已下载(含不完整)的数据集
|
|
475
|
+
const existingDatasets = DATASETS.filter(d => isDatasetExists(dataPath, d.id))
|
|
476
|
+
|
|
477
|
+
if (existingDatasets.length === 0) {
|
|
478
|
+
console.log(chalk.yellow('没有已下载的数据集'))
|
|
479
|
+
return
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
// 构建选项列表
|
|
483
|
+
const choices = existingDatasets.map((dataset) => {
|
|
484
|
+
const downloaded = isDatasetDownloaded(dataPath, dataset.id)
|
|
485
|
+
const incomplete = isDatasetIncomplete(dataPath, dataset.id)
|
|
486
|
+
|
|
487
|
+
let message = `${dataset.name} (${dataset.size})`
|
|
488
|
+
if (downloaded) {
|
|
489
|
+
message += ' [可用]'
|
|
490
|
+
} else if (incomplete) {
|
|
491
|
+
message += ' [不完整]'
|
|
492
|
+
} else {
|
|
493
|
+
message += ' [存在]'
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
return {
|
|
497
|
+
name: dataset.id,
|
|
498
|
+
message
|
|
499
|
+
}
|
|
500
|
+
})
|
|
501
|
+
|
|
502
|
+
console.log(chalk.gray('操作: 上下键移动,空格勾选/取消,回车确认,ESC 返回'))
|
|
503
|
+
console.log()
|
|
504
|
+
|
|
505
|
+
try {
|
|
506
|
+
const { selected } = await prompt({
|
|
507
|
+
type: 'multiselect',
|
|
508
|
+
name: 'selected',
|
|
509
|
+
message: '选择要删除的数据集',
|
|
510
|
+
choices,
|
|
511
|
+
hint: '空格选择,回车确认删除',
|
|
512
|
+
styles: {
|
|
513
|
+
primary: chalk.cyan.bold
|
|
514
|
+
}
|
|
515
|
+
})
|
|
516
|
+
|
|
517
|
+
if (!selected || selected.length === 0) {
|
|
518
|
+
console.log(chalk.yellow('未选择任何数据集'))
|
|
519
|
+
return
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
// 确认删除
|
|
523
|
+
const selectedNames = selected.map(id => {
|
|
524
|
+
const ds = existingDatasets.find(d => d.id === id)
|
|
525
|
+
return ds.name
|
|
526
|
+
})
|
|
527
|
+
|
|
528
|
+
console.log()
|
|
529
|
+
console.log(chalk.red(`将删除以下数据集: ${selectedNames.join(', ')}`))
|
|
530
|
+
|
|
531
|
+
const { confirm } = await prompt({
|
|
532
|
+
type: 'confirm',
|
|
533
|
+
name: 'confirm',
|
|
534
|
+
message: '确认删除?',
|
|
535
|
+
initial: false
|
|
536
|
+
})
|
|
537
|
+
|
|
538
|
+
if (!confirm) {
|
|
539
|
+
console.log(chalk.yellow('操作已取消'))
|
|
540
|
+
return
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
// 执行删除
|
|
544
|
+
for (const datasetId of selected) {
|
|
545
|
+
const dataset = DATASETS.find(d => d.id === datasetId)
|
|
546
|
+
const targetDir = path.join(dataPath, dataset.targetDir)
|
|
547
|
+
|
|
548
|
+
if (fs.existsSync(targetDir)) {
|
|
549
|
+
fs.rmSync(targetDir, { recursive: true, force: true })
|
|
550
|
+
console.log(chalk.green(`已删除: ${dataset.name}`))
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
// 更新配置
|
|
554
|
+
const config = readConfig()
|
|
555
|
+
if (config.installedDatasets) {
|
|
556
|
+
config.installedDatasets = config.installedDatasets.filter(id => id !== datasetId)
|
|
557
|
+
}
|
|
558
|
+
if (config.incompleteDatasets) {
|
|
559
|
+
config.incompleteDatasets = config.incompleteDatasets.filter(id => id !== datasetId)
|
|
560
|
+
}
|
|
561
|
+
writeConfig(config)
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
console.log()
|
|
565
|
+
console.log(chalk.green('删除完成'))
|
|
566
|
+
} catch (error) {
|
|
567
|
+
if (isUserCancel(error)) {
|
|
568
|
+
console.log(chalk.gray('返回上一级'))
|
|
569
|
+
return
|
|
570
|
+
}
|
|
571
|
+
throw error
|
|
572
|
+
}
|
|
573
|
+
}
|
|
574
|
+
|
|
455
575
|
/**
|
|
456
576
|
* 查看数据集列表
|
|
457
577
|
*/
|
|
@@ -993,9 +1113,20 @@ export async function runDataTUI() {
|
|
|
993
1113
|
}
|
|
994
1114
|
break
|
|
995
1115
|
case 3:
|
|
996
|
-
|
|
1116
|
+
try {
|
|
1117
|
+
await deleteDatasets()
|
|
1118
|
+
} catch (error) {
|
|
1119
|
+
if (isUserCancel(error)) {
|
|
1120
|
+
console.log(chalk.gray('返回主菜单'))
|
|
1121
|
+
} else {
|
|
1122
|
+
throw error
|
|
1123
|
+
}
|
|
1124
|
+
}
|
|
997
1125
|
break
|
|
998
1126
|
case 4:
|
|
1127
|
+
listDatasets()
|
|
1128
|
+
break
|
|
1129
|
+
case 5:
|
|
999
1130
|
try {
|
|
1000
1131
|
await clearData()
|
|
1001
1132
|
} catch (error) {
|
|
@@ -1006,7 +1137,7 @@ export async function runDataTUI() {
|
|
|
1006
1137
|
}
|
|
1007
1138
|
}
|
|
1008
1139
|
break
|
|
1009
|
-
case
|
|
1140
|
+
case 6:
|
|
1010
1141
|
try {
|
|
1011
1142
|
await removeData()
|
|
1012
1143
|
} catch (error) {
|
|
@@ -1017,7 +1148,7 @@ export async function runDataTUI() {
|
|
|
1017
1148
|
}
|
|
1018
1149
|
}
|
|
1019
1150
|
break
|
|
1020
|
-
case
|
|
1151
|
+
case 7:
|
|
1021
1152
|
console.log()
|
|
1022
1153
|
console.log(chalk.gray('已退出数据管理'))
|
|
1023
1154
|
console.log()
|
|
@@ -1068,6 +1199,9 @@ export async function runDataCommand(subCommand, options) {
|
|
|
1068
1199
|
case 'download':
|
|
1069
1200
|
await downloadDatasets()
|
|
1070
1201
|
break
|
|
1202
|
+
case 'delete':
|
|
1203
|
+
await deleteDatasets()
|
|
1204
|
+
break
|
|
1071
1205
|
default:
|
|
1072
1206
|
// 无子命令时进入 TUI
|
|
1073
1207
|
await runDataTUI()
|
package/src/commands/manage.js
CHANGED
package/src/commands/server.js
CHANGED
package/src/commands/update.js
CHANGED
package/src/index.js
CHANGED
|
@@ -12,6 +12,7 @@ import { runDoctor } from './commands/manage.js'
|
|
|
12
12
|
import { runDataTUI, runDataCommand } from './commands/data.js'
|
|
13
13
|
import { runImagesTUI } from './commands/images.js'
|
|
14
14
|
import { runUpdate } from './commands/update.js'
|
|
15
|
+
import { setVerbose } from './verbose.js'
|
|
15
16
|
|
|
16
17
|
// 从 package.json 读取版本号
|
|
17
18
|
const __filename = fileURLToPath(import.meta.url)
|
|
@@ -85,6 +86,13 @@ program
|
|
|
85
86
|
.version(VERSION, '-v, --version', '显示版本号')
|
|
86
87
|
.helpOption('-h, --help', '显示帮助信息')
|
|
87
88
|
.addHelpCommand('help [command]', '显示命令帮助信息')
|
|
89
|
+
.option('--verbose', '显示所有执行的外部命令,便于调试')
|
|
90
|
+
|
|
91
|
+
// 解析全局选项(需要在子命令 action 之前解析,以便设置 verbose 开关)
|
|
92
|
+
program.parseOptions(process.argv)
|
|
93
|
+
if (program.opts().verbose) {
|
|
94
|
+
setVerbose(true)
|
|
95
|
+
}
|
|
88
96
|
|
|
89
97
|
// ─────────────────────────────────────────────────────────────
|
|
90
98
|
// start 命令
|
package/src/verbose.js
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* 命令执行封装
|
|
3
|
+
* --verbose 模式下打印所有执行的外部命令,方便调试
|
|
4
|
+
*/
|
|
5
|
+
import { execSync as nodeExecSync, spawn as nodeSpawn } from 'child_process'
|
|
6
|
+
|
|
7
|
+
// 全局 verbose 开关
|
|
8
|
+
let verboseEnabled = false
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* 启用/禁用 verbose 模式
|
|
12
|
+
*/
|
|
13
|
+
export function setVerbose(enabled) {
|
|
14
|
+
verboseEnabled = !!enabled
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* 查询 verbose 模式状态
|
|
19
|
+
*/
|
|
20
|
+
export function isVerbose() {
|
|
21
|
+
return verboseEnabled
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* 打印 verbose 日志(仅在 verbose 模式下输出)
|
|
26
|
+
*/
|
|
27
|
+
function verboseLog(cmd, args) {
|
|
28
|
+
if (!verboseEnabled) return
|
|
29
|
+
const fullCmd = args && args.length > 0
|
|
30
|
+
? `${cmd} ${args.map(a => `'${a}'`).join(' ')}`
|
|
31
|
+
: cmd
|
|
32
|
+
console.log(`[verbose] $ ${fullCmd}`)
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* 封装 execSync,verbose 模式下打印命令
|
|
37
|
+
* 参数与原生 execSync 完全一致
|
|
38
|
+
*/
|
|
39
|
+
export function execSync(command, options) {
|
|
40
|
+
verboseLog(command)
|
|
41
|
+
return nodeExecSync(command, options)
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* 封装 spawn,verbose 模式下打印命令
|
|
46
|
+
* 参数与原生 spawn 完全一致
|
|
47
|
+
*/
|
|
48
|
+
export function spawn(command, args, options) {
|
|
49
|
+
verboseLog(command, args)
|
|
50
|
+
return nodeSpawn(command, args, options)
|
|
51
|
+
}
|
package/version.json
CHANGED