npm - @icyfenix-dmla/cli - Versions diffs - 2026.5.24-16 → 2026.5.24-2045 - Mend

@icyfenix-dmla/cli 2026.5.24-16 → 2026.5.24-2045

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/package.json +1 -1
package/shared/llm/__init__.py +2 -1
package/shared/llm/sftdataset.py +108 -0
package/src/commands/data.js +144 -10
package/src/commands/manage.js +1 -1
package/src/commands/server.js +1 -1
package/src/commands/update.js +1 -1
package/src/index.js +8 -0
package/src/verbose.js +51 -0
package/version.json +2 -2

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@icyfenix-dmla/cli",
-  "version": "2026.5.24-16",
+  "version": "2026.5.24-2045",
   "description": "DMLA 沙箱服务命令行工具",
   "type": "module",
   "main": "src/index.js",

package/shared/llm/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # LLM 模块
 from .mini_mind_config import MiniMindConfig, RMSNorm, Attention, FeedForward, MiniMindBlock, MiniMindModel, MiniMindForCausalLM, precompute_freqs_cis, apply_rotary_pos_emb, repeat_kv
 from .pretrain_dataset import PretrainDataset
+from .sftdataset import SFTDataset, pre_processing_chat
-__all__ = ['MiniMindConfig', 'RMSNorm', 'Attention', 'FeedForward', 'MiniMindBlock', 'MiniMindModel', 'MiniMindForCausalLM', 'precompute_freqs_cis', 'apply_rotary_pos_emb', 'repeat_kv', 'PretrainDataset']
+__all__ = ['MiniMindConfig', 'RMSNorm', 'Attention', 'FeedForward', 'MiniMindBlock', 'MiniMindModel', 'MiniMindForCausalLM', 'precompute_freqs_cis', 'apply_rotary_pos_emb', 'repeat_kv', 'PretrainDataset', 'SFTDataset', 'pre_processing_chat']

package/shared/llm/sftdataset.py ADDED Viewed

@@ -0,0 +1,108 @@
+# SFTDataset, pre_processing_chat 定义
+# 从文档自动提取生成
+import json
+import os
+import random
+import torch
+from datasets import load_dataset, Features, Value
+from torch.utils.data import Dataset
+class SFTDataset(Dataset):
+    """
+    SFT 数据集：将对话数据 tokenize 为 next-token prediction 格式
+    与 PretrainDataset 的核心差异：
+    - 数据格式从 {"text": "..."} 变为 {"conversations": [...]}
+    - 标签掩码：仅 assistant 回答部分参与 loss，其余标记为 -100
+    - 使用 apply_chat_template 将对话转为 ChatML 格式
+    """
+    def __init__(self, jsonl_path, tokenizer, max_length=768):
+        super().__init__()
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        features = Features({
+            'conversations': [{'role': Value('string'), 'content': Value('string'),
+                              'reasoning_content': Value('string'), 'tools': Value('string'),
+                              'tool_calls': Value('string')}]
+        })
+        self.samples = load_dataset('json', data_files=jsonl_path, split='train', features=features)
+        # 预计算 assistant 回答的起止标记 ID
+        self.bos_id = tokenizer(f'{tokenizer.bos_token}assistant\n', add_special_tokens=False).input_ids
+        self.eos_id = tokenizer(f'{tokenizer.eos_token}\n', add_special_tokens=False).input_ids
+    def __len__(self):
+        return len(self.samples)
+    def create_chat_prompt(self, conversations):
+        """将对话列表应用 chat template 转为文本"""
+        messages = []
+        tools = None
+        for message in conversations:
+            message = dict(message)
+            if message.get("role") == "system" and message.get("tools"):
+                tools = json.loads(message["tools"]) if isinstance(message["tools"], str) else message["tools"]
+            if message.get("tool_calls") and isinstance(message["tool_calls"], str):
+                message["tool_calls"] = json.loads(message["tool_calls"])
+            messages.append(message)
+        return self.tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=False, tools=tools
+        )
+    def generate_labels(self, input_ids):
+        """生成标签：assistant 回答部分保留原始 ID，其余设为 -100"""
+        labels = [-100] * len(input_ids)
+        i = 0
+        while i < len(input_ids):
+            # 检测 <|im_start|>assistant\n 的位置
+            if input_ids[i:i + len(self.bos_id)] == self.bos_id:
+                start = i + len(self.bos_id)
+                end = start
+                # 查找对应的 <|im_end|>\n
+                while end < len(input_ids):
+                    if input_ids[end:end + len(self.eos_id)] == self.eos_id:
+                        break
+                    end += 1
+                # 标记回答区间（包含 eos）
+                for j in range(start, min(end + len(self.eos_id), self.max_length)):
+                    labels[j] = input_ids[j]
+                i = end + len(self.eos_id) if end < len(input_ids) else len(input_ids)
+            else:
+                i += 1
+        return labels
+    def __getitem__(self, index):
+        sample = self.samples[index]
+        conversations = pre_processing_chat(sample['conversations'])
+        prompt = self.create_chat_prompt(conversations)
+        input_ids = self.tokenizer(prompt).input_ids[:self.max_length]
+        # 填充到固定长度
+        input_ids += [self.tokenizer.pad_token_id] * (self.max_length - len(input_ids))
+        labels = self.generate_labels(input_ids)
+        return torch.tensor(input_ids, dtype=torch.long), torch.tensor(labels, dtype=torch.long)
+def pre_processing_chat(conversations, add_system_ratio=0.2):
+    """预处理对话数据：概率性添加系统提示词"""
+    # tool use 数据完整保留不做处理
+    if any(conv.get('tools') for conv in conversations):
+        return conversations
+    SYSTEM_PROMPTS = [
+        "你是一个知识丰富的AI，尽力为用户提供准确的信息。",
+        "你是minimind，一个小巧但有用的语言模型。",
+        "你是一个专业的AI助手，请提供有价值的回答。",
+        "你是minimind，请尽力帮助用户解决问题。",
+        "你是一个可靠的AI，请给出准确的回答。",
+        "You are a helpful AI assistant.",
+        "You are minimind, a lightweight intelligent assistant.",
+        "You are a friendly chatbot. Please answer the user's questions carefully.",
+        "You are a knowledgeable AI. Try your best to provide accurate information.",
+        "You are minimind, a small but useful language model."
+    ]
+    # 概率性添加 system
+    if conversations[0].get('role') != 'system':
+        if random.random() < add_system_ratio:
+            return [{'role': 'system', 'content': random.choice(SYSTEM_PROMPTS)}] + conversations
+    return conversations

package/src/commands/data.js CHANGED Viewed

@@ -8,8 +8,7 @@ const { prompt } = pkg
 import fs from 'fs'
 import path from 'path'
 import os from 'os'
-import { spawn } from 'child_process'
-import { execSync } from 'child_process'
+import { spawn, execSync } from '../verbose.js'
 import AdmZip from 'adm-zip'
 // 配置文件路径
@@ -74,11 +73,20 @@ const DATASETS = [
   {
     id: 'minimind-pretrain',
     name: 'MiniMind Pretrain (LLM预训练语料)',
-    url: 'https://www.modelscope.cn/datasets/icyfenix/minimind_pretrain.git',
+    url: 'https://www.modelscope.cn/datasets/icyfenix/Minimind_Pretrain.git',
     size: '~1.2GB',
     format: 'git',
     targetDir: 'datasets/minimind-pretrain',
     source: 'ModelScope (icyfenix)'
+  },
+  {
+    id: 'minimind-sft',
+    name: 'MiniMind SFT (LLM监督微调语料)',
+    url: 'https://www.modelscope.cn/datasets/icyfenix/Minimind_SFT.git',
+    size: '~500MB',
+    format: 'git',
+    targetDir: 'datasets/minimind-sft',
+    source: 'ModelScope (icyfenix)'
   }
 ]
@@ -301,10 +309,11 @@ async function showMainMenu(dataPath) {
   const choices = [
     { name: '1', message: '挂载路径设置        ' + chalk.gray(`[当前: ${dataPath}]`) },
     { name: '2', message: '下载数据集' },
-    { name: '3', message: '查看数据集列表' },
-    { name: '4', message: '清空数据内容' },
-    { name: '5', message: '删除数据卷' },
-    { name: '6', message: '退出' }
+    { name: '3', message: '删除数据集' },
+    { name: '4', message: '查看数据集列表' },
+    { name: '5', message: '清空数据内容' },
+    { name: '6', message: '删除数据卷' },
+    { name: '7', message: '退出' }
   ]
   const { action } = await prompt({
@@ -452,6 +461,117 @@ async function removeData() {
   console.log(chalk.green('数据卷已删除'))
 }
+/**
+ * 删除数据集子菜单
+ */
+async function deleteDatasets() {
+  const dataPath = getDataVolumePath()
+  console.log()
+  console.log(chalk.bold('删除数据集'))
+  console.log()
+  // 收集已下载（含不完整）的数据集
+  const existingDatasets = DATASETS.filter(d => isDatasetExists(dataPath, d.id))
+  if (existingDatasets.length === 0) {
+    console.log(chalk.yellow('没有已下载的数据集'))
+    return
+  }
+  // 构建选项列表
+  const choices = existingDatasets.map((dataset) => {
+    const downloaded = isDatasetDownloaded(dataPath, dataset.id)
+    const incomplete = isDatasetIncomplete(dataPath, dataset.id)
+    let message = `${dataset.name} (${dataset.size})`
+    if (downloaded) {
+      message += ' [可用]'
+    } else if (incomplete) {
+      message += ' [不完整]'
+    } else {
+      message += ' [存在]'
+    }
+    return {
+      name: dataset.id,
+      message
+    }
+  })
+  console.log(chalk.gray('操作: 上下键移动，空格勾选/取消，回车确认，ESC 返回'))
+  console.log()
+  try {
+    const { selected } = await prompt({
+      type: 'multiselect',
+      name: 'selected',
+      message: '选择要删除的数据集',
+      choices,
+      hint: '空格选择，回车确认删除',
+      styles: {
+        primary: chalk.cyan.bold
+      }
+    })
+    if (!selected || selected.length === 0) {
+      console.log(chalk.yellow('未选择任何数据集'))
+      return
+    }
+    // 确认删除
+    const selectedNames = selected.map(id => {
+      const ds = existingDatasets.find(d => d.id === id)
+      return ds.name
+    })
+    console.log()
+    console.log(chalk.red(`将删除以下数据集: ${selectedNames.join(', ')}`))
+    const { confirm } = await prompt({
+      type: 'confirm',
+      name: 'confirm',
+      message: '确认删除?',
+      initial: false
+    })
+    if (!confirm) {
+      console.log(chalk.yellow('操作已取消'))
+      return
+    }
+    // 执行删除
+    for (const datasetId of selected) {
+      const dataset = DATASETS.find(d => d.id === datasetId)
+      const targetDir = path.join(dataPath, dataset.targetDir)
+      if (fs.existsSync(targetDir)) {
+        fs.rmSync(targetDir, { recursive: true, force: true })
+        console.log(chalk.green(`已删除: ${dataset.name}`))
+      }
+      // 更新配置
+      const config = readConfig()
+      if (config.installedDatasets) {
+        config.installedDatasets = config.installedDatasets.filter(id => id !== datasetId)
+      }
+      if (config.incompleteDatasets) {
+        config.incompleteDatasets = config.incompleteDatasets.filter(id => id !== datasetId)
+      }
+      writeConfig(config)
+    }
+    console.log()
+    console.log(chalk.green('删除完成'))
+  } catch (error) {
+    if (isUserCancel(error)) {
+      console.log(chalk.gray('返回上一级'))
+      return
+    }
+    throw error
+  }
+}
 /**
  * 查看数据集列表
  */
@@ -993,9 +1113,20 @@ export async function runDataTUI() {
           }
           break
         case 3:
-          listDatasets()
+          try {
+            await deleteDatasets()
+          } catch (error) {
+            if (isUserCancel(error)) {
+              console.log(chalk.gray('返回主菜单'))
+            } else {
+              throw error
+            }
+          }
           break
         case 4:
+          listDatasets()
+          break
+        case 5:
           try {
             await clearData()
           } catch (error) {
@@ -1006,7 +1137,7 @@ export async function runDataTUI() {
             }
           }
           break
-        case 5:
+        case 6:
           try {
             await removeData()
           } catch (error) {
@@ -1017,7 +1148,7 @@ export async function runDataTUI() {
             }
           }
           break
-        case 6:
+        case 7:
           console.log()
           console.log(chalk.gray('已退出数据管理'))
           console.log()
@@ -1068,6 +1199,9 @@ export async function runDataCommand(subCommand, options) {
     case 'download':
       await downloadDatasets()
       break
+    case 'delete':
+      await deleteDatasets()
+      break
     default:
       // 无子命令时进入 TUI
       await runDataTUI()

package/src/commands/manage.js CHANGED Viewed

@@ -3,7 +3,7 @@
  */
 import chalk from 'chalk'
 import Docker from 'dockerode'
-import { spawn, execSync } from 'child_process'
+import { spawn, execSync } from '../verbose.js'
 import http from 'http'
 import path from 'path'
 import { fileURLToPath } from 'url'

package/src/commands/server.js CHANGED Viewed

@@ -3,7 +3,7 @@
  */
 import chalk from 'chalk'
 import Docker from 'dockerode'
-import { spawn } from 'child_process'
+import { spawn, execSync } from '../verbose.js'
 import http from 'http'
 import path from 'path'
 import { fileURLToPath, pathToFileURL } from 'url'

package/src/commands/update.js CHANGED Viewed

@@ -3,7 +3,7 @@
  * 通过 npm 更新程序
  */
 import chalk from 'chalk'
-import { execSync } from 'child_process'
+import { execSync } from '../verbose.js'
 /**
  * 运行 update 命令

package/src/index.js CHANGED Viewed

@@ -12,6 +12,7 @@ import { runDoctor } from './commands/manage.js'
 import { runDataTUI, runDataCommand } from './commands/data.js'
 import { runImagesTUI } from './commands/images.js'
 import { runUpdate } from './commands/update.js'
+import { setVerbose } from './verbose.js'
 // 从 package.json 读取版本号
 const __filename = fileURLToPath(import.meta.url)
@@ -85,6 +86,13 @@ program
   .version(VERSION, '-v, --version', '显示版本号')
   .helpOption('-h, --help', '显示帮助信息')
   .addHelpCommand('help [command]', '显示命令帮助信息')
+  .option('--verbose', '显示所有执行的外部命令，便于调试')
+// 解析全局选项（需要在子命令 action 之前解析，以便设置 verbose 开关）
+program.parseOptions(process.argv)
+if (program.opts().verbose) {
+  setVerbose(true)
+}
 // ─────────────────────────────────────────────────────────────
 // start 命令

package/src/verbose.js ADDED Viewed

@@ -0,0 +1,51 @@
+/**
+ * 命令执行封装
+ * --verbose 模式下打印所有执行的外部命令，方便调试
+ */
+import { execSync as nodeExecSync, spawn as nodeSpawn } from 'child_process'
+// 全局 verbose 开关
+let verboseEnabled = false
+/**
+ * 启用/禁用 verbose 模式
+ */
+export function setVerbose(enabled) {
+  verboseEnabled = !!enabled
+}
+/**
+ * 查询 verbose 模式状态
+ */
+export function isVerbose() {
+  return verboseEnabled
+}
+/**
+ * 打印 verbose 日志（仅在 verbose 模式下输出）
+ */
+function verboseLog(cmd, args) {
+  if (!verboseEnabled) return
+  const fullCmd = args && args.length > 0
+    ? `${cmd} ${args.map(a => `'${a}'`).join(' ')}`
+    : cmd
+  console.log(`[verbose] $ ${fullCmd}`)
+}
+/**
+ * 封装 execSync，verbose 模式下打印命令
+ * 参数与原生 execSync 完全一致
+ */
+export function execSync(command, options) {
+  verboseLog(command)
+  return nodeExecSync(command, options)
+}
+/**
+ * 封装 spawn，verbose 模式下打印命令
+ * 参数与原生 spawn 完全一致
+ */
+export function spawn(command, args, options) {
+  verboseLog(command, args)
+  return nodeSpawn(command, args, options)
+}

package/version.json CHANGED Viewed

@@ -1,4 +1,4 @@
 {
-  "buildTime": "2026-05-23T16:41:15.404Z",
-  "cliVersion": "2026.5.24-16"
+  "buildTime": "2026-05-24T12:46:48.154Z",
+  "cliVersion": "2026.5.24-2045"
 }