@icyfenix-dmla/cli 2026.5.14-2 → 2026.5.24-1015

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@icyfenix-dmla/cli",
3
- "version": "2026.5.14-2",
3
+ "version": "2026.5.24-1015",
4
4
  "description": "DMLA 沙箱服务命令行工具",
5
5
  "type": "module",
6
6
  "main": "src/index.js",
@@ -1,19 +1,9 @@
1
1
  # shared 模块包初始化
2
2
  # 包含统计学习系列文档中可复用的类定义
3
3
  #
4
- # 注意:不在此处自动导入子模块,避免加载不必要的依赖
5
- # 使用时请直接导入需要的模块,例如:
6
- # from shared.sequence_models.poetry_lstm import PoetryLSTM
7
- # from shared.linear.logistic_regression import LogisticRegression
4
+ # 使用方式:显式导入需要的模块
5
+ # from shared.sequence_models import PoetryLSTM, PoetryDataset
6
+ # from shared.cnn import AlexNet
7
+ # from shared.linear import LogisticRegression
8
8
 
9
- __all__ = [
10
- 'bayesian',
11
- 'cnn',
12
- 'gan',
13
- 'linear',
14
- 'neural',
15
- 'sequence_models',
16
- 'svm',
17
- 'tree',
18
- 'unsupervised',
19
- ]
9
+ __all__ = ['bayesian', 'cnn', 'gan', 'linear', 'llm', 'neural', 'sequence_models', 'svm', 'tree', 'unsupervised']
@@ -1,6 +1,7 @@
1
- # GaussianMixtureModel 类定义
1
+ # GaussianMixtureModel 定义
2
2
  # 从文档自动提取生成
3
3
 
4
+ import matplotlib.pyplot as plt
4
5
  import numpy as np
5
6
 
6
7
  class GaussianMixtureModel:
@@ -1,6 +1,7 @@
1
- # MultinomialNaiveBayes 类定义
1
+ # MultinomialNaiveBayes 定义
2
2
  # 从文档自动提取生成
3
3
 
4
+ import matplotlib.pyplot as plt
4
5
  import numpy as np
5
6
 
6
7
  class MultinomialNaiveBayes:
@@ -1,6 +1,8 @@
1
- # SimpleBayesianNetwork 类定义
1
+ # SimpleBayesianNetwork 定义
2
2
  # 从文档自动提取生成
3
3
 
4
+ import matplotlib.pyplot as plt
5
+
4
6
  class SimpleBayesianNetwork:
5
7
  """
6
8
  简单贝叶斯网络实现
@@ -1,11 +1,15 @@
1
1
  # CNN 模块
2
2
  from .alexnet import AlexNet
3
- from .lmdb_dataset import LMDBDataset
4
- from .lmdb_dataset import LMDBValDataset
5
- from .lmdbpreprocess_cache import LMDBPreprocessCache
3
+ try:
4
+ from .lmdb_dataset import LMDBDataset, LMDBValDataset
5
+ except ImportError:
6
+ pass # 可选依赖 lmdb 未安装
7
+ try:
8
+ from .lmdbpreprocess_cache import LMDBPreprocessCache
9
+ except ImportError:
10
+ pass # 可选依赖 lmdb 未安装
6
11
  from .minimal_preprocess_cache import MinimalPreprocessCache
7
- from .realtime_dataset import RealtimeAugmentDataset
8
- from .realtime_dataset import RealtimeValDataset
12
+ from .realtime_dataset import RealtimeAugmentDataset, RealtimeValDataset, _get_perf_log
9
13
  from .tiny_imagenet_dataset import TinyImageNetDataset
10
14
 
11
- __all__ = ['AlexNet', 'LMDBDataset', 'LMDBValDataset', 'LMDBPreprocessCache', 'MinimalPreprocessCache', 'RealtimeAugmentDataset', 'RealtimeValDataset', 'TinyImageNetDataset']
15
+ __all__ = ['AlexNet', 'LMDBDataset', 'LMDBValDataset', 'LMDBPreprocessCache', 'MinimalPreprocessCache', 'RealtimeAugmentDataset', 'RealtimeValDataset', '_get_perf_log', 'TinyImageNetDataset']
@@ -1,4 +1,4 @@
1
- # AlexNet 类定义
1
+ # AlexNet 定义
2
2
  # 从文档自动提取生成
3
3
 
4
4
  import torch
@@ -1,7 +1,9 @@
1
- # LMDBPreprocessCache 类定义
1
+ # LMDBPreprocessCache 定义
2
2
  # 从文档自动提取生成
3
3
 
4
+ import json
4
5
  import os
6
+ from dmla_progress import ProgressReporter
5
7
  from PIL import Image
6
8
 
7
9
  class LMDBPreprocessCache:
@@ -1,4 +1,4 @@
1
- # DCGANGenerator 类定义
1
+ # DCGANGenerator 定义
2
2
  # 从文档自动提取生成
3
3
 
4
4
  import torch
@@ -1,8 +1,15 @@
1
- # ImageVAE 类定义
1
+ # ImageVAE 定义
2
2
  # 从文档自动提取生成
3
3
 
4
+ import gzip
5
+ import matplotlib.pyplot as plt
6
+ import numpy as np
7
+ import os
8
+ import struct
4
9
  import torch
5
10
  import torch.nn as nn
11
+ import torch.nn.functional as F
12
+ from dmla_progress import ProgressReporter
6
13
  from PIL import Image
7
14
 
8
15
  class ImageVAE(nn.Module):
@@ -1,8 +1,7 @@
1
1
  # LINEAR 模块
2
2
  from .lasso_regression import LassoRegression
3
3
  from .logistic_regression import LogisticRegression
4
- from .naive_bayes import MultinomialNaiveBayes
5
- from .naive_bayes import GaussianNaiveBayes
4
+ from .naive_bayes import MultinomialNaiveBayes, GaussianNaiveBayes
6
5
  from .ridge_regression import RidgeRegression
7
6
 
8
7
  __all__ = ['LassoRegression', 'LogisticRegression', 'MultinomialNaiveBayes', 'GaussianNaiveBayes', 'RidgeRegression']
@@ -1,4 +1,4 @@
1
- # LassoRegression 类定义
1
+ # LassoRegression 定义
2
2
  # 从文档自动提取生成
3
3
 
4
4
  import numpy as np
@@ -1,6 +1,7 @@
1
- # LogisticRegression 类定义
1
+ # LogisticRegression 定义
2
2
  # 从文档自动提取生成
3
3
 
4
+ import matplotlib.pyplot as plt
4
5
  import numpy as np
5
6
 
6
7
  class LogisticRegression:
@@ -1,4 +1,4 @@
1
- # RidgeRegression 类定义
1
+ # RidgeRegression 定义
2
2
  # 从文档自动提取生成
3
3
 
4
4
  import numpy as np
@@ -0,0 +1,6 @@
1
+ # LLM 模块
2
+ from .mini_mind_config import MiniMindConfig, RMSNorm, Attention, FeedForward, MiniMindBlock, MiniMindModel, MiniMindForCausalLM, precompute_freqs_cis, apply_rotary_pos_emb, repeat_kv
3
+ from .pretrain_dataset import PretrainDataset
4
+ from .sftdataset import SFTDataset, pre_processing_chat
5
+
6
+ __all__ = ['MiniMindConfig', 'RMSNorm', 'Attention', 'FeedForward', 'MiniMindBlock', 'MiniMindModel', 'MiniMindForCausalLM', 'precompute_freqs_cis', 'apply_rotary_pos_emb', 'repeat_kv', 'PretrainDataset', 'SFTDataset', 'pre_processing_chat']
@@ -0,0 +1,296 @@
1
+ # MiniMindConfig, RMSNorm, precompute_freqs_cis, apply_rotary_pos_emb, repeat_kv, Attention, FeedForward, MiniMindBlock, MiniMindModel, MiniMindForCausalLM 定义
2
+ # 从文档自动提取生成
3
+
4
+ import math
5
+ import os
6
+ import torch
7
+ import torch.nn as nn
8
+ import torch.nn.functional as F
9
+ from transformers import PreTrainedModel, GenerationMixin, PretrainedConfig
10
+ from transformers.activations import ACT2FN
11
+ from transformers.modeling_outputs import MoeCausalLMOutputWithPast
12
+ from typing import Optional, Tuple, List, Dict
13
+
14
+ class MiniMindConfig(PretrainedConfig):
15
+ """MiniMind 模型配置"""
16
+ model_type = "minimind"
17
+ def __init__(self, hidden_size=768, num_hidden_layers=8, use_moe=False, **kwargs):
18
+ super().__init__(**kwargs)
19
+ self.hidden_size = hidden_size
20
+ self.num_hidden_layers = num_hidden_layers
21
+ self.use_moe = use_moe
22
+ self.dropout = kwargs.get("dropout", 0.0)
23
+ self.vocab_size = kwargs.get("vocab_size", 6400)
24
+ self.bos_token_id = kwargs.get("bos_token_id", 1)
25
+ self.eos_token_id = kwargs.get("eos_token_id", 2)
26
+ self.flash_attn = kwargs.get("flash_attn", True)
27
+ self.num_attention_heads = kwargs.get("num_attention_heads", 8)
28
+ self.num_key_value_heads = kwargs.get("num_key_value_heads", 4)
29
+ self.head_dim = kwargs.get("head_dim", self.hidden_size // self.num_attention_heads)
30
+ self.hidden_act = kwargs.get("hidden_act", 'silu')
31
+ self.intermediate_size = kwargs.get("intermediate_size", math.ceil(hidden_size * math.pi / 64) * 64)
32
+ self.max_position_embeddings = kwargs.get("max_position_embeddings", 32768)
33
+ self.rms_norm_eps = kwargs.get("rms_norm_eps", 1e-6)
34
+ self.rope_theta = kwargs.get("rope_theta", 1e6)
35
+ self.tie_word_embeddings = kwargs.get("tie_word_embeddings", True)
36
+ self.inference_rope_scaling = kwargs.get("inference_rope_scaling", False)
37
+ self.rope_scaling = {
38
+ "beta_fast": 32, "beta_slow": 1, "factor": 16,
39
+ "original_max_position_embeddings": 2048,
40
+ "attention_factor": 1.0, "type": "yarn"
41
+ } if self.inference_rope_scaling else None
42
+
43
+
44
+ class RMSNorm(nn.Module):
45
+ """RMS 归一化:比 LayerNorm 更高效,省去均值计算"""
46
+ def __init__(self, dim, eps=1e-5):
47
+ super().__init__()
48
+ self.eps = eps
49
+ self.weight = nn.Parameter(torch.ones(dim))
50
+
51
+ def norm(self, x):
52
+ return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
53
+
54
+ def forward(self, x):
55
+ return (self.weight * self.norm(x.float())).type_as(x)
56
+
57
+
58
+ def precompute_freqs_cis(dim, end=32768, rope_base=1e6, rope_scaling=None):
59
+ """预计算 RoPE 旋转位置编码的 cos 和 sin 值"""
60
+ freqs = 1.0 / (rope_base ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
61
+ attn_factor = 1.0
62
+ if rope_scaling is not None:
63
+ orig_max = rope_scaling.get("original_max_position_embeddings", 2048)
64
+ factor = rope_scaling.get("factor", 16)
65
+ beta_fast = rope_scaling.get("beta_fast", 32.0)
66
+ beta_slow = rope_scaling.get("beta_slow", 1.0)
67
+ attn_factor = rope_scaling.get("attention_factor", 1.0)
68
+ if end / orig_max > 1.0:
69
+ inv_dim = lambda b: (dim * math.log(orig_max / (b * 2 * math.pi))) / (2 * math.log(rope_base))
70
+ low = max(math.floor(inv_dim(beta_fast)), 0)
71
+ high = min(math.ceil(inv_dim(beta_slow)), dim // 2 - 1)
72
+ ramp = torch.clamp((torch.arange(dim // 2, device=freqs.device).float() - low) / max(high - low, 0.001), 0, 1)
73
+ freqs = freqs * (1 - ramp + ramp / factor)
74
+ t = torch.arange(end, device=freqs.device)
75
+ freqs = torch.outer(t, freqs).float()
76
+ freqs_cos = torch.cat([torch.cos(freqs), torch.cos(freqs)], dim=-1) * attn_factor
77
+ freqs_sin = torch.cat([torch.sin(freqs), torch.sin(freqs)], dim=-1) * attn_factor
78
+ return freqs_cos, freqs_sin
79
+
80
+
81
+ def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
82
+ """应用旋转位置编码到查询和键"""
83
+ def rotate_half(x):
84
+ return torch.cat((-x[..., x.shape[-1] // 2:], x[..., : x.shape[-1] // 2]), dim=-1)
85
+ q_embed = ((q * cos.unsqueeze(unsqueeze_dim)) + (rotate_half(q) * sin.unsqueeze(unsqueeze_dim))).to(q.dtype)
86
+ k_embed = ((k * cos.unsqueeze(unsqueeze_dim)) + (rotate_half(k) * sin.unsqueeze(unsqueeze_dim))).to(k.dtype)
87
+ return q_embed, k_embed
88
+
89
+
90
+ def repeat_kv(x, n_rep):
91
+ """重复 KV 头以匹配查询头数(GQA 推理时使用)"""
92
+ bs, slen, num_kv_heads, head_dim = x.shape
93
+ if n_rep == 1:
94
+ return x
95
+ return x[:, :, :, None, :].expand(bs, slen, num_kv_heads, n_rep, head_dim).reshape(bs, slen, num_kv_heads * n_rep, head_dim)
96
+
97
+
98
+ class Attention(nn.Module):
99
+ """GQA 分组查询注意力"""
100
+ def __init__(self, config):
101
+ super().__init__()
102
+ self.num_key_value_heads = config.num_attention_heads if config.num_key_value_heads is None else config.num_key_value_heads
103
+ self.n_local_heads = config.num_attention_heads
104
+ self.n_local_kv_heads = self.num_key_value_heads
105
+ self.n_rep = self.n_local_heads // self.n_local_kv_heads
106
+ self.head_dim = config.head_dim
107
+ self.is_causal = True
108
+ self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=False)
109
+ self.k_proj = nn.Linear(config.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
110
+ self.v_proj = nn.Linear(config.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
111
+ self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
112
+ self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
113
+ self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
114
+ self.attn_dropout = nn.Dropout(config.dropout)
115
+ self.resid_dropout = nn.Dropout(config.dropout)
116
+ self.dropout = config.dropout
117
+ self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention') and config.flash_attn
118
+
119
+ def forward(self, x, position_embeddings, past_key_value=None, use_cache=False, attention_mask=None):
120
+ bsz, seq_len, _ = x.shape
121
+ xq, xk, xv = self.q_proj(x), self.k_proj(x), self.v_proj(x)
122
+ xq = xq.view(bsz, seq_len, self.n_local_heads, self.head_dim)
123
+ xk = xk.view(bsz, seq_len, self.n_local_kv_heads, self.head_dim)
124
+ xv = xv.view(bsz, seq_len, self.n_local_kv_heads, self.head_dim)
125
+ # QK-Norm:对查询和键做 RMS 归一化,提升训练稳定性
126
+ xq, xk = self.q_norm(xq), self.k_norm(xk)
127
+ cos, sin = position_embeddings
128
+ xq, xk = apply_rotary_pos_emb(xq, xk, cos, sin)
129
+ # KV Cache:推理时拼接历史 KV
130
+ if past_key_value is not None:
131
+ xk = torch.cat([past_key_value[0], xk], dim=1)
132
+ xv = torch.cat([past_key_value[1], xv], dim=1)
133
+ past_kv = (xk, xv) if use_cache else None
134
+ xq, xk, xv = (xq.transpose(1, 2), repeat_kv(xk, self.n_rep).transpose(1, 2), repeat_kv(xv, self.n_rep).transpose(1, 2))
135
+ # 优先使用 Flash Attention(GPU 上更快更省显存)
136
+ if self.flash and (seq_len > 1) and (not self.is_causal or past_key_value is None) and (attention_mask is None or torch.all(attention_mask == 1)):
137
+ output = F.scaled_dot_product_attention(xq, xk, xv, dropout_p=self.dropout if self.training else 0.0, is_causal=self.is_causal)
138
+ else:
139
+ scores = (xq @ xk.transpose(-2, -1)) / math.sqrt(self.head_dim)
140
+ if self.is_causal:
141
+ scores[:, :, :, -seq_len:] += torch.full((seq_len, seq_len), float("-inf"), device=scores.device).triu(1)
142
+ if attention_mask is not None:
143
+ scores += (1.0 - attention_mask.unsqueeze(1).unsqueeze(2)) * -1e9
144
+ output = self.attn_dropout(F.softmax(scores.float(), dim=-1).type_as(xq)) @ xv
145
+ output = output.transpose(1, 2).reshape(bsz, seq_len, -1)
146
+ output = self.resid_dropout(self.o_proj(output))
147
+ return output, past_kv
148
+
149
+
150
+ class FeedForward(nn.Module):
151
+ """SwiGLU 前馈网络:gate 和 up 两条路径,门控选择信息通道"""
152
+ def __init__(self, config, intermediate_size=None):
153
+ super().__init__()
154
+ intermediate_size = intermediate_size or config.intermediate_size
155
+ self.gate_proj = nn.Linear(config.hidden_size, intermediate_size, bias=False)
156
+ self.down_proj = nn.Linear(intermediate_size, config.hidden_size, bias=False)
157
+ self.up_proj = nn.Linear(config.hidden_size, intermediate_size, bias=False)
158
+ self.act_fn = ACT2FN[config.hidden_act]
159
+
160
+ def forward(self, x):
161
+ return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
162
+
163
+
164
+ class MiniMindBlock(nn.Module):
165
+ """单个 Transformer 层:Pre-Norm + Attention + FFN"""
166
+ def __init__(self, layer_id, config):
167
+ super().__init__()
168
+ self.self_attn = Attention(config)
169
+ self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
170
+ self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
171
+ self.mlp = FeedForward(config)
172
+
173
+ def forward(self, hidden_states, position_embeddings, past_key_value=None, use_cache=False, attention_mask=None):
174
+ residual = hidden_states
175
+ hidden_states, present_key_value = self.self_attn(
176
+ self.input_layernorm(hidden_states), position_embeddings,
177
+ past_key_value, use_cache, attention_mask
178
+ )
179
+ hidden_states += residual
180
+ hidden_states = hidden_states + self.mlp(self.post_attention_layernorm(hidden_states))
181
+ return hidden_states, present_key_value
182
+
183
+
184
+ class MiniMindModel(nn.Module):
185
+ """MiniMind 主体:词嵌入 + 多层 Transformer + 最终归一化"""
186
+ def __init__(self, config):
187
+ super().__init__()
188
+ self.config = config
189
+ self.vocab_size = config.vocab_size
190
+ self.num_hidden_layers = config.num_hidden_layers
191
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
192
+ self.dropout = nn.Dropout(config.dropout)
193
+ self.layers = nn.ModuleList([MiniMindBlock(l, config) for l in range(self.num_hidden_layers)])
194
+ self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
195
+ # 预计算 RoPE 的 cos/sin 缓冲区
196
+ freqs_cos, freqs_sin = precompute_freqs_cis(
197
+ dim=config.head_dim, end=config.max_position_embeddings,
198
+ rope_base=config.rope_theta, rope_scaling=config.rope_scaling
199
+ )
200
+ self.register_buffer("freqs_cos", freqs_cos, persistent=False)
201
+ self.register_buffer("freqs_sin", freqs_sin, persistent=False)
202
+
203
+ def forward(self, input_ids, attention_mask=None, past_key_values=None, use_cache=False, **kwargs):
204
+ batch_size, seq_length = input_ids.shape
205
+ if hasattr(past_key_values, 'layers'):
206
+ past_key_values = None
207
+ past_key_values = past_key_values or [None] * len(self.layers)
208
+ start_pos = past_key_values[0][0].shape[1] if past_key_values[0] is not None else 0
209
+ hidden_states = self.dropout(self.embed_tokens(input_ids))
210
+ # 重新计算可能因 meta device 丢失的 RoPE 缓冲区
211
+ if self.freqs_cos[0, 0] == 0:
212
+ freqs_cos, freqs_sin = precompute_freqs_cis(
213
+ dim=self.config.head_dim, end=self.config.max_position_embeddings,
214
+ rope_base=self.config.rope_theta, rope_scaling=self.config.rope_scaling
215
+ )
216
+ self.freqs_cos, self.freqs_sin = freqs_cos.to(hidden_states.device), freqs_sin.to(hidden_states.device)
217
+ position_embeddings = (self.freqs_cos[start_pos:start_pos + seq_length], self.freqs_sin[start_pos:start_pos + seq_length])
218
+ presents = []
219
+ for layer, past_key_value in zip(self.layers, past_key_values):
220
+ hidden_states, present = layer(
221
+ hidden_states, position_embeddings,
222
+ past_key_value=past_key_value, use_cache=use_cache,
223
+ attention_mask=attention_mask
224
+ )
225
+ presents.append(present)
226
+ hidden_states = self.norm(hidden_states)
227
+ return hidden_states, presents, hidden_states.new_zeros(1).squeeze()
228
+
229
+
230
+ class MiniMindForCausalLM(PreTrainedModel, GenerationMixin):
231
+ """MiniMind 因果语言模型:用于预训练和推理"""
232
+ config_class = MiniMindConfig
233
+ _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
234
+ def __init__(self, config=None):
235
+ self.config = config or MiniMindConfig()
236
+ super().__init__(self.config)
237
+ self.model = MiniMindModel(self.config)
238
+ self.lm_head = nn.Linear(self.config.hidden_size, self.config.vocab_size, bias=False)
239
+ if self.config.tie_word_embeddings:
240
+ self.model.embed_tokens.weight = self.lm_head.weight
241
+ self.post_init()
242
+
243
+ def forward(self, input_ids, attention_mask=None, past_key_values=None, use_cache=False, logits_to_keep=0, labels=None, **kwargs):
244
+ hidden_states, past_key_values, aux_loss = self.model(input_ids, attention_mask, past_key_values, use_cache, **kwargs)
245
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
246
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
247
+ loss = None
248
+ if labels is not None:
249
+ x, y = logits[..., :-1, :].contiguous(), labels[..., 1:].contiguous()
250
+ loss = F.cross_entropy(x.view(-1, x.size(-1)), y.view(-1), ignore_index=-100)
251
+ return MoeCausalLMOutputWithPast(loss=loss, aux_loss=aux_loss, logits=logits, past_key_values=past_key_values, hidden_states=hidden_states)
252
+
253
+ @torch.inference_mode()
254
+ def generate(self, inputs=None, attention_mask=None, max_new_tokens=512, temperature=0.85, top_p=0.85, top_k=50, eos_token_id=2, streamer=None, use_cache=True, num_return_sequences=1, do_sample=True, repetition_penalty=1.0, **kwargs):
255
+ """自回归生成:逐 token 采样,支持 top-k、top-p、重复惩罚"""
256
+ input_ids = kwargs.pop("input_ids", inputs).repeat(num_return_sequences, 1)
257
+ attention_mask = attention_mask.repeat(num_return_sequences, 1) if attention_mask is not None else None
258
+ past_key_values = kwargs.pop("past_key_values", None)
259
+ finished = torch.zeros(input_ids.shape[0], dtype=torch.bool, device=input_ids.device)
260
+ if streamer:
261
+ streamer.put(input_ids.cpu())
262
+ for _ in range(max_new_tokens):
263
+ past_len = past_key_values[0][0].shape[1] if past_key_values else 0
264
+ outputs = self.forward(input_ids[:, past_len:], attention_mask, past_key_values, use_cache=use_cache, **kwargs)
265
+ attention_mask = torch.cat([attention_mask, attention_mask.new_ones(attention_mask.shape[0], 1)], -1) if attention_mask is not None else None
266
+ logits = outputs.logits[:, -1, :] / temperature
267
+ # 重复惩罚:降低已出现 token 的概率
268
+ if repetition_penalty != 1.0:
269
+ for i in range(input_ids.shape[0]):
270
+ seen = torch.unique(input_ids[i])
271
+ score = logits[i, seen]
272
+ logits[i, seen] = torch.where(score > 0, score / repetition_penalty, score * repetition_penalty)
273
+ # Top-k 过滤
274
+ if top_k > 0:
275
+ logits[logits < torch.topk(logits, top_k)[0][..., -1, None]] = -float('inf')
276
+ # Top-p(nucleus)过滤
277
+ if top_p < 1.0:
278
+ sorted_logits, sorted_indices = torch.sort(logits, descending=True)
279
+ mask = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1) > top_p
280
+ mask[..., 1:], mask[..., 0] = mask[..., :-1].clone(), 0
281
+ logits[mask.scatter(1, sorted_indices, mask)] = -float('inf')
282
+ # 采样或贪心选择
283
+ next_token = torch.multinomial(torch.softmax(logits, dim=-1), num_samples=1) if do_sample else torch.argmax(logits, dim=-1, keepdim=True)
284
+ if eos_token_id is not None:
285
+ next_token = torch.where(finished.unsqueeze(-1), next_token.new_full((next_token.shape[0], 1), eos_token_id), next_token)
286
+ input_ids = torch.cat([input_ids, next_token], dim=-1)
287
+ past_key_values = outputs.past_key_values if use_cache else None
288
+ if streamer:
289
+ streamer.put(next_token.cpu())
290
+ if eos_token_id is not None:
291
+ finished |= next_token.squeeze(-1).eq(eos_token_id)
292
+ if finished.all():
293
+ break
294
+ if streamer:
295
+ streamer.end()
296
+ return input_ids
@@ -0,0 +1,55 @@
1
+ # PretrainDataset 定义
2
+ # 从文档自动提取生成
3
+
4
+ import json
5
+ import torch
6
+ from torch.utils.data import Dataset
7
+
8
+ class PretrainDataset(Dataset):
9
+ """
10
+ 预训练数据集:从 JSONL 文件加载文本,tokenize 为 next-token prediction 格式
11
+
12
+ 每条样本格式:{"text": "一段文本"}
13
+ 输出:(input_ids, labels),其中 labels 是 input_ids 的右移一位版本,
14
+ 用于计算 next-token prediction 的交叉熵损失
15
+ """
16
+ def __init__(self, data_path, tokenizer, max_length=512):
17
+ super().__init__()
18
+ self.tokenizer = tokenizer
19
+ self.max_length = max_length
20
+ # 预读取所有样本的文本
21
+ self.samples = []
22
+ with open(data_path, 'r', encoding='utf-8') as f:
23
+ for line in f:
24
+ try:
25
+ data = json.loads(line.strip())
26
+ if 'text' in data and data['text'].strip():
27
+ self.samples.append(data['text'])
28
+ except json.JSONDecodeError:
29
+ continue
30
+
31
+ def __len__(self):
32
+ return len(self.samples)
33
+
34
+ def __getitem__(self, index):
35
+ text = self.samples[index]
36
+ # tokenize:截断到 max_length - 2(预留 BOS 和 EOS 的位置)
37
+ tokens = self.tokenizer(
38
+ str(text),
39
+ add_special_tokens=False,
40
+ max_length=self.max_length - 2,
41
+ truncation=True
42
+ ).input_ids
43
+
44
+ # 添加 BOS 和 EOS 标记
45
+ tokens = [self.tokenizer.bos_token_id] + tokens + [self.tokenizer.eos_token_id]
46
+
47
+ # 填充到固定长度
48
+ input_ids = tokens + [self.tokenizer.pad_token_id] * (self.max_length - len(tokens))
49
+ input_ids = torch.tensor(input_ids, dtype=torch.long)
50
+
51
+ # 标签与输入相同,填充位置标记为 -100(交叉熵损失忽略)
52
+ labels = input_ids.clone()
53
+ labels[input_ids == self.tokenizer.pad_token_id] = -100
54
+
55
+ return input_ids, labels
@@ -0,0 +1,108 @@
1
+ # SFTDataset, pre_processing_chat 定义
2
+ # 从文档自动提取生成
3
+
4
+ import json
5
+ import os
6
+ import random
7
+ import torch
8
+ from datasets import load_dataset, Features, Value
9
+ from torch.utils.data import Dataset
10
+
11
+ class SFTDataset(Dataset):
12
+ """
13
+ SFT 数据集:将对话数据 tokenize 为 next-token prediction 格式
14
+
15
+ 与 PretrainDataset 的核心差异:
16
+ - 数据格式从 {"text": "..."} 变为 {"conversations": [...]}
17
+ - 标签掩码:仅 assistant 回答部分参与 loss,其余标记为 -100
18
+ - 使用 apply_chat_template 将对话转为 ChatML 格式
19
+ """
20
+ def __init__(self, jsonl_path, tokenizer, max_length=768):
21
+ super().__init__()
22
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
23
+ self.tokenizer = tokenizer
24
+ self.max_length = max_length
25
+ features = Features({
26
+ 'conversations': [{'role': Value('string'), 'content': Value('string'),
27
+ 'reasoning_content': Value('string'), 'tools': Value('string'),
28
+ 'tool_calls': Value('string')}]
29
+ })
30
+ self.samples = load_dataset('json', data_files=jsonl_path, split='train', features=features)
31
+ # 预计算 assistant 回答的起止标记 ID
32
+ self.bos_id = tokenizer(f'{tokenizer.bos_token}assistant\n', add_special_tokens=False).input_ids
33
+ self.eos_id = tokenizer(f'{tokenizer.eos_token}\n', add_special_tokens=False).input_ids
34
+
35
+ def __len__(self):
36
+ return len(self.samples)
37
+
38
+ def create_chat_prompt(self, conversations):
39
+ """将对话列表应用 chat template 转为文本"""
40
+ messages = []
41
+ tools = None
42
+ for message in conversations:
43
+ message = dict(message)
44
+ if message.get("role") == "system" and message.get("tools"):
45
+ tools = json.loads(message["tools"]) if isinstance(message["tools"], str) else message["tools"]
46
+ if message.get("tool_calls") and isinstance(message["tool_calls"], str):
47
+ message["tool_calls"] = json.loads(message["tool_calls"])
48
+ messages.append(message)
49
+ return self.tokenizer.apply_chat_template(
50
+ messages, tokenize=False, add_generation_prompt=False, tools=tools
51
+ )
52
+
53
+ def generate_labels(self, input_ids):
54
+ """生成标签:assistant 回答部分保留原始 ID,其余设为 -100"""
55
+ labels = [-100] * len(input_ids)
56
+ i = 0
57
+ while i < len(input_ids):
58
+ # 检测 <|im_start|>assistant\n 的位置
59
+ if input_ids[i:i + len(self.bos_id)] == self.bos_id:
60
+ start = i + len(self.bos_id)
61
+ end = start
62
+ # 查找对应的 <|im_end|>\n
63
+ while end < len(input_ids):
64
+ if input_ids[end:end + len(self.eos_id)] == self.eos_id:
65
+ break
66
+ end += 1
67
+ # 标记回答区间(包含 eos)
68
+ for j in range(start, min(end + len(self.eos_id), self.max_length)):
69
+ labels[j] = input_ids[j]
70
+ i = end + len(self.eos_id) if end < len(input_ids) else len(input_ids)
71
+ else:
72
+ i += 1
73
+ return labels
74
+
75
+ def __getitem__(self, index):
76
+ sample = self.samples[index]
77
+ conversations = pre_processing_chat(sample['conversations'])
78
+ prompt = self.create_chat_prompt(conversations)
79
+ input_ids = self.tokenizer(prompt).input_ids[:self.max_length]
80
+ # 填充到固定长度
81
+ input_ids += [self.tokenizer.pad_token_id] * (self.max_length - len(input_ids))
82
+ labels = self.generate_labels(input_ids)
83
+ return torch.tensor(input_ids, dtype=torch.long), torch.tensor(labels, dtype=torch.long)
84
+
85
+
86
+ def pre_processing_chat(conversations, add_system_ratio=0.2):
87
+ """预处理对话数据:概率性添加系统提示词"""
88
+ # tool use 数据完整保留不做处理
89
+ if any(conv.get('tools') for conv in conversations):
90
+ return conversations
91
+
92
+ SYSTEM_PROMPTS = [
93
+ "你是一个知识丰富的AI,尽力为用户提供准确的信息。",
94
+ "你是minimind,一个小巧但有用的语言模型。",
95
+ "你是一个专业的AI助手,请提供有价值的回答。",
96
+ "你是minimind,请尽力帮助用户解决问题。",
97
+ "你是一个可靠的AI,请给出准确的回答。",
98
+ "You are a helpful AI assistant.",
99
+ "You are minimind, a lightweight intelligent assistant.",
100
+ "You are a friendly chatbot. Please answer the user's questions carefully.",
101
+ "You are a knowledgeable AI. Try your best to provide accurate information.",
102
+ "You are minimind, a small but useful language model."
103
+ ]
104
+ # 概率性添加 system
105
+ if conversations[0].get('role') != 'system':
106
+ if random.random() < add_system_ratio:
107
+ return [{'role': 'system', 'content': random.choice(SYSTEM_PROMPTS)}] + conversations
108
+ return conversations
@@ -1,6 +1,7 @@
1
- # Perceptron 类定义
1
+ # Perceptron 定义
2
2
  # 从文档自动提取生成
3
3
 
4
+ import matplotlib.pyplot as plt
4
5
  import numpy as np
5
6
 
6
7
  class Perceptron:
@@ -1,4 +1,5 @@
1
1
  # SEQUENCE_MODELS 模块
2
+ from .poetry_dataset import PoetryDataset
2
3
  from .poetry_lstm import PoetryLSTM
3
4
 
4
- __all__ = ['PoetryLSTM']
5
+ __all__ = ['PoetryDataset', 'PoetryLSTM']
@@ -0,0 +1,132 @@
1
+ # PoetryDataset 定义
2
+ # 从文档自动提取生成
3
+
4
+ import json
5
+ import os
6
+ import re
7
+ from collections import Counter, defaultdict, deque
8
+ from torch.utils.data import Dataset, DataLoader
9
+
10
+ class PoetryDataset:
11
+ """古诗词数据集(字符级语言模型)
12
+
13
+ 从 chinese-poetry 数据集加载诗词,构建字符级词汇表,
14
+ 将诗词文本转换为数值序列用于 LSTM 训练。
15
+ """
16
+ def __init__(self, data_dir, min_length=10, max_length=100, vocab_size=4000):
17
+ self.min_length = min_length
18
+ self.max_length = max_length
19
+ self.vocab_size = vocab_size
20
+
21
+ # 加载诗词文本
22
+ self.poems = self._load_poems(data_dir)
23
+ print(f"加载完成: {len(self.poems)} 首诗词")
24
+
25
+ # 构建词汇表
26
+ self.char2idx, self.idx2char = self._build_vocab()
27
+ print(f"词汇表大小: {len(self.char2idx)}")
28
+
29
+ # 将诗词转换为序列
30
+ self.sequences = self._encode_poems()
31
+ print(f"有效序列数: {len(self.sequences)}")
32
+
33
+ def _load_poems(self, data_dir):
34
+ """加载诗词数据"""
35
+ poems = []
36
+
37
+ # 定义要加载的数据集
38
+ datasets = ['全唐诗', '宋词', '诗经', '楚辞']
39
+
40
+ for dataset in datasets:
41
+ dataset_path = os.path.join(data_dir, dataset)
42
+ if not os.path.exists(dataset_path):
43
+ continue
44
+
45
+ json_files = [f for f in os.listdir(dataset_path) if f.endswith('.json')]
46
+
47
+ for jf in json_files:
48
+ file_path = os.path.join(dataset_path, jf)
49
+ try:
50
+ with open(file_path, 'r', encoding='utf-8') as f:
51
+ data = json.load(f)
52
+
53
+ for poem in data:
54
+ # 提取诗词正文
55
+ text = self._extract_text(poem)
56
+ if text and self._is_valid(text):
57
+ poems.append(text)
58
+ except Exception as e:
59
+ print(f"加载 {jf} 失败: {e}")
60
+
61
+ return poems
62
+
63
+ def _extract_text(self, poem):
64
+ """从诗词数据中提取正文"""
65
+ # 尝试不同的字段名
66
+ if 'text' in poem:
67
+ text = poem['text']
68
+ elif 'paragraphs' in poem:
69
+ text = ''.join(poem['paragraphs'])
70
+ elif 'content' in poem:
71
+ # content 可能是字符串或列表
72
+ content = poem['content']
73
+ if isinstance(content, list):
74
+ text = ''.join(content)
75
+ else:
76
+ text = content
77
+ else:
78
+ return None
79
+
80
+ # 清理文本:去除标点符号,只保留汉字
81
+ # 保留常用标点用于断句
82
+ text = re.sub(r'[^一-龥,。!?、;:""''()]', '', text)
83
+
84
+ return text
85
+
86
+ def _is_valid(self, text):
87
+ """检查文本是否有效"""
88
+ # 长度检查
89
+ if len(text) < self.min_length or len(text) > self.max_length:
90
+ return False
91
+
92
+ # 过滤包含缺字标记的诗句
93
+ if '□' in text or '■' in text:
94
+ return False
95
+
96
+ return True
97
+
98
+ def _build_vocab(self):
99
+ """构建字符级词汇表"""
100
+ # 统计字符频率
101
+ char_counter = Counter()
102
+ for poem in self.poems:
103
+ char_counter.update(poem)
104
+
105
+ # 选择高频字符
106
+ most_common = char_counter.most_common(self.vocab_size - 2) # 预留两个位置给特殊标记
107
+
108
+ # 构建映射
109
+ char2idx = {'<PAD>': 0, '<UNK>': 1}
110
+ for i, (char, _) in enumerate(most_common, start=2):
111
+ char2idx[char] = i
112
+
113
+ idx2char = {idx: char for char, idx in char2idx.items()}
114
+
115
+ return char2idx, idx2char
116
+
117
+ def _encode_poems(self):
118
+ """将诗词转换为数值序列"""
119
+ sequences = []
120
+ for poem in self.poems:
121
+ seq = [self.char2idx.get(c, self.char2idx['<UNK>']) for c in poem]
122
+ sequences.append(seq)
123
+ return sequences
124
+
125
+ def __len__(self):
126
+ return len(self.sequences)
127
+
128
+ def __getitem__(self, idx):
129
+ seq = self.sequences[idx]
130
+ # 输入序列:去掉最后一个字符
131
+ # 目标序列:去掉第一个字符
132
+ return seq[:-1], seq[1:]
@@ -1,4 +1,4 @@
1
- # PoetryLSTM 类定义
1
+ # PoetryLSTM 定义
2
2
  # 从文档自动提取生成
3
3
 
4
4
  import torch
@@ -1,4 +1,4 @@
1
- # KernelSVM 类定义
1
+ # KernelSVM 定义
2
2
  # 从文档自动提取生成
3
3
 
4
4
  import numpy as np
@@ -1,4 +1,4 @@
1
- # SimpleSVM 类定义
1
+ # SimpleSVM 定义
2
2
  # 从文档自动提取生成
3
3
 
4
4
  import numpy as np
@@ -1,4 +1,4 @@
1
- # AdaBoost 类定义
1
+ # AdaBoost 定义
2
2
  # 从文档自动提取生成
3
3
 
4
4
  import numpy as np
@@ -1,7 +1,9 @@
1
- # DecisionTreeClassifier 类定义
1
+ # DecisionTreeClassifier 定义
2
2
  # 从文档自动提取生成
3
3
 
4
+ import matplotlib.pyplot as plt
4
5
  import numpy as np
6
+ from sklearn.datasets import load_iris
5
7
 
6
8
  class DecisionTreeClassifier:
7
9
  """
@@ -1,7 +1,9 @@
1
- # RandomForestClassifier 类定义
1
+ # RandomForestClassifier 定义
2
2
  # 从文档自动提取生成
3
3
 
4
4
  import numpy as np
5
+ from sklearn.datasets import load_digits
6
+ from sklearn.model_selection import train_test_split
5
7
 
6
8
  class RandomForestClassifier:
7
9
  """
@@ -1,6 +1,7 @@
1
- # KMeans 类定义
1
+ # KMeans 定义
2
2
  # 从文档自动提取生成
3
3
 
4
+ import matplotlib.pyplot as plt
4
5
  import numpy as np
5
6
 
6
7
  class KMeans:
@@ -1,7 +1,8 @@
1
- # PCA 类定义
1
+ # PCA 定义
2
2
  # 从文档自动提取生成
3
3
 
4
4
  import numpy as np
5
+ from sklearn.datasets import load_iris
5
6
 
6
7
  class PCA:
7
8
  """
@@ -70,6 +70,24 @@ const DATASETS = [
70
70
  format: 'git',
71
71
  targetDir: 'datasets/chinese-poetry',
72
72
  source: 'ModelScope (icyfenix)'
73
+ },
74
+ {
75
+ id: 'minimind-pretrain',
76
+ name: 'MiniMind Pretrain (LLM预训练语料)',
77
+ url: 'https://www.modelscope.cn/datasets/icyfenix/Minimind_Pretrain.git',
78
+ size: '~1.2GB',
79
+ format: 'git',
80
+ targetDir: 'datasets/minimind-pretrain',
81
+ source: 'ModelScope (icyfenix)'
82
+ },
83
+ {
84
+ id: 'minimind-sft',
85
+ name: 'MiniMind SFT (LLM监督微调语料)',
86
+ url: 'https://www.modelscope.cn/datasets/icyfenix/Minimind_SFT.git',
87
+ size: '~500MB',
88
+ format: 'git',
89
+ targetDir: 'datasets/minimind-sft',
90
+ source: 'ModelScope (icyfenix)'
73
91
  }
74
92
  ]
75
93
 
@@ -25,7 +25,10 @@ const SOFT_DEPS = [
25
25
  'jupyter_client',
26
26
  'ipykernel',
27
27
  'lmdb',
28
- 'requests'
28
+ 'requests',
29
+ 'transformers',
30
+ 'tokenizers',
31
+ 'datasets'
29
32
  ]
30
33
 
31
34
  // 环境检测结果缓存
package/version.json CHANGED
@@ -1,4 +1,4 @@
1
1
  {
2
- "buildTime": "2026-05-13T16:02:49.085Z",
3
- "cliVersion": "2026.5.14-2"
2
+ "buildTime": "2026-05-24T02:17:04.071Z",
3
+ "cliVersion": "2026.5.24-1015"
4
4
  }