@icyfenix-dmla/cli 2026.5.14-2 → 2026.5.24-16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/shared/__init__.py +5 -15
- package/shared/bayesian/gaussian_mixture_model.py +2 -1
- package/shared/bayesian/multinomial_naive_bayes.py +2 -1
- package/shared/bayesian/simple_bayesian_network.py +3 -1
- package/shared/cnn/__init__.py +10 -6
- package/shared/cnn/alexnet.py +1 -1
- package/shared/cnn/lmdbpreprocess_cache.py +3 -1
- package/shared/gan/dcgan_generator.py +1 -1
- package/shared/gan/image_vae.py +8 -1
- package/shared/linear/__init__.py +1 -2
- package/shared/linear/lasso_regression.py +1 -1
- package/shared/linear/logistic_regression.py +2 -1
- package/shared/linear/ridge_regression.py +1 -1
- package/shared/llm/__init__.py +5 -0
- package/shared/llm/mini_mind_config.py +296 -0
- package/shared/llm/pretrain_dataset.py +55 -0
- package/shared/neural/perceptron.py +2 -1
- package/shared/sequence_models/__init__.py +2 -1
- package/shared/sequence_models/poetry_dataset.py +132 -0
- package/shared/sequence_models/poetry_lstm.py +1 -1
- package/shared/svm/kernel_svm.py +1 -1
- package/shared/svm/simple_svm.py +1 -1
- package/shared/tree/ada_boost.py +1 -1
- package/shared/tree/decision_tree_classifier.py +3 -1
- package/shared/tree/random_forest_classifier.py +3 -1
- package/shared/unsupervised/kmeans.py +2 -1
- package/shared/unsupervised/pca.py +2 -1
- package/src/commands/data.js +9 -0
- package/src/server/native_env_check.js +4 -1
- package/version.json +2 -2
package/package.json
CHANGED
package/shared/__init__.py
CHANGED
|
@@ -1,19 +1,9 @@
|
|
|
1
1
|
# shared 模块包初始化
|
|
2
2
|
# 包含统计学习系列文档中可复用的类定义
|
|
3
3
|
#
|
|
4
|
-
#
|
|
5
|
-
#
|
|
6
|
-
# from shared.
|
|
7
|
-
# from shared.linear
|
|
4
|
+
# 使用方式:显式导入需要的模块
|
|
5
|
+
# from shared.sequence_models import PoetryLSTM, PoetryDataset
|
|
6
|
+
# from shared.cnn import AlexNet
|
|
7
|
+
# from shared.linear import LogisticRegression
|
|
8
8
|
|
|
9
|
-
__all__ = [
|
|
10
|
-
'bayesian',
|
|
11
|
-
'cnn',
|
|
12
|
-
'gan',
|
|
13
|
-
'linear',
|
|
14
|
-
'neural',
|
|
15
|
-
'sequence_models',
|
|
16
|
-
'svm',
|
|
17
|
-
'tree',
|
|
18
|
-
'unsupervised',
|
|
19
|
-
]
|
|
9
|
+
__all__ = ['bayesian', 'cnn', 'gan', 'linear', 'llm', 'neural', 'sequence_models', 'svm', 'tree', 'unsupervised']
|
package/shared/cnn/__init__.py
CHANGED
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
# CNN 模块
|
|
2
2
|
from .alexnet import AlexNet
|
|
3
|
-
|
|
4
|
-
from .lmdb_dataset import LMDBValDataset
|
|
5
|
-
|
|
3
|
+
try:
|
|
4
|
+
from .lmdb_dataset import LMDBDataset, LMDBValDataset
|
|
5
|
+
except ImportError:
|
|
6
|
+
pass # 可选依赖 lmdb 未安装
|
|
7
|
+
try:
|
|
8
|
+
from .lmdbpreprocess_cache import LMDBPreprocessCache
|
|
9
|
+
except ImportError:
|
|
10
|
+
pass # 可选依赖 lmdb 未安装
|
|
6
11
|
from .minimal_preprocess_cache import MinimalPreprocessCache
|
|
7
|
-
from .realtime_dataset import RealtimeAugmentDataset
|
|
8
|
-
from .realtime_dataset import RealtimeValDataset
|
|
12
|
+
from .realtime_dataset import RealtimeAugmentDataset, RealtimeValDataset, _get_perf_log
|
|
9
13
|
from .tiny_imagenet_dataset import TinyImageNetDataset
|
|
10
14
|
|
|
11
|
-
__all__ = ['AlexNet', 'LMDBDataset', 'LMDBValDataset', 'LMDBPreprocessCache', 'MinimalPreprocessCache', 'RealtimeAugmentDataset', 'RealtimeValDataset', 'TinyImageNetDataset']
|
|
15
|
+
__all__ = ['AlexNet', 'LMDBDataset', 'LMDBValDataset', 'LMDBPreprocessCache', 'MinimalPreprocessCache', 'RealtimeAugmentDataset', 'RealtimeValDataset', '_get_perf_log', 'TinyImageNetDataset']
|
package/shared/cnn/alexnet.py
CHANGED
package/shared/gan/image_vae.py
CHANGED
|
@@ -1,8 +1,15 @@
|
|
|
1
|
-
# ImageVAE
|
|
1
|
+
# ImageVAE 定义
|
|
2
2
|
# 从文档自动提取生成
|
|
3
3
|
|
|
4
|
+
import gzip
|
|
5
|
+
import matplotlib.pyplot as plt
|
|
6
|
+
import numpy as np
|
|
7
|
+
import os
|
|
8
|
+
import struct
|
|
4
9
|
import torch
|
|
5
10
|
import torch.nn as nn
|
|
11
|
+
import torch.nn.functional as F
|
|
12
|
+
from dmla_progress import ProgressReporter
|
|
6
13
|
from PIL import Image
|
|
7
14
|
|
|
8
15
|
class ImageVAE(nn.Module):
|
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
# LINEAR 模块
|
|
2
2
|
from .lasso_regression import LassoRegression
|
|
3
3
|
from .logistic_regression import LogisticRegression
|
|
4
|
-
from .naive_bayes import MultinomialNaiveBayes
|
|
5
|
-
from .naive_bayes import GaussianNaiveBayes
|
|
4
|
+
from .naive_bayes import MultinomialNaiveBayes, GaussianNaiveBayes
|
|
6
5
|
from .ridge_regression import RidgeRegression
|
|
7
6
|
|
|
8
7
|
__all__ = ['LassoRegression', 'LogisticRegression', 'MultinomialNaiveBayes', 'GaussianNaiveBayes', 'RidgeRegression']
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
# LLM 模块
|
|
2
|
+
from .mini_mind_config import MiniMindConfig, RMSNorm, Attention, FeedForward, MiniMindBlock, MiniMindModel, MiniMindForCausalLM, precompute_freqs_cis, apply_rotary_pos_emb, repeat_kv
|
|
3
|
+
from .pretrain_dataset import PretrainDataset
|
|
4
|
+
|
|
5
|
+
__all__ = ['MiniMindConfig', 'RMSNorm', 'Attention', 'FeedForward', 'MiniMindBlock', 'MiniMindModel', 'MiniMindForCausalLM', 'precompute_freqs_cis', 'apply_rotary_pos_emb', 'repeat_kv', 'PretrainDataset']
|
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
# MiniMindConfig, RMSNorm, precompute_freqs_cis, apply_rotary_pos_emb, repeat_kv, Attention, FeedForward, MiniMindBlock, MiniMindModel, MiniMindForCausalLM 定义
|
|
2
|
+
# 从文档自动提取生成
|
|
3
|
+
|
|
4
|
+
import math
|
|
5
|
+
import os
|
|
6
|
+
import torch
|
|
7
|
+
import torch.nn as nn
|
|
8
|
+
import torch.nn.functional as F
|
|
9
|
+
from transformers import PreTrainedModel, GenerationMixin, PretrainedConfig
|
|
10
|
+
from transformers.activations import ACT2FN
|
|
11
|
+
from transformers.modeling_outputs import MoeCausalLMOutputWithPast
|
|
12
|
+
from typing import Optional, Tuple, List, Dict
|
|
13
|
+
|
|
14
|
+
class MiniMindConfig(PretrainedConfig):
|
|
15
|
+
"""MiniMind 模型配置"""
|
|
16
|
+
model_type = "minimind"
|
|
17
|
+
def __init__(self, hidden_size=768, num_hidden_layers=8, use_moe=False, **kwargs):
|
|
18
|
+
super().__init__(**kwargs)
|
|
19
|
+
self.hidden_size = hidden_size
|
|
20
|
+
self.num_hidden_layers = num_hidden_layers
|
|
21
|
+
self.use_moe = use_moe
|
|
22
|
+
self.dropout = kwargs.get("dropout", 0.0)
|
|
23
|
+
self.vocab_size = kwargs.get("vocab_size", 6400)
|
|
24
|
+
self.bos_token_id = kwargs.get("bos_token_id", 1)
|
|
25
|
+
self.eos_token_id = kwargs.get("eos_token_id", 2)
|
|
26
|
+
self.flash_attn = kwargs.get("flash_attn", True)
|
|
27
|
+
self.num_attention_heads = kwargs.get("num_attention_heads", 8)
|
|
28
|
+
self.num_key_value_heads = kwargs.get("num_key_value_heads", 4)
|
|
29
|
+
self.head_dim = kwargs.get("head_dim", self.hidden_size // self.num_attention_heads)
|
|
30
|
+
self.hidden_act = kwargs.get("hidden_act", 'silu')
|
|
31
|
+
self.intermediate_size = kwargs.get("intermediate_size", math.ceil(hidden_size * math.pi / 64) * 64)
|
|
32
|
+
self.max_position_embeddings = kwargs.get("max_position_embeddings", 32768)
|
|
33
|
+
self.rms_norm_eps = kwargs.get("rms_norm_eps", 1e-6)
|
|
34
|
+
self.rope_theta = kwargs.get("rope_theta", 1e6)
|
|
35
|
+
self.tie_word_embeddings = kwargs.get("tie_word_embeddings", True)
|
|
36
|
+
self.inference_rope_scaling = kwargs.get("inference_rope_scaling", False)
|
|
37
|
+
self.rope_scaling = {
|
|
38
|
+
"beta_fast": 32, "beta_slow": 1, "factor": 16,
|
|
39
|
+
"original_max_position_embeddings": 2048,
|
|
40
|
+
"attention_factor": 1.0, "type": "yarn"
|
|
41
|
+
} if self.inference_rope_scaling else None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class RMSNorm(nn.Module):
|
|
45
|
+
"""RMS 归一化:比 LayerNorm 更高效,省去均值计算"""
|
|
46
|
+
def __init__(self, dim, eps=1e-5):
|
|
47
|
+
super().__init__()
|
|
48
|
+
self.eps = eps
|
|
49
|
+
self.weight = nn.Parameter(torch.ones(dim))
|
|
50
|
+
|
|
51
|
+
def norm(self, x):
|
|
52
|
+
return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
|
|
53
|
+
|
|
54
|
+
def forward(self, x):
|
|
55
|
+
return (self.weight * self.norm(x.float())).type_as(x)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def precompute_freqs_cis(dim, end=32768, rope_base=1e6, rope_scaling=None):
|
|
59
|
+
"""预计算 RoPE 旋转位置编码的 cos 和 sin 值"""
|
|
60
|
+
freqs = 1.0 / (rope_base ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
|
|
61
|
+
attn_factor = 1.0
|
|
62
|
+
if rope_scaling is not None:
|
|
63
|
+
orig_max = rope_scaling.get("original_max_position_embeddings", 2048)
|
|
64
|
+
factor = rope_scaling.get("factor", 16)
|
|
65
|
+
beta_fast = rope_scaling.get("beta_fast", 32.0)
|
|
66
|
+
beta_slow = rope_scaling.get("beta_slow", 1.0)
|
|
67
|
+
attn_factor = rope_scaling.get("attention_factor", 1.0)
|
|
68
|
+
if end / orig_max > 1.0:
|
|
69
|
+
inv_dim = lambda b: (dim * math.log(orig_max / (b * 2 * math.pi))) / (2 * math.log(rope_base))
|
|
70
|
+
low = max(math.floor(inv_dim(beta_fast)), 0)
|
|
71
|
+
high = min(math.ceil(inv_dim(beta_slow)), dim // 2 - 1)
|
|
72
|
+
ramp = torch.clamp((torch.arange(dim // 2, device=freqs.device).float() - low) / max(high - low, 0.001), 0, 1)
|
|
73
|
+
freqs = freqs * (1 - ramp + ramp / factor)
|
|
74
|
+
t = torch.arange(end, device=freqs.device)
|
|
75
|
+
freqs = torch.outer(t, freqs).float()
|
|
76
|
+
freqs_cos = torch.cat([torch.cos(freqs), torch.cos(freqs)], dim=-1) * attn_factor
|
|
77
|
+
freqs_sin = torch.cat([torch.sin(freqs), torch.sin(freqs)], dim=-1) * attn_factor
|
|
78
|
+
return freqs_cos, freqs_sin
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
|
|
82
|
+
"""应用旋转位置编码到查询和键"""
|
|
83
|
+
def rotate_half(x):
|
|
84
|
+
return torch.cat((-x[..., x.shape[-1] // 2:], x[..., : x.shape[-1] // 2]), dim=-1)
|
|
85
|
+
q_embed = ((q * cos.unsqueeze(unsqueeze_dim)) + (rotate_half(q) * sin.unsqueeze(unsqueeze_dim))).to(q.dtype)
|
|
86
|
+
k_embed = ((k * cos.unsqueeze(unsqueeze_dim)) + (rotate_half(k) * sin.unsqueeze(unsqueeze_dim))).to(k.dtype)
|
|
87
|
+
return q_embed, k_embed
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def repeat_kv(x, n_rep):
|
|
91
|
+
"""重复 KV 头以匹配查询头数(GQA 推理时使用)"""
|
|
92
|
+
bs, slen, num_kv_heads, head_dim = x.shape
|
|
93
|
+
if n_rep == 1:
|
|
94
|
+
return x
|
|
95
|
+
return x[:, :, :, None, :].expand(bs, slen, num_kv_heads, n_rep, head_dim).reshape(bs, slen, num_kv_heads * n_rep, head_dim)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class Attention(nn.Module):
|
|
99
|
+
"""GQA 分组查询注意力"""
|
|
100
|
+
def __init__(self, config):
|
|
101
|
+
super().__init__()
|
|
102
|
+
self.num_key_value_heads = config.num_attention_heads if config.num_key_value_heads is None else config.num_key_value_heads
|
|
103
|
+
self.n_local_heads = config.num_attention_heads
|
|
104
|
+
self.n_local_kv_heads = self.num_key_value_heads
|
|
105
|
+
self.n_rep = self.n_local_heads // self.n_local_kv_heads
|
|
106
|
+
self.head_dim = config.head_dim
|
|
107
|
+
self.is_causal = True
|
|
108
|
+
self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=False)
|
|
109
|
+
self.k_proj = nn.Linear(config.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
|
|
110
|
+
self.v_proj = nn.Linear(config.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
|
|
111
|
+
self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
|
|
112
|
+
self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
|
|
113
|
+
self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
|
|
114
|
+
self.attn_dropout = nn.Dropout(config.dropout)
|
|
115
|
+
self.resid_dropout = nn.Dropout(config.dropout)
|
|
116
|
+
self.dropout = config.dropout
|
|
117
|
+
self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention') and config.flash_attn
|
|
118
|
+
|
|
119
|
+
def forward(self, x, position_embeddings, past_key_value=None, use_cache=False, attention_mask=None):
|
|
120
|
+
bsz, seq_len, _ = x.shape
|
|
121
|
+
xq, xk, xv = self.q_proj(x), self.k_proj(x), self.v_proj(x)
|
|
122
|
+
xq = xq.view(bsz, seq_len, self.n_local_heads, self.head_dim)
|
|
123
|
+
xk = xk.view(bsz, seq_len, self.n_local_kv_heads, self.head_dim)
|
|
124
|
+
xv = xv.view(bsz, seq_len, self.n_local_kv_heads, self.head_dim)
|
|
125
|
+
# QK-Norm:对查询和键做 RMS 归一化,提升训练稳定性
|
|
126
|
+
xq, xk = self.q_norm(xq), self.k_norm(xk)
|
|
127
|
+
cos, sin = position_embeddings
|
|
128
|
+
xq, xk = apply_rotary_pos_emb(xq, xk, cos, sin)
|
|
129
|
+
# KV Cache:推理时拼接历史 KV
|
|
130
|
+
if past_key_value is not None:
|
|
131
|
+
xk = torch.cat([past_key_value[0], xk], dim=1)
|
|
132
|
+
xv = torch.cat([past_key_value[1], xv], dim=1)
|
|
133
|
+
past_kv = (xk, xv) if use_cache else None
|
|
134
|
+
xq, xk, xv = (xq.transpose(1, 2), repeat_kv(xk, self.n_rep).transpose(1, 2), repeat_kv(xv, self.n_rep).transpose(1, 2))
|
|
135
|
+
# 优先使用 Flash Attention(GPU 上更快更省显存)
|
|
136
|
+
if self.flash and (seq_len > 1) and (not self.is_causal or past_key_value is None) and (attention_mask is None or torch.all(attention_mask == 1)):
|
|
137
|
+
output = F.scaled_dot_product_attention(xq, xk, xv, dropout_p=self.dropout if self.training else 0.0, is_causal=self.is_causal)
|
|
138
|
+
else:
|
|
139
|
+
scores = (xq @ xk.transpose(-2, -1)) / math.sqrt(self.head_dim)
|
|
140
|
+
if self.is_causal:
|
|
141
|
+
scores[:, :, :, -seq_len:] += torch.full((seq_len, seq_len), float("-inf"), device=scores.device).triu(1)
|
|
142
|
+
if attention_mask is not None:
|
|
143
|
+
scores += (1.0 - attention_mask.unsqueeze(1).unsqueeze(2)) * -1e9
|
|
144
|
+
output = self.attn_dropout(F.softmax(scores.float(), dim=-1).type_as(xq)) @ xv
|
|
145
|
+
output = output.transpose(1, 2).reshape(bsz, seq_len, -1)
|
|
146
|
+
output = self.resid_dropout(self.o_proj(output))
|
|
147
|
+
return output, past_kv
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
class FeedForward(nn.Module):
|
|
151
|
+
"""SwiGLU 前馈网络:gate 和 up 两条路径,门控选择信息通道"""
|
|
152
|
+
def __init__(self, config, intermediate_size=None):
|
|
153
|
+
super().__init__()
|
|
154
|
+
intermediate_size = intermediate_size or config.intermediate_size
|
|
155
|
+
self.gate_proj = nn.Linear(config.hidden_size, intermediate_size, bias=False)
|
|
156
|
+
self.down_proj = nn.Linear(intermediate_size, config.hidden_size, bias=False)
|
|
157
|
+
self.up_proj = nn.Linear(config.hidden_size, intermediate_size, bias=False)
|
|
158
|
+
self.act_fn = ACT2FN[config.hidden_act]
|
|
159
|
+
|
|
160
|
+
def forward(self, x):
|
|
161
|
+
return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class MiniMindBlock(nn.Module):
|
|
165
|
+
"""单个 Transformer 层:Pre-Norm + Attention + FFN"""
|
|
166
|
+
def __init__(self, layer_id, config):
|
|
167
|
+
super().__init__()
|
|
168
|
+
self.self_attn = Attention(config)
|
|
169
|
+
self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
|
170
|
+
self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
|
171
|
+
self.mlp = FeedForward(config)
|
|
172
|
+
|
|
173
|
+
def forward(self, hidden_states, position_embeddings, past_key_value=None, use_cache=False, attention_mask=None):
|
|
174
|
+
residual = hidden_states
|
|
175
|
+
hidden_states, present_key_value = self.self_attn(
|
|
176
|
+
self.input_layernorm(hidden_states), position_embeddings,
|
|
177
|
+
past_key_value, use_cache, attention_mask
|
|
178
|
+
)
|
|
179
|
+
hidden_states += residual
|
|
180
|
+
hidden_states = hidden_states + self.mlp(self.post_attention_layernorm(hidden_states))
|
|
181
|
+
return hidden_states, present_key_value
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
class MiniMindModel(nn.Module):
|
|
185
|
+
"""MiniMind 主体:词嵌入 + 多层 Transformer + 最终归一化"""
|
|
186
|
+
def __init__(self, config):
|
|
187
|
+
super().__init__()
|
|
188
|
+
self.config = config
|
|
189
|
+
self.vocab_size = config.vocab_size
|
|
190
|
+
self.num_hidden_layers = config.num_hidden_layers
|
|
191
|
+
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
|
|
192
|
+
self.dropout = nn.Dropout(config.dropout)
|
|
193
|
+
self.layers = nn.ModuleList([MiniMindBlock(l, config) for l in range(self.num_hidden_layers)])
|
|
194
|
+
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
|
195
|
+
# 预计算 RoPE 的 cos/sin 缓冲区
|
|
196
|
+
freqs_cos, freqs_sin = precompute_freqs_cis(
|
|
197
|
+
dim=config.head_dim, end=config.max_position_embeddings,
|
|
198
|
+
rope_base=config.rope_theta, rope_scaling=config.rope_scaling
|
|
199
|
+
)
|
|
200
|
+
self.register_buffer("freqs_cos", freqs_cos, persistent=False)
|
|
201
|
+
self.register_buffer("freqs_sin", freqs_sin, persistent=False)
|
|
202
|
+
|
|
203
|
+
def forward(self, input_ids, attention_mask=None, past_key_values=None, use_cache=False, **kwargs):
|
|
204
|
+
batch_size, seq_length = input_ids.shape
|
|
205
|
+
if hasattr(past_key_values, 'layers'):
|
|
206
|
+
past_key_values = None
|
|
207
|
+
past_key_values = past_key_values or [None] * len(self.layers)
|
|
208
|
+
start_pos = past_key_values[0][0].shape[1] if past_key_values[0] is not None else 0
|
|
209
|
+
hidden_states = self.dropout(self.embed_tokens(input_ids))
|
|
210
|
+
# 重新计算可能因 meta device 丢失的 RoPE 缓冲区
|
|
211
|
+
if self.freqs_cos[0, 0] == 0:
|
|
212
|
+
freqs_cos, freqs_sin = precompute_freqs_cis(
|
|
213
|
+
dim=self.config.head_dim, end=self.config.max_position_embeddings,
|
|
214
|
+
rope_base=self.config.rope_theta, rope_scaling=self.config.rope_scaling
|
|
215
|
+
)
|
|
216
|
+
self.freqs_cos, self.freqs_sin = freqs_cos.to(hidden_states.device), freqs_sin.to(hidden_states.device)
|
|
217
|
+
position_embeddings = (self.freqs_cos[start_pos:start_pos + seq_length], self.freqs_sin[start_pos:start_pos + seq_length])
|
|
218
|
+
presents = []
|
|
219
|
+
for layer, past_key_value in zip(self.layers, past_key_values):
|
|
220
|
+
hidden_states, present = layer(
|
|
221
|
+
hidden_states, position_embeddings,
|
|
222
|
+
past_key_value=past_key_value, use_cache=use_cache,
|
|
223
|
+
attention_mask=attention_mask
|
|
224
|
+
)
|
|
225
|
+
presents.append(present)
|
|
226
|
+
hidden_states = self.norm(hidden_states)
|
|
227
|
+
return hidden_states, presents, hidden_states.new_zeros(1).squeeze()
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
class MiniMindForCausalLM(PreTrainedModel, GenerationMixin):
|
|
231
|
+
"""MiniMind 因果语言模型:用于预训练和推理"""
|
|
232
|
+
config_class = MiniMindConfig
|
|
233
|
+
_tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
|
|
234
|
+
def __init__(self, config=None):
|
|
235
|
+
self.config = config or MiniMindConfig()
|
|
236
|
+
super().__init__(self.config)
|
|
237
|
+
self.model = MiniMindModel(self.config)
|
|
238
|
+
self.lm_head = nn.Linear(self.config.hidden_size, self.config.vocab_size, bias=False)
|
|
239
|
+
if self.config.tie_word_embeddings:
|
|
240
|
+
self.model.embed_tokens.weight = self.lm_head.weight
|
|
241
|
+
self.post_init()
|
|
242
|
+
|
|
243
|
+
def forward(self, input_ids, attention_mask=None, past_key_values=None, use_cache=False, logits_to_keep=0, labels=None, **kwargs):
|
|
244
|
+
hidden_states, past_key_values, aux_loss = self.model(input_ids, attention_mask, past_key_values, use_cache, **kwargs)
|
|
245
|
+
slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
|
|
246
|
+
logits = self.lm_head(hidden_states[:, slice_indices, :])
|
|
247
|
+
loss = None
|
|
248
|
+
if labels is not None:
|
|
249
|
+
x, y = logits[..., :-1, :].contiguous(), labels[..., 1:].contiguous()
|
|
250
|
+
loss = F.cross_entropy(x.view(-1, x.size(-1)), y.view(-1), ignore_index=-100)
|
|
251
|
+
return MoeCausalLMOutputWithPast(loss=loss, aux_loss=aux_loss, logits=logits, past_key_values=past_key_values, hidden_states=hidden_states)
|
|
252
|
+
|
|
253
|
+
@torch.inference_mode()
|
|
254
|
+
def generate(self, inputs=None, attention_mask=None, max_new_tokens=512, temperature=0.85, top_p=0.85, top_k=50, eos_token_id=2, streamer=None, use_cache=True, num_return_sequences=1, do_sample=True, repetition_penalty=1.0, **kwargs):
|
|
255
|
+
"""自回归生成:逐 token 采样,支持 top-k、top-p、重复惩罚"""
|
|
256
|
+
input_ids = kwargs.pop("input_ids", inputs).repeat(num_return_sequences, 1)
|
|
257
|
+
attention_mask = attention_mask.repeat(num_return_sequences, 1) if attention_mask is not None else None
|
|
258
|
+
past_key_values = kwargs.pop("past_key_values", None)
|
|
259
|
+
finished = torch.zeros(input_ids.shape[0], dtype=torch.bool, device=input_ids.device)
|
|
260
|
+
if streamer:
|
|
261
|
+
streamer.put(input_ids.cpu())
|
|
262
|
+
for _ in range(max_new_tokens):
|
|
263
|
+
past_len = past_key_values[0][0].shape[1] if past_key_values else 0
|
|
264
|
+
outputs = self.forward(input_ids[:, past_len:], attention_mask, past_key_values, use_cache=use_cache, **kwargs)
|
|
265
|
+
attention_mask = torch.cat([attention_mask, attention_mask.new_ones(attention_mask.shape[0], 1)], -1) if attention_mask is not None else None
|
|
266
|
+
logits = outputs.logits[:, -1, :] / temperature
|
|
267
|
+
# 重复惩罚:降低已出现 token 的概率
|
|
268
|
+
if repetition_penalty != 1.0:
|
|
269
|
+
for i in range(input_ids.shape[0]):
|
|
270
|
+
seen = torch.unique(input_ids[i])
|
|
271
|
+
score = logits[i, seen]
|
|
272
|
+
logits[i, seen] = torch.where(score > 0, score / repetition_penalty, score * repetition_penalty)
|
|
273
|
+
# Top-k 过滤
|
|
274
|
+
if top_k > 0:
|
|
275
|
+
logits[logits < torch.topk(logits, top_k)[0][..., -1, None]] = -float('inf')
|
|
276
|
+
# Top-p(nucleus)过滤
|
|
277
|
+
if top_p < 1.0:
|
|
278
|
+
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
|
|
279
|
+
mask = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1) > top_p
|
|
280
|
+
mask[..., 1:], mask[..., 0] = mask[..., :-1].clone(), 0
|
|
281
|
+
logits[mask.scatter(1, sorted_indices, mask)] = -float('inf')
|
|
282
|
+
# 采样或贪心选择
|
|
283
|
+
next_token = torch.multinomial(torch.softmax(logits, dim=-1), num_samples=1) if do_sample else torch.argmax(logits, dim=-1, keepdim=True)
|
|
284
|
+
if eos_token_id is not None:
|
|
285
|
+
next_token = torch.where(finished.unsqueeze(-1), next_token.new_full((next_token.shape[0], 1), eos_token_id), next_token)
|
|
286
|
+
input_ids = torch.cat([input_ids, next_token], dim=-1)
|
|
287
|
+
past_key_values = outputs.past_key_values if use_cache else None
|
|
288
|
+
if streamer:
|
|
289
|
+
streamer.put(next_token.cpu())
|
|
290
|
+
if eos_token_id is not None:
|
|
291
|
+
finished |= next_token.squeeze(-1).eq(eos_token_id)
|
|
292
|
+
if finished.all():
|
|
293
|
+
break
|
|
294
|
+
if streamer:
|
|
295
|
+
streamer.end()
|
|
296
|
+
return input_ids
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# PretrainDataset 定义
|
|
2
|
+
# 从文档自动提取生成
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
import torch
|
|
6
|
+
from torch.utils.data import Dataset
|
|
7
|
+
|
|
8
|
+
class PretrainDataset(Dataset):
|
|
9
|
+
"""
|
|
10
|
+
预训练数据集:从 JSONL 文件加载文本,tokenize 为 next-token prediction 格式
|
|
11
|
+
|
|
12
|
+
每条样本格式:{"text": "一段文本"}
|
|
13
|
+
输出:(input_ids, labels),其中 labels 是 input_ids 的右移一位版本,
|
|
14
|
+
用于计算 next-token prediction 的交叉熵损失
|
|
15
|
+
"""
|
|
16
|
+
def __init__(self, data_path, tokenizer, max_length=512):
|
|
17
|
+
super().__init__()
|
|
18
|
+
self.tokenizer = tokenizer
|
|
19
|
+
self.max_length = max_length
|
|
20
|
+
# 预读取所有样本的文本
|
|
21
|
+
self.samples = []
|
|
22
|
+
with open(data_path, 'r', encoding='utf-8') as f:
|
|
23
|
+
for line in f:
|
|
24
|
+
try:
|
|
25
|
+
data = json.loads(line.strip())
|
|
26
|
+
if 'text' in data and data['text'].strip():
|
|
27
|
+
self.samples.append(data['text'])
|
|
28
|
+
except json.JSONDecodeError:
|
|
29
|
+
continue
|
|
30
|
+
|
|
31
|
+
def __len__(self):
|
|
32
|
+
return len(self.samples)
|
|
33
|
+
|
|
34
|
+
def __getitem__(self, index):
|
|
35
|
+
text = self.samples[index]
|
|
36
|
+
# tokenize:截断到 max_length - 2(预留 BOS 和 EOS 的位置)
|
|
37
|
+
tokens = self.tokenizer(
|
|
38
|
+
str(text),
|
|
39
|
+
add_special_tokens=False,
|
|
40
|
+
max_length=self.max_length - 2,
|
|
41
|
+
truncation=True
|
|
42
|
+
).input_ids
|
|
43
|
+
|
|
44
|
+
# 添加 BOS 和 EOS 标记
|
|
45
|
+
tokens = [self.tokenizer.bos_token_id] + tokens + [self.tokenizer.eos_token_id]
|
|
46
|
+
|
|
47
|
+
# 填充到固定长度
|
|
48
|
+
input_ids = tokens + [self.tokenizer.pad_token_id] * (self.max_length - len(tokens))
|
|
49
|
+
input_ids = torch.tensor(input_ids, dtype=torch.long)
|
|
50
|
+
|
|
51
|
+
# 标签与输入相同,填充位置标记为 -100(交叉熵损失忽略)
|
|
52
|
+
labels = input_ids.clone()
|
|
53
|
+
labels[input_ids == self.tokenizer.pad_token_id] = -100
|
|
54
|
+
|
|
55
|
+
return input_ids, labels
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
# PoetryDataset 定义
|
|
2
|
+
# 从文档自动提取生成
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
from collections import Counter, defaultdict, deque
|
|
8
|
+
from torch.utils.data import Dataset, DataLoader
|
|
9
|
+
|
|
10
|
+
class PoetryDataset:
|
|
11
|
+
"""古诗词数据集(字符级语言模型)
|
|
12
|
+
|
|
13
|
+
从 chinese-poetry 数据集加载诗词,构建字符级词汇表,
|
|
14
|
+
将诗词文本转换为数值序列用于 LSTM 训练。
|
|
15
|
+
"""
|
|
16
|
+
def __init__(self, data_dir, min_length=10, max_length=100, vocab_size=4000):
|
|
17
|
+
self.min_length = min_length
|
|
18
|
+
self.max_length = max_length
|
|
19
|
+
self.vocab_size = vocab_size
|
|
20
|
+
|
|
21
|
+
# 加载诗词文本
|
|
22
|
+
self.poems = self._load_poems(data_dir)
|
|
23
|
+
print(f"加载完成: {len(self.poems)} 首诗词")
|
|
24
|
+
|
|
25
|
+
# 构建词汇表
|
|
26
|
+
self.char2idx, self.idx2char = self._build_vocab()
|
|
27
|
+
print(f"词汇表大小: {len(self.char2idx)}")
|
|
28
|
+
|
|
29
|
+
# 将诗词转换为序列
|
|
30
|
+
self.sequences = self._encode_poems()
|
|
31
|
+
print(f"有效序列数: {len(self.sequences)}")
|
|
32
|
+
|
|
33
|
+
def _load_poems(self, data_dir):
|
|
34
|
+
"""加载诗词数据"""
|
|
35
|
+
poems = []
|
|
36
|
+
|
|
37
|
+
# 定义要加载的数据集
|
|
38
|
+
datasets = ['全唐诗', '宋词', '诗经', '楚辞']
|
|
39
|
+
|
|
40
|
+
for dataset in datasets:
|
|
41
|
+
dataset_path = os.path.join(data_dir, dataset)
|
|
42
|
+
if not os.path.exists(dataset_path):
|
|
43
|
+
continue
|
|
44
|
+
|
|
45
|
+
json_files = [f for f in os.listdir(dataset_path) if f.endswith('.json')]
|
|
46
|
+
|
|
47
|
+
for jf in json_files:
|
|
48
|
+
file_path = os.path.join(dataset_path, jf)
|
|
49
|
+
try:
|
|
50
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
51
|
+
data = json.load(f)
|
|
52
|
+
|
|
53
|
+
for poem in data:
|
|
54
|
+
# 提取诗词正文
|
|
55
|
+
text = self._extract_text(poem)
|
|
56
|
+
if text and self._is_valid(text):
|
|
57
|
+
poems.append(text)
|
|
58
|
+
except Exception as e:
|
|
59
|
+
print(f"加载 {jf} 失败: {e}")
|
|
60
|
+
|
|
61
|
+
return poems
|
|
62
|
+
|
|
63
|
+
def _extract_text(self, poem):
|
|
64
|
+
"""从诗词数据中提取正文"""
|
|
65
|
+
# 尝试不同的字段名
|
|
66
|
+
if 'text' in poem:
|
|
67
|
+
text = poem['text']
|
|
68
|
+
elif 'paragraphs' in poem:
|
|
69
|
+
text = ''.join(poem['paragraphs'])
|
|
70
|
+
elif 'content' in poem:
|
|
71
|
+
# content 可能是字符串或列表
|
|
72
|
+
content = poem['content']
|
|
73
|
+
if isinstance(content, list):
|
|
74
|
+
text = ''.join(content)
|
|
75
|
+
else:
|
|
76
|
+
text = content
|
|
77
|
+
else:
|
|
78
|
+
return None
|
|
79
|
+
|
|
80
|
+
# 清理文本:去除标点符号,只保留汉字
|
|
81
|
+
# 保留常用标点用于断句
|
|
82
|
+
text = re.sub(r'[^一-龥,。!?、;:""''()]', '', text)
|
|
83
|
+
|
|
84
|
+
return text
|
|
85
|
+
|
|
86
|
+
def _is_valid(self, text):
|
|
87
|
+
"""检查文本是否有效"""
|
|
88
|
+
# 长度检查
|
|
89
|
+
if len(text) < self.min_length or len(text) > self.max_length:
|
|
90
|
+
return False
|
|
91
|
+
|
|
92
|
+
# 过滤包含缺字标记的诗句
|
|
93
|
+
if '□' in text or '■' in text:
|
|
94
|
+
return False
|
|
95
|
+
|
|
96
|
+
return True
|
|
97
|
+
|
|
98
|
+
def _build_vocab(self):
|
|
99
|
+
"""构建字符级词汇表"""
|
|
100
|
+
# 统计字符频率
|
|
101
|
+
char_counter = Counter()
|
|
102
|
+
for poem in self.poems:
|
|
103
|
+
char_counter.update(poem)
|
|
104
|
+
|
|
105
|
+
# 选择高频字符
|
|
106
|
+
most_common = char_counter.most_common(self.vocab_size - 2) # 预留两个位置给特殊标记
|
|
107
|
+
|
|
108
|
+
# 构建映射
|
|
109
|
+
char2idx = {'<PAD>': 0, '<UNK>': 1}
|
|
110
|
+
for i, (char, _) in enumerate(most_common, start=2):
|
|
111
|
+
char2idx[char] = i
|
|
112
|
+
|
|
113
|
+
idx2char = {idx: char for char, idx in char2idx.items()}
|
|
114
|
+
|
|
115
|
+
return char2idx, idx2char
|
|
116
|
+
|
|
117
|
+
def _encode_poems(self):
|
|
118
|
+
"""将诗词转换为数值序列"""
|
|
119
|
+
sequences = []
|
|
120
|
+
for poem in self.poems:
|
|
121
|
+
seq = [self.char2idx.get(c, self.char2idx['<UNK>']) for c in poem]
|
|
122
|
+
sequences.append(seq)
|
|
123
|
+
return sequences
|
|
124
|
+
|
|
125
|
+
def __len__(self):
|
|
126
|
+
return len(self.sequences)
|
|
127
|
+
|
|
128
|
+
def __getitem__(self, idx):
|
|
129
|
+
seq = self.sequences[idx]
|
|
130
|
+
# 输入序列:去掉最后一个字符
|
|
131
|
+
# 目标序列:去掉第一个字符
|
|
132
|
+
return seq[:-1], seq[1:]
|
package/shared/svm/kernel_svm.py
CHANGED
package/shared/svm/simple_svm.py
CHANGED
package/shared/tree/ada_boost.py
CHANGED
package/src/commands/data.js
CHANGED
|
@@ -70,6 +70,15 @@ const DATASETS = [
|
|
|
70
70
|
format: 'git',
|
|
71
71
|
targetDir: 'datasets/chinese-poetry',
|
|
72
72
|
source: 'ModelScope (icyfenix)'
|
|
73
|
+
},
|
|
74
|
+
{
|
|
75
|
+
id: 'minimind-pretrain',
|
|
76
|
+
name: 'MiniMind Pretrain (LLM预训练语料)',
|
|
77
|
+
url: 'https://www.modelscope.cn/datasets/icyfenix/minimind_pretrain.git',
|
|
78
|
+
size: '~1.2GB',
|
|
79
|
+
format: 'git',
|
|
80
|
+
targetDir: 'datasets/minimind-pretrain',
|
|
81
|
+
source: 'ModelScope (icyfenix)'
|
|
73
82
|
}
|
|
74
83
|
]
|
|
75
84
|
|
package/version.json
CHANGED