@genai-fi/nanogpt 0.3.2 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/Generator.js +22 -22
- package/dist/MLP-KHhikThU.js +83 -0
- package/dist/NanoGPTModel.d.ts +2 -3
- package/dist/NanoGPTModel.js +79 -79
- package/dist/TeachableLLM.js +16 -13
- package/dist/axis_util-DeydwOoC.js +69 -0
- package/dist/{concat-BIZS_td9.js → concat-DS_qH7MI.js} +5 -5
- package/dist/config.js +7 -8
- package/dist/{gather-BPGW8RsB.js → gather-BUmJIS8n.js} +1 -1
- package/dist/{index-pWA4_lUh.js → index-XjBAhiFO.js} +1272 -1174
- package/dist/layers/BaseLayer.d.ts +14 -2
- package/dist/layers/BaseLayer.js +9 -9
- package/dist/layers/CausalSelfAttention.d.ts +4 -8
- package/dist/layers/CausalSelfAttention.js +106 -80
- package/dist/layers/MLP.d.ts +2 -3
- package/dist/layers/MLP.js +5 -62
- package/dist/layers/RMSNorm.d.ts +2 -2
- package/dist/layers/RMSNorm.js +11 -11
- package/dist/layers/RoPECache.js +3 -3
- package/dist/layers/TiedEmbedding.js +7 -6
- package/dist/layers/TransformerBlock.d.ts +2 -6
- package/dist/layers/TransformerBlock.js +9 -12
- package/dist/{sum-C7Mgy9Bw.js → log_sum_exp-DJPkVZZn.js} +32 -54
- package/dist/main.js +22 -19
- package/dist/{mat_mul-D7_a4KJn.js → mat_mul-CKwFEV1Q.js} +1 -1
- package/dist/max-DJvEiCAJ.js +25 -0
- package/dist/moments-CrWRPcR3.js +53 -0
- package/dist/norm-BzY929B_.js +86 -0
- package/dist/{ones-Cog-G2ag.js → ones-BO01zpJG.js} +2 -2
- package/dist/ops/appendCache.js +1 -1
- package/dist/ops/attentionMask.js +1 -1
- package/dist/ops/cpu/appendCache.js +2 -2
- package/dist/ops/cpu/attentionMask.js +2 -2
- package/dist/ops/cpu/fusedSoftmax.d.ts +9 -0
- package/dist/ops/cpu/fusedSoftmax.js +23 -0
- package/dist/ops/cpu/gatherSub.js +3 -3
- package/dist/ops/cpu/mulDropout.d.ts +1 -0
- package/dist/ops/cpu/mulDropout.js +17 -0
- package/dist/ops/cpu/qkv.js +3 -3
- package/dist/ops/cpu/rope.js +5 -5
- package/dist/ops/cpu/scatterSub.js +27 -27
- package/dist/ops/fusedSoftmax.d.ts +2 -0
- package/dist/ops/fusedSoftmax.js +10 -0
- package/dist/ops/gatherSub.js +1 -1
- package/dist/ops/grads/attentionMask.js +1 -1
- package/dist/ops/grads/fusedSoftmax.d.ts +2 -0
- package/dist/ops/grads/fusedSoftmax.js +17 -0
- package/dist/ops/grads/qkv.js +1 -1
- package/dist/ops/grads/rope.js +1 -1
- package/dist/ops/mulDrop.d.ts +2 -0
- package/dist/ops/mulDrop.js +9 -0
- package/dist/ops/node/sparseCrossEntropy.js +1 -1
- package/dist/ops/qkv.js +1 -1
- package/dist/ops/scatterSub.js +1 -1
- package/dist/ops/webgl/appendCache.js +1 -1
- package/dist/ops/webgl/attentionMask.js +1 -1
- package/dist/ops/webgl/fusedSoftmax.d.ts +11 -0
- package/dist/ops/webgl/fusedSoftmax.js +3930 -0
- package/dist/ops/webgl/gatherSub.js +1 -1
- package/dist/ops/webgl/mulDropout.d.ts +1 -0
- package/dist/ops/webgl/mulDropout.js +41 -0
- package/dist/ops/webgl/qkv.js +1 -1
- package/dist/ops/webgl/rope.js +1 -1
- package/dist/ops/webgl/scatterSub.js +1 -1
- package/dist/{random_width-oeUIlUZj.js → random_width-CMHmdbSu.js} +4212 -6630
- package/dist/{range-CcDl05lo.js → range-DQMNzBWs.js} +1 -1
- package/dist/{reshape-C8CR_Bad.js → reshape-DFzh97Sc.js} +1 -1
- package/dist/{sin-BJIrfnj7.js → sin-BYM-U4Ut.js} +1 -1
- package/dist/slice_util-CnVNPQI-.js +90 -0
- package/dist/softmax-4DOn6cPq.js +28 -0
- package/dist/{split-DZbvruEP.js → split-CkbeVdF8.js} +3 -3
- package/dist/{stack-BMm-efee.js → stack-DaIMO5iX.js} +1 -1
- package/dist/sum-C6u3xMi3.js +27 -0
- package/dist/{tensor-DJVbYhh1.js → tensor-Cu1fU7H7.js} +1 -1
- package/dist/{tensor2d-ZuQSh2D-.js → tensor2d-D0CKdG6B.js} +1 -1
- package/dist/tfjs_backend-Bzl2SrRo.js +2460 -0
- package/dist/training/AdamExt.js +1 -1
- package/dist/training/DatasetBuilder.js +3 -3
- package/dist/training/FullTrainer.js +1 -1
- package/dist/training/Trainer.js +13 -12
- package/dist/training/sparseCrossEntropy.js +12 -11
- package/dist/utilities/dummy.js +8 -8
- package/dist/utilities/generate.js +11 -11
- package/dist/utilities/load.js +1 -1
- package/dist/utilities/profile.js +1 -1
- package/dist/utilities/weights.js +2 -2
- package/dist/{variable-Dl_ub3pk.js → variable-BS4AKqNU.js} +1 -1
- package/dist/{zeros-CCy9C3uU.js → zeros-CmJFiC84.js} +1 -1
- package/package.json +1 -1
- package/dist/exports_layers-tbTBcwMM.js +0 -25
- package/dist/layers/LayerNorm.d.ts +0 -13
- package/dist/layers/LayerNorm.js +0 -33
- package/dist/moments-DfcpfwKi.js +0 -132
- package/dist/softmax-Be_lsqUc.js +0 -105
- package/dist/training/LayerTrainer.d.ts +0 -29
- package/dist/training/LayerTrainer.js +0 -95
- package/dist/training/lwSchedule.d.ts +0 -7
- package/dist/training/lwSchedule.js +0 -162
|
@@ -1,95 +0,0 @@
|
|
|
1
|
-
import { generateText as u } from "../utilities/generate.js";
|
|
2
|
-
import v from "./Trainer.js";
|
|
3
|
-
import { schedule as w } from "./lwSchedule.js";
|
|
4
|
-
import T from "./Evaluator.js";
|
|
5
|
-
import { a as m } from "../index-pWA4_lUh.js";
|
|
6
|
-
const x = {
|
|
7
|
-
desiredLoss: 0.01,
|
|
8
|
-
logInterval: 1,
|
|
9
|
-
stepsPerLayer: 400,
|
|
10
|
-
maxPasses: 3,
|
|
11
|
-
maxSteps: 1e3
|
|
12
|
-
};
|
|
13
|
-
class E extends v {
|
|
14
|
-
trainingPattern = [];
|
|
15
|
-
startPass = 0;
|
|
16
|
-
startLayer = 0;
|
|
17
|
-
constructor(a, r, e = 3e-4) {
|
|
18
|
-
if (super(a, r, e), this.trainingPattern = w[a.config.nLayer - 1] || [], a.log.length > 0) {
|
|
19
|
-
const i = a.log[a.log.length - 1];
|
|
20
|
-
i.pass !== void 0 && i.layer !== void 0 && (this.startPass = i.pass, this.startLayer = i.layer, console.log(`Resuming training from pass ${this.startPass}, layer ${this.startLayer}`));
|
|
21
|
-
}
|
|
22
|
-
}
|
|
23
|
-
applyTrainingPattern(a) {
|
|
24
|
-
const r = a < this.trainingPattern.length ? a : this.trainingPattern.length - 1, e = this.trainingPattern[r];
|
|
25
|
-
this.model.setSkipMask(e.skip), this.model.setTrainableMask(e.trainable), this.resetOptimizer(e.adam), console.log("Applied training pattern:", r, e);
|
|
26
|
-
}
|
|
27
|
-
// Train for multiple epochs using Dataset API - FIXED memory leaks
|
|
28
|
-
async trainOnDataset(a, r, e) {
|
|
29
|
-
const { desiredLoss: i, logInterval: L, stepsPerLayer: d, onLayerChange: l, onPassComplete: p, onStep: h, prompt: c } = {
|
|
30
|
-
...x,
|
|
31
|
-
...r
|
|
32
|
-
}, t = {
|
|
33
|
-
pass: 0,
|
|
34
|
-
layerStep: 0,
|
|
35
|
-
step: 0,
|
|
36
|
-
stepSinceLayerChange: 0,
|
|
37
|
-
lastLoss: 1e6,
|
|
38
|
-
totalSteps: 0,
|
|
39
|
-
losses: [],
|
|
40
|
-
validationLosses: [],
|
|
41
|
-
trainingDuration: 0
|
|
42
|
-
};
|
|
43
|
-
this.dummyPass();
|
|
44
|
-
const S = Date.now();
|
|
45
|
-
this.startPass = 0, this.startLayer = 0;
|
|
46
|
-
const g = e ? new T(this.model, e) : void 0, f = await a.iterator();
|
|
47
|
-
this.applyTrainingPattern(t.layerStep % this.trainingPattern.length);
|
|
48
|
-
try {
|
|
49
|
-
for (; !(t.lastLoss < i); ) {
|
|
50
|
-
const o = await f.next();
|
|
51
|
-
if (o.done) break;
|
|
52
|
-
const y = o.value, P = this.trainBatch(t, y);
|
|
53
|
-
t.stepSinceLayerChange++;
|
|
54
|
-
const n = {
|
|
55
|
-
loss: t.lastLoss,
|
|
56
|
-
step: t.step,
|
|
57
|
-
time: Date.now() - S,
|
|
58
|
-
batchSize: y.xs.shape[0],
|
|
59
|
-
pass: t.pass,
|
|
60
|
-
layer: t.layerStep % this.model.config.nLayer
|
|
61
|
-
};
|
|
62
|
-
if (this.model.log.push(n), t.step % L === 0) {
|
|
63
|
-
if (await P, g)
|
|
64
|
-
try {
|
|
65
|
-
const s = await g.evaluate(5);
|
|
66
|
-
t.validationLosses.push(s), n.valLoss = s;
|
|
67
|
-
} catch (s) {
|
|
68
|
-
console.error("Validation error:", s);
|
|
69
|
-
}
|
|
70
|
-
if (h) {
|
|
71
|
-
if (c) {
|
|
72
|
-
const s = await u(this.tokenizer, this.model, c, 100, {
|
|
73
|
-
temperature: 0.8,
|
|
74
|
-
topK: 10
|
|
75
|
-
});
|
|
76
|
-
n.example = s;
|
|
77
|
-
}
|
|
78
|
-
await h(n, {
|
|
79
|
-
duration: t.trainingDuration,
|
|
80
|
-
totalSamples: t.totalSteps * n.batchSize,
|
|
81
|
-
samplesPerSecond: t.totalSteps * n.batchSize / (t.trainingDuration / 1e3)
|
|
82
|
-
});
|
|
83
|
-
}
|
|
84
|
-
}
|
|
85
|
-
t.stepSinceLayerChange >= d && (t.layerStep++, t.layerStep % this.model.config.nLayer === 0 ? (l && await l(t.layerStep, t.pass), p && await p(t.pass), t.pass++) : l && await l(t.layerStep, t.pass), t.stepSinceLayerChange = 0, this.applyTrainingPattern(t.layerStep % this.trainingPattern.length));
|
|
86
|
-
}
|
|
87
|
-
} catch (o) {
|
|
88
|
-
throw console.error("Training error:", o), m(), o;
|
|
89
|
-
}
|
|
90
|
-
return m(), { losses: t.losses, validationLosses: t.validationLosses };
|
|
91
|
-
}
|
|
92
|
-
}
|
|
93
|
-
export {
|
|
94
|
-
E as default
|
|
95
|
-
};
|
|
@@ -1,162 +0,0 @@
|
|
|
1
|
-
const e = [
|
|
2
|
-
[
|
|
3
|
-
{
|
|
4
|
-
adam: {
|
|
5
|
-
learningRateFactor: 1,
|
|
6
|
-
beta1: 0.9,
|
|
7
|
-
beta2: 0.999,
|
|
8
|
-
epsilon: 1e-8
|
|
9
|
-
},
|
|
10
|
-
skip: [!1],
|
|
11
|
-
trainable: [!0]
|
|
12
|
-
}
|
|
13
|
-
],
|
|
14
|
-
[
|
|
15
|
-
{
|
|
16
|
-
adam: {
|
|
17
|
-
learningRateFactor: 1,
|
|
18
|
-
beta1: 0.9,
|
|
19
|
-
beta2: 0.999,
|
|
20
|
-
epsilon: 1e-8
|
|
21
|
-
},
|
|
22
|
-
skip: [!0, !1],
|
|
23
|
-
trainable: [!1, !0]
|
|
24
|
-
},
|
|
25
|
-
{
|
|
26
|
-
adam: {
|
|
27
|
-
learningRateFactor: 1,
|
|
28
|
-
beta1: 0.9,
|
|
29
|
-
beta2: 0.999,
|
|
30
|
-
epsilon: 1e-8
|
|
31
|
-
},
|
|
32
|
-
skip: [!1, !1],
|
|
33
|
-
trainable: [!0, !1]
|
|
34
|
-
},
|
|
35
|
-
{
|
|
36
|
-
adam: {
|
|
37
|
-
learningRateFactor: 0.3333333333333333,
|
|
38
|
-
beta1: 0.95,
|
|
39
|
-
beta2: 0.999,
|
|
40
|
-
epsilon: 1e-8
|
|
41
|
-
},
|
|
42
|
-
skip: [!1, !1],
|
|
43
|
-
trainable: [!0, !0]
|
|
44
|
-
}
|
|
45
|
-
],
|
|
46
|
-
[],
|
|
47
|
-
[
|
|
48
|
-
{
|
|
49
|
-
adam: {
|
|
50
|
-
learningRateFactor: 1,
|
|
51
|
-
beta1: 0.9,
|
|
52
|
-
beta2: 0.999,
|
|
53
|
-
epsilon: 1e-8
|
|
54
|
-
},
|
|
55
|
-
skip: [!0, !0, !0, !1],
|
|
56
|
-
trainable: [!1, !1, !1, !0]
|
|
57
|
-
},
|
|
58
|
-
{
|
|
59
|
-
adam: {
|
|
60
|
-
learningRateFactor: 1,
|
|
61
|
-
beta1: 0.9,
|
|
62
|
-
beta2: 0.999,
|
|
63
|
-
epsilon: 1e-8
|
|
64
|
-
},
|
|
65
|
-
skip: [!0, !0, !1, !1],
|
|
66
|
-
trainable: [!1, !1, !0, !1]
|
|
67
|
-
},
|
|
68
|
-
{
|
|
69
|
-
adam: {
|
|
70
|
-
learningRateFactor: 0.3333333333333333,
|
|
71
|
-
beta1: 0.95,
|
|
72
|
-
beta2: 0.999,
|
|
73
|
-
epsilon: 1e-8
|
|
74
|
-
},
|
|
75
|
-
skip: [!0, !0, !1, !1],
|
|
76
|
-
trainable: [!1, !1, !1, !0]
|
|
77
|
-
},
|
|
78
|
-
{
|
|
79
|
-
adam: {
|
|
80
|
-
learningRateFactor: 1,
|
|
81
|
-
beta1: 0.9,
|
|
82
|
-
beta2: 0.999,
|
|
83
|
-
epsilon: 1e-8
|
|
84
|
-
},
|
|
85
|
-
skip: [!0, !1, !1, !1],
|
|
86
|
-
trainable: [!1, !0, !1, !1]
|
|
87
|
-
},
|
|
88
|
-
{
|
|
89
|
-
adam: {
|
|
90
|
-
learningRateFactor: 0.3333333333333333,
|
|
91
|
-
beta1: 0.95,
|
|
92
|
-
beta2: 0.999,
|
|
93
|
-
epsilon: 1e-8
|
|
94
|
-
},
|
|
95
|
-
skip: [!0, !1, !1, !1],
|
|
96
|
-
trainable: [!1, !1, !0, !1]
|
|
97
|
-
},
|
|
98
|
-
{
|
|
99
|
-
adam: {
|
|
100
|
-
learningRateFactor: 0.16666666666666666,
|
|
101
|
-
beta1: 0.98,
|
|
102
|
-
beta2: 0.9999,
|
|
103
|
-
epsilon: 1e-8
|
|
104
|
-
},
|
|
105
|
-
skip: [!0, !1, !1, !1],
|
|
106
|
-
trainable: [!1, !1, !1, !0]
|
|
107
|
-
},
|
|
108
|
-
{
|
|
109
|
-
adam: {
|
|
110
|
-
learningRateFactor: 1,
|
|
111
|
-
beta1: 0.9,
|
|
112
|
-
beta2: 0.999,
|
|
113
|
-
epsilon: 1e-8
|
|
114
|
-
},
|
|
115
|
-
skip: [!1, !1, !1, !1],
|
|
116
|
-
trainable: [!0, !1, !1, !1]
|
|
117
|
-
},
|
|
118
|
-
{
|
|
119
|
-
adam: {
|
|
120
|
-
learningRateFactor: 0.3333333333333333,
|
|
121
|
-
beta1: 0.95,
|
|
122
|
-
beta2: 0.999,
|
|
123
|
-
epsilon: 1e-8
|
|
124
|
-
},
|
|
125
|
-
skip: [!1, !1, !1, !1],
|
|
126
|
-
trainable: [!1, !0, !1, !1]
|
|
127
|
-
},
|
|
128
|
-
{
|
|
129
|
-
adam: {
|
|
130
|
-
learningRateFactor: 0.16666666666666666,
|
|
131
|
-
beta1: 0.98,
|
|
132
|
-
beta2: 0.9999,
|
|
133
|
-
epsilon: 1e-8
|
|
134
|
-
},
|
|
135
|
-
skip: [!1, !1, !1, !1],
|
|
136
|
-
trainable: [!1, !1, !0, !1]
|
|
137
|
-
},
|
|
138
|
-
{
|
|
139
|
-
adam: {
|
|
140
|
-
learningRateFactor: 0.16666666666666666,
|
|
141
|
-
beta1: 0.98,
|
|
142
|
-
beta2: 0.9999,
|
|
143
|
-
epsilon: 1e-8
|
|
144
|
-
},
|
|
145
|
-
skip: [!1, !1, !1, !1],
|
|
146
|
-
trainable: [!1, !1, !1, !0]
|
|
147
|
-
},
|
|
148
|
-
{
|
|
149
|
-
adam: {
|
|
150
|
-
learningRateFactor: 0.16666666666666666,
|
|
151
|
-
beta1: 0.98,
|
|
152
|
-
beta2: 0.9999,
|
|
153
|
-
epsilon: 1e-8
|
|
154
|
-
},
|
|
155
|
-
skip: [!1, !1, !1, !1],
|
|
156
|
-
trainable: [!0, !0, !0, !0]
|
|
157
|
-
}
|
|
158
|
-
]
|
|
159
|
-
];
|
|
160
|
-
export {
|
|
161
|
-
e as schedule
|
|
162
|
-
};
|