@genai-fi/nanogpt 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/Generator.d.ts +1 -0
- package/dist/Generator.js +10 -10
- package/dist/NanoGPTModel.js +80 -59
- package/dist/TeachableLLM.js +3 -3
- package/dist/layers/RoPECache.js +24 -19
- package/dist/main.d.ts +1 -0
- package/dist/main.js +4 -2
- package/package.json +1 -1
package/dist/Generator.d.ts
CHANGED
|
@@ -3,6 +3,7 @@ import { ITokeniser } from './tokeniser/type';
|
|
|
3
3
|
import { default as EE } from 'eventemitter3';
|
|
4
4
|
export interface IGenerateOptions extends GenerateOptions {
|
|
5
5
|
maxLength?: number;
|
|
6
|
+
noCache?: boolean;
|
|
6
7
|
}
|
|
7
8
|
export default class Generator extends EE<'start' | 'stop' | 'tokens'> {
|
|
8
9
|
private readonly model;
|
package/dist/Generator.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { E as u } from "./index-SOhdqzHq.js";
|
|
2
|
-
class
|
|
2
|
+
class p extends u {
|
|
3
3
|
constructor(s, e) {
|
|
4
4
|
super(), this.model = s, this.tokeniser = e;
|
|
5
5
|
}
|
|
@@ -14,10 +14,10 @@ class k extends u {
|
|
|
14
14
|
const {
|
|
15
15
|
output: o,
|
|
16
16
|
attention: c,
|
|
17
|
-
probabilities:
|
|
18
|
-
} = this.model.generate(t, void 0, e),
|
|
19
|
-
t = this.model.tf.concat([t, o], 1),
|
|
20
|
-
const r = await this.processResponse(o, c,
|
|
17
|
+
probabilities: h
|
|
18
|
+
} = this.model.generate(t, void 0, e), l = t;
|
|
19
|
+
t = this.model.tf.concat([t, o], 1), l.dispose();
|
|
20
|
+
const r = await this.processResponse(o, c, h);
|
|
21
21
|
if (o.dispose(), r === null)
|
|
22
22
|
break;
|
|
23
23
|
n += r;
|
|
@@ -40,14 +40,14 @@ class k extends u {
|
|
|
40
40
|
for (let o = 0; o < i; o++) {
|
|
41
41
|
const {
|
|
42
42
|
output: c,
|
|
43
|
-
attention:
|
|
44
|
-
probabilities:
|
|
43
|
+
attention: h,
|
|
44
|
+
probabilities: l
|
|
45
45
|
} = this.model.generate(t, a, {
|
|
46
46
|
...e,
|
|
47
47
|
usePadding: !1
|
|
48
48
|
});
|
|
49
49
|
t.dispose(), t = c;
|
|
50
|
-
const r = await this.processResponse(c,
|
|
50
|
+
const r = await this.processResponse(c, h, l);
|
|
51
51
|
if (r === null)
|
|
52
52
|
break;
|
|
53
53
|
n += r;
|
|
@@ -56,10 +56,10 @@ class k extends u {
|
|
|
56
56
|
}
|
|
57
57
|
async generate(s, e) {
|
|
58
58
|
this.emit("start");
|
|
59
|
-
const t = this.model.config.useRope ? this.generateCache(s, e) : this.generateNoCache(s, e);
|
|
59
|
+
const t = this.model.config.useRope && !e?.noCache ? this.generateCache(s, e) : this.generateNoCache(s, e);
|
|
60
60
|
return this.emit("stop"), t;
|
|
61
61
|
}
|
|
62
62
|
}
|
|
63
63
|
export {
|
|
64
|
-
|
|
64
|
+
p as default
|
|
65
65
|
};
|
package/dist/NanoGPTModel.js
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import { defaultConfig as
|
|
2
|
-
import
|
|
3
|
-
import
|
|
4
|
-
import
|
|
5
|
-
import
|
|
6
|
-
class
|
|
1
|
+
import { defaultConfig as z } from "./config.js";
|
|
2
|
+
import $ from "./layers/TransformerBlock.js";
|
|
3
|
+
import S from "./layers/TiedEmbedding.js";
|
|
4
|
+
import I from "./layers/RoPECache.js";
|
|
5
|
+
import _ from "./layers/RMSNorm.js";
|
|
6
|
+
class M {
|
|
7
7
|
config;
|
|
8
8
|
wte;
|
|
9
9
|
// Token embeddings
|
|
@@ -19,7 +19,7 @@ class F {
|
|
|
19
19
|
log = [];
|
|
20
20
|
// Training log
|
|
21
21
|
constructor(t, e = {}) {
|
|
22
|
-
this.tf = t, this.config = { ...
|
|
22
|
+
this.tf = t, this.config = { ...z, ...e }, this.wte = new S(t, {
|
|
23
23
|
vocabSize: this.config.vocabSize,
|
|
24
24
|
embedDim: this.config.nEmbed,
|
|
25
25
|
name: "token_embedding"
|
|
@@ -28,10 +28,10 @@ class F {
|
|
|
28
28
|
outputDim: this.config.nEmbed,
|
|
29
29
|
name: "positional_embedding",
|
|
30
30
|
embeddingsInitializer: this.tf.initializers.randomNormal({ mean: 0, stddev: 0.02 })
|
|
31
|
-
}) : this.ropeCache = new
|
|
32
|
-
for (let
|
|
33
|
-
this.blocks.push(new
|
|
34
|
-
this.lnF = new
|
|
31
|
+
}) : this.ropeCache = new I(t, this.config), this.drop = this.tf.layers.dropout({ rate: this.config.dropout }), this.blocks = [];
|
|
32
|
+
for (let o = 0; o < this.config.nLayer; o++)
|
|
33
|
+
this.blocks.push(new $(this.tf, o, this.config, this.ropeCache));
|
|
34
|
+
this.lnF = new _(t, [this.config.nEmbed], 1e-8, "final_rms_norm");
|
|
35
35
|
}
|
|
36
36
|
get variables() {
|
|
37
37
|
return [
|
|
@@ -54,17 +54,17 @@ class F {
|
|
|
54
54
|
this.blocks[e].loadWeights(t);
|
|
55
55
|
this.lnF.setWeights(t.get("final_rms_norm") || []);
|
|
56
56
|
}
|
|
57
|
-
inputPhase(t, e,
|
|
57
|
+
inputPhase(t, e, o = !1) {
|
|
58
58
|
return this.tf.tidy(() => {
|
|
59
|
-
const
|
|
59
|
+
const i = this.wte.embed(t);
|
|
60
60
|
if (this.config.useRope === !1) {
|
|
61
|
-
const [,
|
|
62
|
-
this.tf.add(
|
|
63
|
-
this.tf.scalar(
|
|
64
|
-
),
|
|
65
|
-
return this.drop.apply(
|
|
61
|
+
const [, s] = t.shape, r = this.config.blockSize, l = this.tf.range(0, s, 1, "int32"), n = this.tf.mod(
|
|
62
|
+
this.tf.add(l, this.tf.scalar(e, "int32")),
|
|
63
|
+
this.tf.scalar(r, "int32")
|
|
64
|
+
), h = this.wpe.apply(n), c = i.add(h);
|
|
65
|
+
return this.drop.apply(c, { training: o });
|
|
66
66
|
} else
|
|
67
|
-
return this.drop.apply(
|
|
67
|
+
return this.drop.apply(i, { training: o });
|
|
68
68
|
});
|
|
69
69
|
}
|
|
70
70
|
setSkipMask(t) {
|
|
@@ -95,8 +95,8 @@ class F {
|
|
|
95
95
|
calculateLoss(t, e) {
|
|
96
96
|
try {
|
|
97
97
|
return this.tf.losses.softmaxCrossEntropy(e, t, this.tf.Reduction.MEAN);
|
|
98
|
-
} catch (
|
|
99
|
-
throw console.error("Error computing loss:",
|
|
98
|
+
} catch (o) {
|
|
99
|
+
throw console.error("Error computing loss:", o), new Error(`Loss computation failed: ${o}`);
|
|
100
100
|
}
|
|
101
101
|
}
|
|
102
102
|
// Attention rollout per Abnar & Zuidema (2020)
|
|
@@ -105,67 +105,88 @@ class F {
|
|
|
105
105
|
return this.tf.tidy(() => {
|
|
106
106
|
if (t.length === 0)
|
|
107
107
|
throw new Error("No attentions for rollout");
|
|
108
|
-
const e
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
108
|
+
const [e, o, i] = t[0].shape;
|
|
109
|
+
for (const s of t) {
|
|
110
|
+
const [r, l, n] = s.shape;
|
|
111
|
+
if (r !== e || l !== o || n !== i)
|
|
112
|
+
throw new Error(
|
|
113
|
+
`Inconsistent attention shapes in rollout: expected [${e},${o},${i}] got [${r},${l},${n}]`
|
|
114
|
+
);
|
|
113
115
|
}
|
|
114
|
-
|
|
116
|
+
if (o === i) {
|
|
117
|
+
const s = this.tf.eye(i, i).expandDims(0);
|
|
118
|
+
let r = s.tile([e, 1, 1]);
|
|
119
|
+
for (const l of t) {
|
|
120
|
+
const n = l.add(s);
|
|
121
|
+
r = n.div(n.sum(-1, !0)).matMul(r);
|
|
122
|
+
}
|
|
123
|
+
return r;
|
|
124
|
+
}
|
|
125
|
+
if (o === 1) {
|
|
126
|
+
let s = null;
|
|
127
|
+
const r = this.tf.tensor1d([i - 1], "int32"), l = this.tf.oneHot(r, i).reshape([1, 1, i]).tile([e, 1, 1]);
|
|
128
|
+
r.dispose();
|
|
129
|
+
for (const n of t) {
|
|
130
|
+
let h = n.add(l);
|
|
131
|
+
h = h.div(h.sum(-1, !0)), s == null ? s = h : (s = s.mul(h), s = s.div(s.sum(-1, !0)));
|
|
132
|
+
}
|
|
133
|
+
return s;
|
|
134
|
+
}
|
|
135
|
+
throw new Error(`Unsupported attention shapes for rollout: [B=${e}, Q=${o}, K=${i}]`);
|
|
115
136
|
});
|
|
116
137
|
}
|
|
117
|
-
forward(t, e,
|
|
138
|
+
forward(t, e, o = !1, i = !1, s) {
|
|
118
139
|
return this.validateInput(t), this.tf.tidy(() => {
|
|
119
|
-
const
|
|
120
|
-
let
|
|
121
|
-
const
|
|
122
|
-
if (
|
|
123
|
-
throw console.error("Cache",
|
|
124
|
-
for (let
|
|
125
|
-
const d = this.blocks[
|
|
140
|
+
const r = s?.[0]?.length ?? 0;
|
|
141
|
+
let l = this.inputPhase(t, r, o);
|
|
142
|
+
const n = [];
|
|
143
|
+
if (s && s.length !== this.blocks.length)
|
|
144
|
+
throw console.error("Cache", s), new Error(`Cache length ${s.length} does not match number of blocks ${this.blocks.length}`);
|
|
145
|
+
for (let a = 0; a < this.blocks.length; a++) {
|
|
146
|
+
const d = this.blocks[a], {
|
|
126
147
|
output: g,
|
|
127
|
-
attention:
|
|
148
|
+
attention: m,
|
|
128
149
|
cache: p
|
|
129
|
-
} = d.call(
|
|
130
|
-
|
|
150
|
+
} = d.call(l, o, i, s ? s[a] : void 0);
|
|
151
|
+
l = g, i && m && n.push(m), s && p ? (s[a]?.k.dispose(), s[a]?.v.dispose(), s[a] = p) : p && (p.k.dispose(), p.v.dispose());
|
|
131
152
|
}
|
|
132
|
-
let
|
|
133
|
-
|
|
134
|
-
const
|
|
153
|
+
let h;
|
|
154
|
+
i && n.length > 0 && (h = this.computeAttentionRollout(n)), l = this.lnF.apply(l);
|
|
155
|
+
const c = this.wte.project(l);
|
|
135
156
|
let f;
|
|
136
|
-
return e && (f = this.calculateLoss(
|
|
157
|
+
return e && (f = this.calculateLoss(c, e)), { logits: c, loss: f, attention: i ? h : void 0 };
|
|
137
158
|
});
|
|
138
159
|
}
|
|
139
|
-
generate(t, e,
|
|
140
|
-
const
|
|
160
|
+
generate(t, e, o) {
|
|
161
|
+
const i = o?.temperature ?? 1, s = o?.topK, r = o?.usePadding ?? !1, l = o?.includeAttention ?? !1;
|
|
141
162
|
return this.tf.tidy(() => {
|
|
142
|
-
const
|
|
143
|
-
[0,
|
|
144
|
-
[
|
|
145
|
-
), f =
|
|
163
|
+
const n = t, h = n.shape[1], c = h <= this.config.blockSize ? n : n.slice(
|
|
164
|
+
[0, h - this.config.blockSize],
|
|
165
|
+
[n.shape[0], this.config.blockSize]
|
|
166
|
+
), f = r ? this.config.blockSize - c.shape[1] : 0, a = f > 0 ? this.tf.pad(c, [
|
|
146
167
|
[0, 0],
|
|
147
168
|
[0, f]
|
|
148
|
-
]) :
|
|
149
|
-
let
|
|
150
|
-
if (
|
|
151
|
-
const { values: E, indices:
|
|
152
|
-
|
|
169
|
+
]) : c, { logits: d, attention: g } = this.forward(a, void 0, !1, l, e), m = d.shape[1] - 1 - f, p = d.slice([0, m, 0], [d.shape[0], 1, d.shape[2]]), w = g ? g.slice([0, m, 0], [g.shape[0], 1, g.shape[2]]) : void 0, u = p.div(i);
|
|
170
|
+
let b;
|
|
171
|
+
if (s) {
|
|
172
|
+
const { values: E, indices: v } = this.tf.topk(u, s), y = this.tf.multinomial(E.squeeze([1]), 1);
|
|
173
|
+
b = this.tf.gather(v.squeeze([1]), y, 1);
|
|
153
174
|
} else
|
|
154
|
-
|
|
175
|
+
b = this.tf.multinomial(u.squeeze([1]), 1);
|
|
155
176
|
let k;
|
|
156
|
-
return
|
|
177
|
+
return o?.includeProbabilities && (k = this.tf.softmax(u.squeeze([1]))), b = b.reshape([1, 1]), { output: b, attention: w?.squeeze([1]), probabilities: k };
|
|
157
178
|
});
|
|
158
179
|
}
|
|
159
180
|
getNumParams() {
|
|
160
181
|
const t = this.config.vocabSize * this.config.nEmbed + this.config.blockSize * this.config.nEmbed, e = this.config.nLayer * (4 * this.config.nEmbed * this.config.nEmbed + // qkv + proj
|
|
161
|
-
2 * this.config.nEmbed),
|
|
162
|
-
this.config.nEmbed * 4 * this.config.nEmbed),
|
|
163
|
-
return t + e +
|
|
182
|
+
2 * this.config.nEmbed), o = this.config.nLayer * (4 * this.config.nEmbed * this.config.nEmbed + // fc
|
|
183
|
+
this.config.nEmbed * 4 * this.config.nEmbed), i = this.config.nEmbed + this.config.vocabSize * this.config.nEmbed;
|
|
184
|
+
return t + e + o + i;
|
|
164
185
|
}
|
|
165
186
|
dispose() {
|
|
166
187
|
this.wte.dispose(), this.wpe && this.wpe.dispose(), this.drop.dispose(), this.blocks.forEach((t) => t.dispose()), this.lnF.dispose();
|
|
167
188
|
}
|
|
168
189
|
}
|
|
169
190
|
export {
|
|
170
|
-
|
|
191
|
+
M as default
|
|
171
192
|
};
|
package/dist/TeachableLLM.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import d from "./
|
|
2
|
-
import
|
|
1
|
+
import { defaultConfig as d } from "./config.js";
|
|
2
|
+
import u from "./NanoGPTModel.js";
|
|
3
3
|
import { saveModel as m } from "./utilities/save.js";
|
|
4
4
|
import { loadModel as l } from "./utilities/load.js";
|
|
5
5
|
import f from "./Generator.js";
|
|
@@ -58,7 +58,7 @@ class a extends c {
|
|
|
58
58
|
}), e;
|
|
59
59
|
}
|
|
60
60
|
static create(t, r = {}) {
|
|
61
|
-
const e = { ...
|
|
61
|
+
const e = { ...d, ...r }, s = new g(e.vocabSize), o = new u(t, e), i = new a(t, s, o);
|
|
62
62
|
return i.setStatus("warmup"), h(o).then(() => {
|
|
63
63
|
i.tokeniser.trained ? i.setStatus("ready") : (i.setStatus("awaitingTokens"), i.tokeniser.once("trainStatus", (n) => {
|
|
64
64
|
n === "trained" && i.setStatus("ready");
|
package/dist/layers/RoPECache.js
CHANGED
|
@@ -1,12 +1,14 @@
|
|
|
1
|
-
class
|
|
2
|
-
constructor(
|
|
3
|
-
this.tf =
|
|
4
|
-
const
|
|
5
|
-
if (this.rotaryDim =
|
|
1
|
+
class b {
|
|
2
|
+
constructor(s, r) {
|
|
3
|
+
this.tf = s, this.config = r;
|
|
4
|
+
const o = this.config.nEmbed / this.config.nHead;
|
|
5
|
+
if (this.rotaryDim = o, this.rotaryDim % 2 !== 0)
|
|
6
6
|
throw new Error("rotaryDim must be even");
|
|
7
7
|
this.ropeBase = 1e4;
|
|
8
|
-
const
|
|
9
|
-
this.ropeInvFreq = this.tf.reciprocal(
|
|
8
|
+
const i = this.tf.range(0, this.rotaryDim, 2, "float32"), t = i.div(this.tf.scalar(this.rotaryDim, "float32")), e = this.tf.pow(this.tf.scalar(this.ropeBase, "float32"), t);
|
|
9
|
+
this.ropeInvFreq = this.tf.reciprocal(e), t.dispose(), e.dispose(), i.dispose(), this.config.useRope === !1 ? (this.ropeCos = null, this.ropeSin = null, this.ropeCacheLen = 0) : this.tf.tidy(() => {
|
|
10
|
+
this.ensureRopeCache(this.config.blockSize * 4);
|
|
11
|
+
});
|
|
10
12
|
}
|
|
11
13
|
rotaryDim;
|
|
12
14
|
ropeBase;
|
|
@@ -16,24 +18,27 @@ class E {
|
|
|
16
18
|
ropeSin = null;
|
|
17
19
|
// [cacheLen, rotaryDim/2]
|
|
18
20
|
ropeCacheLen = 0;
|
|
19
|
-
ensureRopeCache(
|
|
20
|
-
if (
|
|
21
|
+
ensureRopeCache(s) {
|
|
22
|
+
if (s <= this.ropeCacheLen) return;
|
|
21
23
|
this.ropeCos && this.ropeCos.dispose(), this.ropeSin && this.ropeSin.dispose();
|
|
22
|
-
const
|
|
23
|
-
this.ropeCos = this.tf.keep(this.tf.cos(
|
|
24
|
+
const o = this.tf.range(0, s, 1, "float32").expandDims(1).mul(this.ropeInvFreq.expandDims(0));
|
|
25
|
+
this.ropeCos = this.tf.keep(this.tf.cos(o).expandDims(-1)), this.ropeSin = this.tf.keep(this.tf.sin(o).expandDims(-1)), this.ropeCacheLen = s;
|
|
24
26
|
}
|
|
25
|
-
applyRoPE(
|
|
26
|
-
const
|
|
27
|
-
if (
|
|
28
|
-
const
|
|
29
|
-
this.ensureRopeCache(
|
|
30
|
-
const n =
|
|
31
|
-
|
|
27
|
+
applyRoPE(s, r, o) {
|
|
28
|
+
const i = s.shape[3], t = this.rotaryDim;
|
|
29
|
+
if (t > i) return [s, r];
|
|
30
|
+
const e = s.shape[2], v = o + e;
|
|
31
|
+
this.ensureRopeCache(v);
|
|
32
|
+
const n = t / 2, p = this.ropeCos.slice([o, 0, 0], [e, n, 1]).reshape([1, 1, e, n]), a = this.ropeSin.slice([o, 0, 0], [e, n, 1]).reshape([1, 1, e, n]), h = s.shape[0], c = s.shape[1], f = this.tf.range(0, t, 2, "int32"), l = this.tf.range(1, t, 2, "int32"), d = (u) => {
|
|
33
|
+
const m = u.slice([0, 0, 0, 0], [h, c, e, t]), C = t < i ? u.slice([0, 0, 0, t], [h, c, e, i - t]) : null, D = this.tf.gather(m, f, 3), g = this.tf.gather(m, l, 3), x = D.mul(p).sub(g.mul(a)), k = g.mul(p).add(D.mul(a)), R = this.tf.stack([x, k], -1).reshape([h, c, e, t]);
|
|
34
|
+
return C ? this.tf.concat([R, C], 3) : R;
|
|
35
|
+
}, y = d(s), S = d(r);
|
|
36
|
+
return f.dispose(), l.dispose(), [y, S];
|
|
32
37
|
}
|
|
33
38
|
dispose() {
|
|
34
39
|
this.ropeCos && this.ropeCos.dispose(), this.ropeSin && this.ropeSin.dispose(), this.ropeInvFreq.dispose();
|
|
35
40
|
}
|
|
36
41
|
}
|
|
37
42
|
export {
|
|
38
|
-
|
|
43
|
+
b as default
|
|
39
44
|
};
|
package/dist/main.d.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
export { default as NanoGPT } from './NanoGPTModel';
|
|
2
2
|
export { default as TeachableLLM } from './TeachableLLM';
|
|
3
3
|
export { default as CharTokeniser } from './tokeniser/CharTokeniser';
|
|
4
|
+
export { default as waitForModel } from './utilities/waitForModel';
|
|
4
5
|
export type { ITrainerOptions } from './Trainer';
|
|
5
6
|
export type { IGenerateOptions } from './Generator';
|
|
6
7
|
export type { TrainingLogEntry } from './NanoGPTModel';
|
package/dist/main.js
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
import { default as o } from "./NanoGPTModel.js";
|
|
2
|
-
import { default as
|
|
2
|
+
import { default as t } from "./TeachableLLM.js";
|
|
3
3
|
import { default as l } from "./tokeniser/CharTokeniser.js";
|
|
4
|
+
import { default as s } from "./utilities/waitForModel.js";
|
|
4
5
|
export {
|
|
5
6
|
l as CharTokeniser,
|
|
6
7
|
o as NanoGPT,
|
|
7
|
-
|
|
8
|
+
t as TeachableLLM,
|
|
9
|
+
s as waitForModel
|
|
8
10
|
};
|