@genai-fi/nanogpt 0.1.9 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/Generator.d.ts +7 -8
- package/dist/Generator.js +55 -52
- package/dist/NanoGPTModel.d.ts +5 -3
- package/dist/NanoGPTModel.js +92 -55
- package/dist/TeachableLLM.js +17 -17
- package/dist/config.d.ts +1 -0
- package/dist/config.js +5 -3
- package/dist/layers/CausalSelfAttention.d.ts +12 -2
- package/dist/layers/CausalSelfAttention.js +73 -40
- package/dist/layers/RMSNorm.d.ts +13 -0
- package/dist/layers/RMSNorm.js +32 -0
- package/dist/layers/RoPECache.d.ts +16 -0
- package/dist/layers/RoPECache.js +44 -0
- package/dist/layers/TransformerBlock.d.ts +5 -2
- package/dist/layers/TransformerBlock.js +14 -10
- package/dist/main.d.ts +1 -0
- package/dist/main.js +4 -2
- package/dist/utilities/generate.js +14 -14
- package/package.json +1 -1
package/dist/Generator.d.ts
CHANGED
|
@@ -1,18 +1,17 @@
|
|
|
1
|
-
import { default as NanoGPT } from './NanoGPTModel';
|
|
1
|
+
import { default as NanoGPT, GenerateOptions } from './NanoGPTModel';
|
|
2
2
|
import { ITokeniser } from './tokeniser/type';
|
|
3
3
|
import { default as EE } from 'eventemitter3';
|
|
4
|
-
export interface IGenerateOptions {
|
|
4
|
+
export interface IGenerateOptions extends GenerateOptions {
|
|
5
5
|
maxLength?: number;
|
|
6
|
-
|
|
7
|
-
topK?: number;
|
|
8
|
-
usePadding?: boolean;
|
|
9
|
-
includeAttention?: boolean;
|
|
10
|
-
includeProbabilities?: boolean;
|
|
6
|
+
noCache?: boolean;
|
|
11
7
|
}
|
|
12
8
|
export default class Generator extends EE<'start' | 'stop' | 'tokens'> {
|
|
13
9
|
private readonly model;
|
|
14
10
|
private readonly tokeniser;
|
|
15
11
|
constructor(model: NanoGPT, tokeniser: ITokeniser);
|
|
16
|
-
private
|
|
12
|
+
private tokenisePrompt;
|
|
13
|
+
private generateNoCache;
|
|
14
|
+
private processResponse;
|
|
15
|
+
private generateCache;
|
|
17
16
|
generate(prompt?: string, options?: IGenerateOptions): Promise<string>;
|
|
18
17
|
}
|
package/dist/Generator.js
CHANGED
|
@@ -1,62 +1,65 @@
|
|
|
1
|
-
import { E as
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
super(), this.model = a, this.tokeniser = t;
|
|
1
|
+
import { E as u } from "./index-SOhdqzHq.js";
|
|
2
|
+
class p extends u {
|
|
3
|
+
constructor(s, e) {
|
|
4
|
+
super(), this.model = s, this.tokeniser = e;
|
|
6
5
|
}
|
|
7
|
-
|
|
8
|
-
const
|
|
9
|
-
|
|
10
|
-
|
|
6
|
+
async tokenisePrompt(s) {
|
|
7
|
+
const e = s ? await this.tokeniser.tokenise([s], !0) : [[this.tokeniser.eosToken]];
|
|
8
|
+
return this.model.tf.tensor2d(e, [1, e[0].length], "int32");
|
|
9
|
+
}
|
|
10
|
+
async generateNoCache(s, e) {
|
|
11
|
+
let t = await this.tokenisePrompt(s), n = s || "";
|
|
12
|
+
const a = e?.maxLength ?? 1e3;
|
|
13
|
+
for (let i = 0; i < a; i++) {
|
|
11
14
|
const {
|
|
12
|
-
output:
|
|
13
|
-
attention:
|
|
14
|
-
probabilities:
|
|
15
|
-
} = this.model.generate(
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
}), p = i;
|
|
22
|
-
if (i = this.model.tf.concat([i, u], 1), n && l) {
|
|
23
|
-
const o = n;
|
|
24
|
-
n = this.model.tf.concat([n, l], 0), o.dispose();
|
|
25
|
-
} else l && (n = l);
|
|
26
|
-
if (s && r) {
|
|
27
|
-
const o = s;
|
|
28
|
-
s = this.model.tf.concat([s, r], 0), o.dispose();
|
|
29
|
-
} else r && (s = r);
|
|
30
|
-
p.dispose(), u.dispose();
|
|
15
|
+
output: o,
|
|
16
|
+
attention: c,
|
|
17
|
+
probabilities: h
|
|
18
|
+
} = this.model.generate(t, void 0, e), l = t;
|
|
19
|
+
t = this.model.tf.concat([t, o], 1), l.dispose();
|
|
20
|
+
const r = await this.processResponse(o, c, h);
|
|
21
|
+
if (o.dispose(), r === null)
|
|
22
|
+
break;
|
|
23
|
+
n += r;
|
|
31
24
|
}
|
|
32
|
-
return
|
|
25
|
+
return t.dispose(), n;
|
|
33
26
|
}
|
|
34
|
-
async
|
|
35
|
-
const
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
27
|
+
async processResponse(s, e, t) {
|
|
28
|
+
const n = (await s.array())[0][0];
|
|
29
|
+
if (n === this.tokeniser.eosToken)
|
|
30
|
+
return null;
|
|
31
|
+
const a = await this.tokeniser.decode([n]);
|
|
32
|
+
let i;
|
|
33
|
+
e && (i = await e.array(), e.dispose());
|
|
34
|
+
let o;
|
|
35
|
+
return t && (o = await t.array(), t.dispose()), this.emit("tokens", [n], a, i, o), a;
|
|
36
|
+
}
|
|
37
|
+
async generateCache(s, e) {
|
|
38
|
+
let t = await this.tokenisePrompt(s), n = s || "";
|
|
39
|
+
const a = new Array(this.model.config.nLayer).fill(void 0), i = e?.maxLength ?? 1e3;
|
|
40
|
+
for (let o = 0; o < i; o++) {
|
|
41
|
+
const {
|
|
42
|
+
output: c,
|
|
43
|
+
attention: h,
|
|
44
|
+
probabilities: l
|
|
45
|
+
} = this.model.generate(t, a, {
|
|
46
|
+
...e,
|
|
47
|
+
usePadding: !1
|
|
48
|
+
});
|
|
49
|
+
t.dispose(), t = c;
|
|
50
|
+
const r = await this.processResponse(c, h, l);
|
|
51
|
+
if (r === null)
|
|
55
52
|
break;
|
|
53
|
+
n += r;
|
|
56
54
|
}
|
|
57
|
-
return
|
|
55
|
+
return t.dispose(), n;
|
|
56
|
+
}
|
|
57
|
+
async generate(s, e) {
|
|
58
|
+
this.emit("start");
|
|
59
|
+
const t = this.model.config.useRope && !e?.noCache ? this.generateCache(s, e) : this.generateNoCache(s, e);
|
|
60
|
+
return this.emit("stop"), t;
|
|
58
61
|
}
|
|
59
62
|
}
|
|
60
63
|
export {
|
|
61
|
-
|
|
64
|
+
p as default
|
|
62
65
|
};
|
package/dist/NanoGPTModel.d.ts
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { default as TF } from '@tensorflow/tfjs';
|
|
2
2
|
import { GPTConfig } from './config';
|
|
3
|
+
import { KVCache } from './layers/CausalSelfAttention';
|
|
3
4
|
export interface TrainingLogEntry {
|
|
4
5
|
loss: number;
|
|
5
6
|
valLoss?: number;
|
|
@@ -18,10 +19,11 @@ export interface GenerateOptions {
|
|
|
18
19
|
export default class NanoGPT {
|
|
19
20
|
readonly config: GPTConfig;
|
|
20
21
|
private wte;
|
|
21
|
-
private wpe
|
|
22
|
+
private wpe?;
|
|
22
23
|
private drop;
|
|
23
24
|
private blocks;
|
|
24
25
|
private lnF;
|
|
26
|
+
private ropeCache?;
|
|
25
27
|
readonly tf: typeof TF;
|
|
26
28
|
log: TrainingLogEntry[];
|
|
27
29
|
constructor(tf: typeof TF, config?: Partial<GPTConfig>);
|
|
@@ -35,12 +37,12 @@ export default class NanoGPT {
|
|
|
35
37
|
private validateInput;
|
|
36
38
|
private calculateLoss;
|
|
37
39
|
private computeAttentionRollout;
|
|
38
|
-
forward(idx: TF.Tensor, targets?: TF.Tensor, training?: boolean, includeAttention?: boolean): {
|
|
40
|
+
forward(idx: TF.Tensor, targets?: TF.Tensor, training?: boolean, includeAttention?: boolean, cache?: (KVCache | undefined)[]): {
|
|
39
41
|
logits: TF.Tensor;
|
|
40
42
|
loss?: TF.Tensor;
|
|
41
43
|
attention?: TF.Tensor;
|
|
42
44
|
};
|
|
43
|
-
generate(idx: TF.Tensor, options?: GenerateOptions): {
|
|
45
|
+
generate(idx: TF.Tensor, cache?: (KVCache | undefined)[], options?: GenerateOptions): {
|
|
44
46
|
output: TF.Tensor;
|
|
45
47
|
attention?: TF.Tensor;
|
|
46
48
|
probabilities?: TF.Tensor;
|
package/dist/NanoGPTModel.js
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import { defaultConfig as z } from "./config.js";
|
|
2
|
-
import
|
|
2
|
+
import $ from "./layers/TransformerBlock.js";
|
|
3
3
|
import S from "./layers/TiedEmbedding.js";
|
|
4
|
-
import
|
|
5
|
-
|
|
4
|
+
import I from "./layers/RoPECache.js";
|
|
5
|
+
import _ from "./layers/RMSNorm.js";
|
|
6
|
+
class M {
|
|
6
7
|
config;
|
|
7
8
|
wte;
|
|
8
9
|
// Token embeddings
|
|
@@ -13,6 +14,7 @@ class $ {
|
|
|
13
14
|
blocks;
|
|
14
15
|
lnF;
|
|
15
16
|
// Final layer norm
|
|
17
|
+
ropeCache;
|
|
16
18
|
tf;
|
|
17
19
|
log = [];
|
|
18
20
|
// Training log
|
|
@@ -21,19 +23,19 @@ class $ {
|
|
|
21
23
|
vocabSize: this.config.vocabSize,
|
|
22
24
|
embedDim: this.config.nEmbed,
|
|
23
25
|
name: "token_embedding"
|
|
24
|
-
}), this.wpe = this.tf.layers.embedding({
|
|
26
|
+
}), this.config.useRope === !1 ? this.wpe = this.tf.layers.embedding({
|
|
25
27
|
inputDim: this.config.blockSize,
|
|
26
28
|
outputDim: this.config.nEmbed,
|
|
27
29
|
name: "positional_embedding",
|
|
28
30
|
embeddingsInitializer: this.tf.initializers.randomNormal({ mean: 0, stddev: 0.02 })
|
|
29
|
-
}), this.drop = this.tf.layers.dropout({ rate: this.config.dropout }), this.blocks = [];
|
|
30
|
-
for (let
|
|
31
|
-
this.blocks.push(new
|
|
32
|
-
this.lnF = new _(t, [this.config.nEmbed], 1e-
|
|
31
|
+
}) : this.ropeCache = new I(t, this.config), this.drop = this.tf.layers.dropout({ rate: this.config.dropout }), this.blocks = [];
|
|
32
|
+
for (let o = 0; o < this.config.nLayer; o++)
|
|
33
|
+
this.blocks.push(new $(this.tf, o, this.config, this.ropeCache));
|
|
34
|
+
this.lnF = new _(t, [this.config.nEmbed], 1e-8, "final_rms_norm");
|
|
33
35
|
}
|
|
34
36
|
get variables() {
|
|
35
37
|
return [
|
|
36
|
-
|
|
38
|
+
//...this.wpe.trainableWeights.map((v) => v.read() as TF.Variable),
|
|
37
39
|
...this.blocks.flatMap((t) => t.variables),
|
|
38
40
|
...this.lnF.trainableWeights.map((t) => t),
|
|
39
41
|
...this.wte.variables
|
|
@@ -41,21 +43,28 @@ class $ {
|
|
|
41
43
|
}
|
|
42
44
|
saveWeights() {
|
|
43
45
|
const t = /* @__PURE__ */ new Map();
|
|
44
|
-
t.set("token_embedding", this.wte.getWeights()), t.set("positional_embedding", this.wpe.getWeights());
|
|
46
|
+
t.set("token_embedding", this.wte.getWeights()), this.wpe && t.set("positional_embedding", this.wpe.getWeights());
|
|
45
47
|
for (let e = 0; e < this.blocks.length; e++)
|
|
46
48
|
this.blocks[e].saveWeights(t);
|
|
47
|
-
return t.set("
|
|
49
|
+
return t.set("final_rms_norm", this.lnF.getWeights()), t;
|
|
48
50
|
}
|
|
49
51
|
loadWeights(t) {
|
|
50
|
-
this.wte.setWeights(t.get("token_embedding") || []), this.wpe.setWeights(t.get("positional_embedding") || []);
|
|
52
|
+
this.wte.setWeights(t.get("token_embedding") || []), this.wpe && this.wpe.setWeights(t.get("positional_embedding") || []);
|
|
51
53
|
for (let e = 0; e < this.blocks.length; e++)
|
|
52
54
|
this.blocks[e].loadWeights(t);
|
|
53
|
-
this.lnF.setWeights(t.get("
|
|
55
|
+
this.lnF.setWeights(t.get("final_rms_norm") || []);
|
|
54
56
|
}
|
|
55
|
-
inputPhase(t, e = !1) {
|
|
57
|
+
inputPhase(t, e, o = !1) {
|
|
56
58
|
return this.tf.tidy(() => {
|
|
57
|
-
const
|
|
58
|
-
|
|
59
|
+
const i = this.wte.embed(t);
|
|
60
|
+
if (this.config.useRope === !1) {
|
|
61
|
+
const [, s] = t.shape, r = this.config.blockSize, l = this.tf.range(0, s, 1, "int32"), n = this.tf.mod(
|
|
62
|
+
this.tf.add(l, this.tf.scalar(e, "int32")),
|
|
63
|
+
this.tf.scalar(r, "int32")
|
|
64
|
+
), h = this.wpe.apply(n), c = i.add(h);
|
|
65
|
+
return this.drop.apply(c, { training: o });
|
|
66
|
+
} else
|
|
67
|
+
return this.drop.apply(i, { training: o });
|
|
59
68
|
});
|
|
60
69
|
}
|
|
61
70
|
setSkipMask(t) {
|
|
@@ -73,7 +82,7 @@ class $ {
|
|
|
73
82
|
set trainable(t) {
|
|
74
83
|
for (const e of this.blocks)
|
|
75
84
|
e.trainable = t;
|
|
76
|
-
this.
|
|
85
|
+
this.lnF.trainable = t;
|
|
77
86
|
}
|
|
78
87
|
validateInput(t) {
|
|
79
88
|
if (t.shape.length !== 2)
|
|
@@ -86,8 +95,8 @@ class $ {
|
|
|
86
95
|
calculateLoss(t, e) {
|
|
87
96
|
try {
|
|
88
97
|
return this.tf.losses.softmaxCrossEntropy(e, t, this.tf.Reduction.MEAN);
|
|
89
|
-
} catch (
|
|
90
|
-
throw console.error("Error computing loss:",
|
|
98
|
+
} catch (o) {
|
|
99
|
+
throw console.error("Error computing loss:", o), new Error(`Loss computation failed: ${o}`);
|
|
91
100
|
}
|
|
92
101
|
}
|
|
93
102
|
// Attention rollout per Abnar & Zuidema (2020)
|
|
@@ -96,60 +105,88 @@ class $ {
|
|
|
96
105
|
return this.tf.tidy(() => {
|
|
97
106
|
if (t.length === 0)
|
|
98
107
|
throw new Error("No attentions for rollout");
|
|
99
|
-
const e
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
108
|
+
const [e, o, i] = t[0].shape;
|
|
109
|
+
for (const s of t) {
|
|
110
|
+
const [r, l, n] = s.shape;
|
|
111
|
+
if (r !== e || l !== o || n !== i)
|
|
112
|
+
throw new Error(
|
|
113
|
+
`Inconsistent attention shapes in rollout: expected [${e},${o},${i}] got [${r},${l},${n}]`
|
|
114
|
+
);
|
|
115
|
+
}
|
|
116
|
+
if (o === i) {
|
|
117
|
+
const s = this.tf.eye(i, i).expandDims(0);
|
|
118
|
+
let r = s.tile([e, 1, 1]);
|
|
119
|
+
for (const l of t) {
|
|
120
|
+
const n = l.add(s);
|
|
121
|
+
r = n.div(n.sum(-1, !0)).matMul(r);
|
|
122
|
+
}
|
|
123
|
+
return r;
|
|
124
|
+
}
|
|
125
|
+
if (o === 1) {
|
|
126
|
+
let s = null;
|
|
127
|
+
const r = this.tf.tensor1d([i - 1], "int32"), l = this.tf.oneHot(r, i).reshape([1, 1, i]).tile([e, 1, 1]);
|
|
128
|
+
r.dispose();
|
|
129
|
+
for (const n of t) {
|
|
130
|
+
let h = n.add(l);
|
|
131
|
+
h = h.div(h.sum(-1, !0)), s == null ? s = h : (s = s.mul(h), s = s.div(s.sum(-1, !0)));
|
|
132
|
+
}
|
|
133
|
+
return s;
|
|
104
134
|
}
|
|
105
|
-
|
|
135
|
+
throw new Error(`Unsupported attention shapes for rollout: [B=${e}, Q=${o}, K=${i}]`);
|
|
106
136
|
});
|
|
107
137
|
}
|
|
108
|
-
forward(t, e,
|
|
138
|
+
forward(t, e, o = !1, i = !1, s) {
|
|
109
139
|
return this.validateInput(t), this.tf.tidy(() => {
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
140
|
+
const r = s?.[0]?.length ?? 0;
|
|
141
|
+
let l = this.inputPhase(t, r, o);
|
|
142
|
+
const n = [];
|
|
143
|
+
if (s && s.length !== this.blocks.length)
|
|
144
|
+
throw console.error("Cache", s), new Error(`Cache length ${s.length} does not match number of blocks ${this.blocks.length}`);
|
|
145
|
+
for (let a = 0; a < this.blocks.length; a++) {
|
|
146
|
+
const d = this.blocks[a], {
|
|
147
|
+
output: g,
|
|
148
|
+
attention: m,
|
|
149
|
+
cache: p
|
|
150
|
+
} = d.call(l, o, i, s ? s[a] : void 0);
|
|
151
|
+
l = g, i && m && n.push(m), s && p ? (s[a]?.k.dispose(), s[a]?.v.dispose(), s[a] = p) : p && (p.k.dispose(), p.v.dispose());
|
|
115
152
|
}
|
|
116
|
-
let
|
|
117
|
-
i &&
|
|
118
|
-
const
|
|
119
|
-
let
|
|
120
|
-
return e && (
|
|
153
|
+
let h;
|
|
154
|
+
i && n.length > 0 && (h = this.computeAttentionRollout(n)), l = this.lnF.apply(l);
|
|
155
|
+
const c = this.wte.project(l);
|
|
156
|
+
let f;
|
|
157
|
+
return e && (f = this.calculateLoss(c, e)), { logits: c, loss: f, attention: i ? h : void 0 };
|
|
121
158
|
});
|
|
122
159
|
}
|
|
123
|
-
generate(t, e) {
|
|
124
|
-
const
|
|
160
|
+
generate(t, e, o) {
|
|
161
|
+
const i = o?.temperature ?? 1, s = o?.topK, r = o?.usePadding ?? !1, l = o?.includeAttention ?? !1;
|
|
125
162
|
return this.tf.tidy(() => {
|
|
126
|
-
const
|
|
127
|
-
[0,
|
|
128
|
-
[
|
|
129
|
-
),
|
|
163
|
+
const n = t, h = n.shape[1], c = h <= this.config.blockSize ? n : n.slice(
|
|
164
|
+
[0, h - this.config.blockSize],
|
|
165
|
+
[n.shape[0], this.config.blockSize]
|
|
166
|
+
), f = r ? this.config.blockSize - c.shape[1] : 0, a = f > 0 ? this.tf.pad(c, [
|
|
130
167
|
[0, 0],
|
|
131
|
-
[0,
|
|
132
|
-
]) :
|
|
133
|
-
let
|
|
134
|
-
if (
|
|
135
|
-
const { values:
|
|
136
|
-
|
|
168
|
+
[0, f]
|
|
169
|
+
]) : c, { logits: d, attention: g } = this.forward(a, void 0, !1, l, e), m = d.shape[1] - 1 - f, p = d.slice([0, m, 0], [d.shape[0], 1, d.shape[2]]), w = g ? g.slice([0, m, 0], [g.shape[0], 1, g.shape[2]]) : void 0, u = p.div(i);
|
|
170
|
+
let b;
|
|
171
|
+
if (s) {
|
|
172
|
+
const { values: E, indices: v } = this.tf.topk(u, s), y = this.tf.multinomial(E.squeeze([1]), 1);
|
|
173
|
+
b = this.tf.gather(v.squeeze([1]), y, 1);
|
|
137
174
|
} else
|
|
138
|
-
|
|
139
|
-
let
|
|
140
|
-
return
|
|
175
|
+
b = this.tf.multinomial(u.squeeze([1]), 1);
|
|
176
|
+
let k;
|
|
177
|
+
return o?.includeProbabilities && (k = this.tf.softmax(u.squeeze([1]))), b = b.reshape([1, 1]), { output: b, attention: w?.squeeze([1]), probabilities: k };
|
|
141
178
|
});
|
|
142
179
|
}
|
|
143
180
|
getNumParams() {
|
|
144
181
|
const t = this.config.vocabSize * this.config.nEmbed + this.config.blockSize * this.config.nEmbed, e = this.config.nLayer * (4 * this.config.nEmbed * this.config.nEmbed + // qkv + proj
|
|
145
|
-
2 * this.config.nEmbed),
|
|
182
|
+
2 * this.config.nEmbed), o = this.config.nLayer * (4 * this.config.nEmbed * this.config.nEmbed + // fc
|
|
146
183
|
this.config.nEmbed * 4 * this.config.nEmbed), i = this.config.nEmbed + this.config.vocabSize * this.config.nEmbed;
|
|
147
|
-
return t + e +
|
|
184
|
+
return t + e + o + i;
|
|
148
185
|
}
|
|
149
186
|
dispose() {
|
|
150
|
-
this.wte.dispose(), this.wpe.dispose(), this.drop.dispose(), this.blocks.forEach((t) => t.dispose()), this.lnF.dispose();
|
|
187
|
+
this.wte.dispose(), this.wpe && this.wpe.dispose(), this.drop.dispose(), this.blocks.forEach((t) => t.dispose()), this.lnF.dispose();
|
|
151
188
|
}
|
|
152
189
|
}
|
|
153
190
|
export {
|
|
154
|
-
|
|
191
|
+
M as default
|
|
155
192
|
};
|
package/dist/TeachableLLM.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import d from "./
|
|
2
|
-
import
|
|
1
|
+
import { defaultConfig as d } from "./config.js";
|
|
2
|
+
import u from "./NanoGPTModel.js";
|
|
3
3
|
import { saveModel as m } from "./utilities/save.js";
|
|
4
4
|
import { loadModel as l } from "./utilities/load.js";
|
|
5
5
|
import f from "./Generator.js";
|
|
@@ -47,25 +47,25 @@ class a extends c {
|
|
|
47
47
|
}
|
|
48
48
|
static loadModel(t, r) {
|
|
49
49
|
const e = new a(t);
|
|
50
|
-
return l(t, r).then(({ model:
|
|
51
|
-
e._model =
|
|
50
|
+
return l(t, r).then(({ model: s, tokeniser: o }) => {
|
|
51
|
+
e._model = s, e._tokeniser = o, e._config = s.config, e.setStatus("warmup"), h(s).then(() => {
|
|
52
52
|
e.setStatus("ready");
|
|
53
|
-
}).catch((
|
|
54
|
-
e.setStatus("error"), e.emit("error",
|
|
53
|
+
}).catch((i) => {
|
|
54
|
+
e.setStatus("error"), e.emit("error", i);
|
|
55
55
|
});
|
|
56
|
-
}).catch((
|
|
57
|
-
e.setStatus("error"), e.emit("error",
|
|
56
|
+
}).catch((s) => {
|
|
57
|
+
e.setStatus("error"), e.emit("error", s);
|
|
58
58
|
}), e;
|
|
59
59
|
}
|
|
60
60
|
static create(t, r = {}) {
|
|
61
|
-
const e = { ...
|
|
62
|
-
return
|
|
63
|
-
|
|
64
|
-
n === "trained" &&
|
|
65
|
-
});
|
|
61
|
+
const e = { ...d, ...r }, s = new g(e.vocabSize), o = new u(t, e), i = new a(t, s, o);
|
|
62
|
+
return i.setStatus("warmup"), h(o).then(() => {
|
|
63
|
+
i.tokeniser.trained ? i.setStatus("ready") : (i.setStatus("awaitingTokens"), i.tokeniser.once("trainStatus", (n) => {
|
|
64
|
+
n === "trained" && i.setStatus("ready");
|
|
65
|
+
}));
|
|
66
66
|
}).catch((n) => {
|
|
67
|
-
|
|
68
|
-
}),
|
|
67
|
+
i.setStatus("error"), i.emit("error", n);
|
|
68
|
+
}), i;
|
|
69
69
|
}
|
|
70
70
|
getNumParams() {
|
|
71
71
|
if (!this._model)
|
|
@@ -78,8 +78,8 @@ class a extends c {
|
|
|
78
78
|
const t = new _(this._model, this._tokeniser);
|
|
79
79
|
return t.on("start", () => this.setStatus("training")), t.on("stop", () => this.setStatus("ready")), t.on("log", async (r) => {
|
|
80
80
|
const e = this.listeners("trainStep");
|
|
81
|
-
for (const
|
|
82
|
-
await
|
|
81
|
+
for (const s of e)
|
|
82
|
+
await s(r);
|
|
83
83
|
}), t;
|
|
84
84
|
}
|
|
85
85
|
train(t, r) {
|
package/dist/config.d.ts
CHANGED
package/dist/config.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
const
|
|
1
|
+
const e = {
|
|
2
2
|
vocabSize: 50304,
|
|
3
3
|
// GPT-2 vocab size
|
|
4
4
|
blockSize: 1024,
|
|
@@ -13,8 +13,10 @@ const a = {
|
|
|
13
13
|
// Dropout probability
|
|
14
14
|
biasInLinear: !1,
|
|
15
15
|
biasInLayerNorm: !1,
|
|
16
|
-
mlpFactor: 4
|
|
16
|
+
mlpFactor: 4,
|
|
17
|
+
useRope: !1
|
|
18
|
+
// Use Rotary Position Embeddings
|
|
17
19
|
};
|
|
18
20
|
export {
|
|
19
|
-
|
|
21
|
+
e as defaultConfig
|
|
20
22
|
};
|
|
@@ -1,6 +1,14 @@
|
|
|
1
1
|
import { default as TF } from '@tensorflow/tfjs';
|
|
2
2
|
import { GPTConfig } from '../config';
|
|
3
|
+
import { default as RoPECache } from './RoPECache';
|
|
4
|
+
export type KVCache = {
|
|
5
|
+
k: TF.Tensor;
|
|
6
|
+
v: TF.Tensor;
|
|
7
|
+
length: number;
|
|
8
|
+
cumulativeLength: number;
|
|
9
|
+
};
|
|
3
10
|
export default class CausalSelfAttention {
|
|
11
|
+
private readonly ropeCache?;
|
|
4
12
|
private config;
|
|
5
13
|
private cAttn;
|
|
6
14
|
private cProj;
|
|
@@ -12,18 +20,20 @@ export default class CausalSelfAttention {
|
|
|
12
20
|
private divisor;
|
|
13
21
|
private index;
|
|
14
22
|
private _trainable;
|
|
15
|
-
constructor(tf: typeof TF, index: number, config: GPTConfig);
|
|
23
|
+
constructor(tf: typeof TF, index: number, config: GPTConfig, ropeCache?: RoPECache | undefined);
|
|
16
24
|
get variables(): TF.Variable[];
|
|
17
25
|
get trainable(): boolean;
|
|
18
26
|
set trainable(value: boolean);
|
|
19
27
|
saveWeights(map: Map<string, TF.Tensor[]>): void;
|
|
20
28
|
loadWeights(weights: Map<string, TF.Tensor[]>): void;
|
|
21
29
|
private getAttentionScores;
|
|
30
|
+
private getAttentionScoresWithPast;
|
|
22
31
|
private getQKV;
|
|
23
32
|
private getOutputProjection;
|
|
24
|
-
call(x: TF.Tensor, training?: boolean, includeAttention?: boolean): {
|
|
33
|
+
call(x: TF.Tensor, training?: boolean, includeAttention?: boolean, pastKV?: KVCache): {
|
|
25
34
|
output: TF.Tensor;
|
|
26
35
|
attention?: TF.Tensor;
|
|
36
|
+
presentKV?: KVCache;
|
|
27
37
|
};
|
|
28
38
|
dispose(): void;
|
|
29
39
|
}
|
|
@@ -1,20 +1,9 @@
|
|
|
1
|
-
class
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
cProj;
|
|
5
|
-
attnDropout;
|
|
6
|
-
residDropout;
|
|
7
|
-
bias;
|
|
8
|
-
maskInf;
|
|
9
|
-
tf;
|
|
10
|
-
divisor;
|
|
11
|
-
index;
|
|
12
|
-
_trainable = !0;
|
|
13
|
-
constructor(t, e, s) {
|
|
14
|
-
this.config = s, this.tf = t, this.index = e, this.cAttn = this.tf.layers.dense({
|
|
1
|
+
class S {
|
|
2
|
+
constructor(t, i, s, e) {
|
|
3
|
+
this.ropeCache = e, this.config = s, this.tf = t, this.index = i, this.cAttn = this.tf.layers.dense({
|
|
15
4
|
units: 3 * s.nEmbed,
|
|
16
5
|
useBias: s.biasInLinear,
|
|
17
|
-
name: `block_${
|
|
6
|
+
name: `block_${i}_attn_cAttn`,
|
|
18
7
|
kernelInitializer: this.tf.initializers.randomNormal({
|
|
19
8
|
mean: 0,
|
|
20
9
|
stddev: 0.02
|
|
@@ -23,14 +12,27 @@ class m {
|
|
|
23
12
|
}), this.cProj = this.tf.layers.dense({
|
|
24
13
|
units: s.nEmbed,
|
|
25
14
|
useBias: s.biasInLinear,
|
|
26
|
-
name: `block_${
|
|
15
|
+
name: `block_${i}_attn_cProj`,
|
|
27
16
|
kernelInitializer: this.tf.initializers.randomNormal({
|
|
28
17
|
mean: 0,
|
|
29
18
|
stddev: 0.02 / Math.sqrt(2 * s.nLayer)
|
|
30
19
|
}),
|
|
31
20
|
biasInitializer: "zeros"
|
|
32
|
-
}), this.attnDropout = this.tf.layers.dropout({ rate: s.dropout }), this.residDropout = this.tf.layers.dropout({ rate: s.dropout }), this.bias = this.tf.linalg.bandPart(this.tf.ones([s.blockSize, s.blockSize]), -1, 0).cast("bool"), this.divisor = this.tf.scalar(1 / Math.sqrt(s.nEmbed / s.nHead))
|
|
21
|
+
}), this.attnDropout = this.tf.layers.dropout({ rate: s.dropout }), this.residDropout = this.tf.layers.dropout({ rate: s.dropout }), this.bias = this.tf.linalg.bandPart(this.tf.ones([s.blockSize, s.blockSize]), -1, 0).cast("bool"), this.divisor = this.tf.scalar(1 / Math.sqrt(s.nEmbed / s.nHead));
|
|
22
|
+
const a = this.tf.zeros([s.blockSize, s.blockSize]), h = this.tf.fill([s.blockSize, s.blockSize], Number.NEGATIVE_INFINITY);
|
|
23
|
+
this.maskInf = this.tf.where(this.bias, a, h);
|
|
33
24
|
}
|
|
25
|
+
config;
|
|
26
|
+
cAttn;
|
|
27
|
+
cProj;
|
|
28
|
+
attnDropout;
|
|
29
|
+
residDropout;
|
|
30
|
+
bias;
|
|
31
|
+
maskInf;
|
|
32
|
+
tf;
|
|
33
|
+
divisor;
|
|
34
|
+
index;
|
|
35
|
+
_trainable = !0;
|
|
34
36
|
get variables() {
|
|
35
37
|
return [
|
|
36
38
|
...this.cAttn.trainableWeights.map((t) => t.read()),
|
|
@@ -49,34 +51,65 @@ class m {
|
|
|
49
51
|
loadWeights(t) {
|
|
50
52
|
this.cAttn.setWeights(t.get(`block_${this.index}_cAttn`) || []), this.cProj.setWeights(t.get(`block_${this.index}_cProj`) || []);
|
|
51
53
|
}
|
|
52
|
-
getAttentionScores(t,
|
|
53
|
-
const
|
|
54
|
-
return this.attnDropout.apply(
|
|
54
|
+
getAttentionScores(t, i, s) {
|
|
55
|
+
const e = t.shape[2], h = this.tf.matMul(t, i, !1, !0).mul(this.divisor), n = this.maskInf.slice([0, 0], [e, e]).expandDims(0).expandDims(0), r = h.add(n), o = this.tf.softmax(r, -1);
|
|
56
|
+
return this.attnDropout.apply(o, { training: s });
|
|
57
|
+
}
|
|
58
|
+
// Attention with optional past. If pastLen > 0 and T_cur == 1, no mask needed.
|
|
59
|
+
getAttentionScoresWithPast(t, i, s, e) {
|
|
60
|
+
const a = t.shape[2];
|
|
61
|
+
let n = this.tf.matMul(t, i, !1, !0).mul(this.divisor);
|
|
62
|
+
if (a > 1 && e > 0)
|
|
63
|
+
throw new Error("Cannot use past with T_cur > 1");
|
|
64
|
+
if (a > 1) {
|
|
65
|
+
const o = this.maskInf.slice([0, 0], [a, a]).expandDims(0).expandDims(0);
|
|
66
|
+
n = n.add(o);
|
|
67
|
+
}
|
|
68
|
+
const r = this.tf.softmax(n, -1);
|
|
69
|
+
return this.attnDropout.apply(r, { training: s });
|
|
55
70
|
}
|
|
56
71
|
getQKV(t) {
|
|
57
|
-
const [
|
|
58
|
-
|
|
59
|
-
const
|
|
60
|
-
|
|
61
|
-
const
|
|
62
|
-
|
|
63
|
-
const d = this.tf.reshape(
|
|
64
|
-
i.dispose();
|
|
65
|
-
const u = d.transpose([0, 2, 1, 3]);
|
|
66
|
-
d.dispose();
|
|
67
|
-
const p = this.tf.reshape(n, [e, s, this.config.nHead, h]);
|
|
72
|
+
const [i, s, e] = t.shape, a = this.cAttn.apply(t), [h, n, r] = this.tf.split(a, 3, -1);
|
|
73
|
+
a.dispose();
|
|
74
|
+
const o = e / this.config.nHead, u = this.tf.reshape(h, [i, s, this.config.nHead, o]);
|
|
75
|
+
h.dispose();
|
|
76
|
+
const f = u.transpose([0, 2, 1, 3]);
|
|
77
|
+
u.dispose();
|
|
78
|
+
const d = this.tf.reshape(n, [i, s, this.config.nHead, o]);
|
|
68
79
|
n.dispose();
|
|
69
|
-
const
|
|
70
|
-
|
|
80
|
+
const c = d.transpose([0, 2, 1, 3]);
|
|
81
|
+
d.dispose();
|
|
82
|
+
const l = this.tf.reshape(r, [i, s, this.config.nHead, o]);
|
|
83
|
+
r.dispose();
|
|
84
|
+
const p = l.transpose([0, 2, 1, 3]);
|
|
85
|
+
return l.dispose(), [f, c, p];
|
|
71
86
|
}
|
|
72
|
-
getOutputProjection(t,
|
|
73
|
-
const s = t.shape[0],
|
|
74
|
-
return this.residDropout.apply(
|
|
87
|
+
getOutputProjection(t, i) {
|
|
88
|
+
const s = t.shape[0], e = t.shape[2], a = this.config.nEmbed, h = t.transpose([0, 2, 1, 3]), n = this.tf.reshape(h, [s, e, a]), r = this.cProj.apply(n);
|
|
89
|
+
return this.residDropout.apply(r, { training: i });
|
|
75
90
|
}
|
|
76
|
-
|
|
91
|
+
// Added optional KV cache support (pastKV). Returns presentKV for chaining.
|
|
92
|
+
call(t, i = !1, s = !1, e) {
|
|
93
|
+
if (e && !this.config.useRope)
|
|
94
|
+
throw new Error("Cannot use pastKV without RoPE enabled");
|
|
77
95
|
return this.tf.tidy(() => {
|
|
78
|
-
const [a,
|
|
79
|
-
|
|
96
|
+
const [a, h, n] = this.getQKV(t), r = a.shape[2], o = this.config.blockSize, u = e ? e.cumulativeLength : 0, [f, d] = this.ropeCache ? this.ropeCache.applyRoPE(a, h, u) : [a, h];
|
|
97
|
+
let c = d, l = n, p = 0;
|
|
98
|
+
e && (p = e.length, c = this.tf.concat([e.k, d], 2), l = this.tf.concat([e.v, n], 2));
|
|
99
|
+
const b = c.shape[2];
|
|
100
|
+
if (b > o) {
|
|
101
|
+
const k = b - o, g = c.shape[0], v = c.shape[1], I = c.shape[3];
|
|
102
|
+
c = c.slice([0, 0, k, 0], [g, v, o, I]), l = l.slice([0, 0, k, 0], [g, v, o, I]), p = o - r;
|
|
103
|
+
}
|
|
104
|
+
let m;
|
|
105
|
+
p > 0 ? m = this.getAttentionScoresWithPast(f, c, i, p) : m = this.getAttentionScores(f, c, i);
|
|
106
|
+
const _ = this.tf.matMul(m, l), A = this.getOutputProjection(_, i), P = {
|
|
107
|
+
k: this.tf.keep(c),
|
|
108
|
+
v: this.tf.keep(l),
|
|
109
|
+
length: p + r,
|
|
110
|
+
cumulativeLength: e ? e.cumulativeLength + r : r
|
|
111
|
+
};
|
|
112
|
+
return { output: A, attention: s ? m.mean(1) : void 0, presentKV: P };
|
|
80
113
|
});
|
|
81
114
|
}
|
|
82
115
|
dispose() {
|
|
@@ -84,5 +117,5 @@ class m {
|
|
|
84
117
|
}
|
|
85
118
|
}
|
|
86
119
|
export {
|
|
87
|
-
|
|
120
|
+
S as default
|
|
88
121
|
};
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import { default as TF } from '@tensorflow/tfjs';
|
|
2
|
+
export default class RMSNorm {
|
|
3
|
+
private gamma;
|
|
4
|
+
private epsilon;
|
|
5
|
+
private tf;
|
|
6
|
+
constructor(tf: typeof TF, shape: number[], epsilon?: number, name?: string);
|
|
7
|
+
get trainableWeights(): TF.Variable[];
|
|
8
|
+
set trainable(value: boolean);
|
|
9
|
+
getWeights(): TF.Tensor[];
|
|
10
|
+
setWeights(weights: TF.Tensor[]): void;
|
|
11
|
+
apply(x: TF.Tensor): TF.Tensor;
|
|
12
|
+
dispose(): void;
|
|
13
|
+
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
class m {
|
|
2
|
+
gamma;
|
|
3
|
+
epsilon;
|
|
4
|
+
tf;
|
|
5
|
+
constructor(a, s, t = 1e-8, e = "") {
|
|
6
|
+
this.tf = a, this.epsilon = t, this.gamma = a.variable(a.ones(s), !0, `${e}_gamma`, "float32");
|
|
7
|
+
}
|
|
8
|
+
get trainableWeights() {
|
|
9
|
+
return [this.gamma];
|
|
10
|
+
}
|
|
11
|
+
set trainable(a) {
|
|
12
|
+
this.gamma.trainable = a;
|
|
13
|
+
}
|
|
14
|
+
getWeights() {
|
|
15
|
+
return [this.gamma];
|
|
16
|
+
}
|
|
17
|
+
setWeights(a) {
|
|
18
|
+
this.gamma.assign(a[0]);
|
|
19
|
+
}
|
|
20
|
+
apply(a) {
|
|
21
|
+
return this.tf.tidy(() => {
|
|
22
|
+
const t = a.square().mean(-1, !0).add(this.epsilon).rsqrt();
|
|
23
|
+
return a.mul(t).mul(this.gamma);
|
|
24
|
+
});
|
|
25
|
+
}
|
|
26
|
+
dispose() {
|
|
27
|
+
this.gamma.dispose();
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
export {
|
|
31
|
+
m as default
|
|
32
|
+
};
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import { default as TF } from '@tensorflow/tfjs';
|
|
2
|
+
import { GPTConfig } from '../config';
|
|
3
|
+
export default class RoPECache {
|
|
4
|
+
private readonly tf;
|
|
5
|
+
private readonly config;
|
|
6
|
+
private rotaryDim;
|
|
7
|
+
private ropeBase;
|
|
8
|
+
private ropeInvFreq;
|
|
9
|
+
private ropeCos;
|
|
10
|
+
private ropeSin;
|
|
11
|
+
private ropeCacheLen;
|
|
12
|
+
constructor(tf: typeof TF, config: GPTConfig);
|
|
13
|
+
private ensureRopeCache;
|
|
14
|
+
applyRoPE(q: TF.Tensor, k: TF.Tensor, pastLen: number): [TF.Tensor, TF.Tensor];
|
|
15
|
+
dispose(): void;
|
|
16
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
class b {
|
|
2
|
+
constructor(s, r) {
|
|
3
|
+
this.tf = s, this.config = r;
|
|
4
|
+
const o = this.config.nEmbed / this.config.nHead;
|
|
5
|
+
if (this.rotaryDim = o, this.rotaryDim % 2 !== 0)
|
|
6
|
+
throw new Error("rotaryDim must be even");
|
|
7
|
+
this.ropeBase = 1e4;
|
|
8
|
+
const i = this.tf.range(0, this.rotaryDim, 2, "float32"), t = i.div(this.tf.scalar(this.rotaryDim, "float32")), e = this.tf.pow(this.tf.scalar(this.ropeBase, "float32"), t);
|
|
9
|
+
this.ropeInvFreq = this.tf.reciprocal(e), t.dispose(), e.dispose(), i.dispose(), this.config.useRope === !1 ? (this.ropeCos = null, this.ropeSin = null, this.ropeCacheLen = 0) : this.tf.tidy(() => {
|
|
10
|
+
this.ensureRopeCache(this.config.blockSize * 4);
|
|
11
|
+
});
|
|
12
|
+
}
|
|
13
|
+
rotaryDim;
|
|
14
|
+
ropeBase;
|
|
15
|
+
ropeInvFreq;
|
|
16
|
+
ropeCos = null;
|
|
17
|
+
// [cacheLen, rotaryDim/2]
|
|
18
|
+
ropeSin = null;
|
|
19
|
+
// [cacheLen, rotaryDim/2]
|
|
20
|
+
ropeCacheLen = 0;
|
|
21
|
+
ensureRopeCache(s) {
|
|
22
|
+
if (s <= this.ropeCacheLen) return;
|
|
23
|
+
this.ropeCos && this.ropeCos.dispose(), this.ropeSin && this.ropeSin.dispose();
|
|
24
|
+
const o = this.tf.range(0, s, 1, "float32").expandDims(1).mul(this.ropeInvFreq.expandDims(0));
|
|
25
|
+
this.ropeCos = this.tf.keep(this.tf.cos(o).expandDims(-1)), this.ropeSin = this.tf.keep(this.tf.sin(o).expandDims(-1)), this.ropeCacheLen = s;
|
|
26
|
+
}
|
|
27
|
+
applyRoPE(s, r, o) {
|
|
28
|
+
const i = s.shape[3], t = this.rotaryDim;
|
|
29
|
+
if (t > i) return [s, r];
|
|
30
|
+
const e = s.shape[2], v = o + e;
|
|
31
|
+
this.ensureRopeCache(v);
|
|
32
|
+
const n = t / 2, p = this.ropeCos.slice([o, 0, 0], [e, n, 1]).reshape([1, 1, e, n]), a = this.ropeSin.slice([o, 0, 0], [e, n, 1]).reshape([1, 1, e, n]), h = s.shape[0], c = s.shape[1], f = this.tf.range(0, t, 2, "int32"), l = this.tf.range(1, t, 2, "int32"), d = (u) => {
|
|
33
|
+
const m = u.slice([0, 0, 0, 0], [h, c, e, t]), C = t < i ? u.slice([0, 0, 0, t], [h, c, e, i - t]) : null, D = this.tf.gather(m, f, 3), g = this.tf.gather(m, l, 3), x = D.mul(p).sub(g.mul(a)), k = g.mul(p).add(D.mul(a)), R = this.tf.stack([x, k], -1).reshape([h, c, e, t]);
|
|
34
|
+
return C ? this.tf.concat([R, C], 3) : R;
|
|
35
|
+
}, y = d(s), S = d(r);
|
|
36
|
+
return f.dispose(), l.dispose(), [y, S];
|
|
37
|
+
}
|
|
38
|
+
dispose() {
|
|
39
|
+
this.ropeCos && this.ropeCos.dispose(), this.ropeSin && this.ropeSin.dispose(), this.ropeInvFreq.dispose();
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
export {
|
|
43
|
+
b as default
|
|
44
|
+
};
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import { default as TF } from '@tensorflow/tfjs';
|
|
2
2
|
import { GPTConfig } from '../config';
|
|
3
|
+
import { KVCache } from './CausalSelfAttention';
|
|
4
|
+
import { default as RoPECache } from './RoPECache';
|
|
3
5
|
export default class Block {
|
|
4
6
|
private ln1;
|
|
5
7
|
private attn;
|
|
@@ -9,16 +11,17 @@ export default class Block {
|
|
|
9
11
|
private index;
|
|
10
12
|
private _trainable;
|
|
11
13
|
skipped: boolean;
|
|
12
|
-
constructor(tf: typeof TF, index: number, config: GPTConfig);
|
|
14
|
+
constructor(tf: typeof TF, index: number, config: GPTConfig, ropeCache?: RoPECache);
|
|
13
15
|
get variables(): TF.Variable[];
|
|
14
16
|
get trainable(): boolean;
|
|
15
17
|
set trainable(value: boolean);
|
|
16
18
|
saveWeights(map: Map<string, TF.Tensor[]>): void;
|
|
17
19
|
loadWeights(weights: Map<string, TF.Tensor[]>): void;
|
|
18
20
|
private getMLPOutput;
|
|
19
|
-
call(x: TF.Tensor, training?: boolean, includeAttention?: boolean): {
|
|
21
|
+
call(x: TF.Tensor, training?: boolean, includeAttention?: boolean, cache?: KVCache): {
|
|
20
22
|
output: TF.Tensor;
|
|
21
23
|
attention?: TF.Tensor;
|
|
24
|
+
cache?: KVCache;
|
|
22
25
|
};
|
|
23
26
|
dispose(): void;
|
|
24
27
|
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import
|
|
2
|
-
import
|
|
3
|
-
import
|
|
1
|
+
import r from "./CausalSelfAttention.js";
|
|
2
|
+
import o from "./MLP.js";
|
|
3
|
+
import a from "./RMSNorm.js";
|
|
4
4
|
class u {
|
|
5
5
|
ln1;
|
|
6
6
|
attn;
|
|
@@ -10,8 +10,8 @@ class u {
|
|
|
10
10
|
index;
|
|
11
11
|
_trainable = !0;
|
|
12
12
|
skipped = !1;
|
|
13
|
-
constructor(t, i, s) {
|
|
14
|
-
this.tf = t, this.index = i, this.ln1 = new
|
|
13
|
+
constructor(t, i, s, e) {
|
|
14
|
+
this.tf = t, this.index = i, this.ln1 = new a(t, [s.nEmbed], 1e-8, `block_${this.index}_rms1`), this.attn = new r(this.tf, this.index, s, e), this.ln2 = new a(t, [s.nEmbed], 1e-8, `block_${this.index}_rms2`), this.mlp = new o(this.tf, this.index, s);
|
|
15
15
|
}
|
|
16
16
|
get variables() {
|
|
17
17
|
return [
|
|
@@ -28,21 +28,25 @@ class u {
|
|
|
28
28
|
this._trainable = t, this.ln1.trainable = t, this.ln2.trainable = t, this.attn.trainable = t, this.mlp.trainable = t;
|
|
29
29
|
}
|
|
30
30
|
saveWeights(t) {
|
|
31
|
-
this.attn.saveWeights(t), this.mlp.saveWeights(t), t.set(`block_${this.index}
|
|
31
|
+
this.attn.saveWeights(t), this.mlp.saveWeights(t), t.set(`block_${this.index}_rms1`, this.ln1.getWeights()), t.set(`block_${this.index}_rms2`, this.ln2.getWeights());
|
|
32
32
|
}
|
|
33
33
|
loadWeights(t) {
|
|
34
|
-
this.attn.loadWeights(t), this.mlp.loadWeights(t), this.ln1.setWeights(t.get(`block_${this.index}
|
|
34
|
+
this.attn.loadWeights(t), this.mlp.loadWeights(t), this.ln1.setWeights(t.get(`block_${this.index}_rms1`) || []), this.ln2.setWeights(t.get(`block_${this.index}_rms2`) || []);
|
|
35
35
|
}
|
|
36
36
|
getMLPOutput(t, i) {
|
|
37
37
|
const s = this.ln2.apply(t), e = this.mlp.call(s, i);
|
|
38
38
|
return t.add(e);
|
|
39
39
|
}
|
|
40
|
-
call(t, i = !1, s = !1) {
|
|
40
|
+
call(t, i = !1, s = !1, e) {
|
|
41
41
|
return this.tf.tidy(() => {
|
|
42
42
|
if (this.skipped)
|
|
43
43
|
return { output: t };
|
|
44
|
-
const
|
|
45
|
-
return {
|
|
44
|
+
const l = this.ln1.apply(t), n = this.attn.call(l, i, s, e), h = t.add(n.output);
|
|
45
|
+
return {
|
|
46
|
+
output: this.getMLPOutput(h, i),
|
|
47
|
+
attention: n.attention,
|
|
48
|
+
cache: n.presentKV
|
|
49
|
+
};
|
|
46
50
|
});
|
|
47
51
|
}
|
|
48
52
|
dispose() {
|
package/dist/main.d.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
export { default as NanoGPT } from './NanoGPTModel';
|
|
2
2
|
export { default as TeachableLLM } from './TeachableLLM';
|
|
3
3
|
export { default as CharTokeniser } from './tokeniser/CharTokeniser';
|
|
4
|
+
export { default as waitForModel } from './utilities/waitForModel';
|
|
4
5
|
export type { ITrainerOptions } from './Trainer';
|
|
5
6
|
export type { IGenerateOptions } from './Generator';
|
|
6
7
|
export type { TrainingLogEntry } from './NanoGPTModel';
|
package/dist/main.js
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
import { default as o } from "./NanoGPTModel.js";
|
|
2
|
-
import { default as
|
|
2
|
+
import { default as t } from "./TeachableLLM.js";
|
|
3
3
|
import { default as l } from "./tokeniser/CharTokeniser.js";
|
|
4
|
+
import { default as s } from "./utilities/waitForModel.js";
|
|
4
5
|
export {
|
|
5
6
|
l as CharTokeniser,
|
|
6
7
|
o as NanoGPT,
|
|
7
|
-
|
|
8
|
+
t as TeachableLLM,
|
|
9
|
+
s as waitForModel
|
|
8
10
|
};
|
|
@@ -1,20 +1,20 @@
|
|
|
1
|
-
async function
|
|
2
|
-
if (
|
|
1
|
+
async function h(r, t, a, c, g) {
|
|
2
|
+
if (c <= 0)
|
|
3
3
|
throw new Error("Length must be a positive integer");
|
|
4
|
-
if (
|
|
4
|
+
if (a.length === 0)
|
|
5
5
|
throw new Error("Prompt cannot be an empty string");
|
|
6
|
-
const
|
|
7
|
-
let e = t.tf.tensor2d(
|
|
8
|
-
for (let
|
|
9
|
-
const { output:
|
|
10
|
-
e = t.tf.concat([e,
|
|
6
|
+
const p = await r.tokenise([a], !0), s = t.config.useRope ? new Array(t.config.nLayer).fill(void 0) : void 0, u = t.tf.tidy(() => {
|
|
7
|
+
let e = t.tf.tensor2d(p, [1, p[0].length], "int32"), n = e;
|
|
8
|
+
for (let f = 0; f < c; f++) {
|
|
9
|
+
const { output: o } = t.generate(e, s, g), w = e, y = n;
|
|
10
|
+
n = t.tf.concat([n, o], 1), e = s ? o : t.tf.concat([e, o], 1), w.dispose(), y.dispose(), s || o.dispose();
|
|
11
11
|
}
|
|
12
|
-
return
|
|
13
|
-
}),
|
|
14
|
-
|
|
15
|
-
const
|
|
16
|
-
return
|
|
12
|
+
return n;
|
|
13
|
+
}), T = await u.array();
|
|
14
|
+
u.dispose();
|
|
15
|
+
const i = T[0], d = i.indexOf(r.eosToken);
|
|
16
|
+
return d !== -1 && i.splice(d), await r.decode(i);
|
|
17
17
|
}
|
|
18
18
|
export {
|
|
19
|
-
|
|
19
|
+
h as generateText
|
|
20
20
|
};
|