@genai-fi/nanogpt 0.2.9 → 0.2.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/Generator.d.ts +2 -0
- package/dist/Generator.js +37 -32
- package/dist/NanoGPTModel.d.ts +4 -1
- package/dist/NanoGPTModel.js +33 -25
- package/dist/TeachableLLM.d.ts +4 -0
- package/dist/TeachableLLM.js +31 -16
- package/dist/{complex-Cd8sqiBC.js → complex-x7w5HPOS.js} +6 -6
- package/dist/{index-Dsg28SG6.js → index-CWQLouWz.js} +39 -35
- package/dist/layers/BaseLayer.d.ts +8 -0
- package/dist/layers/BaseLayer.js +18 -0
- package/dist/layers/CausalSelfAttention.d.ts +2 -1
- package/dist/layers/CausalSelfAttention.js +10 -8
- package/dist/layers/MLP.d.ts +2 -1
- package/dist/layers/MLP.js +16 -14
- package/dist/layers/RMSNorm.d.ts +2 -1
- package/dist/layers/RMSNorm.js +13 -11
- package/dist/layers/TiedEmbedding.js +21 -21
- package/dist/layers/TransformerBlock.d.ts +4 -1
- package/dist/layers/TransformerBlock.js +9 -5
- package/dist/{mat_mul-BAYDrXvE.js → mat_mul-4v7St11W.js} +5 -5
- package/dist/ops/attentionMask.js +31 -25
- package/dist/ops/gatherSub.js +2 -2
- package/dist/ops/node/sparseCrossEntropy.js +1 -1
- package/dist/ops/scatterSub.js +8 -8
- package/dist/{stack-1o648CP_.js → stack-CTdK-itU.js} +7 -7
- package/dist/{sum-NWazHI7f.js → sum-CnIf1YOh.js} +3 -3
- package/dist/training/AdamExt.js +1 -1
- package/dist/training/Trainer.js +30 -29
- package/dist/training/sparseCrossEntropy.js +9 -9
- package/dist/utilities/profile.d.ts +10 -0
- package/dist/utilities/profile.js +29 -0
- package/package.json +1 -1
package/dist/Generator.d.ts
CHANGED
|
@@ -8,10 +8,12 @@ export interface IGenerateOptions extends GenerateOptions {
|
|
|
8
8
|
export default class Generator extends EE<'start' | 'stop' | 'tokens'> {
|
|
9
9
|
private readonly model;
|
|
10
10
|
private readonly tokeniser;
|
|
11
|
+
private active;
|
|
11
12
|
constructor(model: NanoGPT, tokeniser: ITokeniser);
|
|
12
13
|
private tokenisePrompt;
|
|
13
14
|
private generateNoCache;
|
|
14
15
|
private processResponse;
|
|
15
16
|
private generateCache;
|
|
16
17
|
generate(prompt?: string, options?: IGenerateOptions): Promise<string>;
|
|
18
|
+
stop(): void;
|
|
17
19
|
}
|
package/dist/Generator.js
CHANGED
|
@@ -1,65 +1,70 @@
|
|
|
1
1
|
import { E as u } from "./index-Dwqa6Zy2.js";
|
|
2
|
-
class
|
|
2
|
+
class f extends u {
|
|
3
3
|
constructor(s, e) {
|
|
4
4
|
super(), this.model = s, this.tokeniser = e;
|
|
5
5
|
}
|
|
6
|
+
active = !1;
|
|
6
7
|
async tokenisePrompt(s) {
|
|
7
8
|
const e = s ? await this.tokeniser.tokenise([s], !0) : [[this.tokeniser.eosToken]];
|
|
8
9
|
return this.model.tf.tensor2d(e, [1, e[0].length], "int32");
|
|
9
10
|
}
|
|
10
11
|
async generateNoCache(s, e) {
|
|
11
|
-
let t = await this.tokenisePrompt(s),
|
|
12
|
-
const
|
|
13
|
-
for (let
|
|
12
|
+
let t = await this.tokenisePrompt(s), i = s || "";
|
|
13
|
+
const o = e?.maxLength ?? 1e3;
|
|
14
|
+
for (let a = 0; a < o && this.active; a++) {
|
|
14
15
|
const {
|
|
15
|
-
output:
|
|
16
|
+
output: n,
|
|
16
17
|
attention: c,
|
|
17
|
-
probabilities:
|
|
18
|
-
} = this.model.generate(t, void 0, e),
|
|
19
|
-
t = this.model.tf.concat([t,
|
|
20
|
-
const r = await this.processResponse(
|
|
21
|
-
if (
|
|
18
|
+
probabilities: l
|
|
19
|
+
} = this.model.generate(t, void 0, e), h = t;
|
|
20
|
+
t = this.model.tf.concat([t, n], 1), h.dispose();
|
|
21
|
+
const r = await this.processResponse(n, c, l);
|
|
22
|
+
if (n.dispose(), r === null)
|
|
22
23
|
break;
|
|
23
|
-
|
|
24
|
+
i += r;
|
|
24
25
|
}
|
|
25
|
-
return t.dispose(),
|
|
26
|
+
return t.dispose(), i;
|
|
26
27
|
}
|
|
27
28
|
async processResponse(s, e, t) {
|
|
28
|
-
const
|
|
29
|
-
if (
|
|
29
|
+
const i = (await s.array())[0][0];
|
|
30
|
+
if (i === this.tokeniser.eosToken)
|
|
30
31
|
return null;
|
|
31
|
-
const
|
|
32
|
-
let
|
|
33
|
-
e && (
|
|
34
|
-
let
|
|
35
|
-
return t && (
|
|
32
|
+
const o = await this.tokeniser.decode([i]);
|
|
33
|
+
let a;
|
|
34
|
+
e && (a = await e.array(), e.dispose());
|
|
35
|
+
let n;
|
|
36
|
+
return t && (n = await t.array(), t.dispose()), this.emit("tokens", [i], o, a, n), o;
|
|
36
37
|
}
|
|
37
38
|
async generateCache(s, e) {
|
|
38
|
-
let t = await this.tokenisePrompt(s),
|
|
39
|
-
const
|
|
40
|
-
for (let
|
|
39
|
+
let t = await this.tokenisePrompt(s), i = s || "";
|
|
40
|
+
const o = new Array(this.model.config.nLayer).fill(void 0), a = e?.maxLength ?? 1e3;
|
|
41
|
+
for (let n = 0; n < a && this.active; n++) {
|
|
41
42
|
const {
|
|
42
43
|
output: c,
|
|
43
|
-
attention:
|
|
44
|
-
probabilities:
|
|
45
|
-
} = this.model.generate(t,
|
|
44
|
+
attention: l,
|
|
45
|
+
probabilities: h
|
|
46
|
+
} = this.model.generate(t, o, {
|
|
46
47
|
...e,
|
|
47
48
|
usePadding: !1
|
|
48
49
|
});
|
|
49
50
|
t.dispose(), t = c;
|
|
50
|
-
const r = await this.processResponse(c,
|
|
51
|
+
const r = await this.processResponse(c, l, h);
|
|
51
52
|
if (r === null)
|
|
52
53
|
break;
|
|
53
|
-
|
|
54
|
+
i += r;
|
|
54
55
|
}
|
|
55
|
-
return t.dispose(),
|
|
56
|
+
return t.dispose(), i;
|
|
56
57
|
}
|
|
57
58
|
async generate(s, e) {
|
|
58
|
-
this.
|
|
59
|
-
|
|
60
|
-
|
|
59
|
+
const t = s && s.length > this.model.config.blockSize ? s.slice(-this.model.config.blockSize) : s;
|
|
60
|
+
this.active = !0, this.emit("start");
|
|
61
|
+
const o = await (this.model.config.useRope && !e?.noCache ? this.generateCache(t, e) : this.generateNoCache(t, e));
|
|
62
|
+
return this.active = !1, this.emit("stop"), o;
|
|
63
|
+
}
|
|
64
|
+
stop() {
|
|
65
|
+
this.active = !1;
|
|
61
66
|
}
|
|
62
67
|
}
|
|
63
68
|
export {
|
|
64
|
-
|
|
69
|
+
f as default
|
|
65
70
|
};
|
package/dist/NanoGPTModel.d.ts
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import { default as TF } from '@tensorflow/tfjs';
|
|
2
2
|
import { GPTConfig } from './config';
|
|
3
3
|
import { KVCache } from './layers/CausalSelfAttention';
|
|
4
|
+
import { default as MemoryProfiler } from './utilities/profile';
|
|
5
|
+
import { default as BaseLayer } from './layers/BaseLayer';
|
|
4
6
|
export interface TrainingLogEntry {
|
|
5
7
|
loss: number;
|
|
6
8
|
valLoss?: number;
|
|
@@ -16,7 +18,7 @@ export interface GenerateOptions {
|
|
|
16
18
|
includeAttention?: boolean;
|
|
17
19
|
includeProbabilities?: boolean;
|
|
18
20
|
}
|
|
19
|
-
export default class NanoGPT {
|
|
21
|
+
export default class NanoGPT extends BaseLayer {
|
|
20
22
|
readonly config: GPTConfig;
|
|
21
23
|
private wte;
|
|
22
24
|
private wpe?;
|
|
@@ -34,6 +36,7 @@ export default class NanoGPT {
|
|
|
34
36
|
setSkipMask(mask: boolean[]): void;
|
|
35
37
|
setTrainableMask(mask: boolean[]): void;
|
|
36
38
|
set trainable(value: boolean);
|
|
39
|
+
setProfiler(value: MemoryProfiler | undefined): void;
|
|
37
40
|
private validateInput;
|
|
38
41
|
private calculateLoss;
|
|
39
42
|
private computeAttentionRollout;
|
package/dist/NanoGPTModel.js
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
|
-
import { defaultConfig as
|
|
1
|
+
import { defaultConfig as v } from "./config.js";
|
|
2
2
|
import z from "./layers/TransformerBlock.js";
|
|
3
3
|
import S from "./layers/TiedEmbedding.js";
|
|
4
|
-
import
|
|
5
|
-
import
|
|
6
|
-
import { estimateParameterCount as
|
|
7
|
-
import { createSoftmaxCrossEntropyWithGrad as
|
|
8
|
-
|
|
4
|
+
import _ from "./layers/RoPECache.js";
|
|
5
|
+
import I from "./layers/RMSNorm.js";
|
|
6
|
+
import { estimateParameterCount as F } from "./utilities/parameters.js";
|
|
7
|
+
import { createSoftmaxCrossEntropyWithGrad as L } from "./training/sparseCrossEntropy.js";
|
|
8
|
+
import P from "./layers/BaseLayer.js";
|
|
9
|
+
class A extends P {
|
|
9
10
|
config;
|
|
10
11
|
wte;
|
|
11
12
|
// Token embeddings
|
|
@@ -21,7 +22,7 @@ class K {
|
|
|
21
22
|
log = [];
|
|
22
23
|
// Training log
|
|
23
24
|
constructor(t, e = {}) {
|
|
24
|
-
this.tf = t, this.config = {
|
|
25
|
+
super(), this.tf = t, this.config = { ...v, ...e }, this.wte = new S(t, {
|
|
25
26
|
vocabSize: this.config.vocabSize,
|
|
26
27
|
embedDim: this.config.nEmbed,
|
|
27
28
|
name: "token_embedding"
|
|
@@ -30,10 +31,10 @@ class K {
|
|
|
30
31
|
outputDim: this.config.nEmbed,
|
|
31
32
|
name: "positional_embedding",
|
|
32
33
|
embeddingsInitializer: this.tf.initializers.randomNormal({ mean: 0, stddev: 0.02 })
|
|
33
|
-
}) : this.ropeCache = new
|
|
34
|
+
}) : this.ropeCache = new _(t, this.config), this.drop = this.tf.layers.dropout({ rate: this.config.dropout }), this.blocks = [];
|
|
34
35
|
for (let o = 0; o < this.config.nLayer; o++)
|
|
35
36
|
this.blocks.push(new z(this.tf, o, this.config, this.ropeCache));
|
|
36
|
-
this.lnF = new
|
|
37
|
+
this.lnF = new I(t, [this.config.nEmbed], 1e-8, "final_rms_norm");
|
|
37
38
|
}
|
|
38
39
|
get variables() {
|
|
39
40
|
return [
|
|
@@ -86,6 +87,12 @@ class K {
|
|
|
86
87
|
e.trainable = t;
|
|
87
88
|
this.lnF.trainable = t;
|
|
88
89
|
}
|
|
90
|
+
setProfiler(t) {
|
|
91
|
+
this._profiler = t;
|
|
92
|
+
for (const e of this.blocks)
|
|
93
|
+
e.setProfiler(t);
|
|
94
|
+
this.lnF.setProfiler(t);
|
|
95
|
+
}
|
|
89
96
|
validateInput(t) {
|
|
90
97
|
if (t.shape.length !== 2)
|
|
91
98
|
throw new Error(`Invalid input shape: expected [batch_size, sequence_length], got ${t.shape}`);
|
|
@@ -96,7 +103,7 @@ class K {
|
|
|
96
103
|
}
|
|
97
104
|
calculateLoss(t, e) {
|
|
98
105
|
try {
|
|
99
|
-
return
|
|
106
|
+
return L()(t, e).mean();
|
|
100
107
|
} catch (o) {
|
|
101
108
|
throw console.error("Error computing loss:", o), new Error(`Loss computation failed: ${o}`);
|
|
102
109
|
}
|
|
@@ -139,24 +146,25 @@ class K {
|
|
|
139
146
|
}
|
|
140
147
|
forward(t, e, o = !1, i = !1, s) {
|
|
141
148
|
return this.validateInput(t), this.tf.tidy(() => {
|
|
149
|
+
this.startMemory();
|
|
142
150
|
const l = s?.[0]?.length ?? 0;
|
|
143
151
|
let r = this.inputPhase(t, l, o);
|
|
144
152
|
const n = [];
|
|
145
153
|
if (s && s.length !== this.blocks.length)
|
|
146
154
|
throw console.error("Cache", s), new Error(`Cache length ${s.length} does not match number of blocks ${this.blocks.length}`);
|
|
147
155
|
for (let a = 0; a < this.blocks.length; a++) {
|
|
148
|
-
const d = this.blocks[a], {
|
|
149
|
-
output:
|
|
150
|
-
attention:
|
|
156
|
+
const d = r, g = this.blocks[a], {
|
|
157
|
+
output: m,
|
|
158
|
+
attention: b,
|
|
151
159
|
cache: f
|
|
152
|
-
} =
|
|
153
|
-
r =
|
|
160
|
+
} = g.call(r, o, i, s ? s[a] : void 0);
|
|
161
|
+
r = m, d.dispose(), i && b && n.push(b), s && f ? (s[a]?.k.dispose(), s[a]?.v.dispose(), s[a] = f) : f && (f.k.dispose(), f.v.dispose());
|
|
154
162
|
}
|
|
155
163
|
let h;
|
|
156
164
|
i && n.length > 0 && (h = this.computeAttentionRollout(n)), r = this.lnF.apply(r);
|
|
157
165
|
const c = this.wte.project(r);
|
|
158
166
|
let p;
|
|
159
|
-
return e && (p = this.calculateLoss(c, e)), { logits: c, loss: p, attention: i ? h : void 0 };
|
|
167
|
+
return e && (p = this.calculateLoss(c, e)), this.endMemory("Forward"), { logits: c, loss: p, attention: i ? h : void 0 };
|
|
160
168
|
});
|
|
161
169
|
}
|
|
162
170
|
generate(t, e, o) {
|
|
@@ -168,24 +176,24 @@ class K {
|
|
|
168
176
|
), p = l ? this.config.blockSize - c.shape[1] : 0, a = p > 0 ? this.tf.pad(c, [
|
|
169
177
|
[0, 0],
|
|
170
178
|
[0, p]
|
|
171
|
-
]) : c, { logits: d, attention: g } = this.forward(a, void 0, !1, r, e),
|
|
172
|
-
let
|
|
179
|
+
]) : c, { logits: d, attention: g } = this.forward(a, void 0, !1, r, e), m = d.shape[1] - 1 - p, b = d.slice([0, m, 0], [d.shape[0], 1, d.shape[2]]), f = g ? g.slice([0, m, 0], [g.shape[0], 1, g.shape[2]]) : void 0, k = b.div(i);
|
|
180
|
+
let u;
|
|
173
181
|
if (s) {
|
|
174
|
-
const { values:
|
|
175
|
-
|
|
182
|
+
const { values: y, indices: E } = this.tf.topk(k, s), $ = this.tf.multinomial(y.squeeze([1]), 1);
|
|
183
|
+
u = this.tf.gather(E.squeeze([1]), $, 1);
|
|
176
184
|
} else
|
|
177
|
-
|
|
178
|
-
let
|
|
179
|
-
return o?.includeProbabilities && (
|
|
185
|
+
u = this.tf.multinomial(k.squeeze([1]), 1);
|
|
186
|
+
let w;
|
|
187
|
+
return o?.includeProbabilities && (w = this.tf.softmax(k.squeeze([1]))), u = u.reshape([1, 1]), { output: u, attention: f?.squeeze([1]), probabilities: w };
|
|
180
188
|
});
|
|
181
189
|
}
|
|
182
190
|
getNumParams() {
|
|
183
|
-
return
|
|
191
|
+
return F(this.config);
|
|
184
192
|
}
|
|
185
193
|
dispose() {
|
|
186
194
|
this.wte.dispose(), this.wpe && this.wpe.dispose(), this.drop.dispose(), this.blocks.forEach((t) => t.dispose()), this.lnF.dispose();
|
|
187
195
|
}
|
|
188
196
|
}
|
|
189
197
|
export {
|
|
190
|
-
|
|
198
|
+
A as default
|
|
191
199
|
};
|
package/dist/TeachableLLM.d.ts
CHANGED
|
@@ -6,6 +6,7 @@ import { SaveOptions } from './utilities/save';
|
|
|
6
6
|
import { default as Generator, IGenerateOptions } from './Generator';
|
|
7
7
|
import { default as Trainer, ITrainerOptions } from './Trainer';
|
|
8
8
|
import { default as EE } from 'eventemitter3';
|
|
9
|
+
import { default as MemoryProfiler } from './utilities/profile';
|
|
9
10
|
type TeachableLLMStatus = 'warmup' | 'awaitingTokens' | 'ready' | 'training' | 'loading' | 'busy' | 'error';
|
|
10
11
|
export default class TeachableLLM extends EE<'status' | 'error' | 'trainStep'> {
|
|
11
12
|
private _config?;
|
|
@@ -23,6 +24,9 @@ export default class TeachableLLM extends EE<'status' | 'error' | 'trainStep'> {
|
|
|
23
24
|
saveModel(options?: SaveOptions): Promise<Blob>;
|
|
24
25
|
static loadModel(tf: typeof TF, data: Blob | Buffer | string): TeachableLLM;
|
|
25
26
|
static create(tf: typeof TF, config?: Partial<GPTConfig>): TeachableLLM;
|
|
27
|
+
getProfiler(): MemoryProfiler | undefined;
|
|
28
|
+
get enableProfiler(): boolean;
|
|
29
|
+
set enableProfiler(value: boolean);
|
|
26
30
|
getNumParams(): number;
|
|
27
31
|
trainer(): Trainer;
|
|
28
32
|
train(text: string[], options?: ITrainerOptions): Promise<void>;
|
package/dist/TeachableLLM.js
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
import { defaultConfig as
|
|
2
|
-
import
|
|
3
|
-
import { saveModel as
|
|
4
|
-
import { loadModel as
|
|
5
|
-
import
|
|
1
|
+
import { defaultConfig as h } from "./config.js";
|
|
2
|
+
import d from "./NanoGPTModel.js";
|
|
3
|
+
import { saveModel as m } from "./utilities/save.js";
|
|
4
|
+
import { loadModel as f } from "./utilities/load.js";
|
|
5
|
+
import u from "./Generator.js";
|
|
6
6
|
import _ from "./Trainer.js";
|
|
7
7
|
import { E as c } from "./index-Dwqa6Zy2.js";
|
|
8
|
-
import { dummyPassAsync as
|
|
8
|
+
import { dummyPassAsync as l } from "./utilities/dummy.js";
|
|
9
9
|
import g from "./tokeniser/CharTokeniser.js";
|
|
10
10
|
import "./papaparse.min-C8l2Kvo1.js";
|
|
11
11
|
import "./index-Tf7vU29b.js";
|
|
@@ -13,6 +13,7 @@ import "./jszip.min-CjP2V1VV.js";
|
|
|
13
13
|
import "./ops/scatterSub.js";
|
|
14
14
|
import "./ops/gatherSub.js";
|
|
15
15
|
import "./ops/attentionMask.js";
|
|
16
|
+
import w from "./utilities/profile.js";
|
|
16
17
|
class a extends c {
|
|
17
18
|
_config;
|
|
18
19
|
_model;
|
|
@@ -49,23 +50,23 @@ class a extends c {
|
|
|
49
50
|
saveModel(t) {
|
|
50
51
|
if (!this._model || !this._tokeniser)
|
|
51
52
|
throw new Error("Model or tokeniser is not initialized.");
|
|
52
|
-
return
|
|
53
|
+
return m(this._model, this._tokeniser, t);
|
|
53
54
|
}
|
|
54
55
|
static loadModel(t, r) {
|
|
55
56
|
const e = new a(t);
|
|
56
|
-
return
|
|
57
|
-
e._model =
|
|
57
|
+
return f(t, r).then(({ model: o, tokeniser: s }) => {
|
|
58
|
+
e._model = o, e._tokeniser = s, e._config = o.config, e.setStatus("warmup"), l(o).then(() => {
|
|
58
59
|
e.setStatus("ready");
|
|
59
60
|
}).catch((i) => {
|
|
60
61
|
e.setStatus("error"), e.emit("error", i);
|
|
61
62
|
});
|
|
62
|
-
}).catch((
|
|
63
|
-
e.setStatus("error"), e.emit("error",
|
|
63
|
+
}).catch((o) => {
|
|
64
|
+
e.setStatus("error"), e.emit("error", o);
|
|
64
65
|
}), e;
|
|
65
66
|
}
|
|
66
67
|
static create(t, r = {}) {
|
|
67
|
-
const e = { ...
|
|
68
|
-
return i.setStatus("warmup"),
|
|
68
|
+
const e = { ...h, ...r }, o = new g(e.vocabSize), s = new d(t, e), i = new a(t, o, s);
|
|
69
|
+
return i.setStatus("warmup"), l(s).then(() => {
|
|
69
70
|
i.tokeniser.trained ? i.setStatus("ready") : (i.setStatus("awaitingTokens"), i.tokeniser.once("trainStatus", (n) => {
|
|
70
71
|
n === "trained" && i.setStatus("ready");
|
|
71
72
|
}));
|
|
@@ -73,6 +74,20 @@ class a extends c {
|
|
|
73
74
|
i.setStatus("error"), i.emit("error", n);
|
|
74
75
|
}), i;
|
|
75
76
|
}
|
|
77
|
+
getProfiler() {
|
|
78
|
+
return this._model?.getProfiler();
|
|
79
|
+
}
|
|
80
|
+
get enableProfiler() {
|
|
81
|
+
return !!this._model?.getProfiler();
|
|
82
|
+
}
|
|
83
|
+
set enableProfiler(t) {
|
|
84
|
+
if (t) {
|
|
85
|
+
if (!this._model)
|
|
86
|
+
throw new Error("Model is not initialized.");
|
|
87
|
+
this._model.getProfiler() || this._model.setProfiler(new w());
|
|
88
|
+
} else
|
|
89
|
+
this._model && this._model.setProfiler(void 0);
|
|
90
|
+
}
|
|
76
91
|
getNumParams() {
|
|
77
92
|
if (!this._model)
|
|
78
93
|
throw new Error("Model is not initialized.");
|
|
@@ -84,8 +99,8 @@ class a extends c {
|
|
|
84
99
|
const t = new _(this._model, this._tokeniser);
|
|
85
100
|
return t.on("start", () => this.setStatus("training")), t.on("stop", () => this.setStatus("ready")), t.on("log", async (r) => {
|
|
86
101
|
const e = this.listeners("trainStep");
|
|
87
|
-
for (const
|
|
88
|
-
await
|
|
102
|
+
for (const o of e)
|
|
103
|
+
await o(r);
|
|
89
104
|
}), t;
|
|
90
105
|
}
|
|
91
106
|
train(t, r) {
|
|
@@ -94,7 +109,7 @@ class a extends c {
|
|
|
94
109
|
generator() {
|
|
95
110
|
if (!this._model || !this._tokeniser)
|
|
96
111
|
throw new Error("Model or tokeniser is not initialized.");
|
|
97
|
-
const t = new
|
|
112
|
+
const t = new u(this._model, this._tokeniser);
|
|
98
113
|
return t.on("start", () => {
|
|
99
114
|
this.status === "ready" && this.setStatus("busy");
|
|
100
115
|
}), t.on("stop", () => {
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { o as
|
|
1
|
+
import { o as c, d as s, g as n, E as m, C as r } from "./index-CWQLouWz.js";
|
|
2
2
|
/**
|
|
3
3
|
* @license
|
|
4
4
|
* Copyright 2020 Google LLC. All Rights Reserved.
|
|
@@ -15,13 +15,13 @@ import { o as t, c as s, f as n, E as m, C as r } from "./index-Dsg28SG6.js";
|
|
|
15
15
|
* limitations under the License.
|
|
16
16
|
* =============================================================================
|
|
17
17
|
*/
|
|
18
|
-
function l(o,
|
|
19
|
-
const a = s(o, "real", "complex"), e = s(
|
|
18
|
+
function l(o, p) {
|
|
19
|
+
const a = s(o, "real", "complex"), e = s(p, "imag", "complex");
|
|
20
20
|
n(a.shape, e.shape, `real and imag shapes, ${a.shape} and ${e.shape}, must match in call to tf.complex().`);
|
|
21
|
-
const
|
|
22
|
-
return m.runKernel(r,
|
|
21
|
+
const t = { real: a, imag: e };
|
|
22
|
+
return m.runKernel(r, t);
|
|
23
23
|
}
|
|
24
|
-
const i = /* @__PURE__ */
|
|
24
|
+
const i = /* @__PURE__ */ c({ complex_: l });
|
|
25
25
|
export {
|
|
26
26
|
i as c
|
|
27
27
|
};
|
|
@@ -2068,6 +2068,9 @@ function Sn(n, t) {
|
|
|
2068
2068
|
function Ys() {
|
|
2069
2069
|
return g;
|
|
2070
2070
|
}
|
|
2071
|
+
function Qs() {
|
|
2072
|
+
return g.memory();
|
|
2073
|
+
}
|
|
2071
2074
|
function E(n, t) {
|
|
2072
2075
|
return g.tidy(n, t);
|
|
2073
2076
|
}
|
|
@@ -2890,7 +2893,7 @@ function Yn(n, t, e) {
|
|
|
2890
2893
|
* limitations under the License.
|
|
2891
2894
|
* =============================================================================
|
|
2892
2895
|
*/
|
|
2893
|
-
function
|
|
2896
|
+
function Zs(n, t) {
|
|
2894
2897
|
const e = [];
|
|
2895
2898
|
for (let s = 0; s < t.length; s++) {
|
|
2896
2899
|
const r = n[n.length - s - 1], i = t.length - s - 1, o = t[i];
|
|
@@ -3058,7 +3061,7 @@ function ss(n, t) {
|
|
|
3058
3061
|
a[u] != null && (c[l.name] = a[u]);
|
|
3059
3062
|
}), s?.forEach((l) => c[l.name] = null), { value: o, grads: c };
|
|
3060
3063
|
}
|
|
3061
|
-
function
|
|
3064
|
+
function tr(n) {
|
|
3062
3065
|
return g.customGrad(n);
|
|
3063
3066
|
}
|
|
3064
3067
|
/**
|
|
@@ -3841,51 +3844,52 @@ export {
|
|
|
3841
3844
|
ds as A,
|
|
3842
3845
|
Es as B,
|
|
3843
3846
|
As as C,
|
|
3844
|
-
|
|
3847
|
+
C as D,
|
|
3845
3848
|
g as E,
|
|
3846
|
-
|
|
3849
|
+
zs as F,
|
|
3847
3850
|
Ms as G,
|
|
3848
|
-
|
|
3851
|
+
Bs as H,
|
|
3849
3852
|
Fs as I,
|
|
3850
|
-
|
|
3851
|
-
|
|
3853
|
+
$s as J,
|
|
3854
|
+
Cs as K,
|
|
3852
3855
|
Rs as L,
|
|
3853
3856
|
xs as M,
|
|
3854
3857
|
Ns as N,
|
|
3855
|
-
|
|
3858
|
+
Ps as O,
|
|
3856
3859
|
Ds as P,
|
|
3857
|
-
|
|
3860
|
+
Os as Q,
|
|
3858
3861
|
_s as R,
|
|
3859
3862
|
Ws as S,
|
|
3860
|
-
|
|
3861
|
-
|
|
3862
|
-
|
|
3863
|
-
|
|
3863
|
+
Us as T,
|
|
3864
|
+
Vs as U,
|
|
3865
|
+
Ks as V,
|
|
3866
|
+
Zs as W,
|
|
3867
|
+
Qn as X,
|
|
3864
3868
|
qs as _,
|
|
3865
|
-
|
|
3866
|
-
|
|
3867
|
-
|
|
3868
|
-
|
|
3869
|
+
p as a,
|
|
3870
|
+
Z as b,
|
|
3871
|
+
Js as c,
|
|
3872
|
+
I as d,
|
|
3869
3873
|
Ys as e,
|
|
3870
|
-
|
|
3871
|
-
|
|
3872
|
-
|
|
3873
|
-
|
|
3874
|
-
|
|
3875
|
-
|
|
3876
|
-
|
|
3877
|
-
|
|
3878
|
-
|
|
3874
|
+
V as f,
|
|
3875
|
+
Is as g,
|
|
3876
|
+
Xs as h,
|
|
3877
|
+
y as i,
|
|
3878
|
+
Ls as j,
|
|
3879
|
+
$t as k,
|
|
3880
|
+
Dt as l,
|
|
3881
|
+
Qs as m,
|
|
3882
|
+
Zt as n,
|
|
3879
3883
|
F as o,
|
|
3880
|
-
|
|
3881
|
-
|
|
3884
|
+
G as p,
|
|
3885
|
+
De as q,
|
|
3882
3886
|
Hs as r,
|
|
3883
3887
|
K as s,
|
|
3884
|
-
|
|
3885
|
-
|
|
3886
|
-
|
|
3887
|
-
|
|
3888
|
-
|
|
3889
|
-
|
|
3890
|
-
|
|
3888
|
+
Gs as t,
|
|
3889
|
+
vs as u,
|
|
3890
|
+
Ts as v,
|
|
3891
|
+
w,
|
|
3892
|
+
js as x,
|
|
3893
|
+
tr as y,
|
|
3894
|
+
E as z
|
|
3891
3895
|
};
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import { default as MemoryProfiler } from '../utilities/profile';
|
|
2
|
+
export default abstract class BaseLayer {
|
|
3
|
+
protected _profiler?: MemoryProfiler;
|
|
4
|
+
getProfiler(): MemoryProfiler | undefined;
|
|
5
|
+
setProfiler(value: MemoryProfiler | undefined): void;
|
|
6
|
+
startMemory(): void;
|
|
7
|
+
endMemory(label: string): void;
|
|
8
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
class t {
|
|
2
|
+
_profiler;
|
|
3
|
+
getProfiler() {
|
|
4
|
+
return this._profiler;
|
|
5
|
+
}
|
|
6
|
+
setProfiler(r) {
|
|
7
|
+
this._profiler = r;
|
|
8
|
+
}
|
|
9
|
+
startMemory() {
|
|
10
|
+
this._profiler?.startMemory();
|
|
11
|
+
}
|
|
12
|
+
endMemory(r) {
|
|
13
|
+
this._profiler?.endMemory(r);
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
export {
|
|
17
|
+
t as default
|
|
18
|
+
};
|
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
import { default as TF } from '@tensorflow/tfjs';
|
|
2
2
|
import { GPTConfig } from '../config';
|
|
3
3
|
import { default as RoPECache } from './RoPECache';
|
|
4
|
+
import { default as BaseLayer } from './BaseLayer';
|
|
4
5
|
export type KVCache = {
|
|
5
6
|
k: TF.Tensor;
|
|
6
7
|
v: TF.Tensor;
|
|
7
8
|
length: number;
|
|
8
9
|
cumulativeLength: number;
|
|
9
10
|
};
|
|
10
|
-
export default class CausalSelfAttention {
|
|
11
|
+
export default class CausalSelfAttention extends BaseLayer {
|
|
11
12
|
private readonly ropeCache?;
|
|
12
13
|
private config;
|
|
13
14
|
private cAttn;
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import { attentionMask as z } from "../ops/attentionMask.js";
|
|
2
|
-
|
|
2
|
+
import S from "./BaseLayer.js";
|
|
3
|
+
class C extends S {
|
|
3
4
|
constructor(t, i, s, e) {
|
|
4
|
-
this.ropeCache = e, this.config = s, this.tf = t, this.index = i, this.cAttn = this.tf.layers.dense({
|
|
5
|
+
super(), this.ropeCache = e, this.config = s, this.tf = t, this.index = i, this.cAttn = this.tf.layers.dense({
|
|
5
6
|
units: 3 * s.nEmbed,
|
|
6
7
|
useBias: s.biasInLinear,
|
|
7
8
|
name: `block_${i}_attn_cAttn`,
|
|
@@ -94,23 +95,24 @@ class j {
|
|
|
94
95
|
if (e && !this.config.useRope)
|
|
95
96
|
throw new Error("Cannot use pastKV without RoPE enabled");
|
|
96
97
|
return this.tf.tidy(() => {
|
|
98
|
+
this.startMemory();
|
|
97
99
|
const [o, c, r] = this.getQKV(t), h = o.shape[2], a = this.config.blockSize, u = e ? e.cumulativeLength : 0, [f, d] = this.ropeCache ? this.ropeCache.applyRoPE(o, c, u) : [o, c];
|
|
98
100
|
let n = d, l = r, p = 0;
|
|
99
101
|
e && (p = e.length, n = this.tf.concat([e.k, d], 2), l = this.tf.concat([e.v, r], 2));
|
|
100
102
|
const b = n.shape[2];
|
|
101
103
|
if (b > a) {
|
|
102
|
-
const k = b - a, g = n.shape[0],
|
|
103
|
-
n = n.slice([0, 0, k, 0], [g,
|
|
104
|
+
const k = b - a, g = n.shape[0], A = n.shape[1], I = n.shape[3];
|
|
105
|
+
n = n.slice([0, 0, k, 0], [g, A, a, I]), l = l.slice([0, 0, k, 0], [g, A, a, I]), p = a - h;
|
|
104
106
|
}
|
|
105
107
|
let m;
|
|
106
108
|
p > 0 ? m = this.getAttentionScoresWithPast(f, n, i, p) : m = this.getAttentionScores(f, n, i);
|
|
107
|
-
const
|
|
109
|
+
const _ = this.tf.matMul(m, l), v = this.getOutputProjection(_, i), y = {
|
|
108
110
|
k: this.tf.keep(n),
|
|
109
111
|
v: this.tf.keep(l),
|
|
110
112
|
length: p + h,
|
|
111
113
|
cumulativeLength: e ? e.cumulativeLength + h : h
|
|
112
|
-
};
|
|
113
|
-
return { output:
|
|
114
|
+
}, P = s ? m.mean(1) : void 0;
|
|
115
|
+
return this.endMemory("CausalSelfAttention"), { output: v, attention: P, presentKV: y };
|
|
114
116
|
});
|
|
115
117
|
}
|
|
116
118
|
dispose() {
|
|
@@ -118,5 +120,5 @@ class j {
|
|
|
118
120
|
}
|
|
119
121
|
}
|
|
120
122
|
export {
|
|
121
|
-
|
|
123
|
+
C as default
|
|
122
124
|
};
|
package/dist/layers/MLP.d.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { default as TF } from '@tensorflow/tfjs';
|
|
2
2
|
import { GPTConfig } from '../config';
|
|
3
|
-
|
|
3
|
+
import { default as BaseLayer } from './BaseLayer';
|
|
4
|
+
export default class MLP extends BaseLayer {
|
|
4
5
|
private cFc;
|
|
5
6
|
private cProj;
|
|
6
7
|
private dropout;
|