@genai-fi/nanogpt 0.1.7 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/NanoGPTModel.d.ts +1 -0
- package/dist/NanoGPTModel.js +20 -17
- package/dist/TeachableLLM.d.ts +4 -2
- package/dist/TeachableLLM.js +20 -15
- package/dist/Trainer.d.ts +2 -0
- package/dist/Trainer.js +10 -5
- package/dist/layers/CausalSelfAttention.d.ts +1 -0
- package/dist/layers/CausalSelfAttention.js +19 -16
- package/dist/layers/LayerNorm.d.ts +1 -0
- package/dist/layers/LayerNorm.js +7 -4
- package/dist/layers/MLP.d.ts +1 -0
- package/dist/layers/MLP.js +16 -13
- package/dist/layers/TiedEmbedding.d.ts +1 -0
- package/dist/layers/TiedEmbedding.js +18 -15
- package/dist/layers/TransformerBlock.d.ts +1 -0
- package/dist/layers/TransformerBlock.js +10 -7
- package/dist/tokeniser/CharTokeniser.js +23 -23
- package/dist/training/FullTrainer.js +27 -29
- package/dist/training/Trainer.d.ts +2 -0
- package/dist/training/Trainer.js +31 -27
- package/dist/utilities/save.d.ts +7 -1
- package/dist/utilities/save.js +28 -13
- package/package.json +1 -1
package/dist/NanoGPTModel.d.ts
CHANGED
package/dist/NanoGPTModel.js
CHANGED
|
@@ -54,7 +54,7 @@ class $ {
|
|
|
54
54
|
}
|
|
55
55
|
inputPhase(t, e = !1) {
|
|
56
56
|
return this.tf.tidy(() => {
|
|
57
|
-
const [, s] = t.shape, i = this.wte.embed(t), n = this.tf.range(0, s, 1, "int32"),
|
|
57
|
+
const [, s] = t.shape, i = this.wte.embed(t), n = this.tf.range(0, s, 1, "int32"), h = this.wpe.apply(n), o = i.add(h);
|
|
58
58
|
return this.drop.apply(o, { training: e });
|
|
59
59
|
});
|
|
60
60
|
}
|
|
@@ -98,8 +98,8 @@ class $ {
|
|
|
98
98
|
throw new Error("No attentions for rollout");
|
|
99
99
|
const e = t[0].shape[0], s = t[0].shape[1], i = this.tf.eye(s, s).expandDims(0);
|
|
100
100
|
let n = i.tile([e, 1, 1]);
|
|
101
|
-
for (const
|
|
102
|
-
let o =
|
|
101
|
+
for (const h of t) {
|
|
102
|
+
let o = h.add(i);
|
|
103
103
|
o = o.div(o.sum(-1, !0)), n = o.matMul(n);
|
|
104
104
|
}
|
|
105
105
|
return n;
|
|
@@ -108,36 +108,36 @@ class $ {
|
|
|
108
108
|
forward(t, e, s = !1, i = !1) {
|
|
109
109
|
return this.validateInput(t), this.tf.tidy(() => {
|
|
110
110
|
let n = this.inputPhase(t, s);
|
|
111
|
-
const
|
|
111
|
+
const h = [];
|
|
112
112
|
for (const c of this.blocks) {
|
|
113
|
-
const { output:
|
|
114
|
-
n =
|
|
113
|
+
const { output: d, attention: l } = c.call(n, s, i);
|
|
114
|
+
n = d, i && l && h.push(l);
|
|
115
115
|
}
|
|
116
116
|
let o;
|
|
117
|
-
i &&
|
|
118
|
-
const
|
|
117
|
+
i && h.length > 0 && (o = this.computeAttentionRollout(h)), n = this.lnF.apply(n);
|
|
118
|
+
const a = this.wte.project(n);
|
|
119
119
|
let r;
|
|
120
|
-
return e && (r = this.calculateLoss(
|
|
120
|
+
return e && (r = this.calculateLoss(a, e)), { logits: a, loss: r, attention: i ? o : void 0 };
|
|
121
121
|
});
|
|
122
122
|
}
|
|
123
123
|
generate(t, e) {
|
|
124
|
-
const s = e?.temperature ?? 1, i = e?.topK, n = e?.usePadding ?? !1,
|
|
124
|
+
const s = e?.temperature ?? 1, i = e?.topK, n = e?.usePadding ?? !1, h = e?.includeAttention ?? !1;
|
|
125
125
|
return this.tf.tidy(() => {
|
|
126
|
-
const o = t,
|
|
127
|
-
[0,
|
|
126
|
+
const o = t, a = o.shape[1], r = a <= this.config.blockSize ? o : o.slice(
|
|
127
|
+
[0, a - this.config.blockSize],
|
|
128
128
|
[o.shape[0], this.config.blockSize]
|
|
129
|
-
), c = n ? this.config.blockSize - r.shape[1] : 0,
|
|
129
|
+
), c = n ? this.config.blockSize - r.shape[1] : 0, d = c > 0 ? this.tf.pad(r, [
|
|
130
130
|
[0, 0],
|
|
131
131
|
[0, c]
|
|
132
|
-
]) : r, { logits: l, attention:
|
|
132
|
+
]) : r, { logits: l, attention: p } = this.forward(d, void 0, !1, h), b = l.shape[1] - 1 - c, u = l.slice([0, b, 0], [l.shape[0], 1, l.shape[2]]), k = p ? p.slice([0, b, 0], [p.shape[0], 1, p.shape[2]]) : void 0, g = u.div(s);
|
|
133
133
|
let f;
|
|
134
134
|
if (i) {
|
|
135
|
-
const { values: w, indices: E } = this.tf.topk(
|
|
135
|
+
const { values: w, indices: E } = this.tf.topk(g, i), y = this.tf.multinomial(w.squeeze([1]), 1);
|
|
136
136
|
f = this.tf.gather(E.squeeze([1]), y, 1);
|
|
137
137
|
} else
|
|
138
|
-
f = this.tf.multinomial(
|
|
138
|
+
f = this.tf.multinomial(g.squeeze([1]), 1);
|
|
139
139
|
let m;
|
|
140
|
-
return e?.includeProbabilities && (m = this.tf.softmax(
|
|
140
|
+
return e?.includeProbabilities && (m = this.tf.softmax(g.squeeze([1]))), f = f.reshape([1, 1]), { output: f, attention: k?.squeeze([1]), probabilities: m };
|
|
141
141
|
});
|
|
142
142
|
}
|
|
143
143
|
getNumParams() {
|
|
@@ -146,6 +146,9 @@ class $ {
|
|
|
146
146
|
this.config.nEmbed * 4 * this.config.nEmbed), i = this.config.nEmbed + this.config.vocabSize * this.config.nEmbed;
|
|
147
147
|
return t + e + s + i;
|
|
148
148
|
}
|
|
149
|
+
dispose() {
|
|
150
|
+
this.wte.dispose(), this.wpe.dispose(), this.drop.dispose(), this.blocks.forEach((t) => t.dispose()), this.lnF.dispose();
|
|
151
|
+
}
|
|
149
152
|
}
|
|
150
153
|
export {
|
|
151
154
|
$ as default
|
package/dist/TeachableLLM.d.ts
CHANGED
|
@@ -2,10 +2,11 @@ import { default as TF } from '@tensorflow/tfjs';
|
|
|
2
2
|
import { GPTConfig } from './config';
|
|
3
3
|
import { ITokeniser } from './tokeniser/type';
|
|
4
4
|
import { default as NanoGPT } from './NanoGPTModel';
|
|
5
|
+
import { SaveOptions } from './utilities/save';
|
|
5
6
|
import { default as Generator, IGenerateOptions } from './Generator';
|
|
6
7
|
import { default as Trainer, ITrainerOptions } from './Trainer';
|
|
7
8
|
import { default as EE } from 'eventemitter3';
|
|
8
|
-
type TeachableLLMStatus = 'warmup' | 'ready' | 'training' | 'loading' | 'busy' | 'error';
|
|
9
|
+
type TeachableLLMStatus = 'warmup' | 'awaitingTokens' | 'ready' | 'training' | 'loading' | 'busy' | 'error';
|
|
9
10
|
export default class TeachableLLM extends EE<'status' | 'error' | 'trainStep'> {
|
|
10
11
|
private _config?;
|
|
11
12
|
private _model?;
|
|
@@ -19,7 +20,7 @@ export default class TeachableLLM extends EE<'status' | 'error' | 'trainStep'> {
|
|
|
19
20
|
get status(): TeachableLLMStatus;
|
|
20
21
|
get ready(): boolean;
|
|
21
22
|
private setStatus;
|
|
22
|
-
saveModel(): Promise<Blob>;
|
|
23
|
+
saveModel(options?: SaveOptions): Promise<Blob>;
|
|
23
24
|
static loadModel(tf: typeof TF, data: Blob | Buffer | string): TeachableLLM;
|
|
24
25
|
static create(tf: typeof TF, config?: Partial<GPTConfig>): TeachableLLM;
|
|
25
26
|
getNumParams(): number;
|
|
@@ -27,5 +28,6 @@ export default class TeachableLLM extends EE<'status' | 'error' | 'trainStep'> {
|
|
|
27
28
|
train(text: string[], options?: ITrainerOptions): Promise<void>;
|
|
28
29
|
generator(): Generator;
|
|
29
30
|
generateText(prompt?: string, options?: IGenerateOptions): Promise<string>;
|
|
31
|
+
dispose(): void;
|
|
30
32
|
}
|
|
31
33
|
export {};
|
package/dist/TeachableLLM.js
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import d from "./NanoGPTModel.js";
|
|
2
|
-
import { defaultConfig as
|
|
3
|
-
import { saveModel as
|
|
2
|
+
import { defaultConfig as u } from "./config.js";
|
|
3
|
+
import { saveModel as m } from "./utilities/save.js";
|
|
4
4
|
import { loadModel as l } from "./utilities/load.js";
|
|
5
5
|
import f from "./Generator.js";
|
|
6
6
|
import _ from "./Trainer.js";
|
|
7
7
|
import { E as c } from "./index-SOhdqzHq.js";
|
|
8
|
-
import { dummyPassAsync as
|
|
8
|
+
import { dummyPassAsync as h } from "./utilities/dummy.js";
|
|
9
9
|
import g from "./tokeniser/CharTokeniser.js";
|
|
10
|
-
class
|
|
10
|
+
class a extends c {
|
|
11
11
|
_config;
|
|
12
12
|
_model;
|
|
13
13
|
tf;
|
|
@@ -35,20 +35,20 @@ class n extends c {
|
|
|
35
35
|
return this._status;
|
|
36
36
|
}
|
|
37
37
|
get ready() {
|
|
38
|
-
return this._status === "ready" && !!this._model && !!this._tokeniser;
|
|
38
|
+
return this._status === "ready" && !!this._model && !!this._tokeniser && this.tokeniser.trained;
|
|
39
39
|
}
|
|
40
40
|
setStatus(t) {
|
|
41
41
|
this._status !== t && (this._status = t, this.emit("status", t));
|
|
42
42
|
}
|
|
43
|
-
saveModel() {
|
|
43
|
+
saveModel(t) {
|
|
44
44
|
if (!this._model || !this._tokeniser)
|
|
45
45
|
throw new Error("Model or tokeniser is not initialized.");
|
|
46
|
-
return
|
|
46
|
+
return m(this._model, this._tokeniser, t);
|
|
47
47
|
}
|
|
48
48
|
static loadModel(t, r) {
|
|
49
|
-
const e = new
|
|
49
|
+
const e = new a(t);
|
|
50
50
|
return l(t, r).then(({ model: i, tokeniser: o }) => {
|
|
51
|
-
e._model = i, e._tokeniser = o, e._config = i.config, e.setStatus("warmup"),
|
|
51
|
+
e._model = i, e._tokeniser = o, e._config = i.config, e.setStatus("warmup"), h(i).then(() => {
|
|
52
52
|
e.setStatus("ready");
|
|
53
53
|
}).catch((s) => {
|
|
54
54
|
e.setStatus("error"), e.emit("error", s);
|
|
@@ -58,11 +58,13 @@ class n extends c {
|
|
|
58
58
|
}), e;
|
|
59
59
|
}
|
|
60
60
|
static create(t, r = {}) {
|
|
61
|
-
const e = { ...
|
|
62
|
-
return s.setStatus("warmup"),
|
|
63
|
-
s.setStatus("
|
|
64
|
-
|
|
65
|
-
|
|
61
|
+
const e = { ...u, ...r }, i = new g(e.vocabSize), o = new d(t, e), s = new a(t, i, o);
|
|
62
|
+
return s.setStatus("warmup"), h(o).then(() => {
|
|
63
|
+
s.setStatus("awaitingTokens"), s.tokeniser.once("trainStatus", (n) => {
|
|
64
|
+
n === "trained" && s.setStatus("ready");
|
|
65
|
+
});
|
|
66
|
+
}).catch((n) => {
|
|
67
|
+
s.setStatus("error"), s.emit("error", n);
|
|
66
68
|
}), s;
|
|
67
69
|
}
|
|
68
70
|
getNumParams() {
|
|
@@ -96,7 +98,10 @@ class n extends c {
|
|
|
96
98
|
generateText(t, r) {
|
|
97
99
|
return this.generator().generate(t, r);
|
|
98
100
|
}
|
|
101
|
+
dispose() {
|
|
102
|
+
this._model?.dispose();
|
|
103
|
+
}
|
|
99
104
|
}
|
|
100
105
|
export {
|
|
101
|
-
|
|
106
|
+
a as default
|
|
102
107
|
};
|
package/dist/Trainer.d.ts
CHANGED
|
@@ -12,7 +12,9 @@ export interface ITrainerOptions {
|
|
|
12
12
|
}
|
|
13
13
|
export default class Trainer extends EE<'start' | 'stop' | 'log'> {
|
|
14
14
|
private trainer;
|
|
15
|
+
private hasTrained;
|
|
15
16
|
constructor(model: NanoGPT, tokeniser: ITokeniser);
|
|
16
17
|
stop(): void;
|
|
18
|
+
reset(): void;
|
|
17
19
|
train(text: string[], options?: ITrainerOptions): Promise<void>;
|
|
18
20
|
}
|
package/dist/Trainer.js
CHANGED
|
@@ -1,11 +1,16 @@
|
|
|
1
1
|
import { E as l } from "./index-SOhdqzHq.js";
|
|
2
|
-
import
|
|
3
|
-
class
|
|
2
|
+
import h from "./training/FullTrainer.js";
|
|
3
|
+
class m extends l {
|
|
4
4
|
trainer;
|
|
5
|
+
hasTrained = !1;
|
|
5
6
|
constructor(a, t) {
|
|
6
|
-
super(), this.trainer = new
|
|
7
|
+
super(), this.trainer = new h(a.tf, a, t, 1e-3);
|
|
7
8
|
}
|
|
8
9
|
stop() {
|
|
10
|
+
this.trainer.stop();
|
|
11
|
+
}
|
|
12
|
+
reset() {
|
|
13
|
+
this.hasTrained = !1, this.trainer.reset();
|
|
9
14
|
}
|
|
10
15
|
async train(a, t) {
|
|
11
16
|
const { trainDataset: e, validationDataset: r } = await this.trainer.createTrainValidationSplit(
|
|
@@ -13,7 +18,7 @@ class d extends l {
|
|
|
13
18
|
t?.batchSize || 32,
|
|
14
19
|
t?.validationSplit || 0.1
|
|
15
20
|
);
|
|
16
|
-
this.trainer.setLearningRate(t?.learningRate || 1e-3), this.emit("start"), await this.trainer.trainOnDataset(
|
|
21
|
+
this.hasTrained || this.trainer.setLearningRate(t?.learningRate || 1e-3), this.hasTrained = !0, this.emit("start"), await this.trainer.trainOnDataset(
|
|
17
22
|
e,
|
|
18
23
|
{
|
|
19
24
|
prompt: t?.prompt,
|
|
@@ -31,5 +36,5 @@ class d extends l {
|
|
|
31
36
|
}
|
|
32
37
|
}
|
|
33
38
|
export {
|
|
34
|
-
|
|
39
|
+
m as default
|
|
35
40
|
};
|
|
@@ -50,35 +50,38 @@ class m {
|
|
|
50
50
|
this.cAttn.setWeights(t.get(`block_${this.index}_cAttn`) || []), this.cProj.setWeights(t.get(`block_${this.index}_cProj`) || []);
|
|
51
51
|
}
|
|
52
52
|
getAttentionScores(t, e, s) {
|
|
53
|
-
const a = t.shape[2],
|
|
53
|
+
const a = t.shape[2], o = this.tf.matMul(t, e, !1, !0).mul(this.divisor), i = this.maskInf.slice([0, 0], [a, a]), n = o.add(i), h = this.tf.softmax(n, -1);
|
|
54
54
|
return this.attnDropout.apply(h, { training: s });
|
|
55
55
|
}
|
|
56
56
|
getQKV(t) {
|
|
57
|
-
const [e, s, a] = t.shape, r = this.cAttn.apply(t), [
|
|
57
|
+
const [e, s, a] = t.shape, r = this.cAttn.apply(t), [o, i, n] = this.tf.split(r, 3, -1);
|
|
58
58
|
r.dispose();
|
|
59
|
-
const h = a / this.config.nHead, c = this.tf.reshape(
|
|
60
|
-
|
|
61
|
-
const
|
|
59
|
+
const h = a / this.config.nHead, c = this.tf.reshape(o, [e, s, this.config.nHead, h]);
|
|
60
|
+
o.dispose();
|
|
61
|
+
const l = c.transpose([0, 2, 1, 3]);
|
|
62
62
|
c.dispose();
|
|
63
|
-
const
|
|
63
|
+
const d = this.tf.reshape(i, [e, s, this.config.nHead, h]);
|
|
64
64
|
i.dispose();
|
|
65
|
-
const u =
|
|
66
|
-
|
|
67
|
-
const
|
|
68
|
-
|
|
69
|
-
const b =
|
|
70
|
-
return
|
|
65
|
+
const u = d.transpose([0, 2, 1, 3]);
|
|
66
|
+
d.dispose();
|
|
67
|
+
const p = this.tf.reshape(n, [e, s, this.config.nHead, h]);
|
|
68
|
+
n.dispose();
|
|
69
|
+
const b = p.transpose([0, 2, 1, 3]);
|
|
70
|
+
return p.dispose(), [l, u, b];
|
|
71
71
|
}
|
|
72
72
|
getOutputProjection(t, e) {
|
|
73
|
-
const s = t.shape[0], a = t.shape[2], r = this.config.nEmbed,
|
|
74
|
-
return this.residDropout.apply(
|
|
73
|
+
const s = t.shape[0], a = t.shape[2], r = this.config.nEmbed, o = t.transpose([0, 2, 1, 3]), i = this.tf.reshape(o, [s, a, r]), n = this.cProj.apply(i);
|
|
74
|
+
return this.residDropout.apply(n, { training: e });
|
|
75
75
|
}
|
|
76
76
|
call(t, e = !1, s = !1) {
|
|
77
77
|
return this.tf.tidy(() => {
|
|
78
|
-
const [a, r,
|
|
79
|
-
return { output: this.getOutputProjection(
|
|
78
|
+
const [a, r, o] = this.getQKV(t), i = this.getAttentionScores(a, r, e), n = this.tf.matMul(i, o);
|
|
79
|
+
return { output: this.getOutputProjection(n, e), attention: s ? i.mean(1) : void 0 };
|
|
80
80
|
});
|
|
81
81
|
}
|
|
82
|
+
dispose() {
|
|
83
|
+
this.cAttn.dispose(), this.cProj.dispose(), this.attnDropout.dispose(), this.residDropout.dispose(), this.bias.dispose(), this.maskInf.dispose(), this.divisor.dispose();
|
|
84
|
+
}
|
|
82
85
|
}
|
|
83
86
|
export {
|
|
84
87
|
m as default
|
package/dist/layers/LayerNorm.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
class
|
|
1
|
+
class h {
|
|
2
2
|
gamma;
|
|
3
3
|
//private beta: TF.Variable;
|
|
4
4
|
epsilon;
|
|
@@ -20,11 +20,14 @@ class u {
|
|
|
20
20
|
}
|
|
21
21
|
apply(a) {
|
|
22
22
|
return this.tf.tidy(() => {
|
|
23
|
-
const s = a.mean(-1, !0), t = a.sub(s),
|
|
24
|
-
return t.mul(
|
|
23
|
+
const s = a.mean(-1, !0), t = a.sub(s), i = t.square().mean(-1, !0).add(this.epsilon).rsqrt();
|
|
24
|
+
return t.mul(i).mul(this.gamma);
|
|
25
25
|
});
|
|
26
26
|
}
|
|
27
|
+
dispose() {
|
|
28
|
+
this.gamma.dispose();
|
|
29
|
+
}
|
|
27
30
|
}
|
|
28
31
|
export {
|
|
29
|
-
|
|
32
|
+
h as default
|
|
30
33
|
};
|
package/dist/layers/MLP.d.ts
CHANGED
package/dist/layers/MLP.js
CHANGED
|
@@ -5,27 +5,27 @@ class l {
|
|
|
5
5
|
tf;
|
|
6
6
|
index;
|
|
7
7
|
_trainable = !0;
|
|
8
|
-
constructor(t,
|
|
9
|
-
this.tf = t, this.index =
|
|
10
|
-
units:
|
|
8
|
+
constructor(t, e, i) {
|
|
9
|
+
this.tf = t, this.index = e, this.cFc = this.tf.layers.dense({
|
|
10
|
+
units: i.mlpFactor * i.nEmbed,
|
|
11
11
|
activation: "gelu",
|
|
12
|
-
useBias:
|
|
12
|
+
useBias: i.biasInLinear,
|
|
13
13
|
kernelInitializer: this.tf.initializers.randomNormal({
|
|
14
14
|
mean: 0,
|
|
15
15
|
stddev: 0.02
|
|
16
16
|
}),
|
|
17
17
|
biasInitializer: "zeros",
|
|
18
|
-
name: `block_${
|
|
18
|
+
name: `block_${e}_mlp_cFc`
|
|
19
19
|
}), this.cProj = this.tf.layers.dense({
|
|
20
|
-
units:
|
|
21
|
-
useBias:
|
|
20
|
+
units: i.nEmbed,
|
|
21
|
+
useBias: i.biasInLinear,
|
|
22
22
|
kernelInitializer: this.tf.initializers.randomNormal({
|
|
23
23
|
mean: 0,
|
|
24
|
-
stddev: 0.02 / Math.sqrt(2 *
|
|
24
|
+
stddev: 0.02 / Math.sqrt(2 * i.nLayer)
|
|
25
25
|
}),
|
|
26
26
|
biasInitializer: "zeros",
|
|
27
|
-
name: `block_${
|
|
28
|
-
}), this.dropout = this.tf.layers.dropout({ rate:
|
|
27
|
+
name: `block_${e}_mlp_cProj`
|
|
28
|
+
}), this.dropout = this.tf.layers.dropout({ rate: i.dropout });
|
|
29
29
|
}
|
|
30
30
|
get variables() {
|
|
31
31
|
return [
|
|
@@ -45,12 +45,15 @@ class l {
|
|
|
45
45
|
loadWeights(t) {
|
|
46
46
|
this.cFc.setWeights(t.get(`block_${this.index}_mlpHidden`) || []), this.cProj.setWeights(t.get(`block_${this.index}_mlpOut`) || []);
|
|
47
47
|
}
|
|
48
|
-
call(t,
|
|
48
|
+
call(t, e = !1) {
|
|
49
49
|
return this.tf.tidy(() => {
|
|
50
|
-
const
|
|
51
|
-
return this.dropout.apply(s, { training:
|
|
50
|
+
const i = this.cFc.apply(t), s = this.cProj.apply(i);
|
|
51
|
+
return this.dropout.apply(s, { training: e });
|
|
52
52
|
});
|
|
53
53
|
}
|
|
54
|
+
dispose() {
|
|
55
|
+
this.cFc.dispose(), this.cProj.dispose(), this.dropout.dispose();
|
|
56
|
+
}
|
|
54
57
|
}
|
|
55
58
|
export {
|
|
56
59
|
l as default
|
|
@@ -168,11 +168,11 @@ const we = /* @__PURE__ */ p({ imag_: Ke });
|
|
|
168
168
|
* limitations under the License.
|
|
169
169
|
* =============================================================================
|
|
170
170
|
*/
|
|
171
|
-
function
|
|
171
|
+
function We(t, e = 0.2) {
|
|
172
172
|
const n = { x: a(t, "x", "leakyRelu") }, r = { alpha: e };
|
|
173
173
|
return u.runKernel(ae, n, r);
|
|
174
174
|
}
|
|
175
|
-
const
|
|
175
|
+
const ze = /* @__PURE__ */ p({ leakyRelu_: We });
|
|
176
176
|
/**
|
|
177
177
|
* @license
|
|
178
178
|
* Copyright 2018 Google LLC. All Rights Reserved.
|
|
@@ -189,11 +189,11 @@ const Ee = /* @__PURE__ */ p({ leakyRelu_: ze });
|
|
|
189
189
|
* limitations under the License.
|
|
190
190
|
* =============================================================================
|
|
191
191
|
*/
|
|
192
|
-
function
|
|
192
|
+
function Ee(t) {
|
|
193
193
|
const s = { x: a(t, "x", "neg") };
|
|
194
194
|
return u.runKernel(ue, s);
|
|
195
195
|
}
|
|
196
|
-
const Oe = /* @__PURE__ */ p({ neg_:
|
|
196
|
+
const Oe = /* @__PURE__ */ p({ neg_: Ee });
|
|
197
197
|
/**
|
|
198
198
|
* @license
|
|
199
199
|
* Copyright 2020 Google LLC. All Rights Reserved.
|
|
@@ -368,7 +368,7 @@ function Ue(t, e, s, n) {
|
|
|
368
368
|
if (e === "prelu")
|
|
369
369
|
return Fe(t, s);
|
|
370
370
|
if (e === "leakyrelu")
|
|
371
|
-
return
|
|
371
|
+
return ze(t, n);
|
|
372
372
|
if (e === "sigmoid")
|
|
373
373
|
return De(t);
|
|
374
374
|
throw new Error(`Unknown fused activation ${e}.`);
|
|
@@ -397,18 +397,18 @@ function Je({ a: t, b: e, transposeA: s = !1, transposeB: n = !1, bias: r, activ
|
|
|
397
397
|
}
|
|
398
398
|
let o = a(t, "a", "fused matMul"), c = a(e, "b", "fused matMul");
|
|
399
399
|
[o, c] = A(o, c);
|
|
400
|
-
const b = s ? o.shape[o.rank - 2] : o.shape[o.rank - 1], D = n ? c.shape[c.rank - 1] : c.shape[c.rank - 2], w = s ? o.shape[o.rank - 1] : o.shape[o.rank - 2],
|
|
400
|
+
const b = s ? o.shape[o.rank - 2] : o.shape[o.rank - 1], D = n ? c.shape[c.rank - 1] : c.shape[c.rank - 2], w = s ? o.shape[o.rank - 1] : o.shape[o.rank - 2], W = n ? c.shape[c.rank - 2] : c.shape[c.rank - 1], T = o.shape.slice(0, -2), S = c.shape.slice(0, -2), N = q(T), v = q(S);
|
|
401
401
|
B(b === D, () => `Error in fused matMul: inner shapes (${b}) and (${D}) of Tensors with shapes ${o.shape} and ${c.shape} and transposeA=${s} and transposeB=${n} must match.`);
|
|
402
|
-
const O = P(o.shape.slice(0, -2), c.shape.slice(0, -2)).concat([w,
|
|
402
|
+
const O = P(o.shape.slice(0, -2), c.shape.slice(0, -2)).concat([w, W]), R = s ? f(o, [N, b, w]) : f(o, [N, w, b]), F = n ? f(c, [v, W, D]) : f(c, [v, D, W]);
|
|
403
403
|
let y;
|
|
404
404
|
r != null && (y = a(r, "bias", "fused matMul"), [y] = A(y, o), P(O, y.shape));
|
|
405
405
|
let C;
|
|
406
406
|
l != null && (C = a(l, "prelu weights", "fused matMul"));
|
|
407
407
|
const G = (x, K) => {
|
|
408
|
-
const [g, $, k,
|
|
408
|
+
const [g, $, k, z] = K, m = qe(f(x, k.shape), k, i);
|
|
409
409
|
let _, M;
|
|
410
410
|
if (!s && !n ? (_ = d(m, $, !1, !0), M = d(g, m, !0, !1)) : !s && n ? (_ = d(m, $, !1, !1), M = d(m, g, !0, !1)) : s && !n ? (_ = d($, m, !1, !0), M = d(g, m, !1, !1)) : (_ = d($, m, !0, !0), M = d(m, g, !0, !0)), r != null) {
|
|
411
|
-
const Q = Pe(
|
|
411
|
+
const Q = Pe(z, m);
|
|
412
412
|
return [_, M, Q];
|
|
413
413
|
} else
|
|
414
414
|
return [_, M];
|
|
@@ -425,11 +425,11 @@ function Je({ a: t, b: e, transposeA: s = !1, transposeB: n = !1, bias: r, activ
|
|
|
425
425
|
);
|
|
426
426
|
return $([K, g, k]), { value: f(k, O), gradFunc: G };
|
|
427
427
|
})(R, F) : U((K, g, $, k) => {
|
|
428
|
-
const
|
|
428
|
+
const z = (
|
|
429
429
|
// tslint:disable-next-line: no-unnecessary-type-assertion
|
|
430
430
|
u.runKernel(H, I, j)
|
|
431
431
|
);
|
|
432
|
-
return k([K, g,
|
|
432
|
+
return k([K, g, z, $]), { value: f(z, O), gradFunc: G };
|
|
433
433
|
})(R, F, y);
|
|
434
434
|
}
|
|
435
435
|
const J = /* @__PURE__ */ p({ fusedMatMul_: Je });
|
|
@@ -442,9 +442,9 @@ const J = /* @__PURE__ */ p({ fusedMatMul_: Je });
|
|
|
442
442
|
* https://opensource.org/licenses/MIT.
|
|
443
443
|
* =============================================================================
|
|
444
444
|
*/
|
|
445
|
-
class
|
|
445
|
+
class E extends Error {
|
|
446
446
|
constructor(e) {
|
|
447
|
-
super(e), Object.setPrototypeOf(this,
|
|
447
|
+
super(e), Object.setPrototypeOf(this, E.prototype);
|
|
448
448
|
}
|
|
449
449
|
}
|
|
450
450
|
/**
|
|
@@ -458,11 +458,11 @@ class W extends Error {
|
|
|
458
458
|
*/
|
|
459
459
|
function Qe(t, e, s, n) {
|
|
460
460
|
if (t.rank < 2 || e.rank < 2)
|
|
461
|
-
throw new
|
|
461
|
+
throw new E(`dot requires both inputs to be rank >= 2 but got x shape = ${t.shape} and y shape = ${e.shape}`);
|
|
462
462
|
if (e.rank >= 3) {
|
|
463
463
|
const r = t.shape.slice(-1)[0], i = e.shape.slice(-2)[0];
|
|
464
464
|
if (r !== i)
|
|
465
|
-
throw new
|
|
465
|
+
throw new E(`If rank y >= 3, then the second last dim of y must equal the last dim of x but got x shape = ${t.shape} and y shape = ${e.shape}`);
|
|
466
466
|
}
|
|
467
467
|
if (t.rank === 2 && e.rank === 2)
|
|
468
468
|
return J({
|
|
@@ -526,6 +526,9 @@ class Ye {
|
|
|
526
526
|
embedDim: this.embedDim
|
|
527
527
|
};
|
|
528
528
|
}
|
|
529
|
+
dispose() {
|
|
530
|
+
this.tiedWeights.dispose();
|
|
531
|
+
}
|
|
529
532
|
}
|
|
530
533
|
export {
|
|
531
534
|
Ye as default
|
|
@@ -10,8 +10,8 @@ class u {
|
|
|
10
10
|
index;
|
|
11
11
|
_trainable = !0;
|
|
12
12
|
skipped = !1;
|
|
13
|
-
constructor(t,
|
|
14
|
-
this.tf = t, this.index =
|
|
13
|
+
constructor(t, i, s) {
|
|
14
|
+
this.tf = t, this.index = i, this.ln1 = new l(t, [s.nEmbed], 1e-5, `block_${this.index}_ln1`), this.attn = new h(this.tf, this.index, s), this.ln2 = new l(t, [s.nEmbed], 1e-5, `block_${this.index}_ln2`), this.mlp = new r(this.tf, this.index, s);
|
|
15
15
|
}
|
|
16
16
|
get variables() {
|
|
17
17
|
return [
|
|
@@ -33,18 +33,21 @@ class u {
|
|
|
33
33
|
loadWeights(t) {
|
|
34
34
|
this.attn.loadWeights(t), this.mlp.loadWeights(t), this.ln1.setWeights(t.get(`block_${this.index}_ln1`) || []), this.ln2.setWeights(t.get(`block_${this.index}_ln2`) || []);
|
|
35
35
|
}
|
|
36
|
-
getMLPOutput(t,
|
|
37
|
-
const
|
|
36
|
+
getMLPOutput(t, i) {
|
|
37
|
+
const s = this.ln2.apply(t), e = this.mlp.call(s, i);
|
|
38
38
|
return t.add(e);
|
|
39
39
|
}
|
|
40
|
-
call(t,
|
|
40
|
+
call(t, i = !1, s = !1) {
|
|
41
41
|
return this.tf.tidy(() => {
|
|
42
42
|
if (this.skipped)
|
|
43
43
|
return { output: t };
|
|
44
|
-
const e = this.ln1.apply(t), n = this.attn.call(e,
|
|
45
|
-
return { output: this.getMLPOutput(a,
|
|
44
|
+
const e = this.ln1.apply(t), n = this.attn.call(e, i, s), a = t.add(n.output);
|
|
45
|
+
return { output: this.getMLPOutput(a, i), attention: n.attention };
|
|
46
46
|
});
|
|
47
47
|
}
|
|
48
|
+
dispose() {
|
|
49
|
+
this.ln1.dispose(), this.attn.dispose(), this.ln2.dispose(), this.mlp.dispose();
|
|
50
|
+
}
|
|
48
51
|
}
|
|
49
52
|
export {
|
|
50
53
|
u as default
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import { E as
|
|
2
|
-
const
|
|
3
|
-
class l extends
|
|
1
|
+
import { E as r } from "../index-SOhdqzHq.js";
|
|
2
|
+
const h = ["<eos>", "<unk>"];
|
|
3
|
+
class l extends r {
|
|
4
4
|
vocabSize = 0;
|
|
5
5
|
eosToken = 0;
|
|
6
6
|
unkToken = 0;
|
|
@@ -9,7 +9,7 @@ class l extends h {
|
|
|
9
9
|
constructor(s) {
|
|
10
10
|
if (super(), Array.isArray(s))
|
|
11
11
|
if (this.vocab = s, this.vocab.length > 0)
|
|
12
|
-
this.vocabSize = this.vocab.length, this.eosToken = this.vocab.indexOf("<eos>"), this.unkToken = this.vocab.indexOf("<unk>"), this.unkToken === -1 && (this.unkToken = this.eosToken), this.vocab.forEach((i, o) => {
|
|
12
|
+
this.vocabSize = this.vocab.length, this.eosToken = this.vocab.indexOf("<eos>"), this.unkToken = this.vocab.indexOf("<unk>"), this.unkToken === -1 && (this.unkToken = this.vocab.indexOf("<pad>")), this.unkToken === -1 && (this.unkToken = this.vocab.indexOf("_")), this.unkToken === -1 && (this.unkToken = this.vocab.indexOf(" ")), this.unkToken === -1 && (this.unkToken = this.eosToken), this.vocab.forEach((i, o) => {
|
|
13
13
|
this.cache.set(i, o);
|
|
14
14
|
});
|
|
15
15
|
else
|
|
@@ -23,29 +23,29 @@ class l extends h {
|
|
|
23
23
|
destroy() {
|
|
24
24
|
}
|
|
25
25
|
async train(s) {
|
|
26
|
-
const i = s.map((
|
|
27
|
-
if (
|
|
28
|
-
const
|
|
26
|
+
const i = s.map((t) => t.split("")).flat(), o = new Set(i), e = Array.from(o), n = this.vocabSize - h.length;
|
|
27
|
+
if (e.length > n) {
|
|
28
|
+
const t = /* @__PURE__ */ new Map();
|
|
29
29
|
i.forEach((a) => {
|
|
30
|
-
|
|
31
|
-
}),
|
|
32
|
-
} else if (
|
|
33
|
-
for (;
|
|
34
|
-
|
|
35
|
-
return
|
|
36
|
-
this.cache.set(
|
|
37
|
-
}), this.vocabSize;
|
|
30
|
+
t.set(a, (t.get(a) || 0) + 1);
|
|
31
|
+
}), e.sort((a, c) => (t.get(a) || 0) - (t.get(c) || 0)), e.splice(0, e.length - n);
|
|
32
|
+
} else if (e.length < n)
|
|
33
|
+
for (; e.length < n; )
|
|
34
|
+
e.push("<pad>");
|
|
35
|
+
return e.sort((t, a) => t.charCodeAt(0) - a.charCodeAt(0)), this.vocab = [...e, ...h], this.eosToken = this.vocab.indexOf("<eos>"), this.unkToken = this.vocab.indexOf("<unk>"), this.vocabSize = this.vocab.length, this.cache.clear(), this.vocab.forEach((t, a) => {
|
|
36
|
+
this.cache.set(t, a);
|
|
37
|
+
}), this.emit("trainStatus", "trained"), this.vocabSize;
|
|
38
38
|
}
|
|
39
39
|
async tokenise(s, i) {
|
|
40
40
|
if (!this.trained)
|
|
41
41
|
throw new Error("Tokeniser not trained");
|
|
42
|
-
return s.map((
|
|
43
|
-
const
|
|
44
|
-
return
|
|
42
|
+
return s.map((e) => i ? e.split("").map((n) => this.cache.get(n) ?? this.unkToken) : e.split("").map((n) => {
|
|
43
|
+
const t = this.cache.get(n);
|
|
44
|
+
return t !== void 0 ? this.vocab[t] : "<unk>";
|
|
45
45
|
}));
|
|
46
46
|
}
|
|
47
47
|
async detokenise(s) {
|
|
48
|
-
return s.map((o) => o.map((
|
|
48
|
+
return s.map((o) => o.map((e) => this.vocab[e]).join(""));
|
|
49
49
|
}
|
|
50
50
|
async encode(s) {
|
|
51
51
|
return (await this.tokenise([s], !0))[0];
|
|
@@ -60,10 +60,10 @@ class l extends h {
|
|
|
60
60
|
return [];
|
|
61
61
|
}
|
|
62
62
|
async createTrainingData(s, i = 5) {
|
|
63
|
-
const o = await this.tokenise(s, !0),
|
|
64
|
-
for (let
|
|
65
|
-
|
|
66
|
-
return [
|
|
63
|
+
const o = await this.tokenise(s, !0), e = [], n = [];
|
|
64
|
+
for (let t = 0; t < o.length - i; t++)
|
|
65
|
+
e.push(...o[t].slice(0, i)), n.push(o[t + 1][0]);
|
|
66
|
+
return [e, n];
|
|
67
67
|
}
|
|
68
68
|
}
|
|
69
69
|
export {
|
|
@@ -1,70 +1,68 @@
|
|
|
1
1
|
import { generateText as L } from "../utilities/generate.js";
|
|
2
2
|
import w from "./Trainer.js";
|
|
3
|
-
import
|
|
4
|
-
const
|
|
3
|
+
import x from "./Evaluator.js";
|
|
4
|
+
const g = {
|
|
5
5
|
desiredLoss: 0.01,
|
|
6
6
|
logInterval: 1,
|
|
7
7
|
maxSteps: 1e3
|
|
8
8
|
};
|
|
9
|
-
class
|
|
9
|
+
class P extends w {
|
|
10
10
|
constructor(r, i, o, n = 3e-4) {
|
|
11
11
|
super(r, i, o, n);
|
|
12
12
|
}
|
|
13
13
|
// Train for multiple epochs using Dataset API - FIXED memory leaks
|
|
14
14
|
async trainOnDataset(r, i, o) {
|
|
15
|
-
const { desiredLoss: n, logInterval:
|
|
16
|
-
...
|
|
15
|
+
const { desiredLoss: n, logInterval: m, onStep: l, prompt: c, maxSteps: d } = {
|
|
16
|
+
...g,
|
|
17
17
|
...i
|
|
18
|
-
},
|
|
19
|
-
pass: 0,
|
|
20
|
-
depth: 1,
|
|
18
|
+
}, t = {
|
|
21
19
|
step: 0,
|
|
22
|
-
stepSinceDepthChange: 0,
|
|
23
20
|
lastLoss: 1e6,
|
|
24
21
|
totalSteps: 0,
|
|
25
22
|
losses: [],
|
|
26
|
-
validationLosses: []
|
|
23
|
+
validationLosses: [],
|
|
24
|
+
...this.lastState || {}
|
|
27
25
|
};
|
|
28
|
-
this.dummyPass(), this.model.trainable = !0;
|
|
26
|
+
this.lastState = t, this.dummyPass(), this.model.trainable = !0;
|
|
29
27
|
const u = Date.now();
|
|
30
28
|
this.running = !0;
|
|
31
|
-
const
|
|
29
|
+
const h = o ? new x(this.model, o) : void 0, f = await r.iterator();
|
|
32
30
|
try {
|
|
33
|
-
for (; this.running && !(
|
|
31
|
+
for (; this.running && !(t.lastLoss < n); ) {
|
|
34
32
|
const e = await f.next();
|
|
35
33
|
if (e.done) break;
|
|
36
|
-
const
|
|
37
|
-
loss:
|
|
38
|
-
step:
|
|
34
|
+
const p = e.value, v = this.trainBatch(t, p), a = {
|
|
35
|
+
loss: t.lastLoss,
|
|
36
|
+
step: t.step,
|
|
39
37
|
time: Date.now() - u,
|
|
40
|
-
batchSize:
|
|
38
|
+
batchSize: p.xs.shape[0]
|
|
41
39
|
};
|
|
42
|
-
if (this.model.log.push(a),
|
|
43
|
-
if (await v,
|
|
40
|
+
if (this.model.log.push(a), t.step % m === 0) {
|
|
41
|
+
if (await v, h)
|
|
44
42
|
try {
|
|
45
|
-
const
|
|
46
|
-
|
|
47
|
-
} catch (
|
|
48
|
-
console.error("Validation error:",
|
|
43
|
+
const s = await h.evaluate(5);
|
|
44
|
+
t.validationLosses.push(s), a.valLoss = s;
|
|
45
|
+
} catch (s) {
|
|
46
|
+
console.error("Validation error:", s);
|
|
49
47
|
}
|
|
50
48
|
if (l) {
|
|
51
|
-
if (
|
|
52
|
-
const
|
|
49
|
+
if (c) {
|
|
50
|
+
const s = await L(this.tokenizer, this.model, c, 100, {
|
|
53
51
|
temperature: 0.8
|
|
54
52
|
});
|
|
55
|
-
a.example =
|
|
53
|
+
a.example = s;
|
|
56
54
|
}
|
|
57
55
|
await l(a);
|
|
58
56
|
}
|
|
59
57
|
}
|
|
60
|
-
|
|
58
|
+
t.step >= d && this.stop();
|
|
61
59
|
}
|
|
62
60
|
} catch (e) {
|
|
63
61
|
throw console.error("Training error:", e), this.tf.dispose(), e;
|
|
64
62
|
}
|
|
65
|
-
return this.tf.dispose(), this.running = !1, { losses:
|
|
63
|
+
return this.tf.dispose(), this.running = !1, { losses: t.losses, validationLosses: t.validationLosses };
|
|
66
64
|
}
|
|
67
65
|
}
|
|
68
66
|
export {
|
|
69
|
-
|
|
67
|
+
P as default
|
|
70
68
|
};
|
|
@@ -31,8 +31,10 @@ export default abstract class GPTTrainer {
|
|
|
31
31
|
protected tf: typeof TF;
|
|
32
32
|
protected learningRate: number;
|
|
33
33
|
protected running: boolean;
|
|
34
|
+
protected lastState?: TrainingState;
|
|
34
35
|
constructor(tf: typeof TF, model: NanoGPT, tokenizer: ITokeniser, learningRate?: number);
|
|
35
36
|
setLearningRate(learningRate: number): void;
|
|
37
|
+
reset(): void;
|
|
36
38
|
stop(): void;
|
|
37
39
|
getOptimizer(): AdamExt;
|
|
38
40
|
resetOptimizer(config?: AdamConfig): void;
|
package/dist/training/Trainer.js
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import { DatasetBuilder as d } from "./DatasetBuilder.js";
|
|
2
|
-
import
|
|
2
|
+
import h from "./AdamExt.js";
|
|
3
3
|
class u {
|
|
4
|
-
constructor(t,
|
|
5
|
-
this.tokenizer =
|
|
4
|
+
constructor(t, e, s, i = 1e-3) {
|
|
5
|
+
this.tokenizer = s, this.tf = t, this.model = e, this.learningRate = i, this.resetOptimizer(), this.datasetBuilder = new d(this.tf, s, e.config.blockSize);
|
|
6
6
|
}
|
|
7
7
|
model;
|
|
8
8
|
optimizer;
|
|
@@ -10,9 +10,13 @@ class u {
|
|
|
10
10
|
tf;
|
|
11
11
|
learningRate;
|
|
12
12
|
running = !1;
|
|
13
|
+
lastState;
|
|
13
14
|
setLearningRate(t) {
|
|
14
15
|
this.learningRate = t, this.resetOptimizer({ learningRateFactor: 1, beta1: 0.9, beta2: 0.99, epsilon: 1e-8 });
|
|
15
16
|
}
|
|
17
|
+
reset() {
|
|
18
|
+
this.lastState = void 0, this.running = !1;
|
|
19
|
+
}
|
|
16
20
|
stop() {
|
|
17
21
|
this.running = !1;
|
|
18
22
|
}
|
|
@@ -21,7 +25,7 @@ class u {
|
|
|
21
25
|
}
|
|
22
26
|
resetOptimizer(t = { learningRateFactor: 1, beta1: 0.9, beta2: 0.99, epsilon: 1e-8 }) {
|
|
23
27
|
this.optimizer && this.optimizer.dispose();
|
|
24
|
-
const
|
|
28
|
+
const e = new h(
|
|
25
29
|
t.learningRateFactor * this.learningRate,
|
|
26
30
|
t.beta1,
|
|
27
31
|
t.beta2,
|
|
@@ -33,53 +37,53 @@ class u {
|
|
|
33
37
|
weightDecay: 0
|
|
34
38
|
}
|
|
35
39
|
);
|
|
36
|
-
this.optimizer =
|
|
40
|
+
this.optimizer = e;
|
|
37
41
|
}
|
|
38
42
|
printGradients(t) {
|
|
39
|
-
Object.keys(t).forEach((
|
|
40
|
-
const
|
|
41
|
-
console.log(`${
|
|
43
|
+
Object.keys(t).forEach((e) => {
|
|
44
|
+
const s = t[e];
|
|
45
|
+
console.log(`${e}:`), console.log(` Shape: ${s.shape}`), console.log(` Mean: ${this.tf.mean(s).dataSync()[0]}`), console.log(` Std: ${this.tf.moments(s).variance.sqrt().dataSync()[0]}`), console.log(` Min: ${this.tf.min(s).dataSync()[0]}`), console.log(` Max: ${this.tf.max(s).dataSync()[0]}`), console.log(` Norm: ${this.tf.norm(s).dataSync()[0]}`);
|
|
42
46
|
});
|
|
43
47
|
}
|
|
44
|
-
trainStep(t,
|
|
48
|
+
trainStep(t, e = !1, s = !1) {
|
|
45
49
|
return this.tf.tidy(() => {
|
|
46
50
|
const { xs: i, ys: a } = t, o = () => {
|
|
47
51
|
const { loss: l, logits: c } = this.model.forward(i, a, !0);
|
|
48
52
|
return c.dispose(), l;
|
|
49
53
|
}, { value: n, grads: r } = this.tf.variableGrads(o);
|
|
50
|
-
return
|
|
54
|
+
return e || (s && (console.log("-------"), this.printGradients(r), console.log("-------")), this.optimizer.applyGradients(r), this.tf.dispose(r)), n;
|
|
51
55
|
});
|
|
52
56
|
}
|
|
53
57
|
dummyPass() {
|
|
54
|
-
const t = this.tf.zeros([1, this.model.config.blockSize], "int32"),
|
|
58
|
+
const t = this.tf.zeros([1, this.model.config.blockSize], "int32"), e = this.tf.zeros([1, this.model.config.blockSize, this.model.config.vocabSize]);
|
|
55
59
|
try {
|
|
56
|
-
const
|
|
57
|
-
|
|
58
|
-
} catch (
|
|
59
|
-
console.error("Error during dummy pass:",
|
|
60
|
+
const s = this.trainStep({ xs: t, ys: e }, !0);
|
|
61
|
+
s.dataSync(), s.dispose();
|
|
62
|
+
} catch (s) {
|
|
63
|
+
console.error("Error during dummy pass:", s);
|
|
60
64
|
} finally {
|
|
61
|
-
t.dispose(),
|
|
65
|
+
t.dispose(), e.dispose();
|
|
62
66
|
}
|
|
63
67
|
}
|
|
64
|
-
async trainBatch(t,
|
|
68
|
+
async trainBatch(t, e) {
|
|
65
69
|
try {
|
|
66
|
-
const
|
|
67
|
-
return
|
|
68
|
-
} catch (
|
|
69
|
-
throw console.error(`Error processing batch at step ${t.step}:`,
|
|
70
|
+
const s = this.trainStep(e, !1, !1);
|
|
71
|
+
return e.xs.dispose(), e.ys.dispose(), t.step++, t.totalSteps++, s.array().then((i) => (t.lastLoss = i, t.losses.push(t.lastLoss), s.dispose(), t.lastLoss));
|
|
72
|
+
} catch (s) {
|
|
73
|
+
throw console.error(`Error processing batch at step ${t.step}:`, s), this.tf.dispose(), s;
|
|
70
74
|
}
|
|
71
75
|
}
|
|
72
|
-
async createTrainValidationSplit(t,
|
|
73
|
-
const i = await this.datasetBuilder.createTextDataset(t,
|
|
76
|
+
async createTrainValidationSplit(t, e = 32, s = 0.1) {
|
|
77
|
+
const i = await this.datasetBuilder.createTextDataset(t, e, 0, 1 - s), a = await this.datasetBuilder.createTextDataset(
|
|
74
78
|
t,
|
|
75
|
-
|
|
76
|
-
1 -
|
|
79
|
+
e,
|
|
80
|
+
1 - s,
|
|
77
81
|
1
|
|
78
82
|
);
|
|
79
83
|
return { trainDataset: i, validationDataset: a };
|
|
80
84
|
}
|
|
81
|
-
async createDataset(t,
|
|
82
|
-
return await this.datasetBuilder.createTextDataset(t,
|
|
85
|
+
async createDataset(t, e = 32) {
|
|
86
|
+
return await this.datasetBuilder.createTextDataset(t, e);
|
|
83
87
|
}
|
|
84
88
|
dispose() {
|
|
85
89
|
this.optimizer && this.optimizer.dispose();
|
package/dist/utilities/save.d.ts
CHANGED
|
@@ -1,3 +1,9 @@
|
|
|
1
1
|
import { default as NanoGPT } from '../NanoGPTModel';
|
|
2
2
|
import { ITokeniser } from '../tokeniser/type';
|
|
3
|
-
export
|
|
3
|
+
export interface SaveOptions {
|
|
4
|
+
includeLog?: boolean;
|
|
5
|
+
name?: string;
|
|
6
|
+
metadata?: Record<string, unknown>;
|
|
7
|
+
files?: Record<string, unknown>;
|
|
8
|
+
}
|
|
9
|
+
export declare function saveModel(model: NanoGPT, tokeniser: ITokeniser, options?: SaveOptions): Promise<Blob>;
|
package/dist/utilities/save.js
CHANGED
|
@@ -1,21 +1,36 @@
|
|
|
1
|
-
import { z as
|
|
2
|
-
import { exportWeights as
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
1
|
+
import { z as g } from "../jszip.min-BLbRbbKt.js";
|
|
2
|
+
import { exportWeights as l } from "./weights.js";
|
|
3
|
+
const b = "1.0.0";
|
|
4
|
+
async function p(t, s, i) {
|
|
5
|
+
const o = i?.includeLog ?? !0, c = t.saveWeights(), e = new g(), f = {};
|
|
6
|
+
for (const [n, a] of c) {
|
|
7
|
+
const r = await l(a);
|
|
8
|
+
f[n] = r.spec, e.file(`${n}.bin`, r.data.buffer, { binary: !0 });
|
|
8
9
|
}
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
10
|
+
if (e.file(
|
|
11
|
+
"manifest.json",
|
|
12
|
+
JSON.stringify({
|
|
13
|
+
weightSpec: f,
|
|
14
|
+
config: t.config,
|
|
15
|
+
version: b,
|
|
16
|
+
application: "@genai-fi/nanogpt",
|
|
17
|
+
meta: i?.metadata,
|
|
18
|
+
name: i?.name
|
|
19
|
+
}),
|
|
20
|
+
{
|
|
21
|
+
binary: !1
|
|
22
|
+
}
|
|
23
|
+
), e.file(
|
|
12
24
|
"tokeniser.json",
|
|
13
|
-
JSON.stringify({ vocab:
|
|
25
|
+
JSON.stringify({ vocab: s.getVocab(), merges: await s.getMerges() }),
|
|
14
26
|
{
|
|
15
27
|
binary: !1
|
|
16
28
|
}
|
|
17
|
-
), e.file("log.json", JSON.stringify(
|
|
29
|
+
), o && e.file("log.json", JSON.stringify(t.log), { binary: !1 }), i?.files)
|
|
30
|
+
for (const [n, a] of Object.entries(i.files))
|
|
31
|
+
e.file(n, JSON.stringify(a), { binary: !1 });
|
|
32
|
+
return e.generateAsync({ type: "blob" });
|
|
18
33
|
}
|
|
19
34
|
export {
|
|
20
|
-
|
|
35
|
+
p as saveModel
|
|
21
36
|
};
|