@genai-fi/nanogpt 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/Generator.d.ts +1 -0
- package/dist/Generator.js +49 -40
- package/dist/NanoGPTModel.d.ts +2 -0
- package/dist/NanoGPTModel.js +25 -24
- package/dist/config.d.ts +1 -0
- package/dist/config.js +4 -3
- package/dist/layers/MLP.js +3 -3
- package/dist/training/Evaluator.d.ts +8 -0
- package/dist/training/Evaluator.js +22 -0
- package/dist/training/FullTrainer.js +15 -14
- package/dist/training/LayerTrainer.js +33 -35
- package/dist/training/Trainer.d.ts +0 -1
- package/dist/training/Trainer.js +13 -22
- package/dist/utilities/generate.js +12 -10
- package/package.json +1 -1
package/dist/Generator.d.ts
CHANGED
package/dist/Generator.js
CHANGED
|
@@ -1,53 +1,62 @@
|
|
|
1
1
|
import { E as m } from "./index-SOhdqzHq.js";
|
|
2
|
-
const
|
|
3
|
-
class
|
|
4
|
-
constructor(
|
|
5
|
-
super(), this.model =
|
|
2
|
+
const b = 4;
|
|
3
|
+
class x extends m {
|
|
4
|
+
constructor(a, t) {
|
|
5
|
+
super(), this.model = a, this.tokeniser = t;
|
|
6
6
|
}
|
|
7
|
-
generateBlockOfTokens(
|
|
8
|
-
const
|
|
9
|
-
let
|
|
10
|
-
for (let
|
|
11
|
-
const {
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
7
|
+
generateBlockOfTokens(a, t) {
|
|
8
|
+
const g = t?.temperature ?? 1, c = t?.topK, d = t?.usePadding ?? t?.includeAttention ?? !1, k = t?.includeAttention ?? !1, h = t?.includeProbabilities ?? !1;
|
|
9
|
+
let i = a, n, s;
|
|
10
|
+
for (let e = 0; e < b; e++) {
|
|
11
|
+
const {
|
|
12
|
+
output: u,
|
|
13
|
+
attention: l,
|
|
14
|
+
probabilities: r
|
|
15
|
+
} = this.model.generate(i, {
|
|
16
|
+
temperature: g,
|
|
17
|
+
topK: c,
|
|
18
|
+
usePadding: d,
|
|
19
|
+
includeAttention: k,
|
|
20
|
+
includeProbabilities: h
|
|
21
|
+
}), p = i;
|
|
22
|
+
if (i = this.model.tf.concat([i, u], 1), n && l) {
|
|
23
|
+
const o = n;
|
|
24
|
+
n = this.model.tf.concat([n, l], 0), o.dispose();
|
|
25
|
+
} else l && (n = l);
|
|
26
|
+
if (s && r) {
|
|
27
|
+
const o = s;
|
|
28
|
+
s = this.model.tf.concat([s, r], 0), o.dispose();
|
|
29
|
+
} else r && (s = r);
|
|
30
|
+
p.dispose(), u.dispose();
|
|
22
31
|
}
|
|
23
|
-
return { output:
|
|
32
|
+
return { output: i, attention: n, probabilities: s };
|
|
24
33
|
}
|
|
25
|
-
async generate(
|
|
26
|
-
const
|
|
27
|
-
let
|
|
34
|
+
async generate(a, t) {
|
|
35
|
+
const g = a ? await this.tokeniser.tokenise([a], !0) : [[this.tokeniser.eosToken]];
|
|
36
|
+
let c = this.model.tf.tensor2d(g, [1, g[0].length], "int32");
|
|
28
37
|
this.emit("start");
|
|
29
|
-
let
|
|
38
|
+
let d = a || "";
|
|
30
39
|
for (; ; ) {
|
|
31
|
-
const { output:
|
|
32
|
-
|
|
33
|
-
const
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
40
|
+
const { output: k, attention: h, probabilities: i } = this.generateBlockOfTokens(c, t), n = c;
|
|
41
|
+
c = k;
|
|
42
|
+
const s = k.slice([0, n.shape[1]], [1, b]), e = (await s.array())[0];
|
|
43
|
+
n.dispose(), s.dispose();
|
|
44
|
+
let u = !1, l = !1;
|
|
45
|
+
const r = e.indexOf(this.tokeniser.eosToken);
|
|
46
|
+
r !== -1 && (u = !0, e.splice(r)), e.length + d.length >= (t?.maxLength ?? 1e3) && (l = !0, e.splice(
|
|
47
|
+
t?.maxLength ? t.maxLength - d.length : e.length
|
|
38
48
|
));
|
|
39
|
-
const
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
if (n.dispose(), l.dispose(), i || h)
|
|
49
|
+
const p = await this.tokeniser.decode(e);
|
|
50
|
+
d += p;
|
|
51
|
+
let o;
|
|
52
|
+
h && (o = await h.array(), h.dispose(), o.length > e.length && (o = o.slice(0, e.length)));
|
|
53
|
+
let f;
|
|
54
|
+
if (i && (f = await i.array(), i.dispose(), f.length > e.length && (f = f.slice(0, e.length))), this.emit("tokens", e, p, o, f), u || l)
|
|
46
55
|
break;
|
|
47
56
|
}
|
|
48
|
-
return
|
|
57
|
+
return c.dispose(), this.emit("stop"), d;
|
|
49
58
|
}
|
|
50
59
|
}
|
|
51
60
|
export {
|
|
52
|
-
|
|
61
|
+
x as default
|
|
53
62
|
};
|
package/dist/NanoGPTModel.d.ts
CHANGED
|
@@ -13,6 +13,7 @@ export interface GenerateOptions {
|
|
|
13
13
|
topK?: number;
|
|
14
14
|
usePadding?: boolean;
|
|
15
15
|
includeAttention?: boolean;
|
|
16
|
+
includeProbabilities?: boolean;
|
|
16
17
|
}
|
|
17
18
|
export default class NanoGPT {
|
|
18
19
|
readonly config: GPTConfig;
|
|
@@ -42,6 +43,7 @@ export default class NanoGPT {
|
|
|
42
43
|
generate(idx: TF.Tensor, options?: GenerateOptions): {
|
|
43
44
|
output: TF.Tensor;
|
|
44
45
|
attention?: TF.Tensor;
|
|
46
|
+
probabilities?: TF.Tensor;
|
|
45
47
|
};
|
|
46
48
|
getNumParams(): number;
|
|
47
49
|
}
|
package/dist/NanoGPTModel.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import { defaultConfig as
|
|
2
|
-
import
|
|
3
|
-
import
|
|
4
|
-
import
|
|
1
|
+
import { defaultConfig as z } from "./config.js";
|
|
2
|
+
import v from "./layers/TransformerBlock.js";
|
|
3
|
+
import S from "./layers/TiedEmbedding.js";
|
|
4
|
+
import _ from "./layers/LayerNorm.js";
|
|
5
5
|
class $ {
|
|
6
6
|
config;
|
|
7
7
|
wte;
|
|
@@ -17,7 +17,7 @@ class $ {
|
|
|
17
17
|
log = [];
|
|
18
18
|
// Training log
|
|
19
19
|
constructor(t, e = {}) {
|
|
20
|
-
this.tf = t, this.config = { ...
|
|
20
|
+
this.tf = t, this.config = { ...z, ...e }, this.wte = new S(t, {
|
|
21
21
|
vocabSize: this.config.vocabSize,
|
|
22
22
|
embedDim: this.config.nEmbed,
|
|
23
23
|
name: "token_embedding"
|
|
@@ -28,8 +28,8 @@ class $ {
|
|
|
28
28
|
embeddingsInitializer: this.tf.initializers.randomNormal({ mean: 0, stddev: 0.02 })
|
|
29
29
|
}), this.drop = this.tf.layers.dropout({ rate: this.config.dropout }), this.blocks = [];
|
|
30
30
|
for (let s = 0; s < this.config.nLayer; s++)
|
|
31
|
-
this.blocks.push(new
|
|
32
|
-
this.lnF = new
|
|
31
|
+
this.blocks.push(new v(this.tf, s, this.config));
|
|
32
|
+
this.lnF = new _(t, [this.config.nEmbed], 1e-5, "final_layer_norm");
|
|
33
33
|
}
|
|
34
34
|
get variables() {
|
|
35
35
|
return [
|
|
@@ -54,7 +54,7 @@ class $ {
|
|
|
54
54
|
}
|
|
55
55
|
inputPhase(t, e = !1) {
|
|
56
56
|
return this.tf.tidy(() => {
|
|
57
|
-
const [, s] = t.shape, i = this.wte.embed(t), n = this.tf.range(0, s, 1, "int32"),
|
|
57
|
+
const [, s] = t.shape, i = this.wte.embed(t), n = this.tf.range(0, s, 1, "int32"), a = this.wpe.apply(n), o = i.add(a);
|
|
58
58
|
return this.drop.apply(o, { training: e });
|
|
59
59
|
});
|
|
60
60
|
}
|
|
@@ -98,8 +98,8 @@ class $ {
|
|
|
98
98
|
throw new Error("No attentions for rollout");
|
|
99
99
|
const e = t[0].shape[0], s = t[0].shape[1], i = this.tf.eye(s, s).expandDims(0);
|
|
100
100
|
let n = i.tile([e, 1, 1]);
|
|
101
|
-
for (const
|
|
102
|
-
let o =
|
|
101
|
+
for (const a of t) {
|
|
102
|
+
let o = a.add(i);
|
|
103
103
|
o = o.div(o.sum(-1, !0)), n = o.matMul(n);
|
|
104
104
|
}
|
|
105
105
|
return n;
|
|
@@ -108,35 +108,36 @@ class $ {
|
|
|
108
108
|
forward(t, e, s = !1, i = !1) {
|
|
109
109
|
return this.validateInput(t), this.tf.tidy(() => {
|
|
110
110
|
let n = this.inputPhase(t, s);
|
|
111
|
-
const
|
|
111
|
+
const a = [];
|
|
112
112
|
for (const c of this.blocks) {
|
|
113
|
-
const { output: p, attention:
|
|
114
|
-
n = p, i &&
|
|
113
|
+
const { output: p, attention: l } = c.call(n, s, i);
|
|
114
|
+
n = p, i && l && a.push(l);
|
|
115
115
|
}
|
|
116
116
|
let o;
|
|
117
|
-
i &&
|
|
118
|
-
const
|
|
117
|
+
i && a.length > 0 && (o = this.computeAttentionRollout(a)), n = this.lnF.apply(n);
|
|
118
|
+
const h = this.wte.project(n);
|
|
119
119
|
let r;
|
|
120
|
-
return e && (r = this.calculateLoss(
|
|
120
|
+
return e && (r = this.calculateLoss(h, e)), { logits: h, loss: r, attention: i ? o : void 0 };
|
|
121
121
|
});
|
|
122
122
|
}
|
|
123
123
|
generate(t, e) {
|
|
124
|
-
const s = e?.temperature ?? 1, i = e?.topK, n = e?.usePadding ?? !1,
|
|
124
|
+
const s = e?.temperature ?? 1, i = e?.topK, n = e?.usePadding ?? !1, a = e?.includeAttention ?? !1;
|
|
125
125
|
return this.tf.tidy(() => {
|
|
126
|
-
const o = t,
|
|
127
|
-
[0,
|
|
126
|
+
const o = t, h = o.shape[1], r = h <= this.config.blockSize ? o : o.slice(
|
|
127
|
+
[0, h - this.config.blockSize],
|
|
128
128
|
[o.shape[0], this.config.blockSize]
|
|
129
129
|
), c = n ? this.config.blockSize - r.shape[1] : 0, p = c > 0 ? this.tf.pad(r, [
|
|
130
130
|
[0, 0],
|
|
131
131
|
[0, c]
|
|
132
|
-
]) : r, { logits:
|
|
132
|
+
]) : r, { logits: l, attention: g } = this.forward(p, void 0, !1, a), b = l.shape[1] - 1 - c, u = l.slice([0, b, 0], [l.shape[0], 1, l.shape[2]]), k = g ? g.slice([0, b, 0], [g.shape[0], 1, g.shape[2]]) : void 0, d = u.div(s);
|
|
133
133
|
let f;
|
|
134
134
|
if (i) {
|
|
135
|
-
const { values:
|
|
136
|
-
f = this.tf.gather(
|
|
135
|
+
const { values: w, indices: E } = this.tf.topk(d, i), y = this.tf.multinomial(w.squeeze([1]), 1);
|
|
136
|
+
f = this.tf.gather(E.squeeze([1]), y, 1);
|
|
137
137
|
} else
|
|
138
|
-
f = this.tf.multinomial(
|
|
139
|
-
|
|
138
|
+
f = this.tf.multinomial(d.squeeze([1]), 1);
|
|
139
|
+
let m;
|
|
140
|
+
return e?.includeProbabilities && (m = this.tf.softmax(d.squeeze([1]))), f = f.reshape([1, 1]), { output: f, attention: k?.squeeze([1]), probabilities: m };
|
|
140
141
|
});
|
|
141
142
|
}
|
|
142
143
|
getNumParams() {
|
package/dist/config.d.ts
CHANGED
package/dist/config.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
const
|
|
1
|
+
const a = {
|
|
2
2
|
vocabSize: 50304,
|
|
3
3
|
// GPT-2 vocab size
|
|
4
4
|
blockSize: 1024,
|
|
@@ -12,8 +12,9 @@ const e = {
|
|
|
12
12
|
dropout: 0,
|
|
13
13
|
// Dropout probability
|
|
14
14
|
biasInLinear: !1,
|
|
15
|
-
biasInLayerNorm: !1
|
|
15
|
+
biasInLayerNorm: !1,
|
|
16
|
+
mlpFactor: 4
|
|
16
17
|
};
|
|
17
18
|
export {
|
|
18
|
-
|
|
19
|
+
a as defaultConfig
|
|
19
20
|
};
|
package/dist/layers/MLP.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
class
|
|
1
|
+
class l {
|
|
2
2
|
cFc;
|
|
3
3
|
cProj;
|
|
4
4
|
dropout;
|
|
@@ -7,7 +7,7 @@ class n {
|
|
|
7
7
|
_trainable = !0;
|
|
8
8
|
constructor(t, i, e) {
|
|
9
9
|
this.tf = t, this.index = i, this.cFc = this.tf.layers.dense({
|
|
10
|
-
units:
|
|
10
|
+
units: e.mlpFactor * e.nEmbed,
|
|
11
11
|
activation: "gelu",
|
|
12
12
|
useBias: e.biasInLinear,
|
|
13
13
|
kernelInitializer: this.tf.initializers.randomNormal({
|
|
@@ -53,5 +53,5 @@ class n {
|
|
|
53
53
|
}
|
|
54
54
|
}
|
|
55
55
|
export {
|
|
56
|
-
|
|
56
|
+
l as default
|
|
57
57
|
};
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import { default as NanoGPT } from '../NanoGPTModel';
|
|
2
|
+
import { default as TF } from '@tensorflow/tfjs';
|
|
3
|
+
export default class Evaluator {
|
|
4
|
+
private model;
|
|
5
|
+
private iterator;
|
|
6
|
+
constructor(model: NanoGPT, dataset: TF.data.Dataset<TF.TensorContainer>);
|
|
7
|
+
evaluate(maxBatches?: number): Promise<number>;
|
|
8
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
class p {
|
|
2
|
+
constructor(s, t) {
|
|
3
|
+
this.model = s, this.iterator = t.iterator();
|
|
4
|
+
}
|
|
5
|
+
iterator;
|
|
6
|
+
async evaluate(s = 100) {
|
|
7
|
+
let t = 0, o = 0;
|
|
8
|
+
const c = await this.iterator;
|
|
9
|
+
for (let a = 0; a < s; a++) {
|
|
10
|
+
const e = await c.next();
|
|
11
|
+
if (e.done) break;
|
|
12
|
+
const n = e.value, { xs: r, ys: l } = n, { loss: i, logits: u } = this.model.forward(r, l, !1, !1);
|
|
13
|
+
u.dispose(), r.dispose(), l.dispose();
|
|
14
|
+
const d = i.arraySync();
|
|
15
|
+
i.dispose(), t += d, o++;
|
|
16
|
+
}
|
|
17
|
+
return t / o;
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
export {
|
|
21
|
+
p as default
|
|
22
|
+
};
|
|
@@ -1,18 +1,19 @@
|
|
|
1
1
|
import { generateText as L } from "../utilities/generate.js";
|
|
2
2
|
import w from "./Trainer.js";
|
|
3
|
-
|
|
3
|
+
import g from "./Evaluator.js";
|
|
4
|
+
const x = {
|
|
4
5
|
desiredLoss: 0.01,
|
|
5
6
|
logInterval: 1,
|
|
6
7
|
maxSteps: 1e3
|
|
7
8
|
};
|
|
8
|
-
class
|
|
9
|
+
class D extends w {
|
|
9
10
|
constructor(r, i, o, n = 3e-4) {
|
|
10
11
|
super(r, i, o, n);
|
|
11
12
|
}
|
|
12
13
|
// Train for multiple epochs using Dataset API - FIXED memory leaks
|
|
13
14
|
async trainOnDataset(r, i, o) {
|
|
14
|
-
const { desiredLoss: n, logInterval:
|
|
15
|
-
...
|
|
15
|
+
const { desiredLoss: n, logInterval: d, onStep: l, prompt: p, maxSteps: m } = {
|
|
16
|
+
...x,
|
|
16
17
|
...i
|
|
17
18
|
}, s = {
|
|
18
19
|
pass: 0,
|
|
@@ -25,23 +26,23 @@ class S extends w {
|
|
|
25
26
|
validationLosses: []
|
|
26
27
|
};
|
|
27
28
|
this.dummyPass(), this.model.trainable = !0;
|
|
28
|
-
const
|
|
29
|
+
const u = Date.now();
|
|
29
30
|
this.running = !0;
|
|
30
|
-
const
|
|
31
|
+
const c = o ? new g(this.model, o) : void 0, f = await r.iterator();
|
|
31
32
|
try {
|
|
32
33
|
for (; this.running && !(s.lastLoss < n); ) {
|
|
33
|
-
const e = await
|
|
34
|
+
const e = await f.next();
|
|
34
35
|
if (e.done) break;
|
|
35
|
-
const h = e.value,
|
|
36
|
+
const h = e.value, v = this.trainBatch(s, h), a = {
|
|
36
37
|
loss: s.lastLoss,
|
|
37
38
|
step: s.step,
|
|
38
|
-
time: Date.now() -
|
|
39
|
+
time: Date.now() - u,
|
|
39
40
|
batchSize: h.xs.shape[0]
|
|
40
41
|
};
|
|
41
|
-
if (this.model.log.push(a), s.step %
|
|
42
|
-
if (await
|
|
42
|
+
if (this.model.log.push(a), s.step % d === 0) {
|
|
43
|
+
if (await v, c)
|
|
43
44
|
try {
|
|
44
|
-
const t = await
|
|
45
|
+
const t = await c.evaluate(5);
|
|
45
46
|
s.validationLosses.push(t), a.valLoss = t;
|
|
46
47
|
} catch (t) {
|
|
47
48
|
console.error("Validation error:", t);
|
|
@@ -56,7 +57,7 @@ class S extends w {
|
|
|
56
57
|
await l(a);
|
|
57
58
|
}
|
|
58
59
|
}
|
|
59
|
-
s.step >=
|
|
60
|
+
s.step >= m && this.stop();
|
|
60
61
|
}
|
|
61
62
|
} catch (e) {
|
|
62
63
|
throw console.error("Training error:", e), this.tf.dispose(), e;
|
|
@@ -65,5 +66,5 @@ class S extends w {
|
|
|
65
66
|
}
|
|
66
67
|
}
|
|
67
68
|
export {
|
|
68
|
-
|
|
69
|
+
D as default
|
|
69
70
|
};
|
|
@@ -1,32 +1,33 @@
|
|
|
1
|
-
import { generateText as
|
|
2
|
-
import
|
|
3
|
-
import { schedule as
|
|
4
|
-
|
|
1
|
+
import { generateText as S } from "../utilities/generate.js";
|
|
2
|
+
import u from "./Trainer.js";
|
|
3
|
+
import { schedule as v } from "./lwSchedule.js";
|
|
4
|
+
import w from "./Evaluator.js";
|
|
5
|
+
const T = {
|
|
5
6
|
desiredLoss: 0.01,
|
|
6
7
|
logInterval: 1,
|
|
7
8
|
stepsPerLayer: 400,
|
|
8
9
|
maxPasses: 3,
|
|
9
10
|
maxSteps: 1e3
|
|
10
11
|
};
|
|
11
|
-
class
|
|
12
|
+
class z extends u {
|
|
12
13
|
trainingPattern = [];
|
|
13
14
|
startPass = 0;
|
|
14
15
|
startLayer = 0;
|
|
15
|
-
constructor(r,
|
|
16
|
-
if (super(r,
|
|
17
|
-
const i =
|
|
16
|
+
constructor(r, s, e, p = 3e-4) {
|
|
17
|
+
if (super(r, s, e, p), this.trainingPattern = v[s.config.nLayer - 1] || [], s.log.length > 0) {
|
|
18
|
+
const i = s.log[s.log.length - 1];
|
|
18
19
|
i.pass !== void 0 && i.layer !== void 0 && (this.startPass = i.pass, this.startLayer = i.layer, console.log(`Resuming training from pass ${this.startPass}, layer ${this.startLayer}`));
|
|
19
20
|
}
|
|
20
21
|
}
|
|
21
22
|
applyTrainingPattern(r) {
|
|
22
|
-
const
|
|
23
|
-
this.model.setSkipMask(e.skip), this.model.setTrainableMask(e.trainable), this.resetOptimizer(e.adam), console.log("Applied training pattern:",
|
|
23
|
+
const s = r < this.trainingPattern.length ? r : this.trainingPattern.length - 1, e = this.trainingPattern[s];
|
|
24
|
+
this.model.setSkipMask(e.skip), this.model.setTrainableMask(e.trainable), this.resetOptimizer(e.adam), console.log("Applied training pattern:", s, e);
|
|
24
25
|
}
|
|
25
26
|
// Train for multiple epochs using Dataset API - FIXED memory leaks
|
|
26
|
-
async trainOnDataset(r,
|
|
27
|
-
const { desiredLoss: p, logInterval: i, stepsPerLayer: L, onLayerChange:
|
|
28
|
-
...
|
|
29
|
-
...
|
|
27
|
+
async trainOnDataset(r, s, e) {
|
|
28
|
+
const { desiredLoss: p, logInterval: i, stepsPerLayer: L, onLayerChange: o, onPassComplete: h, onStep: c, prompt: g } = {
|
|
29
|
+
...T,
|
|
30
|
+
...s
|
|
30
31
|
}, t = {
|
|
31
32
|
pass: 0,
|
|
32
33
|
layerStep: 0,
|
|
@@ -38,47 +39,44 @@ class b extends S {
|
|
|
38
39
|
validationLosses: []
|
|
39
40
|
};
|
|
40
41
|
this.dummyPass();
|
|
41
|
-
const
|
|
42
|
+
const f = Date.now();
|
|
42
43
|
this.startPass = 0, this.startLayer = 0;
|
|
43
|
-
const
|
|
44
|
+
const y = e ? new w(this.model, e) : void 0, d = await r.iterator();
|
|
44
45
|
this.applyTrainingPattern(t.layerStep % this.trainingPattern.length);
|
|
45
46
|
try {
|
|
46
47
|
for (; !(t.lastLoss < p); ) {
|
|
47
|
-
const n = await
|
|
48
|
+
const n = await d.next();
|
|
48
49
|
if (n.done) break;
|
|
49
|
-
const
|
|
50
|
+
const m = n.value, P = this.trainBatch(t, m);
|
|
50
51
|
t.stepSinceLayerChange++;
|
|
51
|
-
const
|
|
52
|
+
const l = {
|
|
52
53
|
loss: t.lastLoss,
|
|
53
54
|
step: t.step,
|
|
54
|
-
time: Date.now() -
|
|
55
|
-
batchSize:
|
|
55
|
+
time: Date.now() - f,
|
|
56
|
+
batchSize: m.xs.shape[0],
|
|
56
57
|
pass: t.pass,
|
|
57
58
|
layer: t.layerStep % this.model.config.nLayer
|
|
58
59
|
};
|
|
59
|
-
if (this.model.log.push(
|
|
60
|
-
if (await P,
|
|
60
|
+
if (this.model.log.push(l), t.step % i === 0) {
|
|
61
|
+
if (await P, y)
|
|
61
62
|
try {
|
|
62
|
-
const
|
|
63
|
-
t.validationLosses.push(
|
|
64
|
-
} catch (
|
|
65
|
-
console.error("Validation error:",
|
|
63
|
+
const a = await y.evaluate(5);
|
|
64
|
+
t.validationLosses.push(a), l.valLoss = a;
|
|
65
|
+
} catch (a) {
|
|
66
|
+
console.error("Validation error:", a);
|
|
66
67
|
}
|
|
67
68
|
if (c) {
|
|
68
69
|
if (g) {
|
|
69
|
-
const
|
|
70
|
+
const a = await S(this.tokenizer, this.model, g, 100, {
|
|
70
71
|
temperature: 0.8,
|
|
71
72
|
topK: 10
|
|
72
73
|
});
|
|
73
|
-
|
|
74
|
+
l.example = a;
|
|
74
75
|
}
|
|
75
|
-
await c(
|
|
76
|
+
await c(l);
|
|
76
77
|
}
|
|
77
78
|
}
|
|
78
|
-
|
|
79
|
-
let s;
|
|
80
|
-
e && (s = await this.evaluateOnDataset(e, 5), t.validationLosses.push(s), o.valLoss = s), t.layerStep++, t.layerStep % this.model.config.nLayer === 0 ? (l && await l(t.layerStep, t.pass, s), h && await h(t.pass), t.pass++) : l && await l(t.layerStep, t.pass, s), t.stepSinceLayerChange = 0, this.applyTrainingPattern(t.layerStep % this.trainingPattern.length);
|
|
81
|
-
}
|
|
79
|
+
t.stepSinceLayerChange >= L && (t.layerStep++, t.layerStep % this.model.config.nLayer === 0 ? (o && await o(t.layerStep, t.pass), h && await h(t.pass), t.pass++) : o && await o(t.layerStep, t.pass), t.stepSinceLayerChange = 0, this.applyTrainingPattern(t.layerStep % this.trainingPattern.length));
|
|
82
80
|
}
|
|
83
81
|
} catch (n) {
|
|
84
82
|
throw console.error("Training error:", n), this.tf.dispose(), n;
|
|
@@ -87,5 +85,5 @@ class b extends S {
|
|
|
87
85
|
}
|
|
88
86
|
}
|
|
89
87
|
export {
|
|
90
|
-
|
|
88
|
+
z as default
|
|
91
89
|
};
|
|
@@ -56,7 +56,6 @@ export default abstract class GPTTrainer {
|
|
|
56
56
|
losses: number[];
|
|
57
57
|
validationLosses: number[];
|
|
58
58
|
}>;
|
|
59
|
-
evaluateOnDataset(dataset: TF.data.Dataset<TF.TensorContainer>, maxBatches?: number): Promise<number>;
|
|
60
59
|
createTrainValidationSplit(textData: string[], batchSize?: number, validationSplit?: number): Promise<{
|
|
61
60
|
trainDataset: TF.data.Dataset<{
|
|
62
61
|
xs: TF.Tensor;
|
package/dist/training/Trainer.js
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import { DatasetBuilder as
|
|
1
|
+
import { DatasetBuilder as d } from "./DatasetBuilder.js";
|
|
2
2
|
import p from "./AdamExt.js";
|
|
3
|
-
class
|
|
4
|
-
constructor(t, e, s,
|
|
5
|
-
this.tokenizer = s, this.tf = t, this.model = e, this.learningRate =
|
|
3
|
+
class g {
|
|
4
|
+
constructor(t, e, s, i = 1e-3) {
|
|
5
|
+
this.tokenizer = s, this.tf = t, this.model = e, this.learningRate = i, this.resetOptimizer(), this.datasetBuilder = new d(this.tf, s, e.config.blockSize);
|
|
6
6
|
}
|
|
7
7
|
model;
|
|
8
8
|
optimizer;
|
|
@@ -43,11 +43,11 @@ class y {
|
|
|
43
43
|
}
|
|
44
44
|
trainStep(t, e = !1, s = !1) {
|
|
45
45
|
return this.tf.tidy(() => {
|
|
46
|
-
const { xs:
|
|
47
|
-
const { loss: l, logits: c } = this.model.forward(
|
|
46
|
+
const { xs: i, ys: r } = t, o = () => {
|
|
47
|
+
const { loss: l, logits: c } = this.model.forward(i, r, !0);
|
|
48
48
|
return c.dispose(), l;
|
|
49
|
-
}, { value: n, grads:
|
|
50
|
-
return e || (s && (console.log("-------"), this.printGradients(
|
|
49
|
+
}, { value: n, grads: a } = this.tf.variableGrads(o);
|
|
50
|
+
return e || (s && (console.log("-------"), this.printGradients(a), console.log("-------")), this.optimizer.applyGradients(a), this.tf.dispose(a)), n;
|
|
51
51
|
});
|
|
52
52
|
}
|
|
53
53
|
dummyPass() {
|
|
@@ -64,31 +64,22 @@ class y {
|
|
|
64
64
|
async trainBatch(t, e) {
|
|
65
65
|
try {
|
|
66
66
|
const s = this.trainStep(e, !1, !1);
|
|
67
|
-
return e.xs.dispose(), e.ys.dispose(), t.step++, t.totalSteps++, s.array().then((
|
|
67
|
+
return e.xs.dispose(), e.ys.dispose(), t.step++, t.totalSteps++, s.array().then((i) => (t.lastLoss = i, t.losses.push(t.lastLoss), s.dispose(), t.lastLoss));
|
|
68
68
|
} catch (s) {
|
|
69
69
|
throw console.error(`Error processing batch at step ${t.step}:`, s), this.tf.dispose(), s;
|
|
70
70
|
}
|
|
71
71
|
}
|
|
72
|
-
// Evaluate model on validation dataset - FIXED memory leaks
|
|
73
|
-
async evaluateOnDataset(t, e = 100) {
|
|
74
|
-
let s = 0, a = 0;
|
|
75
|
-
return await t.take(e).forEachAsync(async (o) => {
|
|
76
|
-
const { xs: r, ys: n } = o, { loss: i, logits: l } = this.model.forward(r, n, !1), d = i.arraySync();
|
|
77
|
-
i.dispose(), l.dispose(), s += d, a++;
|
|
78
|
-
}), s / a;
|
|
79
|
-
}
|
|
80
|
-
// Create training and validation datasets - FIXED memory leaks
|
|
81
72
|
async createTrainValidationSplit(t, e = 32, s = 0.1) {
|
|
82
|
-
const
|
|
83
|
-
return { trainDataset: n, validationDataset:
|
|
73
|
+
const i = Math.floor(t.length * (1 - s)), r = t.slice(0, i), o = t.slice(i), n = await this.datasetBuilder.createTextDataset(r, e), a = await this.datasetBuilder.createTextDataset(o, e);
|
|
74
|
+
return { trainDataset: n, validationDataset: a };
|
|
84
75
|
}
|
|
85
76
|
async createDataset(t, e = 32) {
|
|
86
77
|
return await this.datasetBuilder.createTextDataset(t, e);
|
|
87
78
|
}
|
|
88
79
|
dispose() {
|
|
89
|
-
this.optimizer && this.optimizer.dispose()
|
|
80
|
+
this.optimizer && this.optimizer.dispose();
|
|
90
81
|
}
|
|
91
82
|
}
|
|
92
83
|
export {
|
|
93
|
-
|
|
84
|
+
g as default
|
|
94
85
|
};
|
|
@@ -1,17 +1,19 @@
|
|
|
1
|
-
async function w(n,
|
|
1
|
+
async function w(n, t, r, s, g) {
|
|
2
2
|
if (s <= 0)
|
|
3
3
|
throw new Error("Length must be a positive integer");
|
|
4
|
-
if (
|
|
4
|
+
if (r.length === 0)
|
|
5
5
|
throw new Error("Prompt cannot be an empty string");
|
|
6
|
-
const
|
|
7
|
-
let
|
|
8
|
-
for (let
|
|
9
|
-
const { output:
|
|
10
|
-
|
|
6
|
+
const i = await n.tokenise([r], !0), a = t.tf.tidy(() => {
|
|
7
|
+
let e = t.tf.tensor2d(i, [1, i[0].length], "int32");
|
|
8
|
+
for (let d = 0; d < s; d++) {
|
|
9
|
+
const { output: p } = t.generate(e, g), f = e;
|
|
10
|
+
e = t.tf.concat([e, p], 1), f.dispose(), p.dispose();
|
|
11
11
|
}
|
|
12
|
-
return
|
|
13
|
-
})
|
|
14
|
-
|
|
12
|
+
return e;
|
|
13
|
+
}), u = await a.array();
|
|
14
|
+
a.dispose();
|
|
15
|
+
const o = u[0], c = o.indexOf(n.eosToken);
|
|
16
|
+
return c !== -1 && o.splice(c), await n.decode(o);
|
|
15
17
|
}
|
|
16
18
|
export {
|
|
17
19
|
w as generateText
|