@genai-fi/nanogpt 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,6 +7,7 @@ export interface IGenerateOptions {
7
7
  topK?: number;
8
8
  usePadding?: boolean;
9
9
  includeAttention?: boolean;
10
+ includeProbabilities?: boolean;
10
11
  }
11
12
  export default class Generator extends EE<'start' | 'stop' | 'tokens'> {
12
13
  private readonly model;
package/dist/Generator.js CHANGED
@@ -1,53 +1,62 @@
1
1
  import { E as m } from "./index-SOhdqzHq.js";
2
- const g = 4;
3
- class w extends m {
4
- constructor(o, t) {
5
- super(), this.model = o, this.tokeniser = t;
2
+ const b = 4;
3
+ class x extends m {
4
+ constructor(a, t) {
5
+ super(), this.model = a, this.tokeniser = t;
6
6
  }
7
- generateBlockOfTokens(o, t) {
8
- const c = t?.temperature ?? 1, a = t?.topK, r = t?.usePadding ?? t?.includeAttention ?? !1, d = t?.includeAttention ?? !1;
9
- let s = o, n;
10
- for (let l = 0; l < g; l++) {
11
- const { output: e, attention: i } = this.model.generate(s, {
12
- temperature: c,
13
- topK: a,
14
- usePadding: r,
15
- includeAttention: d
16
- }), h = s;
17
- if (s = this.model.tf.concat([s, e], 1), n && i) {
18
- const u = n;
19
- n = this.model.tf.concat([n, i], 0), u.dispose();
20
- } else i && (n = i);
21
- h.dispose(), e.dispose();
7
+ generateBlockOfTokens(a, t) {
8
+ const g = t?.temperature ?? 1, c = t?.topK, d = t?.usePadding ?? t?.includeAttention ?? !1, k = t?.includeAttention ?? !1, h = t?.includeProbabilities ?? !1;
9
+ let i = a, n, s;
10
+ for (let e = 0; e < b; e++) {
11
+ const {
12
+ output: u,
13
+ attention: l,
14
+ probabilities: r
15
+ } = this.model.generate(i, {
16
+ temperature: g,
17
+ topK: c,
18
+ usePadding: d,
19
+ includeAttention: k,
20
+ includeProbabilities: h
21
+ }), p = i;
22
+ if (i = this.model.tf.concat([i, u], 1), n && l) {
23
+ const o = n;
24
+ n = this.model.tf.concat([n, l], 0), o.dispose();
25
+ } else l && (n = l);
26
+ if (s && r) {
27
+ const o = s;
28
+ s = this.model.tf.concat([s, r], 0), o.dispose();
29
+ } else r && (s = r);
30
+ p.dispose(), u.dispose();
22
31
  }
23
- return { output: s, attention: n };
32
+ return { output: i, attention: n, probabilities: s };
24
33
  }
25
- async generate(o, t) {
26
- const c = o ? await this.tokeniser.tokenise([o], !0) : [[this.tokeniser.eosToken]];
27
- let a = this.model.tf.tensor2d(c, [1, c[0].length], "int32");
34
+ async generate(a, t) {
35
+ const g = a ? await this.tokeniser.tokenise([a], !0) : [[this.tokeniser.eosToken]];
36
+ let c = this.model.tf.tensor2d(g, [1, g[0].length], "int32");
28
37
  this.emit("start");
29
- let r = o || "";
38
+ let d = a || "";
30
39
  for (; ; ) {
31
- const { output: d, attention: s } = this.generateBlockOfTokens(a, t), n = a;
32
- a = d;
33
- const l = d.slice([0, n.shape[1]], [1, g]), e = (await l.array())[0];
34
- let i = !1, h = !1;
35
- const u = e.indexOf(this.tokeniser.eosToken);
36
- u !== -1 && (i = !0, e.splice(u)), e.length + r.length >= (t?.maxLength ?? 1e3) && (h = !0, e.splice(
37
- t?.maxLength ? t.maxLength - r.length : e.length
40
+ const { output: k, attention: h, probabilities: i } = this.generateBlockOfTokens(c, t), n = c;
41
+ c = k;
42
+ const s = k.slice([0, n.shape[1]], [1, b]), e = (await s.array())[0];
43
+ n.dispose(), s.dispose();
44
+ let u = !1, l = !1;
45
+ const r = e.indexOf(this.tokeniser.eosToken);
46
+ r !== -1 && (u = !0, e.splice(r)), e.length + d.length >= (t?.maxLength ?? 1e3) && (l = !0, e.splice(
47
+ t?.maxLength ? t.maxLength - d.length : e.length
38
48
  ));
39
- const k = await this.tokeniser.decode(e);
40
- if (r += k, s) {
41
- let f = await s.array();
42
- f.length > e.length && (f = f.slice(0, e.length)), this.emit("tokens", e, k, f);
43
- } else
44
- this.emit("tokens", e, k);
45
- if (n.dispose(), l.dispose(), i || h)
49
+ const p = await this.tokeniser.decode(e);
50
+ d += p;
51
+ let o;
52
+ h && (o = await h.array(), h.dispose(), o.length > e.length && (o = o.slice(0, e.length)));
53
+ let f;
54
+ if (i && (f = await i.array(), i.dispose(), f.length > e.length && (f = f.slice(0, e.length))), this.emit("tokens", e, p, o, f), u || l)
46
55
  break;
47
56
  }
48
- return a.dispose(), this.emit("stop"), r;
57
+ return c.dispose(), this.emit("stop"), d;
49
58
  }
50
59
  }
51
60
  export {
52
- w as default
61
+ x as default
53
62
  };
@@ -13,6 +13,7 @@ export interface GenerateOptions {
13
13
  topK?: number;
14
14
  usePadding?: boolean;
15
15
  includeAttention?: boolean;
16
+ includeProbabilities?: boolean;
16
17
  }
17
18
  export default class NanoGPT {
18
19
  readonly config: GPTConfig;
@@ -33,6 +34,7 @@ export default class NanoGPT {
33
34
  set trainable(value: boolean);
34
35
  private validateInput;
35
36
  private calculateLoss;
37
+ private computeAttentionRollout;
36
38
  forward(idx: TF.Tensor, targets?: TF.Tensor, training?: boolean, includeAttention?: boolean): {
37
39
  logits: TF.Tensor;
38
40
  loss?: TF.Tensor;
@@ -41,6 +43,7 @@ export default class NanoGPT {
41
43
  generate(idx: TF.Tensor, options?: GenerateOptions): {
42
44
  output: TF.Tensor;
43
45
  attention?: TF.Tensor;
46
+ probabilities?: TF.Tensor;
44
47
  };
45
48
  getNumParams(): number;
46
49
  }
@@ -1,7 +1,7 @@
1
- import { defaultConfig as y } from "./config.js";
2
- import z from "./layers/TransformerBlock.js";
3
- import v from "./layers/TiedEmbedding.js";
4
- import S from "./layers/LayerNorm.js";
1
+ import { defaultConfig as z } from "./config.js";
2
+ import v from "./layers/TransformerBlock.js";
3
+ import S from "./layers/TiedEmbedding.js";
4
+ import _ from "./layers/LayerNorm.js";
5
5
  class $ {
6
6
  config;
7
7
  wte;
@@ -17,7 +17,7 @@ class $ {
17
17
  log = [];
18
18
  // Training log
19
19
  constructor(t, e = {}) {
20
- this.tf = t, this.config = { ...y, ...e }, this.wte = new v(t, {
20
+ this.tf = t, this.config = { ...z, ...e }, this.wte = new S(t, {
21
21
  vocabSize: this.config.vocabSize,
22
22
  embedDim: this.config.nEmbed,
23
23
  name: "token_embedding"
@@ -28,8 +28,8 @@ class $ {
28
28
  embeddingsInitializer: this.tf.initializers.randomNormal({ mean: 0, stddev: 0.02 })
29
29
  }), this.drop = this.tf.layers.dropout({ rate: this.config.dropout }), this.blocks = [];
30
30
  for (let s = 0; s < this.config.nLayer; s++)
31
- this.blocks.push(new z(this.tf, s, this.config));
32
- this.lnF = new S(t, [this.config.nEmbed], 1e-5, "final_layer_norm");
31
+ this.blocks.push(new v(this.tf, s, this.config));
32
+ this.lnF = new _(t, [this.config.nEmbed], 1e-5, "final_layer_norm");
33
33
  }
34
34
  get variables() {
35
35
  return [
@@ -54,8 +54,8 @@ class $ {
54
54
  }
55
55
  inputPhase(t, e = !1) {
56
56
  return this.tf.tidy(() => {
57
- const [, s] = t.shape, n = this.wte.embed(t), i = this.tf.range(0, s, 1, "int32"), o = this.wpe.apply(i), h = n.add(o);
58
- return this.drop.apply(h, { training: e });
57
+ const [, s] = t.shape, i = this.wte.embed(t), n = this.tf.range(0, s, 1, "int32"), a = this.wpe.apply(n), o = i.add(a);
58
+ return this.drop.apply(o, { training: e });
59
59
  });
60
60
  }
61
61
  setSkipMask(t) {
@@ -90,44 +90,61 @@ class $ {
90
90
  throw console.error("Error computing loss:", s), new Error(`Loss computation failed: ${s}`);
91
91
  }
92
92
  }
93
- forward(t, e, s = !1, n = !1) {
93
+ // Attention rollout per Abnar & Zuidema (2020)
94
+ // Expects list of (B, T, T) attention matrices already averaged over heads.
95
+ computeAttentionRollout(t) {
96
+ return this.tf.tidy(() => {
97
+ if (t.length === 0)
98
+ throw new Error("No attentions for rollout");
99
+ const e = t[0].shape[0], s = t[0].shape[1], i = this.tf.eye(s, s).expandDims(0);
100
+ let n = i.tile([e, 1, 1]);
101
+ for (const a of t) {
102
+ let o = a.add(i);
103
+ o = o.div(o.sum(-1, !0)), n = o.matMul(n);
104
+ }
105
+ return n;
106
+ });
107
+ }
108
+ forward(t, e, s = !1, i = !1) {
94
109
  return this.validateInput(t), this.tf.tidy(() => {
95
- let i = this.inputPhase(t, s), o;
96
- n && (o = this.tf.zeros([i.shape[0], i.shape[1], i.shape[1]]));
97
- for (const l of this.blocks) {
98
- const { output: r, attention: f } = l.call(i, s, n);
99
- i = r, f && o && (o = o.add(f));
110
+ let n = this.inputPhase(t, s);
111
+ const a = [];
112
+ for (const c of this.blocks) {
113
+ const { output: p, attention: l } = c.call(n, s, i);
114
+ n = p, i && l && a.push(l);
100
115
  }
101
- o && (o = o.div(this.blocks.length)), i = this.lnF.apply(i);
102
- const h = this.wte.project(i);
103
- let a;
104
- return e && (a = this.calculateLoss(h, e)), { logits: h, loss: a, attention: n ? o : void 0 };
116
+ let o;
117
+ i && a.length > 0 && (o = this.computeAttentionRollout(a)), n = this.lnF.apply(n);
118
+ const h = this.wte.project(n);
119
+ let r;
120
+ return e && (r = this.calculateLoss(h, e)), { logits: h, loss: r, attention: i ? o : void 0 };
105
121
  });
106
122
  }
107
123
  generate(t, e) {
108
- const s = e?.temperature ?? 1, n = e?.topK, i = e?.usePadding ?? !1, o = e?.includeAttention ?? !1;
124
+ const s = e?.temperature ?? 1, i = e?.topK, n = e?.usePadding ?? !1, a = e?.includeAttention ?? !1;
109
125
  return this.tf.tidy(() => {
110
- const h = t, a = h.shape[1], l = a <= this.config.blockSize ? h : h.slice(
111
- [0, a - this.config.blockSize],
112
- [h.shape[0], this.config.blockSize]
113
- ), r = i ? this.config.blockSize - l.shape[1] : 0, f = r > 0 ? this.tf.pad(l, [
126
+ const o = t, h = o.shape[1], r = h <= this.config.blockSize ? o : o.slice(
127
+ [0, h - this.config.blockSize],
128
+ [o.shape[0], this.config.blockSize]
129
+ ), c = n ? this.config.blockSize - r.shape[1] : 0, p = c > 0 ? this.tf.pad(r, [
114
130
  [0, 0],
115
- [0, r]
116
- ]) : l, { logits: g, attention: p } = this.forward(f, void 0, !1, o), d = g.shape[1] - 1 - r, m = g.slice([0, d, 0], [g.shape[0], 1, g.shape[2]]), u = p ? p.slice([0, d, 0], [p.shape[0], 1, p.shape[2]]) : void 0, b = m.div(s);
117
- let c;
118
- if (n) {
119
- const { values: k, indices: w } = this.tf.topk(b, n), E = this.tf.multinomial(k.squeeze([1]), 1);
120
- c = this.tf.gather(w.squeeze([1]), E, 1);
131
+ [0, c]
132
+ ]) : r, { logits: l, attention: g } = this.forward(p, void 0, !1, a), b = l.shape[1] - 1 - c, u = l.slice([0, b, 0], [l.shape[0], 1, l.shape[2]]), k = g ? g.slice([0, b, 0], [g.shape[0], 1, g.shape[2]]) : void 0, d = u.div(s);
133
+ let f;
134
+ if (i) {
135
+ const { values: w, indices: E } = this.tf.topk(d, i), y = this.tf.multinomial(w.squeeze([1]), 1);
136
+ f = this.tf.gather(E.squeeze([1]), y, 1);
121
137
  } else
122
- c = this.tf.multinomial(b.squeeze([1]), 1);
123
- return c = c.reshape([1, 1]), { output: c, attention: u?.squeeze([1]) };
138
+ f = this.tf.multinomial(d.squeeze([1]), 1);
139
+ let m;
140
+ return e?.includeProbabilities && (m = this.tf.softmax(d.squeeze([1]))), f = f.reshape([1, 1]), { output: f, attention: k?.squeeze([1]), probabilities: m };
124
141
  });
125
142
  }
126
143
  getNumParams() {
127
144
  const t = this.config.vocabSize * this.config.nEmbed + this.config.blockSize * this.config.nEmbed, e = this.config.nLayer * (4 * this.config.nEmbed * this.config.nEmbed + // qkv + proj
128
145
  2 * this.config.nEmbed), s = this.config.nLayer * (4 * this.config.nEmbed * this.config.nEmbed + // fc
129
- this.config.nEmbed * 4 * this.config.nEmbed), n = this.config.nEmbed + this.config.vocabSize * this.config.nEmbed;
130
- return t + e + s + n;
146
+ this.config.nEmbed * 4 * this.config.nEmbed), i = this.config.nEmbed + this.config.vocabSize * this.config.nEmbed;
147
+ return t + e + s + i;
131
148
  }
132
149
  }
133
150
  export {
package/dist/config.d.ts CHANGED
@@ -7,5 +7,6 @@ export interface GPTConfig {
7
7
  dropout: number;
8
8
  biasInLinear: boolean;
9
9
  biasInLayerNorm: boolean;
10
+ mlpFactor: number;
10
11
  }
11
12
  export declare const defaultConfig: GPTConfig;
package/dist/config.js CHANGED
@@ -1,4 +1,4 @@
1
- const e = {
1
+ const a = {
2
2
  vocabSize: 50304,
3
3
  // GPT-2 vocab size
4
4
  blockSize: 1024,
@@ -12,8 +12,9 @@ const e = {
12
12
  dropout: 0,
13
13
  // Dropout probability
14
14
  biasInLinear: !1,
15
- biasInLayerNorm: !1
15
+ biasInLayerNorm: !1,
16
+ mlpFactor: 4
16
17
  };
17
18
  export {
18
- e as defaultConfig
19
+ a as defaultConfig
19
20
  };
@@ -1,4 +1,4 @@
1
- class n {
1
+ class l {
2
2
  cFc;
3
3
  cProj;
4
4
  dropout;
@@ -7,7 +7,7 @@ class n {
7
7
  _trainable = !0;
8
8
  constructor(t, i, e) {
9
9
  this.tf = t, this.index = i, this.cFc = this.tf.layers.dense({
10
- units: 4 * e.nEmbed,
10
+ units: e.mlpFactor * e.nEmbed,
11
11
  activation: "gelu",
12
12
  useBias: e.biasInLinear,
13
13
  kernelInitializer: this.tf.initializers.randomNormal({
@@ -53,5 +53,5 @@ class n {
53
53
  }
54
54
  }
55
55
  export {
56
- n as default
56
+ l as default
57
57
  };
@@ -0,0 +1,8 @@
1
+ import { default as NanoGPT } from '../NanoGPTModel';
2
+ import { default as TF } from '@tensorflow/tfjs';
3
+ export default class Evaluator {
4
+ private model;
5
+ private iterator;
6
+ constructor(model: NanoGPT, dataset: TF.data.Dataset<TF.TensorContainer>);
7
+ evaluate(maxBatches?: number): Promise<number>;
8
+ }
@@ -0,0 +1,22 @@
1
+ class p {
2
+ constructor(s, t) {
3
+ this.model = s, this.iterator = t.iterator();
4
+ }
5
+ iterator;
6
+ async evaluate(s = 100) {
7
+ let t = 0, o = 0;
8
+ const c = await this.iterator;
9
+ for (let a = 0; a < s; a++) {
10
+ const e = await c.next();
11
+ if (e.done) break;
12
+ const n = e.value, { xs: r, ys: l } = n, { loss: i, logits: u } = this.model.forward(r, l, !1, !1);
13
+ u.dispose(), r.dispose(), l.dispose();
14
+ const d = i.arraySync();
15
+ i.dispose(), t += d, o++;
16
+ }
17
+ return t / o;
18
+ }
19
+ }
20
+ export {
21
+ p as default
22
+ };
@@ -1,18 +1,19 @@
1
1
  import { generateText as L } from "../utilities/generate.js";
2
2
  import w from "./Trainer.js";
3
- const g = {
3
+ import g from "./Evaluator.js";
4
+ const x = {
4
5
  desiredLoss: 0.01,
5
6
  logInterval: 1,
6
7
  maxSteps: 1e3
7
8
  };
8
- class S extends w {
9
+ class D extends w {
9
10
  constructor(r, i, o, n = 3e-4) {
10
11
  super(r, i, o, n);
11
12
  }
12
13
  // Train for multiple epochs using Dataset API - FIXED memory leaks
13
14
  async trainOnDataset(r, i, o) {
14
- const { desiredLoss: n, logInterval: c, onStep: l, prompt: p, maxSteps: d } = {
15
- ...g,
15
+ const { desiredLoss: n, logInterval: d, onStep: l, prompt: p, maxSteps: m } = {
16
+ ...x,
16
17
  ...i
17
18
  }, s = {
18
19
  pass: 0,
@@ -25,23 +26,23 @@ class S extends w {
25
26
  validationLosses: []
26
27
  };
27
28
  this.dummyPass(), this.model.trainable = !0;
28
- const m = Date.now();
29
+ const u = Date.now();
29
30
  this.running = !0;
30
- const u = await r.iterator();
31
+ const c = o ? new g(this.model, o) : void 0, f = await r.iterator();
31
32
  try {
32
33
  for (; this.running && !(s.lastLoss < n); ) {
33
- const e = await u.next();
34
+ const e = await f.next();
34
35
  if (e.done) break;
35
- const h = e.value, f = this.trainBatch(s, h), a = {
36
+ const h = e.value, v = this.trainBatch(s, h), a = {
36
37
  loss: s.lastLoss,
37
38
  step: s.step,
38
- time: Date.now() - m,
39
+ time: Date.now() - u,
39
40
  batchSize: h.xs.shape[0]
40
41
  };
41
- if (this.model.log.push(a), s.step % c === 0) {
42
- if (await f, o)
42
+ if (this.model.log.push(a), s.step % d === 0) {
43
+ if (await v, c)
43
44
  try {
44
- const t = await this.evaluateOnDataset(o, 5);
45
+ const t = await c.evaluate(5);
45
46
  s.validationLosses.push(t), a.valLoss = t;
46
47
  } catch (t) {
47
48
  console.error("Validation error:", t);
@@ -56,7 +57,7 @@ class S extends w {
56
57
  await l(a);
57
58
  }
58
59
  }
59
- s.step >= d && this.stop();
60
+ s.step >= m && this.stop();
60
61
  }
61
62
  } catch (e) {
62
63
  throw console.error("Training error:", e), this.tf.dispose(), e;
@@ -65,5 +66,5 @@ class S extends w {
65
66
  }
66
67
  }
67
68
  export {
68
- S as default
69
+ D as default
69
70
  };
@@ -1,32 +1,33 @@
1
- import { generateText as d } from "../utilities/generate.js";
2
- import S from "./Trainer.js";
3
- import { schedule as u } from "./lwSchedule.js";
4
- const w = {
1
+ import { generateText as S } from "../utilities/generate.js";
2
+ import u from "./Trainer.js";
3
+ import { schedule as v } from "./lwSchedule.js";
4
+ import w from "./Evaluator.js";
5
+ const T = {
5
6
  desiredLoss: 0.01,
6
7
  logInterval: 1,
7
8
  stepsPerLayer: 400,
8
9
  maxPasses: 3,
9
10
  maxSteps: 1e3
10
11
  };
11
- class b extends S {
12
+ class z extends u {
12
13
  trainingPattern = [];
13
14
  startPass = 0;
14
15
  startLayer = 0;
15
- constructor(r, a, e, p = 3e-4) {
16
- if (super(r, a, e, p), this.trainingPattern = u[a.config.nLayer - 1] || [], a.log.length > 0) {
17
- const i = a.log[a.log.length - 1];
16
+ constructor(r, s, e, p = 3e-4) {
17
+ if (super(r, s, e, p), this.trainingPattern = v[s.config.nLayer - 1] || [], s.log.length > 0) {
18
+ const i = s.log[s.log.length - 1];
18
19
  i.pass !== void 0 && i.layer !== void 0 && (this.startPass = i.pass, this.startLayer = i.layer, console.log(`Resuming training from pass ${this.startPass}, layer ${this.startLayer}`));
19
20
  }
20
21
  }
21
22
  applyTrainingPattern(r) {
22
- const a = r < this.trainingPattern.length ? r : this.trainingPattern.length - 1, e = this.trainingPattern[a];
23
- this.model.setSkipMask(e.skip), this.model.setTrainableMask(e.trainable), this.resetOptimizer(e.adam), console.log("Applied training pattern:", a, e);
23
+ const s = r < this.trainingPattern.length ? r : this.trainingPattern.length - 1, e = this.trainingPattern[s];
24
+ this.model.setSkipMask(e.skip), this.model.setTrainableMask(e.trainable), this.resetOptimizer(e.adam), console.log("Applied training pattern:", s, e);
24
25
  }
25
26
  // Train for multiple epochs using Dataset API - FIXED memory leaks
26
- async trainOnDataset(r, a, e) {
27
- const { desiredLoss: p, logInterval: i, stepsPerLayer: L, onLayerChange: l, onPassComplete: h, onStep: c, prompt: g } = {
28
- ...w,
29
- ...a
27
+ async trainOnDataset(r, s, e) {
28
+ const { desiredLoss: p, logInterval: i, stepsPerLayer: L, onLayerChange: o, onPassComplete: h, onStep: c, prompt: g } = {
29
+ ...T,
30
+ ...s
30
31
  }, t = {
31
32
  pass: 0,
32
33
  layerStep: 0,
@@ -38,47 +39,44 @@ class b extends S {
38
39
  validationLosses: []
39
40
  };
40
41
  this.dummyPass();
41
- const m = Date.now();
42
+ const f = Date.now();
42
43
  this.startPass = 0, this.startLayer = 0;
43
- const f = await r.iterator();
44
+ const y = e ? new w(this.model, e) : void 0, d = await r.iterator();
44
45
  this.applyTrainingPattern(t.layerStep % this.trainingPattern.length);
45
46
  try {
46
47
  for (; !(t.lastLoss < p); ) {
47
- const n = await f.next();
48
+ const n = await d.next();
48
49
  if (n.done) break;
49
- const y = n.value, P = this.trainBatch(t, y);
50
+ const m = n.value, P = this.trainBatch(t, m);
50
51
  t.stepSinceLayerChange++;
51
- const o = {
52
+ const l = {
52
53
  loss: t.lastLoss,
53
54
  step: t.step,
54
- time: Date.now() - m,
55
- batchSize: y.xs.shape[0],
55
+ time: Date.now() - f,
56
+ batchSize: m.xs.shape[0],
56
57
  pass: t.pass,
57
58
  layer: t.layerStep % this.model.config.nLayer
58
59
  };
59
- if (this.model.log.push(o), t.step % i === 0) {
60
- if (await P, e)
60
+ if (this.model.log.push(l), t.step % i === 0) {
61
+ if (await P, y)
61
62
  try {
62
- const s = await this.evaluateOnDataset(e, 5);
63
- t.validationLosses.push(s), o.valLoss = s;
64
- } catch (s) {
65
- console.error("Validation error:", s);
63
+ const a = await y.evaluate(5);
64
+ t.validationLosses.push(a), l.valLoss = a;
65
+ } catch (a) {
66
+ console.error("Validation error:", a);
66
67
  }
67
68
  if (c) {
68
69
  if (g) {
69
- const s = await d(this.tokenizer, this.model, g, 100, {
70
+ const a = await S(this.tokenizer, this.model, g, 100, {
70
71
  temperature: 0.8,
71
72
  topK: 10
72
73
  });
73
- o.example = s;
74
+ l.example = a;
74
75
  }
75
- await c(o);
76
+ await c(l);
76
77
  }
77
78
  }
78
- if (t.stepSinceLayerChange >= L) {
79
- let s;
80
- e && (s = await this.evaluateOnDataset(e, 5), t.validationLosses.push(s), o.valLoss = s), t.layerStep++, t.layerStep % this.model.config.nLayer === 0 ? (l && await l(t.layerStep, t.pass, s), h && await h(t.pass), t.pass++) : l && await l(t.layerStep, t.pass, s), t.stepSinceLayerChange = 0, this.applyTrainingPattern(t.layerStep % this.trainingPattern.length);
81
- }
79
+ t.stepSinceLayerChange >= L && (t.layerStep++, t.layerStep % this.model.config.nLayer === 0 ? (o && await o(t.layerStep, t.pass), h && await h(t.pass), t.pass++) : o && await o(t.layerStep, t.pass), t.stepSinceLayerChange = 0, this.applyTrainingPattern(t.layerStep % this.trainingPattern.length));
82
80
  }
83
81
  } catch (n) {
84
82
  throw console.error("Training error:", n), this.tf.dispose(), n;
@@ -87,5 +85,5 @@ class b extends S {
87
85
  }
88
86
  }
89
87
  export {
90
- b as default
88
+ z as default
91
89
  };
@@ -56,7 +56,6 @@ export default abstract class GPTTrainer {
56
56
  losses: number[];
57
57
  validationLosses: number[];
58
58
  }>;
59
- evaluateOnDataset(dataset: TF.data.Dataset<TF.TensorContainer>, maxBatches?: number): Promise<number>;
60
59
  createTrainValidationSplit(textData: string[], batchSize?: number, validationSplit?: number): Promise<{
61
60
  trainDataset: TF.data.Dataset<{
62
61
  xs: TF.Tensor;
@@ -1,8 +1,8 @@
1
- import { DatasetBuilder as h } from "./DatasetBuilder.js";
1
+ import { DatasetBuilder as d } from "./DatasetBuilder.js";
2
2
  import p from "./AdamExt.js";
3
- class y {
4
- constructor(t, e, s, a = 1e-3) {
5
- this.tokenizer = s, this.tf = t, this.model = e, this.learningRate = a, this.resetOptimizer(), this.datasetBuilder = new h(this.tf, s, e.config.blockSize);
3
+ class g {
4
+ constructor(t, e, s, i = 1e-3) {
5
+ this.tokenizer = s, this.tf = t, this.model = e, this.learningRate = i, this.resetOptimizer(), this.datasetBuilder = new d(this.tf, s, e.config.blockSize);
6
6
  }
7
7
  model;
8
8
  optimizer;
@@ -43,11 +43,11 @@ class y {
43
43
  }
44
44
  trainStep(t, e = !1, s = !1) {
45
45
  return this.tf.tidy(() => {
46
- const { xs: a, ys: o } = t, r = () => {
47
- const { loss: l, logits: c } = this.model.forward(a, o, !0);
46
+ const { xs: i, ys: r } = t, o = () => {
47
+ const { loss: l, logits: c } = this.model.forward(i, r, !0);
48
48
  return c.dispose(), l;
49
- }, { value: n, grads: i } = this.tf.variableGrads(r);
50
- return e || (s && (console.log("-------"), this.printGradients(i), console.log("-------")), this.optimizer.applyGradients(i), this.tf.dispose(i)), n;
49
+ }, { value: n, grads: a } = this.tf.variableGrads(o);
50
+ return e || (s && (console.log("-------"), this.printGradients(a), console.log("-------")), this.optimizer.applyGradients(a), this.tf.dispose(a)), n;
51
51
  });
52
52
  }
53
53
  dummyPass() {
@@ -64,31 +64,22 @@ class y {
64
64
  async trainBatch(t, e) {
65
65
  try {
66
66
  const s = this.trainStep(e, !1, !1);
67
- return e.xs.dispose(), e.ys.dispose(), t.step++, t.totalSteps++, s.array().then((a) => (t.lastLoss = a, t.losses.push(t.lastLoss), s.dispose(), t.lastLoss));
67
+ return e.xs.dispose(), e.ys.dispose(), t.step++, t.totalSteps++, s.array().then((i) => (t.lastLoss = i, t.losses.push(t.lastLoss), s.dispose(), t.lastLoss));
68
68
  } catch (s) {
69
69
  throw console.error(`Error processing batch at step ${t.step}:`, s), this.tf.dispose(), s;
70
70
  }
71
71
  }
72
- // Evaluate model on validation dataset - FIXED memory leaks
73
- async evaluateOnDataset(t, e = 100) {
74
- let s = 0, a = 0;
75
- return await t.take(e).forEachAsync(async (o) => {
76
- const { xs: r, ys: n } = o, { loss: i, logits: l } = this.model.forward(r, n, !1), d = i.arraySync();
77
- i.dispose(), l.dispose(), s += d, a++;
78
- }), s / a;
79
- }
80
- // Create training and validation datasets - FIXED memory leaks
81
72
  async createTrainValidationSplit(t, e = 32, s = 0.1) {
82
- const a = Math.floor(t.length * (1 - s)), o = t.slice(0, a), r = t.slice(a), n = await this.datasetBuilder.createTextDataset(o, e), i = await this.datasetBuilder.createTextDataset(r, e);
83
- return { trainDataset: n, validationDataset: i };
73
+ const i = Math.floor(t.length * (1 - s)), r = t.slice(0, i), o = t.slice(i), n = await this.datasetBuilder.createTextDataset(r, e), a = await this.datasetBuilder.createTextDataset(o, e);
74
+ return { trainDataset: n, validationDataset: a };
84
75
  }
85
76
  async createDataset(t, e = 32) {
86
77
  return await this.datasetBuilder.createTextDataset(t, e);
87
78
  }
88
79
  dispose() {
89
- this.optimizer && this.optimizer.dispose(), this.tf.dispose();
80
+ this.optimizer && this.optimizer.dispose();
90
81
  }
91
82
  }
92
83
  export {
93
- y as default
84
+ g as default
94
85
  };
@@ -1,17 +1,19 @@
1
- async function w(n, e, o, s, p) {
1
+ async function w(n, t, r, s, g) {
2
2
  if (s <= 0)
3
3
  throw new Error("Length must be a positive integer");
4
- if (o.length === 0)
4
+ if (r.length === 0)
5
5
  throw new Error("Prompt cannot be an empty string");
6
- const a = await n.tokenise([o], !0), r = (await e.tf.tidy(() => {
7
- let t = e.tf.tensor2d(a, [1, a[0].length], "int32");
8
- for (let c = 0; c < s; c++) {
9
- const { output: d } = e.generate(t, p), u = t;
10
- t = e.tf.concat([t, d], 1), u.dispose(), d.dispose();
6
+ const i = await n.tokenise([r], !0), a = t.tf.tidy(() => {
7
+ let e = t.tf.tensor2d(i, [1, i[0].length], "int32");
8
+ for (let d = 0; d < s; d++) {
9
+ const { output: p } = t.generate(e, g), f = e;
10
+ e = t.tf.concat([e, p], 1), f.dispose(), p.dispose();
11
11
  }
12
- return t;
13
- }).array())[0], i = r.indexOf(n.eosToken);
14
- return i !== -1 && r.splice(i), await n.decode(r);
12
+ return e;
13
+ }), u = await a.array();
14
+ a.dispose();
15
+ const o = u[0], c = o.indexOf(n.eosToken);
16
+ return c !== -1 && o.splice(c), await n.decode(o);
15
17
  }
16
18
  export {
17
19
  w as generateText
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@genai-fi/nanogpt",
3
- "version": "0.1.3",
3
+ "version": "0.1.5",
4
4
  "type": "module",
5
5
  "main": "dist/main.js",
6
6
  "types": "dist/main.d.ts",