@genai-fi/nanogpt 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,7 @@ import { ITokeniser } from './tokeniser/type';
3
3
  import { default as EE } from 'eventemitter3';
4
4
  export interface IGenerateOptions extends GenerateOptions {
5
5
  maxLength?: number;
6
+ noCache?: boolean;
6
7
  }
7
8
  export default class Generator extends EE<'start' | 'stop' | 'tokens'> {
8
9
  private readonly model;
package/dist/Generator.js CHANGED
@@ -1,5 +1,5 @@
1
1
  import { E as u } from "./index-SOhdqzHq.js";
2
- class k extends u {
2
+ class p extends u {
3
3
  constructor(s, e) {
4
4
  super(), this.model = s, this.tokeniser = e;
5
5
  }
@@ -14,10 +14,10 @@ class k extends u {
14
14
  const {
15
15
  output: o,
16
16
  attention: c,
17
- probabilities: l
18
- } = this.model.generate(t, void 0, e), h = t;
19
- t = this.model.tf.concat([t, o], 1), h.dispose();
20
- const r = await this.processResponse(o, c, l);
17
+ probabilities: h
18
+ } = this.model.generate(t, void 0, e), l = t;
19
+ t = this.model.tf.concat([t, o], 1), l.dispose();
20
+ const r = await this.processResponse(o, c, h);
21
21
  if (o.dispose(), r === null)
22
22
  break;
23
23
  n += r;
@@ -40,14 +40,14 @@ class k extends u {
40
40
  for (let o = 0; o < i; o++) {
41
41
  const {
42
42
  output: c,
43
- attention: l,
44
- probabilities: h
43
+ attention: h,
44
+ probabilities: l
45
45
  } = this.model.generate(t, a, {
46
46
  ...e,
47
47
  usePadding: !1
48
48
  });
49
49
  t.dispose(), t = c;
50
- const r = await this.processResponse(c, l, h);
50
+ const r = await this.processResponse(c, h, l);
51
51
  if (r === null)
52
52
  break;
53
53
  n += r;
@@ -56,10 +56,10 @@ class k extends u {
56
56
  }
57
57
  async generate(s, e) {
58
58
  this.emit("start");
59
- const t = this.model.config.useRope ? this.generateCache(s, e) : this.generateNoCache(s, e);
59
+ const t = this.model.config.useRope && !e?.noCache ? this.generateCache(s, e) : this.generateNoCache(s, e);
60
60
  return this.emit("stop"), t;
61
61
  }
62
62
  }
63
63
  export {
64
- k as default
64
+ p as default
65
65
  };
@@ -1,9 +1,9 @@
1
- import { defaultConfig as v } from "./config.js";
2
- import S from "./layers/TransformerBlock.js";
3
- import _ from "./layers/TiedEmbedding.js";
4
- import L from "./layers/RoPECache.js";
5
- import I from "./layers/RMSNorm.js";
6
- class F {
1
+ import { defaultConfig as z } from "./config.js";
2
+ import $ from "./layers/TransformerBlock.js";
3
+ import S from "./layers/TiedEmbedding.js";
4
+ import I from "./layers/RoPECache.js";
5
+ import _ from "./layers/RMSNorm.js";
6
+ class M {
7
7
  config;
8
8
  wte;
9
9
  // Token embeddings
@@ -19,7 +19,7 @@ class F {
19
19
  log = [];
20
20
  // Training log
21
21
  constructor(t, e = {}) {
22
- this.tf = t, this.config = { ...v, ...e }, this.wte = new _(t, {
22
+ this.tf = t, this.config = { ...z, ...e }, this.wte = new S(t, {
23
23
  vocabSize: this.config.vocabSize,
24
24
  embedDim: this.config.nEmbed,
25
25
  name: "token_embedding"
@@ -28,10 +28,10 @@ class F {
28
28
  outputDim: this.config.nEmbed,
29
29
  name: "positional_embedding",
30
30
  embeddingsInitializer: this.tf.initializers.randomNormal({ mean: 0, stddev: 0.02 })
31
- }) : this.ropeCache = new L(t, this.config), this.drop = this.tf.layers.dropout({ rate: this.config.dropout }), this.blocks = [];
32
- for (let s = 0; s < this.config.nLayer; s++)
33
- this.blocks.push(new S(this.tf, s, this.config, this.ropeCache));
34
- this.lnF = new I(t, [this.config.nEmbed], 1e-8, "final_rms_norm");
31
+ }) : this.ropeCache = new I(t, this.config), this.drop = this.tf.layers.dropout({ rate: this.config.dropout }), this.blocks = [];
32
+ for (let o = 0; o < this.config.nLayer; o++)
33
+ this.blocks.push(new $(this.tf, o, this.config, this.ropeCache));
34
+ this.lnF = new _(t, [this.config.nEmbed], 1e-8, "final_rms_norm");
35
35
  }
36
36
  get variables() {
37
37
  return [
@@ -54,17 +54,17 @@ class F {
54
54
  this.blocks[e].loadWeights(t);
55
55
  this.lnF.setWeights(t.get("final_rms_norm") || []);
56
56
  }
57
- inputPhase(t, e, s = !1) {
57
+ inputPhase(t, e, o = !1) {
58
58
  return this.tf.tidy(() => {
59
- const o = this.wte.embed(t);
59
+ const i = this.wte.embed(t);
60
60
  if (this.config.useRope === !1) {
61
- const [, i] = t.shape, a = this.config.blockSize, n = this.tf.range(0, i, 1, "int32"), h = this.tf.mod(
62
- this.tf.add(n, this.tf.scalar(e, "int32")),
63
- this.tf.scalar(a, "int32")
64
- ), c = this.wpe.apply(h), r = o.add(c);
65
- return this.drop.apply(r, { training: s });
61
+ const [, s] = t.shape, r = this.config.blockSize, l = this.tf.range(0, s, 1, "int32"), n = this.tf.mod(
62
+ this.tf.add(l, this.tf.scalar(e, "int32")),
63
+ this.tf.scalar(r, "int32")
64
+ ), h = this.wpe.apply(n), c = i.add(h);
65
+ return this.drop.apply(c, { training: o });
66
66
  } else
67
- return this.drop.apply(o, { training: s });
67
+ return this.drop.apply(i, { training: o });
68
68
  });
69
69
  }
70
70
  setSkipMask(t) {
@@ -95,8 +95,8 @@ class F {
95
95
  calculateLoss(t, e) {
96
96
  try {
97
97
  return this.tf.losses.softmaxCrossEntropy(e, t, this.tf.Reduction.MEAN);
98
- } catch (s) {
99
- throw console.error("Error computing loss:", s), new Error(`Loss computation failed: ${s}`);
98
+ } catch (o) {
99
+ throw console.error("Error computing loss:", o), new Error(`Loss computation failed: ${o}`);
100
100
  }
101
101
  }
102
102
  // Attention rollout per Abnar & Zuidema (2020)
@@ -105,67 +105,88 @@ class F {
105
105
  return this.tf.tidy(() => {
106
106
  if (t.length === 0)
107
107
  throw new Error("No attentions for rollout");
108
- const e = t[0].shape[0], s = t[0].shape[1], o = this.tf.eye(s, s).expandDims(0);
109
- let i = o.tile([e, 1, 1]);
110
- for (const a of t) {
111
- let n = a.add(o);
112
- n = n.div(n.sum(-1, !0)), i = n.matMul(i);
108
+ const [e, o, i] = t[0].shape;
109
+ for (const s of t) {
110
+ const [r, l, n] = s.shape;
111
+ if (r !== e || l !== o || n !== i)
112
+ throw new Error(
113
+ `Inconsistent attention shapes in rollout: expected [${e},${o},${i}] got [${r},${l},${n}]`
114
+ );
113
115
  }
114
- return i;
116
+ if (o === i) {
117
+ const s = this.tf.eye(i, i).expandDims(0);
118
+ let r = s.tile([e, 1, 1]);
119
+ for (const l of t) {
120
+ const n = l.add(s);
121
+ r = n.div(n.sum(-1, !0)).matMul(r);
122
+ }
123
+ return r;
124
+ }
125
+ if (o === 1) {
126
+ let s = null;
127
+ const r = this.tf.tensor1d([i - 1], "int32"), l = this.tf.oneHot(r, i).reshape([1, 1, i]).tile([e, 1, 1]);
128
+ r.dispose();
129
+ for (const n of t) {
130
+ let h = n.add(l);
131
+ h = h.div(h.sum(-1, !0)), s == null ? s = h : (s = s.mul(h), s = s.div(s.sum(-1, !0)));
132
+ }
133
+ return s;
134
+ }
135
+ throw new Error(`Unsupported attention shapes for rollout: [B=${e}, Q=${o}, K=${i}]`);
115
136
  });
116
137
  }
117
- forward(t, e, s = !1, o = !1, i) {
138
+ forward(t, e, o = !1, i = !1, s) {
118
139
  return this.validateInput(t), this.tf.tidy(() => {
119
- const a = i?.[0]?.length ?? 0;
120
- let n = this.inputPhase(t, a, s);
121
- const h = [];
122
- if (i && i.length !== this.blocks.length)
123
- throw console.error("Cache", i), new Error(`Cache length ${i.length} does not match number of blocks ${this.blocks.length}`);
124
- for (let l = 0; l < this.blocks.length; l++) {
125
- const d = this.blocks[l], {
140
+ const r = s?.[0]?.length ?? 0;
141
+ let l = this.inputPhase(t, r, o);
142
+ const n = [];
143
+ if (s && s.length !== this.blocks.length)
144
+ throw console.error("Cache", s), new Error(`Cache length ${s.length} does not match number of blocks ${this.blocks.length}`);
145
+ for (let a = 0; a < this.blocks.length; a++) {
146
+ const d = this.blocks[a], {
126
147
  output: g,
127
- attention: b,
148
+ attention: m,
128
149
  cache: p
129
- } = d.call(n, s, o, i ? i[l] : void 0);
130
- n = g, o && b && h.push(b), i && p ? (i[l]?.k.dispose(), i[l]?.v.dispose(), i[l] = p) : p && (p.k.dispose(), p.v.dispose());
150
+ } = d.call(l, o, i, s ? s[a] : void 0);
151
+ l = g, i && m && n.push(m), s && p ? (s[a]?.k.dispose(), s[a]?.v.dispose(), s[a] = p) : p && (p.k.dispose(), p.v.dispose());
131
152
  }
132
- let c;
133
- o && h.length > 0 && (c = this.computeAttentionRollout(h)), n = this.lnF.apply(n);
134
- const r = this.wte.project(n);
153
+ let h;
154
+ i && n.length > 0 && (h = this.computeAttentionRollout(n)), l = this.lnF.apply(l);
155
+ const c = this.wte.project(l);
135
156
  let f;
136
- return e && (f = this.calculateLoss(r, e)), { logits: r, loss: f, attention: o ? c : void 0 };
157
+ return e && (f = this.calculateLoss(c, e)), { logits: c, loss: f, attention: i ? h : void 0 };
137
158
  });
138
159
  }
139
- generate(t, e, s) {
140
- const o = s?.temperature ?? 1, i = s?.topK, a = s?.usePadding ?? !1, n = s?.includeAttention ?? !1;
160
+ generate(t, e, o) {
161
+ const i = o?.temperature ?? 1, s = o?.topK, r = o?.usePadding ?? !1, l = o?.includeAttention ?? !1;
141
162
  return this.tf.tidy(() => {
142
- const h = t, c = h.shape[1], r = c <= this.config.blockSize ? h : h.slice(
143
- [0, c - this.config.blockSize],
144
- [h.shape[0], this.config.blockSize]
145
- ), f = a ? this.config.blockSize - r.shape[1] : 0, l = f > 0 ? this.tf.pad(r, [
163
+ const n = t, h = n.shape[1], c = h <= this.config.blockSize ? n : n.slice(
164
+ [0, h - this.config.blockSize],
165
+ [n.shape[0], this.config.blockSize]
166
+ ), f = r ? this.config.blockSize - c.shape[1] : 0, a = f > 0 ? this.tf.pad(c, [
146
167
  [0, 0],
147
168
  [0, f]
148
- ]) : r, { logits: d, attention: g } = this.forward(l, void 0, !1, n, e), b = d.shape[1] - 1 - f, p = d.slice([0, b, 0], [d.shape[0], 1, d.shape[2]]), w = g ? g.slice([0, b, 0], [g.shape[0], 1, g.shape[2]]) : void 0, u = p.div(o);
149
- let m;
150
- if (i) {
151
- const { values: E, indices: y } = this.tf.topk(u, i), z = this.tf.multinomial(E.squeeze([1]), 1);
152
- m = this.tf.gather(y.squeeze([1]), z, 1);
169
+ ]) : c, { logits: d, attention: g } = this.forward(a, void 0, !1, l, e), m = d.shape[1] - 1 - f, p = d.slice([0, m, 0], [d.shape[0], 1, d.shape[2]]), w = g ? g.slice([0, m, 0], [g.shape[0], 1, g.shape[2]]) : void 0, u = p.div(i);
170
+ let b;
171
+ if (s) {
172
+ const { values: E, indices: v } = this.tf.topk(u, s), y = this.tf.multinomial(E.squeeze([1]), 1);
173
+ b = this.tf.gather(v.squeeze([1]), y, 1);
153
174
  } else
154
- m = this.tf.multinomial(u.squeeze([1]), 1);
175
+ b = this.tf.multinomial(u.squeeze([1]), 1);
155
176
  let k;
156
- return s?.includeProbabilities && (k = this.tf.softmax(u.squeeze([1]))), m = m.reshape([1, 1]), { output: m, attention: w?.squeeze([1]), probabilities: k };
177
+ return o?.includeProbabilities && (k = this.tf.softmax(u.squeeze([1]))), b = b.reshape([1, 1]), { output: b, attention: w?.squeeze([1]), probabilities: k };
157
178
  });
158
179
  }
159
180
  getNumParams() {
160
181
  const t = this.config.vocabSize * this.config.nEmbed + this.config.blockSize * this.config.nEmbed, e = this.config.nLayer * (4 * this.config.nEmbed * this.config.nEmbed + // qkv + proj
161
- 2 * this.config.nEmbed), s = this.config.nLayer * (4 * this.config.nEmbed * this.config.nEmbed + // fc
162
- this.config.nEmbed * 4 * this.config.nEmbed), o = this.config.nEmbed + this.config.vocabSize * this.config.nEmbed;
163
- return t + e + s + o;
182
+ 2 * this.config.nEmbed), o = this.config.nLayer * (4 * this.config.nEmbed * this.config.nEmbed + // fc
183
+ this.config.nEmbed * 4 * this.config.nEmbed), i = this.config.nEmbed + this.config.vocabSize * this.config.nEmbed;
184
+ return t + e + o + i;
164
185
  }
165
186
  dispose() {
166
187
  this.wte.dispose(), this.wpe && this.wpe.dispose(), this.drop.dispose(), this.blocks.forEach((t) => t.dispose()), this.lnF.dispose();
167
188
  }
168
189
  }
169
190
  export {
170
- F as default
191
+ M as default
171
192
  };
@@ -1,5 +1,5 @@
1
- import d from "./NanoGPTModel.js";
2
- import { defaultConfig as u } from "./config.js";
1
+ import { defaultConfig as d } from "./config.js";
2
+ import u from "./NanoGPTModel.js";
3
3
  import { saveModel as m } from "./utilities/save.js";
4
4
  import { loadModel as l } from "./utilities/load.js";
5
5
  import f from "./Generator.js";
@@ -58,7 +58,7 @@ class a extends c {
58
58
  }), e;
59
59
  }
60
60
  static create(t, r = {}) {
61
- const e = { ...u, ...r }, s = new g(e.vocabSize), o = new d(t, e), i = new a(t, s, o);
61
+ const e = { ...d, ...r }, s = new g(e.vocabSize), o = new u(t, e), i = new a(t, s, o);
62
62
  return i.setStatus("warmup"), h(o).then(() => {
63
63
  i.tokeniser.trained ? i.setStatus("ready") : (i.setStatus("awaitingTokens"), i.tokeniser.once("trainStatus", (n) => {
64
64
  n === "trained" && i.setStatus("ready");
@@ -1,12 +1,14 @@
1
- class E {
2
- constructor(t, c) {
3
- this.tf = t, this.config = c;
4
- const e = this.config.nEmbed / this.config.nHead;
5
- if (this.rotaryDim = e, this.rotaryDim % 2 !== 0)
1
+ class b {
2
+ constructor(s, r) {
3
+ this.tf = s, this.config = r;
4
+ const o = this.config.nEmbed / this.config.nHead;
5
+ if (this.rotaryDim = o, this.rotaryDim % 2 !== 0)
6
6
  throw new Error("rotaryDim must be even");
7
7
  this.ropeBase = 1e4;
8
- const o = this.tf.range(0, this.rotaryDim, 2, "float32").div(this.tf.scalar(this.rotaryDim, "float32")), s = this.tf.pow(this.tf.scalar(this.ropeBase, "float32"), o);
9
- this.ropeInvFreq = this.tf.reciprocal(s), this.config.useRope === !1 ? (this.ropeCos = null, this.ropeSin = null, this.ropeCacheLen = 0) : this.ensureRopeCache(this.config.blockSize * 4);
8
+ const i = this.tf.range(0, this.rotaryDim, 2, "float32"), t = i.div(this.tf.scalar(this.rotaryDim, "float32")), e = this.tf.pow(this.tf.scalar(this.ropeBase, "float32"), t);
9
+ this.ropeInvFreq = this.tf.reciprocal(e), t.dispose(), e.dispose(), i.dispose(), this.config.useRope === !1 ? (this.ropeCos = null, this.ropeSin = null, this.ropeCacheLen = 0) : this.tf.tidy(() => {
10
+ this.ensureRopeCache(this.config.blockSize * 4);
11
+ });
10
12
  }
11
13
  rotaryDim;
12
14
  ropeBase;
@@ -16,24 +18,27 @@ class E {
16
18
  ropeSin = null;
17
19
  // [cacheLen, rotaryDim/2]
18
20
  ropeCacheLen = 0;
19
- ensureRopeCache(t) {
20
- if (t <= this.ropeCacheLen) return;
21
+ ensureRopeCache(s) {
22
+ if (s <= this.ropeCacheLen) return;
21
23
  this.ropeCos && this.ropeCos.dispose(), this.ropeSin && this.ropeSin.dispose();
22
- const e = this.tf.range(0, t, 1, "float32").expandDims(1).mul(this.ropeInvFreq.expandDims(0));
23
- this.ropeCos = this.tf.keep(this.tf.cos(e).expandDims(-1)), this.ropeSin = this.tf.keep(this.tf.sin(e).expandDims(-1)), this.ropeCacheLen = t;
24
+ const o = this.tf.range(0, s, 1, "float32").expandDims(1).mul(this.ropeInvFreq.expandDims(0));
25
+ this.ropeCos = this.tf.keep(this.tf.cos(o).expandDims(-1)), this.ropeSin = this.tf.keep(this.tf.sin(o).expandDims(-1)), this.ropeCacheLen = s;
24
26
  }
25
- applyRoPE(t, c, e) {
26
- const h = t.shape[3], o = this.rotaryDim;
27
- if (o > h) return [t, c];
28
- const s = t.shape[2], S = e + s;
29
- this.ensureRopeCache(S);
30
- const n = o / 2, g = this.ropeCos.slice([e, 0, 0], [s, n, 1]), v = this.ropeSin.slice([e, 0, 0], [s, n, 1]), l = g.reshape([1, 1, s, n, 1]), f = v.reshape([1, 1, s, n, 1]), p = this.tf.concat([t, c], 0), r = p.shape[0], i = p.shape[1], y = p.slice([0, 0, 0, 0], [r, i, s, o]), u = o < h ? p.slice([0, 0, 0, o], [r, i, s, h - o]) : null, d = y.reshape([r, i, s, n, 2]), m = d.slice([0, 0, 0, 0, 0], [r, i, s, n, 1]), C = d.slice([0, 0, 0, 0, 1], [r, i, s, n, 1]), B = m.mul(l).sub(C.mul(f)), b = C.mul(l).add(m.mul(f)), D = this.tf.concat([B, b], -1).reshape([r, i, s, o]), R = u ? this.tf.concat([D, u], 3) : D, a = r / 2, x = R.slice([0, 0, 0, 0], [a, i, s, h]), P = R.slice([a, 0, 0, 0], [a, i, s, h]);
31
- return [x, P];
27
+ applyRoPE(s, r, o) {
28
+ const i = s.shape[3], t = this.rotaryDim;
29
+ if (t > i) return [s, r];
30
+ const e = s.shape[2], v = o + e;
31
+ this.ensureRopeCache(v);
32
+ const n = t / 2, p = this.ropeCos.slice([o, 0, 0], [e, n, 1]).reshape([1, 1, e, n]), a = this.ropeSin.slice([o, 0, 0], [e, n, 1]).reshape([1, 1, e, n]), h = s.shape[0], c = s.shape[1], f = this.tf.range(0, t, 2, "int32"), l = this.tf.range(1, t, 2, "int32"), d = (u) => {
33
+ const m = u.slice([0, 0, 0, 0], [h, c, e, t]), C = t < i ? u.slice([0, 0, 0, t], [h, c, e, i - t]) : null, D = this.tf.gather(m, f, 3), g = this.tf.gather(m, l, 3), x = D.mul(p).sub(g.mul(a)), k = g.mul(p).add(D.mul(a)), R = this.tf.stack([x, k], -1).reshape([h, c, e, t]);
34
+ return C ? this.tf.concat([R, C], 3) : R;
35
+ }, y = d(s), S = d(r);
36
+ return f.dispose(), l.dispose(), [y, S];
32
37
  }
33
38
  dispose() {
34
39
  this.ropeCos && this.ropeCos.dispose(), this.ropeSin && this.ropeSin.dispose(), this.ropeInvFreq.dispose();
35
40
  }
36
41
  }
37
42
  export {
38
- E as default
43
+ b as default
39
44
  };
package/dist/main.d.ts CHANGED
@@ -1,6 +1,7 @@
1
1
  export { default as NanoGPT } from './NanoGPTModel';
2
2
  export { default as TeachableLLM } from './TeachableLLM';
3
3
  export { default as CharTokeniser } from './tokeniser/CharTokeniser';
4
+ export { default as waitForModel } from './utilities/waitForModel';
4
5
  export type { ITrainerOptions } from './Trainer';
5
6
  export type { IGenerateOptions } from './Generator';
6
7
  export type { TrainingLogEntry } from './NanoGPTModel';
package/dist/main.js CHANGED
@@ -1,8 +1,10 @@
1
1
  import { default as o } from "./NanoGPTModel.js";
2
- import { default as f } from "./TeachableLLM.js";
2
+ import { default as t } from "./TeachableLLM.js";
3
3
  import { default as l } from "./tokeniser/CharTokeniser.js";
4
+ import { default as s } from "./utilities/waitForModel.js";
4
5
  export {
5
6
  l as CharTokeniser,
6
7
  o as NanoGPT,
7
- f as TeachableLLM
8
+ t as TeachableLLM,
9
+ s as waitForModel
8
10
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@genai-fi/nanogpt",
3
- "version": "0.2.0",
3
+ "version": "0.2.1",
4
4
  "type": "module",
5
5
  "main": "dist/main.js",
6
6
  "types": "dist/main.d.ts",