@genai-fi/nanogpt 0.2.9 → 0.2.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/dist/Generator.d.ts +2 -0
  2. package/dist/Generator.js +37 -32
  3. package/dist/NanoGPTModel.d.ts +4 -1
  4. package/dist/NanoGPTModel.js +33 -25
  5. package/dist/TeachableLLM.d.ts +4 -0
  6. package/dist/TeachableLLM.js +32 -15
  7. package/dist/{complex-Cd8sqiBC.js → complex-CJ-qCcLB.js} +6 -6
  8. package/dist/{index-Dsg28SG6.js → index-YPKosni4.js} +59 -51
  9. package/dist/layers/BaseLayer.d.ts +8 -0
  10. package/dist/layers/BaseLayer.js +18 -0
  11. package/dist/layers/CausalSelfAttention.d.ts +4 -1
  12. package/dist/layers/CausalSelfAttention.js +47 -55
  13. package/dist/layers/MLP.d.ts +2 -1
  14. package/dist/layers/MLP.js +16 -14
  15. package/dist/layers/RMSNorm.d.ts +2 -1
  16. package/dist/layers/RMSNorm.js +13 -11
  17. package/dist/layers/RoPECache.d.ts +4 -2
  18. package/dist/layers/RoPECache.js +13 -7
  19. package/dist/layers/TiedEmbedding.js +16 -15
  20. package/dist/layers/TransformerBlock.d.ts +4 -1
  21. package/dist/layers/TransformerBlock.js +9 -5
  22. package/dist/main.js +18 -16
  23. package/dist/{mat_mul-BAYDrXvE.js → mat_mul-Bu7bhLms.js} +5 -5
  24. package/dist/ops/attentionMask.js +31 -25
  25. package/dist/ops/gatherSub.js +2 -2
  26. package/dist/ops/node/sparseCrossEntropy.js +1 -1
  27. package/dist/ops/qkv.d.ts +7 -0
  28. package/dist/ops/qkv.js +127 -0
  29. package/dist/ops/rope.d.ts +8 -0
  30. package/dist/ops/rope.js +153 -0
  31. package/dist/ops/scatterSub.js +14 -14
  32. package/dist/reshape-DmnmKT6r.js +25 -0
  33. package/dist/{stack-1o648CP_.js → stack-BtKpB0Ry.js} +5 -5
  34. package/dist/sum-D7fu15XL.js +27 -0
  35. package/dist/training/AdamExt.js +1 -1
  36. package/dist/training/Trainer.js +30 -29
  37. package/dist/training/sparseCrossEntropy.js +34 -33
  38. package/dist/utilities/profile.d.ts +10 -0
  39. package/dist/utilities/profile.js +29 -0
  40. package/package.json +1 -1
  41. package/dist/sum-NWazHI7f.js +0 -49
@@ -1,16 +1,10 @@
1
- import { attentionMask as z } from "../ops/attentionMask.js";
2
- class j {
1
+ import { attentionMask as x } from "../ops/attentionMask.js";
2
+ import j from "./BaseLayer.js";
3
+ import { qkv as w } from "../ops/qkv.js";
4
+ import { rope as y } from "../ops/rope.js";
5
+ class N extends j {
3
6
  constructor(t, i, s, e) {
4
- this.ropeCache = e, this.config = s, this.tf = t, this.index = i, this.cAttn = this.tf.layers.dense({
5
- units: 3 * s.nEmbed,
6
- useBias: s.biasInLinear,
7
- name: `block_${i}_attn_cAttn`,
8
- kernelInitializer: this.tf.initializers.randomNormal({
9
- mean: 0,
10
- stddev: 0.02
11
- }),
12
- biasInitializer: "zeros"
13
- }), this.cProj = this.tf.layers.dense({
7
+ super(), this.ropeCache = e, this.config = s, this.tf = t, this.index = i, this.units = s.nEmbed * 3, this.cProj = this.tf.layers.dense({
14
8
  units: s.nEmbed,
15
9
  useBias: s.biasInLinear,
16
10
  name: `block_${i}_attn_cProj`,
@@ -20,11 +14,11 @@ class j {
20
14
  }),
21
15
  biasInitializer: "zeros"
22
16
  }), this.attnDropout = this.tf.layers.dropout({ rate: s.dropout }), this.residDropout = this.tf.layers.dropout({ rate: s.dropout }), this.bias = this.tf.linalg.bandPart(this.tf.ones([s.blockSize, s.blockSize]), -1, 0).cast("bool"), this.divisor = 1 / Math.sqrt(s.nEmbed / s.nHead);
23
- const o = this.tf.zeros([s.blockSize, s.blockSize]), c = this.tf.fill([s.blockSize, s.blockSize], Number.NEGATIVE_INFINITY);
24
- this.maskInf = this.tf.where(this.bias, o, c);
17
+ const o = this.tf.zeros([s.blockSize, s.blockSize]), a = this.tf.fill([s.blockSize, s.blockSize], Number.NEGATIVE_INFINITY);
18
+ this.maskInf = this.tf.where(this.bias, o, a);
25
19
  }
26
20
  config;
27
- cAttn;
21
+ cAttn = null;
28
22
  cProj;
29
23
  attnDropout;
30
24
  residDropout;
@@ -34,26 +28,35 @@ class j {
34
28
  divisor;
35
29
  index;
36
30
  _trainable = !0;
31
+ units;
32
+ build() {
33
+ this.cAttn === null && (this.cAttn = this.tf.variable(
34
+ this.tf.randomNormal([this.config.nEmbed, this.units], 0, 0.02),
35
+ !0
36
+ //`block_${this.index}_attn_cAttn_kernel`
37
+ ));
38
+ }
37
39
  get variables() {
38
- return [
39
- ...this.cAttn.trainableWeights.map((t) => t.read()),
40
- ...this.cProj.trainableWeights.map((t) => t.read())
41
- ];
40
+ if (this.cAttn === null)
41
+ throw new Error("Layer not built yet");
42
+ return [this.cAttn, ...this.cProj.trainableWeights.map((t) => t.read())];
42
43
  }
43
44
  get trainable() {
44
45
  return this._trainable;
45
46
  }
46
47
  set trainable(t) {
47
- this._trainable = t, this.cAttn.trainable = t, this.cProj.trainable = t;
48
+ this._trainable = t, this.cAttn && (this.cAttn.trainable = t), this.cProj.trainable = t;
48
49
  }
49
50
  saveWeights(t) {
50
- t.set(`block_${this.index}_cAttn`, this.cAttn.getWeights()), t.set(`block_${this.index}_cProj`, this.cProj.getWeights());
51
+ t.set(`block_${this.index}_cAttn`, this.cAttn ? [this.cAttn.clone()] : []), t.set(`block_${this.index}_cProj`, this.cProj.getWeights());
51
52
  }
52
53
  loadWeights(t) {
53
- this.cAttn.setWeights(t.get(`block_${this.index}_cAttn`) || []), this.cProj.setWeights(t.get(`block_${this.index}_cProj`) || []);
54
+ const i = t.get(`block_${this.index}_cAttn`)?.[0];
55
+ if (!i) throw new Error(`Weights for block_${this.index}_cAttn not found`);
56
+ this.cAttn ? this.cAttn.assign(i) : this.cAttn = this.tf.variable(i, !0), this.cProj.setWeights(t.get(`block_${this.index}_cProj`) || []);
54
57
  }
55
58
  getAttentionScores(t, i, s) {
56
- const e = z(t, i, this.maskInf, this.divisor), o = this.tf.softmax(e, -1);
59
+ const e = x(t, i, this.maskInf, this.divisor), o = this.tf.softmax(e, -1);
57
60
  return this.attnDropout.apply(o, { training: s });
58
61
  }
59
62
  // Attention with optional past. If pastLen > 0 and T_cur == 1, no mask needed.
@@ -63,60 +66,49 @@ class j {
63
66
  if (o > 1 && e > 0)
64
67
  throw new Error("Cannot use past with T_cur > 1");
65
68
  if (o > 1) {
66
- const a = this.maskInf.slice([0, 0], [o, o]).expandDims(0).expandDims(0);
67
- r = r.add(a);
69
+ const c = this.maskInf.slice([0, 0], [o, o]).expandDims(0).expandDims(0);
70
+ r = r.add(c);
68
71
  }
69
72
  const h = this.tf.softmax(r, -1);
70
73
  return this.attnDropout.apply(h, { training: s });
71
74
  }
72
75
  getQKV(t) {
73
- const [i, s, e] = t.shape, o = this.cAttn.apply(t), [c, r, h] = this.tf.split(o, 3, -1);
74
- o.dispose();
75
- const a = e / this.config.nHead, u = this.tf.reshape(c, [i, s, this.config.nHead, a]);
76
- c.dispose();
77
- const f = u.transpose([0, 2, 1, 3]);
78
- u.dispose();
79
- const d = this.tf.reshape(r, [i, s, this.config.nHead, a]);
80
- r.dispose();
81
- const n = d.transpose([0, 2, 1, 3]);
82
- d.dispose();
83
- const l = this.tf.reshape(h, [i, s, this.config.nHead, a]);
84
- h.dispose();
85
- const p = l.transpose([0, 2, 1, 3]);
86
- return l.dispose(), [f, n, p];
76
+ return w(t, this.cAttn, this.config.nHead);
87
77
  }
88
78
  getOutputProjection(t, i) {
89
- const s = t.shape[0], e = t.shape[2], o = this.config.nEmbed, c = t.transpose([0, 2, 1, 3]), r = this.tf.reshape(c, [s, e, o]), h = this.cProj.apply(r);
79
+ const s = t.shape[0], e = t.shape[2], o = this.config.nEmbed, a = t.transpose([0, 2, 1, 3]), r = this.tf.reshape(a, [s, e, o]), h = this.cProj.apply(r);
90
80
  return this.residDropout.apply(h, { training: i });
91
81
  }
92
82
  // Added optional KV cache support (pastKV). Returns presentKV for chaining.
93
83
  call(t, i = !1, s = !1, e) {
94
84
  if (e && !this.config.useRope)
95
85
  throw new Error("Cannot use pastKV without RoPE enabled");
96
- return this.tf.tidy(() => {
97
- const [o, c, r] = this.getQKV(t), h = o.shape[2], a = this.config.blockSize, u = e ? e.cumulativeLength : 0, [f, d] = this.ropeCache ? this.ropeCache.applyRoPE(o, c, u) : [o, c];
98
- let n = d, l = r, p = 0;
99
- e && (p = e.length, n = this.tf.concat([e.k, d], 2), l = this.tf.concat([e.v, r], 2));
86
+ return this.build(), this.tf.tidy(() => {
87
+ this.startMemory();
88
+ const [o, a, r] = this.getQKV(t), h = o.shape[2], c = this.config.blockSize, d = e ? e.cumulativeLength : 0, f = this.ropeCache ? y(o, this.ropeCache, d) : o, m = this.ropeCache ? y(a, this.ropeCache, d) : a;
89
+ this.ropeCache && (o.dispose(), a.dispose());
90
+ let n = m, l = r, u = 0;
91
+ e && (u = e.length, n = this.tf.concat([e.k, m], 2), l = this.tf.concat([e.v, r], 2));
100
92
  const b = n.shape[2];
101
- if (b > a) {
102
- const k = b - a, g = n.shape[0], I = n.shape[1], _ = n.shape[3];
103
- n = n.slice([0, 0, k, 0], [g, I, a, _]), l = l.slice([0, 0, k, 0], [g, I, a, _]), p = a - h;
93
+ if (b > c) {
94
+ const k = b - c, A = n.shape[0], g = n.shape[1], _ = n.shape[3];
95
+ n = n.slice([0, 0, k, 0], [A, g, c, _]), l = l.slice([0, 0, k, 0], [A, g, c, _]), u = c - h;
104
96
  }
105
- let m;
106
- p > 0 ? m = this.getAttentionScoresWithPast(f, n, i, p) : m = this.getAttentionScores(f, n, i);
107
- const v = this.tf.matMul(m, l), A = this.getOutputProjection(v, i), P = {
97
+ let p;
98
+ u > 0 ? p = this.getAttentionScoresWithPast(f, n, i, u) : p = this.getAttentionScores(f, n, i);
99
+ const P = this.tf.matMul(p, l), S = this.getOutputProjection(P, i), v = {
108
100
  k: this.tf.keep(n),
109
101
  v: this.tf.keep(l),
110
- length: p + h,
102
+ length: u + h,
111
103
  cumulativeLength: e ? e.cumulativeLength + h : h
112
- };
113
- return { output: A, attention: s ? m.mean(1) : void 0, presentKV: P };
104
+ }, I = s ? p.mean(1) : void 0;
105
+ return this.endMemory("CausalSelfAttention"), { output: S, attention: I, presentKV: v };
114
106
  });
115
107
  }
116
108
  dispose() {
117
- this.cAttn.dispose(), this.cProj.dispose(), this.attnDropout.dispose(), this.residDropout.dispose(), this.bias.dispose(), this.maskInf.dispose();
109
+ this.cAttn?.dispose(), this.cProj.dispose(), this.attnDropout.dispose(), this.residDropout.dispose(), this.bias.dispose(), this.maskInf.dispose();
118
110
  }
119
111
  }
120
112
  export {
121
- j as default
113
+ N as default
122
114
  };
@@ -1,6 +1,7 @@
1
1
  import { default as TF } from '@tensorflow/tfjs';
2
2
  import { GPTConfig } from '../config';
3
- export default class MLP {
3
+ import { default as BaseLayer } from './BaseLayer';
4
+ export default class MLP extends BaseLayer {
4
5
  private cFc;
5
6
  private cProj;
6
7
  private dropout;
@@ -1,31 +1,32 @@
1
- class l {
1
+ import a from "./BaseLayer.js";
2
+ class l extends a {
2
3
  cFc;
3
4
  cProj;
4
5
  dropout;
5
6
  tf;
6
7
  index;
7
8
  _trainable = !0;
8
- constructor(t, e, i) {
9
- this.tf = t, this.index = e, this.cFc = this.tf.layers.dense({
10
- units: i.mlpFactor * i.nEmbed,
9
+ constructor(t, i, e) {
10
+ super(), this.tf = t, this.index = i, this.cFc = this.tf.layers.dense({
11
+ units: e.mlpFactor * e.nEmbed,
11
12
  activation: "gelu",
12
- useBias: i.biasInLinear,
13
+ useBias: e.biasInLinear,
13
14
  kernelInitializer: this.tf.initializers.randomNormal({
14
15
  mean: 0,
15
16
  stddev: 0.02
16
17
  }),
17
18
  biasInitializer: "zeros",
18
- name: `block_${e}_mlp_cFc`
19
+ name: `block_${i}_mlp_cFc`
19
20
  }), this.cProj = this.tf.layers.dense({
20
- units: i.nEmbed,
21
- useBias: i.biasInLinear,
21
+ units: e.nEmbed,
22
+ useBias: e.biasInLinear,
22
23
  kernelInitializer: this.tf.initializers.randomNormal({
23
24
  mean: 0,
24
- stddev: 0.02 / Math.sqrt(2 * i.nLayer)
25
+ stddev: 0.02 / Math.sqrt(2 * e.nLayer)
25
26
  }),
26
27
  biasInitializer: "zeros",
27
- name: `block_${e}_mlp_cProj`
28
- }), this.dropout = this.tf.layers.dropout({ rate: i.dropout });
28
+ name: `block_${i}_mlp_cProj`
29
+ }), this.dropout = this.tf.layers.dropout({ rate: e.dropout });
29
30
  }
30
31
  get variables() {
31
32
  return [
@@ -45,10 +46,11 @@ class l {
45
46
  loadWeights(t) {
46
47
  this.cFc.setWeights(t.get(`block_${this.index}_mlpHidden`) || []), this.cProj.setWeights(t.get(`block_${this.index}_mlpOut`) || []);
47
48
  }
48
- call(t, e = !1) {
49
+ call(t, i = !1) {
49
50
  return this.tf.tidy(() => {
50
- const i = this.cFc.apply(t), s = this.cProj.apply(i);
51
- return this.dropout.apply(s, { training: e });
51
+ this.startMemory();
52
+ const e = this.cFc.apply(t), s = this.cProj.apply(e), r = this.dropout.apply(s, { training: i });
53
+ return this.endMemory("MLP"), r;
52
54
  });
53
55
  }
54
56
  dispose() {
@@ -1,5 +1,6 @@
1
1
  import { default as TF } from '@tensorflow/tfjs';
2
- export default class RMSNorm {
2
+ import { default as BaseLayer } from './BaseLayer';
3
+ export default class RMSNorm extends BaseLayer {
3
4
  private gamma;
4
5
  private epsilon;
5
6
  private tf;
@@ -1,26 +1,28 @@
1
- class m {
1
+ import m from "./BaseLayer.js";
2
+ class o extends m {
2
3
  gamma;
3
4
  epsilon;
4
5
  tf;
5
- constructor(a, s, t = 1e-8, e = "") {
6
- this.tf = a, this.epsilon = t, this.gamma = a.variable(a.ones(s), !0, `${e}_gamma`, "float32");
6
+ constructor(t, s, a = 1e-8, e = "") {
7
+ super(), this.tf = t, this.epsilon = a, this.gamma = t.variable(t.ones(s), !0, `${e}_gamma`, "float32");
7
8
  }
8
9
  get trainableWeights() {
9
10
  return [this.gamma];
10
11
  }
11
- set trainable(a) {
12
- this.gamma.trainable = a;
12
+ set trainable(t) {
13
+ this.gamma.trainable = t;
13
14
  }
14
15
  getWeights() {
15
16
  return [this.gamma];
16
17
  }
17
- setWeights(a) {
18
- this.gamma.assign(a[0]);
18
+ setWeights(t) {
19
+ this.gamma.assign(t[0]);
19
20
  }
20
- apply(a) {
21
+ apply(t) {
21
22
  return this.tf.tidy(() => {
22
- const t = a.square().mean(-1, !0).add(this.epsilon).rsqrt();
23
- return a.mul(t).mul(this.gamma);
23
+ this.startMemory();
24
+ const a = t.square().mean(-1, !0).add(this.epsilon).rsqrt(), r = t.mul(a).mul(this.gamma);
25
+ return this.endMemory("RMSNorm"), r;
24
26
  });
25
27
  }
26
28
  dispose() {
@@ -28,5 +30,5 @@ class m {
28
30
  }
29
31
  }
30
32
  export {
31
- m as default
33
+ o as default
32
34
  };
@@ -3,14 +3,16 @@ import { GPTConfig } from '../config';
3
3
  export default class RoPECache {
4
4
  private readonly tf;
5
5
  private readonly config;
6
- private rotaryDim;
6
+ readonly rotaryDim: number;
7
7
  private ropeBase;
8
8
  private ropeInvFreq;
9
9
  private ropeCos;
10
10
  private ropeSin;
11
11
  private ropeCacheLen;
12
12
  constructor(tf: typeof TF, config: GPTConfig);
13
- private ensureRopeCache;
13
+ ensureRopeCache(needed: number): void;
14
+ getCos(): TF.Tensor | null;
15
+ getSin(): TF.Tensor | null;
14
16
  applyRoPE(q: TF.Tensor, k: TF.Tensor, pastLen: number): [TF.Tensor, TF.Tensor];
15
17
  dispose(): void;
16
18
  }
@@ -24,16 +24,22 @@ class b {
24
24
  const o = this.tf.range(0, s, 1, "float32").expandDims(1).mul(this.ropeInvFreq.expandDims(0));
25
25
  this.ropeCos = this.tf.keep(this.tf.cos(o).expandDims(-1)), this.ropeSin = this.tf.keep(this.tf.sin(o).expandDims(-1)), this.ropeCacheLen = s;
26
26
  }
27
+ getCos() {
28
+ return this.ropeCos;
29
+ }
30
+ getSin() {
31
+ return this.ropeSin;
32
+ }
27
33
  applyRoPE(s, r, o) {
28
34
  const i = s.shape[3], t = this.rotaryDim;
29
35
  if (t > i) return [s, r];
30
- const e = s.shape[2], v = o + e;
31
- this.ensureRopeCache(v);
32
- const n = t / 2, p = this.ropeCos.slice([o, 0, 0], [e, n, 1]).reshape([1, 1, e, n]), a = this.ropeSin.slice([o, 0, 0], [e, n, 1]).reshape([1, 1, e, n]), h = s.shape[0], c = s.shape[1], f = this.tf.range(0, t, 2, "int32"), l = this.tf.range(1, t, 2, "int32"), d = (u) => {
33
- const m = u.slice([0, 0, 0, 0], [h, c, e, t]), C = t < i ? u.slice([0, 0, 0, t], [h, c, e, i - t]) : null, D = this.tf.gather(m, f, 3), g = this.tf.gather(m, l, 3), x = D.mul(p).sub(g.mul(a)), k = g.mul(p).add(D.mul(a)), R = this.tf.stack([x, k], -1).reshape([h, c, e, t]);
34
- return C ? this.tf.concat([R, C], 3) : R;
35
- }, y = d(s), S = d(r);
36
- return f.dispose(), l.dispose(), [y, S];
36
+ const e = s.shape[2], R = o + e;
37
+ this.ensureRopeCache(R);
38
+ const n = t / 2, c = this.ropeCos.slice([o, 0, 0], [e, n, 1]).reshape([1, 1, e, n]), a = this.ropeSin.slice([o, 0, 0], [e, n, 1]).reshape([1, 1, e, n]), h = s.shape[0], p = s.shape[1], f = this.tf.range(0, t, 2, "int32"), l = this.tf.range(1, t, 2, "int32"), d = (u) => {
39
+ const m = u.slice([0, 0, 0, 0], [h, p, e, t]), C = t < i ? u.slice([0, 0, 0, t], [h, p, e, i - t]) : null, g = this.tf.gather(m, f, 3), D = this.tf.gather(m, l, 3), x = g.mul(c).sub(D.mul(a)), k = D.mul(c).add(g.mul(a)), S = this.tf.stack([x, k], -1).reshape([h, p, e, t]);
40
+ return C ? this.tf.concat([S, C], 3) : S;
41
+ }, v = d(s), y = d(r);
42
+ return f.dispose(), l.dispose(), [v, y];
37
43
  }
38
44
  dispose() {
39
45
  this.ropeCos && this.ropeCos.dispose(), this.ropeSin && this.ropeSin.dispose(), this.ropeInvFreq.dispose();
@@ -1,7 +1,8 @@
1
- import { o as h, c as i, E as o, D as V, F as X, I as Y, H as Z, N as ee, J as te, K as se, O as ne, Q as re, T as ue, h as L, y as ae, U as A, m as ie, V as oe, v as le, d as q, n as C, W as P, x as U, _ as H } from "../index-Dsg28SG6.js";
2
- import { s as ce, r as f } from "../sum-NWazHI7f.js";
3
- import { m } from "../mat_mul-BAYDrXvE.js";
4
- import { c as pe } from "../complex-Cd8sqiBC.js";
1
+ import { o as h, d as i, E as o, K as X, N as Y, O as Z, Q as J, T as ee, U as te, V as se, W as ne, X as re, Y as ue, l as L, I as ae, Z as A, a as ie, _ as oe, D as le, f as q, v as C, $ as P, H as U, a0 as H } from "../index-YPKosni4.js";
2
+ import { r as f } from "../reshape-DmnmKT6r.js";
3
+ import { s as ce } from "../sum-D7fu15XL.js";
4
+ import { m } from "../mat_mul-Bu7bhLms.js";
5
+ import { c as pe } from "../complex-CJ-qCcLB.js";
5
6
  /**
6
7
  * @license
7
8
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -20,7 +21,7 @@ import { c as pe } from "../complex-Cd8sqiBC.js";
20
21
  */
21
22
  function he(t) {
22
23
  const s = { x: i(t, "x", "sigmoid", "float32") };
23
- return o.runKernel(V, s);
24
+ return o.runKernel(X, s);
24
25
  }
25
26
  const fe = /* @__PURE__ */ h({ sigmoid_: he });
26
27
  /**
@@ -41,7 +42,7 @@ const fe = /* @__PURE__ */ h({ sigmoid_: he });
41
42
  */
42
43
  function de(t) {
43
44
  const s = { x: i(t, "x", "elu", "float32") };
44
- return o.runKernel(X, s);
45
+ return o.runKernel(Y, s);
45
46
  }
46
47
  const me = /* @__PURE__ */ h({ elu_: de });
47
48
  /**
@@ -62,7 +63,7 @@ const me = /* @__PURE__ */ h({ elu_: de });
62
63
  */
63
64
  function ge(t) {
64
65
  const s = { input: i(t, "input", "imag") };
65
- return o.runKernel(Y, s);
66
+ return o.runKernel(Z, s);
66
67
  }
67
68
  const $e = /* @__PURE__ */ h({ imag_: ge });
68
69
  /**
@@ -83,7 +84,7 @@ const $e = /* @__PURE__ */ h({ imag_: ge });
83
84
  */
84
85
  function xe(t, e = 0.2) {
85
86
  const n = { x: i(t, "x", "leakyRelu") }, r = { alpha: e };
86
- return o.runKernel(Z, n, r);
87
+ return o.runKernel(J, n, r);
87
88
  }
88
89
  const ke = /* @__PURE__ */ h({ leakyRelu_: xe });
89
90
  /**
@@ -321,8 +322,8 @@ function Ne({ a: t, b: e, transposeA: s = !1, transposeB: n = !1, bias: r, activ
321
322
  const [g, $, k, z] = M, d = Ae(f(x, k.shape), k, c);
322
323
  let K, _;
323
324
  if (!s && !n ? (K = m(d, $, !1, !0), _ = m(g, d, !0, !1)) : !s && n ? (K = m(d, $, !1, !1), _ = m(d, g, !0, !1)) : s && !n ? (K = m($, d, !1, !0), _ = m(g, d, !1, !1)) : (K = m($, d, !0, !0), _ = m(d, g, !0, !0)), r != null) {
324
- const Q = Le(z, d);
325
- return [K, _, Q];
325
+ const V = Le(z, d);
326
+ return [K, _, V];
326
327
  } else
327
328
  return [K, _];
328
329
  }, I = {
@@ -345,7 +346,7 @@ function Ne({ a: t, b: e, transposeA: s = !1, transposeB: n = !1, bias: r, activ
345
346
  return k([M, g, z, $]), { value: f(z, O), gradFunc: G };
346
347
  })(F, R, S);
347
348
  }
348
- const J = /* @__PURE__ */ h({ fusedMatMul_: Ne });
349
+ const Q = /* @__PURE__ */ h({ fusedMatMul_: Ne });
349
350
  /**
350
351
  * @license
351
352
  * Copyright 2018 Google LLC
@@ -378,7 +379,7 @@ function ve(t, e, s, n) {
378
379
  throw new E(`If rank y >= 3, then the second last dim of y must equal the last dim of x but got x shape = ${t.shape} and y shape = ${e.shape}`);
379
380
  }
380
381
  if (t.rank === 2 && e.rank === 2)
381
- return J({
382
+ return Q({
382
383
  a: t,
383
384
  b: e,
384
385
  transposeA: !1,
@@ -392,7 +393,7 @@ function ve(t, e, s, n) {
392
393
  const l = e.shape.slice(), p = l.pop(), u = l.pop(), a = [...l, p], D = Array.from({ length: e.rank }, (T, y) => y === 0 ? e.rank - 2 : y <= e.rank - 2 ? y - 1 : y);
393
394
  e = f(Re(e, D), [u, -1]);
394
395
  const b = [...r, ...a];
395
- return f(J({
396
+ return f(Q({
396
397
  a: t,
397
398
  b: e,
398
399
  transposeA: !1,
@@ -402,7 +403,7 @@ function ve(t, e, s, n) {
402
403
  }), b);
403
404
  }
404
405
  }
405
- class Pe {
406
+ class Ue {
406
407
  vocabSize;
407
408
  embedDim;
408
409
  tf;
@@ -444,5 +445,5 @@ class Pe {
444
445
  }
445
446
  }
446
447
  export {
447
- Pe as default
448
+ Ue as default
448
449
  };
@@ -2,7 +2,9 @@ import { default as TF } from '@tensorflow/tfjs';
2
2
  import { GPTConfig } from '../config';
3
3
  import { KVCache } from './CausalSelfAttention';
4
4
  import { default as RoPECache } from './RoPECache';
5
- export default class Block {
5
+ import { default as MemoryProfiler } from '../utilities/profile';
6
+ import { default as BaseLayer } from './BaseLayer';
7
+ export default class Block extends BaseLayer {
6
8
  private ln1;
7
9
  private attn;
8
10
  private ln2;
@@ -12,6 +14,7 @@ export default class Block {
12
14
  private _trainable;
13
15
  skipped: boolean;
14
16
  constructor(tf: typeof TF, index: number, config: GPTConfig, ropeCache?: RoPECache);
17
+ setProfiler(value: MemoryProfiler | undefined): void;
15
18
  get variables(): TF.Variable[];
16
19
  get trainable(): boolean;
17
20
  set trainable(value: boolean);
@@ -1,7 +1,8 @@
1
- import r from "./CausalSelfAttention.js";
1
+ import a from "./CausalSelfAttention.js";
2
2
  import o from "./MLP.js";
3
- import a from "./RMSNorm.js";
4
- class u {
3
+ import r from "./RMSNorm.js";
4
+ import p from "./BaseLayer.js";
5
+ class f extends p {
5
6
  ln1;
6
7
  attn;
7
8
  ln2;
@@ -11,7 +12,10 @@ class u {
11
12
  _trainable = !0;
12
13
  skipped = !1;
13
14
  constructor(t, i, s, e) {
14
- this.tf = t, this.index = i, this.ln1 = new a(t, [s.nEmbed], 1e-8, `block_${this.index}_rms1`), this.attn = new r(this.tf, this.index, s, e), this.ln2 = new a(t, [s.nEmbed], 1e-8, `block_${this.index}_rms2`), this.mlp = new o(this.tf, this.index, s);
15
+ super(), this.tf = t, this.index = i, this.ln1 = new r(t, [s.nEmbed], 1e-8, `block_${this.index}_rms1`), this.attn = new a(this.tf, this.index, s, e), this.ln2 = new r(t, [s.nEmbed], 1e-8, `block_${this.index}_rms2`), this.mlp = new o(this.tf, this.index, s);
16
+ }
17
+ setProfiler(t) {
18
+ this._profiler = t, this.attn.setProfiler(t), this.mlp.setProfiler(t), this.ln1.setProfiler(t), this.ln2.setProfiler(t);
15
19
  }
16
20
  get variables() {
17
21
  return [
@@ -54,5 +58,5 @@ class u {
54
58
  }
55
59
  }
56
60
  export {
57
- u as default
61
+ f as default
58
62
  };
package/dist/main.js CHANGED
@@ -1,21 +1,23 @@
1
- import { default as m } from "./NanoGPTModel.js";
2
- import { default as i } from "./TeachableLLM.js";
3
- import { default as l } from "./tokeniser/CharTokeniser.js";
4
- import { default as d } from "./utilities/waitForModel.js";
5
- import { default as x } from "./data/textLoader.js";
6
- import { estimateMemoryUsage as T, estimateParameterCount as g, estimateResources as M, estimateTrainingMemoryUsage as C, validateConfig as c } from "./utilities/parameters.js";
1
+ import { default as s } from "./NanoGPTModel.js";
2
+ import { default as p } from "./TeachableLLM.js";
3
+ import { default as d } from "./tokeniser/CharTokeniser.js";
4
+ import { default as x } from "./utilities/waitForModel.js";
5
+ import { default as T } from "./data/textLoader.js";
6
+ import { estimateMemoryUsage as M, estimateParameterCount as C, estimateResources as c, estimateTrainingMemoryUsage as h, validateConfig as y } from "./utilities/parameters.js";
7
7
  import "./ops/scatterSub.js";
8
8
  import "./ops/gatherSub.js";
9
9
  import "./ops/attentionMask.js";
10
+ import "./ops/qkv.js";
11
+ import "./ops/rope.js";
10
12
  export {
11
- l as CharTokeniser,
12
- m as NanoGPT,
13
- i as TeachableLLM,
14
- T as estimateMemoryUsage,
15
- g as estimateParameterCount,
16
- M as estimateResources,
17
- C as estimateTrainingMemoryUsage,
18
- x as loadTextData,
19
- c as validateConfig,
20
- d as waitForModel
13
+ d as CharTokeniser,
14
+ s as NanoGPT,
15
+ p as TeachableLLM,
16
+ M as estimateMemoryUsage,
17
+ C as estimateParameterCount,
18
+ c as estimateResources,
19
+ h as estimateTrainingMemoryUsage,
20
+ T as loadTextData,
21
+ y as validateConfig,
22
+ x as waitForModel
21
23
  };
@@ -1,4 +1,4 @@
1
- import { o as c, c as s, d as m, E as M, B as p } from "./index-Dsg28SG6.js";
1
+ import { o as m, d as s, f as c, E as M, B as f } from "./index-YPKosni4.js";
2
2
  /**
3
3
  * @license
4
4
  * Copyright 2020 Google LLC. All Rights Reserved.
@@ -15,13 +15,13 @@ import { o as c, c as s, d as m, E as M, B as p } from "./index-Dsg28SG6.js";
15
15
  * limitations under the License.
16
16
  * =============================================================================
17
17
  */
18
- function f(e, o, n = !1, l = !1) {
18
+ function p(e, o, n = !1, l = !1) {
19
19
  let a = s(e, "a", "matMul"), t = s(o, "b", "matMul");
20
- [a, t] = m(a, t);
20
+ [a, t] = c(a, t);
21
21
  const r = { a, b: t }, u = { transposeA: n, transposeB: l };
22
- return M.runKernel(p, r, u);
22
+ return M.runKernel(f, r, u);
23
23
  }
24
- const i = /* @__PURE__ */ c({ matMul_: f });
24
+ const i = /* @__PURE__ */ m({ matMul_: p });
25
25
  export {
26
26
  i as m
27
27
  };
@@ -1,14 +1,14 @@
1
- import { engine as l } from "@tensorflow/tfjs";
2
- import { r as u, b as k, s as d } from "../index-Dsg28SG6.js";
3
- import { m as p } from "../mat_mul-BAYDrXvE.js";
4
- class f {
1
+ import { engine as k } from "@tensorflow/tfjs";
2
+ import { r as m, c as d, s as p } from "../index-YPKosni4.js";
3
+ import { m as f } from "../mat_mul-Bu7bhLms.js";
4
+ class h {
5
5
  variableNames = ["q", "k", "mask"];
6
6
  outputShape;
7
7
  userCode;
8
8
  // enableShapeUniforms = true;
9
9
  customUniforms = [{ name: "divisor", type: "float" }];
10
- constructor(s, n, e, a) {
11
- this.outputShape = [s, n, e, e], this.userCode = `
10
+ constructor(e, n, s, a) {
11
+ this.outputShape = [e, n, s, s], this.userCode = `
12
12
  void main() {
13
13
  ivec4 coords = getOutputCoords(); // [batch, nh, t1, t2]
14
14
  int b = coords.x;
@@ -34,49 +34,55 @@ class f {
34
34
  `;
35
35
  }
36
36
  }
37
- function h(t) {
38
- const { q: s, k: n, mask: e } = t.inputs, { divisor: a } = t.attrs, o = t.backend, r = s.shape[0], i = s.shape[2], c = s.shape[1], m = new f(r, c, i, s.shape[3]);
39
- return o.runWebGLProgram(m, [s, n, e], "float32", [[a]]);
37
+ function v(t) {
38
+ const { q: e, k: n, mask: s } = t.inputs, { divisor: a } = t.attrs, o = t.backend, r = e.shape[0], i = e.shape[2], c = e.shape[1], u = new h(r, c, i, e.shape[3]);
39
+ return o.runWebGLProgram(u, [e, n, s], "float32", [[a]]);
40
40
  }
41
- const v = {
41
+ const b = {
42
42
  kernelName: "AttentionMask",
43
43
  backendName: "webgl",
44
- kernelFunc: h
44
+ kernelFunc: v
45
45
  };
46
- u(v);
47
- function b(t) {
48
- const { q: s, k: n, mask: e } = t.inputs, { divisor: a } = t.attrs, o = s.shape[2], i = p(s, n, !1, !0).mul(d(a)), c = e.slice([0, 0], [o, o]).expandDims(0).expandDims(0);
46
+ m(b);
47
+ function l(t) {
48
+ const { q: e, k: n, mask: s } = t.inputs, { divisor: a } = t.attrs, o = e.shape[2], i = f(e, n, !1, !0).mul(p(a)), c = s.slice([0, 0], [o, o]).expandDims(0).expandDims(0);
49
49
  return i.add(c);
50
50
  }
51
51
  const M = {
52
52
  kernelName: "AttentionMask",
53
53
  backendName: "cpu",
54
- kernelFunc: b
54
+ kernelFunc: l
55
55
  };
56
- u(M);
57
- function w(t, s, n, e) {
58
- return l().runKernel("AttentionMask", { q: t, k: s, mask: n }, { divisor: e });
59
- }
56
+ m(M);
60
57
  const g = {
58
+ kernelName: "AttentionMask",
59
+ backendName: "tensorflow",
60
+ kernelFunc: l
61
+ };
62
+ m(g);
63
+ function N(t, e, n, s) {
64
+ return k().runKernel("AttentionMask", { q: t, k: e, mask: n }, { divisor: s });
65
+ }
66
+ const A = {
61
67
  kernelName: "AttentionMask",
62
68
  inputsToSave: ["q", "k"],
63
69
  outputsToSave: [],
64
- gradFunc: (t, s, n) => {
70
+ gradFunc: (t, e, n) => {
65
71
  if (Array.isArray(t))
66
72
  throw new Error("Expected dy to be a single Tensor");
67
- const [e, a] = s, { divisor: o } = n;
73
+ const [s, a] = e, { divisor: o } = n;
68
74
  return {
69
75
  q: () => t.matMul(a).mul(o),
70
- k: () => e.transpose([0, 1, 3, 2]).matMul(t).mul(o).transpose([0, 1, 3, 2]),
76
+ k: () => s.transpose([0, 1, 3, 2]).matMul(t).mul(o).transpose([0, 1, 3, 2]),
71
77
  mask: () => t,
72
78
  divisor: () => {
73
- const r = e.matMul(a, !1, !0);
79
+ const r = s.matMul(a, !1, !0);
74
80
  return t.mul(r).sum();
75
81
  }
76
82
  };
77
83
  }
78
84
  };
79
- k(g);
85
+ d(A);
80
86
  export {
81
- w as attentionMask
87
+ N as attentionMask
82
88
  };