@genai-fi/nanogpt 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -23,8 +23,8 @@ class nt extends y {
23
23
  projUnits;
24
24
  constructor(t, s) {
25
25
  super(s), this.index = t, this.units = s.gpt.nEmbed * 3, this.projUnits = s.gpt.nEmbed, this.bias = M.bandPart(q([s.gpt.blockSize, s.gpt.blockSize]), -1, 0).cast("bool"), this.divisor = 1 / Math.sqrt(s.gpt.nEmbed / s.gpt.nHead);
26
- const e = B([s.gpt.blockSize, s.gpt.blockSize]), n = $([s.gpt.blockSize, s.gpt.blockSize], Number.NEGATIVE_INFINITY);
27
- this.maskInf = O(this.bias, e, n);
26
+ const e = B([s.gpt.blockSize, s.gpt.blockSize]), o = $([s.gpt.blockSize, s.gpt.blockSize], Number.NEGATIVE_INFINITY);
27
+ this.maskInf = O(this.bias, e, o);
28
28
  }
29
29
  build() {
30
30
  this.cAttn === null && (this.cAttn = g(
@@ -57,87 +57,87 @@ class nt extends y {
57
57
  if (!e) throw new Error(`Weights for block_${this.index}_cProj not found`);
58
58
  this.cAttn ? this.cAttn.assign(s) : this.cAttn = g(s, !0), this.cProj ? this.cProj.assign(e) : this.cProj = g(e, !0);
59
59
  }
60
- getAttentionScores(t, s, e, n) {
61
- const o = I(t, s, this.maskInf, this.divisor);
62
- return S(o, e ? this.config.gpt.dropout : 0, n);
60
+ getAttentionScores(t, s, e, o) {
61
+ const n = I(t, s, this.maskInf, this.divisor);
62
+ return S(n, e ? this.config.gpt.dropout : 0, o);
63
63
  }
64
64
  // Attention with optional past. If pastLen > 0 and T_cur == 1, no mask needed.
65
- getAttentionScoresWithPast(t, s, e, n, o) {
66
- const i = t.shape[2];
67
- let r = C(t, s, !1, !0).mul(this.divisor);
68
- if (i > 1 && n > 0)
65
+ getAttentionScoresWithPast(t, s, e) {
66
+ const o = t.shape[2];
67
+ let i = C(t, s, !1, !0).mul(this.divisor);
68
+ if (o > 1 && e > 0)
69
69
  throw new Error("Cannot use past with T_cur > 1");
70
- if (i > 1) {
71
- const c = this.maskInf.slice([0, 0], [i, i]).expandDims(0).expandDims(0);
72
- r = r.add(c);
70
+ if (o > 1) {
71
+ const r = this.maskInf.slice([0, 0], [o, o]).expandDims(0).expandDims(0);
72
+ i = i.add(r);
73
73
  }
74
- return S(r, e ? this.config.gpt.dropout : 0, o);
74
+ return S(i, 0, 0);
75
75
  }
76
76
  getQKV(t) {
77
77
  return z(t, this.cAttn, this.config.gpt.nHead);
78
78
  }
79
79
  getOutputProjection(t) {
80
- const s = t.shape[0], e = t.shape[2], n = this.config.gpt.nEmbed, o = t.transpose([0, 2, 1, 3]), i = D(o, [s, e, n]);
80
+ const s = t.shape[0], e = t.shape[2], o = this.config.gpt.nEmbed, n = t.transpose([0, 2, 1, 3]), i = D(n, [s, e, o]);
81
81
  return N(i, this.cProj);
82
82
  }
83
83
  updateCache(t, s, e) {
84
- const n = this.config.gpt.blockSize, o = t.shape[2], i = Math.min(e?.length || 0, n - o), a = e ? E(e.k, t, n) : t, r = e ? E(e.v, s, n) : s;
84
+ const o = this.config.gpt.blockSize, n = t.shape[2], i = Math.min(e?.length || 0, o - n), r = e ? E(e.k, t, o) : t, a = e ? E(e.v, s, o) : s;
85
85
  return {
86
- k: _(a),
87
- v: _(r),
88
- length: i + o,
89
- cumulativeLength: e ? e.cumulativeLength + o : o
86
+ k: _(r),
87
+ v: _(a),
88
+ length: i + n,
89
+ cumulativeLength: e ? e.cumulativeLength + n : n
90
90
  };
91
91
  }
92
- forward(t, s = !1, e, n = !1, o) {
92
+ forward(t, s = !1, e, o = !1, n) {
93
93
  return x(() => {
94
94
  this.startMemory();
95
- const [i, a, r] = this.getQKV(t), c = o ? o.cumulativeLength : 0, h = this.config.layerConfig.ropeCache, d = h ? P(i, h, c) : i, p = h ? P(a, h, c) : a;
96
- h && (i.dispose(), a.dispose());
97
- const f = o ? o.length : 0, l = this.updateCache(p, r, o), m = l.k, b = l.v;
98
- o && (p.dispose(), r.dispose());
99
- let u;
100
- f > 0 ? u = this.getAttentionScoresWithPast(d, m, s, f, e) : u = this.getAttentionScores(d, m, s, e);
101
- const k = C(u, b), A = this.getOutputProjection(k), w = n ? u.mean(1) : void 0;
95
+ const [i, r, a] = this.getQKV(t), u = n ? n.cumulativeLength : 0, c = this.config.layerConfig.ropeCache, d = c ? P(i, c, u) : i, h = c ? P(r, c, u) : r;
96
+ c && (i.dispose(), r.dispose());
97
+ const f = n ? n.length : 0, l = this.updateCache(h, a, n), m = l.k, b = l.v;
98
+ n && (h.dispose(), a.dispose());
99
+ let p;
100
+ f > 0 ? p = this.getAttentionScoresWithPast(d, m, f) : p = this.getAttentionScores(d, m, s, e);
101
+ const k = C(p, b), A = this.getOutputProjection(k), w = o ? p.mean(1) : void 0;
102
102
  return this.endMemory("CausalSelfAttention"), { output: A, attention: w, presentKV: l };
103
103
  });
104
104
  }
105
- call(t, s = !1, e = !1, n) {
106
- if (n && !this.config.gpt.useRope)
105
+ call(t, s = !1, e = !1, o) {
106
+ if (o && !this.config.gpt.useRope)
107
107
  throw new Error("Cannot use pastKV without RoPE enabled");
108
- if (s && n)
108
+ if (s && o)
109
109
  throw new Error("Cannot use pastKV during training");
110
110
  if (t.shape.length !== 3)
111
111
  throw new Error(`Input tensor must be rank 3 [B, T, C], got shape ${t.shape}`);
112
112
  if (t.shape[2] !== this.config.gpt.nEmbed)
113
113
  throw new Error(`Input tensor last dimension must be ${this.config.gpt.nEmbed}, got ${t.shape[2]}`);
114
114
  this.build();
115
- const o = Math.random() * 1e9;
115
+ const n = Math.random() * 1e9;
116
116
  if (s && this.config.layerConfig.checkpointAttention) {
117
- const a = L(
117
+ const r = L(
118
118
  // @ts-expect-error Invalid params
119
- (r, c, h, d) => {
120
- const p = this.forward(r, !0, o);
121
- p.presentKV?.k.dispose(), p.presentKV?.v.dispose(), d([r]);
119
+ (a, u, c, d) => {
120
+ const h = this.forward(a, !0, n);
121
+ h.presentKV?.k.dispose(), h.presentKV?.v.dispose(), d([a]);
122
122
  const f = (l, m) => {
123
- const [b] = m, u = v().state.activeTape;
123
+ const [b] = m, p = v().state.activeTape;
124
124
  v().state.activeTape = [];
125
125
  const k = W((A, w, H) => {
126
- const j = this.forward(A, !0, o);
126
+ const j = this.forward(A, !0, n);
127
127
  return j.presentKV?.k.dispose(), j.presentKV?.v.dispose(), j.output;
128
- })([b, c, h], l);
129
- return v().state.activeTape = u, k;
128
+ })([b, u, c], l);
129
+ return v().state.activeTape = p, k;
130
130
  };
131
- return { value: p.output, gradFunc: f };
131
+ return { value: h.output, gradFunc: f };
132
132
  }
133
133
  )(t, this.cAttn, this.cProj);
134
134
  if (this.config.gpt.dropout > 0) {
135
- const r = U(a, this.config.gpt.dropout);
136
- return a.dispose(), { output: r };
135
+ const a = U(r, this.config.gpt.dropout);
136
+ return r.dispose(), { output: a };
137
137
  } else
138
- return { output: a };
138
+ return { output: r };
139
139
  } else
140
- return this.forward(t, s, o, e, n);
140
+ return this.forward(t, s, n, e, o);
141
141
  }
142
142
  dispose() {
143
143
  this.cAttn?.dispose(), this.cProj?.dispose(), this.bias.dispose(), this.maskInf.dispose();
@@ -3858,8 +3858,8 @@ class Io {
3858
3858
  variableNames = ["logits", "maxLogits"];
3859
3859
  outputShape;
3860
3860
  userCode;
3861
- constructor(e, n, o) {
3862
- this.outputShape = [e, n, o, o], this.userCode = `
3861
+ constructor(e) {
3862
+ this.outputShape = e, this.userCode = `
3863
3863
  void main() {
3864
3864
  ivec4 coords = getOutputCoords(); // [batch, nh, t1, t2]
3865
3865
  int b = coords.x;
@@ -3881,8 +3881,8 @@ class xo {
3881
3881
  { name: "dropoutRate", type: "float" },
3882
3882
  { name: "seed", type: "float" }
3883
3883
  ];
3884
- constructor(e, n, o) {
3885
- this.outputShape = [e, n, o, o], this.userCode = `
3884
+ constructor(e) {
3885
+ this.outputShape = e, this.userCode = `
3886
3886
  float random(ivec4 coords) {
3887
3887
  float x = float(coords.x * 4096 + coords.y * 256 + coords.z * 16 + coords.w);
3888
3888
  return fract(sin(seed + x) * 43758.5453123);
@@ -3908,16 +3908,16 @@ function So(t) {
3908
3908
  inputs: { x: o },
3909
3909
  backend: a,
3910
3910
  attrs: { reductionIndices: u, keepDims: !1 }
3911
- }), h = vt(c.shape, u), f = o.shape[0], w = o.shape[2], p = o.shape[1], m = new Io(f, p, w), b = a.runWebGLProgram(m, [o, c], "float32"), d = co({ inputs: { x: b }, backend: a, attrs: { axis: u, keepDims: !1 } }), g = st({ inputs: { x: d }, backend: a, attrs: { shape: h } });
3911
+ }), h = vt(c.shape, u), f = new Io(o.shape), w = a.runWebGLProgram(f, [o, c], "float32"), p = co({ inputs: { x: w }, backend: a, attrs: { axis: u, keepDims: !1 } }), m = st({ inputs: { x: p }, backend: a, attrs: { shape: h } });
3912
3912
  if (r !== void 0 && r > 0) {
3913
- const $ = new xo(f, p, w), E = a.runWebGLProgram($, [b, g], "float32", [
3913
+ const d = new xo(o.shape), g = a.runWebGLProgram(d, [w, m], "float32", [
3914
3914
  [r],
3915
3915
  [i ?? Math.random() * 1e4]
3916
3916
  ]);
3917
- return a.disposeIntermediateTensorInfo(c), a.disposeIntermediateTensorInfo(b), a.disposeIntermediateTensorInfo(d), a.disposeIntermediateTensorInfo(g), E;
3917
+ return a.disposeIntermediateTensorInfo(c), a.disposeIntermediateTensorInfo(w), a.disposeIntermediateTensorInfo(p), a.disposeIntermediateTensorInfo(m), g;
3918
3918
  }
3919
- const I = wo({ inputs: { a: b, b: g }, backend: a });
3920
- return a.disposeIntermediateTensorInfo(c), a.disposeIntermediateTensorInfo(b), a.disposeIntermediateTensorInfo(d), a.disposeIntermediateTensorInfo(g), I;
3919
+ const b = wo({ inputs: { a: w, b: m }, backend: a });
3920
+ return a.disposeIntermediateTensorInfo(c), a.disposeIntermediateTensorInfo(w), a.disposeIntermediateTensorInfo(p), a.disposeIntermediateTensorInfo(m), b;
3921
3921
  }
3922
3922
  const bo = {
3923
3923
  kernelName: "FusedSoftmax",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@genai-fi/nanogpt",
3
- "version": "0.4.0",
3
+ "version": "0.4.1",
4
4
  "type": "module",
5
5
  "main": "dist/main.js",
6
6
  "types": "dist/main.d.ts",