npm - @genai-fi/nanogpt - Versions diffs - 0.4.0 → 0.4.1 - Mend

@genai-fi/nanogpt 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/layers/CausalSelfAttention.js +44 -44
package/dist/ops/webgl/fusedSoftmax.js +9 -9
package/package.json +1 -1

package/dist/layers/CausalSelfAttention.js CHANGED Viewed

@@ -23,8 +23,8 @@ class nt extends y {
   projUnits;
   constructor(t, s) {
     super(s), this.index = t, this.units = s.gpt.nEmbed * 3, this.projUnits = s.gpt.nEmbed, this.bias = M.bandPart(q([s.gpt.blockSize, s.gpt.blockSize]), -1, 0).cast("bool"), this.divisor = 1 / Math.sqrt(s.gpt.nEmbed / s.gpt.nHead);
-    const e = B([s.gpt.blockSize, s.gpt.blockSize]), n = $([s.gpt.blockSize, s.gpt.blockSize], Number.NEGATIVE_INFINITY);
-    this.maskInf = O(this.bias, e, n);
+    const e = B([s.gpt.blockSize, s.gpt.blockSize]), o = $([s.gpt.blockSize, s.gpt.blockSize], Number.NEGATIVE_INFINITY);
+    this.maskInf = O(this.bias, e, o);
   }
   build() {
     this.cAttn === null && (this.cAttn = g(
@@ -57,87 +57,87 @@ class nt extends y {
     if (!e) throw new Error(`Weights for block_${this.index}_cProj not found`);
     this.cAttn ? this.cAttn.assign(s) : this.cAttn = g(s, !0), this.cProj ? this.cProj.assign(e) : this.cProj = g(e, !0);
   }
-  getAttentionScores(t, s, e, n) {
-    const o = I(t, s, this.maskInf, this.divisor);
-    return S(o, e ? this.config.gpt.dropout : 0, n);
+  getAttentionScores(t, s, e, o) {
+    const n = I(t, s, this.maskInf, this.divisor);
+    return S(n, e ? this.config.gpt.dropout : 0, o);
   }
   // Attention with optional past. If pastLen > 0 and T_cur == 1, no mask needed.
-  getAttentionScoresWithPast(t, s, e, n, o) {
-    const i = t.shape[2];
-    let r = C(t, s, !1, !0).mul(this.divisor);
-    if (i > 1 && n > 0)
+  getAttentionScoresWithPast(t, s, e) {
+    const o = t.shape[2];
+    let i = C(t, s, !1, !0).mul(this.divisor);
+    if (o > 1 && e > 0)
       throw new Error("Cannot use past with T_cur > 1");
-    if (i > 1) {
-      const c = this.maskInf.slice([0, 0], [i, i]).expandDims(0).expandDims(0);
-      r = r.add(c);
+    if (o > 1) {
+      const r = this.maskInf.slice([0, 0], [o, o]).expandDims(0).expandDims(0);
+      i = i.add(r);
     }
-    return S(r, e ? this.config.gpt.dropout : 0, o);
+    return S(i, 0, 0);
   }
   getQKV(t) {
     return z(t, this.cAttn, this.config.gpt.nHead);
   }
   getOutputProjection(t) {
-    const s = t.shape[0], e = t.shape[2], n = this.config.gpt.nEmbed, o = t.transpose([0, 2, 1, 3]), i = D(o, [s, e, n]);
+    const s = t.shape[0], e = t.shape[2], o = this.config.gpt.nEmbed, n = t.transpose([0, 2, 1, 3]), i = D(n, [s, e, o]);
     return N(i, this.cProj);
   }
   updateCache(t, s, e) {
-    const n = this.config.gpt.blockSize, o = t.shape[2], i = Math.min(e?.length || 0, n - o), a = e ? E(e.k, t, n) : t, r = e ? E(e.v, s, n) : s;
+    const o = this.config.gpt.blockSize, n = t.shape[2], i = Math.min(e?.length || 0, o - n), r = e ? E(e.k, t, o) : t, a = e ? E(e.v, s, o) : s;
     return {
-      k: _(a),
-      v: _(r),
-      length: i + o,
-      cumulativeLength: e ? e.cumulativeLength + o : o
+      k: _(r),
+      v: _(a),
+      length: i + n,
+      cumulativeLength: e ? e.cumulativeLength + n : n
     };
   }
-  forward(t, s = !1, e, n = !1, o) {
+  forward(t, s = !1, e, o = !1, n) {
     return x(() => {
       this.startMemory();
-      const [i, a, r] = this.getQKV(t), c = o ? o.cumulativeLength : 0, h = this.config.layerConfig.ropeCache, d = h ? P(i, h, c) : i, p = h ? P(a, h, c) : a;
-      h && (i.dispose(), a.dispose());
-      const f = o ? o.length : 0, l = this.updateCache(p, r, o), m = l.k, b = l.v;
-      o && (p.dispose(), r.dispose());
-      let u;
-      f > 0 ? u = this.getAttentionScoresWithPast(d, m, s, f, e) : u = this.getAttentionScores(d, m, s, e);
-      const k = C(u, b), A = this.getOutputProjection(k), w = n ? u.mean(1) : void 0;
+      const [i, r, a] = this.getQKV(t), u = n ? n.cumulativeLength : 0, c = this.config.layerConfig.ropeCache, d = c ? P(i, c, u) : i, h = c ? P(r, c, u) : r;
+      c && (i.dispose(), r.dispose());
+      const f = n ? n.length : 0, l = this.updateCache(h, a, n), m = l.k, b = l.v;
+      n && (h.dispose(), a.dispose());
+      let p;
+      f > 0 ? p = this.getAttentionScoresWithPast(d, m, f) : p = this.getAttentionScores(d, m, s, e);
+      const k = C(p, b), A = this.getOutputProjection(k), w = o ? p.mean(1) : void 0;
       return this.endMemory("CausalSelfAttention"), { output: A, attention: w, presentKV: l };
     });
   }
-  call(t, s = !1, e = !1, n) {
-    if (n && !this.config.gpt.useRope)
+  call(t, s = !1, e = !1, o) {
+    if (o && !this.config.gpt.useRope)
       throw new Error("Cannot use pastKV without RoPE enabled");
-    if (s && n)
+    if (s && o)
       throw new Error("Cannot use pastKV during training");
     if (t.shape.length !== 3)
       throw new Error(`Input tensor must be rank 3 [B, T, C], got shape ${t.shape}`);
     if (t.shape[2] !== this.config.gpt.nEmbed)
       throw new Error(`Input tensor last dimension must be ${this.config.gpt.nEmbed}, got ${t.shape[2]}`);
     this.build();
-    const o = Math.random() * 1e9;
+    const n = Math.random() * 1e9;
     if (s && this.config.layerConfig.checkpointAttention) {
-      const a = L(
+      const r = L(
         // @ts-expect-error Invalid params
-        (r, c, h, d) => {
-          const p = this.forward(r, !0, o);
-          p.presentKV?.k.dispose(), p.presentKV?.v.dispose(), d([r]);
+        (a, u, c, d) => {
+          const h = this.forward(a, !0, n);
+          h.presentKV?.k.dispose(), h.presentKV?.v.dispose(), d([a]);
           const f = (l, m) => {
-            const [b] = m, u = v().state.activeTape;
+            const [b] = m, p = v().state.activeTape;
             v().state.activeTape = [];
             const k = W((A, w, H) => {
-              const j = this.forward(A, !0, o);
+              const j = this.forward(A, !0, n);
               return j.presentKV?.k.dispose(), j.presentKV?.v.dispose(), j.output;
-            })([b, c, h], l);
-            return v().state.activeTape = u, k;
+            })([b, u, c], l);
+            return v().state.activeTape = p, k;
           };
-          return { value: p.output, gradFunc: f };
+          return { value: h.output, gradFunc: f };
         }
       )(t, this.cAttn, this.cProj);
       if (this.config.gpt.dropout > 0) {
-        const r = U(a, this.config.gpt.dropout);
-        return a.dispose(), { output: r };
+        const a = U(r, this.config.gpt.dropout);
+        return r.dispose(), { output: a };
       } else
-        return { output: a };
+        return { output: r };
     } else
-      return this.forward(t, s, o, e, n);
+      return this.forward(t, s, n, e, o);
   }
   dispose() {
     this.cAttn?.dispose(), this.cProj?.dispose(), this.bias.dispose(), this.maskInf.dispose();

package/dist/ops/webgl/fusedSoftmax.js CHANGED Viewed

@@ -3858,8 +3858,8 @@ class Io {
   variableNames = ["logits", "maxLogits"];
   outputShape;
   userCode;
-  constructor(e, n, o) {
-    this.outputShape = [e, n, o, o], this.userCode = `
+  constructor(e) {
+    this.outputShape = e, this.userCode = `
         void main() {
         ivec4 coords = getOutputCoords(); // [batch, nh, t1, t2]
             int b = coords.x;
@@ -3881,8 +3881,8 @@ class xo {
     { name: "dropoutRate", type: "float" },
     { name: "seed", type: "float" }
   ];
-  constructor(e, n, o) {
-    this.outputShape = [e, n, o, o], this.userCode = `
+  constructor(e) {
+    this.outputShape = e, this.userCode = `
         float random(ivec4 coords) {
             float x = float(coords.x * 4096 + coords.y * 256 + coords.z * 16 + coords.w);
             return fract(sin(seed + x) * 43758.5453123);
@@ -3908,16 +3908,16 @@ function So(t) {
     inputs: { x: o },
     backend: a,
     attrs: { reductionIndices: u, keepDims: !1 }
-  }), h = vt(c.shape, u), f = o.shape[0], w = o.shape[2], p = o.shape[1], m = new Io(f, p, w), b = a.runWebGLProgram(m, [o, c], "float32"), d = co({ inputs: { x: b }, backend: a, attrs: { axis: u, keepDims: !1 } }), g = st({ inputs: { x: d }, backend: a, attrs: { shape: h } });
+  }), h = vt(c.shape, u), f = new Io(o.shape), w = a.runWebGLProgram(f, [o, c], "float32"), p = co({ inputs: { x: w }, backend: a, attrs: { axis: u, keepDims: !1 } }), m = st({ inputs: { x: p }, backend: a, attrs: { shape: h } });
   if (r !== void 0 && r > 0) {
-    const $ = new xo(f, p, w), E = a.runWebGLProgram($, [b, g], "float32", [
+    const d = new xo(o.shape), g = a.runWebGLProgram(d, [w, m], "float32", [
       [r],
       [i ?? Math.random() * 1e4]
     ]);
-    return a.disposeIntermediateTensorInfo(c), a.disposeIntermediateTensorInfo(b), a.disposeIntermediateTensorInfo(d), a.disposeIntermediateTensorInfo(g), E;
+    return a.disposeIntermediateTensorInfo(c), a.disposeIntermediateTensorInfo(w), a.disposeIntermediateTensorInfo(p), a.disposeIntermediateTensorInfo(m), g;
   }
-  const I = wo({ inputs: { a: b, b: g }, backend: a });
-  return a.disposeIntermediateTensorInfo(c), a.disposeIntermediateTensorInfo(b), a.disposeIntermediateTensorInfo(d), a.disposeIntermediateTensorInfo(g), I;
+  const b = wo({ inputs: { a: w, b: m }, backend: a });
+  return a.disposeIntermediateTensorInfo(c), a.disposeIntermediateTensorInfo(w), a.disposeIntermediateTensorInfo(p), a.disposeIntermediateTensorInfo(m), b;
 }
 const bo = {
   kernelName: "FusedSoftmax",

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
     "name": "@genai-fi/nanogpt",
-    "version": "0.4.0",
+    "version": "0.4.1",
     "type": "module",
     "main": "dist/main.js",
     "types": "dist/main.d.ts",