npm - @genai-fi/nanogpt - Versions diffs - 0.4.4 → 0.4.5 - Mend

@genai-fi/nanogpt 0.4.4 → 0.4.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/dist/NanoGPTModel.js +1 -1
package/dist/TeachableLLM.js +7 -4
package/dist/layers/CausalSelfAttention.js +44 -43
package/dist/layers/RMSNorm.d.ts +1 -2
package/dist/layers/RMSNorm.js +9 -9
package/dist/layers/TransformerBlock.js +1 -1
package/dist/main.js +21 -18
package/dist/ops/cpu/normRMS.d.ts +1 -0
package/dist/ops/cpu/normRMS.js +39 -0
package/dist/ops/grads/normRMS.d.ts +2 -0
package/dist/ops/grads/normRMS.js +20 -0
package/dist/ops/normRMS.d.ts +2 -0
package/dist/ops/normRMS.js +10 -0
package/dist/ops/webgl/matMulGelu.d.ts +3 -2
package/dist/ops/webgl/matMulGelu.js +72 -70
package/dist/ops/webgl/normRMS.d.ts +1 -0
package/dist/ops/webgl/normRMS.js +78 -0
package/package.json +1 -1

package/dist/NanoGPTModel.js CHANGED Viewed

@@ -132,7 +132,7 @@ class wt extends B {
     }) : (this.ropeCache = new K(this.config.gpt), this.config.layerConfig.ropeCache = this.ropeCache), this.drop = st({ rate: this.config.gpt.dropout }), this.blocks = [];
     for (let e = 0; e < this.config.gpt.nLayer; e++)
       this.blocks.push(new W(e, this.config));
-    this.lnF = new N(this.config, 1e-8, "final_rms_norm");
+    this.lnF = new N(this.config, "final_rms_norm");
   }
   get checkpointing() {
     return this.config.layerConfig.checkpointAttention === !0 || this.config.layerConfig.checkpointMLP === !0;

package/dist/TeachableLLM.js CHANGED Viewed

@@ -3,8 +3,8 @@ import l from "./NanoGPTModel.js";
 import { saveModel as d } from "./utilities/save.js";
 import { loadModel as f } from "./utilities/load.js";
 import u from "./Generator.js";
-import _ from "./Trainer.js";
-import { E as p } from "./index-Dwqa6Zy2.js";
+import p from "./Trainer.js";
+import { E as _ } from "./index-Dwqa6Zy2.js";
 import { dummyPassAsync as m } from "./utilities/dummy.js";
 import c from "./tokeniser/CharTokeniser.js";
 import g from "./tokeniser/bpe.js";
@@ -37,9 +37,12 @@ import "./ops/grads/matMulGelu.js";
 import "./ops/cpu/gelu.js";
 import "./ops/webgl/gelu.js";
 import "./ops/grads/gelu.js";
+import "./ops/cpu/normRMS.js";
+import "./ops/webgl/normRMS.js";
+import "./ops/grads/normRMS.js";
 import w from "./utilities/profile.js";
 class a {
-  ee = new p();
+  ee = new _();
   _config;
   _model;
   _tokeniser;
@@ -126,7 +129,7 @@ class a {
   trainer() {
     if (!this._model || !this._tokeniser)
       throw new Error("Model or tokeniser is not initialized.");
-    const t = new _(this._model, this._tokeniser);
+    const t = new p(this._model, this._tokeniser);
     return t.on("start", () => this.setStatus("training")), t.on("stop", () => this.setStatus("ready")), t.on("log", async (e) => {
       const i = this.ee.listeners("trainStep");
       for (const o of i)

package/dist/layers/CausalSelfAttention.js CHANGED Viewed

@@ -6,12 +6,12 @@ import { appendCache as E } from "../ops/appendCache.js";
 import { D as z, F as S, t as $, c as L, e as j, H as O } from "../index--6vO-cOz.js";
 import { fusedSoftmax as _ } from "../ops/fusedSoftmax.js";
 import { l as W, w as M, d as x } from "../tfjs_backend-DuKis_xG.js";
-import { o as N } from "../ones-D6kB8bdY.js";
-import { v as A } from "../variable-BJTZ3jOy.js";
-import { z as q } from "../zeros-8xl-W2DC.js";
+import { o as q } from "../ones-D6kB8bdY.js";
+import { v as b } from "../variable-BJTZ3jOy.js";
+import { z as B } from "../zeros-8xl-W2DC.js";
 import { r as C, d as I } from "../dropout-DFEXTPV0.js";
-import { r as B } from "../reshape-z51Eu-re.js";
-import { m as F } from "../mat_mul-BEHRPMh0.js";
+import { r as F } from "../reshape-z51Eu-re.js";
+import { m as H } from "../mat_mul-BEHRPMh0.js";
 class nt extends T {
   cAttn = null;
   cProj = null;
@@ -23,16 +23,16 @@ class nt extends T {
   units;
   projUnits;
   constructor(t, s) {
-    super(s), this.index = t, this.units = s.gpt.nEmbed * 3, this.projUnits = s.gpt.nEmbed, this.bias = W.bandPart(N([s.gpt.blockSize, s.gpt.blockSize]), -1, 0).cast("bool"), this.divisor = 1 / Math.sqrt(s.gpt.nEmbed / s.gpt.nHead);
-    const o = q([s.gpt.blockSize, s.gpt.blockSize]), e = z([s.gpt.blockSize, s.gpt.blockSize], Number.NEGATIVE_INFINITY);
-    this.maskInf = M(this.bias, o, e);
+    super(s), this.index = t, this.units = s.gpt.nEmbed * 3, this.projUnits = s.gpt.nEmbed, this.bias = W.bandPart(q([s.gpt.blockSize, s.gpt.blockSize]), -1, 0).cast("bool"), this.divisor = 1 / Math.sqrt(s.gpt.nEmbed / s.gpt.nHead);
+    const e = B([s.gpt.blockSize, s.gpt.blockSize]), o = z([s.gpt.blockSize, s.gpt.blockSize], Number.NEGATIVE_INFINITY);
+    this.maskInf = M(this.bias, e, o);
   }
   build() {
-    this.cAttn === null && (this.cAttn = A(
+    this.cAttn === null && (this.cAttn = b(
       C([this.config.gpt.nEmbed, this.units], 0, 0.02),
       !0
       //`block_${this.index}_attn_cAttn_kernel`
-    )), this.cProj === null && (this.cProj = A(
+    )), this.cProj === null && (this.cProj = b(
       C([this.projUnits, this.config.gpt.nEmbed], 0, 0.02),
       !0
       //`block_${this.index}_attn_cProj_kernel`
@@ -53,57 +53,58 @@ class nt extends T {
     t.set(`block_${this.index}_cAttn`, this.cAttn ? [this.cAttn.clone()] : []), t.set(`block_${this.index}_cProj`, this.cProj ? [this.cProj.clone()] : []);
   }
   loadWeights(t) {
-    const s = t.get(`block_${this.index}_cAttn`)?.[0], o = t.get(`block_${this.index}_cProj`)?.[0];
+    const s = t.get(`block_${this.index}_cAttn`)?.[0], e = t.get(`block_${this.index}_cProj`)?.[0];
     if (!s) throw new Error(`Weights for block_${this.index}_cAttn not found`);
-    if (!o) throw new Error(`Weights for block_${this.index}_cProj not found`);
-    this.cAttn ? this.cAttn.assign(s) : this.cAttn = A(s, !0), this.cProj ? this.cProj.assign(o) : this.cProj = A(o, !0);
+    if (!e) throw new Error(`Weights for block_${this.index}_cProj not found`);
+    this.cAttn ? this.cAttn.assign(s) : this.cAttn = b(s, !0), this.cProj ? this.cProj.assign(e) : this.cProj = b(e, !0);
   }
-  getAttentionScores(t, s, o, e) {
+  getAttentionScores(t, s, e, o) {
     const i = P(t, s, this.divisor, this.maskInf);
-    return _(i, o ? this.config.gpt.dropout : 0, e);
+    return _(i, e ? this.config.gpt.dropout : 0, o);
   }
   // Attention with optional past. If pastLen > 0 and T_cur == 1, no mask needed.
-  getAttentionScoresWithPast(t, s, o) {
-    const e = P(t, s, this.divisor, void 0, o);
-    return _(e, 0, 0);
+  getAttentionScoresWithPast(t, s, e) {
+    const o = P(t, s, this.divisor, void 0, e);
+    return _(o, 0, 0);
   }
   getQKV(t) {
     return y(t, this.cAttn, this.config.gpt.nHead);
   }
   getOutputProjection(t) {
-    const s = t.shape[0], o = t.shape[2], e = this.config.gpt.nEmbed, i = t.transpose([0, 2, 1, 3]), n = B(i, [s, o, e]);
+    const s = t.shape[0], e = t.shape[2], o = this.config.gpt.nEmbed, i = t.transpose([0, 2, 1, 3]), n = F(i, [s, e, o]);
     return x(n, this.cProj);
   }
-  updateCache(t, s, o, e) {
-    const i = this.config.gpt.blockSize, n = t.shape[2], r = e?.length || 0, a = o ? t : E(t, i, r, e?.k), p = o ? s : E(s, i, r, e?.v);
-    return {
+  updateCache(t, s, e, o) {
+    const i = this.config.gpt.blockSize, n = t.shape[2], r = o?.length || 0, a = e ? t : E(t, i, r, o?.k);
+    e || (t.dispose(), o?.k.dispose());
+    const p = e ? s : E(s, i, r, o?.v);
+    return e || (s.dispose(), o?.v.dispose()), {
       k: S(a),
       v: S(p),
       length: Math.min(r + n, i),
-      cumulativeLength: e ? e.cumulativeLength + n : n
+      cumulativeLength: o ? o.cumulativeLength + n : n
     };
   }
-  forward(t, s = !1, o, e = !1, i) {
+  forward(t, s = !1, e, o = !1, i) {
     return $(() => {
       this.startMemory();
-      const [n, r, a] = this.getQKV(t), p = i ? i.cumulativeLength : 0, c = this.config.layerConfig.ropeCache, u = c ? w(n, c, p) : n, f = c ? w(r, c, p) : r;
+      const [n, r, a] = this.getQKV(t), p = i ? i.cumulativeLength : 0, c = this.config.layerConfig.ropeCache, u = c ? w(n, c, p) : n, A = c ? w(r, c, p) : r;
       c && (n.dispose(), r.dispose());
-      const g = i ? i.length : 0, d = this.updateCache(f, a, s, i), l = d.k, m = d.v;
-      i && (f.dispose(), a.dispose());
+      const f = i ? i.length : 0, d = this.updateCache(A, a, s, i), l = d.k, g = d.v;
       let h;
-      g > 0 ? h = this.getAttentionScoresWithPast(u, l, g) : h = this.getAttentionScores(u, l, s, o), u.dispose(), s && l.dispose();
-      const b = F(h, m);
-      e || h.dispose(), s && m.dispose();
-      const k = this.getOutputProjection(b);
-      b.dispose();
-      const v = e ? h.mean(1) : void 0;
+      f > 0 ? h = this.getAttentionScoresWithPast(u, l, f) : h = this.getAttentionScores(u, l, s, e), u.dispose(), s && l.dispose();
+      const m = H(h, g);
+      o || h.dispose(), s && g.dispose();
+      const k = this.getOutputProjection(m);
+      m.dispose();
+      const v = o ? h.mean(1) : void 0;
       return this.endMemory("CausalSelfAttention"), { output: k, attention: v, presentKV: s ? void 0 : d };
     });
   }
-  call(t, s = !1, o = !1, e) {
-    if (e && !this.config.gpt.useRope)
+  call(t, s = !1, e = !1, o) {
+    if (o && !this.config.gpt.useRope)
       throw new Error("Cannot use pastKV without RoPE enabled");
-    if (s && e)
+    if (s && o)
       throw new Error("Cannot use pastKV during training");
     if (t.shape.length !== 3)
       throw new Error(`Input tensor must be rank 3 [B, T, C], got shape ${t.shape}`);
@@ -115,15 +116,15 @@ class nt extends T {
       const r = L(
         // @ts-expect-error Invalid params
         (a, p, c, u) => {
-          const f = this.forward(a, !0, i);
+          const A = this.forward(a, !0, i);
           u([a]);
-          const g = (d, l) => {
-            const [m] = l, h = j().state.activeTape;
+          const f = (d, l) => {
+            const [g] = l, h = j().state.activeTape;
             j().state.activeTape = [];
-            const b = O((k, v, R) => this.forward(k, !0, i).output)([m, p, c], d);
-            return j().state.activeTape = h, b;
+            const m = O((k, v, R) => this.forward(k, !0, i).output)([g, p, c], d);
+            return j().state.activeTape = h, m;
           };
-          return { value: f.output, gradFunc: g };
+          return { value: A.output, gradFunc: f };
         }
       )(t, this.cAttn, this.cProj);
       if (this.config.gpt.dropout > 0) {
@@ -132,7 +133,7 @@ class nt extends T {
       } else
         return { output: r };
     } else {
-      const n = this.forward(t, s, i, o, e);
+      const n = this.forward(t, s, i, e, o);
       if (this.config.gpt.dropout > 0) {
         const r = I(n.output, this.config.gpt.dropout);
         return n.output.dispose(), { output: r, attention: n.attention, presentKV: n.presentKV };

package/dist/layers/RMSNorm.d.ts CHANGED Viewed

@@ -2,8 +2,7 @@ import { Tensor, Variable } from '@tensorflow/tfjs-core';
 import { default as BaseLayer, GPTLayerConfig } from './BaseLayer';
 export default class RMSNorm extends BaseLayer {
     private gamma;
-    private epsilon;
-    constructor(config: GPTLayerConfig, epsilon?: number, name?: string);
+    constructor(config: GPTLayerConfig, name?: string);
     get trainableWeights(): Variable[];
     set trainable(value: boolean);
     getWeights(): Tensor[];

package/dist/layers/RMSNorm.js CHANGED Viewed

@@ -1,12 +1,12 @@
 import { t as r } from "../index--6vO-cOz.js";
 import m from "./BaseLayer.js";
-import { v as i } from "../variable-BJTZ3jOy.js";
-import { o } from "../ones-D6kB8bdY.js";
-class d extends m {
+import { normRMS as s } from "../ops/normRMS.js";
+import { v as e } from "../variable-BJTZ3jOy.js";
+import { o as i } from "../ones-D6kB8bdY.js";
+class u extends m {
   gamma;
-  epsilon;
-  constructor(t, s = 1e-8, a = "") {
-    super(t), this.epsilon = s, this.gamma = i(o([t.gpt.nEmbed]), !0, `${a}_gamma`, "float32");
+  constructor(t, a = "") {
+    super(t), this.gamma = e(i([t.gpt.nEmbed]), !0, `${a}_gamma`, "float32");
   }
   get trainableWeights() {
     return [this.gamma];
@@ -23,8 +23,8 @@ class d extends m {
   apply(t) {
     return r(() => {
       this.startMemory();
-      const a = t.square().mean(-1, !0).add(this.epsilon).rsqrt(), e = t.mul(a).mul(this.gamma);
-      return this.endMemory("RMSNorm"), e;
+      const a = s(t, this.gamma);
+      return this.endMemory("RMSNorm"), a;
     });
   }
   dispose() {
@@ -32,5 +32,5 @@ class d extends m {
   }
 }
 export {
-  d as default
+  u as default
 };

package/dist/layers/TransformerBlock.js CHANGED Viewed

@@ -12,7 +12,7 @@ class W extends p {
   _trainable = !0;
   skipped = !1;
   constructor(t, s) {
-    super(s), this.index = t, this.ln1 = new a(s, 1e-8, `block_${this.index}_rms1`), this.attn = new h(this.index, s), this.ln2 = new a(s, 1e-8, `block_${this.index}_rms2`), this.mlp = new o(this.index, s);
+    super(s), this.index = t, this.ln1 = new a(s, `block_${this.index}_rms1`), this.attn = new h(this.index, s), this.ln2 = new a(s, `block_${this.index}_rms2`), this.mlp = new o(this.index, s);
   }
   get variables() {
     return [

package/dist/main.js CHANGED Viewed

@@ -1,10 +1,10 @@
-import { default as w } from "./NanoGPTModel.js";
-import { default as D } from "./TeachableLLM.js";
-import { default as F } from "./tokeniser/CharTokeniser.js";
-import { default as N } from "./tokeniser/bpe.js";
-import { default as j } from "./utilities/waitForModel.js";
-import { default as z } from "./data/textLoader.js";
-import { estimateMemoryUsage as H, estimateParameterCount as I, estimateResources as J, estimateTrainingMemoryUsage as K, validateConfig as O } from "./utilities/parameters.js";
+import { default as E } from "./NanoGPTModel.js";
+import { default as G } from "./TeachableLLM.js";
+import { default as R } from "./tokeniser/CharTokeniser.js";
+import { default as q } from "./tokeniser/bpe.js";
+import { default as A } from "./utilities/waitForModel.js";
+import { default as I } from "./data/textLoader.js";
+import { estimateMemoryUsage as K, estimateParameterCount as O, estimateResources as Q, estimateTrainingMemoryUsage as S, validateConfig as V } from "./utilities/parameters.js";
 import "./index--6vO-cOz.js";
 import "./ops/cpu/scatterSub.js";
 import "./ops/webgl/scatterSub.js";
@@ -31,16 +31,19 @@ import "./ops/grads/matMulGelu.js";
 import "./ops/cpu/gelu.js";
 import "./ops/webgl/gelu.js";
 import "./ops/grads/gelu.js";
+import "./ops/cpu/normRMS.js";
+import "./ops/webgl/normRMS.js";
+import "./ops/grads/normRMS.js";
 export {
-  N as BPETokeniser,
-  F as CharTokeniser,
-  w as NanoGPT,
-  D as TeachableLLM,
-  H as estimateMemoryUsage,
-  I as estimateParameterCount,
-  J as estimateResources,
-  K as estimateTrainingMemoryUsage,
-  z as loadTextData,
-  O as validateConfig,
-  j as waitForModel
+  q as BPETokeniser,
+  R as CharTokeniser,
+  E as NanoGPT,
+  G as TeachableLLM,
+  K as estimateMemoryUsage,
+  O as estimateParameterCount,
+  Q as estimateResources,
+  S as estimateTrainingMemoryUsage,
+  I as loadTextData,
+  V as validateConfig,
+  A as waitForModel
 };

package/dist/ops/cpu/normRMS.d.ts ADDED Viewed

	@@ -0,0 +1 @@
1	+ export {};

package/dist/ops/cpu/normRMS.js ADDED Viewed

@@ -0,0 +1,39 @@
+import { r as o, t as d } from "../../index--6vO-cOz.js";
+function i(t) {
+  const { inputs: e } = t, { x: n, gamma: s } = e, r = n, a = s;
+  return d(() => {
+    const u = r.square().mean(-1, !0).add(1e-8).rsqrt();
+    return r.mul(u).mul(a);
+  });
+}
+const f = {
+  kernelName: "RMSNorm",
+  backendName: "cpu",
+  kernelFunc: i
+};
+o(f);
+const g = {
+  kernelName: "RMSNorm",
+  backendName: "tensorflow",
+  kernelFunc: i
+};
+o(g);
+function N(t) {
+  const { dy: e, x: n, gamma: s } = t.inputs;
+  return d(() => {
+    const r = n.shape[n.shape.length - 1], a = n.square().mean(-1, !0), m = a.add(1e-8).rsqrt(), u = n.mul(m), l = e.mul(u).sum([0, 1]), c = e.mul(s), k = c.mul(n).sum(-1, !0).div(r);
+    return [c.mul(m).sub(n.mul(k).mul(m).div(a.add(1e-8))), l];
+  });
+}
+const S = {
+  kernelName: "RMSNormGrad",
+  backendName: "cpu",
+  kernelFunc: N
+};
+o(S);
+const R = {
+  kernelName: "RMSNormGrad",
+  backendName: "tensorflow",
+  kernelFunc: N
+};
+o(R);

package/dist/ops/grads/normRMS.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ import { GradConfig } from '@tensorflow/tfjs-core';
2	+ export declare const normRMSGradConfig: GradConfig;

package/dist/ops/grads/normRMS.js ADDED Viewed

@@ -0,0 +1,20 @@
+import { g as t, e as g } from "../../index--6vO-cOz.js";
+function s(r, a, n) {
+  return g().runKernel("RMSNormGrad", { dy: r, x: a, gamma: n });
+}
+const u = {
+  kernelName: "RMSNorm",
+  inputsToSave: ["x", "gamma"],
+  outputsToSave: [],
+  gradFunc: (r, a) => {
+    const [n, e] = a, [m, o] = s(r, n, e);
+    return {
+      x: () => m,
+      gamma: () => o
+    };
+  }
+};
+t(u);
+export {
+  u as normRMSGradConfig
+};

package/dist/ops/normRMS.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ import { Tensor } from '@tensorflow/tfjs-core';
2	+ export declare function normRMS(x: Tensor, gamma: Tensor): Tensor;

package/dist/ops/normRMS.js ADDED Viewed

@@ -0,0 +1,10 @@
+import { e as n } from "../index--6vO-cOz.js";
+import "./cpu/normRMS.js";
+import "./webgl/normRMS.js";
+import "./grads/normRMS.js";
+function p(r, o) {
+  return n().runKernel("RMSNorm", { x: r, gamma: o });
+}
+export {
+  p as normRMS
+};

package/dist/ops/webgl/matMulGelu.d.ts CHANGED Viewed

@@ -7,9 +7,10 @@ type BatchMatMulConfig = {
     transposeA: boolean;
     transposeB: boolean;
     backend: MathBackendWebGL;
-    activationSnippet: string;
+    activationSnippet?: string;
+    multiplier?: TensorInfo;
 };
-export declare function batchMatMulGeluImpl({ a, b, transposeA, transposeB, backend, activationSnippet, }: BatchMatMulConfig): TensorInfo;
+export declare function batchMatMulGeluImpl({ a, b, transposeA, transposeB, backend, activationSnippet, multiplier, }: BatchMatMulConfig): TensorInfo;
 export declare function batchMatMulKernel(args: {
     inputs: {
         x: TensorInfo;

package/dist/ops/webgl/matMulGelu.js CHANGED Viewed

@@ -1,7 +1,7 @@
-import { r as G, t as P, e as R, b as I, n as k, O as L, j as F, Q as U } from "../../index--6vO-cOz.js";
-import { r as g } from "../../Reshape-CiAY8ltP.js";
+import { r as C, t as R, e as I, n as G, O as L, j as F, Q as U } from "../../index--6vO-cOz.js";
+import { r as S } from "../../Reshape-CiAY8ltP.js";
 import { u as H } from "../../gpgpu_math-CUzjlO9A.js";
-import { m as z } from "../../mat_mul-BEHRPMh0.js";
+import { m as B } from "../../mat_mul-BEHRPMh0.js";
 /**
  * @license
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -19,39 +19,39 @@ import { m as z } from "../../mat_mul-BEHRPMh0.js";
  * =============================================================================
  */
 class W {
-  constructor(e, s, a, n = !1, c = !1, o = !1, r = null, i = !1, u = !1) {
-    this.variableNames = ["matrixA", "matrixB"], this.packedInputs = !0, this.packedOutput = !0, this.outputShape = a, this.enableShapeUniforms = H(this.outputShape.length);
-    const p = n ? e[1] : e[2], l = Math.ceil(p / 2), b = n ? "i * 2, rc.y" : "rc.y, i * 2", M = c ? "rc.z, i * 2" : "i * 2, rc.z", h = n ? ["a.xxyy", "a.zzww"] : ["a.xxzz", "a.yyww"], d = c ? ["b.xzxz", "b.ywyw"] : ["b.xyxy", "b.zwzw"];
-    let m = "", v = "";
-    r && (i ? m = `vec4 activation(vec4 a) {
+  constructor(e, s, n, a = !1, c = !1, o = !1, r = null, u = !1, l = !1) {
+    this.variableNames = ["matrixA", "matrixB"], this.packedInputs = !0, this.packedOutput = !0, this.outputShape = n, this.enableShapeUniforms = H(this.outputShape.length);
+    const h = a ? e[1] : e[2], p = Math.ceil(h / 2), d = a ? "i * 2, rc.y" : "rc.y, i * 2", $ = c ? "rc.z, i * 2" : "i * 2, rc.z", x = a ? ["a.xxyy", "a.zzww"] : ["a.xxzz", "a.yyww"], m = c ? ["b.xzxz", "b.ywyw"] : ["b.xyxy", "b.zwzw"];
+    let i = "", b = "";
+    r && (u ? i = `vec4 activation(vec4 a) {
           vec4 b = getPreluActivationWeightsAtOutCoords();
           ${r}
-        }` : u ? m = `vec4 activation(vec4 a) {
+        }` : l ? i = `vec4 activation(vec4 a) {
           vec4 b = getLeakyreluAlphaAtOutCoords();
           ${r}
-        }` : m = `vec4 activation(vec4 x) {
+        }` : i = `vec4 activation(vec4 x) {
           ${r}
-        }`, v = "result = activation(result);");
-    const $ = o ? "result += getBiasAtOutCoords();" : "";
-    o && this.variableNames.push("bias"), i && this.variableNames.push("preluActivationWeights"), u && this.variableNames.push("leakyreluAlpha");
-    let f = "rc.x", x = "rc.x";
-    e[0] < s[0] ? f = `imod(rc.x, ${e[0]})` : s[0] < e[0] && (x = `imod(rc.x, ${s[0]})`), this.userCode = `
-      ${m}
+        }`, b = "result = activation(result);");
+    const M = o ? "result += getBiasAtOutCoords();" : "";
+    o && this.variableNames.push("bias"), u && this.variableNames.push("preluActivationWeights"), l && this.variableNames.push("leakyreluAlpha");
+    let f = "rc.x", v = "rc.x";
+    e[0] < s[0] ? f = `imod(rc.x, ${e[0]})` : s[0] < e[0] && (v = `imod(rc.x, ${s[0]})`), this.userCode = `
+      ${i}
       // Don't use uniform for sharedDimensionPacked for performance.
-      const float sharedDimension = ${l}.0;
+      const float sharedDimension = ${p}.0;
       vec4 dot2x2ARowBCol(ivec3 rc) {
         vec4 result = vec4(0);
         int batchA = ${f};
-        int batchB = ${x};
-        for (int i = 0; i < ${l}; i++) {
-          vec4 a = getMatrixA(batchA, ${b});
-          vec4 b = getMatrixB(batchB, ${M});
+        int batchB = ${v};
+        for (int i = 0; i < ${p}; i++) {
+          vec4 a = getMatrixA(batchA, ${d});
+          vec4 b = getMatrixB(batchB, ${$});
           // These swizzled products need to be separately added.
           // See: https://github.com/tensorflow/tfjs/issues/1735
-          result += (${h[0]} * ${d[0]});
-          result += (${h[1]} * ${d[1]});
+          result += (${x[0]} * ${m[0]});
+          result += (${x[1]} * ${m[1]});
         }
         return result;
       }
@@ -60,69 +60,72 @@ class W {
         ivec3 rc = getOutputCoords();
         vec4 result = dot2x2ARowBCol(rc);
-        ${$}
+        ${M}
-        ${v}
+        ${b}
         setOutput(result);
       }
     `;
   }
 }
-const S = 0.7978845608028654, w = 0.044715, j = `
+const g = 0.7978845608028654, w = 0.044715, j = `
     vec4 x3 = x * x * x;
     vec4 inner = x + ${w} * x3;
-    inner = ${S} * inner;
+    inner = ${g} * inner;
     inner = tanh(inner);
     inner = 0.5 * (1.0 + inner);
     vec4 result = x * inner;
     return result;
 `, q = `
-    vec4 x2 = x * x;
-    vec4 x3 = x2 * x;
-    vec4 u  = ${S} * (x + ${w} * x3);
+    vec4 a2 = a * a;
+    vec4 a3 = a2 * a;
+    vec4 u  = ${g} * (a + ${w} * a3);
     vec4 t  = tanh(u);
     vec4 sech2 = 1.0 - t * t;
-    vec4 du_dx = ${S} * (1.0 + 3.0 * ${w} * x2);
-    vec4 dgelu = 0.5 * (1.0 + t) + 0.5 * x * sech2 * du_dx;
-    return dgelu;
+    vec4 du_dx = ${g} * (1.0 + 3.0 * ${w} * a2);
+    vec4 dgelu = 0.5 * (1.0 + t) + 0.5 * a * sech2 * du_dx;
+    return dgelu * b;
 `, se = 1e3;
-function B({
+function O({
   a: t,
   b: e,
   transposeA: s,
-  transposeB: a,
-  backend: n,
-  activationSnippet: c
+  transposeB: n,
+  backend: a,
+  activationSnippet: c,
+  multiplier: o
 }) {
-  const o = t.shape.length, r = e.shape.length, i = s ? t.shape[o - 2] : t.shape[o - 1], u = a ? e.shape[r - 1] : e.shape[r - 2], p = s ? t.shape[o - 1] : t.shape[o - 2], l = a ? e.shape[r - 2] : e.shape[r - 1], b = t.shape.slice(0, -2), M = e.shape.slice(0, -2), h = k(b), d = k(M), v = L(t.shape.slice(0, -2), e.shape.slice(0, -2)).concat([p, l]);
+  const r = t.shape.length, u = e.shape.length, l = s ? t.shape[r - 2] : t.shape[r - 1], h = n ? e.shape[u - 1] : e.shape[u - 2], p = s ? t.shape[r - 1] : t.shape[r - 2], d = n ? e.shape[u - 2] : e.shape[u - 1], $ = t.shape.slice(0, -2), x = e.shape.slice(0, -2), m = G($), i = G(x), M = L(t.shape.slice(0, -2), e.shape.slice(0, -2)).concat([p, d]);
   F(
-    i === u,
-    () => `Error in matMul: inner shapes (${i}) and (${u}) of Tensors with shapes ${t.shape} and ${e.shape} and transposeA=${s} and transposeB=${a} must match.`
+    l === h,
+    () => `Error in matMul: inner shapes (${l}) and (${h}) of Tensors with shapes ${t.shape} and ${e.shape} and transposeA=${s} and transposeB=${n} must match.`
   );
-  const $ = s ? [h, i, p] : [h, p, i], f = a ? [d, l, u] : [d, u, l], x = g({ inputs: { x: t }, backend: n, attrs: { shape: $ } }), A = g({ inputs: { x: e }, backend: n, attrs: { shape: f } }), y = [x, A], C = Math.max(h, d), O = c, E = U(t.dtype, e.dtype), N = new W(
-    $,
+  const f = s ? [m, l, p] : [m, p, l], v = n ? [i, d, h] : [i, h, d], A = S({ inputs: { x: t }, backend: a, attrs: { shape: f } }), y = S({ inputs: { x: e }, backend: a, attrs: { shape: v } }), D = [A, y], E = Math.max(m, i), N = c, T = U(t.dtype, e.dtype), _ = new W(
     f,
-    [C, p, l],
+    v,
+    [E, p, d],
     s,
-    a,
-    !1,
-    O,
+    n,
     !1,
+    N,
+    !!o,
     !1
-  ), T = [x, A], D = n.runWebGLProgram(N, T, E), _ = g({ inputs: { x: D }, backend: n, attrs: { shape: v } });
-  y.push(D);
-  for (const K of y)
-    n.disposeIntermediateTensorInfo(K);
-  return _;
+  ), k = [A, y];
+  o && k.push(o);
+  const z = a.runWebGLProgram(_, k, T), K = S({ inputs: { x: z }, backend: a, attrs: { shape: M } });
+  D.push(z);
+  for (const P of D)
+    a.disposeIntermediateTensorInfo(P);
+  return K;
 }
 function Q(t) {
-  const { inputs: e, backend: s } = t, { x: a, kernel: n } = e;
-  if (a === void 0 || n === void 0)
+  const { inputs: e, backend: s } = t, { x: n, kernel: a } = e;
+  if (n === void 0 || a === void 0)
     throw new Error("BatchMatMul requires two input tensors.");
-  return B({
-    a,
-    b: n,
+  return O({
+    a: n,
+    b: a,
     transposeA: !1,
     transposeB: !1,
     backend: s,
@@ -134,23 +137,22 @@ const J = {
   backendName: "webgl",
   kernelFunc: Q
 };
-G(J);
+C(J);
 function V(t) {
-  const { dy: e, x: s, kernel: a } = t.inputs, n = t.backend;
-  return P(() => {
-    const c = R().makeTensorFromTensorInfo(
-      B({
+  const { dy: e, x: s, kernel: n } = t.inputs, a = t.backend;
+  return R(() => {
+    const c = I().makeTensorFromTensorInfo(
+      O({
         a: s,
-        b: a,
+        b: n,
         transposeA: !1,
         transposeB: !1,
-        backend: n,
-        activationSnippet: q
+        backend: a,
+        activationSnippet: q,
+        multiplier: e
       })
-    ), o = I(e, c);
-    c.dispose();
-    const r = z(o, a, !1, !0), i = z(s, o, !0, !1);
-    return [r, i];
+    ), o = B(c, n, !1, !0), r = B(s, c, !0, !1);
+    return [o, r];
   });
 }
 const X = {
@@ -158,9 +160,9 @@ const X = {
   backendName: "webgl",
   kernelFunc: V
 };
-G(X);
+C(X);
 export {
   se as MATMUL_SHARED_DIM_THRESHOLD,
-  B as batchMatMulGeluImpl,
+  O as batchMatMulGeluImpl,
   Q as batchMatMulKernel
 };

package/dist/ops/webgl/normRMS.d.ts ADDED Viewed

	@@ -0,0 +1 @@
1	+ export {};

package/dist/ops/webgl/normRMS.js ADDED Viewed

@@ -0,0 +1,78 @@
+import { r as c, e as h } from "../../index--6vO-cOz.js";
+import { s as q } from "../../sum-DdkDf2MG.js";
+class G {
+  variableNames = ["x", "meanSquare", "gamma"];
+  outputShape;
+  userCode;
+  constructor(e, a, o) {
+    this.outputShape = [e, a, o], this.userCode = `
+        void main() {
+            ivec3 coords = getOutputCoords();
+            float x = getXAtOutCoords();
+            float meanSquare = getMeanSquare(coords.x, coords.y, 0);
+            float gamma = getGammaAtOutCoords();
+            float invRms = inversesqrt(meanSquare + 1e-8);
+            float normalized = x * invRms;
+            float outVal = normalized * gamma;
+            setOutput(outVal);
+        }
+        `;
+  }
+}
+function v(t) {
+  const { x: e, gamma: a } = t.inputs, o = t.backend, r = e.shape[0], s = e.shape[1], n = e.shape[2], m = e.square().mean(-1, !0), u = new G(r, s, n);
+  return o.runWebGLProgram(u, [e, m, a], "float32");
+}
+const x = {
+  kernelName: "RMSNorm",
+  backendName: "webgl",
+  kernelFunc: v
+};
+c(x);
+class y {
+  variableNames = ["x", "meanSquare", "dyGamma", "dyXMean"];
+  outputShape;
+  userCode;
+  constructor(e, a, o) {
+    this.outputShape = [e, a, o], this.userCode = `
+        void main() {
+            ivec3 coords = getOutputCoords();
+            float x = getXAtOutCoords();
+            float meanSquare = getMeanSquare(coords.x, coords.y, 0) + 1e-8;
+            float dyGamma = getDyGammaAtOutCoords();
+            float dyXMean = getDyXMean(coords.x, coords.y, 0) / ${o}.0;
+            float invRms = inversesqrt(meanSquare);
+            float dx = dyGamma * invRms - x * dyXMean * invRms / meanSquare;
+            setOutput(dx);
+        }
+        `;
+  }
+}
+class C {
+  variableNames = ["x", "meanSquare", "dy"];
+  outputShape;
+  userCode;
+  constructor(e, a, o) {
+    this.outputShape = [e, a, o], this.userCode = `
+        void main() {
+            ivec3 coords = getOutputCoords();
+            float x = getXAtOutCoords();
+            float meanSquare = getMeanSquare(coords.x, coords.y, 0) + 1e-8;
+            float dy = getDyAtOutCoords();
+            float invRms = inversesqrt(meanSquare);
+            float dGamma = dy * (x * invRms);
+            setOutput(dGamma);
+        }
+        `;
+  }
+}
+function b(t) {
+  const { dy: e, x: a, gamma: o } = t.inputs, r = t.backend, s = a.shape[0], n = a.shape[1], m = a.shape[2], u = a.square().mean(-1, !0), d = e.mul(o), l = d.mul(a).sum(-1, !0), i = new y(s, n, m), g = r.runWebGLProgram(i, [a, u, d, l], "float32"), p = new C(s, n, m), S = r.runWebGLProgram(p, [a, u, e], "float32"), f = q(h().makeTensorFromTensorInfo(S), [0, 1]);
+  return [g, f];
+}
+const N = {
+  kernelName: "RMSNormGrad",
+  backendName: "webgl",
+  kernelFunc: b
+};
+c(N);

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
     "name": "@genai-fi/nanogpt",
-    "version": "0.4.4",
+    "version": "0.4.5",
     "type": "module",
     "main": "dist/main.js",
     "types": "dist/main.d.ts",