npm - @genai-fi/nanogpt - Versions diffs - 0.2.6 → 0.2.8 - Mend

@genai-fi/nanogpt 0.2.6 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/dist/TeachableLLM.js +1 -0
package/dist/{complex-D6Bq1XDf.js → complex-CeoYJn2o.js} +1 -1
package/dist/{index-D1SlunD-.js → index-DQfEAU9u.js} +17 -17
package/dist/layers/CausalSelfAttention.js +40 -39
package/dist/layers/TiedEmbedding.js +104 -126
package/dist/main.js +15 -14
package/dist/mat_mul-CuHB58-H.js +27 -0
package/dist/ops/attentionMask.d.ts +2 -0
package/dist/ops/attentionMask.js +62 -0
package/dist/ops/gatherSub.js +2 -2
package/dist/ops/node/sparseCrossEntropy.js +1 -1
package/dist/ops/scatterSub.js +7 -7
package/dist/{stack-DB2YLlAs.js → stack-C9cTkqpq.js} +3 -3
package/dist/{sum-02UQ5Eaq.js → sum-B-O33dgG.js} +3 -3
package/dist/training/AdamExt.js +1 -1
package/dist/training/sparseCrossEntropy.js +16 -16
package/package.json +1 -1

package/dist/TeachableLLM.js CHANGED Viewed

@@ -12,6 +12,7 @@ import "./index-Tf7vU29b.js";
 import "./jszip.min-CjP2V1VV.js";
 import "./ops/scatterSub.js";
 import "./ops/gatherSub.js";
+import "./ops/attentionMask.js";
 class a extends c {
   _config;
   _model;

package/dist/{complex-D6Bq1XDf.js → complex-CeoYJn2o.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { o as t, c as s, b as n, E as m, C as r } from "./index-D1SlunD-.js";
+import { o as t, c as s, d as n, E as m, C as r } from "./index-DQfEAU9u.js";
 /**
  * @license
  * Copyright 2020 Google LLC. All Rights Reserved.

package/dist/{index-D1SlunD-.js → index-DQfEAU9u.js} RENAMED Viewed

@@ -3858,29 +3858,29 @@ export {
   Qn as V,
   qs as _,
   Z as a,
-  Is as b,
+  K as b,
   I as c,
-  Js as d,
+  Is as d,
   Xs as e,
-  y as f,
-  Ls as g,
-  Ft as h,
-  Nt as i,
-  Qt as j,
-  U as k,
-  Ne as l,
+  Js as f,
+  y as g,
+  Ls as h,
+  Ft as i,
+  Nt as j,
+  Qt as k,
+  U as l,
   p as m,
-  Gs as n,
+  Ne as n,
   F as o,
-  vs as p,
-  Ts as q,
+  Gs as p,
+  vs as q,
   Hs as r,
   j as s,
-  w as t,
-  js as u,
-  Qs as v,
-  E as w,
-  K as x,
+  Ts as t,
+  w as u,
+  js as v,
+  Qs as w,
+  E as x,
   zs as y,
   C as z
 };

package/dist/layers/CausalSelfAttention.js CHANGED Viewed

@@ -1,4 +1,5 @@
-class S {
+import { attentionMask as z } from "../ops/attentionMask.js";
+class j {
   constructor(t, i, s, e) {
     this.ropeCache = e, this.config = s, this.tf = t, this.index = i, this.cAttn = this.tf.layers.dense({
       units: 3 * s.nEmbed,
@@ -18,9 +19,9 @@ class S {
         stddev: 0.02 / Math.sqrt(2 * s.nLayer)
       }),
       biasInitializer: "zeros"
-    }), this.attnDropout = this.tf.layers.dropout({ rate: s.dropout }), this.residDropout = this.tf.layers.dropout({ rate: s.dropout }), this.bias = this.tf.linalg.bandPart(this.tf.ones([s.blockSize, s.blockSize]), -1, 0).cast("bool"), this.divisor = this.tf.scalar(1 / Math.sqrt(s.nEmbed / s.nHead));
-    const a = this.tf.zeros([s.blockSize, s.blockSize]), h = this.tf.fill([s.blockSize, s.blockSize], Number.NEGATIVE_INFINITY);
-    this.maskInf = this.tf.where(this.bias, a, h);
+    }), this.attnDropout = this.tf.layers.dropout({ rate: s.dropout }), this.residDropout = this.tf.layers.dropout({ rate: s.dropout }), this.bias = this.tf.linalg.bandPart(this.tf.ones([s.blockSize, s.blockSize]), -1, 0).cast("bool"), this.divisor = 1 / Math.sqrt(s.nEmbed / s.nHead);
+    const o = this.tf.zeros([s.blockSize, s.blockSize]), c = this.tf.fill([s.blockSize, s.blockSize], Number.NEGATIVE_INFINITY);
+    this.maskInf = this.tf.where(this.bias, o, c);
   }
   config;
   cAttn;
@@ -52,70 +53,70 @@ class S {
     this.cAttn.setWeights(t.get(`block_${this.index}_cAttn`) || []), this.cProj.setWeights(t.get(`block_${this.index}_cProj`) || []);
   }
   getAttentionScores(t, i, s) {
-    const e = t.shape[2], h = this.tf.matMul(t, i, !1, !0).mul(this.divisor), n = this.maskInf.slice([0, 0], [e, e]).expandDims(0).expandDims(0), r = h.add(n), o = this.tf.softmax(r, -1);
+    const e = z(t, i, this.maskInf, this.divisor), o = this.tf.softmax(e, -1);
     return this.attnDropout.apply(o, { training: s });
   }
   // Attention with optional past. If pastLen > 0 and T_cur == 1, no mask needed.
   getAttentionScoresWithPast(t, i, s, e) {
-    const a = t.shape[2];
-    let n = this.tf.matMul(t, i, !1, !0).mul(this.divisor);
-    if (a > 1 && e > 0)
+    const o = t.shape[2];
+    let r = this.tf.matMul(t, i, !1, !0).mul(this.divisor);
+    if (o > 1 && e > 0)
       throw new Error("Cannot use past with T_cur > 1");
-    if (a > 1) {
-      const o = this.maskInf.slice([0, 0], [a, a]).expandDims(0).expandDims(0);
-      n = n.add(o);
+    if (o > 1) {
+      const a = this.maskInf.slice([0, 0], [o, o]).expandDims(0).expandDims(0);
+      r = r.add(a);
     }
-    const r = this.tf.softmax(n, -1);
-    return this.attnDropout.apply(r, { training: s });
+    const h = this.tf.softmax(r, -1);
+    return this.attnDropout.apply(h, { training: s });
   }
   getQKV(t) {
-    const [i, s, e] = t.shape, a = this.cAttn.apply(t), [h, n, r] = this.tf.split(a, 3, -1);
-    a.dispose();
-    const o = e / this.config.nHead, u = this.tf.reshape(h, [i, s, this.config.nHead, o]);
-    h.dispose();
+    const [i, s, e] = t.shape, o = this.cAttn.apply(t), [c, r, h] = this.tf.split(o, 3, -1);
+    o.dispose();
+    const a = e / this.config.nHead, u = this.tf.reshape(c, [i, s, this.config.nHead, a]);
+    c.dispose();
     const f = u.transpose([0, 2, 1, 3]);
     u.dispose();
-    const d = this.tf.reshape(n, [i, s, this.config.nHead, o]);
-    n.dispose();
-    const c = d.transpose([0, 2, 1, 3]);
-    d.dispose();
-    const l = this.tf.reshape(r, [i, s, this.config.nHead, o]);
+    const d = this.tf.reshape(r, [i, s, this.config.nHead, a]);
     r.dispose();
+    const n = d.transpose([0, 2, 1, 3]);
+    d.dispose();
+    const l = this.tf.reshape(h, [i, s, this.config.nHead, a]);
+    h.dispose();
     const p = l.transpose([0, 2, 1, 3]);
-    return l.dispose(), [f, c, p];
+    return l.dispose(), [f, n, p];
   }
   getOutputProjection(t, i) {
-    const s = t.shape[0], e = t.shape[2], a = this.config.nEmbed, h = t.transpose([0, 2, 1, 3]), n = this.tf.reshape(h, [s, e, a]), r = this.cProj.apply(n);
-    return this.residDropout.apply(r, { training: i });
+    const s = t.shape[0], e = t.shape[2], o = this.config.nEmbed, c = t.transpose([0, 2, 1, 3]), r = this.tf.reshape(c, [s, e, o]), h = this.cProj.apply(r);
+    return this.residDropout.apply(h, { training: i });
   }
   // Added optional KV cache support (pastKV). Returns presentKV for chaining.
   call(t, i = !1, s = !1, e) {
     if (e && !this.config.useRope)
       throw new Error("Cannot use pastKV without RoPE enabled");
     return this.tf.tidy(() => {
-      const [a, h, n] = this.getQKV(t), r = a.shape[2], o = this.config.blockSize, u = e ? e.cumulativeLength : 0, [f, d] = this.ropeCache ? this.ropeCache.applyRoPE(a, h, u) : [a, h];
-      let c = d, l = n, p = 0;
-      e && (p = e.length, c = this.tf.concat([e.k, d], 2), l = this.tf.concat([e.v, n], 2));
-      const b = c.shape[2];
-      if (b > o) {
-        const k = b - o, g = c.shape[0], v = c.shape[1], I = c.shape[3];
-        c = c.slice([0, 0, k, 0], [g, v, o, I]), l = l.slice([0, 0, k, 0], [g, v, o, I]), p = o - r;
+      const [o, c, r] = this.getQKV(t), h = o.shape[2], a = this.config.blockSize, u = e ? e.cumulativeLength : 0, [f, d] = this.ropeCache ? this.ropeCache.applyRoPE(o, c, u) : [o, c];
+      let n = d, l = r, p = 0;
+      e && (p = e.length, n = this.tf.concat([e.k, d], 2), l = this.tf.concat([e.v, r], 2));
+      const b = n.shape[2];
+      if (b > a) {
+        const k = b - a, g = n.shape[0], I = n.shape[1], _ = n.shape[3];
+        n = n.slice([0, 0, k, 0], [g, I, a, _]), l = l.slice([0, 0, k, 0], [g, I, a, _]), p = a - h;
       }
       let m;
-      p > 0 ? m = this.getAttentionScoresWithPast(f, c, i, p) : m = this.getAttentionScores(f, c, i);
-      const _ = this.tf.matMul(m, l), A = this.getOutputProjection(_, i), P = {
-        k: this.tf.keep(c),
+      p > 0 ? m = this.getAttentionScoresWithPast(f, n, i, p) : m = this.getAttentionScores(f, n, i);
+      const v = this.tf.matMul(m, l), A = this.getOutputProjection(v, i), P = {
+        k: this.tf.keep(n),
         v: this.tf.keep(l),
-        length: p + r,
-        cumulativeLength: e ? e.cumulativeLength + r : r
+        length: p + h,
+        cumulativeLength: e ? e.cumulativeLength + h : h
       };
       return { output: A, attention: s ? m.mean(1) : void 0, presentKV: P };
     });
   }
   dispose() {
-    this.cAttn.dispose(), this.cProj.dispose(), this.attnDropout.dispose(), this.residDropout.dispose(), this.bias.dispose(), this.maskInf.dispose(), this.divisor.dispose();
+    this.cAttn.dispose(), this.cProj.dispose(), this.attnDropout.dispose(), this.residDropout.dispose(), this.bias.dispose(), this.maskInf.dispose();
   }
 }
 export {
-  S as default
+  j as default
 };

package/dist/layers/TiedEmbedding.js CHANGED Viewed

@@ -1,29 +1,7 @@
-import { o as h, c as u, x as B, E as c, B as V, y as X, D as Y, I as Z, F as ee, N as te, H as se, J as ne, K as re, O as ae, Q as ue, f as L, w as ie, T as A, m as oe, U as le, t as ce, k as C, V as P, v as U, _ as H } from "../index-D1SlunD-.js";
-import { s as pe, r as f } from "../sum-02UQ5Eaq.js";
-import { c as he } from "../complex-D6Bq1XDf.js";
-/**
- * @license
- * Copyright 2020 Google LLC. All Rights Reserved.
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * =============================================================================
- */
-function fe(t, e, s = !1, n = !1) {
-  let r = u(t, "a", "matMul"), i = u(e, "b", "matMul");
-  [r, i] = B(r, i);
-  const o = { a: r, b: i }, p = { transposeA: s, transposeB: n };
-  return c.runKernel(V, o, p);
-}
-const m = /* @__PURE__ */ h({ matMul_: fe });
+import { o as h, c as i, E as o, y as V, D as X, I as Y, F as Z, N as ee, H as te, J as se, K as ne, O as re, Q as ue, g as L, x as ae, T as A, m as ie, U as oe, u as le, b as q, l as C, V as P, w as U, _ as H } from "../index-DQfEAU9u.js";
+import { s as ce, r as f } from "../sum-B-O33dgG.js";
+import { m } from "../mat_mul-CuHB58-H.js";
+import { c as pe } from "../complex-CeoYJn2o.js";
 /**
  * @license
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -40,11 +18,11 @@ const m = /* @__PURE__ */ h({ matMul_: fe });
  * limitations under the License.
  * =============================================================================
  */
-function de(t) {
-  const s = { x: u(t, "x", "sigmoid", "float32") };
-  return c.runKernel(X, s);
+function he(t) {
+  const s = { x: i(t, "x", "sigmoid", "float32") };
+  return o.runKernel(V, s);
 }
-const me = /* @__PURE__ */ h({ sigmoid_: de });
+const fe = /* @__PURE__ */ h({ sigmoid_: he });
 /**
  * @license
  * Copyright 2020 Google LLC. All Rights Reserved.
@@ -61,11 +39,11 @@ const me = /* @__PURE__ */ h({ sigmoid_: de });
  * limitations under the License.
  * =============================================================================
  */
-function ge(t) {
-  const s = { x: u(t, "x", "elu", "float32") };
-  return c.runKernel(Y, s);
+function de(t) {
+  const s = { x: i(t, "x", "elu", "float32") };
+  return o.runKernel(X, s);
 }
-const $e = /* @__PURE__ */ h({ elu_: ge });
+const me = /* @__PURE__ */ h({ elu_: de });
 /**
  * @license
  * Copyright 2020 Google LLC. All Rights Reserved.
@@ -82,11 +60,11 @@ const $e = /* @__PURE__ */ h({ elu_: ge });
  * limitations under the License.
  * =============================================================================
  */
-function xe(t) {
-  const s = { input: u(t, "input", "imag") };
-  return c.runKernel(Z, s);
+function ge(t) {
+  const s = { input: i(t, "input", "imag") };
+  return o.runKernel(Y, s);
 }
-const ke = /* @__PURE__ */ h({ imag_: xe });
+const $e = /* @__PURE__ */ h({ imag_: ge });
 /**
  * @license
  * Copyright 2020 Google LLC. All Rights Reserved.
@@ -103,11 +81,11 @@ const ke = /* @__PURE__ */ h({ imag_: xe });
  * limitations under the License.
  * =============================================================================
  */
-function De(t, e = 0.2) {
-  const n = { x: u(t, "x", "leakyRelu") }, r = { alpha: e };
-  return c.runKernel(ee, n, r);
+function xe(t, e = 0.2) {
+  const n = { x: i(t, "x", "leakyRelu") }, r = { alpha: e };
+  return o.runKernel(Z, n, r);
 }
-const be = /* @__PURE__ */ h({ leakyRelu_: De });
+const ke = /* @__PURE__ */ h({ leakyRelu_: xe });
 /**
  * @license
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -124,11 +102,11 @@ const be = /* @__PURE__ */ h({ leakyRelu_: De });
  * limitations under the License.
  * =============================================================================
  */
-function ye(t) {
-  const s = { x: u(t, "x", "neg") };
-  return c.runKernel(te, s);
+function De(t) {
+  const s = { x: i(t, "x", "neg") };
+  return o.runKernel(ee, s);
 }
-const Se = /* @__PURE__ */ h({ neg_: ye });
+const be = /* @__PURE__ */ h({ neg_: De });
 /**
  * @license
  * Copyright 2020 Google LLC. All Rights Reserved.
@@ -145,11 +123,11 @@ const Se = /* @__PURE__ */ h({ neg_: ye });
  * limitations under the License.
  * =============================================================================
  */
-function Me(t, e) {
-  const s = u(t, "x", "prelu"), n = u(e, "alpha", "prelu"), r = { x: s, alpha: n };
-  return c.runKernel(se, r);
+function ye(t, e) {
+  const s = i(t, "x", "prelu"), n = i(e, "alpha", "prelu"), r = { x: s, alpha: n };
+  return o.runKernel(te, r);
 }
-const Ke = /* @__PURE__ */ h({ prelu_: Me });
+const Se = /* @__PURE__ */ h({ prelu_: ye });
 /**
  * @license
  * Copyright 2020 Google LLC. All Rights Reserved.
@@ -166,11 +144,11 @@ const Ke = /* @__PURE__ */ h({ prelu_: Me });
  * limitations under the License.
  * =============================================================================
  */
-function _e(t) {
-  const s = { input: u(t, "input", "real") };
-  return c.runKernel(ne, s);
+function Ke(t) {
+  const s = { input: i(t, "input", "real") };
+  return o.runKernel(se, s);
 }
-const we = /* @__PURE__ */ h({ real_: _e });
+const _e = /* @__PURE__ */ h({ real_: Ke });
 /**
  * @license
  * Copyright 2020 Google LLC. All Rights Reserved.
@@ -187,11 +165,11 @@ const we = /* @__PURE__ */ h({ real_: _e });
  * limitations under the License.
  * =============================================================================
  */
-function We(t) {
-  const s = { x: u(t, "x", "relu") };
-  return c.runKernel(re, s);
+function Me(t) {
+  const s = { x: i(t, "x", "relu") };
+  return o.runKernel(ne, s);
 }
-const ze = /* @__PURE__ */ h({ relu_: We });
+const we = /* @__PURE__ */ h({ relu_: Me });
 /**
  * @license
  * Copyright 2020 Google LLC. All Rights Reserved.
@@ -208,11 +186,11 @@ const ze = /* @__PURE__ */ h({ relu_: We });
  * limitations under the License.
  * =============================================================================
  */
-function Ee(t) {
-  const s = { x: u(t, "x", "relu6") };
-  return c.runKernel(ae, s);
+function We(t) {
+  const s = { x: i(t, "x", "relu6") };
+  return o.runKernel(re, s);
 }
-const Oe = /* @__PURE__ */ h({ relu6_: Ee });
+const ze = /* @__PURE__ */ h({ relu6_: We });
 /**
  * @license
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -229,11 +207,11 @@ const Oe = /* @__PURE__ */ h({ relu6_: Ee });
  * limitations under the License.
  * =============================================================================
  */
-function Fe(t, e = 0) {
-  const n = { x: u(t, "x", "step") }, r = { alpha: e };
-  return c.runKernel(ue, n, r);
+function Ee(t, e = 0) {
+  const n = { x: i(t, "x", "step") }, r = { alpha: e };
+  return o.runKernel(ue, n, r);
 }
-const Re = /* @__PURE__ */ h({ step_: Fe });
+const Oe = /* @__PURE__ */ h({ step_: Ee });
 /**
  * @license
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -250,19 +228,19 @@ const Re = /* @__PURE__ */ h({ step_: Fe });
  * limitations under the License.
  * =============================================================================
  */
-function Ae(t, e, s) {
-  const n = u(t, "x", "transpose");
-  if (e == null && (e = n.shape.map((o, p) => p).reverse()), L(n.rank === e.length, () => `Error in transpose: rank of input ${n.rank} must match length of perm ${e}.`), e.forEach((o) => {
-    L(o >= 0 && o < n.rank, () => `All entries in 'perm' must be between 0 and ${n.rank - 1} but got ${e}`);
+function Fe(t, e, s) {
+  const n = i(t, "x", "transpose");
+  if (e == null && (e = n.shape.map((l, p) => p).reverse()), L(n.rank === e.length, () => `Error in transpose: rank of input ${n.rank} must match length of perm ${e}.`), e.forEach((l) => {
+    L(l >= 0 && l < n.rank, () => `All entries in 'perm' must be between 0 and ${n.rank - 1} but got ${e}`);
   }), n.rank <= 1)
     return n.clone();
-  const r = { x: n }, i = { perm: e };
-  return n.dtype === "complex64" ? ie(() => {
-    let o = we(n), p = ke(n);
-    return o = c.runKernel(A, { x: o }, i), p = c.runKernel(A, { x: p }, i), s && (p = Se(p)), he(o, p);
-  }) : c.runKernel(A, r, i);
+  const r = { x: n }, c = { perm: e };
+  return n.dtype === "complex64" ? ae(() => {
+    let l = _e(n), p = $e(n);
+    return l = o.runKernel(A, { x: l }, c), p = o.runKernel(A, { x: p }, c), s && (p = be(p)), pe(l, p);
+  }) : o.runKernel(A, r, c);
 }
-const Be = /* @__PURE__ */ h({ transpose_: Ae });
+const Re = /* @__PURE__ */ h({ transpose_: Fe });
 /**
  * @license
  * Copyright 2019 Google LLC. All Rights Reserved.
@@ -279,36 +257,36 @@ const Be = /* @__PURE__ */ h({ transpose_: Ae });
  * limitations under the License.
  * =============================================================================
  */
-function Le(t, e, s) {
+function Ae(t, e, s) {
   if (s == null || s === "linear")
     return t;
   if (s === "relu")
-    return oe(t, Re(e));
+    return ie(t, Oe(e));
   throw new Error(`Cannot compute gradient for fused activation ${s}.`);
 }
-function Te(t, e) {
+function Le(t, e) {
   let s = e;
-  const n = le(t.shape, e.shape);
-  return n.length > 0 && (s = pe(s, n)), f(s, t.shape);
+  const n = oe(t.shape, e.shape);
+  return n.length > 0 && (s = ce(s, n)), f(s, t.shape);
 }
-function Ne(t, e, s, n) {
+function Te(t, e, s, n) {
   if (e === "linear")
     return t;
   if (e === "relu")
-    return ze(t);
+    return we(t);
   if (e === "elu")
-    return $e(t);
+    return me(t);
   if (e === "relu6")
-    return Oe(t);
+    return ze(t);
   if (e === "prelu")
-    return Ke(t, s);
+    return Se(t, s);
   if (e === "leakyrelu")
-    return be(t, n);
+    return ke(t, n);
   if (e === "sigmoid")
-    return me(t);
+    return fe(t);
   throw new Error(`Unknown fused activation ${e}.`);
 }
-const ve = (t, e) => !(t > 0) || e === "linear";
+const Be = (t, e) => !(t > 0) || e === "linear";
 /**
  * @license
  * Copyright 2019 Google LLC. All Rights Reserved.
@@ -325,49 +303,49 @@ const ve = (t, e) => !(t > 0) || e === "linear";
  * limitations under the License.
  * =============================================================================
  */
-function Ge({ a: t, b: e, transposeA: s = !1, transposeB: n = !1, bias: r, activation: i = "linear", preluActivationWeights: o, leakyreluAlpha: p = 0.2 }) {
-  if (ve(c.state.gradientDepth, i) === !1) {
+function Ne({ a: t, b: e, transposeA: s = !1, transposeB: n = !1, bias: r, activation: c = "linear", preluActivationWeights: l, leakyreluAlpha: p = 0.2 }) {
+  if (Be(o.state.gradientDepth, c) === !1) {
     let x = m(t, e, s, n);
-    return r != null && (x = ce(x, r)), Ne(x, i, o, p);
+    return r != null && (x = le(x, r)), Te(x, c, l, p);
   }
-  let a = u(t, "a", "fused matMul"), l = u(e, "b", "fused matMul");
-  [a, l] = B(a, l);
-  const D = s ? a.shape[a.rank - 2] : a.shape[a.rank - 1], b = n ? l.shape[l.rank - 1] : l.shape[l.rank - 2], w = s ? a.shape[a.rank - 1] : a.shape[a.rank - 2], W = n ? l.shape[l.rank - 2] : l.shape[l.rank - 1], T = a.shape.slice(0, -2), y = l.shape.slice(0, -2), N = C(T), v = C(y);
-  L(D === b, () => `Error in fused matMul: inner shapes (${D}) and (${b}) of Tensors with shapes ${a.shape} and ${l.shape} and transposeA=${s} and transposeB=${n} must match.`);
-  const O = P(a.shape.slice(0, -2), l.shape.slice(0, -2)).concat([w, W]), F = s ? f(a, [N, D, w]) : f(a, [N, w, D]), R = n ? f(l, [v, W, b]) : f(l, [v, b, W]);
+  let u = i(t, "a", "fused matMul"), a = i(e, "b", "fused matMul");
+  [u, a] = q(u, a);
+  const D = s ? u.shape[u.rank - 2] : u.shape[u.rank - 1], b = n ? a.shape[a.rank - 1] : a.shape[a.rank - 2], w = s ? u.shape[u.rank - 1] : u.shape[u.rank - 2], W = n ? a.shape[a.rank - 2] : a.shape[a.rank - 1], T = u.shape.slice(0, -2), y = a.shape.slice(0, -2), B = C(T), N = C(y);
+  L(D === b, () => `Error in fused matMul: inner shapes (${D}) and (${b}) of Tensors with shapes ${u.shape} and ${a.shape} and transposeA=${s} and transposeB=${n} must match.`);
+  const O = P(u.shape.slice(0, -2), a.shape.slice(0, -2)).concat([w, W]), F = s ? f(u, [B, D, w]) : f(u, [B, w, D]), R = n ? f(a, [N, W, b]) : f(a, [N, b, W]);
   let S;
-  r != null && (S = u(r, "bias", "fused matMul"), [S] = B(S, a), P(O, S.shape));
+  r != null && (S = i(r, "bias", "fused matMul"), [S] = q(S, u), P(O, S.shape));
   let G;
-  o != null && (G = u(o, "prelu weights", "fused matMul"));
-  const I = (x, _) => {
-    const [g, $, k, z] = _, d = Le(f(x, k.shape), k, i);
-    let M, K;
-    if (!s && !n ? (M = m(d, $, !1, !0), K = m(g, d, !0, !1)) : !s && n ? (M = m(d, $, !1, !1), K = m(d, g, !0, !1)) : s && !n ? (M = m($, d, !1, !0), K = m(g, d, !1, !1)) : (M = m($, d, !0, !0), K = m(d, g, !0, !0)), r != null) {
-      const Q = Te(z, d);
-      return [M, K, Q];
+  l != null && (G = i(l, "prelu weights", "fused matMul"));
+  const I = (x, M) => {
+    const [g, $, k, z] = M, d = Ae(f(x, k.shape), k, c);
+    let K, _;
+    if (!s && !n ? (K = m(d, $, !1, !0), _ = m(g, d, !0, !1)) : !s && n ? (K = m(d, $, !1, !1), _ = m(d, g, !0, !1)) : s && !n ? (K = m($, d, !1, !0), _ = m(g, d, !1, !1)) : (K = m($, d, !0, !0), _ = m(d, g, !0, !0)), r != null) {
+      const Q = Le(z, d);
+      return [K, _, Q];
     } else
-      return [M, K];
-  }, j = {
+      return [K, _];
+  }, v = {
     a: F,
     b: R,
     bias: S,
     preluActivationWeights: G
-  }, q = { transposeA: s, transposeB: n, activation: i, leakyreluAlpha: p };
-  return r == null ? U((_, g, $) => {
+  }, j = { transposeA: s, transposeB: n, activation: c, leakyreluAlpha: p };
+  return r == null ? U((M, g, $) => {
     const k = (
       // tslint:disable-next-line: no-unnecessary-type-assertion
-      c.runKernel(H, j, q)
+      o.runKernel(H, v, j)
     );
-    return $([_, g, k]), { value: f(k, O), gradFunc: I };
-  })(F, R) : U((_, g, $, k) => {
+    return $([M, g, k]), { value: f(k, O), gradFunc: I };
+  })(F, R) : U((M, g, $, k) => {
     const z = (
       // tslint:disable-next-line: no-unnecessary-type-assertion
-      c.runKernel(H, j, q)
+      o.runKernel(H, v, j)
     );
-    return k([_, g, z, $]), { value: f(z, O), gradFunc: I };
+    return k([M, g, z, $]), { value: f(z, O), gradFunc: I };
   })(F, R, S);
 }
-const J = /* @__PURE__ */ h({ fusedMatMul_: Ge });
+const J = /* @__PURE__ */ h({ fusedMatMul_: Ne });
 /**
  * @license
  * Copyright 2018 Google LLC
@@ -391,12 +369,12 @@ class E extends Error {
  * https://opensource.org/licenses/MIT.
  * =============================================================================
  */
-function Ie(t, e, s, n) {
+function Ge(t, e, s, n) {
   if (t.rank < 2 || e.rank < 2)
     throw new E(`dot requires both inputs to be rank >= 2 but got x shape = ${t.shape} and y shape = ${e.shape}`);
   if (e.rank >= 3) {
-    const r = t.shape.slice(-1)[0], i = e.shape.slice(-2)[0];
-    if (r !== i)
+    const r = t.shape.slice(-1)[0], c = e.shape.slice(-2)[0];
+    if (r !== c)
       throw new E(`If rank y >= 3, then the second last dim of y must equal the last dim of x but got x shape = ${t.shape} and  y shape = ${e.shape}`);
   }
   if (t.rank === 2 && e.rank === 2)
@@ -409,11 +387,11 @@ function Ie(t, e, s, n) {
       activation: s
     });
   {
-    const r = t.shape.slice(), i = r.pop();
-    t = f(t, [-1, i]);
-    const o = e.shape.slice(), p = o.pop(), a = o.pop(), l = [...o, p], D = Array.from({ length: e.rank }, (T, y) => y === 0 ? e.rank - 2 : y <= e.rank - 2 ? y - 1 : y);
-    e = f(Be(e, D), [a, -1]);
-    const b = [...r, ...l];
+    const r = t.shape.slice(), c = r.pop();
+    t = f(t, [-1, c]);
+    const l = e.shape.slice(), p = l.pop(), u = l.pop(), a = [...l, p], D = Array.from({ length: e.rank }, (T, y) => y === 0 ? e.rank - 2 : y <= e.rank - 2 ? y - 1 : y);
+    e = f(Re(e, D), [u, -1]);
+    const b = [...r, ...a];
     return f(J({
       a: t,
       b: e,
@@ -424,7 +402,7 @@ function Ie(t, e, s, n) {
     }), b);
   }
 }
-class Ue {
+class Pe {
   vocabSize;
   embedDim;
   tf;
@@ -447,7 +425,7 @@ class Ue {
     return this.tf.gather(this.tiedWeights, e, 0);
   }
   project(e) {
-    return Ie(e, this.tiedWeights.transpose());
+    return Ge(e, this.tiedWeights.transpose());
   }
   getWeights() {
     return [this.tiedWeights];
@@ -466,5 +444,5 @@ class Ue {
   }
 }
 export {
-  Ue as default
+  Pe as default
 };

package/dist/main.js CHANGED Viewed

@@ -1,20 +1,21 @@
-import { default as r } from "./NanoGPTModel.js";
-import { default as s } from "./TeachableLLM.js";
-import { default as i } from "./tokeniser/CharTokeniser.js";
+import { default as m } from "./NanoGPTModel.js";
+import { default as i } from "./TeachableLLM.js";
+import { default as l } from "./tokeniser/CharTokeniser.js";
 import { default as d } from "./utilities/waitForModel.js";
-import { default as u } from "./data/textLoader.js";
-import { estimateMemoryUsage as n, estimateParameterCount as T, estimateResources as g, estimateTrainingMemoryUsage as M, validateConfig as C } from "./utilities/parameters.js";
+import { default as x } from "./data/textLoader.js";
+import { estimateMemoryUsage as T, estimateParameterCount as g, estimateResources as M, estimateTrainingMemoryUsage as C, validateConfig as c } from "./utilities/parameters.js";
 import "./ops/scatterSub.js";
 import "./ops/gatherSub.js";
+import "./ops/attentionMask.js";
 export {
-  i as CharTokeniser,
-  r as NanoGPT,
-  s as TeachableLLM,
-  n as estimateMemoryUsage,
-  T as estimateParameterCount,
-  g as estimateResources,
-  M as estimateTrainingMemoryUsage,
-  u as loadTextData,
-  C as validateConfig,
+  l as CharTokeniser,
+  m as NanoGPT,
+  i as TeachableLLM,
+  T as estimateMemoryUsage,
+  g as estimateParameterCount,
+  M as estimateResources,
+  C as estimateTrainingMemoryUsage,
+  x as loadTextData,
+  c as validateConfig,
   d as waitForModel
 };

package/dist/mat_mul-CuHB58-H.js ADDED Viewed

@@ -0,0 +1,27 @@
+import { o as c, c as s, b as m, E as M, B as p } from "./index-DQfEAU9u.js";
+/**
+ * @license
+ * Copyright 2020 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+function b(e, o, n = !1, l = !1) {
+  let a = s(e, "a", "matMul"), t = s(o, "b", "matMul");
+  [a, t] = m(a, t);
+  const r = { a, b: t }, u = { transposeA: n, transposeB: l };
+  return M.runKernel(p, r, u);
+}
+const i = /* @__PURE__ */ c({ matMul_: b });
+export {
+  i as m
+};

package/dist/ops/attentionMask.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ import { Tensor } from '@tensorflow/tfjs';
2	+ export declare function attentionMask(q: Tensor, k: Tensor, mask: Tensor, divisor: number): Tensor;

package/dist/ops/attentionMask.js ADDED Viewed

@@ -0,0 +1,62 @@
+import { engine as d } from "@tensorflow/tfjs";
+import { r as k, s as u } from "../index-DQfEAU9u.js";
+import { m as l } from "../mat_mul-CuHB58-H.js";
+class p {
+  variableNames = ["q", "k", "mask"];
+  outputShape;
+  userCode;
+  // enableShapeUniforms = true;
+  customUniforms = [{ name: "divisor", type: "float" }];
+  constructor(t, e, n, a) {
+    this.outputShape = [t, e, n, n], this.userCode = `
+        void main() {
+            ivec4 coords = getOutputCoords(); // [batch, nh, t1, t2]
+            int b = coords.x;
+            int h = coords.y;
+            int t1 = coords.z;
+            int t2 = coords.w;
+            float sum = 0.0;
+            for (int i = 0; i < ${a}; ++i) {
+                float qv = getQ(b, h, t1, i);
+                float kv = getK(b, h, t2, i); // k is transposed on last two dims
+                sum += qv * kv;
+            }
+            // Scale by divisor
+            float scaled = sum * divisor;
+            // Add mask
+            float maskVal = getMask(t1, t2); // mask is [T,T]
+            setOutput(scaled + maskVal);
+        }
+        `;
+  }
+}
+function f(s) {
+  const { q: t, k: e, mask: n } = s.inputs, { divisor: a } = s.attrs, o = s.backend, c = t.shape[0], i = t.shape[2], r = t.shape[1], m = new p(c, r, i, t.shape[3]);
+  return o.runWebGLProgram(m, [t, e, n], "float32", [[a]]);
+}
+const h = {
+  kernelName: "AttentionMask",
+  backendName: "webgl",
+  kernelFunc: f
+};
+k(h);
+function b(s) {
+  const { q: t, k: e, mask: n } = s.inputs, { divisor: a } = s.attrs, o = t.shape[2], i = l(t, e, !1, !0).mul(u(a)), r = n.slice([0, 0], [o, o]).expandDims(0).expandDims(0);
+  return i.add(r);
+}
+const v = {
+  kernelName: "AttentionMask",
+  backendName: "cpu",
+  kernelFunc: b
+};
+k(v);
+function C(s, t, e, n) {
+  return d().runKernel("AttentionMask", { q: s, k: t, mask: e }, { divisor: n });
+}
+export {
+  C as attentionMask
+};

package/dist/ops/gatherSub.js CHANGED Viewed

@@ -1,6 +1,6 @@
 import { engine as l } from "@tensorflow/tfjs";
-import { o as g, c as i, E as b, G as d, r as c, a as h } from "../index-D1SlunD-.js";
-import { r as p, s as f } from "../stack-DB2YLlAs.js";
+import { o as g, c as i, E as b, G as d, r as c, a as h } from "../index-DQfEAU9u.js";
+import { r as p, s as f } from "../stack-C9cTkqpq.js";
 /**
  * @license
  * Copyright 2018 Google LLC. All Rights Reserved.

package/dist/ops/node/sparseCrossEntropy.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import { r as o } from "../../index-D1SlunD-.js";
+import { r as o } from "../../index-DQfEAU9u.js";
 function r(e) {
   const { logits: t, labels: n } = e.inputs;
   return e.backend.executeMultipleOutputs("SparseSoftmaxCrossEntropyWithLogits", [], [t, n], 2);

package/dist/ops/scatterSub.js CHANGED Viewed

@@ -1,7 +1,7 @@
 import { engine as $ } from "@tensorflow/tfjs";
-import { i as u, j as S, k as h, E as f, l as E, o as N, c as l, n as y, r as p, a as D, m as x } from "../index-D1SlunD-.js";
-import { c as m } from "../complex-D6Bq1XDf.js";
-import { r as v, s as T } from "../stack-DB2YLlAs.js";
+import { j as u, k as S, l as p, E as f, n as E, o as N, c as l, p as y, r as h, a as D, m as x } from "../index-DQfEAU9u.js";
+import { c as m } from "../complex-CeoYJn2o.js";
+import { r as v, s as T } from "../stack-C9cTkqpq.js";
 /**
  * @license
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -23,7 +23,7 @@ function i(e, t = "float32") {
     const a = i(e, "float32"), o = i(e, "float32");
     return m(a, o);
   }
-  const r = S(h(e), t);
+  const r = S(p(e), t);
   return f.makeTensor(r, e, t);
 }
 /**
@@ -47,7 +47,7 @@ function d(e, t = "float32") {
     const a = d(e, "float32"), o = i(e, "float32");
     return m(a, o);
   }
-  const r = E(h(e), t);
+  const r = E(p(e), t);
   return f.makeTensor(r, e, t);
 }
 function C(e, t, r) {
@@ -131,7 +131,7 @@ const K = {
   backendName: "webgl",
   kernelFunc: P
 };
-p(K);
+h(K);
 function A(e) {
   const { logits: t, labels: r, dy: a } = e.inputs, o = r.shape[0], s = t.shape[1], n = v(0, o, 1, "int32"), c = T([n, r], 1), b = d([o]), g = I(c, b, [o, s]), k = D(t, g), w = a.reshape([o, 1]);
   return x(k, w);
@@ -141,7 +141,7 @@ const F = {
   backendName: "cpu",
   kernelFunc: A
 };
-p(F);
+h(F);
 function M(e, t, r) {
   return $().runKernel("EfficientScatterSub", { logits: e, labels: t, dy: r }, {});
 }

package/dist/{stack-DB2YLlAs.js → stack-C9cTkqpq.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { E as e, R as c, o as f, d as u, f as a, P as i } from "./index-D1SlunD-.js";
+import { E as e, R as c, o as f, f as u, g as a, P as i } from "./index-DQfEAU9u.js";
 /**
  * @license
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -15,7 +15,7 @@ import { E as e, R as c, o as f, d as u, f as a, P as i } from "./index-D1SlunD-
  * limitations under the License.
  * =============================================================================
  */
-function g(n, s, t = 1, r = "float32") {
+function l(n, s, t = 1, r = "float32") {
   if (t === 0)
     throw new Error("Cannot have a step of zero");
   const o = { start: n, stop: s, step: t, dtype: r };
@@ -45,6 +45,6 @@ function k(n, s = 0) {
 }
 const h = /* @__PURE__ */ f({ stack_: k });
 export {
-  g as r,
+  l as r,
   h as s
 };

package/dist/{sum-02UQ5Eaq.js → sum-B-O33dgG.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { o, c as a, E as u, g as p, h as i, S as x } from "./index-D1SlunD-.js";
+import { o, c as a, E as u, h as i, i as p, S as x } from "./index-DQfEAU9u.js";
 /**
  * @license
  * Copyright 2020 Google LLC. All Rights Reserved.
@@ -17,7 +17,7 @@ import { o, c as a, E as u, g as p, h as i, S as x } from "./index-D1SlunD-.js";
  */
 function l(n, t) {
   const s = { x: a(n, "x", "reshape", "string_or_numeric") }, r = { shape: t };
-  return u.runKernel(p, s, r);
+  return u.runKernel(i, s, r);
 }
 const h = /* @__PURE__ */ o({ reshape_: l });
 /**
@@ -38,7 +38,7 @@ const h = /* @__PURE__ */ o({ reshape_: l });
  */
 function m(n, t = null, e = !1) {
   let s = a(n, "x", "sum");
-  s.dtype === "bool" && (s = i(s, "int32"));
+  s.dtype === "bool" && (s = p(s, "int32"));
   const r = { x: s }, c = { axis: t, keepDims: e };
   return u.runKernel(x, r, c);
 }

package/dist/training/AdamExt.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import { A as r, m as c, s as h, a as g, e as o } from "../index-D1SlunD-.js";
+import { A as r, m as c, s as h, a as g, e as o } from "../index-DQfEAU9u.js";
 class u extends r {
   constructor(t, e, s, a, i) {
     super(t, e, s, a), this.config = i, this.startLearningRate = t;

package/dist/training/sparseCrossEntropy.js CHANGED Viewed

@@ -1,7 +1,7 @@
-import { gatherSub as K } from "../ops/gatherSub.js";
-import { scatterSub as _ } from "../ops/scatterSub.js";
-import { o as l, c as d, E as f, M as G, p as z, L as I, q as N, a as E, t as M, u as T, e as m, v as S, w as $, z as g } from "../index-D1SlunD-.js";
-import { s as F, r as b } from "../sum-02UQ5Eaq.js";
+import { gatherSub as w } from "../ops/gatherSub.js";
+import { scatterSub as K } from "../ops/scatterSub.js";
+import { o as l, c as d, E as f, M as _, q as z, L as I, t as N, a as E, u as M, v as T, e as m, w as g, x as $, z as S } from "../index-DQfEAU9u.js";
+import { s as F, r as b } from "../sum-B-O33dgG.js";
 /**
  * @license
  * Copyright 2017 Google LLC. All Rights Reserved.
@@ -47,9 +47,9 @@ function q(n, s) {
  */
 function A(n, s = null, t = !1) {
   const e = { x: d(n, "x", "max") }, r = { reductionIndices: s, keepDims: t };
-  return f.runKernel(G, e, r);
+  return f.runKernel(_, e, r);
 }
-const k = /* @__PURE__ */ l({ max_: A });
+const L = /* @__PURE__ */ l({ max_: A });
 /**
  * @license
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -109,7 +109,7 @@ const j = /* @__PURE__ */ l({ log_: W });
  * =============================================================================
  */
 function B(n, s = null, t = !1) {
-  const a = d(n, "x", "logSumExp"), e = N(s, a.shape), r = k(
+  const a = d(n, "x", "logSumExp"), e = N(s, a.shape), r = L(
     a,
     e,
     !0
@@ -148,30 +148,30 @@ function J(n, s = -1) {
 const Q = /* @__PURE__ */ l({ softmax_: J });
 function R(n, s) {
   return $(() => {
-    const t = n.shape[n.shape.length - 1], e = n.shape.slice(0, -1).reduce((h, x) => h * x, 1), r = n.shape.length > 2 ? n.reshape([e, t]) : n, c = s.shape.length > 1 ? s.reshape([e]).cast("int32") : s.cast("int32"), o = k(r, -1, !0), p = E(r, o), u = H(p, -1);
-    return K(u, c, p);
+    const t = n.shape[n.shape.length - 1], e = n.shape.slice(0, -1).reduce((h, x) => h * x, 1), r = n.shape.length > 2 ? n.reshape([e, t]) : n, c = s.shape.length > 1 ? s.reshape([e]).cast("int32") : s.cast("int32"), o = L(r, -1, !0), p = E(r, o), u = H(p, -1);
+    return w(u, c, p);
   });
 }
-function Z() {
-  return m().backendName === "tensorflow" ? S((s, t, a) => {
+function ss() {
+  return m().backendName === "tensorflow" ? g((s, t, a) => {
     const e = s.shape.length > 2 ? s.reshape([-1, s.shape[s.shape.length - 1]]) : s, r = t.shape.length > 1 ? t.reshape([-1]).cast("int32") : t.cast("int32"), [c, o] = m().runKernel(
       "NativeSparseSoftmaxCrossEntropy",
       { logits: e, labels: r },
       {}
     );
-    return a([o.reshape(s.shape)]), { value: c, gradFunc: (p, u) => [u[0], g(t)] };
-  }) : S(
+    return a([o.reshape(s.shape)]), { value: c, gradFunc: (p, u) => [u[0], S(t)] };
+  }) : g(
     // @ts-expect-error Invalid params
     (s, t, a) => {
       const e = s.shape[s.shape.length - 1], c = s.shape.slice(0, -1).reduce((h, x) => h * x, 1), o = s.reshape([c, e]), p = t.reshape([c]).cast("int32"), u = R(o, p);
       return a([o, p]), o.dispose(), p.dispose(), { value: u, gradFunc: (h, x) => $(() => {
-        const y = x[0], C = x[1], L = Q(y), v = _(L, C, h), w = g(t);
-        return [v, w];
+        const k = x[0], y = x[1], C = Q(k), G = K(C, y, h), v = S(t);
+        return [G.reshape(s.shape), v];
       }) };
     }
   );
 }
 export {
-  Z as createSoftmaxCrossEntropyWithGrad,
+  ss as createSoftmaxCrossEntropyWithGrad,
   R as sparseSoftmaxCrossEntropy
 };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
     "name": "@genai-fi/nanogpt",
-    "version": "0.2.6",
+    "version": "0.2.8",
     "type": "module",
     "main": "dist/main.js",
     "types": "dist/main.d.ts",