npm - @genai-fi/nanogpt - Versions diffs - 0.2.9 → 0.2.11 - Mend

@genai-fi/nanogpt 0.2.9 → 0.2.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

package/dist/Generator.d.ts +2 -0
package/dist/Generator.js +37 -32
package/dist/NanoGPTModel.d.ts +4 -1
package/dist/NanoGPTModel.js +33 -25
package/dist/TeachableLLM.d.ts +4 -0
package/dist/TeachableLLM.js +32 -15
package/dist/{complex-Cd8sqiBC.js → complex-CJ-qCcLB.js} +6 -6
package/dist/{index-Dsg28SG6.js → index-YPKosni4.js} +59 -51
package/dist/layers/BaseLayer.d.ts +8 -0
package/dist/layers/BaseLayer.js +18 -0
package/dist/layers/CausalSelfAttention.d.ts +4 -1
package/dist/layers/CausalSelfAttention.js +47 -55
package/dist/layers/MLP.d.ts +2 -1
package/dist/layers/MLP.js +16 -14
package/dist/layers/RMSNorm.d.ts +2 -1
package/dist/layers/RMSNorm.js +13 -11
package/dist/layers/RoPECache.d.ts +4 -2
package/dist/layers/RoPECache.js +13 -7
package/dist/layers/TiedEmbedding.js +16 -15
package/dist/layers/TransformerBlock.d.ts +4 -1
package/dist/layers/TransformerBlock.js +9 -5
package/dist/main.js +18 -16
package/dist/{mat_mul-BAYDrXvE.js → mat_mul-Bu7bhLms.js} +5 -5
package/dist/ops/attentionMask.js +31 -25
package/dist/ops/gatherSub.js +2 -2
package/dist/ops/node/sparseCrossEntropy.js +1 -1
package/dist/ops/qkv.d.ts +7 -0
package/dist/ops/qkv.js +127 -0
package/dist/ops/rope.d.ts +8 -0
package/dist/ops/rope.js +153 -0
package/dist/ops/scatterSub.js +14 -14
package/dist/reshape-DmnmKT6r.js +25 -0
package/dist/{stack-1o648CP_.js → stack-BtKpB0Ry.js} +5 -5
package/dist/sum-D7fu15XL.js +27 -0
package/dist/training/AdamExt.js +1 -1
package/dist/training/Trainer.js +30 -29
package/dist/training/sparseCrossEntropy.js +34 -33
package/dist/utilities/profile.d.ts +10 -0
package/dist/utilities/profile.js +29 -0
package/package.json +1 -1
package/dist/sum-NWazHI7f.js +0 -49

package/dist/layers/CausalSelfAttention.js CHANGED Viewed

@@ -1,16 +1,10 @@
-import { attentionMask as z } from "../ops/attentionMask.js";
-class j {
+import { attentionMask as x } from "../ops/attentionMask.js";
+import j from "./BaseLayer.js";
+import { qkv as w } from "../ops/qkv.js";
+import { rope as y } from "../ops/rope.js";
+class N extends j {
   constructor(t, i, s, e) {
-    this.ropeCache = e, this.config = s, this.tf = t, this.index = i, this.cAttn = this.tf.layers.dense({
-      units: 3 * s.nEmbed,
-      useBias: s.biasInLinear,
-      name: `block_${i}_attn_cAttn`,
-      kernelInitializer: this.tf.initializers.randomNormal({
-        mean: 0,
-        stddev: 0.02
-      }),
-      biasInitializer: "zeros"
-    }), this.cProj = this.tf.layers.dense({
+    super(), this.ropeCache = e, this.config = s, this.tf = t, this.index = i, this.units = s.nEmbed * 3, this.cProj = this.tf.layers.dense({
       units: s.nEmbed,
       useBias: s.biasInLinear,
       name: `block_${i}_attn_cProj`,
@@ -20,11 +14,11 @@ class j {
       }),
       biasInitializer: "zeros"
     }), this.attnDropout = this.tf.layers.dropout({ rate: s.dropout }), this.residDropout = this.tf.layers.dropout({ rate: s.dropout }), this.bias = this.tf.linalg.bandPart(this.tf.ones([s.blockSize, s.blockSize]), -1, 0).cast("bool"), this.divisor = 1 / Math.sqrt(s.nEmbed / s.nHead);
-    const o = this.tf.zeros([s.blockSize, s.blockSize]), c = this.tf.fill([s.blockSize, s.blockSize], Number.NEGATIVE_INFINITY);
-    this.maskInf = this.tf.where(this.bias, o, c);
+    const o = this.tf.zeros([s.blockSize, s.blockSize]), a = this.tf.fill([s.blockSize, s.blockSize], Number.NEGATIVE_INFINITY);
+    this.maskInf = this.tf.where(this.bias, o, a);
   }
   config;
-  cAttn;
+  cAttn = null;
   cProj;
   attnDropout;
   residDropout;
@@ -34,26 +28,35 @@ class j {
   divisor;
   index;
   _trainable = !0;
+  units;
+  build() {
+    this.cAttn === null && (this.cAttn = this.tf.variable(
+      this.tf.randomNormal([this.config.nEmbed, this.units], 0, 0.02),
+      !0
+      //`block_${this.index}_attn_cAttn_kernel`
+    ));
+  }
   get variables() {
-    return [
-      ...this.cAttn.trainableWeights.map((t) => t.read()),
-      ...this.cProj.trainableWeights.map((t) => t.read())
-    ];
+    if (this.cAttn === null)
+      throw new Error("Layer not built yet");
+    return [this.cAttn, ...this.cProj.trainableWeights.map((t) => t.read())];
   }
   get trainable() {
     return this._trainable;
   }
   set trainable(t) {
-    this._trainable = t, this.cAttn.trainable = t, this.cProj.trainable = t;
+    this._trainable = t, this.cAttn && (this.cAttn.trainable = t), this.cProj.trainable = t;
   }
   saveWeights(t) {
-    t.set(`block_${this.index}_cAttn`, this.cAttn.getWeights()), t.set(`block_${this.index}_cProj`, this.cProj.getWeights());
+    t.set(`block_${this.index}_cAttn`, this.cAttn ? [this.cAttn.clone()] : []), t.set(`block_${this.index}_cProj`, this.cProj.getWeights());
   }
   loadWeights(t) {
-    this.cAttn.setWeights(t.get(`block_${this.index}_cAttn`) || []), this.cProj.setWeights(t.get(`block_${this.index}_cProj`) || []);
+    const i = t.get(`block_${this.index}_cAttn`)?.[0];
+    if (!i) throw new Error(`Weights for block_${this.index}_cAttn not found`);
+    this.cAttn ? this.cAttn.assign(i) : this.cAttn = this.tf.variable(i, !0), this.cProj.setWeights(t.get(`block_${this.index}_cProj`) || []);
   }
   getAttentionScores(t, i, s) {
-    const e = z(t, i, this.maskInf, this.divisor), o = this.tf.softmax(e, -1);
+    const e = x(t, i, this.maskInf, this.divisor), o = this.tf.softmax(e, -1);
     return this.attnDropout.apply(o, { training: s });
   }
   // Attention with optional past. If pastLen > 0 and T_cur == 1, no mask needed.
@@ -63,60 +66,49 @@ class j {
     if (o > 1 && e > 0)
       throw new Error("Cannot use past with T_cur > 1");
     if (o > 1) {
-      const a = this.maskInf.slice([0, 0], [o, o]).expandDims(0).expandDims(0);
-      r = r.add(a);
+      const c = this.maskInf.slice([0, 0], [o, o]).expandDims(0).expandDims(0);
+      r = r.add(c);
     }
     const h = this.tf.softmax(r, -1);
     return this.attnDropout.apply(h, { training: s });
   }
   getQKV(t) {
-    const [i, s, e] = t.shape, o = this.cAttn.apply(t), [c, r, h] = this.tf.split(o, 3, -1);
-    o.dispose();
-    const a = e / this.config.nHead, u = this.tf.reshape(c, [i, s, this.config.nHead, a]);
-    c.dispose();
-    const f = u.transpose([0, 2, 1, 3]);
-    u.dispose();
-    const d = this.tf.reshape(r, [i, s, this.config.nHead, a]);
-    r.dispose();
-    const n = d.transpose([0, 2, 1, 3]);
-    d.dispose();
-    const l = this.tf.reshape(h, [i, s, this.config.nHead, a]);
-    h.dispose();
-    const p = l.transpose([0, 2, 1, 3]);
-    return l.dispose(), [f, n, p];
+    return w(t, this.cAttn, this.config.nHead);
   }
   getOutputProjection(t, i) {
-    const s = t.shape[0], e = t.shape[2], o = this.config.nEmbed, c = t.transpose([0, 2, 1, 3]), r = this.tf.reshape(c, [s, e, o]), h = this.cProj.apply(r);
+    const s = t.shape[0], e = t.shape[2], o = this.config.nEmbed, a = t.transpose([0, 2, 1, 3]), r = this.tf.reshape(a, [s, e, o]), h = this.cProj.apply(r);
     return this.residDropout.apply(h, { training: i });
   }
   // Added optional KV cache support (pastKV). Returns presentKV for chaining.
   call(t, i = !1, s = !1, e) {
     if (e && !this.config.useRope)
       throw new Error("Cannot use pastKV without RoPE enabled");
-    return this.tf.tidy(() => {
-      const [o, c, r] = this.getQKV(t), h = o.shape[2], a = this.config.blockSize, u = e ? e.cumulativeLength : 0, [f, d] = this.ropeCache ? this.ropeCache.applyRoPE(o, c, u) : [o, c];
-      let n = d, l = r, p = 0;
-      e && (p = e.length, n = this.tf.concat([e.k, d], 2), l = this.tf.concat([e.v, r], 2));
+    return this.build(), this.tf.tidy(() => {
+      this.startMemory();
+      const [o, a, r] = this.getQKV(t), h = o.shape[2], c = this.config.blockSize, d = e ? e.cumulativeLength : 0, f = this.ropeCache ? y(o, this.ropeCache, d) : o, m = this.ropeCache ? y(a, this.ropeCache, d) : a;
+      this.ropeCache && (o.dispose(), a.dispose());
+      let n = m, l = r, u = 0;
+      e && (u = e.length, n = this.tf.concat([e.k, m], 2), l = this.tf.concat([e.v, r], 2));
       const b = n.shape[2];
-      if (b > a) {
-        const k = b - a, g = n.shape[0], I = n.shape[1], _ = n.shape[3];
-        n = n.slice([0, 0, k, 0], [g, I, a, _]), l = l.slice([0, 0, k, 0], [g, I, a, _]), p = a - h;
+      if (b > c) {
+        const k = b - c, A = n.shape[0], g = n.shape[1], _ = n.shape[3];
+        n = n.slice([0, 0, k, 0], [A, g, c, _]), l = l.slice([0, 0, k, 0], [A, g, c, _]), u = c - h;
       }
-      let m;
-      p > 0 ? m = this.getAttentionScoresWithPast(f, n, i, p) : m = this.getAttentionScores(f, n, i);
-      const v = this.tf.matMul(m, l), A = this.getOutputProjection(v, i), P = {
+      let p;
+      u > 0 ? p = this.getAttentionScoresWithPast(f, n, i, u) : p = this.getAttentionScores(f, n, i);
+      const P = this.tf.matMul(p, l), S = this.getOutputProjection(P, i), v = {
         k: this.tf.keep(n),
         v: this.tf.keep(l),
-        length: p + h,
+        length: u + h,
         cumulativeLength: e ? e.cumulativeLength + h : h
-      };
-      return { output: A, attention: s ? m.mean(1) : void 0, presentKV: P };
+      }, I = s ? p.mean(1) : void 0;
+      return this.endMemory("CausalSelfAttention"), { output: S, attention: I, presentKV: v };
     });
   }
   dispose() {
-    this.cAttn.dispose(), this.cProj.dispose(), this.attnDropout.dispose(), this.residDropout.dispose(), this.bias.dispose(), this.maskInf.dispose();
+    this.cAttn?.dispose(), this.cProj.dispose(), this.attnDropout.dispose(), this.residDropout.dispose(), this.bias.dispose(), this.maskInf.dispose();
   }
 }
 export {
-  j as default
+  N as default
 };

package/dist/layers/MLP.d.ts CHANGED Viewed

@@ -1,6 +1,7 @@
 import { default as TF } from '@tensorflow/tfjs';
 import { GPTConfig } from '../config';
-export default class MLP {
+import { default as BaseLayer } from './BaseLayer';
+export default class MLP extends BaseLayer {
     private cFc;
     private cProj;
     private dropout;

package/dist/layers/MLP.js CHANGED Viewed

@@ -1,31 +1,32 @@
-class l {
+import a from "./BaseLayer.js";
+class l extends a {
   cFc;
   cProj;
   dropout;
   tf;
   index;
   _trainable = !0;
-  constructor(t, e, i) {
-    this.tf = t, this.index = e, this.cFc = this.tf.layers.dense({
-      units: i.mlpFactor * i.nEmbed,
+  constructor(t, i, e) {
+    super(), this.tf = t, this.index = i, this.cFc = this.tf.layers.dense({
+      units: e.mlpFactor * e.nEmbed,
       activation: "gelu",
-      useBias: i.biasInLinear,
+      useBias: e.biasInLinear,
       kernelInitializer: this.tf.initializers.randomNormal({
         mean: 0,
         stddev: 0.02
       }),
       biasInitializer: "zeros",
-      name: `block_${e}_mlp_cFc`
+      name: `block_${i}_mlp_cFc`
     }), this.cProj = this.tf.layers.dense({
-      units: i.nEmbed,
-      useBias: i.biasInLinear,
+      units: e.nEmbed,
+      useBias: e.biasInLinear,
       kernelInitializer: this.tf.initializers.randomNormal({
         mean: 0,
-        stddev: 0.02 / Math.sqrt(2 * i.nLayer)
+        stddev: 0.02 / Math.sqrt(2 * e.nLayer)
       }),
       biasInitializer: "zeros",
-      name: `block_${e}_mlp_cProj`
-    }), this.dropout = this.tf.layers.dropout({ rate: i.dropout });
+      name: `block_${i}_mlp_cProj`
+    }), this.dropout = this.tf.layers.dropout({ rate: e.dropout });
   }
   get variables() {
     return [
@@ -45,10 +46,11 @@ class l {
   loadWeights(t) {
     this.cFc.setWeights(t.get(`block_${this.index}_mlpHidden`) || []), this.cProj.setWeights(t.get(`block_${this.index}_mlpOut`) || []);
   }
-  call(t, e = !1) {
+  call(t, i = !1) {
     return this.tf.tidy(() => {
-      const i = this.cFc.apply(t), s = this.cProj.apply(i);
-      return this.dropout.apply(s, { training: e });
+      this.startMemory();
+      const e = this.cFc.apply(t), s = this.cProj.apply(e), r = this.dropout.apply(s, { training: i });
+      return this.endMemory("MLP"), r;
     });
   }
   dispose() {

package/dist/layers/RMSNorm.d.ts CHANGED Viewed

@@ -1,5 +1,6 @@
 import { default as TF } from '@tensorflow/tfjs';
-export default class RMSNorm {
+import { default as BaseLayer } from './BaseLayer';
+export default class RMSNorm extends BaseLayer {
     private gamma;
     private epsilon;
     private tf;

package/dist/layers/RMSNorm.js CHANGED Viewed

@@ -1,26 +1,28 @@
-class m {
+import m from "./BaseLayer.js";
+class o extends m {
   gamma;
   epsilon;
   tf;
-  constructor(a, s, t = 1e-8, e = "") {
-    this.tf = a, this.epsilon = t, this.gamma = a.variable(a.ones(s), !0, `${e}_gamma`, "float32");
+  constructor(t, s, a = 1e-8, e = "") {
+    super(), this.tf = t, this.epsilon = a, this.gamma = t.variable(t.ones(s), !0, `${e}_gamma`, "float32");
   }
   get trainableWeights() {
     return [this.gamma];
   }
-  set trainable(a) {
-    this.gamma.trainable = a;
+  set trainable(t) {
+    this.gamma.trainable = t;
   }
   getWeights() {
     return [this.gamma];
   }
-  setWeights(a) {
-    this.gamma.assign(a[0]);
+  setWeights(t) {
+    this.gamma.assign(t[0]);
   }
-  apply(a) {
+  apply(t) {
     return this.tf.tidy(() => {
-      const t = a.square().mean(-1, !0).add(this.epsilon).rsqrt();
-      return a.mul(t).mul(this.gamma);
+      this.startMemory();
+      const a = t.square().mean(-1, !0).add(this.epsilon).rsqrt(), r = t.mul(a).mul(this.gamma);
+      return this.endMemory("RMSNorm"), r;
     });
   }
   dispose() {
@@ -28,5 +30,5 @@ class m {
   }
 }
 export {
-  m as default
+  o as default
 };

package/dist/layers/RoPECache.d.ts CHANGED Viewed

@@ -3,14 +3,16 @@ import { GPTConfig } from '../config';
 export default class RoPECache {
     private readonly tf;
     private readonly config;
-    private rotaryDim;
+    readonly rotaryDim: number;
     private ropeBase;
     private ropeInvFreq;
     private ropeCos;
     private ropeSin;
     private ropeCacheLen;
     constructor(tf: typeof TF, config: GPTConfig);
-    private ensureRopeCache;
+    ensureRopeCache(needed: number): void;
+    getCos(): TF.Tensor | null;
+    getSin(): TF.Tensor | null;
     applyRoPE(q: TF.Tensor, k: TF.Tensor, pastLen: number): [TF.Tensor, TF.Tensor];
     dispose(): void;
 }

package/dist/layers/RoPECache.js CHANGED Viewed

@@ -24,16 +24,22 @@ class b {
     const o = this.tf.range(0, s, 1, "float32").expandDims(1).mul(this.ropeInvFreq.expandDims(0));
     this.ropeCos = this.tf.keep(this.tf.cos(o).expandDims(-1)), this.ropeSin = this.tf.keep(this.tf.sin(o).expandDims(-1)), this.ropeCacheLen = s;
   }
+  getCos() {
+    return this.ropeCos;
+  }
+  getSin() {
+    return this.ropeSin;
+  }
   applyRoPE(s, r, o) {
     const i = s.shape[3], t = this.rotaryDim;
     if (t > i) return [s, r];
-    const e = s.shape[2], v = o + e;
-    this.ensureRopeCache(v);
-    const n = t / 2, p = this.ropeCos.slice([o, 0, 0], [e, n, 1]).reshape([1, 1, e, n]), a = this.ropeSin.slice([o, 0, 0], [e, n, 1]).reshape([1, 1, e, n]), h = s.shape[0], c = s.shape[1], f = this.tf.range(0, t, 2, "int32"), l = this.tf.range(1, t, 2, "int32"), d = (u) => {
-      const m = u.slice([0, 0, 0, 0], [h, c, e, t]), C = t < i ? u.slice([0, 0, 0, t], [h, c, e, i - t]) : null, D = this.tf.gather(m, f, 3), g = this.tf.gather(m, l, 3), x = D.mul(p).sub(g.mul(a)), k = g.mul(p).add(D.mul(a)), R = this.tf.stack([x, k], -1).reshape([h, c, e, t]);
-      return C ? this.tf.concat([R, C], 3) : R;
-    }, y = d(s), S = d(r);
-    return f.dispose(), l.dispose(), [y, S];
+    const e = s.shape[2], R = o + e;
+    this.ensureRopeCache(R);
+    const n = t / 2, c = this.ropeCos.slice([o, 0, 0], [e, n, 1]).reshape([1, 1, e, n]), a = this.ropeSin.slice([o, 0, 0], [e, n, 1]).reshape([1, 1, e, n]), h = s.shape[0], p = s.shape[1], f = this.tf.range(0, t, 2, "int32"), l = this.tf.range(1, t, 2, "int32"), d = (u) => {
+      const m = u.slice([0, 0, 0, 0], [h, p, e, t]), C = t < i ? u.slice([0, 0, 0, t], [h, p, e, i - t]) : null, g = this.tf.gather(m, f, 3), D = this.tf.gather(m, l, 3), x = g.mul(c).sub(D.mul(a)), k = D.mul(c).add(g.mul(a)), S = this.tf.stack([x, k], -1).reshape([h, p, e, t]);
+      return C ? this.tf.concat([S, C], 3) : S;
+    }, v = d(s), y = d(r);
+    return f.dispose(), l.dispose(), [v, y];
   }
   dispose() {
     this.ropeCos && this.ropeCos.dispose(), this.ropeSin && this.ropeSin.dispose(), this.ropeInvFreq.dispose();

package/dist/layers/TiedEmbedding.js CHANGED Viewed

@@ -1,7 +1,8 @@
-import { o as h, c as i, E as o, D as V, F as X, I as Y, H as Z, N as ee, J as te, K as se, O as ne, Q as re, T as ue, h as L, y as ae, U as A, m as ie, V as oe, v as le, d as q, n as C, W as P, x as U, _ as H } from "../index-Dsg28SG6.js";
-import { s as ce, r as f } from "../sum-NWazHI7f.js";
-import { m } from "../mat_mul-BAYDrXvE.js";
-import { c as pe } from "../complex-Cd8sqiBC.js";
+import { o as h, d as i, E as o, K as X, N as Y, O as Z, Q as J, T as ee, U as te, V as se, W as ne, X as re, Y as ue, l as L, I as ae, Z as A, a as ie, _ as oe, D as le, f as q, v as C, $ as P, H as U, a0 as H } from "../index-YPKosni4.js";
+import { r as f } from "../reshape-DmnmKT6r.js";
+import { s as ce } from "../sum-D7fu15XL.js";
+import { m } from "../mat_mul-Bu7bhLms.js";
+import { c as pe } from "../complex-CJ-qCcLB.js";
 /**
  * @license
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -20,7 +21,7 @@ import { c as pe } from "../complex-Cd8sqiBC.js";
  */
 function he(t) {
   const s = { x: i(t, "x", "sigmoid", "float32") };
-  return o.runKernel(V, s);
+  return o.runKernel(X, s);
 }
 const fe = /* @__PURE__ */ h({ sigmoid_: he });
 /**
@@ -41,7 +42,7 @@ const fe = /* @__PURE__ */ h({ sigmoid_: he });
  */
 function de(t) {
   const s = { x: i(t, "x", "elu", "float32") };
-  return o.runKernel(X, s);
+  return o.runKernel(Y, s);
 }
 const me = /* @__PURE__ */ h({ elu_: de });
 /**
@@ -62,7 +63,7 @@ const me = /* @__PURE__ */ h({ elu_: de });
  */
 function ge(t) {
   const s = { input: i(t, "input", "imag") };
-  return o.runKernel(Y, s);
+  return o.runKernel(Z, s);
 }
 const $e = /* @__PURE__ */ h({ imag_: ge });
 /**
@@ -83,7 +84,7 @@ const $e = /* @__PURE__ */ h({ imag_: ge });
  */
 function xe(t, e = 0.2) {
   const n = { x: i(t, "x", "leakyRelu") }, r = { alpha: e };
-  return o.runKernel(Z, n, r);
+  return o.runKernel(J, n, r);
 }
 const ke = /* @__PURE__ */ h({ leakyRelu_: xe });
 /**
@@ -321,8 +322,8 @@ function Ne({ a: t, b: e, transposeA: s = !1, transposeB: n = !1, bias: r, activ
     const [g, $, k, z] = M, d = Ae(f(x, k.shape), k, c);
     let K, _;
     if (!s && !n ? (K = m(d, $, !1, !0), _ = m(g, d, !0, !1)) : !s && n ? (K = m(d, $, !1, !1), _ = m(d, g, !0, !1)) : s && !n ? (K = m($, d, !1, !0), _ = m(g, d, !1, !1)) : (K = m($, d, !0, !0), _ = m(d, g, !0, !0)), r != null) {
-      const Q = Le(z, d);
-      return [K, _, Q];
+      const V = Le(z, d);
+      return [K, _, V];
     } else
       return [K, _];
   }, I = {
@@ -345,7 +346,7 @@ function Ne({ a: t, b: e, transposeA: s = !1, transposeB: n = !1, bias: r, activ
     return k([M, g, z, $]), { value: f(z, O), gradFunc: G };
   })(F, R, S);
 }
-const J = /* @__PURE__ */ h({ fusedMatMul_: Ne });
+const Q = /* @__PURE__ */ h({ fusedMatMul_: Ne });
 /**
  * @license
  * Copyright 2018 Google LLC
@@ -378,7 +379,7 @@ function ve(t, e, s, n) {
       throw new E(`If rank y >= 3, then the second last dim of y must equal the last dim of x but got x shape = ${t.shape} and  y shape = ${e.shape}`);
   }
   if (t.rank === 2 && e.rank === 2)
-    return J({
+    return Q({
       a: t,
       b: e,
       transposeA: !1,
@@ -392,7 +393,7 @@ function ve(t, e, s, n) {
     const l = e.shape.slice(), p = l.pop(), u = l.pop(), a = [...l, p], D = Array.from({ length: e.rank }, (T, y) => y === 0 ? e.rank - 2 : y <= e.rank - 2 ? y - 1 : y);
     e = f(Re(e, D), [u, -1]);
     const b = [...r, ...a];
-    return f(J({
+    return f(Q({
       a: t,
       b: e,
       transposeA: !1,
@@ -402,7 +403,7 @@ function ve(t, e, s, n) {
     }), b);
   }
 }
-class Pe {
+class Ue {
   vocabSize;
   embedDim;
   tf;
@@ -444,5 +445,5 @@ class Pe {
   }
 }
 export {
-  Pe as default
+  Ue as default
 };

package/dist/layers/TransformerBlock.d.ts CHANGED Viewed

@@ -2,7 +2,9 @@ import { default as TF } from '@tensorflow/tfjs';
 import { GPTConfig } from '../config';
 import { KVCache } from './CausalSelfAttention';
 import { default as RoPECache } from './RoPECache';
-export default class Block {
+import { default as MemoryProfiler } from '../utilities/profile';
+import { default as BaseLayer } from './BaseLayer';
+export default class Block extends BaseLayer {
     private ln1;
     private attn;
     private ln2;
@@ -12,6 +14,7 @@ export default class Block {
     private _trainable;
     skipped: boolean;
     constructor(tf: typeof TF, index: number, config: GPTConfig, ropeCache?: RoPECache);
+    setProfiler(value: MemoryProfiler | undefined): void;
     get variables(): TF.Variable[];
     get trainable(): boolean;
     set trainable(value: boolean);

package/dist/layers/TransformerBlock.js CHANGED Viewed

@@ -1,7 +1,8 @@
-import r from "./CausalSelfAttention.js";
+import a from "./CausalSelfAttention.js";
 import o from "./MLP.js";
-import a from "./RMSNorm.js";
-class u {
+import r from "./RMSNorm.js";
+import p from "./BaseLayer.js";
+class f extends p {
   ln1;
   attn;
   ln2;
@@ -11,7 +12,10 @@ class u {
   _trainable = !0;
   skipped = !1;
   constructor(t, i, s, e) {
-    this.tf = t, this.index = i, this.ln1 = new a(t, [s.nEmbed], 1e-8, `block_${this.index}_rms1`), this.attn = new r(this.tf, this.index, s, e), this.ln2 = new a(t, [s.nEmbed], 1e-8, `block_${this.index}_rms2`), this.mlp = new o(this.tf, this.index, s);
+    super(), this.tf = t, this.index = i, this.ln1 = new r(t, [s.nEmbed], 1e-8, `block_${this.index}_rms1`), this.attn = new a(this.tf, this.index, s, e), this.ln2 = new r(t, [s.nEmbed], 1e-8, `block_${this.index}_rms2`), this.mlp = new o(this.tf, this.index, s);
+  }
+  setProfiler(t) {
+    this._profiler = t, this.attn.setProfiler(t), this.mlp.setProfiler(t), this.ln1.setProfiler(t), this.ln2.setProfiler(t);
   }
   get variables() {
     return [
@@ -54,5 +58,5 @@ class u {
   }
 }
 export {
-  u as default
+  f as default
 };

package/dist/main.js CHANGED Viewed

@@ -1,21 +1,23 @@
-import { default as m } from "./NanoGPTModel.js";
-import { default as i } from "./TeachableLLM.js";
-import { default as l } from "./tokeniser/CharTokeniser.js";
-import { default as d } from "./utilities/waitForModel.js";
-import { default as x } from "./data/textLoader.js";
-import { estimateMemoryUsage as T, estimateParameterCount as g, estimateResources as M, estimateTrainingMemoryUsage as C, validateConfig as c } from "./utilities/parameters.js";
+import { default as s } from "./NanoGPTModel.js";
+import { default as p } from "./TeachableLLM.js";
+import { default as d } from "./tokeniser/CharTokeniser.js";
+import { default as x } from "./utilities/waitForModel.js";
+import { default as T } from "./data/textLoader.js";
+import { estimateMemoryUsage as M, estimateParameterCount as C, estimateResources as c, estimateTrainingMemoryUsage as h, validateConfig as y } from "./utilities/parameters.js";
 import "./ops/scatterSub.js";
 import "./ops/gatherSub.js";
 import "./ops/attentionMask.js";
+import "./ops/qkv.js";
+import "./ops/rope.js";
 export {
-  l as CharTokeniser,
-  m as NanoGPT,
-  i as TeachableLLM,
-  T as estimateMemoryUsage,
-  g as estimateParameterCount,
-  M as estimateResources,
-  C as estimateTrainingMemoryUsage,
-  x as loadTextData,
-  c as validateConfig,
-  d as waitForModel
+  d as CharTokeniser,
+  s as NanoGPT,
+  p as TeachableLLM,
+  M as estimateMemoryUsage,
+  C as estimateParameterCount,
+  c as estimateResources,
+  h as estimateTrainingMemoryUsage,
+  T as loadTextData,
+  y as validateConfig,
+  x as waitForModel
 };

package/dist/{mat_mul-BAYDrXvE.js → mat_mul-Bu7bhLms.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { o as c, c as s, d as m, E as M, B as p } from "./index-Dsg28SG6.js";
+import { o as m, d as s, f as c, E as M, B as f } from "./index-YPKosni4.js";
 /**
  * @license
  * Copyright 2020 Google LLC. All Rights Reserved.
@@ -15,13 +15,13 @@ import { o as c, c as s, d as m, E as M, B as p } from "./index-Dsg28SG6.js";
  * limitations under the License.
  * =============================================================================
  */
-function f(e, o, n = !1, l = !1) {
+function p(e, o, n = !1, l = !1) {
   let a = s(e, "a", "matMul"), t = s(o, "b", "matMul");
-  [a, t] = m(a, t);
+  [a, t] = c(a, t);
   const r = { a, b: t }, u = { transposeA: n, transposeB: l };
-  return M.runKernel(p, r, u);
+  return M.runKernel(f, r, u);
 }
-const i = /* @__PURE__ */ c({ matMul_: f });
+const i = /* @__PURE__ */ m({ matMul_: p });
 export {
   i as m
 };

package/dist/ops/attentionMask.js CHANGED Viewed

@@ -1,14 +1,14 @@
-import { engine as l } from "@tensorflow/tfjs";
-import { r as u, b as k, s as d } from "../index-Dsg28SG6.js";
-import { m as p } from "../mat_mul-BAYDrXvE.js";
-class f {
+import { engine as k } from "@tensorflow/tfjs";
+import { r as m, c as d, s as p } from "../index-YPKosni4.js";
+import { m as f } from "../mat_mul-Bu7bhLms.js";
+class h {
   variableNames = ["q", "k", "mask"];
   outputShape;
   userCode;
   // enableShapeUniforms = true;
   customUniforms = [{ name: "divisor", type: "float" }];
-  constructor(s, n, e, a) {
-    this.outputShape = [s, n, e, e], this.userCode = `
+  constructor(e, n, s, a) {
+    this.outputShape = [e, n, s, s], this.userCode = `
         void main() {
             ivec4 coords = getOutputCoords(); // [batch, nh, t1, t2]
             int b = coords.x;
@@ -34,49 +34,55 @@ class f {
         `;
   }
 }
-function h(t) {
-  const { q: s, k: n, mask: e } = t.inputs, { divisor: a } = t.attrs, o = t.backend, r = s.shape[0], i = s.shape[2], c = s.shape[1], m = new f(r, c, i, s.shape[3]);
-  return o.runWebGLProgram(m, [s, n, e], "float32", [[a]]);
+function v(t) {
+  const { q: e, k: n, mask: s } = t.inputs, { divisor: a } = t.attrs, o = t.backend, r = e.shape[0], i = e.shape[2], c = e.shape[1], u = new h(r, c, i, e.shape[3]);
+  return o.runWebGLProgram(u, [e, n, s], "float32", [[a]]);
 }
-const v = {
+const b = {
   kernelName: "AttentionMask",
   backendName: "webgl",
-  kernelFunc: h
+  kernelFunc: v
 };
-u(v);
-function b(t) {
-  const { q: s, k: n, mask: e } = t.inputs, { divisor: a } = t.attrs, o = s.shape[2], i = p(s, n, !1, !0).mul(d(a)), c = e.slice([0, 0], [o, o]).expandDims(0).expandDims(0);
+m(b);
+function l(t) {
+  const { q: e, k: n, mask: s } = t.inputs, { divisor: a } = t.attrs, o = e.shape[2], i = f(e, n, !1, !0).mul(p(a)), c = s.slice([0, 0], [o, o]).expandDims(0).expandDims(0);
   return i.add(c);
 }
 const M = {
   kernelName: "AttentionMask",
   backendName: "cpu",
-  kernelFunc: b
+  kernelFunc: l
 };
-u(M);
-function w(t, s, n, e) {
-  return l().runKernel("AttentionMask", { q: t, k: s, mask: n }, { divisor: e });
-}
+m(M);
 const g = {
+  kernelName: "AttentionMask",
+  backendName: "tensorflow",
+  kernelFunc: l
+};
+m(g);
+function N(t, e, n, s) {
+  return k().runKernel("AttentionMask", { q: t, k: e, mask: n }, { divisor: s });
+}
+const A = {
   kernelName: "AttentionMask",
   inputsToSave: ["q", "k"],
   outputsToSave: [],
-  gradFunc: (t, s, n) => {
+  gradFunc: (t, e, n) => {
     if (Array.isArray(t))
       throw new Error("Expected dy to be a single Tensor");
-    const [e, a] = s, { divisor: o } = n;
+    const [s, a] = e, { divisor: o } = n;
     return {
       q: () => t.matMul(a).mul(o),
-      k: () => e.transpose([0, 1, 3, 2]).matMul(t).mul(o).transpose([0, 1, 3, 2]),
+      k: () => s.transpose([0, 1, 3, 2]).matMul(t).mul(o).transpose([0, 1, 3, 2]),
       mask: () => t,
       divisor: () => {
-        const r = e.matMul(a, !1, !0);
+        const r = s.matMul(a, !1, !0);
         return t.mul(r).sum();
       }
     };
   }
 };
-k(g);
+d(A);
 export {
-  w as attentionMask
+  N as attentionMask
 };