npm - @genai-fi/nanogpt - Versions diffs - 0.2.9 → 0.2.10 - Mend

@genai-fi/nanogpt 0.2.9 → 0.2.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

package/dist/Generator.d.ts +2 -0
package/dist/Generator.js +37 -32
package/dist/NanoGPTModel.d.ts +4 -1
package/dist/NanoGPTModel.js +33 -25
package/dist/TeachableLLM.d.ts +4 -0
package/dist/TeachableLLM.js +31 -16
package/dist/{complex-Cd8sqiBC.js → complex-x7w5HPOS.js} +6 -6
package/dist/{index-Dsg28SG6.js → index-CWQLouWz.js} +39 -35
package/dist/layers/BaseLayer.d.ts +8 -0
package/dist/layers/BaseLayer.js +18 -0
package/dist/layers/CausalSelfAttention.d.ts +2 -1
package/dist/layers/CausalSelfAttention.js +10 -8
package/dist/layers/MLP.d.ts +2 -1
package/dist/layers/MLP.js +16 -14
package/dist/layers/RMSNorm.d.ts +2 -1
package/dist/layers/RMSNorm.js +13 -11
package/dist/layers/TiedEmbedding.js +21 -21
package/dist/layers/TransformerBlock.d.ts +4 -1
package/dist/layers/TransformerBlock.js +9 -5
package/dist/{mat_mul-BAYDrXvE.js → mat_mul-4v7St11W.js} +5 -5
package/dist/ops/attentionMask.js +31 -25
package/dist/ops/gatherSub.js +2 -2
package/dist/ops/node/sparseCrossEntropy.js +1 -1
package/dist/ops/scatterSub.js +8 -8
package/dist/{stack-1o648CP_.js → stack-CTdK-itU.js} +7 -7
package/dist/{sum-NWazHI7f.js → sum-CnIf1YOh.js} +3 -3
package/dist/training/AdamExt.js +1 -1
package/dist/training/Trainer.js +30 -29
package/dist/training/sparseCrossEntropy.js +9 -9
package/dist/utilities/profile.d.ts +10 -0
package/dist/utilities/profile.js +29 -0
package/package.json +1 -1

package/dist/layers/MLP.js CHANGED Viewed

@@ -1,31 +1,32 @@
-class l {
+import a from "./BaseLayer.js";
+class l extends a {
   cFc;
   cProj;
   dropout;
   tf;
   index;
   _trainable = !0;
-  constructor(t, e, i) {
-    this.tf = t, this.index = e, this.cFc = this.tf.layers.dense({
-      units: i.mlpFactor * i.nEmbed,
+  constructor(t, i, e) {
+    super(), this.tf = t, this.index = i, this.cFc = this.tf.layers.dense({
+      units: e.mlpFactor * e.nEmbed,
       activation: "gelu",
-      useBias: i.biasInLinear,
+      useBias: e.biasInLinear,
       kernelInitializer: this.tf.initializers.randomNormal({
         mean: 0,
         stddev: 0.02
       }),
       biasInitializer: "zeros",
-      name: `block_${e}_mlp_cFc`
+      name: `block_${i}_mlp_cFc`
     }), this.cProj = this.tf.layers.dense({
-      units: i.nEmbed,
-      useBias: i.biasInLinear,
+      units: e.nEmbed,
+      useBias: e.biasInLinear,
       kernelInitializer: this.tf.initializers.randomNormal({
         mean: 0,
-        stddev: 0.02 / Math.sqrt(2 * i.nLayer)
+        stddev: 0.02 / Math.sqrt(2 * e.nLayer)
       }),
       biasInitializer: "zeros",
-      name: `block_${e}_mlp_cProj`
-    }), this.dropout = this.tf.layers.dropout({ rate: i.dropout });
+      name: `block_${i}_mlp_cProj`
+    }), this.dropout = this.tf.layers.dropout({ rate: e.dropout });
   }
   get variables() {
     return [
@@ -45,10 +46,11 @@ class l {
   loadWeights(t) {
     this.cFc.setWeights(t.get(`block_${this.index}_mlpHidden`) || []), this.cProj.setWeights(t.get(`block_${this.index}_mlpOut`) || []);
   }
-  call(t, e = !1) {
+  call(t, i = !1) {
     return this.tf.tidy(() => {
-      const i = this.cFc.apply(t), s = this.cProj.apply(i);
-      return this.dropout.apply(s, { training: e });
+      this.startMemory();
+      const e = this.cFc.apply(t), s = this.cProj.apply(e), r = this.dropout.apply(s, { training: i });
+      return this.endMemory("MLP"), r;
     });
   }
   dispose() {

package/dist/layers/RMSNorm.d.ts CHANGED Viewed

@@ -1,5 +1,6 @@
 import { default as TF } from '@tensorflow/tfjs';
-export default class RMSNorm {
+import { default as BaseLayer } from './BaseLayer';
+export default class RMSNorm extends BaseLayer {
     private gamma;
     private epsilon;
     private tf;

package/dist/layers/RMSNorm.js CHANGED Viewed

@@ -1,26 +1,28 @@
-class m {
+import m from "./BaseLayer.js";
+class o extends m {
   gamma;
   epsilon;
   tf;
-  constructor(a, s, t = 1e-8, e = "") {
-    this.tf = a, this.epsilon = t, this.gamma = a.variable(a.ones(s), !0, `${e}_gamma`, "float32");
+  constructor(t, s, a = 1e-8, e = "") {
+    super(), this.tf = t, this.epsilon = a, this.gamma = t.variable(t.ones(s), !0, `${e}_gamma`, "float32");
   }
   get trainableWeights() {
     return [this.gamma];
   }
-  set trainable(a) {
-    this.gamma.trainable = a;
+  set trainable(t) {
+    this.gamma.trainable = t;
   }
   getWeights() {
     return [this.gamma];
   }
-  setWeights(a) {
-    this.gamma.assign(a[0]);
+  setWeights(t) {
+    this.gamma.assign(t[0]);
   }
-  apply(a) {
+  apply(t) {
     return this.tf.tidy(() => {
-      const t = a.square().mean(-1, !0).add(this.epsilon).rsqrt();
-      return a.mul(t).mul(this.gamma);
+      this.startMemory();
+      const a = t.square().mean(-1, !0).add(this.epsilon).rsqrt(), r = t.mul(a).mul(this.gamma);
+      return this.endMemory("RMSNorm"), r;
     });
   }
   dispose() {
@@ -28,5 +30,5 @@ class m {
   }
 }
 export {
-  m as default
+  o as default
 };

package/dist/layers/TiedEmbedding.js CHANGED Viewed

@@ -1,7 +1,7 @@
-import { o as h, c as i, E as o, D as V, F as X, I as Y, H as Z, N as ee, J as te, K as se, O as ne, Q as re, T as ue, h as L, y as ae, U as A, m as ie, V as oe, v as le, d as q, n as C, W as P, x as U, _ as H } from "../index-Dsg28SG6.js";
-import { s as ce, r as f } from "../sum-NWazHI7f.js";
-import { m } from "../mat_mul-BAYDrXvE.js";
-import { c as pe } from "../complex-Cd8sqiBC.js";
+import { o as h, d as i, E as o, F as V, H as X, I as Y, J as Z, N as ee, K as te, O as se, Q as ne, T as re, U as ue, i as L, z as ae, V as A, a as ie, W as oe, w as le, f as q, p as C, X as P, y as U, _ as H } from "../index-CWQLouWz.js";
+import { s as ce, r as f } from "../sum-CnIf1YOh.js";
+import { m } from "../mat_mul-4v7St11W.js";
+import { c as pe } from "../complex-x7w5HPOS.js";
 /**
  * @license
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -169,7 +169,7 @@ function Me(t) {
   const s = { x: i(t, "x", "relu") };
   return o.runKernel(ne, s);
 }
-const We = /* @__PURE__ */ h({ relu_: Me });
+const we = /* @__PURE__ */ h({ relu_: Me });
 /**
  * @license
  * Copyright 2020 Google LLC. All Rights Reserved.
@@ -186,11 +186,11 @@ const We = /* @__PURE__ */ h({ relu_: Me });
  * limitations under the License.
  * =============================================================================
  */
-function we(t) {
+function We(t) {
   const s = { x: i(t, "x", "relu6") };
   return o.runKernel(re, s);
 }
-const ze = /* @__PURE__ */ h({ relu6_: we });
+const ze = /* @__PURE__ */ h({ relu6_: We });
 /**
  * @license
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -273,7 +273,7 @@ function Te(t, e, s, n) {
   if (e === "linear")
     return t;
   if (e === "relu")
-    return We(t);
+    return we(t);
   if (e === "elu")
     return me(t);
   if (e === "relu6")
@@ -310,14 +310,14 @@ function Ne({ a: t, b: e, transposeA: s = !1, transposeB: n = !1, bias: r, activ
   }
   let u = i(t, "a", "fused matMul"), a = i(e, "b", "fused matMul");
   [u, a] = q(u, a);
-  const D = s ? u.shape[u.rank - 2] : u.shape[u.rank - 1], b = n ? a.shape[a.rank - 1] : a.shape[a.rank - 2], W = s ? u.shape[u.rank - 1] : u.shape[u.rank - 2], w = n ? a.shape[a.rank - 2] : a.shape[a.rank - 1], T = u.shape.slice(0, -2), y = a.shape.slice(0, -2), B = C(T), N = C(y);
+  const D = s ? u.shape[u.rank - 2] : u.shape[u.rank - 1], b = n ? a.shape[a.rank - 1] : a.shape[a.rank - 2], w = s ? u.shape[u.rank - 1] : u.shape[u.rank - 2], W = n ? a.shape[a.rank - 2] : a.shape[a.rank - 1], T = u.shape.slice(0, -2), y = a.shape.slice(0, -2), B = C(T), N = C(y);
   L(D === b, () => `Error in fused matMul: inner shapes (${D}) and (${b}) of Tensors with shapes ${u.shape} and ${a.shape} and transposeA=${s} and transposeB=${n} must match.`);
-  const O = P(u.shape.slice(0, -2), a.shape.slice(0, -2)).concat([W, w]), F = s ? f(u, [B, D, W]) : f(u, [B, W, D]), R = n ? f(a, [N, w, b]) : f(a, [N, b, w]);
+  const O = P(u.shape.slice(0, -2), a.shape.slice(0, -2)).concat([w, W]), F = s ? f(u, [B, D, w]) : f(u, [B, w, D]), R = n ? f(a, [N, W, b]) : f(a, [N, b, W]);
   let S;
   r != null && (S = i(r, "bias", "fused matMul"), [S] = q(S, u), P(O, S.shape));
-  let v;
-  l != null && (v = i(l, "prelu weights", "fused matMul"));
-  const G = (x, M) => {
+  let G;
+  l != null && (G = i(l, "prelu weights", "fused matMul"));
+  const I = (x, M) => {
     const [g, $, k, z] = M, d = Ae(f(x, k.shape), k, c);
     let K, _;
     if (!s && !n ? (K = m(d, $, !1, !0), _ = m(g, d, !0, !1)) : !s && n ? (K = m(d, $, !1, !1), _ = m(d, g, !0, !1)) : s && !n ? (K = m($, d, !1, !0), _ = m(g, d, !1, !1)) : (K = m($, d, !0, !0), _ = m(d, g, !0, !0)), r != null) {
@@ -325,24 +325,24 @@ function Ne({ a: t, b: e, transposeA: s = !1, transposeB: n = !1, bias: r, activ
       return [K, _, Q];
     } else
       return [K, _];
-  }, I = {
+  }, v = {
     a: F,
     b: R,
     bias: S,
-    preluActivationWeights: v
+    preluActivationWeights: G
   }, j = { transposeA: s, transposeB: n, activation: c, leakyreluAlpha: p };
   return r == null ? U((M, g, $) => {
     const k = (
       // tslint:disable-next-line: no-unnecessary-type-assertion
-      o.runKernel(H, I, j)
+      o.runKernel(H, v, j)
     );
-    return $([M, g, k]), { value: f(k, O), gradFunc: G };
+    return $([M, g, k]), { value: f(k, O), gradFunc: I };
   })(F, R) : U((M, g, $, k) => {
     const z = (
       // tslint:disable-next-line: no-unnecessary-type-assertion
-      o.runKernel(H, I, j)
+      o.runKernel(H, v, j)
     );
-    return k([M, g, z, $]), { value: f(z, O), gradFunc: G };
+    return k([M, g, z, $]), { value: f(z, O), gradFunc: I };
   })(F, R, S);
 }
 const J = /* @__PURE__ */ h({ fusedMatMul_: Ne });
@@ -369,7 +369,7 @@ class E extends Error {
  * https://opensource.org/licenses/MIT.
  * =============================================================================
  */
-function ve(t, e, s, n) {
+function Ge(t, e, s, n) {
   if (t.rank < 2 || e.rank < 2)
     throw new E(`dot requires both inputs to be rank >= 2 but got x shape = ${t.shape} and y shape = ${e.shape}`);
   if (e.rank >= 3) {
@@ -425,7 +425,7 @@ class Pe {
     return this.tf.gather(this.tiedWeights, e, 0);
   }
   project(e) {
-    return ve(e, this.tiedWeights.transpose());
+    return Ge(e, this.tiedWeights.transpose());
   }
   getWeights() {
     return [this.tiedWeights];

package/dist/layers/TransformerBlock.d.ts CHANGED Viewed

@@ -2,7 +2,9 @@ import { default as TF } from '@tensorflow/tfjs';
 import { GPTConfig } from '../config';
 import { KVCache } from './CausalSelfAttention';
 import { default as RoPECache } from './RoPECache';
-export default class Block {
+import { default as MemoryProfiler } from '../utilities/profile';
+import { default as BaseLayer } from './BaseLayer';
+export default class Block extends BaseLayer {
     private ln1;
     private attn;
     private ln2;
@@ -12,6 +14,7 @@ export default class Block {
     private _trainable;
     skipped: boolean;
     constructor(tf: typeof TF, index: number, config: GPTConfig, ropeCache?: RoPECache);
+    setProfiler(value: MemoryProfiler | undefined): void;
     get variables(): TF.Variable[];
     get trainable(): boolean;
     set trainable(value: boolean);

package/dist/layers/TransformerBlock.js CHANGED Viewed

@@ -1,7 +1,8 @@
-import r from "./CausalSelfAttention.js";
+import a from "./CausalSelfAttention.js";
 import o from "./MLP.js";
-import a from "./RMSNorm.js";
-class u {
+import r from "./RMSNorm.js";
+import p from "./BaseLayer.js";
+class f extends p {
   ln1;
   attn;
   ln2;
@@ -11,7 +12,10 @@ class u {
   _trainable = !0;
   skipped = !1;
   constructor(t, i, s, e) {
-    this.tf = t, this.index = i, this.ln1 = new a(t, [s.nEmbed], 1e-8, `block_${this.index}_rms1`), this.attn = new r(this.tf, this.index, s, e), this.ln2 = new a(t, [s.nEmbed], 1e-8, `block_${this.index}_rms2`), this.mlp = new o(this.tf, this.index, s);
+    super(), this.tf = t, this.index = i, this.ln1 = new r(t, [s.nEmbed], 1e-8, `block_${this.index}_rms1`), this.attn = new a(this.tf, this.index, s, e), this.ln2 = new r(t, [s.nEmbed], 1e-8, `block_${this.index}_rms2`), this.mlp = new o(this.tf, this.index, s);
+  }
+  setProfiler(t) {
+    this._profiler = t, this.attn.setProfiler(t), this.mlp.setProfiler(t), this.ln1.setProfiler(t), this.ln2.setProfiler(t);
   }
   get variables() {
     return [
@@ -54,5 +58,5 @@ class u {
   }
 }
 export {
-  u as default
+  f as default
 };

package/dist/{mat_mul-BAYDrXvE.js → mat_mul-4v7St11W.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { o as c, c as s, d as m, E as M, B as p } from "./index-Dsg28SG6.js";
+import { o as m, d as s, f as c, E as M, B as f } from "./index-CWQLouWz.js";
 /**
  * @license
  * Copyright 2020 Google LLC. All Rights Reserved.
@@ -15,13 +15,13 @@ import { o as c, c as s, d as m, E as M, B as p } from "./index-Dsg28SG6.js";
  * limitations under the License.
  * =============================================================================
  */
-function f(e, o, n = !1, l = !1) {
+function p(e, o, n = !1, l = !1) {
   let a = s(e, "a", "matMul"), t = s(o, "b", "matMul");
-  [a, t] = m(a, t);
+  [a, t] = c(a, t);
   const r = { a, b: t }, u = { transposeA: n, transposeB: l };
-  return M.runKernel(p, r, u);
+  return M.runKernel(f, r, u);
 }
-const i = /* @__PURE__ */ c({ matMul_: f });
+const i = /* @__PURE__ */ m({ matMul_: p });
 export {
   i as m
 };

package/dist/ops/attentionMask.js CHANGED Viewed

@@ -1,14 +1,14 @@
-import { engine as l } from "@tensorflow/tfjs";
-import { r as u, b as k, s as d } from "../index-Dsg28SG6.js";
-import { m as p } from "../mat_mul-BAYDrXvE.js";
-class f {
+import { engine as k } from "@tensorflow/tfjs";
+import { r as m, c as d, s as p } from "../index-CWQLouWz.js";
+import { m as f } from "../mat_mul-4v7St11W.js";
+class h {
   variableNames = ["q", "k", "mask"];
   outputShape;
   userCode;
   // enableShapeUniforms = true;
   customUniforms = [{ name: "divisor", type: "float" }];
-  constructor(s, n, e, a) {
-    this.outputShape = [s, n, e, e], this.userCode = `
+  constructor(e, n, s, a) {
+    this.outputShape = [e, n, s, s], this.userCode = `
         void main() {
             ivec4 coords = getOutputCoords(); // [batch, nh, t1, t2]
             int b = coords.x;
@@ -34,49 +34,55 @@ class f {
         `;
   }
 }
-function h(t) {
-  const { q: s, k: n, mask: e } = t.inputs, { divisor: a } = t.attrs, o = t.backend, r = s.shape[0], i = s.shape[2], c = s.shape[1], m = new f(r, c, i, s.shape[3]);
-  return o.runWebGLProgram(m, [s, n, e], "float32", [[a]]);
+function v(t) {
+  const { q: e, k: n, mask: s } = t.inputs, { divisor: a } = t.attrs, o = t.backend, r = e.shape[0], i = e.shape[2], c = e.shape[1], u = new h(r, c, i, e.shape[3]);
+  return o.runWebGLProgram(u, [e, n, s], "float32", [[a]]);
 }
-const v = {
+const b = {
   kernelName: "AttentionMask",
   backendName: "webgl",
-  kernelFunc: h
+  kernelFunc: v
 };
-u(v);
-function b(t) {
-  const { q: s, k: n, mask: e } = t.inputs, { divisor: a } = t.attrs, o = s.shape[2], i = p(s, n, !1, !0).mul(d(a)), c = e.slice([0, 0], [o, o]).expandDims(0).expandDims(0);
+m(b);
+function l(t) {
+  const { q: e, k: n, mask: s } = t.inputs, { divisor: a } = t.attrs, o = e.shape[2], i = f(e, n, !1, !0).mul(p(a)), c = s.slice([0, 0], [o, o]).expandDims(0).expandDims(0);
   return i.add(c);
 }
 const M = {
   kernelName: "AttentionMask",
   backendName: "cpu",
-  kernelFunc: b
+  kernelFunc: l
 };
-u(M);
-function w(t, s, n, e) {
-  return l().runKernel("AttentionMask", { q: t, k: s, mask: n }, { divisor: e });
-}
+m(M);
 const g = {
+  kernelName: "AttentionMask",
+  backendName: "tensorflow",
+  kernelFunc: l
+};
+m(g);
+function N(t, e, n, s) {
+  return k().runKernel("AttentionMask", { q: t, k: e, mask: n }, { divisor: s });
+}
+const A = {
   kernelName: "AttentionMask",
   inputsToSave: ["q", "k"],
   outputsToSave: [],
-  gradFunc: (t, s, n) => {
+  gradFunc: (t, e, n) => {
     if (Array.isArray(t))
       throw new Error("Expected dy to be a single Tensor");
-    const [e, a] = s, { divisor: o } = n;
+    const [s, a] = e, { divisor: o } = n;
     return {
       q: () => t.matMul(a).mul(o),
-      k: () => e.transpose([0, 1, 3, 2]).matMul(t).mul(o).transpose([0, 1, 3, 2]),
+      k: () => s.transpose([0, 1, 3, 2]).matMul(t).mul(o).transpose([0, 1, 3, 2]),
       mask: () => t,
       divisor: () => {
-        const r = e.matMul(a, !1, !0);
+        const r = s.matMul(a, !1, !0);
         return t.mul(r).sum();
       }
     };
   }
 };
-k(g);
+d(A);
 export {
-  w as attentionMask
+  N as attentionMask
 };

package/dist/ops/gatherSub.js CHANGED Viewed

@@ -1,6 +1,6 @@
 import { engine as l } from "@tensorflow/tfjs";
-import { o as g, c as i, E as b, G as d, r as c, a as h } from "../index-Dsg28SG6.js";
-import { r as p, s as f } from "../stack-1o648CP_.js";
+import { o as g, d as i, E as b, G as d, r as c, b as h } from "../index-CWQLouWz.js";
+import { r as p, s as f } from "../stack-CTdK-itU.js";
 /**
  * @license
  * Copyright 2018 Google LLC. All Rights Reserved.

package/dist/ops/node/sparseCrossEntropy.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import { r as o } from "../../index-Dsg28SG6.js";
+import { r as o } from "../../index-CWQLouWz.js";
 function r(e) {
   const { logits: t, labels: n } = e.inputs;
   return e.backend.executeMultipleOutputs("SparseSoftmaxCrossEntropyWithLogits", [], [t, n], 2);

package/dist/ops/scatterSub.js CHANGED Viewed

@@ -1,7 +1,7 @@
 import { engine as $ } from "@tensorflow/tfjs";
-import { k as u, l as S, n as p, E as f, p as E, o as N, c as l, q as y, r as h, a as D, m as x } from "../index-Dsg28SG6.js";
-import { c as m } from "../complex-Cd8sqiBC.js";
-import { r as v, s as T } from "../stack-1o648CP_.js";
+import { l as u, n as S, p, E as f, q as E, o as N, d as l, t as y, r as h, b as D, a as x } from "../index-CWQLouWz.js";
+import { c as d } from "../complex-x7w5HPOS.js";
+import { r as v, s as T } from "../stack-CTdK-itU.js";
 /**
  * @license
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -21,7 +21,7 @@ import { r as v, s as T } from "../stack-1o648CP_.js";
 function i(e, t = "float32") {
   if (u(e), t === "complex64") {
     const a = i(e, "float32"), o = i(e, "float32");
-    return m(a, o);
+    return d(a, o);
   }
   const r = S(p(e), t);
   return f.makeTensor(r, e, t);
@@ -42,10 +42,10 @@ function i(e, t = "float32") {
  * limitations under the License.
  * =============================================================================
  */
-function d(e, t = "float32") {
+function m(e, t = "float32") {
   if (u(e), t === "complex64") {
-    const a = d(e, "float32"), o = i(e, "float32");
-    return m(a, o);
+    const a = m(e, "float32"), o = i(e, "float32");
+    return d(a, o);
   }
   const r = E(p(e), t);
   return f.makeTensor(r, e, t);
@@ -133,7 +133,7 @@ const K = {
 };
 h(K);
 function A(e) {
-  const { logits: t, labels: r, dy: a } = e.inputs, o = r.shape[0], s = t.shape[1], n = v(0, o, 1, "int32"), c = T([n, r], 1), b = d([o]), g = I(c, b, [o, s]), k = D(t, g), w = a.reshape([o, 1]);
+  const { logits: t, labels: r, dy: a } = e.inputs, o = r.shape[0], s = t.shape[1], n = v(0, o, 1, "int32"), c = T([n, r], 1), b = m([o]), g = I(c, b, [o, s]), k = D(t, g), w = a.reshape([o, 1]);
   return x(k, w);
 }
 const F = {

package/dist/{stack-1o648CP_.js → stack-CTdK-itU.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { E as e, R as c, o as f, g as u, h as a, P as i } from "./index-Dsg28SG6.js";
+import { E as e, R as c, o as f, h as i, i as a, P as u } from "./index-CWQLouWz.js";
 /**
  * @license
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -15,7 +15,7 @@ import { E as e, R as c, o as f, g as u, h as a, P as i } from "./index-Dsg28SG6
  * limitations under the License.
  * =============================================================================
  */
-function h(n, s, t = 1, r = "float32") {
+function l(n, s, t = 1, r = "float32") {
   if (t === 0)
     throw new Error("Cannot have a step of zero");
   const o = { start: n, stop: s, step: t, dtype: r };
@@ -38,13 +38,13 @@ function h(n, s, t = 1, r = "float32") {
  * =============================================================================
  */
 function k(n, s = 0) {
-  const t = u(n, "tensors", "stack", "string_or_numeric");
+  const t = i(n, "tensors", "stack", "string_or_numeric");
   a(t.length >= 1, () => "Pass at least one tensor to tf.stack"), t.length > 0 && a(s <= t[0].rank, () => "Axis must be <= rank of the tensor");
   const r = t, o = { axis: s };
-  return e.runKernel(i, r, o);
+  return e.runKernel(u, r, o);
 }
-const l = /* @__PURE__ */ f({ stack_: k });
+const g = /* @__PURE__ */ f({ stack_: k });
 export {
-  h as r,
-  l as s
+  l as r,
+  g as s
 };

package/dist/{sum-NWazHI7f.js → sum-CnIf1YOh.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { o, c as a, E as u, i, j as p, S as x } from "./index-Dsg28SG6.js";
+import { o, d as a, E as u, j as p, k as i, S as x } from "./index-CWQLouWz.js";
 /**
  * @license
  * Copyright 2020 Google LLC. All Rights Reserved.
@@ -17,7 +17,7 @@ import { o, c as a, E as u, i, j as p, S as x } from "./index-Dsg28SG6.js";
  */
 function l(n, t) {
   const s = { x: a(n, "x", "reshape", "string_or_numeric") }, r = { shape: t };
-  return u.runKernel(i, s, r);
+  return u.runKernel(p, s, r);
 }
 const h = /* @__PURE__ */ o({ reshape_: l });
 /**
@@ -38,7 +38,7 @@ const h = /* @__PURE__ */ o({ reshape_: l });
  */
 function m(n, t = null, e = !1) {
   let s = a(n, "x", "sum");
-  s.dtype === "bool" && (s = p(s, "int32"));
+  s.dtype === "bool" && (s = i(s, "int32"));
   const r = { x: s }, c = { axis: t, keepDims: e };
   return u.runKernel(x, r, c);
 }

package/dist/training/AdamExt.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import { A as r, m as c, s as h, a as g, e as o } from "../index-Dsg28SG6.js";
+import { A as r, a as c, s as h, b as g, e as o } from "../index-CWQLouWz.js";
 class u extends r {
   constructor(t, e, s, a, i) {
     super(t, e, s, a), this.config = i, this.startLearningRate = t;

package/dist/training/Trainer.js CHANGED Viewed

@@ -1,8 +1,8 @@
 import { DatasetBuilder as d } from "./DatasetBuilder.js";
-import p from "./AdamExt.js";
-class u {
-  constructor(t, e, s, i = 1e-3) {
-    this.tokenizer = s, this.tf = t, this.model = e, this.learningRate = i, this.resetOptimizer(), this.datasetBuilder = new d(this.tf, s, e.config.blockSize);
+import h from "./AdamExt.js";
+class g {
+  constructor(t, s, e, i = 1e-3) {
+    this.tokenizer = e, this.tf = t, this.model = s, this.learningRate = i, this.resetOptimizer(), this.datasetBuilder = new d(this.tf, e, s.config.blockSize);
   }
   model;
   optimizer;
@@ -25,7 +25,7 @@ class u {
   }
   resetOptimizer(t = { learningRateFactor: 1, beta1: 0.9, beta2: 0.99, epsilon: 1e-8 }) {
     this.optimizer && this.optimizer.dispose();
-    const e = new p(
+    const s = new h(
       t.learningRateFactor * this.learningRate,
       t.beta1,
       t.beta2,
@@ -37,58 +37,59 @@ class u {
         weightDecay: 0
       }
     );
-    this.optimizer = e;
+    this.optimizer = s;
   }
   printGradients(t) {
-    Object.keys(t).forEach((e) => {
-      const s = t[e];
-      console.log(`${e}:`), console.log(`  Shape: ${s.shape}`), console.log(`  Mean: ${this.tf.mean(s).dataSync()[0]}`), console.log(`  Std: ${this.tf.moments(s).variance.sqrt().dataSync()[0]}`), console.log(`  Min: ${this.tf.min(s).dataSync()[0]}`), console.log(`  Max: ${this.tf.max(s).dataSync()[0]}`), console.log(`  Norm: ${this.tf.norm(s).dataSync()[0]}`);
+    Object.keys(t).forEach((s) => {
+      const e = t[s];
+      console.log(`${s}:`), console.log(`  Shape: ${e.shape}`), console.log(`  Mean: ${this.tf.mean(e).dataSync()[0]}`), console.log(`  Std: ${this.tf.moments(e).variance.sqrt().dataSync()[0]}`), console.log(`  Min: ${this.tf.min(e).dataSync()[0]}`), console.log(`  Max: ${this.tf.max(e).dataSync()[0]}`), console.log(`  Norm: ${this.tf.norm(e).dataSync()[0]}`);
     });
   }
-  trainStep(t, e = !1, s = !1) {
+  trainStep(t, s = !1, e = !1) {
     return this.tf.tidy(() => {
+      this.model.getProfiler()?.startMemory();
       const { xs: i, ys: a } = t, o = () => {
         const { loss: l, logits: c } = this.model.forward(i, a, !0);
         return c.dispose(), l;
       }, { value: n, grads: r } = this.tf.variableGrads(o);
-      return e || (s && (console.log("-------"), this.printGradients(r), console.log("-------")), this.optimizer.applyGradients(r), this.tf.dispose(r)), n;
+      return s ? this.model.getProfiler()?.endMemory("Training") : (e && (console.log("-------"), this.printGradients(r), console.log("-------")), this.optimizer.applyGradients(r), this.model.getProfiler()?.endMemory("Training"), this.tf.dispose(r)), n;
     });
   }
   dummyPass() {
-    const t = this.tf.zeros([1, this.model.config.blockSize], "int32"), e = this.tf.zeros([1, this.model.config.blockSize], "int32");
+    const t = this.tf.zeros([1, this.model.config.blockSize], "int32"), s = this.tf.zeros([1, this.model.config.blockSize], "int32");
     try {
-      const s = this.trainStep({ xs: t, ys: e }, !0);
-      s.dataSync(), s.dispose();
-    } catch (s) {
-      console.error("Error during dummy pass:", s);
+      const e = this.trainStep({ xs: t, ys: s }, !0);
+      e.dataSync(), e.dispose();
+    } catch (e) {
+      console.error("Error during dummy pass:", e);
     } finally {
-      t.dispose(), e.dispose();
+      t.dispose(), s.dispose();
     }
   }
-  async trainBatch(t, e) {
+  async trainBatch(t, s) {
     try {
-      const s = this.trainStep(e, !1, !1);
-      return e.xs.dispose(), e.ys.dispose(), t.step++, t.totalSteps++, s.array().then((i) => (t.lastLoss = i, t.losses.push(t.lastLoss), s.dispose(), t.lastLoss));
-    } catch (s) {
-      throw console.error(`Error processing batch at step ${t.step}:`, s), this.tf.dispose(), s;
+      const e = this.trainStep(s, !1, !1);
+      return s.xs.dispose(), s.ys.dispose(), t.step++, t.totalSteps++, e.array().then((i) => (t.lastLoss = i, t.losses.push(t.lastLoss), e.dispose(), t.lastLoss));
+    } catch (e) {
+      throw console.error(`Error processing batch at step ${t.step}:`, e), this.tf.dispose(), e;
     }
   }
-  async createTrainValidationSplit(t, e = 32, s = 0.1) {
-    const i = await this.datasetBuilder.createTextDataset(t, e, 0, 1 - s), a = await this.datasetBuilder.createTextDataset(
+  async createTrainValidationSplit(t, s = 32, e = 0.1) {
+    const i = await this.datasetBuilder.createTextDataset(t, s, 0, 1 - e), a = await this.datasetBuilder.createTextDataset(
       t,
-      e,
-      1 - s,
+      s,
+      1 - e,
       1
     );
     return { trainDataset: i, validationDataset: a };
   }
-  async createDataset(t, e = 32) {
-    return await this.datasetBuilder.createTextDataset(t, e);
+  async createDataset(t, s = 32) {
+    return await this.datasetBuilder.createTextDataset(t, s);
   }
   dispose() {
     this.optimizer && this.optimizer.dispose();
   }
 }
 export {
-  u as default
+  g as default
 };