npm - @genai-fi/nanogpt - Versions diffs - 0.5.4 → 0.5.5 - Mend

@genai-fi/nanogpt 0.5.4 → 0.5.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

package/dist/Generator.js +5 -5
package/dist/NanoGPTModel.d.ts +2 -0
package/dist/NanoGPTModel.js +8 -8
package/dist/{Reshape-Bt_t7RNz.js → Reshape-Biok_3X1.js} +6 -6
package/dist/TeachableLLM.js +1 -1
package/dist/{TiedEmbedding-DORsPlNL.js → TiedEmbedding-8S8xn8e6.js} +5 -5
package/dist/Trainer.d.ts +1 -0
package/dist/Trainer.js +8 -7
package/dist/{axis_util-CVbf1vmL.js → axis_util-BczFISHz.js} +1 -1
package/dist/{broadcast_to-BBoMQXbL.js → broadcast_to-B7NGsBSh.js} +2 -2
package/dist/{concat-BRRtq4S2.js → concat-DdKPyAtw.js} +1 -1
package/dist/{dataset-ZHEPJmED.js → dataset-iqT4Otvb.js} +7 -7
package/dist/{dropout-lQm_YyX3.js → dropout-B09InSJS.js} +1 -1
package/dist/{gather-BWyutxwi.js → gather-D6MsdXqc.js} +1 -1
package/dist/{gpgpu_math-Df7gzJWH.js → gpgpu_math-BFbOyvk4.js} +1 -1
package/dist/{index-CnHyhpKc.js → index-Du-bmOP8.js} +98 -98
package/dist/{kernel_funcs_utils-Dqo82NH4.js → kernel_funcs_utils-DShm7-0k.js} +33 -33
package/dist/layers/BaseLayer.js +2 -2
package/dist/layers/CausalSelfAttention.js +6 -6
package/dist/layers/MLP.js +5 -5
package/dist/layers/RMSNorm.js +3 -3
package/dist/layers/RoPECache.js +3 -3
package/dist/layers/TiedEmbedding.js +6 -6
package/dist/layers/TransformerBlock.js +1 -1
package/dist/{log_sum_exp-CRH7Np9v.js → log_sum_exp-CxfBtUaG.js} +5 -5
package/dist/main.js +1 -1
package/dist/{mat_mul-DeGU1U_C.js → mat_mul-CbiqIe2d.js} +1 -1
package/dist/{max-CcnEArWK.js → max-0Xnlpv8k.js} +1 -1
package/dist/{norm-BpWsOapl.js → norm-01kY9I2B.js} +5 -5
package/dist/{ones-CDWGzVnm.js → ones-CrutWGas.js} +2 -2
package/dist/ops/appendCache.js +3 -3
package/dist/ops/attentionMask.js +1 -1
package/dist/ops/cpu/appendCache.js +2 -2
package/dist/ops/cpu/attentionMask.js +5 -5
package/dist/ops/cpu/fusedSoftmax.js +2 -2
package/dist/ops/cpu/gatherSub.js +3 -3
package/dist/ops/cpu/gelu.js +1 -1
package/dist/ops/cpu/matMulGelu.js +1 -1
package/dist/ops/cpu/matMulMul.js +1 -1
package/dist/ops/cpu/mulDropout.js +1 -1
package/dist/ops/cpu/normRMS.js +1 -1
package/dist/ops/cpu/qkv.js +3 -3
package/dist/ops/cpu/rope.js +5 -5
package/dist/ops/cpu/scatterSub.js +4 -4
package/dist/ops/fusedSoftmax.js +1 -1
package/dist/ops/gatherSub.js +1 -1
package/dist/ops/gelu.js +1 -1
package/dist/ops/grads/attentionMask.js +1 -1
package/dist/ops/grads/fusedSoftmax.js +2 -2
package/dist/ops/grads/gelu.js +1 -1
package/dist/ops/grads/matMulGelu.js +1 -1
package/dist/ops/grads/normRMS.js +1 -1
package/dist/ops/grads/qkv.js +1 -1
package/dist/ops/grads/rope.js +1 -1
package/dist/ops/matMulGelu.js +1 -1
package/dist/ops/matMulMul.js +1 -1
package/dist/ops/mulDrop.js +1 -1
package/dist/ops/node/sparseCrossEntropy.js +1 -1
package/dist/ops/normRMS.js +1 -1
package/dist/ops/qkv.js +1 -1
package/dist/ops/scatterSub.js +1 -1
package/dist/ops/webgl/appendCache.js +1 -1
package/dist/ops/webgl/attentionMask.js +1 -1
package/dist/ops/webgl/fusedSoftmax.js +96 -96
package/dist/ops/webgl/gatherSub.js +1 -1
package/dist/ops/webgl/gelu.js +2 -2
package/dist/ops/webgl/matMulGelu.js +4 -4
package/dist/ops/webgl/matMulMul.js +1 -1
package/dist/ops/webgl/mulDropout.js +1 -1
package/dist/ops/webgl/normRMS.js +2 -2
package/dist/ops/webgl/qkv.js +1 -1
package/dist/ops/webgl/rope.js +1 -1
package/dist/ops/webgl/scatterSub.js +1 -1
package/dist/{ops-DzQTmLIl.js → ops-CJNniCAV.js} +13 -13
package/dist/{random_width-DI2h9CMs.js → random_width-C-v-35bY.js} +1324 -1279
package/dist/{range-CkOJ7090.js → range-Bvs1hidm.js} +1 -1
package/dist/{reshape-CTIbqjwm.js → reshape-BH7eBpwq.js} +1 -1
package/dist/{sin-HzioENy_.js → sin-CPAZXNjH.js} +1 -1
package/dist/{slice_util-n4wHKmex.js → slice_util-DskXqRZa.js} +1 -1
package/dist/{softmax-DX6qXAbm.js → softmax-DhWoBa7r.js} +1 -1
package/dist/{split-CVwhL8Oe.js → split-BCUhuU7B.js} +1 -1
package/dist/{stack-S2-D2JAQ.js → stack-BV1v7l3S.js} +1 -1
package/dist/{sum-UdfvaNhB.js → sum-Cvq06317.js} +1 -1
package/dist/{tensor-IZex6Bwp.js → tensor-DgTOPY6h.js} +1 -1
package/dist/{tensor2d-CqtBzOKq.js → tensor2d-CRWjDyUe.js} +1 -1
package/dist/{tfjs_backend-DX9yVvwk.js → tfjs_backend-D9Ytje0G.js} +39 -39
package/dist/training/AdamExt.js +1 -1
package/dist/training/DatasetBuilder.js +2 -2
package/dist/training/FullTrainer.js +36 -32
package/dist/training/Trainer.d.ts +7 -4
package/dist/training/Trainer.js +58 -50
package/dist/training/sparseCrossEntropy.js +4 -4
package/dist/utilities/dummy.js +2 -2
package/dist/utilities/generate.js +3 -3
package/dist/utilities/load.js +1 -1
package/dist/utilities/profile.d.ts +1 -0
package/dist/utilities/profile.js +6 -3
package/dist/utilities/weights.js +2 -2
package/dist/{variable-BGvK-VN3.js → variable-DZ3fF0R2.js} +1 -1
package/dist/{zeros-CYMicyqz.js → zeros-BaHhQTWf.js} +1 -1
package/package.json +1 -1
package/dist/moments-DLTE6-1p.js +0 -53

package/dist/{range-CkOJ7090.js → range-Bvs1hidm.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { E as e, R as f } from "./index-CnHyhpKc.js";
+import { E as e, R as f } from "./index-Du-bmOP8.js";
 /**
  * @license
  * Copyright 2018 Google LLC. All Rights Reserved.

package/dist/{reshape-CTIbqjwm.js → reshape-BH7eBpwq.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { o, j as t, E as a, w as p } from "./index-CnHyhpKc.js";
+import { o, j as t, E as a, w as p } from "./index-Du-bmOP8.js";
 /**
  * @license
  * Copyright 2020 Google LLC. All Rights Reserved.

package/dist/{sin-HzioENy_.js → sin-CPAZXNjH.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { o, j as t, E as c, _ as a, $ as e } from "./index-CnHyhpKc.js";
+import { o, j as t, E as c, _ as a, $ as e } from "./index-Du-bmOP8.js";
 /**
  * @license
  * Copyright 2018 Google LLC. All Rights Reserved.

package/dist/{slice_util-n4wHKmex.js → slice_util-DskXqRZa.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { l as s } from "./index-CnHyhpKc.js";
+import { l as s } from "./index-Du-bmOP8.js";
 /**
  * @license
  * Copyright 2021 Google LLC. All Rights Reserved.

package/dist/{softmax-DX6qXAbm.js → softmax-DhWoBa7r.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { o as r, j as f, E as e, S as i } from "./index-CnHyhpKc.js";
+import { o as r, j as f, E as e, S as i } from "./index-Du-bmOP8.js";
 /**
  * @license
  * Copyright 2018 Google LLC. All Rights Reserved.

package/dist/{split-CVwhL8Oe.js → split-BCUhuU7B.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { o as p, j as i, E as a, x as c } from "./index-CnHyhpKc.js";
+import { o as p, j as i, E as a, x as c } from "./index-Du-bmOP8.js";
 /**
  * @license
  * Copyright 2020 Google LLC. All Rights Reserved.

package/dist/{stack-S2-D2JAQ.js → stack-BV1v7l3S.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { o as e, k as c, l as n, E as k, P as i } from "./index-CnHyhpKc.js";
+import { o as e, k as c, l as n, E as k, P as i } from "./index-Du-bmOP8.js";
 /**
  * @license
  * Copyright 2020 Google LLC. All Rights Reserved.

package/dist/{sum-UdfvaNhB.js → sum-Cvq06317.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { o as e, j as u, D as c, E as l, F as m } from "./index-CnHyhpKc.js";
+import { o as e, j as u, D as c, E as l, F as m } from "./index-Du-bmOP8.js";
 /**
  * @license
  * Copyright 2018 Google LLC. All Rights Reserved.

package/dist/{tensor-IZex6Bwp.js → tensor-DgTOPY6h.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { J as t, K as a } from "./index-CnHyhpKc.js";
+import { J as t, K as a } from "./index-Du-bmOP8.js";
 /**
  * @license
  * Copyright 2018 Google LLC. All Rights Reserved.

package/dist/{tensor2d-CqtBzOKq.js → tensor2d-CRWjDyUe.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { I as t, J as s, K as a } from "./index-CnHyhpKc.js";
+import { I as t, J as s, K as a } from "./index-Du-bmOP8.js";
 /**
  * @license
  * Copyright 2018 Google LLC. All Rights Reserved.

package/dist/{tfjs_backend-DX9yVvwk.js → tfjs_backend-D9Ytje0G.js} RENAMED Viewed

@@ -1,11 +1,11 @@
-import { o as h, j as f, E as $, ap as Te, l as _, g as Ee, aq as xe, ar as Ie, as as Le, at as be, au as Ne, av as Ce, aw as Pe, b as H, ax as Fe, a9 as U, u as ae, q as ie, Q as le, c as fe, ay as he, aj as pe, az as je, t as S, D as $e, am as Me, a4 as Be } from "./index-CnHyhpKc.js";
-import { s as C, t as Ke, a as Ue, b as ve } from "./ops-DzQTmLIl.js";
-import { r as Re, d as Ve } from "./dropout-lQm_YyX3.js";
-import { r as u } from "./reshape-CTIbqjwm.js";
-import { g as qe } from "./gather-BWyutxwi.js";
-import { s as Ge } from "./sum-UdfvaNhB.js";
-import { m as A } from "./mat_mul-DeGU1U_C.js";
-import { c as M } from "./concat-BRRtq4S2.js";
+import { o as h, j as f, E as $, ao as Te, l as _, g as Ee, ap as xe, aq as Ie, ar as Le, as as be, at as Ne, au as Ce, av as Pe, b as H, aw as Fe, a8 as U, u as ae, q as ie, Q as le, c as fe, ax as he, ai as pe, ay as je, t as S, D as $e, al as Me, a2 as Be } from "./index-Du-bmOP8.js";
+import { s as C, t as Ke, a as Ue, b as ve } from "./ops-CJNniCAV.js";
+import { r as Re, d as Ve } from "./dropout-B09InSJS.js";
+import { r as u } from "./reshape-BH7eBpwq.js";
+import { g as qe } from "./gather-D6MsdXqc.js";
+import { s as Ge } from "./sum-Cvq06317.js";
+import { m as A } from "./mat_mul-CbiqIe2d.js";
+import { c as M } from "./concat-DdKPyAtw.js";
 /**
  * @license
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -213,11 +213,11 @@ const X = /* @__PURE__ */ h({ slice1d_: dn });
  * limitations under the License.
  * =============================================================================
  */
-function mn(e, n, t) {
+function gn(e, n, t) {
   const r = f(e, "x", "slice2d");
   return _(r.rank === 2, () => `slice2d expects a rank-2 tensor, but got a rank-${r.rank} tensor`), C(r, n, t);
 }
-const we = /* @__PURE__ */ h({ slice2d_: mn });
+const we = /* @__PURE__ */ h({ slice2d_: gn });
 /**
  * @license
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -234,11 +234,11 @@ const we = /* @__PURE__ */ h({ slice2d_: mn });
  * limitations under the License.
  * =============================================================================
  */
-function gn(e, n, t) {
+function mn(e, n, t) {
   const r = f(e, "x", "slice3d");
   return _(r.rank === 3, () => `slice3d expects a rank-3 tensor, but got a rank-${r.rank} tensor`), C(r, n, t);
 }
-const z = /* @__PURE__ */ h({ slice3d_: gn });
+const z = /* @__PURE__ */ h({ slice3d_: mn });
 /**
  * @license
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -350,9 +350,9 @@ function _n({ a: e, b: n, transposeA: t = !1, transposeB: r = !1, bias: s, activ
   }
   let c = f(e, "a", "fused matMul"), a = f(n, "b", "fused matMul");
   [c, a] = ae(c, a);
-  const k = t ? c.shape[c.rank - 2] : c.shape[c.rank - 1], g = r ? a.shape[a.rank - 1] : a.shape[a.rank - 2], E = t ? c.shape[c.rank - 1] : c.shape[c.rank - 2], d = r ? a.shape[a.rank - 2] : a.shape[a.rank - 1], ne = c.shape.slice(0, -2), x = a.shape.slice(0, -2), te = ie(ne), re = ie(x);
-  _(k === g, () => `Error in fused matMul: inner shapes (${k}) and (${g}) of Tensors with shapes ${c.shape} and ${a.shape} and transposeA=${t} and transposeB=${r} must match.`);
-  const R = le(c.shape.slice(0, -2), a.shape.slice(0, -2)).concat([E, d]), V = t ? u(c, [te, k, E]) : u(c, [te, E, k]), q = r ? u(a, [re, d, g]) : u(a, [re, g, d]);
+  const k = t ? c.shape[c.rank - 2] : c.shape[c.rank - 1], m = r ? a.shape[a.rank - 1] : a.shape[a.rank - 2], E = t ? c.shape[c.rank - 1] : c.shape[c.rank - 2], d = r ? a.shape[a.rank - 2] : a.shape[a.rank - 1], ne = c.shape.slice(0, -2), x = a.shape.slice(0, -2), te = ie(ne), re = ie(x);
+  _(k === m, () => `Error in fused matMul: inner shapes (${k}) and (${m}) of Tensors with shapes ${c.shape} and ${a.shape} and transposeA=${t} and transposeB=${r} must match.`);
+  const R = le(c.shape.slice(0, -2), a.shape.slice(0, -2)).concat([E, d]), V = t ? u(c, [te, k, E]) : u(c, [te, E, k]), q = r ? u(a, [re, d, m]) : u(a, [re, m, d]);
   let I;
   s != null && (I = f(s, "bias", "fused matMul"), [I] = ae(I, c), le(R, I.shape));
   let se;
@@ -450,7 +450,7 @@ function Jn(e, n) {
     return t.fill(e), t;
   }
 }
-function me(e, n) {
+function ge(e, n) {
   if (!e)
     throw new ee(n);
 }
@@ -473,7 +473,7 @@ function Qn(e) {
 function Hn(e) {
   return e.length <= 1 || e.indexOf("_") === -1 ? e : e.replace(/[_]+(\w|$)/g, (n, t) => t.toUpperCase());
 }
-let m = {};
+let g = {};
 function Xn(e) {
   if (e == null)
     return null;
@@ -498,8 +498,8 @@ function zn(e, n = {}, t = {}, r = "object", s = !1) {
     let i;
     if (o in t)
       i = t[o];
-    else if (o in m)
-      i = m[o];
+    else if (o in g)
+      i = g[o];
     else if (i = n[o], i == null)
       throw new l(`Unknown ${r}: ${e}. This may be due to one of the following reasons:
 1. The ${r} is defined in Python, in which case it needs to be ported to TensorFlow.js or your JavaScript code.
@@ -512,30 +512,30 @@ function zn(e, n = {}, t = {}, r = "object", s = !1) {
 'className' and 'config' must set.`);
     const i = o.className;
     let p, c;
-    if (i in t ? [p, c] = t[i] : i in m ? [p, c] = m.className : i in n && ([p, c] = n[i]), p == null)
+    if (i in t ? [p, c] = t[i] : i in g ? [p, c] = g.className : i in n && ([p, c] = n[i]), p == null)
       throw new l(`Unknown ${r}: ${i}. This may be due to one of the following reasons:
 1. The ${r} is defined in Python, in which case it needs to be ported to TensorFlow.js or your JavaScript code.
 2. The custom ${r} is defined in JavaScript, but is not registered properly with tf.serialization.registerClass().`);
     if (c != null) {
       const a = {};
-      for (const d of Object.keys(m))
-        a[d] = m[d];
+      for (const d of Object.keys(g))
+        a[d] = g[d];
       for (const d of Object.keys(t))
         a[d] = t[d];
       const k = o.config;
       k.customObjects = a;
-      const g = Object.assign({}, m);
+      const m = Object.assign({}, g);
       for (const d of Object.keys(t))
-        m[d] = t[d];
+        g[d] = t[d];
       W(o.config);
       const E = c(p, o.config, t, s);
-      return m = Object.assign({}, g), E;
+      return g = Object.assign({}, m), E;
     } else {
-      const a = Object.assign({}, m);
-      for (const g of Object.keys(t))
-        m[g] = t[g];
+      const a = Object.assign({}, g);
+      for (const m of Object.keys(t))
+        g[m] = t[m];
       const k = new p(o.config);
-      return m = Object.assign({}, a), k;
+      return g = Object.assign({}, a), k;
     }
   }
 }
@@ -566,7 +566,7 @@ function v(e, n, t) {
     throw new l(`${t} is not a valid ${n}.  Valid values are ${e} or null/undefined.`);
 }
 function rt(e, n, t = 0, r = 1 / 0) {
-  return me(t >= 0), me(r >= t), Array.isArray(e) && e.length >= t && e.length <= r && e.every((s) => typeof s === n);
+  return ge(t >= 0), ge(r >= t), Array.isArray(e) && e.length >= t && e.length <= r && e.every((s) => typeof s === n);
 }
 function Ln(e, n) {
   Array.isArray(e) ? (_(e.length > 0, () => `${n} is unexpectedly an empty array.`), e.forEach((t, r) => Ln(t, `element ${r + 1} of ${n}`))) : _(Number.isInteger(e) && e > 0, () => `Expected ${n} to be a positive integer, but got ${ye(e)}.`);
@@ -606,7 +606,7 @@ function ct(e) {
 function at(e) {
   v(xn, "PoolMode", e);
 }
-const F = [], ge = "/";
+const F = [], me = "/";
 function it(e, n) {
   F.push(e);
   try {
@@ -617,7 +617,7 @@ function it(e, n) {
   }
 }
 function Nn() {
-  return F.length === 0 ? "" : F.join(ge) + ge;
+  return F.length === 0 ? "" : F.join(me) + me;
 }
 function lt(e) {
   if (!Oe(e))
@@ -678,7 +678,7 @@ function dt(e) {
   }
   return n;
 }
-function mt(e, n) {
+function gt(e, n) {
   if (n < e)
     throw new l(`end (${n}) < begin (${e}) is forbidden.`);
   const t = [];
@@ -696,7 +696,7 @@ function mt(e, n) {
  * =============================================================================
  */
 let G;
-function gt() {
+function mt() {
   return G == null && (G = je().epsilon()), G;
 }
 function Y() {
@@ -876,7 +876,7 @@ function Dt(e, n, t, r) {
     e = u(e, [-1, o]);
     const i = n.shape.slice(), p = i.pop(), c = i.pop(), a = [...i, p], k = Array.from({ length: n.rank }, (ne, x) => x === 0 ? n.rank - 2 : x <= n.rank - 2 ? x - 1 : x);
     n = u(ve(n, k), [c, -1]);
-    const g = [...s, ...a];
+    const m = [...s, ...a];
     return u(de({
       a: e,
       b: n,
@@ -884,7 +884,7 @@ function Dt(e, n, t, r) {
       transposeB: !1,
       bias: r ? Q(e.rank, r, Y()) : null,
       activation: t
-    }), g);
+    }), m);
   }
 }
 function Tt(e, n, t) {
@@ -951,7 +951,7 @@ export {
   J as H,
   Pn as I,
   Tt as J,
-  mt as K,
+  gt as K,
   Zn as L,
   It as M,
   j as N,
@@ -1001,10 +1001,10 @@ export {
   _t as r,
   On as s,
   Qn as t,
-  gt as u,
+  mt as u,
   fn as v,
   st as w,
   wt as x,
   Et as y,
-  me as z
+  ge as z
 };

package/dist/training/AdamExt.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import { A as r, b as c, f as h, s as g, e as o } from "../index-CnHyhpKc.js";
+import { A as r, b as c, f as h, s as g, e as o } from "../index-Du-bmOP8.js";
 class u extends r {
   constructor(t, e, s, a, i) {
     super(t, e, s, a), this.config = i, this.startLearningRate = t;

package/dist/training/DatasetBuilder.js CHANGED Viewed

@@ -1,5 +1,5 @@
-import { t as u } from "../index-CnHyhpKc.js";
-import { d as z, i as f } from "../dataset-ZHEPJmED.js";
+import { t as u } from "../index-Du-bmOP8.js";
+import { d as z, i as f } from "../dataset-iqT4Otvb.js";
 import "../index-Tf7vU29b.js";
 /**
  * @license

package/dist/training/FullTrainer.js CHANGED Viewed

@@ -1,21 +1,22 @@
-import { generateText as v } from "../utilities/generate.js";
+import { generateText as T } from "../utilities/generate.js";
 import L from "./Trainer.js";
 import x from "./Evaluator.js";
-import { a as h } from "../index-CnHyhpKc.js";
+import { a as h } from "../index-Du-bmOP8.js";
+import y from "../utilities/profile.js";
 const D = {
   desiredLoss: 0.01,
   logInterval: 1,
   maxSteps: 1e3
 };
-class E extends L {
-  constructor(r, i, o = 3e-4) {
-    super(r, i, o);
+class I extends L {
+  constructor(i, e, o = 3e-4) {
+    super(i, e, o);
   }
   // Train for multiple epochs using Dataset API - FIXED memory leaks
-  async trainOnDataset(r, i, o) {
-    const { desiredLoss: u, logInterval: d, onStep: l, prompt: c, maxSteps: g } = {
+  async trainOnDataset(i, e, o) {
+    const { desiredLoss: p, logInterval: g, onStep: l, prompt: c, maxSteps: u } = {
       ...D,
-      ...i
+      ...e
     }, n = Date.now(), t = {
       step: 0,
       lastLoss: 1e6,
@@ -26,52 +27,55 @@ class E extends L {
       trainingDuration: 0,
       ...this.lastState || {}
     };
-    this.lastState = t, this.dummyPass(), this.model.trainable = !0, this.running = !0, t.logStartTime = n;
-    const m = o ? new x(this.model, o) : void 0, S = await r.iterator();
+    this.lastState = t, this.dummyPass(), this.model.trainable = !0, e?.advancedMetrics && (this.model.getProfiler() || (this.model.config.layerConfig.profiler = new y())), this.running = !0, t.logStartTime = n;
+    const m = o ? new x(this.model, o) : void 0, f = await i.iterator();
     try {
-      for (; this.running && !(t.lastLoss < u); ) {
-        const a = await S.next();
-        if (a.done) break;
-        const p = a.value, f = this.trainBatch(t, p), s = {
+      for (; this.running && !(t.lastLoss < p); ) {
+        const r = await f.next();
+        if (r.done) break;
+        const d = r.value, v = this.trainBatch(t, d, e.advancedMetrics || !1), s = {
           loss: t.lastLoss,
           step: t.step,
           time: Date.now() - n,
-          batchSize: p.xs.shape[0]
+          batchSize: d.xs.shape[0],
+          learningRate: e?.advancedMetrics ? this.optimizer.lr : void 0,
+          gradientNorm: e?.advancedMetrics ? t.gradientNorm : void 0
         };
-        if (this.model.log.push(s), t.step % d === 0) {
-          await f;
-          const w = Date.now();
-          if (t.trainingDuration += w - t.logStartTime, m)
+        if (this.model.log.push(s), t.step % g === 0) {
+          await v;
+          const S = Date.now();
+          if (t.trainingDuration += S - t.logStartTime, m)
             try {
-              const e = await m.evaluate(5);
-              t.validationLosses.push(e), s.valLoss = e;
-            } catch (e) {
-              console.error("Validation error:", e);
+              const a = await m.evaluate(5);
+              t.validationLosses.push(a), s.valLoss = a;
+            } catch (a) {
+              console.error("Validation error:", a);
             }
           if (l) {
             if (c) {
-              const T = await v(this.tokenizer, this.model, c, 100, {
+              const w = await T(this.tokenizer, this.model, c, 100, {
                 temperature: 0.8
               });
-              s.example = T;
+              s.example = w;
             }
-            const e = {
+            const a = {
               duration: t.trainingDuration,
               totalSamples: t.totalSteps * s.batchSize,
-              samplesPerSecond: t.totalSteps * s.batchSize / (t.trainingDuration / 1e3)
+              samplesPerSecond: t.totalSteps * s.batchSize / (t.trainingDuration / 1e3),
+              memory: e.advancedMetrics ? this.model.getProfiler()?.getPeakMemory() || 0 : void 0
             };
-            await l(s, e);
+            await l(s, a);
           }
           t.logStartTime = Date.now();
         }
-        t.step >= g && this.stop();
+        t.step >= u && this.stop();
       }
-    } catch (a) {
-      throw console.error("Training error:", a), h(), a;
+    } catch (r) {
+      throw console.error("Training error:", r), h(), r;
     }
     return h(), this.running = !1, { losses: t.losses, validationLosses: t.validationLosses };
   }
 }
 export {
-  E as default
+  I as default
 };

package/dist/training/Trainer.d.ts CHANGED Viewed

@@ -11,11 +11,13 @@ export interface TrainingState {
     totalSteps: number;
     losses: number[];
     validationLosses: number[];
+    gradientNorm?: number;
 }
 export interface TrainingProgress {
     duration: number;
     totalSamples: number;
     samplesPerSecond: number;
+    memory?: number;
 }
 export interface AdamConfig {
     learningRateFactor: number;
@@ -28,6 +30,7 @@ export interface TrainingOptions {
     logInterval: number;
     prompt?: string;
     maxSteps: number;
+    advancedMetrics?: boolean;
     onStep?: (log: TrainingLogEntry, progress: TrainingProgress) => Promise<void> | void;
 }
 export default abstract class GPTTrainer {
@@ -44,16 +47,16 @@ export default abstract class GPTTrainer {
     stop(): void;
     getOptimizer(): AdamExt;
     resetOptimizer(config?: AdamConfig): void;
-    private printGradients;
-    protected trainStep(batch: {
+    private maxGradNorm;
+    protected trainStep(state: Partial<TrainingState>, batch: {
         xs: Tensor;
         ys: Tensor;
-    }, dummy?: boolean, print?: boolean): Scalar;
+    }, dummy?: boolean, calcNorm?: boolean): Scalar;
     protected dummyPass(): void;
     protected trainBatch(state: TrainingState, batch: {
         xs: Tensor;
         ys: Tensor;
-    }): Promise<number>;
+    }, calcNorm?: boolean): Promise<number>;
     abstract trainOnDataset(dataset: Dataset<{
         xs: Tensor;
         ys: Tensor;

package/dist/training/Trainer.js CHANGED Viewed

@@ -1,13 +1,11 @@
-import { DatasetBuilder as h, flattenTokens as d, PAGE_FACTOR as g } from "./DatasetBuilder.js";
-import u from "./AdamExt.js";
-import { t as f, v as y, a as m } from "../index-CnHyhpKc.js";
-import { m as S, n as z } from "../norm-BpWsOapl.js";
-import { m as w, a as T } from "../moments-DLTE6-1p.js";
-import { m as x } from "../max-CcnEArWK.js";
-import { z as p } from "../zeros-CYMicyqz.js";
-class G {
-  constructor(t, s, e = 1e-3) {
-    this.tokenizer = s, this.model = t, this.learningRate = e, this.resetOptimizer(), this.datasetBuilder = new h(s, t.config.gpt.blockSize);
+import { DatasetBuilder as g, flattenTokens as m, PAGE_FACTOR as u } from "./DatasetBuilder.js";
+import f from "./AdamExt.js";
+import { t as y, v as z, a as c } from "../index-Du-bmOP8.js";
+import { n as S } from "../norm-01kY9I2B.js";
+import { z as p } from "../zeros-BaHhQTWf.js";
+class R {
+  constructor(t, e, s = 1e-3) {
+    this.tokenizer = e, this.model = t, this.learningRate = s, this.resetOptimizer(), this.datasetBuilder = new g(e, t.config.gpt.blockSize);
   }
   model;
   optimizer;
@@ -29,7 +27,7 @@ class G {
   }
   resetOptimizer(t = { learningRateFactor: 1, beta1: 0.9, beta2: 0.99, epsilon: 1e-8 }) {
     this.optimizer && this.optimizer.dispose();
-    const s = new u(
+    const e = new f(
       t.learningRateFactor * this.learningRate,
       t.beta1,
       t.beta2,
@@ -41,68 +39,78 @@ class G {
         weightDecay: 0
       }
     );
-    this.optimizer = s;
+    this.optimizer = e;
   }
-  printGradients(t) {
-    Object.keys(t).forEach((s) => {
-      const e = t[s];
-      console.log(`${s}:`), console.log(`  Shape: ${e.shape}`), console.log(`  Mean: ${w(e).dataSync()[0]}`), console.log(`  Std: ${T(e).variance.sqrt().dataSync()[0]}`), console.log(`  Min: ${S(e).dataSync()[0]}`), console.log(`  Max: ${x(e).dataSync()[0]}`), console.log(`  Norm: ${z(e).dataSync()[0]}`);
-    });
+  maxGradNorm(t) {
+    let e = 0;
+    return Object.keys(t).forEach((s) => {
+      const a = t[s], r = S(a), i = r.dataSync()[0];
+      r.dispose(), i > e && (e = i);
+    }), e;
   }
-  trainStep(t, s = !1, e = !1) {
-    return f(() => {
+  trainStep(t, e, s = !1, a = !1) {
+    return y(() => {
       this.model.getProfiler()?.startMemory();
-      const { xs: a, ys: i } = t, o = () => {
-        const [l, c] = this.model.forward({ training: !0 }, a, i);
-        return l.dispose(), c;
-      }, { value: n, grads: r } = y(o);
-      return s ? this.model.getProfiler()?.endMemory("Training") : (e && (console.log("-------"), this.printGradients(r), console.log("-------")), this.optimizer.applyGradients(r), this.model.getProfiler()?.endMemory("Training"), m(r)), n;
+      const { xs: r, ys: i } = e, d = () => {
+        const [n, h] = this.model.forward({ training: !0 }, r, i);
+        return n.dispose(), h;
+      }, { value: l, grads: o } = z(d);
+      if (s)
+        this.model.getProfiler()?.endMemory("Training");
+      else {
+        if (a) {
+          const n = this.maxGradNorm(o);
+          t.gradientNorm = n;
+        }
+        this.optimizer.applyGradients(o), this.model.getProfiler()?.endMemory("Training"), c(o);
+      }
+      return l;
     });
   }
   dummyPass() {
-    const t = p([1, this.model.config.gpt.blockSize], "int32"), s = p([1, this.model.config.gpt.blockSize], "int32");
+    const t = p([1, this.model.config.gpt.blockSize], "int32"), e = p([1, this.model.config.gpt.blockSize], "int32");
     try {
-      const e = this.trainStep({ xs: t, ys: s }, !0);
-      e.dataSync(), e.dispose();
-    } catch (e) {
-      console.error("Error during dummy pass:", e);
+      const s = this.trainStep({}, { xs: t, ys: e }, !0);
+      s.dataSync(), s.dispose();
+    } catch (s) {
+      console.error("Error during dummy pass:", s);
     } finally {
-      t.dispose(), s.dispose();
+      t.dispose(), e.dispose();
     }
   }
-  async trainBatch(t, s) {
+  async trainBatch(t, e, s = !1) {
     try {
-      const e = this.trainStep(s, !1, !1);
-      return s.xs.dispose(), s.ys.dispose(), t.step++, t.totalSteps++, e.array().then((a) => (t.lastLoss = a, t.losses.push(t.lastLoss), e.dispose(), t.lastLoss));
-    } catch (e) {
-      throw console.error(`Error processing batch at step ${t.step}:`, e), m(), e;
+      const a = this.trainStep(t, e, !1, s);
+      return e.xs.dispose(), e.ys.dispose(), t.step++, t.totalSteps++, a.array().then((r) => (t.lastLoss = r, t.losses.push(t.lastLoss), a.dispose(), t.lastLoss));
+    } catch (a) {
+      throw console.error(`Error processing batch at step ${t.step}:`, a), c(), a;
     }
   }
-  async createTrainValidationSplit(t, s = 32, e = 0.1) {
-    const a = await d(t, this.tokenizer), i = /* @__PURE__ */ new Set();
-    if (e > 0) {
-      const r = Math.floor(a.length / (this.datasetBuilder.blockSize * g)), l = Math.max(1, Math.floor(r * e));
-      for (; i.size < l; ) {
-        const c = Math.floor(Math.random() * r);
-        i.add(c);
+  async createTrainValidationSplit(t, e = 32, s = 0.1) {
+    const a = await m(t, this.tokenizer), r = /* @__PURE__ */ new Set();
+    if (s > 0) {
+      const l = Math.floor(a.length / (this.datasetBuilder.blockSize * u)), o = Math.max(1, Math.floor(l * s));
+      for (; r.size < o; ) {
+        const n = Math.floor(Math.random() * l);
+        r.add(n);
       }
     }
-    const o = await this.datasetBuilder.createTextDataset(a, s, i, !1), n = await this.datasetBuilder.createTextDataset(
+    const i = await this.datasetBuilder.createTextDataset(a, e, r, !1), d = await this.datasetBuilder.createTextDataset(
       a,
-      s,
-      i,
+      e,
+      r,
       !0
     );
-    return { trainDataset: o, validationDataset: n };
+    return { trainDataset: i, validationDataset: d };
   }
-  async createDataset(t, s = 32) {
-    const e = await d(t, this.tokenizer);
-    return await this.datasetBuilder.createTextDataset(e, s);
+  async createDataset(t, e = 32) {
+    const s = await m(t, this.tokenizer);
+    return await this.datasetBuilder.createTextDataset(s, e);
   }
   dispose() {
     this.optimizer && this.optimizer.dispose();
   }
 }
 export {
-  G as default
+  R as default
 };

package/dist/training/sparseCrossEntropy.js CHANGED Viewed

@@ -1,9 +1,9 @@
 import { gatherSub as L } from "../ops/gatherSub.js";
 import { scatterSub as y } from "../ops/scatterSub.js";
-import { e as u, c as i, z as S, t as f, s as G } from "../index-CnHyhpKc.js";
-import { s as v } from "../softmax-DX6qXAbm.js";
-import { m as z } from "../max-CcnEArWK.js";
-import { l as k } from "../log_sum_exp-CRH7Np9v.js";
+import { e as u, c as i, z as S, t as f, s as G } from "../index-Du-bmOP8.js";
+import { s as v } from "../softmax-DhWoBa7r.js";
+import { m as z } from "../max-0Xnlpv8k.js";
+import { l as k } from "../log_sum_exp-CxfBtUaG.js";
 function F(a, s) {
   return f(() => {
     const e = a.shape[a.shape.length - 1], o = a.shape.slice(0, -1).reduce((d, c) => d * c, 1), p = a.shape.length > 2 ? a.reshape([o, e]) : a, n = s.shape.length > 1 ? s.reshape([o]).cast("int32") : s.cast("int32"), t = z(p, -1, !0), r = G(p, t), h = k(r, -1);

package/dist/utilities/dummy.js CHANGED Viewed

@@ -1,5 +1,5 @@
-import "../index-CnHyhpKc.js";
-import { z as n } from "../zeros-CYMicyqz.js";
+import "../index-Du-bmOP8.js";
+import { z as n } from "../zeros-BaHhQTWf.js";
 async function c(s) {
   const i = n([1, s.config.gpt.blockSize], "int32"), [t, o] = s.forward({ training: !1 }, i);
   await t.data(), t.dispose(), o && o.dispose(), i.dispose();

package/dist/utilities/generate.js CHANGED Viewed

@@ -1,6 +1,6 @@
-import { t as y } from "../index-CnHyhpKc.js";
-import { t as x } from "../tensor2d-CqtBzOKq.js";
-import { c as f } from "../concat-BRRtq4S2.js";
+import { t as y } from "../index-Du-bmOP8.js";
+import { t as x } from "../tensor2d-CRWjDyUe.js";
+import { c as f } from "../concat-DdKPyAtw.js";
 async function A(o, r, a, c, T) {
   if (c <= 0)
     throw new Error("Length must be a positive integer");

package/dist/utilities/load.js CHANGED Viewed

@@ -3,7 +3,7 @@ import { importWeights as b } from "./weights.js";
 import u from "../tokeniser/CharTokeniser.js";
 import F from "../NanoGPTModel.js";
 import { dummyPassAsync as j } from "./dummy.js";
-import { d as T } from "../index-CnHyhpKc.js";
+import { d as T } from "../index-Du-bmOP8.js";
 import E from "../tokeniser/bpe.js";
 async function A(t) {
   const o = await fetch(t);