npm - @genai-fi/nanogpt - Versions diffs - 0.7.3 → 0.8.1 - Mend

@genai-fi/nanogpt 0.7.3 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (197) hide show

package/dist/Generator.d.ts +25 -2
package/dist/Generator.js +152 -49
package/dist/{RealDiv-Dy0p8Bvo.js → RealDiv-D_q39E3A.js} +13 -13
package/dist/{Reshape-DvudQDvJ.js → Reshape-41YpQqEo.js} +1 -1
package/dist/{Reshape-DH5srBP0.js → Reshape-Bh_jzKzV.js} +5 -5
package/dist/TeachableLLM.d.ts +6 -6
package/dist/TeachableLLM.js +33 -31
package/dist/Trainer.d.ts +13 -2
package/dist/Trainer.js +21 -12
package/dist/{axis_util-BzbKo31C.js → axis_util-Did9235A.js} +3 -3
package/dist/backend.js +2 -2
package/dist/{backend_util-TE7aTPhZ.js → backend_util-yC3YH1jo.js} +58 -58
package/dist/{broadcast_to-CdbwV-Dj.js → broadcast_to-CUvOdOT5.js} +2 -2
package/dist/checks/appendCache.d.ts +1 -0
package/dist/checks/appendCache.js +22 -0
package/dist/checks/attentionMask.d.ts +1 -0
package/dist/checks/attentionMask.js +37 -0
package/dist/checks/check.d.ts +9 -0
package/dist/checks/check.js +20 -0
package/dist/checks/gelu.d.ts +1 -0
package/dist/checks/gelu.js +18 -0
package/dist/checks/index.d.ts +19 -0
package/dist/checks/index.js +21 -0
package/dist/checks/normRMS.d.ts +1 -0
package/dist/checks/normRMS.js +16 -0
package/dist/checks/normRMSGrad.d.ts +1 -0
package/dist/checks/normRMSGrad.js +12 -0
package/dist/checks/qkv.d.ts +1 -0
package/dist/checks/qkv.js +25 -0
package/dist/checks/rope.d.ts +1 -0
package/dist/checks/rope.js +21 -0
package/dist/{concat-CsxrgovM.js → concat-pHiVqR3L.js} +1 -1
package/dist/{dataset-CtdBYwjo.js → dataset-DPPl-iLT.js} +9 -9
package/dist/{dropout-DYs5QFGQ.js → dropout-CcKSfOYE.js} +18 -18
package/dist/exports_initializers-DKk7-bsx.js +16 -0
package/dist/{gather-CMMy2KEG.js → gather-CPg6ZlQA.js} +1 -1
package/dist/{gelu-C-dPj6Ku.js → gelu-BkcmEEyD.js} +1 -1
package/dist/{gpgpu_math-DGNLNL4I.js → gpgpu_math-D_ODOLix.js} +26 -26
package/dist/{index-BoWRt-10.js → index-DdmHGZjq.js} +659 -650
package/dist/{index-CLthM0TO.js → index-evZ57wr4.js} +185 -185
package/dist/{kernel_funcs_utils-BYKWV8Aa.js → kernel_funcs_utils-CDfFpUab.js} +21 -21
package/dist/layers/BaseLayer.d.ts +8 -13
package/dist/layers/BaseLayer.js +25 -13
package/dist/layers/CausalSelfAttention.d.ts +3 -2
package/dist/layers/CausalSelfAttention.js +28 -28
package/dist/layers/MLP.d.ts +3 -2
package/dist/layers/MLP.js +16 -20
package/dist/layers/PositionEmbedding.d.ts +9 -0
package/dist/layers/PositionEmbedding.js +45 -0
package/dist/layers/RMSNorm.d.ts +3 -2
package/dist/layers/RMSNorm.js +6 -6
package/dist/layers/RoPECache.d.ts +1 -1
package/dist/layers/RoPECache.js +4 -4
package/dist/layers/TiedEmbedding.d.ts +3 -2
package/dist/layers/TiedEmbedding.js +29 -7
package/dist/layers/TransformerBlock.d.ts +3 -2
package/dist/layers/TransformerBlock.js +1 -1
package/dist/loader/load.d.ts +2 -2
package/dist/loader/loadHF.d.ts +2 -2
package/dist/loader/loadTransformers.d.ts +4 -2
package/dist/loader/loadTransformers.js +10 -9
package/dist/loader/newZipLoad.d.ts +2 -2
package/dist/loader/oldZipLoad.d.ts +2 -2
package/dist/loader/oldZipLoad.js +44 -51
package/dist/loader/save.d.ts +8 -0
package/dist/loader/save.js +62 -0
package/dist/{log_sum_exp-DbjkV734.js → log_sum_exp-C8yFJfZz.js} +45 -24
package/dist/main.d.ts +6 -4
package/dist/main.js +24 -18
package/dist/{mat_mul-8m8pfdcx.js → mat_mul-Dpy2mMRu.js} +1 -1
package/dist/mod-CbibJi3D.js +27 -0
package/dist/models/NanoGPTV1.d.ts +15 -0
package/dist/models/NanoGPTV1.js +71 -0
package/dist/{config.d.ts → models/config.d.ts} +1 -0
package/dist/{config.js → models/config.js} +1 -0
package/dist/models/factory.d.ts +3 -0
package/dist/models/factory.js +14 -0
package/dist/models/model.d.ts +26 -0
package/dist/models/model.js +70 -0
package/dist/{mulmat_packed_gpu-VSekgsNv.js → mulmat_packed_gpu-q_Gmwyld.js} +1 -1
package/dist/{ones-Dj0SDhHf.js → ones-BAqVh-eA.js} +2 -2
package/dist/ops/adamAdjust.js +1 -1
package/dist/ops/adamMoments.js +1 -1
package/dist/ops/appendCache.js +3 -3
package/dist/ops/attentionMask.js +1 -1
package/dist/ops/cpu/adamAdjust.js +9 -9
package/dist/ops/cpu/adamMoments.js +2 -2
package/dist/ops/cpu/appendCache.js +2 -2
package/dist/ops/cpu/attentionMask.js +5 -5
package/dist/ops/cpu/fusedSoftmax.js +2 -2
package/dist/ops/cpu/gatherSub.js +5 -5
package/dist/ops/cpu/gelu.js +1 -1
package/dist/ops/cpu/matMulGelu.js +2 -2
package/dist/ops/cpu/matMulMul.js +1 -1
package/dist/ops/cpu/mulDropout.js +1 -1
package/dist/ops/cpu/normRMS.js +1 -1
package/dist/ops/cpu/qkv.js +3 -3
package/dist/ops/cpu/rope.js +5 -5
package/dist/ops/cpu/scatterSub.js +7 -7
package/dist/ops/fusedSoftmax.js +1 -1
package/dist/ops/gatherSub.js +1 -1
package/dist/ops/gelu.js +2 -2
package/dist/ops/grads/attentionMask.js +1 -1
package/dist/ops/grads/fusedSoftmax.js +2 -2
package/dist/ops/grads/gelu.js +2 -2
package/dist/ops/grads/matMulGelu.js +1 -1
package/dist/ops/grads/normRMS.js +1 -1
package/dist/ops/grads/qkv.js +1 -1
package/dist/ops/grads/rope.js +1 -1
package/dist/ops/matMulGelu.js +1 -1
package/dist/ops/matMulMul.js +1 -1
package/dist/ops/mulDrop.js +1 -1
package/dist/ops/normRMS.js +1 -1
package/dist/ops/qkv.js +1 -1
package/dist/ops/rope.js +4 -4
package/dist/ops/scatterSub.js +1 -1
package/dist/ops/webgl/adamAdjust.js +2 -2
package/dist/ops/webgl/adamMoments.js +1 -1
package/dist/ops/webgl/appendCache.js +1 -1
package/dist/ops/webgl/attentionMask.js +1 -1
package/dist/ops/webgl/fusedSoftmax.js +4 -4
package/dist/ops/webgl/gatherSub.js +1 -1
package/dist/ops/webgl/gelu.js +2 -2
package/dist/ops/webgl/log.js +3 -3
package/dist/ops/webgl/matMulGelu.js +10 -10
package/dist/ops/webgl/matMulMul.js +1 -1
package/dist/ops/webgl/mulDropout.js +1 -1
package/dist/ops/webgl/normRMS.js +2 -2
package/dist/ops/webgl/qkv.js +1 -1
package/dist/ops/webgl/rope.js +1 -1
package/dist/ops/webgl/scatterSub.js +1 -1
package/dist/ops/webgpu/adamAdjust.js +3 -3
package/dist/ops/webgpu/adamMoments.js +3 -3
package/dist/ops/webgpu/appendCache.js +3 -3
package/dist/ops/webgpu/attentionMask.js +3 -3
package/dist/ops/webgpu/gatherSub.js +3 -3
package/dist/ops/webgpu/gelu.js +3 -3
package/dist/ops/webgpu/normRMS.js +2 -2
package/dist/ops/webgpu/normRMSGrad.js +5 -5
package/dist/ops/webgpu/qkv.js +3 -3
package/dist/ops/webgpu/rope.js +3 -3
package/dist/ops/webgpu/scatterSub.js +3 -3
package/dist/ops/webgpu/utils/reductions.js +4 -4
package/dist/ops-542ai2vG.js +1525 -0
package/dist/{random_width-sZORGo5k.js → random_width-DKGeiFuR.js} +1471 -1538
package/dist/{range-CRuAh-gd.js → range-BcUvLuf5.js} +1 -1
package/dist/{reciprocal-BvGAyKyu.js → reciprocal-DhDWSKiD.js} +1 -1
package/dist/{register_all_kernels-BwDSRN-f.js → register_all_kernels-Do9VvZmo.js} +2488 -2534
package/dist/{max-Ddnnb5xe.js → relu-B1AXs7p5.js} +6 -6
package/dist/{reshape-CdBq1WJ6.js → reshape-WeJkT3ja.js} +1 -1
package/dist/{scatter_nd_util-DUstGbU1.js → scatter_nd_util-B7yDhiQr.js} +1 -1
package/dist/{selu_util-BJEXVvjX.js → selu_util-BgUO9gHY.js} +125 -146
package/dist/{shared-wS99K7_n.js → shared-CZiWmQCI.js} +1 -1
package/dist/{shared-B8ztnyEk.js → shared-V6D_md-c.js} +72 -72
package/dist/{sin-BeA3tsEd.js → sin-CPxad7Am.js} +1 -1
package/dist/{slice-BiOsknYS.js → slice-B7jXtPnp.js} +1 -1
package/dist/{softmax-Bv_6lyMX.js → softmax-BfsyI4As.js} +1 -1
package/dist/{split-B-dikLRw.js → split-BPxr8_8m.js} +1 -1
package/dist/{stack-B17UN2nn.js → stack-BNwLzE43.js} +1 -1
package/dist/{sum-66ew2byf.js → sum-ByFINZgi.js} +3 -3
package/dist/{tensor-JwS7ZYY6.js → tensor-DbqgIV9B.js} +1 -1
package/dist/tensor1d-CtJq5BOv.js +27 -0
package/dist/{tensor2d-wxPAnDQy.js → tensor2d-CObBWBkW.js} +1 -1
package/dist/tensor3d-BOukqWwr.js +30 -0
package/dist/tensor4d-DLtk7Nxh.js +30 -0
package/dist/training/Adam.js +2 -2
package/dist/training/AdamExt.js +1 -1
package/dist/training/DatasetBuilder.js +2 -2
package/dist/training/Evaluator.d.ts +2 -2
package/dist/training/FullTrainer.d.ts +3 -3
package/dist/training/FullTrainer.js +61 -69
package/dist/training/Trainer.d.ts +15 -3
package/dist/training/Trainer.js +39 -47
package/dist/training/sparseCrossEntropy.js +12 -13
package/dist/utilities/arrayClose.d.ts +1 -1
package/dist/utilities/arrayClose.js +16 -7
package/dist/utilities/dummy.d.ts +4 -4
package/dist/utilities/dummy.js +13 -13
package/dist/utilities/multinomialCPU.js +2 -2
package/dist/utilities/parameters.d.ts +1 -1
package/dist/utilities/performance.js +1 -1
package/dist/utilities/profile.js +1 -1
package/dist/utilities/safetensors.js +2 -2
package/dist/utilities/weights.js +2 -2
package/dist/{variable-BuddVFLa.js → variable-DPFOJyRG.js} +1 -1
package/dist/{webgpu_program-PFzf1hAQ.js → webgpu_program-Dhk9R5aG.js} +1 -1
package/dist/{webgpu_util-D____QpY.js → webgpu_util-BqGnZg8t.js} +27 -27
package/dist/{zeros--BdLQ3oG.js → zeros-Dnwix0p4.js} +1 -1
package/package.json +2 -3
package/dist/NanoGPTModel.d.ts +0 -52
package/dist/NanoGPTModel.js +0 -203
package/dist/TiedEmbedding-BxOerUmB.js +0 -43
package/dist/ops-BFGCx8Ri.js +0 -1202
package/dist/utilities/generate.d.ts +0 -3
package/dist/utilities/generate.js +0 -22
package/dist/utilities/save.d.ts +0 -9
package/dist/utilities/save.js +0 -61

package/dist/training/FullTrainer.js CHANGED Viewed

@@ -1,16 +1,15 @@
-import { generateText as v } from "../utilities/generate.js";
-import x from "./Trainer.js";
-import S from "./Evaluator.js";
-import { d as w } from "../index-BoWRt-10.js";
-import y from "../utilities/profile.js";
-const T = {
+import y from "./Trainer.js";
+import v from "./Evaluator.js";
+import { d as S } from "../index-DdmHGZjq.js";
+import w from "../utilities/profile.js";
+const f = {
   desiredLoss: 0.01,
   logInterval: 1,
   maxSteps: 1e3
 };
-class z extends x {
-  constructor(r, t, s = 3e-4) {
-    super(r, t, s);
+class b extends y {
+  constructor(s, t, a = 3e-4) {
+    super(s, t, a);
   }
   createEmptyState() {
     return {
@@ -24,104 +23,97 @@ class z extends x {
       ...this.lastState || {}
     };
   }
-  createLogEntry(r, t, s, h) {
+  createLogEntry(s, t, a, n) {
     return {
-      loss: r.lastLoss,
-      step: r.step,
+      loss: s.lastLoss,
+      step: s.step,
       time: Date.now() - t,
-      batchSize: s,
-      learningRate: h ? this.optimizer.lr : void 0
+      batchSize: a,
+      learningRate: n ? this.optimizer.lr : void 0
     };
   }
-  createProgress(r, t, s) {
+  createProgress(s, t, a) {
     return {
-      duration: r.trainingDuration,
-      totalSamples: r.totalSteps * t.batchSize,
-      samplesPerSecond: r.totalSteps * t.batchSize / (r.trainingDuration / 1e3),
-      memory: s ? this.model.getProfiler()?.getPeakMemory() || 0 : void 0
+      duration: s.trainingDuration,
+      totalSamples: s.totalSteps * t.batchSize,
+      samplesPerSecond: s.totalSteps * t.batchSize / (s.trainingDuration / 1e3),
+      memory: a ? this.model.getProfiler()?.getPeakMemory() || 0 : void 0
     };
   }
-  async stepDataset(r, t, s) {
-    const { logInterval: h, prompt: m } = {
-      ...T,
+  async stepDataset(s, t, a) {
+    const { logInterval: n } = {
+      ...f,
       ...t
-    }, g = Date.now(), a = this.createEmptyState();
-    this.lastState = a, await this.dummyPass(), this.model.trainable = !0, t?.advancedMetrics && (this.model.getProfiler() || (this.model.config.layerConfig.profiler = new y())), this.running = !0, a.logStartTime = g;
-    const p = s ? new S(this.model, s) : void 0, e = await r.iterator();
+    }, l = Date.now(), r = this.createEmptyState();
+    this.lastState = r, await this.dummyPass(), this.model.trainable = !0, t?.advancedMetrics && (this.model.getProfiler() || this.model.setProfiler(new w())), this.running = !0, r.logStartTime = l;
+    const m = a ? new v(this.model, a) : void 0, e = await s.iterator();
     try {
       for (; this.running; ) {
         const i = await e.next();
         if (i.done) break;
-        const u = i.value, o = this.trainBatch(a, u), n = this.createLogEntry(a, g, u.xs.shape[0], t?.advancedMetrics);
-        if (this.model.log.push(n), a.step % h === 0) {
+        const g = i.value, o = this.trainBatch(r, g), c = this.createLogEntry(r, l, g.xs.shape[0], t?.advancedMetrics);
+        if (this.model.trainingState = {
+          steps: r.totalSteps,
+          learningRate: this.optimizer.lr,
+          batchSize: g.xs.shape[0],
+          loss: r.lastLoss
+        }, r.step % n === 0) {
           await o.data();
-          const f = Date.now();
-          if (a.trainingDuration += f - a.logStartTime, p)
+          const u = Date.now();
+          if (r.trainingDuration += u - r.logStartTime, m)
             try {
-              const l = await p.evaluate(5);
-              a.validationLosses.push(l), n.valLoss = l;
-            } catch (l) {
-              console.error("Validation error:", l);
+              const h = await m.evaluate(5);
+              r.validationLosses.push(h), c.valLoss = h;
+            } catch (h) {
+              console.error("Validation error:", h);
             }
-          if (m) {
-            const l = await v(this.tokenizer, this.model, m, 100, {
-              temperature: 0.8
-            });
-            n.example = l;
-          }
-          const c = this.createProgress(a, n, t?.advancedMetrics);
-          return o.dispose(), this.stop(), { log: n, progress: c };
+          const p = this.createProgress(r, c, t?.advancedMetrics);
+          return o.dispose(), this.stop(), { log: c, progress: p };
         }
         o.dispose();
       }
     } catch (i) {
-      throw console.error("Training error:", i), w(), i;
+      throw console.error("Training error:", i), S(), i;
     }
-    throw w(), this.running = !1, new Error("No log returned before training stopped.");
+    throw S(), this.running = !1, new Error("No log returned before training stopped.");
   }
   // Train for multiple epochs using Dataset API - FIXED memory leaks
-  async trainOnDataset(r, t, s) {
-    const { logInterval: h, onStep: m, prompt: g, maxSteps: a } = {
-      ...T,
+  async trainOnDataset(s, t, a) {
+    const { logInterval: n, onStep: l, maxSteps: r } = {
+      ...f,
       ...t
-    }, p = Date.now(), e = this.createEmptyState();
-    this.lastState = e, await this.dummyPass(), this.model.trainable = !0, t?.advancedMetrics && (this.model.getProfiler() || (this.model.config.layerConfig.profiler = new y())), this.running = !0, e.logStartTime = p;
-    const i = s ? new S(this.model, s) : void 0, u = await r.iterator();
+    }, m = Date.now(), e = this.createEmptyState();
+    this.lastState = e, await this.dummyPass(), this.model.trainable = !0, t?.advancedMetrics && (this.model.getProfiler() || this.model.setProfiler(new w())), this.running = !0, e.logStartTime = m;
+    const i = a ? new v(this.model, a) : void 0, g = await s.iterator();
     try {
       for (; this.running; ) {
-        const o = await u.next();
+        const o = await g.next();
         if (o.done) break;
-        const n = o.value, f = this.trainBatch(e, n), c = this.createLogEntry(e, p, n.xs.shape[0], t?.advancedMetrics);
-        if (this.model.log.push(c), e.step % h === 0) {
-          await f.data();
-          const l = Date.now();
-          if (e.trainingDuration += l - e.logStartTime, i)
+        const c = o.value, u = this.trainBatch(e, c), p = this.createLogEntry(e, m, c.xs.shape[0], t?.advancedMetrics);
+        if (e.step % n === 0) {
+          await u.data();
+          const h = Date.now();
+          if (e.trainingDuration += h - e.logStartTime, i)
             try {
               const d = await i.evaluate(5);
-              e.validationLosses.push(d), c.valLoss = d;
+              e.validationLosses.push(d), p.valLoss = d;
             } catch (d) {
               console.error("Validation error:", d);
             }
-          if (m) {
-            if (g) {
-              const L = await v(this.tokenizer, this.model, g, 100, {
-                temperature: 0.8
-              });
-              c.example = L;
-            }
-            const d = this.createProgress(e, c, t?.advancedMetrics);
-            await m(c, d);
+          if (l) {
+            const d = this.createProgress(e, p, t?.advancedMetrics);
+            await l(p, d);
           }
           e.logStartTime = Date.now();
         }
-        f.dispose(), e.step >= a && this.stop();
+        u.dispose(), e.step >= r && this.stop();
       }
     } catch (o) {
-      throw console.error("Training error:", o), w(), o;
+      throw console.error("Training error:", o), S(), o;
     }
-    return w(), this.running = !1, { losses: e.losses, validationLosses: e.validationLosses };
+    return S(), this.running = !1, { losses: e.losses, validationLosses: e.validationLosses };
   }
 }
 export {
-  z as default
+  b as default
 };

package/dist/training/Trainer.d.ts CHANGED Viewed

@@ -1,10 +1,20 @@
 import { ITokeniser } from '../tokeniser/type';
 import { DatasetBuilder } from './DatasetBuilder';
-import { default as NanoGPT, TrainingLogEntry } from '../NanoGPTModel';
 import { default as AdamExt } from './AdamExt';
 import { TensorContainer } from '@tensorflow/tfjs-core/dist/tensor_types';
 import { Scalar, Tensor } from '@tensorflow/tfjs-core';
 import { Dataset } from '@tensorflow/tfjs-data';
+import { default as Model, ModelForwardAttributes } from '../models/model';
+export interface TrainingLogEntry {
+    loss: number;
+    valLoss?: number;
+    step: number;
+    time: number;
+    example?: string;
+    batchSize: number;
+    gradientNorm?: number;
+    learningRate?: number;
+}
 export interface TrainingState {
     step: number;
     lastLoss: number;
@@ -35,13 +45,15 @@ export interface TrainingOptions {
 }
 export default abstract class GPTTrainer {
     protected tokenizer: ITokeniser;
-    protected model: NanoGPT;
+    protected model: Model<ModelForwardAttributes>;
     protected optimizer: AdamExt;
     protected datasetBuilder: DatasetBuilder;
     protected learningRate: number;
     protected running: boolean;
     protected lastState?: TrainingState;
-    constructor(model: NanoGPT, tokenizer: ITokeniser, learningRate?: number);
+    protected _gradientCheckpointing: boolean;
+    constructor(model: Model<ModelForwardAttributes>, tokenizer: ITokeniser, learningRate?: number);
+    setGradientCheckpointing(enabled: boolean): void;
     setLearningRate(learningRate: number): void;
     reset(): void;
     stop(): void;

package/dist/training/Trainer.js CHANGED Viewed

@@ -1,10 +1,10 @@
-import { DatasetBuilder as h, flattenTokens as p, PAGE_FACTOR as g } from "./DatasetBuilder.js";
+import { DatasetBuilder as m, flattenTokens as c, PAGE_FACTOR as g } from "./DatasetBuilder.js";
 import u from "./AdamExt.js";
-import { t as f, v as y, d as c } from "../index-BoWRt-10.js";
-import { z as m } from "../zeros--BdLQ3oG.js";
+import { t as f, v as y, d as p } from "../index-DdmHGZjq.js";
+import { z as h } from "../zeros-Dnwix0p4.js";
 class x {
-  constructor(t, e, a = 1e-3) {
-    this.tokenizer = e, this.model = t, this.learningRate = a, this.resetOptimizer(), this.datasetBuilder = new h(e, t.config.gpt.blockSize);
+  constructor(t, e, i = 1e-3) {
+    this.tokenizer = e, this.model = t, this.learningRate = i, this.resetOptimizer(), this.datasetBuilder = new m(e, t.config.blockSize);
   }
   model;
   optimizer;
@@ -12,6 +12,10 @@ class x {
   learningRate;
   running = !1;
   lastState;
+  _gradientCheckpointing = !1;
+  setGradientCheckpointing(t) {
+    this._gradientCheckpointing = t;
+  }
   setLearningRate(t) {
     this.learningRate = t, this.resetOptimizer({ learningRateFactor: 1, beta1: 0.9, beta2: 0.99, epsilon: 1e-8 });
   }
@@ -40,71 +44,59 @@ class x {
     );
     this.optimizer = e;
   }
-  /*private async maxGradNorm(grads: NamedVariableMap): Promise<number> {
-      let maxNorm = 0;
-      // Print all gradients
-      await Promise.all(
-          Object.keys(grads).map(async (varName) => {
-              const grad = grads[varName];
-              const temp = norm(grad);
-              const gradNorm = (await temp.data())[0];
-              temp.dispose();
-              if (gradNorm > maxNorm) {
-                  maxNorm = gradNorm;
-              }
-          })
-      );
-      return maxNorm;
-  }*/
-  trainStep(t, e, a = !1) {
+  trainStep(t, e, i = !1) {
     return f(() => {
       this.model.getProfiler()?.startMemory();
-      const { xs: s, ys: i } = e, o = () => {
-        const [l, d] = this.model.forward({ training: !0 }, s, i);
+      const { xs: a, ys: s } = e, n = () => {
+        const [l, d] = this.model.forward(
+          { training: !0, checkpointing: this._gradientCheckpointing },
+          a,
+          s
+        );
         return l.dispose(), d;
-      }, { value: n, grads: r } = y(o);
-      return a ? this.model.getProfiler()?.endMemory("Training") : (this.optimizer.applyGradients(r), this.model.getProfiler()?.endMemory("Training"), c(r)), n;
+      }, { value: o, grads: r } = y(n);
+      return i ? this.model.getProfiler()?.endMemory("Training") : (this.optimizer.applyGradients(r), this.model.getProfiler()?.endMemory("Training"), p(r)), o;
     });
   }
   async dummyPass() {
-    const t = m([1, this.model.config.gpt.blockSize], "int32"), e = m([1, this.model.config.gpt.blockSize], "int32");
+    const t = h([1, this.model.config.blockSize], "int32"), e = h([1, this.model.config.blockSize], "int32");
     try {
-      const a = this.trainStep({}, { xs: t, ys: e }, !0);
-      await a.data(), a.dispose();
-    } catch (a) {
-      console.error("Error during dummy pass:", a);
+      const i = this.trainStep({}, { xs: t, ys: e }, !0);
+      await i.data(), i.dispose();
+    } catch (i) {
+      console.error("Error during dummy pass:", i);
     } finally {
       t.dispose(), e.dispose();
     }
   }
   trainBatch(t, e) {
     try {
-      const a = this.trainStep(t, e, !1);
-      return e.xs.dispose(), e.ys.dispose(), t.step++, t.totalSteps++, a;
-    } catch (a) {
-      throw console.error(`Error processing batch at step ${t.step}:`, a), c(), a;
+      const i = this.trainStep(t, e, !1);
+      return e.xs.dispose(), e.ys.dispose(), t.step++, t.totalSteps++, i;
+    } catch (i) {
+      throw console.error(`Error processing batch at step ${t.step}:`, i), p(), i;
     }
   }
-  async createTrainValidationSplit(t, e = 32, a = 0.1) {
-    const s = await p(t, this.tokenizer), i = /* @__PURE__ */ new Set();
-    if (a > 0) {
-      const r = Math.floor(s.length / (this.datasetBuilder.blockSize * g)), l = Math.max(1, Math.floor(r * a));
-      for (; i.size < l; ) {
+  async createTrainValidationSplit(t, e = 32, i = 0.1) {
+    const a = await c(t, this.tokenizer), s = /* @__PURE__ */ new Set();
+    if (i > 0) {
+      const r = Math.floor(a.length / (this.datasetBuilder.blockSize * g)), l = Math.max(1, Math.floor(r * i));
+      for (; s.size < l; ) {
         const d = Math.floor(Math.random() * r);
-        i.add(d);
+        s.add(d);
       }
     }
-    const o = await this.datasetBuilder.createTextDataset(s, e, i, !1), n = await this.datasetBuilder.createTextDataset(
-      s,
+    const n = await this.datasetBuilder.createTextDataset(a, e, s, !1), o = await this.datasetBuilder.createTextDataset(
+      a,
       e,
-      i,
+      s,
       !0
     );
-    return { trainDataset: o, validationDataset: n };
+    return { trainDataset: n, validationDataset: o };
   }
   async createDataset(t, e = 32) {
-    const a = await p(t, this.tokenizer);
-    return await this.datasetBuilder.createTextDataset(a, e);
+    const i = await c(t, this.tokenizer);
+    return await this.datasetBuilder.createTextDataset(i, e);
   }
   dispose() {
     this.optimizer && this.optimizer.dispose();

package/dist/training/sparseCrossEntropy.js CHANGED Viewed

@@ -1,28 +1,27 @@
 import { gatherSub as x } from "../ops/gatherSub.js";
 import { scatterSub as L } from "../ops/scatterSub.js";
-import { y, t as u, z as C, c as E } from "../index-BoWRt-10.js";
-import { s as G } from "../softmax-Bv_6lyMX.js";
-import { m as z } from "../max-Ddnnb5xe.js";
-import { l as v } from "../log_sum_exp-DbjkV734.js";
+import { J as C, t as u, L as E, c as G } from "../index-DdmHGZjq.js";
+import { s as y } from "../softmax-BfsyI4As.js";
+import { m as z, l as v } from "../log_sum_exp-C8yFJfZz.js";
 function k(t, s) {
   return u(() => {
-    const n = t.shape[t.shape.length - 1], c = t.shape.slice(0, -1).reduce((o, e) => o * e, 1), h = t.shape.length > 2 ? t.reshape([c, n]) : t, p = s.shape.length > 1 ? s.reshape([c]).cast("int32") : s.cast("int32"), r = z(h, -1, !0), a = E(h, r), m = v(a, -1);
-    return x(m, p, a);
+    const n = t.shape[t.shape.length - 1], c = t.shape.slice(0, -1).reduce((o, e) => o * e, 1), h = t.shape.length > 2 ? t.reshape([c, n]) : t, p = s.shape.length > 1 ? s.reshape([c]).cast("int32") : s.cast("int32"), r = z(h, -1, !0), a = G(h, r), d = v(a, -1);
+    return x(d, p, a);
   });
 }
-function A() {
-  return y(
+function q() {
+  return C(
     // @ts-expect-error Invalid params
-    (s, n, d) => {
-      const c = s.shape[s.shape.length - 1], p = s.shape.slice(0, -1).reduce((o, e) => o * e, 1), r = s.reshape([p, c]), a = n.reshape([p]).cast("int32"), m = k(r, a);
-      return d([r, a]), r.dispose(), a.dispose(), { value: m, gradFunc: (o, e) => u(() => {
-        const S = e[0], f = e[1], b = G(S), l = L(b, f, o), g = C(n);
+    (s, n, m) => {
+      const c = s.shape[s.shape.length - 1], p = s.shape.slice(0, -1).reduce((o, e) => o * e, 1), r = s.reshape([p, c]), a = n.reshape([p]).cast("int32"), d = k(r, a);
+      return m([r, a]), r.dispose(), a.dispose(), { value: d, gradFunc: (o, e) => u(() => {
+        const S = e[0], f = e[1], b = y(S), l = L(b, f, o), g = E(n);
         return [l.reshape(s.shape), g];
       }) };
     }
   );
 }
 export {
-  A as createSoftmaxCrossEntropyWithGrad,
+  q as createSoftmaxCrossEntropyWithGrad,
   k as sparseSoftmaxCrossEntropy
 };

package/dist/utilities/arrayClose.d.ts CHANGED Viewed

	@@ -1 +1 @@
1	- export declare function arraysClose(a: unknown, b: unknown~~, epsilon?: number~~): ~~boolean~~;
1	+ export declare function arraysClose(a: unknown, b: unknown): number;

package/dist/utilities/arrayClose.js CHANGED Viewed

@@ -1,11 +1,20 @@
-function f(r, e, n = 1e-5) {
+function n(r, e) {
+  let t = 0;
   if (Array.isArray(r) && Array.isArray(e)) {
-    if (r.length !== e.length) return !1;
-    for (let t = 0; t < r.length; ++t)
-      if (!f(r[t], e[t], n)) return !1;
-    return !0;
-  } else return typeof r == "number" && typeof e == "number" ? r === -1 / 0 && e === -1 / 0 ? !0 : Math.abs(r - e) < n : !1;
+    if (r.length !== e.length) return Number.POSITIVE_INFINITY;
+    for (let i = 0; i < r.length; ++i)
+      t = Math.max(t, n(r[i], e[i]));
+    return t;
+  } else if (typeof r == "number" && typeof e == "number") {
+    if (isNaN(r) && isNaN(e))
+      return 0;
+    if (!isFinite(r) || !isFinite(e))
+      return r === e ? 0 : Number.POSITIVE_INFINITY;
+    const i = Math.abs(r - e);
+    return t = Math.max(t, i), t;
+  } else
+    return Number.POSITIVE_INFINITY;
 }
 export {
-  f as arraysClose
+  n as arraysClose
 };

package/dist/utilities/dummy.d.ts CHANGED Viewed

@@ -1,9 +1,9 @@
-import { default as NanoGPT } from '../NanoGPTModel';
-export declare function dummyPassAsync(model: NanoGPT): Promise<void>;
+import { default as Model, ModelForwardAttributes } from '../models/model';
+export declare function dummyPassAsync(model: Model<ModelForwardAttributes>): Promise<void>;
 export interface MemoryRequirements {
     perBatch: number;
     tapeSize: number;
     gradients: number;
 }
-export declare function dummyPassTrainAsync(model: NanoGPT): Promise<MemoryRequirements>;
-export declare function dummyPass(model: NanoGPT): void;
+export declare function dummyPassTrainAsync(model: Model<ModelForwardAttributes>): Promise<MemoryRequirements>;
+export declare function dummyPass(model: Model<ModelForwardAttributes>): void;

package/dist/utilities/dummy.js CHANGED Viewed

@@ -1,31 +1,31 @@
-import { m as y, v as P, e as S } from "../index-BoWRt-10.js";
-import { z as i } from "../zeros--BdLQ3oG.js";
+import { m as y, v as P, e as S } from "../index-DdmHGZjq.js";
+import { z as i } from "../zeros-Dnwix0p4.js";
 async function w(s) {
-  const t = i([1, s.config.gpt.blockSize], "int32"), [e, n] = s.forward({ training: !1 }, t);
+  const t = i([1, s.config.blockSize], "int32"), [e, n] = s.forward({ training: !1 }, t);
   await e.data(), e.dispose(), n && n.dispose(), t.dispose();
 }
 async function k(s) {
   const t = y(), e = t.numBytesInGPUAllocated ?? t.numBytesAllocatedInGPU ?? t.numBytes;
   await w(s);
-  const n = i([1, s.config.gpt.blockSize], "int32"), r = i([1, s.config.gpt.blockSize], "int32"), o = {
+  const n = i([1, s.config.blockSize], "int32"), r = i([1, s.config.blockSize], "int32"), o = {
     perBatch: 0,
     tapeSize: 0,
     gradients: s.getNumParams() * 4
   }, f = () => {
-    const [c, l] = s.forward({ training: !0 }, n, r), p = S().state.activeTape;
-    let u = 0;
-    if (p)
-      for (const z of p)
-        u += z.saved?.reduce((B, I) => B + I.size * 4, 0) || 0;
-    return o.tapeSize = u, c.dispose(), l;
-  }, { value: m, grads: d } = P(f), a = y(), g = a.numBytesInGPUAllocated ?? a.numBytesAllocatedInGPU ?? a.numBytes;
-  o.perBatch = g - e - o.gradients, console.log("Dummy training memory requirements:", o), await m.data(), m.dispose();
+    const [c, g] = s.forward({ training: !0 }, n, r), u = S().state.activeTape;
+    let p = 0;
+    if (u)
+      for (const z of u)
+        p += z.saved?.reduce((B, I) => B + I.size * 4, 0) || 0;
+    return o.tapeSize = p, c.dispose(), g;
+  }, { value: m, grads: d } = P(f), a = y(), l = a.numBytesInGPUAllocated ?? a.numBytesAllocatedInGPU ?? a.numBytes;
+  o.perBatch = l - e - o.gradients, console.log("Dummy training memory requirements:", o), await m.data(), m.dispose();
   for (const c in d)
     d[c].dispose();
   return n.dispose(), r.dispose(), o;
 }
 function v(s) {
-  const t = i([1, s.config.gpt.blockSize], "int32"), [e, n] = s.forward({ training: !1 }, t);
+  const t = i([1, s.config.blockSize], "int32"), [e, n] = s.forward({ training: !1 }, t);
   e.dispose(), n && n.dispose(), t.dispose();
 }
 export {

package/dist/utilities/multinomialCPU.js CHANGED Viewed

@@ -1,5 +1,5 @@
-import "../index-BoWRt-10.js";
-import { t as e } from "../tensor2d-wxPAnDQy.js";
+import "../index-DdmHGZjq.js";
+import { t as e } from "../tensor2d-CObBWBkW.js";
 function l(n) {
   let r = 0;
   const i = Math.random();

package/dist/utilities/parameters.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { GPTConfig } from '../config';
+import { GPTConfig } from '../models/config';
 export declare function estimateParameterCount(config: GPTConfig): number;
 export declare function estimateMemoryUsage(config: GPTConfig): number;
 export declare function estimateTrainingMemoryUsage(config: GPTConfig, batchSize: number): number;

package/dist/utilities/performance.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import { t as s } from "../index-BoWRt-10.js";
+import { t as s } from "../index-DdmHGZjq.js";
 async function f(e, o = 10, r = !1) {
   for (let t = 0; t < 100; t++) {
     const a = r ? await e() : s(e);

package/dist/utilities/profile.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import { m as a } from "../index-BoWRt-10.js";
+import { m as a } from "../index-DdmHGZjq.js";
 const s = 1024 * 1024;
 class l {
   log = /* @__PURE__ */ new Map();

package/dist/utilities/safetensors.js CHANGED Viewed

@@ -1,5 +1,5 @@
-import "../index-BoWRt-10.js";
-import { t as y } from "../tensor-JwS7ZYY6.js";
+import "../index-DdmHGZjq.js";
+import { t as y } from "../tensor-DbqgIV9B.js";
 function l(t) {
   if (t === "float32") return "F32";
   if (t === "int32") return "I32";

package/dist/utilities/weights.js CHANGED Viewed

@@ -1,5 +1,5 @@
-import "../index-BoWRt-10.js";
-import { t as p } from "../tensor-JwS7ZYY6.js";
+import "../index-DdmHGZjq.js";
+import { t as p } from "../tensor-DbqgIV9B.js";
 function h(n) {
   const e = n.reduce((s, o) => s + o.length, 0), a = new Float32Array(e);
   let t = 0;

package/dist/{variable-BuddVFLa.js → variable-DPFOJyRG.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { E as i } from "./index-BoWRt-10.js";
+import { E as i } from "./index-DdmHGZjq.js";
 /**
  * @license
  * Copyright 2018 Google LLC. All Rights Reserved.

package/dist/{webgpu_program-PFzf1hAQ.js → webgpu_program-Dhk9R5aG.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { aa as k, ab as z, ac as E, a1 as j, l as A } from "./index-BoWRt-10.js";
+import { ae as k, af as z, ag as E, a3 as j, n as A } from "./index-DdmHGZjq.js";
 /**
  * @license
  * Copyright 2019 Google LLC. All Rights Reserved.

package/dist/{webgpu_util-D____QpY.js → webgpu_util-BqGnZg8t.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { l as u } from "./index-BoWRt-10.js";
+import { n as u } from "./index-DdmHGZjq.js";
 /**
  * @license
  * Copyright 2019 Google LLC. All Rights Reserved.
@@ -15,57 +15,57 @@ import { l as u } from "./index-BoWRt-10.js";
  * limitations under the License.
  * =============================================================================
  */
-const e = (r) => {
+const e = (n) => {
   let t = 1;
-  for (let n = 0; n < r.length; n++)
-    t *= r[n];
+  for (let r = 0; r < n.length; r++)
+    t *= n[r];
   return t;
 };
-function m(r, t, n = [1, 1, 1], a = [1, 1, 1]) {
+function m(n, t, r = [1, 1, 1], a = [1, 1, 1]) {
   const [o, i, f] = [
-    Math.ceil(e(r.x.map((c) => t[c])) / (n[0] * a[0])),
-    r.y ? Math.ceil(e(r.y.map((c) => t[c])) / (n[1] * a[1])) : 1,
-    r.z ? Math.ceil(e(r.z.map((c) => t[c])) / (n[2] * a[2])) : 1
+    Math.ceil(e(n.x.map((c) => t[c])) / (r[0] * a[0])),
+    n.y ? Math.ceil(e(n.y.map((c) => t[c])) / (r[1] * a[1])) : 1,
+    n.z ? Math.ceil(e(n.z.map((c) => t[c])) / (r[2] * a[2])) : 1
   ];
   return [o, i, f];
 }
-function d(r, t, n, a = !1) {
+function d(n, t, r, a = !1) {
   const o = [8, 8, 1], i = [4, 4, 1];
-  return a || (r <= 8 && (i[1] = 1), t <= 16 && n <= 16 && (o[0] = 4)), { workgroupSize: o, elementsPerThread: i };
+  return a || (n <= 8 && (i[1] = 1), t <= 16 && r <= 16 && (o[0] = 4)), { workgroupSize: o, elementsPerThread: i };
 }
-function p(r, t, n = !1) {
-  if (n)
+function p(n, t, r = !1) {
+  if (r)
     return [8, 8, 1];
-  const a = e(r.x.map((i) => t[i])), o = e(r.y.map((i) => t[i]));
+  const a = e(n.x.map((i) => t[i])), o = e(n.y.map((i) => t[i]));
   return a <= 4 ? [4, 16, 1] : o <= 4 ? [16, 4, 1] : [16, 16, 1];
 }
-function M(r, t, n = !1) {
-  if (n)
+function M(n, t, r = !1) {
+  if (r)
     return [4, 4, 1];
-  const a = e(r.x.map((i) => t[i])), o = e(r.y.map((i) => t[i]));
+  const a = e(n.x.map((i) => t[i])), o = e(n.y.map((i) => t[i]));
   return a <= 4 ? [1, 2, 1] : o <= 4 ? [2, 1, 1] : [2, 2, 1];
 }
-function h(r) {
-  return { x: r.map((t, n) => n) };
+function h(n) {
+  return { x: n.map((t, r) => r) };
 }
-function x(r) {
-  if (r === "float32" || r === "int32" || r === "bool" || r === "string")
+function x(n) {
+  if (n === "float32" || n === "int32" || n === "bool" || n === "string")
     return 4;
-  if (r === "complex64")
+  if (n === "complex64")
     return 8;
-  throw new Error(`Unknown dtype ${r}`);
+  throw new Error(`Unknown dtype ${n}`);
 }
 function g() {
   return !!(typeof globalThis < "u" && globalThis.navigator && globalThis.navigator.gpu);
 }
-function b(r, t) {
-  Array.isArray(r) || (r = [r]), r.forEach((n) => {
-    n != null && u(n.dtype !== "complex64", () => `${t} does not support complex64 tensors in the WebGPU backend.`);
+function b(n, t) {
+  Array.isArray(n) || (n = [n]), n.forEach((r) => {
+    r != null && u(r.dtype !== "complex64", () => `${t} does not support complex64 tensors in the WebGPU backend.`);
   });
 }
 var s;
-(function(r) {
-  r[r.MatMulReduceProgram = 0] = "MatMulReduceProgram", r[r.MatMulSplitKProgram = 1] = "MatMulSplitKProgram", r[r.MatMulSmallOutputSizeProgram = 2] = "MatMulSmallOutputSizeProgram", r[r.MatMulPackedProgram = 3] = "MatMulPackedProgram", r[r.MatMulMax = 4] = "MatMulMax";
+(function(n) {
+  n[n.MatMulReduceProgram = 0] = "MatMulReduceProgram", n[n.MatMulSplitKProgram = 1] = "MatMulSplitKProgram", n[n.MatMulSmallOutputSizeProgram = 2] = "MatMulSmallOutputSizeProgram", n[n.MatMulPackedProgram = 3] = "MatMulPackedProgram", n[n.MatMulMax = 4] = "MatMulMax";
 })(s || (s = {}));
 export {
   x as G,