npm - @genai-fi/nanogpt - Versions diffs - 0.3.1 → 0.4.0 - Mend

@genai-fi/nanogpt 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (101) hide show

package/dist/Generator.js +22 -22
package/dist/MLP-KHhikThU.js +83 -0
package/dist/NanoGPTModel.d.ts +2 -3
package/dist/NanoGPTModel.js +79 -79
package/dist/TeachableLLM.d.ts +4 -3
package/dist/TeachableLLM.js +16 -13
package/dist/Trainer.js +20 -13
package/dist/axis_util-DeydwOoC.js +69 -0
package/dist/{concat-BIZS_td9.js → concat-DS_qH7MI.js} +5 -5
package/dist/config.js +7 -8
package/dist/{gather-BPGW8RsB.js → gather-BUmJIS8n.js} +1 -1
package/dist/{index-pWA4_lUh.js → index-XjBAhiFO.js} +1272 -1174
package/dist/layers/BaseLayer.d.ts +14 -2
package/dist/layers/BaseLayer.js +9 -9
package/dist/layers/CausalSelfAttention.d.ts +4 -8
package/dist/layers/CausalSelfAttention.js +108 -82
package/dist/layers/MLP.d.ts +2 -3
package/dist/layers/MLP.js +5 -62
package/dist/layers/RMSNorm.d.ts +2 -2
package/dist/layers/RMSNorm.js +11 -11
package/dist/layers/RoPECache.js +3 -3
package/dist/layers/TiedEmbedding.js +7 -6
package/dist/layers/TransformerBlock.d.ts +2 -6
package/dist/layers/TransformerBlock.js +9 -12
package/dist/{sum-C7Mgy9Bw.js → log_sum_exp-DJPkVZZn.js} +32 -54
package/dist/main.js +22 -19
package/dist/{mat_mul-D7_a4KJn.js → mat_mul-CKwFEV1Q.js} +1 -1
package/dist/max-DJvEiCAJ.js +25 -0
package/dist/moments-CrWRPcR3.js +53 -0
package/dist/norm-BzY929B_.js +86 -0
package/dist/{ones-Cog-G2ag.js → ones-BO01zpJG.js} +2 -2
package/dist/ops/appendCache.js +1 -1
package/dist/ops/attentionMask.js +1 -1
package/dist/ops/cpu/appendCache.js +2 -2
package/dist/ops/cpu/attentionMask.js +2 -2
package/dist/ops/cpu/fusedSoftmax.d.ts +9 -0
package/dist/ops/cpu/fusedSoftmax.js +23 -0
package/dist/ops/cpu/gatherSub.js +3 -3
package/dist/ops/cpu/mulDropout.d.ts +1 -0
package/dist/ops/cpu/mulDropout.js +17 -0
package/dist/ops/cpu/qkv.js +3 -3
package/dist/ops/cpu/rope.js +5 -5
package/dist/ops/cpu/scatterSub.js +27 -27
package/dist/ops/fusedSoftmax.d.ts +2 -0
package/dist/ops/fusedSoftmax.js +10 -0
package/dist/ops/gatherSub.js +1 -1
package/dist/ops/grads/attentionMask.js +1 -1
package/dist/ops/grads/fusedSoftmax.d.ts +2 -0
package/dist/ops/grads/fusedSoftmax.js +17 -0
package/dist/ops/grads/qkv.js +1 -1
package/dist/ops/grads/rope.js +1 -1
package/dist/ops/mulDrop.d.ts +2 -0
package/dist/ops/mulDrop.js +9 -0
package/dist/ops/node/sparseCrossEntropy.js +1 -1
package/dist/ops/qkv.js +1 -1
package/dist/ops/scatterSub.js +1 -1
package/dist/ops/webgl/appendCache.js +1 -1
package/dist/ops/webgl/attentionMask.js +1 -1
package/dist/ops/webgl/fusedSoftmax.d.ts +11 -0
package/dist/ops/webgl/fusedSoftmax.js +3930 -0
package/dist/ops/webgl/gatherSub.js +1 -1
package/dist/ops/webgl/mulDropout.d.ts +1 -0
package/dist/ops/webgl/mulDropout.js +41 -0
package/dist/ops/webgl/qkv.js +1 -1
package/dist/ops/webgl/rope.js +1 -1
package/dist/ops/webgl/scatterSub.js +1 -1
package/dist/{random_width-oeUIlUZj.js → random_width-CMHmdbSu.js} +4212 -6630
package/dist/{range-CcDl05lo.js → range-DQMNzBWs.js} +1 -1
package/dist/{reshape-C8CR_Bad.js → reshape-DFzh97Sc.js} +1 -1
package/dist/{sin-BJIrfnj7.js → sin-BYM-U4Ut.js} +1 -1
package/dist/slice_util-CnVNPQI-.js +90 -0
package/dist/softmax-4DOn6cPq.js +28 -0
package/dist/{split-DZbvruEP.js → split-CkbeVdF8.js} +3 -3
package/dist/{stack-BMm-efee.js → stack-DaIMO5iX.js} +1 -1
package/dist/sum-C6u3xMi3.js +27 -0
package/dist/{tensor-DJVbYhh1.js → tensor-Cu1fU7H7.js} +1 -1
package/dist/{tensor2d-ZuQSh2D-.js → tensor2d-D0CKdG6B.js} +1 -1
package/dist/tfjs_backend-Bzl2SrRo.js +2460 -0
package/dist/training/AdamExt.js +1 -1
package/dist/training/DatasetBuilder.js +3 -3
package/dist/training/FullTrainer.js +41 -33
package/dist/training/Trainer.d.ts +6 -1
package/dist/training/Trainer.js +13 -12
package/dist/training/sparseCrossEntropy.js +12 -11
package/dist/utilities/dummy.js +8 -8
package/dist/utilities/generate.js +11 -11
package/dist/utilities/load.js +1 -1
package/dist/utilities/profile.js +1 -1
package/dist/utilities/weights.js +2 -2
package/dist/{variable-Dl_ub3pk.js → variable-BS4AKqNU.js} +1 -1
package/dist/{zeros-CCy9C3uU.js → zeros-CmJFiC84.js} +1 -1
package/package.json +1 -1
package/dist/exports_layers-tbTBcwMM.js +0 -25
package/dist/layers/LayerNorm.d.ts +0 -13
package/dist/layers/LayerNorm.js +0 -33
package/dist/moments-DfcpfwKi.js +0 -132
package/dist/softmax-Be_lsqUc.js +0 -105
package/dist/training/LayerTrainer.d.ts +0 -29
package/dist/training/LayerTrainer.js +0 -90
package/dist/training/lwSchedule.d.ts +0 -7
package/dist/training/lwSchedule.js +0 -162

package/dist/Generator.js CHANGED Viewed

@@ -1,7 +1,7 @@
 import { E as u } from "./index-Dwqa6Zy2.js";
-import "./index-pWA4_lUh.js";
-import { t as d } from "./tensor2d-ZuQSh2D-.js";
-import { c as k } from "./concat-BIZS_td9.js";
+import "./index-XjBAhiFO.js";
+import { t as d } from "./tensor2d-D0CKdG6B.js";
+import { c as p } from "./concat-DS_qH7MI.js";
 class w extends u {
   constructor(s, e) {
     super(), this.model = s, this.tokeniser = e;
@@ -12,41 +12,41 @@ class w extends u {
     return d(e, [1, e[0].length], "int32");
   }
   async generateNoCache(s, e) {
-    let t = await this.tokenisePrompt(s), o = s || "";
-    const n = e?.maxLength ?? 1e3;
-    for (let a = 0; a < n && this.active; a++) {
+    let t = await this.tokenisePrompt(s), n = s || "";
+    const o = e?.maxLength ?? 1e3;
+    for (let a = 0; a < o && this.active; a++) {
       const {
         output: i,
         attention: c,
         probabilities: l
       } = this.model.generate(t, void 0, e), h = t;
-      t = k([t, i], 1), h.dispose();
+      t = p([t, i], 1), h.dispose();
       const r = await this.processResponse(i, c, l);
       if (i.dispose(), r === null)
         break;
-      o += r;
+      n += r;
     }
-    return t.dispose(), o;
+    return t.dispose(), n;
   }
   async processResponse(s, e, t) {
-    const o = (await s.array())[0][0];
-    if (o === this.tokeniser.eosToken)
+    const n = (await s.array())[0][0];
+    if (n === this.tokeniser.eosToken)
       return null;
-    const n = await this.tokeniser.decode([o]);
+    const o = await this.tokeniser.decode([n]);
     let a;
     e && (a = await e.array(), e.dispose());
     let i;
-    return t && (i = await t.array(), t.dispose()), this.emit("tokens", [o], n, a, i), n;
+    return t && (i = await t.array(), t.dispose()), this.emit("tokens", [n], o, a, i), o;
   }
   async generateCache(s, e) {
-    let t = await this.tokenisePrompt(s), o = s || "";
-    const n = new Array(this.model.config.nLayer).fill(void 0), a = e?.maxLength ?? 1e3;
+    let t = await this.tokenisePrompt(s), n = s || "";
+    const o = new Array(this.model.config.gpt.nLayer).fill(void 0), a = e?.maxLength ?? 1e3;
     for (let i = 0; i < a && this.active; i++) {
       const {
         output: c,
         attention: l,
         probabilities: h
-      } = this.model.generate(t, n, {
+      } = this.model.generate(t, o, {
         ...e,
         usePadding: !1
       });
@@ -54,17 +54,17 @@ class w extends u {
       const r = await this.processResponse(c, l, h);
       if (r === null)
         break;
-      o += r;
+      n += r;
     }
-    return n.forEach((i) => {
+    return o.forEach((i) => {
       i && (i.k.dispose(), i.v.dispose());
-    }), t.dispose(), o;
+    }), t.dispose(), n;
   }
   async generate(s, e) {
-    const t = s && s.length > this.model.config.blockSize ? s.slice(-this.model.config.blockSize) : s;
+    const t = s && s.length > this.model.config.gpt.blockSize ? s.slice(-this.model.config.gpt.blockSize) : s;
     this.active = !0, this.emit("start");
-    const n = await (this.model.config.useRope && !e?.noCache ? this.generateCache(t, e) : this.generateNoCache(t, e));
-    return this.active = !1, this.emit("stop"), n;
+    const o = await (this.model.config.gpt.useRope && !e?.noCache && !e?.includeAttention ? this.generateCache(t, e) : this.generateNoCache(t, e));
+    return this.active = !1, this.emit("stop"), o;
   }
   stop() {
     this.active = !1;

package/dist/MLP-KHhikThU.js ADDED Viewed

@@ -0,0 +1,83 @@
+import { t as d } from "./index-XjBAhiFO.js";
+import c from "./layers/BaseLayer.js";
+import { E as p, D as l, a as h, r as i } from "./random_width-CMHmdbSu.js";
+/**
+ * @license
+ * Copyright 2018 Google LLC
+ *
+ * Use of this source code is governed by an MIT-style
+ * license that can be found in the LICENSE file or at
+ * https://opensource.org/licenses/MIT.
+ * =============================================================================
+ */
+function r(s) {
+  return new h(s);
+}
+function u(s) {
+  return new l(s);
+}
+function g(s) {
+  return new p(s);
+}
+class P extends c {
+  cFc;
+  cProj;
+  dropout;
+  index;
+  _trainable = !0;
+  constructor(t, e) {
+    super(e), this.index = t, this.cFc = r({
+      units: e.gpt.mlpFactor * e.gpt.nEmbed,
+      activation: "gelu",
+      useBias: e.gpt.biasInLinear,
+      kernelInitializer: i({
+        mean: 0,
+        stddev: 0.02
+      }),
+      biasInitializer: "zeros",
+      name: `block_${t}_mlp_cFc`
+    }), this.cProj = r({
+      units: e.gpt.nEmbed,
+      useBias: e.gpt.biasInLinear,
+      kernelInitializer: i({
+        mean: 0,
+        stddev: 0.02 / Math.sqrt(2 * e.gpt.nLayer)
+      }),
+      biasInitializer: "zeros",
+      name: `block_${t}_mlp_cProj`
+    }), this.dropout = u({ rate: e.gpt.dropout });
+  }
+  get variables() {
+    return [
+      ...this.cFc.trainableWeights.map((t) => t.read()),
+      ...this.cProj.trainableWeights.map((t) => t.read())
+    ];
+  }
+  get trainable() {
+    return this._trainable;
+  }
+  set trainable(t) {
+    this._trainable = t, this.cFc.trainable = t, this.cProj.trainable = t;
+  }
+  saveWeights(t) {
+    t.set(`block_${this.index}_mlpHidden`, this.cFc.getWeights()), t.set(`block_${this.index}_mlpOut`, this.cProj.getWeights());
+  }
+  loadWeights(t) {
+    this.cFc.setWeights(t.get(`block_${this.index}_mlpHidden`) || []), this.cProj.setWeights(t.get(`block_${this.index}_mlpOut`) || []);
+  }
+  call(t, e = !1) {
+    return d(() => {
+      this.startMemory();
+      const a = this.cFc.apply(t), n = this.cProj.apply(a), o = this.dropout.apply(n, { training: e });
+      return this.endMemory("MLP"), o;
+    });
+  }
+  dispose() {
+    this.cFc.dispose(), this.cProj.dispose(), this.dropout.dispose();
+  }
+}
+export {
+  P as M,
+  u as d,
+  g as e
+};

package/dist/NanoGPTModel.d.ts CHANGED Viewed

@@ -1,6 +1,5 @@
 import { GPTConfig } from './config';
 import { KVCache } from './layers/CausalSelfAttention';
-import { default as MemoryProfiler } from './utilities/profile';
 import { default as BaseLayer } from './layers/BaseLayer';
 import { Tensor, Variable } from '@tensorflow/tfjs-core';
 export interface TrainingLogEntry {
@@ -19,7 +18,6 @@ export interface GenerateOptions {
     includeProbabilities?: boolean;
 }
 export default class NanoGPT extends BaseLayer {
-    readonly config: GPTConfig;
     private wte;
     private wpe?;
     private drop;
@@ -28,6 +26,8 @@ export default class NanoGPT extends BaseLayer {
     private ropeCache?;
     log: TrainingLogEntry[];
     constructor(config?: Partial<GPTConfig>);
+    get checkpointing(): boolean;
+    set checkpointing(value: boolean);
     get variables(): Variable[];
     saveWeights(): Map<string, Tensor[]>;
     loadWeights(weights: Map<string, Tensor[]>): void;
@@ -35,7 +35,6 @@ export default class NanoGPT extends BaseLayer {
     setSkipMask(mask: boolean[]): void;
     setTrainableMask(mask: boolean[]): void;
     set trainable(value: boolean);
-    setProfiler(value: MemoryProfiler | undefined): void;
     private validateInput;
     private calculateLoss;
     private computeAttentionRollout;

package/dist/NanoGPTModel.js CHANGED Viewed

@@ -1,18 +1,19 @@
-import { defaultConfig as F } from "./config.js";
-import L from "./layers/TransformerBlock.js";
-import P from "./layers/TiedEmbedding.js";
-import C from "./layers/RoPECache.js";
+import { defaultConfig as x } from "./config.js";
+import W from "./layers/TransformerBlock.js";
+import F from "./layers/TiedEmbedding.js";
+import P from "./layers/RoPECache.js";
 import q from "./layers/RMSNorm.js";
 import { estimateParameterCount as K } from "./utilities/parameters.js";
 import { createSoftmaxCrossEntropyWithGrad as N } from "./training/sparseCrossEntropy.js";
 import T from "./layers/BaseLayer.js";
-import { r as R, e as D, p as A } from "./random_width-oeUIlUZj.js";
-import { o as y, h as E, p as B, E as z, W as G, X as O, Y as Q, t as w, Z as X, f as _ } from "./index-pWA4_lUh.js";
-import { e as j, a as U } from "./exports_layers-tbTBcwMM.js";
-import { r as S } from "./reshape-C8CR_Bad.js";
-import { r as V } from "./range-CcDl05lo.js";
-import { g as Y } from "./gather-BPGW8RsB.js";
-import { s as Z } from "./softmax-Be_lsqUc.js";
+import { r as R, p as D } from "./random_width-CMHmdbSu.js";
+import { o as y, h as $, p as A, E as v, a6 as B, a7 as G, a8 as O, t as w, a5 as Q, f as C } from "./index-XjBAhiFO.js";
+import { e as j, d as U } from "./MLP-KHhikThU.js";
+import { r as _ } from "./reshape-DFzh97Sc.js";
+import { r as V } from "./range-DQMNzBWs.js";
+import { e as X } from "./tfjs_backend-Bzl2SrRo.js";
+import { g as H } from "./gather-BUmJIS8n.js";
+import { s as J } from "./softmax-4DOn6cPq.js";
 /**
  * @license
  * Copyright 2020 Google LLC. All Rights Reserved.
@@ -29,13 +30,13 @@ import { s as Z } from "./softmax-Be_lsqUc.js";
  * limitations under the License.
  * =============================================================================
  */
-function H(m, t) {
-  let e = E(m, "a", "mod"), o = E(t, "b", "mod");
-  [e, o] = B(e, o);
+function Y(f, t) {
+  let e = $(f, "a", "mod"), o = $(t, "b", "mod");
+  [e, o] = A(e, o);
   const i = { a: e, b: o };
-  return z.runKernel(G, i);
+  return v.runKernel(B, i);
 }
-const J = /* @__PURE__ */ y({ mod_: H });
+const Z = /* @__PURE__ */ y({ mod_: Y });
 /**
  * @license
  * Copyright 2020 Google LLC. All Rights Reserved.
@@ -52,17 +53,17 @@ const J = /* @__PURE__ */ y({ mod_: H });
  * limitations under the License.
  * =============================================================================
  */
-function tt(m, t, e, o = !1) {
-  const i = E(m, "logits", "multinomial"), s = i.size, r = i.rank;
+function tt(f, t, e, o = !1) {
+  const i = $(f, "logits", "multinomial"), s = i.size, r = i.rank;
   if (s < 2)
     throw new Error(`Error in multinomial: you need at least 2 outcomes, but got ${s}.`);
   if (r > 2)
     throw new Error(`Rank of probabilities must be 1 or 2, but is ${r}`);
   e = e || Math.random();
-  const n = { logits: r === 1 ? S(i, [1, -1]) : i }, h = { numSamples: t, seed: e, normalized: o }, a = z.runKernel(O, n, h);
-  return r === 1 ? S(a, [a.size]) : a;
+  const n = { logits: r === 1 ? _(i, [1, -1]) : i }, h = { numSamples: t, seed: e, normalized: o }, l = v.runKernel(G, n, h);
+  return r === 1 ? _(l, [l.size]) : l;
 }
-const I = /* @__PURE__ */ y({ multinomial_: tt });
+const M = /* @__PURE__ */ y({ multinomial_: tt });
 /**
  * @license
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -79,8 +80,8 @@ const I = /* @__PURE__ */ y({ multinomial_: tt });
  * limitations under the License.
  * =============================================================================
  */
-function et(m, t = 1, e = !0) {
-  const o = E(m, "x", "topk");
+function et(f, t = 1, e = !0) {
+  const o = $(f, "x", "topk");
   if (o.rank === 0)
     throw new Error("topk() expects the input to be of rank 1 or higher");
   const i = o.shape[o.shape.length - 1];
@@ -88,12 +89,11 @@ function et(m, t = 1, e = !0) {
     throw new Error(`'k' passed to topk() must be >= 0 but got ${t}`);
   if (t > i)
     throw new Error(`'k' passed to topk() must be <= the last dimension (${i}) but got ${t}`);
-  const s = { x: o }, r = { k: t, sorted: e }, [l, n] = z.runKernel(Q, s, r);
-  return { values: l, indices: n };
+  const s = { x: o }, r = { k: t, sorted: e }, [a, n] = v.runKernel(O, s, r);
+  return { values: a, indices: n };
 }
 const ot = /* @__PURE__ */ y({ topk_: et });
-class kt extends T {
-  config;
+class wt extends T {
   wte;
   // Token embeddings
   wpe;
@@ -107,19 +107,25 @@ class kt extends T {
   log = [];
   // Training log
   constructor(t = {}) {
-    super(), this.config = { ...F, ...t }, this.wte = new P({
-      vocabSize: this.config.vocabSize,
-      embedDim: this.config.nEmbed,
+    super({ gpt: { ...x, ...t }, layerConfig: {} }), this.wte = new F({
+      vocabSize: this.config.gpt.vocabSize,
+      embedDim: this.config.gpt.nEmbed,
       name: "token_embedding"
-    }), this.config.useRope === !1 ? this.wpe = j({
-      inputDim: this.config.blockSize,
-      outputDim: this.config.nEmbed,
+    }), this.config.gpt.useRope === !1 ? this.wpe = j({
+      inputDim: this.config.gpt.blockSize,
+      outputDim: this.config.gpt.nEmbed,
       name: "positional_embedding",
       embeddingsInitializer: R({ mean: 0, stddev: 0.02 })
-    }) : this.ropeCache = new C(this.config), this.drop = U({ rate: this.config.dropout }), this.blocks = [];
-    for (let e = 0; e < this.config.nLayer; e++)
-      this.blocks.push(new L(e, this.config, this.ropeCache));
-    this.lnF = new q([this.config.nEmbed], 1e-8, "final_rms_norm");
+    }) : (this.ropeCache = new P(this.config.gpt), this.config.layerConfig.ropeCache = this.ropeCache), this.drop = U({ rate: this.config.gpt.dropout }), this.blocks = [];
+    for (let e = 0; e < this.config.gpt.nLayer; e++)
+      this.blocks.push(new W(e, this.config));
+    this.lnF = new q(this.config, 1e-8, "final_rms_norm");
+  }
+  get checkpointing() {
+    return this.config.layerConfig.checkpointAttention === !0 || this.config.layerConfig.checkpointMLP === !0;
+  }
+  set checkpointing(t) {
+    this.config.layerConfig.checkpointAttention = t, this.config.layerConfig.checkpointMLP = t;
   }
   get variables() {
     return [
@@ -145,9 +151,9 @@ class kt extends T {
   inputPhase(t, e, o = !1) {
     return w(() => {
       const i = this.wte.embed(t);
-      if (this.config.useRope === !1) {
-        const [, s] = t.shape, r = this.config.blockSize, l = V(0, s, 1, "int32"), n = J(X(l, _(e, "int32")), _(r, "int32")), h = this.wpe.apply(n), a = i.add(h);
-        return this.drop.apply(a, { training: o });
+      if (this.config.gpt.useRope === !1) {
+        const [, s] = t.shape, r = this.config.gpt.blockSize, a = V(0, s, 1, "int32"), n = Z(Q(a, C(e, "int32")), C(r, "int32")), h = this.wpe.apply(n), l = i.add(h);
+        return this.drop.apply(l, { training: o });
       } else
         return this.drop.apply(i, { training: o });
     });
@@ -169,17 +175,11 @@ class kt extends T {
       e.trainable = t;
     this.lnF.trainable = t;
   }
-  setProfiler(t) {
-    this._profiler = t;
-    for (const e of this.blocks)
-      e.setProfiler(t);
-    this.lnF.setProfiler(t);
-  }
   validateInput(t) {
     if (t.shape.length !== 2)
       throw new Error(`Invalid input shape: expected [batch_size, sequence_length], got ${t.shape}`);
-    if (t.shape[1] > this.config.blockSize)
-      throw new Error(`Input sequence length ${t.shape[1]} isn't block size ${this.config.blockSize}`);
+    if (t.shape[1] > this.config.gpt.blockSize)
+      throw new Error(`Input sequence length ${t.shape[1]} isn't block size ${this.config.gpt.blockSize}`);
     if (t.dtype !== "int32")
       throw new Error(`Input tensor must be of type int32, got ${t.dtype}`);
   }
@@ -198,17 +198,17 @@ class kt extends T {
         throw new Error("No attentions for rollout");
       const [e, o, i] = t[0].shape;
       for (const s of t) {
-        const [r, l, n] = s.shape;
-        if (r !== e || l !== o || n !== i)
+        const [r, a, n] = s.shape;
+        if (r !== e || a !== o || n !== i)
           throw new Error(
-            `Inconsistent attention shapes in rollout: expected [${e},${o},${i}] got [${r},${l},${n}]`
+            `Inconsistent attention shapes in rollout: expected [${e},${o},${i}] got [${r},${a},${n}]`
           );
       }
       if (o === i) {
-        const s = D(i, i).expandDims(0);
+        const s = X(i, i).expandDims(0);
         let r = s.tile([e, 1, 1]);
-        for (const l of t) {
-          const n = l.add(s);
+        for (const a of t) {
+          const n = a.add(s);
           r = n.div(n.sum(-1, !0)).matMul(r);
         }
         return r;
@@ -220,52 +220,52 @@ class kt extends T {
     return this.validateInput(t), w(() => {
       this.startMemory();
       const r = s?.[0]?.length ?? 0;
-      let l = this.inputPhase(t, r, o);
+      let a = this.inputPhase(t, r, o);
       const n = [];
       if (s && s.length !== this.blocks.length)
         throw console.error("Cache", s), new Error(`Cache length ${s.length} does not match number of blocks ${this.blocks.length}`);
-      for (let c = 0; c < this.blocks.length; c++) {
-        const u = l, d = this.blocks[c], {
+      for (let p = 0; p < this.blocks.length; p++) {
+        const m = a, u = this.blocks[p], {
           output: b,
           attention: k,
-          cache: f
-        } = d.call(l, o, i, s ? s[c] : void 0);
-        l = b, u.dispose(), i && k && n.push(k), s && f ? (s[c]?.k.dispose(), s[c]?.v.dispose(), s[c] = f) : f && (f.k.dispose(), f.v.dispose());
+          cache: g
+        } = u.call(a, o, i, s ? s[p] : void 0);
+        a = b, m.dispose(), i && k && n.push(k), s && g ? (s[p]?.k.dispose(), s[p]?.v.dispose(), s[p] = g) : g && (g.k.dispose(), g.v.dispose());
       }
       let h;
-      i && n.length > 0 && (h = this.computeAttentionRollout(n)), l = this.lnF.apply(l);
-      const a = this.wte.project(l);
-      let p;
-      return e && (p = this.calculateLoss(a, e)), this.endMemory("Forward"), { logits: a, loss: p, attention: i ? h : void 0 };
+      i && n.length > 0 && (h = this.computeAttentionRollout(n)), a = this.lnF.apply(a);
+      const l = this.wte.project(a);
+      let c;
+      return e && (c = this.calculateLoss(l, e)), this.endMemory("Forward"), { logits: l, loss: c, attention: i ? h : void 0 };
     });
   }
   generate(t, e, o) {
-    const i = o?.temperature ?? 1, s = o?.topK, r = o?.usePadding ?? !1, l = o?.includeAttention ?? !1;
+    const i = o?.temperature ?? 1, s = o?.topK, r = o?.usePadding ?? !1, a = o?.includeAttention ?? !1;
     return w(() => {
-      const n = t, h = n.shape[1], a = h <= this.config.blockSize ? n : n.slice(
-        [0, h - this.config.blockSize],
-        [n.shape[0], this.config.blockSize]
-      ), p = r ? this.config.blockSize - a.shape[1] : 0, c = p > 0 ? A(a, [
+      const n = t, h = n.shape[1], l = h <= this.config.gpt.blockSize ? n : n.slice(
+        [0, h - this.config.gpt.blockSize],
+        [n.shape[0], this.config.gpt.blockSize]
+      ), c = r ? this.config.gpt.blockSize - l.shape[1] : 0, p = c > 0 ? D(l, [
         [0, 0],
-        [0, p]
-      ]) : a, { logits: u, attention: d } = this.forward(c, void 0, !1, l, e), b = u.shape[1] - 1 - p, k = u.slice([0, b, 0], [u.shape[0], 1, u.shape[2]]), f = d ? d.slice([0, b, 0], [d.shape[0], 1, d.shape[2]]) : void 0, $ = k.div(i);
-      let g;
+        [0, c]
+      ]) : l, { logits: m, attention: u } = this.forward(p, void 0, !1, a, e), b = m.shape[1] - 1 - c, k = m.slice([0, b, 0], [m.shape[0], 1, m.shape[2]]), g = u ? u.slice([0, b, 0], [u.shape[0], 1, u.shape[2]]) : void 0, E = k.div(i);
+      let d;
       if (s) {
-        const { values: M, indices: x } = ot($, s), W = I(M.squeeze([1]), 1);
-        g = Y(x.squeeze([1]), W, 1);
+        const { values: S, indices: I } = ot(E, s), L = M(S.squeeze([1]), 1);
+        d = H(I.squeeze([1]), L, 1);
       } else
-        g = I($.squeeze([1]), 1);
-      let v;
-      return o?.includeProbabilities && (v = Z($.squeeze([1]))), g = g.reshape([1, 1]), { output: g, attention: f?.squeeze([1]), probabilities: v };
+        d = M(E.squeeze([1]), 1);
+      let z;
+      return o?.includeProbabilities && (z = J(E.squeeze([1]))), d = d.reshape([1, 1]), { output: d, attention: g?.squeeze([1]), probabilities: z };
     });
   }
   getNumParams() {
-    return K(this.config);
+    return K(this.config.gpt);
   }
   dispose() {
     this.wte.dispose(), this.wpe && this.wpe.dispose(), this.drop.dispose(), this.blocks.forEach((t) => t.dispose()), this.lnF.dispose();
   }
 }
 export {
-  kt as default
+  wt as default
 };

package/dist/TeachableLLM.d.ts CHANGED Viewed

@@ -1,10 +1,11 @@
 import { GPTConfig } from './config';
 import { ITokeniser } from './tokeniser/type';
-import { default as NanoGPT } from './NanoGPTModel';
+import { default as NanoGPT, TrainingLogEntry } from './NanoGPTModel';
 import { SaveOptions } from './utilities/save';
 import { default as Generator, IGenerateOptions } from './Generator';
 import { default as Trainer, ITrainerOptions } from './Trainer';
 import { default as MemoryProfiler } from './utilities/profile';
+import { TrainingProgress } from './training/Trainer';
 type TeachableLLMStatus = 'warmup' | 'awaitingTokens' | 'ready' | 'training' | 'loading' | 'busy' | 'error';
 export default class TeachableLLM {
     private ee;
@@ -36,11 +37,11 @@ export default class TeachableLLM {
     dispose(): void;
     on(event: 'status', listener: (status: TeachableLLMStatus) => void): void;
     on(event: 'error', listener: (error: Error) => void): void;
-    on(event: 'trainStep', listener: (step: number) => void): void;
+    on(event: 'trainStep', listener: (step: TrainingLogEntry, progress: TrainingProgress) => void): void;
     on(event: 'loaded', listener: () => void): void;
     off(event: 'status', listener: (status: TeachableLLMStatus) => void): void;
     off(event: 'error', listener: (error: Error) => void): void;
-    off(event: 'trainStep', listener: (step: number) => void): void;
+    off(event: 'trainStep', listener: (step: TrainingLogEntry, progress: TrainingProgress) => void): void;
     off(event: 'loaded', listener: () => void): void;
 }
 export {};

package/dist/TeachableLLM.js CHANGED Viewed

@@ -1,17 +1,17 @@
-import { defaultConfig as d } from "./config.js";
-import h from "./NanoGPTModel.js";
-import { saveModel as l } from "./utilities/save.js";
+import { defaultConfig as h } from "./config.js";
+import l from "./NanoGPTModel.js";
+import { saveModel as d } from "./utilities/save.js";
 import { loadModel as f } from "./utilities/load.js";
 import u from "./Generator.js";
 import _ from "./Trainer.js";
-import { E as c } from "./index-Dwqa6Zy2.js";
+import { E as p } from "./index-Dwqa6Zy2.js";
 import { dummyPassAsync as m } from "./utilities/dummy.js";
-import p from "./tokeniser/CharTokeniser.js";
+import c from "./tokeniser/CharTokeniser.js";
 import g from "./tokeniser/bpe.js";
 import "./papaparse.min-C8l2Kvo1.js";
 import "./index-Tf7vU29b.js";
 import "./jszip.min-CjP2V1VV.js";
-import "./index-pWA4_lUh.js";
+import "./index-XjBAhiFO.js";
 import "./ops/cpu/scatterSub.js";
 import "./ops/webgl/scatterSub.js";
 import "./ops/cpu/gatherSub.js";
@@ -28,9 +28,12 @@ import "./ops/webgl/rope.js";
 import "./ops/grads/rope.js";
 import "./ops/cpu/appendCache.js";
 import "./ops/webgl/appendCache.js";
+import "./ops/cpu/fusedSoftmax.js";
+import "./ops/webgl/fusedSoftmax.js";
+import "./ops/grads/fusedSoftmax.js";
 import w from "./utilities/profile.js";
 class a {
-  ee = new c();
+  ee = new p();
   _config;
   _model;
   _tokeniser;
@@ -47,7 +50,7 @@ class a {
   get config() {
     if (!this._config)
       throw new Error("Model configuration is not initialized.");
-    return this._config;
+    return this._config.gpt;
   }
   get model() {
     if (!this._model)
@@ -71,7 +74,7 @@ class a {
   saveModel(t) {
     if (!this._model || !this._tokeniser)
       throw new Error("Model or tokeniser is not initialized.");
-    return l(this._model, this._tokeniser, t);
+    return d(this._model, this._tokeniser, t);
   }
   static loadModel(t) {
     const e = new a();
@@ -86,7 +89,7 @@ class a {
     }), e;
   }
   static create(t, e = {}) {
-    const i = { ...d, ...e }, o = t === "char" ? new p(i.vocabSize) : new g(i.vocabSize), s = new h(i), r = new a(o, s);
+    const i = { ...h, ...e }, o = t === "char" ? new c(i.vocabSize) : new g(i.vocabSize), s = new l(i), r = new a(o, s);
     return r.setStatus("warmup"), m(s).then(() => {
       r.tokeniser.trained ? (r.setStatus("ready"), r.ee.emit("loaded")) : (r.setStatus("awaitingTokens"), r.ee.emit("loaded"), r.tokeniser.once("trainStatus", (n) => {
         n === "trained" && r.setStatus("ready");
@@ -103,11 +106,11 @@ class a {
   }
   set enableProfiler(t) {
     if (t) {
-      if (!this._model)
+      if (!this._config)
         throw new Error("Model is not initialized.");
-      this._model.getProfiler() || this._model.setProfiler(new w());
+      this._config.layerConfig.profiler || (this._config.layerConfig.profiler = new w());
     } else
-      this._model && this._model.setProfiler(void 0);
+      this._config?.layerConfig.profiler && (this._config.layerConfig.profiler = void 0);
   }
   getNumParams() {
     if (!this._model)

package/dist/Trainer.js CHANGED Viewed

@@ -1,10 +1,10 @@
-import { E as l } from "./index-Dwqa6Zy2.js";
-import h from "./training/FullTrainer.js";
-class c extends l {
+import { E as m } from "./index-Dwqa6Zy2.js";
+import d from "./training/FullTrainer.js";
+class S extends m {
   trainer;
   hasTrained = !1;
   constructor(e, t) {
-    super(), this.trainer = new h(e, t, 1e-3);
+    super(), this.trainer = new d(e, t, 1e-3);
   }
   stop() {
     this.trainer.stop();
@@ -13,28 +13,35 @@ class c extends l {
     this.hasTrained = !1, this.trainer.reset();
   }
   async train(e, t) {
-    const { trainDataset: a, validationDataset: r } = await this.trainer.createTrainValidationSplit(
+    const { trainDataset: s, validationDataset: n } = await this.trainer.createTrainValidationSplit(
       e,
       t?.batchSize || 32,
       t?.validationSplit || 0.1
-    );
+    ), r = e.reduce((i, a) => i + a.length, 0) * (1 - (t?.validationSplit || 0));
     this.hasTrained || this.trainer.setLearningRate(t?.learningRate || 1e-3), this.hasTrained = !0, this.emit("start"), await this.trainer.trainOnDataset(
-      a,
+      s,
       {
         prompt: t?.prompt,
         logInterval: t?.logInterval || 10,
         desiredLoss: t?.desiredLoss || 0.01,
         maxSteps: t?.maxSteps || 1e3,
-        onStep: async (i) => {
-          const s = this.listeners("log");
-          for (const n of s)
-            await n(i);
+        onStep: async (i, a) => {
+          const l = this.listeners("log");
+          for (const h of l)
+            await h(i, {
+              ...a,
+              progress: a.totalSamples / r,
+              remaining: Math.max(
+                0,
+                (r - a.totalSamples) / a.totalSamples * a.duration
+              )
+            });
         }
       },
-      r
+      n
     ), this.emit("stop");
   }
 }
 export {
-  c as default
+  S as default
 };