npm - @genai-fi/nanogpt - Versions diffs - 0.1.7 → 0.1.9 - Mend

@genai-fi/nanogpt 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/dist/NanoGPTModel.d.ts +1 -0
package/dist/NanoGPTModel.js +20 -17
package/dist/TeachableLLM.d.ts +4 -2
package/dist/TeachableLLM.js +20 -15
package/dist/Trainer.d.ts +2 -0
package/dist/Trainer.js +10 -5
package/dist/layers/CausalSelfAttention.d.ts +1 -0
package/dist/layers/CausalSelfAttention.js +19 -16
package/dist/layers/LayerNorm.d.ts +1 -0
package/dist/layers/LayerNorm.js +7 -4
package/dist/layers/MLP.d.ts +1 -0
package/dist/layers/MLP.js +16 -13
package/dist/layers/TiedEmbedding.d.ts +1 -0
package/dist/layers/TiedEmbedding.js +18 -15
package/dist/layers/TransformerBlock.d.ts +1 -0
package/dist/layers/TransformerBlock.js +10 -7
package/dist/tokeniser/CharTokeniser.js +23 -23
package/dist/training/FullTrainer.js +27 -29
package/dist/training/Trainer.d.ts +2 -0
package/dist/training/Trainer.js +31 -27
package/dist/utilities/save.d.ts +7 -1
package/dist/utilities/save.js +28 -13
package/package.json +1 -1

package/dist/NanoGPTModel.d.ts CHANGED Viewed

@@ -46,4 +46,5 @@ export default class NanoGPT {
         probabilities?: TF.Tensor;
     };
     getNumParams(): number;
+    dispose(): void;
 }

package/dist/NanoGPTModel.js CHANGED Viewed

@@ -54,7 +54,7 @@ class $ {
   }
   inputPhase(t, e = !1) {
     return this.tf.tidy(() => {
-      const [, s] = t.shape, i = this.wte.embed(t), n = this.tf.range(0, s, 1, "int32"), a = this.wpe.apply(n), o = i.add(a);
+      const [, s] = t.shape, i = this.wte.embed(t), n = this.tf.range(0, s, 1, "int32"), h = this.wpe.apply(n), o = i.add(h);
       return this.drop.apply(o, { training: e });
     });
   }
@@ -98,8 +98,8 @@ class $ {
         throw new Error("No attentions for rollout");
       const e = t[0].shape[0], s = t[0].shape[1], i = this.tf.eye(s, s).expandDims(0);
       let n = i.tile([e, 1, 1]);
-      for (const a of t) {
-        let o = a.add(i);
+      for (const h of t) {
+        let o = h.add(i);
         o = o.div(o.sum(-1, !0)), n = o.matMul(n);
       }
       return n;
@@ -108,36 +108,36 @@ class $ {
   forward(t, e, s = !1, i = !1) {
     return this.validateInput(t), this.tf.tidy(() => {
       let n = this.inputPhase(t, s);
-      const a = [];
+      const h = [];
       for (const c of this.blocks) {
-        const { output: p, attention: l } = c.call(n, s, i);
-        n = p, i && l && a.push(l);
+        const { output: d, attention: l } = c.call(n, s, i);
+        n = d, i && l && h.push(l);
       }
       let o;
-      i && a.length > 0 && (o = this.computeAttentionRollout(a)), n = this.lnF.apply(n);
-      const h = this.wte.project(n);
+      i && h.length > 0 && (o = this.computeAttentionRollout(h)), n = this.lnF.apply(n);
+      const a = this.wte.project(n);
       let r;
-      return e && (r = this.calculateLoss(h, e)), { logits: h, loss: r, attention: i ? o : void 0 };
+      return e && (r = this.calculateLoss(a, e)), { logits: a, loss: r, attention: i ? o : void 0 };
     });
   }
   generate(t, e) {
-    const s = e?.temperature ?? 1, i = e?.topK, n = e?.usePadding ?? !1, a = e?.includeAttention ?? !1;
+    const s = e?.temperature ?? 1, i = e?.topK, n = e?.usePadding ?? !1, h = e?.includeAttention ?? !1;
     return this.tf.tidy(() => {
-      const o = t, h = o.shape[1], r = h <= this.config.blockSize ? o : o.slice(
-        [0, h - this.config.blockSize],
+      const o = t, a = o.shape[1], r = a <= this.config.blockSize ? o : o.slice(
+        [0, a - this.config.blockSize],
         [o.shape[0], this.config.blockSize]
-      ), c = n ? this.config.blockSize - r.shape[1] : 0, p = c > 0 ? this.tf.pad(r, [
+      ), c = n ? this.config.blockSize - r.shape[1] : 0, d = c > 0 ? this.tf.pad(r, [
         [0, 0],
         [0, c]
-      ]) : r, { logits: l, attention: g } = this.forward(p, void 0, !1, a), b = l.shape[1] - 1 - c, u = l.slice([0, b, 0], [l.shape[0], 1, l.shape[2]]), k = g ? g.slice([0, b, 0], [g.shape[0], 1, g.shape[2]]) : void 0, d = u.div(s);
+      ]) : r, { logits: l, attention: p } = this.forward(d, void 0, !1, h), b = l.shape[1] - 1 - c, u = l.slice([0, b, 0], [l.shape[0], 1, l.shape[2]]), k = p ? p.slice([0, b, 0], [p.shape[0], 1, p.shape[2]]) : void 0, g = u.div(s);
       let f;
       if (i) {
-        const { values: w, indices: E } = this.tf.topk(d, i), y = this.tf.multinomial(w.squeeze([1]), 1);
+        const { values: w, indices: E } = this.tf.topk(g, i), y = this.tf.multinomial(w.squeeze([1]), 1);
         f = this.tf.gather(E.squeeze([1]), y, 1);
       } else
-        f = this.tf.multinomial(d.squeeze([1]), 1);
+        f = this.tf.multinomial(g.squeeze([1]), 1);
       let m;
-      return e?.includeProbabilities && (m = this.tf.softmax(d.squeeze([1]))), f = f.reshape([1, 1]), { output: f, attention: k?.squeeze([1]), probabilities: m };
+      return e?.includeProbabilities && (m = this.tf.softmax(g.squeeze([1]))), f = f.reshape([1, 1]), { output: f, attention: k?.squeeze([1]), probabilities: m };
     });
   }
   getNumParams() {
@@ -146,6 +146,9 @@ class $ {
     this.config.nEmbed * 4 * this.config.nEmbed), i = this.config.nEmbed + this.config.vocabSize * this.config.nEmbed;
     return t + e + s + i;
   }
+  dispose() {
+    this.wte.dispose(), this.wpe.dispose(), this.drop.dispose(), this.blocks.forEach((t) => t.dispose()), this.lnF.dispose();
+  }
 }
 export {
   $ as default

package/dist/TeachableLLM.d.ts CHANGED Viewed

@@ -2,10 +2,11 @@ import { default as TF } from '@tensorflow/tfjs';
 import { GPTConfig } from './config';
 import { ITokeniser } from './tokeniser/type';
 import { default as NanoGPT } from './NanoGPTModel';
+import { SaveOptions } from './utilities/save';
 import { default as Generator, IGenerateOptions } from './Generator';
 import { default as Trainer, ITrainerOptions } from './Trainer';
 import { default as EE } from 'eventemitter3';
-type TeachableLLMStatus = 'warmup' | 'ready' | 'training' | 'loading' | 'busy' | 'error';
+type TeachableLLMStatus = 'warmup' | 'awaitingTokens' | 'ready' | 'training' | 'loading' | 'busy' | 'error';
 export default class TeachableLLM extends EE<'status' | 'error' | 'trainStep'> {
     private _config?;
     private _model?;
@@ -19,7 +20,7 @@ export default class TeachableLLM extends EE<'status' | 'error' | 'trainStep'> {
     get status(): TeachableLLMStatus;
     get ready(): boolean;
     private setStatus;
-    saveModel(): Promise<Blob>;
+    saveModel(options?: SaveOptions): Promise<Blob>;
     static loadModel(tf: typeof TF, data: Blob | Buffer | string): TeachableLLM;
     static create(tf: typeof TF, config?: Partial<GPTConfig>): TeachableLLM;
     getNumParams(): number;
@@ -27,5 +28,6 @@ export default class TeachableLLM extends EE<'status' | 'error' | 'trainStep'> {
     train(text: string[], options?: ITrainerOptions): Promise<void>;
     generator(): Generator;
     generateText(prompt?: string, options?: IGenerateOptions): Promise<string>;
+    dispose(): void;
 }
 export {};

package/dist/TeachableLLM.js CHANGED Viewed

@@ -1,13 +1,13 @@
 import d from "./NanoGPTModel.js";
-import { defaultConfig as m } from "./config.js";
-import { saveModel as u } from "./utilities/save.js";
+import { defaultConfig as u } from "./config.js";
+import { saveModel as m } from "./utilities/save.js";
 import { loadModel as l } from "./utilities/load.js";
 import f from "./Generator.js";
 import _ from "./Trainer.js";
 import { E as c } from "./index-SOhdqzHq.js";
-import { dummyPassAsync as a } from "./utilities/dummy.js";
+import { dummyPassAsync as h } from "./utilities/dummy.js";
 import g from "./tokeniser/CharTokeniser.js";
-class n extends c {
+class a extends c {
   _config;
   _model;
   tf;
@@ -35,20 +35,20 @@ class n extends c {
     return this._status;
   }
   get ready() {
-    return this._status === "ready" && !!this._model && !!this._tokeniser;
+    return this._status === "ready" && !!this._model && !!this._tokeniser && this.tokeniser.trained;
   }
   setStatus(t) {
     this._status !== t && (this._status = t, this.emit("status", t));
   }
-  saveModel() {
+  saveModel(t) {
     if (!this._model || !this._tokeniser)
       throw new Error("Model or tokeniser is not initialized.");
-    return u(this._model, this._tokeniser);
+    return m(this._model, this._tokeniser, t);
   }
   static loadModel(t, r) {
-    const e = new n(t);
+    const e = new a(t);
     return l(t, r).then(({ model: i, tokeniser: o }) => {
-      e._model = i, e._tokeniser = o, e._config = i.config, e.setStatus("warmup"), a(i).then(() => {
+      e._model = i, e._tokeniser = o, e._config = i.config, e.setStatus("warmup"), h(i).then(() => {
         e.setStatus("ready");
       }).catch((s) => {
         e.setStatus("error"), e.emit("error", s);
@@ -58,11 +58,13 @@ class n extends c {
     }), e;
   }
   static create(t, r = {}) {
-    const e = { ...m, ...r }, i = new g(e.vocabSize), o = new d(t, e), s = new n(t, i, o);
-    return s.setStatus("warmup"), a(o).then(() => {
-      s.setStatus("ready");
-    }).catch((h) => {
-      s.setStatus("error"), s.emit("error", h);
+    const e = { ...u, ...r }, i = new g(e.vocabSize), o = new d(t, e), s = new a(t, i, o);
+    return s.setStatus("warmup"), h(o).then(() => {
+      s.setStatus("awaitingTokens"), s.tokeniser.once("trainStatus", (n) => {
+        n === "trained" && s.setStatus("ready");
+      });
+    }).catch((n) => {
+      s.setStatus("error"), s.emit("error", n);
     }), s;
   }
   getNumParams() {
@@ -96,7 +98,10 @@ class n extends c {
   generateText(t, r) {
     return this.generator().generate(t, r);
   }
+  dispose() {
+    this._model?.dispose();
+  }
 }
 export {
-  n as default
+  a as default
 };

package/dist/Trainer.d.ts CHANGED Viewed

@@ -12,7 +12,9 @@ export interface ITrainerOptions {
 }
 export default class Trainer extends EE<'start' | 'stop' | 'log'> {
     private trainer;
+    private hasTrained;
     constructor(model: NanoGPT, tokeniser: ITokeniser);
     stop(): void;
+    reset(): void;
     train(text: string[], options?: ITrainerOptions): Promise<void>;
 }

package/dist/Trainer.js CHANGED Viewed

@@ -1,11 +1,16 @@
 import { E as l } from "./index-SOhdqzHq.js";
-import o from "./training/FullTrainer.js";
-class d extends l {
+import h from "./training/FullTrainer.js";
+class m extends l {
   trainer;
+  hasTrained = !1;
   constructor(a, t) {
-    super(), this.trainer = new o(a.tf, a, t, 1e-3);
+    super(), this.trainer = new h(a.tf, a, t, 1e-3);
   }
   stop() {
+    this.trainer.stop();
+  }
+  reset() {
+    this.hasTrained = !1, this.trainer.reset();
   }
   async train(a, t) {
     const { trainDataset: e, validationDataset: r } = await this.trainer.createTrainValidationSplit(
@@ -13,7 +18,7 @@ class d extends l {
       t?.batchSize || 32,
       t?.validationSplit || 0.1
     );
-    this.trainer.setLearningRate(t?.learningRate || 1e-3), this.emit("start"), await this.trainer.trainOnDataset(
+    this.hasTrained || this.trainer.setLearningRate(t?.learningRate || 1e-3), this.hasTrained = !0, this.emit("start"), await this.trainer.trainOnDataset(
       e,
       {
         prompt: t?.prompt,
@@ -31,5 +36,5 @@ class d extends l {
   }
 }
 export {
-  d as default
+  m as default
 };

package/dist/layers/CausalSelfAttention.d.ts CHANGED Viewed

@@ -25,4 +25,5 @@ export default class CausalSelfAttention {
         output: TF.Tensor;
         attention?: TF.Tensor;
     };
+    dispose(): void;
 }

package/dist/layers/CausalSelfAttention.js CHANGED Viewed

@@ -50,35 +50,38 @@ class m {
     this.cAttn.setWeights(t.get(`block_${this.index}_cAttn`) || []), this.cProj.setWeights(t.get(`block_${this.index}_cProj`) || []);
   }
   getAttentionScores(t, e, s) {
-    const a = t.shape[2], n = this.tf.matMul(t, e, !1, !0).mul(this.divisor), i = this.maskInf.slice([0, 0], [a, a]), o = n.add(i), h = this.tf.softmax(o, -1);
+    const a = t.shape[2], o = this.tf.matMul(t, e, !1, !0).mul(this.divisor), i = this.maskInf.slice([0, 0], [a, a]), n = o.add(i), h = this.tf.softmax(n, -1);
     return this.attnDropout.apply(h, { training: s });
   }
   getQKV(t) {
-    const [e, s, a] = t.shape, r = this.cAttn.apply(t), [n, i, o] = this.tf.split(r, 3, -1);
+    const [e, s, a] = t.shape, r = this.cAttn.apply(t), [o, i, n] = this.tf.split(r, 3, -1);
     r.dispose();
-    const h = a / this.config.nHead, c = this.tf.reshape(n, [e, s, this.config.nHead, h]);
-    n.dispose();
-    const p = c.transpose([0, 2, 1, 3]);
+    const h = a / this.config.nHead, c = this.tf.reshape(o, [e, s, this.config.nHead, h]);
+    o.dispose();
+    const l = c.transpose([0, 2, 1, 3]);
     c.dispose();
-    const l = this.tf.reshape(i, [e, s, this.config.nHead, h]);
+    const d = this.tf.reshape(i, [e, s, this.config.nHead, h]);
     i.dispose();
-    const u = l.transpose([0, 2, 1, 3]);
-    l.dispose();
-    const d = this.tf.reshape(o, [e, s, this.config.nHead, h]);
-    o.dispose();
-    const b = d.transpose([0, 2, 1, 3]);
-    return d.dispose(), [p, u, b];
+    const u = d.transpose([0, 2, 1, 3]);
+    d.dispose();
+    const p = this.tf.reshape(n, [e, s, this.config.nHead, h]);
+    n.dispose();
+    const b = p.transpose([0, 2, 1, 3]);
+    return p.dispose(), [l, u, b];
   }
   getOutputProjection(t, e) {
-    const s = t.shape[0], a = t.shape[2], r = this.config.nEmbed, n = t.transpose([0, 2, 1, 3]), i = this.tf.reshape(n, [s, a, r]), o = this.cProj.apply(i);
-    return this.residDropout.apply(o, { training: e });
+    const s = t.shape[0], a = t.shape[2], r = this.config.nEmbed, o = t.transpose([0, 2, 1, 3]), i = this.tf.reshape(o, [s, a, r]), n = this.cProj.apply(i);
+    return this.residDropout.apply(n, { training: e });
   }
   call(t, e = !1, s = !1) {
     return this.tf.tidy(() => {
-      const [a, r, n] = this.getQKV(t), i = this.getAttentionScores(a, r, e), o = this.tf.matMul(i, n);
-      return { output: this.getOutputProjection(o, e), attention: s ? i.mean(1) : void 0 };
+      const [a, r, o] = this.getQKV(t), i = this.getAttentionScores(a, r, e), n = this.tf.matMul(i, o);
+      return { output: this.getOutputProjection(n, e), attention: s ? i.mean(1) : void 0 };
     });
   }
+  dispose() {
+    this.cAttn.dispose(), this.cProj.dispose(), this.attnDropout.dispose(), this.residDropout.dispose(), this.bias.dispose(), this.maskInf.dispose(), this.divisor.dispose();
+  }
 }
 export {
   m as default

package/dist/layers/LayerNorm.d.ts CHANGED Viewed

@@ -9,4 +9,5 @@ export default class LayerNorm {
     getWeights(): TF.Tensor[];
     setWeights(weights: TF.Tensor[]): void;
     apply(x: TF.Tensor): TF.Tensor;
+    dispose(): void;
 }

package/dist/layers/LayerNorm.js CHANGED Viewed

@@ -1,4 +1,4 @@
-class u {
+class h {
   gamma;
   //private beta: TF.Variable;
   epsilon;
@@ -20,11 +20,14 @@ class u {
   }
   apply(a) {
     return this.tf.tidy(() => {
-      const s = a.mean(-1, !0), t = a.sub(s), n = t.square().mean(-1, !0).add(this.epsilon).rsqrt();
-      return t.mul(n).mul(this.gamma);
+      const s = a.mean(-1, !0), t = a.sub(s), i = t.square().mean(-1, !0).add(this.epsilon).rsqrt();
+      return t.mul(i).mul(this.gamma);
     });
   }
+  dispose() {
+    this.gamma.dispose();
+  }
 }
 export {
-  u as default
+  h as default
 };

package/dist/layers/MLP.d.ts CHANGED Viewed

@@ -14,4 +14,5 @@ export default class MLP {
     saveWeights(map: Map<string, TF.Tensor[]>): void;
     loadWeights(weights: Map<string, TF.Tensor[]>): void;
     call(x: TF.Tensor, training?: boolean): TF.Tensor;
+    dispose(): void;
 }

package/dist/layers/MLP.js CHANGED Viewed

@@ -5,27 +5,27 @@ class l {
   tf;
   index;
   _trainable = !0;
-  constructor(t, i, e) {
-    this.tf = t, this.index = i, this.cFc = this.tf.layers.dense({
-      units: e.mlpFactor * e.nEmbed,
+  constructor(t, e, i) {
+    this.tf = t, this.index = e, this.cFc = this.tf.layers.dense({
+      units: i.mlpFactor * i.nEmbed,
       activation: "gelu",
-      useBias: e.biasInLinear,
+      useBias: i.biasInLinear,
       kernelInitializer: this.tf.initializers.randomNormal({
         mean: 0,
         stddev: 0.02
       }),
       biasInitializer: "zeros",
-      name: `block_${i}_mlp_cFc`
+      name: `block_${e}_mlp_cFc`
     }), this.cProj = this.tf.layers.dense({
-      units: e.nEmbed,
-      useBias: e.biasInLinear,
+      units: i.nEmbed,
+      useBias: i.biasInLinear,
       kernelInitializer: this.tf.initializers.randomNormal({
         mean: 0,
-        stddev: 0.02 / Math.sqrt(2 * e.nLayer)
+        stddev: 0.02 / Math.sqrt(2 * i.nLayer)
       }),
       biasInitializer: "zeros",
-      name: `block_${i}_mlp_cProj`
-    }), this.dropout = this.tf.layers.dropout({ rate: e.dropout });
+      name: `block_${e}_mlp_cProj`
+    }), this.dropout = this.tf.layers.dropout({ rate: i.dropout });
   }
   get variables() {
     return [
@@ -45,12 +45,15 @@ class l {
   loadWeights(t) {
     this.cFc.setWeights(t.get(`block_${this.index}_mlpHidden`) || []), this.cProj.setWeights(t.get(`block_${this.index}_mlpOut`) || []);
   }
-  call(t, i = !1) {
+  call(t, e = !1) {
     return this.tf.tidy(() => {
-      const e = this.cFc.apply(t), s = this.cProj.apply(e);
-      return this.dropout.apply(s, { training: i });
+      const i = this.cFc.apply(t), s = this.cProj.apply(i);
+      return this.dropout.apply(s, { training: e });
     });
   }
+  dispose() {
+    this.cFc.dispose(), this.cProj.dispose(), this.dropout.dispose();
+  }
 }
 export {
   l as default

package/dist/layers/TiedEmbedding.d.ts CHANGED Viewed

@@ -19,4 +19,5 @@ export default class TiedEmbeddingOutputLayer {
         vocabSize: number;
         embedDim: number;
     };
+    dispose(): void;
 }

package/dist/layers/TiedEmbedding.js CHANGED Viewed

@@ -168,11 +168,11 @@ const we = /* @__PURE__ */ p({ imag_: Ke });
  * limitations under the License.
  * =============================================================================
  */
-function ze(t, e = 0.2) {
+function We(t, e = 0.2) {
   const n = { x: a(t, "x", "leakyRelu") }, r = { alpha: e };
   return u.runKernel(ae, n, r);
 }
-const Ee = /* @__PURE__ */ p({ leakyRelu_: ze });
+const ze = /* @__PURE__ */ p({ leakyRelu_: We });
 /**
  * @license
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -189,11 +189,11 @@ const Ee = /* @__PURE__ */ p({ leakyRelu_: ze });
  * limitations under the License.
  * =============================================================================
  */
-function We(t) {
+function Ee(t) {
   const s = { x: a(t, "x", "neg") };
   return u.runKernel(ue, s);
 }
-const Oe = /* @__PURE__ */ p({ neg_: We });
+const Oe = /* @__PURE__ */ p({ neg_: Ee });
 /**
  * @license
  * Copyright 2020 Google LLC. All Rights Reserved.
@@ -368,7 +368,7 @@ function Ue(t, e, s, n) {
   if (e === "prelu")
     return Fe(t, s);
   if (e === "leakyrelu")
-    return Ee(t, n);
+    return ze(t, n);
   if (e === "sigmoid")
     return De(t);
   throw new Error(`Unknown fused activation ${e}.`);
@@ -397,18 +397,18 @@ function Je({ a: t, b: e, transposeA: s = !1, transposeB: n = !1, bias: r, activ
   }
   let o = a(t, "a", "fused matMul"), c = a(e, "b", "fused matMul");
   [o, c] = A(o, c);
-  const b = s ? o.shape[o.rank - 2] : o.shape[o.rank - 1], D = n ? c.shape[c.rank - 1] : c.shape[c.rank - 2], w = s ? o.shape[o.rank - 1] : o.shape[o.rank - 2], z = n ? c.shape[c.rank - 2] : c.shape[c.rank - 1], T = o.shape.slice(0, -2), S = c.shape.slice(0, -2), N = q(T), v = q(S);
+  const b = s ? o.shape[o.rank - 2] : o.shape[o.rank - 1], D = n ? c.shape[c.rank - 1] : c.shape[c.rank - 2], w = s ? o.shape[o.rank - 1] : o.shape[o.rank - 2], W = n ? c.shape[c.rank - 2] : c.shape[c.rank - 1], T = o.shape.slice(0, -2), S = c.shape.slice(0, -2), N = q(T), v = q(S);
   B(b === D, () => `Error in fused matMul: inner shapes (${b}) and (${D}) of Tensors with shapes ${o.shape} and ${c.shape} and transposeA=${s} and transposeB=${n} must match.`);
-  const O = P(o.shape.slice(0, -2), c.shape.slice(0, -2)).concat([w, z]), R = s ? f(o, [N, b, w]) : f(o, [N, w, b]), F = n ? f(c, [v, z, D]) : f(c, [v, D, z]);
+  const O = P(o.shape.slice(0, -2), c.shape.slice(0, -2)).concat([w, W]), R = s ? f(o, [N, b, w]) : f(o, [N, w, b]), F = n ? f(c, [v, W, D]) : f(c, [v, D, W]);
   let y;
   r != null && (y = a(r, "bias", "fused matMul"), [y] = A(y, o), P(O, y.shape));
   let C;
   l != null && (C = a(l, "prelu weights", "fused matMul"));
   const G = (x, K) => {
-    const [g, $, k, E] = K, m = qe(f(x, k.shape), k, i);
+    const [g, $, k, z] = K, m = qe(f(x, k.shape), k, i);
     let _, M;
     if (!s && !n ? (_ = d(m, $, !1, !0), M = d(g, m, !0, !1)) : !s && n ? (_ = d(m, $, !1, !1), M = d(m, g, !0, !1)) : s && !n ? (_ = d($, m, !1, !0), M = d(g, m, !1, !1)) : (_ = d($, m, !0, !0), M = d(m, g, !0, !0)), r != null) {
-      const Q = Pe(E, m);
+      const Q = Pe(z, m);
       return [_, M, Q];
     } else
       return [_, M];
@@ -425,11 +425,11 @@ function Je({ a: t, b: e, transposeA: s = !1, transposeB: n = !1, bias: r, activ
     );
     return $([K, g, k]), { value: f(k, O), gradFunc: G };
   })(R, F) : U((K, g, $, k) => {
-    const E = (
+    const z = (
       // tslint:disable-next-line: no-unnecessary-type-assertion
       u.runKernel(H, I, j)
     );
-    return k([K, g, E, $]), { value: f(E, O), gradFunc: G };
+    return k([K, g, z, $]), { value: f(z, O), gradFunc: G };
   })(R, F, y);
 }
 const J = /* @__PURE__ */ p({ fusedMatMul_: Je });
@@ -442,9 +442,9 @@ const J = /* @__PURE__ */ p({ fusedMatMul_: Je });
  * https://opensource.org/licenses/MIT.
  * =============================================================================
  */
-class W extends Error {
+class E extends Error {
   constructor(e) {
-    super(e), Object.setPrototypeOf(this, W.prototype);
+    super(e), Object.setPrototypeOf(this, E.prototype);
   }
 }
 /**
@@ -458,11 +458,11 @@ class W extends Error {
  */
 function Qe(t, e, s, n) {
   if (t.rank < 2 || e.rank < 2)
-    throw new W(`dot requires both inputs to be rank >= 2 but got x shape = ${t.shape} and y shape = ${e.shape}`);
+    throw new E(`dot requires both inputs to be rank >= 2 but got x shape = ${t.shape} and y shape = ${e.shape}`);
   if (e.rank >= 3) {
     const r = t.shape.slice(-1)[0], i = e.shape.slice(-2)[0];
     if (r !== i)
-      throw new W(`If rank y >= 3, then the second last dim of y must equal the last dim of x but got x shape = ${t.shape} and  y shape = ${e.shape}`);
+      throw new E(`If rank y >= 3, then the second last dim of y must equal the last dim of x but got x shape = ${t.shape} and  y shape = ${e.shape}`);
   }
   if (t.rank === 2 && e.rank === 2)
     return J({
@@ -526,6 +526,9 @@ class Ye {
       embedDim: this.embedDim
     };
   }
+  dispose() {
+    this.tiedWeights.dispose();
+  }
 }
 export {
   Ye as default

package/dist/layers/TransformerBlock.d.ts CHANGED Viewed

@@ -20,4 +20,5 @@ export default class Block {
         output: TF.Tensor;
         attention?: TF.Tensor;
     };
+    dispose(): void;
 }

package/dist/layers/TransformerBlock.js CHANGED Viewed

@@ -10,8 +10,8 @@ class u {
   index;
   _trainable = !0;
   skipped = !1;
-  constructor(t, s, i) {
-    this.tf = t, this.index = s, this.ln1 = new l(t, [i.nEmbed], 1e-5, `block_${this.index}_ln1`), this.attn = new h(this.tf, this.index, i), this.ln2 = new l(t, [i.nEmbed], 1e-5, `block_${this.index}_ln2`), this.mlp = new r(this.tf, this.index, i);
+  constructor(t, i, s) {
+    this.tf = t, this.index = i, this.ln1 = new l(t, [s.nEmbed], 1e-5, `block_${this.index}_ln1`), this.attn = new h(this.tf, this.index, s), this.ln2 = new l(t, [s.nEmbed], 1e-5, `block_${this.index}_ln2`), this.mlp = new r(this.tf, this.index, s);
   }
   get variables() {
     return [
@@ -33,18 +33,21 @@ class u {
   loadWeights(t) {
     this.attn.loadWeights(t), this.mlp.loadWeights(t), this.ln1.setWeights(t.get(`block_${this.index}_ln1`) || []), this.ln2.setWeights(t.get(`block_${this.index}_ln2`) || []);
   }
-  getMLPOutput(t, s) {
-    const i = this.ln2.apply(t), e = this.mlp.call(i, s);
+  getMLPOutput(t, i) {
+    const s = this.ln2.apply(t), e = this.mlp.call(s, i);
     return t.add(e);
   }
-  call(t, s = !1, i = !1) {
+  call(t, i = !1, s = !1) {
     return this.tf.tidy(() => {
       if (this.skipped)
         return { output: t };
-      const e = this.ln1.apply(t), n = this.attn.call(e, s, i), a = t.add(n.output);
-      return { output: this.getMLPOutput(a, s), attention: n.attention };
+      const e = this.ln1.apply(t), n = this.attn.call(e, i, s), a = t.add(n.output);
+      return { output: this.getMLPOutput(a, i), attention: n.attention };
     });
   }
+  dispose() {
+    this.ln1.dispose(), this.attn.dispose(), this.ln2.dispose(), this.mlp.dispose();
+  }
 }
 export {
   u as default

package/dist/tokeniser/CharTokeniser.js CHANGED Viewed

@@ -1,6 +1,6 @@
-import { E as h } from "../index-SOhdqzHq.js";
-const c = ["<eos>", "<unk>"];
-class l extends h {
+import { E as r } from "../index-SOhdqzHq.js";
+const h = ["<eos>", "<unk>"];
+class l extends r {
   vocabSize = 0;
   eosToken = 0;
   unkToken = 0;
@@ -9,7 +9,7 @@ class l extends h {
   constructor(s) {
     if (super(), Array.isArray(s))
       if (this.vocab = s, this.vocab.length > 0)
-        this.vocabSize = this.vocab.length, this.eosToken = this.vocab.indexOf("<eos>"), this.unkToken = this.vocab.indexOf("<unk>"), this.unkToken === -1 && (this.unkToken = this.eosToken), this.vocab.forEach((i, o) => {
+        this.vocabSize = this.vocab.length, this.eosToken = this.vocab.indexOf("<eos>"), this.unkToken = this.vocab.indexOf("<unk>"), this.unkToken === -1 && (this.unkToken = this.vocab.indexOf("<pad>")), this.unkToken === -1 && (this.unkToken = this.vocab.indexOf("_")), this.unkToken === -1 && (this.unkToken = this.vocab.indexOf(" ")), this.unkToken === -1 && (this.unkToken = this.eosToken), this.vocab.forEach((i, o) => {
           this.cache.set(i, o);
         });
       else
@@ -23,29 +23,29 @@ class l extends h {
   destroy() {
   }
   async train(s) {
-    const i = s.map((e) => e.split("")).flat(), o = new Set(i), t = Array.from(o), n = this.vocabSize - c.length;
-    if (t.length > n) {
-      const e = /* @__PURE__ */ new Map();
+    const i = s.map((t) => t.split("")).flat(), o = new Set(i), e = Array.from(o), n = this.vocabSize - h.length;
+    if (e.length > n) {
+      const t = /* @__PURE__ */ new Map();
       i.forEach((a) => {
-        e.set(a, (e.get(a) || 0) + 1);
-      }), t.sort((a, r) => (e.get(a) || 0) - (e.get(r) || 0)), t.splice(0, t.length - n);
-    } else if (t.length < n)
-      for (; t.length < n; )
-        t.push("<pad>");
-    return t.sort((e, a) => e.charCodeAt(0) - a.charCodeAt(0)), this.vocab = [...t, ...c], this.eosToken = this.vocab.indexOf("<eos>"), this.unkToken = this.vocab.indexOf("<unk>"), this.vocabSize = this.vocab.length, this.cache.clear(), this.vocab.forEach((e, a) => {
-      this.cache.set(e, a);
-    }), this.vocabSize;
+        t.set(a, (t.get(a) || 0) + 1);
+      }), e.sort((a, c) => (t.get(a) || 0) - (t.get(c) || 0)), e.splice(0, e.length - n);
+    } else if (e.length < n)
+      for (; e.length < n; )
+        e.push("<pad>");
+    return e.sort((t, a) => t.charCodeAt(0) - a.charCodeAt(0)), this.vocab = [...e, ...h], this.eosToken = this.vocab.indexOf("<eos>"), this.unkToken = this.vocab.indexOf("<unk>"), this.vocabSize = this.vocab.length, this.cache.clear(), this.vocab.forEach((t, a) => {
+      this.cache.set(t, a);
+    }), this.emit("trainStatus", "trained"), this.vocabSize;
   }
   async tokenise(s, i) {
     if (!this.trained)
       throw new Error("Tokeniser not trained");
-    return s.map((t) => i ? t.split("").map((n) => this.cache.get(n) ?? this.unkToken) : t.split("").map((n) => {
-      const e = this.cache.get(n);
-      return e !== void 0 ? this.vocab[e] : "<unk>";
+    return s.map((e) => i ? e.split("").map((n) => this.cache.get(n) ?? this.unkToken) : e.split("").map((n) => {
+      const t = this.cache.get(n);
+      return t !== void 0 ? this.vocab[t] : "<unk>";
     }));
   }
   async detokenise(s) {
-    return s.map((o) => o.map((t) => this.vocab[t]).join(""));
+    return s.map((o) => o.map((e) => this.vocab[e]).join(""));
   }
   async encode(s) {
     return (await this.tokenise([s], !0))[0];
@@ -60,10 +60,10 @@ class l extends h {
     return [];
   }
   async createTrainingData(s, i = 5) {
-    const o = await this.tokenise(s, !0), t = [], n = [];
-    for (let e = 0; e < o.length - i; e++)
-      t.push(...o[e].slice(0, i)), n.push(o[e + 1][0]);
-    return [t, n];
+    const o = await this.tokenise(s, !0), e = [], n = [];
+    for (let t = 0; t < o.length - i; t++)
+      e.push(...o[t].slice(0, i)), n.push(o[t + 1][0]);
+    return [e, n];
   }
 }
 export {

package/dist/training/FullTrainer.js CHANGED Viewed

@@ -1,70 +1,68 @@
 import { generateText as L } from "../utilities/generate.js";
 import w from "./Trainer.js";
-import g from "./Evaluator.js";
-const x = {
+import x from "./Evaluator.js";
+const g = {
   desiredLoss: 0.01,
   logInterval: 1,
   maxSteps: 1e3
 };
-class D extends w {
+class P extends w {
   constructor(r, i, o, n = 3e-4) {
     super(r, i, o, n);
   }
   // Train for multiple epochs using Dataset API - FIXED memory leaks
   async trainOnDataset(r, i, o) {
-    const { desiredLoss: n, logInterval: d, onStep: l, prompt: p, maxSteps: m } = {
-      ...x,
+    const { desiredLoss: n, logInterval: m, onStep: l, prompt: c, maxSteps: d } = {
+      ...g,
       ...i
-    }, s = {
-      pass: 0,
-      depth: 1,
+    }, t = {
       step: 0,
-      stepSinceDepthChange: 0,
       lastLoss: 1e6,
       totalSteps: 0,
       losses: [],
-      validationLosses: []
+      validationLosses: [],
+      ...this.lastState || {}
     };
-    this.dummyPass(), this.model.trainable = !0;
+    this.lastState = t, this.dummyPass(), this.model.trainable = !0;
     const u = Date.now();
     this.running = !0;
-    const c = o ? new g(this.model, o) : void 0, f = await r.iterator();
+    const h = o ? new x(this.model, o) : void 0, f = await r.iterator();
     try {
-      for (; this.running && !(s.lastLoss < n); ) {
+      for (; this.running && !(t.lastLoss < n); ) {
         const e = await f.next();
         if (e.done) break;
-        const h = e.value, v = this.trainBatch(s, h), a = {
-          loss: s.lastLoss,
-          step: s.step,
+        const p = e.value, v = this.trainBatch(t, p), a = {
+          loss: t.lastLoss,
+          step: t.step,
           time: Date.now() - u,
-          batchSize: h.xs.shape[0]
+          batchSize: p.xs.shape[0]
         };
-        if (this.model.log.push(a), s.step % d === 0) {
-          if (await v, c)
+        if (this.model.log.push(a), t.step % m === 0) {
+          if (await v, h)
             try {
-              const t = await c.evaluate(5);
-              s.validationLosses.push(t), a.valLoss = t;
-            } catch (t) {
-              console.error("Validation error:", t);
+              const s = await h.evaluate(5);
+              t.validationLosses.push(s), a.valLoss = s;
+            } catch (s) {
+              console.error("Validation error:", s);
             }
           if (l) {
-            if (p) {
-              const t = await L(this.tokenizer, this.model, p, 100, {
+            if (c) {
+              const s = await L(this.tokenizer, this.model, c, 100, {
                 temperature: 0.8
               });
-              a.example = t;
+              a.example = s;
             }
             await l(a);
           }
         }
-        s.step >= m && this.stop();
+        t.step >= d && this.stop();
       }
     } catch (e) {
       throw console.error("Training error:", e), this.tf.dispose(), e;
     }
-    return this.tf.dispose(), this.running = !1, { losses: s.losses, validationLosses: s.validationLosses };
+    return this.tf.dispose(), this.running = !1, { losses: t.losses, validationLosses: t.validationLosses };
   }
 }
 export {
-  D as default
+  P as default
 };

package/dist/training/Trainer.d.ts CHANGED Viewed

@@ -31,8 +31,10 @@ export default abstract class GPTTrainer {
     protected tf: typeof TF;
     protected learningRate: number;
     protected running: boolean;
+    protected lastState?: TrainingState;
     constructor(tf: typeof TF, model: NanoGPT, tokenizer: ITokeniser, learningRate?: number);
     setLearningRate(learningRate: number): void;
+    reset(): void;
     stop(): void;
     getOptimizer(): AdamExt;
     resetOptimizer(config?: AdamConfig): void;

package/dist/training/Trainer.js CHANGED Viewed

@@ -1,8 +1,8 @@
 import { DatasetBuilder as d } from "./DatasetBuilder.js";
-import p from "./AdamExt.js";
+import h from "./AdamExt.js";
 class u {
-  constructor(t, s, e, i = 1e-3) {
-    this.tokenizer = e, this.tf = t, this.model = s, this.learningRate = i, this.resetOptimizer(), this.datasetBuilder = new d(this.tf, e, s.config.blockSize);
+  constructor(t, e, s, i = 1e-3) {
+    this.tokenizer = s, this.tf = t, this.model = e, this.learningRate = i, this.resetOptimizer(), this.datasetBuilder = new d(this.tf, s, e.config.blockSize);
   }
   model;
   optimizer;
@@ -10,9 +10,13 @@ class u {
   tf;
   learningRate;
   running = !1;
+  lastState;
   setLearningRate(t) {
     this.learningRate = t, this.resetOptimizer({ learningRateFactor: 1, beta1: 0.9, beta2: 0.99, epsilon: 1e-8 });
   }
+  reset() {
+    this.lastState = void 0, this.running = !1;
+  }
   stop() {
     this.running = !1;
   }
@@ -21,7 +25,7 @@ class u {
   }
   resetOptimizer(t = { learningRateFactor: 1, beta1: 0.9, beta2: 0.99, epsilon: 1e-8 }) {
     this.optimizer && this.optimizer.dispose();
-    const s = new p(
+    const e = new h(
       t.learningRateFactor * this.learningRate,
       t.beta1,
       t.beta2,
@@ -33,53 +37,53 @@ class u {
         weightDecay: 0
       }
     );
-    this.optimizer = s;
+    this.optimizer = e;
   }
   printGradients(t) {
-    Object.keys(t).forEach((s) => {
-      const e = t[s];
-      console.log(`${s}:`), console.log(`  Shape: ${e.shape}`), console.log(`  Mean: ${this.tf.mean(e).dataSync()[0]}`), console.log(`  Std: ${this.tf.moments(e).variance.sqrt().dataSync()[0]}`), console.log(`  Min: ${this.tf.min(e).dataSync()[0]}`), console.log(`  Max: ${this.tf.max(e).dataSync()[0]}`), console.log(`  Norm: ${this.tf.norm(e).dataSync()[0]}`);
+    Object.keys(t).forEach((e) => {
+      const s = t[e];
+      console.log(`${e}:`), console.log(`  Shape: ${s.shape}`), console.log(`  Mean: ${this.tf.mean(s).dataSync()[0]}`), console.log(`  Std: ${this.tf.moments(s).variance.sqrt().dataSync()[0]}`), console.log(`  Min: ${this.tf.min(s).dataSync()[0]}`), console.log(`  Max: ${this.tf.max(s).dataSync()[0]}`), console.log(`  Norm: ${this.tf.norm(s).dataSync()[0]}`);
     });
   }
-  trainStep(t, s = !1, e = !1) {
+  trainStep(t, e = !1, s = !1) {
     return this.tf.tidy(() => {
       const { xs: i, ys: a } = t, o = () => {
         const { loss: l, logits: c } = this.model.forward(i, a, !0);
         return c.dispose(), l;
       }, { value: n, grads: r } = this.tf.variableGrads(o);
-      return s || (e && (console.log("-------"), this.printGradients(r), console.log("-------")), this.optimizer.applyGradients(r), this.tf.dispose(r)), n;
+      return e || (s && (console.log("-------"), this.printGradients(r), console.log("-------")), this.optimizer.applyGradients(r), this.tf.dispose(r)), n;
     });
   }
   dummyPass() {
-    const t = this.tf.zeros([1, this.model.config.blockSize], "int32"), s = this.tf.zeros([1, this.model.config.blockSize, this.model.config.vocabSize]);
+    const t = this.tf.zeros([1, this.model.config.blockSize], "int32"), e = this.tf.zeros([1, this.model.config.blockSize, this.model.config.vocabSize]);
     try {
-      const e = this.trainStep({ xs: t, ys: s }, !0);
-      e.dataSync(), e.dispose();
-    } catch (e) {
-      console.error("Error during dummy pass:", e);
+      const s = this.trainStep({ xs: t, ys: e }, !0);
+      s.dataSync(), s.dispose();
+    } catch (s) {
+      console.error("Error during dummy pass:", s);
     } finally {
-      t.dispose(), s.dispose();
+      t.dispose(), e.dispose();
     }
   }
-  async trainBatch(t, s) {
+  async trainBatch(t, e) {
     try {
-      const e = this.trainStep(s, !1, !1);
-      return s.xs.dispose(), s.ys.dispose(), t.step++, t.totalSteps++, e.array().then((i) => (t.lastLoss = i, t.losses.push(t.lastLoss), e.dispose(), t.lastLoss));
-    } catch (e) {
-      throw console.error(`Error processing batch at step ${t.step}:`, e), this.tf.dispose(), e;
+      const s = this.trainStep(e, !1, !1);
+      return e.xs.dispose(), e.ys.dispose(), t.step++, t.totalSteps++, s.array().then((i) => (t.lastLoss = i, t.losses.push(t.lastLoss), s.dispose(), t.lastLoss));
+    } catch (s) {
+      throw console.error(`Error processing batch at step ${t.step}:`, s), this.tf.dispose(), s;
     }
   }
-  async createTrainValidationSplit(t, s = 32, e = 0.1) {
-    const i = await this.datasetBuilder.createTextDataset(t, s, 0, 1 - e), a = await this.datasetBuilder.createTextDataset(
+  async createTrainValidationSplit(t, e = 32, s = 0.1) {
+    const i = await this.datasetBuilder.createTextDataset(t, e, 0, 1 - s), a = await this.datasetBuilder.createTextDataset(
       t,
-      s,
-      1 - e,
+      e,
+      1 - s,
       1
     );
     return { trainDataset: i, validationDataset: a };
   }
-  async createDataset(t, s = 32) {
-    return await this.datasetBuilder.createTextDataset(t, s);
+  async createDataset(t, e = 32) {
+    return await this.datasetBuilder.createTextDataset(t, e);
   }
   dispose() {
     this.optimizer && this.optimizer.dispose();

package/dist/utilities/save.d.ts CHANGED Viewed

@@ -1,3 +1,9 @@
 import { default as NanoGPT } from '../NanoGPTModel';
 import { ITokeniser } from '../tokeniser/type';
-export declare function saveModel(model: NanoGPT, tokeniser: ITokeniser): Promise<Blob>;
+export interface SaveOptions {
+    includeLog?: boolean;
+    name?: string;
+    metadata?: Record<string, unknown>;
+    files?: Record<string, unknown>;
+}
+export declare function saveModel(model: NanoGPT, tokeniser: ITokeniser, options?: SaveOptions): Promise<Blob>;

package/dist/utilities/save.js CHANGED Viewed

@@ -1,21 +1,36 @@
-import { z as f } from "../jszip.min-BLbRbbKt.js";
-import { exportWeights as g } from "./weights.js";
-async function l(i, t) {
-  const o = i.saveWeights(), e = new f(), s = {};
-  for (const [n, r] of o) {
-    const a = await g(r);
-    s[n] = a.spec, e.file(`${n}.bin`, a.data.buffer, { binary: !0 });
+import { z as g } from "../jszip.min-BLbRbbKt.js";
+import { exportWeights as l } from "./weights.js";
+const b = "1.0.0";
+async function p(t, s, i) {
+  const o = i?.includeLog ?? !0, c = t.saveWeights(), e = new g(), f = {};
+  for (const [n, a] of c) {
+    const r = await l(a);
+    f[n] = r.spec, e.file(`${n}.bin`, r.data.buffer, { binary: !0 });
   }
-  return e.file("manifest.json", JSON.stringify({ weightSpec: s, config: i.config }), {
-    binary: !1
-  }), e.file(
+  if (e.file(
+    "manifest.json",
+    JSON.stringify({
+      weightSpec: f,
+      config: t.config,
+      version: b,
+      application: "@genai-fi/nanogpt",
+      meta: i?.metadata,
+      name: i?.name
+    }),
+    {
+      binary: !1
+    }
+  ), e.file(
     "tokeniser.json",
-    JSON.stringify({ vocab: t.getVocab(), merges: await t.getMerges() }),
+    JSON.stringify({ vocab: s.getVocab(), merges: await s.getMerges() }),
     {
       binary: !1
     }
-  ), e.file("log.json", JSON.stringify(i.log), { binary: !1 }), e.generateAsync({ type: "blob" });
+  ), o && e.file("log.json", JSON.stringify(t.log), { binary: !1 }), i?.files)
+    for (const [n, a] of Object.entries(i.files))
+      e.file(n, JSON.stringify(a), { binary: !1 });
+  return e.generateAsync({ type: "blob" });
 }
 export {
-  l as saveModel
+  p as saveModel
 };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
     "name": "@genai-fi/nanogpt",
-    "version": "0.1.7",
+    "version": "0.1.9",
     "type": "module",
     "main": "dist/main.js",
     "types": "dist/main.d.ts",