npm - @genai-fi/nanogpt - Versions diffs - 0.1.3 → 0.1.5 - Mend

@genai-fi/nanogpt 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/dist/Generator.d.ts +1 -0
package/dist/Generator.js +49 -40
package/dist/NanoGPTModel.d.ts +3 -0
package/dist/NanoGPTModel.js +51 -34
package/dist/config.d.ts +1 -0
package/dist/config.js +4 -3
package/dist/layers/MLP.js +3 -3
package/dist/training/Evaluator.d.ts +8 -0
package/dist/training/Evaluator.js +22 -0
package/dist/training/FullTrainer.js +15 -14
package/dist/training/LayerTrainer.js +33 -35
package/dist/training/Trainer.d.ts +0 -1
package/dist/training/Trainer.js +13 -22
package/dist/utilities/generate.js +12 -10
package/package.json +1 -1

package/dist/Generator.d.ts CHANGED Viewed

@@ -7,6 +7,7 @@ export interface IGenerateOptions {
     topK?: number;
     usePadding?: boolean;
     includeAttention?: boolean;
+    includeProbabilities?: boolean;
 }
 export default class Generator extends EE<'start' | 'stop' | 'tokens'> {
     private readonly model;

package/dist/Generator.js CHANGED Viewed

@@ -1,53 +1,62 @@
 import { E as m } from "./index-SOhdqzHq.js";
-const g = 4;
-class w extends m {
-  constructor(o, t) {
-    super(), this.model = o, this.tokeniser = t;
+const b = 4;
+class x extends m {
+  constructor(a, t) {
+    super(), this.model = a, this.tokeniser = t;
   }
-  generateBlockOfTokens(o, t) {
-    const c = t?.temperature ?? 1, a = t?.topK, r = t?.usePadding ?? t?.includeAttention ?? !1, d = t?.includeAttention ?? !1;
-    let s = o, n;
-    for (let l = 0; l < g; l++) {
-      const { output: e, attention: i } = this.model.generate(s, {
-        temperature: c,
-        topK: a,
-        usePadding: r,
-        includeAttention: d
-      }), h = s;
-      if (s = this.model.tf.concat([s, e], 1), n && i) {
-        const u = n;
-        n = this.model.tf.concat([n, i], 0), u.dispose();
-      } else i && (n = i);
-      h.dispose(), e.dispose();
+  generateBlockOfTokens(a, t) {
+    const g = t?.temperature ?? 1, c = t?.topK, d = t?.usePadding ?? t?.includeAttention ?? !1, k = t?.includeAttention ?? !1, h = t?.includeProbabilities ?? !1;
+    let i = a, n, s;
+    for (let e = 0; e < b; e++) {
+      const {
+        output: u,
+        attention: l,
+        probabilities: r
+      } = this.model.generate(i, {
+        temperature: g,
+        topK: c,
+        usePadding: d,
+        includeAttention: k,
+        includeProbabilities: h
+      }), p = i;
+      if (i = this.model.tf.concat([i, u], 1), n && l) {
+        const o = n;
+        n = this.model.tf.concat([n, l], 0), o.dispose();
+      } else l && (n = l);
+      if (s && r) {
+        const o = s;
+        s = this.model.tf.concat([s, r], 0), o.dispose();
+      } else r && (s = r);
+      p.dispose(), u.dispose();
     }
-    return { output: s, attention: n };
+    return { output: i, attention: n, probabilities: s };
   }
-  async generate(o, t) {
-    const c = o ? await this.tokeniser.tokenise([o], !0) : [[this.tokeniser.eosToken]];
-    let a = this.model.tf.tensor2d(c, [1, c[0].length], "int32");
+  async generate(a, t) {
+    const g = a ? await this.tokeniser.tokenise([a], !0) : [[this.tokeniser.eosToken]];
+    let c = this.model.tf.tensor2d(g, [1, g[0].length], "int32");
     this.emit("start");
-    let r = o || "";
+    let d = a || "";
     for (; ; ) {
-      const { output: d, attention: s } = this.generateBlockOfTokens(a, t), n = a;
-      a = d;
-      const l = d.slice([0, n.shape[1]], [1, g]), e = (await l.array())[0];
-      let i = !1, h = !1;
-      const u = e.indexOf(this.tokeniser.eosToken);
-      u !== -1 && (i = !0, e.splice(u)), e.length + r.length >= (t?.maxLength ?? 1e3) && (h = !0, e.splice(
-        t?.maxLength ? t.maxLength - r.length : e.length
+      const { output: k, attention: h, probabilities: i } = this.generateBlockOfTokens(c, t), n = c;
+      c = k;
+      const s = k.slice([0, n.shape[1]], [1, b]), e = (await s.array())[0];
+      n.dispose(), s.dispose();
+      let u = !1, l = !1;
+      const r = e.indexOf(this.tokeniser.eosToken);
+      r !== -1 && (u = !0, e.splice(r)), e.length + d.length >= (t?.maxLength ?? 1e3) && (l = !0, e.splice(
+        t?.maxLength ? t.maxLength - d.length : e.length
       ));
-      const k = await this.tokeniser.decode(e);
-      if (r += k, s) {
-        let f = await s.array();
-        f.length > e.length && (f = f.slice(0, e.length)), this.emit("tokens", e, k, f);
-      } else
-        this.emit("tokens", e, k);
-      if (n.dispose(), l.dispose(), i || h)
+      const p = await this.tokeniser.decode(e);
+      d += p;
+      let o;
+      h && (o = await h.array(), h.dispose(), o.length > e.length && (o = o.slice(0, e.length)));
+      let f;
+      if (i && (f = await i.array(), i.dispose(), f.length > e.length && (f = f.slice(0, e.length))), this.emit("tokens", e, p, o, f), u || l)
         break;
     }
-    return a.dispose(), this.emit("stop"), r;
+    return c.dispose(), this.emit("stop"), d;
   }
 }
 export {
-  w as default
+  x as default
 };

package/dist/NanoGPTModel.d.ts CHANGED Viewed

@@ -13,6 +13,7 @@ export interface GenerateOptions {
     topK?: number;
     usePadding?: boolean;
     includeAttention?: boolean;
+    includeProbabilities?: boolean;
 }
 export default class NanoGPT {
     readonly config: GPTConfig;
@@ -33,6 +34,7 @@ export default class NanoGPT {
     set trainable(value: boolean);
     private validateInput;
     private calculateLoss;
+    private computeAttentionRollout;
     forward(idx: TF.Tensor, targets?: TF.Tensor, training?: boolean, includeAttention?: boolean): {
         logits: TF.Tensor;
         loss?: TF.Tensor;
@@ -41,6 +43,7 @@ export default class NanoGPT {
     generate(idx: TF.Tensor, options?: GenerateOptions): {
         output: TF.Tensor;
         attention?: TF.Tensor;
+        probabilities?: TF.Tensor;
     };
     getNumParams(): number;
 }

package/dist/NanoGPTModel.js CHANGED Viewed

@@ -1,7 +1,7 @@
-import { defaultConfig as y } from "./config.js";
-import z from "./layers/TransformerBlock.js";
-import v from "./layers/TiedEmbedding.js";
-import S from "./layers/LayerNorm.js";
+import { defaultConfig as z } from "./config.js";
+import v from "./layers/TransformerBlock.js";
+import S from "./layers/TiedEmbedding.js";
+import _ from "./layers/LayerNorm.js";
 class $ {
   config;
   wte;
@@ -17,7 +17,7 @@ class $ {
   log = [];
   // Training log
   constructor(t, e = {}) {
-    this.tf = t, this.config = { ...y, ...e }, this.wte = new v(t, {
+    this.tf = t, this.config = { ...z, ...e }, this.wte = new S(t, {
       vocabSize: this.config.vocabSize,
       embedDim: this.config.nEmbed,
       name: "token_embedding"
@@ -28,8 +28,8 @@ class $ {
       embeddingsInitializer: this.tf.initializers.randomNormal({ mean: 0, stddev: 0.02 })
     }), this.drop = this.tf.layers.dropout({ rate: this.config.dropout }), this.blocks = [];
     for (let s = 0; s < this.config.nLayer; s++)
-      this.blocks.push(new z(this.tf, s, this.config));
-    this.lnF = new S(t, [this.config.nEmbed], 1e-5, "final_layer_norm");
+      this.blocks.push(new v(this.tf, s, this.config));
+    this.lnF = new _(t, [this.config.nEmbed], 1e-5, "final_layer_norm");
   }
   get variables() {
     return [
@@ -54,8 +54,8 @@ class $ {
   }
   inputPhase(t, e = !1) {
     return this.tf.tidy(() => {
-      const [, s] = t.shape, n = this.wte.embed(t), i = this.tf.range(0, s, 1, "int32"), o = this.wpe.apply(i), h = n.add(o);
-      return this.drop.apply(h, { training: e });
+      const [, s] = t.shape, i = this.wte.embed(t), n = this.tf.range(0, s, 1, "int32"), a = this.wpe.apply(n), o = i.add(a);
+      return this.drop.apply(o, { training: e });
     });
   }
   setSkipMask(t) {
@@ -90,44 +90,61 @@ class $ {
       throw console.error("Error computing loss:", s), new Error(`Loss computation failed: ${s}`);
     }
   }
-  forward(t, e, s = !1, n = !1) {
+  // Attention rollout per Abnar & Zuidema (2020)
+  // Expects list of (B, T, T) attention matrices already averaged over heads.
+  computeAttentionRollout(t) {
+    return this.tf.tidy(() => {
+      if (t.length === 0)
+        throw new Error("No attentions for rollout");
+      const e = t[0].shape[0], s = t[0].shape[1], i = this.tf.eye(s, s).expandDims(0);
+      let n = i.tile([e, 1, 1]);
+      for (const a of t) {
+        let o = a.add(i);
+        o = o.div(o.sum(-1, !0)), n = o.matMul(n);
+      }
+      return n;
+    });
+  }
+  forward(t, e, s = !1, i = !1) {
     return this.validateInput(t), this.tf.tidy(() => {
-      let i = this.inputPhase(t, s), o;
-      n && (o = this.tf.zeros([i.shape[0], i.shape[1], i.shape[1]]));
-      for (const l of this.blocks) {
-        const { output: r, attention: f } = l.call(i, s, n);
-        i = r, f && o && (o = o.add(f));
+      let n = this.inputPhase(t, s);
+      const a = [];
+      for (const c of this.blocks) {
+        const { output: p, attention: l } = c.call(n, s, i);
+        n = p, i && l && a.push(l);
       }
-      o && (o = o.div(this.blocks.length)), i = this.lnF.apply(i);
-      const h = this.wte.project(i);
-      let a;
-      return e && (a = this.calculateLoss(h, e)), { logits: h, loss: a, attention: n ? o : void 0 };
+      let o;
+      i && a.length > 0 && (o = this.computeAttentionRollout(a)), n = this.lnF.apply(n);
+      const h = this.wte.project(n);
+      let r;
+      return e && (r = this.calculateLoss(h, e)), { logits: h, loss: r, attention: i ? o : void 0 };
     });
   }
   generate(t, e) {
-    const s = e?.temperature ?? 1, n = e?.topK, i = e?.usePadding ?? !1, o = e?.includeAttention ?? !1;
+    const s = e?.temperature ?? 1, i = e?.topK, n = e?.usePadding ?? !1, a = e?.includeAttention ?? !1;
     return this.tf.tidy(() => {
-      const h = t, a = h.shape[1], l = a <= this.config.blockSize ? h : h.slice(
-        [0, a - this.config.blockSize],
-        [h.shape[0], this.config.blockSize]
-      ), r = i ? this.config.blockSize - l.shape[1] : 0, f = r > 0 ? this.tf.pad(l, [
+      const o = t, h = o.shape[1], r = h <= this.config.blockSize ? o : o.slice(
+        [0, h - this.config.blockSize],
+        [o.shape[0], this.config.blockSize]
+      ), c = n ? this.config.blockSize - r.shape[1] : 0, p = c > 0 ? this.tf.pad(r, [
         [0, 0],
-        [0, r]
-      ]) : l, { logits: g, attention: p } = this.forward(f, void 0, !1, o), d = g.shape[1] - 1 - r, m = g.slice([0, d, 0], [g.shape[0], 1, g.shape[2]]), u = p ? p.slice([0, d, 0], [p.shape[0], 1, p.shape[2]]) : void 0, b = m.div(s);
-      let c;
-      if (n) {
-        const { values: k, indices: w } = this.tf.topk(b, n), E = this.tf.multinomial(k.squeeze([1]), 1);
-        c = this.tf.gather(w.squeeze([1]), E, 1);
+        [0, c]
+      ]) : r, { logits: l, attention: g } = this.forward(p, void 0, !1, a), b = l.shape[1] - 1 - c, u = l.slice([0, b, 0], [l.shape[0], 1, l.shape[2]]), k = g ? g.slice([0, b, 0], [g.shape[0], 1, g.shape[2]]) : void 0, d = u.div(s);
+      let f;
+      if (i) {
+        const { values: w, indices: E } = this.tf.topk(d, i), y = this.tf.multinomial(w.squeeze([1]), 1);
+        f = this.tf.gather(E.squeeze([1]), y, 1);
       } else
-        c = this.tf.multinomial(b.squeeze([1]), 1);
-      return c = c.reshape([1, 1]), { output: c, attention: u?.squeeze([1]) };
+        f = this.tf.multinomial(d.squeeze([1]), 1);
+      let m;
+      return e?.includeProbabilities && (m = this.tf.softmax(d.squeeze([1]))), f = f.reshape([1, 1]), { output: f, attention: k?.squeeze([1]), probabilities: m };
     });
   }
   getNumParams() {
     const t = this.config.vocabSize * this.config.nEmbed + this.config.blockSize * this.config.nEmbed, e = this.config.nLayer * (4 * this.config.nEmbed * this.config.nEmbed + // qkv + proj
     2 * this.config.nEmbed), s = this.config.nLayer * (4 * this.config.nEmbed * this.config.nEmbed + // fc
-    this.config.nEmbed * 4 * this.config.nEmbed), n = this.config.nEmbed + this.config.vocabSize * this.config.nEmbed;
-    return t + e + s + n;
+    this.config.nEmbed * 4 * this.config.nEmbed), i = this.config.nEmbed + this.config.vocabSize * this.config.nEmbed;
+    return t + e + s + i;
   }
 }
 export {

package/dist/config.d.ts CHANGED Viewed

@@ -7,5 +7,6 @@ export interface GPTConfig {
     dropout: number;
     biasInLinear: boolean;
     biasInLayerNorm: boolean;
+    mlpFactor: number;
 }
 export declare const defaultConfig: GPTConfig;

package/dist/config.js CHANGED Viewed

@@ -1,4 +1,4 @@
-const e = {
+const a = {
   vocabSize: 50304,
   // GPT-2 vocab size
   blockSize: 1024,
@@ -12,8 +12,9 @@ const e = {
   dropout: 0,
   // Dropout probability
   biasInLinear: !1,
-  biasInLayerNorm: !1
+  biasInLayerNorm: !1,
+  mlpFactor: 4
 };
 export {
-  e as defaultConfig
+  a as defaultConfig
 };

package/dist/layers/MLP.js CHANGED Viewed

@@ -1,4 +1,4 @@
-class n {
+class l {
   cFc;
   cProj;
   dropout;
@@ -7,7 +7,7 @@ class n {
   _trainable = !0;
   constructor(t, i, e) {
     this.tf = t, this.index = i, this.cFc = this.tf.layers.dense({
-      units: 4 * e.nEmbed,
+      units: e.mlpFactor * e.nEmbed,
       activation: "gelu",
       useBias: e.biasInLinear,
       kernelInitializer: this.tf.initializers.randomNormal({
@@ -53,5 +53,5 @@ class n {
   }
 }
 export {
-  n as default
+  l as default
 };

package/dist/training/Evaluator.d.ts ADDED Viewed

@@ -0,0 +1,8 @@
+import { default as NanoGPT } from '../NanoGPTModel';
+import { default as TF } from '@tensorflow/tfjs';
+export default class Evaluator {
+    private model;
+    private iterator;
+    constructor(model: NanoGPT, dataset: TF.data.Dataset<TF.TensorContainer>);
+    evaluate(maxBatches?: number): Promise<number>;
+}

package/dist/training/Evaluator.js ADDED Viewed

@@ -0,0 +1,22 @@
+class p {
+  constructor(s, t) {
+    this.model = s, this.iterator = t.iterator();
+  }
+  iterator;
+  async evaluate(s = 100) {
+    let t = 0, o = 0;
+    const c = await this.iterator;
+    for (let a = 0; a < s; a++) {
+      const e = await c.next();
+      if (e.done) break;
+      const n = e.value, { xs: r, ys: l } = n, { loss: i, logits: u } = this.model.forward(r, l, !1, !1);
+      u.dispose(), r.dispose(), l.dispose();
+      const d = i.arraySync();
+      i.dispose(), t += d, o++;
+    }
+    return t / o;
+  }
+}
+export {
+  p as default
+};

package/dist/training/FullTrainer.js CHANGED Viewed

@@ -1,18 +1,19 @@
 import { generateText as L } from "../utilities/generate.js";
 import w from "./Trainer.js";
-const g = {
+import g from "./Evaluator.js";
+const x = {
   desiredLoss: 0.01,
   logInterval: 1,
   maxSteps: 1e3
 };
-class S extends w {
+class D extends w {
   constructor(r, i, o, n = 3e-4) {
     super(r, i, o, n);
   }
   // Train for multiple epochs using Dataset API - FIXED memory leaks
   async trainOnDataset(r, i, o) {
-    const { desiredLoss: n, logInterval: c, onStep: l, prompt: p, maxSteps: d } = {
-      ...g,
+    const { desiredLoss: n, logInterval: d, onStep: l, prompt: p, maxSteps: m } = {
+      ...x,
       ...i
     }, s = {
       pass: 0,
@@ -25,23 +26,23 @@ class S extends w {
       validationLosses: []
     };
     this.dummyPass(), this.model.trainable = !0;
-    const m = Date.now();
+    const u = Date.now();
     this.running = !0;
-    const u = await r.iterator();
+    const c = o ? new g(this.model, o) : void 0, f = await r.iterator();
     try {
       for (; this.running && !(s.lastLoss < n); ) {
-        const e = await u.next();
+        const e = await f.next();
         if (e.done) break;
-        const h = e.value, f = this.trainBatch(s, h), a = {
+        const h = e.value, v = this.trainBatch(s, h), a = {
           loss: s.lastLoss,
           step: s.step,
-          time: Date.now() - m,
+          time: Date.now() - u,
           batchSize: h.xs.shape[0]
         };
-        if (this.model.log.push(a), s.step % c === 0) {
-          if (await f, o)
+        if (this.model.log.push(a), s.step % d === 0) {
+          if (await v, c)
             try {
-              const t = await this.evaluateOnDataset(o, 5);
+              const t = await c.evaluate(5);
               s.validationLosses.push(t), a.valLoss = t;
             } catch (t) {
               console.error("Validation error:", t);
@@ -56,7 +57,7 @@ class S extends w {
             await l(a);
           }
         }
-        s.step >= d && this.stop();
+        s.step >= m && this.stop();
       }
     } catch (e) {
       throw console.error("Training error:", e), this.tf.dispose(), e;
@@ -65,5 +66,5 @@ class S extends w {
   }
 }
 export {
-  S as default
+  D as default
 };

package/dist/training/LayerTrainer.js CHANGED Viewed

@@ -1,32 +1,33 @@
-import { generateText as d } from "../utilities/generate.js";
-import S from "./Trainer.js";
-import { schedule as u } from "./lwSchedule.js";
-const w = {
+import { generateText as S } from "../utilities/generate.js";
+import u from "./Trainer.js";
+import { schedule as v } from "./lwSchedule.js";
+import w from "./Evaluator.js";
+const T = {
   desiredLoss: 0.01,
   logInterval: 1,
   stepsPerLayer: 400,
   maxPasses: 3,
   maxSteps: 1e3
 };
-class b extends S {
+class z extends u {
   trainingPattern = [];
   startPass = 0;
   startLayer = 0;
-  constructor(r, a, e, p = 3e-4) {
-    if (super(r, a, e, p), this.trainingPattern = u[a.config.nLayer - 1] || [], a.log.length > 0) {
-      const i = a.log[a.log.length - 1];
+  constructor(r, s, e, p = 3e-4) {
+    if (super(r, s, e, p), this.trainingPattern = v[s.config.nLayer - 1] || [], s.log.length > 0) {
+      const i = s.log[s.log.length - 1];
       i.pass !== void 0 && i.layer !== void 0 && (this.startPass = i.pass, this.startLayer = i.layer, console.log(`Resuming training from pass ${this.startPass}, layer ${this.startLayer}`));
     }
   }
   applyTrainingPattern(r) {
-    const a = r < this.trainingPattern.length ? r : this.trainingPattern.length - 1, e = this.trainingPattern[a];
-    this.model.setSkipMask(e.skip), this.model.setTrainableMask(e.trainable), this.resetOptimizer(e.adam), console.log("Applied training pattern:", a, e);
+    const s = r < this.trainingPattern.length ? r : this.trainingPattern.length - 1, e = this.trainingPattern[s];
+    this.model.setSkipMask(e.skip), this.model.setTrainableMask(e.trainable), this.resetOptimizer(e.adam), console.log("Applied training pattern:", s, e);
   }
   // Train for multiple epochs using Dataset API - FIXED memory leaks
-  async trainOnDataset(r, a, e) {
-    const { desiredLoss: p, logInterval: i, stepsPerLayer: L, onLayerChange: l, onPassComplete: h, onStep: c, prompt: g } = {
-      ...w,
-      ...a
+  async trainOnDataset(r, s, e) {
+    const { desiredLoss: p, logInterval: i, stepsPerLayer: L, onLayerChange: o, onPassComplete: h, onStep: c, prompt: g } = {
+      ...T,
+      ...s
     }, t = {
       pass: 0,
       layerStep: 0,
@@ -38,47 +39,44 @@ class b extends S {
       validationLosses: []
     };
     this.dummyPass();
-    const m = Date.now();
+    const f = Date.now();
     this.startPass = 0, this.startLayer = 0;
-    const f = await r.iterator();
+    const y = e ? new w(this.model, e) : void 0, d = await r.iterator();
     this.applyTrainingPattern(t.layerStep % this.trainingPattern.length);
     try {
       for (; !(t.lastLoss < p); ) {
-        const n = await f.next();
+        const n = await d.next();
         if (n.done) break;
-        const y = n.value, P = this.trainBatch(t, y);
+        const m = n.value, P = this.trainBatch(t, m);
         t.stepSinceLayerChange++;
-        const o = {
+        const l = {
           loss: t.lastLoss,
           step: t.step,
-          time: Date.now() - m,
-          batchSize: y.xs.shape[0],
+          time: Date.now() - f,
+          batchSize: m.xs.shape[0],
           pass: t.pass,
           layer: t.layerStep % this.model.config.nLayer
         };
-        if (this.model.log.push(o), t.step % i === 0) {
-          if (await P, e)
+        if (this.model.log.push(l), t.step % i === 0) {
+          if (await P, y)
             try {
-              const s = await this.evaluateOnDataset(e, 5);
-              t.validationLosses.push(s), o.valLoss = s;
-            } catch (s) {
-              console.error("Validation error:", s);
+              const a = await y.evaluate(5);
+              t.validationLosses.push(a), l.valLoss = a;
+            } catch (a) {
+              console.error("Validation error:", a);
             }
           if (c) {
             if (g) {
-              const s = await d(this.tokenizer, this.model, g, 100, {
+              const a = await S(this.tokenizer, this.model, g, 100, {
                 temperature: 0.8,
                 topK: 10
               });
-              o.example = s;
+              l.example = a;
             }
-            await c(o);
+            await c(l);
           }
         }
-        if (t.stepSinceLayerChange >= L) {
-          let s;
-          e && (s = await this.evaluateOnDataset(e, 5), t.validationLosses.push(s), o.valLoss = s), t.layerStep++, t.layerStep % this.model.config.nLayer === 0 ? (l && await l(t.layerStep, t.pass, s), h && await h(t.pass), t.pass++) : l && await l(t.layerStep, t.pass, s), t.stepSinceLayerChange = 0, this.applyTrainingPattern(t.layerStep % this.trainingPattern.length);
-        }
+        t.stepSinceLayerChange >= L && (t.layerStep++, t.layerStep % this.model.config.nLayer === 0 ? (o && await o(t.layerStep, t.pass), h && await h(t.pass), t.pass++) : o && await o(t.layerStep, t.pass), t.stepSinceLayerChange = 0, this.applyTrainingPattern(t.layerStep % this.trainingPattern.length));
       }
     } catch (n) {
       throw console.error("Training error:", n), this.tf.dispose(), n;
@@ -87,5 +85,5 @@ class b extends S {
   }
 }
 export {
-  b as default
+  z as default
 };

package/dist/training/Trainer.d.ts CHANGED Viewed

@@ -56,7 +56,6 @@ export default abstract class GPTTrainer {
         losses: number[];
         validationLosses: number[];
     }>;
-    evaluateOnDataset(dataset: TF.data.Dataset<TF.TensorContainer>, maxBatches?: number): Promise<number>;
     createTrainValidationSplit(textData: string[], batchSize?: number, validationSplit?: number): Promise<{
         trainDataset: TF.data.Dataset<{
             xs: TF.Tensor;

package/dist/training/Trainer.js CHANGED Viewed

@@ -1,8 +1,8 @@
-import { DatasetBuilder as h } from "./DatasetBuilder.js";
+import { DatasetBuilder as d } from "./DatasetBuilder.js";
 import p from "./AdamExt.js";
-class y {
-  constructor(t, e, s, a = 1e-3) {
-    this.tokenizer = s, this.tf = t, this.model = e, this.learningRate = a, this.resetOptimizer(), this.datasetBuilder = new h(this.tf, s, e.config.blockSize);
+class g {
+  constructor(t, e, s, i = 1e-3) {
+    this.tokenizer = s, this.tf = t, this.model = e, this.learningRate = i, this.resetOptimizer(), this.datasetBuilder = new d(this.tf, s, e.config.blockSize);
   }
   model;
   optimizer;
@@ -43,11 +43,11 @@ class y {
   }
   trainStep(t, e = !1, s = !1) {
     return this.tf.tidy(() => {
-      const { xs: a, ys: o } = t, r = () => {
-        const { loss: l, logits: c } = this.model.forward(a, o, !0);
+      const { xs: i, ys: r } = t, o = () => {
+        const { loss: l, logits: c } = this.model.forward(i, r, !0);
         return c.dispose(), l;
-      }, { value: n, grads: i } = this.tf.variableGrads(r);
-      return e || (s && (console.log("-------"), this.printGradients(i), console.log("-------")), this.optimizer.applyGradients(i), this.tf.dispose(i)), n;
+      }, { value: n, grads: a } = this.tf.variableGrads(o);
+      return e || (s && (console.log("-------"), this.printGradients(a), console.log("-------")), this.optimizer.applyGradients(a), this.tf.dispose(a)), n;
     });
   }
   dummyPass() {
@@ -64,31 +64,22 @@ class y {
   async trainBatch(t, e) {
     try {
       const s = this.trainStep(e, !1, !1);
-      return e.xs.dispose(), e.ys.dispose(), t.step++, t.totalSteps++, s.array().then((a) => (t.lastLoss = a, t.losses.push(t.lastLoss), s.dispose(), t.lastLoss));
+      return e.xs.dispose(), e.ys.dispose(), t.step++, t.totalSteps++, s.array().then((i) => (t.lastLoss = i, t.losses.push(t.lastLoss), s.dispose(), t.lastLoss));
     } catch (s) {
       throw console.error(`Error processing batch at step ${t.step}:`, s), this.tf.dispose(), s;
     }
   }
-  // Evaluate model on validation dataset - FIXED memory leaks
-  async evaluateOnDataset(t, e = 100) {
-    let s = 0, a = 0;
-    return await t.take(e).forEachAsync(async (o) => {
-      const { xs: r, ys: n } = o, { loss: i, logits: l } = this.model.forward(r, n, !1), d = i.arraySync();
-      i.dispose(), l.dispose(), s += d, a++;
-    }), s / a;
-  }
-  // Create training and validation datasets - FIXED memory leaks
   async createTrainValidationSplit(t, e = 32, s = 0.1) {
-    const a = Math.floor(t.length * (1 - s)), o = t.slice(0, a), r = t.slice(a), n = await this.datasetBuilder.createTextDataset(o, e), i = await this.datasetBuilder.createTextDataset(r, e);
-    return { trainDataset: n, validationDataset: i };
+    const i = Math.floor(t.length * (1 - s)), r = t.slice(0, i), o = t.slice(i), n = await this.datasetBuilder.createTextDataset(r, e), a = await this.datasetBuilder.createTextDataset(o, e);
+    return { trainDataset: n, validationDataset: a };
   }
   async createDataset(t, e = 32) {
     return await this.datasetBuilder.createTextDataset(t, e);
   }
   dispose() {
-    this.optimizer && this.optimizer.dispose(), this.tf.dispose();
+    this.optimizer && this.optimizer.dispose();
   }
 }
 export {
-  y as default
+  g as default
 };

package/dist/utilities/generate.js CHANGED Viewed

@@ -1,17 +1,19 @@
-async function w(n, e, o, s, p) {
+async function w(n, t, r, s, g) {
   if (s <= 0)
     throw new Error("Length must be a positive integer");
-  if (o.length === 0)
+  if (r.length === 0)
     throw new Error("Prompt cannot be an empty string");
-  const a = await n.tokenise([o], !0), r = (await e.tf.tidy(() => {
-    let t = e.tf.tensor2d(a, [1, a[0].length], "int32");
-    for (let c = 0; c < s; c++) {
-      const { output: d } = e.generate(t, p), u = t;
-      t = e.tf.concat([t, d], 1), u.dispose(), d.dispose();
+  const i = await n.tokenise([r], !0), a = t.tf.tidy(() => {
+    let e = t.tf.tensor2d(i, [1, i[0].length], "int32");
+    for (let d = 0; d < s; d++) {
+      const { output: p } = t.generate(e, g), f = e;
+      e = t.tf.concat([e, p], 1), f.dispose(), p.dispose();
     }
-    return t;
-  }).array())[0], i = r.indexOf(n.eosToken);
-  return i !== -1 && r.splice(i), await n.decode(r);
+    return e;
+  }), u = await a.array();
+  a.dispose();
+  const o = u[0], c = o.indexOf(n.eosToken);
+  return c !== -1 && o.splice(c), await n.decode(o);
 }
 export {
   w as generateText

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
     "name": "@genai-fi/nanogpt",
-    "version": "0.1.3",
+    "version": "0.1.5",
     "type": "module",
     "main": "dist/main.js",
     "types": "dist/main.d.ts",