npm - @genai-fi/nanogpt - Versions diffs - 0.1.8 → 0.2.0 - Mend

@genai-fi/nanogpt 0.1.8 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/dist/Generator.d.ts +6 -8
package/dist/Generator.js +55 -52
package/dist/NanoGPTModel.d.ts +5 -3
package/dist/NanoGPTModel.js +69 -53
package/dist/TeachableLLM.js +15 -15
package/dist/Trainer.d.ts +2 -0
package/dist/Trainer.js +10 -5
package/dist/config.d.ts +1 -0
package/dist/config.js +5 -3
package/dist/layers/CausalSelfAttention.d.ts +12 -2
package/dist/layers/CausalSelfAttention.js +73 -40
package/dist/layers/RMSNorm.d.ts +13 -0
package/dist/layers/RMSNorm.js +32 -0
package/dist/layers/RoPECache.d.ts +16 -0
package/dist/layers/RoPECache.js +39 -0
package/dist/layers/TransformerBlock.d.ts +5 -2
package/dist/layers/TransformerBlock.js +14 -10
package/dist/training/FullTrainer.js +27 -29
package/dist/training/Trainer.d.ts +2 -0
package/dist/training/Trainer.js +31 -27
package/dist/utilities/generate.js +14 -14
package/package.json +1 -1

package/dist/Generator.d.ts CHANGED Viewed

@@ -1,18 +1,16 @@
-import { default as NanoGPT } from './NanoGPTModel';
+import { default as NanoGPT, GenerateOptions } from './NanoGPTModel';
 import { ITokeniser } from './tokeniser/type';
 import { default as EE } from 'eventemitter3';
-export interface IGenerateOptions {
+export interface IGenerateOptions extends GenerateOptions {
     maxLength?: number;
-    temperature?: number;
-    topK?: number;
-    usePadding?: boolean;
-    includeAttention?: boolean;
-    includeProbabilities?: boolean;
 }
 export default class Generator extends EE<'start' | 'stop' | 'tokens'> {
     private readonly model;
     private readonly tokeniser;
     constructor(model: NanoGPT, tokeniser: ITokeniser);
-    private generateBlockOfTokens;
+    private tokenisePrompt;
+    private generateNoCache;
+    private processResponse;
+    private generateCache;
     generate(prompt?: string, options?: IGenerateOptions): Promise<string>;
 }

package/dist/Generator.js CHANGED Viewed

@@ -1,62 +1,65 @@
-import { E as m } from "./index-SOhdqzHq.js";
-const b = 4;
-class x extends m {
-  constructor(a, t) {
-    super(), this.model = a, this.tokeniser = t;
+import { E as u } from "./index-SOhdqzHq.js";
+class k extends u {
+  constructor(s, e) {
+    super(), this.model = s, this.tokeniser = e;
   }
-  generateBlockOfTokens(a, t) {
-    const g = t?.temperature ?? 1, c = t?.topK, d = t?.usePadding ?? t?.includeAttention ?? !1, k = t?.includeAttention ?? !1, h = t?.includeProbabilities ?? !1;
-    let i = a, n, s;
-    for (let e = 0; e < b; e++) {
+  async tokenisePrompt(s) {
+    const e = s ? await this.tokeniser.tokenise([s], !0) : [[this.tokeniser.eosToken]];
+    return this.model.tf.tensor2d(e, [1, e[0].length], "int32");
+  }
+  async generateNoCache(s, e) {
+    let t = await this.tokenisePrompt(s), n = s || "";
+    const a = e?.maxLength ?? 1e3;
+    for (let i = 0; i < a; i++) {
       const {
-        output: u,
-        attention: l,
-        probabilities: r
-      } = this.model.generate(i, {
-        temperature: g,
-        topK: c,
-        usePadding: d,
-        includeAttention: k,
-        includeProbabilities: h
-      }), p = i;
-      if (i = this.model.tf.concat([i, u], 1), n && l) {
-        const o = n;
-        n = this.model.tf.concat([n, l], 0), o.dispose();
-      } else l && (n = l);
-      if (s && r) {
-        const o = s;
-        s = this.model.tf.concat([s, r], 0), o.dispose();
-      } else r && (s = r);
-      p.dispose(), u.dispose();
+        output: o,
+        attention: c,
+        probabilities: l
+      } = this.model.generate(t, void 0, e), h = t;
+      t = this.model.tf.concat([t, o], 1), h.dispose();
+      const r = await this.processResponse(o, c, l);
+      if (o.dispose(), r === null)
+        break;
+      n += r;
     }
-    return { output: i, attention: n, probabilities: s };
+    return t.dispose(), n;
   }
-  async generate(a, t) {
-    const g = a ? await this.tokeniser.tokenise([a], !0) : [[this.tokeniser.eosToken]];
-    let c = this.model.tf.tensor2d(g, [1, g[0].length], "int32");
-    this.emit("start");
-    let d = a || "";
-    for (; ; ) {
-      const { output: k, attention: h, probabilities: i } = this.generateBlockOfTokens(c, t), n = c;
-      c = k;
-      const s = k.slice([0, n.shape[1]], [1, b]), e = (await s.array())[0];
-      n.dispose(), s.dispose();
-      let u = !1, l = !1;
-      const r = e.indexOf(this.tokeniser.eosToken);
-      r !== -1 && (u = !0, e.splice(r)), e.length + d.length >= (t?.maxLength ?? 1e3) && (l = !0, e.splice(
-        t?.maxLength ? t.maxLength - d.length : e.length
-      ));
-      const p = await this.tokeniser.decode(e);
-      d += p;
-      let o;
-      h && (o = await h.array(), h.dispose(), o.length > e.length && (o = o.slice(0, e.length)));
-      let f;
-      if (i && (f = await i.array(), i.dispose(), f.length > e.length && (f = f.slice(0, e.length))), this.emit("tokens", e, p, o, f), u || l)
+  async processResponse(s, e, t) {
+    const n = (await s.array())[0][0];
+    if (n === this.tokeniser.eosToken)
+      return null;
+    const a = await this.tokeniser.decode([n]);
+    let i;
+    e && (i = await e.array(), e.dispose());
+    let o;
+    return t && (o = await t.array(), t.dispose()), this.emit("tokens", [n], a, i, o), a;
+  }
+  async generateCache(s, e) {
+    let t = await this.tokenisePrompt(s), n = s || "";
+    const a = new Array(this.model.config.nLayer).fill(void 0), i = e?.maxLength ?? 1e3;
+    for (let o = 0; o < i; o++) {
+      const {
+        output: c,
+        attention: l,
+        probabilities: h
+      } = this.model.generate(t, a, {
+        ...e,
+        usePadding: !1
+      });
+      t.dispose(), t = c;
+      const r = await this.processResponse(c, l, h);
+      if (r === null)
         break;
+      n += r;
     }
-    return c.dispose(), this.emit("stop"), d;
+    return t.dispose(), n;
+  }
+  async generate(s, e) {
+    this.emit("start");
+    const t = this.model.config.useRope ? this.generateCache(s, e) : this.generateNoCache(s, e);
+    return this.emit("stop"), t;
   }
 }
 export {
-  x as default
+  k as default
 };

package/dist/NanoGPTModel.d.ts CHANGED Viewed

@@ -1,5 +1,6 @@
 import { default as TF } from '@tensorflow/tfjs';
 import { GPTConfig } from './config';
+import { KVCache } from './layers/CausalSelfAttention';
 export interface TrainingLogEntry {
     loss: number;
     valLoss?: number;
@@ -18,10 +19,11 @@ export interface GenerateOptions {
 export default class NanoGPT {
     readonly config: GPTConfig;
     private wte;
-    private wpe;
+    private wpe?;
     private drop;
     private blocks;
     private lnF;
+    private ropeCache?;
     readonly tf: typeof TF;
     log: TrainingLogEntry[];
     constructor(tf: typeof TF, config?: Partial<GPTConfig>);
@@ -35,12 +37,12 @@ export default class NanoGPT {
     private validateInput;
     private calculateLoss;
     private computeAttentionRollout;
-    forward(idx: TF.Tensor, targets?: TF.Tensor, training?: boolean, includeAttention?: boolean): {
+    forward(idx: TF.Tensor, targets?: TF.Tensor, training?: boolean, includeAttention?: boolean, cache?: (KVCache | undefined)[]): {
         logits: TF.Tensor;
         loss?: TF.Tensor;
         attention?: TF.Tensor;
     };
-    generate(idx: TF.Tensor, options?: GenerateOptions): {
+    generate(idx: TF.Tensor, cache?: (KVCache | undefined)[], options?: GenerateOptions): {
         output: TF.Tensor;
         attention?: TF.Tensor;
         probabilities?: TF.Tensor;

package/dist/NanoGPTModel.js CHANGED Viewed

@@ -1,8 +1,9 @@
-import { defaultConfig as z } from "./config.js";
-import v from "./layers/TransformerBlock.js";
-import S from "./layers/TiedEmbedding.js";
-import _ from "./layers/LayerNorm.js";
-class $ {
+import { defaultConfig as v } from "./config.js";
+import S from "./layers/TransformerBlock.js";
+import _ from "./layers/TiedEmbedding.js";
+import L from "./layers/RoPECache.js";
+import I from "./layers/RMSNorm.js";
+class F {
   config;
   wte;
   // Token embeddings
@@ -13,27 +14,28 @@ class $ {
   blocks;
   lnF;
   // Final layer norm
+  ropeCache;
   tf;
   log = [];
   // Training log
   constructor(t, e = {}) {
-    this.tf = t, this.config = { ...z, ...e }, this.wte = new S(t, {
+    this.tf = t, this.config = { ...v, ...e }, this.wte = new _(t, {
       vocabSize: this.config.vocabSize,
       embedDim: this.config.nEmbed,
       name: "token_embedding"
-    }), this.wpe = this.tf.layers.embedding({
+    }), this.config.useRope === !1 ? this.wpe = this.tf.layers.embedding({
       inputDim: this.config.blockSize,
       outputDim: this.config.nEmbed,
       name: "positional_embedding",
       embeddingsInitializer: this.tf.initializers.randomNormal({ mean: 0, stddev: 0.02 })
-    }), this.drop = this.tf.layers.dropout({ rate: this.config.dropout }), this.blocks = [];
+    }) : this.ropeCache = new L(t, this.config), this.drop = this.tf.layers.dropout({ rate: this.config.dropout }), this.blocks = [];
     for (let s = 0; s < this.config.nLayer; s++)
-      this.blocks.push(new v(this.tf, s, this.config));
-    this.lnF = new _(t, [this.config.nEmbed], 1e-5, "final_layer_norm");
+      this.blocks.push(new S(this.tf, s, this.config, this.ropeCache));
+    this.lnF = new I(t, [this.config.nEmbed], 1e-8, "final_rms_norm");
   }
   get variables() {
     return [
-      ...this.wpe.trainableWeights.map((t) => t.read()),
+      //...this.wpe.trainableWeights.map((v) => v.read() as TF.Variable),
       ...this.blocks.flatMap((t) => t.variables),
       ...this.lnF.trainableWeights.map((t) => t),
       ...this.wte.variables
@@ -41,21 +43,28 @@ class $ {
   }
   saveWeights() {
     const t = /* @__PURE__ */ new Map();
-    t.set("token_embedding", this.wte.getWeights()), t.set("positional_embedding", this.wpe.getWeights());
+    t.set("token_embedding", this.wte.getWeights()), this.wpe && t.set("positional_embedding", this.wpe.getWeights());
     for (let e = 0; e < this.blocks.length; e++)
       this.blocks[e].saveWeights(t);
-    return t.set("final_layer_norm", this.lnF.getWeights()), t;
+    return t.set("final_rms_norm", this.lnF.getWeights()), t;
   }
   loadWeights(t) {
-    this.wte.setWeights(t.get("token_embedding") || []), this.wpe.setWeights(t.get("positional_embedding") || []);
+    this.wte.setWeights(t.get("token_embedding") || []), this.wpe && this.wpe.setWeights(t.get("positional_embedding") || []);
     for (let e = 0; e < this.blocks.length; e++)
       this.blocks[e].loadWeights(t);
-    this.lnF.setWeights(t.get("final_layer_norm") || []);
+    this.lnF.setWeights(t.get("final_rms_norm") || []);
   }
-  inputPhase(t, e = !1) {
+  inputPhase(t, e, s = !1) {
     return this.tf.tidy(() => {
-      const [, s] = t.shape, i = this.wte.embed(t), n = this.tf.range(0, s, 1, "int32"), h = this.wpe.apply(n), o = i.add(h);
-      return this.drop.apply(o, { training: e });
+      const o = this.wte.embed(t);
+      if (this.config.useRope === !1) {
+        const [, i] = t.shape, a = this.config.blockSize, n = this.tf.range(0, i, 1, "int32"), h = this.tf.mod(
+          this.tf.add(n, this.tf.scalar(e, "int32")),
+          this.tf.scalar(a, "int32")
+        ), c = this.wpe.apply(h), r = o.add(c);
+        return this.drop.apply(r, { training: s });
+      } else
+        return this.drop.apply(o, { training: s });
     });
   }
   setSkipMask(t) {
@@ -73,7 +82,7 @@ class $ {
   set trainable(t) {
     for (const e of this.blocks)
       e.trainable = t;
-    this.wpe.trainable = t, this.lnF.trainable = t;
+    this.lnF.trainable = t;
   }
   validateInput(t) {
     if (t.shape.length !== 2)
@@ -96,60 +105,67 @@ class $ {
     return this.tf.tidy(() => {
       if (t.length === 0)
         throw new Error("No attentions for rollout");
-      const e = t[0].shape[0], s = t[0].shape[1], i = this.tf.eye(s, s).expandDims(0);
-      let n = i.tile([e, 1, 1]);
-      for (const h of t) {
-        let o = h.add(i);
-        o = o.div(o.sum(-1, !0)), n = o.matMul(n);
+      const e = t[0].shape[0], s = t[0].shape[1], o = this.tf.eye(s, s).expandDims(0);
+      let i = o.tile([e, 1, 1]);
+      for (const a of t) {
+        let n = a.add(o);
+        n = n.div(n.sum(-1, !0)), i = n.matMul(i);
       }
-      return n;
+      return i;
     });
   }
-  forward(t, e, s = !1, i = !1) {
+  forward(t, e, s = !1, o = !1, i) {
     return this.validateInput(t), this.tf.tidy(() => {
-      let n = this.inputPhase(t, s);
+      const a = i?.[0]?.length ?? 0;
+      let n = this.inputPhase(t, a, s);
       const h = [];
-      for (const c of this.blocks) {
-        const { output: d, attention: l } = c.call(n, s, i);
-        n = d, i && l && h.push(l);
+      if (i && i.length !== this.blocks.length)
+        throw console.error("Cache", i), new Error(`Cache length ${i.length} does not match number of blocks ${this.blocks.length}`);
+      for (let l = 0; l < this.blocks.length; l++) {
+        const d = this.blocks[l], {
+          output: g,
+          attention: b,
+          cache: p
+        } = d.call(n, s, o, i ? i[l] : void 0);
+        n = g, o && b && h.push(b), i && p ? (i[l]?.k.dispose(), i[l]?.v.dispose(), i[l] = p) : p && (p.k.dispose(), p.v.dispose());
       }
-      let o;
-      i && h.length > 0 && (o = this.computeAttentionRollout(h)), n = this.lnF.apply(n);
-      const a = this.wte.project(n);
-      let r;
-      return e && (r = this.calculateLoss(a, e)), { logits: a, loss: r, attention: i ? o : void 0 };
+      let c;
+      o && h.length > 0 && (c = this.computeAttentionRollout(h)), n = this.lnF.apply(n);
+      const r = this.wte.project(n);
+      let f;
+      return e && (f = this.calculateLoss(r, e)), { logits: r, loss: f, attention: o ? c : void 0 };
     });
   }
-  generate(t, e) {
-    const s = e?.temperature ?? 1, i = e?.topK, n = e?.usePadding ?? !1, h = e?.includeAttention ?? !1;
+  generate(t, e, s) {
+    const o = s?.temperature ?? 1, i = s?.topK, a = s?.usePadding ?? !1, n = s?.includeAttention ?? !1;
     return this.tf.tidy(() => {
-      const o = t, a = o.shape[1], r = a <= this.config.blockSize ? o : o.slice(
-        [0, a - this.config.blockSize],
-        [o.shape[0], this.config.blockSize]
-      ), c = n ? this.config.blockSize - r.shape[1] : 0, d = c > 0 ? this.tf.pad(r, [
+      const h = t, c = h.shape[1], r = c <= this.config.blockSize ? h : h.slice(
+        [0, c - this.config.blockSize],
+        [h.shape[0], this.config.blockSize]
+      ), f = a ? this.config.blockSize - r.shape[1] : 0, l = f > 0 ? this.tf.pad(r, [
         [0, 0],
-        [0, c]
-      ]) : r, { logits: l, attention: p } = this.forward(d, void 0, !1, h), b = l.shape[1] - 1 - c, u = l.slice([0, b, 0], [l.shape[0], 1, l.shape[2]]), k = p ? p.slice([0, b, 0], [p.shape[0], 1, p.shape[2]]) : void 0, g = u.div(s);
-      let f;
+        [0, f]
+      ]) : r, { logits: d, attention: g } = this.forward(l, void 0, !1, n, e), b = d.shape[1] - 1 - f, p = d.slice([0, b, 0], [d.shape[0], 1, d.shape[2]]), w = g ? g.slice([0, b, 0], [g.shape[0], 1, g.shape[2]]) : void 0, u = p.div(o);
+      let m;
       if (i) {
-        const { values: w, indices: E } = this.tf.topk(g, i), y = this.tf.multinomial(w.squeeze([1]), 1);
-        f = this.tf.gather(E.squeeze([1]), y, 1);
+        const { values: E, indices: y } = this.tf.topk(u, i), z = this.tf.multinomial(E.squeeze([1]), 1);
+        m = this.tf.gather(y.squeeze([1]), z, 1);
       } else
-        f = this.tf.multinomial(g.squeeze([1]), 1);
-      let m;
-      return e?.includeProbabilities && (m = this.tf.softmax(g.squeeze([1]))), f = f.reshape([1, 1]), { output: f, attention: k?.squeeze([1]), probabilities: m };
+        m = this.tf.multinomial(u.squeeze([1]), 1);
+      let k;
+      return s?.includeProbabilities && (k = this.tf.softmax(u.squeeze([1]))), m = m.reshape([1, 1]), { output: m, attention: w?.squeeze([1]), probabilities: k };
     });
   }
   getNumParams() {
     const t = this.config.vocabSize * this.config.nEmbed + this.config.blockSize * this.config.nEmbed, e = this.config.nLayer * (4 * this.config.nEmbed * this.config.nEmbed + // qkv + proj
     2 * this.config.nEmbed), s = this.config.nLayer * (4 * this.config.nEmbed * this.config.nEmbed + // fc
-    this.config.nEmbed * 4 * this.config.nEmbed), i = this.config.nEmbed + this.config.vocabSize * this.config.nEmbed;
-    return t + e + s + i;
+    this.config.nEmbed * 4 * this.config.nEmbed), o = this.config.nEmbed + this.config.vocabSize * this.config.nEmbed;
+    return t + e + s + o;
   }
   dispose() {
-    this.wte.dispose(), this.wpe.dispose(), this.drop.dispose(), this.blocks.forEach((t) => t.dispose()), this.lnF.dispose();
+    this.wte.dispose(), this.wpe && this.wpe.dispose(), this.drop.dispose(), this.blocks.forEach((t) => t.dispose()), this.lnF.dispose();
   }
 }
 export {
-  $ as default
+  F as default
 };

package/dist/TeachableLLM.js CHANGED Viewed

@@ -47,25 +47,25 @@ class a extends c {
   }
   static loadModel(t, r) {
     const e = new a(t);
-    return l(t, r).then(({ model: i, tokeniser: o }) => {
-      e._model = i, e._tokeniser = o, e._config = i.config, e.setStatus("warmup"), h(i).then(() => {
+    return l(t, r).then(({ model: s, tokeniser: o }) => {
+      e._model = s, e._tokeniser = o, e._config = s.config, e.setStatus("warmup"), h(s).then(() => {
         e.setStatus("ready");
-      }).catch((s) => {
-        e.setStatus("error"), e.emit("error", s);
+      }).catch((i) => {
+        e.setStatus("error"), e.emit("error", i);
       });
-    }).catch((i) => {
-      e.setStatus("error"), e.emit("error", i);
+    }).catch((s) => {
+      e.setStatus("error"), e.emit("error", s);
     }), e;
   }
   static create(t, r = {}) {
-    const e = { ...u, ...r }, i = new g(e.vocabSize), o = new d(t, e), s = new a(t, i, o);
-    return s.setStatus("warmup"), h(o).then(() => {
-      s.setStatus("awaitingTokens"), s.tokeniser.once("trainStatus", (n) => {
-        n === "trained" && s.setStatus("ready");
-      });
+    const e = { ...u, ...r }, s = new g(e.vocabSize), o = new d(t, e), i = new a(t, s, o);
+    return i.setStatus("warmup"), h(o).then(() => {
+      i.tokeniser.trained ? i.setStatus("ready") : (i.setStatus("awaitingTokens"), i.tokeniser.once("trainStatus", (n) => {
+        n === "trained" && i.setStatus("ready");
+      }));
     }).catch((n) => {
-      s.setStatus("error"), s.emit("error", n);
-    }), s;
+      i.setStatus("error"), i.emit("error", n);
+    }), i;
   }
   getNumParams() {
     if (!this._model)
@@ -78,8 +78,8 @@ class a extends c {
     const t = new _(this._model, this._tokeniser);
     return t.on("start", () => this.setStatus("training")), t.on("stop", () => this.setStatus("ready")), t.on("log", async (r) => {
       const e = this.listeners("trainStep");
-      for (const i of e)
-        await i(r);
+      for (const s of e)
+        await s(r);
     }), t;
   }
   train(t, r) {

package/dist/Trainer.d.ts CHANGED Viewed

@@ -12,7 +12,9 @@ export interface ITrainerOptions {
 }
 export default class Trainer extends EE<'start' | 'stop' | 'log'> {
     private trainer;
+    private hasTrained;
     constructor(model: NanoGPT, tokeniser: ITokeniser);
     stop(): void;
+    reset(): void;
     train(text: string[], options?: ITrainerOptions): Promise<void>;
 }

package/dist/Trainer.js CHANGED Viewed

@@ -1,11 +1,16 @@
 import { E as l } from "./index-SOhdqzHq.js";
-import o from "./training/FullTrainer.js";
-class d extends l {
+import h from "./training/FullTrainer.js";
+class m extends l {
   trainer;
+  hasTrained = !1;
   constructor(a, t) {
-    super(), this.trainer = new o(a.tf, a, t, 1e-3);
+    super(), this.trainer = new h(a.tf, a, t, 1e-3);
   }
   stop() {
+    this.trainer.stop();
+  }
+  reset() {
+    this.hasTrained = !1, this.trainer.reset();
   }
   async train(a, t) {
     const { trainDataset: e, validationDataset: r } = await this.trainer.createTrainValidationSplit(
@@ -13,7 +18,7 @@ class d extends l {
       t?.batchSize || 32,
       t?.validationSplit || 0.1
     );
-    this.trainer.setLearningRate(t?.learningRate || 1e-3), this.emit("start"), await this.trainer.trainOnDataset(
+    this.hasTrained || this.trainer.setLearningRate(t?.learningRate || 1e-3), this.hasTrained = !0, this.emit("start"), await this.trainer.trainOnDataset(
       e,
       {
         prompt: t?.prompt,
@@ -31,5 +36,5 @@ class d extends l {
   }
 }
 export {
-  d as default
+  m as default
 };

package/dist/config.d.ts CHANGED Viewed

@@ -8,5 +8,6 @@ export interface GPTConfig {
     biasInLinear: boolean;
     biasInLayerNorm: boolean;
     mlpFactor: number;
+    useRope: boolean;
 }
 export declare const defaultConfig: GPTConfig;

package/dist/config.js CHANGED Viewed

@@ -1,4 +1,4 @@
-const a = {
+const e = {
   vocabSize: 50304,
   // GPT-2 vocab size
   blockSize: 1024,
@@ -13,8 +13,10 @@ const a = {
   // Dropout probability
   biasInLinear: !1,
   biasInLayerNorm: !1,
-  mlpFactor: 4
+  mlpFactor: 4,
+  useRope: !1
+  // Use Rotary Position Embeddings
 };
 export {
-  a as defaultConfig
+  e as defaultConfig
 };

package/dist/layers/CausalSelfAttention.d.ts CHANGED Viewed

@@ -1,6 +1,14 @@
 import { default as TF } from '@tensorflow/tfjs';
 import { GPTConfig } from '../config';
+import { default as RoPECache } from './RoPECache';
+export type KVCache = {
+    k: TF.Tensor;
+    v: TF.Tensor;
+    length: number;
+    cumulativeLength: number;
+};
 export default class CausalSelfAttention {
+    private readonly ropeCache?;
     private config;
     private cAttn;
     private cProj;
@@ -12,18 +20,20 @@ export default class CausalSelfAttention {
     private divisor;
     private index;
     private _trainable;
-    constructor(tf: typeof TF, index: number, config: GPTConfig);
+    constructor(tf: typeof TF, index: number, config: GPTConfig, ropeCache?: RoPECache | undefined);
     get variables(): TF.Variable[];
     get trainable(): boolean;
     set trainable(value: boolean);
     saveWeights(map: Map<string, TF.Tensor[]>): void;
     loadWeights(weights: Map<string, TF.Tensor[]>): void;
     private getAttentionScores;
+    private getAttentionScoresWithPast;
     private getQKV;
     private getOutputProjection;
-    call(x: TF.Tensor, training?: boolean, includeAttention?: boolean): {
+    call(x: TF.Tensor, training?: boolean, includeAttention?: boolean, pastKV?: KVCache): {
         output: TF.Tensor;
         attention?: TF.Tensor;
+        presentKV?: KVCache;
     };
     dispose(): void;
 }

package/dist/layers/CausalSelfAttention.js CHANGED Viewed

@@ -1,20 +1,9 @@
-class m {
-  config;
-  cAttn;
-  cProj;
-  attnDropout;
-  residDropout;
-  bias;
-  maskInf;
-  tf;
-  divisor;
-  index;
-  _trainable = !0;
-  constructor(t, e, s) {
-    this.config = s, this.tf = t, this.index = e, this.cAttn = this.tf.layers.dense({
+class S {
+  constructor(t, i, s, e) {
+    this.ropeCache = e, this.config = s, this.tf = t, this.index = i, this.cAttn = this.tf.layers.dense({
       units: 3 * s.nEmbed,
       useBias: s.biasInLinear,
-      name: `block_${e}_attn_cAttn`,
+      name: `block_${i}_attn_cAttn`,
       kernelInitializer: this.tf.initializers.randomNormal({
         mean: 0,
         stddev: 0.02
@@ -23,14 +12,27 @@ class m {
     }), this.cProj = this.tf.layers.dense({
       units: s.nEmbed,
       useBias: s.biasInLinear,
-      name: `block_${e}_attn_cProj`,
+      name: `block_${i}_attn_cProj`,
       kernelInitializer: this.tf.initializers.randomNormal({
         mean: 0,
         stddev: 0.02 / Math.sqrt(2 * s.nLayer)
       }),
       biasInitializer: "zeros"
-    }), this.attnDropout = this.tf.layers.dropout({ rate: s.dropout }), this.residDropout = this.tf.layers.dropout({ rate: s.dropout }), this.bias = this.tf.linalg.bandPart(this.tf.ones([s.blockSize, s.blockSize]), -1, 0).cast("bool"), this.divisor = this.tf.scalar(1 / Math.sqrt(s.nEmbed / s.nHead)), this.maskInf = this.tf.zeros([s.blockSize, s.blockSize]).where(this.bias, -1 / 0);
+    }), this.attnDropout = this.tf.layers.dropout({ rate: s.dropout }), this.residDropout = this.tf.layers.dropout({ rate: s.dropout }), this.bias = this.tf.linalg.bandPart(this.tf.ones([s.blockSize, s.blockSize]), -1, 0).cast("bool"), this.divisor = this.tf.scalar(1 / Math.sqrt(s.nEmbed / s.nHead));
+    const a = this.tf.zeros([s.blockSize, s.blockSize]), h = this.tf.fill([s.blockSize, s.blockSize], Number.NEGATIVE_INFINITY);
+    this.maskInf = this.tf.where(this.bias, a, h);
   }
+  config;
+  cAttn;
+  cProj;
+  attnDropout;
+  residDropout;
+  bias;
+  maskInf;
+  tf;
+  divisor;
+  index;
+  _trainable = !0;
   get variables() {
     return [
       ...this.cAttn.trainableWeights.map((t) => t.read()),
@@ -49,34 +51,65 @@ class m {
   loadWeights(t) {
     this.cAttn.setWeights(t.get(`block_${this.index}_cAttn`) || []), this.cProj.setWeights(t.get(`block_${this.index}_cProj`) || []);
   }
-  getAttentionScores(t, e, s) {
-    const a = t.shape[2], o = this.tf.matMul(t, e, !1, !0).mul(this.divisor), i = this.maskInf.slice([0, 0], [a, a]), n = o.add(i), h = this.tf.softmax(n, -1);
-    return this.attnDropout.apply(h, { training: s });
+  getAttentionScores(t, i, s) {
+    const e = t.shape[2], h = this.tf.matMul(t, i, !1, !0).mul(this.divisor), n = this.maskInf.slice([0, 0], [e, e]).expandDims(0).expandDims(0), r = h.add(n), o = this.tf.softmax(r, -1);
+    return this.attnDropout.apply(o, { training: s });
+  }
+  // Attention with optional past. If pastLen > 0 and T_cur == 1, no mask needed.
+  getAttentionScoresWithPast(t, i, s, e) {
+    const a = t.shape[2];
+    let n = this.tf.matMul(t, i, !1, !0).mul(this.divisor);
+    if (a > 1 && e > 0)
+      throw new Error("Cannot use past with T_cur > 1");
+    if (a > 1) {
+      const o = this.maskInf.slice([0, 0], [a, a]).expandDims(0).expandDims(0);
+      n = n.add(o);
+    }
+    const r = this.tf.softmax(n, -1);
+    return this.attnDropout.apply(r, { training: s });
   }
   getQKV(t) {
-    const [e, s, a] = t.shape, r = this.cAttn.apply(t), [o, i, n] = this.tf.split(r, 3, -1);
-    r.dispose();
-    const h = a / this.config.nHead, c = this.tf.reshape(o, [e, s, this.config.nHead, h]);
-    o.dispose();
-    const l = c.transpose([0, 2, 1, 3]);
-    c.dispose();
-    const d = this.tf.reshape(i, [e, s, this.config.nHead, h]);
-    i.dispose();
-    const u = d.transpose([0, 2, 1, 3]);
-    d.dispose();
-    const p = this.tf.reshape(n, [e, s, this.config.nHead, h]);
+    const [i, s, e] = t.shape, a = this.cAttn.apply(t), [h, n, r] = this.tf.split(a, 3, -1);
+    a.dispose();
+    const o = e / this.config.nHead, u = this.tf.reshape(h, [i, s, this.config.nHead, o]);
+    h.dispose();
+    const f = u.transpose([0, 2, 1, 3]);
+    u.dispose();
+    const d = this.tf.reshape(n, [i, s, this.config.nHead, o]);
     n.dispose();
-    const b = p.transpose([0, 2, 1, 3]);
-    return p.dispose(), [l, u, b];
+    const c = d.transpose([0, 2, 1, 3]);
+    d.dispose();
+    const l = this.tf.reshape(r, [i, s, this.config.nHead, o]);
+    r.dispose();
+    const p = l.transpose([0, 2, 1, 3]);
+    return l.dispose(), [f, c, p];
   }
-  getOutputProjection(t, e) {
-    const s = t.shape[0], a = t.shape[2], r = this.config.nEmbed, o = t.transpose([0, 2, 1, 3]), i = this.tf.reshape(o, [s, a, r]), n = this.cProj.apply(i);
-    return this.residDropout.apply(n, { training: e });
+  getOutputProjection(t, i) {
+    const s = t.shape[0], e = t.shape[2], a = this.config.nEmbed, h = t.transpose([0, 2, 1, 3]), n = this.tf.reshape(h, [s, e, a]), r = this.cProj.apply(n);
+    return this.residDropout.apply(r, { training: i });
   }
-  call(t, e = !1, s = !1) {
+  // Added optional KV cache support (pastKV). Returns presentKV for chaining.
+  call(t, i = !1, s = !1, e) {
+    if (e && !this.config.useRope)
+      throw new Error("Cannot use pastKV without RoPE enabled");
     return this.tf.tidy(() => {
-      const [a, r, o] = this.getQKV(t), i = this.getAttentionScores(a, r, e), n = this.tf.matMul(i, o);
-      return { output: this.getOutputProjection(n, e), attention: s ? i.mean(1) : void 0 };
+      const [a, h, n] = this.getQKV(t), r = a.shape[2], o = this.config.blockSize, u = e ? e.cumulativeLength : 0, [f, d] = this.ropeCache ? this.ropeCache.applyRoPE(a, h, u) : [a, h];
+      let c = d, l = n, p = 0;
+      e && (p = e.length, c = this.tf.concat([e.k, d], 2), l = this.tf.concat([e.v, n], 2));
+      const b = c.shape[2];
+      if (b > o) {
+        const k = b - o, g = c.shape[0], v = c.shape[1], I = c.shape[3];
+        c = c.slice([0, 0, k, 0], [g, v, o, I]), l = l.slice([0, 0, k, 0], [g, v, o, I]), p = o - r;
+      }
+      let m;
+      p > 0 ? m = this.getAttentionScoresWithPast(f, c, i, p) : m = this.getAttentionScores(f, c, i);
+      const _ = this.tf.matMul(m, l), A = this.getOutputProjection(_, i), P = {
+        k: this.tf.keep(c),
+        v: this.tf.keep(l),
+        length: p + r,
+        cumulativeLength: e ? e.cumulativeLength + r : r
+      };
+      return { output: A, attention: s ? m.mean(1) : void 0, presentKV: P };
     });
   }
   dispose() {
@@ -84,5 +117,5 @@ class m {
   }
 }
 export {
-  m as default
+  S as default
 };

package/dist/layers/RMSNorm.d.ts ADDED Viewed

@@ -0,0 +1,13 @@
+import { default as TF } from '@tensorflow/tfjs';
+export default class RMSNorm {
+    private gamma;
+    private epsilon;
+    private tf;
+    constructor(tf: typeof TF, shape: number[], epsilon?: number, name?: string);
+    get trainableWeights(): TF.Variable[];
+    set trainable(value: boolean);
+    getWeights(): TF.Tensor[];
+    setWeights(weights: TF.Tensor[]): void;
+    apply(x: TF.Tensor): TF.Tensor;
+    dispose(): void;
+}

package/dist/layers/RMSNorm.js ADDED Viewed

@@ -0,0 +1,32 @@
+class m {
+  gamma;
+  epsilon;
+  tf;
+  constructor(a, s, t = 1e-8, e = "") {
+    this.tf = a, this.epsilon = t, this.gamma = a.variable(a.ones(s), !0, `${e}_gamma`, "float32");
+  }
+  get trainableWeights() {
+    return [this.gamma];
+  }
+  set trainable(a) {
+    this.gamma.trainable = a;
+  }
+  getWeights() {
+    return [this.gamma];
+  }
+  setWeights(a) {
+    this.gamma.assign(a[0]);
+  }
+  apply(a) {
+    return this.tf.tidy(() => {
+      const t = a.square().mean(-1, !0).add(this.epsilon).rsqrt();
+      return a.mul(t).mul(this.gamma);
+    });
+  }
+  dispose() {
+    this.gamma.dispose();
+  }
+}
+export {
+  m as default
+};

package/dist/layers/RoPECache.d.ts ADDED Viewed

@@ -0,0 +1,16 @@
+import { default as TF } from '@tensorflow/tfjs';
+import { GPTConfig } from '../config';
+export default class RoPECache {
+    private readonly tf;
+    private readonly config;
+    private rotaryDim;
+    private ropeBase;
+    private ropeInvFreq;
+    private ropeCos;
+    private ropeSin;
+    private ropeCacheLen;
+    constructor(tf: typeof TF, config: GPTConfig);
+    private ensureRopeCache;
+    applyRoPE(q: TF.Tensor, k: TF.Tensor, pastLen: number): [TF.Tensor, TF.Tensor];
+    dispose(): void;
+}

package/dist/layers/RoPECache.js ADDED Viewed

@@ -0,0 +1,39 @@
+class E {
+  constructor(t, c) {
+    this.tf = t, this.config = c;
+    const e = this.config.nEmbed / this.config.nHead;
+    if (this.rotaryDim = e, this.rotaryDim % 2 !== 0)
+      throw new Error("rotaryDim must be even");
+    this.ropeBase = 1e4;
+    const o = this.tf.range(0, this.rotaryDim, 2, "float32").div(this.tf.scalar(this.rotaryDim, "float32")), s = this.tf.pow(this.tf.scalar(this.ropeBase, "float32"), o);
+    this.ropeInvFreq = this.tf.reciprocal(s), this.config.useRope === !1 ? (this.ropeCos = null, this.ropeSin = null, this.ropeCacheLen = 0) : this.ensureRopeCache(this.config.blockSize * 4);
+  }
+  rotaryDim;
+  ropeBase;
+  ropeInvFreq;
+  ropeCos = null;
+  // [cacheLen, rotaryDim/2]
+  ropeSin = null;
+  // [cacheLen, rotaryDim/2]
+  ropeCacheLen = 0;
+  ensureRopeCache(t) {
+    if (t <= this.ropeCacheLen) return;
+    this.ropeCos && this.ropeCos.dispose(), this.ropeSin && this.ropeSin.dispose();
+    const e = this.tf.range(0, t, 1, "float32").expandDims(1).mul(this.ropeInvFreq.expandDims(0));
+    this.ropeCos = this.tf.keep(this.tf.cos(e).expandDims(-1)), this.ropeSin = this.tf.keep(this.tf.sin(e).expandDims(-1)), this.ropeCacheLen = t;
+  }
+  applyRoPE(t, c, e) {
+    const h = t.shape[3], o = this.rotaryDim;
+    if (o > h) return [t, c];
+    const s = t.shape[2], S = e + s;
+    this.ensureRopeCache(S);
+    const n = o / 2, g = this.ropeCos.slice([e, 0, 0], [s, n, 1]), v = this.ropeSin.slice([e, 0, 0], [s, n, 1]), l = g.reshape([1, 1, s, n, 1]), f = v.reshape([1, 1, s, n, 1]), p = this.tf.concat([t, c], 0), r = p.shape[0], i = p.shape[1], y = p.slice([0, 0, 0, 0], [r, i, s, o]), u = o < h ? p.slice([0, 0, 0, o], [r, i, s, h - o]) : null, d = y.reshape([r, i, s, n, 2]), m = d.slice([0, 0, 0, 0, 0], [r, i, s, n, 1]), C = d.slice([0, 0, 0, 0, 1], [r, i, s, n, 1]), B = m.mul(l).sub(C.mul(f)), b = C.mul(l).add(m.mul(f)), D = this.tf.concat([B, b], -1).reshape([r, i, s, o]), R = u ? this.tf.concat([D, u], 3) : D, a = r / 2, x = R.slice([0, 0, 0, 0], [a, i, s, h]), P = R.slice([a, 0, 0, 0], [a, i, s, h]);
+    return [x, P];
+  }
+  dispose() {
+    this.ropeCos && this.ropeCos.dispose(), this.ropeSin && this.ropeSin.dispose(), this.ropeInvFreq.dispose();
+  }
+}
+export {
+  E as default
+};

package/dist/layers/TransformerBlock.d.ts CHANGED Viewed

@@ -1,5 +1,7 @@
 import { default as TF } from '@tensorflow/tfjs';
 import { GPTConfig } from '../config';
+import { KVCache } from './CausalSelfAttention';
+import { default as RoPECache } from './RoPECache';
 export default class Block {
     private ln1;
     private attn;
@@ -9,16 +11,17 @@ export default class Block {
     private index;
     private _trainable;
     skipped: boolean;
-    constructor(tf: typeof TF, index: number, config: GPTConfig);
+    constructor(tf: typeof TF, index: number, config: GPTConfig, ropeCache?: RoPECache);
     get variables(): TF.Variable[];
     get trainable(): boolean;
     set trainable(value: boolean);
     saveWeights(map: Map<string, TF.Tensor[]>): void;
     loadWeights(weights: Map<string, TF.Tensor[]>): void;
     private getMLPOutput;
-    call(x: TF.Tensor, training?: boolean, includeAttention?: boolean): {
+    call(x: TF.Tensor, training?: boolean, includeAttention?: boolean, cache?: KVCache): {
         output: TF.Tensor;
         attention?: TF.Tensor;
+        cache?: KVCache;
     };
     dispose(): void;
 }

package/dist/layers/TransformerBlock.js CHANGED Viewed

@@ -1,6 +1,6 @@
-import h from "./CausalSelfAttention.js";
-import r from "./MLP.js";
-import l from "./LayerNorm.js";
+import r from "./CausalSelfAttention.js";
+import o from "./MLP.js";
+import a from "./RMSNorm.js";
 class u {
   ln1;
   attn;
@@ -10,8 +10,8 @@ class u {
   index;
   _trainable = !0;
   skipped = !1;
-  constructor(t, i, s) {
-    this.tf = t, this.index = i, this.ln1 = new l(t, [s.nEmbed], 1e-5, `block_${this.index}_ln1`), this.attn = new h(this.tf, this.index, s), this.ln2 = new l(t, [s.nEmbed], 1e-5, `block_${this.index}_ln2`), this.mlp = new r(this.tf, this.index, s);
+  constructor(t, i, s, e) {
+    this.tf = t, this.index = i, this.ln1 = new a(t, [s.nEmbed], 1e-8, `block_${this.index}_rms1`), this.attn = new r(this.tf, this.index, s, e), this.ln2 = new a(t, [s.nEmbed], 1e-8, `block_${this.index}_rms2`), this.mlp = new o(this.tf, this.index, s);
   }
   get variables() {
     return [
@@ -28,21 +28,25 @@ class u {
     this._trainable = t, this.ln1.trainable = t, this.ln2.trainable = t, this.attn.trainable = t, this.mlp.trainable = t;
   }
   saveWeights(t) {
-    this.attn.saveWeights(t), this.mlp.saveWeights(t), t.set(`block_${this.index}_ln1`, this.ln1.getWeights()), t.set(`block_${this.index}_ln2`, this.ln2.getWeights());
+    this.attn.saveWeights(t), this.mlp.saveWeights(t), t.set(`block_${this.index}_rms1`, this.ln1.getWeights()), t.set(`block_${this.index}_rms2`, this.ln2.getWeights());
   }
   loadWeights(t) {
-    this.attn.loadWeights(t), this.mlp.loadWeights(t), this.ln1.setWeights(t.get(`block_${this.index}_ln1`) || []), this.ln2.setWeights(t.get(`block_${this.index}_ln2`) || []);
+    this.attn.loadWeights(t), this.mlp.loadWeights(t), this.ln1.setWeights(t.get(`block_${this.index}_rms1`) || []), this.ln2.setWeights(t.get(`block_${this.index}_rms2`) || []);
   }
   getMLPOutput(t, i) {
     const s = this.ln2.apply(t), e = this.mlp.call(s, i);
     return t.add(e);
   }
-  call(t, i = !1, s = !1) {
+  call(t, i = !1, s = !1, e) {
     return this.tf.tidy(() => {
       if (this.skipped)
         return { output: t };
-      const e = this.ln1.apply(t), n = this.attn.call(e, i, s), a = t.add(n.output);
-      return { output: this.getMLPOutput(a, i), attention: n.attention };
+      const l = this.ln1.apply(t), n = this.attn.call(l, i, s, e), h = t.add(n.output);
+      return {
+        output: this.getMLPOutput(h, i),
+        attention: n.attention,
+        cache: n.presentKV
+      };
     });
   }
   dispose() {

package/dist/training/FullTrainer.js CHANGED Viewed

@@ -1,70 +1,68 @@
 import { generateText as L } from "../utilities/generate.js";
 import w from "./Trainer.js";
-import g from "./Evaluator.js";
-const x = {
+import x from "./Evaluator.js";
+const g = {
   desiredLoss: 0.01,
   logInterval: 1,
   maxSteps: 1e3
 };
-class D extends w {
+class P extends w {
   constructor(r, i, o, n = 3e-4) {
     super(r, i, o, n);
   }
   // Train for multiple epochs using Dataset API - FIXED memory leaks
   async trainOnDataset(r, i, o) {
-    const { desiredLoss: n, logInterval: d, onStep: l, prompt: p, maxSteps: m } = {
-      ...x,
+    const { desiredLoss: n, logInterval: m, onStep: l, prompt: c, maxSteps: d } = {
+      ...g,
       ...i
-    }, s = {
-      pass: 0,
-      depth: 1,
+    }, t = {
       step: 0,
-      stepSinceDepthChange: 0,
       lastLoss: 1e6,
       totalSteps: 0,
       losses: [],
-      validationLosses: []
+      validationLosses: [],
+      ...this.lastState || {}
     };
-    this.dummyPass(), this.model.trainable = !0;
+    this.lastState = t, this.dummyPass(), this.model.trainable = !0;
     const u = Date.now();
     this.running = !0;
-    const c = o ? new g(this.model, o) : void 0, f = await r.iterator();
+    const h = o ? new x(this.model, o) : void 0, f = await r.iterator();
     try {
-      for (; this.running && !(s.lastLoss < n); ) {
+      for (; this.running && !(t.lastLoss < n); ) {
         const e = await f.next();
         if (e.done) break;
-        const h = e.value, v = this.trainBatch(s, h), a = {
-          loss: s.lastLoss,
-          step: s.step,
+        const p = e.value, v = this.trainBatch(t, p), a = {
+          loss: t.lastLoss,
+          step: t.step,
           time: Date.now() - u,
-          batchSize: h.xs.shape[0]
+          batchSize: p.xs.shape[0]
         };
-        if (this.model.log.push(a), s.step % d === 0) {
-          if (await v, c)
+        if (this.model.log.push(a), t.step % m === 0) {
+          if (await v, h)
             try {
-              const t = await c.evaluate(5);
-              s.validationLosses.push(t), a.valLoss = t;
-            } catch (t) {
-              console.error("Validation error:", t);
+              const s = await h.evaluate(5);
+              t.validationLosses.push(s), a.valLoss = s;
+            } catch (s) {
+              console.error("Validation error:", s);
             }
           if (l) {
-            if (p) {
-              const t = await L(this.tokenizer, this.model, p, 100, {
+            if (c) {
+              const s = await L(this.tokenizer, this.model, c, 100, {
                 temperature: 0.8
               });
-              a.example = t;
+              a.example = s;
             }
             await l(a);
           }
         }
-        s.step >= m && this.stop();
+        t.step >= d && this.stop();
       }
     } catch (e) {
       throw console.error("Training error:", e), this.tf.dispose(), e;
     }
-    return this.tf.dispose(), this.running = !1, { losses: s.losses, validationLosses: s.validationLosses };
+    return this.tf.dispose(), this.running = !1, { losses: t.losses, validationLosses: t.validationLosses };
   }
 }
 export {
-  D as default
+  P as default
 };

package/dist/training/Trainer.d.ts CHANGED Viewed

@@ -31,8 +31,10 @@ export default abstract class GPTTrainer {
     protected tf: typeof TF;
     protected learningRate: number;
     protected running: boolean;
+    protected lastState?: TrainingState;
     constructor(tf: typeof TF, model: NanoGPT, tokenizer: ITokeniser, learningRate?: number);
     setLearningRate(learningRate: number): void;
+    reset(): void;
     stop(): void;
     getOptimizer(): AdamExt;
     resetOptimizer(config?: AdamConfig): void;

package/dist/training/Trainer.js CHANGED Viewed

@@ -1,8 +1,8 @@
 import { DatasetBuilder as d } from "./DatasetBuilder.js";
-import p from "./AdamExt.js";
+import h from "./AdamExt.js";
 class u {
-  constructor(t, s, e, i = 1e-3) {
-    this.tokenizer = e, this.tf = t, this.model = s, this.learningRate = i, this.resetOptimizer(), this.datasetBuilder = new d(this.tf, e, s.config.blockSize);
+  constructor(t, e, s, i = 1e-3) {
+    this.tokenizer = s, this.tf = t, this.model = e, this.learningRate = i, this.resetOptimizer(), this.datasetBuilder = new d(this.tf, s, e.config.blockSize);
   }
   model;
   optimizer;
@@ -10,9 +10,13 @@ class u {
   tf;
   learningRate;
   running = !1;
+  lastState;
   setLearningRate(t) {
     this.learningRate = t, this.resetOptimizer({ learningRateFactor: 1, beta1: 0.9, beta2: 0.99, epsilon: 1e-8 });
   }
+  reset() {
+    this.lastState = void 0, this.running = !1;
+  }
   stop() {
     this.running = !1;
   }
@@ -21,7 +25,7 @@ class u {
   }
   resetOptimizer(t = { learningRateFactor: 1, beta1: 0.9, beta2: 0.99, epsilon: 1e-8 }) {
     this.optimizer && this.optimizer.dispose();
-    const s = new p(
+    const e = new h(
       t.learningRateFactor * this.learningRate,
       t.beta1,
       t.beta2,
@@ -33,53 +37,53 @@ class u {
         weightDecay: 0
       }
     );
-    this.optimizer = s;
+    this.optimizer = e;
   }
   printGradients(t) {
-    Object.keys(t).forEach((s) => {
-      const e = t[s];
-      console.log(`${s}:`), console.log(`  Shape: ${e.shape}`), console.log(`  Mean: ${this.tf.mean(e).dataSync()[0]}`), console.log(`  Std: ${this.tf.moments(e).variance.sqrt().dataSync()[0]}`), console.log(`  Min: ${this.tf.min(e).dataSync()[0]}`), console.log(`  Max: ${this.tf.max(e).dataSync()[0]}`), console.log(`  Norm: ${this.tf.norm(e).dataSync()[0]}`);
+    Object.keys(t).forEach((e) => {
+      const s = t[e];
+      console.log(`${e}:`), console.log(`  Shape: ${s.shape}`), console.log(`  Mean: ${this.tf.mean(s).dataSync()[0]}`), console.log(`  Std: ${this.tf.moments(s).variance.sqrt().dataSync()[0]}`), console.log(`  Min: ${this.tf.min(s).dataSync()[0]}`), console.log(`  Max: ${this.tf.max(s).dataSync()[0]}`), console.log(`  Norm: ${this.tf.norm(s).dataSync()[0]}`);
     });
   }
-  trainStep(t, s = !1, e = !1) {
+  trainStep(t, e = !1, s = !1) {
     return this.tf.tidy(() => {
       const { xs: i, ys: a } = t, o = () => {
         const { loss: l, logits: c } = this.model.forward(i, a, !0);
         return c.dispose(), l;
       }, { value: n, grads: r } = this.tf.variableGrads(o);
-      return s || (e && (console.log("-------"), this.printGradients(r), console.log("-------")), this.optimizer.applyGradients(r), this.tf.dispose(r)), n;
+      return e || (s && (console.log("-------"), this.printGradients(r), console.log("-------")), this.optimizer.applyGradients(r), this.tf.dispose(r)), n;
     });
   }
   dummyPass() {
-    const t = this.tf.zeros([1, this.model.config.blockSize], "int32"), s = this.tf.zeros([1, this.model.config.blockSize, this.model.config.vocabSize]);
+    const t = this.tf.zeros([1, this.model.config.blockSize], "int32"), e = this.tf.zeros([1, this.model.config.blockSize, this.model.config.vocabSize]);
     try {
-      const e = this.trainStep({ xs: t, ys: s }, !0);
-      e.dataSync(), e.dispose();
-    } catch (e) {
-      console.error("Error during dummy pass:", e);
+      const s = this.trainStep({ xs: t, ys: e }, !0);
+      s.dataSync(), s.dispose();
+    } catch (s) {
+      console.error("Error during dummy pass:", s);
     } finally {
-      t.dispose(), s.dispose();
+      t.dispose(), e.dispose();
     }
   }
-  async trainBatch(t, s) {
+  async trainBatch(t, e) {
     try {
-      const e = this.trainStep(s, !1, !1);
-      return s.xs.dispose(), s.ys.dispose(), t.step++, t.totalSteps++, e.array().then((i) => (t.lastLoss = i, t.losses.push(t.lastLoss), e.dispose(), t.lastLoss));
-    } catch (e) {
-      throw console.error(`Error processing batch at step ${t.step}:`, e), this.tf.dispose(), e;
+      const s = this.trainStep(e, !1, !1);
+      return e.xs.dispose(), e.ys.dispose(), t.step++, t.totalSteps++, s.array().then((i) => (t.lastLoss = i, t.losses.push(t.lastLoss), s.dispose(), t.lastLoss));
+    } catch (s) {
+      throw console.error(`Error processing batch at step ${t.step}:`, s), this.tf.dispose(), s;
     }
   }
-  async createTrainValidationSplit(t, s = 32, e = 0.1) {
-    const i = await this.datasetBuilder.createTextDataset(t, s, 0, 1 - e), a = await this.datasetBuilder.createTextDataset(
+  async createTrainValidationSplit(t, e = 32, s = 0.1) {
+    const i = await this.datasetBuilder.createTextDataset(t, e, 0, 1 - s), a = await this.datasetBuilder.createTextDataset(
       t,
-      s,
-      1 - e,
+      e,
+      1 - s,
       1
     );
     return { trainDataset: i, validationDataset: a };
   }
-  async createDataset(t, s = 32) {
-    return await this.datasetBuilder.createTextDataset(t, s);
+  async createDataset(t, e = 32) {
+    return await this.datasetBuilder.createTextDataset(t, e);
   }
   dispose() {
     this.optimizer && this.optimizer.dispose();

package/dist/utilities/generate.js CHANGED Viewed

@@ -1,20 +1,20 @@
-async function w(n, t, r, s, g) {
-  if (s <= 0)
+async function h(r, t, a, c, g) {
+  if (c <= 0)
     throw new Error("Length must be a positive integer");
-  if (r.length === 0)
+  if (a.length === 0)
     throw new Error("Prompt cannot be an empty string");
-  const i = await n.tokenise([r], !0), a = t.tf.tidy(() => {
-    let e = t.tf.tensor2d(i, [1, i[0].length], "int32");
-    for (let d = 0; d < s; d++) {
-      const { output: p } = t.generate(e, g), f = e;
-      e = t.tf.concat([e, p], 1), f.dispose(), p.dispose();
+  const p = await r.tokenise([a], !0), s = t.config.useRope ? new Array(t.config.nLayer).fill(void 0) : void 0, u = t.tf.tidy(() => {
+    let e = t.tf.tensor2d(p, [1, p[0].length], "int32"), n = e;
+    for (let f = 0; f < c; f++) {
+      const { output: o } = t.generate(e, s, g), w = e, y = n;
+      n = t.tf.concat([n, o], 1), e = s ? o : t.tf.concat([e, o], 1), w.dispose(), y.dispose(), s || o.dispose();
     }
-    return e;
-  }), u = await a.array();
-  a.dispose();
-  const o = u[0], c = o.indexOf(n.eosToken);
-  return c !== -1 && o.splice(c), await n.decode(o);
+    return n;
+  }), T = await u.array();
+  u.dispose();
+  const i = T[0], d = i.indexOf(r.eosToken);
+  return d !== -1 && i.splice(d), await r.decode(i);
 }
 export {
-  w as generateText
+  h as generateText
 };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
     "name": "@genai-fi/nanogpt",
-    "version": "0.1.8",
+    "version": "0.2.0",
     "type": "module",
     "main": "dist/main.js",
     "types": "dist/main.d.ts",