npm - @genai-fi/nanogpt - Versions diffs - 0.2.9 → 0.2.10 - Mend

@genai-fi/nanogpt 0.2.9 → 0.2.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

package/dist/Generator.d.ts +2 -0
package/dist/Generator.js +37 -32
package/dist/NanoGPTModel.d.ts +4 -1
package/dist/NanoGPTModel.js +33 -25
package/dist/TeachableLLM.d.ts +4 -0
package/dist/TeachableLLM.js +31 -16
package/dist/{complex-Cd8sqiBC.js → complex-x7w5HPOS.js} +6 -6
package/dist/{index-Dsg28SG6.js → index-CWQLouWz.js} +39 -35
package/dist/layers/BaseLayer.d.ts +8 -0
package/dist/layers/BaseLayer.js +18 -0
package/dist/layers/CausalSelfAttention.d.ts +2 -1
package/dist/layers/CausalSelfAttention.js +10 -8
package/dist/layers/MLP.d.ts +2 -1
package/dist/layers/MLP.js +16 -14
package/dist/layers/RMSNorm.d.ts +2 -1
package/dist/layers/RMSNorm.js +13 -11
package/dist/layers/TiedEmbedding.js +21 -21
package/dist/layers/TransformerBlock.d.ts +4 -1
package/dist/layers/TransformerBlock.js +9 -5
package/dist/{mat_mul-BAYDrXvE.js → mat_mul-4v7St11W.js} +5 -5
package/dist/ops/attentionMask.js +31 -25
package/dist/ops/gatherSub.js +2 -2
package/dist/ops/node/sparseCrossEntropy.js +1 -1
package/dist/ops/scatterSub.js +8 -8
package/dist/{stack-1o648CP_.js → stack-CTdK-itU.js} +7 -7
package/dist/{sum-NWazHI7f.js → sum-CnIf1YOh.js} +3 -3
package/dist/training/AdamExt.js +1 -1
package/dist/training/Trainer.js +30 -29
package/dist/training/sparseCrossEntropy.js +9 -9
package/dist/utilities/profile.d.ts +10 -0
package/dist/utilities/profile.js +29 -0
package/package.json +1 -1

package/dist/Generator.d.ts CHANGED Viewed

@@ -8,10 +8,12 @@ export interface IGenerateOptions extends GenerateOptions {
 export default class Generator extends EE<'start' | 'stop' | 'tokens'> {
     private readonly model;
     private readonly tokeniser;
+    private active;
     constructor(model: NanoGPT, tokeniser: ITokeniser);
     private tokenisePrompt;
     private generateNoCache;
     private processResponse;
     private generateCache;
     generate(prompt?: string, options?: IGenerateOptions): Promise<string>;
+    stop(): void;
 }

package/dist/Generator.js CHANGED Viewed

@@ -1,65 +1,70 @@
 import { E as u } from "./index-Dwqa6Zy2.js";
-class p extends u {
+class f extends u {
   constructor(s, e) {
     super(), this.model = s, this.tokeniser = e;
   }
+  active = !1;
   async tokenisePrompt(s) {
     const e = s ? await this.tokeniser.tokenise([s], !0) : [[this.tokeniser.eosToken]];
     return this.model.tf.tensor2d(e, [1, e[0].length], "int32");
   }
   async generateNoCache(s, e) {
-    let t = await this.tokenisePrompt(s), n = s || "";
-    const a = e?.maxLength ?? 1e3;
-    for (let i = 0; i < a; i++) {
+    let t = await this.tokenisePrompt(s), i = s || "";
+    const o = e?.maxLength ?? 1e3;
+    for (let a = 0; a < o && this.active; a++) {
       const {
-        output: o,
+        output: n,
         attention: c,
-        probabilities: h
-      } = this.model.generate(t, void 0, e), l = t;
-      t = this.model.tf.concat([t, o], 1), l.dispose();
-      const r = await this.processResponse(o, c, h);
-      if (o.dispose(), r === null)
+        probabilities: l
+      } = this.model.generate(t, void 0, e), h = t;
+      t = this.model.tf.concat([t, n], 1), h.dispose();
+      const r = await this.processResponse(n, c, l);
+      if (n.dispose(), r === null)
         break;
-      n += r;
+      i += r;
     }
-    return t.dispose(), n;
+    return t.dispose(), i;
   }
   async processResponse(s, e, t) {
-    const n = (await s.array())[0][0];
-    if (n === this.tokeniser.eosToken)
+    const i = (await s.array())[0][0];
+    if (i === this.tokeniser.eosToken)
       return null;
-    const a = await this.tokeniser.decode([n]);
-    let i;
-    e && (i = await e.array(), e.dispose());
-    let o;
-    return t && (o = await t.array(), t.dispose()), this.emit("tokens", [n], a, i, o), a;
+    const o = await this.tokeniser.decode([i]);
+    let a;
+    e && (a = await e.array(), e.dispose());
+    let n;
+    return t && (n = await t.array(), t.dispose()), this.emit("tokens", [i], o, a, n), o;
   }
   async generateCache(s, e) {
-    let t = await this.tokenisePrompt(s), n = s || "";
-    const a = new Array(this.model.config.nLayer).fill(void 0), i = e?.maxLength ?? 1e3;
-    for (let o = 0; o < i; o++) {
+    let t = await this.tokenisePrompt(s), i = s || "";
+    const o = new Array(this.model.config.nLayer).fill(void 0), a = e?.maxLength ?? 1e3;
+    for (let n = 0; n < a && this.active; n++) {
       const {
         output: c,
-        attention: h,
-        probabilities: l
-      } = this.model.generate(t, a, {
+        attention: l,
+        probabilities: h
+      } = this.model.generate(t, o, {
         ...e,
         usePadding: !1
       });
       t.dispose(), t = c;
-      const r = await this.processResponse(c, h, l);
+      const r = await this.processResponse(c, l, h);
       if (r === null)
         break;
-      n += r;
+      i += r;
     }
-    return t.dispose(), n;
+    return t.dispose(), i;
   }
   async generate(s, e) {
-    this.emit("start");
-    const t = this.model.config.useRope && !e?.noCache ? this.generateCache(s, e) : this.generateNoCache(s, e);
-    return this.emit("stop"), t;
+    const t = s && s.length > this.model.config.blockSize ? s.slice(-this.model.config.blockSize) : s;
+    this.active = !0, this.emit("start");
+    const o = await (this.model.config.useRope && !e?.noCache ? this.generateCache(t, e) : this.generateNoCache(t, e));
+    return this.active = !1, this.emit("stop"), o;
+  }
+  stop() {
+    this.active = !1;
   }
 }
 export {
-  p as default
+  f as default
 };

package/dist/NanoGPTModel.d.ts CHANGED Viewed

@@ -1,6 +1,8 @@
 import { default as TF } from '@tensorflow/tfjs';
 import { GPTConfig } from './config';
 import { KVCache } from './layers/CausalSelfAttention';
+import { default as MemoryProfiler } from './utilities/profile';
+import { default as BaseLayer } from './layers/BaseLayer';
 export interface TrainingLogEntry {
     loss: number;
     valLoss?: number;
@@ -16,7 +18,7 @@ export interface GenerateOptions {
     includeAttention?: boolean;
     includeProbabilities?: boolean;
 }
-export default class NanoGPT {
+export default class NanoGPT extends BaseLayer {
     readonly config: GPTConfig;
     private wte;
     private wpe?;
@@ -34,6 +36,7 @@ export default class NanoGPT {
     setSkipMask(mask: boolean[]): void;
     setTrainableMask(mask: boolean[]): void;
     set trainable(value: boolean);
+    setProfiler(value: MemoryProfiler | undefined): void;
     private validateInput;
     private calculateLoss;
     private computeAttentionRollout;

package/dist/NanoGPTModel.js CHANGED Viewed

@@ -1,11 +1,12 @@
-import { defaultConfig as $ } from "./config.js";
+import { defaultConfig as v } from "./config.js";
 import z from "./layers/TransformerBlock.js";
 import S from "./layers/TiedEmbedding.js";
-import I from "./layers/RoPECache.js";
-import _ from "./layers/RMSNorm.js";
-import { estimateParameterCount as W } from "./utilities/parameters.js";
-import { createSoftmaxCrossEntropyWithGrad as C } from "./training/sparseCrossEntropy.js";
-class K {
+import _ from "./layers/RoPECache.js";
+import I from "./layers/RMSNorm.js";
+import { estimateParameterCount as F } from "./utilities/parameters.js";
+import { createSoftmaxCrossEntropyWithGrad as L } from "./training/sparseCrossEntropy.js";
+import P from "./layers/BaseLayer.js";
+class A extends P {
   config;
   wte;
   // Token embeddings
@@ -21,7 +22,7 @@ class K {
   log = [];
   // Training log
   constructor(t, e = {}) {
-    this.tf = t, this.config = { ...$, ...e }, this.wte = new S(t, {
+    super(), this.tf = t, this.config = { ...v, ...e }, this.wte = new S(t, {
       vocabSize: this.config.vocabSize,
       embedDim: this.config.nEmbed,
       name: "token_embedding"
@@ -30,10 +31,10 @@ class K {
       outputDim: this.config.nEmbed,
       name: "positional_embedding",
       embeddingsInitializer: this.tf.initializers.randomNormal({ mean: 0, stddev: 0.02 })
-    }) : this.ropeCache = new I(t, this.config), this.drop = this.tf.layers.dropout({ rate: this.config.dropout }), this.blocks = [];
+    }) : this.ropeCache = new _(t, this.config), this.drop = this.tf.layers.dropout({ rate: this.config.dropout }), this.blocks = [];
     for (let o = 0; o < this.config.nLayer; o++)
       this.blocks.push(new z(this.tf, o, this.config, this.ropeCache));
-    this.lnF = new _(t, [this.config.nEmbed], 1e-8, "final_rms_norm");
+    this.lnF = new I(t, [this.config.nEmbed], 1e-8, "final_rms_norm");
   }
   get variables() {
     return [
@@ -86,6 +87,12 @@ class K {
       e.trainable = t;
     this.lnF.trainable = t;
   }
+  setProfiler(t) {
+    this._profiler = t;
+    for (const e of this.blocks)
+      e.setProfiler(t);
+    this.lnF.setProfiler(t);
+  }
   validateInput(t) {
     if (t.shape.length !== 2)
       throw new Error(`Invalid input shape: expected [batch_size, sequence_length], got ${t.shape}`);
@@ -96,7 +103,7 @@ class K {
   }
   calculateLoss(t, e) {
     try {
-      return C()(t, e).mean();
+      return L()(t, e).mean();
     } catch (o) {
       throw console.error("Error computing loss:", o), new Error(`Loss computation failed: ${o}`);
     }
@@ -139,24 +146,25 @@ class K {
   }
   forward(t, e, o = !1, i = !1, s) {
     return this.validateInput(t), this.tf.tidy(() => {
+      this.startMemory();
       const l = s?.[0]?.length ?? 0;
       let r = this.inputPhase(t, l, o);
       const n = [];
       if (s && s.length !== this.blocks.length)
         throw console.error("Cache", s), new Error(`Cache length ${s.length} does not match number of blocks ${this.blocks.length}`);
       for (let a = 0; a < this.blocks.length; a++) {
-        const d = this.blocks[a], {
-          output: g,
-          attention: u,
+        const d = r, g = this.blocks[a], {
+          output: m,
+          attention: b,
           cache: f
-        } = d.call(r, o, i, s ? s[a] : void 0);
-        r = g, i && u && n.push(u), s && f ? (s[a]?.k.dispose(), s[a]?.v.dispose(), s[a] = f) : f && (f.k.dispose(), f.v.dispose());
+        } = g.call(r, o, i, s ? s[a] : void 0);
+        r = m, d.dispose(), i && b && n.push(b), s && f ? (s[a]?.k.dispose(), s[a]?.v.dispose(), s[a] = f) : f && (f.k.dispose(), f.v.dispose());
       }
       let h;
       i && n.length > 0 && (h = this.computeAttentionRollout(n)), r = this.lnF.apply(r);
       const c = this.wte.project(r);
       let p;
-      return e && (p = this.calculateLoss(c, e)), { logits: c, loss: p, attention: i ? h : void 0 };
+      return e && (p = this.calculateLoss(c, e)), this.endMemory("Forward"), { logits: c, loss: p, attention: i ? h : void 0 };
     });
   }
   generate(t, e, o) {
@@ -168,24 +176,24 @@ class K {
       ), p = l ? this.config.blockSize - c.shape[1] : 0, a = p > 0 ? this.tf.pad(c, [
         [0, 0],
         [0, p]
-      ]) : c, { logits: d, attention: g } = this.forward(a, void 0, !1, r, e), u = d.shape[1] - 1 - p, f = d.slice([0, u, 0], [d.shape[0], 1, d.shape[2]]), w = g ? g.slice([0, u, 0], [g.shape[0], 1, g.shape[2]]) : void 0, b = f.div(i);
-      let m;
+      ]) : c, { logits: d, attention: g } = this.forward(a, void 0, !1, r, e), m = d.shape[1] - 1 - p, b = d.slice([0, m, 0], [d.shape[0], 1, d.shape[2]]), f = g ? g.slice([0, m, 0], [g.shape[0], 1, g.shape[2]]) : void 0, k = b.div(i);
+      let u;
       if (s) {
-        const { values: v, indices: y } = this.tf.topk(b, s), E = this.tf.multinomial(v.squeeze([1]), 1);
-        m = this.tf.gather(y.squeeze([1]), E, 1);
+        const { values: y, indices: E } = this.tf.topk(k, s), $ = this.tf.multinomial(y.squeeze([1]), 1);
+        u = this.tf.gather(E.squeeze([1]), $, 1);
       } else
-        m = this.tf.multinomial(b.squeeze([1]), 1);
-      let k;
-      return o?.includeProbabilities && (k = this.tf.softmax(b.squeeze([1]))), m = m.reshape([1, 1]), { output: m, attention: w?.squeeze([1]), probabilities: k };
+        u = this.tf.multinomial(k.squeeze([1]), 1);
+      let w;
+      return o?.includeProbabilities && (w = this.tf.softmax(k.squeeze([1]))), u = u.reshape([1, 1]), { output: u, attention: f?.squeeze([1]), probabilities: w };
     });
   }
   getNumParams() {
-    return W(this.config);
+    return F(this.config);
   }
   dispose() {
     this.wte.dispose(), this.wpe && this.wpe.dispose(), this.drop.dispose(), this.blocks.forEach((t) => t.dispose()), this.lnF.dispose();
   }
 }
 export {
-  K as default
+  A as default
 };

package/dist/TeachableLLM.d.ts CHANGED Viewed

@@ -6,6 +6,7 @@ import { SaveOptions } from './utilities/save';
 import { default as Generator, IGenerateOptions } from './Generator';
 import { default as Trainer, ITrainerOptions } from './Trainer';
 import { default as EE } from 'eventemitter3';
+import { default as MemoryProfiler } from './utilities/profile';
 type TeachableLLMStatus = 'warmup' | 'awaitingTokens' | 'ready' | 'training' | 'loading' | 'busy' | 'error';
 export default class TeachableLLM extends EE<'status' | 'error' | 'trainStep'> {
     private _config?;
@@ -23,6 +24,9 @@ export default class TeachableLLM extends EE<'status' | 'error' | 'trainStep'> {
     saveModel(options?: SaveOptions): Promise<Blob>;
     static loadModel(tf: typeof TF, data: Blob | Buffer | string): TeachableLLM;
     static create(tf: typeof TF, config?: Partial<GPTConfig>): TeachableLLM;
+    getProfiler(): MemoryProfiler | undefined;
+    get enableProfiler(): boolean;
+    set enableProfiler(value: boolean);
     getNumParams(): number;
     trainer(): Trainer;
     train(text: string[], options?: ITrainerOptions): Promise<void>;

package/dist/TeachableLLM.js CHANGED Viewed

@@ -1,11 +1,11 @@
-import { defaultConfig as d } from "./config.js";
-import m from "./NanoGPTModel.js";
-import { saveModel as u } from "./utilities/save.js";
-import { loadModel as l } from "./utilities/load.js";
-import f from "./Generator.js";
+import { defaultConfig as h } from "./config.js";
+import d from "./NanoGPTModel.js";
+import { saveModel as m } from "./utilities/save.js";
+import { loadModel as f } from "./utilities/load.js";
+import u from "./Generator.js";
 import _ from "./Trainer.js";
 import { E as c } from "./index-Dwqa6Zy2.js";
-import { dummyPassAsync as h } from "./utilities/dummy.js";
+import { dummyPassAsync as l } from "./utilities/dummy.js";
 import g from "./tokeniser/CharTokeniser.js";
 import "./papaparse.min-C8l2Kvo1.js";
 import "./index-Tf7vU29b.js";
@@ -13,6 +13,7 @@ import "./jszip.min-CjP2V1VV.js";
 import "./ops/scatterSub.js";
 import "./ops/gatherSub.js";
 import "./ops/attentionMask.js";
+import w from "./utilities/profile.js";
 class a extends c {
   _config;
   _model;
@@ -49,23 +50,23 @@ class a extends c {
   saveModel(t) {
     if (!this._model || !this._tokeniser)
       throw new Error("Model or tokeniser is not initialized.");
-    return u(this._model, this._tokeniser, t);
+    return m(this._model, this._tokeniser, t);
   }
   static loadModel(t, r) {
     const e = new a(t);
-    return l(t, r).then(({ model: s, tokeniser: o }) => {
-      e._model = s, e._tokeniser = o, e._config = s.config, e.setStatus("warmup"), h(s).then(() => {
+    return f(t, r).then(({ model: o, tokeniser: s }) => {
+      e._model = o, e._tokeniser = s, e._config = o.config, e.setStatus("warmup"), l(o).then(() => {
         e.setStatus("ready");
       }).catch((i) => {
         e.setStatus("error"), e.emit("error", i);
       });
-    }).catch((s) => {
-      e.setStatus("error"), e.emit("error", s);
+    }).catch((o) => {
+      e.setStatus("error"), e.emit("error", o);
     }), e;
   }
   static create(t, r = {}) {
-    const e = { ...d, ...r }, s = new g(e.vocabSize), o = new m(t, e), i = new a(t, s, o);
-    return i.setStatus("warmup"), h(o).then(() => {
+    const e = { ...h, ...r }, o = new g(e.vocabSize), s = new d(t, e), i = new a(t, o, s);
+    return i.setStatus("warmup"), l(s).then(() => {
       i.tokeniser.trained ? i.setStatus("ready") : (i.setStatus("awaitingTokens"), i.tokeniser.once("trainStatus", (n) => {
         n === "trained" && i.setStatus("ready");
       }));
@@ -73,6 +74,20 @@ class a extends c {
       i.setStatus("error"), i.emit("error", n);
     }), i;
   }
+  getProfiler() {
+    return this._model?.getProfiler();
+  }
+  get enableProfiler() {
+    return !!this._model?.getProfiler();
+  }
+  set enableProfiler(t) {
+    if (t) {
+      if (!this._model)
+        throw new Error("Model is not initialized.");
+      this._model.getProfiler() || this._model.setProfiler(new w());
+    } else
+      this._model && this._model.setProfiler(void 0);
+  }
   getNumParams() {
     if (!this._model)
       throw new Error("Model is not initialized.");
@@ -84,8 +99,8 @@ class a extends c {
     const t = new _(this._model, this._tokeniser);
     return t.on("start", () => this.setStatus("training")), t.on("stop", () => this.setStatus("ready")), t.on("log", async (r) => {
       const e = this.listeners("trainStep");
-      for (const s of e)
-        await s(r);
+      for (const o of e)
+        await o(r);
     }), t;
   }
   train(t, r) {
@@ -94,7 +109,7 @@ class a extends c {
   generator() {
     if (!this._model || !this._tokeniser)
       throw new Error("Model or tokeniser is not initialized.");
-    const t = new f(this._model, this._tokeniser);
+    const t = new u(this._model, this._tokeniser);
     return t.on("start", () => {
       this.status === "ready" && this.setStatus("busy");
     }), t.on("stop", () => {

package/dist/{complex-Cd8sqiBC.js → complex-x7w5HPOS.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { o as t, c as s, f as n, E as m, C as r } from "./index-Dsg28SG6.js";
+import { o as c, d as s, g as n, E as m, C as r } from "./index-CWQLouWz.js";
 /**
  * @license
  * Copyright 2020 Google LLC. All Rights Reserved.
@@ -15,13 +15,13 @@ import { o as t, c as s, f as n, E as m, C as r } from "./index-Dsg28SG6.js";
  * limitations under the License.
  * =============================================================================
  */
-function l(o, c) {
-  const a = s(o, "real", "complex"), e = s(c, "imag", "complex");
+function l(o, p) {
+  const a = s(o, "real", "complex"), e = s(p, "imag", "complex");
   n(a.shape, e.shape, `real and imag shapes, ${a.shape} and ${e.shape}, must match in call to tf.complex().`);
-  const p = { real: a, imag: e };
-  return m.runKernel(r, p);
+  const t = { real: a, imag: e };
+  return m.runKernel(r, t);
 }
-const i = /* @__PURE__ */ t({ complex_: l });
+const i = /* @__PURE__ */ c({ complex_: l });
 export {
   i as c
 };

package/dist/{index-Dsg28SG6.js → index-CWQLouWz.js} RENAMED Viewed

@@ -2068,6 +2068,9 @@ function Sn(n, t) {
 function Ys() {
   return g;
 }
+function Qs() {
+  return g.memory();
+}
 function E(n, t) {
   return g.tidy(n, t);
 }
@@ -2890,7 +2893,7 @@ function Yn(n, t, e) {
  * limitations under the License.
  * =============================================================================
  */
-function Qs(n, t) {
+function Zs(n, t) {
   const e = [];
   for (let s = 0; s < t.length; s++) {
     const r = n[n.length - s - 1], i = t.length - s - 1, o = t[i];
@@ -3058,7 +3061,7 @@ function ss(n, t) {
     a[u] != null && (c[l.name] = a[u]);
   }), s?.forEach((l) => c[l.name] = null), { value: o, grads: c };
 }
-function Zs(n) {
+function tr(n) {
   return g.customGrad(n);
 }
 /**
@@ -3841,51 +3844,52 @@ export {
   ds as A,
   Es as B,
   As as C,
-  zs as D,
+  C as D,
   g as E,
-  Bs as F,
+  zs as F,
   Ms as G,
-  $s as H,
+  Bs as H,
   Fs as I,
-  Cs as J,
-  Ps as K,
+  $s as J,
+  Cs as K,
   Rs as L,
   xs as M,
   Ns as N,
-  Os as O,
+  Ps as O,
   Ds as P,
-  Us as Q,
+  Os as Q,
   _s as R,
   Ws as S,
-  Vs as T,
-  Ks as U,
-  Qs as V,
-  Qn as W,
+  Us as T,
+  Vs as U,
+  Ks as V,
+  Zs as W,
+  Qn as X,
   qs as _,
-  Z as a,
-  Js as b,
-  I as c,
-  V as d,
+  p as a,
+  Z as b,
+  Js as c,
+  I as d,
   Ys as e,
-  Is as f,
-  Xs as g,
-  y as h,
-  Ls as i,
-  $t as j,
-  Dt as k,
-  Zt as l,
-  p as m,
-  G as n,
+  V as f,
+  Is as g,
+  Xs as h,
+  y as i,
+  Ls as j,
+  $t as k,
+  Dt as l,
+  Qs as m,
+  Zt as n,
   F as o,
-  De as p,
-  Gs as q,
+  G as p,
+  De as q,
   Hs as r,
   K as s,
-  vs as t,
-  Ts as u,
-  w as v,
-  js as w,
-  Zs as x,
-  E as y,
-  C as z
+  Gs as t,
+  vs as u,
+  Ts as v,
+  w,
+  js as x,
+  tr as y,
+  E as z
 };

package/dist/layers/BaseLayer.d.ts ADDED Viewed

@@ -0,0 +1,8 @@
+import { default as MemoryProfiler } from '../utilities/profile';
+export default abstract class BaseLayer {
+    protected _profiler?: MemoryProfiler;
+    getProfiler(): MemoryProfiler | undefined;
+    setProfiler(value: MemoryProfiler | undefined): void;
+    startMemory(): void;
+    endMemory(label: string): void;
+}

package/dist/layers/BaseLayer.js ADDED Viewed

@@ -0,0 +1,18 @@
+class t {
+  _profiler;
+  getProfiler() {
+    return this._profiler;
+  }
+  setProfiler(r) {
+    this._profiler = r;
+  }
+  startMemory() {
+    this._profiler?.startMemory();
+  }
+  endMemory(r) {
+    this._profiler?.endMemory(r);
+  }
+}
+export {
+  t as default
+};

package/dist/layers/CausalSelfAttention.d.ts CHANGED Viewed

@@ -1,13 +1,14 @@
 import { default as TF } from '@tensorflow/tfjs';
 import { GPTConfig } from '../config';
 import { default as RoPECache } from './RoPECache';
+import { default as BaseLayer } from './BaseLayer';
 export type KVCache = {
     k: TF.Tensor;
     v: TF.Tensor;
     length: number;
     cumulativeLength: number;
 };
-export default class CausalSelfAttention {
+export default class CausalSelfAttention extends BaseLayer {
     private readonly ropeCache?;
     private config;
     private cAttn;

package/dist/layers/CausalSelfAttention.js CHANGED Viewed

@@ -1,7 +1,8 @@
 import { attentionMask as z } from "../ops/attentionMask.js";
-class j {
+import S from "./BaseLayer.js";
+class C extends S {
   constructor(t, i, s, e) {
-    this.ropeCache = e, this.config = s, this.tf = t, this.index = i, this.cAttn = this.tf.layers.dense({
+    super(), this.ropeCache = e, this.config = s, this.tf = t, this.index = i, this.cAttn = this.tf.layers.dense({
       units: 3 * s.nEmbed,
       useBias: s.biasInLinear,
       name: `block_${i}_attn_cAttn`,
@@ -94,23 +95,24 @@ class j {
     if (e && !this.config.useRope)
       throw new Error("Cannot use pastKV without RoPE enabled");
     return this.tf.tidy(() => {
+      this.startMemory();
       const [o, c, r] = this.getQKV(t), h = o.shape[2], a = this.config.blockSize, u = e ? e.cumulativeLength : 0, [f, d] = this.ropeCache ? this.ropeCache.applyRoPE(o, c, u) : [o, c];
       let n = d, l = r, p = 0;
       e && (p = e.length, n = this.tf.concat([e.k, d], 2), l = this.tf.concat([e.v, r], 2));
       const b = n.shape[2];
       if (b > a) {
-        const k = b - a, g = n.shape[0], I = n.shape[1], _ = n.shape[3];
-        n = n.slice([0, 0, k, 0], [g, I, a, _]), l = l.slice([0, 0, k, 0], [g, I, a, _]), p = a - h;
+        const k = b - a, g = n.shape[0], A = n.shape[1], I = n.shape[3];
+        n = n.slice([0, 0, k, 0], [g, A, a, I]), l = l.slice([0, 0, k, 0], [g, A, a, I]), p = a - h;
       }
       let m;
       p > 0 ? m = this.getAttentionScoresWithPast(f, n, i, p) : m = this.getAttentionScores(f, n, i);
-      const v = this.tf.matMul(m, l), A = this.getOutputProjection(v, i), P = {
+      const _ = this.tf.matMul(m, l), v = this.getOutputProjection(_, i), y = {
         k: this.tf.keep(n),
         v: this.tf.keep(l),
         length: p + h,
         cumulativeLength: e ? e.cumulativeLength + h : h
-      };
-      return { output: A, attention: s ? m.mean(1) : void 0, presentKV: P };
+      }, P = s ? m.mean(1) : void 0;
+      return this.endMemory("CausalSelfAttention"), { output: v, attention: P, presentKV: y };
     });
   }
   dispose() {
@@ -118,5 +120,5 @@ class j {
   }
 }
 export {
-  j as default
+  C as default
 };

package/dist/layers/MLP.d.ts CHANGED Viewed

@@ -1,6 +1,7 @@
 import { default as TF } from '@tensorflow/tfjs';
 import { GPTConfig } from '../config';
-export default class MLP {
+import { default as BaseLayer } from './BaseLayer';
+export default class MLP extends BaseLayer {
     private cFc;
     private cProj;
     private dropout;