npm - @genai-fi/nanogpt - Versions diffs - 0.0.1 → 0.1.0 - Mend

@genai-fi/nanogpt 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/README.md +24 -16
package/dist/Generator.d.ts +3 -0
package/dist/Generator.js +41 -27
package/dist/NanoGPTModel.d.ts +14 -2
package/dist/NanoGPTModel.js +49 -43
package/dist/TeachableLLM.d.ts +8 -2
package/dist/TeachableLLM.js +39 -26
package/dist/layers/CausalSelfAttention.d.ts +7 -1
package/dist/layers/CausalSelfAttention.js +45 -35
package/dist/layers/TransformerBlock.d.ts +5 -1
package/dist/layers/TransformerBlock.js +14 -10
package/dist/tokeniser/CharTokeniser.d.ts +3 -1
package/dist/tokeniser/CharTokeniser.js +44 -25
package/dist/tokeniser/NodeTokeniser.d.ts +3 -2
package/dist/tokeniser/NodeTokeniser.js +5 -5
package/dist/tokeniser/type.d.ts +1 -1
package/dist/training/FullTrainer.js +3 -1
package/dist/training/LayerTrainer.js +8 -5
package/dist/utilities/dummy.d.ts +3 -0
package/dist/utilities/dummy.js +12 -0
package/dist/utilities/generate.d.ts +2 -2
package/dist/utilities/generate.js +11 -15
package/dist/utilities/load.js +25 -28
package/package.json +1 -1

package/README.md CHANGED Viewed

@@ -1,20 +1,28 @@
-# Introduction
-TODO: Give a short introduction of your project. Let this section explain the objectives or the motivation behind this project.
+# GenAI NanoGPT
-# Getting Started
-TODO: Guide users through getting your code up and running on their own system. In this section you can talk about:
-1.	Installation process
-2.	Software dependencies
-3.	Latest releases
-4.	API references
+Developed as a part of the Finnish Generation AI research project. This is an implementation of [NanoGPT](https://github.com/karpathy/nanoGPT) for Tensorflow.js. It allows GPT models to be training and loaded within a web browser and exposes some XAI functionality.
-# Build and Test
-TODO: Describe and show how to build your code and run the tests.
+Work in progress...
-# Contribute
-TODO: Explain how other users and developers can contribute to make your code better.
+# Install
-If you want to learn more about creating good readme files then refer the following [guidelines](https://docs.microsoft.com/en-us/azure/devops/repos/git/create-a-readme?view=azure-devops). You can also seek inspiration from the below readme files:
-- [ASP.NET Core](https://github.com/aspnet/Home)
-- [Visual Studio Code](https://github.com/Microsoft/vscode)
-- [Chakra Core](https://github.com/Microsoft/ChakraCore)
+```
+npm install @genai-fi/nanogpt
+```
+# Usage
+```
+import { TeachableLLM, CharTokeniser } from '@genai-fi/nanogpt';
+import * as tf from '@tensorflow/tfjs';
+const tokeniser = new CharTokeniser();
+const model = TeachableLLM.create(tf, tokeniser, {
+    vocabSize: 200,
+    blockSize: 128,
+    nLayer: 4,
+    nHead: 3,
+    nEmbed: 192,
+    dropout: 0.0,
+});
+```

package/dist/Generator.d.ts CHANGED Viewed

@@ -4,6 +4,9 @@ import { default as EE } from 'eventemitter3';
 export interface IGenerateOptions {
     maxLength?: number;
     temperature?: number;
+    topK?: number;
+    usePadding?: boolean;
+    includeAttention?: boolean;
 }
 export default class Generator extends EE<'start' | 'stop' | 'tokens'> {
     private readonly model;

package/dist/Generator.js CHANGED Viewed

@@ -1,39 +1,53 @@
-import { E as f } from "./index-SOhdqzHq.js";
-const u = 4;
-class p extends f {
-  constructor(s, t) {
-    super(), this.model = s, this.tokeniser = t;
+import { E as m } from "./index-SOhdqzHq.js";
+const g = 4;
+class w extends m {
+  constructor(o, t) {
+    super(), this.model = o, this.tokeniser = t;
   }
-  generateBlockOfTokens(s, t) {
-    const r = t?.temperature ?? 1;
-    let e = s;
-    for (let n = 0; n < u; n++) {
-      const i = this.model.generate(e, r), a = e;
-      e = this.model.tf.concat([e, i], 1), a.dispose(), i.dispose();
+  generateBlockOfTokens(o, t) {
+    const c = t?.temperature ?? 1, a = t?.topK, r = t?.usePadding ?? t?.includeAttention ?? !1, d = t?.includeAttention ?? !1;
+    let s = o, n;
+    for (let l = 0; l < g; l++) {
+      const { output: e, attention: i } = this.model.generate(s, {
+        temperature: c,
+        topK: a,
+        usePadding: r,
+        includeAttention: d
+      }), h = s;
+      if (s = this.model.tf.concat([s, e], 1), n && i) {
+        const u = n;
+        n = this.model.tf.concat([n, i], 0), u.dispose();
+      } else i && (n = i);
+      h.dispose(), e.dispose();
     }
-    return e;
+    return { output: s, attention: n };
   }
-  async generate(s, t) {
-    const r = s ? await this.tokeniser.tokenise([s], !0) : [[this.tokeniser.eosToken]];
-    let e = this.model.tf.tensor2d(r, [1, r[0].length], "int32");
+  async generate(o, t) {
+    const c = o ? await this.tokeniser.tokenise([o], !0) : [[this.tokeniser.eosToken]];
+    let a = this.model.tf.tensor2d(c, [1, c[0].length], "int32");
     this.emit("start");
-    let n = s || "";
+    let r = o || "";
     for (; ; ) {
-      const i = this.generateBlockOfTokens(e, t), a = e;
-      e = i;
-      const l = i.slice([0, a.shape[1]], [1, u]), o = (await l.array())[0];
-      let h = !1, c = !1;
-      const d = o.indexOf(this.tokeniser.eosToken);
-      d !== -1 && (h = !0, o.splice(d)), o.length + n.length >= (t?.maxLength ?? 1e3) && (c = !0, o.splice(
-        t?.maxLength ? t.maxLength - n.length : o.length
+      const { output: d, attention: s } = this.generateBlockOfTokens(a, t), n = a;
+      a = d;
+      const l = d.slice([0, n.shape[1]], [1, g]), e = (await l.array())[0];
+      let i = !1, h = !1;
+      const u = e.indexOf(this.tokeniser.eosToken);
+      u !== -1 && (i = !0, e.splice(u)), e.length + r.length >= (t?.maxLength ?? 1e3) && (h = !0, e.splice(
+        t?.maxLength ? t.maxLength - r.length : e.length
       ));
-      const k = await this.tokeniser.decode(o);
-      if (n += k, this.emit("tokens", o, k), a.dispose(), l.dispose(), h || c)
+      const k = await this.tokeniser.decode(e);
+      if (r += k, s) {
+        let f = await s.array();
+        f.length > e.length && (f = f.slice(0, e.length)), this.emit("tokens", e, k, f);
+      } else
+        this.emit("tokens", e, k);
+      if (n.dispose(), l.dispose(), i || h)
         break;
     }
-    return e.dispose(), this.emit("stop"), n;
+    return a.dispose(), this.emit("stop"), r;
   }
 }
 export {
-  p as default
+  w as default
 };

package/dist/NanoGPTModel.d.ts CHANGED Viewed

@@ -9,6 +9,12 @@ export interface TrainingLogEntry {
     example?: string;
     batchSize: number;
 }
+export interface GenerateOptions {
+    temperature?: number;
+    topK?: number;
+    usePadding?: boolean;
+    includeAttention?: boolean;
+}
 export default class NanoGPT {
     readonly config: GPTConfig;
     private wte;
@@ -26,10 +32,16 @@ export default class NanoGPT {
     setSkipMask(mask: boolean[]): void;
     setTrainableMask(mask: boolean[]): void;
     set trainable(value: boolean);
-    forward(idx: TF.Tensor, targets?: TF.Tensor, training?: boolean): {
+    private validateInput;
+    private calculateLoss;
+    forward(idx: TF.Tensor, targets?: TF.Tensor, training?: boolean, includeAttention?: boolean): {
         logits: TF.Tensor;
         loss?: TF.Tensor;
+        attention?: TF.Tensor;
+    };
+    generate(idx: TF.Tensor, options?: GenerateOptions): {
+        output: TF.Tensor;
+        attention?: TF.Tensor;
     };
-    generate(idx: TF.Tensor, temperature?: number, topK?: number): TF.Tensor;
     getNumParams(): number;
 }

package/dist/NanoGPTModel.js CHANGED Viewed

@@ -1,8 +1,8 @@
-import { defaultConfig as b } from "./config.js";
-import p from "./layers/TransformerBlock.js";
-import m from "./layers/TiedEmbedding.js";
-import d from "./layers/LayerNorm.js";
-class S {
+import { defaultConfig as y } from "./config.js";
+import z from "./layers/TransformerBlock.js";
+import v from "./layers/TiedEmbedding.js";
+import S from "./layers/LayerNorm.js";
+class $ {
   config;
   wte;
   // Token embeddings
@@ -17,7 +17,7 @@ class S {
   log = [];
   // Training log
   constructor(t, e = {}) {
-    this.tf = t, this.config = { ...b, ...e }, this.wte = new m(t, {
+    this.tf = t, this.config = { ...y, ...e }, this.wte = new v(t, {
       vocabSize: this.config.vocabSize,
       embedDim: this.config.nEmbed,
       name: "token_embedding"
@@ -28,8 +28,8 @@ class S {
       embeddingsInitializer: this.tf.initializers.randomNormal({ mean: 0, stddev: 0.02 })
     }), this.drop = this.tf.layers.dropout({ rate: this.config.dropout }), this.blocks = [];
     for (let s = 0; s < this.config.nLayer; s++)
-      this.blocks.push(new p(this.tf, s, this.config));
-    this.lnF = new d(t, [this.config.nEmbed], 1e-5, "final_layer_norm");
+      this.blocks.push(new z(this.tf, s, this.config));
+    this.lnF = new S(t, [this.config.nEmbed], 1e-5, "final_layer_norm");
   }
   get variables() {
     return [
@@ -54,8 +54,8 @@ class S {
   }
   inputPhase(t, e = !1) {
     return this.tf.tidy(() => {
-      const [, s] = t.shape, i = this.wte.embed(t), o = this.tf.range(0, s, 1, "int32"), r = this.wpe.apply(o), n = i.add(r);
-      return this.drop.apply(n, { training: e });
+      const [, s] = t.shape, n = this.wte.embed(t), i = this.tf.range(0, s, 1, "int32"), o = this.wpe.apply(i), h = n.add(o);
+      return this.drop.apply(h, { training: e });
     });
   }
   setSkipMask(t) {
@@ -75,55 +75,61 @@ class S {
       e.trainable = t;
     this.wpe.trainable = t, this.lnF.trainable = t;
   }
-  forward(t, e, s = !1) {
+  validateInput(t) {
     if (t.shape.length !== 2)
       throw new Error(`Invalid input shape: expected [batch_size, sequence_length], got ${t.shape}`);
     if (t.shape[1] > this.config.blockSize)
       throw new Error(`Input sequence length ${t.shape[1]} isn't block size ${this.config.blockSize}`);
     if (t.dtype !== "int32")
       throw new Error(`Input tensor must be of type int32, got ${t.dtype}`);
-    return this.tf.tidy(() => {
-      const [, i] = t.shape;
-      if (i > this.config.blockSize)
-        throw new Error(`Cannot forward sequence of length ${i}, block size is only ${this.config.blockSize}`);
-      let o = this.inputPhase(t, s);
-      for (const h of this.blocks)
-        o = h.call(o);
-      o = this.lnF.apply(o);
-      const r = this.wte.project(o);
-      let n;
-      if (e)
-        try {
-          n = this.tf.losses.softmaxCrossEntropy(e, r, this.tf.Reduction.MEAN);
-        } catch (h) {
-          throw console.error("Error computing loss:", h), new Error(`Loss computation failed: ${h}`);
-        }
-      return { logits: r, loss: n };
+  }
+  calculateLoss(t, e) {
+    try {
+      return this.tf.losses.softmaxCrossEntropy(e, t, this.tf.Reduction.MEAN);
+    } catch (s) {
+      throw console.error("Error computing loss:", s), new Error(`Loss computation failed: ${s}`);
+    }
+  }
+  forward(t, e, s = !1, n = !1) {
+    return this.validateInput(t), this.tf.tidy(() => {
+      let i = this.inputPhase(t, s), o;
+      n && (o = this.tf.zeros([i.shape[0], i.shape[1], i.shape[1]]));
+      for (const l of this.blocks) {
+        const { output: r, attention: f } = l.call(i, s, n);
+        i = r, f && o && (o = o.add(f));
+      }
+      o && (o = o.div(this.blocks.length)), i = this.lnF.apply(i);
+      const h = this.wte.project(i);
+      let a;
+      return e && (a = this.calculateLoss(h, e)), { logits: h, loss: a, attention: n ? o : void 0 };
     });
   }
-  generate(t, e = 1, s) {
+  generate(t, e) {
+    const s = e?.temperature ?? 1, n = e?.topK, i = e?.usePadding ?? !1, o = e?.includeAttention ?? !1;
     return this.tf.tidy(() => {
-      const i = t, o = i.shape[1], r = o <= this.config.blockSize ? i : i.slice(
-        [0, o - this.config.blockSize],
-        [i.shape[0], this.config.blockSize]
-      ), { logits: n } = this.forward(r, void 0, !1), h = n.shape[1] - 1, a = n.slice([0, h, 0], [n.shape[0], 1, n.shape[2]]).div(e);
-      let l;
-      if (s) {
-        const { values: c, indices: g } = this.tf.topk(a, s), f = this.tf.multinomial(c.squeeze([1]), 1);
-        l = this.tf.gather(g.squeeze([1]), f, 1);
+      const h = t, a = h.shape[1], l = a <= this.config.blockSize ? h : h.slice(
+        [0, a - this.config.blockSize],
+        [h.shape[0], this.config.blockSize]
+      ), r = i ? this.config.blockSize - l.shape[1] : 0, f = r > 0 ? this.tf.pad(l, [
+        [0, 0],
+        [0, r]
+      ]) : l, { logits: g, attention: p } = this.forward(f, void 0, !1, o), d = g.shape[1] - 1 - r, m = g.slice([0, d, 0], [g.shape[0], 1, g.shape[2]]), u = p ? p.slice([0, d, 0], [p.shape[0], 1, p.shape[2]]) : void 0, b = m.div(s);
+      let c;
+      if (n) {
+        const { values: k, indices: w } = this.tf.topk(b, n), E = this.tf.multinomial(k.squeeze([1]), 1);
+        c = this.tf.gather(w.squeeze([1]), E, 1);
       } else
-        l = this.tf.multinomial(a.squeeze([1]), 1);
-      return l = l.reshape([1, 1]), l;
+        c = this.tf.multinomial(b.squeeze([1]), 1);
+      return c = c.reshape([1, 1]), { output: c, attention: u?.squeeze([1]) };
     });
   }
-  // Get number of parameters
   getNumParams() {
     const t = this.config.vocabSize * this.config.nEmbed + this.config.blockSize * this.config.nEmbed, e = this.config.nLayer * (4 * this.config.nEmbed * this.config.nEmbed + // qkv + proj
     2 * this.config.nEmbed), s = this.config.nLayer * (4 * this.config.nEmbed * this.config.nEmbed + // fc
-    this.config.nEmbed * 4 * this.config.nEmbed), i = this.config.nEmbed + this.config.vocabSize * this.config.nEmbed;
-    return t + e + s + i;
+    this.config.nEmbed * 4 * this.config.nEmbed), n = this.config.nEmbed + this.config.vocabSize * this.config.nEmbed;
+    return t + e + s + n;
   }
 }
 export {
-  S as default
+  $ as default
 };

package/dist/TeachableLLM.d.ts CHANGED Viewed

@@ -4,18 +4,24 @@ import { ITokeniser } from './tokeniser/type';
 import { default as NanoGPT } from './NanoGPTModel';
 import { default as Generator, IGenerateOptions } from './Generator';
 import { default as Trainer, ITrainerOptions } from './Trainer';
-export default class TeachableLLM {
+import { default as EE } from 'eventemitter3';
+type TeachableLLMStatus = 'warmup' | 'ready' | 'training' | 'loading' | 'busy' | 'error';
+export default class TeachableLLM extends EE<'status' | 'error'> {
     readonly config: GPTConfig;
     readonly model: NanoGPT;
     readonly tf: typeof TF;
     readonly tokeniser: ITokeniser;
+    private _status;
     constructor(tf: typeof TF, tokeniser: ITokeniser, model: NanoGPT);
+    get status(): TeachableLLMStatus;
+    private setStatus;
     saveModel(): Promise<Blob>;
     static loadModel(tf: typeof TF, data: Blob | Buffer | string): Promise<TeachableLLM>;
-    static create(tf: typeof TF, tokeniser: ITokeniser, config?: Partial<GPTConfig>): TeachableLLM;
+    static create(tf: typeof TF, config?: Partial<GPTConfig>): TeachableLLM;
     getNumParams(): number;
     trainer(): Trainer;
     train(text: string[], options?: ITrainerOptions): Promise<void>;
     generator(): Generator;
     generateText(prompt?: string, options?: IGenerateOptions): Promise<string>;
 }
+export {};

package/dist/TeachableLLM.js CHANGED Viewed

@@ -1,47 +1,60 @@
-import s from "./NanoGPTModel.js";
-import { defaultConfig as a } from "./config.js";
-import { saveModel as m } from "./utilities/save.js";
-import { loadModel as l } from "./utilities/load.js";
-import d from "./Generator.js";
-import c from "./Trainer.js";
-import "./tokeniser/CharTokeniser.js";
-class i {
+import a from "./NanoGPTModel.js";
+import { defaultConfig as m } from "./config.js";
+import { saveModel as u } from "./utilities/save.js";
+import { loadModel as h } from "./utilities/load.js";
+import c from "./Generator.js";
+import d from "./Trainer.js";
+import { E as l } from "./index-SOhdqzHq.js";
+import { dummyPassAsync as f } from "./utilities/dummy.js";
+import g from "./tokeniser/CharTokeniser.js";
+class n extends l {
   config;
   model;
   tf;
   tokeniser;
-  constructor(e, t, r) {
-    this.tf = e, this.config = r.config, this.tokeniser = t, this.model = r;
+  _status = "loading";
+  constructor(t, e, r) {
+    super(), this.tf = t, this.config = r.config, this.tokeniser = e, this.model = r;
+  }
+  get status() {
+    return this._status;
+  }
+  setStatus(t) {
+    this._status !== t && (this._status = t, this.emit("status", t));
   }
   saveModel() {
-    return m(this.model, this.tokeniser);
+    return u(this.model, this.tokeniser);
   }
-  static async loadModel(e, t) {
-    const { model: r, tokeniser: o } = await l(e, t);
-    return new i(e, o, r);
+  static async loadModel(t, e) {
+    const { model: r, tokeniser: o } = await h(t, e), s = new n(t, o, r);
+    return s.setStatus("warmup"), f(r).then(() => {
+      s.setStatus("ready");
+    }).catch((i) => {
+      s.setStatus("error"), s.emit("error", i);
+    }), s;
   }
-  static create(e, t, r = {}) {
-    const o = { ...a, ...r };
-    o.vocabSize = t.vocabSize;
-    const n = new s(e, o);
-    return new i(e, t, n);
+  static create(t, e = {}) {
+    const r = { ...m, ...e }, o = new g(r.vocabSize), s = new a(t, r);
+    return new n(t, o, s);
   }
   getNumParams() {
     return this.model.getNumParams();
   }
   trainer() {
-    return new c(this.model, this.tokeniser);
+    const t = new d(this.model, this.tokeniser);
+    return t.on("start", () => this.setStatus("training")), t.on("stop", () => this.setStatus("ready")), t;
   }
-  train(e, t) {
-    return this.trainer().train(e, t);
+  train(t, e) {
+    return this.trainer().train(t, e);
   }
   generator() {
-    return new d(this.model, this.tokeniser);
+    const t = new c(this.model, this.tokeniser);
+    return t.on("start", () => this.setStatus("busy")), t.on("stop", () => this.setStatus("ready")), t;
   }
-  generateText(e, t) {
-    return this.generator().generate(e, t);
+  generateText(t, e) {
+    return this.generator().generate(t, e);
   }
 }
 export {
-  i as default
+  n as default
 };

package/dist/layers/CausalSelfAttention.d.ts CHANGED Viewed

@@ -18,5 +18,11 @@ export default class CausalSelfAttention {
     set trainable(value: boolean);
     saveWeights(map: Map<string, TF.Tensor[]>): void;
     loadWeights(weights: Map<string, TF.Tensor[]>): void;
-    call(x: TF.Tensor, training?: boolean): TF.Tensor;
+    private getAttentionScores;
+    private getQKV;
+    private getOutputProjection;
+    call(x: TF.Tensor, training?: boolean, includeAttention?: boolean): {
+        output: TF.Tensor;
+        attention?: TF.Tensor;
+    };
 }

package/dist/layers/CausalSelfAttention.js CHANGED Viewed

@@ -1,4 +1,4 @@
-class g {
+class m {
   config;
   cAttn;
   cProj;
@@ -10,10 +10,10 @@ class g {
   divisor;
   index;
   _trainable = !0;
-  constructor(s, e, t) {
-    this.config = t, this.tf = s, this.index = e, this.cAttn = this.tf.layers.dense({
-      units: 3 * t.nEmbed,
-      useBias: t.biasInLinear,
+  constructor(t, e, s) {
+    this.config = s, this.tf = t, this.index = e, this.cAttn = this.tf.layers.dense({
+      units: 3 * s.nEmbed,
+      useBias: s.biasInLinear,
       name: `block_${e}_attn_cAttn`,
       kernelInitializer: this.tf.initializers.randomNormal({
         mean: 0,
@@ -21,55 +21,65 @@ class g {
       }),
       biasInitializer: "zeros"
     }), this.cProj = this.tf.layers.dense({
-      units: t.nEmbed,
-      useBias: t.biasInLinear,
+      units: s.nEmbed,
+      useBias: s.biasInLinear,
       name: `block_${e}_attn_cProj`,
       kernelInitializer: this.tf.initializers.randomNormal({
         mean: 0,
-        stddev: 0.02 / Math.sqrt(2 * t.nLayer)
+        stddev: 0.02 / Math.sqrt(2 * s.nLayer)
       }),
       biasInitializer: "zeros"
-    }), this.attnDropout = this.tf.layers.dropout({ rate: t.dropout }), this.residDropout = this.tf.layers.dropout({ rate: t.dropout }), this.bias = this.tf.linalg.bandPart(this.tf.ones([t.blockSize, t.blockSize]), -1, 0).cast("bool"), this.divisor = this.tf.scalar(1 / Math.sqrt(t.nEmbed / t.nHead)), this.maskInf = this.tf.zeros([t.blockSize, t.blockSize]).where(this.bias, -1 / 0);
+    }), this.attnDropout = this.tf.layers.dropout({ rate: s.dropout }), this.residDropout = this.tf.layers.dropout({ rate: s.dropout }), this.bias = this.tf.linalg.bandPart(this.tf.ones([s.blockSize, s.blockSize]), -1, 0).cast("bool"), this.divisor = this.tf.scalar(1 / Math.sqrt(s.nEmbed / s.nHead)), this.maskInf = this.tf.zeros([s.blockSize, s.blockSize]).where(this.bias, -1 / 0);
   }
   get variables() {
     return [
-      ...this.cAttn.trainableWeights.map((s) => s.read()),
-      ...this.cProj.trainableWeights.map((s) => s.read())
+      ...this.cAttn.trainableWeights.map((t) => t.read()),
+      ...this.cProj.trainableWeights.map((t) => t.read())
     ];
   }
   get trainable() {
     return this._trainable;
   }
-  set trainable(s) {
-    this._trainable = s, this.cAttn.trainable = s, this.cProj.trainable = s;
+  set trainable(t) {
+    this._trainable = t, this.cAttn.trainable = t, this.cProj.trainable = t;
   }
-  saveWeights(s) {
-    s.set(`block_${this.index}_cAttn`, this.cAttn.getWeights()), s.set(`block_${this.index}_cProj`, this.cProj.getWeights());
+  saveWeights(t) {
+    t.set(`block_${this.index}_cAttn`, this.cAttn.getWeights()), t.set(`block_${this.index}_cProj`, this.cProj.getWeights());
   }
-  loadWeights(s) {
-    this.cAttn.setWeights(s.get(`block_${this.index}_cAttn`) || []), this.cProj.setWeights(s.get(`block_${this.index}_cProj`) || []);
+  loadWeights(t) {
+    this.cAttn.setWeights(t.get(`block_${this.index}_cAttn`) || []), this.cProj.setWeights(t.get(`block_${this.index}_cProj`) || []);
   }
-  call(s, e = !1) {
+  getAttentionScores(t, e, s) {
+    const a = t.shape[2], n = this.tf.matMul(t, e, !1, !0).mul(this.divisor), i = this.maskInf.slice([0, 0], [a, a]), o = n.add(i), h = this.tf.softmax(o, -1);
+    return this.attnDropout.apply(h, { training: s });
+  }
+  getQKV(t) {
+    const [e, s, a] = t.shape, r = this.cAttn.apply(t), [n, i, o] = this.tf.split(r, 3, -1);
+    r.dispose();
+    const h = a / this.config.nHead, c = this.tf.reshape(n, [e, s, this.config.nHead, h]);
+    n.dispose();
+    const p = c.transpose([0, 2, 1, 3]);
+    c.dispose();
+    const l = this.tf.reshape(i, [e, s, this.config.nHead, h]);
+    i.dispose();
+    const u = l.transpose([0, 2, 1, 3]);
+    l.dispose();
+    const d = this.tf.reshape(o, [e, s, this.config.nHead, h]);
+    o.dispose();
+    const b = d.transpose([0, 2, 1, 3]);
+    return d.dispose(), [p, u, b];
+  }
+  getOutputProjection(t, e) {
+    const s = t.shape[0], a = t.shape[2], r = this.config.nEmbed, n = t.transpose([0, 2, 1, 3]), i = this.tf.reshape(n, [s, a, r]), o = this.cProj.apply(i);
+    return this.residDropout.apply(o, { training: e });
+  }
+  call(t, e = !1, s = !1) {
     return this.tf.tidy(() => {
-      const [t, i, n] = s.shape, r = this.cAttn.apply(s), [o, h, l] = this.tf.split(r, 3, -1);
-      r.dispose();
-      const a = n / this.config.nHead, d = this.tf.reshape(o, [t, i, this.config.nHead, a]);
-      o.dispose();
-      const b = d.transpose([0, 2, 1, 3]);
-      d.dispose();
-      const c = this.tf.reshape(h, [t, i, this.config.nHead, a]);
-      h.dispose();
-      const u = c.transpose([0, 2, 1, 3]);
-      c.dispose();
-      const p = this.tf.reshape(l, [t, i, this.config.nHead, a]);
-      l.dispose();
-      const f = p.transpose([0, 2, 1, 3]);
-      p.dispose();
-      const m = this.tf.matMul(b, u, !1, !0).mul(this.divisor), k = this.maskInf.slice([0, 0], [i, i]), _ = m.add(k), y = this.tf.softmax(_, -1), z = this.attnDropout.apply(y, { training: e }), A = this.tf.matMul(z, f).transpose([0, 2, 1, 3]), P = this.tf.reshape(A, [t, i, n]), j = this.cProj.apply(P);
-      return this.residDropout.apply(j, { training: e });
+      const [a, r, n] = this.getQKV(t), i = this.getAttentionScores(a, r, e), o = this.tf.matMul(i, n);
+      return { output: this.getOutputProjection(o, e), attention: s ? i.mean(1) : void 0 };
     });
   }
 }
 export {
-  g as default
+  m as default
 };

package/dist/layers/TransformerBlock.d.ts CHANGED Viewed

@@ -15,5 +15,9 @@ export default class Block {
     set trainable(value: boolean);
     saveWeights(map: Map<string, TF.Tensor[]>): void;
     loadWeights(weights: Map<string, TF.Tensor[]>): void;
-    call(x: TF.Tensor, training?: boolean): TF.Tensor;
+    private getMLPOutput;
+    call(x: TF.Tensor, training?: boolean, includeAttention?: boolean): {
+        output: TF.Tensor;
+        attention?: TF.Tensor;
+    };
 }

package/dist/layers/TransformerBlock.js CHANGED Viewed

@@ -1,7 +1,7 @@
-import r from "./CausalSelfAttention.js";
-import d from "./MLP.js";
-import n from "./LayerNorm.js";
-class _ {
+import h from "./CausalSelfAttention.js";
+import r from "./MLP.js";
+import l from "./LayerNorm.js";
+class u {
   ln1;
   attn;
   ln2;
@@ -11,7 +11,7 @@ class _ {
   _trainable = !0;
   skipped = !1;
   constructor(t, s, i) {
-    this.tf = t, this.index = s, this.ln1 = new n(t, [i.nEmbed], 1e-5, `block_${this.index}_ln1`), this.attn = new r(this.tf, this.index, i), this.ln2 = new n(t, [i.nEmbed], 1e-5, `block_${this.index}_ln2`), this.mlp = new d(this.tf, this.index, i);
+    this.tf = t, this.index = s, this.ln1 = new l(t, [i.nEmbed], 1e-5, `block_${this.index}_ln1`), this.attn = new h(this.tf, this.index, i), this.ln2 = new l(t, [i.nEmbed], 1e-5, `block_${this.index}_ln2`), this.mlp = new r(this.tf, this.index, i);
   }
   get variables() {
     return [
@@ -33,15 +33,19 @@ class _ {
   loadWeights(t) {
     this.attn.loadWeights(t), this.mlp.loadWeights(t), this.ln1.setWeights(t.get(`block_${this.index}_ln1`) || []), this.ln2.setWeights(t.get(`block_${this.index}_ln2`) || []);
   }
-  call(t, s = !1) {
+  getMLPOutput(t, s) {
+    const i = this.ln2.apply(t), e = this.mlp.call(i, s);
+    return t.add(e);
+  }
+  call(t, s = !1, i = !1) {
     return this.tf.tidy(() => {
       if (this.skipped)
-        return t;
-      const i = this.ln1.apply(t), l = this.attn.call(i, s), e = t.add(l), a = this.ln2.apply(e), h = this.mlp.call(a, s);
-      return e.add(h);
+        return { output: t };
+      const e = this.ln1.apply(t), n = this.attn.call(e, s, i), a = t.add(n.output);
+      return { output: this.getMLPOutput(a, s), attention: n.attention };
     });
   }
 }
 export {
-  _ as default
+  u as default
 };

package/dist/tokeniser/CharTokeniser.d.ts CHANGED Viewed

@@ -3,9 +3,11 @@ import { ITokeniser } from './type';
 export default class CharTokeniser extends EE<'trainStatus'> implements ITokeniser {
     vocabSize: number;
     eosToken: number;
+    unkToken: number;
     vocab: string[];
     private cache;
-    constructor(vocab?: string[]);
+    constructor(vocabSize: number);
+    constructor(vocab: string[]);
     get trained(): boolean;
     destroy(): void;
     train(text: string[]): Promise<number>;

package/dist/tokeniser/CharTokeniser.js CHANGED Viewed

@@ -1,38 +1,57 @@
-import { E as r } from "../index-SOhdqzHq.js";
-class h extends r {
+import { E as h } from "../index-SOhdqzHq.js";
+const c = ["<eos>", "<unk>"];
+class l extends h {
   vocabSize = 0;
   eosToken = 0;
+  unkToken = 0;
   vocab = [];
   cache = /* @__PURE__ */ new Map();
-  constructor(t) {
-    super(), this.vocab = t || [], this.vocab.length > 0 && (this.vocabSize = this.vocab.length, this.eosToken = this.vocab.indexOf("<eos>"), this.vocab.forEach((a, s) => {
-      this.cache.set(a, s);
-    }));
+  constructor(s) {
+    if (super(), Array.isArray(s))
+      if (this.vocab = s, this.vocab.length > 0)
+        this.vocabSize = this.vocab.length, this.eosToken = this.vocab.indexOf("<eos>"), this.unkToken = this.vocab.indexOf("<unk>"), this.unkToken === -1 && (this.unkToken = this.eosToken), this.vocab.forEach((i, o) => {
+          this.cache.set(i, o);
+        });
+      else
+        throw new Error("Vocab cannot be empty");
+    else
+      this.vocabSize = s;
   }
   get trained() {
-    return this.vocabSize > 0;
+    return this.vocab.length === this.vocabSize;
   }
   destroy() {
   }
-  async train(t) {
-    const a = new Set(t.map((e) => e.split("")).flat()), s = Array.from(a);
-    return s.sort((e, o) => e.charCodeAt(0) - o.charCodeAt(0)), this.vocab = [...s, "<eos>"], this.eosToken = this.vocab.indexOf("<eos>"), this.vocabSize = this.vocab.length, this.vocab.forEach((e, o) => {
-      this.cache.set(e, o);
+  async train(s) {
+    const i = s.map((e) => e.split("")).flat(), o = new Set(i), t = Array.from(o), n = this.vocabSize - c.length;
+    if (t.length > n) {
+      const e = /* @__PURE__ */ new Map();
+      i.forEach((a) => {
+        e.set(a, (e.get(a) || 0) + 1);
+      }), t.sort((a, r) => (e.get(a) || 0) - (e.get(r) || 0)), t.splice(0, t.length - n);
+    } else if (t.length < n)
+      for (; t.length < n; )
+        t.push("<pad>");
+    return t.sort((e, a) => e.charCodeAt(0) - a.charCodeAt(0)), this.vocab = [...t, ...c], this.eosToken = this.vocab.indexOf("<eos>"), this.unkToken = this.vocab.indexOf("<unk>"), this.vocabSize = this.vocab.length, this.cache.clear(), this.vocab.forEach((e, a) => {
+      this.cache.set(e, a);
     }), this.vocabSize;
   }
-  async tokenise(t, a) {
+  async tokenise(s, i) {
     if (!this.trained)
       throw new Error("Tokeniser not trained");
-    return t.map((e) => a ? e.split("").map((o) => this.cache.get(o) ?? -1) : e.split(""));
+    return s.map((t) => i ? t.split("").map((n) => this.cache.get(n) ?? this.unkToken) : t.split("").map((n) => {
+      const e = this.cache.get(n);
+      return e !== void 0 ? this.vocab[e] : "<unk>";
+    }));
   }
-  async detokenise(t) {
-    return t.map((s) => s.map((e) => this.vocab[e]).join(""));
+  async detokenise(s) {
+    return s.map((o) => o.map((t) => this.vocab[t]).join(""));
   }
-  async encode(t) {
-    return (await this.tokenise([t], !0))[0];
+  async encode(s) {
+    return (await this.tokenise([s], !0))[0];
   }
-  async decode(t) {
-    return (await this.detokenise([t]))[0];
+  async decode(s) {
+    return (await this.detokenise([s]))[0];
   }
   getVocab() {
     return this.vocab;
@@ -40,13 +59,13 @@ class h extends r {
   async getMerges() {
     return [];
   }
-  async createTrainingData(t, a = 5) {
-    const s = await this.tokenise(t, !0), e = [], o = [];
-    for (let i = 0; i < s.length - a; i++)
-      e.push(...s[i].slice(0, a)), o.push(s[i + 1][0]);
-    return [e, o];
+  async createTrainingData(s, i = 5) {
+    const o = await this.tokenise(s, !0), t = [], n = [];
+    for (let e = 0; e < o.length - i; e++)
+      t.push(...o[e].slice(0, i)), n.push(o[e + 1][0]);
+    return [t, n];
   }
 }
 export {
-  h as default
+  l as default
 };

package/dist/tokeniser/NodeTokeniser.d.ts CHANGED Viewed

@@ -4,10 +4,11 @@ export default class NodeTokeniser extends EE<'trainStatus'> implements ITokenis
     vocabSize: number;
     eosToken: number;
     private bpe;
-    constructor(vocab?: string[], merges?: [string, string][]);
+    constructor(vocabSize: number);
+    constructor(vocab: string[], merges: [string, string][]);
     get trained(): boolean;
     destroy(): void;
-    train(text: string[], vocabSize: number): Promise<number>;
+    train(text: string[]): Promise<number>;
     tokenise(text: string[], numeric: true): Promise<number[][]>;
     tokenise(text: string[]): Promise<string[][]>;
     detokenise(tokens: number[][]): Promise<string[]>;

package/dist/tokeniser/NodeTokeniser.js CHANGED Viewed

@@ -1,19 +1,19 @@
 import { E as a } from "../index-SOhdqzHq.js";
 import o from "./bpe.js";
-class b extends a {
+class p extends a {
   vocabSize = 0;
   eosToken = 0;
   bpe = new o();
   constructor(e, t) {
-    super(), e && (this.bpe = new o(e, t), this.vocabSize = e.length);
+    super(), Array.isArray(e) ? (this.bpe = new o(e, t), this.vocabSize = e.length) : this.vocabSize = e;
   }
   get trained() {
     return this.vocabSize > 0;
   }
   destroy() {
   }
-  async train(e, t) {
-    return this.bpe.train(e, t), this.vocabSize = this.bpe.getVocab().length, this.vocabSize;
+  async train(e) {
+    return this.bpe.train(e, this.vocabSize), this.vocabSize = this.bpe.getVocab().length, this.vocabSize;
   }
   async tokenise(e, t) {
     return t ? this.bpe.tokenise(e, !0) : this.bpe.tokenise(e);
@@ -42,5 +42,5 @@ class b extends a {
   }
 }
 export {
-  b as default
+  p as default
 };

package/dist/tokeniser/type.d.ts CHANGED Viewed

@@ -1,6 +1,6 @@
 import { default as EE } from 'eventemitter3';
 export interface ITokeniser extends EE<'trainStatus'> {
-    train(text: string[], vocabSize: number): Promise<number>;
+    train(text: string[]): Promise<number>;
     tokenise(text: string[], numeric?: boolean): Promise<string[][] | number[][]>;
     detokenise(tokens: string[][] | number[][]): Promise<string[]>;
     getVocab(): string[];

package/dist/training/FullTrainer.js CHANGED Viewed

@@ -45,7 +45,9 @@ class S extends T {
           };
           if (this.model.log.push(p), s.step % L === 0 && (await w, h)) {
             if (l) {
-              const v = await g(this.tokenizer, this.model, l, 100, 0.8);
+              const v = await g(this.tokenizer, this.model, l, 100, {
+                temperature: 0.8
+              });
               p.example = v;
             }
             await h(p);

package/dist/training/LayerTrainer.js CHANGED Viewed

@@ -29,7 +29,7 @@ class D extends T {
       epochs: h,
       stepsPerEpoch: r,
       desiredLoss: c,
-      logInterval: m,
+      logInterval: P,
       stepsPerLayer: d,
       onLayerChange: n,
       onPassComplete: g,
@@ -61,20 +61,23 @@ class D extends T {
         for (; !(r && s.step >= r || s.lastLoss < c); ) {
           const a = await u.next();
           if (a.done) break;
-          const P = a.value, w = this.trainBatch(s, P);
+          const m = a.value, w = this.trainBatch(s, m);
           s.stepSinceLayerChange++;
           const l = {
             epoch: s.epoch,
             loss: s.lastLoss,
             step: s.step,
             time: Date.now() - S,
-            batchSize: P.xs.shape[0],
+            batchSize: m.xs.shape[0],
             pass: s.pass,
             layer: s.layerStep % this.model.config.nLayer
           };
-          if (this.model.log.push(l), s.step % m === 0 && (await w, y)) {
+          if (this.model.log.push(l), s.step % P === 0 && (await w, y)) {
             if (L) {
-              const i = await v(this.tokenizer, this.model, L, 100, 0.8, 10);
+              const i = await v(this.tokenizer, this.model, L, 100, {
+                temperature: 0.8,
+                topK: 10
+              });
               l.example = i;
             }
             await y(l);

package/dist/utilities/dummy.d.ts ADDED Viewed

@@ -0,0 +1,3 @@
+import { default as NanoGPT } from '../NanoGPTModel';
+export declare function dummyPassAsync(model: NanoGPT): Promise<void>;
+export declare function dummyPass(model: NanoGPT): void;

package/dist/utilities/dummy.js ADDED Viewed

@@ -0,0 +1,12 @@
+async function f(s) {
+  const o = s.tf.zeros([1, s.config.blockSize], "int32"), { logits: i, loss: t } = s.forward(o, void 0, !1);
+  await i.data(), i.dispose(), t && t.dispose(), o.dispose();
+}
+function c(s) {
+  const o = s.tf.zeros([1, s.config.blockSize], "int32"), { logits: i, loss: t } = s.forward(o, void 0, !1);
+  i.dispose(), t && t.dispose(), o.dispose();
+}
+export {
+  c as dummyPass,
+  f as dummyPassAsync
+};

package/dist/utilities/generate.d.ts CHANGED Viewed

@@ -1,3 +1,3 @@
 import { ITokeniser } from '../tokeniser/type';
-import { default as NanoGPT } from '../NanoGPTModel';
-export declare function generateText(tokeniser: ITokeniser, model: NanoGPT, prompt: string, length: number, temperature?: number, topK?: number): Promise<string>;
+import { default as NanoGPT, GenerateOptions } from '../NanoGPTModel';
+export declare function generateText(tokeniser: ITokeniser, model: NanoGPT, prompt: string, length: number, options: GenerateOptions): Promise<string>;

package/dist/utilities/generate.js CHANGED Viewed

@@ -1,22 +1,18 @@
-async function h(n, t, i, s, a = 1, r) {
+async function w(n, e, o, s, p) {
   if (s <= 0)
     throw new Error("Length must be a positive integer");
-  if (a <= 0)
-    throw new Error("Temperature must be a positive number");
-  if (r !== void 0 && r <= 0)
-    throw new Error("topK must be a positive integer or undefined");
-  if (i.length === 0)
+  if (o.length === 0)
     throw new Error("Prompt cannot be an empty string");
-  const c = await n.tokenise([i], !0), o = (await t.tf.tidy(() => {
-    let e = t.tf.tensor2d(c, [1, c[0].length], "int32");
-    for (let u = 0; u < s; u++) {
-      const f = t.generate(e, a, r), g = e;
-      e = t.tf.concat([e, f], 1), g.dispose(), f.dispose();
+  const a = await n.tokenise([o], !0), r = (await e.tf.tidy(() => {
+    let t = e.tf.tensor2d(a, [1, a[0].length], "int32");
+    for (let c = 0; c < s; c++) {
+      const { output: d } = e.generate(t, p), u = t;
+      t = e.tf.concat([t, d], 1), u.dispose(), d.dispose();
     }
-    return e;
-  }).array())[0], d = o.indexOf(n.eosToken);
-  return d !== -1 && o.splice(d), await n.decode(o);
+    return t;
+  }).array())[0], i = r.indexOf(n.eosToken);
+  return i !== -1 && r.splice(i), await n.decode(r);
 }
 export {
-  h as generateText
+  w as generateText
 };

package/dist/utilities/load.js CHANGED Viewed

@@ -1,47 +1,44 @@
-import { z as k } from "../jszip.min-BLbRbbKt.js";
-import { importWeights as F } from "./weights.js";
-import z from "../tokeniser/CharTokeniser.js";
+import { z as F } from "../jszip.min-BLbRbbKt.js";
+import { importWeights as b } from "./weights.js";
+import k from "../tokeniser/CharTokeniser.js";
 import j from "../NanoGPTModel.js";
-function m(o) {
-  const a = o.tf.zeros([1, o.config.blockSize], "int32"), { logits: n, loss: s } = o.forward(a, void 0, !1);
-  n.dispose(), s && s.dispose(), a.dispose();
-}
-async function E(o) {
+import { dummyPassAsync as z } from "./dummy.js";
+async function A(o) {
   const e = await fetch(o);
   if (!e.ok)
     throw new Error(`Failed to fetch ${o}: ${e.statusText}`);
   return e.arrayBuffer();
 }
-async function A(o, e) {
-  const a = typeof e == "string" ? await E(e) : e, n = await k.loadAsync(a), s = /* @__PURE__ */ new Map(), f = await n.file("manifest.json")?.async("string");
-  if (!f)
+async function T(o, e) {
+  const m = typeof e == "string" ? await A(e) : e, n = await F.loadAsync(m), s = /* @__PURE__ */ new Map(), c = await n.file("manifest.json")?.async("string");
+  if (!c)
     throw new Error("Manifest file not found in the zip archive");
-  const l = JSON.parse(f);
-  for (const [t, r] of Object.entries(l.weightSpec))
+  const f = JSON.parse(c);
+  for (const [t, r] of Object.entries(f.weightSpec))
     s.set(t, { spec: r, data: new Float32Array() });
-  const p = await n.file("tokeniser.json")?.async("string");
-  if (!p)
+  const l = await n.file("tokeniser.json")?.async("string");
+  if (!l)
     throw new Error("Tokeniser file not found in the zip archive");
-  const d = JSON.parse(p), y = new z(d.vocab), w = /* @__PURE__ */ new Map();
+  const g = JSON.parse(l), y = new k(g.vocab), w = /* @__PURE__ */ new Map();
   for (const t of Object.keys(n.files))
     if (t.endsWith(".bin")) {
-      const r = t.replace(".bin", ""), h = await n.file(t).async("arraybuffer"), u = new Float32Array(h), c = s.get(r) || { spec: [], data: new Float32Array() };
-      c.data = u, s.set(r, c);
-      const b = await F(c, o);
-      w.set(r, b);
+      const r = t.replace(".bin", ""), h = await n.file(t).async("arraybuffer"), d = new Float32Array(h), i = s.get(r) || { spec: [], data: new Float32Array() };
+      i.data = d, s.set(r, i);
+      const u = await b(i, o);
+      w.set(r, u);
     }
-  const i = new j(o, l.config);
-  m(i), i.loadWeights(w), m(i);
-  const g = await n.file("log.json")?.async("string");
-  if (g)
+  const a = new j(o, f.config);
+  await z(a), a.loadWeights(w);
+  const p = await n.file("log.json")?.async("string");
+  if (p)
     try {
-      const t = JSON.parse(g);
-      i.log = t;
+      const t = JSON.parse(p);
+      a.log = t;
     } catch (t) {
       throw console.error("Error parsing training log:", t), new Error(`Failed to parse training log: ${t}`);
     }
-  return { model: i, tokeniser: y };
+  return { model: a, tokeniser: y };
 }
 export {
-  A as loadModel
+  T as loadModel
 };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
     "name": "@genai-fi/nanogpt",
-    "version": "0.0.1",
+    "version": "0.1.0",
     "type": "module",
     "main": "dist/main.js",
     "types": "dist/main.d.ts",