npm - @genai-fi/nanogpt - Versions diffs - 0.2.0 → 0.2.1 - Mend

@genai-fi/nanogpt 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/dist/Generator.d.ts +1 -0
package/dist/Generator.js +10 -10
package/dist/NanoGPTModel.js +80 -59
package/dist/TeachableLLM.js +3 -3
package/dist/layers/RoPECache.js +24 -19
package/dist/main.d.ts +1 -0
package/dist/main.js +4 -2
package/package.json +1 -1

package/dist/Generator.d.ts CHANGED Viewed

@@ -3,6 +3,7 @@ import { ITokeniser } from './tokeniser/type';
 import { default as EE } from 'eventemitter3';
 export interface IGenerateOptions extends GenerateOptions {
     maxLength?: number;
+    noCache?: boolean;
 }
 export default class Generator extends EE<'start' | 'stop' | 'tokens'> {
     private readonly model;

package/dist/Generator.js CHANGED Viewed

@@ -1,5 +1,5 @@
 import { E as u } from "./index-SOhdqzHq.js";
-class k extends u {
+class p extends u {
   constructor(s, e) {
     super(), this.model = s, this.tokeniser = e;
   }
@@ -14,10 +14,10 @@ class k extends u {
       const {
         output: o,
         attention: c,
-        probabilities: l
-      } = this.model.generate(t, void 0, e), h = t;
-      t = this.model.tf.concat([t, o], 1), h.dispose();
-      const r = await this.processResponse(o, c, l);
+        probabilities: h
+      } = this.model.generate(t, void 0, e), l = t;
+      t = this.model.tf.concat([t, o], 1), l.dispose();
+      const r = await this.processResponse(o, c, h);
       if (o.dispose(), r === null)
         break;
       n += r;
@@ -40,14 +40,14 @@ class k extends u {
     for (let o = 0; o < i; o++) {
       const {
         output: c,
-        attention: l,
-        probabilities: h
+        attention: h,
+        probabilities: l
       } = this.model.generate(t, a, {
         ...e,
         usePadding: !1
       });
       t.dispose(), t = c;
-      const r = await this.processResponse(c, l, h);
+      const r = await this.processResponse(c, h, l);
       if (r === null)
         break;
       n += r;
@@ -56,10 +56,10 @@ class k extends u {
   }
   async generate(s, e) {
     this.emit("start");
-    const t = this.model.config.useRope ? this.generateCache(s, e) : this.generateNoCache(s, e);
+    const t = this.model.config.useRope && !e?.noCache ? this.generateCache(s, e) : this.generateNoCache(s, e);
     return this.emit("stop"), t;
   }
 }
 export {
-  k as default
+  p as default
 };

package/dist/NanoGPTModel.js CHANGED Viewed

@@ -1,9 +1,9 @@
-import { defaultConfig as v } from "./config.js";
-import S from "./layers/TransformerBlock.js";
-import _ from "./layers/TiedEmbedding.js";
-import L from "./layers/RoPECache.js";
-import I from "./layers/RMSNorm.js";
-class F {
+import { defaultConfig as z } from "./config.js";
+import $ from "./layers/TransformerBlock.js";
+import S from "./layers/TiedEmbedding.js";
+import I from "./layers/RoPECache.js";
+import _ from "./layers/RMSNorm.js";
+class M {
   config;
   wte;
   // Token embeddings
@@ -19,7 +19,7 @@ class F {
   log = [];
   // Training log
   constructor(t, e = {}) {
-    this.tf = t, this.config = { ...v, ...e }, this.wte = new _(t, {
+    this.tf = t, this.config = { ...z, ...e }, this.wte = new S(t, {
       vocabSize: this.config.vocabSize,
       embedDim: this.config.nEmbed,
       name: "token_embedding"
@@ -28,10 +28,10 @@ class F {
       outputDim: this.config.nEmbed,
       name: "positional_embedding",
       embeddingsInitializer: this.tf.initializers.randomNormal({ mean: 0, stddev: 0.02 })
-    }) : this.ropeCache = new L(t, this.config), this.drop = this.tf.layers.dropout({ rate: this.config.dropout }), this.blocks = [];
-    for (let s = 0; s < this.config.nLayer; s++)
-      this.blocks.push(new S(this.tf, s, this.config, this.ropeCache));
-    this.lnF = new I(t, [this.config.nEmbed], 1e-8, "final_rms_norm");
+    }) : this.ropeCache = new I(t, this.config), this.drop = this.tf.layers.dropout({ rate: this.config.dropout }), this.blocks = [];
+    for (let o = 0; o < this.config.nLayer; o++)
+      this.blocks.push(new $(this.tf, o, this.config, this.ropeCache));
+    this.lnF = new _(t, [this.config.nEmbed], 1e-8, "final_rms_norm");
   }
   get variables() {
     return [
@@ -54,17 +54,17 @@ class F {
       this.blocks[e].loadWeights(t);
     this.lnF.setWeights(t.get("final_rms_norm") || []);
   }
-  inputPhase(t, e, s = !1) {
+  inputPhase(t, e, o = !1) {
     return this.tf.tidy(() => {
-      const o = this.wte.embed(t);
+      const i = this.wte.embed(t);
       if (this.config.useRope === !1) {
-        const [, i] = t.shape, a = this.config.blockSize, n = this.tf.range(0, i, 1, "int32"), h = this.tf.mod(
-          this.tf.add(n, this.tf.scalar(e, "int32")),
-          this.tf.scalar(a, "int32")
-        ), c = this.wpe.apply(h), r = o.add(c);
-        return this.drop.apply(r, { training: s });
+        const [, s] = t.shape, r = this.config.blockSize, l = this.tf.range(0, s, 1, "int32"), n = this.tf.mod(
+          this.tf.add(l, this.tf.scalar(e, "int32")),
+          this.tf.scalar(r, "int32")
+        ), h = this.wpe.apply(n), c = i.add(h);
+        return this.drop.apply(c, { training: o });
       } else
-        return this.drop.apply(o, { training: s });
+        return this.drop.apply(i, { training: o });
     });
   }
   setSkipMask(t) {
@@ -95,8 +95,8 @@ class F {
   calculateLoss(t, e) {
     try {
       return this.tf.losses.softmaxCrossEntropy(e, t, this.tf.Reduction.MEAN);
-    } catch (s) {
-      throw console.error("Error computing loss:", s), new Error(`Loss computation failed: ${s}`);
+    } catch (o) {
+      throw console.error("Error computing loss:", o), new Error(`Loss computation failed: ${o}`);
     }
   }
   // Attention rollout per Abnar & Zuidema (2020)
@@ -105,67 +105,88 @@ class F {
     return this.tf.tidy(() => {
       if (t.length === 0)
         throw new Error("No attentions for rollout");
-      const e = t[0].shape[0], s = t[0].shape[1], o = this.tf.eye(s, s).expandDims(0);
-      let i = o.tile([e, 1, 1]);
-      for (const a of t) {
-        let n = a.add(o);
-        n = n.div(n.sum(-1, !0)), i = n.matMul(i);
+      const [e, o, i] = t[0].shape;
+      for (const s of t) {
+        const [r, l, n] = s.shape;
+        if (r !== e || l !== o || n !== i)
+          throw new Error(
+            `Inconsistent attention shapes in rollout: expected [${e},${o},${i}] got [${r},${l},${n}]`
+          );
       }
-      return i;
+      if (o === i) {
+        const s = this.tf.eye(i, i).expandDims(0);
+        let r = s.tile([e, 1, 1]);
+        for (const l of t) {
+          const n = l.add(s);
+          r = n.div(n.sum(-1, !0)).matMul(r);
+        }
+        return r;
+      }
+      if (o === 1) {
+        let s = null;
+        const r = this.tf.tensor1d([i - 1], "int32"), l = this.tf.oneHot(r, i).reshape([1, 1, i]).tile([e, 1, 1]);
+        r.dispose();
+        for (const n of t) {
+          let h = n.add(l);
+          h = h.div(h.sum(-1, !0)), s == null ? s = h : (s = s.mul(h), s = s.div(s.sum(-1, !0)));
+        }
+        return s;
+      }
+      throw new Error(`Unsupported attention shapes for rollout: [B=${e}, Q=${o}, K=${i}]`);
     });
   }
-  forward(t, e, s = !1, o = !1, i) {
+  forward(t, e, o = !1, i = !1, s) {
     return this.validateInput(t), this.tf.tidy(() => {
-      const a = i?.[0]?.length ?? 0;
-      let n = this.inputPhase(t, a, s);
-      const h = [];
-      if (i && i.length !== this.blocks.length)
-        throw console.error("Cache", i), new Error(`Cache length ${i.length} does not match number of blocks ${this.blocks.length}`);
-      for (let l = 0; l < this.blocks.length; l++) {
-        const d = this.blocks[l], {
+      const r = s?.[0]?.length ?? 0;
+      let l = this.inputPhase(t, r, o);
+      const n = [];
+      if (s && s.length !== this.blocks.length)
+        throw console.error("Cache", s), new Error(`Cache length ${s.length} does not match number of blocks ${this.blocks.length}`);
+      for (let a = 0; a < this.blocks.length; a++) {
+        const d = this.blocks[a], {
           output: g,
-          attention: b,
+          attention: m,
           cache: p
-        } = d.call(n, s, o, i ? i[l] : void 0);
-        n = g, o && b && h.push(b), i && p ? (i[l]?.k.dispose(), i[l]?.v.dispose(), i[l] = p) : p && (p.k.dispose(), p.v.dispose());
+        } = d.call(l, o, i, s ? s[a] : void 0);
+        l = g, i && m && n.push(m), s && p ? (s[a]?.k.dispose(), s[a]?.v.dispose(), s[a] = p) : p && (p.k.dispose(), p.v.dispose());
       }
-      let c;
-      o && h.length > 0 && (c = this.computeAttentionRollout(h)), n = this.lnF.apply(n);
-      const r = this.wte.project(n);
+      let h;
+      i && n.length > 0 && (h = this.computeAttentionRollout(n)), l = this.lnF.apply(l);
+      const c = this.wte.project(l);
       let f;
-      return e && (f = this.calculateLoss(r, e)), { logits: r, loss: f, attention: o ? c : void 0 };
+      return e && (f = this.calculateLoss(c, e)), { logits: c, loss: f, attention: i ? h : void 0 };
     });
   }
-  generate(t, e, s) {
-    const o = s?.temperature ?? 1, i = s?.topK, a = s?.usePadding ?? !1, n = s?.includeAttention ?? !1;
+  generate(t, e, o) {
+    const i = o?.temperature ?? 1, s = o?.topK, r = o?.usePadding ?? !1, l = o?.includeAttention ?? !1;
     return this.tf.tidy(() => {
-      const h = t, c = h.shape[1], r = c <= this.config.blockSize ? h : h.slice(
-        [0, c - this.config.blockSize],
-        [h.shape[0], this.config.blockSize]
-      ), f = a ? this.config.blockSize - r.shape[1] : 0, l = f > 0 ? this.tf.pad(r, [
+      const n = t, h = n.shape[1], c = h <= this.config.blockSize ? n : n.slice(
+        [0, h - this.config.blockSize],
+        [n.shape[0], this.config.blockSize]
+      ), f = r ? this.config.blockSize - c.shape[1] : 0, a = f > 0 ? this.tf.pad(c, [
         [0, 0],
         [0, f]
-      ]) : r, { logits: d, attention: g } = this.forward(l, void 0, !1, n, e), b = d.shape[1] - 1 - f, p = d.slice([0, b, 0], [d.shape[0], 1, d.shape[2]]), w = g ? g.slice([0, b, 0], [g.shape[0], 1, g.shape[2]]) : void 0, u = p.div(o);
-      let m;
-      if (i) {
-        const { values: E, indices: y } = this.tf.topk(u, i), z = this.tf.multinomial(E.squeeze([1]), 1);
-        m = this.tf.gather(y.squeeze([1]), z, 1);
+      ]) : c, { logits: d, attention: g } = this.forward(a, void 0, !1, l, e), m = d.shape[1] - 1 - f, p = d.slice([0, m, 0], [d.shape[0], 1, d.shape[2]]), w = g ? g.slice([0, m, 0], [g.shape[0], 1, g.shape[2]]) : void 0, u = p.div(i);
+      let b;
+      if (s) {
+        const { values: E, indices: v } = this.tf.topk(u, s), y = this.tf.multinomial(E.squeeze([1]), 1);
+        b = this.tf.gather(v.squeeze([1]), y, 1);
       } else
-        m = this.tf.multinomial(u.squeeze([1]), 1);
+        b = this.tf.multinomial(u.squeeze([1]), 1);
       let k;
-      return s?.includeProbabilities && (k = this.tf.softmax(u.squeeze([1]))), m = m.reshape([1, 1]), { output: m, attention: w?.squeeze([1]), probabilities: k };
+      return o?.includeProbabilities && (k = this.tf.softmax(u.squeeze([1]))), b = b.reshape([1, 1]), { output: b, attention: w?.squeeze([1]), probabilities: k };
     });
   }
   getNumParams() {
     const t = this.config.vocabSize * this.config.nEmbed + this.config.blockSize * this.config.nEmbed, e = this.config.nLayer * (4 * this.config.nEmbed * this.config.nEmbed + // qkv + proj
-    2 * this.config.nEmbed), s = this.config.nLayer * (4 * this.config.nEmbed * this.config.nEmbed + // fc
-    this.config.nEmbed * 4 * this.config.nEmbed), o = this.config.nEmbed + this.config.vocabSize * this.config.nEmbed;
-    return t + e + s + o;
+    2 * this.config.nEmbed), o = this.config.nLayer * (4 * this.config.nEmbed * this.config.nEmbed + // fc
+    this.config.nEmbed * 4 * this.config.nEmbed), i = this.config.nEmbed + this.config.vocabSize * this.config.nEmbed;
+    return t + e + o + i;
   }
   dispose() {
     this.wte.dispose(), this.wpe && this.wpe.dispose(), this.drop.dispose(), this.blocks.forEach((t) => t.dispose()), this.lnF.dispose();
   }
 }
 export {
-  F as default
+  M as default
 };

package/dist/TeachableLLM.js CHANGED Viewed

@@ -1,5 +1,5 @@
-import d from "./NanoGPTModel.js";
-import { defaultConfig as u } from "./config.js";
+import { defaultConfig as d } from "./config.js";
+import u from "./NanoGPTModel.js";
 import { saveModel as m } from "./utilities/save.js";
 import { loadModel as l } from "./utilities/load.js";
 import f from "./Generator.js";
@@ -58,7 +58,7 @@ class a extends c {
     }), e;
   }
   static create(t, r = {}) {
-    const e = { ...u, ...r }, s = new g(e.vocabSize), o = new d(t, e), i = new a(t, s, o);
+    const e = { ...d, ...r }, s = new g(e.vocabSize), o = new u(t, e), i = new a(t, s, o);
     return i.setStatus("warmup"), h(o).then(() => {
       i.tokeniser.trained ? i.setStatus("ready") : (i.setStatus("awaitingTokens"), i.tokeniser.once("trainStatus", (n) => {
         n === "trained" && i.setStatus("ready");

package/dist/layers/RoPECache.js CHANGED Viewed

@@ -1,12 +1,14 @@
-class E {
-  constructor(t, c) {
-    this.tf = t, this.config = c;
-    const e = this.config.nEmbed / this.config.nHead;
-    if (this.rotaryDim = e, this.rotaryDim % 2 !== 0)
+class b {
+  constructor(s, r) {
+    this.tf = s, this.config = r;
+    const o = this.config.nEmbed / this.config.nHead;
+    if (this.rotaryDim = o, this.rotaryDim % 2 !== 0)
       throw new Error("rotaryDim must be even");
     this.ropeBase = 1e4;
-    const o = this.tf.range(0, this.rotaryDim, 2, "float32").div(this.tf.scalar(this.rotaryDim, "float32")), s = this.tf.pow(this.tf.scalar(this.ropeBase, "float32"), o);
-    this.ropeInvFreq = this.tf.reciprocal(s), this.config.useRope === !1 ? (this.ropeCos = null, this.ropeSin = null, this.ropeCacheLen = 0) : this.ensureRopeCache(this.config.blockSize * 4);
+    const i = this.tf.range(0, this.rotaryDim, 2, "float32"), t = i.div(this.tf.scalar(this.rotaryDim, "float32")), e = this.tf.pow(this.tf.scalar(this.ropeBase, "float32"), t);
+    this.ropeInvFreq = this.tf.reciprocal(e), t.dispose(), e.dispose(), i.dispose(), this.config.useRope === !1 ? (this.ropeCos = null, this.ropeSin = null, this.ropeCacheLen = 0) : this.tf.tidy(() => {
+      this.ensureRopeCache(this.config.blockSize * 4);
+    });
   }
   rotaryDim;
   ropeBase;
@@ -16,24 +18,27 @@ class E {
   ropeSin = null;
   // [cacheLen, rotaryDim/2]
   ropeCacheLen = 0;
-  ensureRopeCache(t) {
-    if (t <= this.ropeCacheLen) return;
+  ensureRopeCache(s) {
+    if (s <= this.ropeCacheLen) return;
     this.ropeCos && this.ropeCos.dispose(), this.ropeSin && this.ropeSin.dispose();
-    const e = this.tf.range(0, t, 1, "float32").expandDims(1).mul(this.ropeInvFreq.expandDims(0));
-    this.ropeCos = this.tf.keep(this.tf.cos(e).expandDims(-1)), this.ropeSin = this.tf.keep(this.tf.sin(e).expandDims(-1)), this.ropeCacheLen = t;
+    const o = this.tf.range(0, s, 1, "float32").expandDims(1).mul(this.ropeInvFreq.expandDims(0));
+    this.ropeCos = this.tf.keep(this.tf.cos(o).expandDims(-1)), this.ropeSin = this.tf.keep(this.tf.sin(o).expandDims(-1)), this.ropeCacheLen = s;
   }
-  applyRoPE(t, c, e) {
-    const h = t.shape[3], o = this.rotaryDim;
-    if (o > h) return [t, c];
-    const s = t.shape[2], S = e + s;
-    this.ensureRopeCache(S);
-    const n = o / 2, g = this.ropeCos.slice([e, 0, 0], [s, n, 1]), v = this.ropeSin.slice([e, 0, 0], [s, n, 1]), l = g.reshape([1, 1, s, n, 1]), f = v.reshape([1, 1, s, n, 1]), p = this.tf.concat([t, c], 0), r = p.shape[0], i = p.shape[1], y = p.slice([0, 0, 0, 0], [r, i, s, o]), u = o < h ? p.slice([0, 0, 0, o], [r, i, s, h - o]) : null, d = y.reshape([r, i, s, n, 2]), m = d.slice([0, 0, 0, 0, 0], [r, i, s, n, 1]), C = d.slice([0, 0, 0, 0, 1], [r, i, s, n, 1]), B = m.mul(l).sub(C.mul(f)), b = C.mul(l).add(m.mul(f)), D = this.tf.concat([B, b], -1).reshape([r, i, s, o]), R = u ? this.tf.concat([D, u], 3) : D, a = r / 2, x = R.slice([0, 0, 0, 0], [a, i, s, h]), P = R.slice([a, 0, 0, 0], [a, i, s, h]);
-    return [x, P];
+  applyRoPE(s, r, o) {
+    const i = s.shape[3], t = this.rotaryDim;
+    if (t > i) return [s, r];
+    const e = s.shape[2], v = o + e;
+    this.ensureRopeCache(v);
+    const n = t / 2, p = this.ropeCos.slice([o, 0, 0], [e, n, 1]).reshape([1, 1, e, n]), a = this.ropeSin.slice([o, 0, 0], [e, n, 1]).reshape([1, 1, e, n]), h = s.shape[0], c = s.shape[1], f = this.tf.range(0, t, 2, "int32"), l = this.tf.range(1, t, 2, "int32"), d = (u) => {
+      const m = u.slice([0, 0, 0, 0], [h, c, e, t]), C = t < i ? u.slice([0, 0, 0, t], [h, c, e, i - t]) : null, D = this.tf.gather(m, f, 3), g = this.tf.gather(m, l, 3), x = D.mul(p).sub(g.mul(a)), k = g.mul(p).add(D.mul(a)), R = this.tf.stack([x, k], -1).reshape([h, c, e, t]);
+      return C ? this.tf.concat([R, C], 3) : R;
+    }, y = d(s), S = d(r);
+    return f.dispose(), l.dispose(), [y, S];
   }
   dispose() {
     this.ropeCos && this.ropeCos.dispose(), this.ropeSin && this.ropeSin.dispose(), this.ropeInvFreq.dispose();
   }
 }
 export {
-  E as default
+  b as default
 };

package/dist/main.d.ts CHANGED Viewed

@@ -1,6 +1,7 @@
 export { default as NanoGPT } from './NanoGPTModel';
 export { default as TeachableLLM } from './TeachableLLM';
 export { default as CharTokeniser } from './tokeniser/CharTokeniser';
+export { default as waitForModel } from './utilities/waitForModel';
 export type { ITrainerOptions } from './Trainer';
 export type { IGenerateOptions } from './Generator';
 export type { TrainingLogEntry } from './NanoGPTModel';

package/dist/main.js CHANGED Viewed

@@ -1,8 +1,10 @@
 import { default as o } from "./NanoGPTModel.js";
-import { default as f } from "./TeachableLLM.js";
+import { default as t } from "./TeachableLLM.js";
 import { default as l } from "./tokeniser/CharTokeniser.js";
+import { default as s } from "./utilities/waitForModel.js";
 export {
   l as CharTokeniser,
   o as NanoGPT,
-  f as TeachableLLM
+  t as TeachableLLM,
+  s as waitForModel
 };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
     "name": "@genai-fi/nanogpt",
-    "version": "0.2.0",
+    "version": "0.2.1",
     "type": "module",
     "main": "dist/main.js",
     "types": "dist/main.d.ts",