npm - @genai-fi/nanogpt - Versions diffs - 0.5.0 → 0.5.1 - Mend

@genai-fi/nanogpt 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/Generator.js +18 -18
package/dist/NanoGPTModel.d.ts +2 -2
package/dist/NanoGPTModel.js +68 -66
package/dist/layers/CausalSelfAttention.d.ts +2 -3
package/dist/layers/CausalSelfAttention.js +20 -20
package/dist/utilities/save.js +5 -5
package/package.json +1 -1

package/dist/Generator.js CHANGED Viewed

@@ -14,17 +14,17 @@ class w extends u {
   async generateNoCache(i, t) {
     let s = await this.tokenisePrompt(i), o = i || "";
     const n = t?.maxLength ?? 1e3;
-    for (let a = 0; a < n && this.active; a++) {
+    for (let r = 0; r < n && this.active; r++) {
       const {
         output: e,
-        attention: c,
-        probabilities: l
+        attention: a,
+        probabilities: c
       } = this.model.generate(s, void 0, t), h = s;
       s = p([s, e], 1), h.dispose();
-      const r = await this.processResponse(e, c, l);
-      if (e.dispose(), r === null)
+      const l = await this.processResponse(e, a, c);
+      if (e.dispose(), l === null)
         break;
-      o += r;
+      o += l;
     }
     return s.dispose(), o;
   }
@@ -33,31 +33,31 @@ class w extends u {
     if (o === this.tokeniser.eosToken)
       return null;
     const n = await this.tokeniser.decode([o]);
-    let a;
-    t && (a = await t.array(), t.dispose());
+    let r;
+    t && (r = await Promise.all(t.map((a) => a.array().then((c) => c))), t.forEach((a) => a.dispose()));
     let e;
-    return s && (e = await s.array(), s.dispose()), this.emit("tokens", [o], n, a, e), n;
+    return s && (e = await s.array(), s.dispose()), this.emit("tokens", [o], n, r, e), n;
   }
   async generateCache(i, t) {
     let s = await this.tokenisePrompt(i), o = i || "";
     const n = new Array(this.model.config.gpt.nLayer);
     for (let e = 0; e < this.model.config.gpt.nLayer; e++)
       n[e] = { k: void 0, v: void 0, length: 0, cumulativeLength: 0 };
-    const a = t?.maxLength ?? 1e3;
-    for (let e = 0; e < a && this.active; e++) {
+    const r = t?.maxLength ?? 1e3;
+    for (let e = 0; e < r && this.active; e++) {
       const {
-        output: c,
-        attention: l,
-        probabilities: h
+        output: a,
+        probabilities: c,
+        attention: h
       } = this.model.generate(s, n, {
         ...t,
         usePadding: !1
       });
-      s.dispose(), s = c;
-      const r = await this.processResponse(c, l, h);
-      if (r === null)
+      s.dispose(), s = a;
+      const l = await this.processResponse(a, h, c);
+      if (l === null)
         break;
-      o += r;
+      o += l;
     }
     return n.forEach((e) => {
       e && (e.k && e.k.dispose(), e.v && e.v.dispose());

package/dist/NanoGPTModel.d.ts CHANGED Viewed

@@ -14,7 +14,7 @@ export interface GenerateOptions {
     temperature?: number;
     topK?: number;
     usePadding?: boolean;
-    attentionScores?: AttentionScores;
+    attentionScores?: boolean;
     includeProbabilities?: boolean;
 }
 export interface ModelForwardAttributes extends ForwardAttributes {
@@ -41,8 +41,8 @@ export default class NanoGPT extends BaseLayer<ModelForwardAttributes> {
     forward(attrs: ModelForwardAttributes, idx: Tensor, targets?: Tensor): Tensor[];
     generate(idx: Tensor, cache?: KVCache[], options?: GenerateOptions): {
         output: Tensor;
-        attention?: Tensor;
         probabilities?: Tensor;
+        attention?: Tensor[];
     };
     getNumParams(): number;
     dispose(): void;

package/dist/NanoGPTModel.js CHANGED Viewed

@@ -1,16 +1,16 @@
 import { defaultConfig as L } from "./config.js";
-import q from "./layers/TransformerBlock.js";
-import { E as O, D as T, T as K, r as P, p as _ } from "./TiedEmbedding-DsDRvLB0.js";
+import v from "./layers/TransformerBlock.js";
+import { E as T, D as q, T as K, r as P, p as _ } from "./TiedEmbedding-DsDRvLB0.js";
 import F from "./layers/RoPECache.js";
 import D from "./layers/RMSNorm.js";
-import { estimateParameterCount as N } from "./utilities/parameters.js";
-import { createSoftmaxCrossEntropyWithGrad as R } from "./training/sparseCrossEntropy.js";
-import { B } from "./BaseLayer-BhrMN8JO.js";
-import { o as k, i as m, q as G, E as w, aa as A, ab as V, ac as j, t as b, a9 as W, f as y, F as H } from "./index-iNhkcAEQ.js";
-import { r as $ } from "./reshape-DxTPgnwL.js";
-import { r as J } from "./range-BsFU-SNG.js";
-import { g as Q } from "./gather-Bxe1Qip8.js";
-import { s as U } from "./softmax-BjsptB07.js";
+import { estimateParameterCount as O } from "./utilities/parameters.js";
+import { createSoftmaxCrossEntropyWithGrad as N } from "./training/sparseCrossEntropy.js";
+import { B as R } from "./BaseLayer-BhrMN8JO.js";
+import { o as E, i as d, q as B, E as y, aa as G, ab as V, ac as j, t as w, a9 as A, f as z, F as W } from "./index-iNhkcAEQ.js";
+import { r as C } from "./reshape-DxTPgnwL.js";
+import { r as H } from "./range-BsFU-SNG.js";
+import { g as J } from "./gather-Bxe1Qip8.js";
+import { s as Q } from "./softmax-BjsptB07.js";
 /**
  * @license
  * Copyright 2020 Google LLC. All Rights Reserved.
@@ -27,13 +27,13 @@ import { s as U } from "./softmax-BjsptB07.js";
  * limitations under the License.
  * =============================================================================
  */
-function X(h, t) {
-  let e = m(h, "a", "mod"), o = m(t, "b", "mod");
-  [e, o] = G(e, o);
+function U(h, t) {
+  let e = d(h, "a", "mod"), o = d(t, "b", "mod");
+  [e, o] = B(e, o);
   const n = { a: e, b: o };
-  return w.runKernel(A, n);
+  return y.runKernel(G, n);
 }
-const Y = /* @__PURE__ */ k({ mod_: X });
+const X = /* @__PURE__ */ E({ mod_: U });
 /**
  * @license
  * Copyright 2020 Google LLC. All Rights Reserved.
@@ -50,17 +50,17 @@ const Y = /* @__PURE__ */ k({ mod_: X });
  * limitations under the License.
  * =============================================================================
  */
-function Z(h, t, e, o = !1) {
-  const n = m(h, "logits", "multinomial"), s = n.size, i = n.rank;
+function Y(h, t, e, o = !1) {
+  const n = d(h, "logits", "multinomial"), s = n.size, i = n.rank;
   if (s < 2)
     throw new Error(`Error in multinomial: you need at least 2 outcomes, but got ${s}.`);
   if (i > 2)
     throw new Error(`Rank of probabilities must be 1 or 2, but is ${i}`);
   e = e || Math.random();
-  const a = { logits: i === 1 ? $(n, [1, -1]) : n }, c = { numSamples: t, seed: e, normalized: o }, l = w.runKernel(V, a, c);
-  return i === 1 ? $(l, [l.size]) : l;
+  const c = { logits: i === 1 ? C(n, [1, -1]) : n }, l = { numSamples: t, seed: e, normalized: o }, a = y.runKernel(V, c, l);
+  return i === 1 ? C(a, [a.size]) : a;
 }
-const z = /* @__PURE__ */ k({ multinomial_: Z });
+const I = /* @__PURE__ */ E({ multinomial_: Y });
 /**
  * @license
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -77,8 +77,8 @@ const z = /* @__PURE__ */ k({ multinomial_: Z });
  * limitations under the License.
  * =============================================================================
  */
-function tt(h, t = 1, e = !0) {
-  const o = m(h, "x", "topk");
+function Z(h, t = 1, e = !0) {
+  const o = d(h, "x", "topk");
   if (o.rank === 0)
     throw new Error("topk() expects the input to be of rank 1 or higher");
   const n = o.shape[o.shape.length - 1];
@@ -86,10 +86,10 @@ function tt(h, t = 1, e = !0) {
     throw new Error(`'k' passed to topk() must be >= 0 but got ${t}`);
   if (t > n)
     throw new Error(`'k' passed to topk() must be <= the last dimension (${n}) but got ${t}`);
-  const s = { x: o }, i = { k: t, sorted: e }, [r, a] = w.runKernel(j, s, i);
-  return { values: r, indices: a };
+  const s = { x: o }, i = { k: t, sorted: e }, [r, c] = y.runKernel(j, s, i);
+  return { values: r, indices: c };
 }
-const et = /* @__PURE__ */ k({ topk_: tt });
+const tt = /* @__PURE__ */ E({ topk_: Z });
 /**
  * @license
  * Copyright 2018 Google LLC
@@ -99,13 +99,13 @@ const et = /* @__PURE__ */ k({ topk_: tt });
  * https://opensource.org/licenses/MIT.
  * =============================================================================
  */
+function et(h) {
+  return new q(h);
+}
 function ot(h) {
   return new T(h);
 }
-function st(h) {
-  return new O(h);
-}
-class bt extends B {
+class dt extends R {
   wte;
   // Token embeddings
   wpe;
@@ -119,14 +119,14 @@ class bt extends B {
   log = [];
   // Training log
   constructor(t = {}) {
-    super({ gpt: { ...L, ...t }, layerConfig: {} }), this.wte = new K(this.config, "token_embedding", this), this.config.gpt.useRope === !1 ? this.wpe = st({
+    super({ gpt: { ...L, ...t }, layerConfig: {} }), this.wte = new K(this.config, "token_embedding", this), this.config.gpt.useRope === !1 ? this.wpe = ot({
       inputDim: this.config.gpt.blockSize,
       outputDim: this.config.gpt.nEmbed,
       name: "positional_embedding",
       embeddingsInitializer: P({ mean: 0, stddev: 0.02 })
-    }) : (this.ropeCache = new F(this.config.gpt), this.config.layerConfig.ropeCache = this.ropeCache), this.drop = ot({ rate: this.config.gpt.dropout }), this.blocks = [];
+    }) : (this.ropeCache = new F(this.config.gpt), this.config.layerConfig.ropeCache = this.ropeCache), this.drop = et({ rate: this.config.gpt.dropout }), this.blocks = [];
     for (let e = 0; e < this.config.gpt.nLayer; e++)
-      this.blocks.push(new q(e, this.config, this));
+      this.blocks.push(new v(e, this.config, this));
     this.lnF = new D(this.config, "final_rms_norm", this);
   }
   get checkpointing() {
@@ -136,11 +136,11 @@ class bt extends B {
     this.config.layerConfig.checkpointing = t;
   }
   inputPhase(t, e, o = !1) {
-    return b(() => {
+    return w(() => {
       const n = this.wte.embed(t);
       if (this.config.gpt.useRope === !1) {
-        const [, s] = t.shape, i = this.config.gpt.blockSize, r = J(0, s, 1, "int32"), a = Y(W(r, y(e, "int32")), y(i, "int32")), c = this.wpe.apply(a), l = n.add(c);
-        return this.drop.apply(l, { training: o });
+        const [, s] = t.shape, i = this.config.gpt.blockSize, r = H(0, s, 1, "int32"), c = X(A(r, z(e, "int32")), z(i, "int32")), l = this.wpe.apply(c), a = n.add(l);
+        return this.drop.apply(a, { training: o });
       } else
         return this.drop.apply(n, { training: o });
     });
@@ -167,7 +167,7 @@ class bt extends B {
   }
   calculateLoss(t, e) {
     try {
-      return R()(t, e).mean();
+      return N()(t, e).mean();
     } catch (o) {
       throw console.error("Error computing loss:", o), new Error(`Loss computation failed: ${o}`);
     }
@@ -205,7 +205,7 @@ class bt extends B {
           });
       }*/
   forward(t, e, o) {
-    return this.validateInput(e), b(() => {
+    return this.validateInput(e), w(() => {
       this.startMemory();
       const n = t.cache?.[0]?.length ?? 0;
       let s = this.inputPhase(e, n, t.training);
@@ -213,59 +213,61 @@ class bt extends B {
         throw console.error("Cache", t.cache), new Error(
           `Cache length ${t.cache.length} does not match number of blocks ${this.blocks.length}`
         );
-      let i;
       for (let c = 0; c < this.blocks.length; c++) {
-        const l = this.blocks[c], f = Math.random() * 1e9, p = {
+        const l = this.blocks[c], a = Math.random() * 1e9, u = {
           training: t.training,
-          seed: f,
+          seed: a,
           attentionScores: t.attentionScores,
           pastKV: t.cache ? t.cache[c] : void 0
-        }, u = this.config.layerConfig.checkpointing && t.training ? l.callCheckpoint(p, s) : l.call(p, s);
-        s.dispose(), s = u, p.attentionScores?.attentionOut && (i = p.attentionScores.attentionOut);
+        }, p = this.config.layerConfig.checkpointing && t.training ? l.callCheckpoint(u, s) : l.call(u, s);
+        s.dispose(), s = p;
       }
       s = this.lnF.call(t, s);
-      const r = this.wte.project(s);
+      const i = this.wte.project(s);
       s.dispose();
-      let a;
-      return o && (a = this.calculateLoss(r, o)), this.endMemory("Forward"), t.attentionScores && (t.attentionScores.attentionOut = i ? H(i) : void 0), a ? [r, a] : [r];
+      let r;
+      return o && (r = this.calculateLoss(i, o)), this.endMemory("Forward"), r ? [i, r] : [i];
     });
   }
   generate(t, e, o) {
     const n = o?.temperature ?? 1, s = o?.topK, i = o?.usePadding ?? !1;
-    return b(() => {
-      const r = t, a = r.shape[1], c = a <= this.config.gpt.blockSize ? r : r.slice(
-        [0, a - this.config.gpt.blockSize],
+    return w(() => {
+      const r = t, c = r.shape[1], l = c <= this.config.gpt.blockSize ? r : r.slice(
+        [0, c - this.config.gpt.blockSize],
         [r.shape[0], this.config.gpt.blockSize]
-      ), l = i ? this.config.gpt.blockSize - c.shape[1] : 0, f = l > 0 ? _(c, [
+      ), a = i ? this.config.gpt.blockSize - l.shape[1] : 0, u = a > 0 ? _(l, [
         [0, 0],
-        [0, l]
-      ]) : c, p = {
+        [0, a]
+      ]) : l, p = {
         training: !1,
-        attentionScores: o?.attentionScores,
+        attentionScores: o?.attentionScores ? {
+          attentionOut: []
+        } : void 0,
         cache: e
-      }, [u] = this.forward(p, f), E = u.shape[1] - 1 - l, C = u.slice([0, E, 0], [u.shape[0], 1, u.shape[2]]), I = p.attentionScores?.attentionOut ? p.attentionScores.attentionOut.slice(
-        [0, E, 0],
-        [p.attentionScores.attentionOut.shape[0], 1, p.attentionScores.attentionOut.shape[2]]
-      ) : void 0;
-      u.dispose();
-      const d = C.div(n);
-      let g;
+      }, [f] = this.forward(p, u), S = f.shape[1] - 1 - a, M = f.slice([0, S, 0], [f.shape[0], 1, f.shape[2]]);
+      p.attentionScores?.attentionOut && p.attentionScores.attentionOut.forEach((g, k) => {
+        g.shape[1] !== 1 && (p.attentionScores.attentionOut[k] = W(
+          g.slice([0, S, 0], [g.shape[0], 1, g.shape[2]])
+        ), g.dispose());
+      }), f.dispose();
+      const b = M.div(n);
+      let m;
       if (s) {
-        const { values: v, indices: M } = et(d, s), x = z(v.squeeze([1]), 1);
-        g = Q(M.squeeze([1]), x, 1);
+        const { values: g, indices: k } = tt(b, s), x = I(g.squeeze([1]), 1);
+        m = J(k.squeeze([1]), x, 1);
       } else
-        g = z(d.squeeze([1]), 1);
-      let S;
-      return o?.includeProbabilities && (S = U(d.squeeze([1]))), g = g.reshape([1, 1]), { output: g, attention: I?.squeeze([1]), probabilities: S };
+        m = I(b.squeeze([1]), 1);
+      let $;
+      return o?.includeProbabilities && ($ = Q(b.squeeze([1]))), m = m.reshape([1, 1]), { output: m, probabilities: $, attention: p.attentionScores?.attentionOut };
     });
   }
   getNumParams() {
-    return N(this.config.gpt);
+    return O(this.config.gpt);
   }
   dispose() {
     this.wte.dispose(), this.wpe && this.wpe.dispose(), this.drop.dispose(), this.blocks.forEach((t) => t.dispose()), this.lnF.dispose();
   }
 }
 export {
-  bt as default
+  dt as default
 };

package/dist/layers/CausalSelfAttention.d.ts CHANGED Viewed

@@ -7,9 +7,8 @@ export type KVCache = {
     cumulativeLength: number;
 };
 export interface AttentionScores {
-    head: number;
-    block: number;
-    attentionOut?: Tensor;
+    meanOfHeads?: boolean;
+    attentionOut?: Tensor[];
 }
 interface AttentionForwardAttributes extends ForwardAttributes {
     attentionScores?: AttentionScores;

package/dist/layers/CausalSelfAttention.js CHANGED Viewed

@@ -1,10 +1,10 @@
-import { attentionMask as f } from "../ops/attentionMask.js";
-import { B as O, v as V } from "../BaseLayer-BhrMN8JO.js";
+import { attentionMask as g } from "../ops/attentionMask.js";
+import { B as O, v } from "../BaseLayer-BhrMN8JO.js";
 import { qkv as P } from "../ops/qkv.js";
-import { rope as b } from "../ops/rope.js";
-import { appendCache as v } from "../ops/appendCache.js";
+import { rope as V } from "../ops/rope.js";
+import { appendCache as T } from "../ops/appendCache.js";
 import { F as c, t as C } from "../index-iNhkcAEQ.js";
-import { fusedSoftmax as T } from "../ops/fusedSoftmax.js";
+import { fusedSoftmax as b } from "../ops/fusedSoftmax.js";
 import { d as y } from "../tfjs_backend-NucKez4s.js";
 import { r as k, d as L } from "../dropout-kbDY39Ci.js";
 import { r as N } from "../reshape-DxTPgnwL.js";
@@ -22,14 +22,14 @@ class W extends O {
   build() {
     this.hasVariable(this.ATTN) === !1 && this.setVariable(
       this.ATTN,
-      V(
+      v(
         k([this.config.gpt.nEmbed, this.units], 0, 0.02),
         !0
         //`block_${this.index}_attn_cAttn_kernel`
       )
     ), this.hasVariable(this.PROJ) === !1 && this.setVariable(
       this.PROJ,
-      V(
+      v(
         k([this.projUnits, this.config.gpt.nEmbed], 0, 0.02),
         !0
         //`block_${this.index}_attn_cProj_kernel`
@@ -37,12 +37,12 @@ class W extends O {
     );
   }
   getAttentionScores(t, i, s, o) {
-    const e = f(t, i, this.divisor), n = T(e, s ? this.config.gpt.dropout : 0, o);
+    const e = g(t, i, this.divisor), n = b(e, s ? this.config.gpt.dropout : 0, o);
     return e.dispose(), n;
   }
   // Attention with optional past. If pastLen > 0 and T_cur == 1, no mask needed.
   getAttentionScoresWithPast(t, i, s) {
-    const o = f(t, i, this.divisor, s), e = T(o, 0, 0);
+    const o = g(t, i, this.divisor, s), e = b(o, 0, 0);
     return o.dispose(), e;
   }
   getQKV(t) {
@@ -53,9 +53,9 @@ class W extends O {
     return n.dispose(), e.dispose(), p;
   }
   updateCache(t, i, s) {
-    const o = this.config.gpt.blockSize, e = t.shape[2], n = s.length || 0, p = v(t, o, n, s.k);
+    const o = this.config.gpt.blockSize, e = t.shape[2], n = s.length || 0, p = T(t, o, n, s.k);
     t.dispose(), s.k && s.k.dispose();
-    const r = v(i, o, n, s.v);
+    const r = T(i, o, n, s.v);
     i.dispose(), s.v && s.v.dispose();
     const d = Math.min(n + e, o), h = s.cumulativeLength + e;
     s.length = d, s.cumulativeLength = h, s.k = c(p), s.v = c(r);
@@ -63,23 +63,23 @@ class W extends O {
   forward(t, i) {
     return C(() => {
       this.startMemory();
-      const [s, o, e] = this.getQKV(i), n = t.pastKV ? t.pastKV.cumulativeLength : 0, p = this.config.layerConfig.ropeCache, r = p ? b(s, p, n) : s, d = p ? b(o, p, n) : o;
+      const [s, o, e] = this.getQKV(i), n = t.pastKV ? t.pastKV.cumulativeLength : 0, p = this.config.layerConfig.ropeCache, r = p ? V(s, p, n) : s, d = p ? V(o, p, n) : o;
       p && (s.dispose(), o.dispose());
       const h = t.pastKV ? t.pastKV.length : 0;
       t.pastKV && !t.training && this.updateCache(d, e, t.pastKV);
       const u = t.pastKV?.k ? t.pastKV.k : d, l = t.pastKV?.v ? t.pastKV.v : e;
       let a;
       h > 0 ? a = this.getAttentionScoresWithPast(r, u, h) : a = this.getAttentionScores(r, u, t.training, t.seed || 0), r.dispose(), t.pastKV || u.dispose();
-      const m = R(a, l), g = t.attentionScores !== void 0 && t.attentionScores.block === this.index;
-      g || a.dispose(), t.pastKV || l.dispose();
-      const S = this.getOutputProjection(m);
-      if (m.dispose(), g && t.attentionScores && t.attentionScores.head >= 0 && t.attentionScores.head < this.config.gpt.nHead) {
-        const A = a.shape[0], K = a.shape[2];
-        t.attentionScores.attentionOut = c(
-          a.slice([0, t.attentionScores.head, 0, 0], [-1, 1, -1, -1]).reshape([A, K, -1])
+      const m = R(a, l), f = t.attentionScores !== void 0 && t.attentionScores.attentionOut !== void 0;
+      f || a.dispose(), t.pastKV || l.dispose();
+      const A = this.getOutputProjection(m);
+      if (m.dispose(), f && t.attentionScores && t.attentionScores.attentionOut !== void 0) {
+        const K = a.shape[1], S = a.shape[2];
+        t.attentionScores.attentionOut?.push(
+          c(a.slice([0, 0, 0, 0], [1, -1, -1, -1]).reshape([K, S, -1]))
         );
       }
-      return this.endMemory("CausalSelfAttention"), S;
+      return this.endMemory("CausalSelfAttention"), A;
     });
   }
   dropout(t) {

package/dist/utilities/save.js CHANGED Viewed

@@ -1,7 +1,7 @@
 import { j as g } from "../jszip.min-CjP2V1VV.js";
 import { exportWeights as l } from "./weights.js";
-import b from "../tokeniser/CharTokeniser.js";
-const p = "1.0.0";
+import p from "../tokeniser/CharTokeniser.js";
+const b = "1.0.0";
 async function h(t, a, i) {
   const c = i?.includeLog ?? !0, f = /* @__PURE__ */ new Map();
   t.saveWeights(f);
@@ -14,8 +14,8 @@ async function h(t, a, i) {
     "manifest.json",
     JSON.stringify({
       weightSpec: r,
-      config: t.config,
-      version: p,
+      config: t.config.gpt,
+      version: b,
       application: "@genai-fi/nanogpt",
       meta: i?.metadata,
       name: i?.name
@@ -26,7 +26,7 @@ async function h(t, a, i) {
   ), e.file(
     "tokeniser.json",
     JSON.stringify({
-      type: a instanceof b ? "char" : "bpe",
+      type: a instanceof p ? "char" : "bpe",
       vocab: a.getVocab(),
       merges: await a.getMerges()
     }),

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
     "name": "@genai-fi/nanogpt",
-    "version": "0.5.0",
+    "version": "0.5.1",
     "type": "module",
     "main": "dist/main.js",
     "types": "dist/main.d.ts",