npm - @genai-fi/nanogpt - Versions diffs - 0.4.1 → 0.4.3 - Mend

@genai-fi/nanogpt 0.4.1 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

package/dist/Generator.js +3 -3
package/dist/NanoGPTModel.js +84 -74
package/dist/TeachableLLM.js +1 -1
package/dist/{random_width-CMHmdbSu.js → TiedEmbedding-CnJ1bx4q.js} +760 -719
package/dist/{axis_util-DeydwOoC.js → axis_util-BgTGy5w8.js} +1 -1
package/dist/{concat-DS_qH7MI.js → concat-CuRsVY-K.js} +1 -1
package/dist/dropout-DfDdklfL.js +193 -0
package/dist/{gather-BUmJIS8n.js → gather-ZYRWhmXR.js} +1 -1
package/dist/gelu-CnCt17Lk.js +26 -0
package/dist/{index-XjBAhiFO.js → index-C4JCoBvj.js} +61 -61
package/dist/kernel_funcs_utils-CAd1h9X1.js +388 -0
package/dist/layers/CausalSelfAttention.js +74 -73
package/dist/layers/MLP.d.ts +3 -1
package/dist/layers/MLP.js +93 -5
package/dist/layers/RMSNorm.js +3 -3
package/dist/layers/RoPECache.js +3 -3
package/dist/layers/TiedEmbedding.js +6 -46
package/dist/layers/TransformerBlock.js +2 -2
package/dist/{log_sum_exp-DJPkVZZn.js → log_sum_exp-BswFnwOb.js} +5 -5
package/dist/main.js +1 -1
package/dist/{mat_mul-CKwFEV1Q.js → mat_mul-415y5Qn2.js} +1 -1
package/dist/{max-DJvEiCAJ.js → max-CP_9O2Yd.js} +1 -1
package/dist/{moments-CrWRPcR3.js → moments-CjeIaVdp.js} +3 -3
package/dist/{norm-BzY929B_.js → norm-CZM380I3.js} +5 -5
package/dist/{ones-BO01zpJG.js → ones-Bf3YR48P.js} +2 -2
package/dist/ops/appendCache.d.ts +1 -1
package/dist/ops/appendCache.js +10 -4
package/dist/ops/attentionMask.d.ts +1 -1
package/dist/ops/attentionMask.js +4 -4
package/dist/ops/cpu/appendCache.d.ts +1 -2
package/dist/ops/cpu/appendCache.js +15 -20
package/dist/ops/cpu/attentionMask.js +15 -11
package/dist/ops/cpu/fusedSoftmax.js +2 -2
package/dist/ops/cpu/gatherSub.js +3 -3
package/dist/ops/cpu/gelu.d.ts +1 -0
package/dist/ops/cpu/gelu.js +40 -0
package/dist/ops/cpu/mulDropout.js +1 -1
package/dist/ops/cpu/qkv.js +3 -3
package/dist/ops/cpu/rope.js +5 -5
package/dist/ops/cpu/scatterSub.js +4 -4
package/dist/ops/fusedSoftmax.js +1 -1
package/dist/ops/gatherSub.js +1 -1
package/dist/ops/gelu.d.ts +3 -0
package/dist/ops/gelu.js +8 -0
package/dist/ops/grads/attentionMask.js +1 -1
package/dist/ops/grads/fusedSoftmax.js +2 -2
package/dist/ops/grads/gelu.d.ts +2 -0
package/dist/ops/grads/gelu.js +5 -0
package/dist/ops/grads/qkv.js +1 -1
package/dist/ops/grads/rope.js +1 -1
package/dist/ops/mulDrop.js +1 -1
package/dist/ops/node/sparseCrossEntropy.js +1 -1
package/dist/ops/qkv.js +1 -1
package/dist/ops/scatterSub.js +1 -1
package/dist/ops/webgl/appendCache.js +14 -13
package/dist/ops/webgl/attentionMask.js +19 -18
package/dist/ops/webgl/fusedSoftmax.js +483 -782
package/dist/ops/webgl/gatherSub.js +1 -1
package/dist/ops/webgl/gelu.d.ts +2 -0
package/dist/ops/webgl/gelu.js +50 -0
package/dist/ops/webgl/mulDropout.js +1 -1
package/dist/ops/webgl/qkv.js +1 -1
package/dist/ops/webgl/rope.js +1 -1
package/dist/ops/webgl/scatterSub.js +1 -1
package/dist/{range-DQMNzBWs.js → range-9AzeApCc.js} +1 -1
package/dist/{reshape-DFzh97Sc.js → reshape-Boe4DuIO.js} +1 -1
package/dist/{sin-BYM-U4Ut.js → sin-KmhiDuMa.js} +1 -1
package/dist/{slice_util-CnVNPQI-.js → slice_util-19zDNNSn.js} +2 -2
package/dist/{softmax-4DOn6cPq.js → softmax-Cujsg4ay.js} +1 -1
package/dist/{split-CkbeVdF8.js → split-DbcNm1-i.js} +1 -1
package/dist/{stack-DaIMO5iX.js → stack-D1YjmgKN.js} +1 -1
package/dist/{sum-C6u3xMi3.js → sum-R28pucR5.js} +1 -1
package/dist/{tensor-Cu1fU7H7.js → tensor-BVeHdl7V.js} +1 -1
package/dist/{tensor2d-D0CKdG6B.js → tensor2d-DqFGNs_K.js} +1 -1
package/dist/{tfjs_backend-Bzl2SrRo.js → tfjs_backend-Cug-PH75.js} +826 -1015
package/dist/training/AdamExt.js +1 -1
package/dist/training/DatasetBuilder.js +3 -3
package/dist/training/FullTrainer.js +1 -1
package/dist/training/Trainer.js +5 -5
package/dist/training/sparseCrossEntropy.js +4 -4
package/dist/utilities/dummy.js +2 -2
package/dist/utilities/generate.js +3 -3
package/dist/utilities/load.js +1 -1
package/dist/utilities/profile.js +1 -1
package/dist/utilities/weights.js +2 -2
package/dist/{variable-BS4AKqNU.js → variable-LJT9Ld63.js} +1 -1
package/dist/{zeros-CmJFiC84.js → zeros-dnQxFgAD.js} +1 -1
package/package.json +1 -1
package/dist/MLP-KHhikThU.js +0 -83

package/dist/Generator.js CHANGED Viewed

@@ -1,7 +1,7 @@
 import { E as u } from "./index-Dwqa6Zy2.js";
-import "./index-XjBAhiFO.js";
-import { t as d } from "./tensor2d-D0CKdG6B.js";
-import { c as p } from "./concat-DS_qH7MI.js";
+import "./index-C4JCoBvj.js";
+import { t as d } from "./tensor2d-DqFGNs_K.js";
+import { c as p } from "./concat-CuRsVY-K.js";
 class w extends u {
   constructor(s, e) {
     super(), this.model = s, this.tokeniser = e;

package/dist/NanoGPTModel.js CHANGED Viewed

@@ -1,19 +1,17 @@
 import { defaultConfig as x } from "./config.js";
 import W from "./layers/TransformerBlock.js";
-import F from "./layers/TiedEmbedding.js";
-import P from "./layers/RoPECache.js";
-import q from "./layers/RMSNorm.js";
-import { estimateParameterCount as K } from "./utilities/parameters.js";
-import { createSoftmaxCrossEntropyWithGrad as N } from "./training/sparseCrossEntropy.js";
-import T from "./layers/BaseLayer.js";
-import { r as R, p as D } from "./random_width-CMHmdbSu.js";
-import { o as y, h as $, p as A, E as v, a6 as B, a7 as G, a8 as O, t as w, a5 as Q, f as C } from "./index-XjBAhiFO.js";
-import { e as j, d as U } from "./MLP-KHhikThU.js";
-import { r as _ } from "./reshape-DFzh97Sc.js";
-import { r as V } from "./range-DQMNzBWs.js";
-import { e as X } from "./tfjs_backend-Bzl2SrRo.js";
-import { g as H } from "./gather-BUmJIS8n.js";
-import { s as J } from "./softmax-4DOn6cPq.js";
+import { E as F, D as P, T as q, r as T, p as D } from "./TiedEmbedding-CnJ1bx4q.js";
+import K from "./layers/RoPECache.js";
+import N from "./layers/RMSNorm.js";
+import { estimateParameterCount as R } from "./utilities/parameters.js";
+import { createSoftmaxCrossEntropyWithGrad as A } from "./training/sparseCrossEntropy.js";
+import B from "./layers/BaseLayer.js";
+import { o as $, h as E, p as G, E as v, a6 as O, a7 as j, a8 as Q, t as w, a5 as V, f as C } from "./index-C4JCoBvj.js";
+import { r as _ } from "./reshape-Boe4DuIO.js";
+import { r as X } from "./range-9AzeApCc.js";
+import { e as H } from "./tfjs_backend-Cug-PH75.js";
+import { g as J } from "./gather-ZYRWhmXR.js";
+import { s as U } from "./softmax-Cujsg4ay.js";
 /**
  * @license
  * Copyright 2020 Google LLC. All Rights Reserved.
@@ -30,13 +28,13 @@ import { s as J } from "./softmax-4DOn6cPq.js";
  * limitations under the License.
  * =============================================================================
  */
-function Y(f, t) {
-  let e = $(f, "a", "mod"), o = $(t, "b", "mod");
-  [e, o] = A(e, o);
+function Y(c, t) {
+  let e = E(c, "a", "mod"), o = E(t, "b", "mod");
+  [e, o] = G(e, o);
   const i = { a: e, b: o };
-  return v.runKernel(B, i);
+  return v.runKernel(O, i);
 }
-const Z = /* @__PURE__ */ y({ mod_: Y });
+const Z = /* @__PURE__ */ $({ mod_: Y });
 /**
  * @license
  * Copyright 2020 Google LLC. All Rights Reserved.
@@ -53,17 +51,17 @@ const Z = /* @__PURE__ */ y({ mod_: Y });
  * limitations under the License.
  * =============================================================================
  */
-function tt(f, t, e, o = !1) {
-  const i = $(f, "logits", "multinomial"), s = i.size, r = i.rank;
+function tt(c, t, e, o = !1) {
+  const i = E(c, "logits", "multinomial"), s = i.size, l = i.rank;
   if (s < 2)
     throw new Error(`Error in multinomial: you need at least 2 outcomes, but got ${s}.`);
-  if (r > 2)
-    throw new Error(`Rank of probabilities must be 1 or 2, but is ${r}`);
+  if (l > 2)
+    throw new Error(`Rank of probabilities must be 1 or 2, but is ${l}`);
   e = e || Math.random();
-  const n = { logits: r === 1 ? _(i, [1, -1]) : i }, h = { numSamples: t, seed: e, normalized: o }, l = v.runKernel(G, n, h);
-  return r === 1 ? _(l, [l.size]) : l;
+  const n = { logits: l === 1 ? _(i, [1, -1]) : i }, h = { numSamples: t, seed: e, normalized: o }, a = v.runKernel(j, n, h);
+  return l === 1 ? _(a, [a.size]) : a;
 }
-const M = /* @__PURE__ */ y({ multinomial_: tt });
+const M = /* @__PURE__ */ $({ multinomial_: tt });
 /**
  * @license
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -80,8 +78,8 @@ const M = /* @__PURE__ */ y({ multinomial_: tt });
  * limitations under the License.
  * =============================================================================
  */
-function et(f, t = 1, e = !0) {
-  const o = $(f, "x", "topk");
+function et(c, t = 1, e = !0) {
+  const o = E(c, "x", "topk");
   if (o.rank === 0)
     throw new Error("topk() expects the input to be of rank 1 or higher");
   const i = o.shape[o.shape.length - 1];
@@ -89,11 +87,26 @@ function et(f, t = 1, e = !0) {
     throw new Error(`'k' passed to topk() must be >= 0 but got ${t}`);
   if (t > i)
     throw new Error(`'k' passed to topk() must be <= the last dimension (${i}) but got ${t}`);
-  const s = { x: o }, r = { k: t, sorted: e }, [a, n] = v.runKernel(O, s, r);
-  return { values: a, indices: n };
+  const s = { x: o }, l = { k: t, sorted: e }, [r, n] = v.runKernel(Q, s, l);
+  return { values: r, indices: n };
 }
-const ot = /* @__PURE__ */ y({ topk_: et });
-class wt extends T {
+const ot = /* @__PURE__ */ $({ topk_: et });
+/**
+ * @license
+ * Copyright 2018 Google LLC
+ *
+ * Use of this source code is governed by an MIT-style
+ * license that can be found in the LICENSE file or at
+ * https://opensource.org/licenses/MIT.
+ * =============================================================================
+ */
+function st(c) {
+  return new P(c);
+}
+function it(c) {
+  return new F(c);
+}
+class wt extends B {
   wte;
   // Token embeddings
   wpe;
@@ -107,19 +120,19 @@ class wt extends T {
   log = [];
   // Training log
   constructor(t = {}) {
-    super({ gpt: { ...x, ...t }, layerConfig: {} }), this.wte = new F({
+    super({ gpt: { ...x, ...t }, layerConfig: {} }), this.wte = new q({
       vocabSize: this.config.gpt.vocabSize,
       embedDim: this.config.gpt.nEmbed,
       name: "token_embedding"
-    }), this.config.gpt.useRope === !1 ? this.wpe = j({
+    }), this.config.gpt.useRope === !1 ? this.wpe = it({
       inputDim: this.config.gpt.blockSize,
       outputDim: this.config.gpt.nEmbed,
       name: "positional_embedding",
-      embeddingsInitializer: R({ mean: 0, stddev: 0.02 })
-    }) : (this.ropeCache = new P(this.config.gpt), this.config.layerConfig.ropeCache = this.ropeCache), this.drop = U({ rate: this.config.gpt.dropout }), this.blocks = [];
+      embeddingsInitializer: T({ mean: 0, stddev: 0.02 })
+    }) : (this.ropeCache = new K(this.config.gpt), this.config.layerConfig.ropeCache = this.ropeCache), this.drop = st({ rate: this.config.gpt.dropout }), this.blocks = [];
     for (let e = 0; e < this.config.gpt.nLayer; e++)
       this.blocks.push(new W(e, this.config));
-    this.lnF = new q(this.config, 1e-8, "final_rms_norm");
+    this.lnF = new N(this.config, 1e-8, "final_rms_norm");
   }
   get checkpointing() {
     return this.config.layerConfig.checkpointAttention === !0 || this.config.layerConfig.checkpointMLP === !0;
@@ -152,8 +165,8 @@ class wt extends T {
     return w(() => {
       const i = this.wte.embed(t);
       if (this.config.gpt.useRope === !1) {
-        const [, s] = t.shape, r = this.config.gpt.blockSize, a = V(0, s, 1, "int32"), n = Z(Q(a, C(e, "int32")), C(r, "int32")), h = this.wpe.apply(n), l = i.add(h);
-        return this.drop.apply(l, { training: o });
+        const [, s] = t.shape, l = this.config.gpt.blockSize, r = X(0, s, 1, "int32"), n = Z(V(r, C(e, "int32")), C(l, "int32")), h = this.wpe.apply(n), a = i.add(h);
+        return this.drop.apply(a, { training: o });
       } else
         return this.drop.apply(i, { training: o });
     });
@@ -185,7 +198,7 @@ class wt extends T {
   }
   calculateLoss(t, e) {
     try {
-      return N()(t, e).mean();
+      return A()(t, e).mean();
     } catch (o) {
       throw console.error("Error computing loss:", o), new Error(`Loss computation failed: ${o}`);
     }
@@ -197,70 +210,67 @@ class wt extends T {
       if (t.length === 0)
         throw new Error("No attentions for rollout");
       const [e, o, i] = t[0].shape;
-      for (const s of t) {
-        const [r, a, n] = s.shape;
-        if (r !== e || a !== o || n !== i)
+      for (const n of t) {
+        const [h, a, p] = n.shape;
+        if (h !== e || a !== o || p !== i)
           throw new Error(
-            `Inconsistent attention shapes in rollout: expected [${e},${o},${i}] got [${r},${a},${n}]`
+            `Inconsistent attention shapes in rollout: expected [${e},${o},${i}] got [${h},${a},${p}]`
           );
       }
-      if (o === i) {
-        const s = X(i, i).expandDims(0);
-        let r = s.tile([e, 1, 1]);
-        for (const a of t) {
-          const n = a.add(s);
-          r = n.div(n.sum(-1, !0)).matMul(r);
-        }
-        return r;
+      const s = t.map((n) => n.slice([0, 0, 0], [e, o, o])), l = H(o, o).expandDims(0);
+      let r = l.tile([e, 1, 1]);
+      for (const n of s) {
+        const h = n.add(l);
+        r = h.div(h.sum(-1, !0)).matMul(r);
       }
-      throw new Error(`Unsupported attention shapes for rollout: [B=${e}, Q=${o}, K=${i}]`);
+      return r;
     });
   }
   forward(t, e, o = !1, i = !1, s) {
     return this.validateInput(t), w(() => {
       this.startMemory();
-      const r = s?.[0]?.length ?? 0;
-      let a = this.inputPhase(t, r, o);
+      const l = s?.[0]?.length ?? 0;
+      let r = this.inputPhase(t, l, o);
       const n = [];
       if (s && s.length !== this.blocks.length)
         throw console.error("Cache", s), new Error(`Cache length ${s.length} does not match number of blocks ${this.blocks.length}`);
-      for (let p = 0; p < this.blocks.length; p++) {
-        const m = a, u = this.blocks[p], {
+      for (let g = 0; g < this.blocks.length; g++) {
+        const u = r, m = this.blocks[g], {
           output: b,
           attention: k,
-          cache: g
-        } = u.call(a, o, i, s ? s[p] : void 0);
-        a = b, m.dispose(), i && k && n.push(k), s && g ? (s[p]?.k.dispose(), s[p]?.v.dispose(), s[p] = g) : g && (g.k.dispose(), g.v.dispose());
+          cache: f
+        } = m.call(r, o, i, s ? s[g] : void 0);
+        r = b, u.dispose(), i && k && n.push(k), s && f ? (s[g]?.k.dispose(), s[g]?.v.dispose(), s[g] = f) : f && (f.k.dispose(), f.v.dispose());
       }
       let h;
-      i && n.length > 0 && (h = this.computeAttentionRollout(n)), a = this.lnF.apply(a);
-      const l = this.wte.project(a);
-      let c;
-      return e && (c = this.calculateLoss(l, e)), this.endMemory("Forward"), { logits: l, loss: c, attention: i ? h : void 0 };
+      i && n.length > 0 && (h = this.computeAttentionRollout(n)), r = this.lnF.apply(r);
+      const a = this.wte.project(r);
+      let p;
+      return e && (p = this.calculateLoss(a, e)), this.endMemory("Forward"), { logits: a, loss: p, attention: i ? h : void 0 };
     });
   }
   generate(t, e, o) {
-    const i = o?.temperature ?? 1, s = o?.topK, r = o?.usePadding ?? !1, a = o?.includeAttention ?? !1;
+    const i = o?.temperature ?? 1, s = o?.topK, l = o?.usePadding ?? !1, r = o?.includeAttention ?? !1;
     return w(() => {
-      const n = t, h = n.shape[1], l = h <= this.config.gpt.blockSize ? n : n.slice(
+      const n = t, h = n.shape[1], a = h <= this.config.gpt.blockSize ? n : n.slice(
         [0, h - this.config.gpt.blockSize],
         [n.shape[0], this.config.gpt.blockSize]
-      ), c = r ? this.config.gpt.blockSize - l.shape[1] : 0, p = c > 0 ? D(l, [
+      ), p = l ? this.config.gpt.blockSize - a.shape[1] : 0, g = p > 0 ? D(a, [
         [0, 0],
-        [0, c]
-      ]) : l, { logits: m, attention: u } = this.forward(p, void 0, !1, a, e), b = m.shape[1] - 1 - c, k = m.slice([0, b, 0], [m.shape[0], 1, m.shape[2]]), g = u ? u.slice([0, b, 0], [u.shape[0], 1, u.shape[2]]) : void 0, E = k.div(i);
+        [0, p]
+      ]) : a, { logits: u, attention: m } = this.forward(g, void 0, !1, r, e), b = u.shape[1] - 1 - p, k = u.slice([0, b, 0], [u.shape[0], 1, u.shape[2]]), f = m ? m.slice([0, b, 0], [m.shape[0], 1, m.shape[2]]) : void 0, y = k.div(i);
       let d;
       if (s) {
-        const { values: S, indices: I } = ot(E, s), L = M(S.squeeze([1]), 1);
-        d = H(I.squeeze([1]), L, 1);
+        const { values: S, indices: I } = ot(y, s), L = M(S.squeeze([1]), 1);
+        d = J(I.squeeze([1]), L, 1);
       } else
-        d = M(E.squeeze([1]), 1);
+        d = M(y.squeeze([1]), 1);
       let z;
-      return o?.includeProbabilities && (z = J(E.squeeze([1]))), d = d.reshape([1, 1]), { output: d, attention: g?.squeeze([1]), probabilities: z };
+      return o?.includeProbabilities && (z = U(y.squeeze([1]))), d = d.reshape([1, 1]), { output: d, attention: f?.squeeze([1]), probabilities: z };
     });
   }
   getNumParams() {
-    return K(this.config.gpt);
+    return R(this.config.gpt);
   }
   dispose() {
     this.wte.dispose(), this.wpe && this.wpe.dispose(), this.drop.dispose(), this.blocks.forEach((t) => t.dispose()), this.lnF.dispose();

package/dist/TeachableLLM.js CHANGED Viewed

@@ -11,7 +11,7 @@ import g from "./tokeniser/bpe.js";
 import "./papaparse.min-C8l2Kvo1.js";
 import "./index-Tf7vU29b.js";
 import "./jszip.min-CjP2V1VV.js";
-import "./index-XjBAhiFO.js";
+import "./index-C4JCoBvj.js";
 import "./ops/cpu/scatterSub.js";
 import "./ops/webgl/scatterSub.js";
 import "./ops/cpu/gatherSub.js";