npm - @genai-fi/nanogpt - Versions diffs - 0.4.5 → 0.5.1 - Mend

@genai-fi/nanogpt 0.4.5 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (111) hide show

package/dist/BaseLayer-BhrMN8JO.js +135 -0
package/dist/Generator.js +52 -49
package/dist/NanoGPTModel.d.ts +13 -17
package/dist/NanoGPTModel.js +128 -136
package/dist/{Reshape-CiAY8ltP.js → Reshape-BE5rA4rT.js} +8 -8
package/dist/TeachableLLM.js +1 -1
package/dist/{TiedEmbedding-DznFwzcB.js → TiedEmbedding-DsDRvLB0.js} +751 -768
package/dist/{axis_util-QP0LdI1v.js → axis_util-97KkkyRQ.js} +1 -1
package/dist/broadcast_to-CMlkG8NS.js +44 -0
package/dist/{concat-DvWM7HGZ.js → concat-Cxbo2sOz.js} +3 -3
package/dist/{dropout-DFEXTPV0.js → dropout-kbDY39Ci.js} +1 -1
package/dist/{gather-C5D8PxwA.js → gather-Bxe1Qip8.js} +4 -4
package/dist/{gpgpu_math-CUzjlO9A.js → gpgpu_math-C0zyxKFi.js} +1 -1
package/dist/{index--6vO-cOz.js → index-iNhkcAEQ.js} +82 -82
package/dist/{kernel_funcs_utils-C6YBCuOt.js → kernel_funcs_utils-C4eIk4fE.js} +20 -20
package/dist/layers/BaseLayer.d.ts +28 -4
package/dist/layers/BaseLayer.js +3 -16
package/dist/layers/CausalSelfAttention.d.ts +21 -24
package/dist/layers/CausalSelfAttention.js +73 -128
package/dist/layers/MLP.d.ts +8 -15
package/dist/layers/MLP.js +43 -81
package/dist/layers/RMSNorm.d.ts +5 -10
package/dist/layers/RMSNorm.js +13 -29
package/dist/layers/RoPECache.js +14 -12
package/dist/layers/TiedEmbedding.d.ts +6 -16
package/dist/layers/TiedEmbedding.js +5 -5
package/dist/layers/TransformerBlock.d.ts +12 -16
package/dist/layers/TransformerBlock.js +20 -41
package/dist/{log_sum_exp-CiEy1aUe.js → log_sum_exp-CkumwesB.js} +11 -11
package/dist/main.js +1 -1
package/dist/{mat_mul-BEHRPMh0.js → mat_mul-D0SifYfJ.js} +3 -3
package/dist/{max-BUShNgfh.js → max-CYaAjEEp.js} +3 -3
package/dist/{moments-DYOHXoRV.js → moments-B06NlR_V.js} +6 -6
package/dist/{norm-DSva3hI3.js → norm-D3676xIo.js} +7 -7
package/dist/{ones-D6kB8bdY.js → ones-BIeFnPHR.js} +2 -2
package/dist/ops/appendCache.js +4 -4
package/dist/ops/attentionMask.d.ts +1 -1
package/dist/ops/attentionMask.js +4 -4
package/dist/ops/cpu/appendCache.js +2 -2
package/dist/ops/cpu/attentionMask.js +14 -15
package/dist/ops/cpu/fusedSoftmax.js +2 -2
package/dist/ops/cpu/gatherSub.js +5 -5
package/dist/ops/cpu/gelu.js +1 -1
package/dist/ops/cpu/matMulGelu.js +1 -1
package/dist/ops/cpu/matMulMul.d.ts +1 -0
package/dist/ops/cpu/matMulMul.js +17 -0
package/dist/ops/cpu/mulDropout.js +1 -1
package/dist/ops/cpu/normRMS.js +1 -1
package/dist/ops/cpu/qkv.js +3 -3
package/dist/ops/cpu/rope.js +5 -5
package/dist/ops/cpu/scatterSub.js +8 -8
package/dist/ops/fusedSoftmax.js +1 -1
package/dist/ops/gatherSub.js +1 -1
package/dist/ops/gelu.js +1 -1
package/dist/ops/grads/attentionMask.js +13 -9
package/dist/ops/grads/fusedSoftmax.js +12 -9
package/dist/ops/grads/gelu.js +1 -1
package/dist/ops/grads/matMulGelu.js +1 -1
package/dist/ops/grads/normRMS.js +1 -1
package/dist/ops/grads/qkv.js +19 -9
package/dist/ops/grads/rope.js +1 -1
package/dist/ops/matMulGelu.js +1 -1
package/dist/ops/matMulMul.d.ts +2 -0
package/dist/ops/matMulMul.js +9 -0
package/dist/ops/mulDrop.js +1 -1
package/dist/ops/node/sparseCrossEntropy.js +1 -1
package/dist/ops/normRMS.js +1 -1
package/dist/ops/qkv.js +1 -1
package/dist/ops/scatterSub.js +1 -1
package/dist/ops/webgl/appendCache.js +1 -1
package/dist/ops/webgl/attentionMask.js +13 -12
package/dist/ops/webgl/fusedSoftmax.js +43 -40
package/dist/ops/webgl/gatherSub.js +1 -1
package/dist/ops/webgl/gelu.js +2 -2
package/dist/ops/webgl/matMulGelu.js +17 -17
package/dist/ops/webgl/matMulMul.d.ts +14 -0
package/dist/ops/webgl/matMulMul.js +28 -0
package/dist/ops/webgl/mulDropout.js +1 -1
package/dist/ops/webgl/normRMS.js +29 -21
package/dist/ops/webgl/qkv.js +1 -1
package/dist/ops/webgl/rope.js +1 -1
package/dist/ops/webgl/scatterSub.js +1 -1
package/dist/ops-ObfXLHYQ.js +1269 -0
package/dist/{range-C_vpUjBu.js → range-BsFU-SNG.js} +1 -1
package/dist/{reshape-z51Eu-re.js → reshape-DxTPgnwL.js} +3 -3
package/dist/{sin-H567uayl.js → sin-BOX-JVAj.js} +5 -5
package/dist/slice_util-D-kaD4ZV.js +49 -0
package/dist/{softmax-Dsxflvdl.js → softmax-BjsptB07.js} +2 -2
package/dist/{split-B_k_jwud.js → split-BCbrzthj.js} +4 -4
package/dist/{stack-CmqSdsfs.js → stack--cqr9Dgc.js} +2 -2
package/dist/{sum-DdkDf2MG.js → sum-B_92TaHD.js} +5 -5
package/dist/{tensor-BGYi41cj.js → tensor-CfiPXsW4.js} +1 -1
package/dist/{tensor2d-DUr_htjt.js → tensor2d-tSxWdFMH.js} +1 -1
package/dist/tfjs_backend-NucKez4s.js +1010 -0
package/dist/training/AdamExt.js +1 -1
package/dist/training/DatasetBuilder.js +44 -44
package/dist/training/Evaluator.js +6 -6
package/dist/training/FullTrainer.js +1 -1
package/dist/training/Trainer.js +7 -7
package/dist/training/sparseCrossEntropy.js +4 -4
package/dist/utilities/dummy.js +10 -10
package/dist/utilities/generate.js +3 -3
package/dist/utilities/load.js +1 -1
package/dist/utilities/profile.js +1 -1
package/dist/utilities/save.js +13 -11
package/dist/utilities/weights.js +2 -2
package/dist/{zeros-8xl-W2DC.js → zeros-NMYTayy7.js} +3 -3
package/package.json +1 -1
package/dist/slice_util-BdhYwFY_.js +0 -90
package/dist/tfjs_backend-DuKis_xG.js +0 -2271
package/dist/variable-BJTZ3jOy.js +0 -23

package/dist/layers/CausalSelfAttention.js CHANGED Viewed

@@ -1,150 +1,95 @@
-import { attentionMask as P } from "../ops/attentionMask.js";
-import T from "./BaseLayer.js";
-import { qkv as y } from "../ops/qkv.js";
-import { rope as w } from "../ops/rope.js";
-import { appendCache as E } from "../ops/appendCache.js";
-import { D as z, F as S, t as $, c as L, e as j, H as O } from "../index--6vO-cOz.js";
-import { fusedSoftmax as _ } from "../ops/fusedSoftmax.js";
-import { l as W, w as M, d as x } from "../tfjs_backend-DuKis_xG.js";
-import { o as q } from "../ones-D6kB8bdY.js";
-import { v as b } from "../variable-BJTZ3jOy.js";
-import { z as B } from "../zeros-8xl-W2DC.js";
-import { r as C, d as I } from "../dropout-DFEXTPV0.js";
-import { r as F } from "../reshape-z51Eu-re.js";
-import { m as H } from "../mat_mul-BEHRPMh0.js";
-class nt extends T {
-  cAttn = null;
-  cProj = null;
-  bias;
-  maskInf;
+import { attentionMask as g } from "../ops/attentionMask.js";
+import { B as O, v } from "../BaseLayer-BhrMN8JO.js";
+import { qkv as P } from "../ops/qkv.js";
+import { rope as V } from "../ops/rope.js";
+import { appendCache as T } from "../ops/appendCache.js";
+import { F as c, t as C } from "../index-iNhkcAEQ.js";
+import { fusedSoftmax as b } from "../ops/fusedSoftmax.js";
+import { d as y } from "../tfjs_backend-NucKez4s.js";
+import { r as k, d as L } from "../dropout-kbDY39Ci.js";
+import { r as N } from "../reshape-DxTPgnwL.js";
+import { m as R } from "../mat_mul-D0SifYfJ.js";
+class W extends O {
   divisor;
   index;
-  _trainable = !0;
   units;
   projUnits;
-  constructor(t, s) {
-    super(s), this.index = t, this.units = s.gpt.nEmbed * 3, this.projUnits = s.gpt.nEmbed, this.bias = W.bandPart(q([s.gpt.blockSize, s.gpt.blockSize]), -1, 0).cast("bool"), this.divisor = 1 / Math.sqrt(s.gpt.nEmbed / s.gpt.nHead);
-    const e = B([s.gpt.blockSize, s.gpt.blockSize]), o = z([s.gpt.blockSize, s.gpt.blockSize], Number.NEGATIVE_INFINITY);
-    this.maskInf = M(this.bias, e, o);
+  ATTN;
+  PROJ;
+  constructor(t, i, s) {
+    super(i, s), this.index = t, this.units = i.gpt.nEmbed * 3, this.projUnits = i.gpt.nEmbed, this.ATTN = `block_${this.index}_cAttn`, this.PROJ = `block_${this.index}_cProj`, this.addVariable(this.ATTN), this.addVariable(this.PROJ), this.divisor = 1 / Math.sqrt(i.gpt.nEmbed / i.gpt.nHead);
   }
   build() {
-    this.cAttn === null && (this.cAttn = b(
-      C([this.config.gpt.nEmbed, this.units], 0, 0.02),
-      !0
-      //`block_${this.index}_attn_cAttn_kernel`
-    )), this.cProj === null && (this.cProj = b(
-      C([this.projUnits, this.config.gpt.nEmbed], 0, 0.02),
-      !0
-      //`block_${this.index}_attn_cProj_kernel`
-    ));
+    this.hasVariable(this.ATTN) === !1 && this.setVariable(
+      this.ATTN,
+      v(
+        k([this.config.gpt.nEmbed, this.units], 0, 0.02),
+        !0
+        //`block_${this.index}_attn_cAttn_kernel`
+      )
+    ), this.hasVariable(this.PROJ) === !1 && this.setVariable(
+      this.PROJ,
+      v(
+        k([this.projUnits, this.config.gpt.nEmbed], 0, 0.02),
+        !0
+        //`block_${this.index}_attn_cProj_kernel`
+      )
+    );
   }
-  get variables() {
-    if (this.cAttn === null)
-      throw new Error("Layer not built yet");
-    return [this.cAttn, this.cProj];
-  }
-  get trainable() {
-    return this._trainable;
-  }
-  set trainable(t) {
-    this._trainable = t, this.cAttn && (this.cAttn.trainable = t), this.cProj && (this.cProj.trainable = t);
-  }
-  saveWeights(t) {
-    t.set(`block_${this.index}_cAttn`, this.cAttn ? [this.cAttn.clone()] : []), t.set(`block_${this.index}_cProj`, this.cProj ? [this.cProj.clone()] : []);
-  }
-  loadWeights(t) {
-    const s = t.get(`block_${this.index}_cAttn`)?.[0], e = t.get(`block_${this.index}_cProj`)?.[0];
-    if (!s) throw new Error(`Weights for block_${this.index}_cAttn not found`);
-    if (!e) throw new Error(`Weights for block_${this.index}_cProj not found`);
-    this.cAttn ? this.cAttn.assign(s) : this.cAttn = b(s, !0), this.cProj ? this.cProj.assign(e) : this.cProj = b(e, !0);
-  }
-  getAttentionScores(t, s, e, o) {
-    const i = P(t, s, this.divisor, this.maskInf);
-    return _(i, e ? this.config.gpt.dropout : 0, o);
+  getAttentionScores(t, i, s, o) {
+    const e = g(t, i, this.divisor), n = b(e, s ? this.config.gpt.dropout : 0, o);
+    return e.dispose(), n;
   }
   // Attention with optional past. If pastLen > 0 and T_cur == 1, no mask needed.
-  getAttentionScoresWithPast(t, s, e) {
-    const o = P(t, s, this.divisor, void 0, e);
-    return _(o, 0, 0);
+  getAttentionScoresWithPast(t, i, s) {
+    const o = g(t, i, this.divisor, s), e = b(o, 0, 0);
+    return o.dispose(), e;
   }
   getQKV(t) {
-    return y(t, this.cAttn, this.config.gpt.nHead);
+    return P(t, this.getVariable(this.ATTN), this.config.gpt.nHead);
   }
   getOutputProjection(t) {
-    const s = t.shape[0], e = t.shape[2], o = this.config.gpt.nEmbed, i = t.transpose([0, 2, 1, 3]), n = F(i, [s, e, o]);
-    return x(n, this.cProj);
+    const i = t.shape[0], s = t.shape[2], o = this.config.gpt.nEmbed, e = t.transpose([0, 2, 1, 3]), n = N(e, [i, s, o]), p = y(n, this.getVariable(this.PROJ));
+    return n.dispose(), e.dispose(), p;
   }
-  updateCache(t, s, e, o) {
-    const i = this.config.gpt.blockSize, n = t.shape[2], r = o?.length || 0, a = e ? t : E(t, i, r, o?.k);
-    e || (t.dispose(), o?.k.dispose());
-    const p = e ? s : E(s, i, r, o?.v);
-    return e || (s.dispose(), o?.v.dispose()), {
-      k: S(a),
-      v: S(p),
-      length: Math.min(r + n, i),
-      cumulativeLength: o ? o.cumulativeLength + n : n
-    };
+  updateCache(t, i, s) {
+    const o = this.config.gpt.blockSize, e = t.shape[2], n = s.length || 0, p = T(t, o, n, s.k);
+    t.dispose(), s.k && s.k.dispose();
+    const r = T(i, o, n, s.v);
+    i.dispose(), s.v && s.v.dispose();
+    const d = Math.min(n + e, o), h = s.cumulativeLength + e;
+    s.length = d, s.cumulativeLength = h, s.k = c(p), s.v = c(r);
   }
-  forward(t, s = !1, e, o = !1, i) {
-    return $(() => {
+  forward(t, i) {
+    return C(() => {
       this.startMemory();
-      const [n, r, a] = this.getQKV(t), p = i ? i.cumulativeLength : 0, c = this.config.layerConfig.ropeCache, u = c ? w(n, c, p) : n, A = c ? w(r, c, p) : r;
-      c && (n.dispose(), r.dispose());
-      const f = i ? i.length : 0, d = this.updateCache(A, a, s, i), l = d.k, g = d.v;
-      let h;
-      f > 0 ? h = this.getAttentionScoresWithPast(u, l, f) : h = this.getAttentionScores(u, l, s, e), u.dispose(), s && l.dispose();
-      const m = H(h, g);
-      o || h.dispose(), s && g.dispose();
-      const k = this.getOutputProjection(m);
-      m.dispose();
-      const v = o ? h.mean(1) : void 0;
-      return this.endMemory("CausalSelfAttention"), { output: k, attention: v, presentKV: s ? void 0 : d };
+      const [s, o, e] = this.getQKV(i), n = t.pastKV ? t.pastKV.cumulativeLength : 0, p = this.config.layerConfig.ropeCache, r = p ? V(s, p, n) : s, d = p ? V(o, p, n) : o;
+      p && (s.dispose(), o.dispose());
+      const h = t.pastKV ? t.pastKV.length : 0;
+      t.pastKV && !t.training && this.updateCache(d, e, t.pastKV);
+      const u = t.pastKV?.k ? t.pastKV.k : d, l = t.pastKV?.v ? t.pastKV.v : e;
+      let a;
+      h > 0 ? a = this.getAttentionScoresWithPast(r, u, h) : a = this.getAttentionScores(r, u, t.training, t.seed || 0), r.dispose(), t.pastKV || u.dispose();
+      const m = R(a, l), f = t.attentionScores !== void 0 && t.attentionScores.attentionOut !== void 0;
+      f || a.dispose(), t.pastKV || l.dispose();
+      const A = this.getOutputProjection(m);
+      if (m.dispose(), f && t.attentionScores && t.attentionScores.attentionOut !== void 0) {
+        const K = a.shape[1], S = a.shape[2];
+        t.attentionScores.attentionOut?.push(
+          c(a.slice([0, 0, 0, 0], [1, -1, -1, -1]).reshape([K, S, -1]))
+        );
+      }
+      return this.endMemory("CausalSelfAttention"), A;
     });
   }
-  call(t, s = !1, e = !1, o) {
-    if (o && !this.config.gpt.useRope)
-      throw new Error("Cannot use pastKV without RoPE enabled");
-    if (s && o)
-      throw new Error("Cannot use pastKV during training");
-    if (t.shape.length !== 3)
-      throw new Error(`Input tensor must be rank 3 [B, T, C], got shape ${t.shape}`);
-    if (t.shape[2] !== this.config.gpt.nEmbed)
-      throw new Error(`Input tensor last dimension must be ${this.config.gpt.nEmbed}, got ${t.shape[2]}`);
-    this.build();
-    const i = Math.random() * 1e9;
-    if (s && this.config.layerConfig.checkpointAttention) {
-      const r = L(
-        // @ts-expect-error Invalid params
-        (a, p, c, u) => {
-          const A = this.forward(a, !0, i);
-          u([a]);
-          const f = (d, l) => {
-            const [g] = l, h = j().state.activeTape;
-            j().state.activeTape = [];
-            const m = O((k, v, R) => this.forward(k, !0, i).output)([g, p, c], d);
-            return j().state.activeTape = h, m;
-          };
-          return { value: A.output, gradFunc: f };
-        }
-      )(t, this.cAttn, this.cProj);
-      if (this.config.gpt.dropout > 0) {
-        const a = I(r, this.config.gpt.dropout);
-        return r.dispose(), { output: a };
-      } else
-        return { output: r };
-    } else {
-      const n = this.forward(t, s, i, e, o);
-      if (this.config.gpt.dropout > 0) {
-        const r = I(n.output, this.config.gpt.dropout);
-        return n.output.dispose(), { output: r, attention: n.attention, presentKV: n.presentKV };
-      } else
-        return n;
-    }
-  }
-  dispose() {
-    this.cAttn?.dispose(), this.cProj?.dispose(), this.bias.dispose(), this.maskInf.dispose();
+  dropout(t) {
+    if (this.config.gpt.dropout > 0) {
+      const i = L(t, this.config.gpt.dropout);
+      return t.dispose(), i;
+    } else
+      return t;
   }
 }
 export {
-  nt as default
+  W as default
 };

package/dist/layers/MLP.d.ts CHANGED Viewed

@@ -1,19 +1,12 @@
-import { Tensor, Variable } from '@tensorflow/tfjs-core';
-import { default as BaseLayer, GPTLayerConfig } from './BaseLayer';
+import { Tensor } from '@tensorflow/tfjs-core';
+import { default as BaseLayer, ForwardAttributes, GPTLayerConfig } from './BaseLayer';
 export default class MLP extends BaseLayer {
-    private cFc;
-    private cProj;
     private index;
-    private _trainable;
     private hiddenUnits;
-    constructor(index: number, config: GPTLayerConfig);
-    private build;
-    get variables(): Variable[];
-    get trainable(): boolean;
-    set trainable(value: boolean);
-    saveWeights(map: Map<string, Tensor[]>): void;
-    loadWeights(weights: Map<string, Tensor[]>): void;
-    forward(x: Tensor): Tensor;
-    call(x: Tensor, training?: boolean): Tensor;
-    dispose(): void;
+    private MLPHIDDEN;
+    private MLPOUT;
+    constructor(index: number, config: GPTLayerConfig, parent?: BaseLayer);
+    protected build(): void;
+    forward(_: ForwardAttributes, x: Tensor): Tensor;
+    protected dropout(x: Tensor): Tensor;
 }

package/dist/layers/MLP.js CHANGED Viewed

@@ -1,93 +1,55 @@
-import { t as F, c as _, e as h, H as M } from "../index--6vO-cOz.js";
-import v from "./BaseLayer.js";
-import { matMulGelu as x } from "../ops/matMulGelu.js";
-import { v as c } from "../variable-BJTZ3jOy.js";
-import { r as d, d as u } from "../dropout-DFEXTPV0.js";
-import { r as p } from "../reshape-z51Eu-re.js";
-import { m as L } from "../mat_mul-BEHRPMh0.js";
-class G extends v {
-  cFc = null;
-  cProj = null;
+import { t as l } from "../index-iNhkcAEQ.js";
+import { B as u, v as o } from "../BaseLayer-BhrMN8JO.js";
+import { matMulGelu as M } from "../ops/matMulGelu.js";
+import { r as h, d as c } from "../dropout-kbDY39Ci.js";
+import { r as d } from "../reshape-DxTPgnwL.js";
+import { m as f } from "../mat_mul-D0SifYfJ.js";
+class O extends u {
   index;
-  _trainable = !0;
   hiddenUnits;
-  constructor(t, s) {
-    super(s), this.index = t, this.hiddenUnits = s.gpt.mlpFactor * s.gpt.nEmbed;
+  MLPHIDDEN;
+  MLPOUT;
+  constructor(i, t, s) {
+    super(t, s), this.index = i, this.hiddenUnits = t.gpt.mlpFactor * t.gpt.nEmbed, this.MLPHIDDEN = `block_${this.index}_mlpHidden`, this.MLPOUT = `block_${this.index}_mlpOut`, this.addVariable(this.MLPHIDDEN), this.addVariable(this.MLPOUT);
   }
   build() {
-    this.cFc === null && (this.cFc = c(
-      d([this.config.gpt.nEmbed, this.hiddenUnits], 0, 0.02),
-      !0
-      //`block_${this.index}_attn_cAttn_kernel`
-    )), this.cProj === null && (this.cProj = c(
-      d(
-        [this.hiddenUnits, this.config.gpt.nEmbed],
-        0,
-        0.02 / Math.sqrt(2 * this.config.gpt.nLayer)
-      ),
-      !0
-      //`block_${this.index}_attn_cProj_kernel`
-    ));
-  }
-  get variables() {
-    return [this.cFc, this.cProj];
-  }
-  get trainable() {
-    return this._trainable;
-  }
-  set trainable(t) {
-    this._trainable = t, this.cFc && (this.cFc.trainable = t), this.cProj && (this.cProj.trainable = t);
-  }
-  saveWeights(t) {
-    t.set(`block_${this.index}_mlpHidden`, this.cFc ? [this.cFc.clone()] : []), t.set(`block_${this.index}_mlpOut`, this.cProj ? [this.cProj.clone()] : []);
-  }
-  loadWeights(t) {
-    const s = t.get(`block_${this.index}_mlpOut`)?.[0], i = t.get(`block_${this.index}_mlpHidden`)?.[0];
-    if (!s || !i)
-      throw new Error(`Weights for block ${this.index} not found`);
-    this.cFc ? this.cFc.assign(i) : this.cFc = c(i, !0), this.cProj ? this.cProj.assign(s) : this.cProj = c(s, !0);
-  }
-  forward(t) {
-    return F(() => {
+    this.hasVariable(this.MLPHIDDEN) === !1 && this.setVariable(
+      this.MLPHIDDEN,
+      o(
+        h([this.config.gpt.nEmbed, this.hiddenUnits], 0, 0.02),
+        !0
+        //`block_${this.index}_attn_cAttn_kernel`
+      )
+    ), this.hasVariable(this.MLPOUT) === !1 && this.setVariable(
+      this.MLPOUT,
+      o(
+        h(
+          [this.hiddenUnits, this.config.gpt.nEmbed],
+          0,
+          0.02 / Math.sqrt(2 * this.config.gpt.nLayer)
+        ),
+        !0
+        //`block_${this.index}_attn_cProj_kernel`
+      )
+    );
+  }
+  forward(i, t) {
+    return l(() => {
       this.startMemory();
-      const [s, i, r] = t.shape, o = p(t, [s * i, r]), e = x(o, this.cFc), n = L(e, this.cProj);
-      e.dispose();
-      const a = p(n, [s, i, r]);
-      return this.endMemory("MLP"), a;
+      const [s, r, e] = t.shape, n = d(t, [s * r, e]), a = M(n, this.getVariable(this.MLPHIDDEN)), p = f(a, this.getVariable(this.MLPOUT));
+      a.dispose();
+      const m = d(p, [s, r, e]);
+      return this.endMemory("MLP"), m;
     });
   }
-  call(t, s = !1) {
-    if (this.build(), s && this.config.layerConfig.checkpointMLP) {
-      const r = _(
-        // @ts-expect-error Invalid params
-        (o, e, n, a) => {
-          const l = this.forward(o);
-          return a([o]), { value: l, gradFunc: (f, g) => {
-            const [m] = g, b = h().state.activeTape;
-            h().state.activeTape = [];
-            const P = M((j, w, T) => this.forward(j))([m, e, n], f);
-            return h().state.activeTape = b, P;
-          } };
-        }
-      )(t, this.cFc, this.cProj);
-      if (this.config.gpt.dropout > 0) {
-        const o = u(r, this.config.gpt.dropout);
-        return r.dispose(), o;
-      }
-      return r;
-    } else {
-      const i = this.forward(t);
-      if (s && this.config.gpt.dropout > 0) {
-        const r = u(i, this.config.gpt.dropout);
-        return i.dispose(), r;
-      }
-      return i;
+  dropout(i) {
+    if (this.config.gpt.dropout > 0) {
+      const t = c(i, this.config.gpt.dropout);
+      return i.dispose(), t;
     }
-  }
-  dispose() {
-    this.cFc?.dispose(), this.cProj?.dispose();
+    return i;
   }
 }
 export {
-  G as default
+  O as default
 };

package/dist/layers/RMSNorm.d.ts CHANGED Viewed

@@ -1,12 +1,7 @@
-import { Tensor, Variable } from '@tensorflow/tfjs-core';
-import { default as BaseLayer, GPTLayerConfig } from './BaseLayer';
+import { Tensor } from '@tensorflow/tfjs-core';
+import { default as BaseLayer, ForwardAttributes, GPTLayerConfig } from './BaseLayer';
 export default class RMSNorm extends BaseLayer {
-    private gamma;
-    constructor(config: GPTLayerConfig, name?: string);
-    get trainableWeights(): Variable[];
-    set trainable(value: boolean);
-    getWeights(): Tensor[];
-    setWeights(weights: Tensor[]): void;
-    apply(x: Tensor): Tensor;
-    dispose(): void;
+    private GAMMA;
+    constructor(config: GPTLayerConfig, name?: string, parent?: BaseLayer);
+    forward(_: ForwardAttributes, x: Tensor): Tensor;
 }

package/dist/layers/RMSNorm.js CHANGED Viewed

@@ -1,36 +1,20 @@
-import { t as r } from "../index--6vO-cOz.js";
-import m from "./BaseLayer.js";
-import { normRMS as s } from "../ops/normRMS.js";
-import { v as e } from "../variable-BJTZ3jOy.js";
-import { o as i } from "../ones-D6kB8bdY.js";
-class u extends m {
-  gamma;
-  constructor(t, a = "") {
-    super(t), this.gamma = e(i([t.gpt.nEmbed]), !0, `${a}_gamma`, "float32");
+import { t as e } from "../index-iNhkcAEQ.js";
+import { B as o, v as a } from "../BaseLayer-BhrMN8JO.js";
+import { normRMS as i } from "../ops/normRMS.js";
+import { o as M } from "../ones-BIeFnPHR.js";
+class l extends o {
+  GAMMA;
+  constructor(r, t = "", s) {
+    super(r, s), this.GAMMA = t, this.addVariable(this.GAMMA, a(M([r.gpt.nEmbed]), !0, this.GAMMA, "float32"));
   }
-  get trainableWeights() {
-    return [this.gamma];
-  }
-  set trainable(t) {
-    this.gamma.trainable = t;
-  }
-  getWeights() {
-    return [this.gamma];
-  }
-  setWeights(t) {
-    this.gamma.assign(t[0]);
-  }
-  apply(t) {
-    return r(() => {
+  forward(r, t) {
+    return e(() => {
       this.startMemory();
-      const a = s(t, this.gamma);
-      return this.endMemory("RMSNorm"), a;
+      const s = i(t, this.getVariable(this.GAMMA));
+      return this.endMemory("RMSNorm"), s;
     });
   }
-  dispose() {
-    this.gamma.dispose();
-  }
 }
 export {
-  u as default
+  l as default
 };

package/dist/layers/RoPECache.js CHANGED Viewed

@@ -1,6 +1,6 @@
-import { o as h, h as c, E as f, T as l, f as n, U as m, t as u, F as p } from "../index--6vO-cOz.js";
-import { c as d, s as C } from "../sin-H567uayl.js";
-import { r as a } from "../range-C_vpUjBu.js";
+import { o as c, i as f, E as l, Q as m, f as n, U as u, t as p, F as a } from "../index-iNhkcAEQ.js";
+import { c as d, s as C } from "../sin-BOX-JVAj.js";
+import { r as h } from "../range-BsFU-SNG.js";
 /**
  * @license
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -18,10 +18,10 @@ import { r as a } from "../range-C_vpUjBu.js";
  * =============================================================================
  */
 function x(r) {
-  const s = { x: c(r, "x", "reciprocal") };
-  return f.runKernel(l, s);
+  const s = { x: f(r, "x", "reciprocal") };
+  return l.runKernel(m, s);
 }
-const S = /* @__PURE__ */ h({ reciprocal_: x });
+const S = /* @__PURE__ */ c({ reciprocal_: x });
 class y {
   constructor(o) {
     this.config = o;
@@ -29,8 +29,8 @@ class y {
     if (this.rotaryDim = s, this.rotaryDim % 2 !== 0)
       throw new Error("rotaryDim must be even");
     this.ropeBase = 1e4;
-    const i = a(0, this.rotaryDim, 2, "float32"), e = i.div(n(this.rotaryDim, "float32")), t = m(n(this.ropeBase, "float32"), e);
-    this.ropeInvFreq = S(t), e.dispose(), t.dispose(), i.dispose(), this.config.useRope === !1 ? (this.ropeCos = null, this.ropeSin = null, this.ropeCacheLen = 0) : u(() => {
+    const i = h(0, this.rotaryDim, 2, "float32"), e = i.div(n(this.rotaryDim, "float32")), t = u(n(this.ropeBase, "float32"), e);
+    this.ropeInvFreq = S(t), e.dispose(), t.dispose(), i.dispose(), this.config.useRope === !1 ? (this.ropeCos = null, this.ropeSin = null, this.ropeCacheLen = 0) : p(() => {
       this.ensureRopeCache(this.config.blockSize * 4);
     });
   }
@@ -43,10 +43,12 @@ class y {
   // [cacheLen, rotaryDim/2]
   ropeCacheLen = 0;
   ensureRopeCache(o) {
-    if (o <= this.ropeCacheLen) return;
-    this.ropeCos && this.ropeCos.dispose(), this.ropeSin && this.ropeSin.dispose();
-    const s = Math.max(o, this.ropeCacheLen + this.config.blockSize * 4), e = a(0, s, 1, "float32").expandDims(1).mul(this.ropeInvFreq.expandDims(0));
-    this.ropeCos = p(d(e).expandDims(-1)), this.ropeSin = p(C(e).expandDims(-1)), this.ropeCacheLen = s;
+    p(() => {
+      if (o <= this.ropeCacheLen) return;
+      this.ropeCos && this.ropeCos.dispose(), this.ropeSin && this.ropeSin.dispose();
+      const s = Math.max(o, this.ropeCacheLen + this.config.blockSize * 4), e = h(0, s, 1, "float32").expandDims(1).mul(this.ropeInvFreq.expandDims(0));
+      this.ropeCos = a(d(e).expandDims(-1)), this.ropeSin = a(C(e).expandDims(-1)), this.ropeCacheLen = s;
+    });
   }
   getCos() {
     return this.ropeCos;

package/dist/layers/TiedEmbedding.d.ts CHANGED Viewed

@@ -1,22 +1,12 @@
-import { Tensor, Variable } from '@tensorflow/tfjs-core';
-export default class TiedEmbeddingOutputLayer {
+import { Tensor } from '@tensorflow/tfjs-core';
+import { default as BaseLayer, ForwardAttributes, GPTLayerConfig } from './BaseLayer';
+export default class TiedEmbeddingOutputLayer extends BaseLayer {
     private vocabSize;
     private embedDim;
-    private tiedWeights;
     private initializer;
-    constructor(config: {
-        vocabSize: number;
-        embedDim: number;
-        name?: string;
-    }, name?: string);
-    get variables(): Variable[];
+    private WEIGHTS;
+    constructor(config: GPTLayerConfig, name: string, parent?: BaseLayer);
     embed(inputs: Tensor): Tensor;
     project(inputs: Tensor): Tensor;
-    getWeights(): Tensor[];
-    setWeights(weights: Tensor[]): void;
-    getConfig(): {
-        vocabSize: number;
-        embedDim: number;
-    };
-    dispose(): void;
+    forward(_: ForwardAttributes, x: Tensor): Tensor;
 }

package/dist/layers/TiedEmbedding.js CHANGED Viewed

@@ -1,8 +1,8 @@
-import { T as a } from "../TiedEmbedding-DznFwzcB.js";
-import "../index--6vO-cOz.js";
-import "../tfjs_backend-DuKis_xG.js";
-import "../variable-BJTZ3jOy.js";
-import "../gather-C5D8PxwA.js";
+import { T as a } from "../TiedEmbedding-DsDRvLB0.js";
+import "../index-iNhkcAEQ.js";
+import "../tfjs_backend-NucKez4s.js";
+import "../BaseLayer-BhrMN8JO.js";
+import "../gather-Bxe1Qip8.js";
 export {
   a as default
 };

package/dist/layers/TransformerBlock.d.ts CHANGED Viewed

@@ -1,25 +1,21 @@
-import { KVCache } from './CausalSelfAttention';
-import { default as BaseLayer, GPTLayerConfig } from './BaseLayer';
-import { Tensor, Variable } from '@tensorflow/tfjs-core';
-export default class Block extends BaseLayer {
+import { AttentionScores, KVCache } from './CausalSelfAttention';
+import { default as BaseLayer, ForwardAttributes, GPTLayerConfig } from './BaseLayer';
+import { Tensor } from '@tensorflow/tfjs-core';
+interface BlockAttributes extends ForwardAttributes {
+    pastKV?: KVCache;
+    seed?: number;
+    attentionScores?: AttentionScores;
+}
+export default class Block extends BaseLayer<BlockAttributes> {
     private ln1;
     private attn;
     private ln2;
     private mlp;
     private index;
-    private _trainable;
     skipped: boolean;
-    constructor(index: number, config: GPTLayerConfig);
-    get variables(): Variable[];
-    get trainable(): boolean;
-    set trainable(value: boolean);
-    saveWeights(map: Map<string, Tensor[]>): void;
-    loadWeights(weights: Map<string, Tensor[]>): void;
+    constructor(index: number, config: GPTLayerConfig, parent?: BaseLayer);
     private getMLPOutput;
-    call(x: Tensor, training?: boolean, includeAttention?: boolean, cache?: KVCache): {
-        output: Tensor;
-        attention?: Tensor;
-        cache?: KVCache;
-    };
+    forward(attrs: BlockAttributes, x: Tensor): Tensor;
     dispose(): void;
 }
+export {};