@genai-fi/nanogpt 0.2.8 → 0.2.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ import { default as MemoryProfiler } from '../utilities/profile';
2
+ export default abstract class BaseLayer {
3
+ protected _profiler?: MemoryProfiler;
4
+ getProfiler(): MemoryProfiler | undefined;
5
+ setProfiler(value: MemoryProfiler | undefined): void;
6
+ startMemory(): void;
7
+ endMemory(label: string): void;
8
+ }
@@ -0,0 +1,18 @@
1
+ class t {
2
+ _profiler;
3
+ getProfiler() {
4
+ return this._profiler;
5
+ }
6
+ setProfiler(r) {
7
+ this._profiler = r;
8
+ }
9
+ startMemory() {
10
+ this._profiler?.startMemory();
11
+ }
12
+ endMemory(r) {
13
+ this._profiler?.endMemory(r);
14
+ }
15
+ }
16
+ export {
17
+ t as default
18
+ };
@@ -1,13 +1,14 @@
1
1
  import { default as TF } from '@tensorflow/tfjs';
2
2
  import { GPTConfig } from '../config';
3
3
  import { default as RoPECache } from './RoPECache';
4
+ import { default as BaseLayer } from './BaseLayer';
4
5
  export type KVCache = {
5
6
  k: TF.Tensor;
6
7
  v: TF.Tensor;
7
8
  length: number;
8
9
  cumulativeLength: number;
9
10
  };
10
- export default class CausalSelfAttention {
11
+ export default class CausalSelfAttention extends BaseLayer {
11
12
  private readonly ropeCache?;
12
13
  private config;
13
14
  private cAttn;
@@ -1,7 +1,8 @@
1
1
  import { attentionMask as z } from "../ops/attentionMask.js";
2
- class j {
2
+ import S from "./BaseLayer.js";
3
+ class C extends S {
3
4
  constructor(t, i, s, e) {
4
- this.ropeCache = e, this.config = s, this.tf = t, this.index = i, this.cAttn = this.tf.layers.dense({
5
+ super(), this.ropeCache = e, this.config = s, this.tf = t, this.index = i, this.cAttn = this.tf.layers.dense({
5
6
  units: 3 * s.nEmbed,
6
7
  useBias: s.biasInLinear,
7
8
  name: `block_${i}_attn_cAttn`,
@@ -94,23 +95,24 @@ class j {
94
95
  if (e && !this.config.useRope)
95
96
  throw new Error("Cannot use pastKV without RoPE enabled");
96
97
  return this.tf.tidy(() => {
98
+ this.startMemory();
97
99
  const [o, c, r] = this.getQKV(t), h = o.shape[2], a = this.config.blockSize, u = e ? e.cumulativeLength : 0, [f, d] = this.ropeCache ? this.ropeCache.applyRoPE(o, c, u) : [o, c];
98
100
  let n = d, l = r, p = 0;
99
101
  e && (p = e.length, n = this.tf.concat([e.k, d], 2), l = this.tf.concat([e.v, r], 2));
100
102
  const b = n.shape[2];
101
103
  if (b > a) {
102
- const k = b - a, g = n.shape[0], I = n.shape[1], _ = n.shape[3];
103
- n = n.slice([0, 0, k, 0], [g, I, a, _]), l = l.slice([0, 0, k, 0], [g, I, a, _]), p = a - h;
104
+ const k = b - a, g = n.shape[0], A = n.shape[1], I = n.shape[3];
105
+ n = n.slice([0, 0, k, 0], [g, A, a, I]), l = l.slice([0, 0, k, 0], [g, A, a, I]), p = a - h;
104
106
  }
105
107
  let m;
106
108
  p > 0 ? m = this.getAttentionScoresWithPast(f, n, i, p) : m = this.getAttentionScores(f, n, i);
107
- const v = this.tf.matMul(m, l), A = this.getOutputProjection(v, i), P = {
109
+ const _ = this.tf.matMul(m, l), v = this.getOutputProjection(_, i), y = {
108
110
  k: this.tf.keep(n),
109
111
  v: this.tf.keep(l),
110
112
  length: p + h,
111
113
  cumulativeLength: e ? e.cumulativeLength + h : h
112
- };
113
- return { output: A, attention: s ? m.mean(1) : void 0, presentKV: P };
114
+ }, P = s ? m.mean(1) : void 0;
115
+ return this.endMemory("CausalSelfAttention"), { output: v, attention: P, presentKV: y };
114
116
  });
115
117
  }
116
118
  dispose() {
@@ -118,5 +120,5 @@ class j {
118
120
  }
119
121
  }
120
122
  export {
121
- j as default
123
+ C as default
122
124
  };
@@ -1,6 +1,7 @@
1
1
  import { default as TF } from '@tensorflow/tfjs';
2
2
  import { GPTConfig } from '../config';
3
- export default class MLP {
3
+ import { default as BaseLayer } from './BaseLayer';
4
+ export default class MLP extends BaseLayer {
4
5
  private cFc;
5
6
  private cProj;
6
7
  private dropout;
@@ -1,31 +1,32 @@
1
- class l {
1
+ import a from "./BaseLayer.js";
2
+ class l extends a {
2
3
  cFc;
3
4
  cProj;
4
5
  dropout;
5
6
  tf;
6
7
  index;
7
8
  _trainable = !0;
8
- constructor(t, e, i) {
9
- this.tf = t, this.index = e, this.cFc = this.tf.layers.dense({
10
- units: i.mlpFactor * i.nEmbed,
9
+ constructor(t, i, e) {
10
+ super(), this.tf = t, this.index = i, this.cFc = this.tf.layers.dense({
11
+ units: e.mlpFactor * e.nEmbed,
11
12
  activation: "gelu",
12
- useBias: i.biasInLinear,
13
+ useBias: e.biasInLinear,
13
14
  kernelInitializer: this.tf.initializers.randomNormal({
14
15
  mean: 0,
15
16
  stddev: 0.02
16
17
  }),
17
18
  biasInitializer: "zeros",
18
- name: `block_${e}_mlp_cFc`
19
+ name: `block_${i}_mlp_cFc`
19
20
  }), this.cProj = this.tf.layers.dense({
20
- units: i.nEmbed,
21
- useBias: i.biasInLinear,
21
+ units: e.nEmbed,
22
+ useBias: e.biasInLinear,
22
23
  kernelInitializer: this.tf.initializers.randomNormal({
23
24
  mean: 0,
24
- stddev: 0.02 / Math.sqrt(2 * i.nLayer)
25
+ stddev: 0.02 / Math.sqrt(2 * e.nLayer)
25
26
  }),
26
27
  biasInitializer: "zeros",
27
- name: `block_${e}_mlp_cProj`
28
- }), this.dropout = this.tf.layers.dropout({ rate: i.dropout });
28
+ name: `block_${i}_mlp_cProj`
29
+ }), this.dropout = this.tf.layers.dropout({ rate: e.dropout });
29
30
  }
30
31
  get variables() {
31
32
  return [
@@ -45,10 +46,11 @@ class l {
45
46
  loadWeights(t) {
46
47
  this.cFc.setWeights(t.get(`block_${this.index}_mlpHidden`) || []), this.cProj.setWeights(t.get(`block_${this.index}_mlpOut`) || []);
47
48
  }
48
- call(t, e = !1) {
49
+ call(t, i = !1) {
49
50
  return this.tf.tidy(() => {
50
- const i = this.cFc.apply(t), s = this.cProj.apply(i);
51
- return this.dropout.apply(s, { training: e });
51
+ this.startMemory();
52
+ const e = this.cFc.apply(t), s = this.cProj.apply(e), r = this.dropout.apply(s, { training: i });
53
+ return this.endMemory("MLP"), r;
52
54
  });
53
55
  }
54
56
  dispose() {
@@ -1,5 +1,6 @@
1
1
  import { default as TF } from '@tensorflow/tfjs';
2
- export default class RMSNorm {
2
+ import { default as BaseLayer } from './BaseLayer';
3
+ export default class RMSNorm extends BaseLayer {
3
4
  private gamma;
4
5
  private epsilon;
5
6
  private tf;
@@ -1,26 +1,28 @@
1
- class m {
1
+ import m from "./BaseLayer.js";
2
+ class o extends m {
2
3
  gamma;
3
4
  epsilon;
4
5
  tf;
5
- constructor(a, s, t = 1e-8, e = "") {
6
- this.tf = a, this.epsilon = t, this.gamma = a.variable(a.ones(s), !0, `${e}_gamma`, "float32");
6
+ constructor(t, s, a = 1e-8, e = "") {
7
+ super(), this.tf = t, this.epsilon = a, this.gamma = t.variable(t.ones(s), !0, `${e}_gamma`, "float32");
7
8
  }
8
9
  get trainableWeights() {
9
10
  return [this.gamma];
10
11
  }
11
- set trainable(a) {
12
- this.gamma.trainable = a;
12
+ set trainable(t) {
13
+ this.gamma.trainable = t;
13
14
  }
14
15
  getWeights() {
15
16
  return [this.gamma];
16
17
  }
17
- setWeights(a) {
18
- this.gamma.assign(a[0]);
18
+ setWeights(t) {
19
+ this.gamma.assign(t[0]);
19
20
  }
20
- apply(a) {
21
+ apply(t) {
21
22
  return this.tf.tidy(() => {
22
- const t = a.square().mean(-1, !0).add(this.epsilon).rsqrt();
23
- return a.mul(t).mul(this.gamma);
23
+ this.startMemory();
24
+ const a = t.square().mean(-1, !0).add(this.epsilon).rsqrt(), r = t.mul(a).mul(this.gamma);
25
+ return this.endMemory("RMSNorm"), r;
24
26
  });
25
27
  }
26
28
  dispose() {
@@ -28,5 +30,5 @@ class m {
28
30
  }
29
31
  }
30
32
  export {
31
- m as default
33
+ o as default
32
34
  };
@@ -1,7 +1,7 @@
1
- import { o as h, c as i, E as o, y as V, D as X, I as Y, F as Z, N as ee, H as te, J as se, K as ne, O as re, Q as ue, g as L, x as ae, T as A, m as ie, U as oe, u as le, b as q, l as C, V as P, w as U, _ as H } from "../index-DQfEAU9u.js";
2
- import { s as ce, r as f } from "../sum-B-O33dgG.js";
3
- import { m } from "../mat_mul-CuHB58-H.js";
4
- import { c as pe } from "../complex-CeoYJn2o.js";
1
+ import { o as h, d as i, E as o, F as V, H as X, I as Y, J as Z, N as ee, K as te, O as se, Q as ne, T as re, U as ue, i as L, z as ae, V as A, a as ie, W as oe, w as le, f as q, p as C, X as P, y as U, _ as H } from "../index-CWQLouWz.js";
2
+ import { s as ce, r as f } from "../sum-CnIf1YOh.js";
3
+ import { m } from "../mat_mul-4v7St11W.js";
4
+ import { c as pe } from "../complex-x7w5HPOS.js";
5
5
  /**
6
6
  * @license
7
7
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -2,7 +2,9 @@ import { default as TF } from '@tensorflow/tfjs';
2
2
  import { GPTConfig } from '../config';
3
3
  import { KVCache } from './CausalSelfAttention';
4
4
  import { default as RoPECache } from './RoPECache';
5
- export default class Block {
5
+ import { default as MemoryProfiler } from '../utilities/profile';
6
+ import { default as BaseLayer } from './BaseLayer';
7
+ export default class Block extends BaseLayer {
6
8
  private ln1;
7
9
  private attn;
8
10
  private ln2;
@@ -12,6 +14,7 @@ export default class Block {
12
14
  private _trainable;
13
15
  skipped: boolean;
14
16
  constructor(tf: typeof TF, index: number, config: GPTConfig, ropeCache?: RoPECache);
17
+ setProfiler(value: MemoryProfiler | undefined): void;
15
18
  get variables(): TF.Variable[];
16
19
  get trainable(): boolean;
17
20
  set trainable(value: boolean);
@@ -1,7 +1,8 @@
1
- import r from "./CausalSelfAttention.js";
1
+ import a from "./CausalSelfAttention.js";
2
2
  import o from "./MLP.js";
3
- import a from "./RMSNorm.js";
4
- class u {
3
+ import r from "./RMSNorm.js";
4
+ import p from "./BaseLayer.js";
5
+ class f extends p {
5
6
  ln1;
6
7
  attn;
7
8
  ln2;
@@ -11,7 +12,10 @@ class u {
11
12
  _trainable = !0;
12
13
  skipped = !1;
13
14
  constructor(t, i, s, e) {
14
- this.tf = t, this.index = i, this.ln1 = new a(t, [s.nEmbed], 1e-8, `block_${this.index}_rms1`), this.attn = new r(this.tf, this.index, s, e), this.ln2 = new a(t, [s.nEmbed], 1e-8, `block_${this.index}_rms2`), this.mlp = new o(this.tf, this.index, s);
15
+ super(), this.tf = t, this.index = i, this.ln1 = new r(t, [s.nEmbed], 1e-8, `block_${this.index}_rms1`), this.attn = new a(this.tf, this.index, s, e), this.ln2 = new r(t, [s.nEmbed], 1e-8, `block_${this.index}_rms2`), this.mlp = new o(this.tf, this.index, s);
16
+ }
17
+ setProfiler(t) {
18
+ this._profiler = t, this.attn.setProfiler(t), this.mlp.setProfiler(t), this.ln1.setProfiler(t), this.ln2.setProfiler(t);
15
19
  }
16
20
  get variables() {
17
21
  return [
@@ -54,5 +58,5 @@ class u {
54
58
  }
55
59
  }
56
60
  export {
57
- u as default
61
+ f as default
58
62
  };
@@ -1,4 +1,4 @@
1
- import { o as c, c as s, b as m, E as M, B as p } from "./index-DQfEAU9u.js";
1
+ import { o as m, d as s, f as c, E as M, B as f } from "./index-CWQLouWz.js";
2
2
  /**
3
3
  * @license
4
4
  * Copyright 2020 Google LLC. All Rights Reserved.
@@ -15,13 +15,13 @@ import { o as c, c as s, b as m, E as M, B as p } from "./index-DQfEAU9u.js";
15
15
  * limitations under the License.
16
16
  * =============================================================================
17
17
  */
18
- function b(e, o, n = !1, l = !1) {
18
+ function p(e, o, n = !1, l = !1) {
19
19
  let a = s(e, "a", "matMul"), t = s(o, "b", "matMul");
20
- [a, t] = m(a, t);
20
+ [a, t] = c(a, t);
21
21
  const r = { a, b: t }, u = { transposeA: n, transposeB: l };
22
- return M.runKernel(p, r, u);
22
+ return M.runKernel(f, r, u);
23
23
  }
24
- const i = /* @__PURE__ */ c({ matMul_: b });
24
+ const i = /* @__PURE__ */ m({ matMul_: p });
25
25
  export {
26
26
  i as m
27
27
  };
@@ -1,14 +1,14 @@
1
- import { engine as d } from "@tensorflow/tfjs";
2
- import { r as k, s as u } from "../index-DQfEAU9u.js";
3
- import { m as l } from "../mat_mul-CuHB58-H.js";
4
- class p {
1
+ import { engine as k } from "@tensorflow/tfjs";
2
+ import { r as m, c as d, s as p } from "../index-CWQLouWz.js";
3
+ import { m as f } from "../mat_mul-4v7St11W.js";
4
+ class h {
5
5
  variableNames = ["q", "k", "mask"];
6
6
  outputShape;
7
7
  userCode;
8
8
  // enableShapeUniforms = true;
9
9
  customUniforms = [{ name: "divisor", type: "float" }];
10
- constructor(t, e, n, a) {
11
- this.outputShape = [t, e, n, n], this.userCode = `
10
+ constructor(e, n, s, a) {
11
+ this.outputShape = [e, n, s, s], this.userCode = `
12
12
  void main() {
13
13
  ivec4 coords = getOutputCoords(); // [batch, nh, t1, t2]
14
14
  int b = coords.x;
@@ -34,29 +34,55 @@ class p {
34
34
  `;
35
35
  }
36
36
  }
37
- function f(s) {
38
- const { q: t, k: e, mask: n } = s.inputs, { divisor: a } = s.attrs, o = s.backend, c = t.shape[0], i = t.shape[2], r = t.shape[1], m = new p(c, r, i, t.shape[3]);
39
- return o.runWebGLProgram(m, [t, e, n], "float32", [[a]]);
37
+ function v(t) {
38
+ const { q: e, k: n, mask: s } = t.inputs, { divisor: a } = t.attrs, o = t.backend, r = e.shape[0], i = e.shape[2], c = e.shape[1], u = new h(r, c, i, e.shape[3]);
39
+ return o.runWebGLProgram(u, [e, n, s], "float32", [[a]]);
40
40
  }
41
- const h = {
41
+ const b = {
42
42
  kernelName: "AttentionMask",
43
43
  backendName: "webgl",
44
- kernelFunc: f
44
+ kernelFunc: v
45
45
  };
46
- k(h);
47
- function b(s) {
48
- const { q: t, k: e, mask: n } = s.inputs, { divisor: a } = s.attrs, o = t.shape[2], i = l(t, e, !1, !0).mul(u(a)), r = n.slice([0, 0], [o, o]).expandDims(0).expandDims(0);
49
- return i.add(r);
46
+ m(b);
47
+ function l(t) {
48
+ const { q: e, k: n, mask: s } = t.inputs, { divisor: a } = t.attrs, o = e.shape[2], i = f(e, n, !1, !0).mul(p(a)), c = s.slice([0, 0], [o, o]).expandDims(0).expandDims(0);
49
+ return i.add(c);
50
50
  }
51
- const v = {
51
+ const M = {
52
52
  kernelName: "AttentionMask",
53
53
  backendName: "cpu",
54
- kernelFunc: b
54
+ kernelFunc: l
55
55
  };
56
- k(v);
57
- function C(s, t, e, n) {
58
- return d().runKernel("AttentionMask", { q: s, k: t, mask: e }, { divisor: n });
56
+ m(M);
57
+ const g = {
58
+ kernelName: "AttentionMask",
59
+ backendName: "tensorflow",
60
+ kernelFunc: l
61
+ };
62
+ m(g);
63
+ function N(t, e, n, s) {
64
+ return k().runKernel("AttentionMask", { q: t, k: e, mask: n }, { divisor: s });
59
65
  }
66
+ const A = {
67
+ kernelName: "AttentionMask",
68
+ inputsToSave: ["q", "k"],
69
+ outputsToSave: [],
70
+ gradFunc: (t, e, n) => {
71
+ if (Array.isArray(t))
72
+ throw new Error("Expected dy to be a single Tensor");
73
+ const [s, a] = e, { divisor: o } = n;
74
+ return {
75
+ q: () => t.matMul(a).mul(o),
76
+ k: () => s.transpose([0, 1, 3, 2]).matMul(t).mul(o).transpose([0, 1, 3, 2]),
77
+ mask: () => t,
78
+ divisor: () => {
79
+ const r = s.matMul(a, !1, !0);
80
+ return t.mul(r).sum();
81
+ }
82
+ };
83
+ }
84
+ };
85
+ d(A);
60
86
  export {
61
- C as attentionMask
87
+ N as attentionMask
62
88
  };
@@ -1,6 +1,6 @@
1
1
  import { engine as l } from "@tensorflow/tfjs";
2
- import { o as g, c as i, E as b, G as d, r as c, a as h } from "../index-DQfEAU9u.js";
3
- import { r as p, s as f } from "../stack-C9cTkqpq.js";
2
+ import { o as g, d as i, E as b, G as d, r as c, b as h } from "../index-CWQLouWz.js";
3
+ import { r as p, s as f } from "../stack-CTdK-itU.js";
4
4
  /**
5
5
  * @license
6
6
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -1,4 +1,4 @@
1
- import { r as o } from "../../index-DQfEAU9u.js";
1
+ import { r as o } from "../../index-CWQLouWz.js";
2
2
  function r(e) {
3
3
  const { logits: t, labels: n } = e.inputs;
4
4
  return e.backend.executeMultipleOutputs("SparseSoftmaxCrossEntropyWithLogits", [], [t, n], 2);
@@ -1,7 +1,7 @@
1
1
  import { engine as $ } from "@tensorflow/tfjs";
2
- import { j as u, k as S, l as p, E as f, n as E, o as N, c as l, p as y, r as h, a as D, m as x } from "../index-DQfEAU9u.js";
3
- import { c as m } from "../complex-CeoYJn2o.js";
4
- import { r as v, s as T } from "../stack-C9cTkqpq.js";
2
+ import { l as u, n as S, p, E as f, q as E, o as N, d as l, t as y, r as h, b as D, a as x } from "../index-CWQLouWz.js";
3
+ import { c as d } from "../complex-x7w5HPOS.js";
4
+ import { r as v, s as T } from "../stack-CTdK-itU.js";
5
5
  /**
6
6
  * @license
7
7
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -21,7 +21,7 @@ import { r as v, s as T } from "../stack-C9cTkqpq.js";
21
21
  function i(e, t = "float32") {
22
22
  if (u(e), t === "complex64") {
23
23
  const a = i(e, "float32"), o = i(e, "float32");
24
- return m(a, o);
24
+ return d(a, o);
25
25
  }
26
26
  const r = S(p(e), t);
27
27
  return f.makeTensor(r, e, t);
@@ -42,10 +42,10 @@ function i(e, t = "float32") {
42
42
  * limitations under the License.
43
43
  * =============================================================================
44
44
  */
45
- function d(e, t = "float32") {
45
+ function m(e, t = "float32") {
46
46
  if (u(e), t === "complex64") {
47
- const a = d(e, "float32"), o = i(e, "float32");
48
- return m(a, o);
47
+ const a = m(e, "float32"), o = i(e, "float32");
48
+ return d(a, o);
49
49
  }
50
50
  const r = E(p(e), t);
51
51
  return f.makeTensor(r, e, t);
@@ -133,7 +133,7 @@ const K = {
133
133
  };
134
134
  h(K);
135
135
  function A(e) {
136
- const { logits: t, labels: r, dy: a } = e.inputs, o = r.shape[0], s = t.shape[1], n = v(0, o, 1, "int32"), c = T([n, r], 1), b = d([o]), g = I(c, b, [o, s]), k = D(t, g), w = a.reshape([o, 1]);
136
+ const { logits: t, labels: r, dy: a } = e.inputs, o = r.shape[0], s = t.shape[1], n = v(0, o, 1, "int32"), c = T([n, r], 1), b = m([o]), g = I(c, b, [o, s]), k = D(t, g), w = a.reshape([o, 1]);
137
137
  return x(k, w);
138
138
  }
139
139
  const F = {
@@ -142,9 +142,9 @@ const F = {
142
142
  kernelFunc: A
143
143
  };
144
144
  h(F);
145
- function M(e, t, r) {
145
+ function R(e, t, r) {
146
146
  return $().runKernel("EfficientScatterSub", { logits: e, labels: t, dy: r }, {});
147
147
  }
148
148
  export {
149
- M as scatterSub
149
+ R as scatterSub
150
150
  };
@@ -1,4 +1,4 @@
1
- import { E as e, R as c, o as f, f as u, g as a, P as i } from "./index-DQfEAU9u.js";
1
+ import { E as e, R as c, o as f, h as i, i as a, P as u } from "./index-CWQLouWz.js";
2
2
  /**
3
3
  * @license
4
4
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -38,13 +38,13 @@ function l(n, s, t = 1, r = "float32") {
38
38
  * =============================================================================
39
39
  */
40
40
  function k(n, s = 0) {
41
- const t = u(n, "tensors", "stack", "string_or_numeric");
41
+ const t = i(n, "tensors", "stack", "string_or_numeric");
42
42
  a(t.length >= 1, () => "Pass at least one tensor to tf.stack"), t.length > 0 && a(s <= t[0].rank, () => "Axis must be <= rank of the tensor");
43
43
  const r = t, o = { axis: s };
44
- return e.runKernel(i, r, o);
44
+ return e.runKernel(u, r, o);
45
45
  }
46
- const h = /* @__PURE__ */ f({ stack_: k });
46
+ const g = /* @__PURE__ */ f({ stack_: k });
47
47
  export {
48
48
  l as r,
49
- h as s
49
+ g as s
50
50
  };
@@ -1,4 +1,4 @@
1
- import { o, c as a, E as u, h as i, i as p, S as x } from "./index-DQfEAU9u.js";
1
+ import { o, d as a, E as u, j as p, k as i, S as x } from "./index-CWQLouWz.js";
2
2
  /**
3
3
  * @license
4
4
  * Copyright 2020 Google LLC. All Rights Reserved.
@@ -17,7 +17,7 @@ import { o, c as a, E as u, h as i, i as p, S as x } from "./index-DQfEAU9u.js";
17
17
  */
18
18
  function l(n, t) {
19
19
  const s = { x: a(n, "x", "reshape", "string_or_numeric") }, r = { shape: t };
20
- return u.runKernel(i, s, r);
20
+ return u.runKernel(p, s, r);
21
21
  }
22
22
  const h = /* @__PURE__ */ o({ reshape_: l });
23
23
  /**
@@ -38,7 +38,7 @@ const h = /* @__PURE__ */ o({ reshape_: l });
38
38
  */
39
39
  function m(n, t = null, e = !1) {
40
40
  let s = a(n, "x", "sum");
41
- s.dtype === "bool" && (s = p(s, "int32"));
41
+ s.dtype === "bool" && (s = i(s, "int32"));
42
42
  const r = { x: s }, c = { axis: t, keepDims: e };
43
43
  return u.runKernel(x, r, c);
44
44
  }
@@ -1,4 +1,4 @@
1
- import { A as r, m as c, s as h, a as g, e as o } from "../index-DQfEAU9u.js";
1
+ import { A as r, a as c, s as h, b as g, e as o } from "../index-CWQLouWz.js";
2
2
  class u extends r {
3
3
  constructor(t, e, s, a, i) {
4
4
  super(t, e, s, a), this.config = i, this.startLearningRate = t;
@@ -1,8 +1,8 @@
1
1
  import { DatasetBuilder as d } from "./DatasetBuilder.js";
2
- import p from "./AdamExt.js";
3
- class u {
4
- constructor(t, e, s, i = 1e-3) {
5
- this.tokenizer = s, this.tf = t, this.model = e, this.learningRate = i, this.resetOptimizer(), this.datasetBuilder = new d(this.tf, s, e.config.blockSize);
2
+ import h from "./AdamExt.js";
3
+ class g {
4
+ constructor(t, s, e, i = 1e-3) {
5
+ this.tokenizer = e, this.tf = t, this.model = s, this.learningRate = i, this.resetOptimizer(), this.datasetBuilder = new d(this.tf, e, s.config.blockSize);
6
6
  }
7
7
  model;
8
8
  optimizer;
@@ -25,7 +25,7 @@ class u {
25
25
  }
26
26
  resetOptimizer(t = { learningRateFactor: 1, beta1: 0.9, beta2: 0.99, epsilon: 1e-8 }) {
27
27
  this.optimizer && this.optimizer.dispose();
28
- const e = new p(
28
+ const s = new h(
29
29
  t.learningRateFactor * this.learningRate,
30
30
  t.beta1,
31
31
  t.beta2,
@@ -37,58 +37,59 @@ class u {
37
37
  weightDecay: 0
38
38
  }
39
39
  );
40
- this.optimizer = e;
40
+ this.optimizer = s;
41
41
  }
42
42
  printGradients(t) {
43
- Object.keys(t).forEach((e) => {
44
- const s = t[e];
45
- console.log(`${e}:`), console.log(` Shape: ${s.shape}`), console.log(` Mean: ${this.tf.mean(s).dataSync()[0]}`), console.log(` Std: ${this.tf.moments(s).variance.sqrt().dataSync()[0]}`), console.log(` Min: ${this.tf.min(s).dataSync()[0]}`), console.log(` Max: ${this.tf.max(s).dataSync()[0]}`), console.log(` Norm: ${this.tf.norm(s).dataSync()[0]}`);
43
+ Object.keys(t).forEach((s) => {
44
+ const e = t[s];
45
+ console.log(`${s}:`), console.log(` Shape: ${e.shape}`), console.log(` Mean: ${this.tf.mean(e).dataSync()[0]}`), console.log(` Std: ${this.tf.moments(e).variance.sqrt().dataSync()[0]}`), console.log(` Min: ${this.tf.min(e).dataSync()[0]}`), console.log(` Max: ${this.tf.max(e).dataSync()[0]}`), console.log(` Norm: ${this.tf.norm(e).dataSync()[0]}`);
46
46
  });
47
47
  }
48
- trainStep(t, e = !1, s = !1) {
48
+ trainStep(t, s = !1, e = !1) {
49
49
  return this.tf.tidy(() => {
50
+ this.model.getProfiler()?.startMemory();
50
51
  const { xs: i, ys: a } = t, o = () => {
51
52
  const { loss: l, logits: c } = this.model.forward(i, a, !0);
52
53
  return c.dispose(), l;
53
54
  }, { value: n, grads: r } = this.tf.variableGrads(o);
54
- return e || (s && (console.log("-------"), this.printGradients(r), console.log("-------")), this.optimizer.applyGradients(r), this.tf.dispose(r)), n;
55
+ return s ? this.model.getProfiler()?.endMemory("Training") : (e && (console.log("-------"), this.printGradients(r), console.log("-------")), this.optimizer.applyGradients(r), this.model.getProfiler()?.endMemory("Training"), this.tf.dispose(r)), n;
55
56
  });
56
57
  }
57
58
  dummyPass() {
58
- const t = this.tf.zeros([1, this.model.config.blockSize], "int32"), e = this.tf.zeros([1, this.model.config.blockSize], "int32");
59
+ const t = this.tf.zeros([1, this.model.config.blockSize], "int32"), s = this.tf.zeros([1, this.model.config.blockSize], "int32");
59
60
  try {
60
- const s = this.trainStep({ xs: t, ys: e }, !0);
61
- s.dataSync(), s.dispose();
62
- } catch (s) {
63
- console.error("Error during dummy pass:", s);
61
+ const e = this.trainStep({ xs: t, ys: s }, !0);
62
+ e.dataSync(), e.dispose();
63
+ } catch (e) {
64
+ console.error("Error during dummy pass:", e);
64
65
  } finally {
65
- t.dispose(), e.dispose();
66
+ t.dispose(), s.dispose();
66
67
  }
67
68
  }
68
- async trainBatch(t, e) {
69
+ async trainBatch(t, s) {
69
70
  try {
70
- const s = this.trainStep(e, !1, !1);
71
- return e.xs.dispose(), e.ys.dispose(), t.step++, t.totalSteps++, s.array().then((i) => (t.lastLoss = i, t.losses.push(t.lastLoss), s.dispose(), t.lastLoss));
72
- } catch (s) {
73
- throw console.error(`Error processing batch at step ${t.step}:`, s), this.tf.dispose(), s;
71
+ const e = this.trainStep(s, !1, !1);
72
+ return s.xs.dispose(), s.ys.dispose(), t.step++, t.totalSteps++, e.array().then((i) => (t.lastLoss = i, t.losses.push(t.lastLoss), e.dispose(), t.lastLoss));
73
+ } catch (e) {
74
+ throw console.error(`Error processing batch at step ${t.step}:`, e), this.tf.dispose(), e;
74
75
  }
75
76
  }
76
- async createTrainValidationSplit(t, e = 32, s = 0.1) {
77
- const i = await this.datasetBuilder.createTextDataset(t, e, 0, 1 - s), a = await this.datasetBuilder.createTextDataset(
77
+ async createTrainValidationSplit(t, s = 32, e = 0.1) {
78
+ const i = await this.datasetBuilder.createTextDataset(t, s, 0, 1 - e), a = await this.datasetBuilder.createTextDataset(
78
79
  t,
79
- e,
80
- 1 - s,
80
+ s,
81
+ 1 - e,
81
82
  1
82
83
  );
83
84
  return { trainDataset: i, validationDataset: a };
84
85
  }
85
- async createDataset(t, e = 32) {
86
- return await this.datasetBuilder.createTextDataset(t, e);
86
+ async createDataset(t, s = 32) {
87
+ return await this.datasetBuilder.createTextDataset(t, s);
87
88
  }
88
89
  dispose() {
89
90
  this.optimizer && this.optimizer.dispose();
90
91
  }
91
92
  }
92
93
  export {
93
- u as default
94
+ g as default
94
95
  };