@genai-fi/nanogpt 0.2.9 → 0.2.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,31 +1,32 @@
1
- class l {
1
+ import a from "./BaseLayer.js";
2
+ class l extends a {
2
3
  cFc;
3
4
  cProj;
4
5
  dropout;
5
6
  tf;
6
7
  index;
7
8
  _trainable = !0;
8
- constructor(t, e, i) {
9
- this.tf = t, this.index = e, this.cFc = this.tf.layers.dense({
10
- units: i.mlpFactor * i.nEmbed,
9
+ constructor(t, i, e) {
10
+ super(), this.tf = t, this.index = i, this.cFc = this.tf.layers.dense({
11
+ units: e.mlpFactor * e.nEmbed,
11
12
  activation: "gelu",
12
- useBias: i.biasInLinear,
13
+ useBias: e.biasInLinear,
13
14
  kernelInitializer: this.tf.initializers.randomNormal({
14
15
  mean: 0,
15
16
  stddev: 0.02
16
17
  }),
17
18
  biasInitializer: "zeros",
18
- name: `block_${e}_mlp_cFc`
19
+ name: `block_${i}_mlp_cFc`
19
20
  }), this.cProj = this.tf.layers.dense({
20
- units: i.nEmbed,
21
- useBias: i.biasInLinear,
21
+ units: e.nEmbed,
22
+ useBias: e.biasInLinear,
22
23
  kernelInitializer: this.tf.initializers.randomNormal({
23
24
  mean: 0,
24
- stddev: 0.02 / Math.sqrt(2 * i.nLayer)
25
+ stddev: 0.02 / Math.sqrt(2 * e.nLayer)
25
26
  }),
26
27
  biasInitializer: "zeros",
27
- name: `block_${e}_mlp_cProj`
28
- }), this.dropout = this.tf.layers.dropout({ rate: i.dropout });
28
+ name: `block_${i}_mlp_cProj`
29
+ }), this.dropout = this.tf.layers.dropout({ rate: e.dropout });
29
30
  }
30
31
  get variables() {
31
32
  return [
@@ -45,10 +46,11 @@ class l {
45
46
  loadWeights(t) {
46
47
  this.cFc.setWeights(t.get(`block_${this.index}_mlpHidden`) || []), this.cProj.setWeights(t.get(`block_${this.index}_mlpOut`) || []);
47
48
  }
48
- call(t, e = !1) {
49
+ call(t, i = !1) {
49
50
  return this.tf.tidy(() => {
50
- const i = this.cFc.apply(t), s = this.cProj.apply(i);
51
- return this.dropout.apply(s, { training: e });
51
+ this.startMemory();
52
+ const e = this.cFc.apply(t), s = this.cProj.apply(e), r = this.dropout.apply(s, { training: i });
53
+ return this.endMemory("MLP"), r;
52
54
  });
53
55
  }
54
56
  dispose() {
@@ -1,5 +1,6 @@
1
1
  import { default as TF } from '@tensorflow/tfjs';
2
- export default class RMSNorm {
2
+ import { default as BaseLayer } from './BaseLayer';
3
+ export default class RMSNorm extends BaseLayer {
3
4
  private gamma;
4
5
  private epsilon;
5
6
  private tf;
@@ -1,26 +1,28 @@
1
- class m {
1
+ import m from "./BaseLayer.js";
2
+ class o extends m {
2
3
  gamma;
3
4
  epsilon;
4
5
  tf;
5
- constructor(a, s, t = 1e-8, e = "") {
6
- this.tf = a, this.epsilon = t, this.gamma = a.variable(a.ones(s), !0, `${e}_gamma`, "float32");
6
+ constructor(t, s, a = 1e-8, e = "") {
7
+ super(), this.tf = t, this.epsilon = a, this.gamma = t.variable(t.ones(s), !0, `${e}_gamma`, "float32");
7
8
  }
8
9
  get trainableWeights() {
9
10
  return [this.gamma];
10
11
  }
11
- set trainable(a) {
12
- this.gamma.trainable = a;
12
+ set trainable(t) {
13
+ this.gamma.trainable = t;
13
14
  }
14
15
  getWeights() {
15
16
  return [this.gamma];
16
17
  }
17
- setWeights(a) {
18
- this.gamma.assign(a[0]);
18
+ setWeights(t) {
19
+ this.gamma.assign(t[0]);
19
20
  }
20
- apply(a) {
21
+ apply(t) {
21
22
  return this.tf.tidy(() => {
22
- const t = a.square().mean(-1, !0).add(this.epsilon).rsqrt();
23
- return a.mul(t).mul(this.gamma);
23
+ this.startMemory();
24
+ const a = t.square().mean(-1, !0).add(this.epsilon).rsqrt(), r = t.mul(a).mul(this.gamma);
25
+ return this.endMemory("RMSNorm"), r;
24
26
  });
25
27
  }
26
28
  dispose() {
@@ -28,5 +30,5 @@ class m {
28
30
  }
29
31
  }
30
32
  export {
31
- m as default
33
+ o as default
32
34
  };
@@ -1,7 +1,7 @@
1
- import { o as h, c as i, E as o, D as V, F as X, I as Y, H as Z, N as ee, J as te, K as se, O as ne, Q as re, T as ue, h as L, y as ae, U as A, m as ie, V as oe, v as le, d as q, n as C, W as P, x as U, _ as H } from "../index-Dsg28SG6.js";
2
- import { s as ce, r as f } from "../sum-NWazHI7f.js";
3
- import { m } from "../mat_mul-BAYDrXvE.js";
4
- import { c as pe } from "../complex-Cd8sqiBC.js";
1
+ import { o as h, d as i, E as o, F as V, H as X, I as Y, J as Z, N as ee, K as te, O as se, Q as ne, T as re, U as ue, i as L, z as ae, V as A, a as ie, W as oe, w as le, f as q, p as C, X as P, y as U, _ as H } from "../index-CWQLouWz.js";
2
+ import { s as ce, r as f } from "../sum-CnIf1YOh.js";
3
+ import { m } from "../mat_mul-4v7St11W.js";
4
+ import { c as pe } from "../complex-x7w5HPOS.js";
5
5
  /**
6
6
  * @license
7
7
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -169,7 +169,7 @@ function Me(t) {
169
169
  const s = { x: i(t, "x", "relu") };
170
170
  return o.runKernel(ne, s);
171
171
  }
172
- const We = /* @__PURE__ */ h({ relu_: Me });
172
+ const we = /* @__PURE__ */ h({ relu_: Me });
173
173
  /**
174
174
  * @license
175
175
  * Copyright 2020 Google LLC. All Rights Reserved.
@@ -186,11 +186,11 @@ const We = /* @__PURE__ */ h({ relu_: Me });
186
186
  * limitations under the License.
187
187
  * =============================================================================
188
188
  */
189
- function we(t) {
189
+ function We(t) {
190
190
  const s = { x: i(t, "x", "relu6") };
191
191
  return o.runKernel(re, s);
192
192
  }
193
- const ze = /* @__PURE__ */ h({ relu6_: we });
193
+ const ze = /* @__PURE__ */ h({ relu6_: We });
194
194
  /**
195
195
  * @license
196
196
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -273,7 +273,7 @@ function Te(t, e, s, n) {
273
273
  if (e === "linear")
274
274
  return t;
275
275
  if (e === "relu")
276
- return We(t);
276
+ return we(t);
277
277
  if (e === "elu")
278
278
  return me(t);
279
279
  if (e === "relu6")
@@ -310,14 +310,14 @@ function Ne({ a: t, b: e, transposeA: s = !1, transposeB: n = !1, bias: r, activ
310
310
  }
311
311
  let u = i(t, "a", "fused matMul"), a = i(e, "b", "fused matMul");
312
312
  [u, a] = q(u, a);
313
- const D = s ? u.shape[u.rank - 2] : u.shape[u.rank - 1], b = n ? a.shape[a.rank - 1] : a.shape[a.rank - 2], W = s ? u.shape[u.rank - 1] : u.shape[u.rank - 2], w = n ? a.shape[a.rank - 2] : a.shape[a.rank - 1], T = u.shape.slice(0, -2), y = a.shape.slice(0, -2), B = C(T), N = C(y);
313
+ const D = s ? u.shape[u.rank - 2] : u.shape[u.rank - 1], b = n ? a.shape[a.rank - 1] : a.shape[a.rank - 2], w = s ? u.shape[u.rank - 1] : u.shape[u.rank - 2], W = n ? a.shape[a.rank - 2] : a.shape[a.rank - 1], T = u.shape.slice(0, -2), y = a.shape.slice(0, -2), B = C(T), N = C(y);
314
314
  L(D === b, () => `Error in fused matMul: inner shapes (${D}) and (${b}) of Tensors with shapes ${u.shape} and ${a.shape} and transposeA=${s} and transposeB=${n} must match.`);
315
- const O = P(u.shape.slice(0, -2), a.shape.slice(0, -2)).concat([W, w]), F = s ? f(u, [B, D, W]) : f(u, [B, W, D]), R = n ? f(a, [N, w, b]) : f(a, [N, b, w]);
315
+ const O = P(u.shape.slice(0, -2), a.shape.slice(0, -2)).concat([w, W]), F = s ? f(u, [B, D, w]) : f(u, [B, w, D]), R = n ? f(a, [N, W, b]) : f(a, [N, b, W]);
316
316
  let S;
317
317
  r != null && (S = i(r, "bias", "fused matMul"), [S] = q(S, u), P(O, S.shape));
318
- let v;
319
- l != null && (v = i(l, "prelu weights", "fused matMul"));
320
- const G = (x, M) => {
318
+ let G;
319
+ l != null && (G = i(l, "prelu weights", "fused matMul"));
320
+ const I = (x, M) => {
321
321
  const [g, $, k, z] = M, d = Ae(f(x, k.shape), k, c);
322
322
  let K, _;
323
323
  if (!s && !n ? (K = m(d, $, !1, !0), _ = m(g, d, !0, !1)) : !s && n ? (K = m(d, $, !1, !1), _ = m(d, g, !0, !1)) : s && !n ? (K = m($, d, !1, !0), _ = m(g, d, !1, !1)) : (K = m($, d, !0, !0), _ = m(d, g, !0, !0)), r != null) {
@@ -325,24 +325,24 @@ function Ne({ a: t, b: e, transposeA: s = !1, transposeB: n = !1, bias: r, activ
325
325
  return [K, _, Q];
326
326
  } else
327
327
  return [K, _];
328
- }, I = {
328
+ }, v = {
329
329
  a: F,
330
330
  b: R,
331
331
  bias: S,
332
- preluActivationWeights: v
332
+ preluActivationWeights: G
333
333
  }, j = { transposeA: s, transposeB: n, activation: c, leakyreluAlpha: p };
334
334
  return r == null ? U((M, g, $) => {
335
335
  const k = (
336
336
  // tslint:disable-next-line: no-unnecessary-type-assertion
337
- o.runKernel(H, I, j)
337
+ o.runKernel(H, v, j)
338
338
  );
339
- return $([M, g, k]), { value: f(k, O), gradFunc: G };
339
+ return $([M, g, k]), { value: f(k, O), gradFunc: I };
340
340
  })(F, R) : U((M, g, $, k) => {
341
341
  const z = (
342
342
  // tslint:disable-next-line: no-unnecessary-type-assertion
343
- o.runKernel(H, I, j)
343
+ o.runKernel(H, v, j)
344
344
  );
345
- return k([M, g, z, $]), { value: f(z, O), gradFunc: G };
345
+ return k([M, g, z, $]), { value: f(z, O), gradFunc: I };
346
346
  })(F, R, S);
347
347
  }
348
348
  const J = /* @__PURE__ */ h({ fusedMatMul_: Ne });
@@ -369,7 +369,7 @@ class E extends Error {
369
369
  * https://opensource.org/licenses/MIT.
370
370
  * =============================================================================
371
371
  */
372
- function ve(t, e, s, n) {
372
+ function Ge(t, e, s, n) {
373
373
  if (t.rank < 2 || e.rank < 2)
374
374
  throw new E(`dot requires both inputs to be rank >= 2 but got x shape = ${t.shape} and y shape = ${e.shape}`);
375
375
  if (e.rank >= 3) {
@@ -425,7 +425,7 @@ class Pe {
425
425
  return this.tf.gather(this.tiedWeights, e, 0);
426
426
  }
427
427
  project(e) {
428
- return ve(e, this.tiedWeights.transpose());
428
+ return Ge(e, this.tiedWeights.transpose());
429
429
  }
430
430
  getWeights() {
431
431
  return [this.tiedWeights];
@@ -2,7 +2,9 @@ import { default as TF } from '@tensorflow/tfjs';
2
2
  import { GPTConfig } from '../config';
3
3
  import { KVCache } from './CausalSelfAttention';
4
4
  import { default as RoPECache } from './RoPECache';
5
- export default class Block {
5
+ import { default as MemoryProfiler } from '../utilities/profile';
6
+ import { default as BaseLayer } from './BaseLayer';
7
+ export default class Block extends BaseLayer {
6
8
  private ln1;
7
9
  private attn;
8
10
  private ln2;
@@ -12,6 +14,7 @@ export default class Block {
12
14
  private _trainable;
13
15
  skipped: boolean;
14
16
  constructor(tf: typeof TF, index: number, config: GPTConfig, ropeCache?: RoPECache);
17
+ setProfiler(value: MemoryProfiler | undefined): void;
15
18
  get variables(): TF.Variable[];
16
19
  get trainable(): boolean;
17
20
  set trainable(value: boolean);
@@ -1,7 +1,8 @@
1
- import r from "./CausalSelfAttention.js";
1
+ import a from "./CausalSelfAttention.js";
2
2
  import o from "./MLP.js";
3
- import a from "./RMSNorm.js";
4
- class u {
3
+ import r from "./RMSNorm.js";
4
+ import p from "./BaseLayer.js";
5
+ class f extends p {
5
6
  ln1;
6
7
  attn;
7
8
  ln2;
@@ -11,7 +12,10 @@ class u {
11
12
  _trainable = !0;
12
13
  skipped = !1;
13
14
  constructor(t, i, s, e) {
14
- this.tf = t, this.index = i, this.ln1 = new a(t, [s.nEmbed], 1e-8, `block_${this.index}_rms1`), this.attn = new r(this.tf, this.index, s, e), this.ln2 = new a(t, [s.nEmbed], 1e-8, `block_${this.index}_rms2`), this.mlp = new o(this.tf, this.index, s);
15
+ super(), this.tf = t, this.index = i, this.ln1 = new r(t, [s.nEmbed], 1e-8, `block_${this.index}_rms1`), this.attn = new a(this.tf, this.index, s, e), this.ln2 = new r(t, [s.nEmbed], 1e-8, `block_${this.index}_rms2`), this.mlp = new o(this.tf, this.index, s);
16
+ }
17
+ setProfiler(t) {
18
+ this._profiler = t, this.attn.setProfiler(t), this.mlp.setProfiler(t), this.ln1.setProfiler(t), this.ln2.setProfiler(t);
15
19
  }
16
20
  get variables() {
17
21
  return [
@@ -54,5 +58,5 @@ class u {
54
58
  }
55
59
  }
56
60
  export {
57
- u as default
61
+ f as default
58
62
  };
@@ -1,4 +1,4 @@
1
- import { o as c, c as s, d as m, E as M, B as p } from "./index-Dsg28SG6.js";
1
+ import { o as m, d as s, f as c, E as M, B as f } from "./index-CWQLouWz.js";
2
2
  /**
3
3
  * @license
4
4
  * Copyright 2020 Google LLC. All Rights Reserved.
@@ -15,13 +15,13 @@ import { o as c, c as s, d as m, E as M, B as p } from "./index-Dsg28SG6.js";
15
15
  * limitations under the License.
16
16
  * =============================================================================
17
17
  */
18
- function f(e, o, n = !1, l = !1) {
18
+ function p(e, o, n = !1, l = !1) {
19
19
  let a = s(e, "a", "matMul"), t = s(o, "b", "matMul");
20
- [a, t] = m(a, t);
20
+ [a, t] = c(a, t);
21
21
  const r = { a, b: t }, u = { transposeA: n, transposeB: l };
22
- return M.runKernel(p, r, u);
22
+ return M.runKernel(f, r, u);
23
23
  }
24
- const i = /* @__PURE__ */ c({ matMul_: f });
24
+ const i = /* @__PURE__ */ m({ matMul_: p });
25
25
  export {
26
26
  i as m
27
27
  };
@@ -1,14 +1,14 @@
1
- import { engine as l } from "@tensorflow/tfjs";
2
- import { r as u, b as k, s as d } from "../index-Dsg28SG6.js";
3
- import { m as p } from "../mat_mul-BAYDrXvE.js";
4
- class f {
1
+ import { engine as k } from "@tensorflow/tfjs";
2
+ import { r as m, c as d, s as p } from "../index-CWQLouWz.js";
3
+ import { m as f } from "../mat_mul-4v7St11W.js";
4
+ class h {
5
5
  variableNames = ["q", "k", "mask"];
6
6
  outputShape;
7
7
  userCode;
8
8
  // enableShapeUniforms = true;
9
9
  customUniforms = [{ name: "divisor", type: "float" }];
10
- constructor(s, n, e, a) {
11
- this.outputShape = [s, n, e, e], this.userCode = `
10
+ constructor(e, n, s, a) {
11
+ this.outputShape = [e, n, s, s], this.userCode = `
12
12
  void main() {
13
13
  ivec4 coords = getOutputCoords(); // [batch, nh, t1, t2]
14
14
  int b = coords.x;
@@ -34,49 +34,55 @@ class f {
34
34
  `;
35
35
  }
36
36
  }
37
- function h(t) {
38
- const { q: s, k: n, mask: e } = t.inputs, { divisor: a } = t.attrs, o = t.backend, r = s.shape[0], i = s.shape[2], c = s.shape[1], m = new f(r, c, i, s.shape[3]);
39
- return o.runWebGLProgram(m, [s, n, e], "float32", [[a]]);
37
+ function v(t) {
38
+ const { q: e, k: n, mask: s } = t.inputs, { divisor: a } = t.attrs, o = t.backend, r = e.shape[0], i = e.shape[2], c = e.shape[1], u = new h(r, c, i, e.shape[3]);
39
+ return o.runWebGLProgram(u, [e, n, s], "float32", [[a]]);
40
40
  }
41
- const v = {
41
+ const b = {
42
42
  kernelName: "AttentionMask",
43
43
  backendName: "webgl",
44
- kernelFunc: h
44
+ kernelFunc: v
45
45
  };
46
- u(v);
47
- function b(t) {
48
- const { q: s, k: n, mask: e } = t.inputs, { divisor: a } = t.attrs, o = s.shape[2], i = p(s, n, !1, !0).mul(d(a)), c = e.slice([0, 0], [o, o]).expandDims(0).expandDims(0);
46
+ m(b);
47
+ function l(t) {
48
+ const { q: e, k: n, mask: s } = t.inputs, { divisor: a } = t.attrs, o = e.shape[2], i = f(e, n, !1, !0).mul(p(a)), c = s.slice([0, 0], [o, o]).expandDims(0).expandDims(0);
49
49
  return i.add(c);
50
50
  }
51
51
  const M = {
52
52
  kernelName: "AttentionMask",
53
53
  backendName: "cpu",
54
- kernelFunc: b
54
+ kernelFunc: l
55
55
  };
56
- u(M);
57
- function w(t, s, n, e) {
58
- return l().runKernel("AttentionMask", { q: t, k: s, mask: n }, { divisor: e });
59
- }
56
+ m(M);
60
57
  const g = {
58
+ kernelName: "AttentionMask",
59
+ backendName: "tensorflow",
60
+ kernelFunc: l
61
+ };
62
+ m(g);
63
+ function N(t, e, n, s) {
64
+ return k().runKernel("AttentionMask", { q: t, k: e, mask: n }, { divisor: s });
65
+ }
66
+ const A = {
61
67
  kernelName: "AttentionMask",
62
68
  inputsToSave: ["q", "k"],
63
69
  outputsToSave: [],
64
- gradFunc: (t, s, n) => {
70
+ gradFunc: (t, e, n) => {
65
71
  if (Array.isArray(t))
66
72
  throw new Error("Expected dy to be a single Tensor");
67
- const [e, a] = s, { divisor: o } = n;
73
+ const [s, a] = e, { divisor: o } = n;
68
74
  return {
69
75
  q: () => t.matMul(a).mul(o),
70
- k: () => e.transpose([0, 1, 3, 2]).matMul(t).mul(o).transpose([0, 1, 3, 2]),
76
+ k: () => s.transpose([0, 1, 3, 2]).matMul(t).mul(o).transpose([0, 1, 3, 2]),
71
77
  mask: () => t,
72
78
  divisor: () => {
73
- const r = e.matMul(a, !1, !0);
79
+ const r = s.matMul(a, !1, !0);
74
80
  return t.mul(r).sum();
75
81
  }
76
82
  };
77
83
  }
78
84
  };
79
- k(g);
85
+ d(A);
80
86
  export {
81
- w as attentionMask
87
+ N as attentionMask
82
88
  };
@@ -1,6 +1,6 @@
1
1
  import { engine as l } from "@tensorflow/tfjs";
2
- import { o as g, c as i, E as b, G as d, r as c, a as h } from "../index-Dsg28SG6.js";
3
- import { r as p, s as f } from "../stack-1o648CP_.js";
2
+ import { o as g, d as i, E as b, G as d, r as c, b as h } from "../index-CWQLouWz.js";
3
+ import { r as p, s as f } from "../stack-CTdK-itU.js";
4
4
  /**
5
5
  * @license
6
6
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -1,4 +1,4 @@
1
- import { r as o } from "../../index-Dsg28SG6.js";
1
+ import { r as o } from "../../index-CWQLouWz.js";
2
2
  function r(e) {
3
3
  const { logits: t, labels: n } = e.inputs;
4
4
  return e.backend.executeMultipleOutputs("SparseSoftmaxCrossEntropyWithLogits", [], [t, n], 2);
@@ -1,7 +1,7 @@
1
1
  import { engine as $ } from "@tensorflow/tfjs";
2
- import { k as u, l as S, n as p, E as f, p as E, o as N, c as l, q as y, r as h, a as D, m as x } from "../index-Dsg28SG6.js";
3
- import { c as m } from "../complex-Cd8sqiBC.js";
4
- import { r as v, s as T } from "../stack-1o648CP_.js";
2
+ import { l as u, n as S, p, E as f, q as E, o as N, d as l, t as y, r as h, b as D, a as x } from "../index-CWQLouWz.js";
3
+ import { c as d } from "../complex-x7w5HPOS.js";
4
+ import { r as v, s as T } from "../stack-CTdK-itU.js";
5
5
  /**
6
6
  * @license
7
7
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -21,7 +21,7 @@ import { r as v, s as T } from "../stack-1o648CP_.js";
21
21
  function i(e, t = "float32") {
22
22
  if (u(e), t === "complex64") {
23
23
  const a = i(e, "float32"), o = i(e, "float32");
24
- return m(a, o);
24
+ return d(a, o);
25
25
  }
26
26
  const r = S(p(e), t);
27
27
  return f.makeTensor(r, e, t);
@@ -42,10 +42,10 @@ function i(e, t = "float32") {
42
42
  * limitations under the License.
43
43
  * =============================================================================
44
44
  */
45
- function d(e, t = "float32") {
45
+ function m(e, t = "float32") {
46
46
  if (u(e), t === "complex64") {
47
- const a = d(e, "float32"), o = i(e, "float32");
48
- return m(a, o);
47
+ const a = m(e, "float32"), o = i(e, "float32");
48
+ return d(a, o);
49
49
  }
50
50
  const r = E(p(e), t);
51
51
  return f.makeTensor(r, e, t);
@@ -133,7 +133,7 @@ const K = {
133
133
  };
134
134
  h(K);
135
135
  function A(e) {
136
- const { logits: t, labels: r, dy: a } = e.inputs, o = r.shape[0], s = t.shape[1], n = v(0, o, 1, "int32"), c = T([n, r], 1), b = d([o]), g = I(c, b, [o, s]), k = D(t, g), w = a.reshape([o, 1]);
136
+ const { logits: t, labels: r, dy: a } = e.inputs, o = r.shape[0], s = t.shape[1], n = v(0, o, 1, "int32"), c = T([n, r], 1), b = m([o]), g = I(c, b, [o, s]), k = D(t, g), w = a.reshape([o, 1]);
137
137
  return x(k, w);
138
138
  }
139
139
  const F = {
@@ -1,4 +1,4 @@
1
- import { E as e, R as c, o as f, g as u, h as a, P as i } from "./index-Dsg28SG6.js";
1
+ import { E as e, R as c, o as f, h as i, i as a, P as u } from "./index-CWQLouWz.js";
2
2
  /**
3
3
  * @license
4
4
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -15,7 +15,7 @@ import { E as e, R as c, o as f, g as u, h as a, P as i } from "./index-Dsg28SG6
15
15
  * limitations under the License.
16
16
  * =============================================================================
17
17
  */
18
- function h(n, s, t = 1, r = "float32") {
18
+ function l(n, s, t = 1, r = "float32") {
19
19
  if (t === 0)
20
20
  throw new Error("Cannot have a step of zero");
21
21
  const o = { start: n, stop: s, step: t, dtype: r };
@@ -38,13 +38,13 @@ function h(n, s, t = 1, r = "float32") {
38
38
  * =============================================================================
39
39
  */
40
40
  function k(n, s = 0) {
41
- const t = u(n, "tensors", "stack", "string_or_numeric");
41
+ const t = i(n, "tensors", "stack", "string_or_numeric");
42
42
  a(t.length >= 1, () => "Pass at least one tensor to tf.stack"), t.length > 0 && a(s <= t[0].rank, () => "Axis must be <= rank of the tensor");
43
43
  const r = t, o = { axis: s };
44
- return e.runKernel(i, r, o);
44
+ return e.runKernel(u, r, o);
45
45
  }
46
- const l = /* @__PURE__ */ f({ stack_: k });
46
+ const g = /* @__PURE__ */ f({ stack_: k });
47
47
  export {
48
- h as r,
49
- l as s
48
+ l as r,
49
+ g as s
50
50
  };
@@ -1,4 +1,4 @@
1
- import { o, c as a, E as u, i, j as p, S as x } from "./index-Dsg28SG6.js";
1
+ import { o, d as a, E as u, j as p, k as i, S as x } from "./index-CWQLouWz.js";
2
2
  /**
3
3
  * @license
4
4
  * Copyright 2020 Google LLC. All Rights Reserved.
@@ -17,7 +17,7 @@ import { o, c as a, E as u, i, j as p, S as x } from "./index-Dsg28SG6.js";
17
17
  */
18
18
  function l(n, t) {
19
19
  const s = { x: a(n, "x", "reshape", "string_or_numeric") }, r = { shape: t };
20
- return u.runKernel(i, s, r);
20
+ return u.runKernel(p, s, r);
21
21
  }
22
22
  const h = /* @__PURE__ */ o({ reshape_: l });
23
23
  /**
@@ -38,7 +38,7 @@ const h = /* @__PURE__ */ o({ reshape_: l });
38
38
  */
39
39
  function m(n, t = null, e = !1) {
40
40
  let s = a(n, "x", "sum");
41
- s.dtype === "bool" && (s = p(s, "int32"));
41
+ s.dtype === "bool" && (s = i(s, "int32"));
42
42
  const r = { x: s }, c = { axis: t, keepDims: e };
43
43
  return u.runKernel(x, r, c);
44
44
  }
@@ -1,4 +1,4 @@
1
- import { A as r, m as c, s as h, a as g, e as o } from "../index-Dsg28SG6.js";
1
+ import { A as r, a as c, s as h, b as g, e as o } from "../index-CWQLouWz.js";
2
2
  class u extends r {
3
3
  constructor(t, e, s, a, i) {
4
4
  super(t, e, s, a), this.config = i, this.startLearningRate = t;
@@ -1,8 +1,8 @@
1
1
  import { DatasetBuilder as d } from "./DatasetBuilder.js";
2
- import p from "./AdamExt.js";
3
- class u {
4
- constructor(t, e, s, i = 1e-3) {
5
- this.tokenizer = s, this.tf = t, this.model = e, this.learningRate = i, this.resetOptimizer(), this.datasetBuilder = new d(this.tf, s, e.config.blockSize);
2
+ import h from "./AdamExt.js";
3
+ class g {
4
+ constructor(t, s, e, i = 1e-3) {
5
+ this.tokenizer = e, this.tf = t, this.model = s, this.learningRate = i, this.resetOptimizer(), this.datasetBuilder = new d(this.tf, e, s.config.blockSize);
6
6
  }
7
7
  model;
8
8
  optimizer;
@@ -25,7 +25,7 @@ class u {
25
25
  }
26
26
  resetOptimizer(t = { learningRateFactor: 1, beta1: 0.9, beta2: 0.99, epsilon: 1e-8 }) {
27
27
  this.optimizer && this.optimizer.dispose();
28
- const e = new p(
28
+ const s = new h(
29
29
  t.learningRateFactor * this.learningRate,
30
30
  t.beta1,
31
31
  t.beta2,
@@ -37,58 +37,59 @@ class u {
37
37
  weightDecay: 0
38
38
  }
39
39
  );
40
- this.optimizer = e;
40
+ this.optimizer = s;
41
41
  }
42
42
  printGradients(t) {
43
- Object.keys(t).forEach((e) => {
44
- const s = t[e];
45
- console.log(`${e}:`), console.log(` Shape: ${s.shape}`), console.log(` Mean: ${this.tf.mean(s).dataSync()[0]}`), console.log(` Std: ${this.tf.moments(s).variance.sqrt().dataSync()[0]}`), console.log(` Min: ${this.tf.min(s).dataSync()[0]}`), console.log(` Max: ${this.tf.max(s).dataSync()[0]}`), console.log(` Norm: ${this.tf.norm(s).dataSync()[0]}`);
43
+ Object.keys(t).forEach((s) => {
44
+ const e = t[s];
45
+ console.log(`${s}:`), console.log(` Shape: ${e.shape}`), console.log(` Mean: ${this.tf.mean(e).dataSync()[0]}`), console.log(` Std: ${this.tf.moments(e).variance.sqrt().dataSync()[0]}`), console.log(` Min: ${this.tf.min(e).dataSync()[0]}`), console.log(` Max: ${this.tf.max(e).dataSync()[0]}`), console.log(` Norm: ${this.tf.norm(e).dataSync()[0]}`);
46
46
  });
47
47
  }
48
- trainStep(t, e = !1, s = !1) {
48
+ trainStep(t, s = !1, e = !1) {
49
49
  return this.tf.tidy(() => {
50
+ this.model.getProfiler()?.startMemory();
50
51
  const { xs: i, ys: a } = t, o = () => {
51
52
  const { loss: l, logits: c } = this.model.forward(i, a, !0);
52
53
  return c.dispose(), l;
53
54
  }, { value: n, grads: r } = this.tf.variableGrads(o);
54
- return e || (s && (console.log("-------"), this.printGradients(r), console.log("-------")), this.optimizer.applyGradients(r), this.tf.dispose(r)), n;
55
+ return s ? this.model.getProfiler()?.endMemory("Training") : (e && (console.log("-------"), this.printGradients(r), console.log("-------")), this.optimizer.applyGradients(r), this.model.getProfiler()?.endMemory("Training"), this.tf.dispose(r)), n;
55
56
  });
56
57
  }
57
58
  dummyPass() {
58
- const t = this.tf.zeros([1, this.model.config.blockSize], "int32"), e = this.tf.zeros([1, this.model.config.blockSize], "int32");
59
+ const t = this.tf.zeros([1, this.model.config.blockSize], "int32"), s = this.tf.zeros([1, this.model.config.blockSize], "int32");
59
60
  try {
60
- const s = this.trainStep({ xs: t, ys: e }, !0);
61
- s.dataSync(), s.dispose();
62
- } catch (s) {
63
- console.error("Error during dummy pass:", s);
61
+ const e = this.trainStep({ xs: t, ys: s }, !0);
62
+ e.dataSync(), e.dispose();
63
+ } catch (e) {
64
+ console.error("Error during dummy pass:", e);
64
65
  } finally {
65
- t.dispose(), e.dispose();
66
+ t.dispose(), s.dispose();
66
67
  }
67
68
  }
68
- async trainBatch(t, e) {
69
+ async trainBatch(t, s) {
69
70
  try {
70
- const s = this.trainStep(e, !1, !1);
71
- return e.xs.dispose(), e.ys.dispose(), t.step++, t.totalSteps++, s.array().then((i) => (t.lastLoss = i, t.losses.push(t.lastLoss), s.dispose(), t.lastLoss));
72
- } catch (s) {
73
- throw console.error(`Error processing batch at step ${t.step}:`, s), this.tf.dispose(), s;
71
+ const e = this.trainStep(s, !1, !1);
72
+ return s.xs.dispose(), s.ys.dispose(), t.step++, t.totalSteps++, e.array().then((i) => (t.lastLoss = i, t.losses.push(t.lastLoss), e.dispose(), t.lastLoss));
73
+ } catch (e) {
74
+ throw console.error(`Error processing batch at step ${t.step}:`, e), this.tf.dispose(), e;
74
75
  }
75
76
  }
76
- async createTrainValidationSplit(t, e = 32, s = 0.1) {
77
- const i = await this.datasetBuilder.createTextDataset(t, e, 0, 1 - s), a = await this.datasetBuilder.createTextDataset(
77
+ async createTrainValidationSplit(t, s = 32, e = 0.1) {
78
+ const i = await this.datasetBuilder.createTextDataset(t, s, 0, 1 - e), a = await this.datasetBuilder.createTextDataset(
78
79
  t,
79
- e,
80
- 1 - s,
80
+ s,
81
+ 1 - e,
81
82
  1
82
83
  );
83
84
  return { trainDataset: i, validationDataset: a };
84
85
  }
85
- async createDataset(t, e = 32) {
86
- return await this.datasetBuilder.createTextDataset(t, e);
86
+ async createDataset(t, s = 32) {
87
+ return await this.datasetBuilder.createTextDataset(t, s);
87
88
  }
88
89
  dispose() {
89
90
  this.optimizer && this.optimizer.dispose();
90
91
  }
91
92
  }
92
93
  export {
93
- u as default
94
+ g as default
94
95
  };