@genai-fi/nanogpt 0.7.0 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (152) hide show
  1. package/dist/Generator.js +13 -9
  2. package/dist/NanoGPTModel.js +10 -10
  3. package/dist/{RealDiv-C4hOvYOZ.js → RealDiv-Dy0p8Bvo.js} +11 -11
  4. package/dist/{Reshape-BLijOA8h.js → Reshape-DH5srBP0.js} +2 -2
  5. package/dist/Reshape-DvudQDvJ.js +30 -0
  6. package/dist/TeachableLLM.js +9 -5
  7. package/dist/{TiedEmbedding-BLltddza.js → TiedEmbedding-BxOerUmB.js} +4 -4
  8. package/dist/{axis_util-DaAl5MER.js → axis_util-BzbKo31C.js} +1 -1
  9. package/dist/backend.js +2 -2
  10. package/dist/{backend_util-DWiwsi2N.js → backend_util-TE7aTPhZ.js} +40 -40
  11. package/dist/{broadcast_to-C4v-j9yA.js → broadcast_to-CdbwV-Dj.js} +2 -2
  12. package/dist/{concat-CsHeR4zV.js → concat-CsxrgovM.js} +1 -1
  13. package/dist/{dataset-JDyjG3QR.js → dataset-CtdBYwjo.js} +7 -7
  14. package/dist/{dropout-hpDwECTe.js → dropout-DYs5QFGQ.js} +11 -11
  15. package/dist/{gather-D0_gPiBz.js → gather-CMMy2KEG.js} +4 -4
  16. package/dist/{gelu-uyHP1x1f.js → gelu-C-dPj6Ku.js} +1 -1
  17. package/dist/{gpgpu_math-DJm3ZTAf.js → gpgpu_math-DGNLNL4I.js} +2 -2
  18. package/dist/{index-C0dhsYom.js → index-BoWRt-10.js} +126 -126
  19. package/dist/{index-BPPzKVdR.js → index-CLthM0TO.js} +1083 -1106
  20. package/dist/{kernel_funcs_utils-CwRTFqrc.js → kernel_funcs_utils-BYKWV8Aa.js} +3 -3
  21. package/dist/layers/BaseLayer.js +2 -2
  22. package/dist/layers/CausalSelfAttention.js +8 -8
  23. package/dist/layers/MLP.js +5 -5
  24. package/dist/layers/RMSNorm.js +3 -3
  25. package/dist/layers/RoPECache.js +4 -4
  26. package/dist/layers/TiedEmbedding.js +5 -5
  27. package/dist/layers/TransformerBlock.js +1 -1
  28. package/dist/loader/loadTransformers.js +1 -1
  29. package/dist/loader/oldZipLoad.js +11 -7
  30. package/dist/{log_sum_exp-D086OgZJ.js → log_sum_exp-DbjkV734.js} +8 -8
  31. package/dist/main.d.ts +11 -0
  32. package/dist/main.js +44 -27
  33. package/dist/{mat_mul-1nwdPkQ_.js → mat_mul-8m8pfdcx.js} +1 -1
  34. package/dist/{max-BQc2Aj-I.js → max-Ddnnb5xe.js} +3 -3
  35. package/dist/{mulmat_packed_gpu-Gzf3I9UV.js → mulmat_packed_gpu-VSekgsNv.js} +1 -1
  36. package/dist/{ones-D63HpSF_.js → ones-Dj0SDhHf.js} +2 -2
  37. package/dist/ops/adamAdjust.d.ts +2 -0
  38. package/dist/ops/adamAdjust.js +9 -0
  39. package/dist/ops/adamMoments.d.ts +2 -0
  40. package/dist/ops/adamMoments.js +9 -0
  41. package/dist/ops/appendCache.js +3 -3
  42. package/dist/ops/attentionMask.js +1 -1
  43. package/dist/ops/cpu/adamAdjust.d.ts +1 -0
  44. package/dist/ops/cpu/adamAdjust.js +18 -0
  45. package/dist/ops/cpu/adamMoments.d.ts +1 -0
  46. package/dist/ops/cpu/adamMoments.js +16 -0
  47. package/dist/ops/cpu/appendCache.js +2 -2
  48. package/dist/ops/cpu/attentionMask.js +5 -5
  49. package/dist/ops/cpu/fusedSoftmax.js +2 -2
  50. package/dist/ops/cpu/gatherSub.js +3 -3
  51. package/dist/ops/cpu/gelu.js +1 -1
  52. package/dist/ops/cpu/matMulGelu.js +2 -2
  53. package/dist/ops/cpu/matMulMul.js +1 -1
  54. package/dist/ops/cpu/mulDropout.js +1 -1
  55. package/dist/ops/cpu/normRMS.js +1 -1
  56. package/dist/ops/cpu/qkv.js +3 -3
  57. package/dist/ops/cpu/rope.js +5 -5
  58. package/dist/ops/cpu/scatterSub.js +11 -11
  59. package/dist/ops/fusedSoftmax.js +1 -1
  60. package/dist/ops/gatherSub.js +1 -1
  61. package/dist/ops/gelu.js +2 -2
  62. package/dist/ops/grads/attentionMask.js +1 -1
  63. package/dist/ops/grads/fusedSoftmax.js +2 -2
  64. package/dist/ops/grads/gelu.js +2 -2
  65. package/dist/ops/grads/matMulGelu.js +1 -1
  66. package/dist/ops/grads/normRMS.js +1 -1
  67. package/dist/ops/grads/qkv.js +1 -1
  68. package/dist/ops/grads/rope.js +1 -1
  69. package/dist/ops/matMulGelu.js +1 -1
  70. package/dist/ops/matMulMul.js +1 -1
  71. package/dist/ops/mulDrop.js +1 -1
  72. package/dist/ops/normRMS.js +1 -1
  73. package/dist/ops/qkv.js +1 -1
  74. package/dist/ops/rope.js +4 -4
  75. package/dist/ops/scatterSub.js +1 -1
  76. package/dist/ops/webgl/adamAdjust.d.ts +1 -0
  77. package/dist/ops/webgl/adamAdjust.js +50 -0
  78. package/dist/ops/webgl/adamMoments.d.ts +1 -0
  79. package/dist/ops/webgl/adamMoments.js +40 -0
  80. package/dist/ops/webgl/appendCache.js +1 -1
  81. package/dist/ops/webgl/attentionMask.js +1 -1
  82. package/dist/ops/webgl/fusedSoftmax.js +4 -4
  83. package/dist/ops/webgl/gatherSub.js +8 -8
  84. package/dist/ops/webgl/gelu.js +2 -2
  85. package/dist/ops/webgl/log.js +3 -3
  86. package/dist/ops/webgl/matMulGelu.js +4 -4
  87. package/dist/ops/webgl/matMulMul.js +1 -1
  88. package/dist/ops/webgl/mulDropout.js +1 -1
  89. package/dist/ops/webgl/normRMS.js +2 -2
  90. package/dist/ops/webgl/qkv.js +1 -1
  91. package/dist/ops/webgl/rope.js +1 -1
  92. package/dist/ops/webgl/scatterSub.js +1 -1
  93. package/dist/ops/webgpu/adamAdjust.d.ts +1 -0
  94. package/dist/ops/webgpu/adamAdjust.js +54 -0
  95. package/dist/ops/webgpu/adamMoments.d.ts +1 -0
  96. package/dist/ops/webgpu/adamMoments.js +58 -0
  97. package/dist/ops/webgpu/appendCache.js +22 -18
  98. package/dist/ops/webgpu/attentionMask.js +24 -17
  99. package/dist/ops/webgpu/gatherSub.js +17 -15
  100. package/dist/ops/webgpu/gelu.js +7 -6
  101. package/dist/ops/webgpu/index.js +3 -0
  102. package/dist/ops/webgpu/normRMS.js +35 -101
  103. package/dist/ops/webgpu/normRMSGrad.d.ts +1 -0
  104. package/dist/ops/webgpu/normRMSGrad.js +133 -0
  105. package/dist/ops/webgpu/qkv.js +21 -16
  106. package/dist/ops/webgpu/rope.js +37 -23
  107. package/dist/ops/webgpu/scatterSub.js +16 -13
  108. package/dist/ops/webgpu/utils/reductions.d.ts +9 -0
  109. package/dist/ops/webgpu/utils/reductions.js +68 -0
  110. package/dist/{ops-CIQLNshk.js → ops-BFGCx8Ri.js} +195 -219
  111. package/dist/{random_width-DkYP8W8N.js → random_width-sZORGo5k.js} +22 -21
  112. package/dist/{range-CYzpQY53.js → range-CRuAh-gd.js} +1 -1
  113. package/dist/{reciprocal-_A9yv27J.js → reciprocal-BvGAyKyu.js} +1 -1
  114. package/dist/{register_all_kernels-guvSxp7M.js → register_all_kernels-BwDSRN-f.js} +30 -29
  115. package/dist/{reshape-BMUzc1UY.js → reshape-CdBq1WJ6.js} +3 -3
  116. package/dist/{scatter_nd_util-IRBqKz_b.js → scatter_nd_util-DUstGbU1.js} +1 -1
  117. package/dist/{selu_util-Dt_iuXaq.js → selu_util-BJEXVvjX.js} +41 -41
  118. package/dist/{shared-CDu9S76h.js → shared-B8ztnyEk.js} +6 -6
  119. package/dist/{shared-BNa2q6jD.js → shared-wS99K7_n.js} +1 -1
  120. package/dist/{sin-Cocju-BY.js → sin-BeA3tsEd.js} +6 -6
  121. package/dist/slice-BiOsknYS.js +28 -0
  122. package/dist/{softmax-GPNK3o-U.js → softmax-Bv_6lyMX.js} +3 -3
  123. package/dist/{split-CHzJjxDv.js → split-B-dikLRw.js} +1 -1
  124. package/dist/{stack-Dpgg_1W1.js → stack-B17UN2nn.js} +1 -1
  125. package/dist/{sum-B8wEpKsg.js → sum-66ew2byf.js} +3 -3
  126. package/dist/{tensor-RvZVNmg0.js → tensor-JwS7ZYY6.js} +1 -1
  127. package/dist/{tensor2d-B_kyod7_.js → tensor2d-wxPAnDQy.js} +1 -1
  128. package/dist/training/Adam.d.ts +22 -0
  129. package/dist/training/Adam.js +93 -0
  130. package/dist/training/AdamExt.d.ts +1 -1
  131. package/dist/training/AdamExt.js +13 -12
  132. package/dist/training/DatasetBuilder.js +35 -32
  133. package/dist/training/FullTrainer.js +22 -22
  134. package/dist/training/Trainer.d.ts +1 -1
  135. package/dist/training/Trainer.js +32 -32
  136. package/dist/training/sparseCrossEntropy.d.ts +0 -4
  137. package/dist/training/sparseCrossEntropy.js +7 -7
  138. package/dist/utilities/arrayClose.d.ts +1 -0
  139. package/dist/utilities/arrayClose.js +11 -0
  140. package/dist/utilities/dummy.js +2 -2
  141. package/dist/utilities/generate.js +3 -3
  142. package/dist/utilities/multinomialCPU.js +2 -2
  143. package/dist/utilities/performance.d.ts +1 -1
  144. package/dist/utilities/performance.js +11 -11
  145. package/dist/utilities/profile.js +1 -1
  146. package/dist/utilities/safetensors.js +2 -2
  147. package/dist/utilities/weights.js +2 -2
  148. package/dist/{variable-DXEUOwew.js → variable-BuddVFLa.js} +1 -1
  149. package/dist/{webgpu_util-g13LvDIv.js → webgpu_program-PFzf1hAQ.js} +138 -215
  150. package/dist/webgpu_util-D____QpY.js +80 -0
  151. package/dist/{zeros-DCPCdFGq.js → zeros--BdLQ3oG.js} +4 -4
  152. package/package.json +1 -1
@@ -0,0 +1,93 @@
1
+ import { adamAdjust as b } from "../ops/adamAdjust.js";
2
+ import { adamMoments as d } from "../ops/adamMoments.js";
3
+ import { O as g, e as h, t as o, d as B } from "../index-BoWRt-10.js";
4
+ import { z as M } from "../zeros--BdLQ3oG.js";
5
+ /**
6
+ * @license
7
+ * Copyright 2018 Google LLC. All Rights Reserved.
8
+ * Licensed under the Apache License, Version 2.0 (the "License");
9
+ * you may not use this file except in compliance with the License.
10
+ * You may obtain a copy of the License at
11
+ *
12
+ * http://www.apache.org/licenses/LICENSE-2.0
13
+ *
14
+ * Unless required by applicable law or agreed to in writing, software
15
+ * distributed under the License is distributed on an "AS IS" BASIS,
16
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17
+ * See the License for the specific language governing permissions and
18
+ * limitations under the License.
19
+ * =============================================================================
20
+ */
21
+ class R extends g {
22
+ constructor(t, a, e, s = null) {
23
+ super(), this.learningRate = t, this.beta1 = a, this.beta2 = e, this.epsilon = s, this.accBeta1 = a, this.accBeta2 = e, s === null && (this.epsilon = h().backend.epsilon());
24
+ }
25
+ /** @nocollapse */
26
+ static get className() {
27
+ return "Adam";
28
+ }
29
+ accBeta1 = 0;
30
+ accBeta2 = 0;
31
+ accumulatedMoments = [];
32
+ applyGradients(t) {
33
+ const a = Array.isArray(t) ? t.map((e) => e.name) : Object.keys(t);
34
+ o(() => {
35
+ const e = 1 - this.accBeta1, s = 1 - this.accBeta2;
36
+ a.forEach((n, i) => {
37
+ const c = h().registeredVariables[n], u = !1;
38
+ this.accumulatedMoments[i] == null && (this.accumulatedMoments[i] = {
39
+ originalName: `${n}/m`,
40
+ variable: o(() => M([...c.shape, 2]).variable(u))
41
+ });
42
+ const r = Array.isArray(t) ? t[i].tensor : t[n];
43
+ if (r == null)
44
+ return;
45
+ const m = this.accumulatedMoments[i].variable, l = d(m, r, this.beta1, this.beta2);
46
+ m.assign(l);
47
+ const p = b(
48
+ l,
49
+ c,
50
+ e,
51
+ s,
52
+ this.epsilon ?? 1e-8,
53
+ this.learningRate
54
+ );
55
+ c.assign(p);
56
+ }), this.accBeta1 = this.accBeta1 * this.beta1, this.accBeta2 = this.accBeta2 * this.beta2;
57
+ }), this.incrementIterations();
58
+ }
59
+ dispose() {
60
+ this.accumulatedMoments != null && B(this.accumulatedMoments.map((t) => t.variable));
61
+ }
62
+ async getWeights() {
63
+ const t = [...this.accumulatedMoments];
64
+ return [await this.saveIterations()].concat(
65
+ t.map((a) => ({ name: a.originalName, tensor: a.variable }))
66
+ );
67
+ }
68
+ async setWeights(t) {
69
+ t = await this.extractIterations(t), o(() => {
70
+ this.accBeta1 = Math.pow(this.beta1, this.iterations_ + 1), this.accBeta2 = Math.pow(this.beta2, this.iterations_ + 1);
71
+ });
72
+ const a = t.length / 2, e = !1;
73
+ this.accumulatedMoments = t.slice(0, a).map((s) => ({
74
+ originalName: s.name,
75
+ variable: s.tensor.variable(e)
76
+ }));
77
+ }
78
+ getConfig() {
79
+ return {
80
+ learningRate: this.learningRate,
81
+ beta1: this.beta1,
82
+ beta2: this.beta2,
83
+ epsilon: this.epsilon
84
+ };
85
+ }
86
+ /** @nocollapse */
87
+ static fromConfig(t, a) {
88
+ return new t(a.learningRate, a.beta1, a.beta2, a.epsilon);
89
+ }
90
+ }
91
+ export {
92
+ R as AdamOptimizer
93
+ };
@@ -1,5 +1,5 @@
1
- import { AdamOptimizer } from '@tensorflow/tfjs-core';
2
1
  import { NamedTensor, NamedVariableMap } from '@tensorflow/tfjs-core/dist/tensor_types';
2
+ import { AdamOptimizer } from './Adam';
3
3
  interface AdamExtConfig {
4
4
  warmupSteps: number;
5
5
  decaySteps: number;
@@ -1,7 +1,8 @@
1
- import { A as r, a as c, b as h, c as g, e as o } from "../index-C0dhsYom.js";
2
- class u extends r {
3
- constructor(t, e, s, a, i) {
4
- super(t, e, s, a), this.config = i, this.startLearningRate = t;
1
+ import { a as r, b as c, c as h, e as o } from "../index-BoWRt-10.js";
2
+ import { AdamOptimizer as g } from "./Adam.js";
3
+ class y extends g {
4
+ constructor(t, e, s, i, a) {
5
+ super(t, e, s, i), this.config = a, this.startLearningRate = t;
5
6
  }
6
7
  step = 0;
7
8
  startLearningRate;
@@ -23,21 +24,21 @@ class u extends r {
23
24
  }
24
25
  decayVariable(t, e, s) {
25
26
  if (t && t.shape.length >= 2) {
26
- const a = c(t, h(s * e));
27
- t.assign(g(t, a)), a.dispose();
27
+ const i = r(t, c(s * e));
28
+ t.assign(h(t, i)), i.dispose();
28
29
  }
29
30
  }
30
31
  applyWeightDecay(t) {
31
- const e = this.config.weightDecay, s = this.learningRate, a = o().registeredVariables;
32
- Array.isArray(t) ? t.forEach(({ name: i }) => {
33
- const n = a[i];
32
+ const e = this.config.weightDecay, s = this.learningRate, i = o().registeredVariables;
33
+ Array.isArray(t) ? t.forEach(({ name: a }) => {
34
+ const n = i[a];
34
35
  this.decayVariable(n, e, s);
35
- }) : Object.keys(t).forEach((i) => {
36
- const n = a[i];
36
+ }) : Object.keys(t).forEach((a) => {
37
+ const n = i[a];
37
38
  this.decayVariable(n, e, s);
38
39
  });
39
40
  }
40
41
  }
41
42
  export {
42
- u as default
43
+ y as default
43
44
  };
@@ -1,5 +1,5 @@
1
- import { t as u } from "../index-C0dhsYom.js";
2
- import { d as z, i as f } from "../dataset-JDyjG3QR.js";
1
+ import { t as g } from "../index-BoWRt-10.js";
2
+ import { d as u, i as d } from "../dataset-CtdBYwjo.js";
3
3
  import "../index-Tf7vU29b.js";
4
4
  /**
5
5
  * @license
@@ -18,57 +18,60 @@ import "../index-Tf7vU29b.js";
18
18
  *
19
19
  * =============================================================================
20
20
  */
21
- function S(c) {
22
- return z(async () => {
23
- const t = await c();
24
- return f(() => t.next());
21
+ function z(r) {
22
+ return u(async () => {
23
+ const t = await r();
24
+ return d(() => t.next());
25
25
  });
26
26
  }
27
- const p = 8;
28
- async function y(c, t) {
29
- const s = await Promise.all(c.map((n) => t.encode(n))), i = t.eosToken >= 0;
30
- return s.map((n) => i ? [...n, t.eosToken] : n).flat();
27
+ const S = 8;
28
+ async function y(r, t) {
29
+ const s = await Promise.all(r.map((e) => t.encode(e))), o = t.eosToken >= 0, a = s.map((e) => o ? [...e, t.eosToken] : e).flat();
30
+ for (const e of a)
31
+ if (e < 0 || e >= t.vocabSize)
32
+ throw new Error(`Invalid token index ${e} found in tokenised data`);
33
+ return a;
31
34
  }
32
35
  class w {
33
36
  tokenizer;
34
37
  blockSize;
35
38
  pageSize;
36
39
  constructor(t, s = 128) {
37
- this.tokenizer = t, this.blockSize = s, this.pageSize = s * p;
40
+ this.tokenizer = t, this.blockSize = s, this.pageSize = s * S;
38
41
  }
39
42
  // Create dataset from text files
40
- async createTextDataset(t, s = 32, i, r) {
43
+ async createTextDataset(t, s = 32, o, a) {
41
44
  if (t.length < this.blockSize + 1)
42
45
  throw new Error(`Not enough tokens (${t.length}) for block size ${this.blockSize}`);
43
- if (i && i.size > t.length / this.pageSize / 2)
46
+ if (o && o.size > t.length / this.pageSize / 2)
44
47
  throw new Error("Too many masked pages - would leave insufficient training data");
45
- const n = (function* () {
46
- if (i && r) {
47
- const e = Array.from(i);
48
+ const e = (function* () {
49
+ if (o && a) {
50
+ const i = Array.from(o);
48
51
  for (; ; ) {
49
- const a = Math.floor(Math.random() * e.length), l = Math.floor(Math.random() * this.pageSize), o = e[a] * this.pageSize + l;
50
- if (o + this.blockSize + 1 > t.length)
52
+ const c = Math.floor(Math.random() * i.length), l = Math.floor(Math.random() * this.pageSize), n = i[c] * this.pageSize + l;
53
+ if (n + this.blockSize + 1 > t.length)
51
54
  continue;
52
- const h = t.slice(o, o + this.blockSize), g = t.slice(o + 1, o + this.blockSize + 1);
53
- yield { xs: h, ys: g };
55
+ const h = t.slice(n, n + this.blockSize), f = t.slice(n + 1, n + this.blockSize + 1);
56
+ yield { xs: h, ys: f };
54
57
  }
55
58
  } else
56
59
  for (; ; ) {
57
- const e = Math.floor(Math.random() * (t.length - this.blockSize - 1));
58
- if (i) {
59
- const o = Math.floor(e / this.pageSize), h = i.has(o);
60
- if (h && !r || !h && r)
60
+ const i = Math.floor(Math.random() * (t.length - this.blockSize - 1));
61
+ if (o) {
62
+ const n = Math.floor(i / this.pageSize), h = o.has(n);
63
+ if (h && !a || !h && a)
61
64
  continue;
62
65
  }
63
- const a = t.slice(e, e + this.blockSize), l = t.slice(e + 1, e + this.blockSize + 1);
64
- yield { xs: a, ys: l };
66
+ const c = t.slice(i, i + this.blockSize), l = t.slice(i + 1, i + this.blockSize + 1);
67
+ yield { xs: c, ys: l };
65
68
  }
66
69
  }).bind(this);
67
- return S(n).batch(s).map((e) => {
68
- const a = e;
69
- return u(() => ({
70
- xs: a.xs.cast("int32"),
71
- ys: a.ys.cast("int32")
70
+ return z(e).batch(s).map((i) => {
71
+ const c = i;
72
+ return g(() => ({
73
+ xs: c.xs.cast("int32"),
74
+ ys: c.ys.cast("int32")
72
75
  // this.tf.oneHot(batchData.ys.cast('int32'), this.tokenizer.vocabSize),
73
76
  }));
74
77
  }).prefetch(2);
@@ -76,6 +79,6 @@ class w {
76
79
  }
77
80
  export {
78
81
  w as DatasetBuilder,
79
- p as PAGE_FACTOR,
82
+ S as PAGE_FACTOR,
80
83
  y as flattenTokens
81
84
  };
@@ -1,21 +1,21 @@
1
- import { generateText as T } from "../utilities/generate.js";
2
- import L from "./Trainer.js";
3
- import x from "./Evaluator.js";
4
- import { d as h } from "../index-C0dhsYom.js";
5
- import y from "../utilities/profile.js";
6
- const D = {
1
+ import { generateText as w } from "../utilities/generate.js";
2
+ import T from "./Trainer.js";
3
+ import L from "./Evaluator.js";
4
+ import { d as h } from "../index-BoWRt-10.js";
5
+ import x from "../utilities/profile.js";
6
+ const y = {
7
7
  desiredLoss: 0.01,
8
8
  logInterval: 1,
9
9
  maxSteps: 1e3
10
10
  };
11
- class I extends L {
11
+ class E extends T {
12
12
  constructor(i, e, r = 3e-4) {
13
13
  super(i, e, r);
14
14
  }
15
15
  // Train for multiple epochs using Dataset API - FIXED memory leaks
16
16
  async trainOnDataset(i, e, r) {
17
- const { desiredLoss: p, logInterval: g, onStep: l, prompt: m, maxSteps: u } = {
18
- ...D,
17
+ const { logInterval: g, onStep: l, prompt: c, maxSteps: u } = {
18
+ ...y,
19
19
  ...e
20
20
  }, n = Date.now(), t = {
21
21
  step: 0,
@@ -27,13 +27,13 @@ class I extends L {
27
27
  trainingDuration: 0,
28
28
  ...this.lastState || {}
29
29
  };
30
- this.lastState = t, await this.dummyPass(), this.model.trainable = !0, e?.advancedMetrics && (this.model.getProfiler() || (this.model.config.layerConfig.profiler = new y())), this.running = !0, t.logStartTime = n;
31
- const c = r ? new x(this.model, r) : void 0, f = await i.iterator();
30
+ this.lastState = t, await this.dummyPass(), this.model.trainable = !0, e?.advancedMetrics && (this.model.getProfiler() || (this.model.config.layerConfig.profiler = new x())), this.running = !0, t.logStartTime = n;
31
+ const m = r ? new L(this.model, r) : void 0, f = await i.iterator();
32
32
  try {
33
- for (; this.running && !(t.lastLoss < p); ) {
33
+ for (; this.running; ) {
34
34
  const o = await f.next();
35
35
  if (o.done) break;
36
- const d = o.value, S = this.trainBatch(t, d), s = {
36
+ const d = o.value, p = this.trainBatch(t, d), s = {
37
37
  loss: t.lastLoss,
38
38
  step: t.step,
39
39
  time: Date.now() - n,
@@ -42,21 +42,21 @@ class I extends L {
42
42
  //gradientNorm: options?.advancedMetrics ? await state.gradientNorm : undefined,
43
43
  };
44
44
  if (this.model.log.push(s), t.step % g === 0) {
45
- await S;
46
- const v = Date.now();
47
- if (t.trainingDuration += v - t.logStartTime, c)
45
+ await p.data();
46
+ const S = Date.now();
47
+ if (t.trainingDuration += S - t.logStartTime, m)
48
48
  try {
49
- const a = await c.evaluate(5);
49
+ const a = await m.evaluate(5);
50
50
  t.validationLosses.push(a), s.valLoss = a;
51
51
  } catch (a) {
52
52
  console.error("Validation error:", a);
53
53
  }
54
54
  if (l) {
55
- if (m) {
56
- const w = await T(this.tokenizer, this.model, m, 100, {
55
+ if (c) {
56
+ const v = await w(this.tokenizer, this.model, c, 100, {
57
57
  temperature: 0.8
58
58
  });
59
- s.example = w;
59
+ s.example = v;
60
60
  }
61
61
  const a = {
62
62
  duration: t.trainingDuration,
@@ -68,7 +68,7 @@ class I extends L {
68
68
  }
69
69
  t.logStartTime = Date.now();
70
70
  }
71
- t.step >= u && this.stop();
71
+ p.dispose(), t.step >= u && this.stop();
72
72
  }
73
73
  } catch (o) {
74
74
  throw console.error("Training error:", o), h(), o;
@@ -77,5 +77,5 @@ class I extends L {
77
77
  }
78
78
  }
79
79
  export {
80
- I as default
80
+ E as default
81
81
  };
@@ -55,7 +55,7 @@ export default abstract class GPTTrainer {
55
55
  protected trainBatch(state: TrainingState, batch: {
56
56
  xs: Tensor;
57
57
  ys: Tensor;
58
- }): Promise<number>;
58
+ }): Scalar;
59
59
  abstract trainOnDataset(dataset: Dataset<{
60
60
  xs: Tensor;
61
61
  ys: Tensor;
@@ -1,10 +1,10 @@
1
- import { DatasetBuilder as m, flattenTokens as p, PAGE_FACTOR as u } from "./DatasetBuilder.js";
2
- import g from "./AdamExt.js";
3
- import { t as f, v as y, d as c } from "../index-C0dhsYom.js";
4
- import { z as h } from "../zeros-DCPCdFGq.js";
1
+ import { DatasetBuilder as h, flattenTokens as p, PAGE_FACTOR as g } from "./DatasetBuilder.js";
2
+ import u from "./AdamExt.js";
3
+ import { t as f, v as y, d as c } from "../index-BoWRt-10.js";
4
+ import { z as m } from "../zeros--BdLQ3oG.js";
5
5
  class x {
6
- constructor(t, s, e = 1e-3) {
7
- this.tokenizer = s, this.model = t, this.learningRate = e, this.resetOptimizer(), this.datasetBuilder = new m(s, t.config.gpt.blockSize);
6
+ constructor(t, e, a = 1e-3) {
7
+ this.tokenizer = e, this.model = t, this.learningRate = a, this.resetOptimizer(), this.datasetBuilder = new h(e, t.config.gpt.blockSize);
8
8
  }
9
9
  model;
10
10
  optimizer;
@@ -26,7 +26,7 @@ class x {
26
26
  }
27
27
  resetOptimizer(t = { learningRateFactor: 1, beta1: 0.9, beta2: 0.99, epsilon: 1e-8 }) {
28
28
  this.optimizer && this.optimizer.dispose();
29
- const s = new g(
29
+ const e = new u(
30
30
  t.learningRateFactor * this.learningRate,
31
31
  t.beta1,
32
32
  t.beta2,
@@ -38,7 +38,7 @@ class x {
38
38
  weightDecay: 0
39
39
  }
40
40
  );
41
- this.optimizer = s;
41
+ this.optimizer = e;
42
42
  }
43
43
  /*private async maxGradNorm(grads: NamedVariableMap): Promise<number> {
44
44
  let maxNorm = 0;
@@ -56,55 +56,55 @@ class x {
56
56
  );
57
57
  return maxNorm;
58
58
  }*/
59
- trainStep(t, s, e = !1) {
59
+ trainStep(t, e, a = !1) {
60
60
  return f(() => {
61
61
  this.model.getProfiler()?.startMemory();
62
- const { xs: a, ys: i } = s, o = () => {
63
- const [l, d] = this.model.forward({ training: !0 }, a, i);
62
+ const { xs: s, ys: i } = e, o = () => {
63
+ const [l, d] = this.model.forward({ training: !0 }, s, i);
64
64
  return l.dispose(), d;
65
65
  }, { value: n, grads: r } = y(o);
66
- return e ? this.model.getProfiler()?.endMemory("Training") : (this.optimizer.applyGradients(r), this.model.getProfiler()?.endMemory("Training"), c(r)), n;
66
+ return a ? this.model.getProfiler()?.endMemory("Training") : (this.optimizer.applyGradients(r), this.model.getProfiler()?.endMemory("Training"), c(r)), n;
67
67
  });
68
68
  }
69
69
  async dummyPass() {
70
- const t = h([1, this.model.config.gpt.blockSize], "int32"), s = h([1, this.model.config.gpt.blockSize], "int32");
70
+ const t = m([1, this.model.config.gpt.blockSize], "int32"), e = m([1, this.model.config.gpt.blockSize], "int32");
71
71
  try {
72
- const e = this.trainStep({}, { xs: t, ys: s }, !0);
73
- await e.data(), e.dispose();
74
- } catch (e) {
75
- console.error("Error during dummy pass:", e);
72
+ const a = this.trainStep({}, { xs: t, ys: e }, !0);
73
+ await a.data(), a.dispose();
74
+ } catch (a) {
75
+ console.error("Error during dummy pass:", a);
76
76
  } finally {
77
- t.dispose(), s.dispose();
77
+ t.dispose(), e.dispose();
78
78
  }
79
79
  }
80
- async trainBatch(t, s) {
80
+ trainBatch(t, e) {
81
81
  try {
82
- const e = this.trainStep(t, s, !1);
83
- return s.xs.dispose(), s.ys.dispose(), t.step++, t.totalSteps++, e.array().then((a) => (t.lastLoss = a, t.losses.push(t.lastLoss), e.dispose(), t.lastLoss));
84
- } catch (e) {
85
- throw console.error(`Error processing batch at step ${t.step}:`, e), c(), e;
82
+ const a = this.trainStep(t, e, !1);
83
+ return e.xs.dispose(), e.ys.dispose(), t.step++, t.totalSteps++, a;
84
+ } catch (a) {
85
+ throw console.error(`Error processing batch at step ${t.step}:`, a), c(), a;
86
86
  }
87
87
  }
88
- async createTrainValidationSplit(t, s = 32, e = 0.1) {
89
- const a = await p(t, this.tokenizer), i = /* @__PURE__ */ new Set();
90
- if (e > 0) {
91
- const r = Math.floor(a.length / (this.datasetBuilder.blockSize * u)), l = Math.max(1, Math.floor(r * e));
88
+ async createTrainValidationSplit(t, e = 32, a = 0.1) {
89
+ const s = await p(t, this.tokenizer), i = /* @__PURE__ */ new Set();
90
+ if (a > 0) {
91
+ const r = Math.floor(s.length / (this.datasetBuilder.blockSize * g)), l = Math.max(1, Math.floor(r * a));
92
92
  for (; i.size < l; ) {
93
93
  const d = Math.floor(Math.random() * r);
94
94
  i.add(d);
95
95
  }
96
96
  }
97
- const o = await this.datasetBuilder.createTextDataset(a, s, i, !1), n = await this.datasetBuilder.createTextDataset(
98
- a,
97
+ const o = await this.datasetBuilder.createTextDataset(s, e, i, !1), n = await this.datasetBuilder.createTextDataset(
99
98
  s,
99
+ e,
100
100
  i,
101
101
  !0
102
102
  );
103
103
  return { trainDataset: o, validationDataset: n };
104
104
  }
105
- async createDataset(t, s = 32) {
106
- const e = await p(t, this.tokenizer);
107
- return await this.datasetBuilder.createTextDataset(e, s);
105
+ async createDataset(t, e = 32) {
106
+ const a = await p(t, this.tokenizer);
107
+ return await this.datasetBuilder.createTextDataset(a, e);
108
108
  }
109
109
  dispose() {
110
110
  this.optimizer && this.optimizer.dispose();
@@ -4,8 +4,4 @@ import * as tf from '@tensorflow/tfjs-core';
4
4
  * This version handles potential numerical issues better
5
5
  */
6
6
  export declare function sparseSoftmaxCrossEntropy(logits: tf.Tensor, labels: tf.Tensor): tf.Tensor;
7
- /**
8
- * Custom gradient implementation for sparse cross-entropy
9
- * This ensures proper backpropagation
10
- */
11
7
  export declare function createSoftmaxCrossEntropyWithGrad(): (...args: tf.Tensor[]) => tf.Tensor<tf.Rank>;
@@ -1,22 +1,22 @@
1
1
  import { gatherSub as x } from "../ops/gatherSub.js";
2
2
  import { scatterSub as L } from "../ops/scatterSub.js";
3
- import { q as C, t as u, z as E, c as G } from "../index-C0dhsYom.js";
4
- import { s as y } from "../softmax-GPNK3o-U.js";
5
- import { m as z } from "../max-BQc2Aj-I.js";
6
- import { l as v } from "../log_sum_exp-D086OgZJ.js";
3
+ import { y, t as u, z as C, c as E } from "../index-BoWRt-10.js";
4
+ import { s as G } from "../softmax-Bv_6lyMX.js";
5
+ import { m as z } from "../max-Ddnnb5xe.js";
6
+ import { l as v } from "../log_sum_exp-DbjkV734.js";
7
7
  function k(t, s) {
8
8
  return u(() => {
9
- const n = t.shape[t.shape.length - 1], c = t.shape.slice(0, -1).reduce((o, e) => o * e, 1), h = t.shape.length > 2 ? t.reshape([c, n]) : t, p = s.shape.length > 1 ? s.reshape([c]).cast("int32") : s.cast("int32"), r = z(h, -1, !0), a = G(h, r), m = v(a, -1);
9
+ const n = t.shape[t.shape.length - 1], c = t.shape.slice(0, -1).reduce((o, e) => o * e, 1), h = t.shape.length > 2 ? t.reshape([c, n]) : t, p = s.shape.length > 1 ? s.reshape([c]).cast("int32") : s.cast("int32"), r = z(h, -1, !0), a = E(h, r), m = v(a, -1);
10
10
  return x(m, p, a);
11
11
  });
12
12
  }
13
13
  function A() {
14
- return C(
14
+ return y(
15
15
  // @ts-expect-error Invalid params
16
16
  (s, n, d) => {
17
17
  const c = s.shape[s.shape.length - 1], p = s.shape.slice(0, -1).reduce((o, e) => o * e, 1), r = s.reshape([p, c]), a = n.reshape([p]).cast("int32"), m = k(r, a);
18
18
  return d([r, a]), r.dispose(), a.dispose(), { value: m, gradFunc: (o, e) => u(() => {
19
- const S = e[0], f = e[1], b = y(S), l = L(b, f, o), g = E(n);
19
+ const S = e[0], f = e[1], b = G(S), l = L(b, f, o), g = C(n);
20
20
  return [l.reshape(s.shape), g];
21
21
  }) };
22
22
  }
@@ -0,0 +1 @@
1
+ export declare function arraysClose(a: unknown, b: unknown, epsilon?: number): boolean;
@@ -0,0 +1,11 @@
1
+ function f(r, e, n = 1e-5) {
2
+ if (Array.isArray(r) && Array.isArray(e)) {
3
+ if (r.length !== e.length) return !1;
4
+ for (let t = 0; t < r.length; ++t)
5
+ if (!f(r[t], e[t], n)) return !1;
6
+ return !0;
7
+ } else return typeof r == "number" && typeof e == "number" ? r === -1 / 0 && e === -1 / 0 ? !0 : Math.abs(r - e) < n : !1;
8
+ }
9
+ export {
10
+ f as arraysClose
11
+ };
@@ -1,5 +1,5 @@
1
- import { m as y, v as P, e as S } from "../index-C0dhsYom.js";
2
- import { z as i } from "../zeros-DCPCdFGq.js";
1
+ import { m as y, v as P, e as S } from "../index-BoWRt-10.js";
2
+ import { z as i } from "../zeros--BdLQ3oG.js";
3
3
  async function w(s) {
4
4
  const t = i([1, s.config.gpt.blockSize], "int32"), [e, n] = s.forward({ training: !1 }, t);
5
5
  await e.data(), e.dispose(), n && n.dispose(), t.dispose();
@@ -1,6 +1,6 @@
1
- import "../index-C0dhsYom.js";
2
- import { t as m } from "../tensor2d-B_kyod7_.js";
3
- import { c as u } from "../concat-CsHeR4zV.js";
1
+ import "../index-BoWRt-10.js";
2
+ import { t as m } from "../tensor2d-wxPAnDQy.js";
3
+ import { c as u } from "../concat-CsxrgovM.js";
4
4
  async function v(o, r, a, c, f) {
5
5
  if (c <= 0)
6
6
  throw new Error("Length must be a positive integer");
@@ -1,5 +1,5 @@
1
- import "../index-C0dhsYom.js";
2
- import { t as e } from "../tensor2d-B_kyod7_.js";
1
+ import "../index-BoWRt-10.js";
2
+ import { t as e } from "../tensor2d-wxPAnDQy.js";
3
3
  function l(n) {
4
4
  let r = 0;
5
5
  const i = Math.random();
@@ -1,2 +1,2 @@
1
1
  import { Tensor } from '@tensorflow/tfjs-core';
2
- export default function performanceTest(fn: () => Tensor, iterations?: number): Promise<number>;
2
+ export default function performanceTest(fn: () => Tensor, iterations?: number, allowPromise?: boolean): Promise<number>;
@@ -1,16 +1,16 @@
1
- import { t as r } from "../index-C0dhsYom.js";
2
- async function d(s, o = 10) {
3
- for (let e = 0; e < 10; e++) {
4
- const t = s();
5
- await t.data(), t.dispose();
1
+ import { t as s } from "../index-BoWRt-10.js";
2
+ async function f(e, o = 10, r = !1) {
3
+ for (let t = 0; t < 100; t++) {
4
+ const a = r ? await e() : s(e);
5
+ t === 99 && await a.data(), a.dispose();
6
6
  }
7
- const a = performance.now();
8
- for (let e = 0; e < o; e++) {
9
- const t = r(s);
10
- e === o - 1 && await t.data(), t.dispose();
7
+ const n = performance.now();
8
+ for (let t = 0; t < o; t++) {
9
+ const a = r ? await e() : s(e);
10
+ t === o - 1 && await a.data(), a.dispose();
11
11
  }
12
- return (performance.now() - a) / o;
12
+ return (performance.now() - n) / o;
13
13
  }
14
14
  export {
15
- d as default
15
+ f as default
16
16
  };
@@ -1,4 +1,4 @@
1
- import { m as a } from "../index-C0dhsYom.js";
1
+ import { m as a } from "../index-BoWRt-10.js";
2
2
  const s = 1024 * 1024;
3
3
  class l {
4
4
  log = /* @__PURE__ */ new Map();
@@ -1,5 +1,5 @@
1
- import "../index-C0dhsYom.js";
2
- import { t as y } from "../tensor-RvZVNmg0.js";
1
+ import "../index-BoWRt-10.js";
2
+ import { t as y } from "../tensor-JwS7ZYY6.js";
3
3
  function l(t) {
4
4
  if (t === "float32") return "F32";
5
5
  if (t === "int32") return "I32";
@@ -1,5 +1,5 @@
1
- import "../index-C0dhsYom.js";
2
- import { t as p } from "../tensor-RvZVNmg0.js";
1
+ import "../index-BoWRt-10.js";
2
+ import { t as p } from "../tensor-JwS7ZYY6.js";
3
3
  function h(n) {
4
4
  const e = n.reduce((s, o) => s + o.length, 0), a = new Float32Array(e);
5
5
  let t = 0;
@@ -1,4 +1,4 @@
1
- import { E as i } from "./index-C0dhsYom.js";
1
+ import { E as i } from "./index-BoWRt-10.js";
2
2
  /**
3
3
  * @license
4
4
  * Copyright 2018 Google LLC. All Rights Reserved.