@genai-fi/nanogpt 0.7.2 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. package/dist/Generator.d.ts +36 -4
  2. package/dist/Generator.js +183 -69
  3. package/dist/{RealDiv-Dy0p8Bvo.js → RealDiv-N8TpOMYv.js} +14 -14
  4. package/dist/{Reshape-DvudQDvJ.js → Reshape-B-lWQRnF.js} +1 -1
  5. package/dist/{Reshape-DH5srBP0.js → Reshape-Bo8HzP8V.js} +5 -5
  6. package/dist/TeachableLLM.d.ts +6 -6
  7. package/dist/TeachableLLM.js +51 -50
  8. package/dist/Trainer.d.ts +19 -3
  9. package/dist/Trainer.js +71 -28
  10. package/dist/{axis_util-BzbKo31C.js → axis_util-DubwyOhW.js} +3 -3
  11. package/dist/backend.js +2 -2
  12. package/dist/{backend_util-TE7aTPhZ.js → backend_util-BJ-_jSeK.js} +46 -46
  13. package/dist/{broadcast_to-CdbwV-Dj.js → broadcast_to-BYfCp5iL.js} +2 -2
  14. package/dist/{concat-CsxrgovM.js → concat-BmDqqFsa.js} +1 -1
  15. package/dist/{dataset-CtdBYwjo.js → dataset-CJmEGu6D.js} +5 -5
  16. package/dist/{dropout-DYs5QFGQ.js → dropout-sx0sjVAT.js} +8 -8
  17. package/dist/exports_initializers-DAKM8UO9.js +16 -0
  18. package/dist/{gather-CMMy2KEG.js → gather-C1siEkdp.js} +1 -1
  19. package/dist/{gelu-C-dPj6Ku.js → gelu-Bd3UBBxg.js} +1 -1
  20. package/dist/{gpgpu_math-DGNLNL4I.js → gpgpu_math-TFLxaLkw.js} +26 -26
  21. package/dist/{index-CLthM0TO.js → index-BaPo_0H8.js} +185 -185
  22. package/dist/{index-BoWRt-10.js → index-CUQrfsw_.js} +266 -265
  23. package/dist/{kernel_funcs_utils-BYKWV8Aa.js → kernel_funcs_utils-P9aFa232.js} +9 -9
  24. package/dist/layers/BaseLayer.d.ts +8 -13
  25. package/dist/layers/BaseLayer.js +25 -13
  26. package/dist/layers/CausalSelfAttention.d.ts +3 -2
  27. package/dist/layers/CausalSelfAttention.js +28 -28
  28. package/dist/layers/MLP.d.ts +3 -2
  29. package/dist/layers/MLP.js +16 -20
  30. package/dist/layers/PositionEmbedding.d.ts +9 -0
  31. package/dist/layers/PositionEmbedding.js +45 -0
  32. package/dist/layers/RMSNorm.d.ts +3 -2
  33. package/dist/layers/RMSNorm.js +6 -6
  34. package/dist/layers/RoPECache.d.ts +1 -1
  35. package/dist/layers/RoPECache.js +4 -4
  36. package/dist/layers/TiedEmbedding.d.ts +3 -2
  37. package/dist/layers/TiedEmbedding.js +29 -7
  38. package/dist/layers/TransformerBlock.d.ts +3 -2
  39. package/dist/layers/TransformerBlock.js +1 -1
  40. package/dist/loader/load.d.ts +2 -2
  41. package/dist/loader/loadHF.d.ts +2 -2
  42. package/dist/loader/loadTransformers.d.ts +4 -2
  43. package/dist/loader/loadTransformers.js +10 -9
  44. package/dist/loader/newZipLoad.d.ts +2 -2
  45. package/dist/loader/oldZipLoad.d.ts +2 -2
  46. package/dist/loader/oldZipLoad.js +42 -51
  47. package/dist/loader/save.d.ts +8 -0
  48. package/dist/loader/save.js +62 -0
  49. package/dist/{log_sum_exp-DbjkV734.js → log_sum_exp-C142qZqY.js} +14 -14
  50. package/dist/main.d.ts +5 -4
  51. package/dist/main.js +22 -18
  52. package/dist/{mat_mul-8m8pfdcx.js → mat_mul-DMkduNJu.js} +1 -1
  53. package/dist/{max-Ddnnb5xe.js → max-B3JOcNGb.js} +1 -1
  54. package/dist/mod-uUuj4gSb.js +27 -0
  55. package/dist/models/NanoGPTV1.d.ts +15 -0
  56. package/dist/models/NanoGPTV1.js +71 -0
  57. package/dist/{config.d.ts → models/config.d.ts} +1 -0
  58. package/dist/{config.js → models/config.js} +1 -0
  59. package/dist/models/factory.d.ts +3 -0
  60. package/dist/models/factory.js +14 -0
  61. package/dist/models/model.d.ts +26 -0
  62. package/dist/models/model.js +68 -0
  63. package/dist/{mulmat_packed_gpu-VSekgsNv.js → mulmat_packed_gpu-Cm2gw-c8.js} +1 -1
  64. package/dist/{ones-Dj0SDhHf.js → ones-ZdgQGBCP.js} +2 -2
  65. package/dist/ops/adamAdjust.js +1 -1
  66. package/dist/ops/adamMoments.js +1 -1
  67. package/dist/ops/appendCache.js +3 -3
  68. package/dist/ops/attentionMask.js +1 -1
  69. package/dist/ops/cpu/adamAdjust.js +9 -9
  70. package/dist/ops/cpu/adamMoments.js +2 -2
  71. package/dist/ops/cpu/appendCache.js +2 -2
  72. package/dist/ops/cpu/attentionMask.js +5 -5
  73. package/dist/ops/cpu/fusedSoftmax.js +2 -2
  74. package/dist/ops/cpu/gatherSub.js +3 -3
  75. package/dist/ops/cpu/gelu.js +1 -1
  76. package/dist/ops/cpu/matMulGelu.js +2 -2
  77. package/dist/ops/cpu/matMulMul.js +1 -1
  78. package/dist/ops/cpu/mulDropout.js +1 -1
  79. package/dist/ops/cpu/normRMS.js +1 -1
  80. package/dist/ops/cpu/qkv.js +3 -3
  81. package/dist/ops/cpu/rope.js +5 -5
  82. package/dist/ops/cpu/scatterSub.js +11 -11
  83. package/dist/ops/fusedSoftmax.js +1 -1
  84. package/dist/ops/gatherSub.js +1 -1
  85. package/dist/ops/gelu.js +2 -2
  86. package/dist/ops/grads/attentionMask.js +1 -1
  87. package/dist/ops/grads/fusedSoftmax.js +2 -2
  88. package/dist/ops/grads/gelu.js +2 -2
  89. package/dist/ops/grads/matMulGelu.js +1 -1
  90. package/dist/ops/grads/normRMS.js +1 -1
  91. package/dist/ops/grads/qkv.js +1 -1
  92. package/dist/ops/grads/rope.js +1 -1
  93. package/dist/ops/matMulGelu.js +1 -1
  94. package/dist/ops/matMulMul.js +1 -1
  95. package/dist/ops/mulDrop.js +1 -1
  96. package/dist/ops/normRMS.js +1 -1
  97. package/dist/ops/qkv.js +1 -1
  98. package/dist/ops/rope.js +4 -4
  99. package/dist/ops/scatterSub.js +1 -1
  100. package/dist/ops/webgl/adamAdjust.js +2 -2
  101. package/dist/ops/webgl/adamMoments.js +1 -1
  102. package/dist/ops/webgl/appendCache.js +1 -1
  103. package/dist/ops/webgl/attentionMask.js +1 -1
  104. package/dist/ops/webgl/fusedSoftmax.js +4 -4
  105. package/dist/ops/webgl/gatherSub.js +1 -1
  106. package/dist/ops/webgl/gelu.js +2 -2
  107. package/dist/ops/webgl/log.js +3 -3
  108. package/dist/ops/webgl/matMulGelu.js +10 -10
  109. package/dist/ops/webgl/matMulMul.js +1 -1
  110. package/dist/ops/webgl/mulDropout.js +1 -1
  111. package/dist/ops/webgl/normRMS.js +2 -2
  112. package/dist/ops/webgl/qkv.js +1 -1
  113. package/dist/ops/webgl/rope.js +1 -1
  114. package/dist/ops/webgl/scatterSub.js +1 -1
  115. package/dist/ops/webgpu/adamAdjust.js +3 -3
  116. package/dist/ops/webgpu/adamMoments.js +3 -3
  117. package/dist/ops/webgpu/appendCache.js +3 -3
  118. package/dist/ops/webgpu/attentionMask.js +3 -3
  119. package/dist/ops/webgpu/gatherSub.js +3 -3
  120. package/dist/ops/webgpu/gelu.js +3 -3
  121. package/dist/ops/webgpu/normRMS.js +2 -2
  122. package/dist/ops/webgpu/normRMSGrad.js +5 -5
  123. package/dist/ops/webgpu/qkv.js +3 -3
  124. package/dist/ops/webgpu/rope.js +3 -3
  125. package/dist/ops/webgpu/scatterSub.js +3 -3
  126. package/dist/ops/webgpu/utils/reductions.js +4 -4
  127. package/dist/{ops-BFGCx8Ri.js → ops-C_1K_-35.js} +103 -103
  128. package/dist/{random_width-sZORGo5k.js → random_width-D8Pwy_na.js} +136 -136
  129. package/dist/{range-CRuAh-gd.js → range-LVHrSLdi.js} +1 -1
  130. package/dist/{reciprocal-BvGAyKyu.js → reciprocal-CaR9e67G.js} +1 -1
  131. package/dist/{register_all_kernels-BwDSRN-f.js → register_all_kernels-DUshvVWP.js} +2026 -2049
  132. package/dist/{reshape-CdBq1WJ6.js → reshape-DEfQGSin.js} +1 -1
  133. package/dist/{scatter_nd_util-DUstGbU1.js → scatter_nd_util-CUPPNLaA.js} +1 -1
  134. package/dist/{selu_util-BJEXVvjX.js → selu_util-8vv5JxQV.js} +3 -3
  135. package/dist/{shared-B8ztnyEk.js → shared-CkNorDcU.js} +83 -83
  136. package/dist/{shared-wS99K7_n.js → shared-D1elLckx.js} +1 -1
  137. package/dist/{sin-BeA3tsEd.js → sin-D2CKKmyR.js} +1 -1
  138. package/dist/{slice-BiOsknYS.js → slice-BnyE-M_7.js} +1 -1
  139. package/dist/{softmax-Bv_6lyMX.js → softmax-DLoZWYBx.js} +1 -1
  140. package/dist/{split-B-dikLRw.js → split-By_n4TKP.js} +1 -1
  141. package/dist/{stack-B17UN2nn.js → stack-DkdFLq37.js} +1 -1
  142. package/dist/{sum-66ew2byf.js → sum-l_0SqM4h.js} +3 -3
  143. package/dist/{tensor-JwS7ZYY6.js → tensor-BAQdLqoU.js} +1 -1
  144. package/dist/{tensor2d-wxPAnDQy.js → tensor2d-BHy261cI.js} +1 -1
  145. package/dist/training/Adam.js +2 -2
  146. package/dist/training/AdamExt.js +1 -1
  147. package/dist/training/DatasetBuilder.js +2 -2
  148. package/dist/training/Evaluator.d.ts +2 -2
  149. package/dist/training/FullTrainer.d.ts +16 -3
  150. package/dist/training/FullTrainer.js +91 -53
  151. package/dist/training/Trainer.d.ts +25 -3
  152. package/dist/training/Trainer.js +39 -47
  153. package/dist/training/sparseCrossEntropy.js +9 -9
  154. package/dist/utilities/dummy.d.ts +4 -4
  155. package/dist/utilities/dummy.js +13 -13
  156. package/dist/utilities/multinomialCPU.js +2 -2
  157. package/dist/utilities/parameters.d.ts +1 -1
  158. package/dist/utilities/performance.js +1 -1
  159. package/dist/utilities/profile.js +1 -1
  160. package/dist/utilities/safetensors.js +2 -2
  161. package/dist/utilities/weights.js +2 -2
  162. package/dist/{variable-BuddVFLa.js → variable-C9hihzDB.js} +1 -1
  163. package/dist/{webgpu_program-PFzf1hAQ.js → webgpu_program-dFEVbDPL.js} +1 -1
  164. package/dist/{webgpu_util-D____QpY.js → webgpu_util-DLImlSc6.js} +27 -27
  165. package/dist/{zeros--BdLQ3oG.js → zeros-VZ72lWXM.js} +1 -1
  166. package/package.json +2 -3
  167. package/dist/NanoGPTModel.d.ts +0 -52
  168. package/dist/NanoGPTModel.js +0 -203
  169. package/dist/TiedEmbedding-BxOerUmB.js +0 -43
  170. package/dist/utilities/generate.d.ts +0 -3
  171. package/dist/utilities/generate.js +0 -22
  172. package/dist/utilities/save.d.ts +0 -9
  173. package/dist/utilities/save.js +0 -61
@@ -1,10 +1,23 @@
1
1
  import { ITokeniser } from '../tokeniser/type';
2
- import { default as NanoGPT } from '../NanoGPTModel';
3
- import { default as GPTTrainer, TrainingOptions } from './Trainer';
2
+ import { default as GPTTrainer, TrainingLogEntry, TrainingOptions, TrainingProgress } from './Trainer';
4
3
  import { Tensor } from '@tensorflow/tfjs-core';
5
4
  import { Dataset } from '@tensorflow/tfjs-data';
5
+ import { default as Model, ModelForwardAttributes } from '../models/model';
6
6
  export default class FullTrainer extends GPTTrainer {
7
- constructor(model: NanoGPT, tokenizer: ITokeniser, learningRate?: number);
7
+ constructor(model: Model<ModelForwardAttributes>, tokenizer: ITokeniser, learningRate?: number);
8
+ private createEmptyState;
9
+ private createLogEntry;
10
+ private createProgress;
11
+ stepDataset(dataset: Dataset<{
12
+ xs: Tensor;
13
+ ys: Tensor;
14
+ }>, options: Partial<TrainingOptions>, validationDataset?: Dataset<{
15
+ xs: Tensor;
16
+ ys: Tensor;
17
+ }>): Promise<{
18
+ log: TrainingLogEntry;
19
+ progress: TrainingProgress;
20
+ }>;
8
21
  trainOnDataset(dataset: Dataset<{
9
22
  xs: Tensor;
10
23
  ys: Tensor;
@@ -1,81 +1,119 @@
1
- import { generateText as w } from "../utilities/generate.js";
2
- import T from "./Trainer.js";
3
- import L from "./Evaluator.js";
4
- import { d as h } from "../index-BoWRt-10.js";
5
- import x from "../utilities/profile.js";
6
- const y = {
1
+ import y from "./Trainer.js";
2
+ import v from "./Evaluator.js";
3
+ import { d as S } from "../index-CUQrfsw_.js";
4
+ import w from "../utilities/profile.js";
5
+ const f = {
7
6
  desiredLoss: 0.01,
8
7
  logInterval: 1,
9
8
  maxSteps: 1e3
10
9
  };
11
- class E extends T {
12
- constructor(i, e, r = 3e-4) {
13
- super(i, e, r);
10
+ class b extends y {
11
+ constructor(s, t, a = 3e-4) {
12
+ super(s, t, a);
14
13
  }
15
- // Train for multiple epochs using Dataset API - FIXED memory leaks
16
- async trainOnDataset(i, e, r) {
17
- const { logInterval: g, onStep: l, prompt: c, maxSteps: u } = {
18
- ...y,
19
- ...e
20
- }, n = Date.now(), t = {
14
+ createEmptyState() {
15
+ return {
21
16
  step: 0,
22
17
  lastLoss: 1e6,
23
18
  totalSteps: 0,
24
19
  losses: [],
25
20
  validationLosses: [],
26
- logStartTime: n,
21
+ logStartTime: 0,
27
22
  trainingDuration: 0,
28
23
  ...this.lastState || {}
29
24
  };
30
- this.lastState = t, await this.dummyPass(), this.model.trainable = !0, e?.advancedMetrics && (this.model.getProfiler() || (this.model.config.layerConfig.profiler = new x())), this.running = !0, t.logStartTime = n;
31
- const m = r ? new L(this.model, r) : void 0, f = await i.iterator();
25
+ }
26
+ createLogEntry(s, t, a, n) {
27
+ return {
28
+ loss: s.lastLoss,
29
+ step: s.step,
30
+ time: Date.now() - t,
31
+ batchSize: a,
32
+ learningRate: n ? this.optimizer.lr : void 0
33
+ };
34
+ }
35
+ createProgress(s, t, a) {
36
+ return {
37
+ duration: s.trainingDuration,
38
+ totalSamples: s.totalSteps * t.batchSize,
39
+ samplesPerSecond: s.totalSteps * t.batchSize / (s.trainingDuration / 1e3),
40
+ memory: a ? this.model.getProfiler()?.getPeakMemory() || 0 : void 0
41
+ };
42
+ }
43
+ async stepDataset(s, t, a) {
44
+ const { logInterval: n } = {
45
+ ...f,
46
+ ...t
47
+ }, l = Date.now(), r = this.createEmptyState();
48
+ this.lastState = r, await this.dummyPass(), this.model.trainable = !0, t?.advancedMetrics && (this.model.getProfiler() || this.model.setProfiler(new w())), this.running = !0, r.logStartTime = l;
49
+ const m = a ? new v(this.model, a) : void 0, e = await s.iterator();
50
+ try {
51
+ for (; this.running; ) {
52
+ const i = await e.next();
53
+ if (i.done) break;
54
+ const g = i.value, o = this.trainBatch(r, g), c = this.createLogEntry(r, l, g.xs.shape[0], t?.advancedMetrics);
55
+ if (this.model.trainingState = {
56
+ steps: r.totalSteps,
57
+ learningRate: this.optimizer.lr,
58
+ batchSize: g.xs.shape[0],
59
+ loss: r.lastLoss
60
+ }, r.step % n === 0) {
61
+ await o.data();
62
+ const u = Date.now();
63
+ if (r.trainingDuration += u - r.logStartTime, m)
64
+ try {
65
+ const h = await m.evaluate(5);
66
+ r.validationLosses.push(h), c.valLoss = h;
67
+ } catch (h) {
68
+ console.error("Validation error:", h);
69
+ }
70
+ const p = this.createProgress(r, c, t?.advancedMetrics);
71
+ return o.dispose(), this.stop(), { log: c, progress: p };
72
+ }
73
+ o.dispose();
74
+ }
75
+ } catch (i) {
76
+ throw console.error("Training error:", i), S(), i;
77
+ }
78
+ throw S(), this.running = !1, new Error("No log returned before training stopped.");
79
+ }
80
+ // Train for multiple epochs using Dataset API - FIXED memory leaks
81
+ async trainOnDataset(s, t, a) {
82
+ const { logInterval: n, onStep: l, maxSteps: r } = {
83
+ ...f,
84
+ ...t
85
+ }, m = Date.now(), e = this.createEmptyState();
86
+ this.lastState = e, await this.dummyPass(), this.model.trainable = !0, t?.advancedMetrics && (this.model.getProfiler() || this.model.setProfiler(new w())), this.running = !0, e.logStartTime = m;
87
+ const i = a ? new v(this.model, a) : void 0, g = await s.iterator();
32
88
  try {
33
89
  for (; this.running; ) {
34
- const o = await f.next();
90
+ const o = await g.next();
35
91
  if (o.done) break;
36
- const d = o.value, p = this.trainBatch(t, d), s = {
37
- loss: t.lastLoss,
38
- step: t.step,
39
- time: Date.now() - n,
40
- batchSize: d.xs.shape[0],
41
- learningRate: e?.advancedMetrics ? this.optimizer.lr : void 0
42
- //gradientNorm: options?.advancedMetrics ? await state.gradientNorm : undefined,
43
- };
44
- if (this.model.log.push(s), t.step % g === 0) {
45
- await p.data();
46
- const S = Date.now();
47
- if (t.trainingDuration += S - t.logStartTime, m)
92
+ const c = o.value, u = this.trainBatch(e, c), p = this.createLogEntry(e, m, c.xs.shape[0], t?.advancedMetrics);
93
+ if (e.step % n === 0) {
94
+ await u.data();
95
+ const h = Date.now();
96
+ if (e.trainingDuration += h - e.logStartTime, i)
48
97
  try {
49
- const a = await m.evaluate(5);
50
- t.validationLosses.push(a), s.valLoss = a;
51
- } catch (a) {
52
- console.error("Validation error:", a);
98
+ const d = await i.evaluate(5);
99
+ e.validationLosses.push(d), p.valLoss = d;
100
+ } catch (d) {
101
+ console.error("Validation error:", d);
53
102
  }
54
103
  if (l) {
55
- if (c) {
56
- const v = await w(this.tokenizer, this.model, c, 100, {
57
- temperature: 0.8
58
- });
59
- s.example = v;
60
- }
61
- const a = {
62
- duration: t.trainingDuration,
63
- totalSamples: t.totalSteps * s.batchSize,
64
- samplesPerSecond: t.totalSteps * s.batchSize / (t.trainingDuration / 1e3),
65
- memory: e.advancedMetrics ? this.model.getProfiler()?.getPeakMemory() || 0 : void 0
66
- };
67
- await l(s, a);
104
+ const d = this.createProgress(e, p, t?.advancedMetrics);
105
+ await l(p, d);
68
106
  }
69
- t.logStartTime = Date.now();
107
+ e.logStartTime = Date.now();
70
108
  }
71
- p.dispose(), t.step >= u && this.stop();
109
+ u.dispose(), e.step >= r && this.stop();
72
110
  }
73
111
  } catch (o) {
74
- throw console.error("Training error:", o), h(), o;
112
+ throw console.error("Training error:", o), S(), o;
75
113
  }
76
- return h(), this.running = !1, { losses: t.losses, validationLosses: t.validationLosses };
114
+ return S(), this.running = !1, { losses: e.losses, validationLosses: e.validationLosses };
77
115
  }
78
116
  }
79
117
  export {
80
- E as default
118
+ b as default
81
119
  };
@@ -1,10 +1,20 @@
1
1
  import { ITokeniser } from '../tokeniser/type';
2
2
  import { DatasetBuilder } from './DatasetBuilder';
3
- import { default as NanoGPT, TrainingLogEntry } from '../NanoGPTModel';
4
3
  import { default as AdamExt } from './AdamExt';
5
4
  import { TensorContainer } from '@tensorflow/tfjs-core/dist/tensor_types';
6
5
  import { Scalar, Tensor } from '@tensorflow/tfjs-core';
7
6
  import { Dataset } from '@tensorflow/tfjs-data';
7
+ import { default as Model, ModelForwardAttributes } from '../models/model';
8
+ export interface TrainingLogEntry {
9
+ loss: number;
10
+ valLoss?: number;
11
+ step: number;
12
+ time: number;
13
+ example?: string;
14
+ batchSize: number;
15
+ gradientNorm?: number;
16
+ learningRate?: number;
17
+ }
8
18
  export interface TrainingState {
9
19
  step: number;
10
20
  lastLoss: number;
@@ -35,13 +45,15 @@ export interface TrainingOptions {
35
45
  }
36
46
  export default abstract class GPTTrainer {
37
47
  protected tokenizer: ITokeniser;
38
- protected model: NanoGPT;
48
+ protected model: Model<ModelForwardAttributes>;
39
49
  protected optimizer: AdamExt;
40
50
  protected datasetBuilder: DatasetBuilder;
41
51
  protected learningRate: number;
42
52
  protected running: boolean;
43
53
  protected lastState?: TrainingState;
44
- constructor(model: NanoGPT, tokenizer: ITokeniser, learningRate?: number);
54
+ protected _gradientCheckpointing: boolean;
55
+ constructor(model: Model<ModelForwardAttributes>, tokenizer: ITokeniser, learningRate?: number);
56
+ setGradientCheckpointing(enabled: boolean): void;
45
57
  setLearningRate(learningRate: number): void;
46
58
  reset(): void;
47
59
  stop(): void;
@@ -66,6 +78,16 @@ export default abstract class GPTTrainer {
66
78
  losses: number[];
67
79
  validationLosses: number[];
68
80
  }>;
81
+ abstract stepDataset(dataset: Dataset<{
82
+ xs: Tensor;
83
+ ys: Tensor;
84
+ }>, options: Partial<TrainingOptions>, validationDataset?: Dataset<{
85
+ xs: Tensor;
86
+ ys: Tensor;
87
+ }>): Promise<{
88
+ log: TrainingLogEntry;
89
+ progress: TrainingProgress;
90
+ }>;
69
91
  createTrainValidationSplit(textData: string[], batchSize?: number, validationSplit?: number): Promise<{
70
92
  trainDataset: Dataset<{
71
93
  xs: Tensor;
@@ -1,10 +1,10 @@
1
- import { DatasetBuilder as h, flattenTokens as p, PAGE_FACTOR as g } from "./DatasetBuilder.js";
1
+ import { DatasetBuilder as m, flattenTokens as c, PAGE_FACTOR as g } from "./DatasetBuilder.js";
2
2
  import u from "./AdamExt.js";
3
- import { t as f, v as y, d as c } from "../index-BoWRt-10.js";
4
- import { z as m } from "../zeros--BdLQ3oG.js";
3
+ import { t as f, v as y, d as p } from "../index-CUQrfsw_.js";
4
+ import { z as h } from "../zeros-VZ72lWXM.js";
5
5
  class x {
6
- constructor(t, e, a = 1e-3) {
7
- this.tokenizer = e, this.model = t, this.learningRate = a, this.resetOptimizer(), this.datasetBuilder = new h(e, t.config.gpt.blockSize);
6
+ constructor(t, e, i = 1e-3) {
7
+ this.tokenizer = e, this.model = t, this.learningRate = i, this.resetOptimizer(), this.datasetBuilder = new m(e, t.config.blockSize);
8
8
  }
9
9
  model;
10
10
  optimizer;
@@ -12,6 +12,10 @@ class x {
12
12
  learningRate;
13
13
  running = !1;
14
14
  lastState;
15
+ _gradientCheckpointing = !1;
16
+ setGradientCheckpointing(t) {
17
+ this._gradientCheckpointing = t;
18
+ }
15
19
  setLearningRate(t) {
16
20
  this.learningRate = t, this.resetOptimizer({ learningRateFactor: 1, beta1: 0.9, beta2: 0.99, epsilon: 1e-8 });
17
21
  }
@@ -40,71 +44,59 @@ class x {
40
44
  );
41
45
  this.optimizer = e;
42
46
  }
43
- /*private async maxGradNorm(grads: NamedVariableMap): Promise<number> {
44
- let maxNorm = 0;
45
- // Print all gradients
46
- await Promise.all(
47
- Object.keys(grads).map(async (varName) => {
48
- const grad = grads[varName];
49
- const temp = norm(grad);
50
- const gradNorm = (await temp.data())[0];
51
- temp.dispose();
52
- if (gradNorm > maxNorm) {
53
- maxNorm = gradNorm;
54
- }
55
- })
56
- );
57
- return maxNorm;
58
- }*/
59
- trainStep(t, e, a = !1) {
47
+ trainStep(t, e, i = !1) {
60
48
  return f(() => {
61
49
  this.model.getProfiler()?.startMemory();
62
- const { xs: s, ys: i } = e, o = () => {
63
- const [l, d] = this.model.forward({ training: !0 }, s, i);
50
+ const { xs: a, ys: s } = e, n = () => {
51
+ const [l, d] = this.model.forward(
52
+ { training: !0, checkpointing: this._gradientCheckpointing },
53
+ a,
54
+ s
55
+ );
64
56
  return l.dispose(), d;
65
- }, { value: n, grads: r } = y(o);
66
- return a ? this.model.getProfiler()?.endMemory("Training") : (this.optimizer.applyGradients(r), this.model.getProfiler()?.endMemory("Training"), c(r)), n;
57
+ }, { value: o, grads: r } = y(n);
58
+ return i ? this.model.getProfiler()?.endMemory("Training") : (this.optimizer.applyGradients(r), this.model.getProfiler()?.endMemory("Training"), p(r)), o;
67
59
  });
68
60
  }
69
61
  async dummyPass() {
70
- const t = m([1, this.model.config.gpt.blockSize], "int32"), e = m([1, this.model.config.gpt.blockSize], "int32");
62
+ const t = h([1, this.model.config.blockSize], "int32"), e = h([1, this.model.config.blockSize], "int32");
71
63
  try {
72
- const a = this.trainStep({}, { xs: t, ys: e }, !0);
73
- await a.data(), a.dispose();
74
- } catch (a) {
75
- console.error("Error during dummy pass:", a);
64
+ const i = this.trainStep({}, { xs: t, ys: e }, !0);
65
+ await i.data(), i.dispose();
66
+ } catch (i) {
67
+ console.error("Error during dummy pass:", i);
76
68
  } finally {
77
69
  t.dispose(), e.dispose();
78
70
  }
79
71
  }
80
72
  trainBatch(t, e) {
81
73
  try {
82
- const a = this.trainStep(t, e, !1);
83
- return e.xs.dispose(), e.ys.dispose(), t.step++, t.totalSteps++, a;
84
- } catch (a) {
85
- throw console.error(`Error processing batch at step ${t.step}:`, a), c(), a;
74
+ const i = this.trainStep(t, e, !1);
75
+ return e.xs.dispose(), e.ys.dispose(), t.step++, t.totalSteps++, i;
76
+ } catch (i) {
77
+ throw console.error(`Error processing batch at step ${t.step}:`, i), p(), i;
86
78
  }
87
79
  }
88
- async createTrainValidationSplit(t, e = 32, a = 0.1) {
89
- const s = await p(t, this.tokenizer), i = /* @__PURE__ */ new Set();
90
- if (a > 0) {
91
- const r = Math.floor(s.length / (this.datasetBuilder.blockSize * g)), l = Math.max(1, Math.floor(r * a));
92
- for (; i.size < l; ) {
80
+ async createTrainValidationSplit(t, e = 32, i = 0.1) {
81
+ const a = await c(t, this.tokenizer), s = /* @__PURE__ */ new Set();
82
+ if (i > 0) {
83
+ const r = Math.floor(a.length / (this.datasetBuilder.blockSize * g)), l = Math.max(1, Math.floor(r * i));
84
+ for (; s.size < l; ) {
93
85
  const d = Math.floor(Math.random() * r);
94
- i.add(d);
86
+ s.add(d);
95
87
  }
96
88
  }
97
- const o = await this.datasetBuilder.createTextDataset(s, e, i, !1), n = await this.datasetBuilder.createTextDataset(
98
- s,
89
+ const n = await this.datasetBuilder.createTextDataset(a, e, s, !1), o = await this.datasetBuilder.createTextDataset(
90
+ a,
99
91
  e,
100
- i,
92
+ s,
101
93
  !0
102
94
  );
103
- return { trainDataset: o, validationDataset: n };
95
+ return { trainDataset: n, validationDataset: o };
104
96
  }
105
97
  async createDataset(t, e = 32) {
106
- const a = await p(t, this.tokenizer);
107
- return await this.datasetBuilder.createTextDataset(a, e);
98
+ const i = await c(t, this.tokenizer);
99
+ return await this.datasetBuilder.createTextDataset(i, e);
108
100
  }
109
101
  dispose() {
110
102
  this.optimizer && this.optimizer.dispose();
@@ -1,28 +1,28 @@
1
1
  import { gatherSub as x } from "../ops/gatherSub.js";
2
2
  import { scatterSub as L } from "../ops/scatterSub.js";
3
- import { y, t as u, z as C, c as E } from "../index-BoWRt-10.js";
4
- import { s as G } from "../softmax-Bv_6lyMX.js";
5
- import { m as z } from "../max-Ddnnb5xe.js";
6
- import { l as v } from "../log_sum_exp-DbjkV734.js";
3
+ import { I as C, t as u, K as E, c as G } from "../index-CUQrfsw_.js";
4
+ import { s as y } from "../softmax-DLoZWYBx.js";
5
+ import { m as z } from "../max-B3JOcNGb.js";
6
+ import { l as v } from "../log_sum_exp-C142qZqY.js";
7
7
  function k(t, s) {
8
8
  return u(() => {
9
- const n = t.shape[t.shape.length - 1], c = t.shape.slice(0, -1).reduce((o, e) => o * e, 1), h = t.shape.length > 2 ? t.reshape([c, n]) : t, p = s.shape.length > 1 ? s.reshape([c]).cast("int32") : s.cast("int32"), r = z(h, -1, !0), a = E(h, r), m = v(a, -1);
9
+ const n = t.shape[t.shape.length - 1], c = t.shape.slice(0, -1).reduce((o, e) => o * e, 1), h = t.shape.length > 2 ? t.reshape([c, n]) : t, p = s.shape.length > 1 ? s.reshape([c]).cast("int32") : s.cast("int32"), r = z(h, -1, !0), a = G(h, r), m = v(a, -1);
10
10
  return x(m, p, a);
11
11
  });
12
12
  }
13
- function A() {
14
- return y(
13
+ function q() {
14
+ return C(
15
15
  // @ts-expect-error Invalid params
16
16
  (s, n, d) => {
17
17
  const c = s.shape[s.shape.length - 1], p = s.shape.slice(0, -1).reduce((o, e) => o * e, 1), r = s.reshape([p, c]), a = n.reshape([p]).cast("int32"), m = k(r, a);
18
18
  return d([r, a]), r.dispose(), a.dispose(), { value: m, gradFunc: (o, e) => u(() => {
19
- const S = e[0], f = e[1], b = G(S), l = L(b, f, o), g = C(n);
19
+ const S = e[0], f = e[1], b = y(S), l = L(b, f, o), g = E(n);
20
20
  return [l.reshape(s.shape), g];
21
21
  }) };
22
22
  }
23
23
  );
24
24
  }
25
25
  export {
26
- A as createSoftmaxCrossEntropyWithGrad,
26
+ q as createSoftmaxCrossEntropyWithGrad,
27
27
  k as sparseSoftmaxCrossEntropy
28
28
  };
@@ -1,9 +1,9 @@
1
- import { default as NanoGPT } from '../NanoGPTModel';
2
- export declare function dummyPassAsync(model: NanoGPT): Promise<void>;
1
+ import { default as Model, ModelForwardAttributes } from '../models/model';
2
+ export declare function dummyPassAsync(model: Model<ModelForwardAttributes>): Promise<void>;
3
3
  export interface MemoryRequirements {
4
4
  perBatch: number;
5
5
  tapeSize: number;
6
6
  gradients: number;
7
7
  }
8
- export declare function dummyPassTrainAsync(model: NanoGPT): Promise<MemoryRequirements>;
9
- export declare function dummyPass(model: NanoGPT): void;
8
+ export declare function dummyPassTrainAsync(model: Model<ModelForwardAttributes>): Promise<MemoryRequirements>;
9
+ export declare function dummyPass(model: Model<ModelForwardAttributes>): void;
@@ -1,31 +1,31 @@
1
- import { m as y, v as P, e as S } from "../index-BoWRt-10.js";
2
- import { z as i } from "../zeros--BdLQ3oG.js";
1
+ import { m as y, v as P, e as S } from "../index-CUQrfsw_.js";
2
+ import { z as i } from "../zeros-VZ72lWXM.js";
3
3
  async function w(s) {
4
- const t = i([1, s.config.gpt.blockSize], "int32"), [e, n] = s.forward({ training: !1 }, t);
4
+ const t = i([1, s.config.blockSize], "int32"), [e, n] = s.forward({ training: !1 }, t);
5
5
  await e.data(), e.dispose(), n && n.dispose(), t.dispose();
6
6
  }
7
7
  async function k(s) {
8
8
  const t = y(), e = t.numBytesInGPUAllocated ?? t.numBytesAllocatedInGPU ?? t.numBytes;
9
9
  await w(s);
10
- const n = i([1, s.config.gpt.blockSize], "int32"), r = i([1, s.config.gpt.blockSize], "int32"), o = {
10
+ const n = i([1, s.config.blockSize], "int32"), r = i([1, s.config.blockSize], "int32"), o = {
11
11
  perBatch: 0,
12
12
  tapeSize: 0,
13
13
  gradients: s.getNumParams() * 4
14
14
  }, f = () => {
15
- const [c, l] = s.forward({ training: !0 }, n, r), p = S().state.activeTape;
16
- let u = 0;
17
- if (p)
18
- for (const z of p)
19
- u += z.saved?.reduce((B, I) => B + I.size * 4, 0) || 0;
20
- return o.tapeSize = u, c.dispose(), l;
21
- }, { value: m, grads: d } = P(f), a = y(), g = a.numBytesInGPUAllocated ?? a.numBytesAllocatedInGPU ?? a.numBytes;
22
- o.perBatch = g - e - o.gradients, console.log("Dummy training memory requirements:", o), await m.data(), m.dispose();
15
+ const [c, g] = s.forward({ training: !0 }, n, r), u = S().state.activeTape;
16
+ let p = 0;
17
+ if (u)
18
+ for (const z of u)
19
+ p += z.saved?.reduce((B, I) => B + I.size * 4, 0) || 0;
20
+ return o.tapeSize = p, c.dispose(), g;
21
+ }, { value: m, grads: d } = P(f), a = y(), l = a.numBytesInGPUAllocated ?? a.numBytesAllocatedInGPU ?? a.numBytes;
22
+ o.perBatch = l - e - o.gradients, console.log("Dummy training memory requirements:", o), await m.data(), m.dispose();
23
23
  for (const c in d)
24
24
  d[c].dispose();
25
25
  return n.dispose(), r.dispose(), o;
26
26
  }
27
27
  function v(s) {
28
- const t = i([1, s.config.gpt.blockSize], "int32"), [e, n] = s.forward({ training: !1 }, t);
28
+ const t = i([1, s.config.blockSize], "int32"), [e, n] = s.forward({ training: !1 }, t);
29
29
  e.dispose(), n && n.dispose(), t.dispose();
30
30
  }
31
31
  export {
@@ -1,5 +1,5 @@
1
- import "../index-BoWRt-10.js";
2
- import { t as e } from "../tensor2d-wxPAnDQy.js";
1
+ import "../index-CUQrfsw_.js";
2
+ import { t as e } from "../tensor2d-BHy261cI.js";
3
3
  function l(n) {
4
4
  let r = 0;
5
5
  const i = Math.random();
@@ -1,4 +1,4 @@
1
- import { GPTConfig } from '../config';
1
+ import { GPTConfig } from '../models/config';
2
2
  export declare function estimateParameterCount(config: GPTConfig): number;
3
3
  export declare function estimateMemoryUsage(config: GPTConfig): number;
4
4
  export declare function estimateTrainingMemoryUsage(config: GPTConfig, batchSize: number): number;
@@ -1,4 +1,4 @@
1
- import { t as s } from "../index-BoWRt-10.js";
1
+ import { t as s } from "../index-CUQrfsw_.js";
2
2
  async function f(e, o = 10, r = !1) {
3
3
  for (let t = 0; t < 100; t++) {
4
4
  const a = r ? await e() : s(e);
@@ -1,4 +1,4 @@
1
- import { m as a } from "../index-BoWRt-10.js";
1
+ import { m as a } from "../index-CUQrfsw_.js";
2
2
  const s = 1024 * 1024;
3
3
  class l {
4
4
  log = /* @__PURE__ */ new Map();
@@ -1,5 +1,5 @@
1
- import "../index-BoWRt-10.js";
2
- import { t as y } from "../tensor-JwS7ZYY6.js";
1
+ import "../index-CUQrfsw_.js";
2
+ import { t as y } from "../tensor-BAQdLqoU.js";
3
3
  function l(t) {
4
4
  if (t === "float32") return "F32";
5
5
  if (t === "int32") return "I32";
@@ -1,5 +1,5 @@
1
- import "../index-BoWRt-10.js";
2
- import { t as p } from "../tensor-JwS7ZYY6.js";
1
+ import "../index-CUQrfsw_.js";
2
+ import { t as p } from "../tensor-BAQdLqoU.js";
3
3
  function h(n) {
4
4
  const e = n.reduce((s, o) => s + o.length, 0), a = new Float32Array(e);
5
5
  let t = 0;
@@ -1,4 +1,4 @@
1
- import { E as i } from "./index-BoWRt-10.js";
1
+ import { E as i } from "./index-CUQrfsw_.js";
2
2
  /**
3
3
  * @license
4
4
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -1,4 +1,4 @@
1
- import { aa as k, ab as z, ac as E, a1 as j, l as A } from "./index-BoWRt-10.js";
1
+ import { ad as k, ae as z, af as E, a2 as j, n as A } from "./index-CUQrfsw_.js";
2
2
  /**
3
3
  * @license
4
4
  * Copyright 2019 Google LLC. All Rights Reserved.
@@ -1,4 +1,4 @@
1
- import { l as u } from "./index-BoWRt-10.js";
1
+ import { n as u } from "./index-CUQrfsw_.js";
2
2
  /**
3
3
  * @license
4
4
  * Copyright 2019 Google LLC. All Rights Reserved.
@@ -15,57 +15,57 @@ import { l as u } from "./index-BoWRt-10.js";
15
15
  * limitations under the License.
16
16
  * =============================================================================
17
17
  */
18
- const e = (r) => {
18
+ const e = (n) => {
19
19
  let t = 1;
20
- for (let n = 0; n < r.length; n++)
21
- t *= r[n];
20
+ for (let r = 0; r < n.length; r++)
21
+ t *= n[r];
22
22
  return t;
23
23
  };
24
- function m(r, t, n = [1, 1, 1], a = [1, 1, 1]) {
24
+ function m(n, t, r = [1, 1, 1], a = [1, 1, 1]) {
25
25
  const [o, i, f] = [
26
- Math.ceil(e(r.x.map((c) => t[c])) / (n[0] * a[0])),
27
- r.y ? Math.ceil(e(r.y.map((c) => t[c])) / (n[1] * a[1])) : 1,
28
- r.z ? Math.ceil(e(r.z.map((c) => t[c])) / (n[2] * a[2])) : 1
26
+ Math.ceil(e(n.x.map((c) => t[c])) / (r[0] * a[0])),
27
+ n.y ? Math.ceil(e(n.y.map((c) => t[c])) / (r[1] * a[1])) : 1,
28
+ n.z ? Math.ceil(e(n.z.map((c) => t[c])) / (r[2] * a[2])) : 1
29
29
  ];
30
30
  return [o, i, f];
31
31
  }
32
- function d(r, t, n, a = !1) {
32
+ function d(n, t, r, a = !1) {
33
33
  const o = [8, 8, 1], i = [4, 4, 1];
34
- return a || (r <= 8 && (i[1] = 1), t <= 16 && n <= 16 && (o[0] = 4)), { workgroupSize: o, elementsPerThread: i };
34
+ return a || (n <= 8 && (i[1] = 1), t <= 16 && r <= 16 && (o[0] = 4)), { workgroupSize: o, elementsPerThread: i };
35
35
  }
36
- function p(r, t, n = !1) {
37
- if (n)
36
+ function p(n, t, r = !1) {
37
+ if (r)
38
38
  return [8, 8, 1];
39
- const a = e(r.x.map((i) => t[i])), o = e(r.y.map((i) => t[i]));
39
+ const a = e(n.x.map((i) => t[i])), o = e(n.y.map((i) => t[i]));
40
40
  return a <= 4 ? [4, 16, 1] : o <= 4 ? [16, 4, 1] : [16, 16, 1];
41
41
  }
42
- function M(r, t, n = !1) {
43
- if (n)
42
+ function M(n, t, r = !1) {
43
+ if (r)
44
44
  return [4, 4, 1];
45
- const a = e(r.x.map((i) => t[i])), o = e(r.y.map((i) => t[i]));
45
+ const a = e(n.x.map((i) => t[i])), o = e(n.y.map((i) => t[i]));
46
46
  return a <= 4 ? [1, 2, 1] : o <= 4 ? [2, 1, 1] : [2, 2, 1];
47
47
  }
48
- function h(r) {
49
- return { x: r.map((t, n) => n) };
48
+ function h(n) {
49
+ return { x: n.map((t, r) => r) };
50
50
  }
51
- function x(r) {
52
- if (r === "float32" || r === "int32" || r === "bool" || r === "string")
51
+ function x(n) {
52
+ if (n === "float32" || n === "int32" || n === "bool" || n === "string")
53
53
  return 4;
54
- if (r === "complex64")
54
+ if (n === "complex64")
55
55
  return 8;
56
- throw new Error(`Unknown dtype ${r}`);
56
+ throw new Error(`Unknown dtype ${n}`);
57
57
  }
58
58
  function g() {
59
59
  return !!(typeof globalThis < "u" && globalThis.navigator && globalThis.navigator.gpu);
60
60
  }
61
- function b(r, t) {
62
- Array.isArray(r) || (r = [r]), r.forEach((n) => {
63
- n != null && u(n.dtype !== "complex64", () => `${t} does not support complex64 tensors in the WebGPU backend.`);
61
+ function b(n, t) {
62
+ Array.isArray(n) || (n = [n]), n.forEach((r) => {
63
+ r != null && u(r.dtype !== "complex64", () => `${t} does not support complex64 tensors in the WebGPU backend.`);
64
64
  });
65
65
  }
66
66
  var s;
67
- (function(r) {
68
- r[r.MatMulReduceProgram = 0] = "MatMulReduceProgram", r[r.MatMulSplitKProgram = 1] = "MatMulSplitKProgram", r[r.MatMulSmallOutputSizeProgram = 2] = "MatMulSmallOutputSizeProgram", r[r.MatMulPackedProgram = 3] = "MatMulPackedProgram", r[r.MatMulMax = 4] = "MatMulMax";
67
+ (function(n) {
68
+ n[n.MatMulReduceProgram = 0] = "MatMulReduceProgram", n[n.MatMulSplitKProgram = 1] = "MatMulSplitKProgram", n[n.MatMulSmallOutputSizeProgram = 2] = "MatMulSmallOutputSizeProgram", n[n.MatMulPackedProgram = 3] = "MatMulPackedProgram", n[n.MatMulMax = 4] = "MatMulMax";
69
69
  })(s || (s = {}));
70
70
  export {
71
71
  x as G,
@@ -1,4 +1,4 @@
1
- import { B as m, C as r, a2 as l, E as c, a6 as i, F as p, a7 as u, j as f } from "./index-BoWRt-10.js";
1
+ import { B as m, C as r, a3 as l, E as c, a9 as i, L as p, aa as u, j as f } from "./index-CUQrfsw_.js";
2
2
  /**
3
3
  * @license
4
4
  * Copyright 2020 Google LLC. All Rights Reserved.