@genai-fi/nanogpt 0.7.3 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. package/dist/Generator.d.ts +25 -2
  2. package/dist/Generator.js +150 -49
  3. package/dist/{RealDiv-Dy0p8Bvo.js → RealDiv-N8TpOMYv.js} +14 -14
  4. package/dist/{Reshape-DvudQDvJ.js → Reshape-B-lWQRnF.js} +1 -1
  5. package/dist/{Reshape-DH5srBP0.js → Reshape-Bo8HzP8V.js} +5 -5
  6. package/dist/TeachableLLM.d.ts +6 -6
  7. package/dist/TeachableLLM.js +31 -31
  8. package/dist/Trainer.d.ts +13 -2
  9. package/dist/Trainer.js +21 -12
  10. package/dist/{axis_util-BzbKo31C.js → axis_util-DubwyOhW.js} +3 -3
  11. package/dist/backend.js +2 -2
  12. package/dist/{backend_util-TE7aTPhZ.js → backend_util-BJ-_jSeK.js} +46 -46
  13. package/dist/{broadcast_to-CdbwV-Dj.js → broadcast_to-BYfCp5iL.js} +2 -2
  14. package/dist/{concat-CsxrgovM.js → concat-BmDqqFsa.js} +1 -1
  15. package/dist/{dataset-CtdBYwjo.js → dataset-CJmEGu6D.js} +5 -5
  16. package/dist/{dropout-DYs5QFGQ.js → dropout-sx0sjVAT.js} +8 -8
  17. package/dist/exports_initializers-DAKM8UO9.js +16 -0
  18. package/dist/{gather-CMMy2KEG.js → gather-C1siEkdp.js} +1 -1
  19. package/dist/{gelu-C-dPj6Ku.js → gelu-Bd3UBBxg.js} +1 -1
  20. package/dist/{gpgpu_math-DGNLNL4I.js → gpgpu_math-TFLxaLkw.js} +26 -26
  21. package/dist/{index-CLthM0TO.js → index-BaPo_0H8.js} +185 -185
  22. package/dist/{index-BoWRt-10.js → index-CUQrfsw_.js} +266 -265
  23. package/dist/{kernel_funcs_utils-BYKWV8Aa.js → kernel_funcs_utils-P9aFa232.js} +9 -9
  24. package/dist/layers/BaseLayer.d.ts +8 -13
  25. package/dist/layers/BaseLayer.js +25 -13
  26. package/dist/layers/CausalSelfAttention.d.ts +3 -2
  27. package/dist/layers/CausalSelfAttention.js +28 -28
  28. package/dist/layers/MLP.d.ts +3 -2
  29. package/dist/layers/MLP.js +16 -20
  30. package/dist/layers/PositionEmbedding.d.ts +9 -0
  31. package/dist/layers/PositionEmbedding.js +45 -0
  32. package/dist/layers/RMSNorm.d.ts +3 -2
  33. package/dist/layers/RMSNorm.js +6 -6
  34. package/dist/layers/RoPECache.d.ts +1 -1
  35. package/dist/layers/RoPECache.js +4 -4
  36. package/dist/layers/TiedEmbedding.d.ts +3 -2
  37. package/dist/layers/TiedEmbedding.js +29 -7
  38. package/dist/layers/TransformerBlock.d.ts +3 -2
  39. package/dist/layers/TransformerBlock.js +1 -1
  40. package/dist/loader/load.d.ts +2 -2
  41. package/dist/loader/loadHF.d.ts +2 -2
  42. package/dist/loader/loadTransformers.d.ts +4 -2
  43. package/dist/loader/loadTransformers.js +10 -9
  44. package/dist/loader/newZipLoad.d.ts +2 -2
  45. package/dist/loader/oldZipLoad.d.ts +2 -2
  46. package/dist/loader/oldZipLoad.js +42 -51
  47. package/dist/loader/save.d.ts +8 -0
  48. package/dist/loader/save.js +62 -0
  49. package/dist/{log_sum_exp-DbjkV734.js → log_sum_exp-C142qZqY.js} +14 -14
  50. package/dist/main.d.ts +5 -4
  51. package/dist/main.js +22 -18
  52. package/dist/{mat_mul-8m8pfdcx.js → mat_mul-DMkduNJu.js} +1 -1
  53. package/dist/{max-Ddnnb5xe.js → max-B3JOcNGb.js} +1 -1
  54. package/dist/mod-uUuj4gSb.js +27 -0
  55. package/dist/models/NanoGPTV1.d.ts +15 -0
  56. package/dist/models/NanoGPTV1.js +71 -0
  57. package/dist/{config.d.ts → models/config.d.ts} +1 -0
  58. package/dist/{config.js → models/config.js} +1 -0
  59. package/dist/models/factory.d.ts +3 -0
  60. package/dist/models/factory.js +14 -0
  61. package/dist/models/model.d.ts +26 -0
  62. package/dist/models/model.js +68 -0
  63. package/dist/{mulmat_packed_gpu-VSekgsNv.js → mulmat_packed_gpu-Cm2gw-c8.js} +1 -1
  64. package/dist/{ones-Dj0SDhHf.js → ones-ZdgQGBCP.js} +2 -2
  65. package/dist/ops/adamAdjust.js +1 -1
  66. package/dist/ops/adamMoments.js +1 -1
  67. package/dist/ops/appendCache.js +3 -3
  68. package/dist/ops/attentionMask.js +1 -1
  69. package/dist/ops/cpu/adamAdjust.js +9 -9
  70. package/dist/ops/cpu/adamMoments.js +2 -2
  71. package/dist/ops/cpu/appendCache.js +2 -2
  72. package/dist/ops/cpu/attentionMask.js +5 -5
  73. package/dist/ops/cpu/fusedSoftmax.js +2 -2
  74. package/dist/ops/cpu/gatherSub.js +3 -3
  75. package/dist/ops/cpu/gelu.js +1 -1
  76. package/dist/ops/cpu/matMulGelu.js +2 -2
  77. package/dist/ops/cpu/matMulMul.js +1 -1
  78. package/dist/ops/cpu/mulDropout.js +1 -1
  79. package/dist/ops/cpu/normRMS.js +1 -1
  80. package/dist/ops/cpu/qkv.js +3 -3
  81. package/dist/ops/cpu/rope.js +5 -5
  82. package/dist/ops/cpu/scatterSub.js +11 -11
  83. package/dist/ops/fusedSoftmax.js +1 -1
  84. package/dist/ops/gatherSub.js +1 -1
  85. package/dist/ops/gelu.js +2 -2
  86. package/dist/ops/grads/attentionMask.js +1 -1
  87. package/dist/ops/grads/fusedSoftmax.js +2 -2
  88. package/dist/ops/grads/gelu.js +2 -2
  89. package/dist/ops/grads/matMulGelu.js +1 -1
  90. package/dist/ops/grads/normRMS.js +1 -1
  91. package/dist/ops/grads/qkv.js +1 -1
  92. package/dist/ops/grads/rope.js +1 -1
  93. package/dist/ops/matMulGelu.js +1 -1
  94. package/dist/ops/matMulMul.js +1 -1
  95. package/dist/ops/mulDrop.js +1 -1
  96. package/dist/ops/normRMS.js +1 -1
  97. package/dist/ops/qkv.js +1 -1
  98. package/dist/ops/rope.js +4 -4
  99. package/dist/ops/scatterSub.js +1 -1
  100. package/dist/ops/webgl/adamAdjust.js +2 -2
  101. package/dist/ops/webgl/adamMoments.js +1 -1
  102. package/dist/ops/webgl/appendCache.js +1 -1
  103. package/dist/ops/webgl/attentionMask.js +1 -1
  104. package/dist/ops/webgl/fusedSoftmax.js +4 -4
  105. package/dist/ops/webgl/gatherSub.js +1 -1
  106. package/dist/ops/webgl/gelu.js +2 -2
  107. package/dist/ops/webgl/log.js +3 -3
  108. package/dist/ops/webgl/matMulGelu.js +10 -10
  109. package/dist/ops/webgl/matMulMul.js +1 -1
  110. package/dist/ops/webgl/mulDropout.js +1 -1
  111. package/dist/ops/webgl/normRMS.js +2 -2
  112. package/dist/ops/webgl/qkv.js +1 -1
  113. package/dist/ops/webgl/rope.js +1 -1
  114. package/dist/ops/webgl/scatterSub.js +1 -1
  115. package/dist/ops/webgpu/adamAdjust.js +3 -3
  116. package/dist/ops/webgpu/adamMoments.js +3 -3
  117. package/dist/ops/webgpu/appendCache.js +3 -3
  118. package/dist/ops/webgpu/attentionMask.js +3 -3
  119. package/dist/ops/webgpu/gatherSub.js +3 -3
  120. package/dist/ops/webgpu/gelu.js +3 -3
  121. package/dist/ops/webgpu/normRMS.js +2 -2
  122. package/dist/ops/webgpu/normRMSGrad.js +5 -5
  123. package/dist/ops/webgpu/qkv.js +3 -3
  124. package/dist/ops/webgpu/rope.js +3 -3
  125. package/dist/ops/webgpu/scatterSub.js +3 -3
  126. package/dist/ops/webgpu/utils/reductions.js +4 -4
  127. package/dist/{ops-BFGCx8Ri.js → ops-C_1K_-35.js} +103 -103
  128. package/dist/{random_width-sZORGo5k.js → random_width-D8Pwy_na.js} +136 -136
  129. package/dist/{range-CRuAh-gd.js → range-LVHrSLdi.js} +1 -1
  130. package/dist/{reciprocal-BvGAyKyu.js → reciprocal-CaR9e67G.js} +1 -1
  131. package/dist/{register_all_kernels-BwDSRN-f.js → register_all_kernels-DUshvVWP.js} +2026 -2049
  132. package/dist/{reshape-CdBq1WJ6.js → reshape-DEfQGSin.js} +1 -1
  133. package/dist/{scatter_nd_util-DUstGbU1.js → scatter_nd_util-CUPPNLaA.js} +1 -1
  134. package/dist/{selu_util-BJEXVvjX.js → selu_util-8vv5JxQV.js} +3 -3
  135. package/dist/{shared-B8ztnyEk.js → shared-CkNorDcU.js} +83 -83
  136. package/dist/{shared-wS99K7_n.js → shared-D1elLckx.js} +1 -1
  137. package/dist/{sin-BeA3tsEd.js → sin-D2CKKmyR.js} +1 -1
  138. package/dist/{slice-BiOsknYS.js → slice-BnyE-M_7.js} +1 -1
  139. package/dist/{softmax-Bv_6lyMX.js → softmax-DLoZWYBx.js} +1 -1
  140. package/dist/{split-B-dikLRw.js → split-By_n4TKP.js} +1 -1
  141. package/dist/{stack-B17UN2nn.js → stack-DkdFLq37.js} +1 -1
  142. package/dist/{sum-66ew2byf.js → sum-l_0SqM4h.js} +3 -3
  143. package/dist/{tensor-JwS7ZYY6.js → tensor-BAQdLqoU.js} +1 -1
  144. package/dist/{tensor2d-wxPAnDQy.js → tensor2d-BHy261cI.js} +1 -1
  145. package/dist/training/Adam.js +2 -2
  146. package/dist/training/AdamExt.js +1 -1
  147. package/dist/training/DatasetBuilder.js +2 -2
  148. package/dist/training/Evaluator.d.ts +2 -2
  149. package/dist/training/FullTrainer.d.ts +3 -3
  150. package/dist/training/FullTrainer.js +61 -69
  151. package/dist/training/Trainer.d.ts +15 -3
  152. package/dist/training/Trainer.js +39 -47
  153. package/dist/training/sparseCrossEntropy.js +9 -9
  154. package/dist/utilities/dummy.d.ts +4 -4
  155. package/dist/utilities/dummy.js +13 -13
  156. package/dist/utilities/multinomialCPU.js +2 -2
  157. package/dist/utilities/parameters.d.ts +1 -1
  158. package/dist/utilities/performance.js +1 -1
  159. package/dist/utilities/profile.js +1 -1
  160. package/dist/utilities/safetensors.js +2 -2
  161. package/dist/utilities/weights.js +2 -2
  162. package/dist/{variable-BuddVFLa.js → variable-C9hihzDB.js} +1 -1
  163. package/dist/{webgpu_program-PFzf1hAQ.js → webgpu_program-dFEVbDPL.js} +1 -1
  164. package/dist/{webgpu_util-D____QpY.js → webgpu_util-DLImlSc6.js} +27 -27
  165. package/dist/{zeros--BdLQ3oG.js → zeros-VZ72lWXM.js} +1 -1
  166. package/package.json +2 -3
  167. package/dist/NanoGPTModel.d.ts +0 -52
  168. package/dist/NanoGPTModel.js +0 -203
  169. package/dist/TiedEmbedding-BxOerUmB.js +0 -43
  170. package/dist/utilities/generate.d.ts +0 -3
  171. package/dist/utilities/generate.js +0 -22
  172. package/dist/utilities/save.d.ts +0 -9
  173. package/dist/utilities/save.js +0 -61
@@ -1,22 +1,28 @@
1
- import { T as g, y as p, e as o, A as v } from "../index-BoWRt-10.js";
2
- import { v as _ } from "../variable-BuddVFLa.js";
3
- class M {
1
+ import { T as p, I as g, e as o, J as v } from "../index-CUQrfsw_.js";
2
+ import { v as _ } from "../variable-C9hihzDB.js";
3
+ class T {
4
4
  parent;
5
5
  config;
6
6
  _variables = /* @__PURE__ */ new Map();
7
7
  _trainable = !0;
8
8
  children = [];
9
+ profiler;
9
10
  constructor(t, r) {
10
11
  this.config = t, this.parent = r, this.parent && this.parent.children.push(this);
11
12
  }
12
13
  getProfiler() {
13
- return this.config.layerConfig.profiler;
14
+ return this.profiler;
15
+ }
16
+ setProfiler(t) {
17
+ this.profiler = t || void 0, this.children.forEach((r) => {
18
+ r.setProfiler(t);
19
+ });
14
20
  }
15
21
  startMemory() {
16
- this.config.layerConfig.profiler?.startMemory();
22
+ this.profiler?.startMemory();
17
23
  }
18
24
  endMemory(t) {
19
- this.config.layerConfig.profiler?.endMemory(t);
25
+ this.profiler?.endMemory(t);
20
26
  }
21
27
  addVariable(t, r) {
22
28
  this._variables.set(t, r || null);
@@ -41,11 +47,17 @@ class M {
41
47
  r.trainable = t;
42
48
  });
43
49
  }
44
- getVariable(t) {
45
- const r = this._variables.get(t);
46
- if (!r)
50
+ getVariable(t, r = !1) {
51
+ const e = this._variables.get(t);
52
+ if (!e && r)
53
+ for (const i of this.children) {
54
+ const s = i.getVariable(t, !0);
55
+ if (s)
56
+ return s;
57
+ }
58
+ if (!e)
47
59
  throw new Error(`Variable ${t} not found`);
48
- return r;
60
+ return e;
49
61
  }
50
62
  hasVariable(t) {
51
63
  return this._variables.get(t) !== null;
@@ -85,7 +97,7 @@ class M {
85
97
  call(t, ...r) {
86
98
  this.build();
87
99
  const e = this.forward(t, ...r);
88
- if (t.training && e instanceof g) {
100
+ if (t.training && e instanceof p) {
89
101
  const i = this.dropout(e);
90
102
  return i !== e && e.dispose(), i;
91
103
  } else
@@ -95,7 +107,7 @@ class M {
95
107
  return this.build(), this.checkpointingFn(t, ...r);
96
108
  }
97
109
  checkpointingFn(t, ...r) {
98
- const e = this.trainableVariables, s = p((...a) => {
110
+ const e = this.trainableVariables, s = g((...a) => {
99
111
  const l = a[a.length - 1], n = a.slice(0, r.length), h = this.forward(t, ...n);
100
112
  return l(n), { value: h, gradFunc: (c, f) => {
101
113
  const u = o().state.activeTape;
@@ -112,5 +124,5 @@ class M {
112
124
  }
113
125
  }
114
126
  export {
115
- M as default
127
+ T as default
116
128
  };
@@ -1,5 +1,6 @@
1
- import { default as BaseLayer, ForwardAttributes, GPTLayerConfig } from './BaseLayer';
1
+ import { default as BaseLayer, ForwardAttributes } from './BaseLayer';
2
2
  import { Tensor } from '@tensorflow/tfjs-core';
3
+ import { GPTConfig } from '../models/config';
3
4
  export type KVCache = {
4
5
  k?: Tensor;
5
6
  v?: Tensor;
@@ -22,7 +23,7 @@ export default class CausalSelfAttention extends BaseLayer<AttentionForwardAttri
22
23
  private projUnits;
23
24
  private ATTN;
24
25
  private PROJ;
25
- constructor(index: number, config: GPTLayerConfig, parent?: BaseLayer);
26
+ constructor(index: number, config: GPTConfig, parent?: BaseLayer);
26
27
  protected build(): void;
27
28
  private getAttentionScores;
28
29
  private getAttentionScoresWithPast;
@@ -3,14 +3,14 @@ import O from "./BaseLayer.js";
3
3
  import { qkv as P } from "../ops/qkv.js";
4
4
  import { rope as v } from "../ops/rope.js";
5
5
  import { appendCache as V } from "../ops/appendCache.js";
6
- import { w as c, t as C } from "../index-BoWRt-10.js";
6
+ import { k as c, t as C } from "../index-CUQrfsw_.js";
7
7
  import { fusedSoftmax as T } from "../ops/fusedSoftmax.js";
8
- import { d as y } from "../random_width-sZORGo5k.js";
9
- import { v as b } from "../variable-BuddVFLa.js";
10
- import { r as k, d as L } from "../dropout-DYs5QFGQ.js";
11
- import { r as N } from "../reshape-CdBq1WJ6.js";
12
- import { m as R } from "../mat_mul-8m8pfdcx.js";
13
- class W extends O {
8
+ import { d as L } from "../random_width-D8Pwy_na.js";
9
+ import { v as b } from "../variable-C9hihzDB.js";
10
+ import { r as k, d as y } from "../dropout-sx0sjVAT.js";
11
+ import { r as N } from "../reshape-DEfQGSin.js";
12
+ import { m as R } from "../mat_mul-DMkduNJu.js";
13
+ class $ extends O {
14
14
  divisor;
15
15
  index;
16
16
  units;
@@ -18,27 +18,27 @@ class W extends O {
18
18
  ATTN;
19
19
  PROJ;
20
20
  constructor(t, i, s) {
21
- super(i, s), this.index = t, this.units = i.gpt.nEmbed * 3, this.projUnits = i.gpt.nEmbed, this.ATTN = `block_${this.index}_cAttn`, this.PROJ = `block_${this.index}_cProj`, this.addVariable(this.ATTN), this.addVariable(this.PROJ), this.divisor = 1 / Math.sqrt(i.gpt.nEmbed / i.gpt.nHead);
21
+ super(i, s), this.index = t, this.units = i.nEmbed * 3, this.projUnits = i.nEmbed, this.ATTN = `block_${this.index}_cAttn`, this.PROJ = `block_${this.index}_cProj`, this.addVariable(this.ATTN), this.addVariable(this.PROJ), this.divisor = 1 / Math.sqrt(i.nEmbed / i.nHead);
22
22
  }
23
23
  build() {
24
24
  this.hasVariable(this.ATTN) === !1 && this.setVariable(
25
25
  this.ATTN,
26
26
  b(
27
- k([this.config.gpt.nEmbed, this.units], 0, 0.02),
27
+ k([this.config.nEmbed, this.units], 0, 0.02),
28
28
  !0
29
29
  //`block_${this.index}_attn_cAttn_kernel`
30
30
  )
31
31
  ), this.hasVariable(this.PROJ) === !1 && this.setVariable(
32
32
  this.PROJ,
33
33
  b(
34
- k([this.projUnits, this.config.gpt.nEmbed], 0, 0.02),
34
+ k([this.projUnits, this.config.nEmbed], 0, 0.02),
35
35
  !0
36
36
  //`block_${this.index}_attn_cProj_kernel`
37
37
  )
38
38
  );
39
39
  }
40
40
  getAttentionScores(t, i, s, o) {
41
- const e = g(t, i, this.divisor), n = T(e, s ? this.config.gpt.dropout : 0, o);
41
+ const e = g(t, i, this.divisor), n = T(e, s ? this.config.dropout : 0, o);
42
42
  return e.dispose(), n;
43
43
  }
44
44
  // Attention with optional past. If pastLen > 0 and T_cur == 1, no mask needed.
@@ -47,50 +47,50 @@ class W extends O {
47
47
  return o.dispose(), e;
48
48
  }
49
49
  getQKV(t) {
50
- return P(t, this.getVariable(this.ATTN), this.config.gpt.nHead);
50
+ return P(t, this.getVariable(this.ATTN), this.config.nHead);
51
51
  }
52
52
  getOutputProjection(t) {
53
- const i = t.shape[0], s = t.shape[2], o = this.config.gpt.nEmbed, e = t.transpose([0, 2, 1, 3]), n = N(e, [i, s, o]), p = y(n, this.getVariable(this.PROJ));
54
- return n.dispose(), e.dispose(), p;
53
+ const i = t.shape[0], s = t.shape[2], o = this.config.nEmbed, e = t.transpose([0, 2, 1, 3]), n = N(e, [i, s, o]), r = L(n, this.getVariable(this.PROJ));
54
+ return n.dispose(), e.dispose(), r;
55
55
  }
56
56
  updateCache(t, i, s) {
57
- const o = this.config.gpt.blockSize, e = t.shape[2], n = s.length || 0, p = V(t, o, n, s.k);
57
+ const o = this.config.blockSize, e = t.shape[2], n = s.length || 0, r = V(t, o, n, s.k);
58
58
  t.dispose(), s.k && s.k.dispose();
59
- const a = V(i, o, n, s.v);
59
+ const p = V(i, o, n, s.v);
60
60
  i.dispose(), s.v && s.v.dispose();
61
61
  const d = Math.min(n + e, o), h = s.cumulativeLength + e;
62
- s.length = d, s.cumulativeLength = h, s.k = c(p), s.v = c(a);
62
+ s.length = d, s.cumulativeLength = h, s.k = c(r), s.v = c(p);
63
63
  }
64
64
  forward(t, i) {
65
65
  return C(() => {
66
66
  this.startMemory();
67
- const [s, o, e] = this.getQKV(i), n = t.pastKV ? t.pastKV.cumulativeLength : 0, p = this.config.layerConfig.ropeCache, a = p ? v(s, p, n) : s, d = p ? v(o, p, n) : o;
68
- p && (s.dispose(), o.dispose());
67
+ const [s, o, e] = this.getQKV(i), n = t.pastKV ? t.pastKV.cumulativeLength : 0, r = t.ropeCache, p = r ? v(s, r, n) : s, d = r ? v(o, r, n) : o;
68
+ r && (s.dispose(), o.dispose());
69
69
  const h = t.pastKV ? t.pastKV.length : 0;
70
70
  t.pastKV && !t.training && this.updateCache(d, e, t.pastKV);
71
71
  const u = t.pastKV?.k ? t.pastKV.k : d, m = t.pastKV?.v ? t.pastKV.v : e;
72
- let r;
73
- h > 0 ? r = this.getAttentionScoresWithPast(a, u, h) : r = this.getAttentionScores(a, u, t.training, t.seed || 0), a.dispose(), t.pastKV || u.dispose();
74
- const l = R(r, m), f = t.attentionScores !== void 0 && t.attentionScores.attentionOut !== void 0;
75
- f || r.dispose(), t.pastKV || m.dispose();
72
+ let a;
73
+ h > 0 ? a = this.getAttentionScoresWithPast(p, u, h) : a = this.getAttentionScores(p, u, t.training, t.seed || 0), p.dispose(), t.pastKV || u.dispose();
74
+ const l = R(a, m), f = t.attentionScores !== void 0 && t.attentionScores.attentionOut !== void 0;
75
+ f || a.dispose(), t.pastKV || m.dispose();
76
76
  const A = this.getOutputProjection(l);
77
77
  if (l.dispose(), f && t.attentionScores && t.attentionScores.attentionOut !== void 0) {
78
- const K = r.shape[1], S = r.shape[2];
78
+ const K = a.shape[1], S = a.shape[2];
79
79
  t.attentionScores.attentionOut?.push(
80
- c(r.slice([0, 0, 0, 0], [1, -1, -1, -1]).reshape([K, S, -1]))
80
+ c(a.slice([0, 0, 0, 0], [1, -1, -1, -1]).reshape([K, S, -1]))
81
81
  );
82
82
  }
83
83
  return this.endMemory("CausalSelfAttention"), A;
84
84
  });
85
85
  }
86
86
  dropout(t) {
87
- if (this.config.gpt.dropout > 0) {
88
- const i = L(t, this.config.gpt.dropout);
87
+ if (this.config.dropout > 0) {
88
+ const i = y(t, this.config.dropout);
89
89
  return t.dispose(), i;
90
90
  } else
91
91
  return t;
92
92
  }
93
93
  }
94
94
  export {
95
- W as default
95
+ $ as default
96
96
  };
@@ -1,11 +1,12 @@
1
1
  import { Tensor } from '@tensorflow/tfjs-core';
2
- import { default as BaseLayer, ForwardAttributes, GPTLayerConfig } from './BaseLayer';
2
+ import { default as BaseLayer, ForwardAttributes } from './BaseLayer';
3
+ import { GPTConfig } from '../main';
3
4
  export default class MLP extends BaseLayer {
4
5
  private index;
5
6
  private hiddenUnits;
6
7
  private MLPHIDDEN;
7
8
  private MLPOUT;
8
- constructor(index: number, config: GPTLayerConfig, parent?: BaseLayer);
9
+ constructor(index: number, config: GPTConfig, parent?: BaseLayer);
9
10
  protected build(): void;
10
11
  forward(_: ForwardAttributes, x: Tensor): Tensor;
11
12
  protected dropout(x: Tensor): Tensor;
@@ -1,56 +1,52 @@
1
- import { t as l } from "../index-BoWRt-10.js";
1
+ import { t as p } from "../index-CUQrfsw_.js";
2
2
  import u from "./BaseLayer.js";
3
3
  import { matMulGelu as M } from "../ops/matMulGelu.js";
4
- import { v as o } from "../variable-BuddVFLa.js";
5
- import { r as h, d as f } from "../dropout-DYs5QFGQ.js";
6
- import { r as d } from "../reshape-CdBq1WJ6.js";
7
- import { m as c } from "../mat_mul-8m8pfdcx.js";
8
- class V extends u {
4
+ import { v as o } from "../variable-C9hihzDB.js";
5
+ import { r as h, d as f } from "../dropout-sx0sjVAT.js";
6
+ import { r as d } from "../reshape-DEfQGSin.js";
7
+ import { m as c } from "../mat_mul-DMkduNJu.js";
8
+ class H extends u {
9
9
  index;
10
10
  hiddenUnits;
11
11
  MLPHIDDEN;
12
12
  MLPOUT;
13
13
  constructor(i, t, s) {
14
- super(t, s), this.index = i, this.hiddenUnits = t.gpt.mlpFactor * t.gpt.nEmbed, this.MLPHIDDEN = `block_${this.index}_mlpHidden`, this.MLPOUT = `block_${this.index}_mlpOut`, this.addVariable(this.MLPHIDDEN), this.addVariable(this.MLPOUT);
14
+ super(t, s), this.index = i, this.hiddenUnits = t.mlpFactor * t.nEmbed, this.MLPHIDDEN = `block_${this.index}_mlpHidden`, this.MLPOUT = `block_${this.index}_mlpOut`, this.addVariable(this.MLPHIDDEN), this.addVariable(this.MLPOUT);
15
15
  }
16
16
  build() {
17
17
  this.hasVariable(this.MLPHIDDEN) === !1 && this.setVariable(
18
18
  this.MLPHIDDEN,
19
19
  o(
20
- h([this.config.gpt.nEmbed, this.hiddenUnits], 0, 0.02),
20
+ h([this.config.nEmbed, this.hiddenUnits], 0, 0.02),
21
21
  !0
22
22
  //`block_${this.index}_attn_cAttn_kernel`
23
23
  )
24
24
  ), this.hasVariable(this.MLPOUT) === !1 && this.setVariable(
25
25
  this.MLPOUT,
26
26
  o(
27
- h(
28
- [this.hiddenUnits, this.config.gpt.nEmbed],
29
- 0,
30
- 0.02 / Math.sqrt(2 * this.config.gpt.nLayer)
31
- ),
27
+ h([this.hiddenUnits, this.config.nEmbed], 0, 0.02 / Math.sqrt(2 * this.config.nLayer)),
32
28
  !0
33
29
  //`block_${this.index}_attn_cProj_kernel`
34
30
  )
35
31
  );
36
32
  }
37
33
  forward(i, t) {
38
- return l(() => {
34
+ return p(() => {
39
35
  this.startMemory();
40
- const [s, r, e] = t.shape, n = d(t, [s * r, e]), a = M(n, this.getVariable(this.MLPHIDDEN)), p = c(a, this.getVariable(this.MLPOUT));
36
+ const [s, r, e] = t.shape, n = d(t, [s * r, e]), a = M(n, this.getVariable(this.MLPHIDDEN)), m = c(a, this.getVariable(this.MLPOUT));
41
37
  a.dispose();
42
- const m = d(p, [s, r, e]);
43
- return this.endMemory("MLP"), m;
38
+ const l = d(m, [s, r, e]);
39
+ return this.endMemory("MLP"), l;
44
40
  });
45
41
  }
46
42
  dropout(i) {
47
- if (this.config.gpt.dropout > 0) {
48
- const t = f(i, this.config.gpt.dropout);
43
+ if (this.config.dropout > 0) {
44
+ const t = f(i, this.config.dropout);
49
45
  return i.dispose(), t;
50
46
  }
51
47
  return i;
52
48
  }
53
49
  }
54
50
  export {
55
- V as default
51
+ H as default
56
52
  };
@@ -0,0 +1,9 @@
1
+ import { Tensor } from '@tensorflow/tfjs-core';
2
+ import { default as BaseLayer } from './BaseLayer';
3
+ import { GPTConfig, ModelForwardAttributes } from '../main';
4
+ export default class PositionEmbedding extends BaseLayer {
5
+ private wpe?;
6
+ private drop;
7
+ constructor(config: GPTConfig, name?: string, parent?: BaseLayer);
8
+ forward(attrs: ModelForwardAttributes, x: Tensor): Tensor;
9
+ }
@@ -0,0 +1,45 @@
1
+ import { t as c, a8 as u, b as i } from "../index-CUQrfsw_.js";
2
+ import f from "./BaseLayer.js";
3
+ import { E as g, D as h } from "../random_width-D8Pwy_na.js";
4
+ import { r as b } from "../exports_initializers-DAKM8UO9.js";
5
+ import { m as l } from "../mod-uUuj4gSb.js";
6
+ import { r as w } from "../range-LVHrSLdi.js";
7
+ /**
8
+ * @license
9
+ * Copyright 2018 Google LLC
10
+ *
11
+ * Use of this source code is governed by an MIT-style
12
+ * license that can be found in the LICENSE file or at
13
+ * https://opensource.org/licenses/MIT.
14
+ * =============================================================================
15
+ */
16
+ function E(t) {
17
+ return new h(t);
18
+ }
19
+ function x(t) {
20
+ return new g(t);
21
+ }
22
+ class q extends f {
23
+ wpe;
24
+ // Position embeddings
25
+ drop;
26
+ // Dropout
27
+ constructor(o, n = "", r) {
28
+ super(o, r), this.wpe = x({
29
+ inputDim: this.config.blockSize,
30
+ outputDim: this.config.nEmbed,
31
+ name: n,
32
+ embeddingsInitializer: b({ mean: 0, stddev: 0.02 })
33
+ }), this.drop = E({ rate: this.config.dropout });
34
+ }
35
+ forward(o, n) {
36
+ const r = o.cache?.[0]?.length ?? 0;
37
+ return c(() => {
38
+ const [, s] = n.shape, e = this.config.blockSize, a = w(0, s, 1, "int32"), m = l(u(a, i(r, "int32")), i(e, "int32")), d = this.wpe.apply(m), p = n.add(d);
39
+ return this.drop.apply(p, { training: o.training });
40
+ });
41
+ }
42
+ }
43
+ export {
44
+ q as default
45
+ };
@@ -1,7 +1,8 @@
1
1
  import { Tensor } from '@tensorflow/tfjs-core';
2
- import { default as BaseLayer, ForwardAttributes, GPTLayerConfig } from './BaseLayer';
2
+ import { default as BaseLayer, ForwardAttributes } from './BaseLayer';
3
+ import { GPTConfig } from '../main';
3
4
  export default class RMSNorm extends BaseLayer {
4
5
  private GAMMA;
5
- constructor(config: GPTLayerConfig, name?: string, parent?: BaseLayer);
6
+ constructor(config: GPTConfig, name?: string, parent?: BaseLayer);
6
7
  forward(_: ForwardAttributes, x: Tensor): Tensor;
7
8
  }
@@ -1,12 +1,12 @@
1
- import { t as s } from "../index-BoWRt-10.js";
1
+ import { t as s } from "../index-CUQrfsw_.js";
2
2
  import e from "./BaseLayer.js";
3
3
  import { normRMS as a } from "../ops/normRMS.js";
4
- import { v as i } from "../variable-BuddVFLa.js";
5
- import { o as m } from "../ones-Dj0SDhHf.js";
6
- class f extends e {
4
+ import { v as i } from "../variable-C9hihzDB.js";
5
+ import { o as m } from "../ones-ZdgQGBCP.js";
6
+ class l extends e {
7
7
  GAMMA;
8
8
  constructor(r, t = "", o) {
9
- super(r, o), this.GAMMA = t, this.addVariable(this.GAMMA, i(m([r.gpt.nEmbed]), !0, this.GAMMA, "float32"));
9
+ super(r, o), this.GAMMA = t, this.addVariable(this.GAMMA, i(m([r.nEmbed]), !0, this.GAMMA, "float32"));
10
10
  }
11
11
  forward(r, t) {
12
12
  return s(() => {
@@ -17,5 +17,5 @@ class f extends e {
17
17
  }
18
18
  }
19
19
  export {
20
- f as default
20
+ l as default
21
21
  };
@@ -1,5 +1,5 @@
1
1
  import { Tensor } from '@tensorflow/tfjs-core';
2
- import { GPTConfig } from '../config';
2
+ import { GPTConfig } from '../models/config';
3
3
  export default class RoPECache {
4
4
  private readonly config;
5
5
  readonly rotaryDim: number;
@@ -1,7 +1,7 @@
1
- import { b as t, x as h, t as n, w as p } from "../index-BoWRt-10.js";
2
- import { r as c } from "../reciprocal-BvGAyKyu.js";
3
- import { c as f, s as m } from "../sin-BeA3tsEd.js";
4
- import { r as a } from "../range-CRuAh-gd.js";
1
+ import { b as t, x as h, t as n, k as p } from "../index-CUQrfsw_.js";
2
+ import { r as c } from "../reciprocal-CaR9e67G.js";
3
+ import { c as f, s as m } from "../sin-D2CKKmyR.js";
4
+ import { r as a } from "../range-LVHrSLdi.js";
5
5
  class D {
6
6
  constructor(o) {
7
7
  this.config = o;
@@ -1,11 +1,12 @@
1
1
  import { Tensor } from '@tensorflow/tfjs-core';
2
- import { default as BaseLayer, ForwardAttributes, GPTLayerConfig } from './BaseLayer';
2
+ import { default as BaseLayer, ForwardAttributes } from './BaseLayer';
3
+ import { GPTConfig } from '../models/config';
3
4
  export default class TiedEmbeddingOutputLayer extends BaseLayer {
4
5
  private vocabSize;
5
6
  private embedDim;
6
7
  private initializer;
7
8
  private WEIGHTS;
8
- constructor(config: GPTLayerConfig, name: string, parent?: BaseLayer);
9
+ constructor(config: GPTConfig, name: string, parent?: BaseLayer);
9
10
  embed(inputs: Tensor): Tensor;
10
11
  project(inputs: Tensor): Tensor;
11
12
  forward(_: ForwardAttributes, x: Tensor): Tensor;
@@ -1,9 +1,31 @@
1
- import "../random_width-sZORGo5k.js";
2
- import "../index-BoWRt-10.js";
3
- import { T as e } from "../TiedEmbedding-BxOerUmB.js";
4
- import "./BaseLayer.js";
5
- import "../variable-BuddVFLa.js";
6
- import "../gather-CMMy2KEG.js";
1
+ import { d as r } from "../random_width-D8Pwy_na.js";
2
+ import "../index-CUQrfsw_.js";
3
+ import { r as a } from "../exports_initializers-DAKM8UO9.js";
4
+ import s from "./BaseLayer.js";
5
+ import { v as m } from "../variable-C9hihzDB.js";
6
+ import { g as o } from "../gather-C1siEkdp.js";
7
+ class S extends s {
8
+ vocabSize;
9
+ embedDim;
10
+ initializer;
11
+ WEIGHTS;
12
+ constructor(i, e, t) {
13
+ super(i, t), this.WEIGHTS = e, this.vocabSize = i.vocabSize, this.embedDim = i.nEmbed, this.initializer = a({
14
+ mean: 0,
15
+ stddev: 0.02
16
+ }), this.addVariable(this.WEIGHTS, m(this.initializer.apply([this.vocabSize, this.embedDim]), !0));
17
+ }
18
+ embed(i) {
19
+ return o(this.getVariable(this.WEIGHTS), i, 0);
20
+ }
21
+ project(i) {
22
+ return r(i, this.getVariable(this.WEIGHTS).transpose());
23
+ }
24
+ // Dummy, should not be used.
25
+ forward(i, e) {
26
+ return this.project(e);
27
+ }
28
+ }
7
29
  export {
8
- e as default
30
+ S as default
9
31
  };
@@ -1,6 +1,7 @@
1
1
  import { AttentionScores, KVCache } from './CausalSelfAttention';
2
- import { default as BaseLayer, ForwardAttributes, GPTLayerConfig } from './BaseLayer';
2
+ import { default as BaseLayer, ForwardAttributes } from './BaseLayer';
3
3
  import { Tensor } from '@tensorflow/tfjs-core';
4
+ import { GPTConfig } from '../models/config';
4
5
  interface BlockAttributes extends ForwardAttributes {
5
6
  pastKV?: KVCache;
6
7
  seed?: number;
@@ -13,7 +14,7 @@ export default class Block extends BaseLayer<BlockAttributes> {
13
14
  private mlp;
14
15
  private index;
15
16
  skipped: boolean;
16
- constructor(index: number, config: GPTLayerConfig, parent?: BaseLayer);
17
+ constructor(index: number, config: GPTConfig, parent?: BaseLayer);
17
18
  private getMLPOutput;
18
19
  forward(attrs: BlockAttributes, x: Tensor): Tensor;
19
20
  dispose(): void;
@@ -2,7 +2,7 @@ import l from "./CausalSelfAttention.js";
2
2
  import r from "./MLP.js";
3
3
  import o from "./RMSNorm.js";
4
4
  import d from "./BaseLayer.js";
5
- import { t as p } from "../index-BoWRt-10.js";
5
+ import { t as p } from "../index-CUQrfsw_.js";
6
6
  class k extends d {
7
7
  ln1;
8
8
  attn;
@@ -1,5 +1,5 @@
1
- import { default as NanoGPT } from '../NanoGPTModel';
2
1
  import { ITokeniser } from '../tokeniser/type';
2
+ import { default as Model, ModelForwardAttributes } from '../models/model';
3
3
  export declare const VERSION = 2;
4
4
  export interface Metadata {
5
5
  version: string;
@@ -7,7 +7,7 @@ export interface Metadata {
7
7
  name?: string;
8
8
  }
9
9
  export declare function loadModel(data: Blob | Buffer | string): Promise<{
10
- model: NanoGPT;
10
+ model: Model<ModelForwardAttributes>;
11
11
  tokeniser: ITokeniser;
12
12
  name?: string;
13
13
  }>;
@@ -1,7 +1,7 @@
1
- import { default as NanoGPT } from '../NanoGPTModel';
2
1
  import { ITokeniser } from '../main';
2
+ import { default as Model, ModelForwardAttributes } from '../models/model';
3
3
  export default function loadHuggingFace(name: string): Promise<{
4
- model: NanoGPT;
4
+ model: Model<ModelForwardAttributes>;
5
5
  tokeniser: ITokeniser;
6
6
  name?: string;
7
7
  }>;
@@ -1,5 +1,6 @@
1
1
  import { ITokeniser } from '../tokeniser/type';
2
- import { default as NanoGPT } from '../NanoGPTModel';
2
+ import { default as Model, ModelForwardAttributes } from '../models/model';
3
+ import { TrainingState } from '../training/Trainer';
3
4
  export interface TransformersConfig {
4
5
  model_type: string;
5
6
  vocab_size: number;
@@ -22,10 +23,11 @@ export interface TransformersMetadata {
22
23
  name?: string;
23
24
  version: number;
24
25
  application: string;
26
+ training?: TrainingState;
25
27
  [key: string]: unknown;
26
28
  }
27
29
  export default function loadTransformers(config: TransformersConfig, tokeniser: TransformersTokeniser, metadata: TransformersMetadata, weightData: ArrayBuffer): Promise<{
28
- model: NanoGPT;
30
+ model: Model<ModelForwardAttributes>;
29
31
  tokeniser: ITokeniser;
30
32
  name?: string;
31
33
  }>;
@@ -1,11 +1,12 @@
1
- import b from "../NanoGPTModel.js";
2
- import c from "../tokeniser/CharTokeniser.js";
3
- import l from "../tokeniser/bpe.js";
4
- import { load_safetensors as u } from "../utilities/safetensors.js";
5
- import { a0 as y } from "../index-BoWRt-10.js";
6
- import { dummyPassAsync as h } from "../utilities/dummy.js";
1
+ import l from "../tokeniser/CharTokeniser.js";
2
+ import c from "../tokeniser/bpe.js";
3
+ import { load_safetensors as b } from "../utilities/safetensors.js";
4
+ import { a1 as y } from "../index-CUQrfsw_.js";
5
+ import { dummyPassAsync as u } from "../utilities/dummy.js";
6
+ import _ from "../models/factory.js";
7
7
  async function L(e, a, r, t) {
8
8
  const n = {
9
+ modelType: e.model_type || "GenAI_NanoGPT_v1",
9
10
  vocabSize: e.vocab_size,
10
11
  blockSize: e.block_size,
11
12
  nLayer: e.num_hidden_layers,
@@ -16,12 +17,12 @@ async function L(e, a, r, t) {
16
17
  biasInLayerNorm: e.biasInLayerNorm,
17
18
  mlpFactor: e.mlpFactor,
18
19
  useRope: e.useRope
19
- }, m = (a.type ?? "char") === "char" ? new c(a.vocab) : new l(a.vocab, a.merges), i = await u(t), s = /* @__PURE__ */ new Map();
20
+ }, m = (a.type ?? "char") === "char" ? new l(a.vocab) : new c(a.vocab, a.merges), i = await b(t), s = /* @__PURE__ */ new Map();
20
21
  for (const [p, d] of Object.entries(i))
21
22
  s.set(p, [d]);
22
23
  y();
23
- const o = new b(n);
24
- return await h(o), o.loadWeights(s), { model: o, tokeniser: m, name: r.name };
24
+ const o = _(n);
25
+ return await u(o), o.loadWeights(s), { model: o, tokeniser: m, name: r.name };
25
26
  }
26
27
  export {
27
28
  L as default
@@ -1,8 +1,8 @@
1
1
  import { ITokeniser } from '../main';
2
- import { default as NanoGPT } from '../NanoGPTModel';
3
2
  import { default as zip } from 'jszip';
3
+ import { default as Model, ModelForwardAttributes } from '../models/model';
4
4
  export default function loadZipFile(zipFile: zip): Promise<{
5
- model: NanoGPT;
5
+ model: Model<ModelForwardAttributes>;
6
6
  tokeniser: ITokeniser;
7
7
  name?: string;
8
8
  }>;
@@ -1,7 +1,7 @@
1
1
  import { default as zip } from 'jszip';
2
2
  import { ITokeniser } from '../main';
3
- import { default as NanoGPT } from '../NanoGPTModel';
3
+ import { default as Model, ModelForwardAttributes } from '../models/model';
4
4
  export default function loadOldModel(zipFile: zip): Promise<{
5
- model: NanoGPT;
5
+ model: Model<ModelForwardAttributes>;
6
6
  tokeniser: ITokeniser;
7
7
  }>;