@genai-fi/nanogpt 0.3.2 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. package/dist/Generator.js +22 -22
  2. package/dist/MLP-KHhikThU.js +83 -0
  3. package/dist/NanoGPTModel.d.ts +2 -3
  4. package/dist/NanoGPTModel.js +79 -79
  5. package/dist/TeachableLLM.js +16 -13
  6. package/dist/axis_util-DeydwOoC.js +69 -0
  7. package/dist/{concat-BIZS_td9.js → concat-DS_qH7MI.js} +5 -5
  8. package/dist/config.js +7 -8
  9. package/dist/{gather-BPGW8RsB.js → gather-BUmJIS8n.js} +1 -1
  10. package/dist/{index-pWA4_lUh.js → index-XjBAhiFO.js} +1272 -1174
  11. package/dist/layers/BaseLayer.d.ts +14 -2
  12. package/dist/layers/BaseLayer.js +9 -9
  13. package/dist/layers/CausalSelfAttention.d.ts +4 -8
  14. package/dist/layers/CausalSelfAttention.js +106 -80
  15. package/dist/layers/MLP.d.ts +2 -3
  16. package/dist/layers/MLP.js +5 -62
  17. package/dist/layers/RMSNorm.d.ts +2 -2
  18. package/dist/layers/RMSNorm.js +11 -11
  19. package/dist/layers/RoPECache.js +3 -3
  20. package/dist/layers/TiedEmbedding.js +7 -6
  21. package/dist/layers/TransformerBlock.d.ts +2 -6
  22. package/dist/layers/TransformerBlock.js +9 -12
  23. package/dist/{sum-C7Mgy9Bw.js → log_sum_exp-DJPkVZZn.js} +32 -54
  24. package/dist/main.js +22 -19
  25. package/dist/{mat_mul-D7_a4KJn.js → mat_mul-CKwFEV1Q.js} +1 -1
  26. package/dist/max-DJvEiCAJ.js +25 -0
  27. package/dist/moments-CrWRPcR3.js +53 -0
  28. package/dist/norm-BzY929B_.js +86 -0
  29. package/dist/{ones-Cog-G2ag.js → ones-BO01zpJG.js} +2 -2
  30. package/dist/ops/appendCache.js +1 -1
  31. package/dist/ops/attentionMask.js +1 -1
  32. package/dist/ops/cpu/appendCache.js +2 -2
  33. package/dist/ops/cpu/attentionMask.js +2 -2
  34. package/dist/ops/cpu/fusedSoftmax.d.ts +9 -0
  35. package/dist/ops/cpu/fusedSoftmax.js +23 -0
  36. package/dist/ops/cpu/gatherSub.js +3 -3
  37. package/dist/ops/cpu/mulDropout.d.ts +1 -0
  38. package/dist/ops/cpu/mulDropout.js +17 -0
  39. package/dist/ops/cpu/qkv.js +3 -3
  40. package/dist/ops/cpu/rope.js +5 -5
  41. package/dist/ops/cpu/scatterSub.js +27 -27
  42. package/dist/ops/fusedSoftmax.d.ts +2 -0
  43. package/dist/ops/fusedSoftmax.js +10 -0
  44. package/dist/ops/gatherSub.js +1 -1
  45. package/dist/ops/grads/attentionMask.js +1 -1
  46. package/dist/ops/grads/fusedSoftmax.d.ts +2 -0
  47. package/dist/ops/grads/fusedSoftmax.js +17 -0
  48. package/dist/ops/grads/qkv.js +1 -1
  49. package/dist/ops/grads/rope.js +1 -1
  50. package/dist/ops/mulDrop.d.ts +2 -0
  51. package/dist/ops/mulDrop.js +9 -0
  52. package/dist/ops/node/sparseCrossEntropy.js +1 -1
  53. package/dist/ops/qkv.js +1 -1
  54. package/dist/ops/scatterSub.js +1 -1
  55. package/dist/ops/webgl/appendCache.js +1 -1
  56. package/dist/ops/webgl/attentionMask.js +1 -1
  57. package/dist/ops/webgl/fusedSoftmax.d.ts +11 -0
  58. package/dist/ops/webgl/fusedSoftmax.js +3930 -0
  59. package/dist/ops/webgl/gatherSub.js +1 -1
  60. package/dist/ops/webgl/mulDropout.d.ts +1 -0
  61. package/dist/ops/webgl/mulDropout.js +41 -0
  62. package/dist/ops/webgl/qkv.js +1 -1
  63. package/dist/ops/webgl/rope.js +1 -1
  64. package/dist/ops/webgl/scatterSub.js +1 -1
  65. package/dist/{random_width-oeUIlUZj.js → random_width-CMHmdbSu.js} +4212 -6630
  66. package/dist/{range-CcDl05lo.js → range-DQMNzBWs.js} +1 -1
  67. package/dist/{reshape-C8CR_Bad.js → reshape-DFzh97Sc.js} +1 -1
  68. package/dist/{sin-BJIrfnj7.js → sin-BYM-U4Ut.js} +1 -1
  69. package/dist/slice_util-CnVNPQI-.js +90 -0
  70. package/dist/softmax-4DOn6cPq.js +28 -0
  71. package/dist/{split-DZbvruEP.js → split-CkbeVdF8.js} +3 -3
  72. package/dist/{stack-BMm-efee.js → stack-DaIMO5iX.js} +1 -1
  73. package/dist/sum-C6u3xMi3.js +27 -0
  74. package/dist/{tensor-DJVbYhh1.js → tensor-Cu1fU7H7.js} +1 -1
  75. package/dist/{tensor2d-ZuQSh2D-.js → tensor2d-D0CKdG6B.js} +1 -1
  76. package/dist/tfjs_backend-Bzl2SrRo.js +2460 -0
  77. package/dist/training/AdamExt.js +1 -1
  78. package/dist/training/DatasetBuilder.js +3 -3
  79. package/dist/training/FullTrainer.js +1 -1
  80. package/dist/training/Trainer.js +13 -12
  81. package/dist/training/sparseCrossEntropy.js +12 -11
  82. package/dist/utilities/dummy.js +8 -8
  83. package/dist/utilities/generate.js +11 -11
  84. package/dist/utilities/load.js +1 -1
  85. package/dist/utilities/profile.js +1 -1
  86. package/dist/utilities/weights.js +2 -2
  87. package/dist/{variable-Dl_ub3pk.js → variable-BS4AKqNU.js} +1 -1
  88. package/dist/{zeros-CCy9C3uU.js → zeros-CmJFiC84.js} +1 -1
  89. package/package.json +1 -1
  90. package/dist/exports_layers-tbTBcwMM.js +0 -25
  91. package/dist/layers/LayerNorm.d.ts +0 -13
  92. package/dist/layers/LayerNorm.js +0 -33
  93. package/dist/moments-DfcpfwKi.js +0 -132
  94. package/dist/softmax-Be_lsqUc.js +0 -105
  95. package/dist/training/LayerTrainer.d.ts +0 -29
  96. package/dist/training/LayerTrainer.js +0 -95
  97. package/dist/training/lwSchedule.d.ts +0 -7
  98. package/dist/training/lwSchedule.js +0 -162
@@ -1,95 +0,0 @@
1
- import { generateText as u } from "../utilities/generate.js";
2
- import v from "./Trainer.js";
3
- import { schedule as w } from "./lwSchedule.js";
4
- import T from "./Evaluator.js";
5
- import { a as m } from "../index-pWA4_lUh.js";
6
- const x = {
7
- desiredLoss: 0.01,
8
- logInterval: 1,
9
- stepsPerLayer: 400,
10
- maxPasses: 3,
11
- maxSteps: 1e3
12
- };
13
- class E extends v {
14
- trainingPattern = [];
15
- startPass = 0;
16
- startLayer = 0;
17
- constructor(a, r, e = 3e-4) {
18
- if (super(a, r, e), this.trainingPattern = w[a.config.nLayer - 1] || [], a.log.length > 0) {
19
- const i = a.log[a.log.length - 1];
20
- i.pass !== void 0 && i.layer !== void 0 && (this.startPass = i.pass, this.startLayer = i.layer, console.log(`Resuming training from pass ${this.startPass}, layer ${this.startLayer}`));
21
- }
22
- }
23
- applyTrainingPattern(a) {
24
- const r = a < this.trainingPattern.length ? a : this.trainingPattern.length - 1, e = this.trainingPattern[r];
25
- this.model.setSkipMask(e.skip), this.model.setTrainableMask(e.trainable), this.resetOptimizer(e.adam), console.log("Applied training pattern:", r, e);
26
- }
27
- // Train for multiple epochs using Dataset API - FIXED memory leaks
28
- async trainOnDataset(a, r, e) {
29
- const { desiredLoss: i, logInterval: L, stepsPerLayer: d, onLayerChange: l, onPassComplete: p, onStep: h, prompt: c } = {
30
- ...x,
31
- ...r
32
- }, t = {
33
- pass: 0,
34
- layerStep: 0,
35
- step: 0,
36
- stepSinceLayerChange: 0,
37
- lastLoss: 1e6,
38
- totalSteps: 0,
39
- losses: [],
40
- validationLosses: [],
41
- trainingDuration: 0
42
- };
43
- this.dummyPass();
44
- const S = Date.now();
45
- this.startPass = 0, this.startLayer = 0;
46
- const g = e ? new T(this.model, e) : void 0, f = await a.iterator();
47
- this.applyTrainingPattern(t.layerStep % this.trainingPattern.length);
48
- try {
49
- for (; !(t.lastLoss < i); ) {
50
- const o = await f.next();
51
- if (o.done) break;
52
- const y = o.value, P = this.trainBatch(t, y);
53
- t.stepSinceLayerChange++;
54
- const n = {
55
- loss: t.lastLoss,
56
- step: t.step,
57
- time: Date.now() - S,
58
- batchSize: y.xs.shape[0],
59
- pass: t.pass,
60
- layer: t.layerStep % this.model.config.nLayer
61
- };
62
- if (this.model.log.push(n), t.step % L === 0) {
63
- if (await P, g)
64
- try {
65
- const s = await g.evaluate(5);
66
- t.validationLosses.push(s), n.valLoss = s;
67
- } catch (s) {
68
- console.error("Validation error:", s);
69
- }
70
- if (h) {
71
- if (c) {
72
- const s = await u(this.tokenizer, this.model, c, 100, {
73
- temperature: 0.8,
74
- topK: 10
75
- });
76
- n.example = s;
77
- }
78
- await h(n, {
79
- duration: t.trainingDuration,
80
- totalSamples: t.totalSteps * n.batchSize,
81
- samplesPerSecond: t.totalSteps * n.batchSize / (t.trainingDuration / 1e3)
82
- });
83
- }
84
- }
85
- t.stepSinceLayerChange >= d && (t.layerStep++, t.layerStep % this.model.config.nLayer === 0 ? (l && await l(t.layerStep, t.pass), p && await p(t.pass), t.pass++) : l && await l(t.layerStep, t.pass), t.stepSinceLayerChange = 0, this.applyTrainingPattern(t.layerStep % this.trainingPattern.length));
86
- }
87
- } catch (o) {
88
- throw console.error("Training error:", o), m(), o;
89
- }
90
- return m(), { losses: t.losses, validationLosses: t.validationLosses };
91
- }
92
- }
93
- export {
94
- E as default
95
- };
@@ -1,7 +0,0 @@
1
- import { AdamConfig } from './Trainer';
2
- export interface LWSchedule {
3
- adam: AdamConfig;
4
- skip: boolean[];
5
- trainable: boolean[];
6
- }
7
- export declare const schedule: LWSchedule[][];
@@ -1,162 +0,0 @@
1
- const e = [
2
- [
3
- {
4
- adam: {
5
- learningRateFactor: 1,
6
- beta1: 0.9,
7
- beta2: 0.999,
8
- epsilon: 1e-8
9
- },
10
- skip: [!1],
11
- trainable: [!0]
12
- }
13
- ],
14
- [
15
- {
16
- adam: {
17
- learningRateFactor: 1,
18
- beta1: 0.9,
19
- beta2: 0.999,
20
- epsilon: 1e-8
21
- },
22
- skip: [!0, !1],
23
- trainable: [!1, !0]
24
- },
25
- {
26
- adam: {
27
- learningRateFactor: 1,
28
- beta1: 0.9,
29
- beta2: 0.999,
30
- epsilon: 1e-8
31
- },
32
- skip: [!1, !1],
33
- trainable: [!0, !1]
34
- },
35
- {
36
- adam: {
37
- learningRateFactor: 0.3333333333333333,
38
- beta1: 0.95,
39
- beta2: 0.999,
40
- epsilon: 1e-8
41
- },
42
- skip: [!1, !1],
43
- trainable: [!0, !0]
44
- }
45
- ],
46
- [],
47
- [
48
- {
49
- adam: {
50
- learningRateFactor: 1,
51
- beta1: 0.9,
52
- beta2: 0.999,
53
- epsilon: 1e-8
54
- },
55
- skip: [!0, !0, !0, !1],
56
- trainable: [!1, !1, !1, !0]
57
- },
58
- {
59
- adam: {
60
- learningRateFactor: 1,
61
- beta1: 0.9,
62
- beta2: 0.999,
63
- epsilon: 1e-8
64
- },
65
- skip: [!0, !0, !1, !1],
66
- trainable: [!1, !1, !0, !1]
67
- },
68
- {
69
- adam: {
70
- learningRateFactor: 0.3333333333333333,
71
- beta1: 0.95,
72
- beta2: 0.999,
73
- epsilon: 1e-8
74
- },
75
- skip: [!0, !0, !1, !1],
76
- trainable: [!1, !1, !1, !0]
77
- },
78
- {
79
- adam: {
80
- learningRateFactor: 1,
81
- beta1: 0.9,
82
- beta2: 0.999,
83
- epsilon: 1e-8
84
- },
85
- skip: [!0, !1, !1, !1],
86
- trainable: [!1, !0, !1, !1]
87
- },
88
- {
89
- adam: {
90
- learningRateFactor: 0.3333333333333333,
91
- beta1: 0.95,
92
- beta2: 0.999,
93
- epsilon: 1e-8
94
- },
95
- skip: [!0, !1, !1, !1],
96
- trainable: [!1, !1, !0, !1]
97
- },
98
- {
99
- adam: {
100
- learningRateFactor: 0.16666666666666666,
101
- beta1: 0.98,
102
- beta2: 0.9999,
103
- epsilon: 1e-8
104
- },
105
- skip: [!0, !1, !1, !1],
106
- trainable: [!1, !1, !1, !0]
107
- },
108
- {
109
- adam: {
110
- learningRateFactor: 1,
111
- beta1: 0.9,
112
- beta2: 0.999,
113
- epsilon: 1e-8
114
- },
115
- skip: [!1, !1, !1, !1],
116
- trainable: [!0, !1, !1, !1]
117
- },
118
- {
119
- adam: {
120
- learningRateFactor: 0.3333333333333333,
121
- beta1: 0.95,
122
- beta2: 0.999,
123
- epsilon: 1e-8
124
- },
125
- skip: [!1, !1, !1, !1],
126
- trainable: [!1, !0, !1, !1]
127
- },
128
- {
129
- adam: {
130
- learningRateFactor: 0.16666666666666666,
131
- beta1: 0.98,
132
- beta2: 0.9999,
133
- epsilon: 1e-8
134
- },
135
- skip: [!1, !1, !1, !1],
136
- trainable: [!1, !1, !0, !1]
137
- },
138
- {
139
- adam: {
140
- learningRateFactor: 0.16666666666666666,
141
- beta1: 0.98,
142
- beta2: 0.9999,
143
- epsilon: 1e-8
144
- },
145
- skip: [!1, !1, !1, !1],
146
- trainable: [!1, !1, !1, !0]
147
- },
148
- {
149
- adam: {
150
- learningRateFactor: 0.16666666666666666,
151
- beta1: 0.98,
152
- beta2: 0.9999,
153
- epsilon: 1e-8
154
- },
155
- skip: [!1, !1, !1, !1],
156
- trainable: [!0, !0, !0, !0]
157
- }
158
- ]
159
- ];
160
- export {
161
- e as schedule
162
- };