@genai-fi/nanogpt 0.7.0 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/Generator.js +13 -9
- package/dist/NanoGPTModel.js +10 -10
- package/dist/{RealDiv-C4hOvYOZ.js → RealDiv-CVYNbZxu.js} +11 -11
- package/dist/{Reshape-BLijOA8h.js → Reshape-CEsEp0AI.js} +2 -2
- package/dist/Reshape-Do18N3gO.js +30 -0
- package/dist/TeachableLLM.js +9 -5
- package/dist/{TiedEmbedding-BLltddza.js → TiedEmbedding-ccLBFiZi.js} +4 -4
- package/dist/{axis_util-DaAl5MER.js → axis_util-5DTW2tFV.js} +1 -1
- package/dist/backend.js +2 -2
- package/dist/{backend_util-DWiwsi2N.js → backend_util-C9Ut8n0Q.js} +40 -40
- package/dist/{broadcast_to-C4v-j9yA.js → broadcast_to-Ba9h_8DO.js} +2 -2
- package/dist/{concat-CsHeR4zV.js → concat-CbXTetof.js} +1 -1
- package/dist/{dataset-JDyjG3QR.js → dataset-U3PrjwgU.js} +7 -7
- package/dist/{dropout-hpDwECTe.js → dropout-DPfPgWWe.js} +11 -11
- package/dist/{gather-D0_gPiBz.js → gather-Bbh8DHhM.js} +4 -4
- package/dist/{gelu-uyHP1x1f.js → gelu-BFwVnd1r.js} +1 -1
- package/dist/{gpgpu_math-DJm3ZTAf.js → gpgpu_math-DffelNS-.js} +2 -2
- package/dist/{index-BPPzKVdR.js → index-DYD_yPa-.js} +1083 -1106
- package/dist/{index-C0dhsYom.js → index-UdZhlibC.js} +126 -126
- package/dist/{kernel_funcs_utils-CwRTFqrc.js → kernel_funcs_utils-CXDy3EN7.js} +3 -3
- package/dist/layers/BaseLayer.js +2 -2
- package/dist/layers/CausalSelfAttention.js +8 -8
- package/dist/layers/MLP.js +5 -5
- package/dist/layers/RMSNorm.js +3 -3
- package/dist/layers/RoPECache.js +4 -4
- package/dist/layers/TiedEmbedding.js +5 -5
- package/dist/layers/TransformerBlock.js +1 -1
- package/dist/loader/loadTransformers.js +1 -1
- package/dist/loader/oldZipLoad.js +11 -7
- package/dist/{log_sum_exp-D086OgZJ.js → log_sum_exp-BnmCkHWl.js} +8 -8
- package/dist/main.d.ts +11 -0
- package/dist/main.js +44 -27
- package/dist/{mat_mul-1nwdPkQ_.js → mat_mul-dwmZz69e.js} +1 -1
- package/dist/{max-BQc2Aj-I.js → max-ByjEGoFx.js} +3 -3
- package/dist/{mulmat_packed_gpu-Gzf3I9UV.js → mulmat_packed_gpu-IGPBp6h9.js} +1 -1
- package/dist/{ones-D63HpSF_.js → ones-C8Mfln6-.js} +2 -2
- package/dist/ops/adamAdjust.d.ts +2 -0
- package/dist/ops/adamAdjust.js +9 -0
- package/dist/ops/adamMoments.d.ts +2 -0
- package/dist/ops/adamMoments.js +9 -0
- package/dist/ops/appendCache.js +3 -3
- package/dist/ops/attentionMask.js +1 -1
- package/dist/ops/cpu/adamAdjust.d.ts +1 -0
- package/dist/ops/cpu/adamAdjust.js +18 -0
- package/dist/ops/cpu/adamMoments.d.ts +1 -0
- package/dist/ops/cpu/adamMoments.js +16 -0
- package/dist/ops/cpu/appendCache.js +2 -2
- package/dist/ops/cpu/attentionMask.js +5 -5
- package/dist/ops/cpu/fusedSoftmax.js +2 -2
- package/dist/ops/cpu/gatherSub.js +3 -3
- package/dist/ops/cpu/gelu.js +1 -1
- package/dist/ops/cpu/matMulGelu.js +2 -2
- package/dist/ops/cpu/matMulMul.js +1 -1
- package/dist/ops/cpu/mulDropout.js +1 -1
- package/dist/ops/cpu/normRMS.js +1 -1
- package/dist/ops/cpu/qkv.js +3 -3
- package/dist/ops/cpu/rope.js +5 -5
- package/dist/ops/cpu/scatterSub.js +11 -11
- package/dist/ops/fusedSoftmax.js +1 -1
- package/dist/ops/gatherSub.js +1 -1
- package/dist/ops/gelu.js +2 -2
- package/dist/ops/grads/attentionMask.js +1 -1
- package/dist/ops/grads/fusedSoftmax.js +2 -2
- package/dist/ops/grads/gelu.js +2 -2
- package/dist/ops/grads/matMulGelu.js +1 -1
- package/dist/ops/grads/normRMS.js +1 -1
- package/dist/ops/grads/qkv.js +1 -1
- package/dist/ops/grads/rope.js +1 -1
- package/dist/ops/matMulGelu.js +1 -1
- package/dist/ops/matMulMul.js +1 -1
- package/dist/ops/mulDrop.js +1 -1
- package/dist/ops/normRMS.js +1 -1
- package/dist/ops/qkv.js +1 -1
- package/dist/ops/rope.js +4 -4
- package/dist/ops/scatterSub.js +1 -1
- package/dist/ops/webgl/adamAdjust.d.ts +1 -0
- package/dist/ops/webgl/adamAdjust.js +50 -0
- package/dist/ops/webgl/adamMoments.d.ts +1 -0
- package/dist/ops/webgl/adamMoments.js +38 -0
- package/dist/ops/webgl/appendCache.js +1 -1
- package/dist/ops/webgl/attentionMask.js +1 -1
- package/dist/ops/webgl/fusedSoftmax.js +4 -4
- package/dist/ops/webgl/gatherSub.js +8 -8
- package/dist/ops/webgl/gelu.js +2 -2
- package/dist/ops/webgl/log.js +3 -3
- package/dist/ops/webgl/matMulGelu.js +4 -4
- package/dist/ops/webgl/matMulMul.js +1 -1
- package/dist/ops/webgl/mulDropout.js +1 -1
- package/dist/ops/webgl/normRMS.js +2 -2
- package/dist/ops/webgl/qkv.js +1 -1
- package/dist/ops/webgl/rope.js +1 -1
- package/dist/ops/webgl/scatterSub.js +1 -1
- package/dist/ops/webgpu/adamAdjust.d.ts +1 -0
- package/dist/ops/webgpu/adamAdjust.js +52 -0
- package/dist/ops/webgpu/adamMoments.d.ts +1 -0
- package/dist/ops/webgpu/adamMoments.js +51 -0
- package/dist/ops/webgpu/appendCache.js +13 -12
- package/dist/ops/webgpu/attentionMask.js +11 -10
- package/dist/ops/webgpu/gatherSub.js +26 -11
- package/dist/ops/webgpu/gelu.js +7 -6
- package/dist/ops/webgpu/index.js +3 -0
- package/dist/ops/webgpu/normRMS.js +27 -101
- package/dist/ops/webgpu/normRMSGrad.d.ts +1 -0
- package/dist/ops/webgpu/normRMSGrad.js +128 -0
- package/dist/ops/webgpu/qkv.js +9 -8
- package/dist/ops/webgpu/rope.js +8 -7
- package/dist/ops/webgpu/scatterSub.js +8 -7
- package/dist/ops/webgpu/utils/reductions.d.ts +9 -0
- package/dist/ops/webgpu/utils/reductions.js +68 -0
- package/dist/{ops-CIQLNshk.js → ops-aRTXR2Sr.js} +195 -219
- package/dist/{random_width-DkYP8W8N.js → random_width-DbSpgl4o.js} +22 -21
- package/dist/{range-CYzpQY53.js → range-D9CZhVlR.js} +1 -1
- package/dist/{reciprocal-_A9yv27J.js → reciprocal-CGB48wZB.js} +1 -1
- package/dist/{register_all_kernels-guvSxp7M.js → register_all_kernels-DnbAyBXt.js} +30 -29
- package/dist/{reshape-BMUzc1UY.js → reshape-BR0eoLYN.js} +3 -3
- package/dist/{scatter_nd_util-IRBqKz_b.js → scatter_nd_util-OjyAxku2.js} +1 -1
- package/dist/{selu_util-Dt_iuXaq.js → selu_util-Ce6pu9IM.js} +41 -41
- package/dist/{shared-CDu9S76h.js → shared-Czipaeb6.js} +6 -6
- package/dist/{shared-BNa2q6jD.js → shared-DS5waSIY.js} +1 -1
- package/dist/{sin-Cocju-BY.js → sin-CiBxrDqX.js} +6 -6
- package/dist/slice-BHbDHObE.js +28 -0
- package/dist/{softmax-GPNK3o-U.js → softmax-JMEIUo2J.js} +3 -3
- package/dist/{split-CHzJjxDv.js → split-CRU0PjVV.js} +1 -1
- package/dist/{stack-Dpgg_1W1.js → stack-ikk2Y8_P.js} +1 -1
- package/dist/{sum-B8wEpKsg.js → sum-NLYbiDag.js} +3 -3
- package/dist/{tensor-RvZVNmg0.js → tensor-Do9PKbIE.js} +1 -1
- package/dist/{tensor2d-B_kyod7_.js → tensor2d-CWHxHpLh.js} +1 -1
- package/dist/training/Adam.d.ts +22 -0
- package/dist/training/Adam.js +93 -0
- package/dist/training/AdamExt.d.ts +1 -1
- package/dist/training/AdamExt.js +13 -12
- package/dist/training/DatasetBuilder.js +2 -2
- package/dist/training/FullTrainer.js +22 -22
- package/dist/training/Trainer.d.ts +1 -1
- package/dist/training/Trainer.js +32 -32
- package/dist/training/sparseCrossEntropy.d.ts +0 -4
- package/dist/training/sparseCrossEntropy.js +7 -7
- package/dist/utilities/arrayClose.d.ts +1 -0
- package/dist/utilities/arrayClose.js +11 -0
- package/dist/utilities/dummy.js +2 -2
- package/dist/utilities/generate.js +3 -3
- package/dist/utilities/multinomialCPU.js +2 -2
- package/dist/utilities/performance.d.ts +1 -1
- package/dist/utilities/performance.js +11 -11
- package/dist/utilities/profile.js +1 -1
- package/dist/utilities/safetensors.js +2 -2
- package/dist/utilities/weights.js +2 -2
- package/dist/{variable-DXEUOwew.js → variable-BTBkayv_.js} +1 -1
- package/dist/{webgpu_util-g13LvDIv.js → webgpu_program-WaoMq-WD.js} +138 -215
- package/dist/webgpu_util-DhSeP4b6.js +80 -0
- package/dist/{zeros-DCPCdFGq.js → zeros-DnPT2nD4.js} +4 -4
- package/package.json +1 -1
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import { adamAdjust as b } from "../ops/adamAdjust.js";
|
|
2
|
+
import { adamMoments as d } from "../ops/adamMoments.js";
|
|
3
|
+
import { O as g, e as h, t as o, d as B } from "../index-UdZhlibC.js";
|
|
4
|
+
import { z as M } from "../zeros-DnPT2nD4.js";
|
|
5
|
+
/**
|
|
6
|
+
* @license
|
|
7
|
+
* Copyright 2018 Google LLC. All Rights Reserved.
|
|
8
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
9
|
+
* you may not use this file except in compliance with the License.
|
|
10
|
+
* You may obtain a copy of the License at
|
|
11
|
+
*
|
|
12
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
13
|
+
*
|
|
14
|
+
* Unless required by applicable law or agreed to in writing, software
|
|
15
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
16
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
17
|
+
* See the License for the specific language governing permissions and
|
|
18
|
+
* limitations under the License.
|
|
19
|
+
* =============================================================================
|
|
20
|
+
*/
|
|
21
|
+
class R extends g {
|
|
22
|
+
constructor(t, a, e, s = null) {
|
|
23
|
+
super(), this.learningRate = t, this.beta1 = a, this.beta2 = e, this.epsilon = s, this.accBeta1 = a, this.accBeta2 = e, s === null && (this.epsilon = h().backend.epsilon());
|
|
24
|
+
}
|
|
25
|
+
/** @nocollapse */
|
|
26
|
+
static get className() {
|
|
27
|
+
return "Adam";
|
|
28
|
+
}
|
|
29
|
+
accBeta1 = 0;
|
|
30
|
+
accBeta2 = 0;
|
|
31
|
+
accumulatedMoments = [];
|
|
32
|
+
applyGradients(t) {
|
|
33
|
+
const a = Array.isArray(t) ? t.map((e) => e.name) : Object.keys(t);
|
|
34
|
+
o(() => {
|
|
35
|
+
const e = 1 - this.accBeta1, s = 1 - this.accBeta2;
|
|
36
|
+
a.forEach((n, i) => {
|
|
37
|
+
const c = h().registeredVariables[n], u = !1;
|
|
38
|
+
this.accumulatedMoments[i] == null && (this.accumulatedMoments[i] = {
|
|
39
|
+
originalName: `${n}/m`,
|
|
40
|
+
variable: o(() => M([...c.shape, 2]).variable(u))
|
|
41
|
+
});
|
|
42
|
+
const r = Array.isArray(t) ? t[i].tensor : t[n];
|
|
43
|
+
if (r == null)
|
|
44
|
+
return;
|
|
45
|
+
const m = this.accumulatedMoments[i].variable, l = d(m, r, this.beta1, this.beta2);
|
|
46
|
+
m.assign(l);
|
|
47
|
+
const p = b(
|
|
48
|
+
l,
|
|
49
|
+
c,
|
|
50
|
+
e,
|
|
51
|
+
s,
|
|
52
|
+
this.epsilon ?? 1e-8,
|
|
53
|
+
this.learningRate
|
|
54
|
+
);
|
|
55
|
+
c.assign(p);
|
|
56
|
+
}), this.accBeta1 = this.accBeta1 * this.beta1, this.accBeta2 = this.accBeta2 * this.beta2;
|
|
57
|
+
}), this.incrementIterations();
|
|
58
|
+
}
|
|
59
|
+
dispose() {
|
|
60
|
+
this.accumulatedMoments != null && B(this.accumulatedMoments.map((t) => t.variable));
|
|
61
|
+
}
|
|
62
|
+
async getWeights() {
|
|
63
|
+
const t = [...this.accumulatedMoments];
|
|
64
|
+
return [await this.saveIterations()].concat(
|
|
65
|
+
t.map((a) => ({ name: a.originalName, tensor: a.variable }))
|
|
66
|
+
);
|
|
67
|
+
}
|
|
68
|
+
async setWeights(t) {
|
|
69
|
+
t = await this.extractIterations(t), o(() => {
|
|
70
|
+
this.accBeta1 = Math.pow(this.beta1, this.iterations_ + 1), this.accBeta2 = Math.pow(this.beta2, this.iterations_ + 1);
|
|
71
|
+
});
|
|
72
|
+
const a = t.length / 2, e = !1;
|
|
73
|
+
this.accumulatedMoments = t.slice(0, a).map((s) => ({
|
|
74
|
+
originalName: s.name,
|
|
75
|
+
variable: s.tensor.variable(e)
|
|
76
|
+
}));
|
|
77
|
+
}
|
|
78
|
+
getConfig() {
|
|
79
|
+
return {
|
|
80
|
+
learningRate: this.learningRate,
|
|
81
|
+
beta1: this.beta1,
|
|
82
|
+
beta2: this.beta2,
|
|
83
|
+
epsilon: this.epsilon
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
/** @nocollapse */
|
|
87
|
+
static fromConfig(t, a) {
|
|
88
|
+
return new t(a.learningRate, a.beta1, a.beta2, a.epsilon);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
export {
|
|
92
|
+
R as AdamOptimizer
|
|
93
|
+
};
|
package/dist/training/AdamExt.js
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
import {
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
1
|
+
import { a as r, b as c, c as h, e as o } from "../index-UdZhlibC.js";
|
|
2
|
+
import { AdamOptimizer as g } from "./Adam.js";
|
|
3
|
+
class y extends g {
|
|
4
|
+
constructor(t, e, s, i, a) {
|
|
5
|
+
super(t, e, s, i), this.config = a, this.startLearningRate = t;
|
|
5
6
|
}
|
|
6
7
|
step = 0;
|
|
7
8
|
startLearningRate;
|
|
@@ -23,21 +24,21 @@ class u extends r {
|
|
|
23
24
|
}
|
|
24
25
|
decayVariable(t, e, s) {
|
|
25
26
|
if (t && t.shape.length >= 2) {
|
|
26
|
-
const
|
|
27
|
-
t.assign(
|
|
27
|
+
const i = r(t, c(s * e));
|
|
28
|
+
t.assign(h(t, i)), i.dispose();
|
|
28
29
|
}
|
|
29
30
|
}
|
|
30
31
|
applyWeightDecay(t) {
|
|
31
|
-
const e = this.config.weightDecay, s = this.learningRate,
|
|
32
|
-
Array.isArray(t) ? t.forEach(({ name:
|
|
33
|
-
const n = a
|
|
32
|
+
const e = this.config.weightDecay, s = this.learningRate, i = o().registeredVariables;
|
|
33
|
+
Array.isArray(t) ? t.forEach(({ name: a }) => {
|
|
34
|
+
const n = i[a];
|
|
34
35
|
this.decayVariable(n, e, s);
|
|
35
|
-
}) : Object.keys(t).forEach((
|
|
36
|
-
const n = a
|
|
36
|
+
}) : Object.keys(t).forEach((a) => {
|
|
37
|
+
const n = i[a];
|
|
37
38
|
this.decayVariable(n, e, s);
|
|
38
39
|
});
|
|
39
40
|
}
|
|
40
41
|
}
|
|
41
42
|
export {
|
|
42
|
-
|
|
43
|
+
y as default
|
|
43
44
|
};
|
|
@@ -1,21 +1,21 @@
|
|
|
1
|
-
import { generateText as
|
|
2
|
-
import
|
|
3
|
-
import
|
|
4
|
-
import { d as h } from "../index-
|
|
5
|
-
import
|
|
6
|
-
const
|
|
1
|
+
import { generateText as w } from "../utilities/generate.js";
|
|
2
|
+
import T from "./Trainer.js";
|
|
3
|
+
import L from "./Evaluator.js";
|
|
4
|
+
import { d as h } from "../index-UdZhlibC.js";
|
|
5
|
+
import x from "../utilities/profile.js";
|
|
6
|
+
const y = {
|
|
7
7
|
desiredLoss: 0.01,
|
|
8
8
|
logInterval: 1,
|
|
9
9
|
maxSteps: 1e3
|
|
10
10
|
};
|
|
11
|
-
class
|
|
11
|
+
class E extends T {
|
|
12
12
|
constructor(i, e, r = 3e-4) {
|
|
13
13
|
super(i, e, r);
|
|
14
14
|
}
|
|
15
15
|
// Train for multiple epochs using Dataset API - FIXED memory leaks
|
|
16
16
|
async trainOnDataset(i, e, r) {
|
|
17
|
-
const {
|
|
18
|
-
...
|
|
17
|
+
const { logInterval: g, onStep: l, prompt: c, maxSteps: u } = {
|
|
18
|
+
...y,
|
|
19
19
|
...e
|
|
20
20
|
}, n = Date.now(), t = {
|
|
21
21
|
step: 0,
|
|
@@ -27,13 +27,13 @@ class I extends L {
|
|
|
27
27
|
trainingDuration: 0,
|
|
28
28
|
...this.lastState || {}
|
|
29
29
|
};
|
|
30
|
-
this.lastState = t, await this.dummyPass(), this.model.trainable = !0, e?.advancedMetrics && (this.model.getProfiler() || (this.model.config.layerConfig.profiler = new
|
|
31
|
-
const
|
|
30
|
+
this.lastState = t, await this.dummyPass(), this.model.trainable = !0, e?.advancedMetrics && (this.model.getProfiler() || (this.model.config.layerConfig.profiler = new x())), this.running = !0, t.logStartTime = n;
|
|
31
|
+
const m = r ? new L(this.model, r) : void 0, f = await i.iterator();
|
|
32
32
|
try {
|
|
33
|
-
for (; this.running
|
|
33
|
+
for (; this.running; ) {
|
|
34
34
|
const o = await f.next();
|
|
35
35
|
if (o.done) break;
|
|
36
|
-
const d = o.value,
|
|
36
|
+
const d = o.value, p = this.trainBatch(t, d), s = {
|
|
37
37
|
loss: t.lastLoss,
|
|
38
38
|
step: t.step,
|
|
39
39
|
time: Date.now() - n,
|
|
@@ -42,21 +42,21 @@ class I extends L {
|
|
|
42
42
|
//gradientNorm: options?.advancedMetrics ? await state.gradientNorm : undefined,
|
|
43
43
|
};
|
|
44
44
|
if (this.model.log.push(s), t.step % g === 0) {
|
|
45
|
-
await
|
|
46
|
-
const
|
|
47
|
-
if (t.trainingDuration +=
|
|
45
|
+
await p.data();
|
|
46
|
+
const S = Date.now();
|
|
47
|
+
if (t.trainingDuration += S - t.logStartTime, m)
|
|
48
48
|
try {
|
|
49
|
-
const a = await
|
|
49
|
+
const a = await m.evaluate(5);
|
|
50
50
|
t.validationLosses.push(a), s.valLoss = a;
|
|
51
51
|
} catch (a) {
|
|
52
52
|
console.error("Validation error:", a);
|
|
53
53
|
}
|
|
54
54
|
if (l) {
|
|
55
|
-
if (
|
|
56
|
-
const
|
|
55
|
+
if (c) {
|
|
56
|
+
const v = await w(this.tokenizer, this.model, c, 100, {
|
|
57
57
|
temperature: 0.8
|
|
58
58
|
});
|
|
59
|
-
s.example =
|
|
59
|
+
s.example = v;
|
|
60
60
|
}
|
|
61
61
|
const a = {
|
|
62
62
|
duration: t.trainingDuration,
|
|
@@ -68,7 +68,7 @@ class I extends L {
|
|
|
68
68
|
}
|
|
69
69
|
t.logStartTime = Date.now();
|
|
70
70
|
}
|
|
71
|
-
t.step >= u && this.stop();
|
|
71
|
+
p.dispose(), t.step >= u && this.stop();
|
|
72
72
|
}
|
|
73
73
|
} catch (o) {
|
|
74
74
|
throw console.error("Training error:", o), h(), o;
|
|
@@ -77,5 +77,5 @@ class I extends L {
|
|
|
77
77
|
}
|
|
78
78
|
}
|
|
79
79
|
export {
|
|
80
|
-
|
|
80
|
+
E as default
|
|
81
81
|
};
|
package/dist/training/Trainer.js
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
import { DatasetBuilder as
|
|
2
|
-
import
|
|
3
|
-
import { t as f, v as y, d as c } from "../index-
|
|
4
|
-
import { z as
|
|
1
|
+
import { DatasetBuilder as h, flattenTokens as p, PAGE_FACTOR as g } from "./DatasetBuilder.js";
|
|
2
|
+
import u from "./AdamExt.js";
|
|
3
|
+
import { t as f, v as y, d as c } from "../index-UdZhlibC.js";
|
|
4
|
+
import { z as m } from "../zeros-DnPT2nD4.js";
|
|
5
5
|
class x {
|
|
6
|
-
constructor(t,
|
|
7
|
-
this.tokenizer =
|
|
6
|
+
constructor(t, e, a = 1e-3) {
|
|
7
|
+
this.tokenizer = e, this.model = t, this.learningRate = a, this.resetOptimizer(), this.datasetBuilder = new h(e, t.config.gpt.blockSize);
|
|
8
8
|
}
|
|
9
9
|
model;
|
|
10
10
|
optimizer;
|
|
@@ -26,7 +26,7 @@ class x {
|
|
|
26
26
|
}
|
|
27
27
|
resetOptimizer(t = { learningRateFactor: 1, beta1: 0.9, beta2: 0.99, epsilon: 1e-8 }) {
|
|
28
28
|
this.optimizer && this.optimizer.dispose();
|
|
29
|
-
const
|
|
29
|
+
const e = new u(
|
|
30
30
|
t.learningRateFactor * this.learningRate,
|
|
31
31
|
t.beta1,
|
|
32
32
|
t.beta2,
|
|
@@ -38,7 +38,7 @@ class x {
|
|
|
38
38
|
weightDecay: 0
|
|
39
39
|
}
|
|
40
40
|
);
|
|
41
|
-
this.optimizer =
|
|
41
|
+
this.optimizer = e;
|
|
42
42
|
}
|
|
43
43
|
/*private async maxGradNorm(grads: NamedVariableMap): Promise<number> {
|
|
44
44
|
let maxNorm = 0;
|
|
@@ -56,55 +56,55 @@ class x {
|
|
|
56
56
|
);
|
|
57
57
|
return maxNorm;
|
|
58
58
|
}*/
|
|
59
|
-
trainStep(t,
|
|
59
|
+
trainStep(t, e, a = !1) {
|
|
60
60
|
return f(() => {
|
|
61
61
|
this.model.getProfiler()?.startMemory();
|
|
62
|
-
const { xs:
|
|
63
|
-
const [l, d] = this.model.forward({ training: !0 },
|
|
62
|
+
const { xs: s, ys: i } = e, o = () => {
|
|
63
|
+
const [l, d] = this.model.forward({ training: !0 }, s, i);
|
|
64
64
|
return l.dispose(), d;
|
|
65
65
|
}, { value: n, grads: r } = y(o);
|
|
66
|
-
return
|
|
66
|
+
return a ? this.model.getProfiler()?.endMemory("Training") : (this.optimizer.applyGradients(r), this.model.getProfiler()?.endMemory("Training"), c(r)), n;
|
|
67
67
|
});
|
|
68
68
|
}
|
|
69
69
|
async dummyPass() {
|
|
70
|
-
const t =
|
|
70
|
+
const t = m([1, this.model.config.gpt.blockSize], "int32"), e = m([1, this.model.config.gpt.blockSize], "int32");
|
|
71
71
|
try {
|
|
72
|
-
const
|
|
73
|
-
await
|
|
74
|
-
} catch (
|
|
75
|
-
console.error("Error during dummy pass:",
|
|
72
|
+
const a = this.trainStep({}, { xs: t, ys: e }, !0);
|
|
73
|
+
await a.data(), a.dispose();
|
|
74
|
+
} catch (a) {
|
|
75
|
+
console.error("Error during dummy pass:", a);
|
|
76
76
|
} finally {
|
|
77
|
-
t.dispose(),
|
|
77
|
+
t.dispose(), e.dispose();
|
|
78
78
|
}
|
|
79
79
|
}
|
|
80
|
-
|
|
80
|
+
trainBatch(t, e) {
|
|
81
81
|
try {
|
|
82
|
-
const
|
|
83
|
-
return
|
|
84
|
-
} catch (
|
|
85
|
-
throw console.error(`Error processing batch at step ${t.step}:`,
|
|
82
|
+
const a = this.trainStep(t, e, !1);
|
|
83
|
+
return e.xs.dispose(), e.ys.dispose(), t.step++, t.totalSteps++, a;
|
|
84
|
+
} catch (a) {
|
|
85
|
+
throw console.error(`Error processing batch at step ${t.step}:`, a), c(), a;
|
|
86
86
|
}
|
|
87
87
|
}
|
|
88
|
-
async createTrainValidationSplit(t,
|
|
89
|
-
const
|
|
90
|
-
if (
|
|
91
|
-
const r = Math.floor(
|
|
88
|
+
async createTrainValidationSplit(t, e = 32, a = 0.1) {
|
|
89
|
+
const s = await p(t, this.tokenizer), i = /* @__PURE__ */ new Set();
|
|
90
|
+
if (a > 0) {
|
|
91
|
+
const r = Math.floor(s.length / (this.datasetBuilder.blockSize * g)), l = Math.max(1, Math.floor(r * a));
|
|
92
92
|
for (; i.size < l; ) {
|
|
93
93
|
const d = Math.floor(Math.random() * r);
|
|
94
94
|
i.add(d);
|
|
95
95
|
}
|
|
96
96
|
}
|
|
97
|
-
const o = await this.datasetBuilder.createTextDataset(
|
|
98
|
-
a,
|
|
97
|
+
const o = await this.datasetBuilder.createTextDataset(s, e, i, !1), n = await this.datasetBuilder.createTextDataset(
|
|
99
98
|
s,
|
|
99
|
+
e,
|
|
100
100
|
i,
|
|
101
101
|
!0
|
|
102
102
|
);
|
|
103
103
|
return { trainDataset: o, validationDataset: n };
|
|
104
104
|
}
|
|
105
|
-
async createDataset(t,
|
|
106
|
-
const
|
|
107
|
-
return await this.datasetBuilder.createTextDataset(
|
|
105
|
+
async createDataset(t, e = 32) {
|
|
106
|
+
const a = await p(t, this.tokenizer);
|
|
107
|
+
return await this.datasetBuilder.createTextDataset(a, e);
|
|
108
108
|
}
|
|
109
109
|
dispose() {
|
|
110
110
|
this.optimizer && this.optimizer.dispose();
|
|
@@ -4,8 +4,4 @@ import * as tf from '@tensorflow/tfjs-core';
|
|
|
4
4
|
* This version handles potential numerical issues better
|
|
5
5
|
*/
|
|
6
6
|
export declare function sparseSoftmaxCrossEntropy(logits: tf.Tensor, labels: tf.Tensor): tf.Tensor;
|
|
7
|
-
/**
|
|
8
|
-
* Custom gradient implementation for sparse cross-entropy
|
|
9
|
-
* This ensures proper backpropagation
|
|
10
|
-
*/
|
|
11
7
|
export declare function createSoftmaxCrossEntropyWithGrad(): (...args: tf.Tensor[]) => tf.Tensor<tf.Rank>;
|
|
@@ -1,22 +1,22 @@
|
|
|
1
1
|
import { gatherSub as x } from "../ops/gatherSub.js";
|
|
2
2
|
import { scatterSub as L } from "../ops/scatterSub.js";
|
|
3
|
-
import {
|
|
4
|
-
import { s as
|
|
5
|
-
import { m as z } from "../max-
|
|
6
|
-
import { l as v } from "../log_sum_exp-
|
|
3
|
+
import { y, t as u, z as C, c as E } from "../index-UdZhlibC.js";
|
|
4
|
+
import { s as G } from "../softmax-JMEIUo2J.js";
|
|
5
|
+
import { m as z } from "../max-ByjEGoFx.js";
|
|
6
|
+
import { l as v } from "../log_sum_exp-BnmCkHWl.js";
|
|
7
7
|
function k(t, s) {
|
|
8
8
|
return u(() => {
|
|
9
|
-
const n = t.shape[t.shape.length - 1], c = t.shape.slice(0, -1).reduce((o, e) => o * e, 1), h = t.shape.length > 2 ? t.reshape([c, n]) : t, p = s.shape.length > 1 ? s.reshape([c]).cast("int32") : s.cast("int32"), r = z(h, -1, !0), a =
|
|
9
|
+
const n = t.shape[t.shape.length - 1], c = t.shape.slice(0, -1).reduce((o, e) => o * e, 1), h = t.shape.length > 2 ? t.reshape([c, n]) : t, p = s.shape.length > 1 ? s.reshape([c]).cast("int32") : s.cast("int32"), r = z(h, -1, !0), a = E(h, r), m = v(a, -1);
|
|
10
10
|
return x(m, p, a);
|
|
11
11
|
});
|
|
12
12
|
}
|
|
13
13
|
function A() {
|
|
14
|
-
return
|
|
14
|
+
return y(
|
|
15
15
|
// @ts-expect-error Invalid params
|
|
16
16
|
(s, n, d) => {
|
|
17
17
|
const c = s.shape[s.shape.length - 1], p = s.shape.slice(0, -1).reduce((o, e) => o * e, 1), r = s.reshape([p, c]), a = n.reshape([p]).cast("int32"), m = k(r, a);
|
|
18
18
|
return d([r, a]), r.dispose(), a.dispose(), { value: m, gradFunc: (o, e) => u(() => {
|
|
19
|
-
const S = e[0], f = e[1], b =
|
|
19
|
+
const S = e[0], f = e[1], b = G(S), l = L(b, f, o), g = C(n);
|
|
20
20
|
return [l.reshape(s.shape), g];
|
|
21
21
|
}) };
|
|
22
22
|
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function arraysClose(a: unknown, b: unknown, epsilon?: number): boolean;
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
function f(r, e, n = 1e-5) {
|
|
2
|
+
if (Array.isArray(r) && Array.isArray(e)) {
|
|
3
|
+
if (r.length !== e.length) return !1;
|
|
4
|
+
for (let t = 0; t < r.length; ++t)
|
|
5
|
+
if (!f(r[t], e[t], n)) return !1;
|
|
6
|
+
return !0;
|
|
7
|
+
} else return typeof r == "number" && typeof e == "number" ? r === -1 / 0 && e === -1 / 0 ? !0 : Math.abs(r - e) < n : !1;
|
|
8
|
+
}
|
|
9
|
+
export {
|
|
10
|
+
f as arraysClose
|
|
11
|
+
};
|
package/dist/utilities/dummy.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { m as y, v as P, e as S } from "../index-
|
|
2
|
-
import { z as i } from "../zeros-
|
|
1
|
+
import { m as y, v as P, e as S } from "../index-UdZhlibC.js";
|
|
2
|
+
import { z as i } from "../zeros-DnPT2nD4.js";
|
|
3
3
|
async function w(s) {
|
|
4
4
|
const t = i([1, s.config.gpt.blockSize], "int32"), [e, n] = s.forward({ training: !1 }, t);
|
|
5
5
|
await e.data(), e.dispose(), n && n.dispose(), t.dispose();
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import "../index-
|
|
2
|
-
import { t as m } from "../tensor2d-
|
|
3
|
-
import { c as u } from "../concat-
|
|
1
|
+
import "../index-UdZhlibC.js";
|
|
2
|
+
import { t as m } from "../tensor2d-CWHxHpLh.js";
|
|
3
|
+
import { c as u } from "../concat-CbXTetof.js";
|
|
4
4
|
async function v(o, r, a, c, f) {
|
|
5
5
|
if (c <= 0)
|
|
6
6
|
throw new Error("Length must be a positive integer");
|
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
import { Tensor } from '@tensorflow/tfjs-core';
|
|
2
|
-
export default function performanceTest(fn: () => Tensor, iterations?: number): Promise<number>;
|
|
2
|
+
export default function performanceTest(fn: () => Tensor, iterations?: number, allowPromise?: boolean): Promise<number>;
|
|
@@ -1,16 +1,16 @@
|
|
|
1
|
-
import { t as
|
|
2
|
-
async function
|
|
3
|
-
for (let
|
|
4
|
-
const
|
|
5
|
-
await
|
|
1
|
+
import { t as s } from "../index-UdZhlibC.js";
|
|
2
|
+
async function f(e, o = 10, r = !1) {
|
|
3
|
+
for (let t = 0; t < 100; t++) {
|
|
4
|
+
const a = r ? await e() : s(e);
|
|
5
|
+
t === 99 && await a.data(), a.dispose();
|
|
6
6
|
}
|
|
7
|
-
const
|
|
8
|
-
for (let
|
|
9
|
-
const
|
|
10
|
-
|
|
7
|
+
const n = performance.now();
|
|
8
|
+
for (let t = 0; t < o; t++) {
|
|
9
|
+
const a = r ? await e() : s(e);
|
|
10
|
+
t === o - 1 && await a.data(), a.dispose();
|
|
11
11
|
}
|
|
12
|
-
return (performance.now() -
|
|
12
|
+
return (performance.now() - n) / o;
|
|
13
13
|
}
|
|
14
14
|
export {
|
|
15
|
-
|
|
15
|
+
f as default
|
|
16
16
|
};
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import "../index-
|
|
2
|
-
import { t as p } from "../tensor-
|
|
1
|
+
import "../index-UdZhlibC.js";
|
|
2
|
+
import { t as p } from "../tensor-Do9PKbIE.js";
|
|
3
3
|
function h(n) {
|
|
4
4
|
const e = n.reduce((s, o) => s + o.length, 0), a = new Float32Array(e);
|
|
5
5
|
let t = 0;
|