@genai-fi/nanogpt 0.2.9 → 0.2.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/Generator.d.ts +2 -0
- package/dist/Generator.js +37 -32
- package/dist/NanoGPTModel.d.ts +4 -1
- package/dist/NanoGPTModel.js +33 -25
- package/dist/TeachableLLM.d.ts +4 -0
- package/dist/TeachableLLM.js +31 -16
- package/dist/{complex-Cd8sqiBC.js → complex-x7w5HPOS.js} +6 -6
- package/dist/{index-Dsg28SG6.js → index-CWQLouWz.js} +39 -35
- package/dist/layers/BaseLayer.d.ts +8 -0
- package/dist/layers/BaseLayer.js +18 -0
- package/dist/layers/CausalSelfAttention.d.ts +2 -1
- package/dist/layers/CausalSelfAttention.js +10 -8
- package/dist/layers/MLP.d.ts +2 -1
- package/dist/layers/MLP.js +16 -14
- package/dist/layers/RMSNorm.d.ts +2 -1
- package/dist/layers/RMSNorm.js +13 -11
- package/dist/layers/TiedEmbedding.js +21 -21
- package/dist/layers/TransformerBlock.d.ts +4 -1
- package/dist/layers/TransformerBlock.js +9 -5
- package/dist/{mat_mul-BAYDrXvE.js → mat_mul-4v7St11W.js} +5 -5
- package/dist/ops/attentionMask.js +31 -25
- package/dist/ops/gatherSub.js +2 -2
- package/dist/ops/node/sparseCrossEntropy.js +1 -1
- package/dist/ops/scatterSub.js +8 -8
- package/dist/{stack-1o648CP_.js → stack-CTdK-itU.js} +7 -7
- package/dist/{sum-NWazHI7f.js → sum-CnIf1YOh.js} +3 -3
- package/dist/training/AdamExt.js +1 -1
- package/dist/training/Trainer.js +30 -29
- package/dist/training/sparseCrossEntropy.js +9 -9
- package/dist/utilities/profile.d.ts +10 -0
- package/dist/utilities/profile.js +29 -0
- package/package.json +1 -1
package/dist/layers/MLP.js
CHANGED
|
@@ -1,31 +1,32 @@
|
|
|
1
|
-
|
|
1
|
+
import a from "./BaseLayer.js";
|
|
2
|
+
class l extends a {
|
|
2
3
|
cFc;
|
|
3
4
|
cProj;
|
|
4
5
|
dropout;
|
|
5
6
|
tf;
|
|
6
7
|
index;
|
|
7
8
|
_trainable = !0;
|
|
8
|
-
constructor(t,
|
|
9
|
-
this.tf = t, this.index =
|
|
10
|
-
units:
|
|
9
|
+
constructor(t, i, e) {
|
|
10
|
+
super(), this.tf = t, this.index = i, this.cFc = this.tf.layers.dense({
|
|
11
|
+
units: e.mlpFactor * e.nEmbed,
|
|
11
12
|
activation: "gelu",
|
|
12
|
-
useBias:
|
|
13
|
+
useBias: e.biasInLinear,
|
|
13
14
|
kernelInitializer: this.tf.initializers.randomNormal({
|
|
14
15
|
mean: 0,
|
|
15
16
|
stddev: 0.02
|
|
16
17
|
}),
|
|
17
18
|
biasInitializer: "zeros",
|
|
18
|
-
name: `block_${
|
|
19
|
+
name: `block_${i}_mlp_cFc`
|
|
19
20
|
}), this.cProj = this.tf.layers.dense({
|
|
20
|
-
units:
|
|
21
|
-
useBias:
|
|
21
|
+
units: e.nEmbed,
|
|
22
|
+
useBias: e.biasInLinear,
|
|
22
23
|
kernelInitializer: this.tf.initializers.randomNormal({
|
|
23
24
|
mean: 0,
|
|
24
|
-
stddev: 0.02 / Math.sqrt(2 *
|
|
25
|
+
stddev: 0.02 / Math.sqrt(2 * e.nLayer)
|
|
25
26
|
}),
|
|
26
27
|
biasInitializer: "zeros",
|
|
27
|
-
name: `block_${
|
|
28
|
-
}), this.dropout = this.tf.layers.dropout({ rate:
|
|
28
|
+
name: `block_${i}_mlp_cProj`
|
|
29
|
+
}), this.dropout = this.tf.layers.dropout({ rate: e.dropout });
|
|
29
30
|
}
|
|
30
31
|
get variables() {
|
|
31
32
|
return [
|
|
@@ -45,10 +46,11 @@ class l {
|
|
|
45
46
|
loadWeights(t) {
|
|
46
47
|
this.cFc.setWeights(t.get(`block_${this.index}_mlpHidden`) || []), this.cProj.setWeights(t.get(`block_${this.index}_mlpOut`) || []);
|
|
47
48
|
}
|
|
48
|
-
call(t,
|
|
49
|
+
call(t, i = !1) {
|
|
49
50
|
return this.tf.tidy(() => {
|
|
50
|
-
|
|
51
|
-
|
|
51
|
+
this.startMemory();
|
|
52
|
+
const e = this.cFc.apply(t), s = this.cProj.apply(e), r = this.dropout.apply(s, { training: i });
|
|
53
|
+
return this.endMemory("MLP"), r;
|
|
52
54
|
});
|
|
53
55
|
}
|
|
54
56
|
dispose() {
|
package/dist/layers/RMSNorm.d.ts
CHANGED
package/dist/layers/RMSNorm.js
CHANGED
|
@@ -1,26 +1,28 @@
|
|
|
1
|
-
|
|
1
|
+
import m from "./BaseLayer.js";
|
|
2
|
+
class o extends m {
|
|
2
3
|
gamma;
|
|
3
4
|
epsilon;
|
|
4
5
|
tf;
|
|
5
|
-
constructor(
|
|
6
|
-
this.tf =
|
|
6
|
+
constructor(t, s, a = 1e-8, e = "") {
|
|
7
|
+
super(), this.tf = t, this.epsilon = a, this.gamma = t.variable(t.ones(s), !0, `${e}_gamma`, "float32");
|
|
7
8
|
}
|
|
8
9
|
get trainableWeights() {
|
|
9
10
|
return [this.gamma];
|
|
10
11
|
}
|
|
11
|
-
set trainable(
|
|
12
|
-
this.gamma.trainable =
|
|
12
|
+
set trainable(t) {
|
|
13
|
+
this.gamma.trainable = t;
|
|
13
14
|
}
|
|
14
15
|
getWeights() {
|
|
15
16
|
return [this.gamma];
|
|
16
17
|
}
|
|
17
|
-
setWeights(
|
|
18
|
-
this.gamma.assign(
|
|
18
|
+
setWeights(t) {
|
|
19
|
+
this.gamma.assign(t[0]);
|
|
19
20
|
}
|
|
20
|
-
apply(
|
|
21
|
+
apply(t) {
|
|
21
22
|
return this.tf.tidy(() => {
|
|
22
|
-
|
|
23
|
-
|
|
23
|
+
this.startMemory();
|
|
24
|
+
const a = t.square().mean(-1, !0).add(this.epsilon).rsqrt(), r = t.mul(a).mul(this.gamma);
|
|
25
|
+
return this.endMemory("RMSNorm"), r;
|
|
24
26
|
});
|
|
25
27
|
}
|
|
26
28
|
dispose() {
|
|
@@ -28,5 +30,5 @@ class m {
|
|
|
28
30
|
}
|
|
29
31
|
}
|
|
30
32
|
export {
|
|
31
|
-
|
|
33
|
+
o as default
|
|
32
34
|
};
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import { o as h,
|
|
2
|
-
import { s as ce, r as f } from "../sum-
|
|
3
|
-
import { m } from "../mat_mul-
|
|
4
|
-
import { c as pe } from "../complex-
|
|
1
|
+
import { o as h, d as i, E as o, F as V, H as X, I as Y, J as Z, N as ee, K as te, O as se, Q as ne, T as re, U as ue, i as L, z as ae, V as A, a as ie, W as oe, w as le, f as q, p as C, X as P, y as U, _ as H } from "../index-CWQLouWz.js";
|
|
2
|
+
import { s as ce, r as f } from "../sum-CnIf1YOh.js";
|
|
3
|
+
import { m } from "../mat_mul-4v7St11W.js";
|
|
4
|
+
import { c as pe } from "../complex-x7w5HPOS.js";
|
|
5
5
|
/**
|
|
6
6
|
* @license
|
|
7
7
|
* Copyright 2018 Google LLC. All Rights Reserved.
|
|
@@ -169,7 +169,7 @@ function Me(t) {
|
|
|
169
169
|
const s = { x: i(t, "x", "relu") };
|
|
170
170
|
return o.runKernel(ne, s);
|
|
171
171
|
}
|
|
172
|
-
const
|
|
172
|
+
const we = /* @__PURE__ */ h({ relu_: Me });
|
|
173
173
|
/**
|
|
174
174
|
* @license
|
|
175
175
|
* Copyright 2020 Google LLC. All Rights Reserved.
|
|
@@ -186,11 +186,11 @@ const We = /* @__PURE__ */ h({ relu_: Me });
|
|
|
186
186
|
* limitations under the License.
|
|
187
187
|
* =============================================================================
|
|
188
188
|
*/
|
|
189
|
-
function
|
|
189
|
+
function We(t) {
|
|
190
190
|
const s = { x: i(t, "x", "relu6") };
|
|
191
191
|
return o.runKernel(re, s);
|
|
192
192
|
}
|
|
193
|
-
const ze = /* @__PURE__ */ h({ relu6_:
|
|
193
|
+
const ze = /* @__PURE__ */ h({ relu6_: We });
|
|
194
194
|
/**
|
|
195
195
|
* @license
|
|
196
196
|
* Copyright 2018 Google LLC. All Rights Reserved.
|
|
@@ -273,7 +273,7 @@ function Te(t, e, s, n) {
|
|
|
273
273
|
if (e === "linear")
|
|
274
274
|
return t;
|
|
275
275
|
if (e === "relu")
|
|
276
|
-
return
|
|
276
|
+
return we(t);
|
|
277
277
|
if (e === "elu")
|
|
278
278
|
return me(t);
|
|
279
279
|
if (e === "relu6")
|
|
@@ -310,14 +310,14 @@ function Ne({ a: t, b: e, transposeA: s = !1, transposeB: n = !1, bias: r, activ
|
|
|
310
310
|
}
|
|
311
311
|
let u = i(t, "a", "fused matMul"), a = i(e, "b", "fused matMul");
|
|
312
312
|
[u, a] = q(u, a);
|
|
313
|
-
const D = s ? u.shape[u.rank - 2] : u.shape[u.rank - 1], b = n ? a.shape[a.rank - 1] : a.shape[a.rank - 2],
|
|
313
|
+
const D = s ? u.shape[u.rank - 2] : u.shape[u.rank - 1], b = n ? a.shape[a.rank - 1] : a.shape[a.rank - 2], w = s ? u.shape[u.rank - 1] : u.shape[u.rank - 2], W = n ? a.shape[a.rank - 2] : a.shape[a.rank - 1], T = u.shape.slice(0, -2), y = a.shape.slice(0, -2), B = C(T), N = C(y);
|
|
314
314
|
L(D === b, () => `Error in fused matMul: inner shapes (${D}) and (${b}) of Tensors with shapes ${u.shape} and ${a.shape} and transposeA=${s} and transposeB=${n} must match.`);
|
|
315
|
-
const O = P(u.shape.slice(0, -2), a.shape.slice(0, -2)).concat([
|
|
315
|
+
const O = P(u.shape.slice(0, -2), a.shape.slice(0, -2)).concat([w, W]), F = s ? f(u, [B, D, w]) : f(u, [B, w, D]), R = n ? f(a, [N, W, b]) : f(a, [N, b, W]);
|
|
316
316
|
let S;
|
|
317
317
|
r != null && (S = i(r, "bias", "fused matMul"), [S] = q(S, u), P(O, S.shape));
|
|
318
|
-
let
|
|
319
|
-
l != null && (
|
|
320
|
-
const
|
|
318
|
+
let G;
|
|
319
|
+
l != null && (G = i(l, "prelu weights", "fused matMul"));
|
|
320
|
+
const I = (x, M) => {
|
|
321
321
|
const [g, $, k, z] = M, d = Ae(f(x, k.shape), k, c);
|
|
322
322
|
let K, _;
|
|
323
323
|
if (!s && !n ? (K = m(d, $, !1, !0), _ = m(g, d, !0, !1)) : !s && n ? (K = m(d, $, !1, !1), _ = m(d, g, !0, !1)) : s && !n ? (K = m($, d, !1, !0), _ = m(g, d, !1, !1)) : (K = m($, d, !0, !0), _ = m(d, g, !0, !0)), r != null) {
|
|
@@ -325,24 +325,24 @@ function Ne({ a: t, b: e, transposeA: s = !1, transposeB: n = !1, bias: r, activ
|
|
|
325
325
|
return [K, _, Q];
|
|
326
326
|
} else
|
|
327
327
|
return [K, _];
|
|
328
|
-
},
|
|
328
|
+
}, v = {
|
|
329
329
|
a: F,
|
|
330
330
|
b: R,
|
|
331
331
|
bias: S,
|
|
332
|
-
preluActivationWeights:
|
|
332
|
+
preluActivationWeights: G
|
|
333
333
|
}, j = { transposeA: s, transposeB: n, activation: c, leakyreluAlpha: p };
|
|
334
334
|
return r == null ? U((M, g, $) => {
|
|
335
335
|
const k = (
|
|
336
336
|
// tslint:disable-next-line: no-unnecessary-type-assertion
|
|
337
|
-
o.runKernel(H,
|
|
337
|
+
o.runKernel(H, v, j)
|
|
338
338
|
);
|
|
339
|
-
return $([M, g, k]), { value: f(k, O), gradFunc:
|
|
339
|
+
return $([M, g, k]), { value: f(k, O), gradFunc: I };
|
|
340
340
|
})(F, R) : U((M, g, $, k) => {
|
|
341
341
|
const z = (
|
|
342
342
|
// tslint:disable-next-line: no-unnecessary-type-assertion
|
|
343
|
-
o.runKernel(H,
|
|
343
|
+
o.runKernel(H, v, j)
|
|
344
344
|
);
|
|
345
|
-
return k([M, g, z, $]), { value: f(z, O), gradFunc:
|
|
345
|
+
return k([M, g, z, $]), { value: f(z, O), gradFunc: I };
|
|
346
346
|
})(F, R, S);
|
|
347
347
|
}
|
|
348
348
|
const J = /* @__PURE__ */ h({ fusedMatMul_: Ne });
|
|
@@ -369,7 +369,7 @@ class E extends Error {
|
|
|
369
369
|
* https://opensource.org/licenses/MIT.
|
|
370
370
|
* =============================================================================
|
|
371
371
|
*/
|
|
372
|
-
function
|
|
372
|
+
function Ge(t, e, s, n) {
|
|
373
373
|
if (t.rank < 2 || e.rank < 2)
|
|
374
374
|
throw new E(`dot requires both inputs to be rank >= 2 but got x shape = ${t.shape} and y shape = ${e.shape}`);
|
|
375
375
|
if (e.rank >= 3) {
|
|
@@ -425,7 +425,7 @@ class Pe {
|
|
|
425
425
|
return this.tf.gather(this.tiedWeights, e, 0);
|
|
426
426
|
}
|
|
427
427
|
project(e) {
|
|
428
|
-
return
|
|
428
|
+
return Ge(e, this.tiedWeights.transpose());
|
|
429
429
|
}
|
|
430
430
|
getWeights() {
|
|
431
431
|
return [this.tiedWeights];
|
|
@@ -2,7 +2,9 @@ import { default as TF } from '@tensorflow/tfjs';
|
|
|
2
2
|
import { GPTConfig } from '../config';
|
|
3
3
|
import { KVCache } from './CausalSelfAttention';
|
|
4
4
|
import { default as RoPECache } from './RoPECache';
|
|
5
|
-
|
|
5
|
+
import { default as MemoryProfiler } from '../utilities/profile';
|
|
6
|
+
import { default as BaseLayer } from './BaseLayer';
|
|
7
|
+
export default class Block extends BaseLayer {
|
|
6
8
|
private ln1;
|
|
7
9
|
private attn;
|
|
8
10
|
private ln2;
|
|
@@ -12,6 +14,7 @@ export default class Block {
|
|
|
12
14
|
private _trainable;
|
|
13
15
|
skipped: boolean;
|
|
14
16
|
constructor(tf: typeof TF, index: number, config: GPTConfig, ropeCache?: RoPECache);
|
|
17
|
+
setProfiler(value: MemoryProfiler | undefined): void;
|
|
15
18
|
get variables(): TF.Variable[];
|
|
16
19
|
get trainable(): boolean;
|
|
17
20
|
set trainable(value: boolean);
|
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
import
|
|
1
|
+
import a from "./CausalSelfAttention.js";
|
|
2
2
|
import o from "./MLP.js";
|
|
3
|
-
import
|
|
4
|
-
|
|
3
|
+
import r from "./RMSNorm.js";
|
|
4
|
+
import p from "./BaseLayer.js";
|
|
5
|
+
class f extends p {
|
|
5
6
|
ln1;
|
|
6
7
|
attn;
|
|
7
8
|
ln2;
|
|
@@ -11,7 +12,10 @@ class u {
|
|
|
11
12
|
_trainable = !0;
|
|
12
13
|
skipped = !1;
|
|
13
14
|
constructor(t, i, s, e) {
|
|
14
|
-
this.tf = t, this.index = i, this.ln1 = new
|
|
15
|
+
super(), this.tf = t, this.index = i, this.ln1 = new r(t, [s.nEmbed], 1e-8, `block_${this.index}_rms1`), this.attn = new a(this.tf, this.index, s, e), this.ln2 = new r(t, [s.nEmbed], 1e-8, `block_${this.index}_rms2`), this.mlp = new o(this.tf, this.index, s);
|
|
16
|
+
}
|
|
17
|
+
setProfiler(t) {
|
|
18
|
+
this._profiler = t, this.attn.setProfiler(t), this.mlp.setProfiler(t), this.ln1.setProfiler(t), this.ln2.setProfiler(t);
|
|
15
19
|
}
|
|
16
20
|
get variables() {
|
|
17
21
|
return [
|
|
@@ -54,5 +58,5 @@ class u {
|
|
|
54
58
|
}
|
|
55
59
|
}
|
|
56
60
|
export {
|
|
57
|
-
|
|
61
|
+
f as default
|
|
58
62
|
};
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { o as
|
|
1
|
+
import { o as m, d as s, f as c, E as M, B as f } from "./index-CWQLouWz.js";
|
|
2
2
|
/**
|
|
3
3
|
* @license
|
|
4
4
|
* Copyright 2020 Google LLC. All Rights Reserved.
|
|
@@ -15,13 +15,13 @@ import { o as c, c as s, d as m, E as M, B as p } from "./index-Dsg28SG6.js";
|
|
|
15
15
|
* limitations under the License.
|
|
16
16
|
* =============================================================================
|
|
17
17
|
*/
|
|
18
|
-
function
|
|
18
|
+
function p(e, o, n = !1, l = !1) {
|
|
19
19
|
let a = s(e, "a", "matMul"), t = s(o, "b", "matMul");
|
|
20
|
-
[a, t] =
|
|
20
|
+
[a, t] = c(a, t);
|
|
21
21
|
const r = { a, b: t }, u = { transposeA: n, transposeB: l };
|
|
22
|
-
return M.runKernel(
|
|
22
|
+
return M.runKernel(f, r, u);
|
|
23
23
|
}
|
|
24
|
-
const i = /* @__PURE__ */
|
|
24
|
+
const i = /* @__PURE__ */ m({ matMul_: p });
|
|
25
25
|
export {
|
|
26
26
|
i as m
|
|
27
27
|
};
|
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
import { engine as
|
|
2
|
-
import { r as
|
|
3
|
-
import { m as
|
|
4
|
-
class
|
|
1
|
+
import { engine as k } from "@tensorflow/tfjs";
|
|
2
|
+
import { r as m, c as d, s as p } from "../index-CWQLouWz.js";
|
|
3
|
+
import { m as f } from "../mat_mul-4v7St11W.js";
|
|
4
|
+
class h {
|
|
5
5
|
variableNames = ["q", "k", "mask"];
|
|
6
6
|
outputShape;
|
|
7
7
|
userCode;
|
|
8
8
|
// enableShapeUniforms = true;
|
|
9
9
|
customUniforms = [{ name: "divisor", type: "float" }];
|
|
10
|
-
constructor(
|
|
11
|
-
this.outputShape = [
|
|
10
|
+
constructor(e, n, s, a) {
|
|
11
|
+
this.outputShape = [e, n, s, s], this.userCode = `
|
|
12
12
|
void main() {
|
|
13
13
|
ivec4 coords = getOutputCoords(); // [batch, nh, t1, t2]
|
|
14
14
|
int b = coords.x;
|
|
@@ -34,49 +34,55 @@ class f {
|
|
|
34
34
|
`;
|
|
35
35
|
}
|
|
36
36
|
}
|
|
37
|
-
function
|
|
38
|
-
const { q:
|
|
39
|
-
return o.runWebGLProgram(
|
|
37
|
+
function v(t) {
|
|
38
|
+
const { q: e, k: n, mask: s } = t.inputs, { divisor: a } = t.attrs, o = t.backend, r = e.shape[0], i = e.shape[2], c = e.shape[1], u = new h(r, c, i, e.shape[3]);
|
|
39
|
+
return o.runWebGLProgram(u, [e, n, s], "float32", [[a]]);
|
|
40
40
|
}
|
|
41
|
-
const
|
|
41
|
+
const b = {
|
|
42
42
|
kernelName: "AttentionMask",
|
|
43
43
|
backendName: "webgl",
|
|
44
|
-
kernelFunc:
|
|
44
|
+
kernelFunc: v
|
|
45
45
|
};
|
|
46
|
-
|
|
47
|
-
function
|
|
48
|
-
const { q:
|
|
46
|
+
m(b);
|
|
47
|
+
function l(t) {
|
|
48
|
+
const { q: e, k: n, mask: s } = t.inputs, { divisor: a } = t.attrs, o = e.shape[2], i = f(e, n, !1, !0).mul(p(a)), c = s.slice([0, 0], [o, o]).expandDims(0).expandDims(0);
|
|
49
49
|
return i.add(c);
|
|
50
50
|
}
|
|
51
51
|
const M = {
|
|
52
52
|
kernelName: "AttentionMask",
|
|
53
53
|
backendName: "cpu",
|
|
54
|
-
kernelFunc:
|
|
54
|
+
kernelFunc: l
|
|
55
55
|
};
|
|
56
|
-
|
|
57
|
-
function w(t, s, n, e) {
|
|
58
|
-
return l().runKernel("AttentionMask", { q: t, k: s, mask: n }, { divisor: e });
|
|
59
|
-
}
|
|
56
|
+
m(M);
|
|
60
57
|
const g = {
|
|
58
|
+
kernelName: "AttentionMask",
|
|
59
|
+
backendName: "tensorflow",
|
|
60
|
+
kernelFunc: l
|
|
61
|
+
};
|
|
62
|
+
m(g);
|
|
63
|
+
function N(t, e, n, s) {
|
|
64
|
+
return k().runKernel("AttentionMask", { q: t, k: e, mask: n }, { divisor: s });
|
|
65
|
+
}
|
|
66
|
+
const A = {
|
|
61
67
|
kernelName: "AttentionMask",
|
|
62
68
|
inputsToSave: ["q", "k"],
|
|
63
69
|
outputsToSave: [],
|
|
64
|
-
gradFunc: (t,
|
|
70
|
+
gradFunc: (t, e, n) => {
|
|
65
71
|
if (Array.isArray(t))
|
|
66
72
|
throw new Error("Expected dy to be a single Tensor");
|
|
67
|
-
const [
|
|
73
|
+
const [s, a] = e, { divisor: o } = n;
|
|
68
74
|
return {
|
|
69
75
|
q: () => t.matMul(a).mul(o),
|
|
70
|
-
k: () =>
|
|
76
|
+
k: () => s.transpose([0, 1, 3, 2]).matMul(t).mul(o).transpose([0, 1, 3, 2]),
|
|
71
77
|
mask: () => t,
|
|
72
78
|
divisor: () => {
|
|
73
|
-
const r =
|
|
79
|
+
const r = s.matMul(a, !1, !0);
|
|
74
80
|
return t.mul(r).sum();
|
|
75
81
|
}
|
|
76
82
|
};
|
|
77
83
|
}
|
|
78
84
|
};
|
|
79
|
-
|
|
85
|
+
d(A);
|
|
80
86
|
export {
|
|
81
|
-
|
|
87
|
+
N as attentionMask
|
|
82
88
|
};
|
package/dist/ops/gatherSub.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { engine as l } from "@tensorflow/tfjs";
|
|
2
|
-
import { o as g,
|
|
3
|
-
import { r as p, s as f } from "../stack-
|
|
2
|
+
import { o as g, d as i, E as b, G as d, r as c, b as h } from "../index-CWQLouWz.js";
|
|
3
|
+
import { r as p, s as f } from "../stack-CTdK-itU.js";
|
|
4
4
|
/**
|
|
5
5
|
* @license
|
|
6
6
|
* Copyright 2018 Google LLC. All Rights Reserved.
|
package/dist/ops/scatterSub.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { engine as $ } from "@tensorflow/tfjs";
|
|
2
|
-
import {
|
|
3
|
-
import { c as
|
|
4
|
-
import { r as v, s as T } from "../stack-
|
|
2
|
+
import { l as u, n as S, p, E as f, q as E, o as N, d as l, t as y, r as h, b as D, a as x } from "../index-CWQLouWz.js";
|
|
3
|
+
import { c as d } from "../complex-x7w5HPOS.js";
|
|
4
|
+
import { r as v, s as T } from "../stack-CTdK-itU.js";
|
|
5
5
|
/**
|
|
6
6
|
* @license
|
|
7
7
|
* Copyright 2018 Google LLC. All Rights Reserved.
|
|
@@ -21,7 +21,7 @@ import { r as v, s as T } from "../stack-1o648CP_.js";
|
|
|
21
21
|
function i(e, t = "float32") {
|
|
22
22
|
if (u(e), t === "complex64") {
|
|
23
23
|
const a = i(e, "float32"), o = i(e, "float32");
|
|
24
|
-
return
|
|
24
|
+
return d(a, o);
|
|
25
25
|
}
|
|
26
26
|
const r = S(p(e), t);
|
|
27
27
|
return f.makeTensor(r, e, t);
|
|
@@ -42,10 +42,10 @@ function i(e, t = "float32") {
|
|
|
42
42
|
* limitations under the License.
|
|
43
43
|
* =============================================================================
|
|
44
44
|
*/
|
|
45
|
-
function
|
|
45
|
+
function m(e, t = "float32") {
|
|
46
46
|
if (u(e), t === "complex64") {
|
|
47
|
-
const a =
|
|
48
|
-
return
|
|
47
|
+
const a = m(e, "float32"), o = i(e, "float32");
|
|
48
|
+
return d(a, o);
|
|
49
49
|
}
|
|
50
50
|
const r = E(p(e), t);
|
|
51
51
|
return f.makeTensor(r, e, t);
|
|
@@ -133,7 +133,7 @@ const K = {
|
|
|
133
133
|
};
|
|
134
134
|
h(K);
|
|
135
135
|
function A(e) {
|
|
136
|
-
const { logits: t, labels: r, dy: a } = e.inputs, o = r.shape[0], s = t.shape[1], n = v(0, o, 1, "int32"), c = T([n, r], 1), b =
|
|
136
|
+
const { logits: t, labels: r, dy: a } = e.inputs, o = r.shape[0], s = t.shape[1], n = v(0, o, 1, "int32"), c = T([n, r], 1), b = m([o]), g = I(c, b, [o, s]), k = D(t, g), w = a.reshape([o, 1]);
|
|
137
137
|
return x(k, w);
|
|
138
138
|
}
|
|
139
139
|
const F = {
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { E as e, R as c, o as f,
|
|
1
|
+
import { E as e, R as c, o as f, h as i, i as a, P as u } from "./index-CWQLouWz.js";
|
|
2
2
|
/**
|
|
3
3
|
* @license
|
|
4
4
|
* Copyright 2018 Google LLC. All Rights Reserved.
|
|
@@ -15,7 +15,7 @@ import { E as e, R as c, o as f, g as u, h as a, P as i } from "./index-Dsg28SG6
|
|
|
15
15
|
* limitations under the License.
|
|
16
16
|
* =============================================================================
|
|
17
17
|
*/
|
|
18
|
-
function
|
|
18
|
+
function l(n, s, t = 1, r = "float32") {
|
|
19
19
|
if (t === 0)
|
|
20
20
|
throw new Error("Cannot have a step of zero");
|
|
21
21
|
const o = { start: n, stop: s, step: t, dtype: r };
|
|
@@ -38,13 +38,13 @@ function h(n, s, t = 1, r = "float32") {
|
|
|
38
38
|
* =============================================================================
|
|
39
39
|
*/
|
|
40
40
|
function k(n, s = 0) {
|
|
41
|
-
const t =
|
|
41
|
+
const t = i(n, "tensors", "stack", "string_or_numeric");
|
|
42
42
|
a(t.length >= 1, () => "Pass at least one tensor to tf.stack"), t.length > 0 && a(s <= t[0].rank, () => "Axis must be <= rank of the tensor");
|
|
43
43
|
const r = t, o = { axis: s };
|
|
44
|
-
return e.runKernel(
|
|
44
|
+
return e.runKernel(u, r, o);
|
|
45
45
|
}
|
|
46
|
-
const
|
|
46
|
+
const g = /* @__PURE__ */ f({ stack_: k });
|
|
47
47
|
export {
|
|
48
|
-
|
|
49
|
-
|
|
48
|
+
l as r,
|
|
49
|
+
g as s
|
|
50
50
|
};
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { o,
|
|
1
|
+
import { o, d as a, E as u, j as p, k as i, S as x } from "./index-CWQLouWz.js";
|
|
2
2
|
/**
|
|
3
3
|
* @license
|
|
4
4
|
* Copyright 2020 Google LLC. All Rights Reserved.
|
|
@@ -17,7 +17,7 @@ import { o, c as a, E as u, i, j as p, S as x } from "./index-Dsg28SG6.js";
|
|
|
17
17
|
*/
|
|
18
18
|
function l(n, t) {
|
|
19
19
|
const s = { x: a(n, "x", "reshape", "string_or_numeric") }, r = { shape: t };
|
|
20
|
-
return u.runKernel(
|
|
20
|
+
return u.runKernel(p, s, r);
|
|
21
21
|
}
|
|
22
22
|
const h = /* @__PURE__ */ o({ reshape_: l });
|
|
23
23
|
/**
|
|
@@ -38,7 +38,7 @@ const h = /* @__PURE__ */ o({ reshape_: l });
|
|
|
38
38
|
*/
|
|
39
39
|
function m(n, t = null, e = !1) {
|
|
40
40
|
let s = a(n, "x", "sum");
|
|
41
|
-
s.dtype === "bool" && (s =
|
|
41
|
+
s.dtype === "bool" && (s = i(s, "int32"));
|
|
42
42
|
const r = { x: s }, c = { axis: t, keepDims: e };
|
|
43
43
|
return u.runKernel(x, r, c);
|
|
44
44
|
}
|
package/dist/training/AdamExt.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { A as r,
|
|
1
|
+
import { A as r, a as c, s as h, b as g, e as o } from "../index-CWQLouWz.js";
|
|
2
2
|
class u extends r {
|
|
3
3
|
constructor(t, e, s, a, i) {
|
|
4
4
|
super(t, e, s, a), this.config = i, this.startLearningRate = t;
|
package/dist/training/Trainer.js
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import { DatasetBuilder as d } from "./DatasetBuilder.js";
|
|
2
|
-
import
|
|
3
|
-
class
|
|
4
|
-
constructor(t,
|
|
5
|
-
this.tokenizer =
|
|
2
|
+
import h from "./AdamExt.js";
|
|
3
|
+
class g {
|
|
4
|
+
constructor(t, s, e, i = 1e-3) {
|
|
5
|
+
this.tokenizer = e, this.tf = t, this.model = s, this.learningRate = i, this.resetOptimizer(), this.datasetBuilder = new d(this.tf, e, s.config.blockSize);
|
|
6
6
|
}
|
|
7
7
|
model;
|
|
8
8
|
optimizer;
|
|
@@ -25,7 +25,7 @@ class u {
|
|
|
25
25
|
}
|
|
26
26
|
resetOptimizer(t = { learningRateFactor: 1, beta1: 0.9, beta2: 0.99, epsilon: 1e-8 }) {
|
|
27
27
|
this.optimizer && this.optimizer.dispose();
|
|
28
|
-
const
|
|
28
|
+
const s = new h(
|
|
29
29
|
t.learningRateFactor * this.learningRate,
|
|
30
30
|
t.beta1,
|
|
31
31
|
t.beta2,
|
|
@@ -37,58 +37,59 @@ class u {
|
|
|
37
37
|
weightDecay: 0
|
|
38
38
|
}
|
|
39
39
|
);
|
|
40
|
-
this.optimizer =
|
|
40
|
+
this.optimizer = s;
|
|
41
41
|
}
|
|
42
42
|
printGradients(t) {
|
|
43
|
-
Object.keys(t).forEach((
|
|
44
|
-
const
|
|
45
|
-
console.log(`${
|
|
43
|
+
Object.keys(t).forEach((s) => {
|
|
44
|
+
const e = t[s];
|
|
45
|
+
console.log(`${s}:`), console.log(` Shape: ${e.shape}`), console.log(` Mean: ${this.tf.mean(e).dataSync()[0]}`), console.log(` Std: ${this.tf.moments(e).variance.sqrt().dataSync()[0]}`), console.log(` Min: ${this.tf.min(e).dataSync()[0]}`), console.log(` Max: ${this.tf.max(e).dataSync()[0]}`), console.log(` Norm: ${this.tf.norm(e).dataSync()[0]}`);
|
|
46
46
|
});
|
|
47
47
|
}
|
|
48
|
-
trainStep(t,
|
|
48
|
+
trainStep(t, s = !1, e = !1) {
|
|
49
49
|
return this.tf.tidy(() => {
|
|
50
|
+
this.model.getProfiler()?.startMemory();
|
|
50
51
|
const { xs: i, ys: a } = t, o = () => {
|
|
51
52
|
const { loss: l, logits: c } = this.model.forward(i, a, !0);
|
|
52
53
|
return c.dispose(), l;
|
|
53
54
|
}, { value: n, grads: r } = this.tf.variableGrads(o);
|
|
54
|
-
return
|
|
55
|
+
return s ? this.model.getProfiler()?.endMemory("Training") : (e && (console.log("-------"), this.printGradients(r), console.log("-------")), this.optimizer.applyGradients(r), this.model.getProfiler()?.endMemory("Training"), this.tf.dispose(r)), n;
|
|
55
56
|
});
|
|
56
57
|
}
|
|
57
58
|
dummyPass() {
|
|
58
|
-
const t = this.tf.zeros([1, this.model.config.blockSize], "int32"),
|
|
59
|
+
const t = this.tf.zeros([1, this.model.config.blockSize], "int32"), s = this.tf.zeros([1, this.model.config.blockSize], "int32");
|
|
59
60
|
try {
|
|
60
|
-
const
|
|
61
|
-
|
|
62
|
-
} catch (
|
|
63
|
-
console.error("Error during dummy pass:",
|
|
61
|
+
const e = this.trainStep({ xs: t, ys: s }, !0);
|
|
62
|
+
e.dataSync(), e.dispose();
|
|
63
|
+
} catch (e) {
|
|
64
|
+
console.error("Error during dummy pass:", e);
|
|
64
65
|
} finally {
|
|
65
|
-
t.dispose(),
|
|
66
|
+
t.dispose(), s.dispose();
|
|
66
67
|
}
|
|
67
68
|
}
|
|
68
|
-
async trainBatch(t,
|
|
69
|
+
async trainBatch(t, s) {
|
|
69
70
|
try {
|
|
70
|
-
const
|
|
71
|
-
return
|
|
72
|
-
} catch (
|
|
73
|
-
throw console.error(`Error processing batch at step ${t.step}:`,
|
|
71
|
+
const e = this.trainStep(s, !1, !1);
|
|
72
|
+
return s.xs.dispose(), s.ys.dispose(), t.step++, t.totalSteps++, e.array().then((i) => (t.lastLoss = i, t.losses.push(t.lastLoss), e.dispose(), t.lastLoss));
|
|
73
|
+
} catch (e) {
|
|
74
|
+
throw console.error(`Error processing batch at step ${t.step}:`, e), this.tf.dispose(), e;
|
|
74
75
|
}
|
|
75
76
|
}
|
|
76
|
-
async createTrainValidationSplit(t,
|
|
77
|
-
const i = await this.datasetBuilder.createTextDataset(t,
|
|
77
|
+
async createTrainValidationSplit(t, s = 32, e = 0.1) {
|
|
78
|
+
const i = await this.datasetBuilder.createTextDataset(t, s, 0, 1 - e), a = await this.datasetBuilder.createTextDataset(
|
|
78
79
|
t,
|
|
79
|
-
|
|
80
|
-
1 -
|
|
80
|
+
s,
|
|
81
|
+
1 - e,
|
|
81
82
|
1
|
|
82
83
|
);
|
|
83
84
|
return { trainDataset: i, validationDataset: a };
|
|
84
85
|
}
|
|
85
|
-
async createDataset(t,
|
|
86
|
-
return await this.datasetBuilder.createTextDataset(t,
|
|
86
|
+
async createDataset(t, s = 32) {
|
|
87
|
+
return await this.datasetBuilder.createTextDataset(t, s);
|
|
87
88
|
}
|
|
88
89
|
dispose() {
|
|
89
90
|
this.optimizer && this.optimizer.dispose();
|
|
90
91
|
}
|
|
91
92
|
}
|
|
92
93
|
export {
|
|
93
|
-
|
|
94
|
+
g as default
|
|
94
95
|
};
|