@genai-fi/nanogpt 0.2.8 → 0.2.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/Generator.d.ts +2 -0
- package/dist/Generator.js +37 -32
- package/dist/NanoGPTModel.d.ts +4 -1
- package/dist/NanoGPTModel.js +33 -25
- package/dist/TeachableLLM.d.ts +4 -0
- package/dist/TeachableLLM.js +31 -16
- package/dist/{complex-CeoYJn2o.js → complex-x7w5HPOS.js} +6 -6
- package/dist/{index-DQfEAU9u.js → index-CWQLouWz.js} +312 -303
- package/dist/layers/BaseLayer.d.ts +8 -0
- package/dist/layers/BaseLayer.js +18 -0
- package/dist/layers/CausalSelfAttention.d.ts +2 -1
- package/dist/layers/CausalSelfAttention.js +10 -8
- package/dist/layers/MLP.d.ts +2 -1
- package/dist/layers/MLP.js +16 -14
- package/dist/layers/RMSNorm.d.ts +2 -1
- package/dist/layers/RMSNorm.js +13 -11
- package/dist/layers/TiedEmbedding.js +4 -4
- package/dist/layers/TransformerBlock.d.ts +4 -1
- package/dist/layers/TransformerBlock.js +9 -5
- package/dist/{mat_mul-CuHB58-H.js → mat_mul-4v7St11W.js} +5 -5
- package/dist/ops/attentionMask.js +47 -21
- package/dist/ops/gatherSub.js +2 -2
- package/dist/ops/node/sparseCrossEntropy.js +1 -1
- package/dist/ops/scatterSub.js +10 -10
- package/dist/{stack-C9cTkqpq.js → stack-CTdK-itU.js} +5 -5
- package/dist/{sum-B-O33dgG.js → sum-CnIf1YOh.js} +3 -3
- package/dist/training/AdamExt.js +1 -1
- package/dist/training/Trainer.js +30 -29
- package/dist/training/sparseCrossEntropy.js +12 -12
- package/dist/utilities/profile.d.ts +10 -0
- package/dist/utilities/profile.js +29 -0
- package/package.json +1 -1
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import { default as MemoryProfiler } from '../utilities/profile';
|
|
2
|
+
export default abstract class BaseLayer {
|
|
3
|
+
protected _profiler?: MemoryProfiler;
|
|
4
|
+
getProfiler(): MemoryProfiler | undefined;
|
|
5
|
+
setProfiler(value: MemoryProfiler | undefined): void;
|
|
6
|
+
startMemory(): void;
|
|
7
|
+
endMemory(label: string): void;
|
|
8
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
class t {
|
|
2
|
+
_profiler;
|
|
3
|
+
getProfiler() {
|
|
4
|
+
return this._profiler;
|
|
5
|
+
}
|
|
6
|
+
setProfiler(r) {
|
|
7
|
+
this._profiler = r;
|
|
8
|
+
}
|
|
9
|
+
startMemory() {
|
|
10
|
+
this._profiler?.startMemory();
|
|
11
|
+
}
|
|
12
|
+
endMemory(r) {
|
|
13
|
+
this._profiler?.endMemory(r);
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
export {
|
|
17
|
+
t as default
|
|
18
|
+
};
|
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
import { default as TF } from '@tensorflow/tfjs';
|
|
2
2
|
import { GPTConfig } from '../config';
|
|
3
3
|
import { default as RoPECache } from './RoPECache';
|
|
4
|
+
import { default as BaseLayer } from './BaseLayer';
|
|
4
5
|
export type KVCache = {
|
|
5
6
|
k: TF.Tensor;
|
|
6
7
|
v: TF.Tensor;
|
|
7
8
|
length: number;
|
|
8
9
|
cumulativeLength: number;
|
|
9
10
|
};
|
|
10
|
-
export default class CausalSelfAttention {
|
|
11
|
+
export default class CausalSelfAttention extends BaseLayer {
|
|
11
12
|
private readonly ropeCache?;
|
|
12
13
|
private config;
|
|
13
14
|
private cAttn;
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import { attentionMask as z } from "../ops/attentionMask.js";
|
|
2
|
-
|
|
2
|
+
import S from "./BaseLayer.js";
|
|
3
|
+
class C extends S {
|
|
3
4
|
constructor(t, i, s, e) {
|
|
4
|
-
this.ropeCache = e, this.config = s, this.tf = t, this.index = i, this.cAttn = this.tf.layers.dense({
|
|
5
|
+
super(), this.ropeCache = e, this.config = s, this.tf = t, this.index = i, this.cAttn = this.tf.layers.dense({
|
|
5
6
|
units: 3 * s.nEmbed,
|
|
6
7
|
useBias: s.biasInLinear,
|
|
7
8
|
name: `block_${i}_attn_cAttn`,
|
|
@@ -94,23 +95,24 @@ class j {
|
|
|
94
95
|
if (e && !this.config.useRope)
|
|
95
96
|
throw new Error("Cannot use pastKV without RoPE enabled");
|
|
96
97
|
return this.tf.tidy(() => {
|
|
98
|
+
this.startMemory();
|
|
97
99
|
const [o, c, r] = this.getQKV(t), h = o.shape[2], a = this.config.blockSize, u = e ? e.cumulativeLength : 0, [f, d] = this.ropeCache ? this.ropeCache.applyRoPE(o, c, u) : [o, c];
|
|
98
100
|
let n = d, l = r, p = 0;
|
|
99
101
|
e && (p = e.length, n = this.tf.concat([e.k, d], 2), l = this.tf.concat([e.v, r], 2));
|
|
100
102
|
const b = n.shape[2];
|
|
101
103
|
if (b > a) {
|
|
102
|
-
const k = b - a, g = n.shape[0],
|
|
103
|
-
n = n.slice([0, 0, k, 0], [g,
|
|
104
|
+
const k = b - a, g = n.shape[0], A = n.shape[1], I = n.shape[3];
|
|
105
|
+
n = n.slice([0, 0, k, 0], [g, A, a, I]), l = l.slice([0, 0, k, 0], [g, A, a, I]), p = a - h;
|
|
104
106
|
}
|
|
105
107
|
let m;
|
|
106
108
|
p > 0 ? m = this.getAttentionScoresWithPast(f, n, i, p) : m = this.getAttentionScores(f, n, i);
|
|
107
|
-
const
|
|
109
|
+
const _ = this.tf.matMul(m, l), v = this.getOutputProjection(_, i), y = {
|
|
108
110
|
k: this.tf.keep(n),
|
|
109
111
|
v: this.tf.keep(l),
|
|
110
112
|
length: p + h,
|
|
111
113
|
cumulativeLength: e ? e.cumulativeLength + h : h
|
|
112
|
-
};
|
|
113
|
-
return { output:
|
|
114
|
+
}, P = s ? m.mean(1) : void 0;
|
|
115
|
+
return this.endMemory("CausalSelfAttention"), { output: v, attention: P, presentKV: y };
|
|
114
116
|
});
|
|
115
117
|
}
|
|
116
118
|
dispose() {
|
|
@@ -118,5 +120,5 @@ class j {
|
|
|
118
120
|
}
|
|
119
121
|
}
|
|
120
122
|
export {
|
|
121
|
-
|
|
123
|
+
C as default
|
|
122
124
|
};
|
package/dist/layers/MLP.d.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { default as TF } from '@tensorflow/tfjs';
|
|
2
2
|
import { GPTConfig } from '../config';
|
|
3
|
-
|
|
3
|
+
import { default as BaseLayer } from './BaseLayer';
|
|
4
|
+
export default class MLP extends BaseLayer {
|
|
4
5
|
private cFc;
|
|
5
6
|
private cProj;
|
|
6
7
|
private dropout;
|
package/dist/layers/MLP.js
CHANGED
|
@@ -1,31 +1,32 @@
|
|
|
1
|
-
|
|
1
|
+
import a from "./BaseLayer.js";
|
|
2
|
+
class l extends a {
|
|
2
3
|
cFc;
|
|
3
4
|
cProj;
|
|
4
5
|
dropout;
|
|
5
6
|
tf;
|
|
6
7
|
index;
|
|
7
8
|
_trainable = !0;
|
|
8
|
-
constructor(t,
|
|
9
|
-
this.tf = t, this.index =
|
|
10
|
-
units:
|
|
9
|
+
constructor(t, i, e) {
|
|
10
|
+
super(), this.tf = t, this.index = i, this.cFc = this.tf.layers.dense({
|
|
11
|
+
units: e.mlpFactor * e.nEmbed,
|
|
11
12
|
activation: "gelu",
|
|
12
|
-
useBias:
|
|
13
|
+
useBias: e.biasInLinear,
|
|
13
14
|
kernelInitializer: this.tf.initializers.randomNormal({
|
|
14
15
|
mean: 0,
|
|
15
16
|
stddev: 0.02
|
|
16
17
|
}),
|
|
17
18
|
biasInitializer: "zeros",
|
|
18
|
-
name: `block_${
|
|
19
|
+
name: `block_${i}_mlp_cFc`
|
|
19
20
|
}), this.cProj = this.tf.layers.dense({
|
|
20
|
-
units:
|
|
21
|
-
useBias:
|
|
21
|
+
units: e.nEmbed,
|
|
22
|
+
useBias: e.biasInLinear,
|
|
22
23
|
kernelInitializer: this.tf.initializers.randomNormal({
|
|
23
24
|
mean: 0,
|
|
24
|
-
stddev: 0.02 / Math.sqrt(2 *
|
|
25
|
+
stddev: 0.02 / Math.sqrt(2 * e.nLayer)
|
|
25
26
|
}),
|
|
26
27
|
biasInitializer: "zeros",
|
|
27
|
-
name: `block_${
|
|
28
|
-
}), this.dropout = this.tf.layers.dropout({ rate:
|
|
28
|
+
name: `block_${i}_mlp_cProj`
|
|
29
|
+
}), this.dropout = this.tf.layers.dropout({ rate: e.dropout });
|
|
29
30
|
}
|
|
30
31
|
get variables() {
|
|
31
32
|
return [
|
|
@@ -45,10 +46,11 @@ class l {
|
|
|
45
46
|
loadWeights(t) {
|
|
46
47
|
this.cFc.setWeights(t.get(`block_${this.index}_mlpHidden`) || []), this.cProj.setWeights(t.get(`block_${this.index}_mlpOut`) || []);
|
|
47
48
|
}
|
|
48
|
-
call(t,
|
|
49
|
+
call(t, i = !1) {
|
|
49
50
|
return this.tf.tidy(() => {
|
|
50
|
-
|
|
51
|
-
|
|
51
|
+
this.startMemory();
|
|
52
|
+
const e = this.cFc.apply(t), s = this.cProj.apply(e), r = this.dropout.apply(s, { training: i });
|
|
53
|
+
return this.endMemory("MLP"), r;
|
|
52
54
|
});
|
|
53
55
|
}
|
|
54
56
|
dispose() {
|
package/dist/layers/RMSNorm.d.ts
CHANGED
package/dist/layers/RMSNorm.js
CHANGED
|
@@ -1,26 +1,28 @@
|
|
|
1
|
-
|
|
1
|
+
import m from "./BaseLayer.js";
|
|
2
|
+
class o extends m {
|
|
2
3
|
gamma;
|
|
3
4
|
epsilon;
|
|
4
5
|
tf;
|
|
5
|
-
constructor(
|
|
6
|
-
this.tf =
|
|
6
|
+
constructor(t, s, a = 1e-8, e = "") {
|
|
7
|
+
super(), this.tf = t, this.epsilon = a, this.gamma = t.variable(t.ones(s), !0, `${e}_gamma`, "float32");
|
|
7
8
|
}
|
|
8
9
|
get trainableWeights() {
|
|
9
10
|
return [this.gamma];
|
|
10
11
|
}
|
|
11
|
-
set trainable(
|
|
12
|
-
this.gamma.trainable =
|
|
12
|
+
set trainable(t) {
|
|
13
|
+
this.gamma.trainable = t;
|
|
13
14
|
}
|
|
14
15
|
getWeights() {
|
|
15
16
|
return [this.gamma];
|
|
16
17
|
}
|
|
17
|
-
setWeights(
|
|
18
|
-
this.gamma.assign(
|
|
18
|
+
setWeights(t) {
|
|
19
|
+
this.gamma.assign(t[0]);
|
|
19
20
|
}
|
|
20
|
-
apply(
|
|
21
|
+
apply(t) {
|
|
21
22
|
return this.tf.tidy(() => {
|
|
22
|
-
|
|
23
|
-
|
|
23
|
+
this.startMemory();
|
|
24
|
+
const a = t.square().mean(-1, !0).add(this.epsilon).rsqrt(), r = t.mul(a).mul(this.gamma);
|
|
25
|
+
return this.endMemory("RMSNorm"), r;
|
|
24
26
|
});
|
|
25
27
|
}
|
|
26
28
|
dispose() {
|
|
@@ -28,5 +30,5 @@ class m {
|
|
|
28
30
|
}
|
|
29
31
|
}
|
|
30
32
|
export {
|
|
31
|
-
|
|
33
|
+
o as default
|
|
32
34
|
};
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import { o as h,
|
|
2
|
-
import { s as ce, r as f } from "../sum-
|
|
3
|
-
import { m } from "../mat_mul-
|
|
4
|
-
import { c as pe } from "../complex-
|
|
1
|
+
import { o as h, d as i, E as o, F as V, H as X, I as Y, J as Z, N as ee, K as te, O as se, Q as ne, T as re, U as ue, i as L, z as ae, V as A, a as ie, W as oe, w as le, f as q, p as C, X as P, y as U, _ as H } from "../index-CWQLouWz.js";
|
|
2
|
+
import { s as ce, r as f } from "../sum-CnIf1YOh.js";
|
|
3
|
+
import { m } from "../mat_mul-4v7St11W.js";
|
|
4
|
+
import { c as pe } from "../complex-x7w5HPOS.js";
|
|
5
5
|
/**
|
|
6
6
|
* @license
|
|
7
7
|
* Copyright 2018 Google LLC. All Rights Reserved.
|
|
@@ -2,7 +2,9 @@ import { default as TF } from '@tensorflow/tfjs';
|
|
|
2
2
|
import { GPTConfig } from '../config';
|
|
3
3
|
import { KVCache } from './CausalSelfAttention';
|
|
4
4
|
import { default as RoPECache } from './RoPECache';
|
|
5
|
-
|
|
5
|
+
import { default as MemoryProfiler } from '../utilities/profile';
|
|
6
|
+
import { default as BaseLayer } from './BaseLayer';
|
|
7
|
+
export default class Block extends BaseLayer {
|
|
6
8
|
private ln1;
|
|
7
9
|
private attn;
|
|
8
10
|
private ln2;
|
|
@@ -12,6 +14,7 @@ export default class Block {
|
|
|
12
14
|
private _trainable;
|
|
13
15
|
skipped: boolean;
|
|
14
16
|
constructor(tf: typeof TF, index: number, config: GPTConfig, ropeCache?: RoPECache);
|
|
17
|
+
setProfiler(value: MemoryProfiler | undefined): void;
|
|
15
18
|
get variables(): TF.Variable[];
|
|
16
19
|
get trainable(): boolean;
|
|
17
20
|
set trainable(value: boolean);
|
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
import
|
|
1
|
+
import a from "./CausalSelfAttention.js";
|
|
2
2
|
import o from "./MLP.js";
|
|
3
|
-
import
|
|
4
|
-
|
|
3
|
+
import r from "./RMSNorm.js";
|
|
4
|
+
import p from "./BaseLayer.js";
|
|
5
|
+
class f extends p {
|
|
5
6
|
ln1;
|
|
6
7
|
attn;
|
|
7
8
|
ln2;
|
|
@@ -11,7 +12,10 @@ class u {
|
|
|
11
12
|
_trainable = !0;
|
|
12
13
|
skipped = !1;
|
|
13
14
|
constructor(t, i, s, e) {
|
|
14
|
-
this.tf = t, this.index = i, this.ln1 = new
|
|
15
|
+
super(), this.tf = t, this.index = i, this.ln1 = new r(t, [s.nEmbed], 1e-8, `block_${this.index}_rms1`), this.attn = new a(this.tf, this.index, s, e), this.ln2 = new r(t, [s.nEmbed], 1e-8, `block_${this.index}_rms2`), this.mlp = new o(this.tf, this.index, s);
|
|
16
|
+
}
|
|
17
|
+
setProfiler(t) {
|
|
18
|
+
this._profiler = t, this.attn.setProfiler(t), this.mlp.setProfiler(t), this.ln1.setProfiler(t), this.ln2.setProfiler(t);
|
|
15
19
|
}
|
|
16
20
|
get variables() {
|
|
17
21
|
return [
|
|
@@ -54,5 +58,5 @@ class u {
|
|
|
54
58
|
}
|
|
55
59
|
}
|
|
56
60
|
export {
|
|
57
|
-
|
|
61
|
+
f as default
|
|
58
62
|
};
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { o as
|
|
1
|
+
import { o as m, d as s, f as c, E as M, B as f } from "./index-CWQLouWz.js";
|
|
2
2
|
/**
|
|
3
3
|
* @license
|
|
4
4
|
* Copyright 2020 Google LLC. All Rights Reserved.
|
|
@@ -15,13 +15,13 @@ import { o as c, c as s, b as m, E as M, B as p } from "./index-DQfEAU9u.js";
|
|
|
15
15
|
* limitations under the License.
|
|
16
16
|
* =============================================================================
|
|
17
17
|
*/
|
|
18
|
-
function
|
|
18
|
+
function p(e, o, n = !1, l = !1) {
|
|
19
19
|
let a = s(e, "a", "matMul"), t = s(o, "b", "matMul");
|
|
20
|
-
[a, t] =
|
|
20
|
+
[a, t] = c(a, t);
|
|
21
21
|
const r = { a, b: t }, u = { transposeA: n, transposeB: l };
|
|
22
|
-
return M.runKernel(
|
|
22
|
+
return M.runKernel(f, r, u);
|
|
23
23
|
}
|
|
24
|
-
const i = /* @__PURE__ */
|
|
24
|
+
const i = /* @__PURE__ */ m({ matMul_: p });
|
|
25
25
|
export {
|
|
26
26
|
i as m
|
|
27
27
|
};
|
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
import { engine as
|
|
2
|
-
import { r as
|
|
3
|
-
import { m as
|
|
4
|
-
class
|
|
1
|
+
import { engine as k } from "@tensorflow/tfjs";
|
|
2
|
+
import { r as m, c as d, s as p } from "../index-CWQLouWz.js";
|
|
3
|
+
import { m as f } from "../mat_mul-4v7St11W.js";
|
|
4
|
+
class h {
|
|
5
5
|
variableNames = ["q", "k", "mask"];
|
|
6
6
|
outputShape;
|
|
7
7
|
userCode;
|
|
8
8
|
// enableShapeUniforms = true;
|
|
9
9
|
customUniforms = [{ name: "divisor", type: "float" }];
|
|
10
|
-
constructor(
|
|
11
|
-
this.outputShape = [
|
|
10
|
+
constructor(e, n, s, a) {
|
|
11
|
+
this.outputShape = [e, n, s, s], this.userCode = `
|
|
12
12
|
void main() {
|
|
13
13
|
ivec4 coords = getOutputCoords(); // [batch, nh, t1, t2]
|
|
14
14
|
int b = coords.x;
|
|
@@ -34,29 +34,55 @@ class p {
|
|
|
34
34
|
`;
|
|
35
35
|
}
|
|
36
36
|
}
|
|
37
|
-
function
|
|
38
|
-
const { q:
|
|
39
|
-
return o.runWebGLProgram(
|
|
37
|
+
function v(t) {
|
|
38
|
+
const { q: e, k: n, mask: s } = t.inputs, { divisor: a } = t.attrs, o = t.backend, r = e.shape[0], i = e.shape[2], c = e.shape[1], u = new h(r, c, i, e.shape[3]);
|
|
39
|
+
return o.runWebGLProgram(u, [e, n, s], "float32", [[a]]);
|
|
40
40
|
}
|
|
41
|
-
const
|
|
41
|
+
const b = {
|
|
42
42
|
kernelName: "AttentionMask",
|
|
43
43
|
backendName: "webgl",
|
|
44
|
-
kernelFunc:
|
|
44
|
+
kernelFunc: v
|
|
45
45
|
};
|
|
46
|
-
|
|
47
|
-
function
|
|
48
|
-
const { q:
|
|
49
|
-
return i.add(
|
|
46
|
+
m(b);
|
|
47
|
+
function l(t) {
|
|
48
|
+
const { q: e, k: n, mask: s } = t.inputs, { divisor: a } = t.attrs, o = e.shape[2], i = f(e, n, !1, !0).mul(p(a)), c = s.slice([0, 0], [o, o]).expandDims(0).expandDims(0);
|
|
49
|
+
return i.add(c);
|
|
50
50
|
}
|
|
51
|
-
const
|
|
51
|
+
const M = {
|
|
52
52
|
kernelName: "AttentionMask",
|
|
53
53
|
backendName: "cpu",
|
|
54
|
-
kernelFunc:
|
|
54
|
+
kernelFunc: l
|
|
55
55
|
};
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
56
|
+
m(M);
|
|
57
|
+
const g = {
|
|
58
|
+
kernelName: "AttentionMask",
|
|
59
|
+
backendName: "tensorflow",
|
|
60
|
+
kernelFunc: l
|
|
61
|
+
};
|
|
62
|
+
m(g);
|
|
63
|
+
function N(t, e, n, s) {
|
|
64
|
+
return k().runKernel("AttentionMask", { q: t, k: e, mask: n }, { divisor: s });
|
|
59
65
|
}
|
|
66
|
+
const A = {
|
|
67
|
+
kernelName: "AttentionMask",
|
|
68
|
+
inputsToSave: ["q", "k"],
|
|
69
|
+
outputsToSave: [],
|
|
70
|
+
gradFunc: (t, e, n) => {
|
|
71
|
+
if (Array.isArray(t))
|
|
72
|
+
throw new Error("Expected dy to be a single Tensor");
|
|
73
|
+
const [s, a] = e, { divisor: o } = n;
|
|
74
|
+
return {
|
|
75
|
+
q: () => t.matMul(a).mul(o),
|
|
76
|
+
k: () => s.transpose([0, 1, 3, 2]).matMul(t).mul(o).transpose([0, 1, 3, 2]),
|
|
77
|
+
mask: () => t,
|
|
78
|
+
divisor: () => {
|
|
79
|
+
const r = s.matMul(a, !1, !0);
|
|
80
|
+
return t.mul(r).sum();
|
|
81
|
+
}
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
};
|
|
85
|
+
d(A);
|
|
60
86
|
export {
|
|
61
|
-
|
|
87
|
+
N as attentionMask
|
|
62
88
|
};
|
package/dist/ops/gatherSub.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { engine as l } from "@tensorflow/tfjs";
|
|
2
|
-
import { o as g,
|
|
3
|
-
import { r as p, s as f } from "../stack-
|
|
2
|
+
import { o as g, d as i, E as b, G as d, r as c, b as h } from "../index-CWQLouWz.js";
|
|
3
|
+
import { r as p, s as f } from "../stack-CTdK-itU.js";
|
|
4
4
|
/**
|
|
5
5
|
* @license
|
|
6
6
|
* Copyright 2018 Google LLC. All Rights Reserved.
|
package/dist/ops/scatterSub.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { engine as $ } from "@tensorflow/tfjs";
|
|
2
|
-
import {
|
|
3
|
-
import { c as
|
|
4
|
-
import { r as v, s as T } from "../stack-
|
|
2
|
+
import { l as u, n as S, p, E as f, q as E, o as N, d as l, t as y, r as h, b as D, a as x } from "../index-CWQLouWz.js";
|
|
3
|
+
import { c as d } from "../complex-x7w5HPOS.js";
|
|
4
|
+
import { r as v, s as T } from "../stack-CTdK-itU.js";
|
|
5
5
|
/**
|
|
6
6
|
* @license
|
|
7
7
|
* Copyright 2018 Google LLC. All Rights Reserved.
|
|
@@ -21,7 +21,7 @@ import { r as v, s as T } from "../stack-C9cTkqpq.js";
|
|
|
21
21
|
function i(e, t = "float32") {
|
|
22
22
|
if (u(e), t === "complex64") {
|
|
23
23
|
const a = i(e, "float32"), o = i(e, "float32");
|
|
24
|
-
return
|
|
24
|
+
return d(a, o);
|
|
25
25
|
}
|
|
26
26
|
const r = S(p(e), t);
|
|
27
27
|
return f.makeTensor(r, e, t);
|
|
@@ -42,10 +42,10 @@ function i(e, t = "float32") {
|
|
|
42
42
|
* limitations under the License.
|
|
43
43
|
* =============================================================================
|
|
44
44
|
*/
|
|
45
|
-
function
|
|
45
|
+
function m(e, t = "float32") {
|
|
46
46
|
if (u(e), t === "complex64") {
|
|
47
|
-
const a =
|
|
48
|
-
return
|
|
47
|
+
const a = m(e, "float32"), o = i(e, "float32");
|
|
48
|
+
return d(a, o);
|
|
49
49
|
}
|
|
50
50
|
const r = E(p(e), t);
|
|
51
51
|
return f.makeTensor(r, e, t);
|
|
@@ -133,7 +133,7 @@ const K = {
|
|
|
133
133
|
};
|
|
134
134
|
h(K);
|
|
135
135
|
function A(e) {
|
|
136
|
-
const { logits: t, labels: r, dy: a } = e.inputs, o = r.shape[0], s = t.shape[1], n = v(0, o, 1, "int32"), c = T([n, r], 1), b =
|
|
136
|
+
const { logits: t, labels: r, dy: a } = e.inputs, o = r.shape[0], s = t.shape[1], n = v(0, o, 1, "int32"), c = T([n, r], 1), b = m([o]), g = I(c, b, [o, s]), k = D(t, g), w = a.reshape([o, 1]);
|
|
137
137
|
return x(k, w);
|
|
138
138
|
}
|
|
139
139
|
const F = {
|
|
@@ -142,9 +142,9 @@ const F = {
|
|
|
142
142
|
kernelFunc: A
|
|
143
143
|
};
|
|
144
144
|
h(F);
|
|
145
|
-
function
|
|
145
|
+
function R(e, t, r) {
|
|
146
146
|
return $().runKernel("EfficientScatterSub", { logits: e, labels: t, dy: r }, {});
|
|
147
147
|
}
|
|
148
148
|
export {
|
|
149
|
-
|
|
149
|
+
R as scatterSub
|
|
150
150
|
};
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { E as e, R as c, o as f,
|
|
1
|
+
import { E as e, R as c, o as f, h as i, i as a, P as u } from "./index-CWQLouWz.js";
|
|
2
2
|
/**
|
|
3
3
|
* @license
|
|
4
4
|
* Copyright 2018 Google LLC. All Rights Reserved.
|
|
@@ -38,13 +38,13 @@ function l(n, s, t = 1, r = "float32") {
|
|
|
38
38
|
* =============================================================================
|
|
39
39
|
*/
|
|
40
40
|
function k(n, s = 0) {
|
|
41
|
-
const t =
|
|
41
|
+
const t = i(n, "tensors", "stack", "string_or_numeric");
|
|
42
42
|
a(t.length >= 1, () => "Pass at least one tensor to tf.stack"), t.length > 0 && a(s <= t[0].rank, () => "Axis must be <= rank of the tensor");
|
|
43
43
|
const r = t, o = { axis: s };
|
|
44
|
-
return e.runKernel(
|
|
44
|
+
return e.runKernel(u, r, o);
|
|
45
45
|
}
|
|
46
|
-
const
|
|
46
|
+
const g = /* @__PURE__ */ f({ stack_: k });
|
|
47
47
|
export {
|
|
48
48
|
l as r,
|
|
49
|
-
|
|
49
|
+
g as s
|
|
50
50
|
};
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { o,
|
|
1
|
+
import { o, d as a, E as u, j as p, k as i, S as x } from "./index-CWQLouWz.js";
|
|
2
2
|
/**
|
|
3
3
|
* @license
|
|
4
4
|
* Copyright 2020 Google LLC. All Rights Reserved.
|
|
@@ -17,7 +17,7 @@ import { o, c as a, E as u, h as i, i as p, S as x } from "./index-DQfEAU9u.js";
|
|
|
17
17
|
*/
|
|
18
18
|
function l(n, t) {
|
|
19
19
|
const s = { x: a(n, "x", "reshape", "string_or_numeric") }, r = { shape: t };
|
|
20
|
-
return u.runKernel(
|
|
20
|
+
return u.runKernel(p, s, r);
|
|
21
21
|
}
|
|
22
22
|
const h = /* @__PURE__ */ o({ reshape_: l });
|
|
23
23
|
/**
|
|
@@ -38,7 +38,7 @@ const h = /* @__PURE__ */ o({ reshape_: l });
|
|
|
38
38
|
*/
|
|
39
39
|
function m(n, t = null, e = !1) {
|
|
40
40
|
let s = a(n, "x", "sum");
|
|
41
|
-
s.dtype === "bool" && (s =
|
|
41
|
+
s.dtype === "bool" && (s = i(s, "int32"));
|
|
42
42
|
const r = { x: s }, c = { axis: t, keepDims: e };
|
|
43
43
|
return u.runKernel(x, r, c);
|
|
44
44
|
}
|
package/dist/training/AdamExt.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { A as r,
|
|
1
|
+
import { A as r, a as c, s as h, b as g, e as o } from "../index-CWQLouWz.js";
|
|
2
2
|
class u extends r {
|
|
3
3
|
constructor(t, e, s, a, i) {
|
|
4
4
|
super(t, e, s, a), this.config = i, this.startLearningRate = t;
|
package/dist/training/Trainer.js
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import { DatasetBuilder as d } from "./DatasetBuilder.js";
|
|
2
|
-
import
|
|
3
|
-
class
|
|
4
|
-
constructor(t,
|
|
5
|
-
this.tokenizer =
|
|
2
|
+
import h from "./AdamExt.js";
|
|
3
|
+
class g {
|
|
4
|
+
constructor(t, s, e, i = 1e-3) {
|
|
5
|
+
this.tokenizer = e, this.tf = t, this.model = s, this.learningRate = i, this.resetOptimizer(), this.datasetBuilder = new d(this.tf, e, s.config.blockSize);
|
|
6
6
|
}
|
|
7
7
|
model;
|
|
8
8
|
optimizer;
|
|
@@ -25,7 +25,7 @@ class u {
|
|
|
25
25
|
}
|
|
26
26
|
resetOptimizer(t = { learningRateFactor: 1, beta1: 0.9, beta2: 0.99, epsilon: 1e-8 }) {
|
|
27
27
|
this.optimizer && this.optimizer.dispose();
|
|
28
|
-
const
|
|
28
|
+
const s = new h(
|
|
29
29
|
t.learningRateFactor * this.learningRate,
|
|
30
30
|
t.beta1,
|
|
31
31
|
t.beta2,
|
|
@@ -37,58 +37,59 @@ class u {
|
|
|
37
37
|
weightDecay: 0
|
|
38
38
|
}
|
|
39
39
|
);
|
|
40
|
-
this.optimizer =
|
|
40
|
+
this.optimizer = s;
|
|
41
41
|
}
|
|
42
42
|
printGradients(t) {
|
|
43
|
-
Object.keys(t).forEach((
|
|
44
|
-
const
|
|
45
|
-
console.log(`${
|
|
43
|
+
Object.keys(t).forEach((s) => {
|
|
44
|
+
const e = t[s];
|
|
45
|
+
console.log(`${s}:`), console.log(` Shape: ${e.shape}`), console.log(` Mean: ${this.tf.mean(e).dataSync()[0]}`), console.log(` Std: ${this.tf.moments(e).variance.sqrt().dataSync()[0]}`), console.log(` Min: ${this.tf.min(e).dataSync()[0]}`), console.log(` Max: ${this.tf.max(e).dataSync()[0]}`), console.log(` Norm: ${this.tf.norm(e).dataSync()[0]}`);
|
|
46
46
|
});
|
|
47
47
|
}
|
|
48
|
-
trainStep(t,
|
|
48
|
+
trainStep(t, s = !1, e = !1) {
|
|
49
49
|
return this.tf.tidy(() => {
|
|
50
|
+
this.model.getProfiler()?.startMemory();
|
|
50
51
|
const { xs: i, ys: a } = t, o = () => {
|
|
51
52
|
const { loss: l, logits: c } = this.model.forward(i, a, !0);
|
|
52
53
|
return c.dispose(), l;
|
|
53
54
|
}, { value: n, grads: r } = this.tf.variableGrads(o);
|
|
54
|
-
return
|
|
55
|
+
return s ? this.model.getProfiler()?.endMemory("Training") : (e && (console.log("-------"), this.printGradients(r), console.log("-------")), this.optimizer.applyGradients(r), this.model.getProfiler()?.endMemory("Training"), this.tf.dispose(r)), n;
|
|
55
56
|
});
|
|
56
57
|
}
|
|
57
58
|
dummyPass() {
|
|
58
|
-
const t = this.tf.zeros([1, this.model.config.blockSize], "int32"),
|
|
59
|
+
const t = this.tf.zeros([1, this.model.config.blockSize], "int32"), s = this.tf.zeros([1, this.model.config.blockSize], "int32");
|
|
59
60
|
try {
|
|
60
|
-
const
|
|
61
|
-
|
|
62
|
-
} catch (
|
|
63
|
-
console.error("Error during dummy pass:",
|
|
61
|
+
const e = this.trainStep({ xs: t, ys: s }, !0);
|
|
62
|
+
e.dataSync(), e.dispose();
|
|
63
|
+
} catch (e) {
|
|
64
|
+
console.error("Error during dummy pass:", e);
|
|
64
65
|
} finally {
|
|
65
|
-
t.dispose(),
|
|
66
|
+
t.dispose(), s.dispose();
|
|
66
67
|
}
|
|
67
68
|
}
|
|
68
|
-
async trainBatch(t,
|
|
69
|
+
async trainBatch(t, s) {
|
|
69
70
|
try {
|
|
70
|
-
const
|
|
71
|
-
return
|
|
72
|
-
} catch (
|
|
73
|
-
throw console.error(`Error processing batch at step ${t.step}:`,
|
|
71
|
+
const e = this.trainStep(s, !1, !1);
|
|
72
|
+
return s.xs.dispose(), s.ys.dispose(), t.step++, t.totalSteps++, e.array().then((i) => (t.lastLoss = i, t.losses.push(t.lastLoss), e.dispose(), t.lastLoss));
|
|
73
|
+
} catch (e) {
|
|
74
|
+
throw console.error(`Error processing batch at step ${t.step}:`, e), this.tf.dispose(), e;
|
|
74
75
|
}
|
|
75
76
|
}
|
|
76
|
-
async createTrainValidationSplit(t,
|
|
77
|
-
const i = await this.datasetBuilder.createTextDataset(t,
|
|
77
|
+
async createTrainValidationSplit(t, s = 32, e = 0.1) {
|
|
78
|
+
const i = await this.datasetBuilder.createTextDataset(t, s, 0, 1 - e), a = await this.datasetBuilder.createTextDataset(
|
|
78
79
|
t,
|
|
79
|
-
|
|
80
|
-
1 -
|
|
80
|
+
s,
|
|
81
|
+
1 - e,
|
|
81
82
|
1
|
|
82
83
|
);
|
|
83
84
|
return { trainDataset: i, validationDataset: a };
|
|
84
85
|
}
|
|
85
|
-
async createDataset(t,
|
|
86
|
-
return await this.datasetBuilder.createTextDataset(t,
|
|
86
|
+
async createDataset(t, s = 32) {
|
|
87
|
+
return await this.datasetBuilder.createTextDataset(t, s);
|
|
87
88
|
}
|
|
88
89
|
dispose() {
|
|
89
90
|
this.optimizer && this.optimizer.dispose();
|
|
90
91
|
}
|
|
91
92
|
}
|
|
92
93
|
export {
|
|
93
|
-
|
|
94
|
+
g as default
|
|
94
95
|
};
|