@genai-fi/nanogpt 0.2.9 → 0.2.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/Generator.d.ts +2 -0
- package/dist/Generator.js +37 -32
- package/dist/NanoGPTModel.d.ts +4 -1
- package/dist/NanoGPTModel.js +33 -25
- package/dist/TeachableLLM.d.ts +4 -0
- package/dist/TeachableLLM.js +32 -15
- package/dist/{complex-Cd8sqiBC.js → complex-CJ-qCcLB.js} +6 -6
- package/dist/{index-Dsg28SG6.js → index-YPKosni4.js} +59 -51
- package/dist/layers/BaseLayer.d.ts +8 -0
- package/dist/layers/BaseLayer.js +18 -0
- package/dist/layers/CausalSelfAttention.d.ts +4 -1
- package/dist/layers/CausalSelfAttention.js +47 -55
- package/dist/layers/MLP.d.ts +2 -1
- package/dist/layers/MLP.js +16 -14
- package/dist/layers/RMSNorm.d.ts +2 -1
- package/dist/layers/RMSNorm.js +13 -11
- package/dist/layers/RoPECache.d.ts +4 -2
- package/dist/layers/RoPECache.js +13 -7
- package/dist/layers/TiedEmbedding.js +16 -15
- package/dist/layers/TransformerBlock.d.ts +4 -1
- package/dist/layers/TransformerBlock.js +9 -5
- package/dist/main.js +18 -16
- package/dist/{mat_mul-BAYDrXvE.js → mat_mul-Bu7bhLms.js} +5 -5
- package/dist/ops/attentionMask.js +31 -25
- package/dist/ops/gatherSub.js +2 -2
- package/dist/ops/node/sparseCrossEntropy.js +1 -1
- package/dist/ops/qkv.d.ts +7 -0
- package/dist/ops/qkv.js +127 -0
- package/dist/ops/rope.d.ts +8 -0
- package/dist/ops/rope.js +153 -0
- package/dist/ops/scatterSub.js +14 -14
- package/dist/reshape-DmnmKT6r.js +25 -0
- package/dist/{stack-1o648CP_.js → stack-BtKpB0Ry.js} +5 -5
- package/dist/sum-D7fu15XL.js +27 -0
- package/dist/training/AdamExt.js +1 -1
- package/dist/training/Trainer.js +30 -29
- package/dist/training/sparseCrossEntropy.js +34 -33
- package/dist/utilities/profile.d.ts +10 -0
- package/dist/utilities/profile.js +29 -0
- package/package.json +1 -1
- package/dist/sum-NWazHI7f.js +0 -49
|
@@ -1,16 +1,10 @@
|
|
|
1
|
-
import { attentionMask as
|
|
2
|
-
|
|
1
|
+
import { attentionMask as x } from "../ops/attentionMask.js";
|
|
2
|
+
import j from "./BaseLayer.js";
|
|
3
|
+
import { qkv as w } from "../ops/qkv.js";
|
|
4
|
+
import { rope as y } from "../ops/rope.js";
|
|
5
|
+
class N extends j {
|
|
3
6
|
constructor(t, i, s, e) {
|
|
4
|
-
this.ropeCache = e, this.config = s, this.tf = t, this.index = i, this.
|
|
5
|
-
units: 3 * s.nEmbed,
|
|
6
|
-
useBias: s.biasInLinear,
|
|
7
|
-
name: `block_${i}_attn_cAttn`,
|
|
8
|
-
kernelInitializer: this.tf.initializers.randomNormal({
|
|
9
|
-
mean: 0,
|
|
10
|
-
stddev: 0.02
|
|
11
|
-
}),
|
|
12
|
-
biasInitializer: "zeros"
|
|
13
|
-
}), this.cProj = this.tf.layers.dense({
|
|
7
|
+
super(), this.ropeCache = e, this.config = s, this.tf = t, this.index = i, this.units = s.nEmbed * 3, this.cProj = this.tf.layers.dense({
|
|
14
8
|
units: s.nEmbed,
|
|
15
9
|
useBias: s.biasInLinear,
|
|
16
10
|
name: `block_${i}_attn_cProj`,
|
|
@@ -20,11 +14,11 @@ class j {
|
|
|
20
14
|
}),
|
|
21
15
|
biasInitializer: "zeros"
|
|
22
16
|
}), this.attnDropout = this.tf.layers.dropout({ rate: s.dropout }), this.residDropout = this.tf.layers.dropout({ rate: s.dropout }), this.bias = this.tf.linalg.bandPart(this.tf.ones([s.blockSize, s.blockSize]), -1, 0).cast("bool"), this.divisor = 1 / Math.sqrt(s.nEmbed / s.nHead);
|
|
23
|
-
const o = this.tf.zeros([s.blockSize, s.blockSize]),
|
|
24
|
-
this.maskInf = this.tf.where(this.bias, o,
|
|
17
|
+
const o = this.tf.zeros([s.blockSize, s.blockSize]), a = this.tf.fill([s.blockSize, s.blockSize], Number.NEGATIVE_INFINITY);
|
|
18
|
+
this.maskInf = this.tf.where(this.bias, o, a);
|
|
25
19
|
}
|
|
26
20
|
config;
|
|
27
|
-
cAttn;
|
|
21
|
+
cAttn = null;
|
|
28
22
|
cProj;
|
|
29
23
|
attnDropout;
|
|
30
24
|
residDropout;
|
|
@@ -34,26 +28,35 @@ class j {
|
|
|
34
28
|
divisor;
|
|
35
29
|
index;
|
|
36
30
|
_trainable = !0;
|
|
31
|
+
units;
|
|
32
|
+
build() {
|
|
33
|
+
this.cAttn === null && (this.cAttn = this.tf.variable(
|
|
34
|
+
this.tf.randomNormal([this.config.nEmbed, this.units], 0, 0.02),
|
|
35
|
+
!0
|
|
36
|
+
//`block_${this.index}_attn_cAttn_kernel`
|
|
37
|
+
));
|
|
38
|
+
}
|
|
37
39
|
get variables() {
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
];
|
|
40
|
+
if (this.cAttn === null)
|
|
41
|
+
throw new Error("Layer not built yet");
|
|
42
|
+
return [this.cAttn, ...this.cProj.trainableWeights.map((t) => t.read())];
|
|
42
43
|
}
|
|
43
44
|
get trainable() {
|
|
44
45
|
return this._trainable;
|
|
45
46
|
}
|
|
46
47
|
set trainable(t) {
|
|
47
|
-
this._trainable = t, this.cAttn.trainable = t, this.cProj.trainable = t;
|
|
48
|
+
this._trainable = t, this.cAttn && (this.cAttn.trainable = t), this.cProj.trainable = t;
|
|
48
49
|
}
|
|
49
50
|
saveWeights(t) {
|
|
50
|
-
t.set(`block_${this.index}_cAttn`, this.cAttn.
|
|
51
|
+
t.set(`block_${this.index}_cAttn`, this.cAttn ? [this.cAttn.clone()] : []), t.set(`block_${this.index}_cProj`, this.cProj.getWeights());
|
|
51
52
|
}
|
|
52
53
|
loadWeights(t) {
|
|
53
|
-
|
|
54
|
+
const i = t.get(`block_${this.index}_cAttn`)?.[0];
|
|
55
|
+
if (!i) throw new Error(`Weights for block_${this.index}_cAttn not found`);
|
|
56
|
+
this.cAttn ? this.cAttn.assign(i) : this.cAttn = this.tf.variable(i, !0), this.cProj.setWeights(t.get(`block_${this.index}_cProj`) || []);
|
|
54
57
|
}
|
|
55
58
|
getAttentionScores(t, i, s) {
|
|
56
|
-
const e =
|
|
59
|
+
const e = x(t, i, this.maskInf, this.divisor), o = this.tf.softmax(e, -1);
|
|
57
60
|
return this.attnDropout.apply(o, { training: s });
|
|
58
61
|
}
|
|
59
62
|
// Attention with optional past. If pastLen > 0 and T_cur == 1, no mask needed.
|
|
@@ -63,60 +66,49 @@ class j {
|
|
|
63
66
|
if (o > 1 && e > 0)
|
|
64
67
|
throw new Error("Cannot use past with T_cur > 1");
|
|
65
68
|
if (o > 1) {
|
|
66
|
-
const
|
|
67
|
-
r = r.add(
|
|
69
|
+
const c = this.maskInf.slice([0, 0], [o, o]).expandDims(0).expandDims(0);
|
|
70
|
+
r = r.add(c);
|
|
68
71
|
}
|
|
69
72
|
const h = this.tf.softmax(r, -1);
|
|
70
73
|
return this.attnDropout.apply(h, { training: s });
|
|
71
74
|
}
|
|
72
75
|
getQKV(t) {
|
|
73
|
-
|
|
74
|
-
o.dispose();
|
|
75
|
-
const a = e / this.config.nHead, u = this.tf.reshape(c, [i, s, this.config.nHead, a]);
|
|
76
|
-
c.dispose();
|
|
77
|
-
const f = u.transpose([0, 2, 1, 3]);
|
|
78
|
-
u.dispose();
|
|
79
|
-
const d = this.tf.reshape(r, [i, s, this.config.nHead, a]);
|
|
80
|
-
r.dispose();
|
|
81
|
-
const n = d.transpose([0, 2, 1, 3]);
|
|
82
|
-
d.dispose();
|
|
83
|
-
const l = this.tf.reshape(h, [i, s, this.config.nHead, a]);
|
|
84
|
-
h.dispose();
|
|
85
|
-
const p = l.transpose([0, 2, 1, 3]);
|
|
86
|
-
return l.dispose(), [f, n, p];
|
|
76
|
+
return w(t, this.cAttn, this.config.nHead);
|
|
87
77
|
}
|
|
88
78
|
getOutputProjection(t, i) {
|
|
89
|
-
const s = t.shape[0], e = t.shape[2], o = this.config.nEmbed,
|
|
79
|
+
const s = t.shape[0], e = t.shape[2], o = this.config.nEmbed, a = t.transpose([0, 2, 1, 3]), r = this.tf.reshape(a, [s, e, o]), h = this.cProj.apply(r);
|
|
90
80
|
return this.residDropout.apply(h, { training: i });
|
|
91
81
|
}
|
|
92
82
|
// Added optional KV cache support (pastKV). Returns presentKV for chaining.
|
|
93
83
|
call(t, i = !1, s = !1, e) {
|
|
94
84
|
if (e && !this.config.useRope)
|
|
95
85
|
throw new Error("Cannot use pastKV without RoPE enabled");
|
|
96
|
-
return this.tf.tidy(() => {
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
86
|
+
return this.build(), this.tf.tidy(() => {
|
|
87
|
+
this.startMemory();
|
|
88
|
+
const [o, a, r] = this.getQKV(t), h = o.shape[2], c = this.config.blockSize, d = e ? e.cumulativeLength : 0, f = this.ropeCache ? y(o, this.ropeCache, d) : o, m = this.ropeCache ? y(a, this.ropeCache, d) : a;
|
|
89
|
+
this.ropeCache && (o.dispose(), a.dispose());
|
|
90
|
+
let n = m, l = r, u = 0;
|
|
91
|
+
e && (u = e.length, n = this.tf.concat([e.k, m], 2), l = this.tf.concat([e.v, r], 2));
|
|
100
92
|
const b = n.shape[2];
|
|
101
|
-
if (b >
|
|
102
|
-
const k = b -
|
|
103
|
-
n = n.slice([0, 0, k, 0], [
|
|
93
|
+
if (b > c) {
|
|
94
|
+
const k = b - c, A = n.shape[0], g = n.shape[1], _ = n.shape[3];
|
|
95
|
+
n = n.slice([0, 0, k, 0], [A, g, c, _]), l = l.slice([0, 0, k, 0], [A, g, c, _]), u = c - h;
|
|
104
96
|
}
|
|
105
|
-
let
|
|
106
|
-
|
|
107
|
-
const
|
|
97
|
+
let p;
|
|
98
|
+
u > 0 ? p = this.getAttentionScoresWithPast(f, n, i, u) : p = this.getAttentionScores(f, n, i);
|
|
99
|
+
const P = this.tf.matMul(p, l), S = this.getOutputProjection(P, i), v = {
|
|
108
100
|
k: this.tf.keep(n),
|
|
109
101
|
v: this.tf.keep(l),
|
|
110
|
-
length:
|
|
102
|
+
length: u + h,
|
|
111
103
|
cumulativeLength: e ? e.cumulativeLength + h : h
|
|
112
|
-
};
|
|
113
|
-
return { output:
|
|
104
|
+
}, I = s ? p.mean(1) : void 0;
|
|
105
|
+
return this.endMemory("CausalSelfAttention"), { output: S, attention: I, presentKV: v };
|
|
114
106
|
});
|
|
115
107
|
}
|
|
116
108
|
dispose() {
|
|
117
|
-
this.cAttn
|
|
109
|
+
this.cAttn?.dispose(), this.cProj.dispose(), this.attnDropout.dispose(), this.residDropout.dispose(), this.bias.dispose(), this.maskInf.dispose();
|
|
118
110
|
}
|
|
119
111
|
}
|
|
120
112
|
export {
|
|
121
|
-
|
|
113
|
+
N as default
|
|
122
114
|
};
|
package/dist/layers/MLP.d.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { default as TF } from '@tensorflow/tfjs';
|
|
2
2
|
import { GPTConfig } from '../config';
|
|
3
|
-
|
|
3
|
+
import { default as BaseLayer } from './BaseLayer';
|
|
4
|
+
export default class MLP extends BaseLayer {
|
|
4
5
|
private cFc;
|
|
5
6
|
private cProj;
|
|
6
7
|
private dropout;
|
package/dist/layers/MLP.js
CHANGED
|
@@ -1,31 +1,32 @@
|
|
|
1
|
-
|
|
1
|
+
import a from "./BaseLayer.js";
|
|
2
|
+
class l extends a {
|
|
2
3
|
cFc;
|
|
3
4
|
cProj;
|
|
4
5
|
dropout;
|
|
5
6
|
tf;
|
|
6
7
|
index;
|
|
7
8
|
_trainable = !0;
|
|
8
|
-
constructor(t,
|
|
9
|
-
this.tf = t, this.index =
|
|
10
|
-
units:
|
|
9
|
+
constructor(t, i, e) {
|
|
10
|
+
super(), this.tf = t, this.index = i, this.cFc = this.tf.layers.dense({
|
|
11
|
+
units: e.mlpFactor * e.nEmbed,
|
|
11
12
|
activation: "gelu",
|
|
12
|
-
useBias:
|
|
13
|
+
useBias: e.biasInLinear,
|
|
13
14
|
kernelInitializer: this.tf.initializers.randomNormal({
|
|
14
15
|
mean: 0,
|
|
15
16
|
stddev: 0.02
|
|
16
17
|
}),
|
|
17
18
|
biasInitializer: "zeros",
|
|
18
|
-
name: `block_${
|
|
19
|
+
name: `block_${i}_mlp_cFc`
|
|
19
20
|
}), this.cProj = this.tf.layers.dense({
|
|
20
|
-
units:
|
|
21
|
-
useBias:
|
|
21
|
+
units: e.nEmbed,
|
|
22
|
+
useBias: e.biasInLinear,
|
|
22
23
|
kernelInitializer: this.tf.initializers.randomNormal({
|
|
23
24
|
mean: 0,
|
|
24
|
-
stddev: 0.02 / Math.sqrt(2 *
|
|
25
|
+
stddev: 0.02 / Math.sqrt(2 * e.nLayer)
|
|
25
26
|
}),
|
|
26
27
|
biasInitializer: "zeros",
|
|
27
|
-
name: `block_${
|
|
28
|
-
}), this.dropout = this.tf.layers.dropout({ rate:
|
|
28
|
+
name: `block_${i}_mlp_cProj`
|
|
29
|
+
}), this.dropout = this.tf.layers.dropout({ rate: e.dropout });
|
|
29
30
|
}
|
|
30
31
|
get variables() {
|
|
31
32
|
return [
|
|
@@ -45,10 +46,11 @@ class l {
|
|
|
45
46
|
loadWeights(t) {
|
|
46
47
|
this.cFc.setWeights(t.get(`block_${this.index}_mlpHidden`) || []), this.cProj.setWeights(t.get(`block_${this.index}_mlpOut`) || []);
|
|
47
48
|
}
|
|
48
|
-
call(t,
|
|
49
|
+
call(t, i = !1) {
|
|
49
50
|
return this.tf.tidy(() => {
|
|
50
|
-
|
|
51
|
-
|
|
51
|
+
this.startMemory();
|
|
52
|
+
const e = this.cFc.apply(t), s = this.cProj.apply(e), r = this.dropout.apply(s, { training: i });
|
|
53
|
+
return this.endMemory("MLP"), r;
|
|
52
54
|
});
|
|
53
55
|
}
|
|
54
56
|
dispose() {
|
package/dist/layers/RMSNorm.d.ts
CHANGED
package/dist/layers/RMSNorm.js
CHANGED
|
@@ -1,26 +1,28 @@
|
|
|
1
|
-
|
|
1
|
+
import m from "./BaseLayer.js";
|
|
2
|
+
class o extends m {
|
|
2
3
|
gamma;
|
|
3
4
|
epsilon;
|
|
4
5
|
tf;
|
|
5
|
-
constructor(
|
|
6
|
-
this.tf =
|
|
6
|
+
constructor(t, s, a = 1e-8, e = "") {
|
|
7
|
+
super(), this.tf = t, this.epsilon = a, this.gamma = t.variable(t.ones(s), !0, `${e}_gamma`, "float32");
|
|
7
8
|
}
|
|
8
9
|
get trainableWeights() {
|
|
9
10
|
return [this.gamma];
|
|
10
11
|
}
|
|
11
|
-
set trainable(
|
|
12
|
-
this.gamma.trainable =
|
|
12
|
+
set trainable(t) {
|
|
13
|
+
this.gamma.trainable = t;
|
|
13
14
|
}
|
|
14
15
|
getWeights() {
|
|
15
16
|
return [this.gamma];
|
|
16
17
|
}
|
|
17
|
-
setWeights(
|
|
18
|
-
this.gamma.assign(
|
|
18
|
+
setWeights(t) {
|
|
19
|
+
this.gamma.assign(t[0]);
|
|
19
20
|
}
|
|
20
|
-
apply(
|
|
21
|
+
apply(t) {
|
|
21
22
|
return this.tf.tidy(() => {
|
|
22
|
-
|
|
23
|
-
|
|
23
|
+
this.startMemory();
|
|
24
|
+
const a = t.square().mean(-1, !0).add(this.epsilon).rsqrt(), r = t.mul(a).mul(this.gamma);
|
|
25
|
+
return this.endMemory("RMSNorm"), r;
|
|
24
26
|
});
|
|
25
27
|
}
|
|
26
28
|
dispose() {
|
|
@@ -28,5 +30,5 @@ class m {
|
|
|
28
30
|
}
|
|
29
31
|
}
|
|
30
32
|
export {
|
|
31
|
-
|
|
33
|
+
o as default
|
|
32
34
|
};
|
|
@@ -3,14 +3,16 @@ import { GPTConfig } from '../config';
|
|
|
3
3
|
export default class RoPECache {
|
|
4
4
|
private readonly tf;
|
|
5
5
|
private readonly config;
|
|
6
|
-
|
|
6
|
+
readonly rotaryDim: number;
|
|
7
7
|
private ropeBase;
|
|
8
8
|
private ropeInvFreq;
|
|
9
9
|
private ropeCos;
|
|
10
10
|
private ropeSin;
|
|
11
11
|
private ropeCacheLen;
|
|
12
12
|
constructor(tf: typeof TF, config: GPTConfig);
|
|
13
|
-
|
|
13
|
+
ensureRopeCache(needed: number): void;
|
|
14
|
+
getCos(): TF.Tensor | null;
|
|
15
|
+
getSin(): TF.Tensor | null;
|
|
14
16
|
applyRoPE(q: TF.Tensor, k: TF.Tensor, pastLen: number): [TF.Tensor, TF.Tensor];
|
|
15
17
|
dispose(): void;
|
|
16
18
|
}
|
package/dist/layers/RoPECache.js
CHANGED
|
@@ -24,16 +24,22 @@ class b {
|
|
|
24
24
|
const o = this.tf.range(0, s, 1, "float32").expandDims(1).mul(this.ropeInvFreq.expandDims(0));
|
|
25
25
|
this.ropeCos = this.tf.keep(this.tf.cos(o).expandDims(-1)), this.ropeSin = this.tf.keep(this.tf.sin(o).expandDims(-1)), this.ropeCacheLen = s;
|
|
26
26
|
}
|
|
27
|
+
getCos() {
|
|
28
|
+
return this.ropeCos;
|
|
29
|
+
}
|
|
30
|
+
getSin() {
|
|
31
|
+
return this.ropeSin;
|
|
32
|
+
}
|
|
27
33
|
applyRoPE(s, r, o) {
|
|
28
34
|
const i = s.shape[3], t = this.rotaryDim;
|
|
29
35
|
if (t > i) return [s, r];
|
|
30
|
-
const e = s.shape[2],
|
|
31
|
-
this.ensureRopeCache(
|
|
32
|
-
const n = t / 2,
|
|
33
|
-
const m = u.slice([0, 0, 0, 0], [h,
|
|
34
|
-
return C ? this.tf.concat([
|
|
35
|
-
},
|
|
36
|
-
return f.dispose(), l.dispose(), [
|
|
36
|
+
const e = s.shape[2], R = o + e;
|
|
37
|
+
this.ensureRopeCache(R);
|
|
38
|
+
const n = t / 2, c = this.ropeCos.slice([o, 0, 0], [e, n, 1]).reshape([1, 1, e, n]), a = this.ropeSin.slice([o, 0, 0], [e, n, 1]).reshape([1, 1, e, n]), h = s.shape[0], p = s.shape[1], f = this.tf.range(0, t, 2, "int32"), l = this.tf.range(1, t, 2, "int32"), d = (u) => {
|
|
39
|
+
const m = u.slice([0, 0, 0, 0], [h, p, e, t]), C = t < i ? u.slice([0, 0, 0, t], [h, p, e, i - t]) : null, g = this.tf.gather(m, f, 3), D = this.tf.gather(m, l, 3), x = g.mul(c).sub(D.mul(a)), k = D.mul(c).add(g.mul(a)), S = this.tf.stack([x, k], -1).reshape([h, p, e, t]);
|
|
40
|
+
return C ? this.tf.concat([S, C], 3) : S;
|
|
41
|
+
}, v = d(s), y = d(r);
|
|
42
|
+
return f.dispose(), l.dispose(), [v, y];
|
|
37
43
|
}
|
|
38
44
|
dispose() {
|
|
39
45
|
this.ropeCos && this.ropeCos.dispose(), this.ropeSin && this.ropeSin.dispose(), this.ropeInvFreq.dispose();
|
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
import { o as h,
|
|
2
|
-
import {
|
|
3
|
-
import {
|
|
4
|
-
import {
|
|
1
|
+
import { o as h, d as i, E as o, K as X, N as Y, O as Z, Q as J, T as ee, U as te, V as se, W as ne, X as re, Y as ue, l as L, I as ae, Z as A, a as ie, _ as oe, D as le, f as q, v as C, $ as P, H as U, a0 as H } from "../index-YPKosni4.js";
|
|
2
|
+
import { r as f } from "../reshape-DmnmKT6r.js";
|
|
3
|
+
import { s as ce } from "../sum-D7fu15XL.js";
|
|
4
|
+
import { m } from "../mat_mul-Bu7bhLms.js";
|
|
5
|
+
import { c as pe } from "../complex-CJ-qCcLB.js";
|
|
5
6
|
/**
|
|
6
7
|
* @license
|
|
7
8
|
* Copyright 2018 Google LLC. All Rights Reserved.
|
|
@@ -20,7 +21,7 @@ import { c as pe } from "../complex-Cd8sqiBC.js";
|
|
|
20
21
|
*/
|
|
21
22
|
function he(t) {
|
|
22
23
|
const s = { x: i(t, "x", "sigmoid", "float32") };
|
|
23
|
-
return o.runKernel(
|
|
24
|
+
return o.runKernel(X, s);
|
|
24
25
|
}
|
|
25
26
|
const fe = /* @__PURE__ */ h({ sigmoid_: he });
|
|
26
27
|
/**
|
|
@@ -41,7 +42,7 @@ const fe = /* @__PURE__ */ h({ sigmoid_: he });
|
|
|
41
42
|
*/
|
|
42
43
|
function de(t) {
|
|
43
44
|
const s = { x: i(t, "x", "elu", "float32") };
|
|
44
|
-
return o.runKernel(
|
|
45
|
+
return o.runKernel(Y, s);
|
|
45
46
|
}
|
|
46
47
|
const me = /* @__PURE__ */ h({ elu_: de });
|
|
47
48
|
/**
|
|
@@ -62,7 +63,7 @@ const me = /* @__PURE__ */ h({ elu_: de });
|
|
|
62
63
|
*/
|
|
63
64
|
function ge(t) {
|
|
64
65
|
const s = { input: i(t, "input", "imag") };
|
|
65
|
-
return o.runKernel(
|
|
66
|
+
return o.runKernel(Z, s);
|
|
66
67
|
}
|
|
67
68
|
const $e = /* @__PURE__ */ h({ imag_: ge });
|
|
68
69
|
/**
|
|
@@ -83,7 +84,7 @@ const $e = /* @__PURE__ */ h({ imag_: ge });
|
|
|
83
84
|
*/
|
|
84
85
|
function xe(t, e = 0.2) {
|
|
85
86
|
const n = { x: i(t, "x", "leakyRelu") }, r = { alpha: e };
|
|
86
|
-
return o.runKernel(
|
|
87
|
+
return o.runKernel(J, n, r);
|
|
87
88
|
}
|
|
88
89
|
const ke = /* @__PURE__ */ h({ leakyRelu_: xe });
|
|
89
90
|
/**
|
|
@@ -321,8 +322,8 @@ function Ne({ a: t, b: e, transposeA: s = !1, transposeB: n = !1, bias: r, activ
|
|
|
321
322
|
const [g, $, k, z] = M, d = Ae(f(x, k.shape), k, c);
|
|
322
323
|
let K, _;
|
|
323
324
|
if (!s && !n ? (K = m(d, $, !1, !0), _ = m(g, d, !0, !1)) : !s && n ? (K = m(d, $, !1, !1), _ = m(d, g, !0, !1)) : s && !n ? (K = m($, d, !1, !0), _ = m(g, d, !1, !1)) : (K = m($, d, !0, !0), _ = m(d, g, !0, !0)), r != null) {
|
|
324
|
-
const
|
|
325
|
-
return [K, _,
|
|
325
|
+
const V = Le(z, d);
|
|
326
|
+
return [K, _, V];
|
|
326
327
|
} else
|
|
327
328
|
return [K, _];
|
|
328
329
|
}, I = {
|
|
@@ -345,7 +346,7 @@ function Ne({ a: t, b: e, transposeA: s = !1, transposeB: n = !1, bias: r, activ
|
|
|
345
346
|
return k([M, g, z, $]), { value: f(z, O), gradFunc: G };
|
|
346
347
|
})(F, R, S);
|
|
347
348
|
}
|
|
348
|
-
const
|
|
349
|
+
const Q = /* @__PURE__ */ h({ fusedMatMul_: Ne });
|
|
349
350
|
/**
|
|
350
351
|
* @license
|
|
351
352
|
* Copyright 2018 Google LLC
|
|
@@ -378,7 +379,7 @@ function ve(t, e, s, n) {
|
|
|
378
379
|
throw new E(`If rank y >= 3, then the second last dim of y must equal the last dim of x but got x shape = ${t.shape} and y shape = ${e.shape}`);
|
|
379
380
|
}
|
|
380
381
|
if (t.rank === 2 && e.rank === 2)
|
|
381
|
-
return
|
|
382
|
+
return Q({
|
|
382
383
|
a: t,
|
|
383
384
|
b: e,
|
|
384
385
|
transposeA: !1,
|
|
@@ -392,7 +393,7 @@ function ve(t, e, s, n) {
|
|
|
392
393
|
const l = e.shape.slice(), p = l.pop(), u = l.pop(), a = [...l, p], D = Array.from({ length: e.rank }, (T, y) => y === 0 ? e.rank - 2 : y <= e.rank - 2 ? y - 1 : y);
|
|
393
394
|
e = f(Re(e, D), [u, -1]);
|
|
394
395
|
const b = [...r, ...a];
|
|
395
|
-
return f(
|
|
396
|
+
return f(Q({
|
|
396
397
|
a: t,
|
|
397
398
|
b: e,
|
|
398
399
|
transposeA: !1,
|
|
@@ -402,7 +403,7 @@ function ve(t, e, s, n) {
|
|
|
402
403
|
}), b);
|
|
403
404
|
}
|
|
404
405
|
}
|
|
405
|
-
class
|
|
406
|
+
class Ue {
|
|
406
407
|
vocabSize;
|
|
407
408
|
embedDim;
|
|
408
409
|
tf;
|
|
@@ -444,5 +445,5 @@ class Pe {
|
|
|
444
445
|
}
|
|
445
446
|
}
|
|
446
447
|
export {
|
|
447
|
-
|
|
448
|
+
Ue as default
|
|
448
449
|
};
|
|
@@ -2,7 +2,9 @@ import { default as TF } from '@tensorflow/tfjs';
|
|
|
2
2
|
import { GPTConfig } from '../config';
|
|
3
3
|
import { KVCache } from './CausalSelfAttention';
|
|
4
4
|
import { default as RoPECache } from './RoPECache';
|
|
5
|
-
|
|
5
|
+
import { default as MemoryProfiler } from '../utilities/profile';
|
|
6
|
+
import { default as BaseLayer } from './BaseLayer';
|
|
7
|
+
export default class Block extends BaseLayer {
|
|
6
8
|
private ln1;
|
|
7
9
|
private attn;
|
|
8
10
|
private ln2;
|
|
@@ -12,6 +14,7 @@ export default class Block {
|
|
|
12
14
|
private _trainable;
|
|
13
15
|
skipped: boolean;
|
|
14
16
|
constructor(tf: typeof TF, index: number, config: GPTConfig, ropeCache?: RoPECache);
|
|
17
|
+
setProfiler(value: MemoryProfiler | undefined): void;
|
|
15
18
|
get variables(): TF.Variable[];
|
|
16
19
|
get trainable(): boolean;
|
|
17
20
|
set trainable(value: boolean);
|
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
import
|
|
1
|
+
import a from "./CausalSelfAttention.js";
|
|
2
2
|
import o from "./MLP.js";
|
|
3
|
-
import
|
|
4
|
-
|
|
3
|
+
import r from "./RMSNorm.js";
|
|
4
|
+
import p from "./BaseLayer.js";
|
|
5
|
+
class f extends p {
|
|
5
6
|
ln1;
|
|
6
7
|
attn;
|
|
7
8
|
ln2;
|
|
@@ -11,7 +12,10 @@ class u {
|
|
|
11
12
|
_trainable = !0;
|
|
12
13
|
skipped = !1;
|
|
13
14
|
constructor(t, i, s, e) {
|
|
14
|
-
this.tf = t, this.index = i, this.ln1 = new
|
|
15
|
+
super(), this.tf = t, this.index = i, this.ln1 = new r(t, [s.nEmbed], 1e-8, `block_${this.index}_rms1`), this.attn = new a(this.tf, this.index, s, e), this.ln2 = new r(t, [s.nEmbed], 1e-8, `block_${this.index}_rms2`), this.mlp = new o(this.tf, this.index, s);
|
|
16
|
+
}
|
|
17
|
+
setProfiler(t) {
|
|
18
|
+
this._profiler = t, this.attn.setProfiler(t), this.mlp.setProfiler(t), this.ln1.setProfiler(t), this.ln2.setProfiler(t);
|
|
15
19
|
}
|
|
16
20
|
get variables() {
|
|
17
21
|
return [
|
|
@@ -54,5 +58,5 @@ class u {
|
|
|
54
58
|
}
|
|
55
59
|
}
|
|
56
60
|
export {
|
|
57
|
-
|
|
61
|
+
f as default
|
|
58
62
|
};
|
package/dist/main.js
CHANGED
|
@@ -1,21 +1,23 @@
|
|
|
1
|
-
import { default as
|
|
2
|
-
import { default as
|
|
3
|
-
import { default as
|
|
4
|
-
import { default as
|
|
5
|
-
import { default as
|
|
6
|
-
import { estimateMemoryUsage as
|
|
1
|
+
import { default as s } from "./NanoGPTModel.js";
|
|
2
|
+
import { default as p } from "./TeachableLLM.js";
|
|
3
|
+
import { default as d } from "./tokeniser/CharTokeniser.js";
|
|
4
|
+
import { default as x } from "./utilities/waitForModel.js";
|
|
5
|
+
import { default as T } from "./data/textLoader.js";
|
|
6
|
+
import { estimateMemoryUsage as M, estimateParameterCount as C, estimateResources as c, estimateTrainingMemoryUsage as h, validateConfig as y } from "./utilities/parameters.js";
|
|
7
7
|
import "./ops/scatterSub.js";
|
|
8
8
|
import "./ops/gatherSub.js";
|
|
9
9
|
import "./ops/attentionMask.js";
|
|
10
|
+
import "./ops/qkv.js";
|
|
11
|
+
import "./ops/rope.js";
|
|
10
12
|
export {
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
13
|
+
d as CharTokeniser,
|
|
14
|
+
s as NanoGPT,
|
|
15
|
+
p as TeachableLLM,
|
|
16
|
+
M as estimateMemoryUsage,
|
|
17
|
+
C as estimateParameterCount,
|
|
18
|
+
c as estimateResources,
|
|
19
|
+
h as estimateTrainingMemoryUsage,
|
|
20
|
+
T as loadTextData,
|
|
21
|
+
y as validateConfig,
|
|
22
|
+
x as waitForModel
|
|
21
23
|
};
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { o as
|
|
1
|
+
import { o as m, d as s, f as c, E as M, B as f } from "./index-YPKosni4.js";
|
|
2
2
|
/**
|
|
3
3
|
* @license
|
|
4
4
|
* Copyright 2020 Google LLC. All Rights Reserved.
|
|
@@ -15,13 +15,13 @@ import { o as c, c as s, d as m, E as M, B as p } from "./index-Dsg28SG6.js";
|
|
|
15
15
|
* limitations under the License.
|
|
16
16
|
* =============================================================================
|
|
17
17
|
*/
|
|
18
|
-
function
|
|
18
|
+
function p(e, o, n = !1, l = !1) {
|
|
19
19
|
let a = s(e, "a", "matMul"), t = s(o, "b", "matMul");
|
|
20
|
-
[a, t] =
|
|
20
|
+
[a, t] = c(a, t);
|
|
21
21
|
const r = { a, b: t }, u = { transposeA: n, transposeB: l };
|
|
22
|
-
return M.runKernel(
|
|
22
|
+
return M.runKernel(f, r, u);
|
|
23
23
|
}
|
|
24
|
-
const i = /* @__PURE__ */
|
|
24
|
+
const i = /* @__PURE__ */ m({ matMul_: p });
|
|
25
25
|
export {
|
|
26
26
|
i as m
|
|
27
27
|
};
|
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
import { engine as
|
|
2
|
-
import { r as
|
|
3
|
-
import { m as
|
|
4
|
-
class
|
|
1
|
+
import { engine as k } from "@tensorflow/tfjs";
|
|
2
|
+
import { r as m, c as d, s as p } from "../index-YPKosni4.js";
|
|
3
|
+
import { m as f } from "../mat_mul-Bu7bhLms.js";
|
|
4
|
+
class h {
|
|
5
5
|
variableNames = ["q", "k", "mask"];
|
|
6
6
|
outputShape;
|
|
7
7
|
userCode;
|
|
8
8
|
// enableShapeUniforms = true;
|
|
9
9
|
customUniforms = [{ name: "divisor", type: "float" }];
|
|
10
|
-
constructor(
|
|
11
|
-
this.outputShape = [
|
|
10
|
+
constructor(e, n, s, a) {
|
|
11
|
+
this.outputShape = [e, n, s, s], this.userCode = `
|
|
12
12
|
void main() {
|
|
13
13
|
ivec4 coords = getOutputCoords(); // [batch, nh, t1, t2]
|
|
14
14
|
int b = coords.x;
|
|
@@ -34,49 +34,55 @@ class f {
|
|
|
34
34
|
`;
|
|
35
35
|
}
|
|
36
36
|
}
|
|
37
|
-
function
|
|
38
|
-
const { q:
|
|
39
|
-
return o.runWebGLProgram(
|
|
37
|
+
function v(t) {
|
|
38
|
+
const { q: e, k: n, mask: s } = t.inputs, { divisor: a } = t.attrs, o = t.backend, r = e.shape[0], i = e.shape[2], c = e.shape[1], u = new h(r, c, i, e.shape[3]);
|
|
39
|
+
return o.runWebGLProgram(u, [e, n, s], "float32", [[a]]);
|
|
40
40
|
}
|
|
41
|
-
const
|
|
41
|
+
const b = {
|
|
42
42
|
kernelName: "AttentionMask",
|
|
43
43
|
backendName: "webgl",
|
|
44
|
-
kernelFunc:
|
|
44
|
+
kernelFunc: v
|
|
45
45
|
};
|
|
46
|
-
|
|
47
|
-
function
|
|
48
|
-
const { q:
|
|
46
|
+
m(b);
|
|
47
|
+
function l(t) {
|
|
48
|
+
const { q: e, k: n, mask: s } = t.inputs, { divisor: a } = t.attrs, o = e.shape[2], i = f(e, n, !1, !0).mul(p(a)), c = s.slice([0, 0], [o, o]).expandDims(0).expandDims(0);
|
|
49
49
|
return i.add(c);
|
|
50
50
|
}
|
|
51
51
|
const M = {
|
|
52
52
|
kernelName: "AttentionMask",
|
|
53
53
|
backendName: "cpu",
|
|
54
|
-
kernelFunc:
|
|
54
|
+
kernelFunc: l
|
|
55
55
|
};
|
|
56
|
-
|
|
57
|
-
function w(t, s, n, e) {
|
|
58
|
-
return l().runKernel("AttentionMask", { q: t, k: s, mask: n }, { divisor: e });
|
|
59
|
-
}
|
|
56
|
+
m(M);
|
|
60
57
|
const g = {
|
|
58
|
+
kernelName: "AttentionMask",
|
|
59
|
+
backendName: "tensorflow",
|
|
60
|
+
kernelFunc: l
|
|
61
|
+
};
|
|
62
|
+
m(g);
|
|
63
|
+
function N(t, e, n, s) {
|
|
64
|
+
return k().runKernel("AttentionMask", { q: t, k: e, mask: n }, { divisor: s });
|
|
65
|
+
}
|
|
66
|
+
const A = {
|
|
61
67
|
kernelName: "AttentionMask",
|
|
62
68
|
inputsToSave: ["q", "k"],
|
|
63
69
|
outputsToSave: [],
|
|
64
|
-
gradFunc: (t,
|
|
70
|
+
gradFunc: (t, e, n) => {
|
|
65
71
|
if (Array.isArray(t))
|
|
66
72
|
throw new Error("Expected dy to be a single Tensor");
|
|
67
|
-
const [
|
|
73
|
+
const [s, a] = e, { divisor: o } = n;
|
|
68
74
|
return {
|
|
69
75
|
q: () => t.matMul(a).mul(o),
|
|
70
|
-
k: () =>
|
|
76
|
+
k: () => s.transpose([0, 1, 3, 2]).matMul(t).mul(o).transpose([0, 1, 3, 2]),
|
|
71
77
|
mask: () => t,
|
|
72
78
|
divisor: () => {
|
|
73
|
-
const r =
|
|
79
|
+
const r = s.matMul(a, !1, !0);
|
|
74
80
|
return t.mul(r).sum();
|
|
75
81
|
}
|
|
76
82
|
};
|
|
77
83
|
}
|
|
78
84
|
};
|
|
79
|
-
|
|
85
|
+
d(A);
|
|
80
86
|
export {
|
|
81
|
-
|
|
87
|
+
N as attentionMask
|
|
82
88
|
};
|