@genai-fi/nanogpt 0.2.12 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/Generator.js +30 -25
- package/dist/NanoGPTModel.d.ts +13 -14
- package/dist/NanoGPTModel.js +142 -70
- package/dist/TeachableLLM.d.ts +16 -7
- package/dist/TeachableLLM.js +81 -44
- package/dist/Trainer.js +8 -8
- package/dist/concat-BIZS_td9.js +33 -0
- package/dist/data/parquet.js +1 -1
- package/dist/exports_layers-tbTBcwMM.js +25 -0
- package/dist/{sum-D7fu15XL.js → gather-BPGW8RsB.js} +6 -8
- package/dist/index-C4L8Cm77.js +349 -0
- package/dist/{index-YPKosni4.js → index-pWA4_lUh.js} +1020 -782
- package/dist/layers/CausalSelfAttention.d.ts +11 -11
- package/dist/layers/CausalSelfAttention.js +71 -63
- package/dist/layers/MLP.d.ts +6 -7
- package/dist/layers/MLP.js +18 -16
- package/dist/layers/RMSNorm.d.ts +6 -7
- package/dist/layers/RMSNorm.js +15 -13
- package/dist/layers/RoPECache.d.ts +4 -5
- package/dist/layers/RoPECache.js +36 -12
- package/dist/layers/TiedEmbedding.d.ts +7 -8
- package/dist/layers/TiedEmbedding.js +16 -418
- package/dist/layers/TransformerBlock.d.ts +8 -9
- package/dist/layers/TransformerBlock.js +12 -12
- package/dist/main.d.ts +2 -0
- package/dist/main.js +35 -21
- package/dist/{mat_mul-Bu7bhLms.js → mat_mul-D7_a4KJn.js} +5 -5
- package/dist/moments-DfcpfwKi.js +132 -0
- package/dist/ones-Cog-G2ag.js +29 -0
- package/dist/ops/appendCache.d.ts +2 -0
- package/dist/ops/appendCache.js +9 -0
- package/dist/ops/attentionMask.d.ts +1 -1
- package/dist/ops/attentionMask.js +7 -85
- package/dist/ops/cpu/appendCache.d.ts +2 -0
- package/dist/ops/cpu/appendCache.js +28 -0
- package/dist/ops/cpu/attentionMask.js +18 -0
- package/dist/ops/cpu/gatherSub.d.ts +1 -0
- package/dist/ops/cpu/gatherSub.js +34 -0
- package/dist/ops/cpu/qkv.d.ts +5 -0
- package/dist/ops/cpu/qkv.js +38 -0
- package/dist/ops/cpu/rope.d.ts +6 -0
- package/dist/ops/cpu/rope.js +38 -0
- package/dist/ops/cpu/scatterSub.d.ts +1 -0
- package/dist/ops/cpu/scatterSub.js +70 -0
- package/dist/ops/gatherSub.d.ts +1 -1
- package/dist/ops/gatherSub.js +6 -63
- package/dist/ops/grads/attentionMask.d.ts +1 -0
- package/dist/ops/grads/attentionMask.js +21 -0
- package/dist/ops/grads/qkv.d.ts +1 -0
- package/dist/ops/grads/qkv.js +20 -0
- package/dist/ops/grads/rope.d.ts +1 -0
- package/dist/ops/grads/rope.js +14 -0
- package/dist/ops/node/sparseCrossEntropy.js +1 -1
- package/dist/ops/qkv.d.ts +1 -6
- package/dist/ops/qkv.js +7 -124
- package/dist/ops/rope.d.ts +0 -5
- package/dist/ops/rope.js +7 -151
- package/dist/ops/scatterSub.d.ts +1 -1
- package/dist/ops/scatterSub.js +6 -147
- package/dist/ops/webgl/appendCache.d.ts +1 -0
- package/dist/ops/webgl/appendCache.js +43 -0
- package/dist/ops/webgl/attentionMask.d.ts +1 -0
- package/dist/ops/webgl/attentionMask.js +43 -0
- package/dist/ops/webgl/gatherSub.d.ts +1 -0
- package/dist/ops/webgl/gatherSub.js +27 -0
- package/dist/ops/webgl/qkv.d.ts +1 -0
- package/dist/ops/webgl/qkv.js +46 -0
- package/dist/ops/webgl/rope.d.ts +1 -0
- package/dist/ops/webgl/rope.js +56 -0
- package/dist/ops/webgl/scatterSub.d.ts +1 -0
- package/dist/ops/webgl/scatterSub.js +27 -0
- package/dist/{parquet-BRl5lE_I.js → parquet-C0Tlmv9c.js} +3045 -3048
- package/dist/random_width-oeUIlUZj.js +15487 -0
- package/dist/range-CcDl05lo.js +26 -0
- package/dist/{reshape-DmnmKT6r.js → reshape-C8CR_Bad.js} +3 -3
- package/dist/sin-BJIrfnj7.js +47 -0
- package/dist/softmax-Be_lsqUc.js +105 -0
- package/dist/{complex-CJ-qCcLB.js → split-DZbvruEP.js} +6 -8
- package/dist/stack-BMm-efee.js +27 -0
- package/dist/sum-C7Mgy9Bw.js +104 -0
- package/dist/tensor-DJVbYhh1.js +24 -0
- package/dist/tensor2d-ZuQSh2D-.js +30 -0
- package/dist/tokeniser/bpe.d.ts +17 -6
- package/dist/tokeniser/bpe.js +89 -61
- package/dist/training/AdamExt.js +1 -1
- package/dist/training/DatasetBuilder.d.ts +6 -6
- package/dist/training/DatasetBuilder.js +1262 -17
- package/dist/training/Evaluator.d.ts +3 -2
- package/dist/training/FullTrainer.d.ts +9 -8
- package/dist/training/FullTrainer.js +26 -25
- package/dist/training/LayerTrainer.d.ts +9 -8
- package/dist/training/LayerTrainer.js +34 -33
- package/dist/training/Trainer.d.ts +22 -21
- package/dist/training/Trainer.js +21 -18
- package/dist/training/sparseCrossEntropy.js +22 -166
- package/dist/utilities/dummy.js +10 -8
- package/dist/utilities/generate.js +14 -11
- package/dist/utilities/load.d.ts +1 -2
- package/dist/utilities/load.js +37 -35
- package/dist/utilities/profile.js +1 -1
- package/dist/utilities/save.js +14 -9
- package/dist/utilities/tokenParse.d.ts +1 -1
- package/dist/utilities/tokenParse.js +7 -61
- package/dist/utilities/weights.d.ts +3 -3
- package/dist/utilities/weights.js +21 -19
- package/dist/variable-Dl_ub3pk.js +23 -0
- package/dist/{stack-BtKpB0Ry.js → zeros-CCy9C3uU.js} +18 -16
- package/package.json +2 -1
- package/dist/assets/worker-BYeSPNkq.js +0 -1
- package/dist/tokeniser/NodeTokeniser.d.ts +0 -20
- package/dist/tokeniser/NodeTokeniser.js +0 -46
- package/dist/tokeniser/WebTokeniser.d.ts +0 -18
- package/dist/tokeniser/WebTokeniser.js +0 -96
- package/dist/tokeniser/worker.js +0 -53
- /package/dist/{tokeniser/worker.d.ts → ops/cpu/attentionMask.d.ts} +0 -0
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
import { default as TF } from '@tensorflow/tfjs';
|
|
2
1
|
import { GPTConfig } from '../config';
|
|
3
2
|
import { default as RoPECache } from './RoPECache';
|
|
4
3
|
import { default as BaseLayer } from './BaseLayer';
|
|
4
|
+
import { Tensor, Variable } from '@tensorflow/tfjs-core';
|
|
5
5
|
export type KVCache = {
|
|
6
|
-
k:
|
|
7
|
-
v:
|
|
6
|
+
k: Tensor;
|
|
7
|
+
v: Tensor;
|
|
8
8
|
length: number;
|
|
9
9
|
cumulativeLength: number;
|
|
10
10
|
};
|
|
@@ -17,25 +17,25 @@ export default class CausalSelfAttention extends BaseLayer {
|
|
|
17
17
|
private residDropout;
|
|
18
18
|
private bias;
|
|
19
19
|
private maskInf;
|
|
20
|
-
private tf;
|
|
21
20
|
private divisor;
|
|
22
21
|
private index;
|
|
23
22
|
private _trainable;
|
|
24
23
|
private units;
|
|
25
|
-
constructor(
|
|
24
|
+
constructor(index: number, config: GPTConfig, ropeCache?: RoPECache | undefined);
|
|
26
25
|
private build;
|
|
27
|
-
get variables():
|
|
26
|
+
get variables(): Variable[];
|
|
28
27
|
get trainable(): boolean;
|
|
29
28
|
set trainable(value: boolean);
|
|
30
|
-
saveWeights(map: Map<string,
|
|
31
|
-
loadWeights(weights: Map<string,
|
|
29
|
+
saveWeights(map: Map<string, Tensor[]>): void;
|
|
30
|
+
loadWeights(weights: Map<string, Tensor[]>): void;
|
|
32
31
|
private getAttentionScores;
|
|
33
32
|
private getAttentionScoresWithPast;
|
|
34
33
|
private getQKV;
|
|
35
34
|
private getOutputProjection;
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
35
|
+
private updateCache;
|
|
36
|
+
call(x: Tensor, training?: boolean, includeAttention?: boolean, pastKV?: KVCache): {
|
|
37
|
+
output: Tensor;
|
|
38
|
+
attention?: Tensor;
|
|
39
39
|
presentKV?: KVCache;
|
|
40
40
|
};
|
|
41
41
|
dispose(): void;
|
|
@@ -1,21 +1,31 @@
|
|
|
1
|
-
import { attentionMask as
|
|
2
|
-
import
|
|
3
|
-
import { qkv as
|
|
4
|
-
import { rope as
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
1
|
+
import { attentionMask as C } from "../ops/attentionMask.js";
|
|
2
|
+
import x from "./BaseLayer.js";
|
|
3
|
+
import { qkv as y } from "../ops/qkv.js";
|
|
4
|
+
import { rope as m } from "../ops/rope.js";
|
|
5
|
+
import { appendCache as b } from "../ops/appendCache.js";
|
|
6
|
+
import { w as j, x as f, t as z } from "../index-pWA4_lUh.js";
|
|
7
|
+
import { r as w, l as E, w as D, b as T } from "../random_width-oeUIlUZj.js";
|
|
8
|
+
import { d as L, a as k } from "../exports_layers-tbTBcwMM.js";
|
|
9
|
+
import { o as W } from "../ones-Cog-G2ag.js";
|
|
10
|
+
import { z as M } from "../zeros-CCy9C3uU.js";
|
|
11
|
+
import { v as A } from "../variable-Dl_ub3pk.js";
|
|
12
|
+
import { s as g } from "../softmax-Be_lsqUc.js";
|
|
13
|
+
import { m as _ } from "../mat_mul-D7_a4KJn.js";
|
|
14
|
+
import { r as $ } from "../reshape-C8CR_Bad.js";
|
|
15
|
+
class K extends x {
|
|
16
|
+
constructor(s, t, i) {
|
|
17
|
+
super(), this.ropeCache = i, this.config = t, this.index = s, this.units = t.nEmbed * 3, this.cProj = L({
|
|
18
|
+
units: t.nEmbed,
|
|
19
|
+
useBias: t.biasInLinear,
|
|
20
|
+
name: `block_${s}_attn_cProj`,
|
|
21
|
+
kernelInitializer: w({
|
|
12
22
|
mean: 0,
|
|
13
|
-
stddev: 0.02 / Math.sqrt(2 *
|
|
23
|
+
stddev: 0.02 / Math.sqrt(2 * t.nLayer)
|
|
14
24
|
}),
|
|
15
25
|
biasInitializer: "zeros"
|
|
16
|
-
}), this.attnDropout =
|
|
17
|
-
const
|
|
18
|
-
this.maskInf =
|
|
26
|
+
}), this.attnDropout = k({ rate: t.dropout }), this.residDropout = k({ rate: t.dropout }), this.bias = E.bandPart(W([t.blockSize, t.blockSize]), -1, 0).cast("bool"), this.divisor = 1 / Math.sqrt(t.nEmbed / t.nHead);
|
|
27
|
+
const e = M([t.blockSize, t.blockSize]), o = j([t.blockSize, t.blockSize], Number.NEGATIVE_INFINITY);
|
|
28
|
+
this.maskInf = D(this.bias, e, o);
|
|
19
29
|
}
|
|
20
30
|
config;
|
|
21
31
|
cAttn = null;
|
|
@@ -24,14 +34,13 @@ class N extends j {
|
|
|
24
34
|
residDropout;
|
|
25
35
|
bias;
|
|
26
36
|
maskInf;
|
|
27
|
-
tf;
|
|
28
37
|
divisor;
|
|
29
38
|
index;
|
|
30
39
|
_trainable = !0;
|
|
31
40
|
units;
|
|
32
41
|
build() {
|
|
33
|
-
this.cAttn === null && (this.cAttn =
|
|
34
|
-
|
|
42
|
+
this.cAttn === null && (this.cAttn = A(
|
|
43
|
+
T([this.config.nEmbed, this.units], 0, 0.02),
|
|
35
44
|
!0
|
|
36
45
|
//`block_${this.index}_attn_cAttn_kernel`
|
|
37
46
|
));
|
|
@@ -39,70 +48,69 @@ class N extends j {
|
|
|
39
48
|
get variables() {
|
|
40
49
|
if (this.cAttn === null)
|
|
41
50
|
throw new Error("Layer not built yet");
|
|
42
|
-
return [this.cAttn, ...this.cProj.trainableWeights.map((
|
|
51
|
+
return [this.cAttn, ...this.cProj.trainableWeights.map((s) => s.read())];
|
|
43
52
|
}
|
|
44
53
|
get trainable() {
|
|
45
54
|
return this._trainable;
|
|
46
55
|
}
|
|
47
|
-
set trainable(
|
|
48
|
-
this._trainable =
|
|
56
|
+
set trainable(s) {
|
|
57
|
+
this._trainable = s, this.cAttn && (this.cAttn.trainable = s), this.cProj.trainable = s;
|
|
49
58
|
}
|
|
50
|
-
saveWeights(
|
|
51
|
-
|
|
59
|
+
saveWeights(s) {
|
|
60
|
+
s.set(`block_${this.index}_cAttn`, this.cAttn ? [this.cAttn.clone()] : []), s.set(`block_${this.index}_cProj`, this.cProj.getWeights());
|
|
52
61
|
}
|
|
53
|
-
loadWeights(
|
|
54
|
-
const
|
|
55
|
-
if (!
|
|
56
|
-
this.cAttn ? this.cAttn.assign(
|
|
62
|
+
loadWeights(s) {
|
|
63
|
+
const t = s.get(`block_${this.index}_cAttn`)?.[0];
|
|
64
|
+
if (!t) throw new Error(`Weights for block_${this.index}_cAttn not found`);
|
|
65
|
+
this.cAttn ? this.cAttn.assign(t) : this.cAttn = A(t, !0), this.cProj.setWeights(s.get(`block_${this.index}_cProj`) || []);
|
|
57
66
|
}
|
|
58
|
-
getAttentionScores(t, i
|
|
59
|
-
const e =
|
|
60
|
-
return this.attnDropout.apply(o, { training:
|
|
67
|
+
getAttentionScores(s, t, i) {
|
|
68
|
+
const e = C(s, t, this.maskInf, this.divisor), o = g(e, -1);
|
|
69
|
+
return this.attnDropout.apply(o, { training: i });
|
|
61
70
|
}
|
|
62
71
|
// Attention with optional past. If pastLen > 0 and T_cur == 1, no mask needed.
|
|
63
|
-
getAttentionScoresWithPast(t, i,
|
|
64
|
-
const o =
|
|
65
|
-
let
|
|
72
|
+
getAttentionScoresWithPast(s, t, i, e) {
|
|
73
|
+
const o = s.shape[2];
|
|
74
|
+
let n = _(s, t, !1, !0).mul(this.divisor);
|
|
66
75
|
if (o > 1 && e > 0)
|
|
67
76
|
throw new Error("Cannot use past with T_cur > 1");
|
|
68
77
|
if (o > 1) {
|
|
69
|
-
const
|
|
70
|
-
|
|
78
|
+
const h = this.maskInf.slice([0, 0], [o, o]).expandDims(0).expandDims(0);
|
|
79
|
+
n = n.add(h);
|
|
71
80
|
}
|
|
72
|
-
const
|
|
73
|
-
return this.attnDropout.apply(
|
|
81
|
+
const a = g(n, -1);
|
|
82
|
+
return this.attnDropout.apply(a, { training: i });
|
|
74
83
|
}
|
|
75
|
-
getQKV(
|
|
76
|
-
return
|
|
84
|
+
getQKV(s) {
|
|
85
|
+
return y(s, this.cAttn, this.config.nHead);
|
|
77
86
|
}
|
|
78
|
-
getOutputProjection(
|
|
79
|
-
const
|
|
80
|
-
return this.residDropout.apply(
|
|
87
|
+
getOutputProjection(s, t) {
|
|
88
|
+
const i = s.shape[0], e = s.shape[2], o = this.config.nEmbed, r = s.transpose([0, 2, 1, 3]), n = $(r, [i, e, o]), a = this.cProj.apply(n);
|
|
89
|
+
return this.residDropout.apply(a, { training: t });
|
|
90
|
+
}
|
|
91
|
+
updateCache(s, t, i) {
|
|
92
|
+
const e = this.config.blockSize, o = s.shape[2], r = Math.min(i?.length || 0, e - o), n = i ? b(i.k, s, e) : s, a = i ? b(i.v, t, e) : t;
|
|
93
|
+
return {
|
|
94
|
+
k: f(n),
|
|
95
|
+
v: f(a),
|
|
96
|
+
length: r + o,
|
|
97
|
+
cumulativeLength: i ? i.cumulativeLength + o : o
|
|
98
|
+
};
|
|
81
99
|
}
|
|
82
100
|
// Added optional KV cache support (pastKV). Returns presentKV for chaining.
|
|
83
|
-
call(
|
|
101
|
+
call(s, t = !1, i = !1, e) {
|
|
84
102
|
if (e && !this.config.useRope)
|
|
85
103
|
throw new Error("Cannot use pastKV without RoPE enabled");
|
|
86
|
-
return this.build(),
|
|
104
|
+
return this.build(), z(() => {
|
|
87
105
|
this.startMemory();
|
|
88
|
-
const [o,
|
|
89
|
-
this.ropeCache && (o.dispose(),
|
|
90
|
-
|
|
91
|
-
e && (
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
}
|
|
97
|
-
let p;
|
|
98
|
-
u > 0 ? p = this.getAttentionScoresWithPast(f, n, i, u) : p = this.getAttentionScores(f, n, i);
|
|
99
|
-
const P = this.tf.matMul(p, l), S = this.getOutputProjection(P, i), v = {
|
|
100
|
-
k: this.tf.keep(n),
|
|
101
|
-
v: this.tf.keep(l),
|
|
102
|
-
length: u + h,
|
|
103
|
-
cumulativeLength: e ? e.cumulativeLength + h : h
|
|
104
|
-
}, I = s ? p.mean(1) : void 0;
|
|
105
|
-
return this.endMemory("CausalSelfAttention"), { output: S, attention: I, presentKV: v };
|
|
106
|
+
const [o, r, n] = this.getQKV(s), a = e ? e.cumulativeLength : 0, h = this.ropeCache ? m(o, this.ropeCache, a) : o, p = this.ropeCache ? m(r, this.ropeCache, a) : r;
|
|
107
|
+
this.ropeCache && (o.dispose(), r.dispose());
|
|
108
|
+
const u = e ? e.length : 0, l = this.updateCache(p, n, e), d = l.k, v = l.v;
|
|
109
|
+
e && (p.dispose(), n.dispose());
|
|
110
|
+
let c;
|
|
111
|
+
u > 0 ? c = this.getAttentionScoresWithPast(h, d, t, u) : c = this.getAttentionScores(h, d, t);
|
|
112
|
+
const P = _(c, v), I = this.getOutputProjection(P, t), S = i ? c.mean(1) : void 0;
|
|
113
|
+
return this.endMemory("CausalSelfAttention"), { output: I, attention: S, presentKV: l };
|
|
106
114
|
});
|
|
107
115
|
}
|
|
108
116
|
dispose() {
|
|
@@ -110,5 +118,5 @@ class N extends j {
|
|
|
110
118
|
}
|
|
111
119
|
}
|
|
112
120
|
export {
|
|
113
|
-
|
|
121
|
+
K as default
|
|
114
122
|
};
|
package/dist/layers/MLP.d.ts
CHANGED
|
@@ -1,19 +1,18 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { Tensor, Variable } from '@tensorflow/tfjs-core';
|
|
2
2
|
import { GPTConfig } from '../config';
|
|
3
3
|
import { default as BaseLayer } from './BaseLayer';
|
|
4
4
|
export default class MLP extends BaseLayer {
|
|
5
5
|
private cFc;
|
|
6
6
|
private cProj;
|
|
7
7
|
private dropout;
|
|
8
|
-
private tf;
|
|
9
8
|
private index;
|
|
10
9
|
private _trainable;
|
|
11
|
-
constructor(
|
|
12
|
-
get variables():
|
|
10
|
+
constructor(index: number, config: GPTConfig);
|
|
11
|
+
get variables(): Variable[];
|
|
13
12
|
get trainable(): boolean;
|
|
14
13
|
set trainable(value: boolean);
|
|
15
|
-
saveWeights(map: Map<string,
|
|
16
|
-
loadWeights(weights: Map<string,
|
|
17
|
-
call(x:
|
|
14
|
+
saveWeights(map: Map<string, Tensor[]>): void;
|
|
15
|
+
loadWeights(weights: Map<string, Tensor[]>): void;
|
|
16
|
+
call(x: Tensor, training?: boolean): Tensor;
|
|
18
17
|
dispose(): void;
|
|
19
18
|
}
|
package/dist/layers/MLP.js
CHANGED
|
@@ -1,32 +1,34 @@
|
|
|
1
|
-
import
|
|
2
|
-
|
|
1
|
+
import { t as n } from "../index-pWA4_lUh.js";
|
|
2
|
+
import l from "./BaseLayer.js";
|
|
3
|
+
import { r as s } from "../random_width-oeUIlUZj.js";
|
|
4
|
+
import { d as i, a as c } from "../exports_layers-tbTBcwMM.js";
|
|
5
|
+
class u extends l {
|
|
3
6
|
cFc;
|
|
4
7
|
cProj;
|
|
5
8
|
dropout;
|
|
6
|
-
tf;
|
|
7
9
|
index;
|
|
8
10
|
_trainable = !0;
|
|
9
|
-
constructor(t,
|
|
10
|
-
super(), this.
|
|
11
|
+
constructor(t, e) {
|
|
12
|
+
super(), this.index = t, this.cFc = i({
|
|
11
13
|
units: e.mlpFactor * e.nEmbed,
|
|
12
14
|
activation: "gelu",
|
|
13
15
|
useBias: e.biasInLinear,
|
|
14
|
-
kernelInitializer:
|
|
16
|
+
kernelInitializer: s({
|
|
15
17
|
mean: 0,
|
|
16
18
|
stddev: 0.02
|
|
17
19
|
}),
|
|
18
20
|
biasInitializer: "zeros",
|
|
19
|
-
name: `block_${
|
|
20
|
-
}), this.cProj =
|
|
21
|
+
name: `block_${t}_mlp_cFc`
|
|
22
|
+
}), this.cProj = i({
|
|
21
23
|
units: e.nEmbed,
|
|
22
24
|
useBias: e.biasInLinear,
|
|
23
|
-
kernelInitializer:
|
|
25
|
+
kernelInitializer: s({
|
|
24
26
|
mean: 0,
|
|
25
27
|
stddev: 0.02 / Math.sqrt(2 * e.nLayer)
|
|
26
28
|
}),
|
|
27
29
|
biasInitializer: "zeros",
|
|
28
|
-
name: `block_${
|
|
29
|
-
}), this.dropout =
|
|
30
|
+
name: `block_${t}_mlp_cProj`
|
|
31
|
+
}), this.dropout = c({ rate: e.dropout });
|
|
30
32
|
}
|
|
31
33
|
get variables() {
|
|
32
34
|
return [
|
|
@@ -46,11 +48,11 @@ class l extends a {
|
|
|
46
48
|
loadWeights(t) {
|
|
47
49
|
this.cFc.setWeights(t.get(`block_${this.index}_mlpHidden`) || []), this.cProj.setWeights(t.get(`block_${this.index}_mlpOut`) || []);
|
|
48
50
|
}
|
|
49
|
-
call(t,
|
|
50
|
-
return
|
|
51
|
+
call(t, e = !1) {
|
|
52
|
+
return n(() => {
|
|
51
53
|
this.startMemory();
|
|
52
|
-
const
|
|
53
|
-
return this.endMemory("MLP"),
|
|
54
|
+
const r = this.cFc.apply(t), a = this.cProj.apply(r), o = this.dropout.apply(a, { training: e });
|
|
55
|
+
return this.endMemory("MLP"), o;
|
|
54
56
|
});
|
|
55
57
|
}
|
|
56
58
|
dispose() {
|
|
@@ -58,5 +60,5 @@ class l extends a {
|
|
|
58
60
|
}
|
|
59
61
|
}
|
|
60
62
|
export {
|
|
61
|
-
|
|
63
|
+
u as default
|
|
62
64
|
};
|
package/dist/layers/RMSNorm.d.ts
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { Tensor, Variable } from '@tensorflow/tfjs-core';
|
|
2
2
|
import { default as BaseLayer } from './BaseLayer';
|
|
3
3
|
export default class RMSNorm extends BaseLayer {
|
|
4
4
|
private gamma;
|
|
5
5
|
private epsilon;
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
get trainableWeights(): TF.Variable[];
|
|
6
|
+
constructor(shape: number[], epsilon?: number, name?: string);
|
|
7
|
+
get trainableWeights(): Variable[];
|
|
9
8
|
set trainable(value: boolean);
|
|
10
|
-
getWeights():
|
|
11
|
-
setWeights(weights:
|
|
12
|
-
apply(x:
|
|
9
|
+
getWeights(): Tensor[];
|
|
10
|
+
setWeights(weights: Tensor[]): void;
|
|
11
|
+
apply(x: Tensor): Tensor;
|
|
13
12
|
dispose(): void;
|
|
14
13
|
}
|
package/dist/layers/RMSNorm.js
CHANGED
|
@@ -1,28 +1,30 @@
|
|
|
1
|
+
import { t as r } from "../index-pWA4_lUh.js";
|
|
1
2
|
import m from "./BaseLayer.js";
|
|
2
|
-
|
|
3
|
+
import { v as i } from "../variable-Dl_ub3pk.js";
|
|
4
|
+
import { o } from "../ones-Cog-G2ag.js";
|
|
5
|
+
class d extends m {
|
|
3
6
|
gamma;
|
|
4
7
|
epsilon;
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
super(), this.tf = t, this.epsilon = a, this.gamma = t.variable(t.ones(s), !0, `${e}_gamma`, "float32");
|
|
8
|
+
constructor(a, s = 1e-8, t = "") {
|
|
9
|
+
super(), this.epsilon = s, this.gamma = i(o(a), !0, `${t}_gamma`, "float32");
|
|
8
10
|
}
|
|
9
11
|
get trainableWeights() {
|
|
10
12
|
return [this.gamma];
|
|
11
13
|
}
|
|
12
|
-
set trainable(
|
|
13
|
-
this.gamma.trainable =
|
|
14
|
+
set trainable(a) {
|
|
15
|
+
this.gamma.trainable = a;
|
|
14
16
|
}
|
|
15
17
|
getWeights() {
|
|
16
18
|
return [this.gamma];
|
|
17
19
|
}
|
|
18
|
-
setWeights(
|
|
19
|
-
this.gamma.assign(
|
|
20
|
+
setWeights(a) {
|
|
21
|
+
this.gamma.assign(a[0]);
|
|
20
22
|
}
|
|
21
|
-
apply(
|
|
22
|
-
return
|
|
23
|
+
apply(a) {
|
|
24
|
+
return r(() => {
|
|
23
25
|
this.startMemory();
|
|
24
|
-
const
|
|
25
|
-
return this.endMemory("RMSNorm"),
|
|
26
|
+
const t = a.square().mean(-1, !0).add(this.epsilon).rsqrt(), e = a.mul(t).mul(this.gamma);
|
|
27
|
+
return this.endMemory("RMSNorm"), e;
|
|
26
28
|
});
|
|
27
29
|
}
|
|
28
30
|
dispose() {
|
|
@@ -30,5 +32,5 @@ class o extends m {
|
|
|
30
32
|
}
|
|
31
33
|
}
|
|
32
34
|
export {
|
|
33
|
-
|
|
35
|
+
d as default
|
|
34
36
|
};
|
|
@@ -1,7 +1,6 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { Tensor } from '@tensorflow/tfjs-core';
|
|
2
2
|
import { GPTConfig } from '../config';
|
|
3
3
|
export default class RoPECache {
|
|
4
|
-
private readonly tf;
|
|
5
4
|
private readonly config;
|
|
6
5
|
readonly rotaryDim: number;
|
|
7
6
|
private ropeBase;
|
|
@@ -9,9 +8,9 @@ export default class RoPECache {
|
|
|
9
8
|
private ropeCos;
|
|
10
9
|
private ropeSin;
|
|
11
10
|
private ropeCacheLen;
|
|
12
|
-
constructor(
|
|
11
|
+
constructor(config: GPTConfig);
|
|
13
12
|
ensureRopeCache(needed: number): void;
|
|
14
|
-
getCos():
|
|
15
|
-
getSin():
|
|
13
|
+
getCos(): Tensor | null;
|
|
14
|
+
getSin(): Tensor | null;
|
|
16
15
|
dispose(): void;
|
|
17
16
|
}
|
package/dist/layers/RoPECache.js
CHANGED
|
@@ -1,12 +1,36 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
1
|
+
import { o as h, h as c, E as f, I as l, f as n, J as m, t as u, x as p } from "../index-pWA4_lUh.js";
|
|
2
|
+
import { c as d, s as C } from "../sin-BJIrfnj7.js";
|
|
3
|
+
import { r as a } from "../range-CcDl05lo.js";
|
|
4
|
+
/**
|
|
5
|
+
* @license
|
|
6
|
+
* Copyright 2018 Google LLC. All Rights Reserved.
|
|
7
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
8
|
+
* you may not use this file except in compliance with the License.
|
|
9
|
+
* You may obtain a copy of the License at
|
|
10
|
+
*
|
|
11
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
12
|
+
*
|
|
13
|
+
* Unless required by applicable law or agreed to in writing, software
|
|
14
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
15
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
16
|
+
* See the License for the specific language governing permissions and
|
|
17
|
+
* limitations under the License.
|
|
18
|
+
* =============================================================================
|
|
19
|
+
*/
|
|
20
|
+
function x(r) {
|
|
21
|
+
const s = { x: c(r, "x", "reciprocal") };
|
|
22
|
+
return f.runKernel(l, s);
|
|
23
|
+
}
|
|
24
|
+
const S = /* @__PURE__ */ h({ reciprocal_: x });
|
|
25
|
+
class y {
|
|
26
|
+
constructor(o) {
|
|
27
|
+
this.config = o;
|
|
28
|
+
const s = this.config.nEmbed / this.config.nHead;
|
|
29
|
+
if (this.rotaryDim = s, this.rotaryDim % 2 !== 0)
|
|
6
30
|
throw new Error("rotaryDim must be even");
|
|
7
31
|
this.ropeBase = 1e4;
|
|
8
|
-
const
|
|
9
|
-
this.ropeInvFreq =
|
|
32
|
+
const i = a(0, this.rotaryDim, 2, "float32"), e = i.div(n(this.rotaryDim, "float32")), t = m(n(this.ropeBase, "float32"), e);
|
|
33
|
+
this.ropeInvFreq = S(t), e.dispose(), t.dispose(), i.dispose(), this.config.useRope === !1 ? (this.ropeCos = null, this.ropeSin = null, this.ropeCacheLen = 0) : u(() => {
|
|
10
34
|
this.ensureRopeCache(this.config.blockSize * 4);
|
|
11
35
|
});
|
|
12
36
|
}
|
|
@@ -18,11 +42,11 @@ class n {
|
|
|
18
42
|
ropeSin = null;
|
|
19
43
|
// [cacheLen, rotaryDim/2]
|
|
20
44
|
ropeCacheLen = 0;
|
|
21
|
-
ensureRopeCache(
|
|
22
|
-
if (
|
|
45
|
+
ensureRopeCache(o) {
|
|
46
|
+
if (o <= this.ropeCacheLen) return;
|
|
23
47
|
this.ropeCos && this.ropeCos.dispose(), this.ropeSin && this.ropeSin.dispose();
|
|
24
|
-
const
|
|
25
|
-
this.ropeCos =
|
|
48
|
+
const s = Math.max(o, this.ropeCacheLen + this.config.blockSize * 4), e = a(0, s, 1, "float32").expandDims(1).mul(this.ropeInvFreq.expandDims(0));
|
|
49
|
+
this.ropeCos = p(d(e).expandDims(-1)), this.ropeSin = p(C(e).expandDims(-1)), this.ropeCacheLen = s;
|
|
26
50
|
}
|
|
27
51
|
getCos() {
|
|
28
52
|
return this.ropeCos;
|
|
@@ -35,5 +59,5 @@ class n {
|
|
|
35
59
|
}
|
|
36
60
|
}
|
|
37
61
|
export {
|
|
38
|
-
|
|
62
|
+
y as default
|
|
39
63
|
};
|
|
@@ -1,20 +1,19 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { Tensor, Variable } from '@tensorflow/tfjs-core';
|
|
2
2
|
export default class TiedEmbeddingOutputLayer {
|
|
3
3
|
private vocabSize;
|
|
4
4
|
private embedDim;
|
|
5
|
-
private tf;
|
|
6
5
|
private tiedWeights;
|
|
7
6
|
private initializer;
|
|
8
|
-
constructor(
|
|
7
|
+
constructor(config: {
|
|
9
8
|
vocabSize: number;
|
|
10
9
|
embedDim: number;
|
|
11
10
|
name?: string;
|
|
12
11
|
}, name?: string);
|
|
13
|
-
get variables():
|
|
14
|
-
embed(inputs:
|
|
15
|
-
project(inputs:
|
|
16
|
-
getWeights():
|
|
17
|
-
setWeights(weights:
|
|
12
|
+
get variables(): Variable[];
|
|
13
|
+
embed(inputs: Tensor): Tensor;
|
|
14
|
+
project(inputs: Tensor): Tensor;
|
|
15
|
+
getWeights(): Tensor[];
|
|
16
|
+
setWeights(weights: Tensor[]): void;
|
|
18
17
|
getConfig(): {
|
|
19
18
|
vocabSize: number;
|
|
20
19
|
embedDim: number;
|