@genai-fi/nanogpt 0.4.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -23,8 +23,8 @@ class nt extends y {
|
|
|
23
23
|
projUnits;
|
|
24
24
|
constructor(t, s) {
|
|
25
25
|
super(s), this.index = t, this.units = s.gpt.nEmbed * 3, this.projUnits = s.gpt.nEmbed, this.bias = M.bandPart(q([s.gpt.blockSize, s.gpt.blockSize]), -1, 0).cast("bool"), this.divisor = 1 / Math.sqrt(s.gpt.nEmbed / s.gpt.nHead);
|
|
26
|
-
const e = B([s.gpt.blockSize, s.gpt.blockSize]),
|
|
27
|
-
this.maskInf = O(this.bias, e,
|
|
26
|
+
const e = B([s.gpt.blockSize, s.gpt.blockSize]), o = $([s.gpt.blockSize, s.gpt.blockSize], Number.NEGATIVE_INFINITY);
|
|
27
|
+
this.maskInf = O(this.bias, e, o);
|
|
28
28
|
}
|
|
29
29
|
build() {
|
|
30
30
|
this.cAttn === null && (this.cAttn = g(
|
|
@@ -57,87 +57,87 @@ class nt extends y {
|
|
|
57
57
|
if (!e) throw new Error(`Weights for block_${this.index}_cProj not found`);
|
|
58
58
|
this.cAttn ? this.cAttn.assign(s) : this.cAttn = g(s, !0), this.cProj ? this.cProj.assign(e) : this.cProj = g(e, !0);
|
|
59
59
|
}
|
|
60
|
-
getAttentionScores(t, s, e,
|
|
61
|
-
const
|
|
62
|
-
return S(
|
|
60
|
+
getAttentionScores(t, s, e, o) {
|
|
61
|
+
const n = I(t, s, this.maskInf, this.divisor);
|
|
62
|
+
return S(n, e ? this.config.gpt.dropout : 0, o);
|
|
63
63
|
}
|
|
64
64
|
// Attention with optional past. If pastLen > 0 and T_cur == 1, no mask needed.
|
|
65
|
-
getAttentionScoresWithPast(t, s, e
|
|
66
|
-
const
|
|
67
|
-
let
|
|
68
|
-
if (
|
|
65
|
+
getAttentionScoresWithPast(t, s, e) {
|
|
66
|
+
const o = t.shape[2];
|
|
67
|
+
let i = C(t, s, !1, !0).mul(this.divisor);
|
|
68
|
+
if (o > 1 && e > 0)
|
|
69
69
|
throw new Error("Cannot use past with T_cur > 1");
|
|
70
|
-
if (
|
|
71
|
-
const
|
|
72
|
-
|
|
70
|
+
if (o > 1) {
|
|
71
|
+
const r = this.maskInf.slice([0, 0], [o, o]).expandDims(0).expandDims(0);
|
|
72
|
+
i = i.add(r);
|
|
73
73
|
}
|
|
74
|
-
return S(
|
|
74
|
+
return S(i, 0, 0);
|
|
75
75
|
}
|
|
76
76
|
getQKV(t) {
|
|
77
77
|
return z(t, this.cAttn, this.config.gpt.nHead);
|
|
78
78
|
}
|
|
79
79
|
getOutputProjection(t) {
|
|
80
|
-
const s = t.shape[0], e = t.shape[2],
|
|
80
|
+
const s = t.shape[0], e = t.shape[2], o = this.config.gpt.nEmbed, n = t.transpose([0, 2, 1, 3]), i = D(n, [s, e, o]);
|
|
81
81
|
return N(i, this.cProj);
|
|
82
82
|
}
|
|
83
83
|
updateCache(t, s, e) {
|
|
84
|
-
const
|
|
84
|
+
const o = this.config.gpt.blockSize, n = t.shape[2], i = Math.min(e?.length || 0, o - n), r = e ? E(e.k, t, o) : t, a = e ? E(e.v, s, o) : s;
|
|
85
85
|
return {
|
|
86
|
-
k: _(
|
|
87
|
-
v: _(
|
|
88
|
-
length: i +
|
|
89
|
-
cumulativeLength: e ? e.cumulativeLength +
|
|
86
|
+
k: _(r),
|
|
87
|
+
v: _(a),
|
|
88
|
+
length: i + n,
|
|
89
|
+
cumulativeLength: e ? e.cumulativeLength + n : n
|
|
90
90
|
};
|
|
91
91
|
}
|
|
92
|
-
forward(t, s = !1, e,
|
|
92
|
+
forward(t, s = !1, e, o = !1, n) {
|
|
93
93
|
return x(() => {
|
|
94
94
|
this.startMemory();
|
|
95
|
-
const [i,
|
|
96
|
-
|
|
97
|
-
const f =
|
|
98
|
-
|
|
99
|
-
let
|
|
100
|
-
f > 0 ?
|
|
101
|
-
const k = C(
|
|
95
|
+
const [i, r, a] = this.getQKV(t), u = n ? n.cumulativeLength : 0, c = this.config.layerConfig.ropeCache, d = c ? P(i, c, u) : i, h = c ? P(r, c, u) : r;
|
|
96
|
+
c && (i.dispose(), r.dispose());
|
|
97
|
+
const f = n ? n.length : 0, l = this.updateCache(h, a, n), m = l.k, b = l.v;
|
|
98
|
+
n && (h.dispose(), a.dispose());
|
|
99
|
+
let p;
|
|
100
|
+
f > 0 ? p = this.getAttentionScoresWithPast(d, m, f) : p = this.getAttentionScores(d, m, s, e);
|
|
101
|
+
const k = C(p, b), A = this.getOutputProjection(k), w = o ? p.mean(1) : void 0;
|
|
102
102
|
return this.endMemory("CausalSelfAttention"), { output: A, attention: w, presentKV: l };
|
|
103
103
|
});
|
|
104
104
|
}
|
|
105
|
-
call(t, s = !1, e = !1,
|
|
106
|
-
if (
|
|
105
|
+
call(t, s = !1, e = !1, o) {
|
|
106
|
+
if (o && !this.config.gpt.useRope)
|
|
107
107
|
throw new Error("Cannot use pastKV without RoPE enabled");
|
|
108
|
-
if (s &&
|
|
108
|
+
if (s && o)
|
|
109
109
|
throw new Error("Cannot use pastKV during training");
|
|
110
110
|
if (t.shape.length !== 3)
|
|
111
111
|
throw new Error(`Input tensor must be rank 3 [B, T, C], got shape ${t.shape}`);
|
|
112
112
|
if (t.shape[2] !== this.config.gpt.nEmbed)
|
|
113
113
|
throw new Error(`Input tensor last dimension must be ${this.config.gpt.nEmbed}, got ${t.shape[2]}`);
|
|
114
114
|
this.build();
|
|
115
|
-
const
|
|
115
|
+
const n = Math.random() * 1e9;
|
|
116
116
|
if (s && this.config.layerConfig.checkpointAttention) {
|
|
117
|
-
const
|
|
117
|
+
const r = L(
|
|
118
118
|
// @ts-expect-error Invalid params
|
|
119
|
-
(
|
|
120
|
-
const
|
|
121
|
-
|
|
119
|
+
(a, u, c, d) => {
|
|
120
|
+
const h = this.forward(a, !0, n);
|
|
121
|
+
h.presentKV?.k.dispose(), h.presentKV?.v.dispose(), d([a]);
|
|
122
122
|
const f = (l, m) => {
|
|
123
|
-
const [b] = m,
|
|
123
|
+
const [b] = m, p = v().state.activeTape;
|
|
124
124
|
v().state.activeTape = [];
|
|
125
125
|
const k = W((A, w, H) => {
|
|
126
|
-
const j = this.forward(A, !0,
|
|
126
|
+
const j = this.forward(A, !0, n);
|
|
127
127
|
return j.presentKV?.k.dispose(), j.presentKV?.v.dispose(), j.output;
|
|
128
|
-
})([b,
|
|
129
|
-
return v().state.activeTape =
|
|
128
|
+
})([b, u, c], l);
|
|
129
|
+
return v().state.activeTape = p, k;
|
|
130
130
|
};
|
|
131
|
-
return { value:
|
|
131
|
+
return { value: h.output, gradFunc: f };
|
|
132
132
|
}
|
|
133
133
|
)(t, this.cAttn, this.cProj);
|
|
134
134
|
if (this.config.gpt.dropout > 0) {
|
|
135
|
-
const
|
|
136
|
-
return
|
|
135
|
+
const a = U(r, this.config.gpt.dropout);
|
|
136
|
+
return r.dispose(), { output: a };
|
|
137
137
|
} else
|
|
138
|
-
return { output:
|
|
138
|
+
return { output: r };
|
|
139
139
|
} else
|
|
140
|
-
return this.forward(t, s,
|
|
140
|
+
return this.forward(t, s, n, e, o);
|
|
141
141
|
}
|
|
142
142
|
dispose() {
|
|
143
143
|
this.cAttn?.dispose(), this.cProj?.dispose(), this.bias.dispose(), this.maskInf.dispose();
|
|
@@ -3858,8 +3858,8 @@ class Io {
|
|
|
3858
3858
|
variableNames = ["logits", "maxLogits"];
|
|
3859
3859
|
outputShape;
|
|
3860
3860
|
userCode;
|
|
3861
|
-
constructor(e
|
|
3862
|
-
this.outputShape =
|
|
3861
|
+
constructor(e) {
|
|
3862
|
+
this.outputShape = e, this.userCode = `
|
|
3863
3863
|
void main() {
|
|
3864
3864
|
ivec4 coords = getOutputCoords(); // [batch, nh, t1, t2]
|
|
3865
3865
|
int b = coords.x;
|
|
@@ -3881,8 +3881,8 @@ class xo {
|
|
|
3881
3881
|
{ name: "dropoutRate", type: "float" },
|
|
3882
3882
|
{ name: "seed", type: "float" }
|
|
3883
3883
|
];
|
|
3884
|
-
constructor(e
|
|
3885
|
-
this.outputShape =
|
|
3884
|
+
constructor(e) {
|
|
3885
|
+
this.outputShape = e, this.userCode = `
|
|
3886
3886
|
float random(ivec4 coords) {
|
|
3887
3887
|
float x = float(coords.x * 4096 + coords.y * 256 + coords.z * 16 + coords.w);
|
|
3888
3888
|
return fract(sin(seed + x) * 43758.5453123);
|
|
@@ -3908,16 +3908,16 @@ function So(t) {
|
|
|
3908
3908
|
inputs: { x: o },
|
|
3909
3909
|
backend: a,
|
|
3910
3910
|
attrs: { reductionIndices: u, keepDims: !1 }
|
|
3911
|
-
}), h = vt(c.shape, u), f =
|
|
3911
|
+
}), h = vt(c.shape, u), f = new Io(o.shape), w = a.runWebGLProgram(f, [o, c], "float32"), p = co({ inputs: { x: w }, backend: a, attrs: { axis: u, keepDims: !1 } }), m = st({ inputs: { x: p }, backend: a, attrs: { shape: h } });
|
|
3912
3912
|
if (r !== void 0 && r > 0) {
|
|
3913
|
-
const
|
|
3913
|
+
const d = new xo(o.shape), g = a.runWebGLProgram(d, [w, m], "float32", [
|
|
3914
3914
|
[r],
|
|
3915
3915
|
[i ?? Math.random() * 1e4]
|
|
3916
3916
|
]);
|
|
3917
|
-
return a.disposeIntermediateTensorInfo(c), a.disposeIntermediateTensorInfo(
|
|
3917
|
+
return a.disposeIntermediateTensorInfo(c), a.disposeIntermediateTensorInfo(w), a.disposeIntermediateTensorInfo(p), a.disposeIntermediateTensorInfo(m), g;
|
|
3918
3918
|
}
|
|
3919
|
-
const
|
|
3920
|
-
return a.disposeIntermediateTensorInfo(c), a.disposeIntermediateTensorInfo(
|
|
3919
|
+
const b = wo({ inputs: { a: w, b: m }, backend: a });
|
|
3920
|
+
return a.disposeIntermediateTensorInfo(c), a.disposeIntermediateTensorInfo(w), a.disposeIntermediateTensorInfo(p), a.disposeIntermediateTensorInfo(m), b;
|
|
3921
3921
|
}
|
|
3922
3922
|
const bo = {
|
|
3923
3923
|
kernelName: "FusedSoftmax",
|