@genai-fi/nanogpt 0.5.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/Generator.js
CHANGED
|
@@ -14,17 +14,17 @@ class w extends u {
|
|
|
14
14
|
async generateNoCache(i, t) {
|
|
15
15
|
let s = await this.tokenisePrompt(i), o = i || "";
|
|
16
16
|
const n = t?.maxLength ?? 1e3;
|
|
17
|
-
for (let
|
|
17
|
+
for (let r = 0; r < n && this.active; r++) {
|
|
18
18
|
const {
|
|
19
19
|
output: e,
|
|
20
|
-
attention:
|
|
21
|
-
probabilities:
|
|
20
|
+
attention: a,
|
|
21
|
+
probabilities: c
|
|
22
22
|
} = this.model.generate(s, void 0, t), h = s;
|
|
23
23
|
s = p([s, e], 1), h.dispose();
|
|
24
|
-
const
|
|
25
|
-
if (e.dispose(),
|
|
24
|
+
const l = await this.processResponse(e, a, c);
|
|
25
|
+
if (e.dispose(), l === null)
|
|
26
26
|
break;
|
|
27
|
-
o +=
|
|
27
|
+
o += l;
|
|
28
28
|
}
|
|
29
29
|
return s.dispose(), o;
|
|
30
30
|
}
|
|
@@ -33,31 +33,31 @@ class w extends u {
|
|
|
33
33
|
if (o === this.tokeniser.eosToken)
|
|
34
34
|
return null;
|
|
35
35
|
const n = await this.tokeniser.decode([o]);
|
|
36
|
-
let
|
|
37
|
-
t && (
|
|
36
|
+
let r;
|
|
37
|
+
t && (r = await Promise.all(t.map((a) => a.array().then((c) => c))), t.forEach((a) => a.dispose()));
|
|
38
38
|
let e;
|
|
39
|
-
return s && (e = await s.array(), s.dispose()), this.emit("tokens", [o], n,
|
|
39
|
+
return s && (e = await s.array(), s.dispose()), this.emit("tokens", [o], n, r, e), n;
|
|
40
40
|
}
|
|
41
41
|
async generateCache(i, t) {
|
|
42
42
|
let s = await this.tokenisePrompt(i), o = i || "";
|
|
43
43
|
const n = new Array(this.model.config.gpt.nLayer);
|
|
44
44
|
for (let e = 0; e < this.model.config.gpt.nLayer; e++)
|
|
45
45
|
n[e] = { k: void 0, v: void 0, length: 0, cumulativeLength: 0 };
|
|
46
|
-
const
|
|
47
|
-
for (let e = 0; e <
|
|
46
|
+
const r = t?.maxLength ?? 1e3;
|
|
47
|
+
for (let e = 0; e < r && this.active; e++) {
|
|
48
48
|
const {
|
|
49
|
-
output:
|
|
50
|
-
|
|
51
|
-
|
|
49
|
+
output: a,
|
|
50
|
+
probabilities: c,
|
|
51
|
+
attention: h
|
|
52
52
|
} = this.model.generate(s, n, {
|
|
53
53
|
...t,
|
|
54
54
|
usePadding: !1
|
|
55
55
|
});
|
|
56
|
-
s.dispose(), s =
|
|
57
|
-
const
|
|
58
|
-
if (
|
|
56
|
+
s.dispose(), s = a;
|
|
57
|
+
const l = await this.processResponse(a, h, c);
|
|
58
|
+
if (l === null)
|
|
59
59
|
break;
|
|
60
|
-
o +=
|
|
60
|
+
o += l;
|
|
61
61
|
}
|
|
62
62
|
return n.forEach((e) => {
|
|
63
63
|
e && (e.k && e.k.dispose(), e.v && e.v.dispose());
|
package/dist/NanoGPTModel.d.ts
CHANGED
|
@@ -14,7 +14,7 @@ export interface GenerateOptions {
|
|
|
14
14
|
temperature?: number;
|
|
15
15
|
topK?: number;
|
|
16
16
|
usePadding?: boolean;
|
|
17
|
-
attentionScores?:
|
|
17
|
+
attentionScores?: boolean;
|
|
18
18
|
includeProbabilities?: boolean;
|
|
19
19
|
}
|
|
20
20
|
export interface ModelForwardAttributes extends ForwardAttributes {
|
|
@@ -41,8 +41,8 @@ export default class NanoGPT extends BaseLayer<ModelForwardAttributes> {
|
|
|
41
41
|
forward(attrs: ModelForwardAttributes, idx: Tensor, targets?: Tensor): Tensor[];
|
|
42
42
|
generate(idx: Tensor, cache?: KVCache[], options?: GenerateOptions): {
|
|
43
43
|
output: Tensor;
|
|
44
|
-
attention?: Tensor;
|
|
45
44
|
probabilities?: Tensor;
|
|
45
|
+
attention?: Tensor[];
|
|
46
46
|
};
|
|
47
47
|
getNumParams(): number;
|
|
48
48
|
dispose(): void;
|
package/dist/NanoGPTModel.js
CHANGED
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
import { defaultConfig as L } from "./config.js";
|
|
2
|
-
import
|
|
3
|
-
import { E as
|
|
2
|
+
import v from "./layers/TransformerBlock.js";
|
|
3
|
+
import { E as T, D as q, T as K, r as P, p as _ } from "./TiedEmbedding-DsDRvLB0.js";
|
|
4
4
|
import F from "./layers/RoPECache.js";
|
|
5
5
|
import D from "./layers/RMSNorm.js";
|
|
6
|
-
import { estimateParameterCount as
|
|
7
|
-
import { createSoftmaxCrossEntropyWithGrad as
|
|
8
|
-
import { B } from "./BaseLayer-BhrMN8JO.js";
|
|
9
|
-
import { o as
|
|
10
|
-
import { r as
|
|
11
|
-
import { r as
|
|
12
|
-
import { g as
|
|
13
|
-
import { s as
|
|
6
|
+
import { estimateParameterCount as O } from "./utilities/parameters.js";
|
|
7
|
+
import { createSoftmaxCrossEntropyWithGrad as N } from "./training/sparseCrossEntropy.js";
|
|
8
|
+
import { B as R } from "./BaseLayer-BhrMN8JO.js";
|
|
9
|
+
import { o as E, i as d, q as B, E as y, aa as G, ab as V, ac as j, t as w, a9 as A, f as z, F as W } from "./index-iNhkcAEQ.js";
|
|
10
|
+
import { r as C } from "./reshape-DxTPgnwL.js";
|
|
11
|
+
import { r as H } from "./range-BsFU-SNG.js";
|
|
12
|
+
import { g as J } from "./gather-Bxe1Qip8.js";
|
|
13
|
+
import { s as Q } from "./softmax-BjsptB07.js";
|
|
14
14
|
/**
|
|
15
15
|
* @license
|
|
16
16
|
* Copyright 2020 Google LLC. All Rights Reserved.
|
|
@@ -27,13 +27,13 @@ import { s as U } from "./softmax-BjsptB07.js";
|
|
|
27
27
|
* limitations under the License.
|
|
28
28
|
* =============================================================================
|
|
29
29
|
*/
|
|
30
|
-
function
|
|
31
|
-
let e =
|
|
32
|
-
[e, o] =
|
|
30
|
+
function U(h, t) {
|
|
31
|
+
let e = d(h, "a", "mod"), o = d(t, "b", "mod");
|
|
32
|
+
[e, o] = B(e, o);
|
|
33
33
|
const n = { a: e, b: o };
|
|
34
|
-
return
|
|
34
|
+
return y.runKernel(G, n);
|
|
35
35
|
}
|
|
36
|
-
const
|
|
36
|
+
const X = /* @__PURE__ */ E({ mod_: U });
|
|
37
37
|
/**
|
|
38
38
|
* @license
|
|
39
39
|
* Copyright 2020 Google LLC. All Rights Reserved.
|
|
@@ -50,17 +50,17 @@ const Y = /* @__PURE__ */ k({ mod_: X });
|
|
|
50
50
|
* limitations under the License.
|
|
51
51
|
* =============================================================================
|
|
52
52
|
*/
|
|
53
|
-
function
|
|
54
|
-
const n =
|
|
53
|
+
function Y(h, t, e, o = !1) {
|
|
54
|
+
const n = d(h, "logits", "multinomial"), s = n.size, i = n.rank;
|
|
55
55
|
if (s < 2)
|
|
56
56
|
throw new Error(`Error in multinomial: you need at least 2 outcomes, but got ${s}.`);
|
|
57
57
|
if (i > 2)
|
|
58
58
|
throw new Error(`Rank of probabilities must be 1 or 2, but is ${i}`);
|
|
59
59
|
e = e || Math.random();
|
|
60
|
-
const
|
|
61
|
-
return i === 1 ?
|
|
60
|
+
const c = { logits: i === 1 ? C(n, [1, -1]) : n }, l = { numSamples: t, seed: e, normalized: o }, a = y.runKernel(V, c, l);
|
|
61
|
+
return i === 1 ? C(a, [a.size]) : a;
|
|
62
62
|
}
|
|
63
|
-
const
|
|
63
|
+
const I = /* @__PURE__ */ E({ multinomial_: Y });
|
|
64
64
|
/**
|
|
65
65
|
* @license
|
|
66
66
|
* Copyright 2018 Google LLC. All Rights Reserved.
|
|
@@ -77,8 +77,8 @@ const z = /* @__PURE__ */ k({ multinomial_: Z });
|
|
|
77
77
|
* limitations under the License.
|
|
78
78
|
* =============================================================================
|
|
79
79
|
*/
|
|
80
|
-
function
|
|
81
|
-
const o =
|
|
80
|
+
function Z(h, t = 1, e = !0) {
|
|
81
|
+
const o = d(h, "x", "topk");
|
|
82
82
|
if (o.rank === 0)
|
|
83
83
|
throw new Error("topk() expects the input to be of rank 1 or higher");
|
|
84
84
|
const n = o.shape[o.shape.length - 1];
|
|
@@ -86,10 +86,10 @@ function tt(h, t = 1, e = !0) {
|
|
|
86
86
|
throw new Error(`'k' passed to topk() must be >= 0 but got ${t}`);
|
|
87
87
|
if (t > n)
|
|
88
88
|
throw new Error(`'k' passed to topk() must be <= the last dimension (${n}) but got ${t}`);
|
|
89
|
-
const s = { x: o }, i = { k: t, sorted: e }, [r,
|
|
90
|
-
return { values: r, indices:
|
|
89
|
+
const s = { x: o }, i = { k: t, sorted: e }, [r, c] = y.runKernel(j, s, i);
|
|
90
|
+
return { values: r, indices: c };
|
|
91
91
|
}
|
|
92
|
-
const
|
|
92
|
+
const tt = /* @__PURE__ */ E({ topk_: Z });
|
|
93
93
|
/**
|
|
94
94
|
* @license
|
|
95
95
|
* Copyright 2018 Google LLC
|
|
@@ -99,13 +99,13 @@ const et = /* @__PURE__ */ k({ topk_: tt });
|
|
|
99
99
|
* https://opensource.org/licenses/MIT.
|
|
100
100
|
* =============================================================================
|
|
101
101
|
*/
|
|
102
|
+
function et(h) {
|
|
103
|
+
return new q(h);
|
|
104
|
+
}
|
|
102
105
|
function ot(h) {
|
|
103
106
|
return new T(h);
|
|
104
107
|
}
|
|
105
|
-
|
|
106
|
-
return new O(h);
|
|
107
|
-
}
|
|
108
|
-
class bt extends B {
|
|
108
|
+
class dt extends R {
|
|
109
109
|
wte;
|
|
110
110
|
// Token embeddings
|
|
111
111
|
wpe;
|
|
@@ -119,14 +119,14 @@ class bt extends B {
|
|
|
119
119
|
log = [];
|
|
120
120
|
// Training log
|
|
121
121
|
constructor(t = {}) {
|
|
122
|
-
super({ gpt: { ...L, ...t }, layerConfig: {} }), this.wte = new K(this.config, "token_embedding", this), this.config.gpt.useRope === !1 ? this.wpe =
|
|
122
|
+
super({ gpt: { ...L, ...t }, layerConfig: {} }), this.wte = new K(this.config, "token_embedding", this), this.config.gpt.useRope === !1 ? this.wpe = ot({
|
|
123
123
|
inputDim: this.config.gpt.blockSize,
|
|
124
124
|
outputDim: this.config.gpt.nEmbed,
|
|
125
125
|
name: "positional_embedding",
|
|
126
126
|
embeddingsInitializer: P({ mean: 0, stddev: 0.02 })
|
|
127
|
-
}) : (this.ropeCache = new F(this.config.gpt), this.config.layerConfig.ropeCache = this.ropeCache), this.drop =
|
|
127
|
+
}) : (this.ropeCache = new F(this.config.gpt), this.config.layerConfig.ropeCache = this.ropeCache), this.drop = et({ rate: this.config.gpt.dropout }), this.blocks = [];
|
|
128
128
|
for (let e = 0; e < this.config.gpt.nLayer; e++)
|
|
129
|
-
this.blocks.push(new
|
|
129
|
+
this.blocks.push(new v(e, this.config, this));
|
|
130
130
|
this.lnF = new D(this.config, "final_rms_norm", this);
|
|
131
131
|
}
|
|
132
132
|
get checkpointing() {
|
|
@@ -136,11 +136,11 @@ class bt extends B {
|
|
|
136
136
|
this.config.layerConfig.checkpointing = t;
|
|
137
137
|
}
|
|
138
138
|
inputPhase(t, e, o = !1) {
|
|
139
|
-
return
|
|
139
|
+
return w(() => {
|
|
140
140
|
const n = this.wte.embed(t);
|
|
141
141
|
if (this.config.gpt.useRope === !1) {
|
|
142
|
-
const [, s] = t.shape, i = this.config.gpt.blockSize, r =
|
|
143
|
-
return this.drop.apply(
|
|
142
|
+
const [, s] = t.shape, i = this.config.gpt.blockSize, r = H(0, s, 1, "int32"), c = X(A(r, z(e, "int32")), z(i, "int32")), l = this.wpe.apply(c), a = n.add(l);
|
|
143
|
+
return this.drop.apply(a, { training: o });
|
|
144
144
|
} else
|
|
145
145
|
return this.drop.apply(n, { training: o });
|
|
146
146
|
});
|
|
@@ -167,7 +167,7 @@ class bt extends B {
|
|
|
167
167
|
}
|
|
168
168
|
calculateLoss(t, e) {
|
|
169
169
|
try {
|
|
170
|
-
return
|
|
170
|
+
return N()(t, e).mean();
|
|
171
171
|
} catch (o) {
|
|
172
172
|
throw console.error("Error computing loss:", o), new Error(`Loss computation failed: ${o}`);
|
|
173
173
|
}
|
|
@@ -205,7 +205,7 @@ class bt extends B {
|
|
|
205
205
|
});
|
|
206
206
|
}*/
|
|
207
207
|
forward(t, e, o) {
|
|
208
|
-
return this.validateInput(e),
|
|
208
|
+
return this.validateInput(e), w(() => {
|
|
209
209
|
this.startMemory();
|
|
210
210
|
const n = t.cache?.[0]?.length ?? 0;
|
|
211
211
|
let s = this.inputPhase(e, n, t.training);
|
|
@@ -213,59 +213,61 @@ class bt extends B {
|
|
|
213
213
|
throw console.error("Cache", t.cache), new Error(
|
|
214
214
|
`Cache length ${t.cache.length} does not match number of blocks ${this.blocks.length}`
|
|
215
215
|
);
|
|
216
|
-
let i;
|
|
217
216
|
for (let c = 0; c < this.blocks.length; c++) {
|
|
218
|
-
const l = this.blocks[c],
|
|
217
|
+
const l = this.blocks[c], a = Math.random() * 1e9, u = {
|
|
219
218
|
training: t.training,
|
|
220
|
-
seed:
|
|
219
|
+
seed: a,
|
|
221
220
|
attentionScores: t.attentionScores,
|
|
222
221
|
pastKV: t.cache ? t.cache[c] : void 0
|
|
223
|
-
},
|
|
224
|
-
s.dispose(), s =
|
|
222
|
+
}, p = this.config.layerConfig.checkpointing && t.training ? l.callCheckpoint(u, s) : l.call(u, s);
|
|
223
|
+
s.dispose(), s = p;
|
|
225
224
|
}
|
|
226
225
|
s = this.lnF.call(t, s);
|
|
227
|
-
const
|
|
226
|
+
const i = this.wte.project(s);
|
|
228
227
|
s.dispose();
|
|
229
|
-
let
|
|
230
|
-
return o && (
|
|
228
|
+
let r;
|
|
229
|
+
return o && (r = this.calculateLoss(i, o)), this.endMemory("Forward"), r ? [i, r] : [i];
|
|
231
230
|
});
|
|
232
231
|
}
|
|
233
232
|
generate(t, e, o) {
|
|
234
233
|
const n = o?.temperature ?? 1, s = o?.topK, i = o?.usePadding ?? !1;
|
|
235
|
-
return
|
|
236
|
-
const r = t,
|
|
237
|
-
[0,
|
|
234
|
+
return w(() => {
|
|
235
|
+
const r = t, c = r.shape[1], l = c <= this.config.gpt.blockSize ? r : r.slice(
|
|
236
|
+
[0, c - this.config.gpt.blockSize],
|
|
238
237
|
[r.shape[0], this.config.gpt.blockSize]
|
|
239
|
-
),
|
|
238
|
+
), a = i ? this.config.gpt.blockSize - l.shape[1] : 0, u = a > 0 ? _(l, [
|
|
240
239
|
[0, 0],
|
|
241
|
-
[0,
|
|
242
|
-
]) :
|
|
240
|
+
[0, a]
|
|
241
|
+
]) : l, p = {
|
|
243
242
|
training: !1,
|
|
244
|
-
attentionScores: o?.attentionScores
|
|
243
|
+
attentionScores: o?.attentionScores ? {
|
|
244
|
+
attentionOut: []
|
|
245
|
+
} : void 0,
|
|
245
246
|
cache: e
|
|
246
|
-
}, [
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
247
|
+
}, [f] = this.forward(p, u), S = f.shape[1] - 1 - a, M = f.slice([0, S, 0], [f.shape[0], 1, f.shape[2]]);
|
|
248
|
+
p.attentionScores?.attentionOut && p.attentionScores.attentionOut.forEach((g, k) => {
|
|
249
|
+
g.shape[1] !== 1 && (p.attentionScores.attentionOut[k] = W(
|
|
250
|
+
g.slice([0, S, 0], [g.shape[0], 1, g.shape[2]])
|
|
251
|
+
), g.dispose());
|
|
252
|
+
}), f.dispose();
|
|
253
|
+
const b = M.div(n);
|
|
254
|
+
let m;
|
|
253
255
|
if (s) {
|
|
254
|
-
const { values:
|
|
255
|
-
|
|
256
|
+
const { values: g, indices: k } = tt(b, s), x = I(g.squeeze([1]), 1);
|
|
257
|
+
m = J(k.squeeze([1]), x, 1);
|
|
256
258
|
} else
|
|
257
|
-
|
|
258
|
-
let
|
|
259
|
-
return o?.includeProbabilities && (
|
|
259
|
+
m = I(b.squeeze([1]), 1);
|
|
260
|
+
let $;
|
|
261
|
+
return o?.includeProbabilities && ($ = Q(b.squeeze([1]))), m = m.reshape([1, 1]), { output: m, probabilities: $, attention: p.attentionScores?.attentionOut };
|
|
260
262
|
});
|
|
261
263
|
}
|
|
262
264
|
getNumParams() {
|
|
263
|
-
return
|
|
265
|
+
return O(this.config.gpt);
|
|
264
266
|
}
|
|
265
267
|
dispose() {
|
|
266
268
|
this.wte.dispose(), this.wpe && this.wpe.dispose(), this.drop.dispose(), this.blocks.forEach((t) => t.dispose()), this.lnF.dispose();
|
|
267
269
|
}
|
|
268
270
|
}
|
|
269
271
|
export {
|
|
270
|
-
|
|
272
|
+
dt as default
|
|
271
273
|
};
|
|
@@ -7,9 +7,8 @@ export type KVCache = {
|
|
|
7
7
|
cumulativeLength: number;
|
|
8
8
|
};
|
|
9
9
|
export interface AttentionScores {
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
attentionOut?: Tensor;
|
|
10
|
+
meanOfHeads?: boolean;
|
|
11
|
+
attentionOut?: Tensor[];
|
|
13
12
|
}
|
|
14
13
|
interface AttentionForwardAttributes extends ForwardAttributes {
|
|
15
14
|
attentionScores?: AttentionScores;
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
import { attentionMask as
|
|
2
|
-
import { B as O, v
|
|
1
|
+
import { attentionMask as g } from "../ops/attentionMask.js";
|
|
2
|
+
import { B as O, v } from "../BaseLayer-BhrMN8JO.js";
|
|
3
3
|
import { qkv as P } from "../ops/qkv.js";
|
|
4
|
-
import { rope as
|
|
5
|
-
import { appendCache as
|
|
4
|
+
import { rope as V } from "../ops/rope.js";
|
|
5
|
+
import { appendCache as T } from "../ops/appendCache.js";
|
|
6
6
|
import { F as c, t as C } from "../index-iNhkcAEQ.js";
|
|
7
|
-
import { fusedSoftmax as
|
|
7
|
+
import { fusedSoftmax as b } from "../ops/fusedSoftmax.js";
|
|
8
8
|
import { d as y } from "../tfjs_backend-NucKez4s.js";
|
|
9
9
|
import { r as k, d as L } from "../dropout-kbDY39Ci.js";
|
|
10
10
|
import { r as N } from "../reshape-DxTPgnwL.js";
|
|
@@ -22,14 +22,14 @@ class W extends O {
|
|
|
22
22
|
build() {
|
|
23
23
|
this.hasVariable(this.ATTN) === !1 && this.setVariable(
|
|
24
24
|
this.ATTN,
|
|
25
|
-
|
|
25
|
+
v(
|
|
26
26
|
k([this.config.gpt.nEmbed, this.units], 0, 0.02),
|
|
27
27
|
!0
|
|
28
28
|
//`block_${this.index}_attn_cAttn_kernel`
|
|
29
29
|
)
|
|
30
30
|
), this.hasVariable(this.PROJ) === !1 && this.setVariable(
|
|
31
31
|
this.PROJ,
|
|
32
|
-
|
|
32
|
+
v(
|
|
33
33
|
k([this.projUnits, this.config.gpt.nEmbed], 0, 0.02),
|
|
34
34
|
!0
|
|
35
35
|
//`block_${this.index}_attn_cProj_kernel`
|
|
@@ -37,12 +37,12 @@ class W extends O {
|
|
|
37
37
|
);
|
|
38
38
|
}
|
|
39
39
|
getAttentionScores(t, i, s, o) {
|
|
40
|
-
const e =
|
|
40
|
+
const e = g(t, i, this.divisor), n = b(e, s ? this.config.gpt.dropout : 0, o);
|
|
41
41
|
return e.dispose(), n;
|
|
42
42
|
}
|
|
43
43
|
// Attention with optional past. If pastLen > 0 and T_cur == 1, no mask needed.
|
|
44
44
|
getAttentionScoresWithPast(t, i, s) {
|
|
45
|
-
const o =
|
|
45
|
+
const o = g(t, i, this.divisor, s), e = b(o, 0, 0);
|
|
46
46
|
return o.dispose(), e;
|
|
47
47
|
}
|
|
48
48
|
getQKV(t) {
|
|
@@ -53,9 +53,9 @@ class W extends O {
|
|
|
53
53
|
return n.dispose(), e.dispose(), p;
|
|
54
54
|
}
|
|
55
55
|
updateCache(t, i, s) {
|
|
56
|
-
const o = this.config.gpt.blockSize, e = t.shape[2], n = s.length || 0, p =
|
|
56
|
+
const o = this.config.gpt.blockSize, e = t.shape[2], n = s.length || 0, p = T(t, o, n, s.k);
|
|
57
57
|
t.dispose(), s.k && s.k.dispose();
|
|
58
|
-
const r =
|
|
58
|
+
const r = T(i, o, n, s.v);
|
|
59
59
|
i.dispose(), s.v && s.v.dispose();
|
|
60
60
|
const d = Math.min(n + e, o), h = s.cumulativeLength + e;
|
|
61
61
|
s.length = d, s.cumulativeLength = h, s.k = c(p), s.v = c(r);
|
|
@@ -63,23 +63,23 @@ class W extends O {
|
|
|
63
63
|
forward(t, i) {
|
|
64
64
|
return C(() => {
|
|
65
65
|
this.startMemory();
|
|
66
|
-
const [s, o, e] = this.getQKV(i), n = t.pastKV ? t.pastKV.cumulativeLength : 0, p = this.config.layerConfig.ropeCache, r = p ?
|
|
66
|
+
const [s, o, e] = this.getQKV(i), n = t.pastKV ? t.pastKV.cumulativeLength : 0, p = this.config.layerConfig.ropeCache, r = p ? V(s, p, n) : s, d = p ? V(o, p, n) : o;
|
|
67
67
|
p && (s.dispose(), o.dispose());
|
|
68
68
|
const h = t.pastKV ? t.pastKV.length : 0;
|
|
69
69
|
t.pastKV && !t.training && this.updateCache(d, e, t.pastKV);
|
|
70
70
|
const u = t.pastKV?.k ? t.pastKV.k : d, l = t.pastKV?.v ? t.pastKV.v : e;
|
|
71
71
|
let a;
|
|
72
72
|
h > 0 ? a = this.getAttentionScoresWithPast(r, u, h) : a = this.getAttentionScores(r, u, t.training, t.seed || 0), r.dispose(), t.pastKV || u.dispose();
|
|
73
|
-
const m = R(a, l),
|
|
74
|
-
|
|
75
|
-
const
|
|
76
|
-
if (m.dispose(),
|
|
77
|
-
const
|
|
78
|
-
t.attentionScores.attentionOut
|
|
79
|
-
a.slice([0,
|
|
73
|
+
const m = R(a, l), f = t.attentionScores !== void 0 && t.attentionScores.attentionOut !== void 0;
|
|
74
|
+
f || a.dispose(), t.pastKV || l.dispose();
|
|
75
|
+
const A = this.getOutputProjection(m);
|
|
76
|
+
if (m.dispose(), f && t.attentionScores && t.attentionScores.attentionOut !== void 0) {
|
|
77
|
+
const K = a.shape[1], S = a.shape[2];
|
|
78
|
+
t.attentionScores.attentionOut?.push(
|
|
79
|
+
c(a.slice([0, 0, 0, 0], [1, -1, -1, -1]).reshape([K, S, -1]))
|
|
80
80
|
);
|
|
81
81
|
}
|
|
82
|
-
return this.endMemory("CausalSelfAttention"),
|
|
82
|
+
return this.endMemory("CausalSelfAttention"), A;
|
|
83
83
|
});
|
|
84
84
|
}
|
|
85
85
|
dropout(t) {
|
package/dist/utilities/save.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { j as g } from "../jszip.min-CjP2V1VV.js";
|
|
2
2
|
import { exportWeights as l } from "./weights.js";
|
|
3
|
-
import
|
|
4
|
-
const
|
|
3
|
+
import p from "../tokeniser/CharTokeniser.js";
|
|
4
|
+
const b = "1.0.0";
|
|
5
5
|
async function h(t, a, i) {
|
|
6
6
|
const c = i?.includeLog ?? !0, f = /* @__PURE__ */ new Map();
|
|
7
7
|
t.saveWeights(f);
|
|
@@ -14,8 +14,8 @@ async function h(t, a, i) {
|
|
|
14
14
|
"manifest.json",
|
|
15
15
|
JSON.stringify({
|
|
16
16
|
weightSpec: r,
|
|
17
|
-
config: t.config,
|
|
18
|
-
version:
|
|
17
|
+
config: t.config.gpt,
|
|
18
|
+
version: b,
|
|
19
19
|
application: "@genai-fi/nanogpt",
|
|
20
20
|
meta: i?.metadata,
|
|
21
21
|
name: i?.name
|
|
@@ -26,7 +26,7 @@ async function h(t, a, i) {
|
|
|
26
26
|
), e.file(
|
|
27
27
|
"tokeniser.json",
|
|
28
28
|
JSON.stringify({
|
|
29
|
-
type: a instanceof
|
|
29
|
+
type: a instanceof p ? "char" : "bpe",
|
|
30
30
|
vocab: a.getVocab(),
|
|
31
31
|
merges: await a.getMerges()
|
|
32
32
|
}),
|