@genai-fi/nanogpt 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/Generator.js CHANGED
@@ -14,17 +14,17 @@ class w extends u {
14
14
  async generateNoCache(i, t) {
15
15
  let s = await this.tokenisePrompt(i), o = i || "";
16
16
  const n = t?.maxLength ?? 1e3;
17
- for (let a = 0; a < n && this.active; a++) {
17
+ for (let r = 0; r < n && this.active; r++) {
18
18
  const {
19
19
  output: e,
20
- attention: c,
21
- probabilities: l
20
+ attention: a,
21
+ probabilities: c
22
22
  } = this.model.generate(s, void 0, t), h = s;
23
23
  s = p([s, e], 1), h.dispose();
24
- const r = await this.processResponse(e, c, l);
25
- if (e.dispose(), r === null)
24
+ const l = await this.processResponse(e, a, c);
25
+ if (e.dispose(), l === null)
26
26
  break;
27
- o += r;
27
+ o += l;
28
28
  }
29
29
  return s.dispose(), o;
30
30
  }
@@ -33,31 +33,31 @@ class w extends u {
33
33
  if (o === this.tokeniser.eosToken)
34
34
  return null;
35
35
  const n = await this.tokeniser.decode([o]);
36
- let a;
37
- t && (a = await t.array(), t.dispose());
36
+ let r;
37
+ t && (r = await Promise.all(t.map((a) => a.array().then((c) => c))), t.forEach((a) => a.dispose()));
38
38
  let e;
39
- return s && (e = await s.array(), s.dispose()), this.emit("tokens", [o], n, a, e), n;
39
+ return s && (e = await s.array(), s.dispose()), this.emit("tokens", [o], n, r, e), n;
40
40
  }
41
41
  async generateCache(i, t) {
42
42
  let s = await this.tokenisePrompt(i), o = i || "";
43
43
  const n = new Array(this.model.config.gpt.nLayer);
44
44
  for (let e = 0; e < this.model.config.gpt.nLayer; e++)
45
45
  n[e] = { k: void 0, v: void 0, length: 0, cumulativeLength: 0 };
46
- const a = t?.maxLength ?? 1e3;
47
- for (let e = 0; e < a && this.active; e++) {
46
+ const r = t?.maxLength ?? 1e3;
47
+ for (let e = 0; e < r && this.active; e++) {
48
48
  const {
49
- output: c,
50
- attention: l,
51
- probabilities: h
49
+ output: a,
50
+ probabilities: c,
51
+ attention: h
52
52
  } = this.model.generate(s, n, {
53
53
  ...t,
54
54
  usePadding: !1
55
55
  });
56
- s.dispose(), s = c;
57
- const r = await this.processResponse(c, l, h);
58
- if (r === null)
56
+ s.dispose(), s = a;
57
+ const l = await this.processResponse(a, h, c);
58
+ if (l === null)
59
59
  break;
60
- o += r;
60
+ o += l;
61
61
  }
62
62
  return n.forEach((e) => {
63
63
  e && (e.k && e.k.dispose(), e.v && e.v.dispose());
@@ -14,7 +14,7 @@ export interface GenerateOptions {
14
14
  temperature?: number;
15
15
  topK?: number;
16
16
  usePadding?: boolean;
17
- attentionScores?: AttentionScores;
17
+ attentionScores?: boolean;
18
18
  includeProbabilities?: boolean;
19
19
  }
20
20
  export interface ModelForwardAttributes extends ForwardAttributes {
@@ -41,8 +41,8 @@ export default class NanoGPT extends BaseLayer<ModelForwardAttributes> {
41
41
  forward(attrs: ModelForwardAttributes, idx: Tensor, targets?: Tensor): Tensor[];
42
42
  generate(idx: Tensor, cache?: KVCache[], options?: GenerateOptions): {
43
43
  output: Tensor;
44
- attention?: Tensor;
45
44
  probabilities?: Tensor;
45
+ attention?: Tensor[];
46
46
  };
47
47
  getNumParams(): number;
48
48
  dispose(): void;
@@ -1,16 +1,16 @@
1
1
  import { defaultConfig as L } from "./config.js";
2
- import q from "./layers/TransformerBlock.js";
3
- import { E as O, D as T, T as K, r as P, p as _ } from "./TiedEmbedding-DsDRvLB0.js";
2
+ import v from "./layers/TransformerBlock.js";
3
+ import { E as T, D as q, T as K, r as P, p as _ } from "./TiedEmbedding-DsDRvLB0.js";
4
4
  import F from "./layers/RoPECache.js";
5
5
  import D from "./layers/RMSNorm.js";
6
- import { estimateParameterCount as N } from "./utilities/parameters.js";
7
- import { createSoftmaxCrossEntropyWithGrad as R } from "./training/sparseCrossEntropy.js";
8
- import { B } from "./BaseLayer-BhrMN8JO.js";
9
- import { o as k, i as m, q as G, E as w, aa as A, ab as V, ac as j, t as b, a9 as W, f as y, F as H } from "./index-iNhkcAEQ.js";
10
- import { r as $ } from "./reshape-DxTPgnwL.js";
11
- import { r as J } from "./range-BsFU-SNG.js";
12
- import { g as Q } from "./gather-Bxe1Qip8.js";
13
- import { s as U } from "./softmax-BjsptB07.js";
6
+ import { estimateParameterCount as O } from "./utilities/parameters.js";
7
+ import { createSoftmaxCrossEntropyWithGrad as N } from "./training/sparseCrossEntropy.js";
8
+ import { B as R } from "./BaseLayer-BhrMN8JO.js";
9
+ import { o as E, i as d, q as B, E as y, aa as G, ab as V, ac as j, t as w, a9 as A, f as z, F as W } from "./index-iNhkcAEQ.js";
10
+ import { r as C } from "./reshape-DxTPgnwL.js";
11
+ import { r as H } from "./range-BsFU-SNG.js";
12
+ import { g as J } from "./gather-Bxe1Qip8.js";
13
+ import { s as Q } from "./softmax-BjsptB07.js";
14
14
  /**
15
15
  * @license
16
16
  * Copyright 2020 Google LLC. All Rights Reserved.
@@ -27,13 +27,13 @@ import { s as U } from "./softmax-BjsptB07.js";
27
27
  * limitations under the License.
28
28
  * =============================================================================
29
29
  */
30
- function X(h, t) {
31
- let e = m(h, "a", "mod"), o = m(t, "b", "mod");
32
- [e, o] = G(e, o);
30
+ function U(h, t) {
31
+ let e = d(h, "a", "mod"), o = d(t, "b", "mod");
32
+ [e, o] = B(e, o);
33
33
  const n = { a: e, b: o };
34
- return w.runKernel(A, n);
34
+ return y.runKernel(G, n);
35
35
  }
36
- const Y = /* @__PURE__ */ k({ mod_: X });
36
+ const X = /* @__PURE__ */ E({ mod_: U });
37
37
  /**
38
38
  * @license
39
39
  * Copyright 2020 Google LLC. All Rights Reserved.
@@ -50,17 +50,17 @@ const Y = /* @__PURE__ */ k({ mod_: X });
50
50
  * limitations under the License.
51
51
  * =============================================================================
52
52
  */
53
- function Z(h, t, e, o = !1) {
54
- const n = m(h, "logits", "multinomial"), s = n.size, i = n.rank;
53
+ function Y(h, t, e, o = !1) {
54
+ const n = d(h, "logits", "multinomial"), s = n.size, i = n.rank;
55
55
  if (s < 2)
56
56
  throw new Error(`Error in multinomial: you need at least 2 outcomes, but got ${s}.`);
57
57
  if (i > 2)
58
58
  throw new Error(`Rank of probabilities must be 1 or 2, but is ${i}`);
59
59
  e = e || Math.random();
60
- const a = { logits: i === 1 ? $(n, [1, -1]) : n }, c = { numSamples: t, seed: e, normalized: o }, l = w.runKernel(V, a, c);
61
- return i === 1 ? $(l, [l.size]) : l;
60
+ const c = { logits: i === 1 ? C(n, [1, -1]) : n }, l = { numSamples: t, seed: e, normalized: o }, a = y.runKernel(V, c, l);
61
+ return i === 1 ? C(a, [a.size]) : a;
62
62
  }
63
- const z = /* @__PURE__ */ k({ multinomial_: Z });
63
+ const I = /* @__PURE__ */ E({ multinomial_: Y });
64
64
  /**
65
65
  * @license
66
66
  * Copyright 2018 Google LLC. All Rights Reserved.
@@ -77,8 +77,8 @@ const z = /* @__PURE__ */ k({ multinomial_: Z });
77
77
  * limitations under the License.
78
78
  * =============================================================================
79
79
  */
80
- function tt(h, t = 1, e = !0) {
81
- const o = m(h, "x", "topk");
80
+ function Z(h, t = 1, e = !0) {
81
+ const o = d(h, "x", "topk");
82
82
  if (o.rank === 0)
83
83
  throw new Error("topk() expects the input to be of rank 1 or higher");
84
84
  const n = o.shape[o.shape.length - 1];
@@ -86,10 +86,10 @@ function tt(h, t = 1, e = !0) {
86
86
  throw new Error(`'k' passed to topk() must be >= 0 but got ${t}`);
87
87
  if (t > n)
88
88
  throw new Error(`'k' passed to topk() must be <= the last dimension (${n}) but got ${t}`);
89
- const s = { x: o }, i = { k: t, sorted: e }, [r, a] = w.runKernel(j, s, i);
90
- return { values: r, indices: a };
89
+ const s = { x: o }, i = { k: t, sorted: e }, [r, c] = y.runKernel(j, s, i);
90
+ return { values: r, indices: c };
91
91
  }
92
- const et = /* @__PURE__ */ k({ topk_: tt });
92
+ const tt = /* @__PURE__ */ E({ topk_: Z });
93
93
  /**
94
94
  * @license
95
95
  * Copyright 2018 Google LLC
@@ -99,13 +99,13 @@ const et = /* @__PURE__ */ k({ topk_: tt });
99
99
  * https://opensource.org/licenses/MIT.
100
100
  * =============================================================================
101
101
  */
102
+ function et(h) {
103
+ return new q(h);
104
+ }
102
105
  function ot(h) {
103
106
  return new T(h);
104
107
  }
105
- function st(h) {
106
- return new O(h);
107
- }
108
- class bt extends B {
108
+ class dt extends R {
109
109
  wte;
110
110
  // Token embeddings
111
111
  wpe;
@@ -119,14 +119,14 @@ class bt extends B {
119
119
  log = [];
120
120
  // Training log
121
121
  constructor(t = {}) {
122
- super({ gpt: { ...L, ...t }, layerConfig: {} }), this.wte = new K(this.config, "token_embedding", this), this.config.gpt.useRope === !1 ? this.wpe = st({
122
+ super({ gpt: { ...L, ...t }, layerConfig: {} }), this.wte = new K(this.config, "token_embedding", this), this.config.gpt.useRope === !1 ? this.wpe = ot({
123
123
  inputDim: this.config.gpt.blockSize,
124
124
  outputDim: this.config.gpt.nEmbed,
125
125
  name: "positional_embedding",
126
126
  embeddingsInitializer: P({ mean: 0, stddev: 0.02 })
127
- }) : (this.ropeCache = new F(this.config.gpt), this.config.layerConfig.ropeCache = this.ropeCache), this.drop = ot({ rate: this.config.gpt.dropout }), this.blocks = [];
127
+ }) : (this.ropeCache = new F(this.config.gpt), this.config.layerConfig.ropeCache = this.ropeCache), this.drop = et({ rate: this.config.gpt.dropout }), this.blocks = [];
128
128
  for (let e = 0; e < this.config.gpt.nLayer; e++)
129
- this.blocks.push(new q(e, this.config, this));
129
+ this.blocks.push(new v(e, this.config, this));
130
130
  this.lnF = new D(this.config, "final_rms_norm", this);
131
131
  }
132
132
  get checkpointing() {
@@ -136,11 +136,11 @@ class bt extends B {
136
136
  this.config.layerConfig.checkpointing = t;
137
137
  }
138
138
  inputPhase(t, e, o = !1) {
139
- return b(() => {
139
+ return w(() => {
140
140
  const n = this.wte.embed(t);
141
141
  if (this.config.gpt.useRope === !1) {
142
- const [, s] = t.shape, i = this.config.gpt.blockSize, r = J(0, s, 1, "int32"), a = Y(W(r, y(e, "int32")), y(i, "int32")), c = this.wpe.apply(a), l = n.add(c);
143
- return this.drop.apply(l, { training: o });
142
+ const [, s] = t.shape, i = this.config.gpt.blockSize, r = H(0, s, 1, "int32"), c = X(A(r, z(e, "int32")), z(i, "int32")), l = this.wpe.apply(c), a = n.add(l);
143
+ return this.drop.apply(a, { training: o });
144
144
  } else
145
145
  return this.drop.apply(n, { training: o });
146
146
  });
@@ -167,7 +167,7 @@ class bt extends B {
167
167
  }
168
168
  calculateLoss(t, e) {
169
169
  try {
170
- return R()(t, e).mean();
170
+ return N()(t, e).mean();
171
171
  } catch (o) {
172
172
  throw console.error("Error computing loss:", o), new Error(`Loss computation failed: ${o}`);
173
173
  }
@@ -205,7 +205,7 @@ class bt extends B {
205
205
  });
206
206
  }*/
207
207
  forward(t, e, o) {
208
- return this.validateInput(e), b(() => {
208
+ return this.validateInput(e), w(() => {
209
209
  this.startMemory();
210
210
  const n = t.cache?.[0]?.length ?? 0;
211
211
  let s = this.inputPhase(e, n, t.training);
@@ -213,59 +213,61 @@ class bt extends B {
213
213
  throw console.error("Cache", t.cache), new Error(
214
214
  `Cache length ${t.cache.length} does not match number of blocks ${this.blocks.length}`
215
215
  );
216
- let i;
217
216
  for (let c = 0; c < this.blocks.length; c++) {
218
- const l = this.blocks[c], f = Math.random() * 1e9, p = {
217
+ const l = this.blocks[c], a = Math.random() * 1e9, u = {
219
218
  training: t.training,
220
- seed: f,
219
+ seed: a,
221
220
  attentionScores: t.attentionScores,
222
221
  pastKV: t.cache ? t.cache[c] : void 0
223
- }, u = this.config.layerConfig.checkpointing && t.training ? l.callCheckpoint(p, s) : l.call(p, s);
224
- s.dispose(), s = u, p.attentionScores?.attentionOut && (i = p.attentionScores.attentionOut);
222
+ }, p = this.config.layerConfig.checkpointing && t.training ? l.callCheckpoint(u, s) : l.call(u, s);
223
+ s.dispose(), s = p;
225
224
  }
226
225
  s = this.lnF.call(t, s);
227
- const r = this.wte.project(s);
226
+ const i = this.wte.project(s);
228
227
  s.dispose();
229
- let a;
230
- return o && (a = this.calculateLoss(r, o)), this.endMemory("Forward"), t.attentionScores && (t.attentionScores.attentionOut = i ? H(i) : void 0), a ? [r, a] : [r];
228
+ let r;
229
+ return o && (r = this.calculateLoss(i, o)), this.endMemory("Forward"), r ? [i, r] : [i];
231
230
  });
232
231
  }
233
232
  generate(t, e, o) {
234
233
  const n = o?.temperature ?? 1, s = o?.topK, i = o?.usePadding ?? !1;
235
- return b(() => {
236
- const r = t, a = r.shape[1], c = a <= this.config.gpt.blockSize ? r : r.slice(
237
- [0, a - this.config.gpt.blockSize],
234
+ return w(() => {
235
+ const r = t, c = r.shape[1], l = c <= this.config.gpt.blockSize ? r : r.slice(
236
+ [0, c - this.config.gpt.blockSize],
238
237
  [r.shape[0], this.config.gpt.blockSize]
239
- ), l = i ? this.config.gpt.blockSize - c.shape[1] : 0, f = l > 0 ? _(c, [
238
+ ), a = i ? this.config.gpt.blockSize - l.shape[1] : 0, u = a > 0 ? _(l, [
240
239
  [0, 0],
241
- [0, l]
242
- ]) : c, p = {
240
+ [0, a]
241
+ ]) : l, p = {
243
242
  training: !1,
244
- attentionScores: o?.attentionScores,
243
+ attentionScores: o?.attentionScores ? {
244
+ attentionOut: []
245
+ } : void 0,
245
246
  cache: e
246
- }, [u] = this.forward(p, f), E = u.shape[1] - 1 - l, C = u.slice([0, E, 0], [u.shape[0], 1, u.shape[2]]), I = p.attentionScores?.attentionOut ? p.attentionScores.attentionOut.slice(
247
- [0, E, 0],
248
- [p.attentionScores.attentionOut.shape[0], 1, p.attentionScores.attentionOut.shape[2]]
249
- ) : void 0;
250
- u.dispose();
251
- const d = C.div(n);
252
- let g;
247
+ }, [f] = this.forward(p, u), S = f.shape[1] - 1 - a, M = f.slice([0, S, 0], [f.shape[0], 1, f.shape[2]]);
248
+ p.attentionScores?.attentionOut && p.attentionScores.attentionOut.forEach((g, k) => {
249
+ g.shape[1] !== 1 && (p.attentionScores.attentionOut[k] = W(
250
+ g.slice([0, S, 0], [g.shape[0], 1, g.shape[2]])
251
+ ), g.dispose());
252
+ }), f.dispose();
253
+ const b = M.div(n);
254
+ let m;
253
255
  if (s) {
254
- const { values: v, indices: M } = et(d, s), x = z(v.squeeze([1]), 1);
255
- g = Q(M.squeeze([1]), x, 1);
256
+ const { values: g, indices: k } = tt(b, s), x = I(g.squeeze([1]), 1);
257
+ m = J(k.squeeze([1]), x, 1);
256
258
  } else
257
- g = z(d.squeeze([1]), 1);
258
- let S;
259
- return o?.includeProbabilities && (S = U(d.squeeze([1]))), g = g.reshape([1, 1]), { output: g, attention: I?.squeeze([1]), probabilities: S };
259
+ m = I(b.squeeze([1]), 1);
260
+ let $;
261
+ return o?.includeProbabilities && ($ = Q(b.squeeze([1]))), m = m.reshape([1, 1]), { output: m, probabilities: $, attention: p.attentionScores?.attentionOut };
260
262
  });
261
263
  }
262
264
  getNumParams() {
263
- return N(this.config.gpt);
265
+ return O(this.config.gpt);
264
266
  }
265
267
  dispose() {
266
268
  this.wte.dispose(), this.wpe && this.wpe.dispose(), this.drop.dispose(), this.blocks.forEach((t) => t.dispose()), this.lnF.dispose();
267
269
  }
268
270
  }
269
271
  export {
270
- bt as default
272
+ dt as default
271
273
  };
@@ -7,9 +7,8 @@ export type KVCache = {
7
7
  cumulativeLength: number;
8
8
  };
9
9
  export interface AttentionScores {
10
- head: number;
11
- block: number;
12
- attentionOut?: Tensor;
10
+ meanOfHeads?: boolean;
11
+ attentionOut?: Tensor[];
13
12
  }
14
13
  interface AttentionForwardAttributes extends ForwardAttributes {
15
14
  attentionScores?: AttentionScores;
@@ -1,10 +1,10 @@
1
- import { attentionMask as f } from "../ops/attentionMask.js";
2
- import { B as O, v as V } from "../BaseLayer-BhrMN8JO.js";
1
+ import { attentionMask as g } from "../ops/attentionMask.js";
2
+ import { B as O, v } from "../BaseLayer-BhrMN8JO.js";
3
3
  import { qkv as P } from "../ops/qkv.js";
4
- import { rope as b } from "../ops/rope.js";
5
- import { appendCache as v } from "../ops/appendCache.js";
4
+ import { rope as V } from "../ops/rope.js";
5
+ import { appendCache as T } from "../ops/appendCache.js";
6
6
  import { F as c, t as C } from "../index-iNhkcAEQ.js";
7
- import { fusedSoftmax as T } from "../ops/fusedSoftmax.js";
7
+ import { fusedSoftmax as b } from "../ops/fusedSoftmax.js";
8
8
  import { d as y } from "../tfjs_backend-NucKez4s.js";
9
9
  import { r as k, d as L } from "../dropout-kbDY39Ci.js";
10
10
  import { r as N } from "../reshape-DxTPgnwL.js";
@@ -22,14 +22,14 @@ class W extends O {
22
22
  build() {
23
23
  this.hasVariable(this.ATTN) === !1 && this.setVariable(
24
24
  this.ATTN,
25
- V(
25
+ v(
26
26
  k([this.config.gpt.nEmbed, this.units], 0, 0.02),
27
27
  !0
28
28
  //`block_${this.index}_attn_cAttn_kernel`
29
29
  )
30
30
  ), this.hasVariable(this.PROJ) === !1 && this.setVariable(
31
31
  this.PROJ,
32
- V(
32
+ v(
33
33
  k([this.projUnits, this.config.gpt.nEmbed], 0, 0.02),
34
34
  !0
35
35
  //`block_${this.index}_attn_cProj_kernel`
@@ -37,12 +37,12 @@ class W extends O {
37
37
  );
38
38
  }
39
39
  getAttentionScores(t, i, s, o) {
40
- const e = f(t, i, this.divisor), n = T(e, s ? this.config.gpt.dropout : 0, o);
40
+ const e = g(t, i, this.divisor), n = b(e, s ? this.config.gpt.dropout : 0, o);
41
41
  return e.dispose(), n;
42
42
  }
43
43
  // Attention with optional past. If pastLen > 0 and T_cur == 1, no mask needed.
44
44
  getAttentionScoresWithPast(t, i, s) {
45
- const o = f(t, i, this.divisor, s), e = T(o, 0, 0);
45
+ const o = g(t, i, this.divisor, s), e = b(o, 0, 0);
46
46
  return o.dispose(), e;
47
47
  }
48
48
  getQKV(t) {
@@ -53,9 +53,9 @@ class W extends O {
53
53
  return n.dispose(), e.dispose(), p;
54
54
  }
55
55
  updateCache(t, i, s) {
56
- const o = this.config.gpt.blockSize, e = t.shape[2], n = s.length || 0, p = v(t, o, n, s.k);
56
+ const o = this.config.gpt.blockSize, e = t.shape[2], n = s.length || 0, p = T(t, o, n, s.k);
57
57
  t.dispose(), s.k && s.k.dispose();
58
- const r = v(i, o, n, s.v);
58
+ const r = T(i, o, n, s.v);
59
59
  i.dispose(), s.v && s.v.dispose();
60
60
  const d = Math.min(n + e, o), h = s.cumulativeLength + e;
61
61
  s.length = d, s.cumulativeLength = h, s.k = c(p), s.v = c(r);
@@ -63,23 +63,23 @@ class W extends O {
63
63
  forward(t, i) {
64
64
  return C(() => {
65
65
  this.startMemory();
66
- const [s, o, e] = this.getQKV(i), n = t.pastKV ? t.pastKV.cumulativeLength : 0, p = this.config.layerConfig.ropeCache, r = p ? b(s, p, n) : s, d = p ? b(o, p, n) : o;
66
+ const [s, o, e] = this.getQKV(i), n = t.pastKV ? t.pastKV.cumulativeLength : 0, p = this.config.layerConfig.ropeCache, r = p ? V(s, p, n) : s, d = p ? V(o, p, n) : o;
67
67
  p && (s.dispose(), o.dispose());
68
68
  const h = t.pastKV ? t.pastKV.length : 0;
69
69
  t.pastKV && !t.training && this.updateCache(d, e, t.pastKV);
70
70
  const u = t.pastKV?.k ? t.pastKV.k : d, l = t.pastKV?.v ? t.pastKV.v : e;
71
71
  let a;
72
72
  h > 0 ? a = this.getAttentionScoresWithPast(r, u, h) : a = this.getAttentionScores(r, u, t.training, t.seed || 0), r.dispose(), t.pastKV || u.dispose();
73
- const m = R(a, l), g = t.attentionScores !== void 0 && t.attentionScores.block === this.index;
74
- g || a.dispose(), t.pastKV || l.dispose();
75
- const S = this.getOutputProjection(m);
76
- if (m.dispose(), g && t.attentionScores && t.attentionScores.head >= 0 && t.attentionScores.head < this.config.gpt.nHead) {
77
- const A = a.shape[0], K = a.shape[2];
78
- t.attentionScores.attentionOut = c(
79
- a.slice([0, t.attentionScores.head, 0, 0], [-1, 1, -1, -1]).reshape([A, K, -1])
73
+ const m = R(a, l), f = t.attentionScores !== void 0 && t.attentionScores.attentionOut !== void 0;
74
+ f || a.dispose(), t.pastKV || l.dispose();
75
+ const A = this.getOutputProjection(m);
76
+ if (m.dispose(), f && t.attentionScores && t.attentionScores.attentionOut !== void 0) {
77
+ const K = a.shape[1], S = a.shape[2];
78
+ t.attentionScores.attentionOut?.push(
79
+ c(a.slice([0, 0, 0, 0], [1, -1, -1, -1]).reshape([K, S, -1]))
80
80
  );
81
81
  }
82
- return this.endMemory("CausalSelfAttention"), S;
82
+ return this.endMemory("CausalSelfAttention"), A;
83
83
  });
84
84
  }
85
85
  dropout(t) {
@@ -1,7 +1,7 @@
1
1
  import { j as g } from "../jszip.min-CjP2V1VV.js";
2
2
  import { exportWeights as l } from "./weights.js";
3
- import b from "../tokeniser/CharTokeniser.js";
4
- const p = "1.0.0";
3
+ import p from "../tokeniser/CharTokeniser.js";
4
+ const b = "1.0.0";
5
5
  async function h(t, a, i) {
6
6
  const c = i?.includeLog ?? !0, f = /* @__PURE__ */ new Map();
7
7
  t.saveWeights(f);
@@ -14,8 +14,8 @@ async function h(t, a, i) {
14
14
  "manifest.json",
15
15
  JSON.stringify({
16
16
  weightSpec: r,
17
- config: t.config,
18
- version: p,
17
+ config: t.config.gpt,
18
+ version: b,
19
19
  application: "@genai-fi/nanogpt",
20
20
  meta: i?.metadata,
21
21
  name: i?.name
@@ -26,7 +26,7 @@ async function h(t, a, i) {
26
26
  ), e.file(
27
27
  "tokeniser.json",
28
28
  JSON.stringify({
29
- type: a instanceof b ? "char" : "bpe",
29
+ type: a instanceof p ? "char" : "bpe",
30
30
  vocab: a.getVocab(),
31
31
  merges: await a.getMerges()
32
32
  }),
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@genai-fi/nanogpt",
3
- "version": "0.5.0",
3
+ "version": "0.5.1",
4
4
  "type": "module",
5
5
  "main": "dist/main.js",
6
6
  "types": "dist/main.d.ts",