@genai-fi/nanogpt 0.17.3 → 0.17.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/Generator.js CHANGED
@@ -11841,7 +11841,7 @@ class AS extends Ui {
11841
11841
  this.outputConversation[this.outputConversation.length - 1]._completed = !0;
11842
11842
  break;
11843
11843
  }
11844
- o === s - 1 && (this.outputConversation[this.outputConversation.length - 1]._completed = !0), this.outputConversation[this.outputConversation.length - 1].content += p;
11844
+ o === s - 1 && s > 1 && (this.outputConversation[this.outputConversation.length - 1]._completed = !0), this.outputConversation[this.outputConversation.length - 1].content += p;
11845
11845
  }
11846
11846
  return n.dispose(), this.outputConversation;
11847
11847
  }
@@ -1,6 +1,7 @@
1
- import k, { SPECIALS as d } from "./BaseTokeniser.js";
2
- const u = ["<eos>", "<unk>"];
3
- class T extends k {
1
+ import { yieldIfNeeded as d } from "../utilities/yielder.js";
2
+ import f, { SPECIALS as u } from "./BaseTokeniser.js";
3
+ const b = ["<eos>", "<unk>"];
4
+ class p extends f {
4
5
  vocabSize = 0;
5
6
  eosToken = 0;
6
7
  bosToken = 0;
@@ -8,30 +9,30 @@ class T extends k {
8
9
  vocab = [];
9
10
  cache = /* @__PURE__ */ new Map();
10
11
  _trained = !1;
11
- constructor(i) {
12
- if (super(), Array.isArray(i)) {
13
- if (this.vocab = i, this.vocab.length > 0)
14
- this.vocabSize = this.vocab.length, d.forEach((t) => {
15
- const e = this.vocab.indexOf(t);
16
- e !== -1 && this.addSpecialToken(t, e);
17
- }), this.eosToken = this.getSpecialTokenIndex("<eos>"), this.bosToken = this.getSpecialTokenIndex("<bos>") ?? this.eosToken, this.unkToken = this.getSpecialTokenIndex("") ?? -1, this.unkToken === -1 && (this.unkToken = this.vocab.indexOf("<unk>")), this.unkToken === -1 && (this.unkToken = this.vocab.indexOf("<pad>")), this.unkToken === -1 && (this.unkToken = this.vocab.indexOf("_")), this.unkToken === -1 && (this.unkToken = this.vocab.indexOf(" ")), this.unkToken === -1 && (this.unkToken = this.eosToken), this.vocab = this.vocab.map((t) => t === "<pad>" ? "" : t), this.vocab.forEach((t, e) => {
18
- this.cache.set(t, e);
12
+ constructor(t) {
13
+ if (super(), Array.isArray(t)) {
14
+ if (this.vocab = t, this.vocab.length > 0)
15
+ this.vocabSize = this.vocab.length, u.forEach((i) => {
16
+ const e = this.vocab.indexOf(i);
17
+ e !== -1 && this.addSpecialToken(i, e);
18
+ }), this.eosToken = this.getSpecialTokenIndex("<eos>"), this.bosToken = this.getSpecialTokenIndex("<bos>") ?? this.eosToken, this.unkToken = this.getSpecialTokenIndex("") ?? -1, this.unkToken === -1 && (this.unkToken = this.vocab.indexOf("<unk>")), this.unkToken === -1 && (this.unkToken = this.vocab.indexOf("<pad>")), this.unkToken === -1 && (this.unkToken = this.vocab.indexOf("_")), this.unkToken === -1 && (this.unkToken = this.vocab.indexOf(" ")), this.unkToken === -1 && (this.unkToken = this.eosToken), this.vocab = this.vocab.map((i) => i === "<pad>" ? "" : i), this.vocab.forEach((i, e) => {
19
+ this.cache.set(i, e);
19
20
  });
20
21
  else
21
22
  throw new Error("Vocab cannot be empty");
22
23
  this._trained = !0;
23
24
  } else
24
- this.vocabSize = i, this.vocab = new Array(this.vocabSize).fill(""), this.addSpecialTokens(), this.eosToken = this.getSpecialTokenIndex("<eos>"), this.bosToken = this.getSpecialTokenIndex("<bos>") ?? this.eosToken, this.unkToken = this.getSpecialTokenIndex(""), this.vocab.forEach((t, e) => {
25
- this.cache.set(t, e);
25
+ this.vocabSize = t, this.vocab = new Array(this.vocabSize).fill(""), this.addSpecialTokens(), this.eosToken = this.getSpecialTokenIndex("<eos>"), this.bosToken = this.getSpecialTokenIndex("<bos>") ?? this.eosToken, this.unkToken = this.getSpecialTokenIndex(""), this.vocab.forEach((i, e) => {
26
+ this.cache.set(i, e);
26
27
  }), this.cache.set("", this.unkToken);
27
28
  }
28
- addToken(i, t) {
29
- if (this.cache.has(i))
30
- return this.cache.get(i);
29
+ addToken(t, i) {
30
+ if (this.cache.has(t))
31
+ return this.cache.get(t);
31
32
  let e;
32
- if (t !== void 0 ? e = t : (e = this.vocab.indexOf("", this.unkToken + 1), e === -1 && (e = this.vocabSize)), e >= this.vocabSize)
33
+ if (i !== void 0 ? e = i : (e = this.vocab.indexOf("", this.unkToken + 1), e === -1 && (e = this.vocabSize)), e >= this.vocabSize)
33
34
  throw new Error("Vocab size exceeded");
34
- return this.vocab[e] = i, this.cache.set(i, e), e;
35
+ return this.vocab[e] = t, this.cache.set(t, e), e;
35
36
  }
36
37
  get trained() {
37
38
  return this.vocab.length === this.vocabSize && this._trained;
@@ -39,43 +40,53 @@ class T extends k {
39
40
  destroy() {
40
41
  this.cache.clear(), this.vocab = [];
41
42
  }
42
- async train(i) {
43
- const t = i.map((o) => o.map((n) => n.content.split(""))).flat(2), e = new Set(t), s = Array.from(e), h = this.vocab.indexOf("", this.unkToken + 1), a = this.vocabSize - u.length;
43
+ async train(t) {
44
+ const i = /* @__PURE__ */ new Set();
45
+ let e = performance.now();
46
+ for (const n of t)
47
+ n.forEach((o) => {
48
+ for (const r of o.content)
49
+ i.add(r);
50
+ }), e = await d(e);
51
+ const s = Array.from(i), h = this.vocab.indexOf("", this.unkToken + 1), a = this.vocabSize - b.length;
44
52
  if (h === -1)
45
53
  return this.vocabSize;
46
54
  if (this._trained = !0, s.length > a) {
47
- const o = /* @__PURE__ */ new Map();
48
- t.forEach((n) => {
49
- o.set(n, (o.get(n) || 0) + 1);
50
- }), s.sort((n, r) => (o.get(n) || 0) - (o.get(r) || 0)), s.splice(0, s.length - a);
55
+ const n = /* @__PURE__ */ new Map();
56
+ t.forEach((o) => {
57
+ o.forEach((r) => {
58
+ for (const k of r.content)
59
+ n.set(k, (n.get(k) || 0) + 1);
60
+ });
61
+ }), s.sort((o, r) => (n.get(o) || 0) - (n.get(r) || 0)), s.splice(0, s.length - a);
51
62
  }
52
63
  let c = h;
53
64
  if (c !== -1) {
54
- const o = new Set(this.vocab);
55
- for (const n of s)
56
- if (!o.has(n) && (this.vocab[c] = n, o.add(n), c = this.vocab.indexOf("", c + 1), c === -1))
65
+ const n = new Set(this.vocab);
66
+ for (const o of s)
67
+ if (!n.has(o) && (this.vocab[c] = o, n.add(o), c = this.vocab.indexOf("", c + 1), c === -1))
57
68
  break;
58
69
  }
59
- return this.cache.clear(), this.vocab.forEach((o, n) => {
60
- this.cache.set(o, n);
70
+ return this.cache.clear(), this.vocab.forEach((n, o) => {
71
+ this.cache.set(n, o);
61
72
  }), this.emit("trainStatus", "trained"), this.vocabSize;
62
73
  }
63
- tokenise(i, t) {
74
+ tokenise(t, i) {
64
75
  if (!this.trained)
65
76
  throw new Error("Tokeniser not trained");
66
- return i.map((s) => t ? s.split("").map((h) => this.cache.get(h) ?? this.unkToken) : s.split("").map((h) => {
77
+ return t.map((s) => i ? s.split("").map((h) => this.cache.get(h) ?? this.unkToken) : s.split("").map((h) => {
67
78
  const a = this.cache.get(h);
68
79
  return a !== void 0 ? this.vocab[a] : "";
69
80
  }));
70
81
  }
71
- detokenise(i) {
72
- return i.map((e) => Array.from(e).map((s) => this.vocab[s] || "").join(""));
82
+ detokenise(t) {
83
+ return t.map((e) => Array.from(e).map((s) => this.vocab[s] || "").join(""));
73
84
  }
74
- encode(i) {
75
- return this.tokenise([i], !0)[0];
85
+ encode(t) {
86
+ return this.tokenise([t], !0)[0];
76
87
  }
77
- decode(i) {
78
- return this.detokenise([i])[0];
88
+ decode(t) {
89
+ return this.detokenise([t])[0];
79
90
  }
80
91
  getVocab() {
81
92
  return this.vocab;
@@ -83,13 +94,13 @@ class T extends k {
83
94
  getMerges() {
84
95
  return [];
85
96
  }
86
- async createTrainingData(i, t = 5) {
87
- const e = await this.tokenise(i, !0), s = [], h = [];
88
- for (let a = 0; a < e.length - t; a++)
89
- s.push(...e[a].slice(0, t)), h.push(e[a + 1][0]);
97
+ async createTrainingData(t, i = 5) {
98
+ const e = await this.tokenise(t, !0), s = [], h = [];
99
+ for (let a = 0; a < e.length - i; a++)
100
+ s.push(...e[a].slice(0, i)), h.push(e[a + 1][0]);
90
101
  return [s, h];
91
102
  }
92
103
  }
93
104
  export {
94
- T as default
105
+ p as default
95
106
  };
@@ -1,15 +1,15 @@
1
- import { yieldIfNeeded as f } from "../utilities/yielder.js";
1
+ import { yieldIfNeeded as p } from "../utilities/yielder.js";
2
2
  import m from "../utilities/tokenParse.js";
3
- import z, { SPECIALS as k } from "./BaseTokeniser.js";
4
- function p(o, e) {
3
+ import w, { SPECIALS as S } from "./BaseTokeniser.js";
4
+ function g(o, e) {
5
5
  return `${o}-::-${e}`;
6
6
  }
7
- function w(o) {
7
+ function T(o) {
8
8
  const e = /* @__PURE__ */ new Map();
9
9
  for (let s = 0; s < o.length; s++) {
10
10
  const t = o[s];
11
11
  for (let n = 0; n < t.length - 1; n++) {
12
- const r = p(t[n], t[n + 1]), a = e.get(r) || {
12
+ const r = g(t[n], t[n + 1]), a = e.get(r) || {
13
13
  a: t[n],
14
14
  b: t[n + 1],
15
15
  count: 0,
@@ -20,21 +20,21 @@ function w(o) {
20
20
  }
21
21
  return { pairs: e, tokens: o };
22
22
  }
23
- function d(o, e, s, t, n) {
24
- const r = p(e, s);
23
+ function f(o, e, s, t, n) {
24
+ const r = g(e, s);
25
25
  if (o.pairs.has(r)) {
26
26
  const a = o.pairs.get(r);
27
27
  a.count += n, n > 0 ? a.instances.add(t) : a.count <= 0 ? o.pairs.delete(r) : a.instances.delete(t);
28
28
  } else
29
29
  o.pairs.set(r, { a: e, b: s, count: n, instances: /* @__PURE__ */ new Set([t]) });
30
30
  }
31
- function T(o) {
31
+ function y(o) {
32
32
  let e = null, s = 0;
33
33
  for (const t of o.pairs.values())
34
34
  t.count > s && (s = t.count, e = t);
35
35
  return e;
36
36
  }
37
- function y(o, e) {
37
+ function I(o, e) {
38
38
  return o.map((s) => {
39
39
  const t = [];
40
40
  for (let n = 0; n < s.length; n++)
@@ -42,19 +42,19 @@ function y(o, e) {
42
42
  return t;
43
43
  });
44
44
  }
45
- function I(o, e) {
45
+ function x(o, e) {
46
46
  e.instances.forEach((s) => {
47
47
  const t = o.tokens[s], n = [];
48
48
  for (let r = 0; r < t.length; r++)
49
49
  if (r < t.length - 1 && t[r] === e.a && t[r + 1] === e.b) {
50
50
  const a = e.a + e.b;
51
- n.push(a), r > 0 && (d(o, t[r - 1], e.a, s, -1), d(o, t[r - 1], a, s, 1)), r++, r < t.length - 1 && (d(o, e.b, t[r + 1], s, -1), d(o, a, t[r + 1], s, 1));
51
+ n.push(a), r > 0 && (f(o, t[r - 1], e.a, s, -1), f(o, t[r - 1], a, s, 1)), r++, r < t.length - 1 && (f(o, e.b, t[r + 1], s, -1), f(o, a, t[r + 1], s, 1));
52
52
  } else
53
53
  n.push(t[r]);
54
54
  o.tokens[s] = n;
55
- }), o.pairs.delete(p(e.a, e.b));
55
+ }), o.pairs.delete(g(e.a, e.b));
56
56
  }
57
- class E extends z {
57
+ class C extends w {
58
58
  targetSize;
59
59
  vocab = /* @__PURE__ */ new Set();
60
60
  vocabIndex = /* @__PURE__ */ new Map();
@@ -63,7 +63,7 @@ class E extends z {
63
63
  constructor(e, s) {
64
64
  super(), Array.isArray(e) ? (e.forEach((t, n) => {
65
65
  this.vocab.add(t), this.vocabIndex.set(t, n);
66
- }), s && (this.merges = s), this.targetSize = e.length, k.forEach((t) => {
66
+ }), s && (this.merges = s), this.targetSize = e.length, S.forEach((t) => {
67
67
  const n = e.indexOf(t);
68
68
  n !== -1 && this.addSpecialToken(t, n);
69
69
  })) : (this.addSpecialTokens(), this.targetSize = e);
@@ -81,7 +81,7 @@ class E extends z {
81
81
  this.vocab.clear(), this.vocabIndex.clear(), this.merges = [], this.pretokenMap.clear();
82
82
  }
83
83
  get trained() {
84
- return this.vocab.size > k.length && this.vocab.size <= this.targetSize;
84
+ return this.vocab.size > S.length && this.vocab.size <= this.targetSize;
85
85
  }
86
86
  get vocabSize() {
87
87
  return this.vocab.size;
@@ -97,39 +97,44 @@ class E extends z {
97
97
  }
98
98
  async train(e = [], s) {
99
99
  let t = performance.now();
100
- const n = e.map((i) => i.map((h) => m(h.content))).flat(2);
101
- t = await f(t, s, this.vocab.size);
102
- const r = new Set(n);
100
+ const n = new Array(e.length);
101
+ for (let i = 0; i < e.length; i++) {
102
+ const h = e[i], l = new Array(h.length);
103
+ for (let d = 0; d < h.length; d++)
104
+ l[d] = m(h[d].content);
105
+ t = await p(t, s, this.vocab.size), n[i] = l;
106
+ }
107
+ const r = n.flat(2), a = new Set(r);
103
108
  this.vocab = /* @__PURE__ */ new Set(), this.pretokenMap.clear(), this.merges = [], this.addSpecialTokens();
104
- const a = Array.from(r), b = a.map((i) => Array.from(i).map((l) => (this.vocab.add(l), l))), g = w(b);
105
- if (t = await f(t, s, this.vocab.size), this.vocab.size >= this.targetSize) {
109
+ const b = Array.from(a), v = b.map((i) => Array.from(i).map((l) => (this.vocab.add(l), l))), k = T(v);
110
+ if (t = await p(t, s, this.vocab.size), this.vocab.size >= this.targetSize) {
106
111
  console.warn("Initial vocab size is greater than or equal to target size. No merges will be performed.");
107
112
  const i = /* @__PURE__ */ new Map();
108
- n.forEach((c) => {
113
+ r.forEach((c) => {
109
114
  Array.from(c).forEach((u) => {
110
115
  i.set(u, (i.get(u) || 0) + 1);
111
116
  });
112
117
  });
113
118
  const h = Array.from(i.entries()).sort((c, u) => u[1] - c[1]);
114
119
  this.vocab = /* @__PURE__ */ new Set(), this.addSpecialTokens(), h.slice(0, this.targetSize - this.vocab.size).map(([c]) => c).forEach((c) => this.vocab.add(c)), this.vocabIndex.clear();
115
- let S = 0;
120
+ let d = 0;
116
121
  for (const c of this.vocab.keys())
117
- this.vocabIndex.set(c, S++);
122
+ this.vocabIndex.set(c, d++);
118
123
  return this.emit("trainStatus", "trained"), this.vocab.size;
119
124
  }
120
125
  for (; this.vocab.size < this.targetSize && this.merges.length < this.targetSize; ) {
121
- const i = T(g);
126
+ const i = y(k);
122
127
  if (!i)
123
128
  break;
124
- this.merges.push([i.a, i.b]), this.vocab.add(i.a + i.b), I(g, i), t = await f(t, s, this.vocab.size);
129
+ this.merges.push([i.a, i.b]), this.vocab.add(i.a + i.b), x(k, i), t = await p(t, s, this.vocab.size);
125
130
  }
126
- a.forEach((i, h) => {
127
- const l = b[h];
131
+ b.forEach((i, h) => {
132
+ const l = v[h];
128
133
  this.pretokenMap.set(i, l);
129
134
  }), this.vocabIndex.clear();
130
- let v = 0;
135
+ let z = 0;
131
136
  for (const i of this.vocab.keys())
132
- this.vocabIndex.set(i, v++);
137
+ this.vocabIndex.set(i, z++);
133
138
  return this.emit("trainStatus", "trained"), this.vocab.size;
134
139
  }
135
140
  getVocab() {
@@ -141,7 +146,7 @@ class E extends z {
141
146
  tokeniseWord(e) {
142
147
  let s = Array.from(e);
143
148
  return this.merges.forEach((t) => {
144
- s = y([s], t)[0];
149
+ s = I([s], t)[0];
145
150
  }), this.pretokenMap.set(e, s), s;
146
151
  }
147
152
  tokeniseStrings(e) {
@@ -163,5 +168,5 @@ class E extends z {
163
168
  }
164
169
  }
165
170
  export {
166
- E as default
171
+ C as default
167
172
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@genai-fi/nanogpt",
3
- "version": "0.17.3",
3
+ "version": "0.17.5",
4
4
  "type": "module",
5
5
  "main": "dist/main.js",
6
6
  "types": "dist/main.d.ts",