@genai-fi/nanogpt 0.17.4 → 0.17.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/tokeniser/CharTokeniser.js +53 -42
- package/dist/tokeniser/bpe.js +36 -31
- package/package.json +1 -1
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
import
|
|
2
|
-
|
|
3
|
-
|
|
1
|
+
import { yieldIfNeeded as d } from "../utilities/yielder.js";
|
|
2
|
+
import f, { SPECIALS as u } from "./BaseTokeniser.js";
|
|
3
|
+
const b = ["<eos>", "<unk>"];
|
|
4
|
+
class p extends f {
|
|
4
5
|
vocabSize = 0;
|
|
5
6
|
eosToken = 0;
|
|
6
7
|
bosToken = 0;
|
|
@@ -8,30 +9,30 @@ class T extends k {
|
|
|
8
9
|
vocab = [];
|
|
9
10
|
cache = /* @__PURE__ */ new Map();
|
|
10
11
|
_trained = !1;
|
|
11
|
-
constructor(
|
|
12
|
-
if (super(), Array.isArray(
|
|
13
|
-
if (this.vocab =
|
|
14
|
-
this.vocabSize = this.vocab.length,
|
|
15
|
-
const e = this.vocab.indexOf(
|
|
16
|
-
e !== -1 && this.addSpecialToken(
|
|
17
|
-
}), this.eosToken = this.getSpecialTokenIndex("<eos>"), this.bosToken = this.getSpecialTokenIndex("<bos>") ?? this.eosToken, this.unkToken = this.getSpecialTokenIndex("") ?? -1, this.unkToken === -1 && (this.unkToken = this.vocab.indexOf("<unk>")), this.unkToken === -1 && (this.unkToken = this.vocab.indexOf("<pad>")), this.unkToken === -1 && (this.unkToken = this.vocab.indexOf("_")), this.unkToken === -1 && (this.unkToken = this.vocab.indexOf(" ")), this.unkToken === -1 && (this.unkToken = this.eosToken), this.vocab = this.vocab.map((
|
|
18
|
-
this.cache.set(
|
|
12
|
+
constructor(t) {
|
|
13
|
+
if (super(), Array.isArray(t)) {
|
|
14
|
+
if (this.vocab = t, this.vocab.length > 0)
|
|
15
|
+
this.vocabSize = this.vocab.length, u.forEach((i) => {
|
|
16
|
+
const e = this.vocab.indexOf(i);
|
|
17
|
+
e !== -1 && this.addSpecialToken(i, e);
|
|
18
|
+
}), this.eosToken = this.getSpecialTokenIndex("<eos>"), this.bosToken = this.getSpecialTokenIndex("<bos>") ?? this.eosToken, this.unkToken = this.getSpecialTokenIndex("") ?? -1, this.unkToken === -1 && (this.unkToken = this.vocab.indexOf("<unk>")), this.unkToken === -1 && (this.unkToken = this.vocab.indexOf("<pad>")), this.unkToken === -1 && (this.unkToken = this.vocab.indexOf("_")), this.unkToken === -1 && (this.unkToken = this.vocab.indexOf(" ")), this.unkToken === -1 && (this.unkToken = this.eosToken), this.vocab = this.vocab.map((i) => i === "<pad>" ? "" : i), this.vocab.forEach((i, e) => {
|
|
19
|
+
this.cache.set(i, e);
|
|
19
20
|
});
|
|
20
21
|
else
|
|
21
22
|
throw new Error("Vocab cannot be empty");
|
|
22
23
|
this._trained = !0;
|
|
23
24
|
} else
|
|
24
|
-
this.vocabSize =
|
|
25
|
-
this.cache.set(
|
|
25
|
+
this.vocabSize = t, this.vocab = new Array(this.vocabSize).fill(""), this.addSpecialTokens(), this.eosToken = this.getSpecialTokenIndex("<eos>"), this.bosToken = this.getSpecialTokenIndex("<bos>") ?? this.eosToken, this.unkToken = this.getSpecialTokenIndex(""), this.vocab.forEach((i, e) => {
|
|
26
|
+
this.cache.set(i, e);
|
|
26
27
|
}), this.cache.set("", this.unkToken);
|
|
27
28
|
}
|
|
28
|
-
addToken(
|
|
29
|
-
if (this.cache.has(
|
|
30
|
-
return this.cache.get(
|
|
29
|
+
addToken(t, i) {
|
|
30
|
+
if (this.cache.has(t))
|
|
31
|
+
return this.cache.get(t);
|
|
31
32
|
let e;
|
|
32
|
-
if (
|
|
33
|
+
if (i !== void 0 ? e = i : (e = this.vocab.indexOf("", this.unkToken + 1), e === -1 && (e = this.vocabSize)), e >= this.vocabSize)
|
|
33
34
|
throw new Error("Vocab size exceeded");
|
|
34
|
-
return this.vocab[e] =
|
|
35
|
+
return this.vocab[e] = t, this.cache.set(t, e), e;
|
|
35
36
|
}
|
|
36
37
|
get trained() {
|
|
37
38
|
return this.vocab.length === this.vocabSize && this._trained;
|
|
@@ -39,43 +40,53 @@ class T extends k {
|
|
|
39
40
|
destroy() {
|
|
40
41
|
this.cache.clear(), this.vocab = [];
|
|
41
42
|
}
|
|
42
|
-
async train(
|
|
43
|
-
const
|
|
43
|
+
async train(t) {
|
|
44
|
+
const i = /* @__PURE__ */ new Set();
|
|
45
|
+
let e = performance.now();
|
|
46
|
+
for (const n of t)
|
|
47
|
+
n.forEach((o) => {
|
|
48
|
+
for (const r of o.content)
|
|
49
|
+
i.add(r);
|
|
50
|
+
}), e = await d(e);
|
|
51
|
+
const s = Array.from(i), h = this.vocab.indexOf("", this.unkToken + 1), a = this.vocabSize - b.length;
|
|
44
52
|
if (h === -1)
|
|
45
53
|
return this.vocabSize;
|
|
46
54
|
if (this._trained = !0, s.length > a) {
|
|
47
|
-
const
|
|
48
|
-
t.forEach((
|
|
49
|
-
o.
|
|
50
|
-
|
|
55
|
+
const n = /* @__PURE__ */ new Map();
|
|
56
|
+
t.forEach((o) => {
|
|
57
|
+
o.forEach((r) => {
|
|
58
|
+
for (const k of r.content)
|
|
59
|
+
n.set(k, (n.get(k) || 0) + 1);
|
|
60
|
+
});
|
|
61
|
+
}), s.sort((o, r) => (n.get(o) || 0) - (n.get(r) || 0)), s.splice(0, s.length - a);
|
|
51
62
|
}
|
|
52
63
|
let c = h;
|
|
53
64
|
if (c !== -1) {
|
|
54
|
-
const
|
|
55
|
-
for (const
|
|
56
|
-
if (!
|
|
65
|
+
const n = new Set(this.vocab);
|
|
66
|
+
for (const o of s)
|
|
67
|
+
if (!n.has(o) && (this.vocab[c] = o, n.add(o), c = this.vocab.indexOf("", c + 1), c === -1))
|
|
57
68
|
break;
|
|
58
69
|
}
|
|
59
|
-
return this.cache.clear(), this.vocab.forEach((
|
|
60
|
-
this.cache.set(
|
|
70
|
+
return this.cache.clear(), this.vocab.forEach((n, o) => {
|
|
71
|
+
this.cache.set(n, o);
|
|
61
72
|
}), this.emit("trainStatus", "trained"), this.vocabSize;
|
|
62
73
|
}
|
|
63
|
-
tokenise(
|
|
74
|
+
tokenise(t, i) {
|
|
64
75
|
if (!this.trained)
|
|
65
76
|
throw new Error("Tokeniser not trained");
|
|
66
|
-
return
|
|
77
|
+
return t.map((s) => i ? s.split("").map((h) => this.cache.get(h) ?? this.unkToken) : s.split("").map((h) => {
|
|
67
78
|
const a = this.cache.get(h);
|
|
68
79
|
return a !== void 0 ? this.vocab[a] : "";
|
|
69
80
|
}));
|
|
70
81
|
}
|
|
71
|
-
detokenise(
|
|
72
|
-
return
|
|
82
|
+
detokenise(t) {
|
|
83
|
+
return t.map((e) => Array.from(e).map((s) => this.vocab[s] || "").join(""));
|
|
73
84
|
}
|
|
74
|
-
encode(
|
|
75
|
-
return this.tokenise([
|
|
85
|
+
encode(t) {
|
|
86
|
+
return this.tokenise([t], !0)[0];
|
|
76
87
|
}
|
|
77
|
-
decode(
|
|
78
|
-
return this.detokenise([
|
|
88
|
+
decode(t) {
|
|
89
|
+
return this.detokenise([t])[0];
|
|
79
90
|
}
|
|
80
91
|
getVocab() {
|
|
81
92
|
return this.vocab;
|
|
@@ -83,13 +94,13 @@ class T extends k {
|
|
|
83
94
|
getMerges() {
|
|
84
95
|
return [];
|
|
85
96
|
}
|
|
86
|
-
async createTrainingData(
|
|
87
|
-
const e = await this.tokenise(
|
|
88
|
-
for (let a = 0; a < e.length -
|
|
89
|
-
s.push(...e[a].slice(0,
|
|
97
|
+
async createTrainingData(t, i = 5) {
|
|
98
|
+
const e = await this.tokenise(t, !0), s = [], h = [];
|
|
99
|
+
for (let a = 0; a < e.length - i; a++)
|
|
100
|
+
s.push(...e[a].slice(0, i)), h.push(e[a + 1][0]);
|
|
90
101
|
return [s, h];
|
|
91
102
|
}
|
|
92
103
|
}
|
|
93
104
|
export {
|
|
94
|
-
|
|
105
|
+
p as default
|
|
95
106
|
};
|
package/dist/tokeniser/bpe.js
CHANGED
|
@@ -1,15 +1,15 @@
|
|
|
1
|
-
import { yieldIfNeeded as
|
|
1
|
+
import { yieldIfNeeded as p } from "../utilities/yielder.js";
|
|
2
2
|
import m from "../utilities/tokenParse.js";
|
|
3
|
-
import
|
|
4
|
-
function
|
|
3
|
+
import w, { SPECIALS as S } from "./BaseTokeniser.js";
|
|
4
|
+
function g(o, e) {
|
|
5
5
|
return `${o}-::-${e}`;
|
|
6
6
|
}
|
|
7
|
-
function
|
|
7
|
+
function T(o) {
|
|
8
8
|
const e = /* @__PURE__ */ new Map();
|
|
9
9
|
for (let s = 0; s < o.length; s++) {
|
|
10
10
|
const t = o[s];
|
|
11
11
|
for (let n = 0; n < t.length - 1; n++) {
|
|
12
|
-
const r =
|
|
12
|
+
const r = g(t[n], t[n + 1]), a = e.get(r) || {
|
|
13
13
|
a: t[n],
|
|
14
14
|
b: t[n + 1],
|
|
15
15
|
count: 0,
|
|
@@ -20,21 +20,21 @@ function w(o) {
|
|
|
20
20
|
}
|
|
21
21
|
return { pairs: e, tokens: o };
|
|
22
22
|
}
|
|
23
|
-
function
|
|
24
|
-
const r =
|
|
23
|
+
function f(o, e, s, t, n) {
|
|
24
|
+
const r = g(e, s);
|
|
25
25
|
if (o.pairs.has(r)) {
|
|
26
26
|
const a = o.pairs.get(r);
|
|
27
27
|
a.count += n, n > 0 ? a.instances.add(t) : a.count <= 0 ? o.pairs.delete(r) : a.instances.delete(t);
|
|
28
28
|
} else
|
|
29
29
|
o.pairs.set(r, { a: e, b: s, count: n, instances: /* @__PURE__ */ new Set([t]) });
|
|
30
30
|
}
|
|
31
|
-
function
|
|
31
|
+
function y(o) {
|
|
32
32
|
let e = null, s = 0;
|
|
33
33
|
for (const t of o.pairs.values())
|
|
34
34
|
t.count > s && (s = t.count, e = t);
|
|
35
35
|
return e;
|
|
36
36
|
}
|
|
37
|
-
function
|
|
37
|
+
function I(o, e) {
|
|
38
38
|
return o.map((s) => {
|
|
39
39
|
const t = [];
|
|
40
40
|
for (let n = 0; n < s.length; n++)
|
|
@@ -42,19 +42,19 @@ function y(o, e) {
|
|
|
42
42
|
return t;
|
|
43
43
|
});
|
|
44
44
|
}
|
|
45
|
-
function
|
|
45
|
+
function x(o, e) {
|
|
46
46
|
e.instances.forEach((s) => {
|
|
47
47
|
const t = o.tokens[s], n = [];
|
|
48
48
|
for (let r = 0; r < t.length; r++)
|
|
49
49
|
if (r < t.length - 1 && t[r] === e.a && t[r + 1] === e.b) {
|
|
50
50
|
const a = e.a + e.b;
|
|
51
|
-
n.push(a), r > 0 && (
|
|
51
|
+
n.push(a), r > 0 && (f(o, t[r - 1], e.a, s, -1), f(o, t[r - 1], a, s, 1)), r++, r < t.length - 1 && (f(o, e.b, t[r + 1], s, -1), f(o, a, t[r + 1], s, 1));
|
|
52
52
|
} else
|
|
53
53
|
n.push(t[r]);
|
|
54
54
|
o.tokens[s] = n;
|
|
55
|
-
}), o.pairs.delete(
|
|
55
|
+
}), o.pairs.delete(g(e.a, e.b));
|
|
56
56
|
}
|
|
57
|
-
class
|
|
57
|
+
class C extends w {
|
|
58
58
|
targetSize;
|
|
59
59
|
vocab = /* @__PURE__ */ new Set();
|
|
60
60
|
vocabIndex = /* @__PURE__ */ new Map();
|
|
@@ -63,7 +63,7 @@ class E extends z {
|
|
|
63
63
|
constructor(e, s) {
|
|
64
64
|
super(), Array.isArray(e) ? (e.forEach((t, n) => {
|
|
65
65
|
this.vocab.add(t), this.vocabIndex.set(t, n);
|
|
66
|
-
}), s && (this.merges = s), this.targetSize = e.length,
|
|
66
|
+
}), s && (this.merges = s), this.targetSize = e.length, S.forEach((t) => {
|
|
67
67
|
const n = e.indexOf(t);
|
|
68
68
|
n !== -1 && this.addSpecialToken(t, n);
|
|
69
69
|
})) : (this.addSpecialTokens(), this.targetSize = e);
|
|
@@ -81,7 +81,7 @@ class E extends z {
|
|
|
81
81
|
this.vocab.clear(), this.vocabIndex.clear(), this.merges = [], this.pretokenMap.clear();
|
|
82
82
|
}
|
|
83
83
|
get trained() {
|
|
84
|
-
return this.vocab.size >
|
|
84
|
+
return this.vocab.size > S.length && this.vocab.size <= this.targetSize;
|
|
85
85
|
}
|
|
86
86
|
get vocabSize() {
|
|
87
87
|
return this.vocab.size;
|
|
@@ -97,39 +97,44 @@ class E extends z {
|
|
|
97
97
|
}
|
|
98
98
|
async train(e = [], s) {
|
|
99
99
|
let t = performance.now();
|
|
100
|
-
const n =
|
|
101
|
-
|
|
102
|
-
|
|
100
|
+
const n = new Array(e.length);
|
|
101
|
+
for (let i = 0; i < e.length; i++) {
|
|
102
|
+
const h = e[i], l = new Array(h.length);
|
|
103
|
+
for (let d = 0; d < h.length; d++)
|
|
104
|
+
l[d] = m(h[d].content);
|
|
105
|
+
t = await p(t, s, this.vocab.size), n[i] = l;
|
|
106
|
+
}
|
|
107
|
+
const r = n.flat(2), a = new Set(r);
|
|
103
108
|
this.vocab = /* @__PURE__ */ new Set(), this.pretokenMap.clear(), this.merges = [], this.addSpecialTokens();
|
|
104
|
-
const
|
|
105
|
-
if (t = await
|
|
109
|
+
const b = Array.from(a), v = b.map((i) => Array.from(i).map((l) => (this.vocab.add(l), l))), k = T(v);
|
|
110
|
+
if (t = await p(t, s, this.vocab.size), this.vocab.size >= this.targetSize) {
|
|
106
111
|
console.warn("Initial vocab size is greater than or equal to target size. No merges will be performed.");
|
|
107
112
|
const i = /* @__PURE__ */ new Map();
|
|
108
|
-
|
|
113
|
+
r.forEach((c) => {
|
|
109
114
|
Array.from(c).forEach((u) => {
|
|
110
115
|
i.set(u, (i.get(u) || 0) + 1);
|
|
111
116
|
});
|
|
112
117
|
});
|
|
113
118
|
const h = Array.from(i.entries()).sort((c, u) => u[1] - c[1]);
|
|
114
119
|
this.vocab = /* @__PURE__ */ new Set(), this.addSpecialTokens(), h.slice(0, this.targetSize - this.vocab.size).map(([c]) => c).forEach((c) => this.vocab.add(c)), this.vocabIndex.clear();
|
|
115
|
-
let
|
|
120
|
+
let d = 0;
|
|
116
121
|
for (const c of this.vocab.keys())
|
|
117
|
-
this.vocabIndex.set(c,
|
|
122
|
+
this.vocabIndex.set(c, d++);
|
|
118
123
|
return this.emit("trainStatus", "trained"), this.vocab.size;
|
|
119
124
|
}
|
|
120
125
|
for (; this.vocab.size < this.targetSize && this.merges.length < this.targetSize; ) {
|
|
121
|
-
const i =
|
|
126
|
+
const i = y(k);
|
|
122
127
|
if (!i)
|
|
123
128
|
break;
|
|
124
|
-
this.merges.push([i.a, i.b]), this.vocab.add(i.a + i.b),
|
|
129
|
+
this.merges.push([i.a, i.b]), this.vocab.add(i.a + i.b), x(k, i), t = await p(t, s, this.vocab.size);
|
|
125
130
|
}
|
|
126
|
-
|
|
127
|
-
const l =
|
|
131
|
+
b.forEach((i, h) => {
|
|
132
|
+
const l = v[h];
|
|
128
133
|
this.pretokenMap.set(i, l);
|
|
129
134
|
}), this.vocabIndex.clear();
|
|
130
|
-
let
|
|
135
|
+
let z = 0;
|
|
131
136
|
for (const i of this.vocab.keys())
|
|
132
|
-
this.vocabIndex.set(i,
|
|
137
|
+
this.vocabIndex.set(i, z++);
|
|
133
138
|
return this.emit("trainStatus", "trained"), this.vocab.size;
|
|
134
139
|
}
|
|
135
140
|
getVocab() {
|
|
@@ -141,7 +146,7 @@ class E extends z {
|
|
|
141
146
|
tokeniseWord(e) {
|
|
142
147
|
let s = Array.from(e);
|
|
143
148
|
return this.merges.forEach((t) => {
|
|
144
|
-
s =
|
|
149
|
+
s = I([s], t)[0];
|
|
145
150
|
}), this.pretokenMap.set(e, s), s;
|
|
146
151
|
}
|
|
147
152
|
tokeniseStrings(e) {
|
|
@@ -163,5 +168,5 @@ class E extends z {
|
|
|
163
168
|
}
|
|
164
169
|
}
|
|
165
170
|
export {
|
|
166
|
-
|
|
171
|
+
C as default
|
|
167
172
|
};
|