@genai-fi/nanogpt 0.15.14 → 0.16.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/TeachableLLM.d.ts +1 -1
- package/dist/data/docx.d.ts +2 -1
- package/dist/data/docx.js +8 -8
- package/dist/data/parquet.d.ts +2 -1
- package/dist/data/parquet.js +5 -5
- package/dist/data/pdf.d.ts +2 -1
- package/dist/data/pdf.js +1 -1
- package/dist/data/textLoader.d.ts +2 -1
- package/dist/data/textLoader.js +38 -37
- package/dist/tokeniser/BaseTokeniser.d.ts +2 -2
- package/dist/tokeniser/BaseTokeniser.js +35 -35
- package/dist/tokeniser/CharTokeniser.d.ts +2 -1
- package/dist/tokeniser/CharTokeniser.js +15 -15
- package/dist/tokeniser/bpe.d.ts +2 -1
- package/dist/tokeniser/bpe.js +40 -40
- package/dist/tokeniser/type.d.ts +2 -2
- package/dist/training/SFTDatasetBuilder.js +10 -8
- package/package.json +1 -1
package/dist/TeachableLLM.d.ts
CHANGED
|
@@ -49,7 +49,7 @@ export default class TeachableLLM {
|
|
|
49
49
|
getNumParams(): number;
|
|
50
50
|
trainer(trainingType?: TrainingType, options?: TrainingOptions): Trainer;
|
|
51
51
|
train(text: Task[], options?: TrainingOptions, trainingType?: TrainingType): Promise<void>;
|
|
52
|
-
trainTokeniser(text:
|
|
52
|
+
trainTokeniser(text: Conversation[][]): Promise<number>;
|
|
53
53
|
generator(): IGenerator;
|
|
54
54
|
generateText(prompt: Conversation[], options?: IGenerateOptions): Promise<Conversation[]>;
|
|
55
55
|
generateText(options?: IGenerateOptions): Promise<Conversation[]>;
|
package/dist/data/docx.d.ts
CHANGED
|
@@ -1 +1,2 @@
|
|
|
1
|
-
|
|
1
|
+
import { Conversation } from '../tokeniser/type';
|
|
2
|
+
export declare function loadDOCX(file: Blob | Uint8Array): Promise<Conversation[][]>;
|
package/dist/data/docx.js
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import { z as a } from "../jszip.min-BZhlzntC.js";
|
|
2
|
-
async function c(
|
|
3
|
-
const
|
|
4
|
-
if (!
|
|
5
|
-
return i(
|
|
6
|
-
`).filter((
|
|
2
|
+
async function c(e) {
|
|
3
|
+
const n = await (await a.loadAsync(e)).file("word/document.xml")?.async("string");
|
|
4
|
+
if (!n) throw new Error("Failed to load document.xml");
|
|
5
|
+
return i(n).split(`
|
|
6
|
+
`).filter((t) => t.trim().length > 10).map((t) => [{ role: "text", content: t }]);
|
|
7
7
|
}
|
|
8
|
-
function i(
|
|
9
|
-
const
|
|
10
|
-
return Array.from(
|
|
8
|
+
function i(e) {
|
|
9
|
+
const n = new DOMParser().parseFromString(e, "application/xml");
|
|
10
|
+
return Array.from(n.getElementsByTagName("w:t")).map((t) => t.textContent).join(`
|
|
11
11
|
`);
|
|
12
12
|
}
|
|
13
13
|
export {
|
package/dist/data/parquet.d.ts
CHANGED
|
@@ -1 +1,2 @@
|
|
|
1
|
-
|
|
1
|
+
import { Conversation } from '../tokeniser/type';
|
|
2
|
+
export declare function loadParquet(file: File, maxSize?: number, column?: string): Promise<Conversation[][]>;
|
package/dist/data/parquet.js
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
import { B as
|
|
1
|
+
import { B as f } from "../index-Cp39cXWe.js";
|
|
2
2
|
const p = 100 * 1024 * 1024;
|
|
3
|
-
async function d(i,
|
|
4
|
-
const r = await (await import("../parquet-Bqjmp2vo.js").then((t) => t.p)).ParquetReader.openBuffer(
|
|
3
|
+
async function d(i, n = p, e = "text") {
|
|
4
|
+
const r = await (await import("../parquet-Bqjmp2vo.js").then((t) => t.p)).ParquetReader.openBuffer(f.from(await i.arrayBuffer())), a = [], s = r.getCursor([[e]]);
|
|
5
5
|
let o = 0;
|
|
6
6
|
for (; ; ) {
|
|
7
|
-
const t = await
|
|
7
|
+
const t = await s.next();
|
|
8
8
|
if (!t || t[e] === void 0 || typeof t[e] != "string")
|
|
9
9
|
break;
|
|
10
|
-
if (t[e].length !== 0 && (a.push(t[e]), o += t[e].length, o >
|
|
10
|
+
if (t[e].length !== 0 && (a.push([{ role: "text", content: t[e] }]), o += t[e].length, o > n))
|
|
11
11
|
break;
|
|
12
12
|
}
|
|
13
13
|
return r.close(), a;
|
package/dist/data/pdf.d.ts
CHANGED
|
@@ -1 +1,2 @@
|
|
|
1
|
-
|
|
1
|
+
import { Conversation } from '../tokeniser/type';
|
|
2
|
+
export declare function loadPDF(file: Blob | Uint8Array, maxSize?: number): Promise<Conversation[][]>;
|
package/dist/data/pdf.js
CHANGED
|
@@ -5,7 +5,7 @@ async function h(l, X = 104857600) {
|
|
|
5
5
|
let m = 0;
|
|
6
6
|
for (let b = 1; b <= N; b++) {
|
|
7
7
|
const G = (await (await d.getPage(b)).getTextContent()).items.filter((c) => c.str.trim().length > 10).map((c) => c.str).join(" ");
|
|
8
|
-
if (W.push(G), m += G.length, m > X) break;
|
|
8
|
+
if (W.push([{ role: "text", content: G }]), m += G.length, m > X) break;
|
|
9
9
|
}
|
|
10
10
|
return W;
|
|
11
11
|
}
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
+
import { Conversation } from '../tokeniser/type';
|
|
1
2
|
export interface DataOptions {
|
|
2
3
|
maxSize?: number;
|
|
3
4
|
column?: string;
|
|
4
5
|
hasHeader?: boolean;
|
|
5
6
|
}
|
|
6
|
-
export default function loadTextData(file: File, options?: DataOptions): Promise<
|
|
7
|
+
export default function loadTextData(file: File, options?: DataOptions): Promise<Conversation[][]>;
|
package/dist/data/textLoader.js
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import { p as u } from "../papaparse.min-C0cScC2i.js";
|
|
2
2
|
import { loadParquet as f } from "./parquet.js";
|
|
3
3
|
import { loadPDF as d } from "./pdf.js";
|
|
4
|
-
import { loadDOCX as
|
|
5
|
-
import { z as
|
|
4
|
+
import { loadDOCX as x } from "./docx.js";
|
|
5
|
+
import { z as m } from "../jszip.min-BZhlzntC.js";
|
|
6
6
|
function y(t, r) {
|
|
7
|
-
const
|
|
8
|
-
return
|
|
7
|
+
const n = t.findIndex((i) => i.toLowerCase() === r.toLowerCase());
|
|
8
|
+
return n === -1 ? 0 : n;
|
|
9
9
|
}
|
|
10
10
|
function w(t) {
|
|
11
11
|
return t.every((r) => r.length < 64);
|
|
@@ -35,73 +35,74 @@ function g(t) {
|
|
|
35
35
|
return "unknown";
|
|
36
36
|
}
|
|
37
37
|
}
|
|
38
|
-
function
|
|
38
|
+
function z(t) {
|
|
39
39
|
if (!Array.isArray(t)) return !1;
|
|
40
40
|
const r = t[0];
|
|
41
41
|
return typeof r == "object" && r !== null && "role" in r && "content" in r && typeof r.role == "string" && typeof r.content == "string";
|
|
42
42
|
}
|
|
43
|
-
async function
|
|
44
|
-
const
|
|
45
|
-
if (
|
|
43
|
+
async function j(t, r) {
|
|
44
|
+
const n = t.type !== "" ? t.type : g(t.name);
|
|
45
|
+
if (n === "application/parquet")
|
|
46
46
|
return f(t, r?.maxSize, r?.column);
|
|
47
|
-
if (
|
|
47
|
+
if (n === "application/pdf")
|
|
48
48
|
return d(t, r?.maxSize);
|
|
49
|
-
if (
|
|
50
|
-
return
|
|
51
|
-
if (
|
|
49
|
+
if (n === "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
|
|
50
|
+
return x(t);
|
|
51
|
+
if (n === "application/json") {
|
|
52
52
|
const i = await t.text(), o = JSON.parse(i);
|
|
53
53
|
if (Array.isArray(o))
|
|
54
|
-
return o.map(
|
|
55
|
-
|
|
56
|
-
);
|
|
54
|
+
return o.map((e) => [
|
|
55
|
+
typeof e == "string" ? { role: "text", content: e } : "text" in e ? { role: "text", content: e.text } : { role: "text", content: JSON.stringify(e) }
|
|
56
|
+
]);
|
|
57
57
|
throw new Error("Expected JSON array");
|
|
58
58
|
}
|
|
59
|
-
if (
|
|
59
|
+
if (n === "application/jsonl")
|
|
60
60
|
return (await t.text()).split(`
|
|
61
61
|
`).filter((o) => o.trim() !== "").map((o) => {
|
|
62
62
|
try {
|
|
63
63
|
const e = JSON.parse(o);
|
|
64
|
-
return
|
|
65
|
-
|
|
64
|
+
return z(e) ? e : [
|
|
65
|
+
typeof e == "string" ? { role: "text", content: e } : "text" in e ? { role: "text", content: e.text } : { role: "text", content: JSON.stringify(e) }
|
|
66
|
+
];
|
|
66
67
|
} catch {
|
|
67
|
-
return o;
|
|
68
|
+
return [{ role: "text", content: o }];
|
|
68
69
|
}
|
|
69
70
|
});
|
|
70
|
-
if (
|
|
71
|
-
const i = await
|
|
71
|
+
if (n === "application/zip") {
|
|
72
|
+
const i = await m.loadAsync(t), o = [];
|
|
72
73
|
for (const e of Object.keys(i.files)) {
|
|
73
|
-
const
|
|
74
|
-
if (
|
|
75
|
-
const
|
|
76
|
-
o.push(...
|
|
74
|
+
const a = i.file(e);
|
|
75
|
+
if (a) {
|
|
76
|
+
const c = await a.async("blob"), s = await j(new File([c], e), r);
|
|
77
|
+
o.push(...s);
|
|
77
78
|
}
|
|
78
79
|
}
|
|
79
80
|
return o;
|
|
80
81
|
}
|
|
81
|
-
if (
|
|
82
|
+
if (n === "text/csv") {
|
|
82
83
|
const i = await t.text();
|
|
83
84
|
return new Promise((o, e) => {
|
|
84
85
|
u.parse(i, {
|
|
85
86
|
header: !1,
|
|
86
87
|
skipEmptyLines: !0,
|
|
87
88
|
delimiter: ",",
|
|
88
|
-
complete: (
|
|
89
|
-
if (
|
|
90
|
-
console.error(
|
|
89
|
+
complete: (a) => {
|
|
90
|
+
if (a.errors.length > 0)
|
|
91
|
+
console.error(a.errors), e(new Error("Error parsing file"));
|
|
91
92
|
else {
|
|
92
|
-
const
|
|
93
|
-
o(p.map((l) => l[
|
|
93
|
+
const c = y(a.data[0], r?.column || "text"), p = r?.hasHeader ?? w(a.data[0]) ? a.data.slice(1) : a.data;
|
|
94
|
+
o(p.map((l) => [{ role: "text", content: l[c] }]));
|
|
94
95
|
}
|
|
95
96
|
},
|
|
96
|
-
error: (
|
|
97
|
-
e(
|
|
97
|
+
error: (a) => {
|
|
98
|
+
e(a);
|
|
98
99
|
}
|
|
99
100
|
});
|
|
100
101
|
});
|
|
101
|
-
} else if (
|
|
102
|
-
return [await t.text()];
|
|
103
|
-
throw new Error(`Unsupported file type: ${
|
|
102
|
+
} else if (n === "text/plain")
|
|
103
|
+
return [[{ role: "text", content: await t.text() }]];
|
|
104
|
+
throw new Error(`Unsupported file type: ${n}`);
|
|
104
105
|
}
|
|
105
106
|
export {
|
|
106
|
-
|
|
107
|
+
j as default
|
|
107
108
|
};
|
|
@@ -12,7 +12,7 @@ export default abstract class BaseTokeniser extends EE<'trainStatus'> implements
|
|
|
12
12
|
isSpecialToken(index: number): boolean;
|
|
13
13
|
protected addSpecialTokens(): void;
|
|
14
14
|
protected addSpecialToken(token: string, index: number): void;
|
|
15
|
-
abstract train(text:
|
|
15
|
+
abstract train(text: Conversation[][], cb?: (vocab: number) => void): Promise<number>;
|
|
16
16
|
abstract getVocab(): string[];
|
|
17
17
|
abstract getMerges(): [string, string][];
|
|
18
18
|
abstract destroy(): void;
|
|
@@ -21,6 +21,6 @@ export default abstract class BaseTokeniser extends EE<'trainStatus'> implements
|
|
|
21
21
|
encodeAsSequence(conversation: Conversation[], completion?: boolean): number[];
|
|
22
22
|
encodeConversation(conversation: Conversation[], completion?: boolean): number[];
|
|
23
23
|
abstract decode(tokens: number[]): string;
|
|
24
|
-
decodeConversation(tokens: number[]): Conversation[];
|
|
24
|
+
decodeConversation(tokens: number[] | Uint16Array): Conversation[];
|
|
25
25
|
getSpecialTokenIndex(token: string): number | undefined;
|
|
26
26
|
}
|
|
@@ -11,30 +11,30 @@ const h = [
|
|
|
11
11
|
"<|system_start|>",
|
|
12
12
|
"<|system_end|>"
|
|
13
13
|
];
|
|
14
|
-
class
|
|
14
|
+
class k extends r {
|
|
15
15
|
specialTokens = /* @__PURE__ */ new Map();
|
|
16
16
|
specialTokenSet = /* @__PURE__ */ new Set();
|
|
17
|
-
isSpecialToken(
|
|
18
|
-
return this.specialTokenSet.has(
|
|
17
|
+
isSpecialToken(s) {
|
|
18
|
+
return this.specialTokenSet.has(s);
|
|
19
19
|
}
|
|
20
20
|
addSpecialTokens() {
|
|
21
|
-
h.forEach((
|
|
22
|
-
this.addToken(
|
|
21
|
+
h.forEach((s, t) => {
|
|
22
|
+
this.addToken(s, t), this.specialTokens.set(s, t), this.specialTokenSet.add(t);
|
|
23
23
|
});
|
|
24
24
|
}
|
|
25
|
-
addSpecialToken(
|
|
26
|
-
this.specialTokens.set(
|
|
25
|
+
addSpecialToken(s, t) {
|
|
26
|
+
this.specialTokens.set(s, t), this.specialTokenSet.add(t);
|
|
27
27
|
}
|
|
28
|
-
encodeSequence(
|
|
29
|
-
const t = this.encode(
|
|
28
|
+
encodeSequence(s) {
|
|
29
|
+
const t = this.encode(s);
|
|
30
30
|
return [this.bosToken, ...t, this.eosToken];
|
|
31
31
|
}
|
|
32
|
-
encodeAsSequence(
|
|
33
|
-
const
|
|
34
|
-
return t ? [this.bosToken, ...
|
|
32
|
+
encodeAsSequence(s, t) {
|
|
33
|
+
const e = s.flatMap((o) => this.encode(o.content));
|
|
34
|
+
return t ? [this.bosToken, ...e, this.eosToken, this.bosToken] : [this.bosToken, ...e, this.eosToken];
|
|
35
35
|
}
|
|
36
|
-
encodeConversation(
|
|
37
|
-
const
|
|
36
|
+
encodeConversation(s, t) {
|
|
37
|
+
const e = [[this.bosToken]], o = [
|
|
38
38
|
this.getSpecialTokenIndex("<|user_start|>"),
|
|
39
39
|
this.getSpecialTokenIndex("<|assistant_start|>"),
|
|
40
40
|
this.getSpecialTokenIndex("<|system_start|>")
|
|
@@ -43,57 +43,57 @@ class l extends r {
|
|
|
43
43
|
this.getSpecialTokenIndex("<|assistant_end|>"),
|
|
44
44
|
this.getSpecialTokenIndex("<|system_end|>")
|
|
45
45
|
];
|
|
46
|
-
for (const i of
|
|
46
|
+
for (const i of s) {
|
|
47
47
|
const c = this.encode(i.content);
|
|
48
48
|
switch (i.role) {
|
|
49
49
|
case "user":
|
|
50
|
-
|
|
50
|
+
e.push([o[0]]);
|
|
51
51
|
break;
|
|
52
52
|
case "assistant":
|
|
53
|
-
|
|
53
|
+
e.push([o[1]]);
|
|
54
54
|
break;
|
|
55
55
|
case "system":
|
|
56
|
-
|
|
56
|
+
e.push([o[2]]);
|
|
57
57
|
break;
|
|
58
58
|
}
|
|
59
|
-
switch (
|
|
59
|
+
switch (e.push(c), i.role) {
|
|
60
60
|
case "user":
|
|
61
|
-
|
|
61
|
+
e.push([n[0]]);
|
|
62
62
|
break;
|
|
63
63
|
case "assistant":
|
|
64
|
-
|
|
64
|
+
e.push([n[1]]);
|
|
65
65
|
break;
|
|
66
66
|
case "system":
|
|
67
|
-
|
|
67
|
+
e.push([n[2]]);
|
|
68
68
|
break;
|
|
69
69
|
}
|
|
70
70
|
}
|
|
71
|
-
const a =
|
|
71
|
+
const a = e.flat();
|
|
72
72
|
return t ? a.push(o[1]) : a.push(this.eosToken), a;
|
|
73
73
|
}
|
|
74
|
-
decodeConversation(
|
|
74
|
+
decodeConversation(s) {
|
|
75
75
|
const t = [];
|
|
76
|
-
let
|
|
77
|
-
for (;
|
|
78
|
-
const o = e
|
|
76
|
+
let e = 0;
|
|
77
|
+
for (; e < s.length; ) {
|
|
78
|
+
const o = s[e];
|
|
79
79
|
let n = null;
|
|
80
|
-
if (o === this.getSpecialTokenIndex("<|user_start|>") ? n = "user" : o === this.getSpecialTokenIndex("<|assistant_start|>") ? n = "assistant" : o === this.getSpecialTokenIndex("<|system_start|>")
|
|
81
|
-
|
|
80
|
+
if (o === this.getSpecialTokenIndex("<|user_start|>") ? n = "user" : o === this.getSpecialTokenIndex("<|assistant_start|>") ? n = "assistant" : o === this.getSpecialTokenIndex("<|system_start|>") ? n = "system" : o === this.bosToken || (o === this.eosToken ? n = null : (n = "text", e--)), n) {
|
|
81
|
+
e++;
|
|
82
82
|
const a = [];
|
|
83
|
-
for (;
|
|
84
|
-
a.push(e
|
|
83
|
+
for (; e < s.length && s[e] !== this.getSpecialTokenIndex(`<|${n}_end|>`) && s[e] !== this.eosToken; )
|
|
84
|
+
a.push(s[e]), e++;
|
|
85
85
|
const i = this.decode(a);
|
|
86
86
|
t.push({ role: n, content: i });
|
|
87
87
|
}
|
|
88
|
-
|
|
88
|
+
e++;
|
|
89
89
|
}
|
|
90
90
|
return t;
|
|
91
91
|
}
|
|
92
|
-
getSpecialTokenIndex(
|
|
93
|
-
return this.specialTokens.get(
|
|
92
|
+
getSpecialTokenIndex(s) {
|
|
93
|
+
return this.specialTokens.get(s);
|
|
94
94
|
}
|
|
95
95
|
}
|
|
96
96
|
export {
|
|
97
97
|
h as SPECIALS,
|
|
98
|
-
|
|
98
|
+
k as default
|
|
99
99
|
};
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { default as BaseTokeniser } from './BaseTokeniser';
|
|
2
|
+
import { Conversation } from './type';
|
|
2
3
|
export default class CharTokeniser extends BaseTokeniser {
|
|
3
4
|
vocabSize: number;
|
|
4
5
|
eosToken: number;
|
|
@@ -11,7 +12,7 @@ export default class CharTokeniser extends BaseTokeniser {
|
|
|
11
12
|
addToken(token: string, index?: number): number;
|
|
12
13
|
get trained(): boolean;
|
|
13
14
|
destroy(): void;
|
|
14
|
-
train(text:
|
|
15
|
+
train(text: Conversation[][]): Promise<number>;
|
|
15
16
|
tokenise(text: string[], numeric: true): number[][];
|
|
16
17
|
tokenise(text: string[]): string[][];
|
|
17
18
|
detokenise(tokens: (number[] | Uint16Array)[]): string[];
|
|
@@ -40,32 +40,32 @@ class T extends k {
|
|
|
40
40
|
this.cache.clear(), this.vocab = [];
|
|
41
41
|
}
|
|
42
42
|
async train(i) {
|
|
43
|
-
const t = i.map((n) => n.split("")).flat(), e = new Set(t), s = Array.from(e), h = this.vocab.indexOf("", this.unkToken + 1),
|
|
43
|
+
const t = i.map((o) => o.map((n) => n.content.split(""))).flat(2), e = new Set(t), s = Array.from(e), h = this.vocab.indexOf("", this.unkToken + 1), a = this.vocabSize - u.length;
|
|
44
44
|
if (h === -1)
|
|
45
45
|
return this.vocabSize;
|
|
46
|
-
if (this._trained = !0, s.length >
|
|
47
|
-
const
|
|
48
|
-
t.forEach((
|
|
49
|
-
|
|
50
|
-
}), s.sort((
|
|
46
|
+
if (this._trained = !0, s.length > a) {
|
|
47
|
+
const o = /* @__PURE__ */ new Map();
|
|
48
|
+
t.forEach((n) => {
|
|
49
|
+
o.set(n, (o.get(n) || 0) + 1);
|
|
50
|
+
}), s.sort((n, r) => (o.get(n) || 0) - (o.get(r) || 0)), s.splice(0, s.length - a);
|
|
51
51
|
}
|
|
52
52
|
let c = h;
|
|
53
53
|
if (c !== -1) {
|
|
54
|
-
const
|
|
55
|
-
for (const
|
|
56
|
-
if (!
|
|
54
|
+
const o = new Set(this.vocab);
|
|
55
|
+
for (const n of s)
|
|
56
|
+
if (!o.has(n) && (this.vocab[c] = n, o.add(n), c = this.vocab.indexOf("", c + 1), c === -1))
|
|
57
57
|
break;
|
|
58
58
|
}
|
|
59
|
-
return this.cache.clear(), this.vocab.forEach((
|
|
60
|
-
this.cache.set(
|
|
59
|
+
return this.cache.clear(), this.vocab.forEach((o, n) => {
|
|
60
|
+
this.cache.set(o, n);
|
|
61
61
|
}), this.emit("trainStatus", "trained"), this.vocabSize;
|
|
62
62
|
}
|
|
63
63
|
tokenise(i, t) {
|
|
64
64
|
if (!this.trained)
|
|
65
65
|
throw new Error("Tokeniser not trained");
|
|
66
66
|
return i.map((s) => t ? s.split("").map((h) => this.cache.get(h) ?? this.unkToken) : s.split("").map((h) => {
|
|
67
|
-
const
|
|
68
|
-
return
|
|
67
|
+
const a = this.cache.get(h);
|
|
68
|
+
return a !== void 0 ? this.vocab[a] : "";
|
|
69
69
|
}));
|
|
70
70
|
}
|
|
71
71
|
detokenise(i) {
|
|
@@ -85,8 +85,8 @@ class T extends k {
|
|
|
85
85
|
}
|
|
86
86
|
async createTrainingData(i, t = 5) {
|
|
87
87
|
const e = await this.tokenise(i, !0), s = [], h = [];
|
|
88
|
-
for (let
|
|
89
|
-
s.push(...e[
|
|
88
|
+
for (let a = 0; a < e.length - t; a++)
|
|
89
|
+
s.push(...e[a].slice(0, t)), h.push(e[a + 1][0]);
|
|
90
90
|
return [s, h];
|
|
91
91
|
}
|
|
92
92
|
}
|
package/dist/tokeniser/bpe.d.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { default as BaseTokeniser } from './BaseTokeniser';
|
|
2
|
+
import { Conversation } from './type';
|
|
2
3
|
export default class BPETokeniser extends BaseTokeniser {
|
|
3
4
|
private targetSize;
|
|
4
5
|
private vocab;
|
|
@@ -14,7 +15,7 @@ export default class BPETokeniser extends BaseTokeniser {
|
|
|
14
15
|
get eosToken(): number;
|
|
15
16
|
get bosToken(): number;
|
|
16
17
|
get unkToken(): number;
|
|
17
|
-
train(text
|
|
18
|
+
train(text?: Conversation[][], cb?: (vocab: number) => void): Promise<number>;
|
|
18
19
|
getVocab(): string[];
|
|
19
20
|
getMerges(): [string, string][];
|
|
20
21
|
private tokeniseWord;
|
package/dist/tokeniser/bpe.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { yieldIfNeeded as f } from "../utilities/yielder.js";
|
|
2
|
-
import
|
|
3
|
-
import z, { SPECIALS as
|
|
2
|
+
import m from "../utilities/tokenParse.js";
|
|
3
|
+
import z, { SPECIALS as k } from "./BaseTokeniser.js";
|
|
4
4
|
function p(o, e) {
|
|
5
5
|
return `${o}-::-${e}`;
|
|
6
6
|
}
|
|
@@ -8,25 +8,25 @@ function w(o) {
|
|
|
8
8
|
const e = /* @__PURE__ */ new Map();
|
|
9
9
|
for (let s = 0; s < o.length; s++) {
|
|
10
10
|
const t = o[s];
|
|
11
|
-
for (let
|
|
12
|
-
const
|
|
13
|
-
a: t[
|
|
14
|
-
b: t[
|
|
11
|
+
for (let n = 0; n < t.length - 1; n++) {
|
|
12
|
+
const r = p(t[n], t[n + 1]), a = e.get(r) || {
|
|
13
|
+
a: t[n],
|
|
14
|
+
b: t[n + 1],
|
|
15
15
|
count: 0,
|
|
16
16
|
instances: /* @__PURE__ */ new Set()
|
|
17
17
|
};
|
|
18
|
-
a.count += 1, a.instances.add(s), e.set(
|
|
18
|
+
a.count += 1, a.instances.add(s), e.set(r, a);
|
|
19
19
|
}
|
|
20
20
|
}
|
|
21
21
|
return { pairs: e, tokens: o };
|
|
22
22
|
}
|
|
23
|
-
function d(o, e, s, t,
|
|
24
|
-
const
|
|
25
|
-
if (o.pairs.has(
|
|
26
|
-
const a = o.pairs.get(
|
|
27
|
-
a.count +=
|
|
23
|
+
function d(o, e, s, t, n) {
|
|
24
|
+
const r = p(e, s);
|
|
25
|
+
if (o.pairs.has(r)) {
|
|
26
|
+
const a = o.pairs.get(r);
|
|
27
|
+
a.count += n, n > 0 ? a.instances.add(t) : a.count <= 0 ? o.pairs.delete(r) : a.instances.delete(t);
|
|
28
28
|
} else
|
|
29
|
-
o.pairs.set(
|
|
29
|
+
o.pairs.set(r, { a: e, b: s, count: n, instances: /* @__PURE__ */ new Set([t]) });
|
|
30
30
|
}
|
|
31
31
|
function T(o) {
|
|
32
32
|
let e = null, s = 0;
|
|
@@ -37,21 +37,21 @@ function T(o) {
|
|
|
37
37
|
function y(o, e) {
|
|
38
38
|
return o.map((s) => {
|
|
39
39
|
const t = [];
|
|
40
|
-
for (let
|
|
41
|
-
|
|
40
|
+
for (let n = 0; n < s.length; n++)
|
|
41
|
+
n < s.length - 1 && s[n] === e[0] && s[n + 1] === e[1] ? (t.push(e[0] + e[1]), n++) : t.push(s[n]);
|
|
42
42
|
return t;
|
|
43
43
|
});
|
|
44
44
|
}
|
|
45
45
|
function I(o, e) {
|
|
46
46
|
e.instances.forEach((s) => {
|
|
47
|
-
const t = o.tokens[s],
|
|
48
|
-
for (let
|
|
49
|
-
if (
|
|
47
|
+
const t = o.tokens[s], n = [];
|
|
48
|
+
for (let r = 0; r < t.length; r++)
|
|
49
|
+
if (r < t.length - 1 && t[r] === e.a && t[r + 1] === e.b) {
|
|
50
50
|
const a = e.a + e.b;
|
|
51
|
-
|
|
51
|
+
n.push(a), r > 0 && (d(o, t[r - 1], e.a, s, -1), d(o, t[r - 1], a, s, 1)), r++, r < t.length - 1 && (d(o, e.b, t[r + 1], s, -1), d(o, a, t[r + 1], s, 1));
|
|
52
52
|
} else
|
|
53
|
-
|
|
54
|
-
o.tokens[s] =
|
|
53
|
+
n.push(t[r]);
|
|
54
|
+
o.tokens[s] = n;
|
|
55
55
|
}), o.pairs.delete(p(e.a, e.b));
|
|
56
56
|
}
|
|
57
57
|
class E extends z {
|
|
@@ -61,11 +61,11 @@ class E extends z {
|
|
|
61
61
|
merges = [];
|
|
62
62
|
pretokenMap = /* @__PURE__ */ new Map();
|
|
63
63
|
constructor(e, s) {
|
|
64
|
-
super(), Array.isArray(e) ? (e.forEach((t,
|
|
65
|
-
this.vocab.add(t), this.vocabIndex.set(t,
|
|
66
|
-
}), s && (this.merges = s), this.targetSize = e.length,
|
|
67
|
-
const
|
|
68
|
-
|
|
64
|
+
super(), Array.isArray(e) ? (e.forEach((t, n) => {
|
|
65
|
+
this.vocab.add(t), this.vocabIndex.set(t, n);
|
|
66
|
+
}), s && (this.merges = s), this.targetSize = e.length, k.forEach((t) => {
|
|
67
|
+
const n = e.indexOf(t);
|
|
68
|
+
n !== -1 && this.addSpecialToken(t, n);
|
|
69
69
|
})) : (this.addSpecialTokens(), this.targetSize = e);
|
|
70
70
|
}
|
|
71
71
|
addToken(e, s) {
|
|
@@ -81,7 +81,7 @@ class E extends z {
|
|
|
81
81
|
this.vocab.clear(), this.vocabIndex.clear(), this.merges = [], this.pretokenMap.clear();
|
|
82
82
|
}
|
|
83
83
|
get trained() {
|
|
84
|
-
return this.vocab.size >
|
|
84
|
+
return this.vocab.size > k.length && this.vocab.size <= this.targetSize;
|
|
85
85
|
}
|
|
86
86
|
get vocabSize() {
|
|
87
87
|
return this.vocab.size;
|
|
@@ -95,23 +95,23 @@ class E extends z {
|
|
|
95
95
|
get unkToken() {
|
|
96
96
|
return this.vocabIndex.get("") ?? 1;
|
|
97
97
|
}
|
|
98
|
-
async train(e, s) {
|
|
98
|
+
async train(e = [], s) {
|
|
99
99
|
let t = performance.now();
|
|
100
|
-
const
|
|
100
|
+
const n = e.map((i) => i.map((h) => m(h.content))).flat(2);
|
|
101
101
|
t = await f(t, s, this.vocab.size);
|
|
102
|
-
const
|
|
102
|
+
const r = new Set(n);
|
|
103
103
|
this.vocab = /* @__PURE__ */ new Set(), this.pretokenMap.clear(), this.merges = [], this.addSpecialTokens();
|
|
104
|
-
const a = Array.from(
|
|
104
|
+
const a = Array.from(r), b = a.map((i) => Array.from(i).map((l) => (this.vocab.add(l), l))), g = w(b);
|
|
105
105
|
if (t = await f(t, s, this.vocab.size), this.vocab.size >= this.targetSize) {
|
|
106
106
|
console.warn("Initial vocab size is greater than or equal to target size. No merges will be performed.");
|
|
107
107
|
const i = /* @__PURE__ */ new Map();
|
|
108
|
-
|
|
108
|
+
n.forEach((c) => {
|
|
109
109
|
Array.from(c).forEach((u) => {
|
|
110
110
|
i.set(u, (i.get(u) || 0) + 1);
|
|
111
111
|
});
|
|
112
112
|
});
|
|
113
|
-
const
|
|
114
|
-
this.vocab = /* @__PURE__ */ new Set(), this.addSpecialTokens(),
|
|
113
|
+
const h = Array.from(i.entries()).sort((c, u) => u[1] - c[1]);
|
|
114
|
+
this.vocab = /* @__PURE__ */ new Set(), this.addSpecialTokens(), h.slice(0, this.targetSize - this.vocab.size).map(([c]) => c).forEach((c) => this.vocab.add(c)), this.vocabIndex.clear();
|
|
115
115
|
let S = 0;
|
|
116
116
|
for (const c of this.vocab.keys())
|
|
117
117
|
this.vocabIndex.set(c, S++);
|
|
@@ -123,9 +123,9 @@ class E extends z {
|
|
|
123
123
|
break;
|
|
124
124
|
this.merges.push([i.a, i.b]), this.vocab.add(i.a + i.b), I(g, i), t = await f(t, s, this.vocab.size);
|
|
125
125
|
}
|
|
126
|
-
a.forEach((i,
|
|
127
|
-
const
|
|
128
|
-
this.pretokenMap.set(i,
|
|
126
|
+
a.forEach((i, h) => {
|
|
127
|
+
const l = b[h];
|
|
128
|
+
this.pretokenMap.set(i, l);
|
|
129
129
|
}), this.vocabIndex.clear();
|
|
130
130
|
let v = 0;
|
|
131
131
|
for (const i of this.vocab.keys())
|
|
@@ -145,15 +145,15 @@ class E extends z {
|
|
|
145
145
|
}), this.pretokenMap.set(e, s), s;
|
|
146
146
|
}
|
|
147
147
|
tokeniseStrings(e) {
|
|
148
|
-
return e.map((s) =>
|
|
148
|
+
return e.map((s) => m(s).map((r) => this.pretokenMap.has(r) ? this.pretokenMap.get(r) : this.tokeniseWord(r)).flat(1));
|
|
149
149
|
}
|
|
150
150
|
tokenise(e, s) {
|
|
151
151
|
const t = this.tokeniseStrings(e);
|
|
152
|
-
return s ? t.map((
|
|
152
|
+
return s ? t.map((n) => n.map((r) => this.vocabIndex.get(r) ?? this.unkToken)) : t.map((n) => n.map((r) => this.vocab.has(r) ? r : ""));
|
|
153
153
|
}
|
|
154
154
|
detokenise(e) {
|
|
155
155
|
const s = this.getVocab();
|
|
156
|
-
return e.map((
|
|
156
|
+
return e.map((n) => n.map((r) => s[r]).join(""));
|
|
157
157
|
}
|
|
158
158
|
encode(e) {
|
|
159
159
|
return this.tokenise([e], !0)[0];
|
package/dist/tokeniser/type.d.ts
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import { default as EE } from 'eventemitter3';
|
|
2
|
-
export type Roles = 'user' | 'assistant' | 'system';
|
|
2
|
+
export type Roles = 'user' | 'assistant' | 'system' | 'text';
|
|
3
3
|
export interface Conversation {
|
|
4
4
|
role: Roles;
|
|
5
5
|
content: string;
|
|
6
6
|
}
|
|
7
7
|
export interface ITokeniser extends EE<'trainStatus'> {
|
|
8
|
-
train(text:
|
|
8
|
+
train(text: Conversation[][], cb?: (vocab: number) => void): Promise<number>;
|
|
9
9
|
getVocab(): string[];
|
|
10
10
|
getMerges(): [string, string][];
|
|
11
11
|
destroy(): void;
|
|
@@ -6,22 +6,24 @@ function w(p, o, t, l) {
|
|
|
6
6
|
const s = [t.bosToken], a = [!1], u = {
|
|
7
7
|
user: t.getSpecialTokenIndex("<|user_start|>"),
|
|
8
8
|
assistant: t.getSpecialTokenIndex("<|assistant_start|>"),
|
|
9
|
-
system: t.getSpecialTokenIndex("<|system_start|>")
|
|
9
|
+
system: t.getSpecialTokenIndex("<|system_start|>"),
|
|
10
|
+
text: void 0
|
|
10
11
|
}, c = {
|
|
11
12
|
user: t.getSpecialTokenIndex("<|user_end|>"),
|
|
12
13
|
assistant: t.getSpecialTokenIndex("<|assistant_end|>"),
|
|
13
|
-
system: t.getSpecialTokenIndex("<|system_end|>")
|
|
14
|
+
system: t.getSpecialTokenIndex("<|system_end|>"),
|
|
15
|
+
text: void 0
|
|
14
16
|
};
|
|
15
17
|
for (const e of p) {
|
|
16
18
|
const r = u[e.role], h = c[e.role];
|
|
17
19
|
if (!r || !h)
|
|
18
20
|
throw new Error(`Missing special tokens for role: ${e.role}`);
|
|
19
21
|
s.push(r), a.push(!1);
|
|
20
|
-
const m = e.role === "assistant",
|
|
21
|
-
for (const T of
|
|
22
|
+
const m = e.role === "assistant", x = t.encode(e.content);
|
|
23
|
+
for (const T of x) {
|
|
22
24
|
s.push(T);
|
|
23
|
-
const
|
|
24
|
-
a.push(m && !
|
|
25
|
+
const S = t.isSpecialToken(T);
|
|
26
|
+
a.push(m && !S);
|
|
25
27
|
}
|
|
26
28
|
s.push(h), a.push(m);
|
|
27
29
|
}
|
|
@@ -40,7 +42,7 @@ function w(p, o, t, l) {
|
|
|
40
42
|
}
|
|
41
43
|
return g ? { xs: f, ys: d } : null;
|
|
42
44
|
}
|
|
43
|
-
class
|
|
45
|
+
class A {
|
|
44
46
|
tokenizer;
|
|
45
47
|
blockSize;
|
|
46
48
|
constructor(o, t = 128) {
|
|
@@ -78,6 +80,6 @@ class D {
|
|
|
78
80
|
}
|
|
79
81
|
}
|
|
80
82
|
export {
|
|
81
|
-
|
|
83
|
+
A as SFTDatasetBuilder,
|
|
82
84
|
w as buildSFTExample
|
|
83
85
|
};
|