@genai-fi/nanogpt 0.15.13 → 0.16.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/TeachableLLM.d.ts +1 -1
- package/dist/Trainer.js +10 -5
- package/dist/data/docx.d.ts +2 -1
- package/dist/data/docx.js +8 -8
- package/dist/data/parquet.d.ts +2 -1
- package/dist/data/parquet.js +5 -5
- package/dist/data/pdf.d.ts +2 -1
- package/dist/data/pdf.js +1 -1
- package/dist/data/textLoader.d.ts +2 -1
- package/dist/data/textLoader.js +55 -48
- package/dist/tokeniser/BaseTokeniser.d.ts +2 -2
- package/dist/tokeniser/BaseTokeniser.js +35 -35
- package/dist/tokeniser/CharTokeniser.d.ts +2 -1
- package/dist/tokeniser/CharTokeniser.js +15 -15
- package/dist/tokeniser/bpe.d.ts +2 -1
- package/dist/tokeniser/bpe.js +40 -40
- package/dist/tokeniser/type.d.ts +2 -2
- package/dist/training/BasicTrainer.js +62 -62
- package/dist/training/Evaluator.d.ts +2 -1
- package/dist/training/Evaluator.js +19 -18
- package/dist/training/SFTDatasetBuilder.js +47 -38
- package/dist/training/tasks/ConversationTask.d.ts +2 -2
- package/dist/training/tasks/ConversationTask.js +13 -11
- package/dist/training/tasks/PretrainingTask.d.ts +1 -2
- package/dist/training/tasks/PretrainingTask.js +4 -14
- package/dist/training/tasks/StartSentenceTask.d.ts +1 -2
- package/dist/training/tasks/StartSentenceTask.js +2 -7
- package/dist/training/tasks/Task.d.ts +1 -2
- package/dist/training/tasks/splitter.d.ts +5 -0
- package/dist/training/tasks/splitter.js +21 -0
- package/dist/training/validation.js +1 -1
- package/package.json +1 -1
package/dist/TeachableLLM.d.ts
CHANGED
|
@@ -49,7 +49,7 @@ export default class TeachableLLM {
|
|
|
49
49
|
getNumParams(): number;
|
|
50
50
|
trainer(trainingType?: TrainingType, options?: TrainingOptions): Trainer;
|
|
51
51
|
train(text: Task[], options?: TrainingOptions, trainingType?: TrainingType): Promise<void>;
|
|
52
|
-
trainTokeniser(text:
|
|
52
|
+
trainTokeniser(text: Conversation[][]): Promise<number>;
|
|
53
53
|
generator(): IGenerator;
|
|
54
54
|
generateText(prompt: Conversation[], options?: IGenerateOptions): Promise<Conversation[]>;
|
|
55
55
|
generateText(options?: IGenerateOptions): Promise<Conversation[]>;
|
package/dist/Trainer.js
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import { E as g } from "./index-DvYrXKkX.js";
|
|
2
2
|
import o from "./training/PreTrainer.js";
|
|
3
|
-
import { createTrainValidationSplit as
|
|
3
|
+
import { createTrainValidationSplit as d } from "./training/validation.js";
|
|
4
4
|
import h from "./training/SFTTrainer.js";
|
|
5
|
+
import p from "./training/tasks/splitter.js";
|
|
5
6
|
class l extends g {
|
|
6
7
|
trainer;
|
|
7
8
|
trainingType = "pretraining";
|
|
@@ -81,7 +82,7 @@ class l extends g {
|
|
|
81
82
|
async prepare(t = []) {
|
|
82
83
|
const i = this.options;
|
|
83
84
|
if (this.trainingType === "pretraining" && this.trainer instanceof o) {
|
|
84
|
-
const { trainDataset: e, validationDataset: a, size: r, trainState: n } = await
|
|
85
|
+
const { trainDataset: e, validationDataset: a, size: r, trainState: n } = await d(
|
|
85
86
|
t,
|
|
86
87
|
this.trainer.tokenizer,
|
|
87
88
|
this.trainer.datasetBuilder,
|
|
@@ -92,12 +93,16 @@ class l extends g {
|
|
|
92
93
|
} else if (this.trainingType === "sft" && this.trainer instanceof h) {
|
|
93
94
|
if (t instanceof Uint16Array)
|
|
94
95
|
throw new Error("SFT training requires Task[] input");
|
|
95
|
-
const e = await this.trainer.datasetBuilder.createSFTDataset(
|
|
96
|
-
|
|
96
|
+
const e = p(t, i?.validationSplit || 0.1), a = await this.trainer.datasetBuilder.createSFTDataset(
|
|
97
|
+
[e.training],
|
|
98
|
+
i?.batchSize || 32,
|
|
99
|
+
-100
|
|
100
|
+
), r = await this.trainer.datasetBuilder.createSFTDataset(
|
|
101
|
+
[e.validation],
|
|
97
102
|
i?.batchSize || 32,
|
|
98
103
|
-100
|
|
99
104
|
);
|
|
100
|
-
this.trainDataset =
|
|
105
|
+
this.validationDataset = r, this.trainDataset = a, this.totalSamples = t.reduce((n, s) => n + s.length, 0), this.options.epochSteps = Math.ceil(this.totalSamples / (i?.batchSize || 32)), this.trainer.updateOptimizer(this.options);
|
|
101
106
|
}
|
|
102
107
|
}
|
|
103
108
|
configureModel(t) {
|
package/dist/data/docx.d.ts
CHANGED
|
@@ -1 +1,2 @@
|
|
|
1
|
-
|
|
1
|
+
import { Conversation } from '../tokeniser/type';
|
|
2
|
+
export declare function loadDOCX(file: Blob | Uint8Array): Promise<Conversation[][]>;
|
package/dist/data/docx.js
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import { z as a } from "../jszip.min-BZhlzntC.js";
|
|
2
|
-
async function c(
|
|
3
|
-
const
|
|
4
|
-
if (!
|
|
5
|
-
return i(
|
|
6
|
-
`).filter((
|
|
2
|
+
async function c(e) {
|
|
3
|
+
const n = await (await a.loadAsync(e)).file("word/document.xml")?.async("string");
|
|
4
|
+
if (!n) throw new Error("Failed to load document.xml");
|
|
5
|
+
return i(n).split(`
|
|
6
|
+
`).filter((t) => t.trim().length > 10).map((t) => [{ role: "text", content: t }]);
|
|
7
7
|
}
|
|
8
|
-
function i(
|
|
9
|
-
const
|
|
10
|
-
return Array.from(
|
|
8
|
+
function i(e) {
|
|
9
|
+
const n = new DOMParser().parseFromString(e, "application/xml");
|
|
10
|
+
return Array.from(n.getElementsByTagName("w:t")).map((t) => t.textContent).join(`
|
|
11
11
|
`);
|
|
12
12
|
}
|
|
13
13
|
export {
|
package/dist/data/parquet.d.ts
CHANGED
|
@@ -1 +1,2 @@
|
|
|
1
|
-
|
|
1
|
+
import { Conversation } from '../tokeniser/type';
|
|
2
|
+
export declare function loadParquet(file: File, maxSize?: number, column?: string): Promise<Conversation[][]>;
|
package/dist/data/parquet.js
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
import { B as
|
|
1
|
+
import { B as f } from "../index-Cp39cXWe.js";
|
|
2
2
|
const p = 100 * 1024 * 1024;
|
|
3
|
-
async function d(i,
|
|
4
|
-
const r = await (await import("../parquet-Bqjmp2vo.js").then((t) => t.p)).ParquetReader.openBuffer(
|
|
3
|
+
async function d(i, n = p, e = "text") {
|
|
4
|
+
const r = await (await import("../parquet-Bqjmp2vo.js").then((t) => t.p)).ParquetReader.openBuffer(f.from(await i.arrayBuffer())), a = [], s = r.getCursor([[e]]);
|
|
5
5
|
let o = 0;
|
|
6
6
|
for (; ; ) {
|
|
7
|
-
const t = await
|
|
7
|
+
const t = await s.next();
|
|
8
8
|
if (!t || t[e] === void 0 || typeof t[e] != "string")
|
|
9
9
|
break;
|
|
10
|
-
if (t[e].length !== 0 && (a.push(t[e]), o += t[e].length, o >
|
|
10
|
+
if (t[e].length !== 0 && (a.push([{ role: "text", content: t[e] }]), o += t[e].length, o > n))
|
|
11
11
|
break;
|
|
12
12
|
}
|
|
13
13
|
return r.close(), a;
|
package/dist/data/pdf.d.ts
CHANGED
|
@@ -1 +1,2 @@
|
|
|
1
|
-
|
|
1
|
+
import { Conversation } from '../tokeniser/type';
|
|
2
|
+
export declare function loadPDF(file: Blob | Uint8Array, maxSize?: number): Promise<Conversation[][]>;
|
package/dist/data/pdf.js
CHANGED
|
@@ -5,7 +5,7 @@ async function h(l, X = 104857600) {
|
|
|
5
5
|
let m = 0;
|
|
6
6
|
for (let b = 1; b <= N; b++) {
|
|
7
7
|
const G = (await (await d.getPage(b)).getTextContent()).items.filter((c) => c.str.trim().length > 10).map((c) => c.str).join(" ");
|
|
8
|
-
if (W.push(G), m += G.length, m > X) break;
|
|
8
|
+
if (W.push([{ role: "text", content: G }]), m += G.length, m > X) break;
|
|
9
9
|
}
|
|
10
10
|
return W;
|
|
11
11
|
}
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
+
import { Conversation } from '../tokeniser/type';
|
|
1
2
|
export interface DataOptions {
|
|
2
3
|
maxSize?: number;
|
|
3
4
|
column?: string;
|
|
4
5
|
hasHeader?: boolean;
|
|
5
6
|
}
|
|
6
|
-
export default function loadTextData(file: File, options?: DataOptions): Promise<
|
|
7
|
+
export default function loadTextData(file: File, options?: DataOptions): Promise<Conversation[][]>;
|
package/dist/data/textLoader.js
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
import { p as u } from "../papaparse.min-C0cScC2i.js";
|
|
2
|
-
import { loadParquet as
|
|
3
|
-
import { loadPDF as
|
|
4
|
-
import { loadDOCX as
|
|
5
|
-
import { z as
|
|
6
|
-
function
|
|
7
|
-
const
|
|
8
|
-
return
|
|
2
|
+
import { loadParquet as f } from "./parquet.js";
|
|
3
|
+
import { loadPDF as d } from "./pdf.js";
|
|
4
|
+
import { loadDOCX as x } from "./docx.js";
|
|
5
|
+
import { z as m } from "../jszip.min-BZhlzntC.js";
|
|
6
|
+
function y(t, r) {
|
|
7
|
+
const n = t.findIndex((i) => i.toLowerCase() === r.toLowerCase());
|
|
8
|
+
return n === -1 ? 0 : n;
|
|
9
9
|
}
|
|
10
|
-
function
|
|
11
|
-
return t.every((
|
|
10
|
+
function w(t) {
|
|
11
|
+
return t.every((r) => r.length < 64);
|
|
12
12
|
}
|
|
13
13
|
function h(t) {
|
|
14
14
|
return t.split(".").pop() || "";
|
|
@@ -35,67 +35,74 @@ function g(t) {
|
|
|
35
35
|
return "unknown";
|
|
36
36
|
}
|
|
37
37
|
}
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
38
|
+
function z(t) {
|
|
39
|
+
if (!Array.isArray(t)) return !1;
|
|
40
|
+
const r = t[0];
|
|
41
|
+
return typeof r == "object" && r !== null && "role" in r && "content" in r && typeof r.role == "string" && typeof r.content == "string";
|
|
42
|
+
}
|
|
43
|
+
async function j(t, r) {
|
|
44
|
+
const n = t.type !== "" ? t.type : g(t.name);
|
|
45
|
+
if (n === "application/parquet")
|
|
46
|
+
return f(t, r?.maxSize, r?.column);
|
|
47
|
+
if (n === "application/pdf")
|
|
48
|
+
return d(t, r?.maxSize);
|
|
49
|
+
if (n === "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
|
|
50
|
+
return x(t);
|
|
51
|
+
if (n === "application/json") {
|
|
52
|
+
const i = await t.text(), o = JSON.parse(i);
|
|
53
|
+
if (Array.isArray(o))
|
|
54
|
+
return o.map((e) => [
|
|
55
|
+
typeof e == "string" ? { role: "text", content: e } : "text" in e ? { role: "text", content: e.text } : { role: "text", content: JSON.stringify(e) }
|
|
56
|
+
]);
|
|
52
57
|
throw new Error("Expected JSON array");
|
|
53
58
|
}
|
|
54
|
-
if (
|
|
59
|
+
if (n === "application/jsonl")
|
|
55
60
|
return (await t.text()).split(`
|
|
56
|
-
`).filter((
|
|
61
|
+
`).filter((o) => o.trim() !== "").map((o) => {
|
|
57
62
|
try {
|
|
58
|
-
const e = JSON.parse(
|
|
59
|
-
return
|
|
63
|
+
const e = JSON.parse(o);
|
|
64
|
+
return z(e) ? e : [
|
|
65
|
+
typeof e == "string" ? { role: "text", content: e } : "text" in e ? { role: "text", content: e.text } : { role: "text", content: JSON.stringify(e) }
|
|
66
|
+
];
|
|
60
67
|
} catch {
|
|
61
|
-
return
|
|
68
|
+
return [{ role: "text", content: o }];
|
|
62
69
|
}
|
|
63
70
|
});
|
|
64
|
-
if (
|
|
65
|
-
const i = await
|
|
71
|
+
if (n === "application/zip") {
|
|
72
|
+
const i = await m.loadAsync(t), o = [];
|
|
66
73
|
for (const e of Object.keys(i.files)) {
|
|
67
|
-
const
|
|
68
|
-
if (
|
|
69
|
-
const c = await
|
|
70
|
-
|
|
74
|
+
const a = i.file(e);
|
|
75
|
+
if (a) {
|
|
76
|
+
const c = await a.async("blob"), s = await j(new File([c], e), r);
|
|
77
|
+
o.push(...s);
|
|
71
78
|
}
|
|
72
79
|
}
|
|
73
|
-
return
|
|
80
|
+
return o;
|
|
74
81
|
}
|
|
75
|
-
if (
|
|
82
|
+
if (n === "text/csv") {
|
|
76
83
|
const i = await t.text();
|
|
77
|
-
return new Promise((
|
|
84
|
+
return new Promise((o, e) => {
|
|
78
85
|
u.parse(i, {
|
|
79
86
|
header: !1,
|
|
80
87
|
skipEmptyLines: !0,
|
|
81
88
|
delimiter: ",",
|
|
82
|
-
complete: (
|
|
83
|
-
if (
|
|
84
|
-
console.error(
|
|
89
|
+
complete: (a) => {
|
|
90
|
+
if (a.errors.length > 0)
|
|
91
|
+
console.error(a.errors), e(new Error("Error parsing file"));
|
|
85
92
|
else {
|
|
86
|
-
const c =
|
|
87
|
-
|
|
93
|
+
const c = y(a.data[0], r?.column || "text"), p = r?.hasHeader ?? w(a.data[0]) ? a.data.slice(1) : a.data;
|
|
94
|
+
o(p.map((l) => [{ role: "text", content: l[c] }]));
|
|
88
95
|
}
|
|
89
96
|
},
|
|
90
|
-
error: (
|
|
91
|
-
e(
|
|
97
|
+
error: (a) => {
|
|
98
|
+
e(a);
|
|
92
99
|
}
|
|
93
100
|
});
|
|
94
101
|
});
|
|
95
|
-
} else if (
|
|
96
|
-
return [await t.text()];
|
|
97
|
-
throw new Error(`Unsupported file type: ${
|
|
102
|
+
} else if (n === "text/plain")
|
|
103
|
+
return [[{ role: "text", content: await t.text() }]];
|
|
104
|
+
throw new Error(`Unsupported file type: ${n}`);
|
|
98
105
|
}
|
|
99
106
|
export {
|
|
100
|
-
|
|
107
|
+
j as default
|
|
101
108
|
};
|
|
@@ -12,7 +12,7 @@ export default abstract class BaseTokeniser extends EE<'trainStatus'> implements
|
|
|
12
12
|
isSpecialToken(index: number): boolean;
|
|
13
13
|
protected addSpecialTokens(): void;
|
|
14
14
|
protected addSpecialToken(token: string, index: number): void;
|
|
15
|
-
abstract train(text:
|
|
15
|
+
abstract train(text: Conversation[][], cb?: (vocab: number) => void): Promise<number>;
|
|
16
16
|
abstract getVocab(): string[];
|
|
17
17
|
abstract getMerges(): [string, string][];
|
|
18
18
|
abstract destroy(): void;
|
|
@@ -21,6 +21,6 @@ export default abstract class BaseTokeniser extends EE<'trainStatus'> implements
|
|
|
21
21
|
encodeAsSequence(conversation: Conversation[], completion?: boolean): number[];
|
|
22
22
|
encodeConversation(conversation: Conversation[], completion?: boolean): number[];
|
|
23
23
|
abstract decode(tokens: number[]): string;
|
|
24
|
-
decodeConversation(tokens: number[]): Conversation[];
|
|
24
|
+
decodeConversation(tokens: number[] | Uint16Array): Conversation[];
|
|
25
25
|
getSpecialTokenIndex(token: string): number | undefined;
|
|
26
26
|
}
|
|
@@ -11,30 +11,30 @@ const h = [
|
|
|
11
11
|
"<|system_start|>",
|
|
12
12
|
"<|system_end|>"
|
|
13
13
|
];
|
|
14
|
-
class
|
|
14
|
+
class k extends r {
|
|
15
15
|
specialTokens = /* @__PURE__ */ new Map();
|
|
16
16
|
specialTokenSet = /* @__PURE__ */ new Set();
|
|
17
|
-
isSpecialToken(
|
|
18
|
-
return this.specialTokenSet.has(
|
|
17
|
+
isSpecialToken(s) {
|
|
18
|
+
return this.specialTokenSet.has(s);
|
|
19
19
|
}
|
|
20
20
|
addSpecialTokens() {
|
|
21
|
-
h.forEach((
|
|
22
|
-
this.addToken(
|
|
21
|
+
h.forEach((s, t) => {
|
|
22
|
+
this.addToken(s, t), this.specialTokens.set(s, t), this.specialTokenSet.add(t);
|
|
23
23
|
});
|
|
24
24
|
}
|
|
25
|
-
addSpecialToken(
|
|
26
|
-
this.specialTokens.set(
|
|
25
|
+
addSpecialToken(s, t) {
|
|
26
|
+
this.specialTokens.set(s, t), this.specialTokenSet.add(t);
|
|
27
27
|
}
|
|
28
|
-
encodeSequence(
|
|
29
|
-
const t = this.encode(
|
|
28
|
+
encodeSequence(s) {
|
|
29
|
+
const t = this.encode(s);
|
|
30
30
|
return [this.bosToken, ...t, this.eosToken];
|
|
31
31
|
}
|
|
32
|
-
encodeAsSequence(
|
|
33
|
-
const
|
|
34
|
-
return t ? [this.bosToken, ...
|
|
32
|
+
encodeAsSequence(s, t) {
|
|
33
|
+
const e = s.flatMap((o) => this.encode(o.content));
|
|
34
|
+
return t ? [this.bosToken, ...e, this.eosToken, this.bosToken] : [this.bosToken, ...e, this.eosToken];
|
|
35
35
|
}
|
|
36
|
-
encodeConversation(
|
|
37
|
-
const
|
|
36
|
+
encodeConversation(s, t) {
|
|
37
|
+
const e = [[this.bosToken]], o = [
|
|
38
38
|
this.getSpecialTokenIndex("<|user_start|>"),
|
|
39
39
|
this.getSpecialTokenIndex("<|assistant_start|>"),
|
|
40
40
|
this.getSpecialTokenIndex("<|system_start|>")
|
|
@@ -43,57 +43,57 @@ class l extends r {
|
|
|
43
43
|
this.getSpecialTokenIndex("<|assistant_end|>"),
|
|
44
44
|
this.getSpecialTokenIndex("<|system_end|>")
|
|
45
45
|
];
|
|
46
|
-
for (const i of
|
|
46
|
+
for (const i of s) {
|
|
47
47
|
const c = this.encode(i.content);
|
|
48
48
|
switch (i.role) {
|
|
49
49
|
case "user":
|
|
50
|
-
|
|
50
|
+
e.push([o[0]]);
|
|
51
51
|
break;
|
|
52
52
|
case "assistant":
|
|
53
|
-
|
|
53
|
+
e.push([o[1]]);
|
|
54
54
|
break;
|
|
55
55
|
case "system":
|
|
56
|
-
|
|
56
|
+
e.push([o[2]]);
|
|
57
57
|
break;
|
|
58
58
|
}
|
|
59
|
-
switch (
|
|
59
|
+
switch (e.push(c), i.role) {
|
|
60
60
|
case "user":
|
|
61
|
-
|
|
61
|
+
e.push([n[0]]);
|
|
62
62
|
break;
|
|
63
63
|
case "assistant":
|
|
64
|
-
|
|
64
|
+
e.push([n[1]]);
|
|
65
65
|
break;
|
|
66
66
|
case "system":
|
|
67
|
-
|
|
67
|
+
e.push([n[2]]);
|
|
68
68
|
break;
|
|
69
69
|
}
|
|
70
70
|
}
|
|
71
|
-
const a =
|
|
71
|
+
const a = e.flat();
|
|
72
72
|
return t ? a.push(o[1]) : a.push(this.eosToken), a;
|
|
73
73
|
}
|
|
74
|
-
decodeConversation(
|
|
74
|
+
decodeConversation(s) {
|
|
75
75
|
const t = [];
|
|
76
|
-
let
|
|
77
|
-
for (;
|
|
78
|
-
const o = e
|
|
76
|
+
let e = 0;
|
|
77
|
+
for (; e < s.length; ) {
|
|
78
|
+
const o = s[e];
|
|
79
79
|
let n = null;
|
|
80
|
-
if (o === this.getSpecialTokenIndex("<|user_start|>") ? n = "user" : o === this.getSpecialTokenIndex("<|assistant_start|>") ? n = "assistant" : o === this.getSpecialTokenIndex("<|system_start|>")
|
|
81
|
-
|
|
80
|
+
if (o === this.getSpecialTokenIndex("<|user_start|>") ? n = "user" : o === this.getSpecialTokenIndex("<|assistant_start|>") ? n = "assistant" : o === this.getSpecialTokenIndex("<|system_start|>") ? n = "system" : o === this.bosToken || (o === this.eosToken ? n = null : (n = "text", e--)), n) {
|
|
81
|
+
e++;
|
|
82
82
|
const a = [];
|
|
83
|
-
for (;
|
|
84
|
-
a.push(e
|
|
83
|
+
for (; e < s.length && s[e] !== this.getSpecialTokenIndex(`<|${n}_end|>`) && s[e] !== this.eosToken; )
|
|
84
|
+
a.push(s[e]), e++;
|
|
85
85
|
const i = this.decode(a);
|
|
86
86
|
t.push({ role: n, content: i });
|
|
87
87
|
}
|
|
88
|
-
|
|
88
|
+
e++;
|
|
89
89
|
}
|
|
90
90
|
return t;
|
|
91
91
|
}
|
|
92
|
-
getSpecialTokenIndex(
|
|
93
|
-
return this.specialTokens.get(
|
|
92
|
+
getSpecialTokenIndex(s) {
|
|
93
|
+
return this.specialTokens.get(s);
|
|
94
94
|
}
|
|
95
95
|
}
|
|
96
96
|
export {
|
|
97
97
|
h as SPECIALS,
|
|
98
|
-
|
|
98
|
+
k as default
|
|
99
99
|
};
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { default as BaseTokeniser } from './BaseTokeniser';
|
|
2
|
+
import { Conversation } from './type';
|
|
2
3
|
export default class CharTokeniser extends BaseTokeniser {
|
|
3
4
|
vocabSize: number;
|
|
4
5
|
eosToken: number;
|
|
@@ -11,7 +12,7 @@ export default class CharTokeniser extends BaseTokeniser {
|
|
|
11
12
|
addToken(token: string, index?: number): number;
|
|
12
13
|
get trained(): boolean;
|
|
13
14
|
destroy(): void;
|
|
14
|
-
train(text:
|
|
15
|
+
train(text: Conversation[][]): Promise<number>;
|
|
15
16
|
tokenise(text: string[], numeric: true): number[][];
|
|
16
17
|
tokenise(text: string[]): string[][];
|
|
17
18
|
detokenise(tokens: (number[] | Uint16Array)[]): string[];
|
|
@@ -40,32 +40,32 @@ class T extends k {
|
|
|
40
40
|
this.cache.clear(), this.vocab = [];
|
|
41
41
|
}
|
|
42
42
|
async train(i) {
|
|
43
|
-
const t = i.map((n) => n.split("")).flat(), e = new Set(t), s = Array.from(e), h = this.vocab.indexOf("", this.unkToken + 1),
|
|
43
|
+
const t = i.map((o) => o.map((n) => n.content.split(""))).flat(2), e = new Set(t), s = Array.from(e), h = this.vocab.indexOf("", this.unkToken + 1), a = this.vocabSize - u.length;
|
|
44
44
|
if (h === -1)
|
|
45
45
|
return this.vocabSize;
|
|
46
|
-
if (this._trained = !0, s.length >
|
|
47
|
-
const
|
|
48
|
-
t.forEach((
|
|
49
|
-
|
|
50
|
-
}), s.sort((
|
|
46
|
+
if (this._trained = !0, s.length > a) {
|
|
47
|
+
const o = /* @__PURE__ */ new Map();
|
|
48
|
+
t.forEach((n) => {
|
|
49
|
+
o.set(n, (o.get(n) || 0) + 1);
|
|
50
|
+
}), s.sort((n, r) => (o.get(n) || 0) - (o.get(r) || 0)), s.splice(0, s.length - a);
|
|
51
51
|
}
|
|
52
52
|
let c = h;
|
|
53
53
|
if (c !== -1) {
|
|
54
|
-
const
|
|
55
|
-
for (const
|
|
56
|
-
if (!
|
|
54
|
+
const o = new Set(this.vocab);
|
|
55
|
+
for (const n of s)
|
|
56
|
+
if (!o.has(n) && (this.vocab[c] = n, o.add(n), c = this.vocab.indexOf("", c + 1), c === -1))
|
|
57
57
|
break;
|
|
58
58
|
}
|
|
59
|
-
return this.cache.clear(), this.vocab.forEach((
|
|
60
|
-
this.cache.set(
|
|
59
|
+
return this.cache.clear(), this.vocab.forEach((o, n) => {
|
|
60
|
+
this.cache.set(o, n);
|
|
61
61
|
}), this.emit("trainStatus", "trained"), this.vocabSize;
|
|
62
62
|
}
|
|
63
63
|
tokenise(i, t) {
|
|
64
64
|
if (!this.trained)
|
|
65
65
|
throw new Error("Tokeniser not trained");
|
|
66
66
|
return i.map((s) => t ? s.split("").map((h) => this.cache.get(h) ?? this.unkToken) : s.split("").map((h) => {
|
|
67
|
-
const
|
|
68
|
-
return
|
|
67
|
+
const a = this.cache.get(h);
|
|
68
|
+
return a !== void 0 ? this.vocab[a] : "";
|
|
69
69
|
}));
|
|
70
70
|
}
|
|
71
71
|
detokenise(i) {
|
|
@@ -85,8 +85,8 @@ class T extends k {
|
|
|
85
85
|
}
|
|
86
86
|
async createTrainingData(i, t = 5) {
|
|
87
87
|
const e = await this.tokenise(i, !0), s = [], h = [];
|
|
88
|
-
for (let
|
|
89
|
-
s.push(...e[
|
|
88
|
+
for (let a = 0; a < e.length - t; a++)
|
|
89
|
+
s.push(...e[a].slice(0, t)), h.push(e[a + 1][0]);
|
|
90
90
|
return [s, h];
|
|
91
91
|
}
|
|
92
92
|
}
|
package/dist/tokeniser/bpe.d.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { default as BaseTokeniser } from './BaseTokeniser';
|
|
2
|
+
import { Conversation } from './type';
|
|
2
3
|
export default class BPETokeniser extends BaseTokeniser {
|
|
3
4
|
private targetSize;
|
|
4
5
|
private vocab;
|
|
@@ -14,7 +15,7 @@ export default class BPETokeniser extends BaseTokeniser {
|
|
|
14
15
|
get eosToken(): number;
|
|
15
16
|
get bosToken(): number;
|
|
16
17
|
get unkToken(): number;
|
|
17
|
-
train(text
|
|
18
|
+
train(text?: Conversation[][], cb?: (vocab: number) => void): Promise<number>;
|
|
18
19
|
getVocab(): string[];
|
|
19
20
|
getMerges(): [string, string][];
|
|
20
21
|
private tokeniseWord;
|