@genai-fi/nanogpt 0.15.13 → 0.16.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -49,7 +49,7 @@ export default class TeachableLLM {
49
49
  getNumParams(): number;
50
50
  trainer(trainingType?: TrainingType, options?: TrainingOptions): Trainer;
51
51
  train(text: Task[], options?: TrainingOptions, trainingType?: TrainingType): Promise<void>;
52
- trainTokeniser(text: string[]): Promise<number>;
52
+ trainTokeniser(text: Conversation[][]): Promise<number>;
53
53
  generator(): IGenerator;
54
54
  generateText(prompt: Conversation[], options?: IGenerateOptions): Promise<Conversation[]>;
55
55
  generateText(options?: IGenerateOptions): Promise<Conversation[]>;
package/dist/Trainer.js CHANGED
@@ -1,7 +1,8 @@
1
1
  import { E as g } from "./index-DvYrXKkX.js";
2
2
  import o from "./training/PreTrainer.js";
3
- import { createTrainValidationSplit as p } from "./training/validation.js";
3
+ import { createTrainValidationSplit as d } from "./training/validation.js";
4
4
  import h from "./training/SFTTrainer.js";
5
+ import p from "./training/tasks/splitter.js";
5
6
  class l extends g {
6
7
  trainer;
7
8
  trainingType = "pretraining";
@@ -81,7 +82,7 @@ class l extends g {
81
82
  async prepare(t = []) {
82
83
  const i = this.options;
83
84
  if (this.trainingType === "pretraining" && this.trainer instanceof o) {
84
- const { trainDataset: e, validationDataset: a, size: r, trainState: n } = await p(
85
+ const { trainDataset: e, validationDataset: a, size: r, trainState: n } = await d(
85
86
  t,
86
87
  this.trainer.tokenizer,
87
88
  this.trainer.datasetBuilder,
@@ -92,12 +93,16 @@ class l extends g {
92
93
  } else if (this.trainingType === "sft" && this.trainer instanceof h) {
93
94
  if (t instanceof Uint16Array)
94
95
  throw new Error("SFT training requires Task[] input");
95
- const e = await this.trainer.datasetBuilder.createSFTDataset(
96
- t,
96
+ const e = p(t, i?.validationSplit || 0.1), a = await this.trainer.datasetBuilder.createSFTDataset(
97
+ [e.training],
98
+ i?.batchSize || 32,
99
+ -100
100
+ ), r = await this.trainer.datasetBuilder.createSFTDataset(
101
+ [e.validation],
97
102
  i?.batchSize || 32,
98
103
  -100
99
104
  );
100
- this.trainDataset = e, this.totalSamples = t.reduce((a, r) => a + r.length, 0), this.options.epochSteps = Math.ceil(this.totalSamples / (i?.batchSize || 32)), this.trainer.updateOptimizer(this.options);
105
+ this.validationDataset = r, this.trainDataset = a, this.totalSamples = t.reduce((n, s) => n + s.length, 0), this.options.epochSteps = Math.ceil(this.totalSamples / (i?.batchSize || 32)), this.trainer.updateOptimizer(this.options);
101
106
  }
102
107
  }
103
108
  configureModel(t) {
@@ -1 +1,2 @@
1
- export declare function loadDOCX(file: Blob | Uint8Array): Promise<string[]>;
1
+ import { Conversation } from '../tokeniser/type';
2
+ export declare function loadDOCX(file: Blob | Uint8Array): Promise<Conversation[][]>;
package/dist/data/docx.js CHANGED
@@ -1,13 +1,13 @@
1
1
  import { z as a } from "../jszip.min-BZhlzntC.js";
2
- async function c(n) {
3
- const t = await (await a.loadAsync(n)).file("word/document.xml")?.async("string");
4
- if (!t) throw new Error("Failed to load document.xml");
5
- return i(t).split(`
6
- `).filter((r) => r.trim().length > 10);
2
+ async function c(e) {
3
+ const n = await (await a.loadAsync(e)).file("word/document.xml")?.async("string");
4
+ if (!n) throw new Error("Failed to load document.xml");
5
+ return i(n).split(`
6
+ `).filter((t) => t.trim().length > 10).map((t) => [{ role: "text", content: t }]);
7
7
  }
8
- function i(n) {
9
- const t = new DOMParser().parseFromString(n, "application/xml");
10
- return Array.from(t.getElementsByTagName("w:t")).map((r) => r.textContent).join(`
8
+ function i(e) {
9
+ const n = new DOMParser().parseFromString(e, "application/xml");
10
+ return Array.from(n.getElementsByTagName("w:t")).map((t) => t.textContent).join(`
11
11
  `);
12
12
  }
13
13
  export {
@@ -1 +1,2 @@
1
- export declare function loadParquet(file: File, maxSize?: number, column?: string): Promise<string[]>;
1
+ import { Conversation } from '../tokeniser/type';
2
+ export declare function loadParquet(file: File, maxSize?: number, column?: string): Promise<Conversation[][]>;
@@ -1,13 +1,13 @@
1
- import { B as n } from "../index-Cp39cXWe.js";
1
+ import { B as f } from "../index-Cp39cXWe.js";
2
2
  const p = 100 * 1024 * 1024;
3
- async function d(i, s = p, e = "text") {
4
- const r = await (await import("../parquet-Bqjmp2vo.js").then((t) => t.p)).ParquetReader.openBuffer(n.from(await i.arrayBuffer())), a = [], f = r.getCursor([[e]]);
3
+ async function d(i, n = p, e = "text") {
4
+ const r = await (await import("../parquet-Bqjmp2vo.js").then((t) => t.p)).ParquetReader.openBuffer(f.from(await i.arrayBuffer())), a = [], s = r.getCursor([[e]]);
5
5
  let o = 0;
6
6
  for (; ; ) {
7
- const t = await f.next();
7
+ const t = await s.next();
8
8
  if (!t || t[e] === void 0 || typeof t[e] != "string")
9
9
  break;
10
- if (t[e].length !== 0 && (a.push(t[e]), o += t[e].length, o > s))
10
+ if (t[e].length !== 0 && (a.push([{ role: "text", content: t[e] }]), o += t[e].length, o > n))
11
11
  break;
12
12
  }
13
13
  return r.close(), a;
@@ -1 +1,2 @@
1
- export declare function loadPDF(file: Blob | Uint8Array, maxSize?: number): Promise<string[]>;
1
+ import { Conversation } from '../tokeniser/type';
2
+ export declare function loadPDF(file: Blob | Uint8Array, maxSize?: number): Promise<Conversation[][]>;
package/dist/data/pdf.js CHANGED
@@ -5,7 +5,7 @@ async function h(l, X = 104857600) {
5
5
  let m = 0;
6
6
  for (let b = 1; b <= N; b++) {
7
7
  const G = (await (await d.getPage(b)).getTextContent()).items.filter((c) => c.str.trim().length > 10).map((c) => c.str).join(" ");
8
- if (W.push(G), m += G.length, m > X) break;
8
+ if (W.push([{ role: "text", content: G }]), m += G.length, m > X) break;
9
9
  }
10
10
  return W;
11
11
  }
@@ -1,6 +1,7 @@
1
+ import { Conversation } from '../tokeniser/type';
1
2
  export interface DataOptions {
2
3
  maxSize?: number;
3
4
  column?: string;
4
5
  hasHeader?: boolean;
5
6
  }
6
- export default function loadTextData(file: File, options?: DataOptions): Promise<string[]>;
7
+ export default function loadTextData(file: File, options?: DataOptions): Promise<Conversation[][]>;
@@ -1,14 +1,14 @@
1
1
  import { p as u } from "../papaparse.min-C0cScC2i.js";
2
- import { loadParquet as d } from "./parquet.js";
3
- import { loadPDF as f } from "./pdf.js";
4
- import { loadDOCX as m } from "./docx.js";
5
- import { z as x } from "../jszip.min-BZhlzntC.js";
6
- function w(t, n) {
7
- const r = t.findIndex((i) => i.toLowerCase() === n.toLowerCase());
8
- return r === -1 ? 0 : r;
2
+ import { loadParquet as f } from "./parquet.js";
3
+ import { loadPDF as d } from "./pdf.js";
4
+ import { loadDOCX as x } from "./docx.js";
5
+ import { z as m } from "../jszip.min-BZhlzntC.js";
6
+ function y(t, r) {
7
+ const n = t.findIndex((i) => i.toLowerCase() === r.toLowerCase());
8
+ return n === -1 ? 0 : n;
9
9
  }
10
- function y(t) {
11
- return t.every((n) => n.length < 64);
10
+ function w(t) {
11
+ return t.every((r) => r.length < 64);
12
12
  }
13
13
  function h(t) {
14
14
  return t.split(".").pop() || "";
@@ -35,67 +35,74 @@ function g(t) {
35
35
  return "unknown";
36
36
  }
37
37
  }
38
- async function z(t, n) {
39
- const r = t.type !== "" ? t.type : g(t.name);
40
- if (r === "application/parquet")
41
- return d(t, n?.maxSize, n?.column);
42
- if (r === "application/pdf")
43
- return f(t, n?.maxSize);
44
- if (r === "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
45
- return m(t);
46
- if (r === "application/json") {
47
- const i = await t.text(), a = JSON.parse(i);
48
- if (Array.isArray(a))
49
- return a.map(
50
- (e) => typeof e == "string" ? e : "text" in e ? e.text : JSON.stringify(e)
51
- );
38
+ function z(t) {
39
+ if (!Array.isArray(t)) return !1;
40
+ const r = t[0];
41
+ return typeof r == "object" && r !== null && "role" in r && "content" in r && typeof r.role == "string" && typeof r.content == "string";
42
+ }
43
+ async function j(t, r) {
44
+ const n = t.type !== "" ? t.type : g(t.name);
45
+ if (n === "application/parquet")
46
+ return f(t, r?.maxSize, r?.column);
47
+ if (n === "application/pdf")
48
+ return d(t, r?.maxSize);
49
+ if (n === "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
50
+ return x(t);
51
+ if (n === "application/json") {
52
+ const i = await t.text(), o = JSON.parse(i);
53
+ if (Array.isArray(o))
54
+ return o.map((e) => [
55
+ typeof e == "string" ? { role: "text", content: e } : "text" in e ? { role: "text", content: e.text } : { role: "text", content: JSON.stringify(e) }
56
+ ]);
52
57
  throw new Error("Expected JSON array");
53
58
  }
54
- if (r === "application/jsonl")
59
+ if (n === "application/jsonl")
55
60
  return (await t.text()).split(`
56
- `).filter((a) => a.trim() !== "").map((a) => {
61
+ `).filter((o) => o.trim() !== "").map((o) => {
57
62
  try {
58
- const e = JSON.parse(a);
59
- return typeof e == "string" ? e : "text" in e ? e.text : JSON.stringify(e);
63
+ const e = JSON.parse(o);
64
+ return z(e) ? e : [
65
+ typeof e == "string" ? { role: "text", content: e } : "text" in e ? { role: "text", content: e.text } : { role: "text", content: JSON.stringify(e) }
66
+ ];
60
67
  } catch {
61
- return a;
68
+ return [{ role: "text", content: o }];
62
69
  }
63
70
  });
64
- if (r === "application/zip") {
65
- const i = await x.loadAsync(t), a = [];
71
+ if (n === "application/zip") {
72
+ const i = await m.loadAsync(t), o = [];
66
73
  for (const e of Object.keys(i.files)) {
67
- const o = i.file(e);
68
- if (o) {
69
- const c = await o.async("blob"), p = await z(new File([c], e), n);
70
- a.push(...p);
74
+ const a = i.file(e);
75
+ if (a) {
76
+ const c = await a.async("blob"), s = await j(new File([c], e), r);
77
+ o.push(...s);
71
78
  }
72
79
  }
73
- return a;
80
+ return o;
74
81
  }
75
- if (r === "text/csv") {
82
+ if (n === "text/csv") {
76
83
  const i = await t.text();
77
- return new Promise((a, e) => {
84
+ return new Promise((o, e) => {
78
85
  u.parse(i, {
79
86
  header: !1,
80
87
  skipEmptyLines: !0,
81
88
  delimiter: ",",
82
- complete: (o) => {
83
- if (o.errors.length > 0)
84
- console.error(o.errors), e(new Error("Error parsing file"));
89
+ complete: (a) => {
90
+ if (a.errors.length > 0)
91
+ console.error(a.errors), e(new Error("Error parsing file"));
85
92
  else {
86
- const c = w(o.data[0], n?.column || "text"), s = n?.hasHeader ?? y(o.data[0]) ? o.data.slice(1) : o.data;
87
- a(s.map((l) => l[c]));
93
+ const c = y(a.data[0], r?.column || "text"), p = r?.hasHeader ?? w(a.data[0]) ? a.data.slice(1) : a.data;
94
+ o(p.map((l) => [{ role: "text", content: l[c] }]));
88
95
  }
89
96
  },
90
- error: (o) => {
91
- e(o);
97
+ error: (a) => {
98
+ e(a);
92
99
  }
93
100
  });
94
101
  });
95
- } else if (r === "text/plain")
96
- return [await t.text()];
97
- throw new Error(`Unsupported file type: ${r}`);
102
+ } else if (n === "text/plain")
103
+ return [[{ role: "text", content: await t.text() }]];
104
+ throw new Error(`Unsupported file type: ${n}`);
98
105
  }
99
106
  export {
100
- z as default
107
+ j as default
101
108
  };
@@ -12,7 +12,7 @@ export default abstract class BaseTokeniser extends EE<'trainStatus'> implements
12
12
  isSpecialToken(index: number): boolean;
13
13
  protected addSpecialTokens(): void;
14
14
  protected addSpecialToken(token: string, index: number): void;
15
- abstract train(text: string[], cb?: (vocab: number) => void): Promise<number>;
15
+ abstract train(text: Conversation[][], cb?: (vocab: number) => void): Promise<number>;
16
16
  abstract getVocab(): string[];
17
17
  abstract getMerges(): [string, string][];
18
18
  abstract destroy(): void;
@@ -21,6 +21,6 @@ export default abstract class BaseTokeniser extends EE<'trainStatus'> implements
21
21
  encodeAsSequence(conversation: Conversation[], completion?: boolean): number[];
22
22
  encodeConversation(conversation: Conversation[], completion?: boolean): number[];
23
23
  abstract decode(tokens: number[]): string;
24
- decodeConversation(tokens: number[]): Conversation[];
24
+ decodeConversation(tokens: number[] | Uint16Array): Conversation[];
25
25
  getSpecialTokenIndex(token: string): number | undefined;
26
26
  }
@@ -11,30 +11,30 @@ const h = [
11
11
  "<|system_start|>",
12
12
  "<|system_end|>"
13
13
  ];
14
- class l extends r {
14
+ class k extends r {
15
15
  specialTokens = /* @__PURE__ */ new Map();
16
16
  specialTokenSet = /* @__PURE__ */ new Set();
17
- isSpecialToken(e) {
18
- return this.specialTokenSet.has(e);
17
+ isSpecialToken(s) {
18
+ return this.specialTokenSet.has(s);
19
19
  }
20
20
  addSpecialTokens() {
21
- h.forEach((e, t) => {
22
- this.addToken(e, t), this.specialTokens.set(e, t), this.specialTokenSet.add(t);
21
+ h.forEach((s, t) => {
22
+ this.addToken(s, t), this.specialTokens.set(s, t), this.specialTokenSet.add(t);
23
23
  });
24
24
  }
25
- addSpecialToken(e, t) {
26
- this.specialTokens.set(e, t), this.specialTokenSet.add(t);
25
+ addSpecialToken(s, t) {
26
+ this.specialTokens.set(s, t), this.specialTokenSet.add(t);
27
27
  }
28
- encodeSequence(e) {
29
- const t = this.encode(e);
28
+ encodeSequence(s) {
29
+ const t = this.encode(s);
30
30
  return [this.bosToken, ...t, this.eosToken];
31
31
  }
32
- encodeAsSequence(e, t) {
33
- const s = e.flatMap((o) => this.encode(o.content));
34
- return t ? [this.bosToken, ...s, this.eosToken, this.bosToken] : [this.bosToken, ...s, this.eosToken];
32
+ encodeAsSequence(s, t) {
33
+ const e = s.flatMap((o) => this.encode(o.content));
34
+ return t ? [this.bosToken, ...e, this.eosToken, this.bosToken] : [this.bosToken, ...e, this.eosToken];
35
35
  }
36
- encodeConversation(e, t) {
37
- const s = [[this.bosToken]], o = [
36
+ encodeConversation(s, t) {
37
+ const e = [[this.bosToken]], o = [
38
38
  this.getSpecialTokenIndex("<|user_start|>"),
39
39
  this.getSpecialTokenIndex("<|assistant_start|>"),
40
40
  this.getSpecialTokenIndex("<|system_start|>")
@@ -43,57 +43,57 @@ class l extends r {
43
43
  this.getSpecialTokenIndex("<|assistant_end|>"),
44
44
  this.getSpecialTokenIndex("<|system_end|>")
45
45
  ];
46
- for (const i of e) {
46
+ for (const i of s) {
47
47
  const c = this.encode(i.content);
48
48
  switch (i.role) {
49
49
  case "user":
50
- s.push([o[0]]);
50
+ e.push([o[0]]);
51
51
  break;
52
52
  case "assistant":
53
- s.push([o[1]]);
53
+ e.push([o[1]]);
54
54
  break;
55
55
  case "system":
56
- s.push([o[2]]);
56
+ e.push([o[2]]);
57
57
  break;
58
58
  }
59
- switch (s.push(c), i.role) {
59
+ switch (e.push(c), i.role) {
60
60
  case "user":
61
- s.push([n[0]]);
61
+ e.push([n[0]]);
62
62
  break;
63
63
  case "assistant":
64
- s.push([n[1]]);
64
+ e.push([n[1]]);
65
65
  break;
66
66
  case "system":
67
- s.push([n[2]]);
67
+ e.push([n[2]]);
68
68
  break;
69
69
  }
70
70
  }
71
- const a = s.flat();
71
+ const a = e.flat();
72
72
  return t ? a.push(o[1]) : a.push(this.eosToken), a;
73
73
  }
74
- decodeConversation(e) {
74
+ decodeConversation(s) {
75
75
  const t = [];
76
- let s = 0;
77
- for (; s < e.length; ) {
78
- const o = e[s];
76
+ let e = 0;
77
+ for (; e < s.length; ) {
78
+ const o = s[e];
79
79
  let n = null;
80
- if (o === this.getSpecialTokenIndex("<|user_start|>") ? n = "user" : o === this.getSpecialTokenIndex("<|assistant_start|>") ? n = "assistant" : o === this.getSpecialTokenIndex("<|system_start|>") && (n = "system"), n) {
81
- s++;
80
+ if (o === this.getSpecialTokenIndex("<|user_start|>") ? n = "user" : o === this.getSpecialTokenIndex("<|assistant_start|>") ? n = "assistant" : o === this.getSpecialTokenIndex("<|system_start|>") ? n = "system" : o === this.bosToken || (o === this.eosToken ? n = null : (n = "text", e--)), n) {
81
+ e++;
82
82
  const a = [];
83
- for (; s < e.length && e[s] !== this.getSpecialTokenIndex(`<|${n}_end|>`); )
84
- a.push(e[s]), s++;
83
+ for (; e < s.length && s[e] !== this.getSpecialTokenIndex(`<|${n}_end|>`) && s[e] !== this.eosToken; )
84
+ a.push(s[e]), e++;
85
85
  const i = this.decode(a);
86
86
  t.push({ role: n, content: i });
87
87
  }
88
- s++;
88
+ e++;
89
89
  }
90
90
  return t;
91
91
  }
92
- getSpecialTokenIndex(e) {
93
- return this.specialTokens.get(e);
92
+ getSpecialTokenIndex(s) {
93
+ return this.specialTokens.get(s);
94
94
  }
95
95
  }
96
96
  export {
97
97
  h as SPECIALS,
98
- l as default
98
+ k as default
99
99
  };
@@ -1,4 +1,5 @@
1
1
  import { default as BaseTokeniser } from './BaseTokeniser';
2
+ import { Conversation } from './type';
2
3
  export default class CharTokeniser extends BaseTokeniser {
3
4
  vocabSize: number;
4
5
  eosToken: number;
@@ -11,7 +12,7 @@ export default class CharTokeniser extends BaseTokeniser {
11
12
  addToken(token: string, index?: number): number;
12
13
  get trained(): boolean;
13
14
  destroy(): void;
14
- train(text: string[]): Promise<number>;
15
+ train(text: Conversation[][]): Promise<number>;
15
16
  tokenise(text: string[], numeric: true): number[][];
16
17
  tokenise(text: string[]): string[][];
17
18
  detokenise(tokens: (number[] | Uint16Array)[]): string[];
@@ -40,32 +40,32 @@ class T extends k {
40
40
  this.cache.clear(), this.vocab = [];
41
41
  }
42
42
  async train(i) {
43
- const t = i.map((n) => n.split("")).flat(), e = new Set(t), s = Array.from(e), h = this.vocab.indexOf("", this.unkToken + 1), o = this.vocabSize - u.length;
43
+ const t = i.map((o) => o.map((n) => n.content.split(""))).flat(2), e = new Set(t), s = Array.from(e), h = this.vocab.indexOf("", this.unkToken + 1), a = this.vocabSize - u.length;
44
44
  if (h === -1)
45
45
  return this.vocabSize;
46
- if (this._trained = !0, s.length > o) {
47
- const n = /* @__PURE__ */ new Map();
48
- t.forEach((a) => {
49
- n.set(a, (n.get(a) || 0) + 1);
50
- }), s.sort((a, r) => (n.get(a) || 0) - (n.get(r) || 0)), s.splice(0, s.length - o);
46
+ if (this._trained = !0, s.length > a) {
47
+ const o = /* @__PURE__ */ new Map();
48
+ t.forEach((n) => {
49
+ o.set(n, (o.get(n) || 0) + 1);
50
+ }), s.sort((n, r) => (o.get(n) || 0) - (o.get(r) || 0)), s.splice(0, s.length - a);
51
51
  }
52
52
  let c = h;
53
53
  if (c !== -1) {
54
- const n = new Set(this.vocab);
55
- for (const a of s)
56
- if (!n.has(a) && (this.vocab[c] = a, n.add(a), c = this.vocab.indexOf("", c + 1), c === -1))
54
+ const o = new Set(this.vocab);
55
+ for (const n of s)
56
+ if (!o.has(n) && (this.vocab[c] = n, o.add(n), c = this.vocab.indexOf("", c + 1), c === -1))
57
57
  break;
58
58
  }
59
- return this.cache.clear(), this.vocab.forEach((n, a) => {
60
- this.cache.set(n, a);
59
+ return this.cache.clear(), this.vocab.forEach((o, n) => {
60
+ this.cache.set(o, n);
61
61
  }), this.emit("trainStatus", "trained"), this.vocabSize;
62
62
  }
63
63
  tokenise(i, t) {
64
64
  if (!this.trained)
65
65
  throw new Error("Tokeniser not trained");
66
66
  return i.map((s) => t ? s.split("").map((h) => this.cache.get(h) ?? this.unkToken) : s.split("").map((h) => {
67
- const o = this.cache.get(h);
68
- return o !== void 0 ? this.vocab[o] : "";
67
+ const a = this.cache.get(h);
68
+ return a !== void 0 ? this.vocab[a] : "";
69
69
  }));
70
70
  }
71
71
  detokenise(i) {
@@ -85,8 +85,8 @@ class T extends k {
85
85
  }
86
86
  async createTrainingData(i, t = 5) {
87
87
  const e = await this.tokenise(i, !0), s = [], h = [];
88
- for (let o = 0; o < e.length - t; o++)
89
- s.push(...e[o].slice(0, t)), h.push(e[o + 1][0]);
88
+ for (let a = 0; a < e.length - t; a++)
89
+ s.push(...e[a].slice(0, t)), h.push(e[a + 1][0]);
90
90
  return [s, h];
91
91
  }
92
92
  }
@@ -1,4 +1,5 @@
1
1
  import { default as BaseTokeniser } from './BaseTokeniser';
2
+ import { Conversation } from './type';
2
3
  export default class BPETokeniser extends BaseTokeniser {
3
4
  private targetSize;
4
5
  private vocab;
@@ -14,7 +15,7 @@ export default class BPETokeniser extends BaseTokeniser {
14
15
  get eosToken(): number;
15
16
  get bosToken(): number;
16
17
  get unkToken(): number;
17
- train(text: string[], cb?: (vocab: number) => void): Promise<number>;
18
+ train(text?: Conversation[][], cb?: (vocab: number) => void): Promise<number>;
18
19
  getVocab(): string[];
19
20
  getMerges(): [string, string][];
20
21
  private tokeniseWord;