@genai-fi/nanogpt 0.15.14 → 0.16.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -49,7 +49,7 @@ export default class TeachableLLM {
49
49
  getNumParams(): number;
50
50
  trainer(trainingType?: TrainingType, options?: TrainingOptions): Trainer;
51
51
  train(text: Task[], options?: TrainingOptions, trainingType?: TrainingType): Promise<void>;
52
- trainTokeniser(text: string[]): Promise<number>;
52
+ trainTokeniser(text: Conversation[][]): Promise<number>;
53
53
  generator(): IGenerator;
54
54
  generateText(prompt: Conversation[], options?: IGenerateOptions): Promise<Conversation[]>;
55
55
  generateText(options?: IGenerateOptions): Promise<Conversation[]>;
@@ -1 +1,2 @@
1
- export declare function loadDOCX(file: Blob | Uint8Array): Promise<string[]>;
1
+ import { Conversation } from '../tokeniser/type';
2
+ export declare function loadDOCX(file: Blob | Uint8Array): Promise<Conversation[][]>;
package/dist/data/docx.js CHANGED
@@ -1,13 +1,13 @@
1
1
  import { z as a } from "../jszip.min-BZhlzntC.js";
2
- async function c(n) {
3
- const t = await (await a.loadAsync(n)).file("word/document.xml")?.async("string");
4
- if (!t) throw new Error("Failed to load document.xml");
5
- return i(t).split(`
6
- `).filter((r) => r.trim().length > 10);
2
+ async function c(e) {
3
+ const n = await (await a.loadAsync(e)).file("word/document.xml")?.async("string");
4
+ if (!n) throw new Error("Failed to load document.xml");
5
+ return i(n).split(`
6
+ `).filter((t) => t.trim().length > 10).map((t) => [{ role: "text", content: t }]);
7
7
  }
8
- function i(n) {
9
- const t = new DOMParser().parseFromString(n, "application/xml");
10
- return Array.from(t.getElementsByTagName("w:t")).map((r) => r.textContent).join(`
8
+ function i(e) {
9
+ const n = new DOMParser().parseFromString(e, "application/xml");
10
+ return Array.from(n.getElementsByTagName("w:t")).map((t) => t.textContent).join(`
11
11
  `);
12
12
  }
13
13
  export {
@@ -1 +1,2 @@
1
- export declare function loadParquet(file: File, maxSize?: number, column?: string): Promise<string[]>;
1
+ import { Conversation } from '../tokeniser/type';
2
+ export declare function loadParquet(file: File, maxSize?: number, column?: string): Promise<Conversation[][]>;
@@ -1,13 +1,13 @@
1
- import { B as n } from "../index-Cp39cXWe.js";
1
+ import { B as f } from "../index-Cp39cXWe.js";
2
2
  const p = 100 * 1024 * 1024;
3
- async function d(i, s = p, e = "text") {
4
- const r = await (await import("../parquet-Bqjmp2vo.js").then((t) => t.p)).ParquetReader.openBuffer(n.from(await i.arrayBuffer())), a = [], f = r.getCursor([[e]]);
3
+ async function d(i, n = p, e = "text") {
4
+ const r = await (await import("../parquet-Bqjmp2vo.js").then((t) => t.p)).ParquetReader.openBuffer(f.from(await i.arrayBuffer())), a = [], s = r.getCursor([[e]]);
5
5
  let o = 0;
6
6
  for (; ; ) {
7
- const t = await f.next();
7
+ const t = await s.next();
8
8
  if (!t || t[e] === void 0 || typeof t[e] != "string")
9
9
  break;
10
- if (t[e].length !== 0 && (a.push(t[e]), o += t[e].length, o > s))
10
+ if (t[e].length !== 0 && (a.push([{ role: "text", content: t[e] }]), o += t[e].length, o > n))
11
11
  break;
12
12
  }
13
13
  return r.close(), a;
@@ -1 +1,2 @@
1
- export declare function loadPDF(file: Blob | Uint8Array, maxSize?: number): Promise<string[]>;
1
+ import { Conversation } from '../tokeniser/type';
2
+ export declare function loadPDF(file: Blob | Uint8Array, maxSize?: number): Promise<Conversation[][]>;
package/dist/data/pdf.js CHANGED
@@ -5,7 +5,7 @@ async function h(l, X = 104857600) {
5
5
  let m = 0;
6
6
  for (let b = 1; b <= N; b++) {
7
7
  const G = (await (await d.getPage(b)).getTextContent()).items.filter((c) => c.str.trim().length > 10).map((c) => c.str).join(" ");
8
- if (W.push(G), m += G.length, m > X) break;
8
+ if (W.push([{ role: "text", content: G }]), m += G.length, m > X) break;
9
9
  }
10
10
  return W;
11
11
  }
@@ -1,6 +1,7 @@
1
+ import { Conversation } from '../tokeniser/type';
1
2
  export interface DataOptions {
2
3
  maxSize?: number;
3
4
  column?: string;
4
5
  hasHeader?: boolean;
5
6
  }
6
- export default function loadTextData(file: File, options?: DataOptions): Promise<string[]>;
7
+ export default function loadTextData(file: File, options?: DataOptions): Promise<Conversation[][]>;
@@ -1,11 +1,11 @@
1
1
  import { p as u } from "../papaparse.min-C0cScC2i.js";
2
2
  import { loadParquet as f } from "./parquet.js";
3
3
  import { loadPDF as d } from "./pdf.js";
4
- import { loadDOCX as m } from "./docx.js";
5
- import { z as x } from "../jszip.min-BZhlzntC.js";
4
+ import { loadDOCX as x } from "./docx.js";
5
+ import { z as m } from "../jszip.min-BZhlzntC.js";
6
6
  function y(t, r) {
7
- const a = t.findIndex((i) => i.toLowerCase() === r.toLowerCase());
8
- return a === -1 ? 0 : a;
7
+ const n = t.findIndex((i) => i.toLowerCase() === r.toLowerCase());
8
+ return n === -1 ? 0 : n;
9
9
  }
10
10
  function w(t) {
11
11
  return t.every((r) => r.length < 64);
@@ -35,73 +35,74 @@ function g(t) {
35
35
  return "unknown";
36
36
  }
37
37
  }
38
- function j(t) {
38
+ function z(t) {
39
39
  if (!Array.isArray(t)) return !1;
40
40
  const r = t[0];
41
41
  return typeof r == "object" && r !== null && "role" in r && "content" in r && typeof r.role == "string" && typeof r.content == "string";
42
42
  }
43
- async function z(t, r) {
44
- const a = t.type !== "" ? t.type : g(t.name);
45
- if (a === "application/parquet")
43
+ async function j(t, r) {
44
+ const n = t.type !== "" ? t.type : g(t.name);
45
+ if (n === "application/parquet")
46
46
  return f(t, r?.maxSize, r?.column);
47
- if (a === "application/pdf")
47
+ if (n === "application/pdf")
48
48
  return d(t, r?.maxSize);
49
- if (a === "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
50
- return m(t);
51
- if (a === "application/json") {
49
+ if (n === "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
50
+ return x(t);
51
+ if (n === "application/json") {
52
52
  const i = await t.text(), o = JSON.parse(i);
53
53
  if (Array.isArray(o))
54
- return o.map(
55
- (e) => typeof e == "string" ? e : "text" in e ? e.text : JSON.stringify(e)
56
- );
54
+ return o.map((e) => [
55
+ typeof e == "string" ? { role: "text", content: e } : "text" in e ? { role: "text", content: e.text } : { role: "text", content: JSON.stringify(e) }
56
+ ]);
57
57
  throw new Error("Expected JSON array");
58
58
  }
59
- if (a === "application/jsonl")
59
+ if (n === "application/jsonl")
60
60
  return (await t.text()).split(`
61
61
  `).filter((o) => o.trim() !== "").map((o) => {
62
62
  try {
63
63
  const e = JSON.parse(o);
64
- return j(e) ? e.map((n) => `${n.content}`).join(`
65
- `) : typeof e == "string" ? e : "text" in e ? e.text : JSON.stringify(e);
64
+ return z(e) ? e : [
65
+ typeof e == "string" ? { role: "text", content: e } : "text" in e ? { role: "text", content: e.text } : { role: "text", content: JSON.stringify(e) }
66
+ ];
66
67
  } catch {
67
- return o;
68
+ return [{ role: "text", content: o }];
68
69
  }
69
70
  });
70
- if (a === "application/zip") {
71
- const i = await x.loadAsync(t), o = [];
71
+ if (n === "application/zip") {
72
+ const i = await m.loadAsync(t), o = [];
72
73
  for (const e of Object.keys(i.files)) {
73
- const n = i.file(e);
74
- if (n) {
75
- const s = await n.async("blob"), c = await z(new File([s], e), r);
76
- o.push(...c);
74
+ const a = i.file(e);
75
+ if (a) {
76
+ const c = await a.async("blob"), s = await j(new File([c], e), r);
77
+ o.push(...s);
77
78
  }
78
79
  }
79
80
  return o;
80
81
  }
81
- if (a === "text/csv") {
82
+ if (n === "text/csv") {
82
83
  const i = await t.text();
83
84
  return new Promise((o, e) => {
84
85
  u.parse(i, {
85
86
  header: !1,
86
87
  skipEmptyLines: !0,
87
88
  delimiter: ",",
88
- complete: (n) => {
89
- if (n.errors.length > 0)
90
- console.error(n.errors), e(new Error("Error parsing file"));
89
+ complete: (a) => {
90
+ if (a.errors.length > 0)
91
+ console.error(a.errors), e(new Error("Error parsing file"));
91
92
  else {
92
- const s = y(n.data[0], r?.column || "text"), p = r?.hasHeader ?? w(n.data[0]) ? n.data.slice(1) : n.data;
93
- o(p.map((l) => l[s]));
93
+ const c = y(a.data[0], r?.column || "text"), p = r?.hasHeader ?? w(a.data[0]) ? a.data.slice(1) : a.data;
94
+ o(p.map((l) => [{ role: "text", content: l[c] }]));
94
95
  }
95
96
  },
96
- error: (n) => {
97
- e(n);
97
+ error: (a) => {
98
+ e(a);
98
99
  }
99
100
  });
100
101
  });
101
- } else if (a === "text/plain")
102
- return [await t.text()];
103
- throw new Error(`Unsupported file type: ${a}`);
102
+ } else if (n === "text/plain")
103
+ return [[{ role: "text", content: await t.text() }]];
104
+ throw new Error(`Unsupported file type: ${n}`);
104
105
  }
105
106
  export {
106
- z as default
107
+ j as default
107
108
  };
@@ -12,7 +12,7 @@ export default abstract class BaseTokeniser extends EE<'trainStatus'> implements
12
12
  isSpecialToken(index: number): boolean;
13
13
  protected addSpecialTokens(): void;
14
14
  protected addSpecialToken(token: string, index: number): void;
15
- abstract train(text: string[], cb?: (vocab: number) => void): Promise<number>;
15
+ abstract train(text: Conversation[][], cb?: (vocab: number) => void): Promise<number>;
16
16
  abstract getVocab(): string[];
17
17
  abstract getMerges(): [string, string][];
18
18
  abstract destroy(): void;
@@ -21,6 +21,6 @@ export default abstract class BaseTokeniser extends EE<'trainStatus'> implements
21
21
  encodeAsSequence(conversation: Conversation[], completion?: boolean): number[];
22
22
  encodeConversation(conversation: Conversation[], completion?: boolean): number[];
23
23
  abstract decode(tokens: number[]): string;
24
- decodeConversation(tokens: number[]): Conversation[];
24
+ decodeConversation(tokens: number[] | Uint16Array): Conversation[];
25
25
  getSpecialTokenIndex(token: string): number | undefined;
26
26
  }
@@ -11,30 +11,30 @@ const h = [
11
11
  "<|system_start|>",
12
12
  "<|system_end|>"
13
13
  ];
14
- class l extends r {
14
+ class k extends r {
15
15
  specialTokens = /* @__PURE__ */ new Map();
16
16
  specialTokenSet = /* @__PURE__ */ new Set();
17
- isSpecialToken(e) {
18
- return this.specialTokenSet.has(e);
17
+ isSpecialToken(s) {
18
+ return this.specialTokenSet.has(s);
19
19
  }
20
20
  addSpecialTokens() {
21
- h.forEach((e, t) => {
22
- this.addToken(e, t), this.specialTokens.set(e, t), this.specialTokenSet.add(t);
21
+ h.forEach((s, t) => {
22
+ this.addToken(s, t), this.specialTokens.set(s, t), this.specialTokenSet.add(t);
23
23
  });
24
24
  }
25
- addSpecialToken(e, t) {
26
- this.specialTokens.set(e, t), this.specialTokenSet.add(t);
25
+ addSpecialToken(s, t) {
26
+ this.specialTokens.set(s, t), this.specialTokenSet.add(t);
27
27
  }
28
- encodeSequence(e) {
29
- const t = this.encode(e);
28
+ encodeSequence(s) {
29
+ const t = this.encode(s);
30
30
  return [this.bosToken, ...t, this.eosToken];
31
31
  }
32
- encodeAsSequence(e, t) {
33
- const s = e.flatMap((o) => this.encode(o.content));
34
- return t ? [this.bosToken, ...s, this.eosToken, this.bosToken] : [this.bosToken, ...s, this.eosToken];
32
+ encodeAsSequence(s, t) {
33
+ const e = s.flatMap((o) => this.encode(o.content));
34
+ return t ? [this.bosToken, ...e, this.eosToken, this.bosToken] : [this.bosToken, ...e, this.eosToken];
35
35
  }
36
- encodeConversation(e, t) {
37
- const s = [[this.bosToken]], o = [
36
+ encodeConversation(s, t) {
37
+ const e = [[this.bosToken]], o = [
38
38
  this.getSpecialTokenIndex("<|user_start|>"),
39
39
  this.getSpecialTokenIndex("<|assistant_start|>"),
40
40
  this.getSpecialTokenIndex("<|system_start|>")
@@ -43,57 +43,57 @@ class l extends r {
43
43
  this.getSpecialTokenIndex("<|assistant_end|>"),
44
44
  this.getSpecialTokenIndex("<|system_end|>")
45
45
  ];
46
- for (const i of e) {
46
+ for (const i of s) {
47
47
  const c = this.encode(i.content);
48
48
  switch (i.role) {
49
49
  case "user":
50
- s.push([o[0]]);
50
+ e.push([o[0]]);
51
51
  break;
52
52
  case "assistant":
53
- s.push([o[1]]);
53
+ e.push([o[1]]);
54
54
  break;
55
55
  case "system":
56
- s.push([o[2]]);
56
+ e.push([o[2]]);
57
57
  break;
58
58
  }
59
- switch (s.push(c), i.role) {
59
+ switch (e.push(c), i.role) {
60
60
  case "user":
61
- s.push([n[0]]);
61
+ e.push([n[0]]);
62
62
  break;
63
63
  case "assistant":
64
- s.push([n[1]]);
64
+ e.push([n[1]]);
65
65
  break;
66
66
  case "system":
67
- s.push([n[2]]);
67
+ e.push([n[2]]);
68
68
  break;
69
69
  }
70
70
  }
71
- const a = s.flat();
71
+ const a = e.flat();
72
72
  return t ? a.push(o[1]) : a.push(this.eosToken), a;
73
73
  }
74
- decodeConversation(e) {
74
+ decodeConversation(s) {
75
75
  const t = [];
76
- let s = 0;
77
- for (; s < e.length; ) {
78
- const o = e[s];
76
+ let e = 0;
77
+ for (; e < s.length; ) {
78
+ const o = s[e];
79
79
  let n = null;
80
- if (o === this.getSpecialTokenIndex("<|user_start|>") ? n = "user" : o === this.getSpecialTokenIndex("<|assistant_start|>") ? n = "assistant" : o === this.getSpecialTokenIndex("<|system_start|>") && (n = "system"), n) {
81
- s++;
80
+ if (o === this.getSpecialTokenIndex("<|user_start|>") ? n = "user" : o === this.getSpecialTokenIndex("<|assistant_start|>") ? n = "assistant" : o === this.getSpecialTokenIndex("<|system_start|>") ? n = "system" : o === this.bosToken || (o === this.eosToken ? n = null : (n = "text", e--)), n) {
81
+ e++;
82
82
  const a = [];
83
- for (; s < e.length && e[s] !== this.getSpecialTokenIndex(`<|${n}_end|>`); )
84
- a.push(e[s]), s++;
83
+ for (; e < s.length && s[e] !== this.getSpecialTokenIndex(`<|${n}_end|>`) && s[e] !== this.eosToken; )
84
+ a.push(s[e]), e++;
85
85
  const i = this.decode(a);
86
86
  t.push({ role: n, content: i });
87
87
  }
88
- s++;
88
+ e++;
89
89
  }
90
90
  return t;
91
91
  }
92
- getSpecialTokenIndex(e) {
93
- return this.specialTokens.get(e);
92
+ getSpecialTokenIndex(s) {
93
+ return this.specialTokens.get(s);
94
94
  }
95
95
  }
96
96
  export {
97
97
  h as SPECIALS,
98
- l as default
98
+ k as default
99
99
  };
@@ -1,4 +1,5 @@
1
1
  import { default as BaseTokeniser } from './BaseTokeniser';
2
+ import { Conversation } from './type';
2
3
  export default class CharTokeniser extends BaseTokeniser {
3
4
  vocabSize: number;
4
5
  eosToken: number;
@@ -11,7 +12,7 @@ export default class CharTokeniser extends BaseTokeniser {
11
12
  addToken(token: string, index?: number): number;
12
13
  get trained(): boolean;
13
14
  destroy(): void;
14
- train(text: string[]): Promise<number>;
15
+ train(text: Conversation[][]): Promise<number>;
15
16
  tokenise(text: string[], numeric: true): number[][];
16
17
  tokenise(text: string[]): string[][];
17
18
  detokenise(tokens: (number[] | Uint16Array)[]): string[];
@@ -40,32 +40,32 @@ class T extends k {
40
40
  this.cache.clear(), this.vocab = [];
41
41
  }
42
42
  async train(i) {
43
- const t = i.map((n) => n.split("")).flat(), e = new Set(t), s = Array.from(e), h = this.vocab.indexOf("", this.unkToken + 1), o = this.vocabSize - u.length;
43
+ const t = i.map((o) => o.map((n) => n.content.split(""))).flat(2), e = new Set(t), s = Array.from(e), h = this.vocab.indexOf("", this.unkToken + 1), a = this.vocabSize - u.length;
44
44
  if (h === -1)
45
45
  return this.vocabSize;
46
- if (this._trained = !0, s.length > o) {
47
- const n = /* @__PURE__ */ new Map();
48
- t.forEach((a) => {
49
- n.set(a, (n.get(a) || 0) + 1);
50
- }), s.sort((a, r) => (n.get(a) || 0) - (n.get(r) || 0)), s.splice(0, s.length - o);
46
+ if (this._trained = !0, s.length > a) {
47
+ const o = /* @__PURE__ */ new Map();
48
+ t.forEach((n) => {
49
+ o.set(n, (o.get(n) || 0) + 1);
50
+ }), s.sort((n, r) => (o.get(n) || 0) - (o.get(r) || 0)), s.splice(0, s.length - a);
51
51
  }
52
52
  let c = h;
53
53
  if (c !== -1) {
54
- const n = new Set(this.vocab);
55
- for (const a of s)
56
- if (!n.has(a) && (this.vocab[c] = a, n.add(a), c = this.vocab.indexOf("", c + 1), c === -1))
54
+ const o = new Set(this.vocab);
55
+ for (const n of s)
56
+ if (!o.has(n) && (this.vocab[c] = n, o.add(n), c = this.vocab.indexOf("", c + 1), c === -1))
57
57
  break;
58
58
  }
59
- return this.cache.clear(), this.vocab.forEach((n, a) => {
60
- this.cache.set(n, a);
59
+ return this.cache.clear(), this.vocab.forEach((o, n) => {
60
+ this.cache.set(o, n);
61
61
  }), this.emit("trainStatus", "trained"), this.vocabSize;
62
62
  }
63
63
  tokenise(i, t) {
64
64
  if (!this.trained)
65
65
  throw new Error("Tokeniser not trained");
66
66
  return i.map((s) => t ? s.split("").map((h) => this.cache.get(h) ?? this.unkToken) : s.split("").map((h) => {
67
- const o = this.cache.get(h);
68
- return o !== void 0 ? this.vocab[o] : "";
67
+ const a = this.cache.get(h);
68
+ return a !== void 0 ? this.vocab[a] : "";
69
69
  }));
70
70
  }
71
71
  detokenise(i) {
@@ -85,8 +85,8 @@ class T extends k {
85
85
  }
86
86
  async createTrainingData(i, t = 5) {
87
87
  const e = await this.tokenise(i, !0), s = [], h = [];
88
- for (let o = 0; o < e.length - t; o++)
89
- s.push(...e[o].slice(0, t)), h.push(e[o + 1][0]);
88
+ for (let a = 0; a < e.length - t; a++)
89
+ s.push(...e[a].slice(0, t)), h.push(e[a + 1][0]);
90
90
  return [s, h];
91
91
  }
92
92
  }
@@ -1,4 +1,5 @@
1
1
  import { default as BaseTokeniser } from './BaseTokeniser';
2
+ import { Conversation } from './type';
2
3
  export default class BPETokeniser extends BaseTokeniser {
3
4
  private targetSize;
4
5
  private vocab;
@@ -14,7 +15,7 @@ export default class BPETokeniser extends BaseTokeniser {
14
15
  get eosToken(): number;
15
16
  get bosToken(): number;
16
17
  get unkToken(): number;
17
- train(text: string[], cb?: (vocab: number) => void): Promise<number>;
18
+ train(text?: Conversation[][], cb?: (vocab: number) => void): Promise<number>;
18
19
  getVocab(): string[];
19
20
  getMerges(): [string, string][];
20
21
  private tokeniseWord;
@@ -1,6 +1,6 @@
1
1
  import { yieldIfNeeded as f } from "../utilities/yielder.js";
2
- import k from "../utilities/tokenParse.js";
3
- import z, { SPECIALS as m } from "./BaseTokeniser.js";
2
+ import m from "../utilities/tokenParse.js";
3
+ import z, { SPECIALS as k } from "./BaseTokeniser.js";
4
4
  function p(o, e) {
5
5
  return `${o}-::-${e}`;
6
6
  }
@@ -8,25 +8,25 @@ function w(o) {
8
8
  const e = /* @__PURE__ */ new Map();
9
9
  for (let s = 0; s < o.length; s++) {
10
10
  const t = o[s];
11
- for (let r = 0; r < t.length - 1; r++) {
12
- const n = p(t[r], t[r + 1]), a = e.get(n) || {
13
- a: t[r],
14
- b: t[r + 1],
11
+ for (let n = 0; n < t.length - 1; n++) {
12
+ const r = p(t[n], t[n + 1]), a = e.get(r) || {
13
+ a: t[n],
14
+ b: t[n + 1],
15
15
  count: 0,
16
16
  instances: /* @__PURE__ */ new Set()
17
17
  };
18
- a.count += 1, a.instances.add(s), e.set(n, a);
18
+ a.count += 1, a.instances.add(s), e.set(r, a);
19
19
  }
20
20
  }
21
21
  return { pairs: e, tokens: o };
22
22
  }
23
- function d(o, e, s, t, r) {
24
- const n = p(e, s);
25
- if (o.pairs.has(n)) {
26
- const a = o.pairs.get(n);
27
- a.count += r, r > 0 ? a.instances.add(t) : a.count <= 0 ? o.pairs.delete(n) : a.instances.delete(t);
23
+ function d(o, e, s, t, n) {
24
+ const r = p(e, s);
25
+ if (o.pairs.has(r)) {
26
+ const a = o.pairs.get(r);
27
+ a.count += n, n > 0 ? a.instances.add(t) : a.count <= 0 ? o.pairs.delete(r) : a.instances.delete(t);
28
28
  } else
29
- o.pairs.set(n, { a: e, b: s, count: r, instances: /* @__PURE__ */ new Set([t]) });
29
+ o.pairs.set(r, { a: e, b: s, count: n, instances: /* @__PURE__ */ new Set([t]) });
30
30
  }
31
31
  function T(o) {
32
32
  let e = null, s = 0;
@@ -37,21 +37,21 @@ function T(o) {
37
37
  function y(o, e) {
38
38
  return o.map((s) => {
39
39
  const t = [];
40
- for (let r = 0; r < s.length; r++)
41
- r < s.length - 1 && s[r] === e[0] && s[r + 1] === e[1] ? (t.push(e[0] + e[1]), r++) : t.push(s[r]);
40
+ for (let n = 0; n < s.length; n++)
41
+ n < s.length - 1 && s[n] === e[0] && s[n + 1] === e[1] ? (t.push(e[0] + e[1]), n++) : t.push(s[n]);
42
42
  return t;
43
43
  });
44
44
  }
45
45
  function I(o, e) {
46
46
  e.instances.forEach((s) => {
47
- const t = o.tokens[s], r = [];
48
- for (let n = 0; n < t.length; n++)
49
- if (n < t.length - 1 && t[n] === e.a && t[n + 1] === e.b) {
47
+ const t = o.tokens[s], n = [];
48
+ for (let r = 0; r < t.length; r++)
49
+ if (r < t.length - 1 && t[r] === e.a && t[r + 1] === e.b) {
50
50
  const a = e.a + e.b;
51
- r.push(a), n > 0 && (d(o, t[n - 1], e.a, s, -1), d(o, t[n - 1], a, s, 1)), n++, n < t.length - 1 && (d(o, e.b, t[n + 1], s, -1), d(o, a, t[n + 1], s, 1));
51
+ n.push(a), r > 0 && (d(o, t[r - 1], e.a, s, -1), d(o, t[r - 1], a, s, 1)), r++, r < t.length - 1 && (d(o, e.b, t[r + 1], s, -1), d(o, a, t[r + 1], s, 1));
52
52
  } else
53
- r.push(t[n]);
54
- o.tokens[s] = r;
53
+ n.push(t[r]);
54
+ o.tokens[s] = n;
55
55
  }), o.pairs.delete(p(e.a, e.b));
56
56
  }
57
57
  class E extends z {
@@ -61,11 +61,11 @@ class E extends z {
61
61
  merges = [];
62
62
  pretokenMap = /* @__PURE__ */ new Map();
63
63
  constructor(e, s) {
64
- super(), Array.isArray(e) ? (e.forEach((t, r) => {
65
- this.vocab.add(t), this.vocabIndex.set(t, r);
66
- }), s && (this.merges = s), this.targetSize = e.length, m.forEach((t) => {
67
- const r = e.indexOf(t);
68
- r !== -1 && this.addSpecialToken(t, r);
64
+ super(), Array.isArray(e) ? (e.forEach((t, n) => {
65
+ this.vocab.add(t), this.vocabIndex.set(t, n);
66
+ }), s && (this.merges = s), this.targetSize = e.length, k.forEach((t) => {
67
+ const n = e.indexOf(t);
68
+ n !== -1 && this.addSpecialToken(t, n);
69
69
  })) : (this.addSpecialTokens(), this.targetSize = e);
70
70
  }
71
71
  addToken(e, s) {
@@ -81,7 +81,7 @@ class E extends z {
81
81
  this.vocab.clear(), this.vocabIndex.clear(), this.merges = [], this.pretokenMap.clear();
82
82
  }
83
83
  get trained() {
84
- return this.vocab.size > m.length && this.vocab.size <= this.targetSize;
84
+ return this.vocab.size > k.length && this.vocab.size <= this.targetSize;
85
85
  }
86
86
  get vocabSize() {
87
87
  return this.vocab.size;
@@ -95,23 +95,23 @@ class E extends z {
95
95
  get unkToken() {
96
96
  return this.vocabIndex.get("") ?? 1;
97
97
  }
98
- async train(e, s) {
98
+ async train(e = [], s) {
99
99
  let t = performance.now();
100
- const r = e.map((i) => k(i)).flat(1);
100
+ const n = e.map((i) => i.map((h) => m(h.content))).flat(2);
101
101
  t = await f(t, s, this.vocab.size);
102
- const n = new Set(r);
102
+ const r = new Set(n);
103
103
  this.vocab = /* @__PURE__ */ new Set(), this.pretokenMap.clear(), this.merges = [], this.addSpecialTokens();
104
- const a = Array.from(n), b = a.map((i) => Array.from(i).map((h) => (this.vocab.add(h), h))), g = w(b);
104
+ const a = Array.from(r), b = a.map((i) => Array.from(i).map((l) => (this.vocab.add(l), l))), g = w(b);
105
105
  if (t = await f(t, s, this.vocab.size), this.vocab.size >= this.targetSize) {
106
106
  console.warn("Initial vocab size is greater than or equal to target size. No merges will be performed.");
107
107
  const i = /* @__PURE__ */ new Map();
108
- r.forEach((c) => {
108
+ n.forEach((c) => {
109
109
  Array.from(c).forEach((u) => {
110
110
  i.set(u, (i.get(u) || 0) + 1);
111
111
  });
112
112
  });
113
- const l = Array.from(i.entries()).sort((c, u) => u[1] - c[1]);
114
- this.vocab = /* @__PURE__ */ new Set(), this.addSpecialTokens(), l.slice(0, this.targetSize - this.vocab.size).map(([c]) => c).forEach((c) => this.vocab.add(c)), this.vocabIndex.clear();
113
+ const h = Array.from(i.entries()).sort((c, u) => u[1] - c[1]);
114
+ this.vocab = /* @__PURE__ */ new Set(), this.addSpecialTokens(), h.slice(0, this.targetSize - this.vocab.size).map(([c]) => c).forEach((c) => this.vocab.add(c)), this.vocabIndex.clear();
115
115
  let S = 0;
116
116
  for (const c of this.vocab.keys())
117
117
  this.vocabIndex.set(c, S++);
@@ -123,9 +123,9 @@ class E extends z {
123
123
  break;
124
124
  this.merges.push([i.a, i.b]), this.vocab.add(i.a + i.b), I(g, i), t = await f(t, s, this.vocab.size);
125
125
  }
126
- a.forEach((i, l) => {
127
- const h = b[l];
128
- this.pretokenMap.set(i, h);
126
+ a.forEach((i, h) => {
127
+ const l = b[h];
128
+ this.pretokenMap.set(i, l);
129
129
  }), this.vocabIndex.clear();
130
130
  let v = 0;
131
131
  for (const i of this.vocab.keys())
@@ -145,15 +145,15 @@ class E extends z {
145
145
  }), this.pretokenMap.set(e, s), s;
146
146
  }
147
147
  tokeniseStrings(e) {
148
- return e.map((s) => k(s).map((n) => this.pretokenMap.has(n) ? this.pretokenMap.get(n) : this.tokeniseWord(n)).flat(1));
148
+ return e.map((s) => m(s).map((r) => this.pretokenMap.has(r) ? this.pretokenMap.get(r) : this.tokeniseWord(r)).flat(1));
149
149
  }
150
150
  tokenise(e, s) {
151
151
  const t = this.tokeniseStrings(e);
152
- return s ? t.map((r) => r.map((n) => this.vocabIndex.get(n) ?? this.unkToken)) : t.map((r) => r.map((n) => this.vocab.has(n) ? n : ""));
152
+ return s ? t.map((n) => n.map((r) => this.vocabIndex.get(r) ?? this.unkToken)) : t.map((n) => n.map((r) => this.vocab.has(r) ? r : ""));
153
153
  }
154
154
  detokenise(e) {
155
155
  const s = this.getVocab();
156
- return e.map((r) => r.map((n) => s[n]).join(""));
156
+ return e.map((n) => n.map((r) => s[r]).join(""));
157
157
  }
158
158
  encode(e) {
159
159
  return this.tokenise([e], !0)[0];
@@ -1,11 +1,11 @@
1
1
  import { default as EE } from 'eventemitter3';
2
- export type Roles = 'user' | 'assistant' | 'system';
2
+ export type Roles = 'user' | 'assistant' | 'system' | 'text';
3
3
  export interface Conversation {
4
4
  role: Roles;
5
5
  content: string;
6
6
  }
7
7
  export interface ITokeniser extends EE<'trainStatus'> {
8
- train(text: string[], cb?: (vocab: number) => void): Promise<number>;
8
+ train(text: Conversation[][], cb?: (vocab: number) => void): Promise<number>;
9
9
  getVocab(): string[];
10
10
  getMerges(): [string, string][];
11
11
  destroy(): void;
@@ -6,22 +6,24 @@ function w(p, o, t, l) {
6
6
  const s = [t.bosToken], a = [!1], u = {
7
7
  user: t.getSpecialTokenIndex("<|user_start|>"),
8
8
  assistant: t.getSpecialTokenIndex("<|assistant_start|>"),
9
- system: t.getSpecialTokenIndex("<|system_start|>")
9
+ system: t.getSpecialTokenIndex("<|system_start|>"),
10
+ text: void 0
10
11
  }, c = {
11
12
  user: t.getSpecialTokenIndex("<|user_end|>"),
12
13
  assistant: t.getSpecialTokenIndex("<|assistant_end|>"),
13
- system: t.getSpecialTokenIndex("<|system_end|>")
14
+ system: t.getSpecialTokenIndex("<|system_end|>"),
15
+ text: void 0
14
16
  };
15
17
  for (const e of p) {
16
18
  const r = u[e.role], h = c[e.role];
17
19
  if (!r || !h)
18
20
  throw new Error(`Missing special tokens for role: ${e.role}`);
19
21
  s.push(r), a.push(!1);
20
- const m = e.role === "assistant", S = t.encode(e.content);
21
- for (const T of S) {
22
+ const m = e.role === "assistant", x = t.encode(e.content);
23
+ for (const T of x) {
22
24
  s.push(T);
23
- const x = t.isSpecialToken(T);
24
- a.push(m && !x);
25
+ const S = t.isSpecialToken(T);
26
+ a.push(m && !S);
25
27
  }
26
28
  s.push(h), a.push(m);
27
29
  }
@@ -40,7 +42,7 @@ function w(p, o, t, l) {
40
42
  }
41
43
  return g ? { xs: f, ys: d } : null;
42
44
  }
43
- class D {
45
+ class A {
44
46
  tokenizer;
45
47
  blockSize;
46
48
  constructor(o, t = 128) {
@@ -78,6 +80,6 @@ class D {
78
80
  }
79
81
  }
80
82
  export {
81
- D as SFTDatasetBuilder,
83
+ A as SFTDatasetBuilder,
82
84
  w as buildSFTExample
83
85
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@genai-fi/nanogpt",
3
- "version": "0.15.14",
3
+ "version": "0.16.1",
4
4
  "type": "module",
5
5
  "main": "dist/main.js",
6
6
  "types": "dist/main.d.ts",