npm - @genai-fi/nanogpt - Versions diffs - 0.15.14 → 0.16.1 - Mend

@genai-fi/nanogpt 0.15.14 → 0.16.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/dist/TeachableLLM.d.ts +1 -1
package/dist/data/docx.d.ts +2 -1
package/dist/data/docx.js +8 -8
package/dist/data/parquet.d.ts +2 -1
package/dist/data/parquet.js +5 -5
package/dist/data/pdf.d.ts +2 -1
package/dist/data/pdf.js +1 -1
package/dist/data/textLoader.d.ts +2 -1
package/dist/data/textLoader.js +38 -37
package/dist/tokeniser/BaseTokeniser.d.ts +2 -2
package/dist/tokeniser/BaseTokeniser.js +35 -35
package/dist/tokeniser/CharTokeniser.d.ts +2 -1
package/dist/tokeniser/CharTokeniser.js +15 -15
package/dist/tokeniser/bpe.d.ts +2 -1
package/dist/tokeniser/bpe.js +40 -40
package/dist/tokeniser/type.d.ts +2 -2
package/dist/training/SFTDatasetBuilder.js +10 -8
package/package.json +1 -1

package/dist/TeachableLLM.d.ts CHANGED Viewed

@@ -49,7 +49,7 @@ export default class TeachableLLM {
     getNumParams(): number;
     trainer(trainingType?: TrainingType, options?: TrainingOptions): Trainer;
     train(text: Task[], options?: TrainingOptions, trainingType?: TrainingType): Promise<void>;
-    trainTokeniser(text: string[]): Promise<number>;
+    trainTokeniser(text: Conversation[][]): Promise<number>;
     generator(): IGenerator;
     generateText(prompt: Conversation[], options?: IGenerateOptions): Promise<Conversation[]>;
     generateText(options?: IGenerateOptions): Promise<Conversation[]>;

package/dist/data/docx.d.ts CHANGED Viewed

@@ -1 +1,2 @@
-export declare function loadDOCX(file: Blob | Uint8Array): Promise<string[]>;
+import { Conversation } from '../tokeniser/type';
+export declare function loadDOCX(file: Blob | Uint8Array): Promise<Conversation[][]>;

package/dist/data/docx.js CHANGED Viewed

@@ -1,13 +1,13 @@
 import { z as a } from "../jszip.min-BZhlzntC.js";
-async function c(n) {
-  const t = await (await a.loadAsync(n)).file("word/document.xml")?.async("string");
-  if (!t) throw new Error("Failed to load document.xml");
-  return i(t).split(`
-`).filter((r) => r.trim().length > 10);
+async function c(e) {
+  const n = await (await a.loadAsync(e)).file("word/document.xml")?.async("string");
+  if (!n) throw new Error("Failed to load document.xml");
+  return i(n).split(`
+`).filter((t) => t.trim().length > 10).map((t) => [{ role: "text", content: t }]);
 }
-function i(n) {
-  const t = new DOMParser().parseFromString(n, "application/xml");
-  return Array.from(t.getElementsByTagName("w:t")).map((r) => r.textContent).join(`
+function i(e) {
+  const n = new DOMParser().parseFromString(e, "application/xml");
+  return Array.from(n.getElementsByTagName("w:t")).map((t) => t.textContent).join(`
 `);
 }
 export {

package/dist/data/parquet.d.ts CHANGED Viewed

@@ -1 +1,2 @@
-export declare function loadParquet(file: File, maxSize?: number, column?: string): Promise<string[]>;
+import { Conversation } from '../tokeniser/type';
+export declare function loadParquet(file: File, maxSize?: number, column?: string): Promise<Conversation[][]>;

package/dist/data/parquet.js CHANGED Viewed

@@ -1,13 +1,13 @@
-import { B as n } from "../index-Cp39cXWe.js";
+import { B as f } from "../index-Cp39cXWe.js";
 const p = 100 * 1024 * 1024;
-async function d(i, s = p, e = "text") {
-  const r = await (await import("../parquet-Bqjmp2vo.js").then((t) => t.p)).ParquetReader.openBuffer(n.from(await i.arrayBuffer())), a = [], f = r.getCursor([[e]]);
+async function d(i, n = p, e = "text") {
+  const r = await (await import("../parquet-Bqjmp2vo.js").then((t) => t.p)).ParquetReader.openBuffer(f.from(await i.arrayBuffer())), a = [], s = r.getCursor([[e]]);
   let o = 0;
   for (; ; ) {
-    const t = await f.next();
+    const t = await s.next();
     if (!t || t[e] === void 0 || typeof t[e] != "string")
       break;
-    if (t[e].length !== 0 && (a.push(t[e]), o += t[e].length, o > s))
+    if (t[e].length !== 0 && (a.push([{ role: "text", content: t[e] }]), o += t[e].length, o > n))
       break;
   }
   return r.close(), a;

package/dist/data/pdf.d.ts CHANGED Viewed

@@ -1 +1,2 @@
-export declare function loadPDF(file: Blob | Uint8Array, maxSize?: number): Promise<string[]>;
+import { Conversation } from '../tokeniser/type';
+export declare function loadPDF(file: Blob | Uint8Array, maxSize?: number): Promise<Conversation[][]>;

package/dist/data/pdf.js CHANGED Viewed

@@ -5,7 +5,7 @@ async function h(l, X = 104857600) {
   let m = 0;
   for (let b = 1; b <= N; b++) {
     const G = (await (await d.getPage(b)).getTextContent()).items.filter((c) => c.str.trim().length > 10).map((c) => c.str).join(" ");
-    if (W.push(G), m += G.length, m > X) break;
+    if (W.push([{ role: "text", content: G }]), m += G.length, m > X) break;
   }
   return W;
 }

package/dist/data/textLoader.d.ts CHANGED Viewed

@@ -1,6 +1,7 @@
+import { Conversation } from '../tokeniser/type';
 export interface DataOptions {
     maxSize?: number;
     column?: string;
     hasHeader?: boolean;
 }
-export default function loadTextData(file: File, options?: DataOptions): Promise<string[]>;
+export default function loadTextData(file: File, options?: DataOptions): Promise<Conversation[][]>;

package/dist/data/textLoader.js CHANGED Viewed

@@ -1,11 +1,11 @@
 import { p as u } from "../papaparse.min-C0cScC2i.js";
 import { loadParquet as f } from "./parquet.js";
 import { loadPDF as d } from "./pdf.js";
-import { loadDOCX as m } from "./docx.js";
-import { z as x } from "../jszip.min-BZhlzntC.js";
+import { loadDOCX as x } from "./docx.js";
+import { z as m } from "../jszip.min-BZhlzntC.js";
 function y(t, r) {
-  const a = t.findIndex((i) => i.toLowerCase() === r.toLowerCase());
-  return a === -1 ? 0 : a;
+  const n = t.findIndex((i) => i.toLowerCase() === r.toLowerCase());
+  return n === -1 ? 0 : n;
 }
 function w(t) {
   return t.every((r) => r.length < 64);
@@ -35,73 +35,74 @@ function g(t) {
       return "unknown";
   }
 }
-function j(t) {
+function z(t) {
   if (!Array.isArray(t)) return !1;
   const r = t[0];
   return typeof r == "object" && r !== null && "role" in r && "content" in r && typeof r.role == "string" && typeof r.content == "string";
 }
-async function z(t, r) {
-  const a = t.type !== "" ? t.type : g(t.name);
-  if (a === "application/parquet")
+async function j(t, r) {
+  const n = t.type !== "" ? t.type : g(t.name);
+  if (n === "application/parquet")
     return f(t, r?.maxSize, r?.column);
-  if (a === "application/pdf")
+  if (n === "application/pdf")
     return d(t, r?.maxSize);
-  if (a === "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
-    return m(t);
-  if (a === "application/json") {
+  if (n === "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
+    return x(t);
+  if (n === "application/json") {
     const i = await t.text(), o = JSON.parse(i);
     if (Array.isArray(o))
-      return o.map(
-        (e) => typeof e == "string" ? e : "text" in e ? e.text : JSON.stringify(e)
-      );
+      return o.map((e) => [
+        typeof e == "string" ? { role: "text", content: e } : "text" in e ? { role: "text", content: e.text } : { role: "text", content: JSON.stringify(e) }
+      ]);
     throw new Error("Expected JSON array");
   }
-  if (a === "application/jsonl")
+  if (n === "application/jsonl")
     return (await t.text()).split(`
 `).filter((o) => o.trim() !== "").map((o) => {
       try {
         const e = JSON.parse(o);
-        return j(e) ? e.map((n) => `${n.content}`).join(`
-`) : typeof e == "string" ? e : "text" in e ? e.text : JSON.stringify(e);
+        return z(e) ? e : [
+          typeof e == "string" ? { role: "text", content: e } : "text" in e ? { role: "text", content: e.text } : { role: "text", content: JSON.stringify(e) }
+        ];
       } catch {
-        return o;
+        return [{ role: "text", content: o }];
       }
     });
-  if (a === "application/zip") {
-    const i = await x.loadAsync(t), o = [];
+  if (n === "application/zip") {
+    const i = await m.loadAsync(t), o = [];
     for (const e of Object.keys(i.files)) {
-      const n = i.file(e);
-      if (n) {
-        const s = await n.async("blob"), c = await z(new File([s], e), r);
-        o.push(...c);
+      const a = i.file(e);
+      if (a) {
+        const c = await a.async("blob"), s = await j(new File([c], e), r);
+        o.push(...s);
       }
     }
     return o;
   }
-  if (a === "text/csv") {
+  if (n === "text/csv") {
     const i = await t.text();
     return new Promise((o, e) => {
       u.parse(i, {
         header: !1,
         skipEmptyLines: !0,
         delimiter: ",",
-        complete: (n) => {
-          if (n.errors.length > 0)
-            console.error(n.errors), e(new Error("Error parsing file"));
+        complete: (a) => {
+          if (a.errors.length > 0)
+            console.error(a.errors), e(new Error("Error parsing file"));
           else {
-            const s = y(n.data[0], r?.column || "text"), p = r?.hasHeader ?? w(n.data[0]) ? n.data.slice(1) : n.data;
-            o(p.map((l) => l[s]));
+            const c = y(a.data[0], r?.column || "text"), p = r?.hasHeader ?? w(a.data[0]) ? a.data.slice(1) : a.data;
+            o(p.map((l) => [{ role: "text", content: l[c] }]));
           }
         },
-        error: (n) => {
-          e(n);
+        error: (a) => {
+          e(a);
         }
       });
     });
-  } else if (a === "text/plain")
-    return [await t.text()];
-  throw new Error(`Unsupported file type: ${a}`);
+  } else if (n === "text/plain")
+    return [[{ role: "text", content: await t.text() }]];
+  throw new Error(`Unsupported file type: ${n}`);
 }
 export {
-  z as default
+  j as default
 };

package/dist/tokeniser/BaseTokeniser.d.ts CHANGED Viewed

@@ -12,7 +12,7 @@ export default abstract class BaseTokeniser extends EE<'trainStatus'> implements
     isSpecialToken(index: number): boolean;
     protected addSpecialTokens(): void;
     protected addSpecialToken(token: string, index: number): void;
-    abstract train(text: string[], cb?: (vocab: number) => void): Promise<number>;
+    abstract train(text: Conversation[][], cb?: (vocab: number) => void): Promise<number>;
     abstract getVocab(): string[];
     abstract getMerges(): [string, string][];
     abstract destroy(): void;
@@ -21,6 +21,6 @@ export default abstract class BaseTokeniser extends EE<'trainStatus'> implements
     encodeAsSequence(conversation: Conversation[], completion?: boolean): number[];
     encodeConversation(conversation: Conversation[], completion?: boolean): number[];
     abstract decode(tokens: number[]): string;
-    decodeConversation(tokens: number[]): Conversation[];
+    decodeConversation(tokens: number[] | Uint16Array): Conversation[];
     getSpecialTokenIndex(token: string): number | undefined;
 }

package/dist/tokeniser/BaseTokeniser.js CHANGED Viewed

@@ -11,30 +11,30 @@ const h = [
   "<|system_start|>",
   "<|system_end|>"
 ];
-class l extends r {
+class k extends r {
   specialTokens = /* @__PURE__ */ new Map();
   specialTokenSet = /* @__PURE__ */ new Set();
-  isSpecialToken(e) {
-    return this.specialTokenSet.has(e);
+  isSpecialToken(s) {
+    return this.specialTokenSet.has(s);
   }
   addSpecialTokens() {
-    h.forEach((e, t) => {
-      this.addToken(e, t), this.specialTokens.set(e, t), this.specialTokenSet.add(t);
+    h.forEach((s, t) => {
+      this.addToken(s, t), this.specialTokens.set(s, t), this.specialTokenSet.add(t);
     });
   }
-  addSpecialToken(e, t) {
-    this.specialTokens.set(e, t), this.specialTokenSet.add(t);
+  addSpecialToken(s, t) {
+    this.specialTokens.set(s, t), this.specialTokenSet.add(t);
   }
-  encodeSequence(e) {
-    const t = this.encode(e);
+  encodeSequence(s) {
+    const t = this.encode(s);
     return [this.bosToken, ...t, this.eosToken];
   }
-  encodeAsSequence(e, t) {
-    const s = e.flatMap((o) => this.encode(o.content));
-    return t ? [this.bosToken, ...s, this.eosToken, this.bosToken] : [this.bosToken, ...s, this.eosToken];
+  encodeAsSequence(s, t) {
+    const e = s.flatMap((o) => this.encode(o.content));
+    return t ? [this.bosToken, ...e, this.eosToken, this.bosToken] : [this.bosToken, ...e, this.eosToken];
   }
-  encodeConversation(e, t) {
-    const s = [[this.bosToken]], o = [
+  encodeConversation(s, t) {
+    const e = [[this.bosToken]], o = [
       this.getSpecialTokenIndex("<|user_start|>"),
       this.getSpecialTokenIndex("<|assistant_start|>"),
       this.getSpecialTokenIndex("<|system_start|>")
@@ -43,57 +43,57 @@ class l extends r {
       this.getSpecialTokenIndex("<|assistant_end|>"),
       this.getSpecialTokenIndex("<|system_end|>")
     ];
-    for (const i of e) {
+    for (const i of s) {
       const c = this.encode(i.content);
       switch (i.role) {
         case "user":
-          s.push([o[0]]);
+          e.push([o[0]]);
           break;
         case "assistant":
-          s.push([o[1]]);
+          e.push([o[1]]);
           break;
         case "system":
-          s.push([o[2]]);
+          e.push([o[2]]);
           break;
       }
-      switch (s.push(c), i.role) {
+      switch (e.push(c), i.role) {
         case "user":
-          s.push([n[0]]);
+          e.push([n[0]]);
           break;
         case "assistant":
-          s.push([n[1]]);
+          e.push([n[1]]);
           break;
         case "system":
-          s.push([n[2]]);
+          e.push([n[2]]);
           break;
       }
     }
-    const a = s.flat();
+    const a = e.flat();
     return t ? a.push(o[1]) : a.push(this.eosToken), a;
   }
-  decodeConversation(e) {
+  decodeConversation(s) {
     const t = [];
-    let s = 0;
-    for (; s < e.length; ) {
-      const o = e[s];
+    let e = 0;
+    for (; e < s.length; ) {
+      const o = s[e];
       let n = null;
-      if (o === this.getSpecialTokenIndex("<|user_start|>") ? n = "user" : o === this.getSpecialTokenIndex("<|assistant_start|>") ? n = "assistant" : o === this.getSpecialTokenIndex("<|system_start|>") && (n = "system"), n) {
-        s++;
+      if (o === this.getSpecialTokenIndex("<|user_start|>") ? n = "user" : o === this.getSpecialTokenIndex("<|assistant_start|>") ? n = "assistant" : o === this.getSpecialTokenIndex("<|system_start|>") ? n = "system" : o === this.bosToken || (o === this.eosToken ? n = null : (n = "text", e--)), n) {
+        e++;
         const a = [];
-        for (; s < e.length && e[s] !== this.getSpecialTokenIndex(`<|${n}_end|>`); )
-          a.push(e[s]), s++;
+        for (; e < s.length && s[e] !== this.getSpecialTokenIndex(`<|${n}_end|>`) && s[e] !== this.eosToken; )
+          a.push(s[e]), e++;
         const i = this.decode(a);
         t.push({ role: n, content: i });
       }
-      s++;
+      e++;
     }
     return t;
   }
-  getSpecialTokenIndex(e) {
-    return this.specialTokens.get(e);
+  getSpecialTokenIndex(s) {
+    return this.specialTokens.get(s);
   }
 }
 export {
   h as SPECIALS,
-  l as default
+  k as default
 };

package/dist/tokeniser/CharTokeniser.d.ts CHANGED Viewed

@@ -1,4 +1,5 @@
 import { default as BaseTokeniser } from './BaseTokeniser';
+import { Conversation } from './type';
 export default class CharTokeniser extends BaseTokeniser {
     vocabSize: number;
     eosToken: number;
@@ -11,7 +12,7 @@ export default class CharTokeniser extends BaseTokeniser {
     addToken(token: string, index?: number): number;
     get trained(): boolean;
     destroy(): void;
-    train(text: string[]): Promise<number>;
+    train(text: Conversation[][]): Promise<number>;
     tokenise(text: string[], numeric: true): number[][];
     tokenise(text: string[]): string[][];
     detokenise(tokens: (number[] | Uint16Array)[]): string[];

package/dist/tokeniser/CharTokeniser.js CHANGED Viewed

@@ -40,32 +40,32 @@ class T extends k {
     this.cache.clear(), this.vocab = [];
   }
   async train(i) {
-    const t = i.map((n) => n.split("")).flat(), e = new Set(t), s = Array.from(e), h = this.vocab.indexOf("", this.unkToken + 1), o = this.vocabSize - u.length;
+    const t = i.map((o) => o.map((n) => n.content.split(""))).flat(2), e = new Set(t), s = Array.from(e), h = this.vocab.indexOf("", this.unkToken + 1), a = this.vocabSize - u.length;
     if (h === -1)
       return this.vocabSize;
-    if (this._trained = !0, s.length > o) {
-      const n = /* @__PURE__ */ new Map();
-      t.forEach((a) => {
-        n.set(a, (n.get(a) || 0) + 1);
-      }), s.sort((a, r) => (n.get(a) || 0) - (n.get(r) || 0)), s.splice(0, s.length - o);
+    if (this._trained = !0, s.length > a) {
+      const o = /* @__PURE__ */ new Map();
+      t.forEach((n) => {
+        o.set(n, (o.get(n) || 0) + 1);
+      }), s.sort((n, r) => (o.get(n) || 0) - (o.get(r) || 0)), s.splice(0, s.length - a);
     }
     let c = h;
     if (c !== -1) {
-      const n = new Set(this.vocab);
-      for (const a of s)
-        if (!n.has(a) && (this.vocab[c] = a, n.add(a), c = this.vocab.indexOf("", c + 1), c === -1))
+      const o = new Set(this.vocab);
+      for (const n of s)
+        if (!o.has(n) && (this.vocab[c] = n, o.add(n), c = this.vocab.indexOf("", c + 1), c === -1))
           break;
     }
-    return this.cache.clear(), this.vocab.forEach((n, a) => {
-      this.cache.set(n, a);
+    return this.cache.clear(), this.vocab.forEach((o, n) => {
+      this.cache.set(o, n);
     }), this.emit("trainStatus", "trained"), this.vocabSize;
   }
   tokenise(i, t) {
     if (!this.trained)
       throw new Error("Tokeniser not trained");
     return i.map((s) => t ? s.split("").map((h) => this.cache.get(h) ?? this.unkToken) : s.split("").map((h) => {
-      const o = this.cache.get(h);
-      return o !== void 0 ? this.vocab[o] : "";
+      const a = this.cache.get(h);
+      return a !== void 0 ? this.vocab[a] : "";
     }));
   }
   detokenise(i) {
@@ -85,8 +85,8 @@ class T extends k {
   }
   async createTrainingData(i, t = 5) {
     const e = await this.tokenise(i, !0), s = [], h = [];
-    for (let o = 0; o < e.length - t; o++)
-      s.push(...e[o].slice(0, t)), h.push(e[o + 1][0]);
+    for (let a = 0; a < e.length - t; a++)
+      s.push(...e[a].slice(0, t)), h.push(e[a + 1][0]);
     return [s, h];
   }
 }

package/dist/tokeniser/bpe.d.ts CHANGED Viewed

@@ -1,4 +1,5 @@
 import { default as BaseTokeniser } from './BaseTokeniser';
+import { Conversation } from './type';
 export default class BPETokeniser extends BaseTokeniser {
     private targetSize;
     private vocab;
@@ -14,7 +15,7 @@ export default class BPETokeniser extends BaseTokeniser {
     get eosToken(): number;
     get bosToken(): number;
     get unkToken(): number;
-    train(text: string[], cb?: (vocab: number) => void): Promise<number>;
+    train(text?: Conversation[][], cb?: (vocab: number) => void): Promise<number>;
     getVocab(): string[];
     getMerges(): [string, string][];
     private tokeniseWord;

package/dist/tokeniser/bpe.js CHANGED Viewed

@@ -1,6 +1,6 @@
 import { yieldIfNeeded as f } from "../utilities/yielder.js";
-import k from "../utilities/tokenParse.js";
-import z, { SPECIALS as m } from "./BaseTokeniser.js";
+import m from "../utilities/tokenParse.js";
+import z, { SPECIALS as k } from "./BaseTokeniser.js";
 function p(o, e) {
   return `${o}-::-${e}`;
 }
@@ -8,25 +8,25 @@ function w(o) {
   const e = /* @__PURE__ */ new Map();
   for (let s = 0; s < o.length; s++) {
     const t = o[s];
-    for (let r = 0; r < t.length - 1; r++) {
-      const n = p(t[r], t[r + 1]), a = e.get(n) || {
-        a: t[r],
-        b: t[r + 1],
+    for (let n = 0; n < t.length - 1; n++) {
+      const r = p(t[n], t[n + 1]), a = e.get(r) || {
+        a: t[n],
+        b: t[n + 1],
         count: 0,
         instances: /* @__PURE__ */ new Set()
       };
-      a.count += 1, a.instances.add(s), e.set(n, a);
+      a.count += 1, a.instances.add(s), e.set(r, a);
     }
   }
   return { pairs: e, tokens: o };
 }
-function d(o, e, s, t, r) {
-  const n = p(e, s);
-  if (o.pairs.has(n)) {
-    const a = o.pairs.get(n);
-    a.count += r, r > 0 ? a.instances.add(t) : a.count <= 0 ? o.pairs.delete(n) : a.instances.delete(t);
+function d(o, e, s, t, n) {
+  const r = p(e, s);
+  if (o.pairs.has(r)) {
+    const a = o.pairs.get(r);
+    a.count += n, n > 0 ? a.instances.add(t) : a.count <= 0 ? o.pairs.delete(r) : a.instances.delete(t);
   } else
-    o.pairs.set(n, { a: e, b: s, count: r, instances: /* @__PURE__ */ new Set([t]) });
+    o.pairs.set(r, { a: e, b: s, count: n, instances: /* @__PURE__ */ new Set([t]) });
 }
 function T(o) {
   let e = null, s = 0;
@@ -37,21 +37,21 @@ function T(o) {
 function y(o, e) {
   return o.map((s) => {
     const t = [];
-    for (let r = 0; r < s.length; r++)
-      r < s.length - 1 && s[r] === e[0] && s[r + 1] === e[1] ? (t.push(e[0] + e[1]), r++) : t.push(s[r]);
+    for (let n = 0; n < s.length; n++)
+      n < s.length - 1 && s[n] === e[0] && s[n + 1] === e[1] ? (t.push(e[0] + e[1]), n++) : t.push(s[n]);
     return t;
   });
 }
 function I(o, e) {
   e.instances.forEach((s) => {
-    const t = o.tokens[s], r = [];
-    for (let n = 0; n < t.length; n++)
-      if (n < t.length - 1 && t[n] === e.a && t[n + 1] === e.b) {
+    const t = o.tokens[s], n = [];
+    for (let r = 0; r < t.length; r++)
+      if (r < t.length - 1 && t[r] === e.a && t[r + 1] === e.b) {
         const a = e.a + e.b;
-        r.push(a), n > 0 && (d(o, t[n - 1], e.a, s, -1), d(o, t[n - 1], a, s, 1)), n++, n < t.length - 1 && (d(o, e.b, t[n + 1], s, -1), d(o, a, t[n + 1], s, 1));
+        n.push(a), r > 0 && (d(o, t[r - 1], e.a, s, -1), d(o, t[r - 1], a, s, 1)), r++, r < t.length - 1 && (d(o, e.b, t[r + 1], s, -1), d(o, a, t[r + 1], s, 1));
       } else
-        r.push(t[n]);
-    o.tokens[s] = r;
+        n.push(t[r]);
+    o.tokens[s] = n;
   }), o.pairs.delete(p(e.a, e.b));
 }
 class E extends z {
@@ -61,11 +61,11 @@ class E extends z {
   merges = [];
   pretokenMap = /* @__PURE__ */ new Map();
   constructor(e, s) {
-    super(), Array.isArray(e) ? (e.forEach((t, r) => {
-      this.vocab.add(t), this.vocabIndex.set(t, r);
-    }), s && (this.merges = s), this.targetSize = e.length, m.forEach((t) => {
-      const r = e.indexOf(t);
-      r !== -1 && this.addSpecialToken(t, r);
+    super(), Array.isArray(e) ? (e.forEach((t, n) => {
+      this.vocab.add(t), this.vocabIndex.set(t, n);
+    }), s && (this.merges = s), this.targetSize = e.length, k.forEach((t) => {
+      const n = e.indexOf(t);
+      n !== -1 && this.addSpecialToken(t, n);
     })) : (this.addSpecialTokens(), this.targetSize = e);
   }
   addToken(e, s) {
@@ -81,7 +81,7 @@ class E extends z {
     this.vocab.clear(), this.vocabIndex.clear(), this.merges = [], this.pretokenMap.clear();
   }
   get trained() {
-    return this.vocab.size > m.length && this.vocab.size <= this.targetSize;
+    return this.vocab.size > k.length && this.vocab.size <= this.targetSize;
   }
   get vocabSize() {
     return this.vocab.size;
@@ -95,23 +95,23 @@ class E extends z {
   get unkToken() {
     return this.vocabIndex.get("") ?? 1;
   }
-  async train(e, s) {
+  async train(e = [], s) {
     let t = performance.now();
-    const r = e.map((i) => k(i)).flat(1);
+    const n = e.map((i) => i.map((h) => m(h.content))).flat(2);
     t = await f(t, s, this.vocab.size);
-    const n = new Set(r);
+    const r = new Set(n);
     this.vocab = /* @__PURE__ */ new Set(), this.pretokenMap.clear(), this.merges = [], this.addSpecialTokens();
-    const a = Array.from(n), b = a.map((i) => Array.from(i).map((h) => (this.vocab.add(h), h))), g = w(b);
+    const a = Array.from(r), b = a.map((i) => Array.from(i).map((l) => (this.vocab.add(l), l))), g = w(b);
     if (t = await f(t, s, this.vocab.size), this.vocab.size >= this.targetSize) {
       console.warn("Initial vocab size is greater than or equal to target size. No merges will be performed.");
       const i = /* @__PURE__ */ new Map();
-      r.forEach((c) => {
+      n.forEach((c) => {
         Array.from(c).forEach((u) => {
           i.set(u, (i.get(u) || 0) + 1);
         });
       });
-      const l = Array.from(i.entries()).sort((c, u) => u[1] - c[1]);
-      this.vocab = /* @__PURE__ */ new Set(), this.addSpecialTokens(), l.slice(0, this.targetSize - this.vocab.size).map(([c]) => c).forEach((c) => this.vocab.add(c)), this.vocabIndex.clear();
+      const h = Array.from(i.entries()).sort((c, u) => u[1] - c[1]);
+      this.vocab = /* @__PURE__ */ new Set(), this.addSpecialTokens(), h.slice(0, this.targetSize - this.vocab.size).map(([c]) => c).forEach((c) => this.vocab.add(c)), this.vocabIndex.clear();
       let S = 0;
       for (const c of this.vocab.keys())
         this.vocabIndex.set(c, S++);
@@ -123,9 +123,9 @@ class E extends z {
         break;
       this.merges.push([i.a, i.b]), this.vocab.add(i.a + i.b), I(g, i), t = await f(t, s, this.vocab.size);
     }
-    a.forEach((i, l) => {
-      const h = b[l];
-      this.pretokenMap.set(i, h);
+    a.forEach((i, h) => {
+      const l = b[h];
+      this.pretokenMap.set(i, l);
     }), this.vocabIndex.clear();
     let v = 0;
     for (const i of this.vocab.keys())
@@ -145,15 +145,15 @@ class E extends z {
     }), this.pretokenMap.set(e, s), s;
   }
   tokeniseStrings(e) {
-    return e.map((s) => k(s).map((n) => this.pretokenMap.has(n) ? this.pretokenMap.get(n) : this.tokeniseWord(n)).flat(1));
+    return e.map((s) => m(s).map((r) => this.pretokenMap.has(r) ? this.pretokenMap.get(r) : this.tokeniseWord(r)).flat(1));
   }
   tokenise(e, s) {
     const t = this.tokeniseStrings(e);
-    return s ? t.map((r) => r.map((n) => this.vocabIndex.get(n) ?? this.unkToken)) : t.map((r) => r.map((n) => this.vocab.has(n) ? n : ""));
+    return s ? t.map((n) => n.map((r) => this.vocabIndex.get(r) ?? this.unkToken)) : t.map((n) => n.map((r) => this.vocab.has(r) ? r : ""));
   }
   detokenise(e) {
     const s = this.getVocab();
-    return e.map((r) => r.map((n) => s[n]).join(""));
+    return e.map((n) => n.map((r) => s[r]).join(""));
   }
   encode(e) {
     return this.tokenise([e], !0)[0];

package/dist/tokeniser/type.d.ts CHANGED Viewed

@@ -1,11 +1,11 @@
 import { default as EE } from 'eventemitter3';
-export type Roles = 'user' | 'assistant' | 'system';
+export type Roles = 'user' | 'assistant' | 'system' | 'text';
 export interface Conversation {
     role: Roles;
     content: string;
 }
 export interface ITokeniser extends EE<'trainStatus'> {
-    train(text: string[], cb?: (vocab: number) => void): Promise<number>;
+    train(text: Conversation[][], cb?: (vocab: number) => void): Promise<number>;
     getVocab(): string[];
     getMerges(): [string, string][];
     destroy(): void;

package/dist/training/SFTDatasetBuilder.js CHANGED Viewed

@@ -6,22 +6,24 @@ function w(p, o, t, l) {
   const s = [t.bosToken], a = [!1], u = {
     user: t.getSpecialTokenIndex("<|user_start|>"),
     assistant: t.getSpecialTokenIndex("<|assistant_start|>"),
-    system: t.getSpecialTokenIndex("<|system_start|>")
+    system: t.getSpecialTokenIndex("<|system_start|>"),
+    text: void 0
   }, c = {
     user: t.getSpecialTokenIndex("<|user_end|>"),
     assistant: t.getSpecialTokenIndex("<|assistant_end|>"),
-    system: t.getSpecialTokenIndex("<|system_end|>")
+    system: t.getSpecialTokenIndex("<|system_end|>"),
+    text: void 0
   };
   for (const e of p) {
     const r = u[e.role], h = c[e.role];
     if (!r || !h)
       throw new Error(`Missing special tokens for role: ${e.role}`);
     s.push(r), a.push(!1);
-    const m = e.role === "assistant", S = t.encode(e.content);
-    for (const T of S) {
+    const m = e.role === "assistant", x = t.encode(e.content);
+    for (const T of x) {
       s.push(T);
-      const x = t.isSpecialToken(T);
-      a.push(m && !x);
+      const S = t.isSpecialToken(T);
+      a.push(m && !S);
     }
     s.push(h), a.push(m);
   }
@@ -40,7 +42,7 @@ function w(p, o, t, l) {
   }
   return g ? { xs: f, ys: d } : null;
 }
-class D {
+class A {
   tokenizer;
   blockSize;
   constructor(o, t = 128) {
@@ -78,6 +80,6 @@ class D {
   }
 }
 export {
-  D as SFTDatasetBuilder,
+  A as SFTDatasetBuilder,
   w as buildSFTExample
 };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
     "name": "@genai-fi/nanogpt",
-    "version": "0.15.14",
+    "version": "0.16.1",
     "type": "module",
     "main": "dist/main.js",
     "types": "dist/main.d.ts",