npm - @genai-fi/nanogpt - Versions diffs - 0.15.13 → 0.16.1 - Mend

@genai-fi/nanogpt 0.15.13 → 0.16.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

package/dist/TeachableLLM.d.ts +1 -1
package/dist/Trainer.js +10 -5
package/dist/data/docx.d.ts +2 -1
package/dist/data/docx.js +8 -8
package/dist/data/parquet.d.ts +2 -1
package/dist/data/parquet.js +5 -5
package/dist/data/pdf.d.ts +2 -1
package/dist/data/pdf.js +1 -1
package/dist/data/textLoader.d.ts +2 -1
package/dist/data/textLoader.js +55 -48
package/dist/tokeniser/BaseTokeniser.d.ts +2 -2
package/dist/tokeniser/BaseTokeniser.js +35 -35
package/dist/tokeniser/CharTokeniser.d.ts +2 -1
package/dist/tokeniser/CharTokeniser.js +15 -15
package/dist/tokeniser/bpe.d.ts +2 -1
package/dist/tokeniser/bpe.js +40 -40
package/dist/tokeniser/type.d.ts +2 -2
package/dist/training/BasicTrainer.js +62 -62
package/dist/training/Evaluator.d.ts +2 -1
package/dist/training/Evaluator.js +19 -18
package/dist/training/SFTDatasetBuilder.js +47 -38
package/dist/training/tasks/ConversationTask.d.ts +2 -2
package/dist/training/tasks/ConversationTask.js +13 -11
package/dist/training/tasks/PretrainingTask.d.ts +1 -2
package/dist/training/tasks/PretrainingTask.js +4 -14
package/dist/training/tasks/StartSentenceTask.d.ts +1 -2
package/dist/training/tasks/StartSentenceTask.js +2 -7
package/dist/training/tasks/Task.d.ts +1 -2
package/dist/training/tasks/splitter.d.ts +5 -0
package/dist/training/tasks/splitter.js +21 -0
package/dist/training/validation.js +1 -1
package/package.json +1 -1

package/dist/TeachableLLM.d.ts CHANGED Viewed

@@ -49,7 +49,7 @@ export default class TeachableLLM {
     getNumParams(): number;
     trainer(trainingType?: TrainingType, options?: TrainingOptions): Trainer;
     train(text: Task[], options?: TrainingOptions, trainingType?: TrainingType): Promise<void>;
-    trainTokeniser(text: string[]): Promise<number>;
+    trainTokeniser(text: Conversation[][]): Promise<number>;
     generator(): IGenerator;
     generateText(prompt: Conversation[], options?: IGenerateOptions): Promise<Conversation[]>;
     generateText(options?: IGenerateOptions): Promise<Conversation[]>;

package/dist/Trainer.js CHANGED Viewed

@@ -1,7 +1,8 @@
 import { E as g } from "./index-DvYrXKkX.js";
 import o from "./training/PreTrainer.js";
-import { createTrainValidationSplit as p } from "./training/validation.js";
+import { createTrainValidationSplit as d } from "./training/validation.js";
 import h from "./training/SFTTrainer.js";
+import p from "./training/tasks/splitter.js";
 class l extends g {
   trainer;
   trainingType = "pretraining";
@@ -81,7 +82,7 @@ class l extends g {
   async prepare(t = []) {
     const i = this.options;
     if (this.trainingType === "pretraining" && this.trainer instanceof o) {
-      const { trainDataset: e, validationDataset: a, size: r, trainState: n } = await p(
+      const { trainDataset: e, validationDataset: a, size: r, trainState: n } = await d(
         t,
         this.trainer.tokenizer,
         this.trainer.datasetBuilder,
@@ -92,12 +93,16 @@ class l extends g {
     } else if (this.trainingType === "sft" && this.trainer instanceof h) {
       if (t instanceof Uint16Array)
         throw new Error("SFT training requires Task[] input");
-      const e = await this.trainer.datasetBuilder.createSFTDataset(
-        t,
+      const e = p(t, i?.validationSplit || 0.1), a = await this.trainer.datasetBuilder.createSFTDataset(
+        [e.training],
+        i?.batchSize || 32,
+        -100
+      ), r = await this.trainer.datasetBuilder.createSFTDataset(
+        [e.validation],
         i?.batchSize || 32,
         -100
       );
-      this.trainDataset = e, this.totalSamples = t.reduce((a, r) => a + r.length, 0), this.options.epochSteps = Math.ceil(this.totalSamples / (i?.batchSize || 32)), this.trainer.updateOptimizer(this.options);
+      this.validationDataset = r, this.trainDataset = a, this.totalSamples = t.reduce((n, s) => n + s.length, 0), this.options.epochSteps = Math.ceil(this.totalSamples / (i?.batchSize || 32)), this.trainer.updateOptimizer(this.options);
     }
   }
   configureModel(t) {

package/dist/data/docx.d.ts CHANGED Viewed

@@ -1 +1,2 @@
-export declare function loadDOCX(file: Blob | Uint8Array): Promise<string[]>;
+import { Conversation } from '../tokeniser/type';
+export declare function loadDOCX(file: Blob | Uint8Array): Promise<Conversation[][]>;

package/dist/data/docx.js CHANGED Viewed

@@ -1,13 +1,13 @@
 import { z as a } from "../jszip.min-BZhlzntC.js";
-async function c(n) {
-  const t = await (await a.loadAsync(n)).file("word/document.xml")?.async("string");
-  if (!t) throw new Error("Failed to load document.xml");
-  return i(t).split(`
-`).filter((r) => r.trim().length > 10);
+async function c(e) {
+  const n = await (await a.loadAsync(e)).file("word/document.xml")?.async("string");
+  if (!n) throw new Error("Failed to load document.xml");
+  return i(n).split(`
+`).filter((t) => t.trim().length > 10).map((t) => [{ role: "text", content: t }]);
 }
-function i(n) {
-  const t = new DOMParser().parseFromString(n, "application/xml");
-  return Array.from(t.getElementsByTagName("w:t")).map((r) => r.textContent).join(`
+function i(e) {
+  const n = new DOMParser().parseFromString(e, "application/xml");
+  return Array.from(n.getElementsByTagName("w:t")).map((t) => t.textContent).join(`
 `);
 }
 export {

package/dist/data/parquet.d.ts CHANGED Viewed

@@ -1 +1,2 @@
-export declare function loadParquet(file: File, maxSize?: number, column?: string): Promise<string[]>;
+import { Conversation } from '../tokeniser/type';
+export declare function loadParquet(file: File, maxSize?: number, column?: string): Promise<Conversation[][]>;

package/dist/data/parquet.js CHANGED Viewed

@@ -1,13 +1,13 @@
-import { B as n } from "../index-Cp39cXWe.js";
+import { B as f } from "../index-Cp39cXWe.js";
 const p = 100 * 1024 * 1024;
-async function d(i, s = p, e = "text") {
-  const r = await (await import("../parquet-Bqjmp2vo.js").then((t) => t.p)).ParquetReader.openBuffer(n.from(await i.arrayBuffer())), a = [], f = r.getCursor([[e]]);
+async function d(i, n = p, e = "text") {
+  const r = await (await import("../parquet-Bqjmp2vo.js").then((t) => t.p)).ParquetReader.openBuffer(f.from(await i.arrayBuffer())), a = [], s = r.getCursor([[e]]);
   let o = 0;
   for (; ; ) {
-    const t = await f.next();
+    const t = await s.next();
     if (!t || t[e] === void 0 || typeof t[e] != "string")
       break;
-    if (t[e].length !== 0 && (a.push(t[e]), o += t[e].length, o > s))
+    if (t[e].length !== 0 && (a.push([{ role: "text", content: t[e] }]), o += t[e].length, o > n))
       break;
   }
   return r.close(), a;

package/dist/data/pdf.d.ts CHANGED Viewed

@@ -1 +1,2 @@
-export declare function loadPDF(file: Blob | Uint8Array, maxSize?: number): Promise<string[]>;
+import { Conversation } from '../tokeniser/type';
+export declare function loadPDF(file: Blob | Uint8Array, maxSize?: number): Promise<Conversation[][]>;

package/dist/data/pdf.js CHANGED Viewed

@@ -5,7 +5,7 @@ async function h(l, X = 104857600) {
   let m = 0;
   for (let b = 1; b <= N; b++) {
     const G = (await (await d.getPage(b)).getTextContent()).items.filter((c) => c.str.trim().length > 10).map((c) => c.str).join(" ");
-    if (W.push(G), m += G.length, m > X) break;
+    if (W.push([{ role: "text", content: G }]), m += G.length, m > X) break;
   }
   return W;
 }

package/dist/data/textLoader.d.ts CHANGED Viewed

@@ -1,6 +1,7 @@
+import { Conversation } from '../tokeniser/type';
 export interface DataOptions {
     maxSize?: number;
     column?: string;
     hasHeader?: boolean;
 }
-export default function loadTextData(file: File, options?: DataOptions): Promise<string[]>;
+export default function loadTextData(file: File, options?: DataOptions): Promise<Conversation[][]>;

package/dist/data/textLoader.js CHANGED Viewed

@@ -1,14 +1,14 @@
 import { p as u } from "../papaparse.min-C0cScC2i.js";
-import { loadParquet as d } from "./parquet.js";
-import { loadPDF as f } from "./pdf.js";
-import { loadDOCX as m } from "./docx.js";
-import { z as x } from "../jszip.min-BZhlzntC.js";
-function w(t, n) {
-  const r = t.findIndex((i) => i.toLowerCase() === n.toLowerCase());
-  return r === -1 ? 0 : r;
+import { loadParquet as f } from "./parquet.js";
+import { loadPDF as d } from "./pdf.js";
+import { loadDOCX as x } from "./docx.js";
+import { z as m } from "../jszip.min-BZhlzntC.js";
+function y(t, r) {
+  const n = t.findIndex((i) => i.toLowerCase() === r.toLowerCase());
+  return n === -1 ? 0 : n;
 }
-function y(t) {
-  return t.every((n) => n.length < 64);
+function w(t) {
+  return t.every((r) => r.length < 64);
 }
 function h(t) {
   return t.split(".").pop() || "";
@@ -35,67 +35,74 @@ function g(t) {
       return "unknown";
   }
 }
-async function z(t, n) {
-  const r = t.type !== "" ? t.type : g(t.name);
-  if (r === "application/parquet")
-    return d(t, n?.maxSize, n?.column);
-  if (r === "application/pdf")
-    return f(t, n?.maxSize);
-  if (r === "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
-    return m(t);
-  if (r === "application/json") {
-    const i = await t.text(), a = JSON.parse(i);
-    if (Array.isArray(a))
-      return a.map(
-        (e) => typeof e == "string" ? e : "text" in e ? e.text : JSON.stringify(e)
-      );
+function z(t) {
+  if (!Array.isArray(t)) return !1;
+  const r = t[0];
+  return typeof r == "object" && r !== null && "role" in r && "content" in r && typeof r.role == "string" && typeof r.content == "string";
+}
+async function j(t, r) {
+  const n = t.type !== "" ? t.type : g(t.name);
+  if (n === "application/parquet")
+    return f(t, r?.maxSize, r?.column);
+  if (n === "application/pdf")
+    return d(t, r?.maxSize);
+  if (n === "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
+    return x(t);
+  if (n === "application/json") {
+    const i = await t.text(), o = JSON.parse(i);
+    if (Array.isArray(o))
+      return o.map((e) => [
+        typeof e == "string" ? { role: "text", content: e } : "text" in e ? { role: "text", content: e.text } : { role: "text", content: JSON.stringify(e) }
+      ]);
     throw new Error("Expected JSON array");
   }
-  if (r === "application/jsonl")
+  if (n === "application/jsonl")
     return (await t.text()).split(`
-`).filter((a) => a.trim() !== "").map((a) => {
+`).filter((o) => o.trim() !== "").map((o) => {
       try {
-        const e = JSON.parse(a);
-        return typeof e == "string" ? e : "text" in e ? e.text : JSON.stringify(e);
+        const e = JSON.parse(o);
+        return z(e) ? e : [
+          typeof e == "string" ? { role: "text", content: e } : "text" in e ? { role: "text", content: e.text } : { role: "text", content: JSON.stringify(e) }
+        ];
       } catch {
-        return a;
+        return [{ role: "text", content: o }];
       }
     });
-  if (r === "application/zip") {
-    const i = await x.loadAsync(t), a = [];
+  if (n === "application/zip") {
+    const i = await m.loadAsync(t), o = [];
     for (const e of Object.keys(i.files)) {
-      const o = i.file(e);
-      if (o) {
-        const c = await o.async("blob"), p = await z(new File([c], e), n);
-        a.push(...p);
+      const a = i.file(e);
+      if (a) {
+        const c = await a.async("blob"), s = await j(new File([c], e), r);
+        o.push(...s);
       }
     }
-    return a;
+    return o;
   }
-  if (r === "text/csv") {
+  if (n === "text/csv") {
     const i = await t.text();
-    return new Promise((a, e) => {
+    return new Promise((o, e) => {
       u.parse(i, {
         header: !1,
         skipEmptyLines: !0,
         delimiter: ",",
-        complete: (o) => {
-          if (o.errors.length > 0)
-            console.error(o.errors), e(new Error("Error parsing file"));
+        complete: (a) => {
+          if (a.errors.length > 0)
+            console.error(a.errors), e(new Error("Error parsing file"));
           else {
-            const c = w(o.data[0], n?.column || "text"), s = n?.hasHeader ?? y(o.data[0]) ? o.data.slice(1) : o.data;
-            a(s.map((l) => l[c]));
+            const c = y(a.data[0], r?.column || "text"), p = r?.hasHeader ?? w(a.data[0]) ? a.data.slice(1) : a.data;
+            o(p.map((l) => [{ role: "text", content: l[c] }]));
           }
         },
-        error: (o) => {
-          e(o);
+        error: (a) => {
+          e(a);
         }
       });
     });
-  } else if (r === "text/plain")
-    return [await t.text()];
-  throw new Error(`Unsupported file type: ${r}`);
+  } else if (n === "text/plain")
+    return [[{ role: "text", content: await t.text() }]];
+  throw new Error(`Unsupported file type: ${n}`);
 }
 export {
-  z as default
+  j as default
 };

package/dist/tokeniser/BaseTokeniser.d.ts CHANGED Viewed

@@ -12,7 +12,7 @@ export default abstract class BaseTokeniser extends EE<'trainStatus'> implements
     isSpecialToken(index: number): boolean;
     protected addSpecialTokens(): void;
     protected addSpecialToken(token: string, index: number): void;
-    abstract train(text: string[], cb?: (vocab: number) => void): Promise<number>;
+    abstract train(text: Conversation[][], cb?: (vocab: number) => void): Promise<number>;
     abstract getVocab(): string[];
     abstract getMerges(): [string, string][];
     abstract destroy(): void;
@@ -21,6 +21,6 @@ export default abstract class BaseTokeniser extends EE<'trainStatus'> implements
     encodeAsSequence(conversation: Conversation[], completion?: boolean): number[];
     encodeConversation(conversation: Conversation[], completion?: boolean): number[];
     abstract decode(tokens: number[]): string;
-    decodeConversation(tokens: number[]): Conversation[];
+    decodeConversation(tokens: number[] | Uint16Array): Conversation[];
     getSpecialTokenIndex(token: string): number | undefined;
 }

package/dist/tokeniser/BaseTokeniser.js CHANGED Viewed

@@ -11,30 +11,30 @@ const h = [
   "<|system_start|>",
   "<|system_end|>"
 ];
-class l extends r {
+class k extends r {
   specialTokens = /* @__PURE__ */ new Map();
   specialTokenSet = /* @__PURE__ */ new Set();
-  isSpecialToken(e) {
-    return this.specialTokenSet.has(e);
+  isSpecialToken(s) {
+    return this.specialTokenSet.has(s);
   }
   addSpecialTokens() {
-    h.forEach((e, t) => {
-      this.addToken(e, t), this.specialTokens.set(e, t), this.specialTokenSet.add(t);
+    h.forEach((s, t) => {
+      this.addToken(s, t), this.specialTokens.set(s, t), this.specialTokenSet.add(t);
     });
   }
-  addSpecialToken(e, t) {
-    this.specialTokens.set(e, t), this.specialTokenSet.add(t);
+  addSpecialToken(s, t) {
+    this.specialTokens.set(s, t), this.specialTokenSet.add(t);
   }
-  encodeSequence(e) {
-    const t = this.encode(e);
+  encodeSequence(s) {
+    const t = this.encode(s);
     return [this.bosToken, ...t, this.eosToken];
   }
-  encodeAsSequence(e, t) {
-    const s = e.flatMap((o) => this.encode(o.content));
-    return t ? [this.bosToken, ...s, this.eosToken, this.bosToken] : [this.bosToken, ...s, this.eosToken];
+  encodeAsSequence(s, t) {
+    const e = s.flatMap((o) => this.encode(o.content));
+    return t ? [this.bosToken, ...e, this.eosToken, this.bosToken] : [this.bosToken, ...e, this.eosToken];
   }
-  encodeConversation(e, t) {
-    const s = [[this.bosToken]], o = [
+  encodeConversation(s, t) {
+    const e = [[this.bosToken]], o = [
       this.getSpecialTokenIndex("<|user_start|>"),
       this.getSpecialTokenIndex("<|assistant_start|>"),
       this.getSpecialTokenIndex("<|system_start|>")
@@ -43,57 +43,57 @@ class l extends r {
       this.getSpecialTokenIndex("<|assistant_end|>"),
       this.getSpecialTokenIndex("<|system_end|>")
     ];
-    for (const i of e) {
+    for (const i of s) {
       const c = this.encode(i.content);
       switch (i.role) {
         case "user":
-          s.push([o[0]]);
+          e.push([o[0]]);
           break;
         case "assistant":
-          s.push([o[1]]);
+          e.push([o[1]]);
           break;
         case "system":
-          s.push([o[2]]);
+          e.push([o[2]]);
           break;
       }
-      switch (s.push(c), i.role) {
+      switch (e.push(c), i.role) {
         case "user":
-          s.push([n[0]]);
+          e.push([n[0]]);
           break;
         case "assistant":
-          s.push([n[1]]);
+          e.push([n[1]]);
           break;
         case "system":
-          s.push([n[2]]);
+          e.push([n[2]]);
           break;
       }
     }
-    const a = s.flat();
+    const a = e.flat();
     return t ? a.push(o[1]) : a.push(this.eosToken), a;
   }
-  decodeConversation(e) {
+  decodeConversation(s) {
     const t = [];
-    let s = 0;
-    for (; s < e.length; ) {
-      const o = e[s];
+    let e = 0;
+    for (; e < s.length; ) {
+      const o = s[e];
       let n = null;
-      if (o === this.getSpecialTokenIndex("<|user_start|>") ? n = "user" : o === this.getSpecialTokenIndex("<|assistant_start|>") ? n = "assistant" : o === this.getSpecialTokenIndex("<|system_start|>") && (n = "system"), n) {
-        s++;
+      if (o === this.getSpecialTokenIndex("<|user_start|>") ? n = "user" : o === this.getSpecialTokenIndex("<|assistant_start|>") ? n = "assistant" : o === this.getSpecialTokenIndex("<|system_start|>") ? n = "system" : o === this.bosToken || (o === this.eosToken ? n = null : (n = "text", e--)), n) {
+        e++;
         const a = [];
-        for (; s < e.length && e[s] !== this.getSpecialTokenIndex(`<|${n}_end|>`); )
-          a.push(e[s]), s++;
+        for (; e < s.length && s[e] !== this.getSpecialTokenIndex(`<|${n}_end|>`) && s[e] !== this.eosToken; )
+          a.push(s[e]), e++;
         const i = this.decode(a);
         t.push({ role: n, content: i });
       }
-      s++;
+      e++;
     }
     return t;
   }
-  getSpecialTokenIndex(e) {
-    return this.specialTokens.get(e);
+  getSpecialTokenIndex(s) {
+    return this.specialTokens.get(s);
   }
 }
 export {
   h as SPECIALS,
-  l as default
+  k as default
 };

package/dist/tokeniser/CharTokeniser.d.ts CHANGED Viewed

@@ -1,4 +1,5 @@
 import { default as BaseTokeniser } from './BaseTokeniser';
+import { Conversation } from './type';
 export default class CharTokeniser extends BaseTokeniser {
     vocabSize: number;
     eosToken: number;
@@ -11,7 +12,7 @@ export default class CharTokeniser extends BaseTokeniser {
     addToken(token: string, index?: number): number;
     get trained(): boolean;
     destroy(): void;
-    train(text: string[]): Promise<number>;
+    train(text: Conversation[][]): Promise<number>;
     tokenise(text: string[], numeric: true): number[][];
     tokenise(text: string[]): string[][];
     detokenise(tokens: (number[] | Uint16Array)[]): string[];

package/dist/tokeniser/CharTokeniser.js CHANGED Viewed

@@ -40,32 +40,32 @@ class T extends k {
     this.cache.clear(), this.vocab = [];
   }
   async train(i) {
-    const t = i.map((n) => n.split("")).flat(), e = new Set(t), s = Array.from(e), h = this.vocab.indexOf("", this.unkToken + 1), o = this.vocabSize - u.length;
+    const t = i.map((o) => o.map((n) => n.content.split(""))).flat(2), e = new Set(t), s = Array.from(e), h = this.vocab.indexOf("", this.unkToken + 1), a = this.vocabSize - u.length;
     if (h === -1)
       return this.vocabSize;
-    if (this._trained = !0, s.length > o) {
-      const n = /* @__PURE__ */ new Map();
-      t.forEach((a) => {
-        n.set(a, (n.get(a) || 0) + 1);
-      }), s.sort((a, r) => (n.get(a) || 0) - (n.get(r) || 0)), s.splice(0, s.length - o);
+    if (this._trained = !0, s.length > a) {
+      const o = /* @__PURE__ */ new Map();
+      t.forEach((n) => {
+        o.set(n, (o.get(n) || 0) + 1);
+      }), s.sort((n, r) => (o.get(n) || 0) - (o.get(r) || 0)), s.splice(0, s.length - a);
     }
     let c = h;
     if (c !== -1) {
-      const n = new Set(this.vocab);
-      for (const a of s)
-        if (!n.has(a) && (this.vocab[c] = a, n.add(a), c = this.vocab.indexOf("", c + 1), c === -1))
+      const o = new Set(this.vocab);
+      for (const n of s)
+        if (!o.has(n) && (this.vocab[c] = n, o.add(n), c = this.vocab.indexOf("", c + 1), c === -1))
           break;
     }
-    return this.cache.clear(), this.vocab.forEach((n, a) => {
-      this.cache.set(n, a);
+    return this.cache.clear(), this.vocab.forEach((o, n) => {
+      this.cache.set(o, n);
     }), this.emit("trainStatus", "trained"), this.vocabSize;
   }
   tokenise(i, t) {
     if (!this.trained)
       throw new Error("Tokeniser not trained");
     return i.map((s) => t ? s.split("").map((h) => this.cache.get(h) ?? this.unkToken) : s.split("").map((h) => {
-      const o = this.cache.get(h);
-      return o !== void 0 ? this.vocab[o] : "";
+      const a = this.cache.get(h);
+      return a !== void 0 ? this.vocab[a] : "";
     }));
   }
   detokenise(i) {
@@ -85,8 +85,8 @@ class T extends k {
   }
   async createTrainingData(i, t = 5) {
     const e = await this.tokenise(i, !0), s = [], h = [];
-    for (let o = 0; o < e.length - t; o++)
-      s.push(...e[o].slice(0, t)), h.push(e[o + 1][0]);
+    for (let a = 0; a < e.length - t; a++)
+      s.push(...e[a].slice(0, t)), h.push(e[a + 1][0]);
     return [s, h];
   }
 }

package/dist/tokeniser/bpe.d.ts CHANGED Viewed

@@ -1,4 +1,5 @@
 import { default as BaseTokeniser } from './BaseTokeniser';
+import { Conversation } from './type';
 export default class BPETokeniser extends BaseTokeniser {
     private targetSize;
     private vocab;
@@ -14,7 +15,7 @@ export default class BPETokeniser extends BaseTokeniser {
     get eosToken(): number;
     get bosToken(): number;
     get unkToken(): number;
-    train(text: string[], cb?: (vocab: number) => void): Promise<number>;
+    train(text?: Conversation[][], cb?: (vocab: number) => void): Promise<number>;
     getVocab(): string[];
     getMerges(): [string, string][];
     private tokeniseWord;