npm - @genai-fi/nanogpt - Versions diffs - 0.17.4 → 0.17.5 - Mend

@genai-fi/nanogpt 0.17.4 → 0.17.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/tokeniser/CharTokeniser.js +53 -42
package/dist/tokeniser/bpe.js +36 -31
package/package.json +1 -1

package/dist/tokeniser/CharTokeniser.js CHANGED Viewed

@@ -1,6 +1,7 @@
-import k, { SPECIALS as d } from "./BaseTokeniser.js";
-const u = ["<eos>", "<unk>"];
-class T extends k {
+import { yieldIfNeeded as d } from "../utilities/yielder.js";
+import f, { SPECIALS as u } from "./BaseTokeniser.js";
+const b = ["<eos>", "<unk>"];
+class p extends f {
   vocabSize = 0;
   eosToken = 0;
   bosToken = 0;
@@ -8,30 +9,30 @@ class T extends k {
   vocab = [];
   cache = /* @__PURE__ */ new Map();
   _trained = !1;
-  constructor(i) {
-    if (super(), Array.isArray(i)) {
-      if (this.vocab = i, this.vocab.length > 0)
-        this.vocabSize = this.vocab.length, d.forEach((t) => {
-          const e = this.vocab.indexOf(t);
-          e !== -1 && this.addSpecialToken(t, e);
-        }), this.eosToken = this.getSpecialTokenIndex("<eos>"), this.bosToken = this.getSpecialTokenIndex("<bos>") ?? this.eosToken, this.unkToken = this.getSpecialTokenIndex("") ?? -1, this.unkToken === -1 && (this.unkToken = this.vocab.indexOf("<unk>")), this.unkToken === -1 && (this.unkToken = this.vocab.indexOf("<pad>")), this.unkToken === -1 && (this.unkToken = this.vocab.indexOf("_")), this.unkToken === -1 && (this.unkToken = this.vocab.indexOf(" ")), this.unkToken === -1 && (this.unkToken = this.eosToken), this.vocab = this.vocab.map((t) => t === "<pad>" ? "" : t), this.vocab.forEach((t, e) => {
-          this.cache.set(t, e);
+  constructor(t) {
+    if (super(), Array.isArray(t)) {
+      if (this.vocab = t, this.vocab.length > 0)
+        this.vocabSize = this.vocab.length, u.forEach((i) => {
+          const e = this.vocab.indexOf(i);
+          e !== -1 && this.addSpecialToken(i, e);
+        }), this.eosToken = this.getSpecialTokenIndex("<eos>"), this.bosToken = this.getSpecialTokenIndex("<bos>") ?? this.eosToken, this.unkToken = this.getSpecialTokenIndex("") ?? -1, this.unkToken === -1 && (this.unkToken = this.vocab.indexOf("<unk>")), this.unkToken === -1 && (this.unkToken = this.vocab.indexOf("<pad>")), this.unkToken === -1 && (this.unkToken = this.vocab.indexOf("_")), this.unkToken === -1 && (this.unkToken = this.vocab.indexOf(" ")), this.unkToken === -1 && (this.unkToken = this.eosToken), this.vocab = this.vocab.map((i) => i === "<pad>" ? "" : i), this.vocab.forEach((i, e) => {
+          this.cache.set(i, e);
         });
       else
         throw new Error("Vocab cannot be empty");
       this._trained = !0;
     } else
-      this.vocabSize = i, this.vocab = new Array(this.vocabSize).fill(""), this.addSpecialTokens(), this.eosToken = this.getSpecialTokenIndex("<eos>"), this.bosToken = this.getSpecialTokenIndex("<bos>") ?? this.eosToken, this.unkToken = this.getSpecialTokenIndex(""), this.vocab.forEach((t, e) => {
-        this.cache.set(t, e);
+      this.vocabSize = t, this.vocab = new Array(this.vocabSize).fill(""), this.addSpecialTokens(), this.eosToken = this.getSpecialTokenIndex("<eos>"), this.bosToken = this.getSpecialTokenIndex("<bos>") ?? this.eosToken, this.unkToken = this.getSpecialTokenIndex(""), this.vocab.forEach((i, e) => {
+        this.cache.set(i, e);
       }), this.cache.set("", this.unkToken);
   }
-  addToken(i, t) {
-    if (this.cache.has(i))
-      return this.cache.get(i);
+  addToken(t, i) {
+    if (this.cache.has(t))
+      return this.cache.get(t);
     let e;
-    if (t !== void 0 ? e = t : (e = this.vocab.indexOf("", this.unkToken + 1), e === -1 && (e = this.vocabSize)), e >= this.vocabSize)
+    if (i !== void 0 ? e = i : (e = this.vocab.indexOf("", this.unkToken + 1), e === -1 && (e = this.vocabSize)), e >= this.vocabSize)
       throw new Error("Vocab size exceeded");
-    return this.vocab[e] = i, this.cache.set(i, e), e;
+    return this.vocab[e] = t, this.cache.set(t, e), e;
   }
   get trained() {
     return this.vocab.length === this.vocabSize && this._trained;
@@ -39,43 +40,53 @@ class T extends k {
   destroy() {
     this.cache.clear(), this.vocab = [];
   }
-  async train(i) {
-    const t = i.map((o) => o.map((n) => n.content.split(""))).flat(2), e = new Set(t), s = Array.from(e), h = this.vocab.indexOf("", this.unkToken + 1), a = this.vocabSize - u.length;
+  async train(t) {
+    const i = /* @__PURE__ */ new Set();
+    let e = performance.now();
+    for (const n of t)
+      n.forEach((o) => {
+        for (const r of o.content)
+          i.add(r);
+      }), e = await d(e);
+    const s = Array.from(i), h = this.vocab.indexOf("", this.unkToken + 1), a = this.vocabSize - b.length;
     if (h === -1)
       return this.vocabSize;
     if (this._trained = !0, s.length > a) {
-      const o = /* @__PURE__ */ new Map();
-      t.forEach((n) => {
-        o.set(n, (o.get(n) || 0) + 1);
-      }), s.sort((n, r) => (o.get(n) || 0) - (o.get(r) || 0)), s.splice(0, s.length - a);
+      const n = /* @__PURE__ */ new Map();
+      t.forEach((o) => {
+        o.forEach((r) => {
+          for (const k of r.content)
+            n.set(k, (n.get(k) || 0) + 1);
+        });
+      }), s.sort((o, r) => (n.get(o) || 0) - (n.get(r) || 0)), s.splice(0, s.length - a);
     }
     let c = h;
     if (c !== -1) {
-      const o = new Set(this.vocab);
-      for (const n of s)
-        if (!o.has(n) && (this.vocab[c] = n, o.add(n), c = this.vocab.indexOf("", c + 1), c === -1))
+      const n = new Set(this.vocab);
+      for (const o of s)
+        if (!n.has(o) && (this.vocab[c] = o, n.add(o), c = this.vocab.indexOf("", c + 1), c === -1))
           break;
     }
-    return this.cache.clear(), this.vocab.forEach((o, n) => {
-      this.cache.set(o, n);
+    return this.cache.clear(), this.vocab.forEach((n, o) => {
+      this.cache.set(n, o);
     }), this.emit("trainStatus", "trained"), this.vocabSize;
   }
-  tokenise(i, t) {
+  tokenise(t, i) {
     if (!this.trained)
       throw new Error("Tokeniser not trained");
-    return i.map((s) => t ? s.split("").map((h) => this.cache.get(h) ?? this.unkToken) : s.split("").map((h) => {
+    return t.map((s) => i ? s.split("").map((h) => this.cache.get(h) ?? this.unkToken) : s.split("").map((h) => {
       const a = this.cache.get(h);
       return a !== void 0 ? this.vocab[a] : "";
     }));
   }
-  detokenise(i) {
-    return i.map((e) => Array.from(e).map((s) => this.vocab[s] || "").join(""));
+  detokenise(t) {
+    return t.map((e) => Array.from(e).map((s) => this.vocab[s] || "").join(""));
   }
-  encode(i) {
-    return this.tokenise([i], !0)[0];
+  encode(t) {
+    return this.tokenise([t], !0)[0];
   }
-  decode(i) {
-    return this.detokenise([i])[0];
+  decode(t) {
+    return this.detokenise([t])[0];
   }
   getVocab() {
     return this.vocab;
@@ -83,13 +94,13 @@ class T extends k {
   getMerges() {
     return [];
   }
-  async createTrainingData(i, t = 5) {
-    const e = await this.tokenise(i, !0), s = [], h = [];
-    for (let a = 0; a < e.length - t; a++)
-      s.push(...e[a].slice(0, t)), h.push(e[a + 1][0]);
+  async createTrainingData(t, i = 5) {
+    const e = await this.tokenise(t, !0), s = [], h = [];
+    for (let a = 0; a < e.length - i; a++)
+      s.push(...e[a].slice(0, i)), h.push(e[a + 1][0]);
     return [s, h];
   }
 }
 export {
-  T as default
+  p as default
 };

package/dist/tokeniser/bpe.js CHANGED Viewed

@@ -1,15 +1,15 @@
-import { yieldIfNeeded as f } from "../utilities/yielder.js";
+import { yieldIfNeeded as p } from "../utilities/yielder.js";
 import m from "../utilities/tokenParse.js";
-import z, { SPECIALS as k } from "./BaseTokeniser.js";
-function p(o, e) {
+import w, { SPECIALS as S } from "./BaseTokeniser.js";
+function g(o, e) {
   return `${o}-::-${e}`;
 }
-function w(o) {
+function T(o) {
   const e = /* @__PURE__ */ new Map();
   for (let s = 0; s < o.length; s++) {
     const t = o[s];
     for (let n = 0; n < t.length - 1; n++) {
-      const r = p(t[n], t[n + 1]), a = e.get(r) || {
+      const r = g(t[n], t[n + 1]), a = e.get(r) || {
         a: t[n],
         b: t[n + 1],
         count: 0,
@@ -20,21 +20,21 @@ function w(o) {
   }
   return { pairs: e, tokens: o };
 }
-function d(o, e, s, t, n) {
-  const r = p(e, s);
+function f(o, e, s, t, n) {
+  const r = g(e, s);
   if (o.pairs.has(r)) {
     const a = o.pairs.get(r);
     a.count += n, n > 0 ? a.instances.add(t) : a.count <= 0 ? o.pairs.delete(r) : a.instances.delete(t);
   } else
     o.pairs.set(r, { a: e, b: s, count: n, instances: /* @__PURE__ */ new Set([t]) });
 }
-function T(o) {
+function y(o) {
   let e = null, s = 0;
   for (const t of o.pairs.values())
     t.count > s && (s = t.count, e = t);
   return e;
 }
-function y(o, e) {
+function I(o, e) {
   return o.map((s) => {
     const t = [];
     for (let n = 0; n < s.length; n++)
@@ -42,19 +42,19 @@ function y(o, e) {
     return t;
   });
 }
-function I(o, e) {
+function x(o, e) {
   e.instances.forEach((s) => {
     const t = o.tokens[s], n = [];
     for (let r = 0; r < t.length; r++)
       if (r < t.length - 1 && t[r] === e.a && t[r + 1] === e.b) {
         const a = e.a + e.b;
-        n.push(a), r > 0 && (d(o, t[r - 1], e.a, s, -1), d(o, t[r - 1], a, s, 1)), r++, r < t.length - 1 && (d(o, e.b, t[r + 1], s, -1), d(o, a, t[r + 1], s, 1));
+        n.push(a), r > 0 && (f(o, t[r - 1], e.a, s, -1), f(o, t[r - 1], a, s, 1)), r++, r < t.length - 1 && (f(o, e.b, t[r + 1], s, -1), f(o, a, t[r + 1], s, 1));
       } else
         n.push(t[r]);
     o.tokens[s] = n;
-  }), o.pairs.delete(p(e.a, e.b));
+  }), o.pairs.delete(g(e.a, e.b));
 }
-class E extends z {
+class C extends w {
   targetSize;
   vocab = /* @__PURE__ */ new Set();
   vocabIndex = /* @__PURE__ */ new Map();
@@ -63,7 +63,7 @@ class E extends z {
   constructor(e, s) {
     super(), Array.isArray(e) ? (e.forEach((t, n) => {
       this.vocab.add(t), this.vocabIndex.set(t, n);
-    }), s && (this.merges = s), this.targetSize = e.length, k.forEach((t) => {
+    }), s && (this.merges = s), this.targetSize = e.length, S.forEach((t) => {
       const n = e.indexOf(t);
       n !== -1 && this.addSpecialToken(t, n);
     })) : (this.addSpecialTokens(), this.targetSize = e);
@@ -81,7 +81,7 @@ class E extends z {
     this.vocab.clear(), this.vocabIndex.clear(), this.merges = [], this.pretokenMap.clear();
   }
   get trained() {
-    return this.vocab.size > k.length && this.vocab.size <= this.targetSize;
+    return this.vocab.size > S.length && this.vocab.size <= this.targetSize;
   }
   get vocabSize() {
     return this.vocab.size;
@@ -97,39 +97,44 @@ class E extends z {
   }
   async train(e = [], s) {
     let t = performance.now();
-    const n = e.map((i) => i.map((h) => m(h.content))).flat(2);
-    t = await f(t, s, this.vocab.size);
-    const r = new Set(n);
+    const n = new Array(e.length);
+    for (let i = 0; i < e.length; i++) {
+      const h = e[i], l = new Array(h.length);
+      for (let d = 0; d < h.length; d++)
+        l[d] = m(h[d].content);
+      t = await p(t, s, this.vocab.size), n[i] = l;
+    }
+    const r = n.flat(2), a = new Set(r);
     this.vocab = /* @__PURE__ */ new Set(), this.pretokenMap.clear(), this.merges = [], this.addSpecialTokens();
-    const a = Array.from(r), b = a.map((i) => Array.from(i).map((l) => (this.vocab.add(l), l))), g = w(b);
-    if (t = await f(t, s, this.vocab.size), this.vocab.size >= this.targetSize) {
+    const b = Array.from(a), v = b.map((i) => Array.from(i).map((l) => (this.vocab.add(l), l))), k = T(v);
+    if (t = await p(t, s, this.vocab.size), this.vocab.size >= this.targetSize) {
       console.warn("Initial vocab size is greater than or equal to target size. No merges will be performed.");
       const i = /* @__PURE__ */ new Map();
-      n.forEach((c) => {
+      r.forEach((c) => {
         Array.from(c).forEach((u) => {
           i.set(u, (i.get(u) || 0) + 1);
         });
       });
       const h = Array.from(i.entries()).sort((c, u) => u[1] - c[1]);
       this.vocab = /* @__PURE__ */ new Set(), this.addSpecialTokens(), h.slice(0, this.targetSize - this.vocab.size).map(([c]) => c).forEach((c) => this.vocab.add(c)), this.vocabIndex.clear();
-      let S = 0;
+      let d = 0;
       for (const c of this.vocab.keys())
-        this.vocabIndex.set(c, S++);
+        this.vocabIndex.set(c, d++);
       return this.emit("trainStatus", "trained"), this.vocab.size;
     }
     for (; this.vocab.size < this.targetSize && this.merges.length < this.targetSize; ) {
-      const i = T(g);
+      const i = y(k);
       if (!i)
         break;
-      this.merges.push([i.a, i.b]), this.vocab.add(i.a + i.b), I(g, i), t = await f(t, s, this.vocab.size);
+      this.merges.push([i.a, i.b]), this.vocab.add(i.a + i.b), x(k, i), t = await p(t, s, this.vocab.size);
     }
-    a.forEach((i, h) => {
-      const l = b[h];
+    b.forEach((i, h) => {
+      const l = v[h];
       this.pretokenMap.set(i, l);
     }), this.vocabIndex.clear();
-    let v = 0;
+    let z = 0;
     for (const i of this.vocab.keys())
-      this.vocabIndex.set(i, v++);
+      this.vocabIndex.set(i, z++);
     return this.emit("trainStatus", "trained"), this.vocab.size;
   }
   getVocab() {
@@ -141,7 +146,7 @@ class E extends z {
   tokeniseWord(e) {
     let s = Array.from(e);
     return this.merges.forEach((t) => {
-      s = y([s], t)[0];
+      s = I([s], t)[0];
     }), this.pretokenMap.set(e, s), s;
   }
   tokeniseStrings(e) {
@@ -163,5 +168,5 @@ class E extends z {
   }
 }
 export {
-  E as default
+  C as default
 };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
     "name": "@genai-fi/nanogpt",
-    "version": "0.17.4",
+    "version": "0.17.5",
     "type": "module",
     "main": "dist/main.js",
     "types": "dist/main.d.ts",