npm - bpe-lite - Versions diffs - 0.3.1 → 0.4.0 - Mend

bpe-lite 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/src/spm.js CHANGED Viewed

@@ -6,72 +6,194 @@
  * Merges are ordered list of "token_a token_b" strings — applied by rank (index = priority).
  */
+const { MinHeap } = require('./bpe');
 const SPACE_CHAR = '\u2581'; // ▁
+function buildPreparedSPM(vocabData) {
+  const { vocab, merges } = vocabData;
+  const mergeRank = new Map();
+  for (let i = 0; i < merges.length; i++) {
+    mergeRank.set(merges[i], i);
+  }
+  const idToStr = new Map();
+  for (const [str, id] of Object.entries(vocab)) {
+    idToStr.set(id, str);
+  }
+  return {
+    vocab,
+    merges,
+    mergeRank,
+    idToStr,
+    // opt A — segment-level cache: each ▁-prefixed word segment → ids[]
+    // Generalises across different inputs (same words reused across texts).
+    // Note: 1 of 514,906 Gemma merges crosses a ▁ boundary ("> ▁</"),
+    // making this negligibly imprecise for that HTML pattern.
+    cache: new Map(),
+    // opt B — per-instance grow-only scratch
+    scratch: { str: null, ids: null, prev: null, next: null, ver: null, alive: null, cap: 0, heap: new MinHeap() },
+  };
+}
+// opt B — grow SPM scratch arrays only when needed
+function ensureScratch(scratch, n) {
+  if (n <= scratch.cap) return;
+  const cap = n * 2;
+  scratch.str   = new Array(cap);
+  scratch.ids   = new Int32Array(cap);
+  scratch.prev  = new Int32Array(cap);
+  scratch.next  = new Int32Array(cap);
+  scratch.ver   = new Int32Array(cap);
+  scratch.alive = new Uint8Array(cap);
+  scratch.cap   = cap;
+}
 /**
  * Encode text using SentencePiece BPE.
  * @param {string} text
- * @param {Object} vocabData — { engine, vocab, merges }
+ * @param {Object} vocabData  — { engine, vocab, merges }
  * @returns {number[]}
  */
 function encodeSPM(text, vocabData) {
-  if (!text) return [];
-  const { vocab, merges } = vocabData;
+  return encodeSPMPrepared(text, buildPreparedSPM(vocabData));
+}
-  // Build merge rank map: "a b" → rank index (lower = higher priority)
-  const mergeRank = new Map();
-  for (let i = 0; i < merges.length; i++) {
-    mergeRank.set(merges[i], i);
+/**
+ * Hot path: scan normalized text purely from cache.
+ * Returns true if every segment was a cache hit; false on first miss.
+ * Isolated from encodeSegment so V8 can keep this function optimised
+ * even when encodeSPMPrepared is called with wildly different text lengths.
+ */
+function _scanFromCache(normalized, cache, result) {
+  let segStart = 0;
+  for (let i = 1; i <= normalized.length; i++) {
+    if (i === normalized.length || normalized[i] === SPACE_CHAR) {
+      const seg = normalized.slice(segStart, i);
+      const segIds = cache.get(seg);
+      if (segIds === undefined) return false; // cache miss → caller handles cold path
+      for (let j = 0; j < segIds.length; j++) result.push(segIds[j]);
+      segStart = i;
+    }
   }
+  return true;
+}
-  // Build reverse vocab: id → string (for decode)
-  // vocab: { tokenString: id }
+// Cold-path helper — kept separate so it is never inlined into the hot loop.
+function _encodeAndCache(seg, vocab, mergeRank, scratch, cache) {
+  const ids = encodeSegment(seg, vocab, mergeRank, scratch);
+  cache.set(seg, ids);
+  return ids;
+}
+function encodeSPMPrepared(text, prepared) {
+  if (!text) return [];
+  const { vocab, mergeRank, scratch, cache } = prepared;
   // Normalize: replace spaces with ▁, prepend ▁
   const normalized = SPACE_CHAR + text.replace(/ /g, SPACE_CHAR);
-  // Split into individual Unicode characters
-  const chars = [...normalized];
+  // Fast path: serve every segment from the segment cache.
+  // After the first call, this path handles all subsequent calls for common text.
+  const result = [];
+  if (_scanFromCache(normalized, cache, result)) return result;
+  // Cold path: at least one segment is missing — encode everything from scratch.
+  // (Simpler to re-scan than to continue from the miss point.)
+  result.length = 0;
+  let segStart = 0;
+  for (let i = 1; i <= normalized.length; i++) {
+    if (i === normalized.length || normalized[i] === SPACE_CHAR) {
+      const seg = normalized.slice(segStart, i);
+      const segIds = cache.get(seg) ?? _encodeAndCache(seg, vocab, mergeRank, scratch, cache);
+      for (let j = 0; j < segIds.length; j++) result.push(segIds[j]);
+      segStart = i;
+    }
+  }
+  return result;
+}
+// Encode a single ▁-prefixed segment using MinHeap BPE.
+function encodeSegment(seg, vocab, mergeRank, scratch) {
+  const chars = [...seg];
+  const n = chars.length;
+  ensureScratch(scratch, n);
+  const { str, ids, prev, next, ver, alive, heap } = scratch;
+  heap.reset();
+  for (let i = 0; i < n; i++) {
+    const c = chars[i];
+    prev[i]  = i - 1;
+    next[i]  = i + 1;
+    ver[i]   = 0;
+    alive[i] = 1;
-  // Map each character to a token (may be a multi-byte char)
-  let tokens = chars.map(c => {
     if (vocab[c] !== undefined) {
-      return { str: c, id: vocab[c] };
-    }
-    // Try byte fallback: <0xNN>
-    const codePoint = c.codePointAt(0);
-    const hex = codePoint.toString(16).toUpperCase().padStart(2, '0');
-    const byteKey = `<0x${hex}>`;
-    if (vocab[byteKey] !== undefined) {
-      return { str: byteKey, id: vocab[byteKey] };
-    }
-    // Unknown — use UNK (id 3 in Gemma) or skip
-    return { str: c, id: vocab['<unk>'] ?? 0 };
-  });
-  // Greedy BPE merges: find pair with lowest merge rank, merge, repeat
-  while (tokens.length >= 2) {
-    let bestRank = Infinity;
-    let bestIdx = -1;
-    for (let i = 0; i < tokens.length - 1; i++) {
-      const key = `${tokens[i].str} ${tokens[i + 1].str}`;
-      const rank = mergeRank.get(key);
-      if (rank !== undefined && rank < bestRank) {
-        bestRank = rank;
-        bestIdx = i;
+      str[i] = c;
+      ids[i] = vocab[c];
+    } else {
+      const codePoint = c.codePointAt(0);
+      const hex = codePoint.toString(16).toUpperCase().padStart(2, '0');
+      const byteKey = `<0x${hex}>`;
+      if (vocab[byteKey] !== undefined) {
+        str[i] = byteKey;
+        ids[i] = vocab[byteKey];
+      } else {
+        str[i] = c;
+        ids[i] = vocab['<unk>'] ?? 0;
       }
     }
+  }
+  next[n - 1] = -1;
+  for (let i = 0; i < n - 1; i++) {
+    const rank = mergeRank.get(`${str[i]} ${str[i + 1]}`);
+    if (rank !== undefined) heap.push(rank, i, i + 1, ver[i], ver[i + 1]);
+  }
+  while (heap.size > 0) {
+    const top = heap.pop();
+    if (!top) break;
+    const { left, right, verL, verR } = top;
-    if (bestIdx === -1) break;
+    if (!alive[left] || !alive[right]) continue;
+    if (next[left] !== right) continue;
+    if (ver[left] !== verL || ver[right] !== verR) continue;
-    const merged = tokens[bestIdx].str + tokens[bestIdx + 1].str;
-    const mergedId = vocab[merged] ?? vocab['<unk>'] ?? 0;
-    tokens.splice(bestIdx, 2, { str: merged, id: mergedId });
+    str[left] = str[left] + str[right];
+    ids[left] = vocab[str[left]] ?? vocab['<unk>'] ?? 0;
+    ver[left]++;
+    alive[right] = 0;
+    ver[right]++;
+    const nr = next[right];
+    next[left] = nr;
+    if (nr !== -1) prev[nr] = left;
+    const pl = prev[left];
+    if (pl !== -1 && alive[pl]) {
+      const r = mergeRank.get(`${str[pl]} ${str[left]}`);
+      if (r !== undefined) heap.push(r, pl, left, ver[pl], ver[left]);
+    }
+    const nl = next[left];
+    if (nl !== -1 && alive[nl]) {
+      const r = mergeRank.get(`${str[left]} ${str[nl]}`);
+      if (r !== undefined) heap.push(r, left, nl, ver[left], ver[nl]);
+    }
   }
-  return tokens.map(t => t.id);
+  const result = [];
+  let i = 0;
+  while (i !== -1) {
+    if (alive[i]) result.push(ids[i]);
+    i = next[i];
+  }
+  return result;
 }
 /**
@@ -81,20 +203,16 @@ function encodeSPM(text, vocabData) {
  * @returns {string}
  */
 function decodeSPM(ids, vocabData) {
-  if (!ids || ids.length === 0) return '';
-  const { vocab } = vocabData;
+  return decodeSPMPrepared(ids, buildPreparedSPM(vocabData));
+}
-  // Build id → string map
-  const idToStr = new Map();
-  for (const [str, id] of Object.entries(vocab)) {
-    idToStr.set(id, str);
-  }
+function decodeSPMPrepared(ids, prepared) {
+  if (!ids || ids.length === 0) return '';
   let result = '';
-  for (const id of ids) {
-    const str = idToStr.get(id) ?? '';
-    // Handle byte fallbacks like <0x41> → 'A'
+  for (let i = 0; i < ids.length; i++) {
+    const id = ids[i];
+    const str = prepared.idToStr.get(id) ?? '';
     const byteMatch = str.match(/^<0x([0-9A-Fa-f]{2})>$/);
     if (byteMatch) {
       result += String.fromCharCode(parseInt(byteMatch[1], 16));
@@ -107,4 +225,4 @@ function decodeSPM(ids, vocabData) {
   return result.replace(new RegExp(SPACE_CHAR, 'g'), ' ').replace(/^ /, '');
 }
-module.exports = { encodeSPM, decodeSPM };
+module.exports = { buildPreparedSPM, encodeSPM, decodeSPM, encodeSPMPrepared, decodeSPMPrepared };

package/src/tokenizer.js CHANGED Viewed

@@ -1,29 +1,44 @@
 'use strict';
-const { encodeTiktoken, decodeTiktoken, countTiktokenUpTo } = require('./bpe');
-const { encodeSPM, decodeSPM } = require('./spm');
+const {
+  buildPreparedTiktoken,
+  encodeTiktokenPrepared,
+  decodeTiktokenPrepared,
+  countTiktokenPrepared,
+  countTiktokenUpToPrepared,
+} = require('./bpe');
+const { buildPreparedSPM, encodeSPMPrepared, decodeSPMPrepared } = require('./spm');
 class Tokenizer {
   constructor(vocabData) {
     this._data = vocabData;
     this._engine = vocabData.engine;
+    this._preparedTiktoken = null;
+    this._preparedSPM = null;
     if (this._engine !== 'tiktoken' && this._engine !== 'spm') {
       throw new Error(`Unknown tokenizer engine: ${this._engine}`);
     }
+    if (this._engine === 'tiktoken') {
+      this._preparedTiktoken = buildPreparedTiktoken(vocabData);
+    } else {
+      this._preparedSPM = buildPreparedSPM(vocabData);
+    }
   }
   encode(text) {
-    if (this._engine === 'tiktoken') return encodeTiktoken(text, this._data);
-    return encodeSPM(text, this._data);
+    if (this._engine === 'tiktoken') return encodeTiktokenPrepared(text, this._preparedTiktoken);
+    return encodeSPMPrepared(text, this._preparedSPM);
   }
   decode(ids) {
-    if (this._engine === 'tiktoken') return decodeTiktoken(ids, this._data);
-    return decodeSPM(ids, this._data);
+    if (this._engine === 'tiktoken') return decodeTiktokenPrepared(ids, this._preparedTiktoken);
+    return decodeSPMPrepared(ids, this._preparedSPM);
   }
   count(text) {
+    if (this._engine === 'tiktoken') return countTiktokenPrepared(text, this._preparedTiktoken);
     return this.encode(text).length;
   }
@@ -35,7 +50,7 @@ class Tokenizer {
    * @returns {number}
    */
   countUpTo(text, limit) {
-    if (this._engine === 'tiktoken') return countTiktokenUpTo(text, this._data, limit);
+    if (this._engine === 'tiktoken') return countTiktokenUpToPrepared(text, this._preparedTiktoken, limit);
     // SPM encodes the whole text as one unit — no clean early exit, just encode and count
     return this.encode(text).length;
   }

package/vocabs/anthropic.json.gz ADDED Viewed

Binary file

package/vocabs/openai.json.gz ADDED Viewed

Binary file