npm - bpe-lite - Versions diffs - 0.3.1 → 0.4.0 - Mend

bpe-lite 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/src/bpe.js CHANGED Viewed

@@ -6,80 +6,264 @@
  * Lower rank = higher priority in merges.
  */
-// Byte → base64 token lookup (pre-built at module load)
-const BYTE_TOKENS = (() => {
+// Byte → single-byte "binary string" lookup (pre-built at module load)
+const BYTE_STRS = (() => {
   const out = new Array(256);
-  for (let i = 0; i < 256; i++) {
-    out[i] = Buffer.from([i]).toString('base64');
-  }
+  for (let i = 0; i < 256; i++) out[i] = String.fromCharCode(i);
   return out;
 })();
-/**
- * Encode text using tiktoken-style BPE.
- * @param {string} text
- * @param {Object} vocabData  — { engine, pattern, vocab, specialTokens }
- * @returns {number[]} token ids
- */
-function encodeTiktoken(text, vocabData) {
-  if (!text) return [];
+// opt D — shared UTF-8 encode buffer; one allocation for the process lifetime.
+// Node.js is single-threaded and all encode paths are synchronous, so this is safe.
+const _sb = { buf: Buffer.allocUnsafe(4096), cap: 4096 };
-  const { vocab, specialTokens = {}, pattern } = vocabData;
+function writeChunk(chunk) {
+  const maxNeeded = chunk.length * 4; // max 4 UTF-8 bytes per JS char
+  if (maxNeeded > _sb.cap) {
+    _sb.cap = maxNeeded * 2;
+    _sb.buf = Buffer.allocUnsafe(_sb.cap);
+  }
+  return _sb.buf.write(chunk, 0, 'utf8');
+}
-  // Build reverse map: rank → id (rank IS the id in tiktoken)
-  // vocab[base64token] = rank = token_id
+class MinHeap {
+  constructor() {
+    this.ranks = [];
+    this.left  = [];
+    this.right = [];
+    this.verL  = [];
+    this.verR  = [];
+  }
-  // Build special token map (string → id)
-  const specials = Object.entries(specialTokens);
+  // opt B — reset to empty without de-allocating internal arrays
+  reset() {
+    this.ranks.length = 0;
+    this.left.length  = 0;
+    this.right.length = 0;
+    this.verL.length  = 0;
+    this.verR.length  = 0;
+  }
-  const ids = [];
+  get size() { return this.ranks.length; }
-  // Split text around special tokens first
-  const pieces = splitOnSpecials(text, specials);
+  push(rank, left, right, verL, verR) {
+    const i = this.ranks.length;
+    this.ranks.push(rank);
+    this.left.push(left);
+    this.right.push(right);
+    this.verL.push(verL);
+    this.verR.push(verR);
+    this._siftUp(i);
+  }
-  for (const piece of pieces) {
-    if (piece.isSpecial) {
-      ids.push(piece.id);
-    } else {
-      // Apply regex pre-tokenization then BPE encode each chunk
-      const chunks = pretokenize(piece.text, pattern);
-      for (const chunk of chunks) {
-        const chunkIds = bpeEncode(chunk, vocab);
-        for (const id of chunkIds) ids.push(id);
-      }
+  pop() {
+    const n = this.ranks.length;
+    if (n === 0) return null;
+    const rank = this.ranks[0], left = this.left[0], right = this.right[0];
+    const verL = this.verL[0], verR = this.verR[0];
+    const last = n - 1;
+    if (last === 0) {
+      this.ranks.pop(); this.left.pop(); this.right.pop(); this.verL.pop(); this.verR.pop();
+      return { rank, left, right, verL, verR };
     }
+    this.ranks[0] = this.ranks[last]; this.left[0] = this.left[last];
+    this.right[0] = this.right[last]; this.verL[0] = this.verL[last]; this.verR[0] = this.verR[last];
+    this.ranks.pop(); this.left.pop(); this.right.pop(); this.verL.pop(); this.verR.pop();
+    this._siftDown(0);
+    return { rank, left, right, verL, verR };
   }
-  return ids;
+  _siftUp(i) {
+    while (i > 0) {
+      const p = (i - 1) >> 1;
+      if (this.ranks[p] <= this.ranks[i]) break;
+      this._swap(i, p);
+      i = p;
+    }
+  }
+  _siftDown(i) {
+    const n = this.ranks.length;
+    while (true) {
+      const l = i * 2 + 1;
+      if (l >= n) break;
+      const r = l + 1;
+      let m = l;
+      if (r < n && this.ranks[r] < this.ranks[l]) m = r;
+      if (this.ranks[i] <= this.ranks[m]) break;
+      this._swap(i, m);
+      i = m;
+    }
+  }
+  _swap(i, j) {
+    [this.ranks[i], this.ranks[j]] = [this.ranks[j], this.ranks[i]];
+    [this.left[i],  this.left[j]]  = [this.left[j],  this.left[i]];
+    [this.right[i], this.right[j]] = [this.right[j], this.right[i]];
+    [this.verL[i],  this.verL[j]]  = [this.verL[j],  this.verL[i]];
+    [this.verR[i],  this.verR[j]]  = [this.verR[j],  this.verR[i]];
+  }
 }
-/**
- * Decode token ids back to string.
- * @param {number[]} ids
- * @param {Object} vocabData
- * @returns {string}
- */
-function decodeTiktoken(ids, vocabData) {
-  if (!ids || ids.length === 0) return '';
+function compilePretokenizer(patternStr) {
+  if (!patternStr) return { type: 'none' };
-  const { vocab, specialTokens = {} } = vocabData;
+  const regexStr = patternStr
+    .replace(/\(\?i:/g, '(?:')
+    .replace(/\\p\{L\}/g, '\\p{L}')
+    .replace(/\\p\{N\}/g, '\\p{N}');
-  // Build id → bytes map
-  const idToBytes = new Map();
-  for (const [b64, rank] of Object.entries(vocab)) {
-    idToBytes.set(rank, Buffer.from(b64, 'base64'));
+  try {
+    return { type: 'regex', re: new RegExp(regexStr, 'guy') };
+  } catch {
+    try {
+      return { type: 'regex', re: new RegExp(regexStr, 'gi') };
+    } catch {
+      return { type: 'fallback' };
+    }
   }
-  for (const [str, id] of Object.entries(specialTokens)) {
-    idToBytes.set(id, Buffer.from(str, 'utf8'));
+}
+function pretokenize(text, compiled) {
+  if (!text) return [];
+  if (!compiled || compiled.type === 'none') return [text];
+  if (compiled.type === 'fallback') return text.match(/\S+|\s+/g) || [text];
+  return text.match(compiled.re) || [text];
+}
+function buildPreparedTiktoken(vocabData) {
+  const { vocab, specialTokens = {}, pattern } = vocabData;
+  const vocabBin = new Map();
+  let maxId = -1;
+  for (const [b64, id] of Object.entries(vocab)) {
+    const buf = Buffer.from(b64, 'base64');
+    vocabBin.set(buf.toString('latin1'), id);
+    if (id > maxId) maxId = id;
   }
-  const bufs = [];
-  for (const id of ids) {
-    const bytes = idToBytes.get(id);
-    if (bytes) bufs.push(bytes);
+  const specials = Object.entries(specialTokens);
+  for (const [, id] of specials) {
+    if (id > maxId) maxId = id;
   }
-  return Buffer.concat(bufs).toString('utf8');
+  const idToBytes = new Array(maxId + 1);
+  for (const [b64, id] of Object.entries(vocab)) {
+    idToBytes[id] = Buffer.from(b64, 'base64');
+  }
+  for (const [str, id] of specials) {
+    idToBytes[id] = Buffer.from(str, 'utf8');
+  }
+  return {
+    vocabBin,
+    idToBytes,
+    specials,
+    patternCompiled: compilePretokenizer(pattern),
+    // opt A — per-instance chunk cache: chunk string → ids[]
+    cache: new Map(),
+    // opt B — per-instance grow-only scratch (reused across chunks)
+    scratch: { str: null, prev: null, next: null, ver: null, alive: null, cap: 0, heap: new MinHeap() },
+  };
+}
+// opt B — grow scratch arrays only when needed
+function ensureScratch(scratch, n) {
+  if (n <= scratch.cap) return;
+  const cap = n * 2;
+  scratch.str   = new Array(cap);
+  scratch.prev  = new Int32Array(cap);
+  scratch.next  = new Int32Array(cap);
+  scratch.ver   = new Int32Array(cap);
+  scratch.alive = new Uint8Array(cap);
+  scratch.cap   = cap;
+}
+// opt E — unified encode for one pre-tokenized chunk (replaces bpeEncode + bpeCount)
+// Precondition: writeChunk(chunk) was just called and returned n.
+// Reads from _sb.buf[0..n-1]. Returns ids[].
+function bpeChunk(n, vocabBin, scratch) {
+  const buf = _sb.buf;
+  // opt C — fast path: single byte
+  if (n === 1) {
+    const id = vocabBin.get(BYTE_STRS[buf[0]]);
+    return id === undefined ? [] : [id];
+  }
+  // opt C — fast path: two bytes
+  if (n === 2) {
+    const s0 = BYTE_STRS[buf[0]], s1 = BYTE_STRS[buf[1]];
+    const merged = vocabBin.get(s0 + s1);
+    if (merged !== undefined) return [merged];
+    const r = [];
+    const i0 = vocabBin.get(s0); if (i0 !== undefined) r.push(i0);
+    const i1 = vocabBin.get(s1); if (i1 !== undefined) r.push(i1);
+    return r;
+  }
+  // General path — reuse scratch arrays (opt B), reuse heap (opt B)
+  ensureScratch(scratch, n);
+  const { str, prev, next, ver, alive, heap } = scratch;
+  heap.reset();
+  for (let i = 0; i < n; i++) {
+    str[i]   = BYTE_STRS[buf[i]];
+    prev[i]  = i - 1;
+    next[i]  = i + 1;
+    ver[i]   = 0;
+    alive[i] = 1;
+  }
+  next[n - 1] = -1;
+  for (let i = 0; i < n - 1; i++) {
+    const rank = vocabBin.get(str[i] + str[i + 1]);
+    if (rank !== undefined) heap.push(rank, i, i + 1, 0, 0);
+  }
+  while (heap.size > 0) {
+    const top = heap.pop();
+    if (!top) break;
+    const { left, right, verL, verR } = top;
+    if (!alive[left] || !alive[right]) continue;
+    if (next[left] !== right) continue;
+    if (ver[left] !== verL || ver[right] !== verR) continue;
+    str[left] = str[left] + str[right];
+    ver[left]++;
+    alive[right] = 0;
+    ver[right]++;
+    const nr = next[right];
+    next[left] = nr;
+    if (nr !== -1) prev[nr] = left;
+    const pl = prev[left];
+    if (pl !== -1 && alive[pl]) {
+      const rank = vocabBin.get(str[pl] + str[left]);
+      if (rank !== undefined) heap.push(rank, pl, left, ver[pl], ver[left]);
+    }
+    const nl = next[left];
+    if (nl !== -1 && alive[nl]) {
+      const rank = vocabBin.get(str[left] + str[nl]);
+      if (rank !== undefined) heap.push(rank, left, nl, ver[left], ver[nl]);
+    }
+  }
+  const ids = [];
+  let i = 0;
+  while (i !== -1) {
+    if (alive[i]) {
+      const id = vocabBin.get(str[i]);
+      if (id !== undefined) ids.push(id);
+    }
+    i = next[i];
+  }
+  return ids;
 }
 // ─── Internal helpers ─────────────────────────────────────────────────────────
@@ -87,142 +271,197 @@ function decodeTiktoken(ids, vocabData) {
 function splitOnSpecials(text, specials) {
   if (specials.length === 0) return [{ text, isSpecial: false }];
-  // Sort by length descending so longer tokens match first
-  const sorted = specials.slice().sort((a, b) => b[0].length - a[0].length);
   const result = [];
   let remaining = text;
   while (remaining.length > 0) {
-    let found = false;
-    for (const [str, id] of sorted) {
+    let bestIdx = -1, bestStr = null, bestId = null;
+    for (const [str, id] of specials) {
       const idx = remaining.indexOf(str);
-      if (idx !== -1) {
-        if (idx > 0) result.push({ text: remaining.slice(0, idx), isSpecial: false });
-        result.push({ isSpecial: true, id });
-        remaining = remaining.slice(idx + str.length);
-        found = true;
-        break;
+      if (idx === -1) continue;
+      if (
+        bestIdx === -1 ||
+        idx < bestIdx ||
+        (idx === bestIdx && bestStr && str.length > bestStr.length)
+      ) {
+        bestIdx = idx; bestStr = str; bestId = id;
       }
     }
-    if (!found) {
-      result.push({ text: remaining, isSpecial: false });
-      break;
-    }
+    if (bestIdx === -1) { result.push({ text: remaining, isSpecial: false }); break; }
+    if (bestIdx > 0) result.push({ text: remaining.slice(0, bestIdx), isSpecial: false });
+    result.push({ isSpecial: true, id: bestId });
+    remaining = remaining.slice(bestIdx + bestStr.length);
   }
   return result;
 }
-function pretokenize(text, patternStr) {
-  if (!patternStr || !text) return text ? [text] : [];
+// ─── Prepared-object API ──────────────────────────────────────────────────────
-  // Node 18+ supports Unicode property escapes in RegExp natively
-  // We need the 'v' or 'u' flag for \p{L}, \p{N}
-  // The pattern from cl100k_base uses (?i:...) syntax which is not standard JS
-  // We convert it to use the 'i' flag selectively via alternation
-  let regexStr = patternStr
-    .replace(/\(\?i:/g, '(?:')   // (?i:...) → (?:...) and we use 'i' flag on the whole regex
-    .replace(/\\p\{L\}/g, '\\p{L}')
-    .replace(/\\p\{N\}/g, '\\p{N}');
+function encodeTiktokenPrepared(text, prepared) {
+  if (!text) return [];
-  let re;
-  try {
-    re = new RegExp(regexStr, 'guy');  // g=global, u=unicode, y=... actually just 'gu'
-  } catch {
-    try {
-      re = new RegExp(regexStr, 'gi');
-    } catch {
-      // Fallback: split on whitespace
-      return text.match(/\S+|\s+/g) || [text];
+  const ids = [];
+  const { vocabBin, scratch, cache, patternCompiled, specials } = prepared;
+  const pieces = splitOnSpecials(text, specials);
+  for (const piece of pieces) {
+    if (piece.isSpecial) { ids.push(piece.id); continue; }
+    const t = piece.text;
+    if (patternCompiled.type === 'regex') {
+      // opt — exec loop avoids materialising the full matches array
+      const re = patternCompiled.re;
+      re.lastIndex = 0;
+      let m;
+      while ((m = re.exec(t)) !== null) {
+        const chunk = m[0];
+        let chunkIds = cache.get(chunk);
+        if (chunkIds === undefined) {
+          chunkIds = bpeChunk(writeChunk(chunk), vocabBin, scratch);
+          cache.set(chunk, chunkIds);
+        }
+        for (let i = 0; i < chunkIds.length; i++) ids.push(chunkIds[i]);
+      }
+    } else {
+      const chunks = patternCompiled.type === 'none' ? [t] : (t.match(/\S+|\s+/g) || [t]);
+      for (let ci = 0; ci < chunks.length; ci++) {
+        const chunk = chunks[ci];
+        let chunkIds = cache.get(chunk);
+        if (chunkIds === undefined) {
+          chunkIds = bpeChunk(writeChunk(chunk), vocabBin, scratch);
+          cache.set(chunk, chunkIds);
+        }
+        for (let i = 0; i < chunkIds.length; i++) ids.push(chunkIds[i]);
+      }
     }
   }
-  return text.match(re) || [text];
+  return ids;
 }
-/**
- * BPE encode a single pre-tokenized chunk.
- * @param {string} chunk — raw string chunk
- * @param {Object} vocab — { base64token: rank }
- * @returns {number[]}
- */
-function bpeEncode(chunk, vocab) {
-  // Convert chunk to UTF-8 bytes, get initial token ids (one per byte)
-  const bytes = Buffer.from(chunk, 'utf8');
-  if (bytes.length === 0) return [];
-  // Start with individual bytes as tokens
-  let tokens = [];
-  for (let i = 0; i < bytes.length; i++) {
-    const b64 = BYTE_TOKENS[bytes[i]];
-    const rank = vocab[b64];
-    if (rank === undefined) {
-      // Unknown byte — use a fallback (should not happen with byte-level vocab)
-      tokens.push({ b64, rank: Infinity });
-    } else {
-      tokens.push({ b64, rank });
-    }
+function decodeTiktokenPrepared(ids, prepared) {
+  if (!ids || ids.length === 0) return '';
+  const bufs = [];
+  for (let i = 0; i < ids.length; i++) {
+    const bytes = prepared.idToBytes[ids[i]];
+    if (bytes) bufs.push(bytes);
   }
+  return Buffer.concat(bufs).toString('utf8');
+}
-  // Greedy merge: repeatedly find the adjacent pair with the lowest max rank
-  while (tokens.length >= 2) {
-    let bestRank = Infinity;
-    let bestIdx = -1;
+function countTiktokenPrepared(text, prepared) {
+  if (!text) return 0;
-    for (let i = 0; i < tokens.length - 1; i++) {
-      const mergedB64 = mergeB64(tokens[i].b64, tokens[i + 1].b64);
-      const rank = vocab[mergedB64];
-      if (rank !== undefined && rank < bestRank) {
-        bestRank = rank;
-        bestIdx = i;
+  const { vocabBin, scratch, cache, patternCompiled, specials } = prepared;
+  const pieces = splitOnSpecials(text, specials);
+  let count = 0;
+  for (const piece of pieces) {
+    if (piece.isSpecial) { count++; continue; }
+    const t = piece.text;
+    if (patternCompiled.type === 'regex') {
+      const re = patternCompiled.re;
+      re.lastIndex = 0;
+      let m;
+      while ((m = re.exec(t)) !== null) {
+        const chunk = m[0];
+        let chunkIds = cache.get(chunk);
+        if (chunkIds === undefined) {
+          chunkIds = bpeChunk(writeChunk(chunk), vocabBin, scratch);
+          cache.set(chunk, chunkIds);
+        }
+        count += chunkIds.length;
+      }
+    } else {
+      const chunks = patternCompiled.type === 'none' ? [t] : (t.match(/\S+|\s+/g) || [t]);
+      for (let ci = 0; ci < chunks.length; ci++) {
+        const chunk = chunks[ci];
+        let chunkIds = cache.get(chunk);
+        if (chunkIds === undefined) {
+          chunkIds = bpeChunk(writeChunk(chunk), vocabBin, scratch);
+          cache.set(chunk, chunkIds);
+        }
+        count += chunkIds.length;
       }
     }
-    if (bestIdx === -1) break;  // No more merges possible
-    const mergedB64 = mergeB64(tokens[bestIdx].b64, tokens[bestIdx + 1].b64);
-    tokens.splice(bestIdx, 2, { b64: mergedB64, rank: bestRank });
   }
-  return tokens.map(t => t.rank);
-}
-function mergeB64(a, b) {
-  // Decode both, concatenate bytes, re-encode
-  const buf = Buffer.concat([Buffer.from(a, 'base64'), Buffer.from(b, 'base64')]);
-  return buf.toString('base64');
+  return count;
 }
-/**
- * Count tokens up to a limit, short-circuiting once exceeded.
- * @param {string} text
- * @param {Object} vocabData
- * @param {number} limit
- * @returns {number} token count (may exceed limit by up to one chunk)
- */
-function countTiktokenUpTo(text, vocabData, limit) {
+function countTiktokenUpToPrepared(text, prepared, limit) {
   if (!text) return 0;
-  const { vocab, specialTokens = {}, pattern } = vocabData;
-  const specials = Object.entries(specialTokens);
+  const { vocabBin, scratch, cache, patternCompiled, specials } = prepared;
   const pieces = splitOnSpecials(text, specials);
   let count = 0;
-  for (const piece of pieces) {
+  outer: for (const piece of pieces) {
     if (piece.isSpecial) {
-      count++;
+      if (++count > limit) break;
+      continue;
+    }
+    const t = piece.text;
+    if (patternCompiled.type === 'regex') {
+      const re = patternCompiled.re;
+      re.lastIndex = 0;
+      let m;
+      while ((m = re.exec(t)) !== null) {
+        const chunk = m[0];
+        let chunkIds = cache.get(chunk);
+        if (chunkIds === undefined) {
+          chunkIds = bpeChunk(writeChunk(chunk), vocabBin, scratch);
+          cache.set(chunk, chunkIds);
+        }
+        count += chunkIds.length;
+        if (count > limit) break outer;
+      }
     } else {
-      const chunks = pretokenize(piece.text, pattern);
-      for (const chunk of chunks) {
-        count += bpeEncode(chunk, vocab).length;
-        if (count > limit) return count;
+      const chunks = patternCompiled.type === 'none' ? [t] : (t.match(/\S+|\s+/g) || [t]);
+      for (let ci = 0; ci < chunks.length; ci++) {
+        const chunk = chunks[ci];
+        let chunkIds = cache.get(chunk);
+        if (chunkIds === undefined) {
+          chunkIds = bpeChunk(writeChunk(chunk), vocabBin, scratch);
+          cache.set(chunk, chunkIds);
+        }
+        count += chunkIds.length;
+        if (count > limit) break outer;
       }
     }
-    if (count > limit) return count;
   }
   return count;
 }
-module.exports = { encodeTiktoken, decodeTiktoken, countTiktokenUpTo };
+// ─── Standalone wrappers (build prepared fresh each call — used by tests / direct API) ──
+function encodeTiktoken(text, vocabData) {
+  return encodeTiktokenPrepared(text, buildPreparedTiktoken(vocabData));
+}
+function decodeTiktoken(ids, vocabData) {
+  return decodeTiktokenPrepared(ids, buildPreparedTiktoken(vocabData));
+}
+function countTiktokenUpTo(text, vocabData, limit) {
+  return countTiktokenUpToPrepared(text, buildPreparedTiktoken(vocabData), limit);
+}
+module.exports = {
+  MinHeap,
+  encodeTiktoken,
+  decodeTiktoken,
+  countTiktokenUpTo,
+  buildPreparedTiktoken,
+  encodeTiktokenPrepared,
+  decodeTiktokenPrepared,
+  countTiktokenPrepared,
+  countTiktokenUpToPrepared,
+};