npm - @halo-sdk/tokenizer - Versions diffs - 1.0.0 - Mend

@halo-sdk/tokenizer 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md ADDED Viewed

@@ -0,0 +1,36 @@
+# @halo-sdk/tokenizer
+Pluggable tokenizers for Halo AI SDK. Replaces the built-in `chars / 4` heuristic behind `estimateTokens` with accurate counts — needed for honest truncation budgets and cache/cost accounting.
+## Installation
+```bash
+npm install @halo-sdk/tokenizer
+```
+Requires `@halo-sdk/core`. `gpt-tokenizer` is an optional peer dependency, only needed for `createGptTokenizer()`.
+## Usage
+```ts
+import { ApproxTokenizer, installTokenizer } from "@halo-sdk/tokenizer";
+// Install once at startup; `estimateTokens` everywhere now uses it.
+installTokenizer(new ApproxTokenizer());
+```
+`ApproxTokenizer` is dependency-free and deterministic — a BPE approximation that costs punctuation, digits, and CJK far more realistically than `chars / 4`.
+For exact GPT counts, install `gpt-tokenizer` and use the lazy factory:
+```ts
+import { createGptTokenizer, installTokenizer } from "@halo-sdk/tokenizer";
+installTokenizer(await createGptTokenizer());
+```
+## Custom tokenizers
+Implement `Tokenizer` (`count(text): number`) and pass it to `installTokenizer`, or wire a counter directly via `setTokenCounter` from `@halo-sdk/core`.
+Call `resetTokenizer()` to revert to the heuristic.

package/dist/index.cjs ADDED Viewed

@@ -0,0 +1,88 @@
+"use strict";
+var __create = Object.create;
+var __defProp = Object.defineProperty;
+var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
+var __getOwnPropNames = Object.getOwnPropertyNames;
+var __getProtoOf = Object.getPrototypeOf;
+var __hasOwnProp = Object.prototype.hasOwnProperty;
+var __export = (target, all) => {
+  for (var name in all)
+    __defProp(target, name, { get: all[name], enumerable: true });
+};
+var __copyProps = (to, from, except, desc) => {
+  if (from && typeof from === "object" || typeof from === "function") {
+    for (let key of __getOwnPropNames(from))
+      if (!__hasOwnProp.call(to, key) && key !== except)
+        __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
+  }
+  return to;
+};
+var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
+  // If the importer is in node compatibility mode or this is not an ESM
+  // file that has been converted to a CommonJS file using a Babel-
+  // compatible transform (i.e. "__esModule" has not been set), then set
+  // "default" to the CommonJS "module.exports" for node compatibility.
+  isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
+  mod
+));
+var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
+// src/index.ts
+var index_exports = {};
+__export(index_exports, {
+  ApproxTokenizer: () => ApproxTokenizer,
+  createGptTokenizer: () => createGptTokenizer,
+  installTokenizer: () => installTokenizer,
+  resetTokenizer: () => resetTokenizer
+});
+module.exports = __toCommonJS(index_exports);
+var import_core = require("@halo-sdk/core");
+var ApproxTokenizer = class {
+  count(text) {
+    if (!text) return 0;
+    let total = 0;
+    const re = /[A-Za-z]+|\d+|\s+|[^\sA-Za-z\d]/gu;
+    for (const m of text.matchAll(re)) {
+      const piece = m[0];
+      const c = piece.charCodeAt(0);
+      if (/\s/.test(piece)) continue;
+      if (c < 128 && /[A-Za-z]/.test(piece)) {
+        total += Math.max(1, Math.ceil(piece.length / 4));
+      } else if (/\d/.test(piece)) {
+        total += Math.max(1, Math.ceil(piece.length / 3));
+      } else {
+        total += 1;
+      }
+    }
+    return total;
+  }
+};
+async function createGptTokenizer() {
+  let mod;
+  try {
+    mod = await import("gpt-tokenizer");
+  } catch {
+    throw new Error(
+      "createGptTokenizer() requires the optional peer dependency 'gpt-tokenizer'. Install it: npm i gpt-tokenizer"
+    );
+  }
+  const encode = mod.encode ?? mod.default?.encode;
+  if (typeof encode !== "function") {
+    throw new Error("gpt-tokenizer did not export an `encode` function.");
+  }
+  return { count: (text) => encode(text).length };
+}
+function installTokenizer(tokenizer) {
+  return (0, import_core.setTokenCounter)({ countText: (text) => tokenizer.count(text) });
+}
+function resetTokenizer() {
+  (0, import_core.setTokenCounter)(null);
+}
+// Annotate the CommonJS export names for ESM import in node:
+0 && (module.exports = {
+  ApproxTokenizer,
+  createGptTokenizer,
+  installTokenizer,
+  resetTokenizer
+});
+//# sourceMappingURL=index.cjs.map

package/dist/index.cjs.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"sources":["../src/index.ts"],"sourcesContent":["import { setTokenCounter, type TokenCounter } from \"@halo-sdk/core\";\n\n/** A token counter for a piece of text. */\nexport interface Tokenizer {\n /** Number of tokens the given text encodes to. */\n count(text: string): number;\n}\n\n/**\n * A dependency-free, deterministic approximation of BPE tokenization.\n *\n * Far more accurate than the `chars / 4` default for punctuation- and\n * code-heavy text: it segments the string into letter runs, digit runs,\n * punctuation/symbols (and CJK), then costs each segment the way byte-pair\n * encoders tend to. Use this when you want a better estimate without pulling in\n * a real tokenizer dependency.\n */\nexport class ApproxTokenizer implements Tokenizer {\n count(text: string): number {\n if (!text) return 0;\n let total = 0;\n // Letter runs, digit runs, whitespace runs, or a single other char (punct/CJK/emoji).\n const re = /[A-Za-z]+|\\d+|\\s+|[^\\sA-Za-z\\d]/gu;\n for (const m of text.matchAll(re)) {\n const piece = m[0]!;\n const c = piece.charCodeAt(0);\n if (/\\s/.test(piece)) continue; // whitespace merges into adjacent tokens\n if (c < 128 && /[A-Za-z]/.test(piece)) {\n total += Math.max(1, Math.ceil(piece.length / 4)); // ~4 chars/token\n } else if (/\\d/.test(piece)) {\n total += Math.max(1, Math.ceil(piece.length / 3)); // digits pack ~3/token\n } else {\n total += 1; // each punctuation / CJK / symbol char ≈ 1 token\n }\n }\n return total;\n }\n}\n\n/**\n * Build a tokenizer backed by `gpt-tokenizer` (a pure-JS BPE implementation,\n * declared as an optional peer dependency). Loaded lazily so the dependency is\n * only required when this factory is called.\n *\n * @throws if `gpt-tokenizer` is not installed.\n */\nexport async function createGptTokenizer(): Promise<Tokenizer> {\n let mod: { encode?: (t: string) => number[]; default?: { encode?: (t: string) => number[] } };\n try {\n mod = (await import(\"gpt-tokenizer\")) as typeof mod;\n } catch {\n throw new Error(\n \"createGptTokenizer() requires the optional peer dependency 'gpt-tokenizer'. Install it: npm i gpt-tokenizer\",\n );\n }\n const encode = mod.encode ?? mod.default?.encode;\n if (typeof encode !== \"function\") {\n throw new Error(\"gpt-tokenizer did not export an `encode` function.\");\n }\n return { count: (text: string) => encode(text).length };\n}\n\n/**\n * Install a {@link Tokenizer} as the process-wide counter used by\n * `estimateTokens` in `@halo-sdk/core`. Returns the previously-installed\n * counter (or `null`) so it can be restored later.\n *\n * ```ts\n * import { ApproxTokenizer, installTokenizer } from \"@halo-sdk/tokenizer\";\n * installTokenizer(new ApproxTokenizer());\n * ```\n */\nexport function installTokenizer(tokenizer: Tokenizer): TokenCounter | null {\n return setTokenCounter({ countText: (text) => tokenizer.count(text) });\n}\n\n/** Remove any installed tokenizer, reverting `estimateTokens` to its heuristic. */\nexport function resetTokenizer(): void {\n setTokenCounter(null);\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,kBAAmD;AAiB5C,IAAM,kBAAN,MAA2C;AAAA,EAChD,MAAM,MAAsB;AAC1B,QAAI,CAAC,KAAM,QAAO;AAClB,QAAI,QAAQ;AAEZ,UAAM,KAAK;AACX,eAAW,KAAK,KAAK,SAAS,EAAE,GAAG;AACjC,YAAM,QAAQ,EAAE,CAAC;AACjB,YAAM,IAAI,MAAM,WAAW,CAAC;AAC5B,UAAI,KAAK,KAAK,KAAK,EAAG;AACtB,UAAI,IAAI,OAAO,WAAW,KAAK,KAAK,GAAG;AACrC,iBAAS,KAAK,IAAI,GAAG,KAAK,KAAK,MAAM,SAAS,CAAC,CAAC;AAAA,MAClD,WAAW,KAAK,KAAK,KAAK,GAAG;AAC3B,iBAAS,KAAK,IAAI,GAAG,KAAK,KAAK,MAAM,SAAS,CAAC,CAAC;AAAA,MAClD,OAAO;AACL,iBAAS;AAAA,MACX;AAAA,IACF;AACA,WAAO;AAAA,EACT;AACF;AASA,eAAsB,qBAAyC;AAC7D,MAAI;AACJ,MAAI;AACF,UAAO,MAAM,OAAO,eAAe;AAAA,EACrC,QAAQ;AACN,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AACA,QAAM,SAAS,IAAI,UAAU,IAAI,SAAS;AAC1C,MAAI,OAAO,WAAW,YAAY;AAChC,UAAM,IAAI,MAAM,oDAAoD;AAAA,EACtE;AACA,SAAO,EAAE,OAAO,CAAC,SAAiB,OAAO,IAAI,EAAE,OAAO;AACxD;AAYO,SAAS,iBAAiB,WAA2C;AAC1E,aAAO,6BAAgB,EAAE,WAAW,CAAC,SAAS,UAAU,MAAM,IAAI,EAAE,CAAC;AACvE;AAGO,SAAS,iBAAuB;AACrC,mCAAgB,IAAI;AACtB;","names":[]}

package/dist/index.d.ts ADDED Viewed

@@ -0,0 +1,40 @@
+import { type TokenCounter } from "@halo-sdk/core";
+/** A token counter for a piece of text. */
+export interface Tokenizer {
+    /** Number of tokens the given text encodes to. */
+    count(text: string): number;
+}
+/**
+ * A dependency-free, deterministic approximation of BPE tokenization.
+ *
+ * Far more accurate than the `chars / 4` default for punctuation- and
+ * code-heavy text: it segments the string into letter runs, digit runs,
+ * punctuation/symbols (and CJK), then costs each segment the way byte-pair
+ * encoders tend to. Use this when you want a better estimate without pulling in
+ * a real tokenizer dependency.
+ */
+export declare class ApproxTokenizer implements Tokenizer {
+    count(text: string): number;
+}
+/**
+ * Build a tokenizer backed by `gpt-tokenizer` (a pure-JS BPE implementation,
+ * declared as an optional peer dependency). Loaded lazily so the dependency is
+ * only required when this factory is called.
+ *
+ * @throws if `gpt-tokenizer` is not installed.
+ */
+export declare function createGptTokenizer(): Promise<Tokenizer>;
+/**
+ * Install a {@link Tokenizer} as the process-wide counter used by
+ * `estimateTokens` in `@halo-sdk/core`. Returns the previously-installed
+ * counter (or `null`) so it can be restored later.
+ *
+ * ```ts
+ * import { ApproxTokenizer, installTokenizer } from "@halo-sdk/tokenizer";
+ * installTokenizer(new ApproxTokenizer());
+ * ```
+ */
+export declare function installTokenizer(tokenizer: Tokenizer): TokenCounter | null;
+/** Remove any installed tokenizer, reverting `estimateTokens` to its heuristic. */
+export declare function resetTokenizer(): void;
+//# sourceMappingURL=index.d.ts.map

package/dist/index.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAmB,KAAK,YAAY,EAAE,MAAM,gBAAgB,CAAC;AAEpE,2CAA2C;AAC3C,MAAM,WAAW,SAAS;IACxB,kDAAkD;IAClD,KAAK,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAAC;CAC7B;AAED;;;;;;;;GAQG;AACH,qBAAa,eAAgB,YAAW,SAAS;IAC/C,KAAK,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM;CAmB5B;AAED;;;;;;GAMG;AACH,wBAAsB,kBAAkB,IAAI,OAAO,CAAC,SAAS,CAAC,CAc7D;AAED;;;;;;;;;GASG;AACH,wBAAgB,gBAAgB,CAAC,SAAS,EAAE,SAAS,GAAG,YAAY,GAAG,IAAI,CAE1E;AAED,mFAAmF;AACnF,wBAAgB,cAAc,IAAI,IAAI,CAErC"}

package/dist/index.js ADDED Viewed

@@ -0,0 +1,50 @@
+// src/index.ts
+import { setTokenCounter } from "@halo-sdk/core";
+var ApproxTokenizer = class {
+  count(text) {
+    if (!text) return 0;
+    let total = 0;
+    const re = /[A-Za-z]+|\d+|\s+|[^\sA-Za-z\d]/gu;
+    for (const m of text.matchAll(re)) {
+      const piece = m[0];
+      const c = piece.charCodeAt(0);
+      if (/\s/.test(piece)) continue;
+      if (c < 128 && /[A-Za-z]/.test(piece)) {
+        total += Math.max(1, Math.ceil(piece.length / 4));
+      } else if (/\d/.test(piece)) {
+        total += Math.max(1, Math.ceil(piece.length / 3));
+      } else {
+        total += 1;
+      }
+    }
+    return total;
+  }
+};
+async function createGptTokenizer() {
+  let mod;
+  try {
+    mod = await import("gpt-tokenizer");
+  } catch {
+    throw new Error(
+      "createGptTokenizer() requires the optional peer dependency 'gpt-tokenizer'. Install it: npm i gpt-tokenizer"
+    );
+  }
+  const encode = mod.encode ?? mod.default?.encode;
+  if (typeof encode !== "function") {
+    throw new Error("gpt-tokenizer did not export an `encode` function.");
+  }
+  return { count: (text) => encode(text).length };
+}
+function installTokenizer(tokenizer) {
+  return setTokenCounter({ countText: (text) => tokenizer.count(text) });
+}
+function resetTokenizer() {
+  setTokenCounter(null);
+}
+export {
+  ApproxTokenizer,
+  createGptTokenizer,
+  installTokenizer,
+  resetTokenizer
+};
+//# sourceMappingURL=index.js.map

package/dist/index.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"sources":["../src/index.ts"],"sourcesContent":["import { setTokenCounter, type TokenCounter } from \"@halo-sdk/core\";\n\n/** A token counter for a piece of text. */\nexport interface Tokenizer {\n /** Number of tokens the given text encodes to. */\n count(text: string): number;\n}\n\n/**\n * A dependency-free, deterministic approximation of BPE tokenization.\n *\n * Far more accurate than the `chars / 4` default for punctuation- and\n * code-heavy text: it segments the string into letter runs, digit runs,\n * punctuation/symbols (and CJK), then costs each segment the way byte-pair\n * encoders tend to. Use this when you want a better estimate without pulling in\n * a real tokenizer dependency.\n */\nexport class ApproxTokenizer implements Tokenizer {\n count(text: string): number {\n if (!text) return 0;\n let total = 0;\n // Letter runs, digit runs, whitespace runs, or a single other char (punct/CJK/emoji).\n const re = /[A-Za-z]+|\\d+|\\s+|[^\\sA-Za-z\\d]/gu;\n for (const m of text.matchAll(re)) {\n const piece = m[0]!;\n const c = piece.charCodeAt(0);\n if (/\\s/.test(piece)) continue; // whitespace merges into adjacent tokens\n if (c < 128 && /[A-Za-z]/.test(piece)) {\n total += Math.max(1, Math.ceil(piece.length / 4)); // ~4 chars/token\n } else if (/\\d/.test(piece)) {\n total += Math.max(1, Math.ceil(piece.length / 3)); // digits pack ~3/token\n } else {\n total += 1; // each punctuation / CJK / symbol char ≈ 1 token\n }\n }\n return total;\n }\n}\n\n/**\n * Build a tokenizer backed by `gpt-tokenizer` (a pure-JS BPE implementation,\n * declared as an optional peer dependency). Loaded lazily so the dependency is\n * only required when this factory is called.\n *\n * @throws if `gpt-tokenizer` is not installed.\n */\nexport async function createGptTokenizer(): Promise<Tokenizer> {\n let mod: { encode?: (t: string) => number[]; default?: { encode?: (t: string) => number[] } };\n try {\n mod = (await import(\"gpt-tokenizer\")) as typeof mod;\n } catch {\n throw new Error(\n \"createGptTokenizer() requires the optional peer dependency 'gpt-tokenizer'. Install it: npm i gpt-tokenizer\",\n );\n }\n const encode = mod.encode ?? mod.default?.encode;\n if (typeof encode !== \"function\") {\n throw new Error(\"gpt-tokenizer did not export an `encode` function.\");\n }\n return { count: (text: string) => encode(text).length };\n}\n\n/**\n * Install a {@link Tokenizer} as the process-wide counter used by\n * `estimateTokens` in `@halo-sdk/core`. Returns the previously-installed\n * counter (or `null`) so it can be restored later.\n *\n * ```ts\n * import { ApproxTokenizer, installTokenizer } from \"@halo-sdk/tokenizer\";\n * installTokenizer(new ApproxTokenizer());\n * ```\n */\nexport function installTokenizer(tokenizer: Tokenizer): TokenCounter | null {\n return setTokenCounter({ countText: (text) => tokenizer.count(text) });\n}\n\n/** Remove any installed tokenizer, reverting `estimateTokens` to its heuristic. */\nexport function resetTokenizer(): void {\n setTokenCounter(null);\n}\n"],"mappings":";AAAA,SAAS,uBAA0C;AAiB5C,IAAM,kBAAN,MAA2C;AAAA,EAChD,MAAM,MAAsB;AAC1B,QAAI,CAAC,KAAM,QAAO;AAClB,QAAI,QAAQ;AAEZ,UAAM,KAAK;AACX,eAAW,KAAK,KAAK,SAAS,EAAE,GAAG;AACjC,YAAM,QAAQ,EAAE,CAAC;AACjB,YAAM,IAAI,MAAM,WAAW,CAAC;AAC5B,UAAI,KAAK,KAAK,KAAK,EAAG;AACtB,UAAI,IAAI,OAAO,WAAW,KAAK,KAAK,GAAG;AACrC,iBAAS,KAAK,IAAI,GAAG,KAAK,KAAK,MAAM,SAAS,CAAC,CAAC;AAAA,MAClD,WAAW,KAAK,KAAK,KAAK,GAAG;AAC3B,iBAAS,KAAK,IAAI,GAAG,KAAK,KAAK,MAAM,SAAS,CAAC,CAAC;AAAA,MAClD,OAAO;AACL,iBAAS;AAAA,MACX;AAAA,IACF;AACA,WAAO;AAAA,EACT;AACF;AASA,eAAsB,qBAAyC;AAC7D,MAAI;AACJ,MAAI;AACF,UAAO,MAAM,OAAO,eAAe;AAAA,EACrC,QAAQ;AACN,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AACA,QAAM,SAAS,IAAI,UAAU,IAAI,SAAS;AAC1C,MAAI,OAAO,WAAW,YAAY;AAChC,UAAM,IAAI,MAAM,oDAAoD;AAAA,EACtE;AACA,SAAO,EAAE,OAAO,CAAC,SAAiB,OAAO,IAAI,EAAE,OAAO;AACxD;AAYO,SAAS,iBAAiB,WAA2C;AAC1E,SAAO,gBAAgB,EAAE,WAAW,CAAC,SAAS,UAAU,MAAM,IAAI,EAAE,CAAC;AACvE;AAGO,SAAS,iBAAuB;AACrC,kBAAgB,IAAI;AACtB;","names":[]}

package/package.json ADDED Viewed

@@ -0,0 +1,56 @@
+{
+  "name": "@halo-sdk/tokenizer",
+  "version": "1.0.0",
+  "description": "Pluggable tokenizers for Halo AI SDK — accurate token counts for truncation budgets and cache/cost accounting",
+  "keywords": [
+    "ai",
+    "llm",
+    "tiktoken",
+    "token-count",
+    "tokenizer"
+  ],
+  "license": "MIT",
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/halo-sdk/halo-ai",
+    "directory": "packages/tokenizer"
+  },
+  "files": [
+    "dist"
+  ],
+  "type": "module",
+  "main": "./dist/index.js",
+  "types": "./dist/index.d.ts",
+  "exports": {
+    ".": {
+      "types": "./dist/index.d.ts",
+      "import": "./dist/index.js",
+      "require": "./dist/index.cjs"
+    }
+  },
+  "publishConfig": {
+    "access": "public"
+  },
+  "devDependencies": {
+    "typescript": "^5.8.0",
+    "vitest": "^3.0.0",
+    "@halo-sdk/core": "1.1.0"
+  },
+  "peerDependencies": {
+    "@halo-sdk/core": ">=1.1.0",
+    "gpt-tokenizer": "^2.0.0"
+  },
+  "peerDependenciesMeta": {
+    "gpt-tokenizer": {
+      "optional": true
+    }
+  },
+  "scripts": {
+    "build": "tsc --build --emitDeclarationOnly && tsup",
+    "dev": "tsup --watch",
+    "clean": "del-cli dist *.tsbuildinfo",
+    "publint": "publint",
+    "test": "vitest run",
+    "test:watch": "vitest"
+  }
+}