@halo-sdk/tokenizer 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,36 @@
1
+ # @halo-sdk/tokenizer
2
+
3
+ Pluggable tokenizers for Halo AI SDK. Replaces the built-in `chars / 4` heuristic behind `estimateTokens` with accurate counts — needed for honest truncation budgets and cache/cost accounting.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ npm install @halo-sdk/tokenizer
9
+ ```
10
+
11
+ Requires `@halo-sdk/core`. `gpt-tokenizer` is an optional peer dependency, only needed for `createGptTokenizer()`.
12
+
13
+ ## Usage
14
+
15
+ ```ts
16
+ import { ApproxTokenizer, installTokenizer } from "@halo-sdk/tokenizer";
17
+
18
+ // Install once at startup; `estimateTokens` everywhere now uses it.
19
+ installTokenizer(new ApproxTokenizer());
20
+ ```
21
+
22
+ `ApproxTokenizer` is dependency-free and deterministic — a BPE approximation that costs punctuation, digits, and CJK far more realistically than `chars / 4`.
23
+
24
+ For exact GPT counts, install `gpt-tokenizer` and use the lazy factory:
25
+
26
+ ```ts
27
+ import { createGptTokenizer, installTokenizer } from "@halo-sdk/tokenizer";
28
+
29
+ installTokenizer(await createGptTokenizer());
30
+ ```
31
+
32
+ ## Custom tokenizers
33
+
34
+ Implement `Tokenizer` (`count(text): number`) and pass it to `installTokenizer`, or wire a counter directly via `setTokenCounter` from `@halo-sdk/core`.
35
+
36
+ Call `resetTokenizer()` to revert to the heuristic.
package/dist/index.cjs ADDED
@@ -0,0 +1,88 @@
1
+ "use strict";
2
+ var __create = Object.create;
3
+ var __defProp = Object.defineProperty;
4
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
+ var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __export = (target, all) => {
9
+ for (var name in all)
10
+ __defProp(target, name, { get: all[name], enumerable: true });
11
+ };
12
+ var __copyProps = (to, from, except, desc) => {
13
+ if (from && typeof from === "object" || typeof from === "function") {
14
+ for (let key of __getOwnPropNames(from))
15
+ if (!__hasOwnProp.call(to, key) && key !== except)
16
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
17
+ }
18
+ return to;
19
+ };
20
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
+ // If the importer is in node compatibility mode or this is not an ESM
22
+ // file that has been converted to a CommonJS file using a Babel-
23
+ // compatible transform (i.e. "__esModule" has not been set), then set
24
+ // "default" to the CommonJS "module.exports" for node compatibility.
25
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
+ mod
27
+ ));
28
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
+
30
+ // src/index.ts
31
+ var index_exports = {};
32
+ __export(index_exports, {
33
+ ApproxTokenizer: () => ApproxTokenizer,
34
+ createGptTokenizer: () => createGptTokenizer,
35
+ installTokenizer: () => installTokenizer,
36
+ resetTokenizer: () => resetTokenizer
37
+ });
38
+ module.exports = __toCommonJS(index_exports);
39
+ var import_core = require("@halo-sdk/core");
40
+ var ApproxTokenizer = class {
41
+ count(text) {
42
+ if (!text) return 0;
43
+ let total = 0;
44
+ const re = /[A-Za-z]+|\d+|\s+|[^\sA-Za-z\d]/gu;
45
+ for (const m of text.matchAll(re)) {
46
+ const piece = m[0];
47
+ const c = piece.charCodeAt(0);
48
+ if (/\s/.test(piece)) continue;
49
+ if (c < 128 && /[A-Za-z]/.test(piece)) {
50
+ total += Math.max(1, Math.ceil(piece.length / 4));
51
+ } else if (/\d/.test(piece)) {
52
+ total += Math.max(1, Math.ceil(piece.length / 3));
53
+ } else {
54
+ total += 1;
55
+ }
56
+ }
57
+ return total;
58
+ }
59
+ };
60
+ async function createGptTokenizer() {
61
+ let mod;
62
+ try {
63
+ mod = await import("gpt-tokenizer");
64
+ } catch {
65
+ throw new Error(
66
+ "createGptTokenizer() requires the optional peer dependency 'gpt-tokenizer'. Install it: npm i gpt-tokenizer"
67
+ );
68
+ }
69
+ const encode = mod.encode ?? mod.default?.encode;
70
+ if (typeof encode !== "function") {
71
+ throw new Error("gpt-tokenizer did not export an `encode` function.");
72
+ }
73
+ return { count: (text) => encode(text).length };
74
+ }
75
+ function installTokenizer(tokenizer) {
76
+ return (0, import_core.setTokenCounter)({ countText: (text) => tokenizer.count(text) });
77
+ }
78
+ function resetTokenizer() {
79
+ (0, import_core.setTokenCounter)(null);
80
+ }
81
+ // Annotate the CommonJS export names for ESM import in node:
82
+ 0 && (module.exports = {
83
+ ApproxTokenizer,
84
+ createGptTokenizer,
85
+ installTokenizer,
86
+ resetTokenizer
87
+ });
88
+ //# sourceMappingURL=index.cjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/index.ts"],"sourcesContent":["import { setTokenCounter, type TokenCounter } from \"@halo-sdk/core\";\n\n/** A token counter for a piece of text. */\nexport interface Tokenizer {\n /** Number of tokens the given text encodes to. */\n count(text: string): number;\n}\n\n/**\n * A dependency-free, deterministic approximation of BPE tokenization.\n *\n * Far more accurate than the `chars / 4` default for punctuation- and\n * code-heavy text: it segments the string into letter runs, digit runs,\n * punctuation/symbols (and CJK), then costs each segment the way byte-pair\n * encoders tend to. Use this when you want a better estimate without pulling in\n * a real tokenizer dependency.\n */\nexport class ApproxTokenizer implements Tokenizer {\n count(text: string): number {\n if (!text) return 0;\n let total = 0;\n // Letter runs, digit runs, whitespace runs, or a single other char (punct/CJK/emoji).\n const re = /[A-Za-z]+|\\d+|\\s+|[^\\sA-Za-z\\d]/gu;\n for (const m of text.matchAll(re)) {\n const piece = m[0]!;\n const c = piece.charCodeAt(0);\n if (/\\s/.test(piece)) continue; // whitespace merges into adjacent tokens\n if (c < 128 && /[A-Za-z]/.test(piece)) {\n total += Math.max(1, Math.ceil(piece.length / 4)); // ~4 chars/token\n } else if (/\\d/.test(piece)) {\n total += Math.max(1, Math.ceil(piece.length / 3)); // digits pack ~3/token\n } else {\n total += 1; // each punctuation / CJK / symbol char ≈ 1 token\n }\n }\n return total;\n }\n}\n\n/**\n * Build a tokenizer backed by `gpt-tokenizer` (a pure-JS BPE implementation,\n * declared as an optional peer dependency). Loaded lazily so the dependency is\n * only required when this factory is called.\n *\n * @throws if `gpt-tokenizer` is not installed.\n */\nexport async function createGptTokenizer(): Promise<Tokenizer> {\n let mod: { encode?: (t: string) => number[]; default?: { encode?: (t: string) => number[] } };\n try {\n mod = (await import(\"gpt-tokenizer\")) as typeof mod;\n } catch {\n throw new Error(\n \"createGptTokenizer() requires the optional peer dependency 'gpt-tokenizer'. Install it: npm i gpt-tokenizer\",\n );\n }\n const encode = mod.encode ?? mod.default?.encode;\n if (typeof encode !== \"function\") {\n throw new Error(\"gpt-tokenizer did not export an `encode` function.\");\n }\n return { count: (text: string) => encode(text).length };\n}\n\n/**\n * Install a {@link Tokenizer} as the process-wide counter used by\n * `estimateTokens` in `@halo-sdk/core`. Returns the previously-installed\n * counter (or `null`) so it can be restored later.\n *\n * ```ts\n * import { ApproxTokenizer, installTokenizer } from \"@halo-sdk/tokenizer\";\n * installTokenizer(new ApproxTokenizer());\n * ```\n */\nexport function installTokenizer(tokenizer: Tokenizer): TokenCounter | null {\n return setTokenCounter({ countText: (text) => tokenizer.count(text) });\n}\n\n/** Remove any installed tokenizer, reverting `estimateTokens` to its heuristic. */\nexport function resetTokenizer(): void {\n setTokenCounter(null);\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,kBAAmD;AAiB5C,IAAM,kBAAN,MAA2C;AAAA,EAChD,MAAM,MAAsB;AAC1B,QAAI,CAAC,KAAM,QAAO;AAClB,QAAI,QAAQ;AAEZ,UAAM,KAAK;AACX,eAAW,KAAK,KAAK,SAAS,EAAE,GAAG;AACjC,YAAM,QAAQ,EAAE,CAAC;AACjB,YAAM,IAAI,MAAM,WAAW,CAAC;AAC5B,UAAI,KAAK,KAAK,KAAK,EAAG;AACtB,UAAI,IAAI,OAAO,WAAW,KAAK,KAAK,GAAG;AACrC,iBAAS,KAAK,IAAI,GAAG,KAAK,KAAK,MAAM,SAAS,CAAC,CAAC;AAAA,MAClD,WAAW,KAAK,KAAK,KAAK,GAAG;AAC3B,iBAAS,KAAK,IAAI,GAAG,KAAK,KAAK,MAAM,SAAS,CAAC,CAAC;AAAA,MAClD,OAAO;AACL,iBAAS;AAAA,MACX;AAAA,IACF;AACA,WAAO;AAAA,EACT;AACF;AASA,eAAsB,qBAAyC;AAC7D,MAAI;AACJ,MAAI;AACF,UAAO,MAAM,OAAO,eAAe;AAAA,EACrC,QAAQ;AACN,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AACA,QAAM,SAAS,IAAI,UAAU,IAAI,SAAS;AAC1C,MAAI,OAAO,WAAW,YAAY;AAChC,UAAM,IAAI,MAAM,oDAAoD;AAAA,EACtE;AACA,SAAO,EAAE,OAAO,CAAC,SAAiB,OAAO,IAAI,EAAE,OAAO;AACxD;AAYO,SAAS,iBAAiB,WAA2C;AAC1E,aAAO,6BAAgB,EAAE,WAAW,CAAC,SAAS,UAAU,MAAM,IAAI,EAAE,CAAC;AACvE;AAGO,SAAS,iBAAuB;AACrC,mCAAgB,IAAI;AACtB;","names":[]}
@@ -0,0 +1,40 @@
1
+ import { type TokenCounter } from "@halo-sdk/core";
2
+ /** A token counter for a piece of text. */
3
+ export interface Tokenizer {
4
+ /** Number of tokens the given text encodes to. */
5
+ count(text: string): number;
6
+ }
7
+ /**
8
+ * A dependency-free, deterministic approximation of BPE tokenization.
9
+ *
10
+ * Far more accurate than the `chars / 4` default for punctuation- and
11
+ * code-heavy text: it segments the string into letter runs, digit runs,
12
+ * punctuation/symbols (and CJK), then costs each segment the way byte-pair
13
+ * encoders tend to. Use this when you want a better estimate without pulling in
14
+ * a real tokenizer dependency.
15
+ */
16
+ export declare class ApproxTokenizer implements Tokenizer {
17
+ count(text: string): number;
18
+ }
19
+ /**
20
+ * Build a tokenizer backed by `gpt-tokenizer` (a pure-JS BPE implementation,
21
+ * declared as an optional peer dependency). Loaded lazily so the dependency is
22
+ * only required when this factory is called.
23
+ *
24
+ * @throws if `gpt-tokenizer` is not installed.
25
+ */
26
+ export declare function createGptTokenizer(): Promise<Tokenizer>;
27
+ /**
28
+ * Install a {@link Tokenizer} as the process-wide counter used by
29
+ * `estimateTokens` in `@halo-sdk/core`. Returns the previously-installed
30
+ * counter (or `null`) so it can be restored later.
31
+ *
32
+ * ```ts
33
+ * import { ApproxTokenizer, installTokenizer } from "@halo-sdk/tokenizer";
34
+ * installTokenizer(new ApproxTokenizer());
35
+ * ```
36
+ */
37
+ export declare function installTokenizer(tokenizer: Tokenizer): TokenCounter | null;
38
+ /** Remove any installed tokenizer, reverting `estimateTokens` to its heuristic. */
39
+ export declare function resetTokenizer(): void;
40
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAmB,KAAK,YAAY,EAAE,MAAM,gBAAgB,CAAC;AAEpE,2CAA2C;AAC3C,MAAM,WAAW,SAAS;IACxB,kDAAkD;IAClD,KAAK,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAAC;CAC7B;AAED;;;;;;;;GAQG;AACH,qBAAa,eAAgB,YAAW,SAAS;IAC/C,KAAK,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM;CAmB5B;AAED;;;;;;GAMG;AACH,wBAAsB,kBAAkB,IAAI,OAAO,CAAC,SAAS,CAAC,CAc7D;AAED;;;;;;;;;GASG;AACH,wBAAgB,gBAAgB,CAAC,SAAS,EAAE,SAAS,GAAG,YAAY,GAAG,IAAI,CAE1E;AAED,mFAAmF;AACnF,wBAAgB,cAAc,IAAI,IAAI,CAErC"}
package/dist/index.js ADDED
@@ -0,0 +1,50 @@
1
+ // src/index.ts
2
+ import { setTokenCounter } from "@halo-sdk/core";
3
+ var ApproxTokenizer = class {
4
+ count(text) {
5
+ if (!text) return 0;
6
+ let total = 0;
7
+ const re = /[A-Za-z]+|\d+|\s+|[^\sA-Za-z\d]/gu;
8
+ for (const m of text.matchAll(re)) {
9
+ const piece = m[0];
10
+ const c = piece.charCodeAt(0);
11
+ if (/\s/.test(piece)) continue;
12
+ if (c < 128 && /[A-Za-z]/.test(piece)) {
13
+ total += Math.max(1, Math.ceil(piece.length / 4));
14
+ } else if (/\d/.test(piece)) {
15
+ total += Math.max(1, Math.ceil(piece.length / 3));
16
+ } else {
17
+ total += 1;
18
+ }
19
+ }
20
+ return total;
21
+ }
22
+ };
23
+ async function createGptTokenizer() {
24
+ let mod;
25
+ try {
26
+ mod = await import("gpt-tokenizer");
27
+ } catch {
28
+ throw new Error(
29
+ "createGptTokenizer() requires the optional peer dependency 'gpt-tokenizer'. Install it: npm i gpt-tokenizer"
30
+ );
31
+ }
32
+ const encode = mod.encode ?? mod.default?.encode;
33
+ if (typeof encode !== "function") {
34
+ throw new Error("gpt-tokenizer did not export an `encode` function.");
35
+ }
36
+ return { count: (text) => encode(text).length };
37
+ }
38
+ function installTokenizer(tokenizer) {
39
+ return setTokenCounter({ countText: (text) => tokenizer.count(text) });
40
+ }
41
+ function resetTokenizer() {
42
+ setTokenCounter(null);
43
+ }
44
+ export {
45
+ ApproxTokenizer,
46
+ createGptTokenizer,
47
+ installTokenizer,
48
+ resetTokenizer
49
+ };
50
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/index.ts"],"sourcesContent":["import { setTokenCounter, type TokenCounter } from \"@halo-sdk/core\";\n\n/** A token counter for a piece of text. */\nexport interface Tokenizer {\n /** Number of tokens the given text encodes to. */\n count(text: string): number;\n}\n\n/**\n * A dependency-free, deterministic approximation of BPE tokenization.\n *\n * Far more accurate than the `chars / 4` default for punctuation- and\n * code-heavy text: it segments the string into letter runs, digit runs,\n * punctuation/symbols (and CJK), then costs each segment the way byte-pair\n * encoders tend to. Use this when you want a better estimate without pulling in\n * a real tokenizer dependency.\n */\nexport class ApproxTokenizer implements Tokenizer {\n count(text: string): number {\n if (!text) return 0;\n let total = 0;\n // Letter runs, digit runs, whitespace runs, or a single other char (punct/CJK/emoji).\n const re = /[A-Za-z]+|\\d+|\\s+|[^\\sA-Za-z\\d]/gu;\n for (const m of text.matchAll(re)) {\n const piece = m[0]!;\n const c = piece.charCodeAt(0);\n if (/\\s/.test(piece)) continue; // whitespace merges into adjacent tokens\n if (c < 128 && /[A-Za-z]/.test(piece)) {\n total += Math.max(1, Math.ceil(piece.length / 4)); // ~4 chars/token\n } else if (/\\d/.test(piece)) {\n total += Math.max(1, Math.ceil(piece.length / 3)); // digits pack ~3/token\n } else {\n total += 1; // each punctuation / CJK / symbol char ≈ 1 token\n }\n }\n return total;\n }\n}\n\n/**\n * Build a tokenizer backed by `gpt-tokenizer` (a pure-JS BPE implementation,\n * declared as an optional peer dependency). Loaded lazily so the dependency is\n * only required when this factory is called.\n *\n * @throws if `gpt-tokenizer` is not installed.\n */\nexport async function createGptTokenizer(): Promise<Tokenizer> {\n let mod: { encode?: (t: string) => number[]; default?: { encode?: (t: string) => number[] } };\n try {\n mod = (await import(\"gpt-tokenizer\")) as typeof mod;\n } catch {\n throw new Error(\n \"createGptTokenizer() requires the optional peer dependency 'gpt-tokenizer'. Install it: npm i gpt-tokenizer\",\n );\n }\n const encode = mod.encode ?? mod.default?.encode;\n if (typeof encode !== \"function\") {\n throw new Error(\"gpt-tokenizer did not export an `encode` function.\");\n }\n return { count: (text: string) => encode(text).length };\n}\n\n/**\n * Install a {@link Tokenizer} as the process-wide counter used by\n * `estimateTokens` in `@halo-sdk/core`. Returns the previously-installed\n * counter (or `null`) so it can be restored later.\n *\n * ```ts\n * import { ApproxTokenizer, installTokenizer } from \"@halo-sdk/tokenizer\";\n * installTokenizer(new ApproxTokenizer());\n * ```\n */\nexport function installTokenizer(tokenizer: Tokenizer): TokenCounter | null {\n return setTokenCounter({ countText: (text) => tokenizer.count(text) });\n}\n\n/** Remove any installed tokenizer, reverting `estimateTokens` to its heuristic. */\nexport function resetTokenizer(): void {\n setTokenCounter(null);\n}\n"],"mappings":";AAAA,SAAS,uBAA0C;AAiB5C,IAAM,kBAAN,MAA2C;AAAA,EAChD,MAAM,MAAsB;AAC1B,QAAI,CAAC,KAAM,QAAO;AAClB,QAAI,QAAQ;AAEZ,UAAM,KAAK;AACX,eAAW,KAAK,KAAK,SAAS,EAAE,GAAG;AACjC,YAAM,QAAQ,EAAE,CAAC;AACjB,YAAM,IAAI,MAAM,WAAW,CAAC;AAC5B,UAAI,KAAK,KAAK,KAAK,EAAG;AACtB,UAAI,IAAI,OAAO,WAAW,KAAK,KAAK,GAAG;AACrC,iBAAS,KAAK,IAAI,GAAG,KAAK,KAAK,MAAM,SAAS,CAAC,CAAC;AAAA,MAClD,WAAW,KAAK,KAAK,KAAK,GAAG;AAC3B,iBAAS,KAAK,IAAI,GAAG,KAAK,KAAK,MAAM,SAAS,CAAC,CAAC;AAAA,MAClD,OAAO;AACL,iBAAS;AAAA,MACX;AAAA,IACF;AACA,WAAO;AAAA,EACT;AACF;AASA,eAAsB,qBAAyC;AAC7D,MAAI;AACJ,MAAI;AACF,UAAO,MAAM,OAAO,eAAe;AAAA,EACrC,QAAQ;AACN,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AACA,QAAM,SAAS,IAAI,UAAU,IAAI,SAAS;AAC1C,MAAI,OAAO,WAAW,YAAY;AAChC,UAAM,IAAI,MAAM,oDAAoD;AAAA,EACtE;AACA,SAAO,EAAE,OAAO,CAAC,SAAiB,OAAO,IAAI,EAAE,OAAO;AACxD;AAYO,SAAS,iBAAiB,WAA2C;AAC1E,SAAO,gBAAgB,EAAE,WAAW,CAAC,SAAS,UAAU,MAAM,IAAI,EAAE,CAAC;AACvE;AAGO,SAAS,iBAAuB;AACrC,kBAAgB,IAAI;AACtB;","names":[]}
package/package.json ADDED
@@ -0,0 +1,56 @@
1
+ {
2
+ "name": "@halo-sdk/tokenizer",
3
+ "version": "1.0.0",
4
+ "description": "Pluggable tokenizers for Halo AI SDK — accurate token counts for truncation budgets and cache/cost accounting",
5
+ "keywords": [
6
+ "ai",
7
+ "llm",
8
+ "tiktoken",
9
+ "token-count",
10
+ "tokenizer"
11
+ ],
12
+ "license": "MIT",
13
+ "repository": {
14
+ "type": "git",
15
+ "url": "https://github.com/halo-sdk/halo-ai",
16
+ "directory": "packages/tokenizer"
17
+ },
18
+ "files": [
19
+ "dist"
20
+ ],
21
+ "type": "module",
22
+ "main": "./dist/index.js",
23
+ "types": "./dist/index.d.ts",
24
+ "exports": {
25
+ ".": {
26
+ "types": "./dist/index.d.ts",
27
+ "import": "./dist/index.js",
28
+ "require": "./dist/index.cjs"
29
+ }
30
+ },
31
+ "publishConfig": {
32
+ "access": "public"
33
+ },
34
+ "devDependencies": {
35
+ "typescript": "^5.8.0",
36
+ "vitest": "^3.0.0",
37
+ "@halo-sdk/core": "1.1.0"
38
+ },
39
+ "peerDependencies": {
40
+ "@halo-sdk/core": ">=1.1.0",
41
+ "gpt-tokenizer": "^2.0.0"
42
+ },
43
+ "peerDependenciesMeta": {
44
+ "gpt-tokenizer": {
45
+ "optional": true
46
+ }
47
+ },
48
+ "scripts": {
49
+ "build": "tsc --build --emitDeclarationOnly && tsup",
50
+ "dev": "tsup --watch",
51
+ "clean": "del-cli dist *.tsbuildinfo",
52
+ "publint": "publint",
53
+ "test": "vitest run",
54
+ "test:watch": "vitest"
55
+ }
56
+ }