npm - @goliapkg/tiktoken-wasm - Versions diffs - 3.1.0 → 3.2.0 - Mend

@goliapkg/tiktoken-wasm 3.1.0 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/README.md CHANGED Viewed

@@ -21,7 +21,7 @@ wasm-pack build --target web --release
 Output is in `pkg/` — a complete npm-ready package containing:
 - `tiktoken_wasm.js` — ES module with WASM loader
-- `tiktoken_wasm_bg.wasm` — compiled WASM binary (~3 MB)
+- `tiktoken_wasm_bg.wasm` — compiled WASM binary (~7 MB, ~3 MB gzipped)
 - `tiktoken_wasm.d.ts` — TypeScript type definitions
 ## Usage
@@ -32,37 +32,54 @@ Output is in `pkg/` — a complete npm-ready package containing:
 import init, {
   getEncoding,
   encodingForModel,
+  listEncodings,
+  modelToEncoding,
   estimateCost,
   getModelInfo,
+  allModels,
+  modelsByProvider,
   type Encoding,
+  type ModelInfo,
 } from '@goliapkg/tiktoken-wasm'
 // initialize WASM module (required once, before any other calls)
 await init()
+// discover available encodings
+const names: string[] = listEncodings()
+// ["cl100k_base", "o200k_base", ..., "mistral_v3"]
 // encode / decode
 const enc: Encoding = getEncoding('cl100k_base')
 const tokens: Uint32Array = enc.encode('hello world')
 const text: string = enc.decode(tokens)   // "hello world"
 const count: number = enc.count('hello world')  // 2
+// special token handling
+const countST: number = enc.countWithSpecialTokens('hi<|endoftext|>bye')
+// vocabulary info
+console.log(enc.vocabSize)         // 100256
+console.log(enc.numSpecialTokens)  // 5
 // by model name — supports OpenAI, Meta, DeepSeek, Qwen, Mistral
 const enc2 = encodingForModel('gpt-4o')
-const enc3 = encodingForModel('llama-4-scout')
-const enc4 = encodingForModel('deepseek-r1')
+const encName = modelToEncoding('llama-4-scout')  // "llama3"
 // cost estimation (USD)
 const cost: number = estimateCost('gpt-4o', 1000, 500)
-// model metadata
-const info = getModelInfo('claude-opus-4')
-// { id, provider, input_per_1m, output_per_1m, cached_input_per_1m, context_window, max_output }
+// model metadata (fully typed)
+const info: ModelInfo = getModelInfo('claude-opus-4')
+console.log(info.id, info.provider, info.inputPer1m, info.contextWindow)
+// browse all models or filter by provider
+const all: ModelInfo[] = allModels()
+const openai: ModelInfo[] = modelsByProvider('OpenAI')
 // free WASM memory when done
 enc.free()
 enc2.free()
-enc3.free()
-enc4.free()
 ```
 ### Bundler Configuration
@@ -101,6 +118,10 @@ module.exports = {
 ## API Reference
+### `listEncodings(): string[]`
+List all available encoding names (9 encodings).
 ### `getEncoding(name: string): Encoding`
 Get a tokenizer by encoding name. Supported:
@@ -118,24 +139,51 @@ Get a tokenizer by encoding name. Supported:
 Get a tokenizer by model name (e.g. `gpt-4o`, `llama-4-scout`, `deepseek-r1`, `qwen3-235b`).
+### `modelToEncoding(model: string): string | null`
+Map a model name to its encoding name without loading the encoding.
 ### `Encoding`
-| Method | Returns | Description |
-|--------|---------|-------------|
+| Method / Property | Type | Description |
+|-------------------|------|-------------|
 | `encode(text)` | `Uint32Array` | Encode text to token ids |
 | `encodeWithSpecialTokens(text)` | `Uint32Array` | Encode with special token recognition |
 | `decode(tokens)` | `string` | Decode token ids to text |
 | `count(text)` | `number` | Count tokens (faster than `encode().length`) |
+| `countWithSpecialTokens(text)` | `number` | Count tokens with special token recognition |
 | `name` | `string` | Encoding name (getter) |
+| `vocabSize` | `number` | Number of regular tokens in vocabulary |
+| `numSpecialTokens` | `number` | Number of special tokens |
 | `free()` | `void` | Release WASM memory |
 ### `estimateCost(modelId, inputTokens, outputTokens): number`
 Estimate API cost in USD. Supports 57 models across 7 providers.
-### `getModelInfo(modelId): object`
+### `getModelInfo(modelId): ModelInfo`
+Get model metadata with full TypeScript typing.
+### `allModels(): ModelInfo[]`
+List all 57 supported models with pricing info.
+### `modelsByProvider(provider): ModelInfo[]`
+Filter models by provider: `"OpenAI"`, `"Anthropic"`, `"Google"`, `"Meta"`, `"DeepSeek"`, `"Alibaba"`, `"Mistral"`.
+### `ModelInfo`
-Get model metadata: pricing, context window, max output tokens.
+| Property | Type | Description |
+|----------|------|-------------|
+| `id` | `string` | Model identifier |
+| `provider` | `string` | Provider name |
+| `inputPer1m` | `number` | Input cost per 1M tokens (USD) |
+| `outputPer1m` | `number` | Output cost per 1M tokens (USD) |
+| `cachedInputPer1m` | `number \| undefined` | Cached input cost per 1M tokens |
+| `contextWindow` | `number` | Max context window (tokens) |
+| `maxOutput` | `number` | Max output tokens |
 ## Supported Models (pricing)

package/package.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "name": "@goliapkg/tiktoken-wasm",
   "type": "module",
   "description": "WASM bindings for the tiktoken BPE tokenizer",
-  "version": "3.1.0",
+  "version": "3.2.0",
   "license": "MIT",
   "repository": {
     "type": "git",

package/tiktoken_wasm.d.ts CHANGED Viewed

@@ -17,6 +17,13 @@ export class Encoding {
      * Faster than `encode(text).length` for cases where you only need the count.
      */
     count(text: string): number;
+    /**
+     * Count tokens, recognizing special tokens.
+     *
+     * Like `count()` but special tokens (e.g. `<|endoftext|>`) are counted
+     * as single tokens instead of being split into sub-word pieces.
+     */
+    countWithSpecialTokens(text: string): number;
     /**
      * Decode token ids back to a UTF-8 string.
      *
@@ -41,8 +48,39 @@ export class Encoding {
      * Get the encoding name (e.g. `"cl100k_base"`).
      */
     readonly name: string;
+    /**
+     * Get the number of special tokens in the vocabulary.
+     */
+    readonly numSpecialTokens: number;
+    /**
+     * Get the number of regular (non-special) tokens in the vocabulary.
+     */
+    readonly vocabSize: number;
 }
+/**
+ * Model pricing and metadata.
+ */
+export class ModelInfo {
+    private constructor();
+    free(): void;
+    [Symbol.dispose](): void;
+    readonly cachedInputPer1m: number | undefined;
+    readonly contextWindow: number;
+    readonly id: string;
+    readonly inputPer1m: number;
+    readonly maxOutput: number;
+    readonly outputPer1m: number;
+    readonly provider: string;
+}
+/**
+ * List all supported models with pricing info.
+ *
+ * Returns an array of `ModelInfo` objects.
+ */
+export function allModels(): ModelInfo[];
 /**
  * Get an encoding for a model name (e.g. `"gpt-4o"`, `"o3-mini"`, `"llama-4"`, `"deepseek-r1"`).
  *
@@ -79,34 +117,72 @@ export function estimateCost(model_id: string, input_tokens: number, output_toke
 export function getEncoding(name: string): Encoding;
 /**
- * Get model pricing and metadata as a JS object.
+ * Get model pricing and metadata.
  *
- * Returns an object with: `id`, `provider`, `input_per_1m`, `output_per_1m`,
- * `cached_input_per_1m`, `context_window`, `max_output`.
+ * Returns a typed object with: `id`, `provider`, `inputPer1m`, `outputPer1m`,
+ * `cachedInputPer1m`, `contextWindow`, `maxOutput`.
  *
  * Throws `Error` for unknown model ids.
  */
-export function getModelInfo(model_id: string): any;
+export function getModelInfo(model_id: string): ModelInfo;
+/**
+ * List all available encoding names.
+ *
+ * Returns an array of strings: `["cl100k_base", "o200k_base", ...]`
+ */
+export function listEncodings(): any[];
+/**
+ * Map a model name to its encoding name without loading the encoding.
+ *
+ * Returns the encoding name string (e.g. `"o200k_base"`) or `null` for unknown models.
+ */
+export function modelToEncoding(model: string): string | undefined;
+/**
+ * List models filtered by provider name.
+ *
+ * Provider names: `"OpenAI"`, `"Anthropic"`, `"Google"`, `"Meta"`, `"DeepSeek"`, `"Alibaba"`, `"Mistral"`.
+ * Returns an empty array for unknown providers.
+ */
+export function modelsByProvider(provider: string): ModelInfo[];
 export type InitInput = RequestInfo | URL | Response | BufferSource | WebAssembly.Module;
 export interface InitOutput {
     readonly memory: WebAssembly.Memory;
     readonly __wbg_encoding_free: (a: number, b: number) => void;
+    readonly __wbg_modelinfo_free: (a: number, b: number) => void;
+    readonly allModels: () => [number, number];
     readonly encodingForModel: (a: number, b: number) => [number, number, number];
     readonly encoding_count: (a: number, b: number, c: number) => number;
+    readonly encoding_countWithSpecialTokens: (a: number, b: number, c: number) => number;
     readonly encoding_decode: (a: number, b: number, c: number) => [number, number];
     readonly encoding_encode: (a: number, b: number, c: number) => [number, number];
     readonly encoding_encodeWithSpecialTokens: (a: number, b: number, c: number) => [number, number];
     readonly encoding_name: (a: number) => [number, number];
+    readonly encoding_numSpecialTokens: (a: number) => number;
+    readonly encoding_vocabSize: (a: number) => number;
     readonly estimateCost: (a: number, b: number, c: number, d: number) => [number, number, number];
     readonly getEncoding: (a: number, b: number) => [number, number, number];
     readonly getModelInfo: (a: number, b: number) => [number, number, number];
+    readonly listEncodings: () => [number, number];
+    readonly modelToEncoding: (a: number, b: number) => [number, number];
+    readonly modelinfo_cachedInputPer1m: (a: number) => [number, number];
+    readonly modelinfo_contextWindow: (a: number) => number;
+    readonly modelinfo_id: (a: number) => [number, number];
+    readonly modelinfo_inputPer1m: (a: number) => number;
+    readonly modelinfo_maxOutput: (a: number) => number;
+    readonly modelinfo_outputPer1m: (a: number) => number;
+    readonly modelinfo_provider: (a: number) => [number, number];
+    readonly modelsByProvider: (a: number, b: number) => [number, number];
+    readonly __wbindgen_externrefs: WebAssembly.Table;
+    readonly __externref_drop_slice: (a: number, b: number) => void;
+    readonly __wbindgen_free: (a: number, b: number, c: number) => void;
     readonly __wbindgen_malloc: (a: number, b: number) => number;
     readonly __wbindgen_realloc: (a: number, b: number, c: number, d: number) => number;
-    readonly __wbindgen_externrefs: WebAssembly.Table;
     readonly __externref_table_dealloc: (a: number) => void;
-    readonly __wbindgen_free: (a: number, b: number, c: number) => void;
     readonly __wbindgen_start: () => void;
 }

package/tiktoken_wasm.js CHANGED Viewed

@@ -37,6 +37,20 @@ export class Encoding {
         const ret = wasm.encoding_count(this.__wbg_ptr, ptr0, len0);
         return ret >>> 0;
     }
+    /**
+     * Count tokens, recognizing special tokens.
+     *
+     * Like `count()` but special tokens (e.g. `<|endoftext|>`) are counted
+     * as single tokens instead of being split into sub-word pieces.
+     * @param {string} text
+     * @returns {number}
+     */
+    countWithSpecialTokens(text) {
+        const ptr0 = passStringToWasm0(text, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
+        const len0 = WASM_VECTOR_LEN;
+        const ret = wasm.encoding_countWithSpecialTokens(this.__wbg_ptr, ptr0, len0);
+        return ret >>> 0;
+    }
     /**
      * Decode token ids back to a UTF-8 string.
      *
@@ -106,9 +120,127 @@ export class Encoding {
             wasm.__wbindgen_free(deferred1_0, deferred1_1, 1);
         }
     }
+    /**
+     * Get the number of special tokens in the vocabulary.
+     * @returns {number}
+     */
+    get numSpecialTokens() {
+        const ret = wasm.encoding_numSpecialTokens(this.__wbg_ptr);
+        return ret >>> 0;
+    }
+    /**
+     * Get the number of regular (non-special) tokens in the vocabulary.
+     * @returns {number}
+     */
+    get vocabSize() {
+        const ret = wasm.encoding_vocabSize(this.__wbg_ptr);
+        return ret >>> 0;
+    }
 }
 if (Symbol.dispose) Encoding.prototype[Symbol.dispose] = Encoding.prototype.free;
+/**
+ * Model pricing and metadata.
+ */
+export class ModelInfo {
+    static __wrap(ptr) {
+        ptr = ptr >>> 0;
+        const obj = Object.create(ModelInfo.prototype);
+        obj.__wbg_ptr = ptr;
+        ModelInfoFinalization.register(obj, obj.__wbg_ptr, obj);
+        return obj;
+    }
+    __destroy_into_raw() {
+        const ptr = this.__wbg_ptr;
+        this.__wbg_ptr = 0;
+        ModelInfoFinalization.unregister(this);
+        return ptr;
+    }
+    free() {
+        const ptr = this.__destroy_into_raw();
+        wasm.__wbg_modelinfo_free(ptr, 0);
+    }
+    /**
+     * @returns {number | undefined}
+     */
+    get cachedInputPer1m() {
+        const ret = wasm.modelinfo_cachedInputPer1m(this.__wbg_ptr);
+        return ret[0] === 0 ? undefined : ret[1];
+    }
+    /**
+     * @returns {number}
+     */
+    get contextWindow() {
+        const ret = wasm.modelinfo_contextWindow(this.__wbg_ptr);
+        return ret >>> 0;
+    }
+    /**
+     * @returns {string}
+     */
+    get id() {
+        let deferred1_0;
+        let deferred1_1;
+        try {
+            const ret = wasm.modelinfo_id(this.__wbg_ptr);
+            deferred1_0 = ret[0];
+            deferred1_1 = ret[1];
+            return getStringFromWasm0(ret[0], ret[1]);
+        } finally {
+            wasm.__wbindgen_free(deferred1_0, deferred1_1, 1);
+        }
+    }
+    /**
+     * @returns {number}
+     */
+    get inputPer1m() {
+        const ret = wasm.modelinfo_inputPer1m(this.__wbg_ptr);
+        return ret;
+    }
+    /**
+     * @returns {number}
+     */
+    get maxOutput() {
+        const ret = wasm.modelinfo_maxOutput(this.__wbg_ptr);
+        return ret >>> 0;
+    }
+    /**
+     * @returns {number}
+     */
+    get outputPer1m() {
+        const ret = wasm.modelinfo_outputPer1m(this.__wbg_ptr);
+        return ret;
+    }
+    /**
+     * @returns {string}
+     */
+    get provider() {
+        let deferred1_0;
+        let deferred1_1;
+        try {
+            const ret = wasm.modelinfo_provider(this.__wbg_ptr);
+            deferred1_0 = ret[0];
+            deferred1_1 = ret[1];
+            return getStringFromWasm0(ret[0], ret[1]);
+        } finally {
+            wasm.__wbindgen_free(deferred1_0, deferred1_1, 1);
+        }
+    }
+}
+if (Symbol.dispose) ModelInfo.prototype[Symbol.dispose] = ModelInfo.prototype.free;
+/**
+ * List all supported models with pricing info.
+ *
+ * Returns an array of `ModelInfo` objects.
+ * @returns {ModelInfo[]}
+ */
+export function allModels() {
+    const ret = wasm.allModels();
+    var v1 = getArrayJsValueFromWasm0(ret[0], ret[1]).slice();
+    wasm.__wbindgen_free(ret[0], ret[1] * 4, 4);
+    return v1;
+}
 /**
  * Get an encoding for a model name (e.g. `"gpt-4o"`, `"o3-mini"`, `"llama-4"`, `"deepseek-r1"`).
  *
@@ -177,14 +309,14 @@ export function getEncoding(name) {
 }
 /**
- * Get model pricing and metadata as a JS object.
+ * Get model pricing and metadata.
  *
- * Returns an object with: `id`, `provider`, `input_per_1m`, `output_per_1m`,
- * `cached_input_per_1m`, `context_window`, `max_output`.
+ * Returns a typed object with: `id`, `provider`, `inputPer1m`, `outputPer1m`,
+ * `cachedInputPer1m`, `contextWindow`, `maxOutput`.
  *
  * Throws `Error` for unknown model ids.
  * @param {string} model_id
- * @returns {any}
+ * @returns {ModelInfo}
  */
 export function getModelInfo(model_id) {
     const ptr0 = passStringToWasm0(model_id, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
@@ -193,7 +325,56 @@ export function getModelInfo(model_id) {
     if (ret[2]) {
         throw takeFromExternrefTable0(ret[1]);
     }
-    return takeFromExternrefTable0(ret[0]);
+    return ModelInfo.__wrap(ret[0]);
+}
+/**
+ * List all available encoding names.
+ *
+ * Returns an array of strings: `["cl100k_base", "o200k_base", ...]`
+ * @returns {any[]}
+ */
+export function listEncodings() {
+    const ret = wasm.listEncodings();
+    var v1 = getArrayJsValueFromWasm0(ret[0], ret[1]).slice();
+    wasm.__wbindgen_free(ret[0], ret[1] * 4, 4);
+    return v1;
+}
+/**
+ * Map a model name to its encoding name without loading the encoding.
+ *
+ * Returns the encoding name string (e.g. `"o200k_base"`) or `null` for unknown models.
+ * @param {string} model
+ * @returns {string | undefined}
+ */
+export function modelToEncoding(model) {
+    const ptr0 = passStringToWasm0(model, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
+    const len0 = WASM_VECTOR_LEN;
+    const ret = wasm.modelToEncoding(ptr0, len0);
+    let v2;
+    if (ret[0] !== 0) {
+        v2 = getStringFromWasm0(ret[0], ret[1]).slice();
+        wasm.__wbindgen_free(ret[0], ret[1] * 1, 1);
+    }
+    return v2;
+}
+/**
+ * List models filtered by provider name.
+ *
+ * Provider names: `"OpenAI"`, `"Anthropic"`, `"Google"`, `"Meta"`, `"DeepSeek"`, `"Alibaba"`, `"Mistral"`.
+ * Returns an empty array for unknown providers.
+ * @param {string} provider
+ * @returns {ModelInfo[]}
+ */
+export function modelsByProvider(provider) {
+    const ptr0 = passStringToWasm0(provider, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
+    const len0 = WASM_VECTOR_LEN;
+    const ret = wasm.modelsByProvider(ptr0, len0);
+    var v2 = getArrayJsValueFromWasm0(ret[0], ret[1]).slice();
+    wasm.__wbindgen_free(ret[0], ret[1] * 4, 4);
+    return v2;
 }
 function __wbg_get_imports() {
@@ -203,29 +384,14 @@ function __wbg_get_imports() {
             const ret = Error(getStringFromWasm0(arg0, arg1));
             return ret;
         },
-        __wbg_String_8564e559799eccda: function(arg0, arg1) {
-            const ret = String(arg1);
-            const ptr1 = passStringToWasm0(ret, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
-            const len1 = WASM_VECTOR_LEN;
-            getDataViewMemory0().setInt32(arg0 + 4 * 1, len1, true);
-            getDataViewMemory0().setInt32(arg0 + 4 * 0, ptr1, true);
-        },
         __wbg___wbindgen_throw_6ddd609b62940d55: function(arg0, arg1) {
             throw new Error(getStringFromWasm0(arg0, arg1));
         },
-        __wbg_new_ab79df5bd7c26067: function() {
-            const ret = new Object();
-            return ret;
-        },
-        __wbg_set_6be42768c690e380: function(arg0, arg1, arg2) {
-            arg0[arg1] = arg2;
-        },
-        __wbindgen_cast_0000000000000001: function(arg0) {
-            // Cast intrinsic for `F64 -> Externref`.
-            const ret = arg0;
+        __wbg_modelinfo_new: function(arg0) {
+            const ret = ModelInfo.__wrap(arg0);
             return ret;
         },
-        __wbindgen_cast_0000000000000002: function(arg0, arg1) {
+        __wbindgen_cast_0000000000000001: function(arg0, arg1) {
             // Cast intrinsic for `Ref(String) -> Externref`.
             const ret = getStringFromWasm0(arg0, arg1);
             return ret;
@@ -249,6 +415,20 @@ function __wbg_get_imports() {
 const EncodingFinalization = (typeof FinalizationRegistry === 'undefined')
     ? { register: () => {}, unregister: () => {} }
     : new FinalizationRegistry(ptr => wasm.__wbg_encoding_free(ptr >>> 0, 1));
+const ModelInfoFinalization = (typeof FinalizationRegistry === 'undefined')
+    ? { register: () => {}, unregister: () => {} }
+    : new FinalizationRegistry(ptr => wasm.__wbg_modelinfo_free(ptr >>> 0, 1));
+function getArrayJsValueFromWasm0(ptr, len) {
+    ptr = ptr >>> 0;
+    const mem = getDataViewMemory0();
+    const result = [];
+    for (let i = ptr; i < ptr + 4 * len; i += 4) {
+        result.push(wasm.__wbindgen_externrefs.get(mem.getUint32(i, true)));
+    }
+    wasm.__externref_drop_slice(ptr, len);
+    return result;
+}
 function getArrayU32FromWasm0(ptr, len) {
     ptr = ptr >>> 0;

package/tiktoken_wasm_bg.wasm CHANGED Viewed

Binary file