npm - token-vocabs - Versions diffs - 0.2.2 → 0.4.0 - Mend

token-vocabs 0.2.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

package/README.md +80 -61
package/all.d.ts +2 -2
package/all.js +1 -1
package/chunks/decompress.js +1 -1
package/chunks/main.js +2 -2
package/deepseek.bin +0 -0
package/gemma.bin +0 -0
package/glm.bin +0 -0
package/gpt.bin +1 -0
package/hy.bin +0 -0
package/kimi.bin +0 -0
package/lib/api.d.ts +24 -14
package/lib/data.d.ts +4 -5
package/lib/modelAssets/base/ModelAssetBundleLoader.d.ts +6 -0
package/lib/models.d.ts +21 -1
package/lib/tiktoken.d.ts +4 -0
package/lib/tokenization.d.ts +38 -0
package/lib/tokenizers/ClipTokenizer.d.ts +2 -0
package/lib/tokenizers/HuggingFaceTokenizer.d.ts +10 -1
package/lib/tokenizers/TiktokenTokenizer.d.ts +8 -3
package/lib/tokenizers/base/BaseTokenizer.d.ts +7 -2
package/lib/tokenizers/index.d.ts +6 -2
package/main.d.ts +2 -9
package/main.js +1 -1
package/mimo.bin +0 -0
package/minimax.bin +0 -0
package/package.json +5 -5
package/qwen.bin +0 -0
package/sdxl.bin +0 -0
package/step.bin +0 -0
package/chunks/deepseek.js +0 -1
package/chunks/gemma.js +0 -1
package/chunks/glm.js +0 -1
package/chunks/gpt.js +0 -1
package/chunks/kimi.js +0 -1
package/chunks/mimo.js +0 -1
package/chunks/minimax.js +0 -1
package/chunks/qwen.js +0 -1
package/chunks/sdxl.js +0 -1
package/lib/base85Decode.d.ts +0 -2
package/lib/modelAssets.d.ts +0 -6
package/vocabulary/deepseek.js +0 -1
package/vocabulary/gemma.js +0 -1
package/vocabulary/glm.js +0 -1
package/vocabulary/gpt.js +0 -1
package/vocabulary/kimi.js +0 -1
package/vocabulary/mimo.js +0 -1
package/vocabulary/minimax.js +0 -1
package/vocabulary/qwen.js +0 -1
package/vocabulary/sdxl.js +0 -1
/package/chunks/{rolldown-runtime.js → chunk.js} +0 -0

package/README.md CHANGED Viewed

@@ -13,94 +13,118 @@ Count tokens or inspect token IDs across several modern tokenizer families from
 - Stable Diffusion XL
 - GLM 5.1
 - MiniMax M2.7
+- Hy3 Preview
+- Step 3.7 Flash
 ## Highlights
 - offline at runtime once the vendored assets are present
 - browser-friendly once bundled
 - exact golden outputs for the core sample fixture
-- Brotli-compressed MessagePack tokenizer assets with Map-backed structured loading
+- one Brotli-compressed MessagePack asset bundle per model
 - browser Brotli decompression with a bundled JS fallback where native stream support is missing
-- Rolldown browser builds that can lazy-load one chunk per vocabulary, plus an eager `all.js` variant and the required WASM asset
-- sync API for convenience
-- one shared interface for count-oriented and token-ID-oriented usage
+- Rolldown browser builds that emit binary vocabulary bundles, shared chunks and the required WASM asset
+- async auto-loading API plus loaded-only sync helpers
+- one small single-model API for counts, token IDs and byte offsets
 - generated tokenizer assets via `bun run fetch`
-- publish-ready browser `dist/` builds that bundle tokenizer assets, emit the required WASM files and include package metadata plus declarations
+- publish-ready browser `dist/` builds that keep vocabularies outside the JavaScript entry, emit the required WASM files and include package metadata plus declarations
 ## Usage
 ```ts
-import countTokens from 'token-vocabs'
+import tokenize from 'token-vocabs'
-console.dir(countTokens('mind goblin'))
+console.dir(await tokenize('mind goblin', 'gpt'))
 ```
 ```ts
-import countTokens from 'token-vocabs'
+import {count} from 'token-vocabs'
-console.dir(countTokens('mind goblin', {model: ['gpt', 'deepseek']}))
+console.dir(await count(new TextEncoder().encode('mind goblin'), {model: 'gpt'}))
 ```
 ```ts
-import countTokens from 'token-vocabs'
+import {load, tokenizeLoaded} from 'token-vocabs'
-console.dir(countTokens('mind goblin', 'gpt'))
-```
-```ts
-import {tokenize} from 'token-vocabs'
-console.dir(tokenize('mind goblin'))
+await load(['gpt', 'deepseek'])
+console.dir(tokenizeLoaded('mind goblin', 'gpt'))
 ```
 ## Example output
 ```ts
-countTokens('mind goblin')
-// {
-//   gpt: 3,
-//   gemma: 2,
-//   qwen: 3,
-//   kimi: 4,
-//   deepseek: 4,
-//   mimo: 3,
-//   sdxl: 2,
-//   glm: 3,
-//   minimax: 3,
-// }
+await count('mind goblin', 'gpt')
+// 3
 ```
 ```ts
-tokenize('mind goblin')
+await tokenize('mind goblin', 'gpt')
 // {
-//   gpt: [77021, 18778, 4724],
-//   gemma: [24447, 218798],
-//   qwen: [36475, 338, 45491],
-//   kimi: [66468, 970, 3145, 259],
-//   deepseek: [60514, 807, 3778, 261],
-//   mimo: [37724, 342, 47061],
-//   sdxl: [2575, 26223],
-//   glm: [37528, 342, 46771],
-//   minimax: [68201, 113859, 259],
+//   offsets: [4, 8],
+//   tokens: [77021, 18778, 4724],
 // }
 ```
 ## API
-### `countTokens(text, options?)`
+### `async count(textOrBytes, optionsOrModel)`
+Returns the token count for exactly one model and loads the required vocabulary bundle on demand.
+`Uint8Array` input is decoded as UTF-8.
+```ts
+await count('mind goblin', 'sdxl')
+await count('mind goblin', {model: 'gpt'})
+await count(new TextEncoder().encode('mind goblin'), 'gpt')
+```
+### `countLoaded(textOrBytes, optionsOrModel)`
+Synchronous count helper that uses the existing in-memory tokenizer state and throws if the requested vocabulary is not loaded yet.
+This is useful after `await load()` or after a previous `await count()` / `await tokenize()` call has already loaded the model.
-Returns token counts.
+### `async tokenize(textOrBytes, optionsOrModel)`
+Returns a `RawTokenizeResult` for exactly one model and loads the required vocabulary bundle on demand.
+```ts
+await tokenize('mind goblin', 'gpt')
+await tokenize('mind goblin', {model: 'gpt'})
+```
+### `tokenizeLoaded(textOrBytes, optionsOrModel)`
+Synchronous tokenization helper that reuses already loaded vocabularies and throws if the requested model is not in memory yet.
+The result shape is:
 ```ts
-countTokens('mind goblin')
-countTokens('mind goblin', 'sdxl')
-countTokens('mind goblin', {model: 'gpt'})
-countTokens('mind goblin', {model: ['gpt', 'deepseek']})
+type RawTokenizeResult = {
+  offsets: number[]
+  tokens: number[]
+  processedInput?: string | Uint8Array
+}
 ```
-### `tokenize(text, options?)`
+`offsets` omits the first token’s implicit `0` byte start to save one array slot.
-Returns token ID arrays with the same selection rules as `countTokens()`.
+If a tokenizer normalizes or otherwise preprocesses the input, `processedInput` contains the effective tokenizer input. Its type matches the input kind – string in, string out; `Uint8Array` in, `Uint8Array` out.
+If you need results for several models, call `count()` or `tokenize()` once per model and combine the results yourself.
+### `async load(modelSelection?)`
+Preloads one or more model vocabularies into memory.
+- `await load('gpt')` → resolves to `'gpt'`
+- `await load(['gpt', 'deepseek'])` → resolves to `['gpt', 'deepseek']`
+- `await load()` → loads every supported model and resolves to `modelIds`
+### `free(modelId?)`
+Releases a loaded model from memory, or every loaded model if no argument is provided.
 ### `modelIds`
@@ -112,40 +136,35 @@ Exports model metadata, including the original upstream source URLs used by `bun
 ### `token-vocabs/browser`
-Lazy browser entry with the same `countTokens()` and `tokenize()` API, plus:
-- `loadModel(modelId)`
-- `loadModels(modelSelection?)`
-- `isModelLoaded(modelId)`
-- `getLoadedModelIds()`
+Browser entry with the same `count()`, `countLoaded()`, `tokenize()`, `tokenizeLoaded()`, `load()` and `free()` API as the desktop entry.
-Load the required vocabularies first, then call the sync tokenization API.
+It loads the `.bin` asset bundles via `fetch()`.
 ### `token-vocabs/browser/all`
-Eager browser entry that preloads every vocabulary and keeps the original “load once, tokenize immediately” behavior.
+Eager browser entry that runs `await load()` at module initialization time so `countLoaded()` and `tokenizeLoaded()` work immediately after import.
 ## Distribution layout
-The published browser package exposes `token-vocabs` and `token-vocabs/browser/all` as the eager entry backed by `all.js`, plus `token-vocabs/browser` as the lazy entry backed by `main.js`.
+The published browser package exposes `token-vocabs` and `token-vocabs/browser` as the lazy entry backed by `main.js`, plus `token-vocabs/browser/all` as the eager entry backed by `all.js`.
 It also contains:
-- emitted chunk files under `vocabulary/` and `chunks/`, plus the required WASM asset
+- one Brotli-compressed MessagePack asset bundle per model at the package root, shared chunks and the required WASM asset
 - `package.json`, `README.md`, `LICENSE` and declaration files so the folder can be published on its own
 Example lazy browser usage from the published package:
 ```ts
-import {countTokens, loadModels} from 'token-vocabs/browser'
+import {countLoaded, load} from 'token-vocabs/browser'
-await loadModels(['gpt', 'deepseek'])
-console.dir(countTokens('mind goblin', {model: ['gpt', 'deepseek']}))
+await load(['gpt', 'deepseek'])
+console.dir(countLoaded('mind goblin', 'deepseek'))
 ```
 ## Notes
 - `sdxl` intentionally implements the shared CLIP BPE core used by SDXL without auto-adding BOS/EOS tokens.
 - GPT uses `tiktoken`’s built-in `o200k_base` implementation, but the upstream encoder payload is still fetched and converted to MessagePack for completeness.
-- Structured tokenizer payloads are emitted into generated modules as ASCII85-encoded `.msgpack.br` blobs and decompressed before use.
+- Structured tokenizer payloads are stored inside per-model `.bin` bundles and decompressed after loading.
 - Tokenizer assets are large. That is inherent to exact offline tokenization.

package/all.d.ts CHANGED Viewed

@@ -1,3 +1,3 @@
-export {countTokens, getLoadedModelIds, isModelLoaded, loadModel, loadModels, modelIds, models, tokenize} from './main.js'
+export {count, countLoaded, free, load, modelIds, models, tokenize, tokenizeLoaded} from './main.js'
 export {default} from './main.js'
-export type {CountTokensOptions, CountTokensResult, ModelId, ModelSelection, TokenizeResult} from './main.js'
+export type {CountOptions, CountResult, CountTokensOptions, CountTokensResult, ModelId, ModelSelection, RawTokenizeResult, TokenizeInput, TokenizeOptions, TokenizeResult} from './main.js'

package/all.js CHANGED Viewed

	@@ -1 +1 @@
1	- import{a as e,c as t,i as n,l as r,n as i,o as a,r as o,s,t as ~~c,u as l~~}from"./chunks/main.js";import{t as u}from"./vocabulary/deepseek.js";import{t as d}from"./vocabulary/gemma.js";import{t as f}from"./vocabulary/glm.js";import{t as p}from"./vocabulary/gpt.js";import{t as m}from"./vocabulary/kimi.js";import{t as h}from"./vocabulary/mimo.js";import{t as g}from"./vocabulary/minimax.js";import{t as _}from"./vocabulary/qwen.js";import{t as v}from"./vocabulary/sdxl.js";const[y,b,x,S,C,w,T,E,D]=await ~~Promise.all~~(~~[s(u~~)~~,s(d),s(f),s(p),s(m),s(h),s(g),s(_),s(v)])~~;~~t({deepseek:y,gemma:b,glm:x,gpt:S,kimi:C,mimo:w,minimax:T,qwen:E,sdxl:D});~~export{e as ~~countTokens~~,e as ~~default~~,c as ~~getLoadedModelIds~~,i as ~~isModelLoaded~~,o as ~~loadModel~~,n as ~~loadModels~~,r as modelIds,l as models,a as ~~tokenize~~};
1	+ import{a as e,c as t,i as n,n as r,o as i,r as a,s as o,t as s}from"./chunks/main.js";await n();export{s as count,r as countLoaded,e as default,e as tokenize,a as free,n as load,o as modelIds,t as models,i as tokenizeLoaded};