token-vocabs 0.2.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/README.md +80 -61
  2. package/all.d.ts +2 -2
  3. package/all.js +1 -1
  4. package/chunks/decompress.js +1 -1
  5. package/chunks/main.js +2 -2
  6. package/deepseek.bin +0 -0
  7. package/gemma.bin +0 -0
  8. package/glm.bin +0 -0
  9. package/gpt.bin +1 -0
  10. package/hy.bin +0 -0
  11. package/kimi.bin +0 -0
  12. package/lib/api.d.ts +24 -14
  13. package/lib/data.d.ts +4 -5
  14. package/lib/modelAssets/base/ModelAssetBundleLoader.d.ts +6 -0
  15. package/lib/models.d.ts +21 -1
  16. package/lib/tiktoken.d.ts +4 -0
  17. package/lib/tokenization.d.ts +38 -0
  18. package/lib/tokenizers/ClipTokenizer.d.ts +2 -0
  19. package/lib/tokenizers/HuggingFaceTokenizer.d.ts +10 -1
  20. package/lib/tokenizers/TiktokenTokenizer.d.ts +8 -3
  21. package/lib/tokenizers/base/BaseTokenizer.d.ts +7 -2
  22. package/lib/tokenizers/index.d.ts +6 -2
  23. package/main.d.ts +2 -9
  24. package/main.js +1 -1
  25. package/mimo.bin +0 -0
  26. package/minimax.bin +0 -0
  27. package/package.json +5 -5
  28. package/qwen.bin +0 -0
  29. package/sdxl.bin +0 -0
  30. package/step.bin +0 -0
  31. package/chunks/deepseek.js +0 -1
  32. package/chunks/gemma.js +0 -1
  33. package/chunks/glm.js +0 -1
  34. package/chunks/gpt.js +0 -1
  35. package/chunks/kimi.js +0 -1
  36. package/chunks/mimo.js +0 -1
  37. package/chunks/minimax.js +0 -1
  38. package/chunks/qwen.js +0 -1
  39. package/chunks/sdxl.js +0 -1
  40. package/lib/base85Decode.d.ts +0 -2
  41. package/lib/modelAssets.d.ts +0 -6
  42. package/vocabulary/deepseek.js +0 -1
  43. package/vocabulary/gemma.js +0 -1
  44. package/vocabulary/glm.js +0 -1
  45. package/vocabulary/gpt.js +0 -1
  46. package/vocabulary/kimi.js +0 -1
  47. package/vocabulary/mimo.js +0 -1
  48. package/vocabulary/minimax.js +0 -1
  49. package/vocabulary/qwen.js +0 -1
  50. package/vocabulary/sdxl.js +0 -1
  51. /package/chunks/{rolldown-runtime.js → chunk.js} +0 -0
package/README.md CHANGED
@@ -13,94 +13,118 @@ Count tokens or inspect token IDs across several modern tokenizer families from
13
13
  - Stable Diffusion XL
14
14
  - GLM 5.1
15
15
  - MiniMax M2.7
16
+ - Hy3 Preview
17
+ - Step 3.7 Flash
16
18
 
17
19
  ## Highlights
18
20
 
19
21
  - offline at runtime once the vendored assets are present
20
22
  - browser-friendly once bundled
21
23
  - exact golden outputs for the core sample fixture
22
- - Brotli-compressed MessagePack tokenizer assets with Map-backed structured loading
24
+ - one Brotli-compressed MessagePack asset bundle per model
23
25
  - browser Brotli decompression with a bundled JS fallback where native stream support is missing
24
- - Rolldown browser builds that can lazy-load one chunk per vocabulary, plus an eager `all.js` variant and the required WASM asset
25
- - sync API for convenience
26
- - one shared interface for count-oriented and token-ID-oriented usage
26
+ - Rolldown browser builds that emit binary vocabulary bundles, shared chunks and the required WASM asset
27
+ - async auto-loading API plus loaded-only sync helpers
28
+ - one small single-model API for counts, token IDs and byte offsets
27
29
  - generated tokenizer assets via `bun run fetch`
28
- - publish-ready browser `dist/` builds that bundle tokenizer assets, emit the required WASM files and include package metadata plus declarations
30
+ - publish-ready browser `dist/` builds that keep vocabularies outside the JavaScript entry, emit the required WASM files and include package metadata plus declarations
29
31
 
30
32
  ## Usage
31
33
 
32
34
  ```ts
33
- import countTokens from 'token-vocabs'
35
+ import tokenize from 'token-vocabs'
34
36
 
35
- console.dir(countTokens('mind goblin'))
37
+ console.dir(await tokenize('mind goblin', 'gpt'))
36
38
  ```
37
39
 
38
40
  ```ts
39
- import countTokens from 'token-vocabs'
41
+ import {count} from 'token-vocabs'
40
42
 
41
- console.dir(countTokens('mind goblin', {model: ['gpt', 'deepseek']}))
43
+ console.dir(await count(new TextEncoder().encode('mind goblin'), {model: 'gpt'}))
42
44
  ```
43
45
 
44
46
  ```ts
45
- import countTokens from 'token-vocabs'
47
+ import {load, tokenizeLoaded} from 'token-vocabs'
46
48
 
47
- console.dir(countTokens('mind goblin', 'gpt'))
48
- ```
49
-
50
- ```ts
51
- import {tokenize} from 'token-vocabs'
52
-
53
- console.dir(tokenize('mind goblin'))
49
+ await load(['gpt', 'deepseek'])
50
+ console.dir(tokenizeLoaded('mind goblin', 'gpt'))
54
51
  ```
55
52
 
56
53
  ## Example output
57
54
 
58
55
  ```ts
59
- countTokens('mind goblin')
60
- // {
61
- // gpt: 3,
62
- // gemma: 2,
63
- // qwen: 3,
64
- // kimi: 4,
65
- // deepseek: 4,
66
- // mimo: 3,
67
- // sdxl: 2,
68
- // glm: 3,
69
- // minimax: 3,
70
- // }
56
+ await count('mind goblin', 'gpt')
57
+ // 3
71
58
  ```
72
59
 
73
60
  ```ts
74
- tokenize('mind goblin')
61
+ await tokenize('mind goblin', 'gpt')
75
62
  // {
76
- // gpt: [77021, 18778, 4724],
77
- // gemma: [24447, 218798],
78
- // qwen: [36475, 338, 45491],
79
- // kimi: [66468, 970, 3145, 259],
80
- // deepseek: [60514, 807, 3778, 261],
81
- // mimo: [37724, 342, 47061],
82
- // sdxl: [2575, 26223],
83
- // glm: [37528, 342, 46771],
84
- // minimax: [68201, 113859, 259],
63
+ // offsets: [4, 8],
64
+ // tokens: [77021, 18778, 4724],
85
65
  // }
86
66
  ```
87
67
 
88
68
  ## API
89
69
 
90
- ### `countTokens(text, options?)`
70
+ ### `async count(textOrBytes, optionsOrModel)`
71
+
72
+ Returns the token count for exactly one model and loads the required vocabulary bundle on demand.
73
+
74
+ `Uint8Array` input is decoded as UTF-8.
75
+
76
+ ```ts
77
+ await count('mind goblin', 'sdxl')
78
+ await count('mind goblin', {model: 'gpt'})
79
+ await count(new TextEncoder().encode('mind goblin'), 'gpt')
80
+ ```
81
+
82
+ ### `countLoaded(textOrBytes, optionsOrModel)`
83
+
84
+ Synchronous count helper that uses the existing in-memory tokenizer state and throws if the requested vocabulary is not loaded yet.
85
+
86
+ This is useful after `await load()` or after a previous `await count()` / `await tokenize()` call has already loaded the model.
91
87
 
92
- Returns token counts.
88
+ ### `async tokenize(textOrBytes, optionsOrModel)`
89
+
90
+ Returns a `RawTokenizeResult` for exactly one model and loads the required vocabulary bundle on demand.
91
+
92
+ ```ts
93
+ await tokenize('mind goblin', 'gpt')
94
+ await tokenize('mind goblin', {model: 'gpt'})
95
+ ```
96
+
97
+ ### `tokenizeLoaded(textOrBytes, optionsOrModel)`
98
+
99
+ Synchronous tokenization helper that reuses already loaded vocabularies and throws if the requested model is not in memory yet.
100
+
101
+ The result shape is:
93
102
 
94
103
  ```ts
95
- countTokens('mind goblin')
96
- countTokens('mind goblin', 'sdxl')
97
- countTokens('mind goblin', {model: 'gpt'})
98
- countTokens('mind goblin', {model: ['gpt', 'deepseek']})
104
+ type RawTokenizeResult = {
105
+ offsets: number[]
106
+ tokens: number[]
107
+ processedInput?: string | Uint8Array
108
+ }
99
109
  ```
100
110
 
101
- ### `tokenize(text, options?)`
111
+ `offsets` omits the first token’s implicit `0` byte start to save one array slot.
102
112
 
103
- Returns token ID arrays with the same selection rules as `countTokens()`.
113
+ If a tokenizer normalizes or otherwise preprocesses the input, `processedInput` contains the effective tokenizer input. Its type matches the input kind – string in, string out; `Uint8Array` in, `Uint8Array` out.
114
+
115
+ If you need results for several models, call `count()` or `tokenize()` once per model and combine the results yourself.
116
+
117
+ ### `async load(modelSelection?)`
118
+
119
+ Preloads one or more model vocabularies into memory.
120
+
121
+ - `await load('gpt')` → resolves to `'gpt'`
122
+ - `await load(['gpt', 'deepseek'])` → resolves to `['gpt', 'deepseek']`
123
+ - `await load()` → loads every supported model and resolves to `modelIds`
124
+
125
+ ### `free(modelId?)`
126
+
127
+ Releases a loaded model from memory, or every loaded model if no argument is provided.
104
128
 
105
129
  ### `modelIds`
106
130
 
@@ -112,40 +136,35 @@ Exports model metadata, including the original upstream source URLs used by `bun
112
136
 
113
137
  ### `token-vocabs/browser`
114
138
 
115
- Lazy browser entry with the same `countTokens()` and `tokenize()` API, plus:
116
-
117
- - `loadModel(modelId)`
118
- - `loadModels(modelSelection?)`
119
- - `isModelLoaded(modelId)`
120
- - `getLoadedModelIds()`
139
+ Browser entry with the same `count()`, `countLoaded()`, `tokenize()`, `tokenizeLoaded()`, `load()` and `free()` API as the desktop entry.
121
140
 
122
- Load the required vocabularies first, then call the sync tokenization API.
141
+ It loads the `.bin` asset bundles via `fetch()`.
123
142
 
124
143
  ### `token-vocabs/browser/all`
125
144
 
126
- Eager browser entry that preloads every vocabulary and keeps the original “load once, tokenize immediately behavior.
145
+ Eager browser entry that runs `await load()` at module initialization time so `countLoaded()` and `tokenizeLoaded()` work immediately after import.
127
146
 
128
147
  ## Distribution layout
129
148
 
130
- The published browser package exposes `token-vocabs` and `token-vocabs/browser/all` as the eager entry backed by `all.js`, plus `token-vocabs/browser` as the lazy entry backed by `main.js`.
149
+ The published browser package exposes `token-vocabs` and `token-vocabs/browser` as the lazy entry backed by `main.js`, plus `token-vocabs/browser/all` as the eager entry backed by `all.js`.
131
150
 
132
151
  It also contains:
133
152
 
134
- - emitted chunk files under `vocabulary/` and `chunks/`, plus the required WASM asset
153
+ - one Brotli-compressed MessagePack asset bundle per model at the package root, shared chunks and the required WASM asset
135
154
  - `package.json`, `README.md`, `LICENSE` and declaration files so the folder can be published on its own
136
155
 
137
156
  Example lazy browser usage from the published package:
138
157
 
139
158
  ```ts
140
- import {countTokens, loadModels} from 'token-vocabs/browser'
159
+ import {countLoaded, load} from 'token-vocabs/browser'
141
160
 
142
- await loadModels(['gpt', 'deepseek'])
143
- console.dir(countTokens('mind goblin', {model: ['gpt', 'deepseek']}))
161
+ await load(['gpt', 'deepseek'])
162
+ console.dir(countLoaded('mind goblin', 'deepseek'))
144
163
  ```
145
164
 
146
165
  ## Notes
147
166
 
148
167
  - `sdxl` intentionally implements the shared CLIP BPE core used by SDXL without auto-adding BOS/EOS tokens.
149
168
  - GPT uses `tiktoken`’s built-in `o200k_base` implementation, but the upstream encoder payload is still fetched and converted to MessagePack for completeness.
150
- - Structured tokenizer payloads are emitted into generated modules as ASCII85-encoded `.msgpack.br` blobs and decompressed before use.
169
+ - Structured tokenizer payloads are stored inside per-model `.bin` bundles and decompressed after loading.
151
170
  - Tokenizer assets are large. That is inherent to exact offline tokenization.
package/all.d.ts CHANGED
@@ -1,3 +1,3 @@
1
- export {countTokens, getLoadedModelIds, isModelLoaded, loadModel, loadModels, modelIds, models, tokenize} from './main.js'
1
+ export {count, countLoaded, free, load, modelIds, models, tokenize, tokenizeLoaded} from './main.js'
2
2
  export {default} from './main.js'
3
- export type {CountTokensOptions, CountTokensResult, ModelId, ModelSelection, TokenizeResult} from './main.js'
3
+ export type {CountOptions, CountResult, CountTokensOptions, CountTokensResult, ModelId, ModelSelection, RawTokenizeResult, TokenizeInput, TokenizeOptions, TokenizeResult} from './main.js'
package/all.js CHANGED
@@ -1 +1 @@
1
- import{a as e,c as t,i as n,l as r,n as i,o as a,r as o,s,t as c,u as l}from"./chunks/main.js";import{t as u}from"./vocabulary/deepseek.js";import{t as d}from"./vocabulary/gemma.js";import{t as f}from"./vocabulary/glm.js";import{t as p}from"./vocabulary/gpt.js";import{t as m}from"./vocabulary/kimi.js";import{t as h}from"./vocabulary/mimo.js";import{t as g}from"./vocabulary/minimax.js";import{t as _}from"./vocabulary/qwen.js";import{t as v}from"./vocabulary/sdxl.js";const[y,b,x,S,C,w,T,E,D]=await Promise.all([s(u),s(d),s(f),s(p),s(m),s(h),s(g),s(_),s(v)]);t({deepseek:y,gemma:b,glm:x,gpt:S,kimi:C,mimo:w,minimax:T,qwen:E,sdxl:D});export{e as countTokens,e as default,c as getLoadedModelIds,i as isModelLoaded,o as loadModel,n as loadModels,r as modelIds,l as models,a as tokenize};
1
+ import{a as e,c as t,i as n,n as r,o as i,r as a,s as o,t as s}from"./chunks/main.js";await n();export{s as count,r as countLoaded,e as default,e as tokenize,a as free,n as load,o as modelIds,t as models,i as tokenizeLoaded};