token-vocabs 0.2.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +80 -61
- package/all.d.ts +2 -2
- package/all.js +1 -1
- package/chunks/decompress.js +1 -1
- package/chunks/main.js +2 -2
- package/deepseek.bin +0 -0
- package/gemma.bin +0 -0
- package/glm.bin +0 -0
- package/gpt.bin +1 -0
- package/hy.bin +0 -0
- package/kimi.bin +0 -0
- package/lib/api.d.ts +24 -14
- package/lib/data.d.ts +4 -5
- package/lib/modelAssets/base/ModelAssetBundleLoader.d.ts +6 -0
- package/lib/models.d.ts +21 -1
- package/lib/tiktoken.d.ts +4 -0
- package/lib/tokenization.d.ts +38 -0
- package/lib/tokenizers/ClipTokenizer.d.ts +2 -0
- package/lib/tokenizers/HuggingFaceTokenizer.d.ts +10 -1
- package/lib/tokenizers/TiktokenTokenizer.d.ts +8 -3
- package/lib/tokenizers/base/BaseTokenizer.d.ts +7 -2
- package/lib/tokenizers/index.d.ts +6 -2
- package/main.d.ts +2 -9
- package/main.js +1 -1
- package/mimo.bin +0 -0
- package/minimax.bin +0 -0
- package/package.json +5 -5
- package/qwen.bin +0 -0
- package/sdxl.bin +0 -0
- package/step.bin +0 -0
- package/chunks/deepseek.js +0 -1
- package/chunks/gemma.js +0 -1
- package/chunks/glm.js +0 -1
- package/chunks/gpt.js +0 -1
- package/chunks/kimi.js +0 -1
- package/chunks/mimo.js +0 -1
- package/chunks/minimax.js +0 -1
- package/chunks/qwen.js +0 -1
- package/chunks/sdxl.js +0 -1
- package/lib/base85Decode.d.ts +0 -2
- package/lib/modelAssets.d.ts +0 -6
- package/vocabulary/deepseek.js +0 -1
- package/vocabulary/gemma.js +0 -1
- package/vocabulary/glm.js +0 -1
- package/vocabulary/gpt.js +0 -1
- package/vocabulary/kimi.js +0 -1
- package/vocabulary/mimo.js +0 -1
- package/vocabulary/minimax.js +0 -1
- package/vocabulary/qwen.js +0 -1
- package/vocabulary/sdxl.js +0 -1
- /package/chunks/{rolldown-runtime.js → chunk.js} +0 -0
package/README.md
CHANGED
|
@@ -13,94 +13,118 @@ Count tokens or inspect token IDs across several modern tokenizer families from
|
|
|
13
13
|
- Stable Diffusion XL
|
|
14
14
|
- GLM 5.1
|
|
15
15
|
- MiniMax M2.7
|
|
16
|
+
- Hy3 Preview
|
|
17
|
+
- Step 3.7 Flash
|
|
16
18
|
|
|
17
19
|
## Highlights
|
|
18
20
|
|
|
19
21
|
- offline at runtime once the vendored assets are present
|
|
20
22
|
- browser-friendly once bundled
|
|
21
23
|
- exact golden outputs for the core sample fixture
|
|
22
|
-
- Brotli-compressed MessagePack
|
|
24
|
+
- one Brotli-compressed MessagePack asset bundle per model
|
|
23
25
|
- browser Brotli decompression with a bundled JS fallback where native stream support is missing
|
|
24
|
-
- Rolldown browser builds that
|
|
25
|
-
-
|
|
26
|
-
- one
|
|
26
|
+
- Rolldown browser builds that emit binary vocabulary bundles, shared chunks and the required WASM asset
|
|
27
|
+
- async auto-loading API plus loaded-only sync helpers
|
|
28
|
+
- one small single-model API for counts, token IDs and byte offsets
|
|
27
29
|
- generated tokenizer assets via `bun run fetch`
|
|
28
|
-
- publish-ready browser `dist/` builds that
|
|
30
|
+
- publish-ready browser `dist/` builds that keep vocabularies outside the JavaScript entry, emit the required WASM files and include package metadata plus declarations
|
|
29
31
|
|
|
30
32
|
## Usage
|
|
31
33
|
|
|
32
34
|
```ts
|
|
33
|
-
import
|
|
35
|
+
import tokenize from 'token-vocabs'
|
|
34
36
|
|
|
35
|
-
console.dir(
|
|
37
|
+
console.dir(await tokenize('mind goblin', 'gpt'))
|
|
36
38
|
```
|
|
37
39
|
|
|
38
40
|
```ts
|
|
39
|
-
import
|
|
41
|
+
import {count} from 'token-vocabs'
|
|
40
42
|
|
|
41
|
-
console.dir(
|
|
43
|
+
console.dir(await count(new TextEncoder().encode('mind goblin'), {model: 'gpt'}))
|
|
42
44
|
```
|
|
43
45
|
|
|
44
46
|
```ts
|
|
45
|
-
import
|
|
47
|
+
import {load, tokenizeLoaded} from 'token-vocabs'
|
|
46
48
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
```ts
|
|
51
|
-
import {tokenize} from 'token-vocabs'
|
|
52
|
-
|
|
53
|
-
console.dir(tokenize('mind goblin'))
|
|
49
|
+
await load(['gpt', 'deepseek'])
|
|
50
|
+
console.dir(tokenizeLoaded('mind goblin', 'gpt'))
|
|
54
51
|
```
|
|
55
52
|
|
|
56
53
|
## Example output
|
|
57
54
|
|
|
58
55
|
```ts
|
|
59
|
-
|
|
60
|
-
//
|
|
61
|
-
// gpt: 3,
|
|
62
|
-
// gemma: 2,
|
|
63
|
-
// qwen: 3,
|
|
64
|
-
// kimi: 4,
|
|
65
|
-
// deepseek: 4,
|
|
66
|
-
// mimo: 3,
|
|
67
|
-
// sdxl: 2,
|
|
68
|
-
// glm: 3,
|
|
69
|
-
// minimax: 3,
|
|
70
|
-
// }
|
|
56
|
+
await count('mind goblin', 'gpt')
|
|
57
|
+
// 3
|
|
71
58
|
```
|
|
72
59
|
|
|
73
60
|
```ts
|
|
74
|
-
tokenize('mind goblin')
|
|
61
|
+
await tokenize('mind goblin', 'gpt')
|
|
75
62
|
// {
|
|
76
|
-
//
|
|
77
|
-
//
|
|
78
|
-
// qwen: [36475, 338, 45491],
|
|
79
|
-
// kimi: [66468, 970, 3145, 259],
|
|
80
|
-
// deepseek: [60514, 807, 3778, 261],
|
|
81
|
-
// mimo: [37724, 342, 47061],
|
|
82
|
-
// sdxl: [2575, 26223],
|
|
83
|
-
// glm: [37528, 342, 46771],
|
|
84
|
-
// minimax: [68201, 113859, 259],
|
|
63
|
+
// offsets: [4, 8],
|
|
64
|
+
// tokens: [77021, 18778, 4724],
|
|
85
65
|
// }
|
|
86
66
|
```
|
|
87
67
|
|
|
88
68
|
## API
|
|
89
69
|
|
|
90
|
-
### `
|
|
70
|
+
### `async count(textOrBytes, optionsOrModel)`
|
|
71
|
+
|
|
72
|
+
Returns the token count for exactly one model and loads the required vocabulary bundle on demand.
|
|
73
|
+
|
|
74
|
+
`Uint8Array` input is decoded as UTF-8.
|
|
75
|
+
|
|
76
|
+
```ts
|
|
77
|
+
await count('mind goblin', 'sdxl')
|
|
78
|
+
await count('mind goblin', {model: 'gpt'})
|
|
79
|
+
await count(new TextEncoder().encode('mind goblin'), 'gpt')
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### `countLoaded(textOrBytes, optionsOrModel)`
|
|
83
|
+
|
|
84
|
+
Synchronous count helper that uses the existing in-memory tokenizer state and throws if the requested vocabulary is not loaded yet.
|
|
85
|
+
|
|
86
|
+
This is useful after `await load()` or after a previous `await count()` / `await tokenize()` call has already loaded the model.
|
|
91
87
|
|
|
92
|
-
|
|
88
|
+
### `async tokenize(textOrBytes, optionsOrModel)`
|
|
89
|
+
|
|
90
|
+
Returns a `RawTokenizeResult` for exactly one model and loads the required vocabulary bundle on demand.
|
|
91
|
+
|
|
92
|
+
```ts
|
|
93
|
+
await tokenize('mind goblin', 'gpt')
|
|
94
|
+
await tokenize('mind goblin', {model: 'gpt'})
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### `tokenizeLoaded(textOrBytes, optionsOrModel)`
|
|
98
|
+
|
|
99
|
+
Synchronous tokenization helper that reuses already loaded vocabularies and throws if the requested model is not in memory yet.
|
|
100
|
+
|
|
101
|
+
The result shape is:
|
|
93
102
|
|
|
94
103
|
```ts
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
104
|
+
type RawTokenizeResult = {
|
|
105
|
+
offsets: number[]
|
|
106
|
+
tokens: number[]
|
|
107
|
+
processedInput?: string | Uint8Array
|
|
108
|
+
}
|
|
99
109
|
```
|
|
100
110
|
|
|
101
|
-
|
|
111
|
+
`offsets` omits the first token’s implicit `0` byte start to save one array slot.
|
|
102
112
|
|
|
103
|
-
|
|
113
|
+
If a tokenizer normalizes or otherwise preprocesses the input, `processedInput` contains the effective tokenizer input. Its type matches the input kind – string in, string out; `Uint8Array` in, `Uint8Array` out.
|
|
114
|
+
|
|
115
|
+
If you need results for several models, call `count()` or `tokenize()` once per model and combine the results yourself.
|
|
116
|
+
|
|
117
|
+
### `async load(modelSelection?)`
|
|
118
|
+
|
|
119
|
+
Preloads one or more model vocabularies into memory.
|
|
120
|
+
|
|
121
|
+
- `await load('gpt')` → resolves to `'gpt'`
|
|
122
|
+
- `await load(['gpt', 'deepseek'])` → resolves to `['gpt', 'deepseek']`
|
|
123
|
+
- `await load()` → loads every supported model and resolves to `modelIds`
|
|
124
|
+
|
|
125
|
+
### `free(modelId?)`
|
|
126
|
+
|
|
127
|
+
Releases a loaded model from memory, or every loaded model if no argument is provided.
|
|
104
128
|
|
|
105
129
|
### `modelIds`
|
|
106
130
|
|
|
@@ -112,40 +136,35 @@ Exports model metadata, including the original upstream source URLs used by `bun
|
|
|
112
136
|
|
|
113
137
|
### `token-vocabs/browser`
|
|
114
138
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
- `loadModel(modelId)`
|
|
118
|
-
- `loadModels(modelSelection?)`
|
|
119
|
-
- `isModelLoaded(modelId)`
|
|
120
|
-
- `getLoadedModelIds()`
|
|
139
|
+
Browser entry with the same `count()`, `countLoaded()`, `tokenize()`, `tokenizeLoaded()`, `load()` and `free()` API as the desktop entry.
|
|
121
140
|
|
|
122
|
-
|
|
141
|
+
It loads the `.bin` asset bundles via `fetch()`.
|
|
123
142
|
|
|
124
143
|
### `token-vocabs/browser/all`
|
|
125
144
|
|
|
126
|
-
Eager browser entry that
|
|
145
|
+
Eager browser entry that runs `await load()` at module initialization time so `countLoaded()` and `tokenizeLoaded()` work immediately after import.
|
|
127
146
|
|
|
128
147
|
## Distribution layout
|
|
129
148
|
|
|
130
|
-
The published browser package exposes `token-vocabs` and `token-vocabs/browser
|
|
149
|
+
The published browser package exposes `token-vocabs` and `token-vocabs/browser` as the lazy entry backed by `main.js`, plus `token-vocabs/browser/all` as the eager entry backed by `all.js`.
|
|
131
150
|
|
|
132
151
|
It also contains:
|
|
133
152
|
|
|
134
|
-
-
|
|
153
|
+
- one Brotli-compressed MessagePack asset bundle per model at the package root, shared chunks and the required WASM asset
|
|
135
154
|
- `package.json`, `README.md`, `LICENSE` and declaration files so the folder can be published on its own
|
|
136
155
|
|
|
137
156
|
Example lazy browser usage from the published package:
|
|
138
157
|
|
|
139
158
|
```ts
|
|
140
|
-
import {
|
|
159
|
+
import {countLoaded, load} from 'token-vocabs/browser'
|
|
141
160
|
|
|
142
|
-
await
|
|
143
|
-
console.dir(
|
|
161
|
+
await load(['gpt', 'deepseek'])
|
|
162
|
+
console.dir(countLoaded('mind goblin', 'deepseek'))
|
|
144
163
|
```
|
|
145
164
|
|
|
146
165
|
## Notes
|
|
147
166
|
|
|
148
167
|
- `sdxl` intentionally implements the shared CLIP BPE core used by SDXL without auto-adding BOS/EOS tokens.
|
|
149
168
|
- GPT uses `tiktoken`’s built-in `o200k_base` implementation, but the upstream encoder payload is still fetched and converted to MessagePack for completeness.
|
|
150
|
-
- Structured tokenizer payloads are
|
|
169
|
+
- Structured tokenizer payloads are stored inside per-model `.bin` bundles and decompressed after loading.
|
|
151
170
|
- Tokenizer assets are large. That is inherent to exact offline tokenization.
|
package/all.d.ts
CHANGED
|
@@ -1,3 +1,3 @@
|
|
|
1
|
-
export {
|
|
1
|
+
export {count, countLoaded, free, load, modelIds, models, tokenize, tokenizeLoaded} from './main.js'
|
|
2
2
|
export {default} from './main.js'
|
|
3
|
-
export type {CountTokensOptions, CountTokensResult, ModelId, ModelSelection, TokenizeResult} from './main.js'
|
|
3
|
+
export type {CountOptions, CountResult, CountTokensOptions, CountTokensResult, ModelId, ModelSelection, RawTokenizeResult, TokenizeInput, TokenizeOptions, TokenizeResult} from './main.js'
|
package/all.js
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
import{a as e,c as t,i as n,
|
|
1
|
+
import{a as e,c as t,i as n,n as r,o as i,r as a,s as o,t as s}from"./chunks/main.js";await n();export{s as count,r as countLoaded,e as default,e as tokenize,a as free,n as load,o as modelIds,t as models,i as tokenizeLoaded};
|