token-vocabs 0.0.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +151 -0
- package/all.d.ts +3 -0
- package/all.js +1 -0
- package/chunks/decompress.js +3 -0
- package/chunks/deepseek.js +1 -0
- package/chunks/gemma.js +1 -0
- package/chunks/glm.js +1 -0
- package/chunks/gpt.js +1 -0
- package/chunks/kimi.js +1 -0
- package/chunks/main.js +2 -0
- package/chunks/mimo.js +1 -0
- package/chunks/minimax.js +1 -0
- package/chunks/qwen.js +1 -0
- package/chunks/rolldown-runtime.js +1 -0
- package/chunks/sdxl.js +1 -0
- package/lib/api.d.ts +19 -0
- package/lib/base85Decode.d.ts +2 -0
- package/lib/data.d.ts +9 -0
- package/lib/decompressBrotli.d.ts +2 -0
- package/lib/modelAssets.d.ts +6 -0
- package/lib/modelSelection.d.ts +4 -0
- package/lib/models.d.ts +124 -0
- package/lib/structuredData.d.ts +2 -0
- package/lib/tokenizers/ClipTokenizer.d.ts +23 -0
- package/lib/tokenizers/HuggingFaceTokenizer.d.ts +20 -0
- package/lib/tokenizers/TiktokenTokenizer.d.ts +15 -0
- package/lib/tokenizers/base/BaseTokenizer.d.ts +8 -0
- package/lib/tokenizers/index.d.ts +7 -0
- package/main.d.ts +10 -0
- package/main.js +1 -0
- package/package.json +47 -1
- package/tiktoken_bg.wasm +0 -0
- package/vocabulary/deepseek.js +1 -0
- package/vocabulary/gemma.js +1 -0
- package/vocabulary/glm.js +1 -0
- package/vocabulary/gpt.js +1 -0
- package/vocabulary/kimi.js +1 -0
- package/vocabulary/mimo.js +1 -0
- package/vocabulary/minimax.js +1 -0
- package/vocabulary/qwen.js +1 -0
- package/vocabulary/sdxl.js +1 -0
- package/index.d.ts +0 -6
- package/index.js +0 -1
- package/readme.md +0 -1
- /package/{license.txt → LICENSE} +0 -0
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
export type EncodedModelAssetFiles = Record<string, string>;
|
|
2
|
+
export type ModelAssetFileContent = Uint8Array | string;
|
|
3
|
+
export type ModelAssetFiles = Record<string, ModelAssetFileContent>;
|
|
4
|
+
export declare const isCompressedMsgpackFile: (fileName: string) => boolean;
|
|
5
|
+
export declare const normalizeModelAssetFileName: (fileName: string) => string;
|
|
6
|
+
export declare const prepareEncodedModelAssets: (files: EncodedModelAssetFiles) => Promise<ModelAssetFiles>;
|
package/lib/models.d.ts
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
export type ModelId = 'deepseek' | 'gemma' | 'glm' | 'gpt' | 'kimi' | 'mimo' | 'minimax' | 'qwen' | 'sdxl';
|
|
2
|
+
export type ModelDefinition = BuiltinTiktokenModelDefinition | ClipBpeModelDefinition | CustomTiktokenModelDefinition | HuggingFaceModelDefinition;
|
|
3
|
+
type BaseModelDefinition = {
|
|
4
|
+
openrouter?: string;
|
|
5
|
+
title: string;
|
|
6
|
+
};
|
|
7
|
+
type BuiltinTiktokenModelDefinition = BaseModelDefinition & {
|
|
8
|
+
encoding: 'o200k_base';
|
|
9
|
+
kind: 'tiktoken-builtin';
|
|
10
|
+
source: {
|
|
11
|
+
encodingJsonUrl: string;
|
|
12
|
+
};
|
|
13
|
+
};
|
|
14
|
+
type CustomTiktokenModelDefinition = BaseModelDefinition & {
|
|
15
|
+
kind: 'tiktoken-custom';
|
|
16
|
+
source: {
|
|
17
|
+
modelUrl: string;
|
|
18
|
+
tokenizerConfigUrl: string;
|
|
19
|
+
tokenizerImplementationUrl: string;
|
|
20
|
+
};
|
|
21
|
+
};
|
|
22
|
+
type HuggingFaceModelDefinition = BaseModelDefinition & {
|
|
23
|
+
kind: 'huggingface';
|
|
24
|
+
source: {
|
|
25
|
+
tokenizerConfigUrl: string;
|
|
26
|
+
tokenizerJsonUrl: string;
|
|
27
|
+
};
|
|
28
|
+
};
|
|
29
|
+
type ClipBpeModelDefinition = BaseModelDefinition & {
|
|
30
|
+
kind: 'clip-bpe';
|
|
31
|
+
source: {
|
|
32
|
+
mergesUrl: string;
|
|
33
|
+
specialTokensMapUrl?: string;
|
|
34
|
+
tokenizerConfigUrl: string;
|
|
35
|
+
vocabUrl: string;
|
|
36
|
+
};
|
|
37
|
+
};
|
|
38
|
+
export declare const models: {
|
|
39
|
+
readonly gpt: {
|
|
40
|
+
readonly encoding: "o200k_base";
|
|
41
|
+
readonly kind: "tiktoken-builtin";
|
|
42
|
+
readonly openrouter: "openai/gpt-5.5";
|
|
43
|
+
readonly source: {
|
|
44
|
+
readonly encodingJsonUrl: "https://tiktoken.pages.dev/js/o200k_base.json";
|
|
45
|
+
};
|
|
46
|
+
readonly title: "GPT-5.5";
|
|
47
|
+
};
|
|
48
|
+
readonly gemma: {
|
|
49
|
+
readonly kind: "huggingface";
|
|
50
|
+
readonly openrouter: "google/gemma-4-31b-it";
|
|
51
|
+
readonly source: {
|
|
52
|
+
readonly tokenizerConfigUrl: "https://huggingface.co/google/gemma-4-31B-it/resolve/main/tokenizer_config.json";
|
|
53
|
+
readonly tokenizerJsonUrl: "https://huggingface.co/google/gemma-4-31B-it/resolve/main/tokenizer.json";
|
|
54
|
+
};
|
|
55
|
+
readonly title: "Gemma 4 31B it";
|
|
56
|
+
};
|
|
57
|
+
readonly qwen: {
|
|
58
|
+
readonly kind: "huggingface";
|
|
59
|
+
readonly openrouter: "qwen/qwen3.6-27b";
|
|
60
|
+
readonly source: {
|
|
61
|
+
readonly tokenizerConfigUrl: "https://huggingface.co/Qwen/Qwen3.6-27B/resolve/main/tokenizer_config.json";
|
|
62
|
+
readonly tokenizerJsonUrl: "https://huggingface.co/Qwen/Qwen3.6-27B/resolve/main/tokenizer.json";
|
|
63
|
+
};
|
|
64
|
+
readonly title: "Qwen 3.6 27B";
|
|
65
|
+
};
|
|
66
|
+
readonly kimi: {
|
|
67
|
+
readonly kind: "tiktoken-custom";
|
|
68
|
+
readonly openrouter: "moonshotai/kimi-k2.6";
|
|
69
|
+
readonly source: {
|
|
70
|
+
readonly modelUrl: "https://huggingface.co/moonshotai/Kimi-K2.6/resolve/main/tiktoken.model";
|
|
71
|
+
readonly tokenizerConfigUrl: "https://huggingface.co/moonshotai/Kimi-K2.6/resolve/main/tokenizer_config.json";
|
|
72
|
+
readonly tokenizerImplementationUrl: "https://huggingface.co/moonshotai/Kimi-K2.6/resolve/main/tokenization_kimi.py";
|
|
73
|
+
};
|
|
74
|
+
readonly title: "Kimi K2.6";
|
|
75
|
+
};
|
|
76
|
+
readonly deepseek: {
|
|
77
|
+
readonly kind: "huggingface";
|
|
78
|
+
readonly openrouter: "deepseek/deepseek-v4-pro";
|
|
79
|
+
readonly source: {
|
|
80
|
+
readonly tokenizerConfigUrl: "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/resolve/main/tokenizer_config.json";
|
|
81
|
+
readonly tokenizerJsonUrl: "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/resolve/main/tokenizer.json";
|
|
82
|
+
};
|
|
83
|
+
readonly title: "DeepSeek V4 Pro";
|
|
84
|
+
};
|
|
85
|
+
readonly mimo: {
|
|
86
|
+
readonly kind: "huggingface";
|
|
87
|
+
readonly openrouter: "xiaomi/mimo-v2.5-pro";
|
|
88
|
+
readonly source: {
|
|
89
|
+
readonly tokenizerConfigUrl: "https://huggingface.co/XiaomiMiMo/MiMo-V2.5-Pro/resolve/main/tokenizer_config.json";
|
|
90
|
+
readonly tokenizerJsonUrl: "https://huggingface.co/XiaomiMiMo/MiMo-V2.5-Pro/resolve/main/tokenizer.json";
|
|
91
|
+
};
|
|
92
|
+
readonly title: "MiMo V2.5 Pro";
|
|
93
|
+
};
|
|
94
|
+
readonly sdxl: {
|
|
95
|
+
readonly kind: "clip-bpe";
|
|
96
|
+
readonly source: {
|
|
97
|
+
readonly mergesUrl: "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/tokenizer_2/merges.txt";
|
|
98
|
+
readonly specialTokensMapUrl: "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/tokenizer_2/special_tokens_map.json";
|
|
99
|
+
readonly tokenizerConfigUrl: "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/tokenizer_2/tokenizer_config.json";
|
|
100
|
+
readonly vocabUrl: "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/tokenizer_2/vocab.json";
|
|
101
|
+
};
|
|
102
|
+
readonly title: "Stable Diffusion XL";
|
|
103
|
+
};
|
|
104
|
+
readonly glm: {
|
|
105
|
+
readonly kind: "huggingface";
|
|
106
|
+
readonly openrouter: "zai-org/glm-5.1";
|
|
107
|
+
readonly source: {
|
|
108
|
+
readonly tokenizerConfigUrl: "https://huggingface.co/zai-org/GLM-5.1/resolve/main/tokenizer_config.json";
|
|
109
|
+
readonly tokenizerJsonUrl: "https://huggingface.co/zai-org/GLM-5.1/resolve/main/tokenizer.json";
|
|
110
|
+
};
|
|
111
|
+
readonly title: "GLM 5.1";
|
|
112
|
+
};
|
|
113
|
+
readonly minimax: {
|
|
114
|
+
readonly kind: "huggingface";
|
|
115
|
+
readonly openrouter: "minimax/minimax-m2.7";
|
|
116
|
+
readonly source: {
|
|
117
|
+
readonly tokenizerConfigUrl: "https://huggingface.co/MiniMaxAI/MiniMax-M2.7/resolve/main/tokenizer_config.json";
|
|
118
|
+
readonly tokenizerJsonUrl: "https://huggingface.co/MiniMaxAI/MiniMax-M2.7/resolve/main/tokenizer.json";
|
|
119
|
+
};
|
|
120
|
+
readonly title: "MiniMax M2.7";
|
|
121
|
+
};
|
|
122
|
+
};
|
|
123
|
+
export declare const modelIds: Array<ModelId>;
|
|
124
|
+
export {};
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import type { ModelId } from '../models.js';
|
|
2
|
+
import { BaseTokenizer } from './base/BaseTokenizer.js';
|
|
3
|
+
type ClipTokenizerState = {
|
|
4
|
+
byteEncoder: Array<string>;
|
|
5
|
+
mergeRanks: Map<string, number>;
|
|
6
|
+
specialTokenIds: Map<string, number>;
|
|
7
|
+
unknownTokenId: number;
|
|
8
|
+
vocabulary: Map<string, number>;
|
|
9
|
+
};
|
|
10
|
+
export declare class ClipTokenizer extends BaseTokenizer<ClipTokenizerState> {
|
|
11
|
+
#private;
|
|
12
|
+
readonly modelId: ModelId;
|
|
13
|
+
constructor(modelId: ModelId);
|
|
14
|
+
protected createState(): {
|
|
15
|
+
byteEncoder: string[];
|
|
16
|
+
mergeRanks: Map<string, number>;
|
|
17
|
+
specialTokenIds: Map<string, number>;
|
|
18
|
+
unknownTokenId: number;
|
|
19
|
+
vocabulary: Map<string, number>;
|
|
20
|
+
};
|
|
21
|
+
protected encodeWithState(text: string, state: ClipTokenizerState): number[];
|
|
22
|
+
}
|
|
23
|
+
export {};
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import type { ModelId } from '../models.js';
|
|
2
|
+
import { BaseTokenizer } from './base/BaseTokenizer.js';
|
|
3
|
+
type HfEncodeResult = {
|
|
4
|
+
ids: Array<number>;
|
|
5
|
+
};
|
|
6
|
+
type HfTokenizer = {
|
|
7
|
+
encode: (text: string) => HfEncodeResult;
|
|
8
|
+
};
|
|
9
|
+
type TokenizerState = {
|
|
10
|
+
tokenizer: HfTokenizer;
|
|
11
|
+
};
|
|
12
|
+
export declare class HuggingFaceTokenizer extends BaseTokenizer<TokenizerState> {
|
|
13
|
+
readonly modelId: ModelId;
|
|
14
|
+
constructor(modelId: ModelId);
|
|
15
|
+
protected createState(): {
|
|
16
|
+
tokenizer: HfTokenizer;
|
|
17
|
+
};
|
|
18
|
+
protected encodeWithState(text: string, state: TokenizerState): number[];
|
|
19
|
+
}
|
|
20
|
+
export {};
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import type { ModelId } from '../models.js';
|
|
2
|
+
import { Tiktoken } from 'tiktoken';
|
|
3
|
+
import { BaseTokenizer } from './base/BaseTokenizer.js';
|
|
4
|
+
export declare class BuiltinTiktokenTokenizer extends BaseTokenizer<Tiktoken> {
|
|
5
|
+
readonly modelId: ModelId;
|
|
6
|
+
constructor(modelId: ModelId);
|
|
7
|
+
protected createState(): Tiktoken;
|
|
8
|
+
protected encodeWithState(text: string, state: Tiktoken): number[];
|
|
9
|
+
}
|
|
10
|
+
export declare class CustomTiktokenTokenizer extends BaseTokenizer<Tiktoken> {
|
|
11
|
+
readonly modelId: ModelId;
|
|
12
|
+
constructor(modelId: ModelId);
|
|
13
|
+
protected createState(): Tiktoken;
|
|
14
|
+
protected encodeWithState(text: string, state: Tiktoken): number[];
|
|
15
|
+
}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
export declare abstract class BaseTokenizer<TState> {
|
|
2
|
+
#private;
|
|
3
|
+
protected abstract createState(): TState;
|
|
4
|
+
encode(text: string): number[];
|
|
5
|
+
protected abstract encodeWithState(text: string, state: TState): Array<number>;
|
|
6
|
+
protected getState(): TState;
|
|
7
|
+
getTokenCount(text: string): number;
|
|
8
|
+
}
|
package/main.d.ts
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import type {ModelId, ModelSelection} from './lib/api.js'
|
|
2
|
+
|
|
3
|
+
export {countTokens, modelIds, models, tokenize} from './lib/api.js'
|
|
4
|
+
export {default} from './lib/api.js'
|
|
5
|
+
export type {CountTokensOptions, CountTokensResult, ModelId, ModelSelection, TokenizeResult} from './lib/api.js'
|
|
6
|
+
|
|
7
|
+
export declare const isModelLoaded: (modelId: ModelId) => boolean
|
|
8
|
+
export declare const getLoadedModelIds: () => Array<ModelId>
|
|
9
|
+
export declare const loadModel: (modelId: ModelId) => Promise<ModelId>
|
|
10
|
+
export declare const loadModels: (model?: ModelSelection) => Promise<Array<ModelId>>
|
package/main.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
import{a as e,i as t,l as n,n as r,o as i,r as a,t as o,u as s}from"./chunks/main.js";export{e as countTokens,e as default,o as getLoadedModelIds,r as isModelLoaded,a as loadModel,t as loadModels,n as modelIds,s as models,i as tokenize};
|
package/package.json
CHANGED
|
@@ -1 +1,47 @@
|
|
|
1
|
-
{
|
|
1
|
+
{
|
|
2
|
+
"name": "token-vocabs",
|
|
3
|
+
"version": "0.2.2",
|
|
4
|
+
"type": "module",
|
|
5
|
+
"description": "Count and inspect token IDs across several modern tokenizer families offline.",
|
|
6
|
+
"keywords": [
|
|
7
|
+
"bpe",
|
|
8
|
+
"count-tokens",
|
|
9
|
+
"deepseek",
|
|
10
|
+
"gemma",
|
|
11
|
+
"gpt",
|
|
12
|
+
"kimi",
|
|
13
|
+
"llm",
|
|
14
|
+
"minimax",
|
|
15
|
+
"qwen",
|
|
16
|
+
"sdxl",
|
|
17
|
+
"tokenizer"
|
|
18
|
+
],
|
|
19
|
+
"funding": "https://github.com/sponsors/Jaid",
|
|
20
|
+
"repository": {
|
|
21
|
+
"type": "git",
|
|
22
|
+
"url": "git+https://github.com/Jaid/token-vocabs.git"
|
|
23
|
+
},
|
|
24
|
+
"homepage": "https://github.com/Jaid/token-vocabs#readme",
|
|
25
|
+
"bugs": {
|
|
26
|
+
"url": "https://github.com/Jaid/token-vocabs/issues"
|
|
27
|
+
},
|
|
28
|
+
"license": "MIT",
|
|
29
|
+
"exports": {
|
|
30
|
+
".": {
|
|
31
|
+
"types": "./all.d.ts",
|
|
32
|
+
"import": "./all.js",
|
|
33
|
+
"default": "./all.js"
|
|
34
|
+
},
|
|
35
|
+
"./browser": {
|
|
36
|
+
"types": "./main.d.ts",
|
|
37
|
+
"import": "./main.js",
|
|
38
|
+
"default": "./main.js"
|
|
39
|
+
},
|
|
40
|
+
"./browser/all": {
|
|
41
|
+
"types": "./all.d.ts",
|
|
42
|
+
"import": "./all.js",
|
|
43
|
+
"default": "./all.js"
|
|
44
|
+
}
|
|
45
|
+
},
|
|
46
|
+
"types": "./all.d.ts"
|
|
47
|
+
}
|
package/tiktoken_bg.wasm
ADDED
|
Binary file
|