@nanocollective/nanocoder 1.15.1 → 1.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -1
- package/dist/ai-sdk-client.d.ts +5 -0
- package/dist/ai-sdk-client.d.ts.map +1 -1
- package/dist/ai-sdk-client.js +21 -3
- package/dist/ai-sdk-client.js.map +1 -1
- package/dist/app/utils/appUtils.d.ts.map +1 -1
- package/dist/app/utils/appUtils.js +5 -3
- package/dist/app/utils/appUtils.js.map +1 -1
- package/dist/commands/index.d.ts +1 -0
- package/dist/commands/index.d.ts.map +1 -1
- package/dist/commands/index.js +1 -0
- package/dist/commands/index.js.map +1 -1
- package/dist/commands/usage.d.ts +7 -0
- package/dist/commands/usage.d.ts.map +1 -0
- package/dist/commands/usage.js +69 -0
- package/dist/commands/usage.js.map +1 -0
- package/dist/commands.d.ts +3 -2
- package/dist/commands.d.ts.map +1 -1
- package/dist/commands.js.map +1 -1
- package/dist/components/usage/progress-bar.d.ts +14 -0
- package/dist/components/usage/progress-bar.d.ts.map +1 -0
- package/dist/components/usage/progress-bar.js +14 -0
- package/dist/components/usage/progress-bar.js.map +1 -0
- package/dist/components/usage/usage-display.d.ts +18 -0
- package/dist/components/usage/usage-display.d.ts.map +1 -0
- package/dist/components/usage/usage-display.js +50 -0
- package/dist/components/usage/usage-display.js.map +1 -0
- package/dist/config/paths.d.ts +0 -4
- package/dist/config/paths.d.ts.map +1 -1
- package/dist/config/paths.js +2 -18
- package/dist/config/paths.js.map +1 -1
- package/dist/hooks/useAppInitialization.d.ts.map +1 -1
- package/dist/hooks/useAppInitialization.js +2 -1
- package/dist/hooks/useAppInitialization.js.map +1 -1
- package/dist/hooks/useAppState.d.ts +2 -0
- package/dist/hooks/useAppState.d.ts.map +1 -1
- package/dist/hooks/useAppState.js +27 -5
- package/dist/hooks/useAppState.js.map +1 -1
- package/dist/hooks/useChatHandler.d.ts.map +1 -1
- package/dist/hooks/useChatHandler.js +45 -9
- package/dist/hooks/useChatHandler.js.map +1 -1
- package/dist/models/index.d.ts +2 -0
- package/dist/models/index.d.ts.map +1 -0
- package/dist/models/index.js +2 -0
- package/dist/models/index.js.map +1 -0
- package/dist/models/models-cache.d.ts +8 -0
- package/dist/models/models-cache.d.ts.map +1 -0
- package/dist/models/models-cache.js +63 -0
- package/dist/models/models-cache.js.map +1 -0
- package/dist/models/models-dev-client.d.ts +10 -0
- package/dist/models/models-dev-client.d.ts.map +1 -0
- package/dist/models/models-dev-client.js +268 -0
- package/dist/models/models-dev-client.js.map +1 -0
- package/dist/models/models-types.d.ts +66 -0
- package/dist/models/models-types.d.ts.map +1 -0
- package/dist/models/models-types.js +5 -0
- package/dist/models/models-types.js.map +1 -0
- package/dist/recommendations/model-database.d.ts.map +1 -1
- package/dist/recommendations/model-database.js +45 -0
- package/dist/recommendations/model-database.js.map +1 -1
- package/dist/tokenization/index.d.ts +2 -0
- package/dist/tokenization/index.d.ts.map +1 -0
- package/dist/tokenization/index.js +2 -0
- package/dist/tokenization/index.js.map +1 -0
- package/dist/tokenization/tokenizer-factory.d.ts +14 -0
- package/dist/tokenization/tokenizer-factory.d.ts.map +1 -0
- package/dist/tokenization/tokenizer-factory.js +90 -0
- package/dist/tokenization/tokenizer-factory.js.map +1 -0
- package/dist/tokenization/tokenizer-factory.spec.d.ts +5 -0
- package/dist/tokenization/tokenizer-factory.spec.d.ts.map +1 -0
- package/dist/tokenization/tokenizer-factory.spec.js +137 -0
- package/dist/tokenization/tokenizer-factory.spec.js.map +1 -0
- package/dist/tokenization/tokenizers/anthropic-tokenizer.d.ts +17 -0
- package/dist/tokenization/tokenizers/anthropic-tokenizer.d.ts.map +1 -0
- package/dist/tokenization/tokenizers/anthropic-tokenizer.js +35 -0
- package/dist/tokenization/tokenizers/anthropic-tokenizer.js.map +1 -0
- package/dist/tokenization/tokenizers/anthropic-tokenizer.spec.d.ts +5 -0
- package/dist/tokenization/tokenizers/anthropic-tokenizer.spec.d.ts.map +1 -0
- package/dist/tokenization/tokenizers/anthropic-tokenizer.spec.js +152 -0
- package/dist/tokenization/tokenizers/anthropic-tokenizer.spec.js.map +1 -0
- package/dist/tokenization/tokenizers/fallback-tokenizer.d.ts +13 -0
- package/dist/tokenization/tokenizers/fallback-tokenizer.d.ts.map +1 -0
- package/dist/tokenization/tokenizers/fallback-tokenizer.js +20 -0
- package/dist/tokenization/tokenizers/fallback-tokenizer.js.map +1 -0
- package/dist/tokenization/tokenizers/fallback-tokenizer.spec.d.ts +5 -0
- package/dist/tokenization/tokenizers/fallback-tokenizer.spec.d.ts.map +1 -0
- package/dist/tokenization/tokenizers/fallback-tokenizer.spec.js +183 -0
- package/dist/tokenization/tokenizers/fallback-tokenizer.spec.js.map +1 -0
- package/dist/tokenization/tokenizers/llama-tokenizer.d.ts +14 -0
- package/dist/tokenization/tokenizers/llama-tokenizer.d.ts.map +1 -0
- package/dist/tokenization/tokenizers/llama-tokenizer.js +33 -0
- package/dist/tokenization/tokenizers/llama-tokenizer.js.map +1 -0
- package/dist/tokenization/tokenizers/llama-tokenizer.spec.d.ts +5 -0
- package/dist/tokenization/tokenizers/llama-tokenizer.spec.d.ts.map +1 -0
- package/dist/tokenization/tokenizers/llama-tokenizer.spec.js +170 -0
- package/dist/tokenization/tokenizers/llama-tokenizer.spec.js.map +1 -0
- package/dist/tokenization/tokenizers/openai-tokenizer.d.ts +22 -0
- package/dist/tokenization/tokenizers/openai-tokenizer.d.ts.map +1 -0
- package/dist/tokenization/tokenizers/openai-tokenizer.js +48 -0
- package/dist/tokenization/tokenizers/openai-tokenizer.js.map +1 -0
- package/dist/tokenization/tokenizers/openai-tokenizer.spec.d.ts +5 -0
- package/dist/tokenization/tokenizers/openai-tokenizer.spec.d.ts.map +1 -0
- package/dist/tokenization/tokenizers/openai-tokenizer.spec.js +140 -0
- package/dist/tokenization/tokenizers/openai-tokenizer.spec.js.map +1 -0
- package/dist/types/commands.d.ts +1 -0
- package/dist/types/commands.d.ts.map +1 -1
- package/dist/types/tokenization.d.ts +31 -0
- package/dist/types/tokenization.d.ts.map +1 -0
- package/dist/types/tokenization.js +5 -0
- package/dist/types/tokenization.js.map +1 -0
- package/dist/types/usage.d.ts +55 -0
- package/dist/types/usage.d.ts.map +1 -0
- package/dist/types/usage.js +2 -0
- package/dist/types/usage.js.map +1 -0
- package/dist/usage/calculator.d.ts +28 -0
- package/dist/usage/calculator.d.ts.map +1 -0
- package/dist/usage/calculator.js +81 -0
- package/dist/usage/calculator.js.map +1 -0
- package/dist/usage/calculator.spec.d.ts +2 -0
- package/dist/usage/calculator.spec.d.ts.map +1 -0
- package/dist/usage/calculator.spec.js +303 -0
- package/dist/usage/calculator.spec.js.map +1 -0
- package/dist/usage/storage.d.ts +19 -0
- package/dist/usage/storage.d.ts.map +1 -0
- package/dist/usage/storage.js +134 -0
- package/dist/usage/storage.js.map +1 -0
- package/dist/usage/storage.spec.d.ts +2 -0
- package/dist/usage/storage.spec.d.ts.map +1 -0
- package/dist/usage/storage.spec.js +417 -0
- package/dist/usage/storage.spec.js.map +1 -0
- package/dist/usage/tracker.d.ts +28 -0
- package/dist/usage/tracker.d.ts.map +1 -0
- package/dist/usage/tracker.js +75 -0
- package/dist/usage/tracker.js.map +1 -0
- package/dist/usage/tracker.spec.d.ts +2 -0
- package/dist/usage/tracker.spec.d.ts.map +1 -0
- package/dist/usage/tracker.spec.js +347 -0
- package/dist/usage/tracker.spec.js.map +1 -0
- package/dist/utils/paste-roundtrip.spec.d.ts.map +1 -0
- package/dist/utils/paste-roundtrip.spec.js.map +1 -0
- package/package.json +7 -2
- package/dist/integration/paste-roundtrip.spec.d.ts.map +0 -1
- package/dist/integration/paste-roundtrip.spec.js.map +0 -1
- /package/dist/{integration → utils}/paste-roundtrip.spec.d.ts +0 -0
- /package/dist/{integration → utils}/paste-roundtrip.spec.js +0 -0
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for tokenizer-factory.ts
|
|
3
|
+
*/
|
|
4
|
+
import test from 'ava';
|
|
5
|
+
import { createTokenizer, createTokenizerForProvider, } from './tokenizer-factory.js';
|
|
6
|
+
import { OpenAITokenizer } from './tokenizers/openai-tokenizer.js';
|
|
7
|
+
import { AnthropicTokenizer } from './tokenizers/anthropic-tokenizer.js';
|
|
8
|
+
import { LlamaTokenizer } from './tokenizers/llama-tokenizer.js';
|
|
9
|
+
import { FallbackTokenizer } from './tokenizers/fallback-tokenizer.js';
|
|
10
|
+
console.log(`\ntokenizer-factory.spec.ts`);
|
|
11
|
+
// Test createTokenizer with OpenAI provider detection
|
|
12
|
+
test('createTokenizer detects OpenAI from provider name', t => {
|
|
13
|
+
const tokenizer = createTokenizer('openai', 'custom-model');
|
|
14
|
+
t.true(tokenizer instanceof OpenAITokenizer);
|
|
15
|
+
t.is(tokenizer.getName(), 'openai-custom-model');
|
|
16
|
+
});
|
|
17
|
+
test('createTokenizer detects OpenAI from GPT model name', t => {
|
|
18
|
+
const tokenizer = createTokenizer('custom', 'gpt-4-turbo');
|
|
19
|
+
t.true(tokenizer instanceof OpenAITokenizer);
|
|
20
|
+
});
|
|
21
|
+
test('createTokenizer detects OpenAI from model name with openai keyword', t => {
|
|
22
|
+
const tokenizer = createTokenizer('custom', 'openai-model');
|
|
23
|
+
t.true(tokenizer instanceof OpenAITokenizer);
|
|
24
|
+
});
|
|
25
|
+
// Test createTokenizer with Anthropic provider detection
|
|
26
|
+
test('createTokenizer detects Anthropic from provider name', t => {
|
|
27
|
+
const tokenizer = createTokenizer('anthropic', 'custom-model');
|
|
28
|
+
t.true(tokenizer instanceof AnthropicTokenizer);
|
|
29
|
+
});
|
|
30
|
+
test('createTokenizer detects Anthropic from provider name with claude keyword', t => {
|
|
31
|
+
const tokenizer = createTokenizer('claude-provider', 'model');
|
|
32
|
+
t.true(tokenizer instanceof AnthropicTokenizer);
|
|
33
|
+
});
|
|
34
|
+
test('createTokenizer detects Anthropic from claude model name', t => {
|
|
35
|
+
const tokenizer = createTokenizer('custom', 'claude-3-opus');
|
|
36
|
+
t.true(tokenizer instanceof AnthropicTokenizer);
|
|
37
|
+
});
|
|
38
|
+
// Test createTokenizer with Llama provider detection
|
|
39
|
+
test('createTokenizer detects Llama from llama model name', t => {
|
|
40
|
+
const tokenizer = createTokenizer('custom', 'llama-3-8b');
|
|
41
|
+
t.true(tokenizer instanceof LlamaTokenizer);
|
|
42
|
+
});
|
|
43
|
+
test('createTokenizer detects Llama from mistral model name', t => {
|
|
44
|
+
const tokenizer = createTokenizer('custom', 'mistral-7b');
|
|
45
|
+
t.true(tokenizer instanceof LlamaTokenizer);
|
|
46
|
+
});
|
|
47
|
+
test('createTokenizer detects Llama from qwen model name', t => {
|
|
48
|
+
const tokenizer = createTokenizer('custom', 'qwen-2.5');
|
|
49
|
+
t.true(tokenizer instanceof LlamaTokenizer);
|
|
50
|
+
});
|
|
51
|
+
test('createTokenizer detects Llama from gemma model name', t => {
|
|
52
|
+
const tokenizer = createTokenizer('custom', 'gemma-2b');
|
|
53
|
+
t.true(tokenizer instanceof LlamaTokenizer);
|
|
54
|
+
});
|
|
55
|
+
test('createTokenizer detects Llama from phi model name', t => {
|
|
56
|
+
const tokenizer = createTokenizer('custom', 'phi-3');
|
|
57
|
+
t.true(tokenizer instanceof LlamaTokenizer);
|
|
58
|
+
});
|
|
59
|
+
test('createTokenizer detects Llama from codellama model name', t => {
|
|
60
|
+
const tokenizer = createTokenizer('custom', 'codellama-7b');
|
|
61
|
+
t.true(tokenizer instanceof LlamaTokenizer);
|
|
62
|
+
});
|
|
63
|
+
test('createTokenizer detects Llama from deepseek model name', t => {
|
|
64
|
+
const tokenizer = createTokenizer('custom', 'deepseek-coder');
|
|
65
|
+
t.true(tokenizer instanceof LlamaTokenizer);
|
|
66
|
+
});
|
|
67
|
+
test('createTokenizer detects Llama from mixtral model name', t => {
|
|
68
|
+
const tokenizer = createTokenizer('custom', 'mixtral-8x7b');
|
|
69
|
+
t.true(tokenizer instanceof LlamaTokenizer);
|
|
70
|
+
});
|
|
71
|
+
test('createTokenizer detects Llama from ollama provider', t => {
|
|
72
|
+
const tokenizer = createTokenizer('ollama', 'custom-model');
|
|
73
|
+
t.true(tokenizer instanceof LlamaTokenizer);
|
|
74
|
+
});
|
|
75
|
+
test('createTokenizer detects Llama from llama.cpp provider', t => {
|
|
76
|
+
const tokenizer = createTokenizer('llama.cpp', 'custom-model');
|
|
77
|
+
t.true(tokenizer instanceof LlamaTokenizer);
|
|
78
|
+
});
|
|
79
|
+
test('createTokenizer detects Llama from local provider', t => {
|
|
80
|
+
const tokenizer = createTokenizer('local', 'custom-model');
|
|
81
|
+
t.true(tokenizer instanceof LlamaTokenizer);
|
|
82
|
+
});
|
|
83
|
+
// Test createTokenizer with fallback
|
|
84
|
+
test('createTokenizer returns FallbackTokenizer for unknown provider', t => {
|
|
85
|
+
const tokenizer = createTokenizer('unknown', 'unknown-model');
|
|
86
|
+
t.true(tokenizer instanceof FallbackTokenizer);
|
|
87
|
+
});
|
|
88
|
+
// Test cloud suffix stripping
|
|
89
|
+
test('createTokenizer strips :cloud suffix from model name', t => {
|
|
90
|
+
const tokenizer = createTokenizer('ollama', 'llama-3:cloud');
|
|
91
|
+
t.true(tokenizer instanceof LlamaTokenizer);
|
|
92
|
+
t.is(tokenizer.getName(), 'llama-llama-3');
|
|
93
|
+
});
|
|
94
|
+
test('createTokenizer strips -cloud suffix from model name', t => {
|
|
95
|
+
const tokenizer = createTokenizer('ollama', 'llama-3-cloud');
|
|
96
|
+
t.true(tokenizer instanceof LlamaTokenizer);
|
|
97
|
+
t.is(tokenizer.getName(), 'llama-llama-3');
|
|
98
|
+
});
|
|
99
|
+
// Test case insensitivity
|
|
100
|
+
test('createTokenizer is case insensitive for provider detection', t => {
|
|
101
|
+
const tokenizer = createTokenizer('OPENAI', 'model');
|
|
102
|
+
t.true(tokenizer instanceof OpenAITokenizer);
|
|
103
|
+
});
|
|
104
|
+
test('createTokenizer is case insensitive for model detection', t => {
|
|
105
|
+
const tokenizer = createTokenizer('custom', 'GPT-4-TURBO');
|
|
106
|
+
t.true(tokenizer instanceof OpenAITokenizer);
|
|
107
|
+
});
|
|
108
|
+
// Test createTokenizerForProvider
|
|
109
|
+
test('createTokenizerForProvider creates OpenAI tokenizer', t => {
|
|
110
|
+
const tokenizer = createTokenizerForProvider('openai', 'gpt-4');
|
|
111
|
+
t.true(tokenizer instanceof OpenAITokenizer);
|
|
112
|
+
});
|
|
113
|
+
test('createTokenizerForProvider creates Anthropic tokenizer', t => {
|
|
114
|
+
const tokenizer = createTokenizerForProvider('anthropic', 'claude-3');
|
|
115
|
+
t.true(tokenizer instanceof AnthropicTokenizer);
|
|
116
|
+
});
|
|
117
|
+
test('createTokenizerForProvider creates Llama tokenizer', t => {
|
|
118
|
+
const tokenizer = createTokenizerForProvider('llama', 'llama-3');
|
|
119
|
+
t.true(tokenizer instanceof LlamaTokenizer);
|
|
120
|
+
});
|
|
121
|
+
test('createTokenizerForProvider creates FallbackTokenizer', t => {
|
|
122
|
+
const tokenizer = createTokenizerForProvider('fallback');
|
|
123
|
+
t.true(tokenizer instanceof FallbackTokenizer);
|
|
124
|
+
});
|
|
125
|
+
test('createTokenizerForProvider handles auto mode with model ID', t => {
|
|
126
|
+
const tokenizer = createTokenizerForProvider('auto', 'gpt-4');
|
|
127
|
+
t.true(tokenizer instanceof OpenAITokenizer);
|
|
128
|
+
});
|
|
129
|
+
test('createTokenizerForProvider handles auto mode without model ID', t => {
|
|
130
|
+
const tokenizer = createTokenizerForProvider('auto');
|
|
131
|
+
t.true(tokenizer instanceof FallbackTokenizer);
|
|
132
|
+
});
|
|
133
|
+
test('createTokenizerForProvider works without model ID', t => {
|
|
134
|
+
const tokenizer = createTokenizerForProvider('openai');
|
|
135
|
+
t.true(tokenizer instanceof OpenAITokenizer);
|
|
136
|
+
});
|
|
137
|
+
//# sourceMappingURL=tokenizer-factory.spec.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tokenizer-factory.spec.js","sourceRoot":"","sources":["../../source/tokenization/tokenizer-factory.spec.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,IAAI,MAAM,KAAK,CAAC;AACvB,OAAO,EACN,eAAe,EACf,0BAA0B,GAC1B,MAAM,wBAAwB,CAAC;AAChC,OAAO,EAAC,eAAe,EAAC,MAAM,kCAAkC,CAAC;AACjE,OAAO,EAAC,kBAAkB,EAAC,MAAM,qCAAqC,CAAC;AACvE,OAAO,EAAC,cAAc,EAAC,MAAM,iCAAiC,CAAC;AAC/D,OAAO,EAAC,iBAAiB,EAAC,MAAM,oCAAoC,CAAC;AAErE,OAAO,CAAC,GAAG,CAAC,6BAA6B,CAAC,CAAC;AAE3C,sDAAsD;AACtD,IAAI,CAAC,mDAAmD,EAAE,CAAC,CAAC,EAAE;IAC7D,MAAM,SAAS,GAAG,eAAe,CAAC,QAAQ,EAAE,cAAc,CAAC,CAAC;IAC5D,CAAC,CAAC,IAAI,CAAC,SAAS,YAAY,eAAe,CAAC,CAAC;IAC7C,CAAC,CAAC,EAAE,CAAC,SAAS,CAAC,OAAO,EAAE,EAAE,qBAAqB,CAAC,CAAC;AAClD,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,oDAAoD,EAAE,CAAC,CAAC,EAAE;IAC9D,MAAM,SAAS,GAAG,eAAe,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;IAC3D,CAAC,CAAC,IAAI,CAAC,SAAS,YAAY,eAAe,CAAC,CAAC;AAC9C,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,oEAAoE,EAAE,CAAC,CAAC,EAAE;IAC9E,MAAM,SAAS,GAAG,eAAe,CAAC,QAAQ,EAAE,cAAc,CAAC,CAAC;IAC5D,CAAC,CAAC,IAAI,CAAC,SAAS,YAAY,eAAe,CAAC,CAAC;AAC9C,CAAC,CAAC,CAAC;AAEH,yDAAyD;AACzD,IAAI,CAAC,sDAAsD,EAAE,CAAC,CAAC,EAAE;IAChE,MAAM,SAAS,GAAG,eAAe,CAAC,WAAW,EAAE,cAAc,CAAC,CAAC;IAC/D,CAAC,CAAC,IAAI,CAAC,SAAS,YAAY,kBAAkB,CAAC,CAAC;AACjD,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,0EAA0E,EAAE,CAAC,CAAC,EAAE;IACpF,MAAM,SAAS,GAAG,eAAe,CAAC,iBAAiB,EAAE,OAAO,CAAC,CAAC;IAC9D,CAAC,CAAC,IAAI,CAAC,SAAS,YAAY,kBAAkB,CAAC,CAAC;AACjD,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,0DAA0D,EAAE,CAAC,CAAC,EAAE;IACpE,MAAM,SAAS,GAAG,eAAe,CAAC,QAAQ,EAAE,eAAe,CAAC,CAAC;IAC7D,CAAC,CAAC,IAAI,CAAC,SAAS,YAAY,kBAAkB,CAAC,CAAC;AACjD,CAAC,CAAC,CAAC;AAEH,qDAAqD;AACrD,IAAI,CAAC,qDAAqD,EAAE,CAAC,CAAC,EAAE;IAC/D,MAAM,SAAS,GAAG,eAAe,CAAC,QAAQ,EAAE,YAAY,CAAC,CAAC;IAC1D,CAAC,CAAC,IAAI,CAAC,SAAS,YAAY,cAAc,CAAC,CAAC;AAC7C,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,uDAAuD,EAAE,CAAC,CAAC,EAAE;IACjE,MAAM,SAAS,GAAG,eAAe,CAAC,QAAQ,EAAE,YAAY,CAAC,CAAC;IAC1D,CAAC,CAAC,IAAI,CAAC,SAAS,YAAY,cAAc,CAAC,CAAC;AAC7C,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,oDAAoD,EAAE,CAAC,CAAC,EAAE;IAC9D,MAAM,SAAS,GAAG,eAAe,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;IACxD,CAAC,CAAC,IAAI,CAAC,SAAS,YAAY,cAAc,CAAC,CAAC;AAC7C,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,qDAAqD,EAAE,CAAC,CAAC,EAAE;IAC/D,MAAM,SAAS,GAAG,eAAe,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;IACxD,CAAC,CAAC,IAAI,CAAC,SAAS,YAAY,cAAc,CAAC,CAAC;AAC7C,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,mDAAmD,EAAE,CAAC,CAAC,EAAE;IAC7D,MAAM,SAAS,GAAG,eAAe,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IACrD,CAAC,CAAC,IAAI,CAAC,SAAS,YAAY,cAAc,CAAC,CAAC;AAC7C,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,yDAAyD,EAAE,CAAC,CAAC,EAAE;IACnE,MAAM,SAAS,GAAG,eAAe,CAAC,QAAQ,EAAE,cAAc,CAAC,CAAC;IAC5D,CAAC,CAAC,IAAI,CAAC,SAAS,YAAY,cAAc,CAAC,CAAC;AAC7C,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,wDAAwD,EAAE,CAAC,CAAC,EAAE;IAClE,MAAM,SAAS,GAAG,eAAe,CAAC,QAAQ,EAAE,gBAAgB,CAAC,CAAC;IAC9D,CAAC,CAAC,IAAI,CAAC,SAAS,YAAY,cAAc,CAAC,CAAC;AAC7C,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,uDAAuD,EAAE,CAAC,CAAC,EAAE;IACjE,MAAM,SAAS,GAAG,eAAe,CAAC,QAAQ,EAAE,cAAc,CAAC,CAAC;IAC5D,CAAC,CAAC,IAAI,CAAC,SAAS,YAAY,cAAc,CAAC,CAAC;AAC7C,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,oDAAoD,EAAE,CAAC,CAAC,EAAE;IAC9D,MAAM,SAAS,GAAG,eAAe,CAAC,QAAQ,EAAE,cAAc,CAAC,CAAC;IAC5D,CAAC,CAAC,IAAI,CAAC,SAAS,YAAY,cAAc,CAAC,CAAC;AAC7C,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,uDAAuD,EAAE,CAAC,CAAC,EAAE;IACjE,MAAM,SAAS,GAAG,eAAe,CAAC,WAAW,EAAE,cAAc,CAAC,CAAC;IAC/D,CAAC,CAAC,IAAI,CAAC,SAAS,YAAY,cAAc,CAAC,CAAC;AAC7C,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,mDAAmD,EAAE,CAAC,CAAC,EAAE;IAC7D,MAAM,SAAS,GAAG,eAAe,CAAC,OAAO,EAAE,cAAc,CAAC,CAAC;IAC3D,CAAC,CAAC,IAAI,CAAC,SAAS,YAAY,cAAc,CAAC,CAAC;AAC7C,CAAC,CAAC,CAAC;AAEH,qCAAqC;AACrC,IAAI,CAAC,gEAAgE,EAAE,CAAC,CAAC,EAAE;IAC1E,MAAM,SAAS,GAAG,eAAe,CAAC,SAAS,EAAE,eAAe,CAAC,CAAC;IAC9D,CAAC,CAAC,IAAI,CAAC,SAAS,YAAY,iBAAiB,CAAC,CAAC;AAChD,CAAC,CAAC,CAAC;AAEH,8BAA8B;AAC9B,IAAI,CAAC,sDAAsD,EAAE,CAAC,CAAC,EAAE;IAChE,MAAM,SAAS,GAAG,eAAe,CAAC,QAAQ,EAAE,eAAe,CAAC,CAAC;IAC7D,CAAC,CAAC,IAAI,CAAC,SAAS,YAAY,cAAc,CAAC,CAAC;IAC5C,CAAC,CAAC,EAAE,CAAC,SAAS,CAAC,OAAO,EAAE,EAAE,eAAe,CAAC,CAAC;AAC5C,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,sDAAsD,EAAE,CAAC,CAAC,EAAE;IAChE,MAAM,SAAS,GAAG,eAAe,CAAC,QAAQ,EAAE,eAAe,CAAC,CAAC;IAC7D,CAAC,CAAC,IAAI,CAAC,SAAS,YAAY,cAAc,CAAC,CAAC;IAC5C,CAAC,CAAC,EAAE,CAAC,SAAS,CAAC,OAAO,EAAE,EAAE,eAAe,CAAC,CAAC;AAC5C,CAAC,CAAC,CAAC;AAEH,0BAA0B;AAC1B,IAAI,CAAC,4DAA4D,EAAE,CAAC,CAAC,EAAE;IACtE,MAAM,SAAS,GAAG,eAAe,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IACrD,CAAC,CAAC,IAAI,CAAC,SAAS,YAAY,eAAe,CAAC,CAAC;AAC9C,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,yDAAyD,EAAE,CAAC,CAAC,EAAE;IACnE,MAAM,SAAS,GAAG,eAAe,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;IAC3D,CAAC,CAAC,IAAI,CAAC,SAAS,YAAY,eAAe,CAAC,CAAC;AAC9C,CAAC,CAAC,CAAC;AAEH,kCAAkC;AAClC,IAAI,CAAC,qDAAqD,EAAE,CAAC,CAAC,EAAE;IAC/D,MAAM,SAAS,GAAG,0BAA0B,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IAChE,CAAC,CAAC,IAAI,CAAC,SAAS,YAAY,eAAe,CAAC,CAAC;AAC9C,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,wDAAwD,EAAE,CAAC,CAAC,EAAE;IAClE,MAAM,SAAS,GAAG,0BAA0B,CAAC,WAAW,EAAE,UAAU,CAAC,CAAC;IACtE,CAAC,CAAC,IAAI,CAAC,SAAS,YAAY,kBAAkB,CAAC,CAAC;AACjD,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,oDAAoD,EAAE,CAAC,CAAC,EAAE;IAC9D,MAAM,SAAS,GAAG,0BAA0B,CAAC,OAAO,EAAE,SAAS,CAAC,CAAC;IACjE,CAAC,CAAC,IAAI,CAAC,SAAS,YAAY,cAAc,CAAC,CAAC;AAC7C,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,sDAAsD,EAAE,CAAC,CAAC,EAAE;IAChE,MAAM,SAAS,GAAG,0BAA0B,CAAC,UAAU,CAAC,CAAC;IACzD,CAAC,CAAC,IAAI,CAAC,SAAS,YAAY,iBAAiB,CAAC,CAAC;AAChD,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,4DAA4D,EAAE,CAAC,CAAC,EAAE;IACtE,MAAM,SAAS,GAAG,0BAA0B,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IAC9D,CAAC,CAAC,IAAI,CAAC,SAAS,YAAY,eAAe,CAAC,CAAC;AAC9C,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,+DAA+D,EAAE,CAAC,CAAC,EAAE;IACzE,MAAM,SAAS,GAAG,0BAA0B,CAAC,MAAM,CAAC,CAAC;IACrD,CAAC,CAAC,IAAI,CAAC,SAAS,YAAY,iBAAiB,CAAC,CAAC;AAChD,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,mDAAmD,EAAE,CAAC,CAAC,EAAE;IAC7D,MAAM,SAAS,GAAG,0BAA0B,CAAC,QAAQ,CAAC,CAAC;IACvD,CAAC,CAAC,IAAI,CAAC,SAAS,YAAY,eAAe,CAAC,CAAC;AAC9C,CAAC,CAAC,CAAC"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Anthropic tokenizer for Claude models
|
|
3
|
+
* Uses @anthropic-ai/tokenizer package
|
|
4
|
+
*/
|
|
5
|
+
import type { Tokenizer } from '../../types/tokenization.js';
|
|
6
|
+
import type { Message } from '../../types/core.js';
|
|
7
|
+
/**
|
|
8
|
+
* Anthropic tokenizer for Claude models
|
|
9
|
+
*/
|
|
10
|
+
export declare class AnthropicTokenizer implements Tokenizer {
|
|
11
|
+
private modelName;
|
|
12
|
+
constructor(modelId?: string);
|
|
13
|
+
encode(text: string): number;
|
|
14
|
+
countTokens(message: Message): number;
|
|
15
|
+
getName(): string;
|
|
16
|
+
}
|
|
17
|
+
//# sourceMappingURL=anthropic-tokenizer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"anthropic-tokenizer.d.ts","sourceRoot":"","sources":["../../../source/tokenization/tokenizers/anthropic-tokenizer.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAGH,OAAO,KAAK,EAAC,SAAS,EAAC,MAAM,0BAA0B,CAAC;AACxD,OAAO,KAAK,EAAC,OAAO,EAAC,MAAM,cAAc,CAAC;AAE1C;;GAEG;AACH,qBAAa,kBAAmB,YAAW,SAAS;IACnD,OAAO,CAAC,SAAS,CAAS;gBAEd,OAAO,CAAC,EAAE,MAAM;IAI5B,MAAM,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM;IAS5B,WAAW,CAAC,OAAO,EAAE,OAAO,GAAG,MAAM;IAWrC,OAAO,IAAI,MAAM;CAGjB"}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Anthropic tokenizer for Claude models
|
|
3
|
+
* Uses @anthropic-ai/tokenizer package
|
|
4
|
+
*/
|
|
5
|
+
import { countTokens as anthropicCountTokens } from '@anthropic-ai/tokenizer';
|
|
6
|
+
/**
|
|
7
|
+
* Anthropic tokenizer for Claude models
|
|
8
|
+
*/
|
|
9
|
+
export class AnthropicTokenizer {
|
|
10
|
+
modelName;
|
|
11
|
+
constructor(modelId) {
|
|
12
|
+
this.modelName = modelId || 'claude-3';
|
|
13
|
+
}
|
|
14
|
+
encode(text) {
|
|
15
|
+
try {
|
|
16
|
+
return anthropicCountTokens(text);
|
|
17
|
+
}
|
|
18
|
+
catch {
|
|
19
|
+
// Fallback to character-based estimation if tokenization fails
|
|
20
|
+
return Math.ceil(text.length / 4);
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
countTokens(message) {
|
|
24
|
+
const content = message.content || '';
|
|
25
|
+
const role = message.role || '';
|
|
26
|
+
// Anthropic format includes role in the message structure
|
|
27
|
+
// Approximate overhead for message formatting
|
|
28
|
+
const messageOverhead = 3;
|
|
29
|
+
return this.encode(content) + this.encode(role) + messageOverhead;
|
|
30
|
+
}
|
|
31
|
+
getName() {
|
|
32
|
+
return `anthropic-${this.modelName}`;
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
//# sourceMappingURL=anthropic-tokenizer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"anthropic-tokenizer.js","sourceRoot":"","sources":["../../../source/tokenization/tokenizers/anthropic-tokenizer.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAC,WAAW,IAAI,oBAAoB,EAAC,MAAM,yBAAyB,CAAC;AAI5E;;GAEG;AACH,MAAM,OAAO,kBAAkB;IACtB,SAAS,CAAS;IAE1B,YAAY,OAAgB;QAC3B,IAAI,CAAC,SAAS,GAAG,OAAO,IAAI,UAAU,CAAC;IACxC,CAAC;IAED,MAAM,CAAC,IAAY;QAClB,IAAI,CAAC;YACJ,OAAO,oBAAoB,CAAC,IAAI,CAAC,CAAC;QACnC,CAAC;QAAC,MAAM,CAAC;YACR,+DAA+D;YAC/D,OAAO,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QACnC,CAAC;IACF,CAAC;IAED,WAAW,CAAC,OAAgB;QAC3B,MAAM,OAAO,GAAG,OAAO,CAAC,OAAO,IAAI,EAAE,CAAC;QACtC,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,IAAI,EAAE,CAAC;QAEhC,0DAA0D;QAC1D,8CAA8C;QAC9C,MAAM,eAAe,GAAG,CAAC,CAAC;QAE1B,OAAO,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,eAAe,CAAC;IACnE,CAAC;IAED,OAAO;QACN,OAAO,aAAa,IAAI,CAAC,SAAS,EAAE,CAAC;IACtC,CAAC;CACD"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"anthropic-tokenizer.spec.d.ts","sourceRoot":"","sources":["../../../source/tokenization/tokenizers/anthropic-tokenizer.spec.ts"],"names":[],"mappings":"AAAA;;GAEG"}
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for anthropic-tokenizer.ts
|
|
3
|
+
*/
|
|
4
|
+
import test from 'ava';
|
|
5
|
+
import { AnthropicTokenizer } from './anthropic-tokenizer.js';
|
|
6
|
+
console.log(`\nantrhopic-tokenizer.spec.ts`);
|
|
7
|
+
test('AnthropicTokenizer encodes simple text', t => {
|
|
8
|
+
const tokenizer = new AnthropicTokenizer('claude-3-opus');
|
|
9
|
+
const count = tokenizer.encode('Hello, world!');
|
|
10
|
+
// Should return a positive token count
|
|
11
|
+
t.true(count > 0);
|
|
12
|
+
t.true(count < 10);
|
|
13
|
+
});
|
|
14
|
+
test('AnthropicTokenizer encodes empty string', t => {
|
|
15
|
+
const tokenizer = new AnthropicTokenizer('claude-3-opus');
|
|
16
|
+
const count = tokenizer.encode('');
|
|
17
|
+
t.is(count, 0);
|
|
18
|
+
});
|
|
19
|
+
test('AnthropicTokenizer encodes longer text', t => {
|
|
20
|
+
const tokenizer = new AnthropicTokenizer('claude-3-opus');
|
|
21
|
+
const text = 'This is a longer piece of text that should have more tokens than a simple hello world.';
|
|
22
|
+
const count = tokenizer.encode(text);
|
|
23
|
+
// Should have significantly more tokens
|
|
24
|
+
t.true(count > 10);
|
|
25
|
+
t.true(count < 50);
|
|
26
|
+
});
|
|
27
|
+
test('AnthropicTokenizer defaults to claude-3 when no model specified', t => {
|
|
28
|
+
const tokenizer = new AnthropicTokenizer();
|
|
29
|
+
t.is(tokenizer.getName(), 'anthropic-claude-3');
|
|
30
|
+
});
|
|
31
|
+
test('AnthropicTokenizer getName returns correct format', t => {
|
|
32
|
+
const tokenizer = new AnthropicTokenizer('claude-3-sonnet');
|
|
33
|
+
t.is(tokenizer.getName(), 'anthropic-claude-3-sonnet');
|
|
34
|
+
});
|
|
35
|
+
test('AnthropicTokenizer countTokens for user message', t => {
|
|
36
|
+
const tokenizer = new AnthropicTokenizer('claude-3-opus');
|
|
37
|
+
const message = {
|
|
38
|
+
role: 'user',
|
|
39
|
+
content: 'Hello, how are you?',
|
|
40
|
+
};
|
|
41
|
+
const count = tokenizer.countTokens(message);
|
|
42
|
+
// Should include content tokens + role tokens + overhead
|
|
43
|
+
t.true(count > 5);
|
|
44
|
+
t.true(count < 20);
|
|
45
|
+
});
|
|
46
|
+
test('AnthropicTokenizer countTokens for assistant message', t => {
|
|
47
|
+
const tokenizer = new AnthropicTokenizer('claude-3-opus');
|
|
48
|
+
const message = {
|
|
49
|
+
role: 'assistant',
|
|
50
|
+
content: 'I am doing well, thank you!',
|
|
51
|
+
};
|
|
52
|
+
const count = tokenizer.countTokens(message);
|
|
53
|
+
t.true(count > 5);
|
|
54
|
+
});
|
|
55
|
+
test('AnthropicTokenizer countTokens for system message', t => {
|
|
56
|
+
const tokenizer = new AnthropicTokenizer('claude-3-opus');
|
|
57
|
+
const message = {
|
|
58
|
+
role: 'system',
|
|
59
|
+
content: 'You are a helpful assistant.',
|
|
60
|
+
};
|
|
61
|
+
const count = tokenizer.countTokens(message);
|
|
62
|
+
t.true(count > 5);
|
|
63
|
+
});
|
|
64
|
+
test('AnthropicTokenizer countTokens handles empty content', t => {
|
|
65
|
+
const tokenizer = new AnthropicTokenizer('claude-3-opus');
|
|
66
|
+
const message = {
|
|
67
|
+
role: 'user',
|
|
68
|
+
content: '',
|
|
69
|
+
};
|
|
70
|
+
const count = tokenizer.countTokens(message);
|
|
71
|
+
// Should still have overhead for role and message structure
|
|
72
|
+
t.true(count >= 3);
|
|
73
|
+
});
|
|
74
|
+
test('AnthropicTokenizer countTokens handles missing content', t => {
|
|
75
|
+
const tokenizer = new AnthropicTokenizer('claude-3-opus');
|
|
76
|
+
const message = {
|
|
77
|
+
role: 'user',
|
|
78
|
+
};
|
|
79
|
+
const count = tokenizer.countTokens(message);
|
|
80
|
+
// Should handle gracefully
|
|
81
|
+
t.true(count >= 0);
|
|
82
|
+
});
|
|
83
|
+
test('AnthropicTokenizer countTokens includes message overhead', t => {
|
|
84
|
+
const tokenizer = new AnthropicTokenizer('claude-3-opus');
|
|
85
|
+
const shortMessage = {
|
|
86
|
+
role: 'user',
|
|
87
|
+
content: 'Hi',
|
|
88
|
+
};
|
|
89
|
+
const count = tokenizer.countTokens(shortMessage);
|
|
90
|
+
const contentOnly = tokenizer.encode('Hi');
|
|
91
|
+
const roleOnly = tokenizer.encode('user');
|
|
92
|
+
// Total should be more than just content + role due to overhead
|
|
93
|
+
t.true(count > contentOnly + roleOnly);
|
|
94
|
+
});
|
|
95
|
+
test('AnthropicTokenizer handles special characters', t => {
|
|
96
|
+
const tokenizer = new AnthropicTokenizer('claude-3-opus');
|
|
97
|
+
const text = '你好世界 🌍 Привет мир';
|
|
98
|
+
const count = tokenizer.encode(text);
|
|
99
|
+
t.true(count > 0);
|
|
100
|
+
});
|
|
101
|
+
test('AnthropicTokenizer handles code snippets', t => {
|
|
102
|
+
const tokenizer = new AnthropicTokenizer('claude-3-opus');
|
|
103
|
+
const code = `
|
|
104
|
+
function hello() {
|
|
105
|
+
console.log("Hello, world!");
|
|
106
|
+
}
|
|
107
|
+
`;
|
|
108
|
+
const count = tokenizer.encode(code);
|
|
109
|
+
t.true(count > 10);
|
|
110
|
+
});
|
|
111
|
+
test('AnthropicTokenizer works with claude-3-sonnet model', t => {
|
|
112
|
+
const tokenizer = new AnthropicTokenizer('claude-3-sonnet');
|
|
113
|
+
const count = tokenizer.encode('Hello, world!');
|
|
114
|
+
t.true(count > 0);
|
|
115
|
+
t.is(tokenizer.getName(), 'anthropic-claude-3-sonnet');
|
|
116
|
+
});
|
|
117
|
+
test('AnthropicTokenizer works with claude-3-haiku model', t => {
|
|
118
|
+
const tokenizer = new AnthropicTokenizer('claude-3-haiku');
|
|
119
|
+
const count = tokenizer.encode('Hello, world!');
|
|
120
|
+
t.true(count > 0);
|
|
121
|
+
t.is(tokenizer.getName(), 'anthropic-claude-3-haiku');
|
|
122
|
+
});
|
|
123
|
+
test('AnthropicTokenizer handles long messages', t => {
|
|
124
|
+
const tokenizer = new AnthropicTokenizer('claude-3-opus');
|
|
125
|
+
const longText = 'Hello '.repeat(1000);
|
|
126
|
+
const message = {
|
|
127
|
+
role: 'user',
|
|
128
|
+
content: longText,
|
|
129
|
+
};
|
|
130
|
+
const count = tokenizer.countTokens(message);
|
|
131
|
+
// Should handle long text without crashing
|
|
132
|
+
t.true(count > 1000);
|
|
133
|
+
});
|
|
134
|
+
test('AnthropicTokenizer uses fallback on encoding error', t => {
|
|
135
|
+
const tokenizer = new AnthropicTokenizer('claude-3-opus');
|
|
136
|
+
// The fallback should kick in for any edge cases
|
|
137
|
+
// Testing with normal text should still work
|
|
138
|
+
const count = tokenizer.encode('Normal text');
|
|
139
|
+
t.true(count > 0);
|
|
140
|
+
});
|
|
141
|
+
test('AnthropicTokenizer countTokens with tool message', t => {
|
|
142
|
+
const tokenizer = new AnthropicTokenizer('claude-3-opus');
|
|
143
|
+
const message = {
|
|
144
|
+
role: 'tool',
|
|
145
|
+
content: 'Tool result here',
|
|
146
|
+
tool_call_id: '123',
|
|
147
|
+
};
|
|
148
|
+
const count = tokenizer.countTokens(message);
|
|
149
|
+
// Should handle tool messages
|
|
150
|
+
t.true(count > 0);
|
|
151
|
+
});
|
|
152
|
+
//# sourceMappingURL=anthropic-tokenizer.spec.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"anthropic-tokenizer.spec.js","sourceRoot":"","sources":["../../../source/tokenization/tokenizers/anthropic-tokenizer.spec.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,IAAI,MAAM,KAAK,CAAC;AACvB,OAAO,EAAC,kBAAkB,EAAC,MAAM,0BAA0B,CAAC;AAG5D,OAAO,CAAC,GAAG,CAAC,+BAA+B,CAAC,CAAC;AAE7C,IAAI,CAAC,wCAAwC,EAAE,CAAC,CAAC,EAAE;IAClD,MAAM,SAAS,GAAG,IAAI,kBAAkB,CAAC,eAAe,CAAC,CAAC;IAC1D,MAAM,KAAK,GAAG,SAAS,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC;IAEhD,uCAAuC;IACvC,CAAC,CAAC,IAAI,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC;IAClB,CAAC,CAAC,IAAI,CAAC,KAAK,GAAG,EAAE,CAAC,CAAC;AACpB,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,yCAAyC,EAAE,CAAC,CAAC,EAAE;IACnD,MAAM,SAAS,GAAG,IAAI,kBAAkB,CAAC,eAAe,CAAC,CAAC;IAC1D,MAAM,KAAK,GAAG,SAAS,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;IAEnC,CAAC,CAAC,EAAE,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;AAChB,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,wCAAwC,EAAE,CAAC,CAAC,EAAE;IAClD,MAAM,SAAS,GAAG,IAAI,kBAAkB,CAAC,eAAe,CAAC,CAAC;IAC1D,MAAM,IAAI,GACT,wFAAwF,CAAC;IAC1F,MAAM,KAAK,GAAG,SAAS,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAErC,wCAAwC;IACxC,CAAC,CAAC,IAAI,CAAC,KAAK,GAAG,EAAE,CAAC,CAAC;IACnB,CAAC,CAAC,IAAI,CAAC,KAAK,GAAG,EAAE,CAAC,CAAC;AACpB,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,iEAAiE,EAAE,CAAC,CAAC,EAAE;IAC3E,MAAM,SAAS,GAAG,IAAI,kBAAkB,EAAE,CAAC;IAE3C,CAAC,CAAC,EAAE,CAAC,SAAS,CAAC,OAAO,EAAE,EAAE,oBAAoB,CAAC,CAAC;AACjD,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,mDAAmD,EAAE,CAAC,CAAC,EAAE;IAC7D,MAAM,SAAS,GAAG,IAAI,kBAAkB,CAAC,iBAAiB,CAAC,CAAC;IAE5D,CAAC,CAAC,EAAE,CAAC,SAAS,CAAC,OAAO,EAAE,EAAE,2BAA2B,CAAC,CAAC;AACxD,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,iDAAiD,EAAE,CAAC,CAAC,EAAE;IAC3D,MAAM,SAAS,GAAG,IAAI,kBAAkB,CAAC,eAAe,CAAC,CAAC;IAC1D,MAAM,OAAO,GAAY;QACxB,IAAI,EAAE,MAAM;QACZ,OAAO,EAAE,qBAAqB;KAC9B,CAAC;IAEF,MAAM,KAAK,GAAG,SAAS,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;IAE7C,yDAAyD;IACzD,CAAC,CAAC,IAAI,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC;IAClB,CAAC,CAAC,IAAI,CAAC,KAAK,GAAG,EAAE,CAAC,CAAC;AACpB,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,sDAAsD,EAAE,CAAC,CAAC,EAAE;IAChE,MAAM,SAAS,GAAG,IAAI,kBAAkB,CAAC,eAAe,CAAC,CAAC;IAC1D,MAAM,OAAO,GAAY;QACxB,IAAI,EAAE,WAAW;QACjB,OAAO,EAAE,6BAA6B;KACtC,CAAC;IAEF,MAAM,KAAK,GAAG,SAAS,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;IAE7C,CAAC,CAAC,IAAI,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC;AACnB,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,mDAAmD,EAAE,CAAC,CAAC,EAAE;IAC7D,MAAM,SAAS,GAAG,IAAI,kBAAkB,CAAC,eAAe,CAAC,CAAC;IAC1D,MAAM,OAAO,GAAY;QACxB,IAAI,EAAE,QAAQ;QACd,OAAO,EAAE,8BAA8B;KACvC,CAAC;IAEF,MAAM,KAAK,GAAG,SAAS,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;IAE7C,CAAC,CAAC,IAAI,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC;AACnB,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,sDAAsD,EAAE,CAAC,CAAC,EAAE;IAChE,MAAM,SAAS,GAAG,IAAI,kBAAkB,CAAC,eAAe,CAAC,CAAC;IAC1D,MAAM,OAAO,GAAY;QACxB,IAAI,EAAE,MAAM;QACZ,OAAO,EAAE,EAAE;KACX,CAAC;IAEF,MAAM,KAAK,GAAG,SAAS,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;IAE7C,4DAA4D;IAC5D,CAAC,CAAC,IAAI,CAAC,KAAK,IAAI,CAAC,CAAC,CAAC;AACpB,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,wDAAwD,EAAE,CAAC,CAAC,EAAE;IAClE,MAAM,SAAS,GAAG,IAAI,kBAAkB,CAAC,eAAe,CAAC,CAAC;IAC1D,MAAM,OAAO,GAAY;QACxB,IAAI,EAAE,MAAM;KACD,CAAC;IAEb,MAAM,KAAK,GAAG,SAAS,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;IAE7C,2BAA2B;IAC3B,CAAC,CAAC,IAAI,CAAC,KAAK,IAAI,CAAC,CAAC,CAAC;AACpB,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,0DAA0D,EAAE,CAAC,CAAC,EAAE;IACpE,MAAM,SAAS,GAAG,IAAI,kBAAkB,CAAC,eAAe,CAAC,CAAC;IAC1D,MAAM,YAAY,GAAY;QAC7B,IAAI,EAAE,MAAM;QACZ,OAAO,EAAE,IAAI;KACb,CAAC;IAEF,MAAM,KAAK,GAAG,SAAS,CAAC,WAAW,CAAC,YAAY,CAAC,CAAC;IAClD,MAAM,WAAW,GAAG,SAAS,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAC3C,MAAM,QAAQ,GAAG,SAAS,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;IAE1C,gEAAgE;IAChE,CAAC,CAAC,IAAI,CAAC,KAAK,GAAG,WAAW,GAAG,QAAQ,CAAC,CAAC;AACxC,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,+CAA+C,EAAE,CAAC,CAAC,EAAE;IACzD,MAAM,SAAS,GAAG,IAAI,kBAAkB,CAAC,eAAe,CAAC,CAAC;IAC1D,MAAM,IAAI,GAAG,oBAAoB,CAAC;IAClC,MAAM,KAAK,GAAG,SAAS,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAErC,CAAC,CAAC,IAAI,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC;AACnB,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,0CAA0C,EAAE,CAAC,CAAC,EAAE;IACpD,MAAM,SAAS,GAAG,IAAI,kBAAkB,CAAC,eAAe,CAAC,CAAC;IAC1D,MAAM,IAAI,GAAG;;;;EAIZ,CAAC;IACF,MAAM,KAAK,GAAG,SAAS,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAErC,CAAC,CAAC,IAAI,CAAC,KAAK,GAAG,EAAE,CAAC,CAAC;AACpB,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,qDAAqD,EAAE,CAAC,CAAC,EAAE;IAC/D,MAAM,SAAS,GAAG,IAAI,kBAAkB,CAAC,iBAAiB,CAAC,CAAC;IAC5D,MAAM,KAAK,GAAG,SAAS,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC;IAEhD,CAAC,CAAC,IAAI,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC;IAClB,CAAC,CAAC,EAAE,CAAC,SAAS,CAAC,OAAO,EAAE,EAAE,2BAA2B,CAAC,CAAC;AACxD,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,oDAAoD,EAAE,CAAC,CAAC,EAAE;IAC9D,MAAM,SAAS,GAAG,IAAI,kBAAkB,CAAC,gBAAgB,CAAC,CAAC;IAC3D,MAAM,KAAK,GAAG,SAAS,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC;IAEhD,CAAC,CAAC,IAAI,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC;IAClB,CAAC,CAAC,EAAE,CAAC,SAAS,CAAC,OAAO,EAAE,EAAE,0BAA0B,CAAC,CAAC;AACvD,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,0CAA0C,EAAE,CAAC,CAAC,EAAE;IACpD,MAAM,SAAS,GAAG,IAAI,kBAAkB,CAAC,eAAe,CAAC,CAAC;IAC1D,MAAM,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACvC,MAAM,OAAO,GAAY;QACxB,IAAI,EAAE,MAAM;QACZ,OAAO,EAAE,QAAQ;KACjB,CAAC;IAEF,MAAM,KAAK,GAAG,SAAS,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;IAE7C,2CAA2C;IAC3C,CAAC,CAAC,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,CAAC;AACtB,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,oDAAoD,EAAE,CAAC,CAAC,EAAE;IAC9D,MAAM,SAAS,GAAG,IAAI,kBAAkB,CAAC,eAAe,CAAC,CAAC;IAE1D,iDAAiD;IACjD,6CAA6C;IAC7C,MAAM,KAAK,GAAG,SAAS,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC;IAE9C,CAAC,CAAC,IAAI,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC;AACnB,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,kDAAkD,EAAE,CAAC,CAAC,EAAE;IAC5D,MAAM,SAAS,GAAG,IAAI,kBAAkB,CAAC,eAAe,CAAC,CAAC;IAC1D,MAAM,OAAO,GAAY;QACxB,IAAI,EAAE,MAAM;QACZ,OAAO,EAAE,kBAAkB;QAC3B,YAAY,EAAE,KAAK;KACnB,CAAC;IAEF,MAAM,KAAK,GAAG,SAAS,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;IAE7C,8BAA8B;IAC9B,CAAC,CAAC,IAAI,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC;AACnB,CAAC,CAAC,CAAC"}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import type { Tokenizer } from '../../types/tokenization.js';
|
|
2
|
+
import type { Message } from '../../types/core.js';
|
|
3
|
+
/**
|
|
4
|
+
* Fallback tokenizer for unsupported models
|
|
5
|
+
* Uses a simple character-based estimation (4 chars per token)
|
|
6
|
+
*/
|
|
7
|
+
export declare class FallbackTokenizer implements Tokenizer {
|
|
8
|
+
private readonly CHARS_PER_TOKEN;
|
|
9
|
+
encode(text: string): number;
|
|
10
|
+
countTokens(message: Message): number;
|
|
11
|
+
getName(): string;
|
|
12
|
+
}
|
|
13
|
+
//# sourceMappingURL=fallback-tokenizer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fallback-tokenizer.d.ts","sourceRoot":"","sources":["../../../source/tokenization/tokenizers/fallback-tokenizer.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAC,SAAS,EAAC,MAAM,0BAA0B,CAAC;AACxD,OAAO,KAAK,EAAC,OAAO,EAAC,MAAM,cAAc,CAAC;AAE1C;;;GAGG;AACH,qBAAa,iBAAkB,YAAW,SAAS;IAClD,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAK;IAErC,MAAM,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM;IAI5B,WAAW,CAAC,OAAO,EAAE,OAAO,GAAG,MAAM;IAQrC,OAAO,IAAI,MAAM;CAGjB"}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Fallback tokenizer for unsupported models
|
|
3
|
+
* Uses a simple character-based estimation (4 chars per token)
|
|
4
|
+
*/
|
|
5
|
+
export class FallbackTokenizer {
|
|
6
|
+
CHARS_PER_TOKEN = 4;
|
|
7
|
+
encode(text) {
|
|
8
|
+
return Math.ceil(text.length / this.CHARS_PER_TOKEN);
|
|
9
|
+
}
|
|
10
|
+
countTokens(message) {
|
|
11
|
+
const content = message.content || '';
|
|
12
|
+
const role = message.role || '';
|
|
13
|
+
// Count tokens for content + a small overhead for role and formatting
|
|
14
|
+
return this.encode(content) + Math.ceil(role.length / this.CHARS_PER_TOKEN);
|
|
15
|
+
}
|
|
16
|
+
getName() {
|
|
17
|
+
return 'fallback';
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
//# sourceMappingURL=fallback-tokenizer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fallback-tokenizer.js","sourceRoot":"","sources":["../../../source/tokenization/tokenizers/fallback-tokenizer.ts"],"names":[],"mappings":"AAGA;;;GAGG;AACH,MAAM,OAAO,iBAAiB;IACZ,eAAe,GAAG,CAAC,CAAC;IAErC,MAAM,CAAC,IAAY;QAClB,OAAO,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,eAAe,CAAC,CAAC;IACtD,CAAC;IAED,WAAW,CAAC,OAAgB;QAC3B,MAAM,OAAO,GAAG,OAAO,CAAC,OAAO,IAAI,EAAE,CAAC;QACtC,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,IAAI,EAAE,CAAC;QAEhC,sEAAsE;QACtE,OAAO,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,eAAe,CAAC,CAAC;IAC7E,CAAC;IAED,OAAO;QACN,OAAO,UAAU,CAAC;IACnB,CAAC;CACD"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fallback-tokenizer.spec.d.ts","sourceRoot":"","sources":["../../../source/tokenization/tokenizers/fallback-tokenizer.spec.ts"],"names":[],"mappings":"AAAA;;GAEG"}
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for fallback-tokenizer.ts
|
|
3
|
+
*/
|
|
4
|
+
import test from 'ava';
|
|
5
|
+
import { FallbackTokenizer } from './fallback-tokenizer.js';
|
|
6
|
+
console.log(`\nfallback-tokenizer.spec.ts`);
|
|
7
|
+
test('FallbackTokenizer encodes simple text', t => {
|
|
8
|
+
const tokenizer = new FallbackTokenizer();
|
|
9
|
+
const count = tokenizer.encode('Hello, world!');
|
|
10
|
+
// "Hello, world!" is 13 characters, so ~4 tokens (13/4 = 3.25, rounded up to 4)
|
|
11
|
+
t.is(count, 4);
|
|
12
|
+
});
|
|
13
|
+
test('FallbackTokenizer encodes empty string', t => {
|
|
14
|
+
const tokenizer = new FallbackTokenizer();
|
|
15
|
+
const count = tokenizer.encode('');
|
|
16
|
+
t.is(count, 0);
|
|
17
|
+
});
|
|
18
|
+
test('FallbackTokenizer encodes longer text', t => {
|
|
19
|
+
const tokenizer = new FallbackTokenizer();
|
|
20
|
+
const text = 'This is a longer piece of text that should have more tokens than a simple hello world.';
|
|
21
|
+
const count = tokenizer.encode(text);
|
|
22
|
+
// 88 characters / 4 = 22 tokens
|
|
23
|
+
t.is(count, 22);
|
|
24
|
+
});
|
|
25
|
+
test('FallbackTokenizer uses 4 chars per token ratio', t => {
|
|
26
|
+
const tokenizer = new FallbackTokenizer();
|
|
27
|
+
// Test exact multiples of 4
|
|
28
|
+
t.is(tokenizer.encode('1234'), 1);
|
|
29
|
+
t.is(tokenizer.encode('12345678'), 2);
|
|
30
|
+
t.is(tokenizer.encode('123456789012'), 3);
|
|
31
|
+
});
|
|
32
|
+
test('FallbackTokenizer rounds up partial tokens', t => {
|
|
33
|
+
const tokenizer = new FallbackTokenizer();
|
|
34
|
+
// 5 characters should round up to 2 tokens
|
|
35
|
+
t.is(tokenizer.encode('12345'), 2);
|
|
36
|
+
// 9 characters should round up to 3 tokens
|
|
37
|
+
t.is(tokenizer.encode('123456789'), 3);
|
|
38
|
+
});
|
|
39
|
+
test('FallbackTokenizer getName returns correct name', t => {
|
|
40
|
+
const tokenizer = new FallbackTokenizer();
|
|
41
|
+
t.is(tokenizer.getName(), 'fallback');
|
|
42
|
+
});
|
|
43
|
+
test('FallbackTokenizer countTokens for user message', t => {
|
|
44
|
+
const tokenizer = new FallbackTokenizer();
|
|
45
|
+
const message = {
|
|
46
|
+
role: 'user',
|
|
47
|
+
content: 'Hello, how are you?',
|
|
48
|
+
};
|
|
49
|
+
const count = tokenizer.countTokens(message);
|
|
50
|
+
// "Hello, how are you?" is 19 chars = 5 tokens
|
|
51
|
+
// "user" is 4 chars = 1 token
|
|
52
|
+
// Total = 6 tokens
|
|
53
|
+
t.is(count, 6);
|
|
54
|
+
});
|
|
55
|
+
test('FallbackTokenizer countTokens for assistant message', t => {
|
|
56
|
+
const tokenizer = new FallbackTokenizer();
|
|
57
|
+
const message = {
|
|
58
|
+
role: 'assistant',
|
|
59
|
+
content: 'I am doing well, thank you!',
|
|
60
|
+
};
|
|
61
|
+
const count = tokenizer.countTokens(message);
|
|
62
|
+
// "I am doing well, thank you!" is 27 chars = 7 tokens
|
|
63
|
+
// "assistant" is 9 chars = 3 tokens
|
|
64
|
+
// Total = 10 tokens
|
|
65
|
+
t.is(count, 10);
|
|
66
|
+
});
|
|
67
|
+
test('FallbackTokenizer countTokens for system message', t => {
|
|
68
|
+
const tokenizer = new FallbackTokenizer();
|
|
69
|
+
const message = {
|
|
70
|
+
role: 'system',
|
|
71
|
+
content: 'You are a helpful assistant.',
|
|
72
|
+
};
|
|
73
|
+
const count = tokenizer.countTokens(message);
|
|
74
|
+
// "You are a helpful assistant." is 28 chars = 7 tokens
|
|
75
|
+
// "system" is 6 chars = 2 tokens
|
|
76
|
+
// Total = 9 tokens
|
|
77
|
+
t.is(count, 9);
|
|
78
|
+
});
|
|
79
|
+
test('FallbackTokenizer countTokens handles empty content', t => {
|
|
80
|
+
const tokenizer = new FallbackTokenizer();
|
|
81
|
+
const message = {
|
|
82
|
+
role: 'user',
|
|
83
|
+
content: '',
|
|
84
|
+
};
|
|
85
|
+
const count = tokenizer.countTokens(message);
|
|
86
|
+
// Empty content = 0 tokens
|
|
87
|
+
// "user" is 4 chars = 1 token
|
|
88
|
+
// Total = 1 token
|
|
89
|
+
t.is(count, 1);
|
|
90
|
+
});
|
|
91
|
+
test('FallbackTokenizer countTokens handles missing content', t => {
|
|
92
|
+
const tokenizer = new FallbackTokenizer();
|
|
93
|
+
const message = {
|
|
94
|
+
role: 'user',
|
|
95
|
+
};
|
|
96
|
+
const count = tokenizer.countTokens(message);
|
|
97
|
+
// Missing content treated as empty string = 0 tokens
|
|
98
|
+
// "user" is 4 chars = 1 token
|
|
99
|
+
// Total = 1 token
|
|
100
|
+
t.is(count, 1);
|
|
101
|
+
});
|
|
102
|
+
test('FallbackTokenizer handles special characters', t => {
|
|
103
|
+
const tokenizer = new FallbackTokenizer();
|
|
104
|
+
const text = '你好世界 🌍 Привет мир';
|
|
105
|
+
const count = tokenizer.encode(text);
|
|
106
|
+
// Should handle all characters equally (character count / 4)
|
|
107
|
+
t.true(count > 0);
|
|
108
|
+
t.is(count, Math.ceil(text.length / 4));
|
|
109
|
+
});
|
|
110
|
+
test('FallbackTokenizer handles code snippets', t => {
|
|
111
|
+
const tokenizer = new FallbackTokenizer();
|
|
112
|
+
const code = `
|
|
113
|
+
function hello() {
|
|
114
|
+
console.log("Hello, world!");
|
|
115
|
+
}
|
|
116
|
+
`;
|
|
117
|
+
const count = tokenizer.encode(code);
|
|
118
|
+
t.true(count > 0);
|
|
119
|
+
t.is(count, Math.ceil(code.length / 4));
|
|
120
|
+
});
|
|
121
|
+
test('FallbackTokenizer handles long messages', t => {
|
|
122
|
+
const tokenizer = new FallbackTokenizer();
|
|
123
|
+
const longText = 'Hello '.repeat(1000);
|
|
124
|
+
const message = {
|
|
125
|
+
role: 'user',
|
|
126
|
+
content: longText,
|
|
127
|
+
};
|
|
128
|
+
const count = tokenizer.countTokens(message);
|
|
129
|
+
// Should handle long text without crashing
|
|
130
|
+
t.true(count > 1000);
|
|
131
|
+
const expectedContentTokens = Math.ceil(longText.length / 4);
|
|
132
|
+
const expectedRoleTokens = Math.ceil('user'.length / 4);
|
|
133
|
+
t.is(count, expectedContentTokens + expectedRoleTokens);
|
|
134
|
+
});
|
|
135
|
+
test('FallbackTokenizer countTokens with tool message', t => {
|
|
136
|
+
const tokenizer = new FallbackTokenizer();
|
|
137
|
+
const message = {
|
|
138
|
+
role: 'tool',
|
|
139
|
+
content: 'Tool result here',
|
|
140
|
+
tool_call_id: '123',
|
|
141
|
+
};
|
|
142
|
+
const count = tokenizer.countTokens(message);
|
|
143
|
+
// Should handle tool messages
|
|
144
|
+
// "Tool result here" is 16 chars = 4 tokens
|
|
145
|
+
// "tool" is 4 chars = 1 token
|
|
146
|
+
// Total = 5 tokens
|
|
147
|
+
t.is(count, 5);
|
|
148
|
+
});
|
|
149
|
+
test('FallbackTokenizer handles single character', t => {
|
|
150
|
+
const tokenizer = new FallbackTokenizer();
|
|
151
|
+
const count = tokenizer.encode('a');
|
|
152
|
+
// 1 character should round up to 1 token
|
|
153
|
+
t.is(count, 1);
|
|
154
|
+
});
|
|
155
|
+
test('FallbackTokenizer handles whitespace', t => {
|
|
156
|
+
const tokenizer = new FallbackTokenizer();
|
|
157
|
+
const count = tokenizer.encode(' ');
|
|
158
|
+
// 4 spaces = 1 token
|
|
159
|
+
t.is(count, 1);
|
|
160
|
+
});
|
|
161
|
+
test('FallbackTokenizer handles newlines', t => {
|
|
162
|
+
const tokenizer = new FallbackTokenizer();
|
|
163
|
+
const count = tokenizer.encode('\n\n\n\n');
|
|
164
|
+
// 4 newlines = 1 token
|
|
165
|
+
t.is(count, 1);
|
|
166
|
+
});
|
|
167
|
+
test('FallbackTokenizer is consistent', t => {
|
|
168
|
+
const tokenizer = new FallbackTokenizer();
|
|
169
|
+
const text = 'The quick brown fox jumps over the lazy dog';
|
|
170
|
+
const count1 = tokenizer.encode(text);
|
|
171
|
+
const count2 = tokenizer.encode(text);
|
|
172
|
+
t.is(count1, count2);
|
|
173
|
+
});
|
|
174
|
+
test('FallbackTokenizer handles messages with missing role', t => {
|
|
175
|
+
const tokenizer = new FallbackTokenizer();
|
|
176
|
+
const message = {
|
|
177
|
+
content: 'Hello world',
|
|
178
|
+
};
|
|
179
|
+
const count = tokenizer.countTokens(message);
|
|
180
|
+
// Should handle gracefully, role treated as empty string
|
|
181
|
+
t.true(count > 0);
|
|
182
|
+
});
|
|
183
|
+
//# sourceMappingURL=fallback-tokenizer.spec.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fallback-tokenizer.spec.js","sourceRoot":"","sources":["../../../source/tokenization/tokenizers/fallback-tokenizer.spec.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,IAAI,MAAM,KAAK,CAAC;AACvB,OAAO,EAAC,iBAAiB,EAAC,MAAM,yBAAyB,CAAC;AAG1D,OAAO,CAAC,GAAG,CAAC,8BAA8B,CAAC,CAAC;AAE5C,IAAI,CAAC,uCAAuC,EAAE,CAAC,CAAC,EAAE;IACjD,MAAM,SAAS,GAAG,IAAI,iBAAiB,EAAE,CAAC;IAC1C,MAAM,KAAK,GAAG,SAAS,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC;IAEhD,gFAAgF;IAChF,CAAC,CAAC,EAAE,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;AAChB,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,wCAAwC,EAAE,CAAC,CAAC,EAAE;IAClD,MAAM,SAAS,GAAG,IAAI,iBAAiB,EAAE,CAAC;IAC1C,MAAM,KAAK,GAAG,SAAS,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;IAEnC,CAAC,CAAC,EAAE,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;AAChB,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,uCAAuC,EAAE,CAAC,CAAC,EAAE;IACjD,MAAM,SAAS,GAAG,IAAI,iBAAiB,EAAE,CAAC;IAC1C,MAAM,IAAI,GACT,wFAAwF,CAAC;IAC1F,MAAM,KAAK,GAAG,SAAS,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAErC,gCAAgC;IAChC,CAAC,CAAC,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;AACjB,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,gDAAgD,EAAE,CAAC,CAAC,EAAE;IAC1D,MAAM,SAAS,GAAG,IAAI,iBAAiB,EAAE,CAAC;IAE1C,4BAA4B;IAC5B,CAAC,CAAC,EAAE,CAAC,SAAS,CAAC,MAAM,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;IAClC,CAAC,CAAC,EAAE,CAAC,SAAS,CAAC,MAAM,CAAC,UAAU,CAAC,EAAE,CAAC,CAAC,CAAC;IACtC,CAAC,CAAC,EAAE,CAAC,SAAS,CAAC,MAAM,CAAC,cAAc,CAAC,EAAE,CAAC,CAAC,CAAC;AAC3C,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,4CAA4C,EAAE,CAAC,CAAC,EAAE;IACtD,MAAM,SAAS,GAAG,IAAI,iBAAiB,EAAE,CAAC;IAE1C,2CAA2C;IAC3C,CAAC,CAAC,EAAE,CAAC,SAAS,CAAC,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,CAAC;IAEnC,2CAA2C;IAC3C,CAAC,CAAC,EAAE,CAAC,SAAS,CAAC,MAAM,CAAC,WAAW,CAAC,EAAE,CAAC,CAAC,CAAC;AACxC,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,gDAAgD,EAAE,CAAC,CAAC,EAAE;IAC1D,MAAM,SAAS,GAAG,IAAI,iBAAiB,EAAE,CAAC;IAE1C,CAAC,CAAC,EAAE,CAAC,SAAS,CAAC,OAAO,EAAE,EAAE,UAAU,CAAC,CAAC;AACvC,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,gDAAgD,EAAE,CAAC,CAAC,EAAE;IAC1D,MAAM,SAAS,GAAG,IAAI,iBAAiB,EAAE,CAAC;IAC1C,MAAM,OAAO,GAAY;QACxB,IAAI,EAAE,MAAM;QACZ,OAAO,EAAE,qBAAqB;KAC9B,CAAC;IAEF,MAAM,KAAK,GAAG,SAAS,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;IAE7C,+CAA+C;IAC/C,8BAA8B;IAC9B,mBAAmB;IACnB,CAAC,CAAC,EAAE,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;AAChB,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,qDAAqD,EAAE,CAAC,CAAC,EAAE;IAC/D,MAAM,SAAS,GAAG,IAAI,iBAAiB,EAAE,CAAC;IAC1C,MAAM,OAAO,GAAY;QACxB,IAAI,EAAE,WAAW;QACjB,OAAO,EAAE,6BAA6B;KACtC,CAAC;IAEF,MAAM,KAAK,GAAG,SAAS,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;IAE7C,uDAAuD;IACvD,oCAAoC;IACpC,oBAAoB;IACpB,CAAC,CAAC,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;AACjB,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,kDAAkD,EAAE,CAAC,CAAC,EAAE;IAC5D,MAAM,SAAS,GAAG,IAAI,iBAAiB,EAAE,CAAC;IAC1C,MAAM,OAAO,GAAY;QACxB,IAAI,EAAE,QAAQ;QACd,OAAO,EAAE,8BAA8B;KACvC,CAAC;IAEF,MAAM,KAAK,GAAG,SAAS,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;IAE7C,wDAAwD;IACxD,iCAAiC;IACjC,mBAAmB;IACnB,CAAC,CAAC,EAAE,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;AAChB,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,qDAAqD,EAAE,CAAC,CAAC,EAAE;IAC/D,MAAM,SAAS,GAAG,IAAI,iBAAiB,EAAE,CAAC;IAC1C,MAAM,OAAO,GAAY;QACxB,IAAI,EAAE,MAAM;QACZ,OAAO,EAAE,EAAE;KACX,CAAC;IAEF,MAAM,KAAK,GAAG,SAAS,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;IAE7C,2BAA2B;IAC3B,8BAA8B;IAC9B,kBAAkB;IAClB,CAAC,CAAC,EAAE,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;AAChB,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,uDAAuD,EAAE,CAAC,CAAC,EAAE;IACjE,MAAM,SAAS,GAAG,IAAI,iBAAiB,EAAE,CAAC;IAC1C,MAAM,OAAO,GAAY;QACxB,IAAI,EAAE,MAAM;KACD,CAAC;IAEb,MAAM,KAAK,GAAG,SAAS,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;IAE7C,qDAAqD;IACrD,8BAA8B;IAC9B,kBAAkB;IAClB,CAAC,CAAC,EAAE,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;AAChB,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,8CAA8C,EAAE,CAAC,CAAC,EAAE;IACxD,MAAM,SAAS,GAAG,IAAI,iBAAiB,EAAE,CAAC;IAC1C,MAAM,IAAI,GAAG,oBAAoB,CAAC;IAClC,MAAM,KAAK,GAAG,SAAS,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAErC,6DAA6D;IAC7D,CAAC,CAAC,IAAI,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC;IAClB,CAAC,CAAC,EAAE,CAAC,KAAK,EAAE,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC;AACzC,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,yCAAyC,EAAE,CAAC,CAAC,EAAE;IACnD,MAAM,SAAS,GAAG,IAAI,iBAAiB,EAAE,CAAC;IAC1C,MAAM,IAAI,GAAG;;;;EAIZ,CAAC;IACF,MAAM,KAAK,GAAG,SAAS,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAErC,CAAC,CAAC,IAAI,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC;IAClB,CAAC,CAAC,EAAE,CAAC,KAAK,EAAE,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC;AACzC,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,yCAAyC,EAAE,CAAC,CAAC,EAAE;IACnD,MAAM,SAAS,GAAG,IAAI,iBAAiB,EAAE,CAAC;IAC1C,MAAM,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACvC,MAAM,OAAO,GAAY;QACxB,IAAI,EAAE,MAAM;QACZ,OAAO,EAAE,QAAQ;KACjB,CAAC;IAEF,MAAM,KAAK,GAAG,SAAS,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;IAE7C,2CAA2C;IAC3C,CAAC,CAAC,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,CAAC;IACrB,MAAM,qBAAqB,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAC7D,MAAM,kBAAkB,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IACxD,CAAC,CAAC,EAAE,CAAC,KAAK,EAAE,qBAAqB,GAAG,kBAAkB,CAAC,CAAC;AACzD,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,iDAAiD,EAAE,CAAC,CAAC,EAAE;IAC3D,MAAM,SAAS,GAAG,IAAI,iBAAiB,EAAE,CAAC;IAC1C,MAAM,OAAO,GAAY;QACxB,IAAI,EAAE,MAAM;QACZ,OAAO,EAAE,kBAAkB;QAC3B,YAAY,EAAE,KAAK;KACnB,CAAC;IAEF,MAAM,KAAK,GAAG,SAAS,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;IAE7C,8BAA8B;IAC9B,4CAA4C;IAC5C,8BAA8B;IAC9B,mBAAmB;IACnB,CAAC,CAAC,EAAE,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;AAChB,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,4CAA4C,EAAE,CAAC,CAAC,EAAE;IACtD,MAAM,SAAS,GAAG,IAAI,iBAAiB,EAAE,CAAC;IAC1C,MAAM,KAAK,GAAG,SAAS,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;IAEpC,yCAAyC;IACzC,CAAC,CAAC,EAAE,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;AAChB,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,sCAAsC,EAAE,CAAC,CAAC,EAAE;IAChD,MAAM,SAAS,GAAG,IAAI,iBAAiB,EAAE,CAAC;IAC1C,MAAM,KAAK,GAAG,SAAS,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;IAEvC,qBAAqB;IACrB,CAAC,CAAC,EAAE,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;AAChB,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,oCAAoC,EAAE,CAAC,CAAC,EAAE;IAC9C,MAAM,SAAS,GAAG,IAAI,iBAAiB,EAAE,CAAC;IAC1C,MAAM,KAAK,GAAG,SAAS,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC;IAE3C,uBAAuB;IACvB,CAAC,CAAC,EAAE,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;AAChB,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,iCAAiC,EAAE,CAAC,CAAC,EAAE;IAC3C,MAAM,SAAS,GAAG,IAAI,iBAAiB,EAAE,CAAC;IAC1C,MAAM,IAAI,GAAG,6CAA6C,CAAC;IAE3D,MAAM,MAAM,GAAG,SAAS,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACtC,MAAM,MAAM,GAAG,SAAS,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAEtC,CAAC,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;AACtB,CAAC,CAAC,CAAC;AAEH,IAAI,CAAC,sDAAsD,EAAE,CAAC,CAAC,EAAE;IAChE,MAAM,SAAS,GAAG,IAAI,iBAAiB,EAAE,CAAC;IAC1C,MAAM,OAAO,GAAY;QACxB,OAAO,EAAE,aAAa;KACX,CAAC;IAEb,MAAM,KAAK,GAAG,SAAS,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;IAE7C,yDAAyD;IACzD,CAAC,CAAC,IAAI,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC;AACnB,CAAC,CAAC,CAAC"}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Llama tokenizer for local models
|
|
3
|
+
* Uses llama-tokenizer-js package
|
|
4
|
+
*/
|
|
5
|
+
import type { Tokenizer } from '../../types/tokenization.js';
|
|
6
|
+
import type { Message } from '../../types/core.js';
|
|
7
|
+
export declare class LlamaTokenizer implements Tokenizer {
|
|
8
|
+
private modelName;
|
|
9
|
+
constructor(modelId?: string);
|
|
10
|
+
encode(text: string): number;
|
|
11
|
+
countTokens(message: Message): number;
|
|
12
|
+
getName(): string;
|
|
13
|
+
}
|
|
14
|
+
//# sourceMappingURL=llama-tokenizer.d.ts.map
|