headroom-ai 0.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- headroom/__init__.py +212 -0
- headroom/cache/__init__.py +76 -0
- headroom/cache/anthropic.py +517 -0
- headroom/cache/base.py +342 -0
- headroom/cache/compression_feedback.py +613 -0
- headroom/cache/compression_store.py +814 -0
- headroom/cache/dynamic_detector.py +1026 -0
- headroom/cache/google.py +884 -0
- headroom/cache/openai.py +584 -0
- headroom/cache/registry.py +175 -0
- headroom/cache/semantic.py +451 -0
- headroom/ccr/__init__.py +77 -0
- headroom/ccr/context_tracker.py +582 -0
- headroom/ccr/mcp_server.py +319 -0
- headroom/ccr/response_handler.py +772 -0
- headroom/ccr/tool_injection.py +415 -0
- headroom/cli.py +219 -0
- headroom/client.py +977 -0
- headroom/compression/__init__.py +42 -0
- headroom/compression/detector.py +424 -0
- headroom/compression/handlers/__init__.py +22 -0
- headroom/compression/handlers/base.py +219 -0
- headroom/compression/handlers/code_handler.py +506 -0
- headroom/compression/handlers/json_handler.py +418 -0
- headroom/compression/masks.py +345 -0
- headroom/compression/universal.py +465 -0
- headroom/config.py +474 -0
- headroom/exceptions.py +192 -0
- headroom/integrations/__init__.py +159 -0
- headroom/integrations/agno/__init__.py +53 -0
- headroom/integrations/agno/hooks.py +345 -0
- headroom/integrations/agno/model.py +625 -0
- headroom/integrations/agno/providers.py +154 -0
- headroom/integrations/langchain/__init__.py +106 -0
- headroom/integrations/langchain/agents.py +326 -0
- headroom/integrations/langchain/chat_model.py +1002 -0
- headroom/integrations/langchain/langsmith.py +324 -0
- headroom/integrations/langchain/memory.py +319 -0
- headroom/integrations/langchain/providers.py +200 -0
- headroom/integrations/langchain/retriever.py +371 -0
- headroom/integrations/langchain/streaming.py +341 -0
- headroom/integrations/mcp/__init__.py +37 -0
- headroom/integrations/mcp/server.py +533 -0
- headroom/memory/__init__.py +37 -0
- headroom/memory/extractor.py +390 -0
- headroom/memory/fast_store.py +621 -0
- headroom/memory/fast_wrapper.py +311 -0
- headroom/memory/inline_extractor.py +229 -0
- headroom/memory/store.py +434 -0
- headroom/memory/worker.py +260 -0
- headroom/memory/wrapper.py +321 -0
- headroom/models/__init__.py +39 -0
- headroom/models/registry.py +687 -0
- headroom/parser.py +293 -0
- headroom/pricing/__init__.py +51 -0
- headroom/pricing/anthropic_prices.py +81 -0
- headroom/pricing/litellm_pricing.py +113 -0
- headroom/pricing/openai_prices.py +91 -0
- headroom/pricing/registry.py +188 -0
- headroom/providers/__init__.py +61 -0
- headroom/providers/anthropic.py +621 -0
- headroom/providers/base.py +131 -0
- headroom/providers/cohere.py +362 -0
- headroom/providers/google.py +427 -0
- headroom/providers/litellm.py +297 -0
- headroom/providers/openai.py +566 -0
- headroom/providers/openai_compatible.py +521 -0
- headroom/proxy/__init__.py +19 -0
- headroom/proxy/server.py +2683 -0
- headroom/py.typed +0 -0
- headroom/relevance/__init__.py +124 -0
- headroom/relevance/base.py +106 -0
- headroom/relevance/bm25.py +255 -0
- headroom/relevance/embedding.py +255 -0
- headroom/relevance/hybrid.py +259 -0
- headroom/reporting/__init__.py +5 -0
- headroom/reporting/generator.py +549 -0
- headroom/storage/__init__.py +41 -0
- headroom/storage/base.py +125 -0
- headroom/storage/jsonl.py +220 -0
- headroom/storage/sqlite.py +289 -0
- headroom/telemetry/__init__.py +91 -0
- headroom/telemetry/collector.py +764 -0
- headroom/telemetry/models.py +880 -0
- headroom/telemetry/toin.py +1579 -0
- headroom/tokenizer.py +80 -0
- headroom/tokenizers/__init__.py +75 -0
- headroom/tokenizers/base.py +210 -0
- headroom/tokenizers/estimator.py +198 -0
- headroom/tokenizers/huggingface.py +317 -0
- headroom/tokenizers/mistral.py +245 -0
- headroom/tokenizers/registry.py +398 -0
- headroom/tokenizers/tiktoken_counter.py +248 -0
- headroom/transforms/__init__.py +106 -0
- headroom/transforms/base.py +57 -0
- headroom/transforms/cache_aligner.py +357 -0
- headroom/transforms/code_compressor.py +1313 -0
- headroom/transforms/content_detector.py +335 -0
- headroom/transforms/content_router.py +1158 -0
- headroom/transforms/llmlingua_compressor.py +638 -0
- headroom/transforms/log_compressor.py +529 -0
- headroom/transforms/pipeline.py +297 -0
- headroom/transforms/rolling_window.py +350 -0
- headroom/transforms/search_compressor.py +365 -0
- headroom/transforms/smart_crusher.py +2682 -0
- headroom/transforms/text_compressor.py +259 -0
- headroom/transforms/tool_crusher.py +338 -0
- headroom/utils.py +215 -0
- headroom_ai-0.2.13.dist-info/METADATA +315 -0
- headroom_ai-0.2.13.dist-info/RECORD +114 -0
- headroom_ai-0.2.13.dist-info/WHEEL +4 -0
- headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
- headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
- headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
headroom/tokenizer.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""Token counting wrapper for Headroom SDK.
|
|
2
|
+
|
|
3
|
+
This module provides a unified interface for token counting that
|
|
4
|
+
delegates to provider-specific implementations.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from .providers.base import TokenCounter
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Tokenizer:
|
|
15
|
+
"""
|
|
16
|
+
Token counting wrapper with model awareness.
|
|
17
|
+
|
|
18
|
+
This class wraps a provider-specific TokenCounter to provide
|
|
19
|
+
a consistent interface throughout the Headroom SDK.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(self, token_counter: TokenCounter, model: str = ""):
|
|
23
|
+
"""
|
|
24
|
+
Initialize tokenizer with a provider's token counter.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
token_counter: Provider-specific token counter.
|
|
28
|
+
model: Model name (for reference only).
|
|
29
|
+
"""
|
|
30
|
+
self._counter = token_counter
|
|
31
|
+
self.model = model
|
|
32
|
+
|
|
33
|
+
def count_text(self, text: str) -> int:
|
|
34
|
+
"""Count tokens in text."""
|
|
35
|
+
return self._counter.count_text(text)
|
|
36
|
+
|
|
37
|
+
def count_message(self, message: dict[str, Any]) -> int:
|
|
38
|
+
"""Count tokens in a message."""
|
|
39
|
+
return self._counter.count_message(message)
|
|
40
|
+
|
|
41
|
+
def count_messages(self, messages: list[dict[str, Any]]) -> int:
|
|
42
|
+
"""Count tokens in a list of messages."""
|
|
43
|
+
return self._counter.count_messages(messages)
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def available(self) -> bool:
|
|
47
|
+
"""Whether token counting is available."""
|
|
48
|
+
return self._counter is not None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# Convenience functions that require a token counter
|
|
52
|
+
def count_tokens_text(text: str, token_counter: TokenCounter) -> int:
|
|
53
|
+
"""
|
|
54
|
+
Count tokens in a text string.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
text: The text to count tokens for.
|
|
58
|
+
token_counter: Provider-specific token counter.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
Token count.
|
|
62
|
+
"""
|
|
63
|
+
return token_counter.count_text(text)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def count_tokens_messages(
|
|
67
|
+
messages: list[dict[str, Any]],
|
|
68
|
+
token_counter: TokenCounter,
|
|
69
|
+
) -> int:
|
|
70
|
+
"""
|
|
71
|
+
Count total tokens for a list of messages.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
messages: List of message dicts.
|
|
75
|
+
token_counter: Provider-specific token counter.
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
Total token count.
|
|
79
|
+
"""
|
|
80
|
+
return token_counter.count_messages(messages)
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""Pluggable tokenizer system for universal LLM support.
|
|
2
|
+
|
|
3
|
+
This module provides a registry-based tokenizer system that supports
|
|
4
|
+
multiple backends:
|
|
5
|
+
|
|
6
|
+
1. tiktoken - OpenAI models (GPT-3.5, GPT-4, GPT-4o)
|
|
7
|
+
2. HuggingFace - Open models (Llama, Mistral, Falcon, etc.)
|
|
8
|
+
3. Anthropic - Claude models (via SDK or estimation)
|
|
9
|
+
4. Estimation - Fallback for unknown models
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
from headroom.tokenizers import TokenizerRegistry, get_tokenizer
|
|
13
|
+
|
|
14
|
+
# Auto-detect tokenizer from model name
|
|
15
|
+
tokenizer = get_tokenizer("gpt-4o")
|
|
16
|
+
tokens = tokenizer.count_text("Hello, world!")
|
|
17
|
+
|
|
18
|
+
# Get tokenizer for specific backend
|
|
19
|
+
tokenizer = get_tokenizer("llama-3-8b", backend="huggingface")
|
|
20
|
+
|
|
21
|
+
# Register custom tokenizer
|
|
22
|
+
TokenizerRegistry.register("my-model", my_tokenizer)
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from .base import BaseTokenizer, TokenCounter
|
|
26
|
+
from .estimator import CharacterCounter, EstimatingTokenCounter
|
|
27
|
+
from .registry import (
|
|
28
|
+
TokenizerRegistry,
|
|
29
|
+
get_tokenizer,
|
|
30
|
+
list_supported_models,
|
|
31
|
+
register_tokenizer,
|
|
32
|
+
)
|
|
33
|
+
from .tiktoken_counter import TiktokenCounter
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# Lazy imports for optional dependencies
|
|
37
|
+
def get_huggingface_tokenizer():
|
|
38
|
+
"""Get HuggingFaceTokenizer class (requires transformers)."""
|
|
39
|
+
from .huggingface import HuggingFaceTokenizer
|
|
40
|
+
|
|
41
|
+
return HuggingFaceTokenizer
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_mistral_tokenizer():
|
|
45
|
+
"""Get MistralTokenizer class (requires mistral-common)."""
|
|
46
|
+
from .mistral import MistralTokenizer
|
|
47
|
+
|
|
48
|
+
return MistralTokenizer
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def is_mistral_tokenizer_available() -> bool:
|
|
52
|
+
"""Check if Mistral tokenizer is available."""
|
|
53
|
+
from .mistral import is_mistral_available
|
|
54
|
+
|
|
55
|
+
return is_mistral_available()
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
__all__ = [
|
|
59
|
+
# Registry
|
|
60
|
+
"TokenizerRegistry",
|
|
61
|
+
"get_tokenizer",
|
|
62
|
+
"register_tokenizer",
|
|
63
|
+
"list_supported_models",
|
|
64
|
+
# Base classes
|
|
65
|
+
"TokenCounter",
|
|
66
|
+
"BaseTokenizer",
|
|
67
|
+
# Implementations
|
|
68
|
+
"TiktokenCounter",
|
|
69
|
+
"EstimatingTokenCounter",
|
|
70
|
+
"CharacterCounter",
|
|
71
|
+
# Lazy loaders
|
|
72
|
+
"get_huggingface_tokenizer",
|
|
73
|
+
"get_mistral_tokenizer",
|
|
74
|
+
"is_mistral_tokenizer_available",
|
|
75
|
+
]
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
"""Base classes for tokenizer implementations.
|
|
2
|
+
|
|
3
|
+
Defines the TokenCounter protocol and BaseTokenizer class that all
|
|
4
|
+
tokenizer backends must implement.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
from abc import ABC, abstractmethod
|
|
11
|
+
from typing import Any, Protocol, runtime_checkable
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@runtime_checkable
|
|
15
|
+
class TokenCounter(Protocol):
|
|
16
|
+
"""Protocol for token counting implementations.
|
|
17
|
+
|
|
18
|
+
Any class implementing this protocol can be used with Headroom
|
|
19
|
+
for token counting. This allows integration with various
|
|
20
|
+
tokenizer backends (tiktoken, HuggingFace, custom, etc.).
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def count_text(self, text: str) -> int:
|
|
24
|
+
"""Count tokens in a text string.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
text: The text to count tokens for.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Number of tokens in the text.
|
|
31
|
+
"""
|
|
32
|
+
...
|
|
33
|
+
|
|
34
|
+
def count_messages(self, messages: list[dict[str, Any]]) -> int:
|
|
35
|
+
"""Count tokens in a list of chat messages.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
messages: List of message dicts with 'role' and 'content'.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Total token count including message overhead.
|
|
42
|
+
"""
|
|
43
|
+
...
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class BaseTokenizer(ABC):
|
|
47
|
+
"""Abstract base class for tokenizer implementations.
|
|
48
|
+
|
|
49
|
+
Provides common functionality for counting messages while
|
|
50
|
+
requiring subclasses to implement text tokenization.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
# Token overhead per message (role, formatting, etc.)
|
|
54
|
+
# Override in subclasses for model-specific overhead
|
|
55
|
+
MESSAGE_OVERHEAD = 4
|
|
56
|
+
REPLY_OVERHEAD = 3 # Assistant reply start tokens
|
|
57
|
+
|
|
58
|
+
@abstractmethod
|
|
59
|
+
def count_text(self, text: str) -> int:
|
|
60
|
+
"""Count tokens in a text string. Must be implemented by subclasses."""
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
def count_message(self, message: dict[str, Any]) -> int:
|
|
64
|
+
"""Count tokens in a single message.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
message: A message dict with 'role' and 'content'.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
Token count for this message.
|
|
71
|
+
"""
|
|
72
|
+
return self.count_messages([message]) - self.REPLY_OVERHEAD
|
|
73
|
+
|
|
74
|
+
def count_messages(self, messages: list[dict[str, Any]]) -> int:
|
|
75
|
+
"""Count tokens in a list of chat messages.
|
|
76
|
+
|
|
77
|
+
Uses OpenAI-style message counting as the baseline, which
|
|
78
|
+
works well for most models.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
messages: List of message dicts.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
Total token count.
|
|
85
|
+
"""
|
|
86
|
+
total = 0
|
|
87
|
+
|
|
88
|
+
for message in messages:
|
|
89
|
+
# Base message overhead
|
|
90
|
+
total += self.MESSAGE_OVERHEAD
|
|
91
|
+
|
|
92
|
+
# Count role
|
|
93
|
+
role = message.get("role", "")
|
|
94
|
+
total += self.count_text(role)
|
|
95
|
+
|
|
96
|
+
# Count content
|
|
97
|
+
content = message.get("content")
|
|
98
|
+
if content is not None:
|
|
99
|
+
if isinstance(content, str):
|
|
100
|
+
total += self.count_text(content)
|
|
101
|
+
elif isinstance(content, list):
|
|
102
|
+
# Multi-part content (images, tool results, etc.)
|
|
103
|
+
total += self._count_content_parts(content)
|
|
104
|
+
|
|
105
|
+
# Count tool calls
|
|
106
|
+
tool_calls = message.get("tool_calls")
|
|
107
|
+
if tool_calls:
|
|
108
|
+
total += self._count_tool_calls(tool_calls)
|
|
109
|
+
|
|
110
|
+
# Count function call (legacy)
|
|
111
|
+
function_call = message.get("function_call")
|
|
112
|
+
if function_call:
|
|
113
|
+
total += self._count_function_call(function_call)
|
|
114
|
+
|
|
115
|
+
# Count name field
|
|
116
|
+
name = message.get("name")
|
|
117
|
+
if name:
|
|
118
|
+
total += self.count_text(name)
|
|
119
|
+
total += 1 # Name field overhead
|
|
120
|
+
|
|
121
|
+
# Reply start overhead
|
|
122
|
+
total += self.REPLY_OVERHEAD
|
|
123
|
+
|
|
124
|
+
return total
|
|
125
|
+
|
|
126
|
+
def _count_content_parts(self, parts: list[Any]) -> int:
|
|
127
|
+
"""Count tokens in multi-part content."""
|
|
128
|
+
total = 0
|
|
129
|
+
for part in parts:
|
|
130
|
+
if isinstance(part, dict):
|
|
131
|
+
part_type = part.get("type", "")
|
|
132
|
+
|
|
133
|
+
if part_type == "text":
|
|
134
|
+
total += self.count_text(part.get("text", ""))
|
|
135
|
+
elif part_type == "image_url":
|
|
136
|
+
# Images have fixed token cost (varies by model)
|
|
137
|
+
total += 85 # Base image token count
|
|
138
|
+
elif part_type == "tool_result":
|
|
139
|
+
content = part.get("content", "")
|
|
140
|
+
if isinstance(content, str):
|
|
141
|
+
total += self.count_text(content)
|
|
142
|
+
else:
|
|
143
|
+
total += self.count_text(json.dumps(content))
|
|
144
|
+
elif part_type == "tool_use":
|
|
145
|
+
total += self.count_text(part.get("name", ""))
|
|
146
|
+
total += self.count_text(json.dumps(part.get("input", {})))
|
|
147
|
+
else:
|
|
148
|
+
# Unknown type - estimate from JSON
|
|
149
|
+
total += self.count_text(json.dumps(part))
|
|
150
|
+
elif isinstance(part, str):
|
|
151
|
+
total += self.count_text(part)
|
|
152
|
+
|
|
153
|
+
return total
|
|
154
|
+
|
|
155
|
+
def _count_tool_calls(self, tool_calls: list[dict[str, Any]]) -> int:
|
|
156
|
+
"""Count tokens in tool calls."""
|
|
157
|
+
total = 0
|
|
158
|
+
for call in tool_calls:
|
|
159
|
+
total += 4 # Tool call overhead
|
|
160
|
+
|
|
161
|
+
if "function" in call:
|
|
162
|
+
func = call["function"]
|
|
163
|
+
total += self.count_text(func.get("name", ""))
|
|
164
|
+
total += self.count_text(func.get("arguments", ""))
|
|
165
|
+
|
|
166
|
+
if "id" in call:
|
|
167
|
+
total += self.count_text(call["id"])
|
|
168
|
+
|
|
169
|
+
return total
|
|
170
|
+
|
|
171
|
+
def _count_function_call(self, function_call: dict[str, Any]) -> int:
|
|
172
|
+
"""Count tokens in legacy function call."""
|
|
173
|
+
total = 4 # Function call overhead
|
|
174
|
+
total += self.count_text(function_call.get("name", ""))
|
|
175
|
+
total += self.count_text(function_call.get("arguments", ""))
|
|
176
|
+
return total
|
|
177
|
+
|
|
178
|
+
def encode(self, text: str) -> list[int]:
|
|
179
|
+
"""Encode text to token IDs.
|
|
180
|
+
|
|
181
|
+
Optional method - not all backends support encoding.
|
|
182
|
+
Default implementation raises NotImplementedError.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
text: Text to encode.
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
List of token IDs.
|
|
189
|
+
|
|
190
|
+
Raises:
|
|
191
|
+
NotImplementedError: If encoding is not supported.
|
|
192
|
+
"""
|
|
193
|
+
raise NotImplementedError(f"{self.__class__.__name__} does not support encoding")
|
|
194
|
+
|
|
195
|
+
def decode(self, tokens: list[int]) -> str:
|
|
196
|
+
"""Decode token IDs to text.
|
|
197
|
+
|
|
198
|
+
Optional method - not all backends support decoding.
|
|
199
|
+
Default implementation raises NotImplementedError.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
tokens: List of token IDs.
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
Decoded text.
|
|
206
|
+
|
|
207
|
+
Raises:
|
|
208
|
+
NotImplementedError: If decoding is not supported.
|
|
209
|
+
"""
|
|
210
|
+
raise NotImplementedError(f"{self.__class__.__name__} does not support decoding")
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
"""Estimation-based token counter for fallback scenarios.
|
|
2
|
+
|
|
3
|
+
When no exact tokenizer is available (e.g., unknown models, missing
|
|
4
|
+
dependencies), this provides a reasonable approximation based on
|
|
5
|
+
character/word heuristics calibrated against real tokenizers.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import re
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
from .base import BaseTokenizer
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class EstimatingTokenCounter(BaseTokenizer):
|
|
18
|
+
"""Token counter using estimation heuristics.
|
|
19
|
+
|
|
20
|
+
This is the fallback tokenizer used when:
|
|
21
|
+
- Model is unknown/unsupported
|
|
22
|
+
- Required tokenizer library not installed
|
|
23
|
+
- Speed is prioritized over accuracy
|
|
24
|
+
|
|
25
|
+
The estimation is calibrated against tiktoken cl100k_base and
|
|
26
|
+
provides ~90% accuracy for typical text. It tends to slightly
|
|
27
|
+
overestimate, which is safer for context window management.
|
|
28
|
+
|
|
29
|
+
Estimation Strategy:
|
|
30
|
+
- Base: ~4 characters per token (calibrated against GPT-4)
|
|
31
|
+
- Adjustments for code, URLs, numbers, whitespace
|
|
32
|
+
- Special handling for JSON structure
|
|
33
|
+
|
|
34
|
+
Example:
|
|
35
|
+
counter = EstimatingTokenCounter()
|
|
36
|
+
tokens = counter.count_text("Hello, world!")
|
|
37
|
+
print(f"Estimated tokens: {tokens}")
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
# Calibration constants (derived from tiktoken analysis)
|
|
41
|
+
CHARS_PER_TOKEN = 4.0 # Average for English text
|
|
42
|
+
CHARS_PER_TOKEN_CODE = 3.5 # Code is denser
|
|
43
|
+
CHARS_PER_TOKEN_JSON = 3.2 # JSON has more structure
|
|
44
|
+
|
|
45
|
+
# Patterns for content type detection
|
|
46
|
+
CODE_PATTERN = re.compile(
|
|
47
|
+
r"(?:def |class |function |const |let |var |import |from |"
|
|
48
|
+
r"if \(|for \(|while \(|switch \(|try \{|catch \(|"
|
|
49
|
+
r"=>|->|\{\{|\}\}|;$)",
|
|
50
|
+
re.MULTILINE,
|
|
51
|
+
)
|
|
52
|
+
JSON_PATTERN = re.compile(r"^\s*[\[\{]")
|
|
53
|
+
URL_PATTERN = re.compile(r"https?://\S+")
|
|
54
|
+
UUID_PATTERN = re.compile(
|
|
55
|
+
r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", re.IGNORECASE
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
def __init__(self, chars_per_token: float | None = None):
|
|
59
|
+
"""Initialize estimating counter.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
chars_per_token: Override default chars per token ratio.
|
|
63
|
+
If None, auto-detects based on content type.
|
|
64
|
+
"""
|
|
65
|
+
self._fixed_ratio = chars_per_token
|
|
66
|
+
|
|
67
|
+
def count_text(self, text: str) -> int:
|
|
68
|
+
"""Estimate token count for text.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
text: Text to count tokens for.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
Estimated number of tokens.
|
|
75
|
+
"""
|
|
76
|
+
if not text:
|
|
77
|
+
return 0
|
|
78
|
+
|
|
79
|
+
# Use fixed ratio if provided
|
|
80
|
+
if self._fixed_ratio is not None:
|
|
81
|
+
return max(1, int(len(text) / self._fixed_ratio + 0.5))
|
|
82
|
+
|
|
83
|
+
# Auto-detect content type and adjust ratio
|
|
84
|
+
ratio = self._detect_ratio(text)
|
|
85
|
+
|
|
86
|
+
# Apply ratio with minimum of 1 token
|
|
87
|
+
base_count = int(len(text) / ratio + 0.5)
|
|
88
|
+
|
|
89
|
+
# Add overhead for special patterns
|
|
90
|
+
overhead = self._count_special_overhead(text)
|
|
91
|
+
|
|
92
|
+
return max(1, base_count + overhead)
|
|
93
|
+
|
|
94
|
+
def _detect_ratio(self, text: str) -> float:
|
|
95
|
+
"""Detect optimal chars-per-token ratio based on content.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
text: Text to analyze.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Chars per token ratio.
|
|
102
|
+
"""
|
|
103
|
+
# Check for JSON
|
|
104
|
+
if self.JSON_PATTERN.match(text):
|
|
105
|
+
try:
|
|
106
|
+
json.loads(text)
|
|
107
|
+
return self.CHARS_PER_TOKEN_JSON
|
|
108
|
+
except (json.JSONDecodeError, ValueError):
|
|
109
|
+
pass
|
|
110
|
+
|
|
111
|
+
# Check for code
|
|
112
|
+
code_matches = len(self.CODE_PATTERN.findall(text))
|
|
113
|
+
if code_matches > len(text) / 500: # ~2 matches per KB
|
|
114
|
+
return self.CHARS_PER_TOKEN_CODE
|
|
115
|
+
|
|
116
|
+
return self.CHARS_PER_TOKEN
|
|
117
|
+
|
|
118
|
+
def _count_special_overhead(self, text: str) -> int:
|
|
119
|
+
"""Count additional tokens for special patterns.
|
|
120
|
+
|
|
121
|
+
URLs and UUIDs often tokenize into more tokens than
|
|
122
|
+
character count would suggest.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
text: Text to analyze.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
Additional token overhead.
|
|
129
|
+
"""
|
|
130
|
+
overhead = 0
|
|
131
|
+
|
|
132
|
+
# URLs typically tokenize to more tokens
|
|
133
|
+
urls = self.URL_PATTERN.findall(text)
|
|
134
|
+
for url in urls:
|
|
135
|
+
# Each URL component adds overhead
|
|
136
|
+
overhead += url.count("/") + url.count("?") + url.count("&")
|
|
137
|
+
|
|
138
|
+
# UUIDs are typically 8-10 tokens despite being 36 chars
|
|
139
|
+
uuids = self.UUID_PATTERN.findall(text)
|
|
140
|
+
overhead += len(uuids) * 2 # Each UUID adds ~2 extra tokens
|
|
141
|
+
|
|
142
|
+
return overhead
|
|
143
|
+
|
|
144
|
+
def count_messages(self, messages: list[dict[str, Any]]) -> int:
|
|
145
|
+
"""Estimate tokens in chat messages.
|
|
146
|
+
|
|
147
|
+
Uses the base class implementation with estimation-based
|
|
148
|
+
text counting.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
messages: List of chat messages.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
Estimated total token count.
|
|
155
|
+
"""
|
|
156
|
+
# Use base class implementation
|
|
157
|
+
return super().count_messages(messages)
|
|
158
|
+
|
|
159
|
+
def __repr__(self) -> str:
|
|
160
|
+
if self._fixed_ratio:
|
|
161
|
+
return f"EstimatingTokenCounter(chars_per_token={self._fixed_ratio})"
|
|
162
|
+
return "EstimatingTokenCounter(auto)"
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
class CharacterCounter(BaseTokenizer):
|
|
166
|
+
"""Simple character-based counter.
|
|
167
|
+
|
|
168
|
+
Uses a fixed character-to-token ratio. Useful for:
|
|
169
|
+
- Quick approximations
|
|
170
|
+
- Testing
|
|
171
|
+
- Models with unknown tokenization
|
|
172
|
+
|
|
173
|
+
This is less accurate than EstimatingTokenCounter but faster.
|
|
174
|
+
"""
|
|
175
|
+
|
|
176
|
+
def __init__(self, chars_per_token: float = 4.0):
|
|
177
|
+
"""Initialize character counter.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
chars_per_token: Characters per token ratio.
|
|
181
|
+
"""
|
|
182
|
+
self.chars_per_token = chars_per_token
|
|
183
|
+
|
|
184
|
+
def count_text(self, text: str) -> int:
|
|
185
|
+
"""Count tokens based on character count.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
text: Text to count.
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
Estimated token count.
|
|
192
|
+
"""
|
|
193
|
+
if not text:
|
|
194
|
+
return 0
|
|
195
|
+
return max(1, int(len(text) / self.chars_per_token + 0.5))
|
|
196
|
+
|
|
197
|
+
def __repr__(self) -> str:
|
|
198
|
+
return f"CharacterCounter(chars_per_token={self.chars_per_token})"
|