headroom-ai 0.2.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. headroom/__init__.py +212 -0
  2. headroom/cache/__init__.py +76 -0
  3. headroom/cache/anthropic.py +517 -0
  4. headroom/cache/base.py +342 -0
  5. headroom/cache/compression_feedback.py +613 -0
  6. headroom/cache/compression_store.py +814 -0
  7. headroom/cache/dynamic_detector.py +1026 -0
  8. headroom/cache/google.py +884 -0
  9. headroom/cache/openai.py +584 -0
  10. headroom/cache/registry.py +175 -0
  11. headroom/cache/semantic.py +451 -0
  12. headroom/ccr/__init__.py +77 -0
  13. headroom/ccr/context_tracker.py +582 -0
  14. headroom/ccr/mcp_server.py +319 -0
  15. headroom/ccr/response_handler.py +772 -0
  16. headroom/ccr/tool_injection.py +415 -0
  17. headroom/cli.py +219 -0
  18. headroom/client.py +977 -0
  19. headroom/compression/__init__.py +42 -0
  20. headroom/compression/detector.py +424 -0
  21. headroom/compression/handlers/__init__.py +22 -0
  22. headroom/compression/handlers/base.py +219 -0
  23. headroom/compression/handlers/code_handler.py +506 -0
  24. headroom/compression/handlers/json_handler.py +418 -0
  25. headroom/compression/masks.py +345 -0
  26. headroom/compression/universal.py +465 -0
  27. headroom/config.py +474 -0
  28. headroom/exceptions.py +192 -0
  29. headroom/integrations/__init__.py +159 -0
  30. headroom/integrations/agno/__init__.py +53 -0
  31. headroom/integrations/agno/hooks.py +345 -0
  32. headroom/integrations/agno/model.py +625 -0
  33. headroom/integrations/agno/providers.py +154 -0
  34. headroom/integrations/langchain/__init__.py +106 -0
  35. headroom/integrations/langchain/agents.py +326 -0
  36. headroom/integrations/langchain/chat_model.py +1002 -0
  37. headroom/integrations/langchain/langsmith.py +324 -0
  38. headroom/integrations/langchain/memory.py +319 -0
  39. headroom/integrations/langchain/providers.py +200 -0
  40. headroom/integrations/langchain/retriever.py +371 -0
  41. headroom/integrations/langchain/streaming.py +341 -0
  42. headroom/integrations/mcp/__init__.py +37 -0
  43. headroom/integrations/mcp/server.py +533 -0
  44. headroom/memory/__init__.py +37 -0
  45. headroom/memory/extractor.py +390 -0
  46. headroom/memory/fast_store.py +621 -0
  47. headroom/memory/fast_wrapper.py +311 -0
  48. headroom/memory/inline_extractor.py +229 -0
  49. headroom/memory/store.py +434 -0
  50. headroom/memory/worker.py +260 -0
  51. headroom/memory/wrapper.py +321 -0
  52. headroom/models/__init__.py +39 -0
  53. headroom/models/registry.py +687 -0
  54. headroom/parser.py +293 -0
  55. headroom/pricing/__init__.py +51 -0
  56. headroom/pricing/anthropic_prices.py +81 -0
  57. headroom/pricing/litellm_pricing.py +113 -0
  58. headroom/pricing/openai_prices.py +91 -0
  59. headroom/pricing/registry.py +188 -0
  60. headroom/providers/__init__.py +61 -0
  61. headroom/providers/anthropic.py +621 -0
  62. headroom/providers/base.py +131 -0
  63. headroom/providers/cohere.py +362 -0
  64. headroom/providers/google.py +427 -0
  65. headroom/providers/litellm.py +297 -0
  66. headroom/providers/openai.py +566 -0
  67. headroom/providers/openai_compatible.py +521 -0
  68. headroom/proxy/__init__.py +19 -0
  69. headroom/proxy/server.py +2683 -0
  70. headroom/py.typed +0 -0
  71. headroom/relevance/__init__.py +124 -0
  72. headroom/relevance/base.py +106 -0
  73. headroom/relevance/bm25.py +255 -0
  74. headroom/relevance/embedding.py +255 -0
  75. headroom/relevance/hybrid.py +259 -0
  76. headroom/reporting/__init__.py +5 -0
  77. headroom/reporting/generator.py +549 -0
  78. headroom/storage/__init__.py +41 -0
  79. headroom/storage/base.py +125 -0
  80. headroom/storage/jsonl.py +220 -0
  81. headroom/storage/sqlite.py +289 -0
  82. headroom/telemetry/__init__.py +91 -0
  83. headroom/telemetry/collector.py +764 -0
  84. headroom/telemetry/models.py +880 -0
  85. headroom/telemetry/toin.py +1579 -0
  86. headroom/tokenizer.py +80 -0
  87. headroom/tokenizers/__init__.py +75 -0
  88. headroom/tokenizers/base.py +210 -0
  89. headroom/tokenizers/estimator.py +198 -0
  90. headroom/tokenizers/huggingface.py +317 -0
  91. headroom/tokenizers/mistral.py +245 -0
  92. headroom/tokenizers/registry.py +398 -0
  93. headroom/tokenizers/tiktoken_counter.py +248 -0
  94. headroom/transforms/__init__.py +106 -0
  95. headroom/transforms/base.py +57 -0
  96. headroom/transforms/cache_aligner.py +357 -0
  97. headroom/transforms/code_compressor.py +1313 -0
  98. headroom/transforms/content_detector.py +335 -0
  99. headroom/transforms/content_router.py +1158 -0
  100. headroom/transforms/llmlingua_compressor.py +638 -0
  101. headroom/transforms/log_compressor.py +529 -0
  102. headroom/transforms/pipeline.py +297 -0
  103. headroom/transforms/rolling_window.py +350 -0
  104. headroom/transforms/search_compressor.py +365 -0
  105. headroom/transforms/smart_crusher.py +2682 -0
  106. headroom/transforms/text_compressor.py +259 -0
  107. headroom/transforms/tool_crusher.py +338 -0
  108. headroom/utils.py +215 -0
  109. headroom_ai-0.2.13.dist-info/METADATA +315 -0
  110. headroom_ai-0.2.13.dist-info/RECORD +114 -0
  111. headroom_ai-0.2.13.dist-info/WHEEL +4 -0
  112. headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
  113. headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
  114. headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
headroom/tokenizer.py ADDED
@@ -0,0 +1,80 @@
1
+ """Token counting wrapper for Headroom SDK.
2
+
3
+ This module provides a unified interface for token counting that
4
+ delegates to provider-specific implementations.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import Any
10
+
11
+ from .providers.base import TokenCounter
12
+
13
+
14
+ class Tokenizer:
15
+ """
16
+ Token counting wrapper with model awareness.
17
+
18
+ This class wraps a provider-specific TokenCounter to provide
19
+ a consistent interface throughout the Headroom SDK.
20
+ """
21
+
22
+ def __init__(self, token_counter: TokenCounter, model: str = ""):
23
+ """
24
+ Initialize tokenizer with a provider's token counter.
25
+
26
+ Args:
27
+ token_counter: Provider-specific token counter.
28
+ model: Model name (for reference only).
29
+ """
30
+ self._counter = token_counter
31
+ self.model = model
32
+
33
+ def count_text(self, text: str) -> int:
34
+ """Count tokens in text."""
35
+ return self._counter.count_text(text)
36
+
37
+ def count_message(self, message: dict[str, Any]) -> int:
38
+ """Count tokens in a message."""
39
+ return self._counter.count_message(message)
40
+
41
+ def count_messages(self, messages: list[dict[str, Any]]) -> int:
42
+ """Count tokens in a list of messages."""
43
+ return self._counter.count_messages(messages)
44
+
45
+ @property
46
+ def available(self) -> bool:
47
+ """Whether token counting is available."""
48
+ return self._counter is not None
49
+
50
+
51
+ # Convenience functions that require a token counter
52
+ def count_tokens_text(text: str, token_counter: TokenCounter) -> int:
53
+ """
54
+ Count tokens in a text string.
55
+
56
+ Args:
57
+ text: The text to count tokens for.
58
+ token_counter: Provider-specific token counter.
59
+
60
+ Returns:
61
+ Token count.
62
+ """
63
+ return token_counter.count_text(text)
64
+
65
+
66
+ def count_tokens_messages(
67
+ messages: list[dict[str, Any]],
68
+ token_counter: TokenCounter,
69
+ ) -> int:
70
+ """
71
+ Count total tokens for a list of messages.
72
+
73
+ Args:
74
+ messages: List of message dicts.
75
+ token_counter: Provider-specific token counter.
76
+
77
+ Returns:
78
+ Total token count.
79
+ """
80
+ return token_counter.count_messages(messages)
@@ -0,0 +1,75 @@
1
+ """Pluggable tokenizer system for universal LLM support.
2
+
3
+ This module provides a registry-based tokenizer system that supports
4
+ multiple backends:
5
+
6
+ 1. tiktoken - OpenAI models (GPT-3.5, GPT-4, GPT-4o)
7
+ 2. HuggingFace - Open models (Llama, Mistral, Falcon, etc.)
8
+ 3. Anthropic - Claude models (via SDK or estimation)
9
+ 4. Estimation - Fallback for unknown models
10
+
11
+ Usage:
12
+ from headroom.tokenizers import TokenizerRegistry, get_tokenizer
13
+
14
+ # Auto-detect tokenizer from model name
15
+ tokenizer = get_tokenizer("gpt-4o")
16
+ tokens = tokenizer.count_text("Hello, world!")
17
+
18
+ # Get tokenizer for specific backend
19
+ tokenizer = get_tokenizer("llama-3-8b", backend="huggingface")
20
+
21
+ # Register custom tokenizer
22
+ TokenizerRegistry.register("my-model", my_tokenizer)
23
+ """
24
+
25
+ from .base import BaseTokenizer, TokenCounter
26
+ from .estimator import CharacterCounter, EstimatingTokenCounter
27
+ from .registry import (
28
+ TokenizerRegistry,
29
+ get_tokenizer,
30
+ list_supported_models,
31
+ register_tokenizer,
32
+ )
33
+ from .tiktoken_counter import TiktokenCounter
34
+
35
+
36
+ # Lazy imports for optional dependencies
37
+ def get_huggingface_tokenizer():
38
+ """Get HuggingFaceTokenizer class (requires transformers)."""
39
+ from .huggingface import HuggingFaceTokenizer
40
+
41
+ return HuggingFaceTokenizer
42
+
43
+
44
+ def get_mistral_tokenizer():
45
+ """Get MistralTokenizer class (requires mistral-common)."""
46
+ from .mistral import MistralTokenizer
47
+
48
+ return MistralTokenizer
49
+
50
+
51
+ def is_mistral_tokenizer_available() -> bool:
52
+ """Check if Mistral tokenizer is available."""
53
+ from .mistral import is_mistral_available
54
+
55
+ return is_mistral_available()
56
+
57
+
58
+ __all__ = [
59
+ # Registry
60
+ "TokenizerRegistry",
61
+ "get_tokenizer",
62
+ "register_tokenizer",
63
+ "list_supported_models",
64
+ # Base classes
65
+ "TokenCounter",
66
+ "BaseTokenizer",
67
+ # Implementations
68
+ "TiktokenCounter",
69
+ "EstimatingTokenCounter",
70
+ "CharacterCounter",
71
+ # Lazy loaders
72
+ "get_huggingface_tokenizer",
73
+ "get_mistral_tokenizer",
74
+ "is_mistral_tokenizer_available",
75
+ ]
@@ -0,0 +1,210 @@
1
+ """Base classes for tokenizer implementations.
2
+
3
+ Defines the TokenCounter protocol and BaseTokenizer class that all
4
+ tokenizer backends must implement.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ from abc import ABC, abstractmethod
11
+ from typing import Any, Protocol, runtime_checkable
12
+
13
+
14
+ @runtime_checkable
15
+ class TokenCounter(Protocol):
16
+ """Protocol for token counting implementations.
17
+
18
+ Any class implementing this protocol can be used with Headroom
19
+ for token counting. This allows integration with various
20
+ tokenizer backends (tiktoken, HuggingFace, custom, etc.).
21
+ """
22
+
23
+ def count_text(self, text: str) -> int:
24
+ """Count tokens in a text string.
25
+
26
+ Args:
27
+ text: The text to count tokens for.
28
+
29
+ Returns:
30
+ Number of tokens in the text.
31
+ """
32
+ ...
33
+
34
+ def count_messages(self, messages: list[dict[str, Any]]) -> int:
35
+ """Count tokens in a list of chat messages.
36
+
37
+ Args:
38
+ messages: List of message dicts with 'role' and 'content'.
39
+
40
+ Returns:
41
+ Total token count including message overhead.
42
+ """
43
+ ...
44
+
45
+
46
+ class BaseTokenizer(ABC):
47
+ """Abstract base class for tokenizer implementations.
48
+
49
+ Provides common functionality for counting messages while
50
+ requiring subclasses to implement text tokenization.
51
+ """
52
+
53
+ # Token overhead per message (role, formatting, etc.)
54
+ # Override in subclasses for model-specific overhead
55
+ MESSAGE_OVERHEAD = 4
56
+ REPLY_OVERHEAD = 3 # Assistant reply start tokens
57
+
58
+ @abstractmethod
59
+ def count_text(self, text: str) -> int:
60
+ """Count tokens in a text string. Must be implemented by subclasses."""
61
+ pass
62
+
63
+ def count_message(self, message: dict[str, Any]) -> int:
64
+ """Count tokens in a single message.
65
+
66
+ Args:
67
+ message: A message dict with 'role' and 'content'.
68
+
69
+ Returns:
70
+ Token count for this message.
71
+ """
72
+ return self.count_messages([message]) - self.REPLY_OVERHEAD
73
+
74
+ def count_messages(self, messages: list[dict[str, Any]]) -> int:
75
+ """Count tokens in a list of chat messages.
76
+
77
+ Uses OpenAI-style message counting as the baseline, which
78
+ works well for most models.
79
+
80
+ Args:
81
+ messages: List of message dicts.
82
+
83
+ Returns:
84
+ Total token count.
85
+ """
86
+ total = 0
87
+
88
+ for message in messages:
89
+ # Base message overhead
90
+ total += self.MESSAGE_OVERHEAD
91
+
92
+ # Count role
93
+ role = message.get("role", "")
94
+ total += self.count_text(role)
95
+
96
+ # Count content
97
+ content = message.get("content")
98
+ if content is not None:
99
+ if isinstance(content, str):
100
+ total += self.count_text(content)
101
+ elif isinstance(content, list):
102
+ # Multi-part content (images, tool results, etc.)
103
+ total += self._count_content_parts(content)
104
+
105
+ # Count tool calls
106
+ tool_calls = message.get("tool_calls")
107
+ if tool_calls:
108
+ total += self._count_tool_calls(tool_calls)
109
+
110
+ # Count function call (legacy)
111
+ function_call = message.get("function_call")
112
+ if function_call:
113
+ total += self._count_function_call(function_call)
114
+
115
+ # Count name field
116
+ name = message.get("name")
117
+ if name:
118
+ total += self.count_text(name)
119
+ total += 1 # Name field overhead
120
+
121
+ # Reply start overhead
122
+ total += self.REPLY_OVERHEAD
123
+
124
+ return total
125
+
126
+ def _count_content_parts(self, parts: list[Any]) -> int:
127
+ """Count tokens in multi-part content."""
128
+ total = 0
129
+ for part in parts:
130
+ if isinstance(part, dict):
131
+ part_type = part.get("type", "")
132
+
133
+ if part_type == "text":
134
+ total += self.count_text(part.get("text", ""))
135
+ elif part_type == "image_url":
136
+ # Images have fixed token cost (varies by model)
137
+ total += 85 # Base image token count
138
+ elif part_type == "tool_result":
139
+ content = part.get("content", "")
140
+ if isinstance(content, str):
141
+ total += self.count_text(content)
142
+ else:
143
+ total += self.count_text(json.dumps(content))
144
+ elif part_type == "tool_use":
145
+ total += self.count_text(part.get("name", ""))
146
+ total += self.count_text(json.dumps(part.get("input", {})))
147
+ else:
148
+ # Unknown type - estimate from JSON
149
+ total += self.count_text(json.dumps(part))
150
+ elif isinstance(part, str):
151
+ total += self.count_text(part)
152
+
153
+ return total
154
+
155
+ def _count_tool_calls(self, tool_calls: list[dict[str, Any]]) -> int:
156
+ """Count tokens in tool calls."""
157
+ total = 0
158
+ for call in tool_calls:
159
+ total += 4 # Tool call overhead
160
+
161
+ if "function" in call:
162
+ func = call["function"]
163
+ total += self.count_text(func.get("name", ""))
164
+ total += self.count_text(func.get("arguments", ""))
165
+
166
+ if "id" in call:
167
+ total += self.count_text(call["id"])
168
+
169
+ return total
170
+
171
+ def _count_function_call(self, function_call: dict[str, Any]) -> int:
172
+ """Count tokens in legacy function call."""
173
+ total = 4 # Function call overhead
174
+ total += self.count_text(function_call.get("name", ""))
175
+ total += self.count_text(function_call.get("arguments", ""))
176
+ return total
177
+
178
+ def encode(self, text: str) -> list[int]:
179
+ """Encode text to token IDs.
180
+
181
+ Optional method - not all backends support encoding.
182
+ Default implementation raises NotImplementedError.
183
+
184
+ Args:
185
+ text: Text to encode.
186
+
187
+ Returns:
188
+ List of token IDs.
189
+
190
+ Raises:
191
+ NotImplementedError: If encoding is not supported.
192
+ """
193
+ raise NotImplementedError(f"{self.__class__.__name__} does not support encoding")
194
+
195
+ def decode(self, tokens: list[int]) -> str:
196
+ """Decode token IDs to text.
197
+
198
+ Optional method - not all backends support decoding.
199
+ Default implementation raises NotImplementedError.
200
+
201
+ Args:
202
+ tokens: List of token IDs.
203
+
204
+ Returns:
205
+ Decoded text.
206
+
207
+ Raises:
208
+ NotImplementedError: If decoding is not supported.
209
+ """
210
+ raise NotImplementedError(f"{self.__class__.__name__} does not support decoding")
@@ -0,0 +1,198 @@
1
+ """Estimation-based token counter for fallback scenarios.
2
+
3
+ When no exact tokenizer is available (e.g., unknown models, missing
4
+ dependencies), this provides a reasonable approximation based on
5
+ character/word heuristics calibrated against real tokenizers.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import re
12
+ from typing import Any
13
+
14
+ from .base import BaseTokenizer
15
+
16
+
17
+ class EstimatingTokenCounter(BaseTokenizer):
18
+ """Token counter using estimation heuristics.
19
+
20
+ This is the fallback tokenizer used when:
21
+ - Model is unknown/unsupported
22
+ - Required tokenizer library not installed
23
+ - Speed is prioritized over accuracy
24
+
25
+ The estimation is calibrated against tiktoken cl100k_base and
26
+ provides ~90% accuracy for typical text. It tends to slightly
27
+ overestimate, which is safer for context window management.
28
+
29
+ Estimation Strategy:
30
+ - Base: ~4 characters per token (calibrated against GPT-4)
31
+ - Adjustments for code, URLs, numbers, whitespace
32
+ - Special handling for JSON structure
33
+
34
+ Example:
35
+ counter = EstimatingTokenCounter()
36
+ tokens = counter.count_text("Hello, world!")
37
+ print(f"Estimated tokens: {tokens}")
38
+ """
39
+
40
+ # Calibration constants (derived from tiktoken analysis)
41
+ CHARS_PER_TOKEN = 4.0 # Average for English text
42
+ CHARS_PER_TOKEN_CODE = 3.5 # Code is denser
43
+ CHARS_PER_TOKEN_JSON = 3.2 # JSON has more structure
44
+
45
+ # Patterns for content type detection
46
+ CODE_PATTERN = re.compile(
47
+ r"(?:def |class |function |const |let |var |import |from |"
48
+ r"if \(|for \(|while \(|switch \(|try \{|catch \(|"
49
+ r"=>|->|\{\{|\}\}|;$)",
50
+ re.MULTILINE,
51
+ )
52
+ JSON_PATTERN = re.compile(r"^\s*[\[\{]")
53
+ URL_PATTERN = re.compile(r"https?://\S+")
54
+ UUID_PATTERN = re.compile(
55
+ r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", re.IGNORECASE
56
+ )
57
+
58
+ def __init__(self, chars_per_token: float | None = None):
59
+ """Initialize estimating counter.
60
+
61
+ Args:
62
+ chars_per_token: Override default chars per token ratio.
63
+ If None, auto-detects based on content type.
64
+ """
65
+ self._fixed_ratio = chars_per_token
66
+
67
+ def count_text(self, text: str) -> int:
68
+ """Estimate token count for text.
69
+
70
+ Args:
71
+ text: Text to count tokens for.
72
+
73
+ Returns:
74
+ Estimated number of tokens.
75
+ """
76
+ if not text:
77
+ return 0
78
+
79
+ # Use fixed ratio if provided
80
+ if self._fixed_ratio is not None:
81
+ return max(1, int(len(text) / self._fixed_ratio + 0.5))
82
+
83
+ # Auto-detect content type and adjust ratio
84
+ ratio = self._detect_ratio(text)
85
+
86
+ # Apply ratio with minimum of 1 token
87
+ base_count = int(len(text) / ratio + 0.5)
88
+
89
+ # Add overhead for special patterns
90
+ overhead = self._count_special_overhead(text)
91
+
92
+ return max(1, base_count + overhead)
93
+
94
+ def _detect_ratio(self, text: str) -> float:
95
+ """Detect optimal chars-per-token ratio based on content.
96
+
97
+ Args:
98
+ text: Text to analyze.
99
+
100
+ Returns:
101
+ Chars per token ratio.
102
+ """
103
+ # Check for JSON
104
+ if self.JSON_PATTERN.match(text):
105
+ try:
106
+ json.loads(text)
107
+ return self.CHARS_PER_TOKEN_JSON
108
+ except (json.JSONDecodeError, ValueError):
109
+ pass
110
+
111
+ # Check for code
112
+ code_matches = len(self.CODE_PATTERN.findall(text))
113
+ if code_matches > len(text) / 500: # ~2 matches per KB
114
+ return self.CHARS_PER_TOKEN_CODE
115
+
116
+ return self.CHARS_PER_TOKEN
117
+
118
+ def _count_special_overhead(self, text: str) -> int:
119
+ """Count additional tokens for special patterns.
120
+
121
+ URLs and UUIDs often tokenize into more tokens than
122
+ character count would suggest.
123
+
124
+ Args:
125
+ text: Text to analyze.
126
+
127
+ Returns:
128
+ Additional token overhead.
129
+ """
130
+ overhead = 0
131
+
132
+ # URLs typically tokenize to more tokens
133
+ urls = self.URL_PATTERN.findall(text)
134
+ for url in urls:
135
+ # Each URL component adds overhead
136
+ overhead += url.count("/") + url.count("?") + url.count("&")
137
+
138
+ # UUIDs are typically 8-10 tokens despite being 36 chars
139
+ uuids = self.UUID_PATTERN.findall(text)
140
+ overhead += len(uuids) * 2 # Each UUID adds ~2 extra tokens
141
+
142
+ return overhead
143
+
144
+ def count_messages(self, messages: list[dict[str, Any]]) -> int:
145
+ """Estimate tokens in chat messages.
146
+
147
+ Uses the base class implementation with estimation-based
148
+ text counting.
149
+
150
+ Args:
151
+ messages: List of chat messages.
152
+
153
+ Returns:
154
+ Estimated total token count.
155
+ """
156
+ # Use base class implementation
157
+ return super().count_messages(messages)
158
+
159
+ def __repr__(self) -> str:
160
+ if self._fixed_ratio:
161
+ return f"EstimatingTokenCounter(chars_per_token={self._fixed_ratio})"
162
+ return "EstimatingTokenCounter(auto)"
163
+
164
+
165
+ class CharacterCounter(BaseTokenizer):
166
+ """Simple character-based counter.
167
+
168
+ Uses a fixed character-to-token ratio. Useful for:
169
+ - Quick approximations
170
+ - Testing
171
+ - Models with unknown tokenization
172
+
173
+ This is less accurate than EstimatingTokenCounter but faster.
174
+ """
175
+
176
+ def __init__(self, chars_per_token: float = 4.0):
177
+ """Initialize character counter.
178
+
179
+ Args:
180
+ chars_per_token: Characters per token ratio.
181
+ """
182
+ self.chars_per_token = chars_per_token
183
+
184
+ def count_text(self, text: str) -> int:
185
+ """Count tokens based on character count.
186
+
187
+ Args:
188
+ text: Text to count.
189
+
190
+ Returns:
191
+ Estimated token count.
192
+ """
193
+ if not text:
194
+ return 0
195
+ return max(1, int(len(text) / self.chars_per_token + 0.5))
196
+
197
+ def __repr__(self) -> str:
198
+ return f"CharacterCounter(chars_per_token={self.chars_per_token})"