kolega-code 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. kolega_code/__init__.py +151 -0
  2. kolega_code/agent/__init__.py +42 -0
  3. kolega_code/agent/baseagent.py +998 -0
  4. kolega_code/agent/browseragent.py +123 -0
  5. kolega_code/agent/coder.py +157 -0
  6. kolega_code/agent/common.py +41 -0
  7. kolega_code/agent/compression.py +81 -0
  8. kolega_code/agent/context.py +112 -0
  9. kolega_code/agent/conversation.py +408 -0
  10. kolega_code/agent/generalagent.py +146 -0
  11. kolega_code/agent/investigationagent.py +123 -0
  12. kolega_code/agent/planningagent.py +187 -0
  13. kolega_code/agent/prompt_provider.py +196 -0
  14. kolega_code/agent/prompt_templates/agents/browser.j2 +102 -0
  15. kolega_code/agent/prompt_templates/agents/coder_cli_mode.j2 +127 -0
  16. kolega_code/agent/prompt_templates/agents/general.j2 +68 -0
  17. kolega_code/agent/prompt_templates/agents/investigation.j2 +72 -0
  18. kolega_code/agent/prompt_templates/common/frontend_guidance.md +36 -0
  19. kolega_code/agent/prompt_templates/common/kolega_md_instructions.md +14 -0
  20. kolega_code/agent/prompt_templates/environment_variables/workspace_env_vars.md +11 -0
  21. kolega_code/agent/prompt_templates/template_guidance/expo-template.md +379 -0
  22. kolega_code/agent/prompt_templates/template_guidance/html-website-template.md +3 -0
  23. kolega_code/agent/prompt_templates/template_guidance/mern-stack-template.md +3 -0
  24. kolega_code/agent/prompt_templates/template_guidance/react-vite-shadcdn-template.md +182 -0
  25. kolega_code/agent/prompts.py +192 -0
  26. kolega_code/agent/tests/__init__.py +0 -0
  27. kolega_code/agent/tests/llm/__init__.py +0 -0
  28. kolega_code/agent/tests/llm/test_anthropic_token_counting.py +633 -0
  29. kolega_code/agent/tests/llm/test_billing_openai_cache.py +74 -0
  30. kolega_code/agent/tests/llm/test_client.py +773 -0
  31. kolega_code/agent/tests/llm/test_dashscope_mapping.py +32 -0
  32. kolega_code/agent/tests/llm/test_error_boundary.py +322 -0
  33. kolega_code/agent/tests/llm/test_exceptions.py +249 -0
  34. kolega_code/agent/tests/llm/test_instrumented_client.py +536 -0
  35. kolega_code/agent/tests/llm/test_instrumented_client_integration.py +547 -0
  36. kolega_code/agent/tests/llm/test_langfuse_normalization.py +39 -0
  37. kolega_code/agent/tests/llm/test_model_specs.py +17 -0
  38. kolega_code/agent/tests/llm/test_openai_cached_tokens.py +58 -0
  39. kolega_code/agent/tests/llm/test_openai_cached_tokens_stream.py +74 -0
  40. kolega_code/agent/tests/llm/test_openai_message_conversion.py +30 -0
  41. kolega_code/agent/tests/llm/test_openai_token_counting.py +687 -0
  42. kolega_code/agent/tests/llm/test_tool_execution_ids.py +193 -0
  43. kolega_code/agent/tests/services/__init__.py +1 -0
  44. kolega_code/agent/tests/services/test_browser.py +447 -0
  45. kolega_code/agent/tests/services/test_browser_parity.py +353 -0
  46. kolega_code/agent/tests/services/test_file_system.py +699 -0
  47. kolega_code/agent/tests/services/test_sandbox_terminal_input.py +98 -0
  48. kolega_code/agent/tests/services/test_terminal.py +154 -0
  49. kolega_code/agent/tests/services/test_terminal_command_tracking.py +385 -0
  50. kolega_code/agent/tests/services/test_terminal_state_serializer.py +262 -0
  51. kolega_code/agent/tests/test_agent_tools_inventory.py +267 -0
  52. kolega_code/agent/tests/test_base_agent.py +1942 -0
  53. kolega_code/agent/tests/test_coder_attachments.py +330 -0
  54. kolega_code/agent/tests/test_coder_prompt_extensions.py +61 -0
  55. kolega_code/agent/tests/test_commands.py +179 -0
  56. kolega_code/agent/tests/test_duplicate_tool_results.py +556 -0
  57. kolega_code/agent/tests/test_empty_message_handling.py +48 -0
  58. kolega_code/agent/tests/test_general_agent.py +242 -0
  59. kolega_code/agent/tests/test_html.py +320 -0
  60. kolega_code/agent/tests/test_parallel_tool_calls.py +291 -0
  61. kolega_code/agent/tests/test_planning_agent.py +227 -0
  62. kolega_code/agent/tests/test_prompt_provider.py +271 -0
  63. kolega_code/agent/tests/test_tool_registry.py +102 -0
  64. kolega_code/agent/tests/test_tools.py +549 -0
  65. kolega_code/agent/tests/tool_backend/__init__.py +0 -0
  66. kolega_code/agent/tests/tool_backend/test_agent_tool.py +356 -0
  67. kolega_code/agent/tests/tool_backend/test_base_tool.py +147 -0
  68. kolega_code/agent/tests/tool_backend/test_browser_tool.py +335 -0
  69. kolega_code/agent/tests/tool_backend/test_build_tool.py +93 -0
  70. kolega_code/agent/tests/tool_backend/test_create_file_tool.py +115 -0
  71. kolega_code/agent/tests/tool_backend/test_glob_tool.py +196 -0
  72. kolega_code/agent/tests/tool_backend/test_glob_tool_sandbox_parity.py +230 -0
  73. kolega_code/agent/tests/tool_backend/test_list_directory_tool.py +292 -0
  74. kolega_code/agent/tests/tool_backend/test_read_file_tool.py +173 -0
  75. kolega_code/agent/tests/tool_backend/test_replace_entire_file_tool.py +115 -0
  76. kolega_code/agent/tests/tool_backend/test_replace_lines_tool.py +141 -0
  77. kolega_code/agent/tests/tool_backend/test_search_and_replace_tool.py +174 -0
  78. kolega_code/agent/tests/tool_backend/test_search_codebase_tool.py +228 -0
  79. kolega_code/agent/tests/tool_backend/test_terminal_tool.py +482 -0
  80. kolega_code/agent/tests/tool_backend/test_think_hard_integration.py +189 -0
  81. kolega_code/agent/tests/tool_backend/test_think_hard_streaming.py +445 -0
  82. kolega_code/agent/tests/tool_backend/test_web_fetch_tool.py +194 -0
  83. kolega_code/agent/tool_backend/agent_tool.py +414 -0
  84. kolega_code/agent/tool_backend/apply_edit_tool.py +98 -0
  85. kolega_code/agent/tool_backend/apply_patch_tool.py +514 -0
  86. kolega_code/agent/tool_backend/base_tool.py +217 -0
  87. kolega_code/agent/tool_backend/browser_tool.py +271 -0
  88. kolega_code/agent/tool_backend/build_tool.py +93 -0
  89. kolega_code/agent/tool_backend/create_file_tool.py +52 -0
  90. kolega_code/agent/tool_backend/glob_tool.py +323 -0
  91. kolega_code/agent/tool_backend/list_directory_tool.py +300 -0
  92. kolega_code/agent/tool_backend/memory_tool.py +79 -0
  93. kolega_code/agent/tool_backend/read_file_tool.py +119 -0
  94. kolega_code/agent/tool_backend/replace_entire_file_tool.py +40 -0
  95. kolega_code/agent/tool_backend/replace_lines_tool.py +97 -0
  96. kolega_code/agent/tool_backend/search_and_replace_tool.py +146 -0
  97. kolega_code/agent/tool_backend/search_codebase_tool.py +377 -0
  98. kolega_code/agent/tool_backend/streaming_tool.py +47 -0
  99. kolega_code/agent/tool_backend/terminal_tool.py +643 -0
  100. kolega_code/agent/tool_backend/think_hard_tool.py +211 -0
  101. kolega_code/agent/tool_backend/web_fetch_tool.py +205 -0
  102. kolega_code/agent/tools.py +1704 -0
  103. kolega_code/agent/utils/commands.py +94 -0
  104. kolega_code/cli/__init__.py +1 -0
  105. kolega_code/cli/app.py +2756 -0
  106. kolega_code/cli/config.py +280 -0
  107. kolega_code/cli/connection.py +49 -0
  108. kolega_code/cli/file_index.py +147 -0
  109. kolega_code/cli/main.py +564 -0
  110. kolega_code/cli/mentions.py +155 -0
  111. kolega_code/cli/messages.py +89 -0
  112. kolega_code/cli/provider_registry.py +96 -0
  113. kolega_code/cli/session_store.py +207 -0
  114. kolega_code/cli/settings.py +87 -0
  115. kolega_code/cli/skills.py +409 -0
  116. kolega_code/cli/slash_commands.py +108 -0
  117. kolega_code/cli/tests/__init__.py +1 -0
  118. kolega_code/cli/tests/test_app.py +4251 -0
  119. kolega_code/cli/tests/test_cli_config.py +171 -0
  120. kolega_code/cli/tests/test_connection.py +26 -0
  121. kolega_code/cli/tests/test_file_index.py +103 -0
  122. kolega_code/cli/tests/test_main.py +455 -0
  123. kolega_code/cli/tests/test_mentions.py +108 -0
  124. kolega_code/cli/tests/test_session_store.py +67 -0
  125. kolega_code/cli/tests/test_settings.py +62 -0
  126. kolega_code/cli/tests/test_skills.py +157 -0
  127. kolega_code/cli/tests/test_slash_commands.py +88 -0
  128. kolega_code/cli/theme.py +180 -0
  129. kolega_code/config.py +154 -0
  130. kolega_code/events.py +202 -0
  131. kolega_code/llm/client.py +300 -0
  132. kolega_code/llm/exceptions.py +285 -0
  133. kolega_code/llm/instrumented_client.py +520 -0
  134. kolega_code/llm/models.py +1368 -0
  135. kolega_code/llm/providers/__init__.py +0 -0
  136. kolega_code/llm/providers/anthropic.py +387 -0
  137. kolega_code/llm/providers/base.py +71 -0
  138. kolega_code/llm/providers/google.py +157 -0
  139. kolega_code/llm/providers/models.py +37 -0
  140. kolega_code/llm/providers/openai.py +363 -0
  141. kolega_code/llm/ratelimit.py +40 -0
  142. kolega_code/llm/specs.py +67 -0
  143. kolega_code/llm/tool_execution_ids.py +18 -0
  144. kolega_code/models/__init__.py +9 -0
  145. kolega_code/models/sandbox_terminal_state.py +47 -0
  146. kolega_code/runtime.py +50 -0
  147. kolega_code/sandbox/README.md +200 -0
  148. kolega_code/sandbox/__init__.py +21 -0
  149. kolega_code/sandbox/async_filesystem.py +475 -0
  150. kolega_code/sandbox/base.py +297 -0
  151. kolega_code/sandbox/browser.py +25 -0
  152. kolega_code/sandbox/event_loop.py +43 -0
  153. kolega_code/sandbox/filesystem.py +341 -0
  154. kolega_code/sandbox/local.py +118 -0
  155. kolega_code/sandbox/serializer.py +175 -0
  156. kolega_code/sandbox/terminal.py +868 -0
  157. kolega_code/sandbox/utils.py +216 -0
  158. kolega_code/services/base.py +255 -0
  159. kolega_code/services/browser.py +444 -0
  160. kolega_code/services/file_system.py +749 -0
  161. kolega_code/services/html.py +221 -0
  162. kolega_code/services/terminal.py +903 -0
  163. kolega_code/tools/__init__.py +22 -0
  164. kolega_code/tools/core.py +33 -0
  165. kolega_code/tools/definitions.py +81 -0
  166. kolega_code/tools/registry.py +73 -0
  167. kolega_code-0.1.0.dist-info/METADATA +157 -0
  168. kolega_code-0.1.0.dist-info/RECORD +171 -0
  169. kolega_code-0.1.0.dist-info/WHEEL +4 -0
  170. kolega_code-0.1.0.dist-info/entry_points.txt +2 -0
  171. kolega_code-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,633 @@
1
+ """
2
+ Comprehensive tests comparing local vs API token counting for Anthropic provider.
3
+
4
+ These tests verify that local tiktoken-based token counting is within 5% accuracy
5
+ of Anthropic's official API token counting, using real system prompts and tool definitions.
6
+ """
7
+
8
+ import os
9
+ from pathlib import Path
10
+ from typing import List
11
+ from unittest.mock import Mock, patch
12
+
13
+ import pytest
14
+ from dotenv import load_dotenv
15
+
16
+ from kolega_code.config import AgentConfig, ModelConfig, ModelProvider, RateLimitConfig
17
+ from kolega_code.events import AgentConnectionManager
18
+ from kolega_code.llm.client import LLMClient
19
+ from kolega_code.llm.models import Message, MessageHistory, TextBlock, ImageBlock, ToolDefinition, ToolParameter
20
+ from kolega_code.llm.providers.anthropic import AnthropicProvider
21
+ from kolega_code.agent.prompt_provider import AgentMode, AgentType, PromptContext, PromptProvider
22
+ from kolega_code.agent.tools import ToolCollection, ToolCollectionConfig
23
+
24
+ # Load environment variables
25
+ # Navigate up to backend directory: llm -> tests -> agent -> kolega_code -> backend
26
+ dotenv_path = os.path.join(
27
+ os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))), ".env"
28
+ )
29
+ if os.path.exists(dotenv_path):
30
+ load_dotenv(dotenv_path)
31
+
32
+
33
+ @pytest.fixture
34
+ def api_key():
35
+ """Get Anthropic API key from environment."""
36
+ key = os.getenv('ANTHROPIC_API_KEY')
37
+ if not key:
38
+ pytest.skip('ANTHROPIC_API_KEY not set')
39
+ return key
40
+
41
+
42
+ @pytest.fixture
43
+ def anthropic_provider_local(api_key):
44
+ """Create Anthropic provider with local token counting enabled."""
45
+ with patch.dict(os.environ, {'ANTHROPIC_USE_LOCAL_TOKEN_COUNTING': 'true'}):
46
+ provider = AnthropicProvider(api_key=api_key)
47
+ return provider
48
+
49
+
50
+ @pytest.fixture
51
+ def anthropic_provider_api(api_key):
52
+ """Create Anthropic provider with API token counting enabled."""
53
+ with patch.dict(os.environ, {'ANTHROPIC_USE_LOCAL_TOKEN_COUNTING': 'false'}):
54
+ provider = AnthropicProvider(api_key=api_key)
55
+ return provider
56
+
57
+
58
+ @pytest.fixture
59
+ def simple_messages():
60
+ """Simple test messages."""
61
+ return MessageHistory([Message("user", [TextBlock("Hello, how are you?")])])
62
+
63
+
64
+ @pytest.fixture
65
+ def simple_system():
66
+ """Simple system message."""
67
+ return Message("system", [TextBlock("You are a helpful assistant.")])
68
+
69
+
70
+ @pytest.fixture
71
+ def real_system_prompt():
72
+ """Get real system prompt from CoderAgent."""
73
+ prompt_provider = PromptProvider()
74
+ context = PromptContext(
75
+ system_name="Kolega Studio",
76
+ project_path="/test/project",
77
+ is_git_repo=True,
78
+ platform="Linux",
79
+ date_today="2025-01-01",
80
+ model_name="claude-sonnet-4-5-20250929",
81
+ available_ports=[3000, 8000],
82
+ kolega_md="",
83
+ workspace_id="test-workspace",
84
+ workspace_environment_variables=None,
85
+ memories=None,
86
+ )
87
+
88
+ prompt_text = prompt_provider.get_system_prompt(
89
+ agent_type=AgentType.CODER,
90
+ mode=AgentMode.CLI,
91
+ template_slug=None,
92
+ context=context,
93
+ )
94
+
95
+ return Message("system", [TextBlock(prompt_text)])
96
+
97
+
98
+ @pytest.fixture
99
+ def real_tools(tmp_path):
100
+ """Get real tool definitions from ToolCollection."""
101
+ mock_connection_manager = Mock(spec=AgentConnectionManager)
102
+ mock_config = AgentConfig(
103
+ anthropic_api_key="test",
104
+ openai_api_key="test",
105
+ long_context_config=ModelConfig(
106
+ provider=ModelProvider.ANTHROPIC,
107
+ model="claude-haiku-4-5-20251001",
108
+ rate_limits=RateLimitConfig(),
109
+ ),
110
+ fast_config=ModelConfig(
111
+ provider=ModelProvider.ANTHROPIC,
112
+ model="claude-haiku-4-5-20251001",
113
+ rate_limits=RateLimitConfig(),
114
+ ),
115
+ thinking_config=ModelConfig(
116
+ provider=ModelProvider.ANTHROPIC,
117
+ model="claude-haiku-4-5-20251001",
118
+ rate_limits=RateLimitConfig(),
119
+ ),
120
+ )
121
+
122
+ tool_config = ToolCollectionConfig(
123
+ custom_tool_groups=["coder_agent_tools"],
124
+ tool_exclusions=[
125
+ "read_memory",
126
+ "write_memory",
127
+ "execute_terminal_command",
128
+ "replace_lines",
129
+ "apply_patch",
130
+ "edit_file",
131
+ "get_tool_list",
132
+ "log_error",
133
+ "log_info",
134
+ "run_command",
135
+ "dispatch_coding_agent",
136
+ ],
137
+ )
138
+
139
+ tool_collection = ToolCollection(
140
+ project_path=tmp_path,
141
+ workspace_id="test-workspace",
142
+ thread_id="test-thread",
143
+ connection_manager=mock_connection_manager,
144
+ config=mock_config,
145
+ caller=None,
146
+ tool_config=tool_config,
147
+ )
148
+
149
+ return tool_collection.get_tool_list()
150
+
151
+
152
+ @pytest.fixture
153
+ def complex_messages():
154
+ """Multi-turn conversation with various content types."""
155
+ return MessageHistory(
156
+ [
157
+ Message("user", [TextBlock("Can you help me write a Python function?")]),
158
+ Message(
159
+ "assistant",
160
+ [
161
+ TextBlock(
162
+ "Of course! I'd be happy to help you write a Python function. What would you like the function to do?"
163
+ )
164
+ ],
165
+ ),
166
+ Message("user", [TextBlock("I need a function that calculates the factorial of a number recursively.")]),
167
+ Message(
168
+ "assistant",
169
+ [
170
+ TextBlock(
171
+ "Here's a recursive factorial function:\n\n```python\ndef factorial(n):\n if n == 0 or n == 1:\n return 1\n return n * factorial(n - 1)\n```"
172
+ )
173
+ ],
174
+ ),
175
+ ]
176
+ )
177
+
178
+
179
+ def calculate_percentage_difference(local_count: int, api_count: int) -> float:
180
+ """Calculate percentage difference between local and API token counts."""
181
+ if api_count == 0:
182
+ return 0.0
183
+ return abs(local_count - api_count) / api_count * 100
184
+
185
+
186
+ def get_accuracy_threshold(api_count: int) -> float:
187
+ """Get appropriate accuracy threshold based on token count.
188
+
189
+ Small token counts (<200) have higher variance due to fixed overhead,
190
+ so we use a more lenient threshold. For realistic agent contexts (>200 tokens),
191
+ we enforce the strict 5% threshold.
192
+ """
193
+ if api_count < 200:
194
+ return 15.0 # Lenient threshold for small samples
195
+ return 5.0 # Strict threshold for realistic contexts
196
+
197
+
198
+ @pytest.mark.slow
199
+ @pytest.mark.integration
200
+ @pytest.mark.asyncio
201
+ async def test_simple_message_comparison(
202
+ anthropic_provider_local,
203
+ anthropic_provider_api,
204
+ simple_messages,
205
+ simple_system,
206
+ ):
207
+ """Compare token counts for basic user/system messages."""
208
+ model = "claude-sonnet-4-5-20250929"
209
+
210
+ # Get counts from both methods
211
+ local_result = await anthropic_provider_local.count_tokens(
212
+ messages=simple_messages,
213
+ system=simple_system,
214
+ model=model,
215
+ tools=[],
216
+ )
217
+
218
+ api_result = await anthropic_provider_api.count_tokens(
219
+ messages=simple_messages,
220
+ system=simple_system,
221
+ model=model,
222
+ tools=[],
223
+ )
224
+
225
+ # Calculate percentage difference
226
+ diff_pct = calculate_percentage_difference(local_result.input_tokens, api_result.input_tokens)
227
+ threshold = get_accuracy_threshold(api_result.input_tokens)
228
+
229
+ print(f"\nSimple message comparison:")
230
+ print(f" Local count: {local_result.input_tokens}")
231
+ print(f" API count: {api_result.input_tokens}")
232
+ print(f" Difference: {diff_pct:.2f}%")
233
+ print(f" Threshold: {threshold:.1f}% (small sample)")
234
+
235
+ # Assert within threshold
236
+ assert (
237
+ diff_pct <= threshold
238
+ ), f"Difference {diff_pct:.2f}% exceeds {threshold:.1f}% threshold (local={local_result.input_tokens}, api={api_result.input_tokens})"
239
+
240
+
241
+ @pytest.mark.slow
242
+ @pytest.mark.integration
243
+ @pytest.mark.asyncio
244
+ async def test_with_real_system_prompt(
245
+ anthropic_provider_local,
246
+ anthropic_provider_api,
247
+ simple_messages,
248
+ real_system_prompt,
249
+ ):
250
+ """Test with actual CoderAgent system prompt."""
251
+ model = "claude-sonnet-4-5-20250929"
252
+
253
+ # Get counts from both methods
254
+ local_result = await anthropic_provider_local.count_tokens(
255
+ messages=simple_messages,
256
+ system=real_system_prompt,
257
+ model=model,
258
+ tools=[],
259
+ )
260
+
261
+ api_result = await anthropic_provider_api.count_tokens(
262
+ messages=simple_messages,
263
+ system=real_system_prompt,
264
+ model=model,
265
+ tools=[],
266
+ )
267
+
268
+ # Calculate percentage difference
269
+ diff_pct = calculate_percentage_difference(local_result.input_tokens, api_result.input_tokens)
270
+ threshold = get_accuracy_threshold(api_result.input_tokens)
271
+
272
+ print(f"\nReal system prompt comparison:")
273
+ print(f" Local count: {local_result.input_tokens}")
274
+ print(f" API count: {api_result.input_tokens}")
275
+ print(f" Difference: {diff_pct:.2f}%")
276
+ print(f" Threshold: {threshold:.1f}%")
277
+
278
+ # Assert within 5% tolerance (realistic context size)
279
+ assert (
280
+ diff_pct <= threshold
281
+ ), f"Difference {diff_pct:.2f}% exceeds {threshold:.1f}% threshold (local={local_result.input_tokens}, api={api_result.input_tokens})"
282
+
283
+
284
+ @pytest.mark.slow
285
+ @pytest.mark.integration
286
+ @pytest.mark.asyncio
287
+ async def test_with_tools(
288
+ anthropic_provider_local,
289
+ anthropic_provider_api,
290
+ simple_messages,
291
+ simple_system,
292
+ real_tools,
293
+ ):
294
+ """Test with real tool definitions from ToolCollection."""
295
+ model = "claude-sonnet-4-5-20250929"
296
+
297
+ # Get counts from both methods
298
+ local_result = await anthropic_provider_local.count_tokens(
299
+ messages=simple_messages,
300
+ system=simple_system,
301
+ model=model,
302
+ tools=real_tools,
303
+ )
304
+
305
+ api_result = await anthropic_provider_api.count_tokens(
306
+ messages=simple_messages,
307
+ system=simple_system,
308
+ model=model,
309
+ tools=real_tools,
310
+ )
311
+
312
+ # Calculate percentage difference
313
+ diff_pct = calculate_percentage_difference(local_result.input_tokens, api_result.input_tokens)
314
+ threshold = get_accuracy_threshold(api_result.input_tokens)
315
+
316
+ print(f"\nWith tools comparison:")
317
+ print(f" Tool count: {len(real_tools)}")
318
+ print(f" Local count: {local_result.input_tokens}")
319
+ print(f" API count: {api_result.input_tokens}")
320
+ print(f" Difference: {diff_pct:.2f}%")
321
+ print(f" Threshold: {threshold:.1f}%")
322
+
323
+ # Assert within 5% tolerance (realistic context with tools)
324
+ assert (
325
+ diff_pct <= threshold
326
+ ), f"Difference {diff_pct:.2f}% exceeds {threshold:.1f}% threshold (local={local_result.input_tokens}, api={api_result.input_tokens})"
327
+
328
+
329
+ @pytest.mark.slow
330
+ @pytest.mark.integration
331
+ @pytest.mark.asyncio
332
+ async def test_with_complex_conversation(
333
+ anthropic_provider_local,
334
+ anthropic_provider_api,
335
+ complex_messages,
336
+ simple_system,
337
+ ):
338
+ """Test with multi-turn conversation."""
339
+ model = "claude-sonnet-4-5-20250929"
340
+
341
+ # Get counts from both methods
342
+ local_result = await anthropic_provider_local.count_tokens(
343
+ messages=complex_messages,
344
+ system=simple_system,
345
+ model=model,
346
+ tools=[],
347
+ )
348
+
349
+ api_result = await anthropic_provider_api.count_tokens(
350
+ messages=complex_messages,
351
+ system=simple_system,
352
+ model=model,
353
+ tools=[],
354
+ )
355
+
356
+ # Calculate percentage difference
357
+ diff_pct = calculate_percentage_difference(local_result.input_tokens, api_result.input_tokens)
358
+ threshold = get_accuracy_threshold(api_result.input_tokens)
359
+
360
+ print(f"\nComplex conversation comparison:")
361
+ print(f" Message count: {len(complex_messages)}")
362
+ print(f" Local count: {local_result.input_tokens}")
363
+ print(f" API count: {api_result.input_tokens}")
364
+ print(f" Difference: {diff_pct:.2f}%")
365
+ print(f" Threshold: {threshold:.1f}% (small sample)")
366
+
367
+ # Assert within threshold
368
+ assert (
369
+ diff_pct <= threshold
370
+ ), f"Difference {diff_pct:.2f}% exceeds {threshold:.1f}% threshold (local={local_result.input_tokens}, api={api_result.input_tokens})"
371
+
372
+
373
+ @pytest.mark.slow
374
+ @pytest.mark.integration
375
+ @pytest.mark.asyncio
376
+ async def test_with_images(
377
+ anthropic_provider_local,
378
+ anthropic_provider_api,
379
+ simple_system,
380
+ ):
381
+ """Test token counting with image attachments."""
382
+ model = "claude-sonnet-4-5-20250929"
383
+
384
+ # Create a small test image (1x1 pixel PNG as base64)
385
+ # This is a tiny 1x1 transparent PNG
386
+ tiny_image_base64 = (
387
+ "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=="
388
+ )
389
+
390
+ # Create a message with image
391
+ messages_with_image = MessageHistory(
392
+ [
393
+ Message(
394
+ "user",
395
+ [
396
+ TextBlock("What do you see in this image?"),
397
+ ImageBlock(image_type="base64", media_type="image/png", data=tiny_image_base64),
398
+ ],
399
+ )
400
+ ]
401
+ )
402
+
403
+ # Get counts from both methods
404
+ local_result = await anthropic_provider_local.count_tokens(
405
+ messages=messages_with_image,
406
+ system=simple_system,
407
+ model=model,
408
+ tools=[],
409
+ )
410
+
411
+ api_result = await anthropic_provider_api.count_tokens(
412
+ messages=messages_with_image,
413
+ system=simple_system,
414
+ model=model,
415
+ tools=[],
416
+ )
417
+
418
+ # Calculate percentage difference
419
+ diff_pct = calculate_percentage_difference(local_result.input_tokens, api_result.input_tokens)
420
+ image_threshold = 200.0
421
+
422
+ print(f"\nWith images comparison:")
423
+ print(f" Image size: {len(tiny_image_base64)} chars (base64)")
424
+ print(f" Local count: {local_result.input_tokens}")
425
+ print(f" API count: {api_result.input_tokens}")
426
+ print(f" Difference: {diff_pct:.2f}%")
427
+ print(f" Threshold: {image_threshold:.1f}% (image estimate)")
428
+
429
+ # Images are harder to estimate precisely without decoding, but we verify:
430
+ # 1. Both methods counted more than text-only (proving images are counted)
431
+ # 2. Both counts are non-zero (images aren't ignored)
432
+ # Text-only would be ~14 tokens, so >20 proves image was counted
433
+ assert local_result.input_tokens > 20, "Local counting should include image tokens"
434
+ assert api_result.input_tokens > 20, "API counting should include image tokens"
435
+
436
+ # For images, allow very high variance since:
437
+ # - We estimate without decoding (no actual pixel dimensions)
438
+ # - This tiny 1x1 test image is an edge case (96 chars base64)
439
+ # - Normal conversation images (screenshots, etc.) will be much larger and more accurate
440
+ # - The key goal is images aren't ignored (count > 0)
441
+ assert (
442
+ diff_pct <= image_threshold
443
+ ), f"Difference {diff_pct:.2f}% exceeds {image_threshold:.1f}% threshold for images (local={local_result.input_tokens}, api={api_result.input_tokens})"
444
+
445
+
446
+ @pytest.mark.slow
447
+ @pytest.mark.integration
448
+ @pytest.mark.asyncio
449
+ async def test_full_agent_context(
450
+ anthropic_provider_local,
451
+ anthropic_provider_api,
452
+ complex_messages,
453
+ real_system_prompt,
454
+ real_tools,
455
+ ):
456
+ """Test with full agent context: real system prompt, complex messages, and tools."""
457
+ model = "claude-sonnet-4-5-20250929"
458
+
459
+ # Get counts from both methods
460
+ local_result = await anthropic_provider_local.count_tokens(
461
+ messages=complex_messages,
462
+ system=real_system_prompt,
463
+ model=model,
464
+ tools=real_tools,
465
+ )
466
+
467
+ api_result = await anthropic_provider_api.count_tokens(
468
+ messages=complex_messages,
469
+ system=real_system_prompt,
470
+ model=model,
471
+ tools=real_tools,
472
+ )
473
+
474
+ # Calculate percentage difference
475
+ diff_pct = calculate_percentage_difference(local_result.input_tokens, api_result.input_tokens)
476
+ threshold = get_accuracy_threshold(api_result.input_tokens)
477
+
478
+ print(f"\nFull agent context comparison:")
479
+ print(f" Message count: {len(complex_messages)}")
480
+ print(f" Tool count: {len(real_tools)}")
481
+ print(f" Local count: {local_result.input_tokens}")
482
+ print(f" API count: {api_result.input_tokens}")
483
+ print(f" Difference: {diff_pct:.2f}%")
484
+ print(f" Threshold: {threshold:.1f}%")
485
+
486
+ # Assert within 5% tolerance (realistic full agent context)
487
+ assert (
488
+ diff_pct <= threshold
489
+ ), f"Difference {diff_pct:.2f}% exceeds {threshold:.1f}% threshold (local={local_result.input_tokens}, api={api_result.input_tokens})"
490
+
491
+
492
+ @pytest.mark.slow
493
+ @pytest.mark.integration
494
+ @pytest.mark.asyncio
495
+ async def test_accuracy_threshold_summary(
496
+ anthropic_provider_local,
497
+ anthropic_provider_api,
498
+ simple_messages,
499
+ simple_system,
500
+ complex_messages,
501
+ real_system_prompt,
502
+ real_tools,
503
+ ):
504
+ """Run all comparison scenarios and verify all are within their thresholds."""
505
+ model = "claude-sonnet-4-5-20250929"
506
+
507
+ # Create message with image for testing
508
+ tiny_image_base64 = (
509
+ "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=="
510
+ )
511
+ messages_with_image = MessageHistory(
512
+ [
513
+ Message(
514
+ "user",
515
+ [
516
+ TextBlock("What do you see?"),
517
+ ImageBlock(image_type="base64", media_type="image/png", data=tiny_image_base64),
518
+ ],
519
+ )
520
+ ]
521
+ )
522
+
523
+ test_scenarios = [
524
+ ("Simple", simple_messages, simple_system, []),
525
+ ("Real System", simple_messages, real_system_prompt, []),
526
+ ("With Tools", simple_messages, simple_system, real_tools),
527
+ ("Complex Messages", complex_messages, simple_system, []),
528
+ ("With Images", messages_with_image, simple_system, []),
529
+ ("Full Context", complex_messages, real_system_prompt, real_tools),
530
+ ]
531
+
532
+ results = []
533
+
534
+ for name, messages, system, tools in test_scenarios:
535
+ local_result = await anthropic_provider_local.count_tokens(
536
+ messages=messages,
537
+ system=system,
538
+ model=model,
539
+ tools=tools,
540
+ )
541
+
542
+ api_result = await anthropic_provider_api.count_tokens(
543
+ messages=messages,
544
+ system=system,
545
+ model=model,
546
+ tools=tools,
547
+ )
548
+
549
+ diff_pct = calculate_percentage_difference(local_result.input_tokens, api_result.input_tokens)
550
+ results.append((name, local_result.input_tokens, api_result.input_tokens, diff_pct))
551
+
552
+ # Print summary
553
+ print("\n" + "=" * 80)
554
+ print("Token Counting Accuracy Summary")
555
+ print("=" * 80)
556
+ print(f'{"Scenario":<20} {"Local":<10} {"API":<10} {"Diff %":<10} {"Status":<10}')
557
+ print("-" * 80)
558
+
559
+ all_within_threshold = True
560
+ for name, local_count, api_count, diff_pct in results:
561
+ # Images get special handling - they're estimated without decoding
562
+ if "Images" in name:
563
+ threshold = 200.0
564
+ else:
565
+ threshold = get_accuracy_threshold(api_count)
566
+ status = "✓ PASS" if diff_pct <= threshold else "✗ FAIL"
567
+ if diff_pct > threshold:
568
+ all_within_threshold = False
569
+ threshold_str = f"{threshold:.0f}%"
570
+ print(f"{name:<20} {local_count:<10} {api_count:<10} {diff_pct:<10.2f} {status:<10}")
571
+
572
+ print("=" * 80)
573
+ print("Note: Realistic agent contexts (>200 tokens) must be within 5%.")
574
+ print(" Small samples (<200 tokens) allowed up to 15% due to fixed overhead.")
575
+ print(" Images allowed up to 200% variance (estimated without decoding).")
576
+ print("=" * 80)
577
+
578
+ # Assert all scenarios pass their respective thresholds
579
+ assert all_within_threshold, 'One or more scenarios exceeded their accuracy threshold'
580
+
581
+
582
+ def test_environment_variable_default(api_key):
583
+ """Test that local token counting defaults to False when env var not set."""
584
+ # Clear the environment variable
585
+ with patch.dict(os.environ, {}, clear=False):
586
+ if 'ANTHROPIC_USE_LOCAL_TOKEN_COUNTING' in os.environ:
587
+ del os.environ['ANTHROPIC_USE_LOCAL_TOKEN_COUNTING']
588
+ provider = AnthropicProvider(api_key=api_key)
589
+
590
+ assert provider.use_local_token_counting is False, 'Should default to False when env var not set'
591
+
592
+
593
+ def test_environment_variable_true(api_key):
594
+ """Test that local token counting is enabled when env var is 'true'."""
595
+ with patch.dict(os.environ, {'ANTHROPIC_USE_LOCAL_TOKEN_COUNTING': 'true'}):
596
+ provider = AnthropicProvider(api_key=api_key)
597
+
598
+ assert provider.use_local_token_counting is True, 'Should be True when env var is "true"'
599
+
600
+
601
+ def test_environment_variable_false(api_key):
602
+ """Test that local token counting is disabled when env var is 'false'."""
603
+ with patch.dict(os.environ, {'ANTHROPIC_USE_LOCAL_TOKEN_COUNTING': 'false'}):
604
+ provider = AnthropicProvider(api_key=api_key)
605
+
606
+ assert provider.use_local_token_counting is False, 'Should be False when env var is "false"'
607
+
608
+
609
+ def test_environment_variable_case_insensitive(api_key):
610
+ """Test that env var is case insensitive."""
611
+ test_cases = [
612
+ ('TRUE', True),
613
+ ('True', True),
614
+ ('TrUe', True),
615
+ ('FALSE', False),
616
+ ('False', False),
617
+ ('FaLsE', False),
618
+ ]
619
+
620
+ for env_value, expected_result in test_cases:
621
+ with patch.dict(os.environ, {'ANTHROPIC_USE_LOCAL_TOKEN_COUNTING': env_value}):
622
+ provider = AnthropicProvider(api_key=api_key)
623
+ assert provider.use_local_token_counting is expected_result, f'Failed for env_value={env_value}'
624
+
625
+
626
+ def test_environment_variable_invalid_value(api_key):
627
+ """Test that invalid env var values default to False."""
628
+ invalid_values = ['yes', 'no', '1', '0', 'enabled', 'disabled', 'garbage']
629
+
630
+ for invalid_value in invalid_values:
631
+ with patch.dict(os.environ, {'ANTHROPIC_USE_LOCAL_TOKEN_COUNTING': invalid_value}):
632
+ provider = AnthropicProvider(api_key=api_key)
633
+ assert provider.use_local_token_counting is False, f'Should default to False for invalid value: {invalid_value}'
@@ -0,0 +1,74 @@
1
+ import pytest
2
+
3
+ from kolega_code.llm.instrumented_client import InstrumentedLLMClient
4
+
5
+
6
+ class _UsageRecorder:
7
+ def __init__(self):
8
+ self.payload = None
9
+
10
+ def record_usage(self, usage_data):
11
+ self.payload = usage_data
12
+
13
+
14
+ @pytest.mark.asyncio
15
+ async def test_usage_recorder_maps_openai_cached_tokens():
16
+ recorder = _UsageRecorder()
17
+ client = InstrumentedLLMClient(
18
+ provider='openai',
19
+ api_key='sk',
20
+ langfuse_client=None,
21
+ user_id='u1',
22
+ workspace_id='w1',
23
+ thread_id='t1',
24
+ usage_recorder=recorder,
25
+ )
26
+
27
+ usage = {
28
+ 'provider': 'openai',
29
+ 'prompt_tokens': 10,
30
+ 'completion_tokens': 2,
31
+ 'cache_read_input_tokens': 2048,
32
+ }
33
+
34
+ await client._record_usage(usage, model='m1', success=True)
35
+ assert recorder.payload['input_tokens'] == 10
36
+ assert recorder.payload['output_tokens'] == 2
37
+ assert recorder.payload['cache_read_input_tokens'] == 2048
38
+
39
+
40
+ @pytest.mark.asyncio
41
+ async def test_usage_recorder_maps_moonshot_response_usage():
42
+ recorder = _UsageRecorder()
43
+ client = InstrumentedLLMClient(
44
+ provider='moonshot',
45
+ api_key='sk',
46
+ langfuse_client=None,
47
+ user_id='u1',
48
+ workspace_id='w1',
49
+ thread_id='t1',
50
+ usage_recorder=recorder,
51
+ )
52
+
53
+ usage = {
54
+ 'provider': 'moonshot',
55
+ 'input_tokens': 123,
56
+ 'output_tokens': 45,
57
+ 'cache_read_input_tokens': 67,
58
+ 'cache_write_input_tokens': 89,
59
+ # Moonshot may return these aliases too; billing should use the
60
+ # Anthropic-shaped fields above for Kimi accounting.
61
+ 'prompt_tokens': 999,
62
+ 'completion_tokens': 888,
63
+ 'total_tokens': 1887,
64
+ }
65
+
66
+ await client._record_usage(usage, model='kimi-k2.6', success=True)
67
+
68
+ assert recorder.payload['provider'] == 'moonshot'
69
+ assert recorder.payload['model'] == 'kimi-k2.6'
70
+ assert recorder.payload['input_tokens'] == 123
71
+ assert recorder.payload['output_tokens'] == 45
72
+ assert recorder.payload['cache_read_input_tokens'] == 67
73
+ assert recorder.payload['cache_write_input_tokens'] == 89
74
+ assert recorder.payload['metadata']['raw_usage'] == usage