kolega-code 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. kolega_code/__init__.py +151 -0
  2. kolega_code/agent/__init__.py +42 -0
  3. kolega_code/agent/baseagent.py +998 -0
  4. kolega_code/agent/browseragent.py +123 -0
  5. kolega_code/agent/coder.py +157 -0
  6. kolega_code/agent/common.py +41 -0
  7. kolega_code/agent/compression.py +81 -0
  8. kolega_code/agent/context.py +112 -0
  9. kolega_code/agent/conversation.py +408 -0
  10. kolega_code/agent/generalagent.py +146 -0
  11. kolega_code/agent/investigationagent.py +123 -0
  12. kolega_code/agent/planningagent.py +187 -0
  13. kolega_code/agent/prompt_provider.py +196 -0
  14. kolega_code/agent/prompt_templates/agents/browser.j2 +102 -0
  15. kolega_code/agent/prompt_templates/agents/coder_cli_mode.j2 +127 -0
  16. kolega_code/agent/prompt_templates/agents/general.j2 +68 -0
  17. kolega_code/agent/prompt_templates/agents/investigation.j2 +72 -0
  18. kolega_code/agent/prompt_templates/common/frontend_guidance.md +36 -0
  19. kolega_code/agent/prompt_templates/common/kolega_md_instructions.md +14 -0
  20. kolega_code/agent/prompt_templates/environment_variables/workspace_env_vars.md +11 -0
  21. kolega_code/agent/prompt_templates/template_guidance/expo-template.md +379 -0
  22. kolega_code/agent/prompt_templates/template_guidance/html-website-template.md +3 -0
  23. kolega_code/agent/prompt_templates/template_guidance/mern-stack-template.md +3 -0
  24. kolega_code/agent/prompt_templates/template_guidance/react-vite-shadcdn-template.md +182 -0
  25. kolega_code/agent/prompts.py +192 -0
  26. kolega_code/agent/tests/__init__.py +0 -0
  27. kolega_code/agent/tests/llm/__init__.py +0 -0
  28. kolega_code/agent/tests/llm/test_anthropic_token_counting.py +633 -0
  29. kolega_code/agent/tests/llm/test_billing_openai_cache.py +74 -0
  30. kolega_code/agent/tests/llm/test_client.py +773 -0
  31. kolega_code/agent/tests/llm/test_dashscope_mapping.py +32 -0
  32. kolega_code/agent/tests/llm/test_error_boundary.py +322 -0
  33. kolega_code/agent/tests/llm/test_exceptions.py +249 -0
  34. kolega_code/agent/tests/llm/test_instrumented_client.py +536 -0
  35. kolega_code/agent/tests/llm/test_instrumented_client_integration.py +547 -0
  36. kolega_code/agent/tests/llm/test_langfuse_normalization.py +39 -0
  37. kolega_code/agent/tests/llm/test_model_specs.py +17 -0
  38. kolega_code/agent/tests/llm/test_openai_cached_tokens.py +58 -0
  39. kolega_code/agent/tests/llm/test_openai_cached_tokens_stream.py +74 -0
  40. kolega_code/agent/tests/llm/test_openai_message_conversion.py +30 -0
  41. kolega_code/agent/tests/llm/test_openai_token_counting.py +687 -0
  42. kolega_code/agent/tests/llm/test_tool_execution_ids.py +193 -0
  43. kolega_code/agent/tests/services/__init__.py +1 -0
  44. kolega_code/agent/tests/services/test_browser.py +447 -0
  45. kolega_code/agent/tests/services/test_browser_parity.py +353 -0
  46. kolega_code/agent/tests/services/test_file_system.py +699 -0
  47. kolega_code/agent/tests/services/test_sandbox_terminal_input.py +98 -0
  48. kolega_code/agent/tests/services/test_terminal.py +154 -0
  49. kolega_code/agent/tests/services/test_terminal_command_tracking.py +385 -0
  50. kolega_code/agent/tests/services/test_terminal_state_serializer.py +262 -0
  51. kolega_code/agent/tests/test_agent_tools_inventory.py +267 -0
  52. kolega_code/agent/tests/test_base_agent.py +1942 -0
  53. kolega_code/agent/tests/test_coder_attachments.py +330 -0
  54. kolega_code/agent/tests/test_coder_prompt_extensions.py +61 -0
  55. kolega_code/agent/tests/test_commands.py +179 -0
  56. kolega_code/agent/tests/test_duplicate_tool_results.py +556 -0
  57. kolega_code/agent/tests/test_empty_message_handling.py +48 -0
  58. kolega_code/agent/tests/test_general_agent.py +242 -0
  59. kolega_code/agent/tests/test_html.py +320 -0
  60. kolega_code/agent/tests/test_parallel_tool_calls.py +291 -0
  61. kolega_code/agent/tests/test_planning_agent.py +227 -0
  62. kolega_code/agent/tests/test_prompt_provider.py +271 -0
  63. kolega_code/agent/tests/test_tool_registry.py +102 -0
  64. kolega_code/agent/tests/test_tools.py +549 -0
  65. kolega_code/agent/tests/tool_backend/__init__.py +0 -0
  66. kolega_code/agent/tests/tool_backend/test_agent_tool.py +356 -0
  67. kolega_code/agent/tests/tool_backend/test_base_tool.py +147 -0
  68. kolega_code/agent/tests/tool_backend/test_browser_tool.py +335 -0
  69. kolega_code/agent/tests/tool_backend/test_build_tool.py +93 -0
  70. kolega_code/agent/tests/tool_backend/test_create_file_tool.py +115 -0
  71. kolega_code/agent/tests/tool_backend/test_glob_tool.py +196 -0
  72. kolega_code/agent/tests/tool_backend/test_glob_tool_sandbox_parity.py +230 -0
  73. kolega_code/agent/tests/tool_backend/test_list_directory_tool.py +292 -0
  74. kolega_code/agent/tests/tool_backend/test_read_file_tool.py +173 -0
  75. kolega_code/agent/tests/tool_backend/test_replace_entire_file_tool.py +115 -0
  76. kolega_code/agent/tests/tool_backend/test_replace_lines_tool.py +141 -0
  77. kolega_code/agent/tests/tool_backend/test_search_and_replace_tool.py +174 -0
  78. kolega_code/agent/tests/tool_backend/test_search_codebase_tool.py +228 -0
  79. kolega_code/agent/tests/tool_backend/test_terminal_tool.py +482 -0
  80. kolega_code/agent/tests/tool_backend/test_think_hard_integration.py +189 -0
  81. kolega_code/agent/tests/tool_backend/test_think_hard_streaming.py +445 -0
  82. kolega_code/agent/tests/tool_backend/test_web_fetch_tool.py +194 -0
  83. kolega_code/agent/tool_backend/agent_tool.py +414 -0
  84. kolega_code/agent/tool_backend/apply_edit_tool.py +98 -0
  85. kolega_code/agent/tool_backend/apply_patch_tool.py +514 -0
  86. kolega_code/agent/tool_backend/base_tool.py +217 -0
  87. kolega_code/agent/tool_backend/browser_tool.py +271 -0
  88. kolega_code/agent/tool_backend/build_tool.py +93 -0
  89. kolega_code/agent/tool_backend/create_file_tool.py +52 -0
  90. kolega_code/agent/tool_backend/glob_tool.py +323 -0
  91. kolega_code/agent/tool_backend/list_directory_tool.py +300 -0
  92. kolega_code/agent/tool_backend/memory_tool.py +79 -0
  93. kolega_code/agent/tool_backend/read_file_tool.py +119 -0
  94. kolega_code/agent/tool_backend/replace_entire_file_tool.py +40 -0
  95. kolega_code/agent/tool_backend/replace_lines_tool.py +97 -0
  96. kolega_code/agent/tool_backend/search_and_replace_tool.py +146 -0
  97. kolega_code/agent/tool_backend/search_codebase_tool.py +377 -0
  98. kolega_code/agent/tool_backend/streaming_tool.py +47 -0
  99. kolega_code/agent/tool_backend/terminal_tool.py +643 -0
  100. kolega_code/agent/tool_backend/think_hard_tool.py +211 -0
  101. kolega_code/agent/tool_backend/web_fetch_tool.py +205 -0
  102. kolega_code/agent/tools.py +1704 -0
  103. kolega_code/agent/utils/commands.py +94 -0
  104. kolega_code/cli/__init__.py +1 -0
  105. kolega_code/cli/app.py +2756 -0
  106. kolega_code/cli/config.py +280 -0
  107. kolega_code/cli/connection.py +49 -0
  108. kolega_code/cli/file_index.py +147 -0
  109. kolega_code/cli/main.py +564 -0
  110. kolega_code/cli/mentions.py +155 -0
  111. kolega_code/cli/messages.py +89 -0
  112. kolega_code/cli/provider_registry.py +96 -0
  113. kolega_code/cli/session_store.py +207 -0
  114. kolega_code/cli/settings.py +87 -0
  115. kolega_code/cli/skills.py +409 -0
  116. kolega_code/cli/slash_commands.py +108 -0
  117. kolega_code/cli/tests/__init__.py +1 -0
  118. kolega_code/cli/tests/test_app.py +4251 -0
  119. kolega_code/cli/tests/test_cli_config.py +171 -0
  120. kolega_code/cli/tests/test_connection.py +26 -0
  121. kolega_code/cli/tests/test_file_index.py +103 -0
  122. kolega_code/cli/tests/test_main.py +455 -0
  123. kolega_code/cli/tests/test_mentions.py +108 -0
  124. kolega_code/cli/tests/test_session_store.py +67 -0
  125. kolega_code/cli/tests/test_settings.py +62 -0
  126. kolega_code/cli/tests/test_skills.py +157 -0
  127. kolega_code/cli/tests/test_slash_commands.py +88 -0
  128. kolega_code/cli/theme.py +180 -0
  129. kolega_code/config.py +154 -0
  130. kolega_code/events.py +202 -0
  131. kolega_code/llm/client.py +300 -0
  132. kolega_code/llm/exceptions.py +285 -0
  133. kolega_code/llm/instrumented_client.py +520 -0
  134. kolega_code/llm/models.py +1368 -0
  135. kolega_code/llm/providers/__init__.py +0 -0
  136. kolega_code/llm/providers/anthropic.py +387 -0
  137. kolega_code/llm/providers/base.py +71 -0
  138. kolega_code/llm/providers/google.py +157 -0
  139. kolega_code/llm/providers/models.py +37 -0
  140. kolega_code/llm/providers/openai.py +363 -0
  141. kolega_code/llm/ratelimit.py +40 -0
  142. kolega_code/llm/specs.py +67 -0
  143. kolega_code/llm/tool_execution_ids.py +18 -0
  144. kolega_code/models/__init__.py +9 -0
  145. kolega_code/models/sandbox_terminal_state.py +47 -0
  146. kolega_code/runtime.py +50 -0
  147. kolega_code/sandbox/README.md +200 -0
  148. kolega_code/sandbox/__init__.py +21 -0
  149. kolega_code/sandbox/async_filesystem.py +475 -0
  150. kolega_code/sandbox/base.py +297 -0
  151. kolega_code/sandbox/browser.py +25 -0
  152. kolega_code/sandbox/event_loop.py +43 -0
  153. kolega_code/sandbox/filesystem.py +341 -0
  154. kolega_code/sandbox/local.py +118 -0
  155. kolega_code/sandbox/serializer.py +175 -0
  156. kolega_code/sandbox/terminal.py +868 -0
  157. kolega_code/sandbox/utils.py +216 -0
  158. kolega_code/services/base.py +255 -0
  159. kolega_code/services/browser.py +444 -0
  160. kolega_code/services/file_system.py +749 -0
  161. kolega_code/services/html.py +221 -0
  162. kolega_code/services/terminal.py +903 -0
  163. kolega_code/tools/__init__.py +22 -0
  164. kolega_code/tools/core.py +33 -0
  165. kolega_code/tools/definitions.py +81 -0
  166. kolega_code/tools/registry.py +73 -0
  167. kolega_code-0.1.0.dist-info/METADATA +157 -0
  168. kolega_code-0.1.0.dist-info/RECORD +171 -0
  169. kolega_code-0.1.0.dist-info/WHEEL +4 -0
  170. kolega_code-0.1.0.dist-info/entry_points.txt +2 -0
  171. kolega_code-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,687 @@
1
+ """
2
+ Comprehensive tests comparing local vs API token counting for OpenAI provider.
3
+
4
+ These tests verify that local tiktoken-based token counting is within reasonable accuracy
5
+ of OpenAI's official API token counting, using real system prompts and tool definitions.
6
+ """
7
+
8
+ import json
9
+ import os
10
+ from pathlib import Path
11
+ from typing import List
12
+ from unittest.mock import Mock
13
+
14
+ import pytest
15
+ from dotenv import load_dotenv
16
+
17
+ from kolega_code.config import AgentConfig, ModelConfig, ModelProvider, RateLimitConfig
18
+ from kolega_code.events import AgentConnectionManager
19
+ from kolega_code.llm.client import LLMClient
20
+ from kolega_code.llm.models import (
21
+ ImageBlock,
22
+ Message,
23
+ MessageHistory,
24
+ TextBlock,
25
+ ToolCall,
26
+ ToolDefinition,
27
+ ToolParameter,
28
+ ToolResult,
29
+ )
30
+ from kolega_code.llm.providers.openai import OpenAIProvider
31
+ from kolega_code.agent.prompt_provider import AgentMode, AgentType, PromptContext, PromptProvider
32
+ from kolega_code.agent.tools import ToolCollection, ToolCollectionConfig
33
+
34
+ # Load environment variables
35
+ # Navigate up to backend directory: llm -> tests -> agent -> kolega_code -> backend
36
+ dotenv_path = os.path.join(
37
+ os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))), ".env"
38
+ )
39
+ if os.path.exists(dotenv_path):
40
+ load_dotenv(dotenv_path)
41
+
42
+
43
+ @pytest.fixture
44
+ def api_key():
45
+ """Get OpenAI API key from environment."""
46
+ key = os.getenv("OPENAI_API_KEY")
47
+ if not key:
48
+ pytest.skip("OPENAI_API_KEY not set")
49
+ return key
50
+
51
+
52
+ @pytest.fixture
53
+ def openai_provider(api_key):
54
+ """Create OpenAI provider for testing."""
55
+ return OpenAIProvider(api_key=api_key)
56
+
57
+
58
+ @pytest.fixture
59
+ def simple_messages():
60
+ """Simple test messages."""
61
+ return MessageHistory([Message("user", [TextBlock("Hello, how are you?")])])
62
+
63
+
64
+ @pytest.fixture
65
+ def simple_system():
66
+ """Simple system message."""
67
+ return Message("system", [TextBlock("You are a helpful assistant.")])
68
+
69
+
70
+ @pytest.fixture
71
+ def real_system_prompt():
72
+ """Get real system prompt from CoderAgent."""
73
+ prompt_provider = PromptProvider()
74
+ context = PromptContext(
75
+ system_name="Kolega Studio",
76
+ project_path="/test/project",
77
+ is_git_repo=True,
78
+ platform="Linux",
79
+ date_today="2025-01-01",
80
+ model_name="gpt-4o",
81
+ available_ports=[3000, 8000],
82
+ kolega_md="",
83
+ workspace_id="test-workspace",
84
+ workspace_environment_variables=None,
85
+ memories=None,
86
+ )
87
+
88
+ prompt_text = prompt_provider.get_system_prompt(
89
+ agent_type=AgentType.CODER,
90
+ mode=AgentMode.CLI,
91
+ template_slug=None,
92
+ context=context,
93
+ )
94
+
95
+ return Message("system", [TextBlock(prompt_text)])
96
+
97
+
98
+ @pytest.fixture
99
+ def real_tools(tmp_path):
100
+ """Get real tool definitions from ToolCollection."""
101
+ mock_connection_manager = Mock(spec=AgentConnectionManager)
102
+ mock_config = AgentConfig(
103
+ anthropic_api_key="test",
104
+ openai_api_key="test",
105
+ long_context_config=ModelConfig(
106
+ provider=ModelProvider.OPENAI,
107
+ model="gpt-4o",
108
+ rate_limits=RateLimitConfig(),
109
+ ),
110
+ fast_config=ModelConfig(
111
+ provider=ModelProvider.OPENAI,
112
+ model="gpt-4o",
113
+ rate_limits=RateLimitConfig(),
114
+ ),
115
+ thinking_config=ModelConfig(
116
+ provider=ModelProvider.OPENAI,
117
+ model="gpt-4o",
118
+ rate_limits=RateLimitConfig(),
119
+ ),
120
+ )
121
+
122
+ tool_config = ToolCollectionConfig(
123
+ custom_tool_groups=["coder_agent_tools"],
124
+ tool_exclusions=[
125
+ "read_memory",
126
+ "write_memory",
127
+ "execute_terminal_command",
128
+ "replace_lines",
129
+ "apply_patch",
130
+ "edit_file",
131
+ "get_tool_list",
132
+ "log_error",
133
+ "log_info",
134
+ "run_command",
135
+ "dispatch_coding_agent",
136
+ ],
137
+ )
138
+
139
+ tool_collection = ToolCollection(
140
+ project_path=tmp_path,
141
+ workspace_id="test-workspace",
142
+ thread_id="test-thread",
143
+ connection_manager=mock_connection_manager,
144
+ config=mock_config,
145
+ caller=None,
146
+ tool_config=tool_config,
147
+ )
148
+
149
+ return tool_collection.get_tool_list()
150
+
151
+
152
+ @pytest.fixture
153
+ def complex_messages():
154
+ """Multi-turn conversation with various content types."""
155
+ return MessageHistory(
156
+ [
157
+ Message("user", [TextBlock("Can you help me write a Python function?")]),
158
+ Message(
159
+ "assistant",
160
+ [
161
+ TextBlock(
162
+ "Of course! I'd be happy to help you write a Python function. What would you like the function to do?"
163
+ )
164
+ ],
165
+ ),
166
+ Message("user", [TextBlock("I need a function that calculates the factorial of a number recursively.")]),
167
+ Message(
168
+ "assistant",
169
+ [
170
+ TextBlock(
171
+ "Here's a recursive factorial function:\n\n```python\ndef factorial(n):\n if n == 0 or n == 1:\n return 1\n return n * factorial(n - 1)\n```"
172
+ )
173
+ ],
174
+ ),
175
+ ]
176
+ )
177
+
178
+
179
+ @pytest.fixture
180
+ def messages_with_tool_calls():
181
+ """Messages containing tool calls and results."""
182
+ return MessageHistory(
183
+ [
184
+ Message("user", [TextBlock("Can you read the README.md file?")]),
185
+ Message(
186
+ "assistant",
187
+ [
188
+ TextBlock("I'll read that file for you."),
189
+ ToolCall(
190
+ id="call_123",
191
+ name="read_file",
192
+ input={"target_file": "README.md"},
193
+ ),
194
+ ],
195
+ ),
196
+ Message(
197
+ "user",
198
+ [
199
+ ToolResult(
200
+ tool_use_id="call_123",
201
+ name="read_file",
202
+ content="# My Project\n\nThis is a sample README file.",
203
+ is_error=False,
204
+ )
205
+ ],
206
+ ),
207
+ Message(
208
+ "assistant",
209
+ [TextBlock('I\'ve read the README.md file. It contains information about "My Project".')],
210
+ ),
211
+ ]
212
+ )
213
+
214
+
215
+ def calculate_percentage_difference(local_count: int, api_count: int) -> float:
216
+ """Calculate percentage difference between local and API token counts."""
217
+ if api_count == 0:
218
+ return 0.0
219
+ return abs(local_count - api_count) / api_count * 100
220
+
221
+
222
+ def get_accuracy_threshold(api_count: int, has_tools: bool = False) -> float:
223
+ """Get appropriate accuracy threshold based on token count.
224
+
225
+ Small token counts (<200) have higher variance due to fixed overhead,
226
+ so we use a more lenient threshold. For realistic agent contexts (>200 tokens),
227
+ we enforce a stricter threshold. Tool definitions have additional variance.
228
+ """
229
+ if api_count < 200:
230
+ return 15.0 # Lenient threshold for small samples
231
+ if has_tools:
232
+ return 20.0 # More lenient for tool definitions (OpenAI uses compact internal format)
233
+ return 10.0 # Moderate threshold for realistic contexts (OpenAI less predictable than Anthropic)
234
+
235
+
236
+ @pytest.mark.slow
237
+ @pytest.mark.integration
238
+ @pytest.mark.asyncio
239
+ async def test_simple_message_comparison(
240
+ openai_provider,
241
+ simple_messages,
242
+ simple_system,
243
+ ):
244
+ """Compare token counts for basic user/system messages."""
245
+ model = "gpt-4o"
246
+
247
+ # Get local count
248
+ local_result = await openai_provider.count_tokens(
249
+ messages=simple_messages,
250
+ system=simple_system,
251
+ model=model,
252
+ tools=[],
253
+ )
254
+
255
+ # Get API count by making a real call
256
+ combined_messages = MessageHistory([simple_system] + list(simple_messages))
257
+ response = await openai_provider.async_client.chat.completions.create(
258
+ model=model,
259
+ messages=combined_messages.to_openai(),
260
+ max_tokens=1, # Minimal completion to save costs
261
+ )
262
+ api_count = response.usage.prompt_tokens
263
+
264
+ # Calculate percentage difference
265
+ diff_pct = calculate_percentage_difference(local_result.input_tokens, api_count)
266
+ threshold = get_accuracy_threshold(api_count)
267
+
268
+ print(f"\nSimple message comparison:")
269
+ print(f" Local count: {local_result.input_tokens}")
270
+ print(f" API count: {api_count}")
271
+ print(f" Difference: {diff_pct:.2f}%")
272
+ print(f" Threshold: {threshold:.1f}%")
273
+
274
+ # Assert within threshold
275
+ assert (
276
+ diff_pct <= threshold
277
+ ), f"Difference {diff_pct:.2f}% exceeds {threshold:.1f}% threshold (local={local_result.input_tokens}, api={api_count})"
278
+
279
+
280
+ @pytest.mark.slow
281
+ @pytest.mark.integration
282
+ @pytest.mark.asyncio
283
+ async def test_with_real_system_prompt(
284
+ openai_provider,
285
+ simple_messages,
286
+ real_system_prompt,
287
+ ):
288
+ """Test with actual CoderAgent system prompt."""
289
+ model = "gpt-4o"
290
+
291
+ # Get local count
292
+ local_result = await openai_provider.count_tokens(
293
+ messages=simple_messages,
294
+ system=real_system_prompt,
295
+ model=model,
296
+ tools=[],
297
+ )
298
+
299
+ # Get API count by making a real call
300
+ combined_messages = MessageHistory([real_system_prompt] + list(simple_messages))
301
+ response = await openai_provider.async_client.chat.completions.create(
302
+ model=model,
303
+ messages=combined_messages.to_openai(),
304
+ max_tokens=1,
305
+ )
306
+ api_count = response.usage.prompt_tokens
307
+
308
+ # Calculate percentage difference
309
+ diff_pct = calculate_percentage_difference(local_result.input_tokens, api_count)
310
+ threshold = get_accuracy_threshold(api_count)
311
+
312
+ print(f"\nReal system prompt comparison:")
313
+ print(f" Local count: {local_result.input_tokens}")
314
+ print(f" API count: {api_count}")
315
+ print(f" Difference: {diff_pct:.2f}%")
316
+ print(f" Threshold: {threshold:.1f}%")
317
+
318
+ # Assert within threshold (realistic context size)
319
+ assert (
320
+ diff_pct <= threshold
321
+ ), f"Difference {diff_pct:.2f}% exceeds {threshold:.1f}% threshold (local={local_result.input_tokens}, api={api_count})"
322
+
323
+
324
+ @pytest.mark.slow
325
+ @pytest.mark.integration
326
+ @pytest.mark.asyncio
327
+ async def test_with_tools(
328
+ openai_provider,
329
+ simple_messages,
330
+ simple_system,
331
+ real_tools,
332
+ ):
333
+ """Test with real tool definitions from ToolCollection."""
334
+ model = "gpt-4o"
335
+
336
+ # Get local count
337
+ local_result = await openai_provider.count_tokens(
338
+ messages=simple_messages,
339
+ system=simple_system,
340
+ model=model,
341
+ tools=real_tools,
342
+ )
343
+
344
+ # Get API count by making a real call
345
+ combined_messages = MessageHistory([simple_system] + list(simple_messages))
346
+ response = await openai_provider.async_client.chat.completions.create(
347
+ model=model,
348
+ messages=combined_messages.to_openai(),
349
+ tools=[t.to_openai() for t in real_tools],
350
+ max_tokens=1,
351
+ )
352
+ api_count = response.usage.prompt_tokens
353
+
354
+ # Calculate percentage difference
355
+ diff_pct = calculate_percentage_difference(local_result.input_tokens, api_count)
356
+ threshold = get_accuracy_threshold(api_count, has_tools=True)
357
+
358
+ print(f"\nWith tools comparison:")
359
+ print(f" Tool count: {len(real_tools)}")
360
+ print(f" Local count: {local_result.input_tokens}")
361
+ print(f" API count: {api_count}")
362
+ print(f" Difference: {diff_pct:.2f}%")
363
+ print(f" Threshold: {threshold:.1f}%")
364
+
365
+ # Assert within threshold (realistic context with tools)
366
+ assert (
367
+ diff_pct <= threshold
368
+ ), f"Difference {diff_pct:.2f}% exceeds {threshold:.1f}% threshold (local={local_result.input_tokens}, api={api_count})"
369
+
370
+
371
+ @pytest.mark.slow
372
+ @pytest.mark.integration
373
+ @pytest.mark.asyncio
374
+ async def test_with_complex_conversation(
375
+ openai_provider,
376
+ complex_messages,
377
+ simple_system,
378
+ ):
379
+ """Test with multi-turn conversation."""
380
+ model = "gpt-4o"
381
+
382
+ # Get local count
383
+ local_result = await openai_provider.count_tokens(
384
+ messages=complex_messages,
385
+ system=simple_system,
386
+ model=model,
387
+ tools=[],
388
+ )
389
+
390
+ # Get API count by making a real call
391
+ combined_messages = MessageHistory([simple_system] + list(complex_messages))
392
+ response = await openai_provider.async_client.chat.completions.create(
393
+ model=model,
394
+ messages=combined_messages.to_openai(),
395
+ max_tokens=1,
396
+ )
397
+ api_count = response.usage.prompt_tokens
398
+
399
+ # Calculate percentage difference
400
+ diff_pct = calculate_percentage_difference(local_result.input_tokens, api_count)
401
+ threshold = get_accuracy_threshold(api_count)
402
+
403
+ print(f"\nComplex conversation comparison:")
404
+ print(f" Message count: {len(complex_messages)}")
405
+ print(f" Local count: {local_result.input_tokens}")
406
+ print(f" API count: {api_count}")
407
+ print(f" Difference: {diff_pct:.2f}%")
408
+ print(f" Threshold: {threshold:.1f}%")
409
+
410
+ # Assert within threshold
411
+ assert (
412
+ diff_pct <= threshold
413
+ ), f"Difference {diff_pct:.2f}% exceeds {threshold:.1f}% threshold (local={local_result.input_tokens}, api={api_count})"
414
+
415
+
416
+ @pytest.mark.slow
417
+ @pytest.mark.integration
418
+ @pytest.mark.asyncio
419
+ async def test_with_images(
420
+ openai_provider,
421
+ simple_system,
422
+ ):
423
+ """Test token counting with image attachments."""
424
+ model = "gpt-4o"
425
+
426
+ # Create a small test image (1x1 pixel PNG as base64)
427
+ # This is a tiny 1x1 transparent PNG
428
+ tiny_image_base64 = (
429
+ "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=="
430
+ )
431
+
432
+ # Create a message with image
433
+ messages_with_image = MessageHistory(
434
+ [
435
+ Message(
436
+ "user",
437
+ [
438
+ TextBlock("What do you see in this image?"),
439
+ ImageBlock(image_type="base64", media_type="image/png", data=tiny_image_base64),
440
+ ],
441
+ )
442
+ ]
443
+ )
444
+
445
+ # Get local count
446
+ local_result = await openai_provider.count_tokens(
447
+ messages=messages_with_image,
448
+ system=simple_system,
449
+ model=model,
450
+ tools=[],
451
+ )
452
+
453
+ # Get API count by making a real call
454
+ combined_messages = MessageHistory([simple_system] + list(messages_with_image))
455
+ response = await openai_provider.async_client.chat.completions.create(
456
+ model=model,
457
+ messages=combined_messages.to_openai(),
458
+ max_tokens=1,
459
+ )
460
+ api_count = response.usage.prompt_tokens
461
+
462
+ # Calculate percentage difference
463
+ diff_pct = calculate_percentage_difference(local_result.input_tokens, api_count)
464
+
465
+ print(f"\nWith images comparison:")
466
+ print(f" Image size: {len(tiny_image_base64)} chars (base64)")
467
+ print(f" Local count: {local_result.input_tokens}")
468
+ print(f" API count: {api_count}")
469
+ print(f" Difference: {diff_pct:.2f}%")
470
+
471
+ # Images are harder to estimate precisely without decoding, but we verify:
472
+ # 1. Both methods counted more than text-only (proving images are counted)
473
+ # 2. Both counts are non-zero (images aren't ignored)
474
+ # Text-only would be ~14 tokens, so >20 proves image was counted
475
+ assert local_result.input_tokens > 20, "Local counting should include image tokens"
476
+ assert api_count > 20, "API counting should include image tokens"
477
+
478
+ # For images, allow higher variance since:
479
+ # - We estimate without decoding (no actual pixel dimensions)
480
+ # - OpenAI has complex image token calculation based on detail level
481
+ # - This tiny 1x1 test image is an edge case
482
+ image_threshold = 100.0
483
+ assert (
484
+ diff_pct <= image_threshold
485
+ ), f"Difference {diff_pct:.2f}% exceeds {image_threshold:.1f}% threshold for images (local={local_result.input_tokens}, api={api_count})"
486
+
487
+
488
+ @pytest.mark.slow
489
+ @pytest.mark.integration
490
+ @pytest.mark.asyncio
491
+ async def test_with_tool_calls(
492
+ openai_provider,
493
+ messages_with_tool_calls,
494
+ simple_system,
495
+ ):
496
+ """Test token counting with tool calls and results."""
497
+ model = "gpt-4o"
498
+
499
+ # Get local count
500
+ local_result = await openai_provider.count_tokens(
501
+ messages=messages_with_tool_calls,
502
+ system=simple_system,
503
+ model=model,
504
+ tools=[],
505
+ )
506
+
507
+ # Get API count by making a real call
508
+ combined_messages = MessageHistory([simple_system] + list(messages_with_tool_calls))
509
+ response = await openai_provider.async_client.chat.completions.create(
510
+ model=model,
511
+ messages=combined_messages.to_openai(),
512
+ max_tokens=1,
513
+ )
514
+ api_count = response.usage.prompt_tokens
515
+
516
+ # Calculate percentage difference
517
+ diff_pct = calculate_percentage_difference(local_result.input_tokens, api_count)
518
+ # Tool calls/results have higher variance in token counting, similar to images
519
+ threshold = 25.0 if api_count < 200 else 15.0
520
+
521
+ print(f"\nWith tool calls comparison:")
522
+ print(f" Message count: {len(messages_with_tool_calls)}")
523
+ print(f" Local count: {local_result.input_tokens}")
524
+ print(f" API count: {api_count}")
525
+ print(f" Difference: {diff_pct:.2f}%")
526
+ print(f" Threshold: {threshold:.1f}%")
527
+
528
+ # Assert within threshold (tool calls have higher variance)
529
+ assert (
530
+ diff_pct <= threshold
531
+ ), f"Difference {diff_pct:.2f}% exceeds {threshold:.1f}% threshold (local={local_result.input_tokens}, api={api_count})"
532
+
533
+
534
+ @pytest.mark.slow
535
+ @pytest.mark.integration
536
+ @pytest.mark.asyncio
537
+ async def test_full_agent_context(
538
+ openai_provider,
539
+ complex_messages,
540
+ real_system_prompt,
541
+ real_tools,
542
+ ):
543
+ """Test with full agent context: real system prompt, complex messages, and tools."""
544
+ model = "gpt-4o"
545
+
546
+ # Get local count
547
+ local_result = await openai_provider.count_tokens(
548
+ messages=complex_messages,
549
+ system=real_system_prompt,
550
+ model=model,
551
+ tools=real_tools,
552
+ )
553
+
554
+ # Get API count by making a real call
555
+ combined_messages = MessageHistory([real_system_prompt] + list(complex_messages))
556
+ response = await openai_provider.async_client.chat.completions.create(
557
+ model=model,
558
+ messages=combined_messages.to_openai(),
559
+ tools=[t.to_openai() for t in real_tools],
560
+ max_tokens=1,
561
+ )
562
+ api_count = response.usage.prompt_tokens
563
+
564
+ # Calculate percentage difference
565
+ diff_pct = calculate_percentage_difference(local_result.input_tokens, api_count)
566
+ threshold = get_accuracy_threshold(api_count, has_tools=True)
567
+
568
+ print(f"\nFull agent context comparison:")
569
+ print(f" Message count: {len(complex_messages)}")
570
+ print(f" Tool count: {len(real_tools)}")
571
+ print(f" Local count: {local_result.input_tokens}")
572
+ print(f" API count: {api_count}")
573
+ print(f" Difference: {diff_pct:.2f}%")
574
+ print(f" Threshold: {threshold:.1f}%")
575
+
576
+ # Assert within threshold (realistic full agent context)
577
+ assert (
578
+ diff_pct <= threshold
579
+ ), f"Difference {diff_pct:.2f}% exceeds {threshold:.1f}% threshold (local={local_result.input_tokens}, api={api_count})"
580
+
581
+
582
+ @pytest.mark.slow
583
+ @pytest.mark.integration
584
+ @pytest.mark.asyncio
585
+ async def test_accuracy_threshold_summary(
586
+ openai_provider,
587
+ simple_messages,
588
+ simple_system,
589
+ complex_messages,
590
+ real_system_prompt,
591
+ real_tools,
592
+ messages_with_tool_calls,
593
+ ):
594
+ """Run all comparison scenarios and verify all are within their thresholds."""
595
+ model = "gpt-4o"
596
+
597
+ # Create message with image for testing
598
+ tiny_image_base64 = (
599
+ "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=="
600
+ )
601
+ messages_with_image = MessageHistory(
602
+ [
603
+ Message(
604
+ "user",
605
+ [
606
+ TextBlock("What do you see?"),
607
+ ImageBlock(image_type="base64", media_type="image/png", data=tiny_image_base64),
608
+ ],
609
+ )
610
+ ]
611
+ )
612
+
613
+ test_scenarios = [
614
+ ("Simple", simple_messages, simple_system, []),
615
+ ("Real System", simple_messages, real_system_prompt, []),
616
+ ("With Tools", simple_messages, simple_system, real_tools),
617
+ ("Complex Messages", complex_messages, simple_system, []),
618
+ ("With Tool Calls", messages_with_tool_calls, simple_system, []),
619
+ ("With Images", messages_with_image, simple_system, []),
620
+ ("Full Context", complex_messages, real_system_prompt, real_tools),
621
+ ]
622
+
623
+ results = []
624
+
625
+ for name, messages, system, tools in test_scenarios:
626
+ # Get local count
627
+ local_result = await openai_provider.count_tokens(
628
+ messages=messages,
629
+ system=system,
630
+ model=model,
631
+ tools=tools,
632
+ )
633
+
634
+ # Get API count
635
+ combined_messages = MessageHistory([system] + list(messages))
636
+ if tools:
637
+ response = await openai_provider.async_client.chat.completions.create(
638
+ model=model,
639
+ messages=combined_messages.to_openai(),
640
+ tools=[t.to_openai() for t in tools],
641
+ max_tokens=1,
642
+ )
643
+ else:
644
+ response = await openai_provider.async_client.chat.completions.create(
645
+ model=model,
646
+ messages=combined_messages.to_openai(),
647
+ max_tokens=1,
648
+ )
649
+ api_count = response.usage.prompt_tokens
650
+
651
+ diff_pct = calculate_percentage_difference(local_result.input_tokens, api_count)
652
+ results.append((name, local_result.input_tokens, api_count, diff_pct))
653
+
654
+ # Print summary
655
+ print("\n" + "=" * 80)
656
+ print("Token Counting Accuracy Summary (OpenAI)")
657
+ print("=" * 80)
658
+ print(f'{"Scenario":<20} {"Local":<10} {"API":<10} {"Diff %":<10} {"Status":<10}')
659
+ print("-" * 80)
660
+
661
+ all_within_threshold = True
662
+ for name, local_count, api_count, diff_pct in results:
663
+ # Images get special handling - they're estimated without decoding
664
+ if "Images" in name:
665
+ threshold = 100.0
666
+ elif "Tool Calls" in name:
667
+ # Tool calls/results have higher variance, especially in small contexts
668
+ threshold = 25.0 if api_count < 200 else 15.0
669
+ elif "Tools" in name or "Full Context" in name:
670
+ threshold = get_accuracy_threshold(api_count, has_tools=True)
671
+ else:
672
+ threshold = get_accuracy_threshold(api_count)
673
+ status = "✓ PASS" if diff_pct <= threshold else "✗ FAIL"
674
+ if diff_pct > threshold:
675
+ all_within_threshold = False
676
+ print(f"{name:<20} {local_count:<10} {api_count:<10} {diff_pct:<10.2f} {status:<10}")
677
+
678
+ print("=" * 80)
679
+ print("Note: Realistic agent contexts (>200 tokens) must be within 10%.")
680
+ print(" Contexts with tools allowed up to 20% due to OpenAI's compact format.")
681
+ print(" Contexts with tool calls/results allowed up to 25% for small samples.")
682
+ print(" Small samples (<200 tokens) allowed up to 15% due to fixed overhead.")
683
+ print(" Images allowed up to 100% variance (estimated without decoding).")
684
+ print("=" * 80)
685
+
686
+ # Assert all scenarios pass their respective thresholds
687
+ assert all_within_threshold, "One or more scenarios exceeded their accuracy threshold"