kolega-code 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. kolega_code/__init__.py +151 -0
  2. kolega_code/agent/__init__.py +42 -0
  3. kolega_code/agent/baseagent.py +998 -0
  4. kolega_code/agent/browseragent.py +123 -0
  5. kolega_code/agent/coder.py +157 -0
  6. kolega_code/agent/common.py +41 -0
  7. kolega_code/agent/compression.py +81 -0
  8. kolega_code/agent/context.py +112 -0
  9. kolega_code/agent/conversation.py +408 -0
  10. kolega_code/agent/generalagent.py +146 -0
  11. kolega_code/agent/investigationagent.py +123 -0
  12. kolega_code/agent/planningagent.py +187 -0
  13. kolega_code/agent/prompt_provider.py +196 -0
  14. kolega_code/agent/prompt_templates/agents/browser.j2 +102 -0
  15. kolega_code/agent/prompt_templates/agents/coder_cli_mode.j2 +127 -0
  16. kolega_code/agent/prompt_templates/agents/general.j2 +68 -0
  17. kolega_code/agent/prompt_templates/agents/investigation.j2 +72 -0
  18. kolega_code/agent/prompt_templates/common/frontend_guidance.md +36 -0
  19. kolega_code/agent/prompt_templates/common/kolega_md_instructions.md +14 -0
  20. kolega_code/agent/prompt_templates/environment_variables/workspace_env_vars.md +11 -0
  21. kolega_code/agent/prompt_templates/template_guidance/expo-template.md +379 -0
  22. kolega_code/agent/prompt_templates/template_guidance/html-website-template.md +3 -0
  23. kolega_code/agent/prompt_templates/template_guidance/mern-stack-template.md +3 -0
  24. kolega_code/agent/prompt_templates/template_guidance/react-vite-shadcdn-template.md +182 -0
  25. kolega_code/agent/prompts.py +192 -0
  26. kolega_code/agent/tests/__init__.py +0 -0
  27. kolega_code/agent/tests/llm/__init__.py +0 -0
  28. kolega_code/agent/tests/llm/test_anthropic_token_counting.py +633 -0
  29. kolega_code/agent/tests/llm/test_billing_openai_cache.py +74 -0
  30. kolega_code/agent/tests/llm/test_client.py +773 -0
  31. kolega_code/agent/tests/llm/test_dashscope_mapping.py +32 -0
  32. kolega_code/agent/tests/llm/test_error_boundary.py +322 -0
  33. kolega_code/agent/tests/llm/test_exceptions.py +249 -0
  34. kolega_code/agent/tests/llm/test_instrumented_client.py +536 -0
  35. kolega_code/agent/tests/llm/test_instrumented_client_integration.py +547 -0
  36. kolega_code/agent/tests/llm/test_langfuse_normalization.py +39 -0
  37. kolega_code/agent/tests/llm/test_model_specs.py +17 -0
  38. kolega_code/agent/tests/llm/test_openai_cached_tokens.py +58 -0
  39. kolega_code/agent/tests/llm/test_openai_cached_tokens_stream.py +74 -0
  40. kolega_code/agent/tests/llm/test_openai_message_conversion.py +30 -0
  41. kolega_code/agent/tests/llm/test_openai_token_counting.py +687 -0
  42. kolega_code/agent/tests/llm/test_tool_execution_ids.py +193 -0
  43. kolega_code/agent/tests/services/__init__.py +1 -0
  44. kolega_code/agent/tests/services/test_browser.py +447 -0
  45. kolega_code/agent/tests/services/test_browser_parity.py +353 -0
  46. kolega_code/agent/tests/services/test_file_system.py +699 -0
  47. kolega_code/agent/tests/services/test_sandbox_terminal_input.py +98 -0
  48. kolega_code/agent/tests/services/test_terminal.py +154 -0
  49. kolega_code/agent/tests/services/test_terminal_command_tracking.py +385 -0
  50. kolega_code/agent/tests/services/test_terminal_state_serializer.py +262 -0
  51. kolega_code/agent/tests/test_agent_tools_inventory.py +267 -0
  52. kolega_code/agent/tests/test_base_agent.py +1942 -0
  53. kolega_code/agent/tests/test_coder_attachments.py +330 -0
  54. kolega_code/agent/tests/test_coder_prompt_extensions.py +61 -0
  55. kolega_code/agent/tests/test_commands.py +179 -0
  56. kolega_code/agent/tests/test_duplicate_tool_results.py +556 -0
  57. kolega_code/agent/tests/test_empty_message_handling.py +48 -0
  58. kolega_code/agent/tests/test_general_agent.py +242 -0
  59. kolega_code/agent/tests/test_html.py +320 -0
  60. kolega_code/agent/tests/test_parallel_tool_calls.py +291 -0
  61. kolega_code/agent/tests/test_planning_agent.py +227 -0
  62. kolega_code/agent/tests/test_prompt_provider.py +271 -0
  63. kolega_code/agent/tests/test_tool_registry.py +102 -0
  64. kolega_code/agent/tests/test_tools.py +549 -0
  65. kolega_code/agent/tests/tool_backend/__init__.py +0 -0
  66. kolega_code/agent/tests/tool_backend/test_agent_tool.py +356 -0
  67. kolega_code/agent/tests/tool_backend/test_base_tool.py +147 -0
  68. kolega_code/agent/tests/tool_backend/test_browser_tool.py +335 -0
  69. kolega_code/agent/tests/tool_backend/test_build_tool.py +93 -0
  70. kolega_code/agent/tests/tool_backend/test_create_file_tool.py +115 -0
  71. kolega_code/agent/tests/tool_backend/test_glob_tool.py +196 -0
  72. kolega_code/agent/tests/tool_backend/test_glob_tool_sandbox_parity.py +230 -0
  73. kolega_code/agent/tests/tool_backend/test_list_directory_tool.py +292 -0
  74. kolega_code/agent/tests/tool_backend/test_read_file_tool.py +173 -0
  75. kolega_code/agent/tests/tool_backend/test_replace_entire_file_tool.py +115 -0
  76. kolega_code/agent/tests/tool_backend/test_replace_lines_tool.py +141 -0
  77. kolega_code/agent/tests/tool_backend/test_search_and_replace_tool.py +174 -0
  78. kolega_code/agent/tests/tool_backend/test_search_codebase_tool.py +228 -0
  79. kolega_code/agent/tests/tool_backend/test_terminal_tool.py +482 -0
  80. kolega_code/agent/tests/tool_backend/test_think_hard_integration.py +189 -0
  81. kolega_code/agent/tests/tool_backend/test_think_hard_streaming.py +445 -0
  82. kolega_code/agent/tests/tool_backend/test_web_fetch_tool.py +194 -0
  83. kolega_code/agent/tool_backend/agent_tool.py +414 -0
  84. kolega_code/agent/tool_backend/apply_edit_tool.py +98 -0
  85. kolega_code/agent/tool_backend/apply_patch_tool.py +514 -0
  86. kolega_code/agent/tool_backend/base_tool.py +217 -0
  87. kolega_code/agent/tool_backend/browser_tool.py +271 -0
  88. kolega_code/agent/tool_backend/build_tool.py +93 -0
  89. kolega_code/agent/tool_backend/create_file_tool.py +52 -0
  90. kolega_code/agent/tool_backend/glob_tool.py +323 -0
  91. kolega_code/agent/tool_backend/list_directory_tool.py +300 -0
  92. kolega_code/agent/tool_backend/memory_tool.py +79 -0
  93. kolega_code/agent/tool_backend/read_file_tool.py +119 -0
  94. kolega_code/agent/tool_backend/replace_entire_file_tool.py +40 -0
  95. kolega_code/agent/tool_backend/replace_lines_tool.py +97 -0
  96. kolega_code/agent/tool_backend/search_and_replace_tool.py +146 -0
  97. kolega_code/agent/tool_backend/search_codebase_tool.py +377 -0
  98. kolega_code/agent/tool_backend/streaming_tool.py +47 -0
  99. kolega_code/agent/tool_backend/terminal_tool.py +643 -0
  100. kolega_code/agent/tool_backend/think_hard_tool.py +211 -0
  101. kolega_code/agent/tool_backend/web_fetch_tool.py +205 -0
  102. kolega_code/agent/tools.py +1704 -0
  103. kolega_code/agent/utils/commands.py +94 -0
  104. kolega_code/cli/__init__.py +1 -0
  105. kolega_code/cli/app.py +2756 -0
  106. kolega_code/cli/config.py +280 -0
  107. kolega_code/cli/connection.py +49 -0
  108. kolega_code/cli/file_index.py +147 -0
  109. kolega_code/cli/main.py +564 -0
  110. kolega_code/cli/mentions.py +155 -0
  111. kolega_code/cli/messages.py +89 -0
  112. kolega_code/cli/provider_registry.py +96 -0
  113. kolega_code/cli/session_store.py +207 -0
  114. kolega_code/cli/settings.py +87 -0
  115. kolega_code/cli/skills.py +409 -0
  116. kolega_code/cli/slash_commands.py +108 -0
  117. kolega_code/cli/tests/__init__.py +1 -0
  118. kolega_code/cli/tests/test_app.py +4251 -0
  119. kolega_code/cli/tests/test_cli_config.py +171 -0
  120. kolega_code/cli/tests/test_connection.py +26 -0
  121. kolega_code/cli/tests/test_file_index.py +103 -0
  122. kolega_code/cli/tests/test_main.py +455 -0
  123. kolega_code/cli/tests/test_mentions.py +108 -0
  124. kolega_code/cli/tests/test_session_store.py +67 -0
  125. kolega_code/cli/tests/test_settings.py +62 -0
  126. kolega_code/cli/tests/test_skills.py +157 -0
  127. kolega_code/cli/tests/test_slash_commands.py +88 -0
  128. kolega_code/cli/theme.py +180 -0
  129. kolega_code/config.py +154 -0
  130. kolega_code/events.py +202 -0
  131. kolega_code/llm/client.py +300 -0
  132. kolega_code/llm/exceptions.py +285 -0
  133. kolega_code/llm/instrumented_client.py +520 -0
  134. kolega_code/llm/models.py +1368 -0
  135. kolega_code/llm/providers/__init__.py +0 -0
  136. kolega_code/llm/providers/anthropic.py +387 -0
  137. kolega_code/llm/providers/base.py +71 -0
  138. kolega_code/llm/providers/google.py +157 -0
  139. kolega_code/llm/providers/models.py +37 -0
  140. kolega_code/llm/providers/openai.py +363 -0
  141. kolega_code/llm/ratelimit.py +40 -0
  142. kolega_code/llm/specs.py +67 -0
  143. kolega_code/llm/tool_execution_ids.py +18 -0
  144. kolega_code/models/__init__.py +9 -0
  145. kolega_code/models/sandbox_terminal_state.py +47 -0
  146. kolega_code/runtime.py +50 -0
  147. kolega_code/sandbox/README.md +200 -0
  148. kolega_code/sandbox/__init__.py +21 -0
  149. kolega_code/sandbox/async_filesystem.py +475 -0
  150. kolega_code/sandbox/base.py +297 -0
  151. kolega_code/sandbox/browser.py +25 -0
  152. kolega_code/sandbox/event_loop.py +43 -0
  153. kolega_code/sandbox/filesystem.py +341 -0
  154. kolega_code/sandbox/local.py +118 -0
  155. kolega_code/sandbox/serializer.py +175 -0
  156. kolega_code/sandbox/terminal.py +868 -0
  157. kolega_code/sandbox/utils.py +216 -0
  158. kolega_code/services/base.py +255 -0
  159. kolega_code/services/browser.py +444 -0
  160. kolega_code/services/file_system.py +749 -0
  161. kolega_code/services/html.py +221 -0
  162. kolega_code/services/terminal.py +903 -0
  163. kolega_code/tools/__init__.py +22 -0
  164. kolega_code/tools/core.py +33 -0
  165. kolega_code/tools/definitions.py +81 -0
  166. kolega_code/tools/registry.py +73 -0
  167. kolega_code-0.1.0.dist-info/METADATA +157 -0
  168. kolega_code-0.1.0.dist-info/RECORD +171 -0
  169. kolega_code-0.1.0.dist-info/WHEEL +4 -0
  170. kolega_code-0.1.0.dist-info/entry_points.txt +2 -0
  171. kolega_code-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,547 @@
1
+ """
2
+ Integration tests for the InstrumentedLLMClient class using real API keys.
3
+
4
+ These tests require valid API keys to be set in the environment and will be skipped
5
+ if the keys are not available. They test the actual integration with LLM providers
6
+ and Langfuse tracing.
7
+ """
8
+
9
+ import asyncio
10
+ import os
11
+ from unittest.mock import Mock, patch
12
+
13
+ import pytest
14
+ from dotenv import load_dotenv
15
+ from langfuse import Langfuse
16
+ from opentelemetry.sdk.trace import TracerProvider as _OtelTracerProvider
17
+
18
+ from kolega_code.llm.instrumented_client import InstrumentedLLMClient
19
+ from kolega_code.llm.models import Message, MessageHistory, TextBlock, ToolCall
20
+ from kolega_code.llm.providers.models import GenerationParams
21
+
22
+ # Load environment variables
23
+ # Navigate up to backend directory: tests/llm -> tests -> agent -> kolega_code -> backend
24
+ dotenv_path = os.path.join(
25
+ os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))), ".env"
26
+ )
27
+ if os.path.exists(dotenv_path):
28
+ print(f"Loading environment variables from: {dotenv_path}")
29
+ load_dotenv(dotenv_path)
30
+ print(f"ANTHROPIC_API_KEY present: {bool(os.getenv('ANTHROPIC_API_KEY'))}")
31
+ print(f"OPENAI_API_KEY present: {bool(os.getenv('OPENAI_API_KEY'))}")
32
+ print(f"GOOGLE_API_KEY present: {bool(os.getenv('GOOGLE_API_KEY'))}")
33
+ print(f"LANGFUSE_PUBLIC_KEY present: {bool(os.getenv('LANGFUSE_PUBLIC_KEY'))}")
34
+ print(f"LANGFUSE_SECRET_KEY present: {bool(os.getenv('LANGFUSE_SECRET_KEY'))}")
35
+ else:
36
+ print(f"Warning: .env file not found at {dotenv_path}")
37
+ print("Integration tests requiring API keys may be skipped.")
38
+
39
+ # Test data
40
+ TEST_MESSAGES = MessageHistory(
41
+ [Message(role="user", content=[TextBlock(text="What is 2+2? Reply with just the number.")])]
42
+ )
43
+ TEST_SYSTEM = Message(role="system", content=[TextBlock(text="You are a helpful math assistant. Be concise.")])
44
+
45
+ # Check if running in CI environment
46
+ SKIP_IN_CI = bool(os.getenv("CI")) or bool(os.getenv("GITLAB_CI"))
47
+
48
+ @pytest.fixture
49
+ def real_langfuse_client():
50
+ """Create a real Langfuse client if credentials are available."""
51
+ if not all([os.getenv("LANGFUSE_PUBLIC_KEY"), os.getenv("LANGFUSE_SECRET_KEY"), os.getenv("LANGFUSE_HOST")]):
52
+ return None
53
+
54
+ try:
55
+ return Langfuse(
56
+ public_key=os.getenv("LANGFUSE_PUBLIC_KEY"),
57
+ secret_key=os.getenv("LANGFUSE_SECRET_KEY"),
58
+ host=os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com"),
59
+ tracer_provider=_OtelTracerProvider(), # isolates Langfuse from Sentry's global OTEL provider
60
+ )
61
+ except Exception as e:
62
+ print(f"Failed to create Langfuse client: {e}")
63
+ return None
64
+
65
+
66
+ @pytest.fixture
67
+ def mock_langfuse_client():
68
+ """Create a mock Langfuse client for testing (v3 API)."""
69
+ langfuse = Mock()
70
+
71
+ # Create a mock generation that tracks calls
72
+ generation = Mock()
73
+ generation.update = Mock()
74
+ generation.end = Mock()
75
+
76
+ # Create a mock trace/span that returns the generation
77
+ trace = Mock()
78
+ trace.update_trace = Mock()
79
+ trace.update = Mock()
80
+ trace.end = Mock()
81
+ trace.start_generation = Mock(return_value=generation)
82
+
83
+ # Make langfuse.start_span() return the trace
84
+ langfuse.start_span = Mock(return_value=trace)
85
+
86
+ return langfuse
87
+
88
+
89
+ @pytest.mark.slow
90
+ @pytest.mark.integration
91
+ @pytest.mark.skipif(SKIP_IN_CI, reason="Skipping in CI environment")
92
+ class TestInstrumentedClientWithRealAPIs:
93
+ """Test InstrumentedLLMClient with real API calls."""
94
+
95
+ @pytest.mark.asyncio
96
+ async def test_anthropic_generation_with_instrumentation(self, mock_langfuse_client):
97
+ """Test Anthropic generation with instrumentation using real API."""
98
+ api_key = os.getenv("ANTHROPIC_API_KEY")
99
+ if not api_key:
100
+ pytest.skip("ANTHROPIC_API_KEY not set")
101
+
102
+ client = InstrumentedLLMClient(
103
+ provider="anthropic",
104
+ api_key=api_key,
105
+ langfuse_client=mock_langfuse_client,
106
+ workspace_id="test-workspace",
107
+ thread_id="test-thread",
108
+ agent_type="test-agent",
109
+ environment="test",
110
+ )
111
+
112
+ # Make real API call
113
+ response = await client.generate(
114
+ messages=TEST_MESSAGES,
115
+ system=TEST_SYSTEM,
116
+ model="claude-haiku-4-5-20251001",
117
+ max_completion_tokens=10,
118
+ temperature=0,
119
+ )
120
+
121
+ # Verify response
122
+ assert response is not None
123
+ assert response.role == "assistant"
124
+ assert response.get_text_content()
125
+ assert "4" in response.get_text_content()
126
+
127
+ # Verify Langfuse was called (v3 API)
128
+ mock_langfuse_client.start_span.assert_called_once()
129
+ trace = mock_langfuse_client.start_span.return_value
130
+ trace.start_generation.assert_called_once()
131
+
132
+ # Verify usage data was extracted
133
+ generation = trace.start_generation.return_value
134
+ generation.update.assert_called_once()
135
+ generation.end.assert_called_once()
136
+ update_call = generation.update.call_args
137
+ assert update_call.kwargs["usage_details"] is not None
138
+ assert update_call.kwargs["usage_details"]["input"] > 0
139
+ assert update_call.kwargs["usage_details"]["output"] > 0
140
+ assert update_call.kwargs["level"] == "DEFAULT"
141
+
142
+ @pytest.mark.asyncio
143
+ async def test_openai_generation_with_instrumentation(self, mock_langfuse_client):
144
+ """Test OpenAI generation with instrumentation using real API."""
145
+ api_key = os.getenv("OPENAI_API_KEY")
146
+ if not api_key:
147
+ pytest.skip("OPENAI_API_KEY not set")
148
+
149
+ client = InstrumentedLLMClient(
150
+ provider="openai",
151
+ api_key=api_key,
152
+ langfuse_client=mock_langfuse_client,
153
+ workspace_id="test-workspace",
154
+ thread_id="test-thread",
155
+ agent_type="test-agent",
156
+ environment="production",
157
+ )
158
+
159
+ # Make real API call
160
+ response = await client.generate(
161
+ messages=TEST_MESSAGES,
162
+ system=TEST_SYSTEM,
163
+ model="gpt-4o-mini",
164
+ max_completion_tokens=10,
165
+ temperature=0,
166
+ )
167
+
168
+ # Verify response
169
+ assert response is not None
170
+ assert response.role == "assistant"
171
+ assert response.get_text_content()
172
+ assert "4" in response.get_text_content()
173
+
174
+ # Verify Langfuse was called with correct tags
175
+ trace_call = mock_langfuse_client.start_span.return_value.update_trace.call_args
176
+ assert trace_call.kwargs["tags"] == [
177
+ "production",
178
+ "workspace:test-workspace",
179
+ "thread:test-thread",
180
+ "agent:test-agent",
181
+ "provider:openai",
182
+ ]
183
+
184
+ # Verify usage tracking
185
+ generation = mock_langfuse_client.start_span.return_value.start_generation.return_value
186
+ update_call = generation.update.call_args
187
+ usage = update_call.kwargs["usage_details"]
188
+ assert usage["input"] > 0
189
+ assert usage["output"] > 0
190
+
191
+ @pytest.mark.asyncio
192
+ async def test_google_generation_with_instrumentation(self, mock_langfuse_client):
193
+ """Test Google generation with instrumentation using real API."""
194
+ api_key = os.getenv("GOOGLE_API_KEY")
195
+ if not api_key:
196
+ pytest.skip("GOOGLE_API_KEY not set")
197
+
198
+ client = InstrumentedLLMClient(
199
+ provider="google",
200
+ api_key=api_key,
201
+ langfuse_client=mock_langfuse_client,
202
+ workspace_id="test-workspace",
203
+ thread_id="test-thread",
204
+ agent_type="test-agent",
205
+ environment="development",
206
+ )
207
+
208
+ # Make real API call
209
+ response = await client.generate(
210
+ messages=TEST_MESSAGES,
211
+ system=TEST_SYSTEM,
212
+ model="gemini-2.5-pro",
213
+ max_completion_tokens=128,
214
+ temperature=0,
215
+ )
216
+
217
+ # Verify response
218
+ assert response is not None
219
+ assert response.role == "assistant" # Normalized from Google's "model" role
220
+ assert response.get_text_content()
221
+
222
+ # Verify Langfuse integration
223
+ assert mock_langfuse_client.start_span.called
224
+
225
+ @pytest.mark.asyncio
226
+ async def test_streaming_with_instrumentation(self, mock_langfuse_client):
227
+ """Test streaming with instrumentation using real Anthropic API."""
228
+ api_key = os.getenv("ANTHROPIC_API_KEY")
229
+ if not api_key:
230
+ pytest.skip("ANTHROPIC_API_KEY not set")
231
+
232
+ client = InstrumentedLLMClient(
233
+ provider="anthropic",
234
+ api_key=api_key,
235
+ langfuse_client=mock_langfuse_client,
236
+ workspace_id="test-workspace",
237
+ thread_id="test-thread",
238
+ agent_type="test-agent",
239
+ environment="test",
240
+ )
241
+
242
+ # Stream response
243
+ accumulated_text = ""
244
+ stream = await client.stream(
245
+ messages=TEST_MESSAGES,
246
+ system=TEST_SYSTEM,
247
+ model="claude-haiku-4-5-20251001",
248
+ max_completion_tokens=50,
249
+ temperature=0,
250
+ )
251
+
252
+ async with stream:
253
+ async for chunk in stream:
254
+ if chunk.type == "text":
255
+ accumulated_text += chunk.text
256
+
257
+ # Verify response
258
+ assert accumulated_text
259
+ assert "4" in accumulated_text
260
+
261
+ # Verify Langfuse was called
262
+ mock_langfuse_client.start_span.assert_called_once()
263
+ trace = mock_langfuse_client.start_span.return_value
264
+ trace.start_generation.assert_called_once()
265
+
266
+ # Verify streaming tag
267
+ trace_update_call = trace.update_trace.call_args
268
+ assert "streaming" in trace_update_call.kwargs["tags"]
269
+
270
+ gen_call = trace.start_generation.call_args
271
+ assert gen_call.kwargs["model_parameters"]["streaming"] is True
272
+
273
+ # Verify generation.end was called
274
+ generation = trace.start_generation.return_value
275
+ generation.update.assert_called_once()
276
+ generation.end.assert_called_once()
277
+
278
+ @pytest.mark.asyncio
279
+ async def test_error_handling_with_instrumentation(self, mock_langfuse_client):
280
+ """Test error handling with instrumentation."""
281
+ # Use invalid API key
282
+ client = InstrumentedLLMClient(
283
+ provider="anthropic",
284
+ api_key="invalid-key",
285
+ langfuse_client=mock_langfuse_client,
286
+ workspace_id="test-workspace",
287
+ thread_id="test-thread",
288
+ agent_type="test-agent",
289
+ environment="test",
290
+ )
291
+
292
+ # Attempt API call with invalid key
293
+ with pytest.raises(Exception) as exc_info:
294
+ await client.generate(
295
+ messages=TEST_MESSAGES,
296
+ system=TEST_SYSTEM,
297
+ model="claude-haiku-4-5-20251001",
298
+ )
299
+
300
+ # Verify error was logged to Langfuse
301
+ trace = mock_langfuse_client.start_span.return_value
302
+ generation = trace.start_generation.return_value
303
+ generation.update.assert_called_once()
304
+ generation.end.assert_called_once()
305
+ update_call = generation.update.call_args
306
+ assert update_call.kwargs["level"] == "ERROR"
307
+
308
+ @pytest.mark.asyncio
309
+ async def test_cache_tokens_tracking(self, mock_langfuse_client):
310
+ """Test that Anthropic cache tokens are properly tracked."""
311
+ api_key = os.getenv("ANTHROPIC_API_KEY")
312
+ if not api_key:
313
+ pytest.skip("ANTHROPIC_API_KEY not set")
314
+
315
+ client = InstrumentedLLMClient(
316
+ provider="anthropic",
317
+ api_key=api_key,
318
+ langfuse_client=mock_langfuse_client,
319
+ workspace_id="test-workspace",
320
+ thread_id="test-thread",
321
+ agent_type="test-agent",
322
+ environment="test",
323
+ )
324
+
325
+ # Create a longer conversation to potentially trigger cache
326
+ long_messages = MessageHistory(
327
+ [Message(role="user", content=[TextBlock(text="Tell me about Python programming.")])]
328
+ )
329
+
330
+ # First call
331
+ await client.generate(
332
+ messages=long_messages,
333
+ system=TEST_SYSTEM,
334
+ model="claude-sonnet-4-20250514",
335
+ max_completion_tokens=100,
336
+ )
337
+
338
+ # Second call with same system prompt might use cache
339
+ await client.generate(
340
+ messages=long_messages,
341
+ system=TEST_SYSTEM,
342
+ model="claude-sonnet-4-5-20250929",
343
+ max_completion_tokens=100,
344
+ )
345
+
346
+ # Check if any call reported cache usage
347
+ trace = mock_langfuse_client.start_span.return_value
348
+ generation = trace.start_generation.return_value
349
+ any_cache_usage = False
350
+ for call in generation.update.call_args_list:
351
+ if "usage_details" in call.kwargs and call.kwargs["usage_details"]:
352
+ usage = call.kwargs["usage_details"]
353
+ if usage.get("cache_read_input_tokens", 0) > 0:
354
+ any_cache_usage = True
355
+ break
356
+
357
+ # Note: Cache usage is not guaranteed, so we'll just verify the calls were made
358
+ assert generation.update.call_count >= 2, "Should have made at least 2 calls"
359
+
360
+
361
+ @pytest.mark.slow
362
+ @pytest.mark.integration
363
+ class TestRealLangfuseIntegration:
364
+ """Test with real Langfuse service if credentials are available."""
365
+
366
+ @pytest.mark.asyncio
367
+ async def test_real_langfuse_integration(self, real_langfuse_client):
368
+ """Test actual Langfuse integration if credentials are available."""
369
+ if not real_langfuse_client:
370
+ pytest.skip("Langfuse credentials not available")
371
+
372
+ api_key = os.getenv("ANTHROPIC_API_KEY")
373
+ if not api_key:
374
+ pytest.skip("ANTHROPIC_API_KEY not set")
375
+
376
+ client = InstrumentedLLMClient(
377
+ provider="anthropic",
378
+ api_key=api_key,
379
+ langfuse_client=real_langfuse_client,
380
+ workspace_id="integration-test",
381
+ thread_id="test-thread-123",
382
+ agent_type="integration-test-agent",
383
+ environment=os.getenv("ENVIRONMENT", "test"),
384
+ )
385
+
386
+ # Make real API call
387
+ response = await client.generate(
388
+ messages=TEST_MESSAGES,
389
+ system=TEST_SYSTEM,
390
+ model="claude-haiku-4-5-20251001",
391
+ max_completion_tokens=10,
392
+ temperature=0,
393
+ )
394
+
395
+ # Verify response
396
+ assert response is not None
397
+ assert "4" in response.get_text_content()
398
+
399
+ # Flush to ensure trace is sent
400
+ real_langfuse_client.flush()
401
+
402
+ # Give Langfuse a moment to process
403
+ await asyncio.sleep(1)
404
+
405
+ print("✅ Real Langfuse trace sent successfully")
406
+ print(f"Check Langfuse dashboard for trace with:")
407
+ print(f" - Workspace: integration-test")
408
+ print(f" - Thread: test-thread-123")
409
+ print(f" - Agent: integration-test-agent")
410
+
411
+
412
+ @pytest.mark.slow
413
+ @pytest.mark.integration
414
+ class TestInstrumentedClientCompatibility:
415
+ """Test that instrumented client maintains compatibility with base client."""
416
+
417
+ @pytest.mark.asyncio
418
+ async def test_fallback_without_langfuse(self):
419
+ """Test that client works normally without Langfuse."""
420
+ api_key = os.getenv("ANTHROPIC_API_KEY")
421
+ if not api_key:
422
+ pytest.skip("ANTHROPIC_API_KEY not set")
423
+
424
+ # Create client without Langfuse
425
+ client = InstrumentedLLMClient(
426
+ provider="anthropic",
427
+ api_key=api_key,
428
+ langfuse_client=None, # No Langfuse
429
+ )
430
+
431
+ # Should work normally
432
+ response = await client.generate(
433
+ messages=TEST_MESSAGES,
434
+ system=TEST_SYSTEM,
435
+ model="claude-haiku-4-5-20251001",
436
+ max_completion_tokens=10,
437
+ temperature=0,
438
+ )
439
+
440
+ assert response is not None
441
+ assert "4" in response.get_text_content()
442
+
443
+ @pytest.mark.asyncio
444
+ async def test_all_providers_with_instrumentation(self, mock_langfuse_client):
445
+ """Test instrumentation works with all supported providers."""
446
+ providers_to_test = [
447
+ ("anthropic", "ANTHROPIC_API_KEY", "claude-haiku-4-5-20251001"),
448
+ ("openai", "OPENAI_API_KEY", "gpt-4o-mini"),
449
+ ("google", "GOOGLE_API_KEY", "gemini-2.5-pro"),
450
+ ]
451
+
452
+ for provider, env_key, model in providers_to_test:
453
+ api_key = os.getenv(env_key)
454
+ if not api_key:
455
+ print(f"Skipping {provider} - {env_key} not set")
456
+ continue
457
+
458
+ client = InstrumentedLLMClient(
459
+ provider=provider,
460
+ api_key=api_key,
461
+ langfuse_client=mock_langfuse_client,
462
+ workspace_id="test-workspace",
463
+ thread_id="test-thread",
464
+ agent_type=f"{provider}-test-agent",
465
+ environment="test",
466
+ )
467
+
468
+ try:
469
+ response = await client.generate(
470
+ messages=TEST_MESSAGES,
471
+ system=TEST_SYSTEM,
472
+ model=model,
473
+ max_completion_tokens=10,
474
+ temperature=0,
475
+ )
476
+
477
+ assert response is not None
478
+ print(f"✅ {provider} instrumentation working")
479
+
480
+ # Verify Langfuse was called
481
+ assert mock_langfuse_client.start_span.called
482
+
483
+ except Exception as e:
484
+ pytest.fail(f"Failed to test {provider}: {str(e)}")
485
+ finally:
486
+ # Reset mock for next provider
487
+ mock_langfuse_client.reset_mock()
488
+
489
+ @pytest.mark.asyncio
490
+ async def test_tool_calling_with_instrumentation(self, mock_langfuse_client):
491
+ """Test that tool calling works with instrumentation."""
492
+ api_key = os.getenv("ANTHROPIC_API_KEY")
493
+ if not api_key:
494
+ pytest.skip("ANTHROPIC_API_KEY not set")
495
+
496
+ client = InstrumentedLLMClient(
497
+ provider="anthropic",
498
+ api_key=api_key,
499
+ langfuse_client=mock_langfuse_client,
500
+ workspace_id="test-workspace",
501
+ thread_id="test-thread",
502
+ agent_type="test-agent",
503
+ environment="test",
504
+ )
505
+
506
+ # Define a simple tool using proper ToolDefinition
507
+ from kolega_code.llm.models import ToolDefinition, ToolParameter
508
+
509
+ tools = [
510
+ ToolDefinition(
511
+ name="get_weather",
512
+ description="Get the weather for a location",
513
+ parameters=[
514
+ ToolParameter(
515
+ name="location", type="string", description="The location to get weather for", required=True
516
+ )
517
+ ],
518
+ )
519
+ ]
520
+
521
+ messages = MessageHistory(
522
+ [Message(role="user", content=[TextBlock(text="What's the weather in San Francisco?")])]
523
+ )
524
+
525
+ response = await client.generate(
526
+ messages=messages,
527
+ system=TEST_SYSTEM, # Add system message to avoid None error
528
+ model="claude-haiku-4-5-20251001",
529
+ tools=tools,
530
+ max_completion_tokens=200,
531
+ )
532
+
533
+ # Should either answer directly or call the tool
534
+ assert response is not None
535
+ content = response.content
536
+
537
+ # Check if it made a tool call
538
+ tool_calls = [c for c in content if isinstance(c, ToolCall)]
539
+ if tool_calls:
540
+ assert tool_calls[0].name == "get_weather"
541
+ assert "location" in tool_calls[0].input
542
+
543
+ # Verify Langfuse tracked it
544
+ trace = mock_langfuse_client.start_span.return_value
545
+ generation = trace.start_generation.return_value
546
+ generation.update.assert_called_once()
547
+ generation.end.assert_called_once()
@@ -0,0 +1,39 @@
1
+ from kolega_code.llm.instrumented_client import MinimalLangfuseStreamWrapper
2
+ from kolega_code.llm.models import Message
3
+
4
+
5
+ # TODO: Fix after qwen-3-coder-plus PR is merged - needs OpenAI cache token support in Langfuse
6
+ def test_langfuse_normalizes_openai_cache_tokens():
7
+ msg = Message(role='assistant', content='ok', usage_metadata={
8
+ 'provider': 'openai',
9
+ 'prompt_tokens': 10,
10
+ 'completion_tokens': 2,
11
+ 'total_tokens': 12,
12
+ 'cache_read_input_tokens': 2048,
13
+ })
14
+
15
+ wrapper = MinimalLangfuseStreamWrapper(stream=None, generation=None, trace=None, instrumented_client=None, model='x')
16
+ usage = wrapper._extract_langfuse_usage(msg)
17
+ assert usage['input'] == 10
18
+ assert usage['output'] == 2
19
+ assert usage['total'] == 12
20
+ assert usage['cache_read_input_tokens'] == 2048
21
+
22
+
23
+ def test_langfuse_normalizes_deepseek_usage():
24
+ msg = Message(role='assistant', content='ok', usage_metadata={
25
+ 'provider': 'deepseek',
26
+ 'input_tokens': 10,
27
+ 'output_tokens': 2,
28
+ 'cache_read_input_tokens': 3,
29
+ 'cache_write_input_tokens': 4,
30
+ })
31
+
32
+ wrapper = MinimalLangfuseStreamWrapper(stream=None, generation=None, trace=None, instrumented_client=None, model='x')
33
+ usage = wrapper._extract_langfuse_usage(msg)
34
+ assert usage['input'] == 10
35
+ assert usage['output'] == 2
36
+ assert usage['total'] == 12
37
+ assert usage['cache_read_input_tokens'] == 3
38
+ assert usage['cache_creation_input_tokens'] == 4
39
+
@@ -0,0 +1,17 @@
1
+ from kolega_code.llm.specs import get_model_specs
2
+
3
+
4
+ def test_kimi_k26_model_specs():
5
+ specs = get_model_specs("moonshot", "kimi-k2.6")
6
+
7
+ assert specs["context_length"] == 262144
8
+ assert specs["max_completion_tokens"] == 32768
9
+ assert specs["default_temperature"] == 1.0
10
+
11
+
12
+ def test_deepseek_v4_pro_model_specs():
13
+ specs = get_model_specs("deepseek", "deepseek-v4-pro")
14
+
15
+ assert specs["context_length"] == 1000000
16
+ assert specs["max_completion_tokens"] == 384000
17
+ assert specs["default_temperature"] == 1.0
@@ -0,0 +1,58 @@
1
+ import os
2
+ import types
3
+ import pytest
4
+
5
+ from kolega_code.llm.providers.openai import OpenAIProvider
6
+ from kolega_code.llm.models import Message, MessageHistory
7
+
8
+ # Check if running in CI environment
9
+ SKIP_IN_CI = bool(os.getenv("CI")) or bool(os.getenv("GITLAB_CI"))
10
+
11
+
12
+ class _UsageDetails:
13
+ def __init__(self):
14
+ self.cached_tokens = 2048
15
+
16
+
17
+ class _Usage:
18
+ def __init__(self):
19
+ self.prompt_tokens = 3019
20
+ self.completion_tokens = 104
21
+ self.total_tokens = 3123
22
+ self.prompt_tokens_details = _UsageDetails()
23
+
24
+
25
+ class _ChoiceMsg:
26
+ def __init__(self):
27
+ self.content = 'ok'
28
+ self.tool_calls = None
29
+ self.finish_reason = 'stop'
30
+
31
+
32
+ class _Response:
33
+ def __init__(self):
34
+ self.usage = _Usage()
35
+ self.choices = [types.SimpleNamespace(message=_ChoiceMsg())]
36
+
37
+
38
+ # TODO: Fix after qwen-3-coder-plus PR is merged - needs OpenAI cache token extraction from prompt_tokens_details
39
+ @pytest.mark.asyncio
40
+ @pytest.mark.skipif(SKIP_IN_CI, reason="Skipping slow test in CI environment")
41
+ async def test_openai_generate_includes_cached_tokens(monkeypatch):
42
+ provider = OpenAIProvider(api_key='sk-test', base_url='https://api.openai.com/v1')
43
+
44
+ async def fake_create(*args, **kwargs):
45
+ return _Response()
46
+
47
+ monkeypatch.setattr(provider.async_client.chat.completions, 'create', fake_create)
48
+
49
+ messages = MessageHistory([Message(role='user', content='hi')])
50
+
51
+ msg = await provider.generate(messages=messages)
52
+ assert msg.usage_metadata['prompt_tokens'] == 3019
53
+ assert msg.usage_metadata['completion_tokens'] == 104
54
+ assert msg.usage_metadata['total_tokens'] == 3123
55
+ assert msg.usage_metadata['cache_read_input_tokens'] == 2048
56
+
57
+
58
+