PyPI - kolega-code - Versions diffs - 0.1.0__py3-none-any.whl - Mend

kolega-code 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (171) hide show

kolega_code/__init__.py +151 -0
kolega_code/agent/__init__.py +42 -0
kolega_code/agent/baseagent.py +998 -0
kolega_code/agent/browseragent.py +123 -0
kolega_code/agent/coder.py +157 -0
kolega_code/agent/common.py +41 -0
kolega_code/agent/compression.py +81 -0
kolega_code/agent/context.py +112 -0
kolega_code/agent/conversation.py +408 -0
kolega_code/agent/generalagent.py +146 -0
kolega_code/agent/investigationagent.py +123 -0
kolega_code/agent/planningagent.py +187 -0
kolega_code/agent/prompt_provider.py +196 -0
kolega_code/agent/prompt_templates/agents/browser.j2 +102 -0
kolega_code/agent/prompt_templates/agents/coder_cli_mode.j2 +127 -0
kolega_code/agent/prompt_templates/agents/general.j2 +68 -0
kolega_code/agent/prompt_templates/agents/investigation.j2 +72 -0
kolega_code/agent/prompt_templates/common/frontend_guidance.md +36 -0
kolega_code/agent/prompt_templates/common/kolega_md_instructions.md +14 -0
kolega_code/agent/prompt_templates/environment_variables/workspace_env_vars.md +11 -0
kolega_code/agent/prompt_templates/template_guidance/expo-template.md +379 -0
kolega_code/agent/prompt_templates/template_guidance/html-website-template.md +3 -0
kolega_code/agent/prompt_templates/template_guidance/mern-stack-template.md +3 -0
kolega_code/agent/prompt_templates/template_guidance/react-vite-shadcdn-template.md +182 -0
kolega_code/agent/prompts.py +192 -0
kolega_code/agent/tests/__init__.py +0 -0
kolega_code/agent/tests/llm/__init__.py +0 -0
kolega_code/agent/tests/llm/test_anthropic_token_counting.py +633 -0
kolega_code/agent/tests/llm/test_billing_openai_cache.py +74 -0
kolega_code/agent/tests/llm/test_client.py +773 -0
kolega_code/agent/tests/llm/test_dashscope_mapping.py +32 -0
kolega_code/agent/tests/llm/test_error_boundary.py +322 -0
kolega_code/agent/tests/llm/test_exceptions.py +249 -0
kolega_code/agent/tests/llm/test_instrumented_client.py +536 -0
kolega_code/agent/tests/llm/test_instrumented_client_integration.py +547 -0
kolega_code/agent/tests/llm/test_langfuse_normalization.py +39 -0
kolega_code/agent/tests/llm/test_model_specs.py +17 -0
kolega_code/agent/tests/llm/test_openai_cached_tokens.py +58 -0
kolega_code/agent/tests/llm/test_openai_cached_tokens_stream.py +74 -0
kolega_code/agent/tests/llm/test_openai_message_conversion.py +30 -0
kolega_code/agent/tests/llm/test_openai_token_counting.py +687 -0
kolega_code/agent/tests/llm/test_tool_execution_ids.py +193 -0
kolega_code/agent/tests/services/__init__.py +1 -0
kolega_code/agent/tests/services/test_browser.py +447 -0
kolega_code/agent/tests/services/test_browser_parity.py +353 -0
kolega_code/agent/tests/services/test_file_system.py +699 -0
kolega_code/agent/tests/services/test_sandbox_terminal_input.py +98 -0
kolega_code/agent/tests/services/test_terminal.py +154 -0
kolega_code/agent/tests/services/test_terminal_command_tracking.py +385 -0
kolega_code/agent/tests/services/test_terminal_state_serializer.py +262 -0
kolega_code/agent/tests/test_agent_tools_inventory.py +267 -0
kolega_code/agent/tests/test_base_agent.py +1942 -0
kolega_code/agent/tests/test_coder_attachments.py +330 -0
kolega_code/agent/tests/test_coder_prompt_extensions.py +61 -0
kolega_code/agent/tests/test_commands.py +179 -0
kolega_code/agent/tests/test_duplicate_tool_results.py +556 -0
kolega_code/agent/tests/test_empty_message_handling.py +48 -0
kolega_code/agent/tests/test_general_agent.py +242 -0
kolega_code/agent/tests/test_html.py +320 -0
kolega_code/agent/tests/test_parallel_tool_calls.py +291 -0
kolega_code/agent/tests/test_planning_agent.py +227 -0
kolega_code/agent/tests/test_prompt_provider.py +271 -0
kolega_code/agent/tests/test_tool_registry.py +102 -0
kolega_code/agent/tests/test_tools.py +549 -0
kolega_code/agent/tests/tool_backend/__init__.py +0 -0
kolega_code/agent/tests/tool_backend/test_agent_tool.py +356 -0
kolega_code/agent/tests/tool_backend/test_base_tool.py +147 -0
kolega_code/agent/tests/tool_backend/test_browser_tool.py +335 -0
kolega_code/agent/tests/tool_backend/test_build_tool.py +93 -0
kolega_code/agent/tests/tool_backend/test_create_file_tool.py +115 -0
kolega_code/agent/tests/tool_backend/test_glob_tool.py +196 -0
kolega_code/agent/tests/tool_backend/test_glob_tool_sandbox_parity.py +230 -0
kolega_code/agent/tests/tool_backend/test_list_directory_tool.py +292 -0
kolega_code/agent/tests/tool_backend/test_read_file_tool.py +173 -0
kolega_code/agent/tests/tool_backend/test_replace_entire_file_tool.py +115 -0
kolega_code/agent/tests/tool_backend/test_replace_lines_tool.py +141 -0
kolega_code/agent/tests/tool_backend/test_search_and_replace_tool.py +174 -0
kolega_code/agent/tests/tool_backend/test_search_codebase_tool.py +228 -0
kolega_code/agent/tests/tool_backend/test_terminal_tool.py +482 -0
kolega_code/agent/tests/tool_backend/test_think_hard_integration.py +189 -0
kolega_code/agent/tests/tool_backend/test_think_hard_streaming.py +445 -0
kolega_code/agent/tests/tool_backend/test_web_fetch_tool.py +194 -0
kolega_code/agent/tool_backend/agent_tool.py +414 -0
kolega_code/agent/tool_backend/apply_edit_tool.py +98 -0
kolega_code/agent/tool_backend/apply_patch_tool.py +514 -0
kolega_code/agent/tool_backend/base_tool.py +217 -0
kolega_code/agent/tool_backend/browser_tool.py +271 -0
kolega_code/agent/tool_backend/build_tool.py +93 -0
kolega_code/agent/tool_backend/create_file_tool.py +52 -0
kolega_code/agent/tool_backend/glob_tool.py +323 -0
kolega_code/agent/tool_backend/list_directory_tool.py +300 -0
kolega_code/agent/tool_backend/memory_tool.py +79 -0
kolega_code/agent/tool_backend/read_file_tool.py +119 -0
kolega_code/agent/tool_backend/replace_entire_file_tool.py +40 -0
kolega_code/agent/tool_backend/replace_lines_tool.py +97 -0
kolega_code/agent/tool_backend/search_and_replace_tool.py +146 -0
kolega_code/agent/tool_backend/search_codebase_tool.py +377 -0
kolega_code/agent/tool_backend/streaming_tool.py +47 -0
kolega_code/agent/tool_backend/terminal_tool.py +643 -0
kolega_code/agent/tool_backend/think_hard_tool.py +211 -0
kolega_code/agent/tool_backend/web_fetch_tool.py +205 -0
kolega_code/agent/tools.py +1704 -0
kolega_code/agent/utils/commands.py +94 -0
kolega_code/cli/__init__.py +1 -0
kolega_code/cli/app.py +2756 -0
kolega_code/cli/config.py +280 -0
kolega_code/cli/connection.py +49 -0
kolega_code/cli/file_index.py +147 -0
kolega_code/cli/main.py +564 -0
kolega_code/cli/mentions.py +155 -0
kolega_code/cli/messages.py +89 -0
kolega_code/cli/provider_registry.py +96 -0
kolega_code/cli/session_store.py +207 -0
kolega_code/cli/settings.py +87 -0
kolega_code/cli/skills.py +409 -0
kolega_code/cli/slash_commands.py +108 -0
kolega_code/cli/tests/__init__.py +1 -0
kolega_code/cli/tests/test_app.py +4251 -0
kolega_code/cli/tests/test_cli_config.py +171 -0
kolega_code/cli/tests/test_connection.py +26 -0
kolega_code/cli/tests/test_file_index.py +103 -0
kolega_code/cli/tests/test_main.py +455 -0
kolega_code/cli/tests/test_mentions.py +108 -0
kolega_code/cli/tests/test_session_store.py +67 -0
kolega_code/cli/tests/test_settings.py +62 -0
kolega_code/cli/tests/test_skills.py +157 -0
kolega_code/cli/tests/test_slash_commands.py +88 -0
kolega_code/cli/theme.py +180 -0
kolega_code/config.py +154 -0
kolega_code/events.py +202 -0
kolega_code/llm/client.py +300 -0
kolega_code/llm/exceptions.py +285 -0
kolega_code/llm/instrumented_client.py +520 -0
kolega_code/llm/models.py +1368 -0
kolega_code/llm/providers/__init__.py +0 -0
kolega_code/llm/providers/anthropic.py +387 -0
kolega_code/llm/providers/base.py +71 -0
kolega_code/llm/providers/google.py +157 -0
kolega_code/llm/providers/models.py +37 -0
kolega_code/llm/providers/openai.py +363 -0
kolega_code/llm/ratelimit.py +40 -0
kolega_code/llm/specs.py +67 -0
kolega_code/llm/tool_execution_ids.py +18 -0
kolega_code/models/__init__.py +9 -0
kolega_code/models/sandbox_terminal_state.py +47 -0
kolega_code/runtime.py +50 -0
kolega_code/sandbox/README.md +200 -0
kolega_code/sandbox/__init__.py +21 -0
kolega_code/sandbox/async_filesystem.py +475 -0
kolega_code/sandbox/base.py +297 -0
kolega_code/sandbox/browser.py +25 -0
kolega_code/sandbox/event_loop.py +43 -0
kolega_code/sandbox/filesystem.py +341 -0
kolega_code/sandbox/local.py +118 -0
kolega_code/sandbox/serializer.py +175 -0
kolega_code/sandbox/terminal.py +868 -0
kolega_code/sandbox/utils.py +216 -0
kolega_code/services/base.py +255 -0
kolega_code/services/browser.py +444 -0
kolega_code/services/file_system.py +749 -0
kolega_code/services/html.py +221 -0
kolega_code/services/terminal.py +903 -0
kolega_code/tools/__init__.py +22 -0
kolega_code/tools/core.py +33 -0
kolega_code/tools/definitions.py +81 -0
kolega_code/tools/registry.py +73 -0
kolega_code-0.1.0.dist-info/METADATA +157 -0
kolega_code-0.1.0.dist-info/RECORD +171 -0
kolega_code-0.1.0.dist-info/WHEEL +4 -0
kolega_code-0.1.0.dist-info/entry_points.txt +2 -0
kolega_code-0.1.0.dist-info/licenses/LICENSE +21 -0

kolega_code/agent/tests/llm/test_instrumented_client_integration.py ADDED Viewed

@@ -0,0 +1,547 @@
+"""
+Integration tests for the InstrumentedLLMClient class using real API keys.
+These tests require valid API keys to be set in the environment and will be skipped
+if the keys are not available. They test the actual integration with LLM providers
+and Langfuse tracing.
+"""
+import asyncio
+import os
+from unittest.mock import Mock, patch
+import pytest
+from dotenv import load_dotenv
+from langfuse import Langfuse
+from opentelemetry.sdk.trace import TracerProvider as _OtelTracerProvider
+from kolega_code.llm.instrumented_client import InstrumentedLLMClient
+from kolega_code.llm.models import Message, MessageHistory, TextBlock, ToolCall
+from kolega_code.llm.providers.models import GenerationParams
+# Load environment variables
+# Navigate up to backend directory: tests/llm -> tests -> agent -> kolega_code -> backend
+dotenv_path = os.path.join(
+    os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))), ".env"
+)
+if os.path.exists(dotenv_path):
+    print(f"Loading environment variables from: {dotenv_path}")
+    load_dotenv(dotenv_path)
+    print(f"ANTHROPIC_API_KEY present: {bool(os.getenv('ANTHROPIC_API_KEY'))}")
+    print(f"OPENAI_API_KEY present: {bool(os.getenv('OPENAI_API_KEY'))}")
+    print(f"GOOGLE_API_KEY present: {bool(os.getenv('GOOGLE_API_KEY'))}")
+    print(f"LANGFUSE_PUBLIC_KEY present: {bool(os.getenv('LANGFUSE_PUBLIC_KEY'))}")
+    print(f"LANGFUSE_SECRET_KEY present: {bool(os.getenv('LANGFUSE_SECRET_KEY'))}")
+else:
+    print(f"Warning: .env file not found at {dotenv_path}")
+    print("Integration tests requiring API keys may be skipped.")
+# Test data
+TEST_MESSAGES = MessageHistory(
+    [Message(role="user", content=[TextBlock(text="What is 2+2? Reply with just the number.")])]
+)
+TEST_SYSTEM = Message(role="system", content=[TextBlock(text="You are a helpful math assistant. Be concise.")])
+# Check if running in CI environment
+SKIP_IN_CI = bool(os.getenv("CI")) or bool(os.getenv("GITLAB_CI"))
+@pytest.fixture
+def real_langfuse_client():
+    """Create a real Langfuse client if credentials are available."""
+    if not all([os.getenv("LANGFUSE_PUBLIC_KEY"), os.getenv("LANGFUSE_SECRET_KEY"), os.getenv("LANGFUSE_HOST")]):
+        return None
+    try:
+        return Langfuse(
+            public_key=os.getenv("LANGFUSE_PUBLIC_KEY"),
+            secret_key=os.getenv("LANGFUSE_SECRET_KEY"),
+            host=os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com"),
+            tracer_provider=_OtelTracerProvider(),  # isolates Langfuse from Sentry's global OTEL provider
+        )
+    except Exception as e:
+        print(f"Failed to create Langfuse client: {e}")
+        return None
+@pytest.fixture
+def mock_langfuse_client():
+    """Create a mock Langfuse client for testing (v3 API)."""
+    langfuse = Mock()
+    # Create a mock generation that tracks calls
+    generation = Mock()
+    generation.update = Mock()
+    generation.end = Mock()
+    # Create a mock trace/span that returns the generation
+    trace = Mock()
+    trace.update_trace = Mock()
+    trace.update = Mock()
+    trace.end = Mock()
+    trace.start_generation = Mock(return_value=generation)
+    # Make langfuse.start_span() return the trace
+    langfuse.start_span = Mock(return_value=trace)
+    return langfuse
+@pytest.mark.slow
+@pytest.mark.integration
+@pytest.mark.skipif(SKIP_IN_CI, reason="Skipping in CI environment")
+class TestInstrumentedClientWithRealAPIs:
+    """Test InstrumentedLLMClient with real API calls."""
+    @pytest.mark.asyncio
+    async def test_anthropic_generation_with_instrumentation(self, mock_langfuse_client):
+        """Test Anthropic generation with instrumentation using real API."""
+        api_key = os.getenv("ANTHROPIC_API_KEY")
+        if not api_key:
+            pytest.skip("ANTHROPIC_API_KEY not set")
+        client = InstrumentedLLMClient(
+            provider="anthropic",
+            api_key=api_key,
+            langfuse_client=mock_langfuse_client,
+            workspace_id="test-workspace",
+            thread_id="test-thread",
+            agent_type="test-agent",
+            environment="test",
+        )
+        # Make real API call
+        response = await client.generate(
+            messages=TEST_MESSAGES,
+            system=TEST_SYSTEM,
+            model="claude-haiku-4-5-20251001",
+            max_completion_tokens=10,
+            temperature=0,
+        )
+        # Verify response
+        assert response is not None
+        assert response.role == "assistant"
+        assert response.get_text_content()
+        assert "4" in response.get_text_content()
+        # Verify Langfuse was called (v3 API)
+        mock_langfuse_client.start_span.assert_called_once()
+        trace = mock_langfuse_client.start_span.return_value
+        trace.start_generation.assert_called_once()
+        # Verify usage data was extracted
+        generation = trace.start_generation.return_value
+        generation.update.assert_called_once()
+        generation.end.assert_called_once()
+        update_call = generation.update.call_args
+        assert update_call.kwargs["usage_details"] is not None
+        assert update_call.kwargs["usage_details"]["input"] > 0
+        assert update_call.kwargs["usage_details"]["output"] > 0
+        assert update_call.kwargs["level"] == "DEFAULT"
+    @pytest.mark.asyncio
+    async def test_openai_generation_with_instrumentation(self, mock_langfuse_client):
+        """Test OpenAI generation with instrumentation using real API."""
+        api_key = os.getenv("OPENAI_API_KEY")
+        if not api_key:
+            pytest.skip("OPENAI_API_KEY not set")
+        client = InstrumentedLLMClient(
+            provider="openai",
+            api_key=api_key,
+            langfuse_client=mock_langfuse_client,
+            workspace_id="test-workspace",
+            thread_id="test-thread",
+            agent_type="test-agent",
+            environment="production",
+        )
+        # Make real API call
+        response = await client.generate(
+            messages=TEST_MESSAGES,
+            system=TEST_SYSTEM,
+            model="gpt-4o-mini",
+            max_completion_tokens=10,
+            temperature=0,
+        )
+        # Verify response
+        assert response is not None
+        assert response.role == "assistant"
+        assert response.get_text_content()
+        assert "4" in response.get_text_content()
+        # Verify Langfuse was called with correct tags
+        trace_call = mock_langfuse_client.start_span.return_value.update_trace.call_args
+        assert trace_call.kwargs["tags"] == [
+            "production",
+            "workspace:test-workspace",
+            "thread:test-thread",
+            "agent:test-agent",
+            "provider:openai",
+        ]
+        # Verify usage tracking
+        generation = mock_langfuse_client.start_span.return_value.start_generation.return_value
+        update_call = generation.update.call_args
+        usage = update_call.kwargs["usage_details"]
+        assert usage["input"] > 0
+        assert usage["output"] > 0
+    @pytest.mark.asyncio
+    async def test_google_generation_with_instrumentation(self, mock_langfuse_client):
+        """Test Google generation with instrumentation using real API."""
+        api_key = os.getenv("GOOGLE_API_KEY")
+        if not api_key:
+            pytest.skip("GOOGLE_API_KEY not set")
+        client = InstrumentedLLMClient(
+            provider="google",
+            api_key=api_key,
+            langfuse_client=mock_langfuse_client,
+            workspace_id="test-workspace",
+            thread_id="test-thread",
+            agent_type="test-agent",
+            environment="development",
+        )
+        # Make real API call
+        response = await client.generate(
+            messages=TEST_MESSAGES,
+            system=TEST_SYSTEM,
+            model="gemini-2.5-pro",
+            max_completion_tokens=128,
+            temperature=0,
+        )
+        # Verify response
+        assert response is not None
+        assert response.role == "assistant"  # Normalized from Google's "model" role
+        assert response.get_text_content()
+        # Verify Langfuse integration
+        assert mock_langfuse_client.start_span.called
+    @pytest.mark.asyncio
+    async def test_streaming_with_instrumentation(self, mock_langfuse_client):
+        """Test streaming with instrumentation using real Anthropic API."""
+        api_key = os.getenv("ANTHROPIC_API_KEY")
+        if not api_key:
+            pytest.skip("ANTHROPIC_API_KEY not set")
+        client = InstrumentedLLMClient(
+            provider="anthropic",
+            api_key=api_key,
+            langfuse_client=mock_langfuse_client,
+            workspace_id="test-workspace",
+            thread_id="test-thread",
+            agent_type="test-agent",
+            environment="test",
+        )
+        # Stream response
+        accumulated_text = ""
+        stream = await client.stream(
+            messages=TEST_MESSAGES,
+            system=TEST_SYSTEM,
+            model="claude-haiku-4-5-20251001",
+            max_completion_tokens=50,
+            temperature=0,
+        )
+        async with stream:
+            async for chunk in stream:
+                if chunk.type == "text":
+                    accumulated_text += chunk.text
+        # Verify response
+        assert accumulated_text
+        assert "4" in accumulated_text
+        # Verify Langfuse was called
+        mock_langfuse_client.start_span.assert_called_once()
+        trace = mock_langfuse_client.start_span.return_value
+        trace.start_generation.assert_called_once()
+        # Verify streaming tag
+        trace_update_call = trace.update_trace.call_args
+        assert "streaming" in trace_update_call.kwargs["tags"]
+        gen_call = trace.start_generation.call_args
+        assert gen_call.kwargs["model_parameters"]["streaming"] is True
+        # Verify generation.end was called
+        generation = trace.start_generation.return_value
+        generation.update.assert_called_once()
+        generation.end.assert_called_once()
+    @pytest.mark.asyncio
+    async def test_error_handling_with_instrumentation(self, mock_langfuse_client):
+        """Test error handling with instrumentation."""
+        # Use invalid API key
+        client = InstrumentedLLMClient(
+            provider="anthropic",
+            api_key="invalid-key",
+            langfuse_client=mock_langfuse_client,
+            workspace_id="test-workspace",
+            thread_id="test-thread",
+            agent_type="test-agent",
+            environment="test",
+        )
+        # Attempt API call with invalid key
+        with pytest.raises(Exception) as exc_info:
+            await client.generate(
+                messages=TEST_MESSAGES,
+                system=TEST_SYSTEM,
+                model="claude-haiku-4-5-20251001",
+            )
+        # Verify error was logged to Langfuse
+        trace = mock_langfuse_client.start_span.return_value
+        generation = trace.start_generation.return_value
+        generation.update.assert_called_once()
+        generation.end.assert_called_once()
+        update_call = generation.update.call_args
+        assert update_call.kwargs["level"] == "ERROR"
+    @pytest.mark.asyncio
+    async def test_cache_tokens_tracking(self, mock_langfuse_client):
+        """Test that Anthropic cache tokens are properly tracked."""
+        api_key = os.getenv("ANTHROPIC_API_KEY")
+        if not api_key:
+            pytest.skip("ANTHROPIC_API_KEY not set")
+        client = InstrumentedLLMClient(
+            provider="anthropic",
+            api_key=api_key,
+            langfuse_client=mock_langfuse_client,
+            workspace_id="test-workspace",
+            thread_id="test-thread",
+            agent_type="test-agent",
+            environment="test",
+        )
+        # Create a longer conversation to potentially trigger cache
+        long_messages = MessageHistory(
+            [Message(role="user", content=[TextBlock(text="Tell me about Python programming.")])]
+        )
+        # First call
+        await client.generate(
+            messages=long_messages,
+            system=TEST_SYSTEM,
+            model="claude-sonnet-4-20250514",
+            max_completion_tokens=100,
+        )
+        # Second call with same system prompt might use cache
+        await client.generate(
+            messages=long_messages,
+            system=TEST_SYSTEM,
+            model="claude-sonnet-4-5-20250929",
+            max_completion_tokens=100,
+        )
+        # Check if any call reported cache usage
+        trace = mock_langfuse_client.start_span.return_value
+        generation = trace.start_generation.return_value
+        any_cache_usage = False
+        for call in generation.update.call_args_list:
+            if "usage_details" in call.kwargs and call.kwargs["usage_details"]:
+                usage = call.kwargs["usage_details"]
+                if usage.get("cache_read_input_tokens", 0) > 0:
+                    any_cache_usage = True
+                    break
+        # Note: Cache usage is not guaranteed, so we'll just verify the calls were made
+        assert generation.update.call_count >= 2, "Should have made at least 2 calls"
+@pytest.mark.slow
+@pytest.mark.integration
+class TestRealLangfuseIntegration:
+    """Test with real Langfuse service if credentials are available."""
+    @pytest.mark.asyncio
+    async def test_real_langfuse_integration(self, real_langfuse_client):
+        """Test actual Langfuse integration if credentials are available."""
+        if not real_langfuse_client:
+            pytest.skip("Langfuse credentials not available")
+        api_key = os.getenv("ANTHROPIC_API_KEY")
+        if not api_key:
+            pytest.skip("ANTHROPIC_API_KEY not set")
+        client = InstrumentedLLMClient(
+            provider="anthropic",
+            api_key=api_key,
+            langfuse_client=real_langfuse_client,
+            workspace_id="integration-test",
+            thread_id="test-thread-123",
+            agent_type="integration-test-agent",
+            environment=os.getenv("ENVIRONMENT", "test"),
+        )
+        # Make real API call
+        response = await client.generate(
+            messages=TEST_MESSAGES,
+            system=TEST_SYSTEM,
+            model="claude-haiku-4-5-20251001",
+            max_completion_tokens=10,
+            temperature=0,
+        )
+        # Verify response
+        assert response is not None
+        assert "4" in response.get_text_content()
+        # Flush to ensure trace is sent
+        real_langfuse_client.flush()
+        # Give Langfuse a moment to process
+        await asyncio.sleep(1)
+        print("✅ Real Langfuse trace sent successfully")
+        print(f"Check Langfuse dashboard for trace with:")
+        print(f"  - Workspace: integration-test")
+        print(f"  - Thread: test-thread-123")
+        print(f"  - Agent: integration-test-agent")
+@pytest.mark.slow
+@pytest.mark.integration
+class TestInstrumentedClientCompatibility:
+    """Test that instrumented client maintains compatibility with base client."""
+    @pytest.mark.asyncio
+    async def test_fallback_without_langfuse(self):
+        """Test that client works normally without Langfuse."""
+        api_key = os.getenv("ANTHROPIC_API_KEY")
+        if not api_key:
+            pytest.skip("ANTHROPIC_API_KEY not set")
+        # Create client without Langfuse
+        client = InstrumentedLLMClient(
+            provider="anthropic",
+            api_key=api_key,
+            langfuse_client=None,  # No Langfuse
+        )
+        # Should work normally
+        response = await client.generate(
+            messages=TEST_MESSAGES,
+            system=TEST_SYSTEM,
+            model="claude-haiku-4-5-20251001",
+            max_completion_tokens=10,
+            temperature=0,
+        )
+        assert response is not None
+        assert "4" in response.get_text_content()
+    @pytest.mark.asyncio
+    async def test_all_providers_with_instrumentation(self, mock_langfuse_client):
+        """Test instrumentation works with all supported providers."""
+        providers_to_test = [
+            ("anthropic", "ANTHROPIC_API_KEY", "claude-haiku-4-5-20251001"),
+            ("openai", "OPENAI_API_KEY", "gpt-4o-mini"),
+            ("google", "GOOGLE_API_KEY", "gemini-2.5-pro"),
+        ]
+        for provider, env_key, model in providers_to_test:
+            api_key = os.getenv(env_key)
+            if not api_key:
+                print(f"Skipping {provider} - {env_key} not set")
+                continue
+            client = InstrumentedLLMClient(
+                provider=provider,
+                api_key=api_key,
+                langfuse_client=mock_langfuse_client,
+                workspace_id="test-workspace",
+                thread_id="test-thread",
+                agent_type=f"{provider}-test-agent",
+                environment="test",
+            )
+            try:
+                response = await client.generate(
+                    messages=TEST_MESSAGES,
+                    system=TEST_SYSTEM,
+                    model=model,
+                    max_completion_tokens=10,
+                    temperature=0,
+                )
+                assert response is not None
+                print(f"✅ {provider} instrumentation working")
+                # Verify Langfuse was called
+                assert mock_langfuse_client.start_span.called
+            except Exception as e:
+                pytest.fail(f"Failed to test {provider}: {str(e)}")
+            finally:
+                # Reset mock for next provider
+                mock_langfuse_client.reset_mock()
+    @pytest.mark.asyncio
+    async def test_tool_calling_with_instrumentation(self, mock_langfuse_client):
+        """Test that tool calling works with instrumentation."""
+        api_key = os.getenv("ANTHROPIC_API_KEY")
+        if not api_key:
+            pytest.skip("ANTHROPIC_API_KEY not set")
+        client = InstrumentedLLMClient(
+            provider="anthropic",
+            api_key=api_key,
+            langfuse_client=mock_langfuse_client,
+            workspace_id="test-workspace",
+            thread_id="test-thread",
+            agent_type="test-agent",
+            environment="test",
+        )
+        # Define a simple tool using proper ToolDefinition
+        from kolega_code.llm.models import ToolDefinition, ToolParameter
+        tools = [
+            ToolDefinition(
+                name="get_weather",
+                description="Get the weather for a location",
+                parameters=[
+                    ToolParameter(
+                        name="location", type="string", description="The location to get weather for", required=True
+                    )
+                ],
+            )
+        ]
+        messages = MessageHistory(
+            [Message(role="user", content=[TextBlock(text="What's the weather in San Francisco?")])]
+        )
+        response = await client.generate(
+            messages=messages,
+            system=TEST_SYSTEM,  # Add system message to avoid None error
+            model="claude-haiku-4-5-20251001",
+            tools=tools,
+            max_completion_tokens=200,
+        )
+        # Should either answer directly or call the tool
+        assert response is not None
+        content = response.content
+        # Check if it made a tool call
+        tool_calls = [c for c in content if isinstance(c, ToolCall)]
+        if tool_calls:
+            assert tool_calls[0].name == "get_weather"
+            assert "location" in tool_calls[0].input
+        # Verify Langfuse tracked it
+        trace = mock_langfuse_client.start_span.return_value
+        generation = trace.start_generation.return_value
+        generation.update.assert_called_once()
+        generation.end.assert_called_once()

kolega_code/agent/tests/llm/test_langfuse_normalization.py ADDED Viewed

@@ -0,0 +1,39 @@
+from kolega_code.llm.instrumented_client import MinimalLangfuseStreamWrapper
+from kolega_code.llm.models import Message
+# TODO: Fix after qwen-3-coder-plus PR is merged - needs OpenAI cache token support in Langfuse
+def test_langfuse_normalizes_openai_cache_tokens():
+    msg = Message(role='assistant', content='ok', usage_metadata={
+        'provider': 'openai',
+        'prompt_tokens': 10,
+        'completion_tokens': 2,
+        'total_tokens': 12,
+        'cache_read_input_tokens': 2048,
+    })
+    wrapper = MinimalLangfuseStreamWrapper(stream=None, generation=None, trace=None, instrumented_client=None, model='x')
+    usage = wrapper._extract_langfuse_usage(msg)
+    assert usage['input'] == 10
+    assert usage['output'] == 2
+    assert usage['total'] == 12
+    assert usage['cache_read_input_tokens'] == 2048
+def test_langfuse_normalizes_deepseek_usage():
+    msg = Message(role='assistant', content='ok', usage_metadata={
+        'provider': 'deepseek',
+        'input_tokens': 10,
+        'output_tokens': 2,
+        'cache_read_input_tokens': 3,
+        'cache_write_input_tokens': 4,
+    })
+    wrapper = MinimalLangfuseStreamWrapper(stream=None, generation=None, trace=None, instrumented_client=None, model='x')
+    usage = wrapper._extract_langfuse_usage(msg)
+    assert usage['input'] == 10
+    assert usage['output'] == 2
+    assert usage['total'] == 12
+    assert usage['cache_read_input_tokens'] == 3
+    assert usage['cache_creation_input_tokens'] == 4

kolega_code/agent/tests/llm/test_model_specs.py ADDED Viewed

@@ -0,0 +1,17 @@
+from kolega_code.llm.specs import get_model_specs
+def test_kimi_k26_model_specs():
+    specs = get_model_specs("moonshot", "kimi-k2.6")
+    assert specs["context_length"] == 262144
+    assert specs["max_completion_tokens"] == 32768
+    assert specs["default_temperature"] == 1.0
+def test_deepseek_v4_pro_model_specs():
+    specs = get_model_specs("deepseek", "deepseek-v4-pro")
+    assert specs["context_length"] == 1000000
+    assert specs["max_completion_tokens"] == 384000
+    assert specs["default_temperature"] == 1.0

kolega_code/agent/tests/llm/test_openai_cached_tokens.py ADDED Viewed

@@ -0,0 +1,58 @@
+import os
+import types
+import pytest
+from kolega_code.llm.providers.openai import OpenAIProvider
+from kolega_code.llm.models import Message, MessageHistory
+# Check if running in CI environment
+SKIP_IN_CI = bool(os.getenv("CI")) or bool(os.getenv("GITLAB_CI"))
+class _UsageDetails:
+    def __init__(self):
+        self.cached_tokens = 2048
+class _Usage:
+    def __init__(self):
+        self.prompt_tokens = 3019
+        self.completion_tokens = 104
+        self.total_tokens = 3123
+        self.prompt_tokens_details = _UsageDetails()
+class _ChoiceMsg:
+    def __init__(self):
+        self.content = 'ok'
+        self.tool_calls = None
+        self.finish_reason = 'stop'
+class _Response:
+    def __init__(self):
+        self.usage = _Usage()
+        self.choices = [types.SimpleNamespace(message=_ChoiceMsg())]
+# TODO: Fix after qwen-3-coder-plus PR is merged - needs OpenAI cache token extraction from prompt_tokens_details
+@pytest.mark.asyncio
+@pytest.mark.skipif(SKIP_IN_CI, reason="Skipping slow test in CI environment")
+async def test_openai_generate_includes_cached_tokens(monkeypatch):
+    provider = OpenAIProvider(api_key='sk-test', base_url='https://api.openai.com/v1')
+    async def fake_create(*args, **kwargs):
+        return _Response()
+    monkeypatch.setattr(provider.async_client.chat.completions, 'create', fake_create)
+    messages = MessageHistory([Message(role='user', content='hi')])
+    msg = await provider.generate(messages=messages)
+    assert msg.usage_metadata['prompt_tokens'] == 3019
+    assert msg.usage_metadata['completion_tokens'] == 104
+    assert msg.usage_metadata['total_tokens'] == 3123
+    assert msg.usage_metadata['cache_read_input_tokens'] == 2048