PyPI - kolega-code - Versions diffs - 0.1.0__py3-none-any.whl - Mend

kolega-code 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (171) hide show

kolega_code/__init__.py +151 -0
kolega_code/agent/__init__.py +42 -0
kolega_code/agent/baseagent.py +998 -0
kolega_code/agent/browseragent.py +123 -0
kolega_code/agent/coder.py +157 -0
kolega_code/agent/common.py +41 -0
kolega_code/agent/compression.py +81 -0
kolega_code/agent/context.py +112 -0
kolega_code/agent/conversation.py +408 -0
kolega_code/agent/generalagent.py +146 -0
kolega_code/agent/investigationagent.py +123 -0
kolega_code/agent/planningagent.py +187 -0
kolega_code/agent/prompt_provider.py +196 -0
kolega_code/agent/prompt_templates/agents/browser.j2 +102 -0
kolega_code/agent/prompt_templates/agents/coder_cli_mode.j2 +127 -0
kolega_code/agent/prompt_templates/agents/general.j2 +68 -0
kolega_code/agent/prompt_templates/agents/investigation.j2 +72 -0
kolega_code/agent/prompt_templates/common/frontend_guidance.md +36 -0
kolega_code/agent/prompt_templates/common/kolega_md_instructions.md +14 -0
kolega_code/agent/prompt_templates/environment_variables/workspace_env_vars.md +11 -0
kolega_code/agent/prompt_templates/template_guidance/expo-template.md +379 -0
kolega_code/agent/prompt_templates/template_guidance/html-website-template.md +3 -0
kolega_code/agent/prompt_templates/template_guidance/mern-stack-template.md +3 -0
kolega_code/agent/prompt_templates/template_guidance/react-vite-shadcdn-template.md +182 -0
kolega_code/agent/prompts.py +192 -0
kolega_code/agent/tests/__init__.py +0 -0
kolega_code/agent/tests/llm/__init__.py +0 -0
kolega_code/agent/tests/llm/test_anthropic_token_counting.py +633 -0
kolega_code/agent/tests/llm/test_billing_openai_cache.py +74 -0
kolega_code/agent/tests/llm/test_client.py +773 -0
kolega_code/agent/tests/llm/test_dashscope_mapping.py +32 -0
kolega_code/agent/tests/llm/test_error_boundary.py +322 -0
kolega_code/agent/tests/llm/test_exceptions.py +249 -0
kolega_code/agent/tests/llm/test_instrumented_client.py +536 -0
kolega_code/agent/tests/llm/test_instrumented_client_integration.py +547 -0
kolega_code/agent/tests/llm/test_langfuse_normalization.py +39 -0
kolega_code/agent/tests/llm/test_model_specs.py +17 -0
kolega_code/agent/tests/llm/test_openai_cached_tokens.py +58 -0
kolega_code/agent/tests/llm/test_openai_cached_tokens_stream.py +74 -0
kolega_code/agent/tests/llm/test_openai_message_conversion.py +30 -0
kolega_code/agent/tests/llm/test_openai_token_counting.py +687 -0
kolega_code/agent/tests/llm/test_tool_execution_ids.py +193 -0
kolega_code/agent/tests/services/__init__.py +1 -0
kolega_code/agent/tests/services/test_browser.py +447 -0
kolega_code/agent/tests/services/test_browser_parity.py +353 -0
kolega_code/agent/tests/services/test_file_system.py +699 -0
kolega_code/agent/tests/services/test_sandbox_terminal_input.py +98 -0
kolega_code/agent/tests/services/test_terminal.py +154 -0
kolega_code/agent/tests/services/test_terminal_command_tracking.py +385 -0
kolega_code/agent/tests/services/test_terminal_state_serializer.py +262 -0
kolega_code/agent/tests/test_agent_tools_inventory.py +267 -0
kolega_code/agent/tests/test_base_agent.py +1942 -0
kolega_code/agent/tests/test_coder_attachments.py +330 -0
kolega_code/agent/tests/test_coder_prompt_extensions.py +61 -0
kolega_code/agent/tests/test_commands.py +179 -0
kolega_code/agent/tests/test_duplicate_tool_results.py +556 -0
kolega_code/agent/tests/test_empty_message_handling.py +48 -0
kolega_code/agent/tests/test_general_agent.py +242 -0
kolega_code/agent/tests/test_html.py +320 -0
kolega_code/agent/tests/test_parallel_tool_calls.py +291 -0
kolega_code/agent/tests/test_planning_agent.py +227 -0
kolega_code/agent/tests/test_prompt_provider.py +271 -0
kolega_code/agent/tests/test_tool_registry.py +102 -0
kolega_code/agent/tests/test_tools.py +549 -0
kolega_code/agent/tests/tool_backend/__init__.py +0 -0
kolega_code/agent/tests/tool_backend/test_agent_tool.py +356 -0
kolega_code/agent/tests/tool_backend/test_base_tool.py +147 -0
kolega_code/agent/tests/tool_backend/test_browser_tool.py +335 -0
kolega_code/agent/tests/tool_backend/test_build_tool.py +93 -0
kolega_code/agent/tests/tool_backend/test_create_file_tool.py +115 -0
kolega_code/agent/tests/tool_backend/test_glob_tool.py +196 -0
kolega_code/agent/tests/tool_backend/test_glob_tool_sandbox_parity.py +230 -0
kolega_code/agent/tests/tool_backend/test_list_directory_tool.py +292 -0
kolega_code/agent/tests/tool_backend/test_read_file_tool.py +173 -0
kolega_code/agent/tests/tool_backend/test_replace_entire_file_tool.py +115 -0
kolega_code/agent/tests/tool_backend/test_replace_lines_tool.py +141 -0
kolega_code/agent/tests/tool_backend/test_search_and_replace_tool.py +174 -0
kolega_code/agent/tests/tool_backend/test_search_codebase_tool.py +228 -0
kolega_code/agent/tests/tool_backend/test_terminal_tool.py +482 -0
kolega_code/agent/tests/tool_backend/test_think_hard_integration.py +189 -0
kolega_code/agent/tests/tool_backend/test_think_hard_streaming.py +445 -0
kolega_code/agent/tests/tool_backend/test_web_fetch_tool.py +194 -0
kolega_code/agent/tool_backend/agent_tool.py +414 -0
kolega_code/agent/tool_backend/apply_edit_tool.py +98 -0
kolega_code/agent/tool_backend/apply_patch_tool.py +514 -0
kolega_code/agent/tool_backend/base_tool.py +217 -0
kolega_code/agent/tool_backend/browser_tool.py +271 -0
kolega_code/agent/tool_backend/build_tool.py +93 -0
kolega_code/agent/tool_backend/create_file_tool.py +52 -0
kolega_code/agent/tool_backend/glob_tool.py +323 -0
kolega_code/agent/tool_backend/list_directory_tool.py +300 -0
kolega_code/agent/tool_backend/memory_tool.py +79 -0
kolega_code/agent/tool_backend/read_file_tool.py +119 -0
kolega_code/agent/tool_backend/replace_entire_file_tool.py +40 -0
kolega_code/agent/tool_backend/replace_lines_tool.py +97 -0
kolega_code/agent/tool_backend/search_and_replace_tool.py +146 -0
kolega_code/agent/tool_backend/search_codebase_tool.py +377 -0
kolega_code/agent/tool_backend/streaming_tool.py +47 -0
kolega_code/agent/tool_backend/terminal_tool.py +643 -0
kolega_code/agent/tool_backend/think_hard_tool.py +211 -0
kolega_code/agent/tool_backend/web_fetch_tool.py +205 -0
kolega_code/agent/tools.py +1704 -0
kolega_code/agent/utils/commands.py +94 -0
kolega_code/cli/__init__.py +1 -0
kolega_code/cli/app.py +2756 -0
kolega_code/cli/config.py +280 -0
kolega_code/cli/connection.py +49 -0
kolega_code/cli/file_index.py +147 -0
kolega_code/cli/main.py +564 -0
kolega_code/cli/mentions.py +155 -0
kolega_code/cli/messages.py +89 -0
kolega_code/cli/provider_registry.py +96 -0
kolega_code/cli/session_store.py +207 -0
kolega_code/cli/settings.py +87 -0
kolega_code/cli/skills.py +409 -0
kolega_code/cli/slash_commands.py +108 -0
kolega_code/cli/tests/__init__.py +1 -0
kolega_code/cli/tests/test_app.py +4251 -0
kolega_code/cli/tests/test_cli_config.py +171 -0
kolega_code/cli/tests/test_connection.py +26 -0
kolega_code/cli/tests/test_file_index.py +103 -0
kolega_code/cli/tests/test_main.py +455 -0
kolega_code/cli/tests/test_mentions.py +108 -0
kolega_code/cli/tests/test_session_store.py +67 -0
kolega_code/cli/tests/test_settings.py +62 -0
kolega_code/cli/tests/test_skills.py +157 -0
kolega_code/cli/tests/test_slash_commands.py +88 -0
kolega_code/cli/theme.py +180 -0
kolega_code/config.py +154 -0
kolega_code/events.py +202 -0
kolega_code/llm/client.py +300 -0
kolega_code/llm/exceptions.py +285 -0
kolega_code/llm/instrumented_client.py +520 -0
kolega_code/llm/models.py +1368 -0
kolega_code/llm/providers/__init__.py +0 -0
kolega_code/llm/providers/anthropic.py +387 -0
kolega_code/llm/providers/base.py +71 -0
kolega_code/llm/providers/google.py +157 -0
kolega_code/llm/providers/models.py +37 -0
kolega_code/llm/providers/openai.py +363 -0
kolega_code/llm/ratelimit.py +40 -0
kolega_code/llm/specs.py +67 -0
kolega_code/llm/tool_execution_ids.py +18 -0
kolega_code/models/__init__.py +9 -0
kolega_code/models/sandbox_terminal_state.py +47 -0
kolega_code/runtime.py +50 -0
kolega_code/sandbox/README.md +200 -0
kolega_code/sandbox/__init__.py +21 -0
kolega_code/sandbox/async_filesystem.py +475 -0
kolega_code/sandbox/base.py +297 -0
kolega_code/sandbox/browser.py +25 -0
kolega_code/sandbox/event_loop.py +43 -0
kolega_code/sandbox/filesystem.py +341 -0
kolega_code/sandbox/local.py +118 -0
kolega_code/sandbox/serializer.py +175 -0
kolega_code/sandbox/terminal.py +868 -0
kolega_code/sandbox/utils.py +216 -0
kolega_code/services/base.py +255 -0
kolega_code/services/browser.py +444 -0
kolega_code/services/file_system.py +749 -0
kolega_code/services/html.py +221 -0
kolega_code/services/terminal.py +903 -0
kolega_code/tools/__init__.py +22 -0
kolega_code/tools/core.py +33 -0
kolega_code/tools/definitions.py +81 -0
kolega_code/tools/registry.py +73 -0
kolega_code-0.1.0.dist-info/METADATA +157 -0
kolega_code-0.1.0.dist-info/RECORD +171 -0
kolega_code-0.1.0.dist-info/WHEEL +4 -0
kolega_code-0.1.0.dist-info/entry_points.txt +2 -0
kolega_code-0.1.0.dist-info/licenses/LICENSE +21 -0

kolega_code/agent/tests/llm/test_anthropic_token_counting.py ADDED Viewed

@@ -0,0 +1,633 @@
+"""
+Comprehensive tests comparing local vs API token counting for Anthropic provider.
+These tests verify that local tiktoken-based token counting is within 5% accuracy
+of Anthropic's official API token counting, using real system prompts and tool definitions.
+"""
+import os
+from pathlib import Path
+from typing import List
+from unittest.mock import Mock, patch
+import pytest
+from dotenv import load_dotenv
+from kolega_code.config import AgentConfig, ModelConfig, ModelProvider, RateLimitConfig
+from kolega_code.events import AgentConnectionManager
+from kolega_code.llm.client import LLMClient
+from kolega_code.llm.models import Message, MessageHistory, TextBlock, ImageBlock, ToolDefinition, ToolParameter
+from kolega_code.llm.providers.anthropic import AnthropicProvider
+from kolega_code.agent.prompt_provider import AgentMode, AgentType, PromptContext, PromptProvider
+from kolega_code.agent.tools import ToolCollection, ToolCollectionConfig
+# Load environment variables
+# Navigate up to backend directory: llm -> tests -> agent -> kolega_code -> backend
+dotenv_path = os.path.join(
+    os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))), ".env"
+)
+if os.path.exists(dotenv_path):
+    load_dotenv(dotenv_path)
+@pytest.fixture
+def api_key():
+    """Get Anthropic API key from environment."""
+    key = os.getenv('ANTHROPIC_API_KEY')
+    if not key:
+        pytest.skip('ANTHROPIC_API_KEY not set')
+    return key
+@pytest.fixture
+def anthropic_provider_local(api_key):
+    """Create Anthropic provider with local token counting enabled."""
+    with patch.dict(os.environ, {'ANTHROPIC_USE_LOCAL_TOKEN_COUNTING': 'true'}):
+        provider = AnthropicProvider(api_key=api_key)
+    return provider
+@pytest.fixture
+def anthropic_provider_api(api_key):
+    """Create Anthropic provider with API token counting enabled."""
+    with patch.dict(os.environ, {'ANTHROPIC_USE_LOCAL_TOKEN_COUNTING': 'false'}):
+        provider = AnthropicProvider(api_key=api_key)
+    return provider
+@pytest.fixture
+def simple_messages():
+    """Simple test messages."""
+    return MessageHistory([Message("user", [TextBlock("Hello, how are you?")])])
+@pytest.fixture
+def simple_system():
+    """Simple system message."""
+    return Message("system", [TextBlock("You are a helpful assistant.")])
+@pytest.fixture
+def real_system_prompt():
+    """Get real system prompt from CoderAgent."""
+    prompt_provider = PromptProvider()
+    context = PromptContext(
+        system_name="Kolega Studio",
+        project_path="/test/project",
+        is_git_repo=True,
+        platform="Linux",
+        date_today="2025-01-01",
+        model_name="claude-sonnet-4-5-20250929",
+        available_ports=[3000, 8000],
+        kolega_md="",
+        workspace_id="test-workspace",
+        workspace_environment_variables=None,
+        memories=None,
+    )
+    prompt_text = prompt_provider.get_system_prompt(
+        agent_type=AgentType.CODER,
+        mode=AgentMode.CLI,
+        template_slug=None,
+        context=context,
+    )
+    return Message("system", [TextBlock(prompt_text)])
+@pytest.fixture
+def real_tools(tmp_path):
+    """Get real tool definitions from ToolCollection."""
+    mock_connection_manager = Mock(spec=AgentConnectionManager)
+    mock_config = AgentConfig(
+        anthropic_api_key="test",
+        openai_api_key="test",
+        long_context_config=ModelConfig(
+            provider=ModelProvider.ANTHROPIC,
+            model="claude-haiku-4-5-20251001",
+            rate_limits=RateLimitConfig(),
+        ),
+        fast_config=ModelConfig(
+            provider=ModelProvider.ANTHROPIC,
+            model="claude-haiku-4-5-20251001",
+            rate_limits=RateLimitConfig(),
+        ),
+        thinking_config=ModelConfig(
+            provider=ModelProvider.ANTHROPIC,
+            model="claude-haiku-4-5-20251001",
+            rate_limits=RateLimitConfig(),
+        ),
+    )
+    tool_config = ToolCollectionConfig(
+        custom_tool_groups=["coder_agent_tools"],
+        tool_exclusions=[
+            "read_memory",
+            "write_memory",
+            "execute_terminal_command",
+            "replace_lines",
+            "apply_patch",
+            "edit_file",
+            "get_tool_list",
+            "log_error",
+            "log_info",
+            "run_command",
+            "dispatch_coding_agent",
+        ],
+    )
+    tool_collection = ToolCollection(
+        project_path=tmp_path,
+        workspace_id="test-workspace",
+        thread_id="test-thread",
+        connection_manager=mock_connection_manager,
+        config=mock_config,
+        caller=None,
+        tool_config=tool_config,
+    )
+    return tool_collection.get_tool_list()
+@pytest.fixture
+def complex_messages():
+    """Multi-turn conversation with various content types."""
+    return MessageHistory(
+        [
+            Message("user", [TextBlock("Can you help me write a Python function?")]),
+            Message(
+                "assistant",
+                [
+                    TextBlock(
+                        "Of course! I'd be happy to help you write a Python function. What would you like the function to do?"
+                    )
+                ],
+            ),
+            Message("user", [TextBlock("I need a function that calculates the factorial of a number recursively.")]),
+            Message(
+                "assistant",
+                [
+                    TextBlock(
+                        "Here's a recursive factorial function:\n\n```python\ndef factorial(n):\n    if n == 0 or n == 1:\n        return 1\n    return n * factorial(n - 1)\n```"
+                    )
+                ],
+            ),
+        ]
+    )
+def calculate_percentage_difference(local_count: int, api_count: int) -> float:
+    """Calculate percentage difference between local and API token counts."""
+    if api_count == 0:
+        return 0.0
+    return abs(local_count - api_count) / api_count * 100
+def get_accuracy_threshold(api_count: int) -> float:
+    """Get appropriate accuracy threshold based on token count.
+    Small token counts (<200) have higher variance due to fixed overhead,
+    so we use a more lenient threshold. For realistic agent contexts (>200 tokens),
+    we enforce the strict 5% threshold.
+    """
+    if api_count < 200:
+        return 15.0  # Lenient threshold for small samples
+    return 5.0  # Strict threshold for realistic contexts
+@pytest.mark.slow
+@pytest.mark.integration
+@pytest.mark.asyncio
+async def test_simple_message_comparison(
+    anthropic_provider_local,
+    anthropic_provider_api,
+    simple_messages,
+    simple_system,
+):
+    """Compare token counts for basic user/system messages."""
+    model = "claude-sonnet-4-5-20250929"
+    # Get counts from both methods
+    local_result = await anthropic_provider_local.count_tokens(
+        messages=simple_messages,
+        system=simple_system,
+        model=model,
+        tools=[],
+    )
+    api_result = await anthropic_provider_api.count_tokens(
+        messages=simple_messages,
+        system=simple_system,
+        model=model,
+        tools=[],
+    )
+    # Calculate percentage difference
+    diff_pct = calculate_percentage_difference(local_result.input_tokens, api_result.input_tokens)
+    threshold = get_accuracy_threshold(api_result.input_tokens)
+    print(f"\nSimple message comparison:")
+    print(f"  Local count: {local_result.input_tokens}")
+    print(f"  API count: {api_result.input_tokens}")
+    print(f"  Difference: {diff_pct:.2f}%")
+    print(f"  Threshold: {threshold:.1f}% (small sample)")
+    # Assert within threshold
+    assert (
+        diff_pct <= threshold
+    ), f"Difference {diff_pct:.2f}% exceeds {threshold:.1f}% threshold (local={local_result.input_tokens}, api={api_result.input_tokens})"
+@pytest.mark.slow
+@pytest.mark.integration
+@pytest.mark.asyncio
+async def test_with_real_system_prompt(
+    anthropic_provider_local,
+    anthropic_provider_api,
+    simple_messages,
+    real_system_prompt,
+):
+    """Test with actual CoderAgent system prompt."""
+    model = "claude-sonnet-4-5-20250929"
+    # Get counts from both methods
+    local_result = await anthropic_provider_local.count_tokens(
+        messages=simple_messages,
+        system=real_system_prompt,
+        model=model,
+        tools=[],
+    )
+    api_result = await anthropic_provider_api.count_tokens(
+        messages=simple_messages,
+        system=real_system_prompt,
+        model=model,
+        tools=[],
+    )
+    # Calculate percentage difference
+    diff_pct = calculate_percentage_difference(local_result.input_tokens, api_result.input_tokens)
+    threshold = get_accuracy_threshold(api_result.input_tokens)
+    print(f"\nReal system prompt comparison:")
+    print(f"  Local count: {local_result.input_tokens}")
+    print(f"  API count: {api_result.input_tokens}")
+    print(f"  Difference: {diff_pct:.2f}%")
+    print(f"  Threshold: {threshold:.1f}%")
+    # Assert within 5% tolerance (realistic context size)
+    assert (
+        diff_pct <= threshold
+    ), f"Difference {diff_pct:.2f}% exceeds {threshold:.1f}% threshold (local={local_result.input_tokens}, api={api_result.input_tokens})"
+@pytest.mark.slow
+@pytest.mark.integration
+@pytest.mark.asyncio
+async def test_with_tools(
+    anthropic_provider_local,
+    anthropic_provider_api,
+    simple_messages,
+    simple_system,
+    real_tools,
+):
+    """Test with real tool definitions from ToolCollection."""
+    model = "claude-sonnet-4-5-20250929"
+    # Get counts from both methods
+    local_result = await anthropic_provider_local.count_tokens(
+        messages=simple_messages,
+        system=simple_system,
+        model=model,
+        tools=real_tools,
+    )
+    api_result = await anthropic_provider_api.count_tokens(
+        messages=simple_messages,
+        system=simple_system,
+        model=model,
+        tools=real_tools,
+    )
+    # Calculate percentage difference
+    diff_pct = calculate_percentage_difference(local_result.input_tokens, api_result.input_tokens)
+    threshold = get_accuracy_threshold(api_result.input_tokens)
+    print(f"\nWith tools comparison:")
+    print(f"  Tool count: {len(real_tools)}")
+    print(f"  Local count: {local_result.input_tokens}")
+    print(f"  API count: {api_result.input_tokens}")
+    print(f"  Difference: {diff_pct:.2f}%")
+    print(f"  Threshold: {threshold:.1f}%")
+    # Assert within 5% tolerance (realistic context with tools)
+    assert (
+        diff_pct <= threshold
+    ), f"Difference {diff_pct:.2f}% exceeds {threshold:.1f}% threshold (local={local_result.input_tokens}, api={api_result.input_tokens})"
+@pytest.mark.slow
+@pytest.mark.integration
+@pytest.mark.asyncio
+async def test_with_complex_conversation(
+    anthropic_provider_local,
+    anthropic_provider_api,
+    complex_messages,
+    simple_system,
+):
+    """Test with multi-turn conversation."""
+    model = "claude-sonnet-4-5-20250929"
+    # Get counts from both methods
+    local_result = await anthropic_provider_local.count_tokens(
+        messages=complex_messages,
+        system=simple_system,
+        model=model,
+        tools=[],
+    )
+    api_result = await anthropic_provider_api.count_tokens(
+        messages=complex_messages,
+        system=simple_system,
+        model=model,
+        tools=[],
+    )
+    # Calculate percentage difference
+    diff_pct = calculate_percentage_difference(local_result.input_tokens, api_result.input_tokens)
+    threshold = get_accuracy_threshold(api_result.input_tokens)
+    print(f"\nComplex conversation comparison:")
+    print(f"  Message count: {len(complex_messages)}")
+    print(f"  Local count: {local_result.input_tokens}")
+    print(f"  API count: {api_result.input_tokens}")
+    print(f"  Difference: {diff_pct:.2f}%")
+    print(f"  Threshold: {threshold:.1f}% (small sample)")
+    # Assert within threshold
+    assert (
+        diff_pct <= threshold
+    ), f"Difference {diff_pct:.2f}% exceeds {threshold:.1f}% threshold (local={local_result.input_tokens}, api={api_result.input_tokens})"
+@pytest.mark.slow
+@pytest.mark.integration
+@pytest.mark.asyncio
+async def test_with_images(
+    anthropic_provider_local,
+    anthropic_provider_api,
+    simple_system,
+):
+    """Test token counting with image attachments."""
+    model = "claude-sonnet-4-5-20250929"
+    # Create a small test image (1x1 pixel PNG as base64)
+    # This is a tiny 1x1 transparent PNG
+    tiny_image_base64 = (
+        "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=="
+    )
+    # Create a message with image
+    messages_with_image = MessageHistory(
+        [
+            Message(
+                "user",
+                [
+                    TextBlock("What do you see in this image?"),
+                    ImageBlock(image_type="base64", media_type="image/png", data=tiny_image_base64),
+                ],
+            )
+        ]
+    )
+    # Get counts from both methods
+    local_result = await anthropic_provider_local.count_tokens(
+        messages=messages_with_image,
+        system=simple_system,
+        model=model,
+        tools=[],
+    )
+    api_result = await anthropic_provider_api.count_tokens(
+        messages=messages_with_image,
+        system=simple_system,
+        model=model,
+        tools=[],
+    )
+    # Calculate percentage difference
+    diff_pct = calculate_percentage_difference(local_result.input_tokens, api_result.input_tokens)
+    image_threshold = 200.0
+    print(f"\nWith images comparison:")
+    print(f"  Image size: {len(tiny_image_base64)} chars (base64)")
+    print(f"  Local count: {local_result.input_tokens}")
+    print(f"  API count: {api_result.input_tokens}")
+    print(f"  Difference: {diff_pct:.2f}%")
+    print(f"  Threshold: {image_threshold:.1f}% (image estimate)")
+    # Images are harder to estimate precisely without decoding, but we verify:
+    # 1. Both methods counted more than text-only (proving images are counted)
+    # 2. Both counts are non-zero (images aren't ignored)
+    # Text-only would be ~14 tokens, so >20 proves image was counted
+    assert local_result.input_tokens > 20, "Local counting should include image tokens"
+    assert api_result.input_tokens > 20, "API counting should include image tokens"
+    # For images, allow very high variance since:
+    # - We estimate without decoding (no actual pixel dimensions)
+    # - This tiny 1x1 test image is an edge case (96 chars base64)
+    # - Normal conversation images (screenshots, etc.) will be much larger and more accurate
+    # - The key goal is images aren't ignored (count > 0)
+    assert (
+        diff_pct <= image_threshold
+    ), f"Difference {diff_pct:.2f}% exceeds {image_threshold:.1f}% threshold for images (local={local_result.input_tokens}, api={api_result.input_tokens})"
+@pytest.mark.slow
+@pytest.mark.integration
+@pytest.mark.asyncio
+async def test_full_agent_context(
+    anthropic_provider_local,
+    anthropic_provider_api,
+    complex_messages,
+    real_system_prompt,
+    real_tools,
+):
+    """Test with full agent context: real system prompt, complex messages, and tools."""
+    model = "claude-sonnet-4-5-20250929"
+    # Get counts from both methods
+    local_result = await anthropic_provider_local.count_tokens(
+        messages=complex_messages,
+        system=real_system_prompt,
+        model=model,
+        tools=real_tools,
+    )
+    api_result = await anthropic_provider_api.count_tokens(
+        messages=complex_messages,
+        system=real_system_prompt,
+        model=model,
+        tools=real_tools,
+    )
+    # Calculate percentage difference
+    diff_pct = calculate_percentage_difference(local_result.input_tokens, api_result.input_tokens)
+    threshold = get_accuracy_threshold(api_result.input_tokens)
+    print(f"\nFull agent context comparison:")
+    print(f"  Message count: {len(complex_messages)}")
+    print(f"  Tool count: {len(real_tools)}")
+    print(f"  Local count: {local_result.input_tokens}")
+    print(f"  API count: {api_result.input_tokens}")
+    print(f"  Difference: {diff_pct:.2f}%")
+    print(f"  Threshold: {threshold:.1f}%")
+    # Assert within 5% tolerance (realistic full agent context)
+    assert (
+        diff_pct <= threshold
+    ), f"Difference {diff_pct:.2f}% exceeds {threshold:.1f}% threshold (local={local_result.input_tokens}, api={api_result.input_tokens})"
+@pytest.mark.slow
+@pytest.mark.integration
+@pytest.mark.asyncio
+async def test_accuracy_threshold_summary(
+    anthropic_provider_local,
+    anthropic_provider_api,
+    simple_messages,
+    simple_system,
+    complex_messages,
+    real_system_prompt,
+    real_tools,
+):
+    """Run all comparison scenarios and verify all are within their thresholds."""
+    model = "claude-sonnet-4-5-20250929"
+    # Create message with image for testing
+    tiny_image_base64 = (
+        "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=="
+    )
+    messages_with_image = MessageHistory(
+        [
+            Message(
+                "user",
+                [
+                    TextBlock("What do you see?"),
+                    ImageBlock(image_type="base64", media_type="image/png", data=tiny_image_base64),
+                ],
+            )
+        ]
+    )
+    test_scenarios = [
+        ("Simple", simple_messages, simple_system, []),
+        ("Real System", simple_messages, real_system_prompt, []),
+        ("With Tools", simple_messages, simple_system, real_tools),
+        ("Complex Messages", complex_messages, simple_system, []),
+        ("With Images", messages_with_image, simple_system, []),
+        ("Full Context", complex_messages, real_system_prompt, real_tools),
+    ]
+    results = []
+    for name, messages, system, tools in test_scenarios:
+        local_result = await anthropic_provider_local.count_tokens(
+            messages=messages,
+            system=system,
+            model=model,
+            tools=tools,
+        )
+        api_result = await anthropic_provider_api.count_tokens(
+            messages=messages,
+            system=system,
+            model=model,
+            tools=tools,
+        )
+        diff_pct = calculate_percentage_difference(local_result.input_tokens, api_result.input_tokens)
+        results.append((name, local_result.input_tokens, api_result.input_tokens, diff_pct))
+    # Print summary
+    print("\n" + "=" * 80)
+    print("Token Counting Accuracy Summary")
+    print("=" * 80)
+    print(f'{"Scenario":<20} {"Local":<10} {"API":<10} {"Diff %":<10} {"Status":<10}')
+    print("-" * 80)
+    all_within_threshold = True
+    for name, local_count, api_count, diff_pct in results:
+        # Images get special handling - they're estimated without decoding
+        if "Images" in name:
+            threshold = 200.0
+        else:
+            threshold = get_accuracy_threshold(api_count)
+        status = "✓ PASS" if diff_pct <= threshold else "✗ FAIL"
+        if diff_pct > threshold:
+            all_within_threshold = False
+        threshold_str = f"{threshold:.0f}%"
+        print(f"{name:<20} {local_count:<10} {api_count:<10} {diff_pct:<10.2f} {status:<10}")
+    print("=" * 80)
+    print("Note: Realistic agent contexts (>200 tokens) must be within 5%.")
+    print("      Small samples (<200 tokens) allowed up to 15% due to fixed overhead.")
+    print("      Images allowed up to 200% variance (estimated without decoding).")
+    print("=" * 80)
+    # Assert all scenarios pass their respective thresholds
+    assert all_within_threshold, 'One or more scenarios exceeded their accuracy threshold'
+def test_environment_variable_default(api_key):
+    """Test that local token counting defaults to False when env var not set."""
+    # Clear the environment variable
+    with patch.dict(os.environ, {}, clear=False):
+        if 'ANTHROPIC_USE_LOCAL_TOKEN_COUNTING' in os.environ:
+            del os.environ['ANTHROPIC_USE_LOCAL_TOKEN_COUNTING']
+        provider = AnthropicProvider(api_key=api_key)
+    assert provider.use_local_token_counting is False, 'Should default to False when env var not set'
+def test_environment_variable_true(api_key):
+    """Test that local token counting is enabled when env var is 'true'."""
+    with patch.dict(os.environ, {'ANTHROPIC_USE_LOCAL_TOKEN_COUNTING': 'true'}):
+        provider = AnthropicProvider(api_key=api_key)
+    assert provider.use_local_token_counting is True, 'Should be True when env var is "true"'
+def test_environment_variable_false(api_key):
+    """Test that local token counting is disabled when env var is 'false'."""
+    with patch.dict(os.environ, {'ANTHROPIC_USE_LOCAL_TOKEN_COUNTING': 'false'}):
+        provider = AnthropicProvider(api_key=api_key)
+    assert provider.use_local_token_counting is False, 'Should be False when env var is "false"'
+def test_environment_variable_case_insensitive(api_key):
+    """Test that env var is case insensitive."""
+    test_cases = [
+        ('TRUE', True),
+        ('True', True),
+        ('TrUe', True),
+        ('FALSE', False),
+        ('False', False),
+        ('FaLsE', False),
+    ]
+    for env_value, expected_result in test_cases:
+        with patch.dict(os.environ, {'ANTHROPIC_USE_LOCAL_TOKEN_COUNTING': env_value}):
+            provider = AnthropicProvider(api_key=api_key)
+        assert provider.use_local_token_counting is expected_result, f'Failed for env_value={env_value}'
+def test_environment_variable_invalid_value(api_key):
+    """Test that invalid env var values default to False."""
+    invalid_values = ['yes', 'no', '1', '0', 'enabled', 'disabled', 'garbage']
+    for invalid_value in invalid_values:
+        with patch.dict(os.environ, {'ANTHROPIC_USE_LOCAL_TOKEN_COUNTING': invalid_value}):
+            provider = AnthropicProvider(api_key=api_key)
+        assert provider.use_local_token_counting is False, f'Should default to False for invalid value: {invalid_value}'

kolega_code/agent/tests/llm/test_billing_openai_cache.py ADDED Viewed

@@ -0,0 +1,74 @@
+import pytest
+from kolega_code.llm.instrumented_client import InstrumentedLLMClient
+class _UsageRecorder:
+    def __init__(self):
+        self.payload = None
+    def record_usage(self, usage_data):
+        self.payload = usage_data
+@pytest.mark.asyncio
+async def test_usage_recorder_maps_openai_cached_tokens():
+    recorder = _UsageRecorder()
+    client = InstrumentedLLMClient(
+        provider='openai',
+        api_key='sk',
+        langfuse_client=None,
+        user_id='u1',
+        workspace_id='w1',
+        thread_id='t1',
+        usage_recorder=recorder,
+    )
+    usage = {
+        'provider': 'openai',
+        'prompt_tokens': 10,
+        'completion_tokens': 2,
+        'cache_read_input_tokens': 2048,
+    }
+    await client._record_usage(usage, model='m1', success=True)
+    assert recorder.payload['input_tokens'] == 10
+    assert recorder.payload['output_tokens'] == 2
+    assert recorder.payload['cache_read_input_tokens'] == 2048
+@pytest.mark.asyncio
+async def test_usage_recorder_maps_moonshot_response_usage():
+    recorder = _UsageRecorder()
+    client = InstrumentedLLMClient(
+        provider='moonshot',
+        api_key='sk',
+        langfuse_client=None,
+        user_id='u1',
+        workspace_id='w1',
+        thread_id='t1',
+        usage_recorder=recorder,
+    )
+    usage = {
+        'provider': 'moonshot',
+        'input_tokens': 123,
+        'output_tokens': 45,
+        'cache_read_input_tokens': 67,
+        'cache_write_input_tokens': 89,
+        # Moonshot may return these aliases too; billing should use the
+        # Anthropic-shaped fields above for Kimi accounting.
+        'prompt_tokens': 999,
+        'completion_tokens': 888,
+        'total_tokens': 1887,
+    }
+    await client._record_usage(usage, model='kimi-k2.6', success=True)
+    assert recorder.payload['provider'] == 'moonshot'
+    assert recorder.payload['model'] == 'kimi-k2.6'
+    assert recorder.payload['input_tokens'] == 123
+    assert recorder.payload['output_tokens'] == 45
+    assert recorder.payload['cache_read_input_tokens'] == 67
+    assert recorder.payload['cache_write_input_tokens'] == 89
+    assert recorder.payload['metadata']['raw_usage'] == usage