hindsight-api 0.4.7__py3-none-any.whl → 0.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hindsight_api/__init__.py +1 -1
- hindsight_api/alembic/versions/5a366d414dce_initial_schema.py +16 -2
- hindsight_api/api/http.py +39 -1
- hindsight_api/banner.py +3 -0
- hindsight_api/config.py +44 -6
- hindsight_api/daemon.py +17 -111
- hindsight_api/engine/llm_interface.py +146 -0
- hindsight_api/engine/llm_wrapper.py +304 -1327
- hindsight_api/engine/memory_engine.py +71 -37
- hindsight_api/engine/providers/__init__.py +14 -0
- hindsight_api/engine/providers/anthropic_llm.py +434 -0
- hindsight_api/engine/providers/claude_code_llm.py +352 -0
- hindsight_api/engine/providers/codex_llm.py +527 -0
- hindsight_api/engine/providers/gemini_llm.py +502 -0
- hindsight_api/engine/providers/mock_llm.py +234 -0
- hindsight_api/engine/providers/openai_compatible_llm.py +745 -0
- hindsight_api/extensions/__init__.py +2 -0
- hindsight_api/extensions/builtin/tenant.py +36 -0
- hindsight_api/extensions/operation_validator.py +26 -0
- hindsight_api/main.py +6 -21
- hindsight_api/migrations.py +75 -0
- hindsight_api/worker/main.py +35 -10
- hindsight_api/worker/poller.py +15 -11
- {hindsight_api-0.4.7.dist-info → hindsight_api-0.4.8.dist-info}/METADATA +2 -1
- {hindsight_api-0.4.7.dist-info → hindsight_api-0.4.8.dist-info}/RECORD +27 -19
- {hindsight_api-0.4.7.dist-info → hindsight_api-0.4.8.dist-info}/WHEEL +0 -0
- {hindsight_api-0.4.7.dist-info → hindsight_api-0.4.8.dist-info}/entry_points.txt +0 -0
|
@@ -303,8 +303,10 @@ class MemoryEngine(MemoryEngineInterface):
|
|
|
303
303
|
db_url = db_url or config.database_url
|
|
304
304
|
memory_llm_provider = memory_llm_provider or config.llm_provider
|
|
305
305
|
memory_llm_api_key = memory_llm_api_key or config.llm_api_key
|
|
306
|
-
# Ollama and mock don't require an API key
|
|
307
|
-
|
|
306
|
+
# Ollama, openai-codex, claude-code, and mock don't require an API key
|
|
307
|
+
# openai-codex uses OAuth tokens from ~/.codex/auth.json
|
|
308
|
+
# claude-code uses OAuth tokens from macOS Keychain
|
|
309
|
+
if not memory_llm_api_key and memory_llm_provider not in ("ollama", "openai-codex", "claude-code", "mock"):
|
|
308
310
|
raise ValueError("LLM API key is required. Set HINDSIGHT_API_LLM_API_KEY environment variable.")
|
|
309
311
|
memory_llm_model = memory_llm_model or config.llm_model
|
|
310
312
|
memory_llm_base_url = memory_llm_base_url or config.get_llm_base_url() or None
|
|
@@ -457,7 +459,11 @@ class MemoryEngine(MemoryEngineInterface):
|
|
|
457
459
|
# Store operation validator extension (optional)
|
|
458
460
|
self._operation_validator = operation_validator
|
|
459
461
|
|
|
460
|
-
# Store tenant extension (
|
|
462
|
+
# Store tenant extension (always set, use default if none provided)
|
|
463
|
+
if tenant_extension is None:
|
|
464
|
+
from ..extensions.builtin.tenant import DefaultTenantExtension
|
|
465
|
+
|
|
466
|
+
tenant_extension = DefaultTenantExtension(config={})
|
|
461
467
|
self._tenant_extension = tenant_extension
|
|
462
468
|
|
|
463
469
|
async def _validate_operation(self, validation_coro) -> None:
|
|
@@ -495,22 +501,18 @@ class MemoryEngine(MemoryEngineInterface):
|
|
|
495
501
|
Raises:
|
|
496
502
|
AuthenticationError: If authentication fails or request_context is missing when required.
|
|
497
503
|
"""
|
|
498
|
-
if self._tenant_extension is None:
|
|
499
|
-
_current_schema.set("public")
|
|
500
|
-
return "public"
|
|
501
|
-
|
|
502
504
|
from hindsight_api.extensions import AuthenticationError
|
|
503
505
|
|
|
504
506
|
if request_context is None:
|
|
505
|
-
raise AuthenticationError("RequestContext is required
|
|
507
|
+
raise AuthenticationError("RequestContext is required")
|
|
506
508
|
|
|
507
509
|
# For internal/background operations (e.g., worker tasks), skip extension authentication.
|
|
508
510
|
# The task was already authenticated at submission time, and execute_task sets _current_schema
|
|
509
|
-
# from the task's _schema field.
|
|
511
|
+
# from the task's _schema field.
|
|
510
512
|
if request_context.internal:
|
|
511
513
|
return _current_schema.get()
|
|
512
514
|
|
|
513
|
-
#
|
|
515
|
+
# Authenticate through tenant extension (always set, may be default no-auth extension)
|
|
514
516
|
tenant_context = await self._tenant_extension.authenticate(request_context)
|
|
515
517
|
|
|
516
518
|
_current_schema.set(tenant_context.schema_name)
|
|
@@ -536,10 +538,15 @@ class MemoryEngine(MemoryEngineInterface):
|
|
|
536
538
|
f"[BATCH_RETAIN_TASK] Starting background batch retain for bank_id={bank_id}, {len(contents)} items"
|
|
537
539
|
)
|
|
538
540
|
|
|
539
|
-
#
|
|
541
|
+
# Restore tenant_id/api_key_id from task payload so downstream operations
|
|
542
|
+
# (e.g., consolidation and mental model refreshes) can attribute usage.
|
|
540
543
|
from hindsight_api.models import RequestContext
|
|
541
544
|
|
|
542
|
-
internal_context = RequestContext(
|
|
545
|
+
internal_context = RequestContext(
|
|
546
|
+
internal=True,
|
|
547
|
+
tenant_id=task_dict.get("_tenant_id"),
|
|
548
|
+
api_key_id=task_dict.get("_api_key_id"),
|
|
549
|
+
)
|
|
543
550
|
await self.retain_batch_async(bank_id=bank_id, contents=contents, request_context=internal_context)
|
|
544
551
|
|
|
545
552
|
logger.info(f"[BATCH_RETAIN_TASK] Completed background batch retain for bank_id={bank_id}")
|
|
@@ -565,7 +572,13 @@ class MemoryEngine(MemoryEngineInterface):
|
|
|
565
572
|
|
|
566
573
|
from .consolidation import run_consolidation_job
|
|
567
574
|
|
|
568
|
-
|
|
575
|
+
# Restore tenant_id/api_key_id from task payload so downstream operations
|
|
576
|
+
# (e.g., mental model refreshes) can attribute usage to the correct org.
|
|
577
|
+
internal_context = RequestContext(
|
|
578
|
+
internal=True,
|
|
579
|
+
tenant_id=task_dict.get("_tenant_id"),
|
|
580
|
+
api_key_id=task_dict.get("_api_key_id"),
|
|
581
|
+
)
|
|
569
582
|
result = await run_consolidation_job(
|
|
570
583
|
memory_engine=self,
|
|
571
584
|
bank_id=bank_id,
|
|
@@ -926,30 +939,34 @@ class MemoryEngine(MemoryEngineInterface):
|
|
|
926
939
|
|
|
927
940
|
if not self.db_url:
|
|
928
941
|
raise ValueError("Database URL is required for migrations")
|
|
929
|
-
logger.info("Running database migrations...")
|
|
930
|
-
# Use configured database schema for migrations (defaults to "public")
|
|
931
|
-
run_migrations(self.db_url, schema=get_config().database_schema)
|
|
932
942
|
|
|
933
|
-
# Migrate all
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
943
|
+
# Migrate all schemas from the tenant extension
|
|
944
|
+
# The tenant extension is the single source of truth for which schemas exist
|
|
945
|
+
logger.info("Running database migrations...")
|
|
946
|
+
try:
|
|
947
|
+
tenants = await self._tenant_extension.list_tenants()
|
|
948
|
+
if tenants:
|
|
949
|
+
logger.info(f"Running migrations on {len(tenants)} schema(s)...")
|
|
950
|
+
for tenant in tenants:
|
|
951
|
+
schema = tenant.schema
|
|
952
|
+
if schema:
|
|
953
|
+
try:
|
|
954
|
+
run_migrations(self.db_url, schema=schema)
|
|
955
|
+
except Exception as e:
|
|
956
|
+
logger.warning(f"Failed to migrate schema {schema}: {e}")
|
|
957
|
+
logger.info("Schema migrations completed")
|
|
958
|
+
|
|
959
|
+
# Ensure embedding column dimension matches the model's dimension
|
|
960
|
+
# This is done after migrations and after embeddings.initialize()
|
|
961
|
+
for tenant in tenants:
|
|
962
|
+
schema = tenant.schema
|
|
963
|
+
if schema:
|
|
964
|
+
try:
|
|
965
|
+
ensure_embedding_dimension(self.db_url, self.embeddings.dimension, schema=schema)
|
|
966
|
+
except Exception as e:
|
|
967
|
+
logger.warning(f"Failed to ensure embedding dimension for schema {schema}: {e}")
|
|
968
|
+
except Exception as e:
|
|
969
|
+
logger.warning(f"Failed to run schema migrations: {e}")
|
|
953
970
|
|
|
954
971
|
logger.info(f"Connecting to PostgreSQL at {self.db_url}")
|
|
955
972
|
|
|
@@ -5458,6 +5475,13 @@ class MemoryEngine(MemoryEngineInterface):
|
|
|
5458
5475
|
task_payload: dict[str, Any] = {"contents": contents}
|
|
5459
5476
|
if document_tags:
|
|
5460
5477
|
task_payload["document_tags"] = document_tags
|
|
5478
|
+
# Pass tenant_id and api_key_id through task payload so the worker
|
|
5479
|
+
# can propagate request context to downstream operations (e.g.,
|
|
5480
|
+
# consolidation and mental model refreshes triggered after retain).
|
|
5481
|
+
if request_context.tenant_id:
|
|
5482
|
+
task_payload["_tenant_id"] = request_context.tenant_id
|
|
5483
|
+
if request_context.api_key_id:
|
|
5484
|
+
task_payload["_api_key_id"] = request_context.api_key_id
|
|
5461
5485
|
|
|
5462
5486
|
result = await self._submit_async_operation(
|
|
5463
5487
|
bank_id=bank_id,
|
|
@@ -5490,11 +5514,21 @@ class MemoryEngine(MemoryEngineInterface):
|
|
|
5490
5514
|
Dict with operation_id
|
|
5491
5515
|
"""
|
|
5492
5516
|
await self._authenticate_tenant(request_context)
|
|
5517
|
+
|
|
5518
|
+
# Pass tenant_id and api_key_id through task payload so the worker
|
|
5519
|
+
# can provide request context to extension hooks (e.g., usage metering
|
|
5520
|
+
# for mental model refreshes triggered by consolidation).
|
|
5521
|
+
task_payload: dict[str, Any] = {}
|
|
5522
|
+
if request_context.tenant_id:
|
|
5523
|
+
task_payload["_tenant_id"] = request_context.tenant_id
|
|
5524
|
+
if request_context.api_key_id:
|
|
5525
|
+
task_payload["_api_key_id"] = request_context.api_key_id
|
|
5526
|
+
|
|
5493
5527
|
return await self._submit_async_operation(
|
|
5494
5528
|
bank_id=bank_id,
|
|
5495
5529
|
operation_type="consolidation",
|
|
5496
5530
|
task_type="consolidation",
|
|
5497
|
-
task_payload=
|
|
5531
|
+
task_payload=task_payload,
|
|
5498
5532
|
dedupe_by_bank=True,
|
|
5499
5533
|
)
|
|
5500
5534
|
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LLM provider implementations.
|
|
3
|
+
|
|
4
|
+
This package contains concrete implementations of the LLMInterface for various providers.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from .anthropic_llm import AnthropicLLM
|
|
8
|
+
from .claude_code_llm import ClaudeCodeLLM
|
|
9
|
+
from .codex_llm import CodexLLM
|
|
10
|
+
from .gemini_llm import GeminiLLM
|
|
11
|
+
from .mock_llm import MockLLM
|
|
12
|
+
from .openai_compatible_llm import OpenAICompatibleLLM
|
|
13
|
+
|
|
14
|
+
__all__ = ["AnthropicLLM", "ClaudeCodeLLM", "CodexLLM", "GeminiLLM", "MockLLM", "OpenAICompatibleLLM"]
|
|
@@ -0,0 +1,434 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Anthropic LLM provider using the Anthropic Python SDK.
|
|
3
|
+
|
|
4
|
+
This provider enables using Claude models from Anthropic with support for:
|
|
5
|
+
- Structured JSON output
|
|
6
|
+
- Tool/function calling with proper format conversion
|
|
7
|
+
- Extended thinking mode
|
|
8
|
+
- Retry logic with exponential backoff
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import asyncio
|
|
12
|
+
import json
|
|
13
|
+
import logging
|
|
14
|
+
import time
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
from hindsight_api.engine.llm_interface import LLMInterface, OutputTooLongError
|
|
18
|
+
from hindsight_api.engine.response_models import LLMToolCall, LLMToolCallResult, TokenUsage
|
|
19
|
+
from hindsight_api.metrics import get_metrics_collector
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class AnthropicLLM(LLMInterface):
|
|
25
|
+
"""
|
|
26
|
+
LLM provider using Anthropic's Claude models.
|
|
27
|
+
|
|
28
|
+
Supports structured output, tool calling, and extended thinking mode.
|
|
29
|
+
Handles format conversion between OpenAI-style messages and Anthropic's format.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
provider: str,
|
|
35
|
+
api_key: str,
|
|
36
|
+
base_url: str,
|
|
37
|
+
model: str,
|
|
38
|
+
reasoning_effort: str = "low",
|
|
39
|
+
timeout: float = 300.0,
|
|
40
|
+
**kwargs: Any,
|
|
41
|
+
):
|
|
42
|
+
"""
|
|
43
|
+
Initialize Anthropic LLM provider.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
provider: Provider name (should be "anthropic").
|
|
47
|
+
api_key: Anthropic API key.
|
|
48
|
+
base_url: Base URL for the API (optional, uses Anthropic default if empty).
|
|
49
|
+
model: Model name (e.g., "claude-sonnet-4-20250514").
|
|
50
|
+
reasoning_effort: Reasoning effort level (not used by Anthropic).
|
|
51
|
+
timeout: Request timeout in seconds.
|
|
52
|
+
**kwargs: Additional provider-specific parameters.
|
|
53
|
+
"""
|
|
54
|
+
super().__init__(provider, api_key, base_url, model, reasoning_effort, **kwargs)
|
|
55
|
+
|
|
56
|
+
if not self.api_key:
|
|
57
|
+
raise ValueError("API key is required for Anthropic provider")
|
|
58
|
+
|
|
59
|
+
# Import and initialize Anthropic client
|
|
60
|
+
try:
|
|
61
|
+
from anthropic import AsyncAnthropic
|
|
62
|
+
|
|
63
|
+
client_kwargs: dict[str, Any] = {"api_key": self.api_key}
|
|
64
|
+
if self.base_url:
|
|
65
|
+
client_kwargs["base_url"] = self.base_url
|
|
66
|
+
if timeout:
|
|
67
|
+
client_kwargs["timeout"] = timeout
|
|
68
|
+
|
|
69
|
+
self._client = AsyncAnthropic(**client_kwargs)
|
|
70
|
+
logger.info(f"Anthropic client initialized for model: {self.model}")
|
|
71
|
+
except ImportError as e:
|
|
72
|
+
raise RuntimeError("Anthropic SDK not installed. Run: uv add anthropic or pip install anthropic") from e
|
|
73
|
+
|
|
74
|
+
async def verify_connection(self) -> None:
|
|
75
|
+
"""
|
|
76
|
+
Verify that the Anthropic provider is configured correctly by making a simple test call.
|
|
77
|
+
|
|
78
|
+
Raises:
|
|
79
|
+
RuntimeError: If the connection test fails.
|
|
80
|
+
"""
|
|
81
|
+
try:
|
|
82
|
+
test_messages = [{"role": "user", "content": "test"}]
|
|
83
|
+
await self.call(
|
|
84
|
+
messages=test_messages,
|
|
85
|
+
max_completion_tokens=10,
|
|
86
|
+
temperature=0.0,
|
|
87
|
+
scope="test",
|
|
88
|
+
max_retries=0,
|
|
89
|
+
)
|
|
90
|
+
logger.info("Anthropic connection verified successfully")
|
|
91
|
+
except Exception as e:
|
|
92
|
+
logger.error(f"Anthropic connection verification failed: {e}")
|
|
93
|
+
raise RuntimeError(f"Failed to verify Anthropic connection: {e}") from e
|
|
94
|
+
|
|
95
|
+
async def call(
|
|
96
|
+
self,
|
|
97
|
+
messages: list[dict[str, str]],
|
|
98
|
+
response_format: Any | None = None,
|
|
99
|
+
max_completion_tokens: int | None = None,
|
|
100
|
+
temperature: float | None = None,
|
|
101
|
+
scope: str = "memory",
|
|
102
|
+
max_retries: int = 10,
|
|
103
|
+
initial_backoff: float = 1.0,
|
|
104
|
+
max_backoff: float = 60.0,
|
|
105
|
+
skip_validation: bool = False,
|
|
106
|
+
strict_schema: bool = False,
|
|
107
|
+
return_usage: bool = False,
|
|
108
|
+
) -> Any:
|
|
109
|
+
"""
|
|
110
|
+
Make an LLM API call with retry logic.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
messages: List of message dicts with 'role' and 'content'.
|
|
114
|
+
response_format: Optional Pydantic model for structured output.
|
|
115
|
+
max_completion_tokens: Maximum tokens in response.
|
|
116
|
+
temperature: Sampling temperature (0.0-2.0).
|
|
117
|
+
scope: Scope identifier for tracking.
|
|
118
|
+
max_retries: Maximum retry attempts.
|
|
119
|
+
initial_backoff: Initial backoff time in seconds.
|
|
120
|
+
max_backoff: Maximum backoff time in seconds.
|
|
121
|
+
skip_validation: Return raw JSON without Pydantic validation.
|
|
122
|
+
strict_schema: Use strict JSON schema enforcement (not supported by Anthropic).
|
|
123
|
+
return_usage: If True, return tuple (result, TokenUsage) instead of just result.
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
If return_usage=False: Parsed response if response_format is provided, otherwise text content.
|
|
127
|
+
If return_usage=True: Tuple of (result, TokenUsage) with token counts.
|
|
128
|
+
|
|
129
|
+
Raises:
|
|
130
|
+
OutputTooLongError: If output exceeds token limits.
|
|
131
|
+
Exception: Re-raises API errors after retries exhausted.
|
|
132
|
+
"""
|
|
133
|
+
from anthropic import APIConnectionError, APIStatusError, RateLimitError
|
|
134
|
+
|
|
135
|
+
start_time = time.time()
|
|
136
|
+
|
|
137
|
+
# Convert OpenAI-style messages to Anthropic format
|
|
138
|
+
system_prompt = None
|
|
139
|
+
anthropic_messages = []
|
|
140
|
+
|
|
141
|
+
for msg in messages:
|
|
142
|
+
role = msg.get("role", "user")
|
|
143
|
+
content = msg.get("content", "")
|
|
144
|
+
|
|
145
|
+
if role == "system":
|
|
146
|
+
if system_prompt:
|
|
147
|
+
system_prompt += "\n\n" + content
|
|
148
|
+
else:
|
|
149
|
+
system_prompt = content
|
|
150
|
+
else:
|
|
151
|
+
anthropic_messages.append({"role": role, "content": content})
|
|
152
|
+
|
|
153
|
+
# Add JSON schema instruction if response_format is provided
|
|
154
|
+
if response_format is not None and hasattr(response_format, "model_json_schema"):
|
|
155
|
+
schema = response_format.model_json_schema()
|
|
156
|
+
schema_msg = f"\n\nYou must respond with valid JSON matching this schema:\n{json.dumps(schema, indent=2)}"
|
|
157
|
+
if system_prompt:
|
|
158
|
+
system_prompt += schema_msg
|
|
159
|
+
else:
|
|
160
|
+
system_prompt = schema_msg
|
|
161
|
+
|
|
162
|
+
# Prepare parameters
|
|
163
|
+
call_params: dict[str, Any] = {
|
|
164
|
+
"model": self.model,
|
|
165
|
+
"messages": anthropic_messages,
|
|
166
|
+
"max_tokens": max_completion_tokens if max_completion_tokens is not None else 4096,
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
if system_prompt:
|
|
170
|
+
call_params["system"] = system_prompt
|
|
171
|
+
|
|
172
|
+
if temperature is not None:
|
|
173
|
+
call_params["temperature"] = temperature
|
|
174
|
+
|
|
175
|
+
last_exception = None
|
|
176
|
+
|
|
177
|
+
for attempt in range(max_retries + 1):
|
|
178
|
+
try:
|
|
179
|
+
response = await self._client.messages.create(**call_params)
|
|
180
|
+
|
|
181
|
+
# Anthropic response content is a list of blocks
|
|
182
|
+
content = ""
|
|
183
|
+
for block in response.content:
|
|
184
|
+
if block.type == "text":
|
|
185
|
+
content += block.text
|
|
186
|
+
|
|
187
|
+
if response_format is not None:
|
|
188
|
+
# Models may wrap JSON in markdown code blocks
|
|
189
|
+
clean_content = content
|
|
190
|
+
if "```json" in content:
|
|
191
|
+
clean_content = content.split("```json")[1].split("```")[0].strip()
|
|
192
|
+
elif "```" in content:
|
|
193
|
+
clean_content = content.split("```")[1].split("```")[0].strip()
|
|
194
|
+
|
|
195
|
+
try:
|
|
196
|
+
json_data = json.loads(clean_content)
|
|
197
|
+
except json.JSONDecodeError:
|
|
198
|
+
# Fallback to parsing raw content if markdown stripping failed
|
|
199
|
+
json_data = json.loads(content)
|
|
200
|
+
|
|
201
|
+
if skip_validation:
|
|
202
|
+
result = json_data
|
|
203
|
+
else:
|
|
204
|
+
result = response_format.model_validate(json_data)
|
|
205
|
+
else:
|
|
206
|
+
result = content
|
|
207
|
+
|
|
208
|
+
# Record metrics and log slow calls
|
|
209
|
+
duration = time.time() - start_time
|
|
210
|
+
input_tokens = response.usage.input_tokens or 0 if response.usage else 0
|
|
211
|
+
output_tokens = response.usage.output_tokens or 0 if response.usage else 0
|
|
212
|
+
total_tokens = input_tokens + output_tokens
|
|
213
|
+
|
|
214
|
+
# Record LLM metrics
|
|
215
|
+
metrics = get_metrics_collector()
|
|
216
|
+
metrics.record_llm_call(
|
|
217
|
+
provider=self.provider,
|
|
218
|
+
model=self.model,
|
|
219
|
+
scope=scope,
|
|
220
|
+
duration=duration,
|
|
221
|
+
input_tokens=input_tokens,
|
|
222
|
+
output_tokens=output_tokens,
|
|
223
|
+
success=True,
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
# Log slow calls
|
|
227
|
+
if duration > 10.0:
|
|
228
|
+
logger.info(
|
|
229
|
+
f"slow llm call: scope={scope}, model={self.provider}/{self.model}, "
|
|
230
|
+
f"input_tokens={input_tokens}, output_tokens={output_tokens}, "
|
|
231
|
+
f"time={duration:.3f}s"
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
if return_usage:
|
|
235
|
+
token_usage = TokenUsage(
|
|
236
|
+
input_tokens=input_tokens,
|
|
237
|
+
output_tokens=output_tokens,
|
|
238
|
+
total_tokens=total_tokens,
|
|
239
|
+
)
|
|
240
|
+
return result, token_usage
|
|
241
|
+
return result
|
|
242
|
+
|
|
243
|
+
except json.JSONDecodeError as e:
|
|
244
|
+
last_exception = e
|
|
245
|
+
if attempt < max_retries:
|
|
246
|
+
logger.warning("Anthropic returned invalid JSON, retrying...")
|
|
247
|
+
backoff = min(initial_backoff * (2**attempt), max_backoff)
|
|
248
|
+
await asyncio.sleep(backoff)
|
|
249
|
+
continue
|
|
250
|
+
else:
|
|
251
|
+
logger.error(f"Anthropic returned invalid JSON after {max_retries + 1} attempts")
|
|
252
|
+
raise
|
|
253
|
+
|
|
254
|
+
except (APIConnectionError, RateLimitError, APIStatusError) as e:
|
|
255
|
+
# Fast fail on 401/403
|
|
256
|
+
if isinstance(e, APIStatusError) and e.status_code in (401, 403):
|
|
257
|
+
logger.error(f"Anthropic auth error (HTTP {e.status_code}), not retrying: {str(e)}")
|
|
258
|
+
raise
|
|
259
|
+
|
|
260
|
+
last_exception = e
|
|
261
|
+
if attempt < max_retries:
|
|
262
|
+
# Check if it's a rate limit or server error
|
|
263
|
+
should_retry = isinstance(e, (APIConnectionError, RateLimitError)) or (
|
|
264
|
+
isinstance(e, APIStatusError) and e.status_code >= 500
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
if should_retry:
|
|
268
|
+
backoff = min(initial_backoff * (2**attempt), max_backoff)
|
|
269
|
+
jitter = backoff * 0.2 * (2 * (time.time() % 1) - 1)
|
|
270
|
+
await asyncio.sleep(backoff + jitter)
|
|
271
|
+
continue
|
|
272
|
+
|
|
273
|
+
logger.error(f"Anthropic API error after {max_retries + 1} attempts: {str(e)}")
|
|
274
|
+
raise
|
|
275
|
+
|
|
276
|
+
except Exception as e:
|
|
277
|
+
logger.error(f"Unexpected error during Anthropic call: {type(e).__name__}: {str(e)}")
|
|
278
|
+
raise
|
|
279
|
+
|
|
280
|
+
if last_exception:
|
|
281
|
+
raise last_exception
|
|
282
|
+
raise RuntimeError("Anthropic call failed after all retries")
|
|
283
|
+
|
|
284
|
+
async def call_with_tools(
|
|
285
|
+
self,
|
|
286
|
+
messages: list[dict[str, Any]],
|
|
287
|
+
tools: list[dict[str, Any]],
|
|
288
|
+
max_completion_tokens: int | None = None,
|
|
289
|
+
temperature: float | None = None,
|
|
290
|
+
scope: str = "tools",
|
|
291
|
+
max_retries: int = 5,
|
|
292
|
+
initial_backoff: float = 1.0,
|
|
293
|
+
max_backoff: float = 30.0,
|
|
294
|
+
tool_choice: str | dict[str, Any] = "auto",
|
|
295
|
+
) -> LLMToolCallResult:
|
|
296
|
+
"""
|
|
297
|
+
Make an LLM API call with tool/function calling support.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
messages: List of message dicts. Can include tool results with role='tool'.
|
|
301
|
+
tools: List of tool definitions in OpenAI format.
|
|
302
|
+
max_completion_tokens: Maximum tokens in response.
|
|
303
|
+
temperature: Sampling temperature (0.0-2.0).
|
|
304
|
+
scope: Scope identifier for tracking.
|
|
305
|
+
max_retries: Maximum retry attempts.
|
|
306
|
+
initial_backoff: Initial backoff time in seconds.
|
|
307
|
+
max_backoff: Maximum backoff time in seconds.
|
|
308
|
+
tool_choice: How to choose tools - "auto", "none", "required", or specific function.
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
LLMToolCallResult with content and/or tool_calls.
|
|
312
|
+
"""
|
|
313
|
+
from anthropic import APIConnectionError, APIStatusError
|
|
314
|
+
|
|
315
|
+
start_time = time.time()
|
|
316
|
+
|
|
317
|
+
# Convert OpenAI tool format to Anthropic format
|
|
318
|
+
anthropic_tools = []
|
|
319
|
+
for tool in tools:
|
|
320
|
+
func = tool.get("function", {})
|
|
321
|
+
anthropic_tools.append(
|
|
322
|
+
{
|
|
323
|
+
"name": func.get("name", ""),
|
|
324
|
+
"description": func.get("description", ""),
|
|
325
|
+
"input_schema": func.get("parameters", {"type": "object", "properties": {}}),
|
|
326
|
+
}
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
# Convert messages - handle tool results
|
|
330
|
+
system_prompt = None
|
|
331
|
+
anthropic_messages = []
|
|
332
|
+
for msg in messages:
|
|
333
|
+
role = msg.get("role", "user")
|
|
334
|
+
content = msg.get("content", "")
|
|
335
|
+
|
|
336
|
+
if role == "system":
|
|
337
|
+
system_prompt = (system_prompt + "\n\n" + content) if system_prompt else content
|
|
338
|
+
elif role == "tool":
|
|
339
|
+
# Anthropic uses tool_result blocks
|
|
340
|
+
anthropic_messages.append(
|
|
341
|
+
{
|
|
342
|
+
"role": "user",
|
|
343
|
+
"content": [
|
|
344
|
+
{"type": "tool_result", "tool_use_id": msg.get("tool_call_id", ""), "content": content}
|
|
345
|
+
],
|
|
346
|
+
}
|
|
347
|
+
)
|
|
348
|
+
elif role == "assistant" and msg.get("tool_calls"):
|
|
349
|
+
# Convert assistant tool calls
|
|
350
|
+
tool_use_blocks = []
|
|
351
|
+
for tc in msg["tool_calls"]:
|
|
352
|
+
tool_use_blocks.append(
|
|
353
|
+
{
|
|
354
|
+
"type": "tool_use",
|
|
355
|
+
"id": tc.get("id", ""),
|
|
356
|
+
"name": tc.get("function", {}).get("name", ""),
|
|
357
|
+
"input": json.loads(tc.get("function", {}).get("arguments", "{}")),
|
|
358
|
+
}
|
|
359
|
+
)
|
|
360
|
+
anthropic_messages.append({"role": "assistant", "content": tool_use_blocks})
|
|
361
|
+
else:
|
|
362
|
+
anthropic_messages.append({"role": role, "content": content})
|
|
363
|
+
|
|
364
|
+
call_params: dict[str, Any] = {
|
|
365
|
+
"model": self.model,
|
|
366
|
+
"messages": anthropic_messages,
|
|
367
|
+
"tools": anthropic_tools,
|
|
368
|
+
"max_tokens": max_completion_tokens or 4096,
|
|
369
|
+
}
|
|
370
|
+
if system_prompt:
|
|
371
|
+
call_params["system"] = system_prompt
|
|
372
|
+
|
|
373
|
+
if temperature is not None:
|
|
374
|
+
call_params["temperature"] = temperature
|
|
375
|
+
|
|
376
|
+
last_exception = None
|
|
377
|
+
for attempt in range(max_retries + 1):
|
|
378
|
+
try:
|
|
379
|
+
response = await self._client.messages.create(**call_params)
|
|
380
|
+
|
|
381
|
+
# Extract content and tool calls
|
|
382
|
+
content_parts = []
|
|
383
|
+
tool_calls: list[LLMToolCall] = []
|
|
384
|
+
|
|
385
|
+
for block in response.content:
|
|
386
|
+
if block.type == "text":
|
|
387
|
+
content_parts.append(block.text)
|
|
388
|
+
elif block.type == "tool_use":
|
|
389
|
+
tool_calls.append(LLMToolCall(id=block.id, name=block.name, arguments=block.input or {}))
|
|
390
|
+
|
|
391
|
+
content = "".join(content_parts) if content_parts else None
|
|
392
|
+
finish_reason = "tool_calls" if tool_calls else "stop"
|
|
393
|
+
|
|
394
|
+
# Extract token usage
|
|
395
|
+
input_tokens = response.usage.input_tokens or 0
|
|
396
|
+
output_tokens = response.usage.output_tokens or 0
|
|
397
|
+
|
|
398
|
+
# Record metrics
|
|
399
|
+
metrics = get_metrics_collector()
|
|
400
|
+
metrics.record_llm_call(
|
|
401
|
+
provider=self.provider,
|
|
402
|
+
model=self.model,
|
|
403
|
+
scope=scope,
|
|
404
|
+
duration=time.time() - start_time,
|
|
405
|
+
input_tokens=input_tokens,
|
|
406
|
+
output_tokens=output_tokens,
|
|
407
|
+
success=True,
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
return LLMToolCallResult(
|
|
411
|
+
content=content,
|
|
412
|
+
tool_calls=tool_calls,
|
|
413
|
+
finish_reason=finish_reason,
|
|
414
|
+
input_tokens=input_tokens,
|
|
415
|
+
output_tokens=output_tokens,
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
except (APIConnectionError, APIStatusError) as e:
|
|
419
|
+
if isinstance(e, APIStatusError) and e.status_code in (401, 403):
|
|
420
|
+
raise
|
|
421
|
+
last_exception = e
|
|
422
|
+
if attempt < max_retries:
|
|
423
|
+
await asyncio.sleep(min(initial_backoff * (2**attempt), max_backoff))
|
|
424
|
+
continue
|
|
425
|
+
raise
|
|
426
|
+
|
|
427
|
+
if last_exception:
|
|
428
|
+
raise last_exception
|
|
429
|
+
raise RuntimeError("Anthropic tool call failed")
|
|
430
|
+
|
|
431
|
+
async def cleanup(self) -> None:
|
|
432
|
+
"""Clean up resources (close Anthropic client connections)."""
|
|
433
|
+
if hasattr(self, "_client") and self._client:
|
|
434
|
+
await self._client.close()
|