hindsight-api 0.0.21__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hindsight_api/__init__.py +10 -2
- hindsight_api/alembic/README +1 -0
- hindsight_api/alembic/env.py +146 -0
- hindsight_api/alembic/script.py.mako +28 -0
- hindsight_api/alembic/versions/5a366d414dce_initial_schema.py +274 -0
- hindsight_api/alembic/versions/b7c4d8e9f1a2_add_chunks_table.py +70 -0
- hindsight_api/alembic/versions/c8e5f2a3b4d1_add_retain_params_to_documents.py +39 -0
- hindsight_api/alembic/versions/d9f6a3b4c5e2_rename_bank_to_interactions.py +48 -0
- hindsight_api/alembic/versions/e0a1b2c3d4e5_disposition_to_3_traits.py +62 -0
- hindsight_api/alembic/versions/rename_personality_to_disposition.py +65 -0
- hindsight_api/api/__init__.py +2 -4
- hindsight_api/api/http.py +112 -164
- hindsight_api/api/mcp.py +2 -1
- hindsight_api/config.py +154 -0
- hindsight_api/engine/__init__.py +7 -2
- hindsight_api/engine/cross_encoder.py +225 -16
- hindsight_api/engine/embeddings.py +198 -19
- hindsight_api/engine/entity_resolver.py +56 -29
- hindsight_api/engine/llm_wrapper.py +147 -106
- hindsight_api/engine/memory_engine.py +337 -192
- hindsight_api/engine/response_models.py +15 -17
- hindsight_api/engine/retain/bank_utils.py +25 -35
- hindsight_api/engine/retain/entity_processing.py +5 -5
- hindsight_api/engine/retain/fact_extraction.py +86 -24
- hindsight_api/engine/retain/fact_storage.py +1 -1
- hindsight_api/engine/retain/link_creation.py +12 -6
- hindsight_api/engine/retain/link_utils.py +50 -56
- hindsight_api/engine/retain/observation_regeneration.py +264 -0
- hindsight_api/engine/retain/orchestrator.py +31 -44
- hindsight_api/engine/retain/types.py +14 -0
- hindsight_api/engine/search/reranking.py +6 -10
- hindsight_api/engine/search/retrieval.py +2 -2
- hindsight_api/engine/search/think_utils.py +59 -30
- hindsight_api/engine/search/tracer.py +1 -1
- hindsight_api/main.py +201 -0
- hindsight_api/migrations.py +61 -39
- hindsight_api/models.py +1 -2
- hindsight_api/pg0.py +17 -36
- hindsight_api/server.py +43 -0
- {hindsight_api-0.0.21.dist-info → hindsight_api-0.1.1.dist-info}/METADATA +2 -3
- hindsight_api-0.1.1.dist-info/RECORD +60 -0
- hindsight_api-0.1.1.dist-info/entry_points.txt +2 -0
- hindsight_api/cli.py +0 -128
- hindsight_api/web/__init__.py +0 -12
- hindsight_api/web/server.py +0 -109
- hindsight_api-0.0.21.dist-info/RECORD +0 -50
- hindsight_api-0.0.21.dist-info/entry_points.txt +0 -2
- {hindsight_api-0.0.21.dist-info → hindsight_api-0.1.1.dist-info}/WHEEL +0 -0
|
@@ -5,12 +5,15 @@ import os
|
|
|
5
5
|
import time
|
|
6
6
|
import asyncio
|
|
7
7
|
from typing import Optional, Any, Dict, List
|
|
8
|
-
from openai import AsyncOpenAI, RateLimitError, APIError, APIStatusError, LengthFinishReasonError
|
|
8
|
+
from openai import AsyncOpenAI, RateLimitError, APIError, APIStatusError, APIConnectionError, LengthFinishReasonError
|
|
9
9
|
from google import genai
|
|
10
10
|
from google.genai import types as genai_types
|
|
11
11
|
from google.genai import errors as genai_errors
|
|
12
12
|
import logging
|
|
13
13
|
|
|
14
|
+
# Seed applied to every Groq request for deterministic behavior.
|
|
15
|
+
DEFAULT_LLM_SEED = 4242
|
|
16
|
+
|
|
14
17
|
logger = logging.getLogger(__name__)
|
|
15
18
|
|
|
16
19
|
# Disable httpx logging
|
|
@@ -31,8 +34,12 @@ class OutputTooLongError(Exception):
|
|
|
31
34
|
pass
|
|
32
35
|
|
|
33
36
|
|
|
34
|
-
class
|
|
35
|
-
"""
|
|
37
|
+
class LLMProvider:
|
|
38
|
+
"""
|
|
39
|
+
Unified LLM provider.
|
|
40
|
+
|
|
41
|
+
Supports OpenAI, Groq, Ollama (OpenAI-compatible), and Gemini.
|
|
42
|
+
"""
|
|
36
43
|
|
|
37
44
|
def __init__(
|
|
38
45
|
self,
|
|
@@ -40,25 +47,29 @@ class LLMConfig:
|
|
|
40
47
|
api_key: str,
|
|
41
48
|
base_url: str,
|
|
42
49
|
model: str,
|
|
50
|
+
reasoning_effort: str = "low",
|
|
43
51
|
):
|
|
44
52
|
"""
|
|
45
|
-
Initialize LLM
|
|
53
|
+
Initialize LLM provider.
|
|
46
54
|
|
|
47
55
|
Args:
|
|
48
|
-
provider: Provider name ("openai", "groq", "ollama").
|
|
49
|
-
api_key: API key.
|
|
50
|
-
base_url: Base URL
|
|
51
|
-
model: Model name.
|
|
56
|
+
provider: Provider name ("openai", "groq", "ollama", "gemini").
|
|
57
|
+
api_key: API key.
|
|
58
|
+
base_url: Base URL for the API.
|
|
59
|
+
model: Model name.
|
|
60
|
+
reasoning_effort: Reasoning effort level for supported providers.
|
|
52
61
|
"""
|
|
53
62
|
self.provider = provider.lower()
|
|
54
63
|
self.api_key = api_key
|
|
55
64
|
self.base_url = base_url
|
|
56
65
|
self.model = model
|
|
66
|
+
self.reasoning_effort = reasoning_effort
|
|
57
67
|
|
|
58
68
|
# Validate provider
|
|
59
|
-
|
|
69
|
+
valid_providers = ["openai", "groq", "ollama", "gemini"]
|
|
70
|
+
if self.provider not in valid_providers:
|
|
60
71
|
raise ValueError(
|
|
61
|
-
f"Invalid LLM provider: {self.provider}. Must be
|
|
72
|
+
f"Invalid LLM provider: {self.provider}. Must be one of: {', '.join(valid_providers)}"
|
|
62
73
|
)
|
|
63
74
|
|
|
64
75
|
# Set default base URLs
|
|
@@ -69,24 +80,18 @@ class LLMConfig:
|
|
|
69
80
|
self.base_url = "http://localhost:11434/v1"
|
|
70
81
|
|
|
71
82
|
# Validate API key (not needed for ollama)
|
|
72
|
-
if self.provider
|
|
73
|
-
raise ValueError(
|
|
74
|
-
f"API key not found for {self.provider}"
|
|
75
|
-
)
|
|
83
|
+
if self.provider != "ollama" and not self.api_key:
|
|
84
|
+
raise ValueError(f"API key not found for {self.provider}")
|
|
76
85
|
|
|
77
|
-
# Create client
|
|
78
|
-
# Disable automatic retries - we handle retries in the call() method
|
|
86
|
+
# Create client based on provider
|
|
79
87
|
if self.provider == "gemini":
|
|
80
88
|
self._gemini_client = genai.Client(api_key=self.api_key)
|
|
81
|
-
self._client = None
|
|
89
|
+
self._client = None
|
|
82
90
|
elif self.provider == "ollama":
|
|
83
91
|
self._client = AsyncOpenAI(api_key="ollama", base_url=self.base_url, max_retries=0)
|
|
84
92
|
self._gemini_client = None
|
|
85
|
-
elif self.base_url:
|
|
86
|
-
self._client = AsyncOpenAI(api_key=self.api_key, base_url=self.base_url, max_retries=0)
|
|
87
|
-
self._gemini_client = None
|
|
88
93
|
else:
|
|
89
|
-
self._client = AsyncOpenAI(api_key=self.api_key, max_retries=0)
|
|
94
|
+
self._client = AsyncOpenAI(api_key=self.api_key, base_url=self.base_url, max_retries=0)
|
|
90
95
|
self._gemini_client = None
|
|
91
96
|
|
|
92
97
|
logger.info(
|
|
@@ -97,125 +102,141 @@ class LLMConfig:
|
|
|
97
102
|
self,
|
|
98
103
|
messages: List[Dict[str, str]],
|
|
99
104
|
response_format: Optional[Any] = None,
|
|
105
|
+
max_completion_tokens: Optional[int] = None,
|
|
106
|
+
temperature: Optional[float] = None,
|
|
100
107
|
scope: str = "memory",
|
|
101
108
|
max_retries: int = 10,
|
|
102
109
|
initial_backoff: float = 1.0,
|
|
103
110
|
max_backoff: float = 60.0,
|
|
104
111
|
skip_validation: bool = False,
|
|
105
|
-
**kwargs
|
|
106
112
|
) -> Any:
|
|
107
113
|
"""
|
|
108
|
-
Make an LLM API call with
|
|
114
|
+
Make an LLM API call with retry logic.
|
|
109
115
|
|
|
110
116
|
Args:
|
|
111
|
-
messages: List of message dicts with 'role' and 'content'
|
|
112
|
-
response_format: Optional Pydantic model for structured output
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
117
|
+
messages: List of message dicts with 'role' and 'content'.
|
|
118
|
+
response_format: Optional Pydantic model for structured output.
|
|
119
|
+
max_completion_tokens: Maximum tokens in response.
|
|
120
|
+
temperature: Sampling temperature (0.0-2.0).
|
|
121
|
+
scope: Scope identifier for tracking.
|
|
122
|
+
max_retries: Maximum retry attempts.
|
|
123
|
+
initial_backoff: Initial backoff time in seconds.
|
|
124
|
+
max_backoff: Maximum backoff time in seconds.
|
|
125
|
+
skip_validation: Return raw JSON without Pydantic validation.
|
|
118
126
|
|
|
119
127
|
Returns:
|
|
120
|
-
Parsed response if response_format is provided, otherwise
|
|
128
|
+
Parsed response if response_format is provided, otherwise text content.
|
|
121
129
|
|
|
122
130
|
Raises:
|
|
123
|
-
|
|
131
|
+
OutputTooLongError: If output exceeds token limits.
|
|
132
|
+
Exception: Re-raises API errors after retries exhausted.
|
|
124
133
|
"""
|
|
125
|
-
# Use global semaphore to limit concurrent requests
|
|
126
134
|
async with _global_llm_semaphore:
|
|
127
135
|
start_time = time.time()
|
|
128
136
|
import json
|
|
129
137
|
|
|
130
138
|
# Handle Gemini provider separately
|
|
131
139
|
if self.provider == "gemini":
|
|
132
|
-
return await self._call_gemini(
|
|
140
|
+
return await self._call_gemini(
|
|
141
|
+
messages, response_format, max_retries, initial_backoff,
|
|
142
|
+
max_backoff, skip_validation, start_time
|
|
143
|
+
)
|
|
133
144
|
|
|
134
145
|
call_params = {
|
|
135
146
|
"model": self.model,
|
|
136
147
|
"messages": messages,
|
|
137
|
-
**kwargs
|
|
138
148
|
}
|
|
149
|
+
|
|
150
|
+
if max_completion_tokens is not None:
|
|
151
|
+
call_params["max_completion_tokens"] = max_completion_tokens
|
|
152
|
+
if temperature is not None:
|
|
153
|
+
call_params["temperature"] = temperature
|
|
154
|
+
|
|
155
|
+
# Provider-specific parameters
|
|
139
156
|
if self.provider == "groq":
|
|
157
|
+
call_params["seed"] = DEFAULT_LLM_SEED
|
|
140
158
|
call_params["extra_body"] = {
|
|
141
159
|
"service_tier": "auto",
|
|
142
|
-
"reasoning_effort":
|
|
143
|
-
"include_reasoning": False,
|
|
160
|
+
"reasoning_effort": self.reasoning_effort,
|
|
161
|
+
"include_reasoning": False,
|
|
144
162
|
}
|
|
145
163
|
|
|
146
164
|
last_exception = None
|
|
147
165
|
|
|
148
166
|
for attempt in range(max_retries + 1):
|
|
149
167
|
try:
|
|
150
|
-
# Use the appropriate response format
|
|
151
168
|
if response_format is not None:
|
|
152
|
-
#
|
|
153
|
-
# This allows the LLM to omit optional fields without validation errors
|
|
154
|
-
|
|
155
|
-
# Add schema to the system message
|
|
169
|
+
# Add schema to system message for JSON mode
|
|
156
170
|
if hasattr(response_format, 'model_json_schema'):
|
|
157
171
|
schema = response_format.model_json_schema()
|
|
158
172
|
schema_msg = f"\n\nYou must respond with valid JSON matching this schema:\n{json.dumps(schema, indent=2)}"
|
|
159
173
|
|
|
160
|
-
# Add schema to the system message if present, otherwise prepend as user message
|
|
161
174
|
if call_params['messages'] and call_params['messages'][0].get('role') == 'system':
|
|
162
175
|
call_params['messages'][0]['content'] += schema_msg
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
if call_params['messages']:
|
|
166
|
-
call_params['messages'][0]['content'] = schema_msg + "\n\n" + call_params['messages'][0]['content']
|
|
176
|
+
elif call_params['messages']:
|
|
177
|
+
call_params['messages'][0]['content'] = schema_msg + "\n\n" + call_params['messages'][0]['content']
|
|
167
178
|
|
|
168
179
|
call_params['response_format'] = {"type": "json_object"}
|
|
169
180
|
response = await self._client.chat.completions.create(**call_params)
|
|
170
181
|
|
|
171
|
-
# Parse the JSON response
|
|
172
182
|
content = response.choices[0].message.content
|
|
173
183
|
json_data = json.loads(content)
|
|
174
184
|
|
|
175
|
-
# Return raw JSON if skip_validation is True, otherwise validate with Pydantic
|
|
176
185
|
if skip_validation:
|
|
177
186
|
result = json_data
|
|
178
187
|
else:
|
|
179
188
|
result = response_format.model_validate(json_data)
|
|
180
189
|
else:
|
|
181
|
-
# Standard completion and return text content
|
|
182
190
|
response = await self._client.chat.completions.create(**call_params)
|
|
183
191
|
result = response.choices[0].message.content
|
|
184
192
|
|
|
185
|
-
# Log
|
|
193
|
+
# Log slow calls
|
|
186
194
|
duration = time.time() - start_time
|
|
187
195
|
usage = response.usage
|
|
188
196
|
if duration > 10.0:
|
|
189
197
|
ratio = max(1, usage.completion_tokens) / usage.prompt_tokens
|
|
198
|
+
cached_tokens = 0
|
|
199
|
+
if hasattr(usage, 'prompt_tokens_details') and usage.prompt_tokens_details:
|
|
200
|
+
cached_tokens = getattr(usage.prompt_tokens_details, 'cached_tokens', 0) or 0
|
|
201
|
+
cache_info = f", cached_tokens={cached_tokens}" if cached_tokens > 0 else ""
|
|
190
202
|
logger.info(
|
|
191
203
|
f"slow llm call: model={self.provider}/{self.model}, "
|
|
192
204
|
f"input_tokens={usage.prompt_tokens}, output_tokens={usage.completion_tokens}, "
|
|
193
|
-
f"total_tokens={usage.total_tokens}, time={duration:.3f}s, ratio out/in={ratio:.2f}"
|
|
205
|
+
f"total_tokens={usage.total_tokens}{cache_info}, time={duration:.3f}s, ratio out/in={ratio:.2f}"
|
|
194
206
|
)
|
|
195
207
|
|
|
196
208
|
return result
|
|
197
209
|
|
|
198
210
|
except LengthFinishReasonError as e:
|
|
199
|
-
# Output exceeded token limits - raise bridge exception for caller to handle
|
|
200
211
|
logger.warning(f"LLM output exceeded token limits: {str(e)}")
|
|
201
212
|
raise OutputTooLongError(
|
|
202
213
|
f"LLM output exceeded token limits. Input may need to be split into smaller chunks."
|
|
203
214
|
) from e
|
|
204
215
|
|
|
216
|
+
except APIConnectionError as e:
|
|
217
|
+
last_exception = e
|
|
218
|
+
if attempt < max_retries:
|
|
219
|
+
logger.warning(f"Connection error, retrying... (attempt {attempt + 1}/{max_retries + 1})")
|
|
220
|
+
backoff = min(initial_backoff * (2 ** attempt), max_backoff)
|
|
221
|
+
await asyncio.sleep(backoff)
|
|
222
|
+
continue
|
|
223
|
+
else:
|
|
224
|
+
logger.error(f"Connection error after {max_retries + 1} attempts: {str(e)}")
|
|
225
|
+
raise
|
|
226
|
+
|
|
205
227
|
except APIStatusError as e:
|
|
228
|
+
# Fast fail on 4xx client errors (except 429 rate limit and 498 which is treated as server error)
|
|
229
|
+
if 400 <= e.status_code < 500 and e.status_code not in (429, 498):
|
|
230
|
+
logger.error(f"Client error (HTTP {e.status_code}), not retrying: {str(e)}")
|
|
231
|
+
raise
|
|
232
|
+
|
|
206
233
|
last_exception = e
|
|
207
234
|
if attempt < max_retries:
|
|
208
|
-
# Calculate exponential backoff with jitter
|
|
209
235
|
backoff = min(initial_backoff * (2 ** attempt), max_backoff)
|
|
210
|
-
# Add jitter (±20%)
|
|
211
236
|
jitter = backoff * 0.2 * (2 * (time.time() % 1) - 1)
|
|
212
237
|
sleep_time = backoff + jitter
|
|
213
|
-
|
|
214
|
-
# Only log if it's a non-retryable error or final attempt
|
|
215
|
-
# Silent retry for common transient errors like capacity exceeded
|
|
216
238
|
await asyncio.sleep(sleep_time)
|
|
217
239
|
else:
|
|
218
|
-
# Log only on final failed attempt
|
|
219
240
|
logger.error(f"API error after {max_retries + 1} attempts: {str(e)}")
|
|
220
241
|
raise
|
|
221
242
|
|
|
@@ -223,7 +244,6 @@ class LLMConfig:
|
|
|
223
244
|
logger.error(f"Unexpected error during LLM call: {type(e).__name__}: {str(e)}")
|
|
224
245
|
raise
|
|
225
246
|
|
|
226
|
-
# This should never be reached, but just in case
|
|
227
247
|
if last_exception:
|
|
228
248
|
raise last_exception
|
|
229
249
|
raise RuntimeError(f"LLM call failed after all retries with no exception captured")
|
|
@@ -237,13 +257,11 @@ class LLMConfig:
|
|
|
237
257
|
max_backoff: float,
|
|
238
258
|
skip_validation: bool,
|
|
239
259
|
start_time: float,
|
|
240
|
-
**kwargs
|
|
241
260
|
) -> Any:
|
|
242
|
-
"""Handle Gemini-specific API calls
|
|
261
|
+
"""Handle Gemini-specific API calls."""
|
|
243
262
|
import json
|
|
244
263
|
|
|
245
264
|
# Convert OpenAI-style messages to Gemini format
|
|
246
|
-
# Gemini uses 'user' and 'model' roles, and system instructions are separate
|
|
247
265
|
system_instruction = None
|
|
248
266
|
gemini_contents = []
|
|
249
267
|
|
|
@@ -252,7 +270,6 @@ class LLMConfig:
|
|
|
252
270
|
content = msg.get('content', '')
|
|
253
271
|
|
|
254
272
|
if role == 'system':
|
|
255
|
-
# Accumulate system messages as system instruction
|
|
256
273
|
if system_instruction:
|
|
257
274
|
system_instruction += "\n\n" + content
|
|
258
275
|
else:
|
|
@@ -262,7 +279,7 @@ class LLMConfig:
|
|
|
262
279
|
role="model",
|
|
263
280
|
parts=[genai_types.Part(text=content)]
|
|
264
281
|
))
|
|
265
|
-
else:
|
|
282
|
+
else:
|
|
266
283
|
gemini_contents.append(genai_types.Content(
|
|
267
284
|
role="user",
|
|
268
285
|
parts=[genai_types.Part(text=content)]
|
|
@@ -281,12 +298,9 @@ class LLMConfig:
|
|
|
281
298
|
config_kwargs = {}
|
|
282
299
|
if system_instruction:
|
|
283
300
|
config_kwargs['system_instruction'] = system_instruction
|
|
284
|
-
if 'temperature' in kwargs:
|
|
285
|
-
config_kwargs['temperature'] = kwargs['temperature']
|
|
286
|
-
if 'max_tokens' in kwargs:
|
|
287
|
-
config_kwargs['max_output_tokens'] = kwargs['max_tokens']
|
|
288
301
|
if response_format is not None:
|
|
289
302
|
config_kwargs['response_mime_type'] = 'application/json'
|
|
303
|
+
config_kwargs['response_schema'] = response_format
|
|
290
304
|
|
|
291
305
|
generation_config = genai_types.GenerateContentConfig(**config_kwargs) if config_kwargs else None
|
|
292
306
|
|
|
@@ -302,11 +316,24 @@ class LLMConfig:
|
|
|
302
316
|
|
|
303
317
|
content = response.text
|
|
304
318
|
|
|
319
|
+
# Handle empty response
|
|
320
|
+
if content is None:
|
|
321
|
+
block_reason = None
|
|
322
|
+
if hasattr(response, 'candidates') and response.candidates:
|
|
323
|
+
candidate = response.candidates[0]
|
|
324
|
+
if hasattr(candidate, 'finish_reason'):
|
|
325
|
+
block_reason = candidate.finish_reason
|
|
326
|
+
|
|
327
|
+
if attempt < max_retries:
|
|
328
|
+
logger.warning(f"Gemini returned empty response (reason: {block_reason}), retrying...")
|
|
329
|
+
backoff = min(initial_backoff * (2 ** attempt), max_backoff)
|
|
330
|
+
await asyncio.sleep(backoff)
|
|
331
|
+
continue
|
|
332
|
+
else:
|
|
333
|
+
raise RuntimeError(f"Gemini returned empty response after {max_retries + 1} attempts")
|
|
334
|
+
|
|
305
335
|
if response_format is not None:
|
|
306
|
-
# Parse the JSON response
|
|
307
336
|
json_data = json.loads(content)
|
|
308
|
-
|
|
309
|
-
# Return raw JSON if skip_validation is True, otherwise validate with Pydantic
|
|
310
337
|
if skip_validation:
|
|
311
338
|
result = json_data
|
|
312
339
|
else:
|
|
@@ -314,7 +341,7 @@ class LLMConfig:
|
|
|
314
341
|
else:
|
|
315
342
|
result = content
|
|
316
343
|
|
|
317
|
-
# Log
|
|
344
|
+
# Log slow calls
|
|
318
345
|
duration = time.time() - start_time
|
|
319
346
|
if duration > 10.0 and hasattr(response, 'usage_metadata') and response.usage_metadata:
|
|
320
347
|
usage = response.usage_metadata
|
|
@@ -326,15 +353,30 @@ class LLMConfig:
|
|
|
326
353
|
|
|
327
354
|
return result
|
|
328
355
|
|
|
356
|
+
except json.JSONDecodeError as e:
|
|
357
|
+
last_exception = e
|
|
358
|
+
if attempt < max_retries:
|
|
359
|
+
logger.warning(f"Gemini returned invalid JSON, retrying...")
|
|
360
|
+
backoff = min(initial_backoff * (2 ** attempt), max_backoff)
|
|
361
|
+
await asyncio.sleep(backoff)
|
|
362
|
+
continue
|
|
363
|
+
else:
|
|
364
|
+
logger.error(f"Gemini returned invalid JSON after {max_retries + 1} attempts")
|
|
365
|
+
raise
|
|
366
|
+
|
|
329
367
|
except genai_errors.APIError as e:
|
|
330
|
-
#
|
|
331
|
-
if e.code
|
|
368
|
+
# Fast fail on 4xx client errors (except 429 rate limit)
|
|
369
|
+
if e.code and 400 <= e.code < 500 and e.code != 429:
|
|
370
|
+
logger.error(f"Gemini client error (HTTP {e.code}), not retrying: {str(e)}")
|
|
371
|
+
raise
|
|
372
|
+
|
|
373
|
+
# Retry on 429 and 5xx
|
|
374
|
+
if e.code in (429, 500, 502, 503, 504):
|
|
332
375
|
last_exception = e
|
|
333
376
|
if attempt < max_retries:
|
|
334
377
|
backoff = min(initial_backoff * (2 ** attempt), max_backoff)
|
|
335
378
|
jitter = backoff * 0.2 * (2 * (time.time() % 1) - 1)
|
|
336
|
-
|
|
337
|
-
await asyncio.sleep(sleep_time)
|
|
379
|
+
await asyncio.sleep(backoff + jitter)
|
|
338
380
|
else:
|
|
339
381
|
logger.error(f"Gemini API error after {max_retries + 1} attempts: {str(e)}")
|
|
340
382
|
raise
|
|
@@ -348,57 +390,56 @@ class LLMConfig:
|
|
|
348
390
|
|
|
349
391
|
if last_exception:
|
|
350
392
|
raise last_exception
|
|
351
|
-
raise RuntimeError(f"Gemini call failed after all retries
|
|
393
|
+
raise RuntimeError(f"Gemini call failed after all retries")
|
|
352
394
|
|
|
353
395
|
@classmethod
|
|
354
|
-
def for_memory(cls) -> "
|
|
355
|
-
"""Create
|
|
396
|
+
def for_memory(cls) -> "LLMProvider":
|
|
397
|
+
"""Create provider for memory operations from environment variables."""
|
|
356
398
|
provider = os.getenv("HINDSIGHT_API_LLM_PROVIDER", "groq")
|
|
357
399
|
api_key = os.getenv("HINDSIGHT_API_LLM_API_KEY")
|
|
358
|
-
base_url = os.getenv("HINDSIGHT_API_LLM_BASE_URL")
|
|
400
|
+
base_url = os.getenv("HINDSIGHT_API_LLM_BASE_URL", "")
|
|
359
401
|
model = os.getenv("HINDSIGHT_API_LLM_MODEL", "openai/gpt-oss-120b")
|
|
360
402
|
|
|
361
|
-
# Set default base URL if not provided
|
|
362
|
-
if not base_url:
|
|
363
|
-
if provider == "groq":
|
|
364
|
-
base_url = "https://api.groq.com/openai/v1"
|
|
365
|
-
elif provider == "ollama":
|
|
366
|
-
base_url = "http://localhost:11434/v1"
|
|
367
|
-
else:
|
|
368
|
-
base_url = ""
|
|
369
|
-
|
|
370
403
|
return cls(
|
|
371
404
|
provider=provider,
|
|
372
405
|
api_key=api_key,
|
|
373
406
|
base_url=base_url,
|
|
374
407
|
model=model,
|
|
408
|
+
reasoning_effort="low"
|
|
375
409
|
)
|
|
376
410
|
|
|
377
411
|
@classmethod
|
|
378
|
-
def
|
|
379
|
-
"""
|
|
380
|
-
|
|
412
|
+
def for_answer_generation(cls) -> "LLMProvider":
|
|
413
|
+
"""Create provider for answer generation. Falls back to memory config if not set."""
|
|
414
|
+
provider = os.getenv("HINDSIGHT_API_ANSWER_LLM_PROVIDER", os.getenv("HINDSIGHT_API_LLM_PROVIDER", "groq"))
|
|
415
|
+
api_key = os.getenv("HINDSIGHT_API_ANSWER_LLM_API_KEY", os.getenv("HINDSIGHT_API_LLM_API_KEY"))
|
|
416
|
+
base_url = os.getenv("HINDSIGHT_API_ANSWER_LLM_BASE_URL", os.getenv("HINDSIGHT_API_LLM_BASE_URL", ""))
|
|
417
|
+
model = os.getenv("HINDSIGHT_API_ANSWER_LLM_MODEL", os.getenv("HINDSIGHT_API_LLM_MODEL", "openai/gpt-oss-120b"))
|
|
381
418
|
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
419
|
+
return cls(
|
|
420
|
+
provider=provider,
|
|
421
|
+
api_key=api_key,
|
|
422
|
+
base_url=base_url,
|
|
423
|
+
model=model,
|
|
424
|
+
reasoning_effort="high"
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
@classmethod
|
|
428
|
+
def for_judge(cls) -> "LLMProvider":
|
|
429
|
+
"""Create provider for judge/evaluator operations. Falls back to memory config if not set."""
|
|
385
430
|
provider = os.getenv("HINDSIGHT_API_JUDGE_LLM_PROVIDER", os.getenv("HINDSIGHT_API_LLM_PROVIDER", "groq"))
|
|
386
431
|
api_key = os.getenv("HINDSIGHT_API_JUDGE_LLM_API_KEY", os.getenv("HINDSIGHT_API_LLM_API_KEY"))
|
|
387
|
-
base_url = os.getenv("HINDSIGHT_API_JUDGE_LLM_BASE_URL", os.getenv("HINDSIGHT_API_LLM_BASE_URL"))
|
|
432
|
+
base_url = os.getenv("HINDSIGHT_API_JUDGE_LLM_BASE_URL", os.getenv("HINDSIGHT_API_LLM_BASE_URL", ""))
|
|
388
433
|
model = os.getenv("HINDSIGHT_API_JUDGE_LLM_MODEL", os.getenv("HINDSIGHT_API_LLM_MODEL", "openai/gpt-oss-120b"))
|
|
389
434
|
|
|
390
|
-
# Set default base URL if not provided
|
|
391
|
-
if not base_url:
|
|
392
|
-
if provider == "groq":
|
|
393
|
-
base_url = "https://api.groq.com/openai/v1"
|
|
394
|
-
elif provider == "ollama":
|
|
395
|
-
base_url = "http://localhost:11434/v1"
|
|
396
|
-
else:
|
|
397
|
-
base_url = ""
|
|
398
|
-
|
|
399
435
|
return cls(
|
|
400
436
|
provider=provider,
|
|
401
437
|
api_key=api_key,
|
|
402
438
|
base_url=base_url,
|
|
403
439
|
model=model,
|
|
440
|
+
reasoning_effort="high"
|
|
404
441
|
)
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
# Backwards compatibility alias
|
|
445
|
+
LLMConfig = LLMProvider
|