hindsight-api 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hindsight_api/__init__.py +10 -2
- hindsight_api/alembic/README +1 -0
- hindsight_api/alembic/env.py +146 -0
- hindsight_api/alembic/script.py.mako +28 -0
- hindsight_api/alembic/versions/5a366d414dce_initial_schema.py +274 -0
- hindsight_api/alembic/versions/b7c4d8e9f1a2_add_chunks_table.py +70 -0
- hindsight_api/alembic/versions/c8e5f2a3b4d1_add_retain_params_to_documents.py +39 -0
- hindsight_api/alembic/versions/d9f6a3b4c5e2_rename_bank_to_interactions.py +48 -0
- hindsight_api/alembic/versions/e0a1b2c3d4e5_disposition_to_3_traits.py +62 -0
- hindsight_api/alembic/versions/rename_personality_to_disposition.py +65 -0
- hindsight_api/api/http.py +84 -86
- hindsight_api/config.py +154 -0
- hindsight_api/engine/__init__.py +7 -2
- hindsight_api/engine/cross_encoder.py +219 -15
- hindsight_api/engine/embeddings.py +192 -18
- hindsight_api/engine/llm_wrapper.py +88 -139
- hindsight_api/engine/memory_engine.py +71 -51
- hindsight_api/engine/retain/bank_utils.py +2 -2
- hindsight_api/engine/retain/fact_extraction.py +1 -1
- hindsight_api/engine/search/reranking.py +6 -10
- hindsight_api/engine/search/tracer.py +1 -1
- hindsight_api/main.py +201 -0
- hindsight_api/migrations.py +7 -7
- hindsight_api/server.py +43 -0
- {hindsight_api-0.1.0.dist-info → hindsight_api-0.1.1.dist-info}/METADATA +1 -1
- {hindsight_api-0.1.0.dist-info → hindsight_api-0.1.1.dist-info}/RECORD +28 -19
- hindsight_api-0.1.1.dist-info/entry_points.txt +2 -0
- hindsight_api/cli.py +0 -127
- hindsight_api/web/__init__.py +0 -12
- hindsight_api/web/server.py +0 -109
- hindsight_api-0.1.0.dist-info/entry_points.txt +0 -2
- {hindsight_api-0.1.0.dist-info → hindsight_api-0.1.1.dist-info}/WHEEL +0 -0
|
@@ -34,8 +34,12 @@ class OutputTooLongError(Exception):
|
|
|
34
34
|
pass
|
|
35
35
|
|
|
36
36
|
|
|
37
|
-
class
|
|
38
|
-
"""
|
|
37
|
+
class LLMProvider:
|
|
38
|
+
"""
|
|
39
|
+
Unified LLM provider.
|
|
40
|
+
|
|
41
|
+
Supports OpenAI, Groq, Ollama (OpenAI-compatible), and Gemini.
|
|
42
|
+
"""
|
|
39
43
|
|
|
40
44
|
def __init__(
|
|
41
45
|
self,
|
|
@@ -43,16 +47,17 @@ class LLMConfig:
|
|
|
43
47
|
api_key: str,
|
|
44
48
|
base_url: str,
|
|
45
49
|
model: str,
|
|
46
|
-
|
|
50
|
+
reasoning_effort: str = "low",
|
|
47
51
|
):
|
|
48
52
|
"""
|
|
49
|
-
Initialize LLM
|
|
53
|
+
Initialize LLM provider.
|
|
50
54
|
|
|
51
55
|
Args:
|
|
52
|
-
provider: Provider name ("openai", "groq", "ollama").
|
|
53
|
-
api_key: API key.
|
|
54
|
-
base_url: Base URL
|
|
55
|
-
model: Model name.
|
|
56
|
+
provider: Provider name ("openai", "groq", "ollama", "gemini").
|
|
57
|
+
api_key: API key.
|
|
58
|
+
base_url: Base URL for the API.
|
|
59
|
+
model: Model name.
|
|
60
|
+
reasoning_effort: Reasoning effort level for supported providers.
|
|
56
61
|
"""
|
|
57
62
|
self.provider = provider.lower()
|
|
58
63
|
self.api_key = api_key
|
|
@@ -61,9 +66,10 @@ class LLMConfig:
|
|
|
61
66
|
self.reasoning_effort = reasoning_effort
|
|
62
67
|
|
|
63
68
|
# Validate provider
|
|
64
|
-
|
|
69
|
+
valid_providers = ["openai", "groq", "ollama", "gemini"]
|
|
70
|
+
if self.provider not in valid_providers:
|
|
65
71
|
raise ValueError(
|
|
66
|
-
f"Invalid LLM provider: {self.provider}. Must be
|
|
72
|
+
f"Invalid LLM provider: {self.provider}. Must be one of: {', '.join(valid_providers)}"
|
|
67
73
|
)
|
|
68
74
|
|
|
69
75
|
# Set default base URLs
|
|
@@ -74,24 +80,18 @@ class LLMConfig:
|
|
|
74
80
|
self.base_url = "http://localhost:11434/v1"
|
|
75
81
|
|
|
76
82
|
# Validate API key (not needed for ollama)
|
|
77
|
-
if self.provider
|
|
78
|
-
raise ValueError(
|
|
79
|
-
f"API key not found for {self.provider}"
|
|
80
|
-
)
|
|
83
|
+
if self.provider != "ollama" and not self.api_key:
|
|
84
|
+
raise ValueError(f"API key not found for {self.provider}")
|
|
81
85
|
|
|
82
|
-
# Create client
|
|
83
|
-
# Disable automatic retries - we handle retries in the call() method
|
|
86
|
+
# Create client based on provider
|
|
84
87
|
if self.provider == "gemini":
|
|
85
88
|
self._gemini_client = genai.Client(api_key=self.api_key)
|
|
86
|
-
self._client = None
|
|
89
|
+
self._client = None
|
|
87
90
|
elif self.provider == "ollama":
|
|
88
91
|
self._client = AsyncOpenAI(api_key="ollama", base_url=self.base_url, max_retries=0)
|
|
89
92
|
self._gemini_client = None
|
|
90
|
-
elif self.base_url:
|
|
91
|
-
self._client = AsyncOpenAI(api_key=self.api_key, base_url=self.base_url, max_retries=0)
|
|
92
|
-
self._gemini_client = None
|
|
93
93
|
else:
|
|
94
|
-
self._client = AsyncOpenAI(api_key=self.api_key, max_retries=0)
|
|
94
|
+
self._client = AsyncOpenAI(api_key=self.api_key, base_url=self.base_url, max_retries=0)
|
|
95
95
|
self._gemini_client = None
|
|
96
96
|
|
|
97
97
|
logger.info(
|
|
@@ -102,101 +102,99 @@ class LLMConfig:
|
|
|
102
102
|
self,
|
|
103
103
|
messages: List[Dict[str, str]],
|
|
104
104
|
response_format: Optional[Any] = None,
|
|
105
|
+
max_completion_tokens: Optional[int] = None,
|
|
106
|
+
temperature: Optional[float] = None,
|
|
105
107
|
scope: str = "memory",
|
|
106
108
|
max_retries: int = 10,
|
|
107
109
|
initial_backoff: float = 1.0,
|
|
108
110
|
max_backoff: float = 60.0,
|
|
109
111
|
skip_validation: bool = False,
|
|
110
|
-
**kwargs
|
|
111
112
|
) -> Any:
|
|
112
113
|
"""
|
|
113
|
-
Make an LLM API call with
|
|
114
|
+
Make an LLM API call with retry logic.
|
|
114
115
|
|
|
115
116
|
Args:
|
|
116
|
-
messages: List of message dicts with 'role' and 'content'
|
|
117
|
-
response_format: Optional Pydantic model for structured output
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
117
|
+
messages: List of message dicts with 'role' and 'content'.
|
|
118
|
+
response_format: Optional Pydantic model for structured output.
|
|
119
|
+
max_completion_tokens: Maximum tokens in response.
|
|
120
|
+
temperature: Sampling temperature (0.0-2.0).
|
|
121
|
+
scope: Scope identifier for tracking.
|
|
122
|
+
max_retries: Maximum retry attempts.
|
|
123
|
+
initial_backoff: Initial backoff time in seconds.
|
|
124
|
+
max_backoff: Maximum backoff time in seconds.
|
|
125
|
+
skip_validation: Return raw JSON without Pydantic validation.
|
|
123
126
|
|
|
124
127
|
Returns:
|
|
125
|
-
Parsed response if response_format is provided, otherwise
|
|
128
|
+
Parsed response if response_format is provided, otherwise text content.
|
|
126
129
|
|
|
127
130
|
Raises:
|
|
128
|
-
|
|
131
|
+
OutputTooLongError: If output exceeds token limits.
|
|
132
|
+
Exception: Re-raises API errors after retries exhausted.
|
|
129
133
|
"""
|
|
130
|
-
# Use global semaphore to limit concurrent requests
|
|
131
134
|
async with _global_llm_semaphore:
|
|
132
135
|
start_time = time.time()
|
|
133
136
|
import json
|
|
134
137
|
|
|
135
138
|
# Handle Gemini provider separately
|
|
136
139
|
if self.provider == "gemini":
|
|
137
|
-
return await self._call_gemini(
|
|
140
|
+
return await self._call_gemini(
|
|
141
|
+
messages, response_format, max_retries, initial_backoff,
|
|
142
|
+
max_backoff, skip_validation, start_time
|
|
143
|
+
)
|
|
138
144
|
|
|
139
145
|
call_params = {
|
|
140
146
|
"model": self.model,
|
|
141
147
|
"messages": messages,
|
|
142
|
-
**kwargs
|
|
143
148
|
}
|
|
144
149
|
|
|
150
|
+
if max_completion_tokens is not None:
|
|
151
|
+
call_params["max_completion_tokens"] = max_completion_tokens
|
|
152
|
+
if temperature is not None:
|
|
153
|
+
call_params["temperature"] = temperature
|
|
154
|
+
|
|
155
|
+
# Provider-specific parameters
|
|
145
156
|
if self.provider == "groq":
|
|
146
157
|
call_params["seed"] = DEFAULT_LLM_SEED
|
|
147
|
-
|
|
148
|
-
if self.provider == "groq":
|
|
149
158
|
call_params["extra_body"] = {
|
|
150
159
|
"service_tier": "auto",
|
|
151
160
|
"reasoning_effort": self.reasoning_effort,
|
|
152
|
-
"include_reasoning": False,
|
|
161
|
+
"include_reasoning": False,
|
|
153
162
|
}
|
|
154
163
|
|
|
155
164
|
last_exception = None
|
|
156
165
|
|
|
157
166
|
for attempt in range(max_retries + 1):
|
|
158
167
|
try:
|
|
159
|
-
# Use the appropriate response format
|
|
160
168
|
if response_format is not None:
|
|
161
|
-
#
|
|
162
|
-
# This allows the LLM to omit optional fields without validation errors
|
|
163
|
-
|
|
164
|
-
# Add schema to the system message
|
|
169
|
+
# Add schema to system message for JSON mode
|
|
165
170
|
if hasattr(response_format, 'model_json_schema'):
|
|
166
171
|
schema = response_format.model_json_schema()
|
|
167
172
|
schema_msg = f"\n\nYou must respond with valid JSON matching this schema:\n{json.dumps(schema, indent=2)}"
|
|
168
173
|
|
|
169
|
-
# Add schema to the system message if present, otherwise prepend as user message
|
|
170
174
|
if call_params['messages'] and call_params['messages'][0].get('role') == 'system':
|
|
171
175
|
call_params['messages'][0]['content'] += schema_msg
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
if call_params['messages']:
|
|
175
|
-
call_params['messages'][0]['content'] = schema_msg + "\n\n" + call_params['messages'][0]['content']
|
|
176
|
+
elif call_params['messages']:
|
|
177
|
+
call_params['messages'][0]['content'] = schema_msg + "\n\n" + call_params['messages'][0]['content']
|
|
176
178
|
|
|
177
179
|
call_params['response_format'] = {"type": "json_object"}
|
|
178
180
|
response = await self._client.chat.completions.create(**call_params)
|
|
179
181
|
|
|
180
|
-
# Parse the JSON response
|
|
181
182
|
content = response.choices[0].message.content
|
|
182
183
|
json_data = json.loads(content)
|
|
183
184
|
|
|
184
|
-
# Return raw JSON if skip_validation is True, otherwise validate with Pydantic
|
|
185
185
|
if skip_validation:
|
|
186
186
|
result = json_data
|
|
187
187
|
else:
|
|
188
188
|
result = response_format.model_validate(json_data)
|
|
189
189
|
else:
|
|
190
|
-
# Standard completion and return text content
|
|
191
190
|
response = await self._client.chat.completions.create(**call_params)
|
|
192
191
|
result = response.choices[0].message.content
|
|
193
192
|
|
|
194
|
-
# Log
|
|
193
|
+
# Log slow calls
|
|
195
194
|
duration = time.time() - start_time
|
|
196
195
|
usage = response.usage
|
|
197
196
|
if duration > 10.0:
|
|
198
197
|
ratio = max(1, usage.completion_tokens) / usage.prompt_tokens
|
|
199
|
-
# Check for cached tokens (OpenAI/Groq may include this)
|
|
200
198
|
cached_tokens = 0
|
|
201
199
|
if hasattr(usage, 'prompt_tokens_details') and usage.prompt_tokens_details:
|
|
202
200
|
cached_tokens = getattr(usage.prompt_tokens_details, 'cached_tokens', 0) or 0
|
|
@@ -210,14 +208,12 @@ class LLMConfig:
|
|
|
210
208
|
return result
|
|
211
209
|
|
|
212
210
|
except LengthFinishReasonError as e:
|
|
213
|
-
# Output exceeded token limits - raise bridge exception for caller to handle
|
|
214
211
|
logger.warning(f"LLM output exceeded token limits: {str(e)}")
|
|
215
212
|
raise OutputTooLongError(
|
|
216
213
|
f"LLM output exceeded token limits. Input may need to be split into smaller chunks."
|
|
217
214
|
) from e
|
|
218
215
|
|
|
219
216
|
except APIConnectionError as e:
|
|
220
|
-
# Handle connection errors (server disconnected, network issues) with retry
|
|
221
217
|
last_exception = e
|
|
222
218
|
if attempt < max_retries:
|
|
223
219
|
logger.warning(f"Connection error, retrying... (attempt {attempt + 1}/{max_retries + 1})")
|
|
@@ -229,19 +225,18 @@ class LLMConfig:
|
|
|
229
225
|
raise
|
|
230
226
|
|
|
231
227
|
except APIStatusError as e:
|
|
228
|
+
# Fast fail on 4xx client errors (except 429 rate limit and 498 which is treated as server error)
|
|
229
|
+
if 400 <= e.status_code < 500 and e.status_code not in (429, 498):
|
|
230
|
+
logger.error(f"Client error (HTTP {e.status_code}), not retrying: {str(e)}")
|
|
231
|
+
raise
|
|
232
|
+
|
|
232
233
|
last_exception = e
|
|
233
234
|
if attempt < max_retries:
|
|
234
|
-
# Calculate exponential backoff with jitter
|
|
235
235
|
backoff = min(initial_backoff * (2 ** attempt), max_backoff)
|
|
236
|
-
# Add jitter (±20%)
|
|
237
236
|
jitter = backoff * 0.2 * (2 * (time.time() % 1) - 1)
|
|
238
237
|
sleep_time = backoff + jitter
|
|
239
|
-
|
|
240
|
-
# Only log if it's a non-retryable error or final attempt
|
|
241
|
-
# Silent retry for common transient errors like capacity exceeded
|
|
242
238
|
await asyncio.sleep(sleep_time)
|
|
243
239
|
else:
|
|
244
|
-
# Log only on final failed attempt
|
|
245
240
|
logger.error(f"API error after {max_retries + 1} attempts: {str(e)}")
|
|
246
241
|
raise
|
|
247
242
|
|
|
@@ -249,7 +244,6 @@ class LLMConfig:
|
|
|
249
244
|
logger.error(f"Unexpected error during LLM call: {type(e).__name__}: {str(e)}")
|
|
250
245
|
raise
|
|
251
246
|
|
|
252
|
-
# This should never be reached, but just in case
|
|
253
247
|
if last_exception:
|
|
254
248
|
raise last_exception
|
|
255
249
|
raise RuntimeError(f"LLM call failed after all retries with no exception captured")
|
|
@@ -263,13 +257,11 @@ class LLMConfig:
|
|
|
263
257
|
max_backoff: float,
|
|
264
258
|
skip_validation: bool,
|
|
265
259
|
start_time: float,
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
"""Handle Gemini-specific API calls using google-genai SDK."""
|
|
260
|
+
) -> Any:
|
|
261
|
+
"""Handle Gemini-specific API calls."""
|
|
269
262
|
import json
|
|
270
263
|
|
|
271
264
|
# Convert OpenAI-style messages to Gemini format
|
|
272
|
-
# Gemini uses 'user' and 'model' roles, and system instructions are separate
|
|
273
265
|
system_instruction = None
|
|
274
266
|
gemini_contents = []
|
|
275
267
|
|
|
@@ -278,7 +270,6 @@ class LLMConfig:
|
|
|
278
270
|
content = msg.get('content', '')
|
|
279
271
|
|
|
280
272
|
if role == 'system':
|
|
281
|
-
# Accumulate system messages as system instruction
|
|
282
273
|
if system_instruction:
|
|
283
274
|
system_instruction += "\n\n" + content
|
|
284
275
|
else:
|
|
@@ -288,7 +279,7 @@ class LLMConfig:
|
|
|
288
279
|
role="model",
|
|
289
280
|
parts=[genai_types.Part(text=content)]
|
|
290
281
|
))
|
|
291
|
-
else:
|
|
282
|
+
else:
|
|
292
283
|
gemini_contents.append(genai_types.Content(
|
|
293
284
|
role="user",
|
|
294
285
|
parts=[genai_types.Part(text=content)]
|
|
@@ -307,13 +298,8 @@ class LLMConfig:
|
|
|
307
298
|
config_kwargs = {}
|
|
308
299
|
if system_instruction:
|
|
309
300
|
config_kwargs['system_instruction'] = system_instruction
|
|
310
|
-
if 'temperature' in kwargs:
|
|
311
|
-
config_kwargs['temperature'] = kwargs['temperature']
|
|
312
|
-
if 'max_tokens' in kwargs:
|
|
313
|
-
config_kwargs['max_output_tokens'] = kwargs['max_tokens']
|
|
314
301
|
if response_format is not None:
|
|
315
302
|
config_kwargs['response_mime_type'] = 'application/json'
|
|
316
|
-
# Pass the Pydantic model directly as response_schema for structured output
|
|
317
303
|
config_kwargs['response_schema'] = response_format
|
|
318
304
|
|
|
319
305
|
generation_config = genai_types.GenerateContentConfig(**config_kwargs) if config_kwargs else None
|
|
@@ -330,9 +316,8 @@ class LLMConfig:
|
|
|
330
316
|
|
|
331
317
|
content = response.text
|
|
332
318
|
|
|
333
|
-
# Handle empty
|
|
319
|
+
# Handle empty response
|
|
334
320
|
if content is None:
|
|
335
|
-
# Check if there's a block reason
|
|
336
321
|
block_reason = None
|
|
337
322
|
if hasattr(response, 'candidates') and response.candidates:
|
|
338
323
|
candidate = response.candidates[0]
|
|
@@ -340,18 +325,15 @@ class LLMConfig:
|
|
|
340
325
|
block_reason = candidate.finish_reason
|
|
341
326
|
|
|
342
327
|
if attempt < max_retries:
|
|
343
|
-
logger.warning(f"Gemini returned empty response (reason: {block_reason}), retrying...
|
|
328
|
+
logger.warning(f"Gemini returned empty response (reason: {block_reason}), retrying...")
|
|
344
329
|
backoff = min(initial_backoff * (2 ** attempt), max_backoff)
|
|
345
330
|
await asyncio.sleep(backoff)
|
|
346
331
|
continue
|
|
347
332
|
else:
|
|
348
|
-
raise RuntimeError(f"Gemini returned empty response after {max_retries + 1} attempts
|
|
333
|
+
raise RuntimeError(f"Gemini returned empty response after {max_retries + 1} attempts")
|
|
349
334
|
|
|
350
335
|
if response_format is not None:
|
|
351
|
-
# Parse the JSON response
|
|
352
336
|
json_data = json.loads(content)
|
|
353
|
-
|
|
354
|
-
# Return raw JSON if skip_validation is True, otherwise validate with Pydantic
|
|
355
337
|
if skip_validation:
|
|
356
338
|
result = json_data
|
|
357
339
|
else:
|
|
@@ -359,42 +341,42 @@ class LLMConfig:
|
|
|
359
341
|
else:
|
|
360
342
|
result = content
|
|
361
343
|
|
|
362
|
-
# Log
|
|
344
|
+
# Log slow calls
|
|
363
345
|
duration = time.time() - start_time
|
|
364
346
|
if duration > 10.0 and hasattr(response, 'usage_metadata') and response.usage_metadata:
|
|
365
347
|
usage = response.usage_metadata
|
|
366
|
-
# Check for cached tokens (Gemini uses cached_content_token_count)
|
|
367
|
-
cached_tokens = getattr(usage, 'cached_content_token_count', 0) or 0
|
|
368
|
-
cache_info = f", cached_tokens={cached_tokens}" if cached_tokens > 0 else ""
|
|
369
348
|
logger.info(
|
|
370
349
|
f"slow llm call: model={self.provider}/{self.model}, "
|
|
371
|
-
f"input_tokens={usage.prompt_token_count}, output_tokens={usage.candidates_token_count}
|
|
350
|
+
f"input_tokens={usage.prompt_token_count}, output_tokens={usage.candidates_token_count}, "
|
|
372
351
|
f"time={duration:.3f}s"
|
|
373
352
|
)
|
|
374
353
|
|
|
375
354
|
return result
|
|
376
355
|
|
|
377
356
|
except json.JSONDecodeError as e:
|
|
378
|
-
# Handle truncated JSON responses (often from MAX_TOKENS) with retry
|
|
379
357
|
last_exception = e
|
|
380
358
|
if attempt < max_retries:
|
|
381
|
-
logger.warning(f"Gemini returned invalid JSON
|
|
359
|
+
logger.warning(f"Gemini returned invalid JSON, retrying...")
|
|
382
360
|
backoff = min(initial_backoff * (2 ** attempt), max_backoff)
|
|
383
361
|
await asyncio.sleep(backoff)
|
|
384
362
|
continue
|
|
385
363
|
else:
|
|
386
|
-
logger.error(f"Gemini returned invalid JSON after {max_retries + 1} attempts
|
|
364
|
+
logger.error(f"Gemini returned invalid JSON after {max_retries + 1} attempts")
|
|
387
365
|
raise
|
|
388
366
|
|
|
389
367
|
except genai_errors.APIError as e:
|
|
390
|
-
#
|
|
391
|
-
if e.code
|
|
368
|
+
# Fast fail on 4xx client errors (except 429 rate limit)
|
|
369
|
+
if e.code and 400 <= e.code < 500 and e.code != 429:
|
|
370
|
+
logger.error(f"Gemini client error (HTTP {e.code}), not retrying: {str(e)}")
|
|
371
|
+
raise
|
|
372
|
+
|
|
373
|
+
# Retry on 429 and 5xx
|
|
374
|
+
if e.code in (429, 500, 502, 503, 504):
|
|
392
375
|
last_exception = e
|
|
393
376
|
if attempt < max_retries:
|
|
394
377
|
backoff = min(initial_backoff * (2 ** attempt), max_backoff)
|
|
395
378
|
jitter = backoff * 0.2 * (2 * (time.time() % 1) - 1)
|
|
396
|
-
|
|
397
|
-
await asyncio.sleep(sleep_time)
|
|
379
|
+
await asyncio.sleep(backoff + jitter)
|
|
398
380
|
else:
|
|
399
381
|
logger.error(f"Gemini API error after {max_retries + 1} attempts: {str(e)}")
|
|
400
382
|
raise
|
|
@@ -408,25 +390,16 @@ class LLMConfig:
|
|
|
408
390
|
|
|
409
391
|
if last_exception:
|
|
410
392
|
raise last_exception
|
|
411
|
-
raise RuntimeError(f"Gemini call failed after all retries
|
|
393
|
+
raise RuntimeError(f"Gemini call failed after all retries")
|
|
412
394
|
|
|
413
395
|
@classmethod
|
|
414
|
-
def for_memory(cls) -> "
|
|
415
|
-
"""Create
|
|
396
|
+
def for_memory(cls) -> "LLMProvider":
|
|
397
|
+
"""Create provider for memory operations from environment variables."""
|
|
416
398
|
provider = os.getenv("HINDSIGHT_API_LLM_PROVIDER", "groq")
|
|
417
399
|
api_key = os.getenv("HINDSIGHT_API_LLM_API_KEY")
|
|
418
|
-
base_url = os.getenv("HINDSIGHT_API_LLM_BASE_URL")
|
|
400
|
+
base_url = os.getenv("HINDSIGHT_API_LLM_BASE_URL", "")
|
|
419
401
|
model = os.getenv("HINDSIGHT_API_LLM_MODEL", "openai/gpt-oss-120b")
|
|
420
402
|
|
|
421
|
-
# Set default base URL if not provided
|
|
422
|
-
if not base_url:
|
|
423
|
-
if provider == "groq":
|
|
424
|
-
base_url = "https://api.groq.com/openai/v1"
|
|
425
|
-
elif provider == "ollama":
|
|
426
|
-
base_url = "http://localhost:11434/v1"
|
|
427
|
-
else:
|
|
428
|
-
base_url = ""
|
|
429
|
-
|
|
430
403
|
return cls(
|
|
431
404
|
provider=provider,
|
|
432
405
|
api_key=api_key,
|
|
@@ -436,27 +409,13 @@ class LLMConfig:
|
|
|
436
409
|
)
|
|
437
410
|
|
|
438
411
|
@classmethod
|
|
439
|
-
def for_answer_generation(cls) -> "
|
|
440
|
-
"""
|
|
441
|
-
Create configuration for answer generation operations from environment variables.
|
|
442
|
-
|
|
443
|
-
Falls back to memory LLM config if answer-specific config not set.
|
|
444
|
-
"""
|
|
445
|
-
# Check if answer-specific config exists, otherwise fall back to memory config
|
|
412
|
+
def for_answer_generation(cls) -> "LLMProvider":
|
|
413
|
+
"""Create provider for answer generation. Falls back to memory config if not set."""
|
|
446
414
|
provider = os.getenv("HINDSIGHT_API_ANSWER_LLM_PROVIDER", os.getenv("HINDSIGHT_API_LLM_PROVIDER", "groq"))
|
|
447
415
|
api_key = os.getenv("HINDSIGHT_API_ANSWER_LLM_API_KEY", os.getenv("HINDSIGHT_API_LLM_API_KEY"))
|
|
448
|
-
base_url = os.getenv("HINDSIGHT_API_ANSWER_LLM_BASE_URL", os.getenv("HINDSIGHT_API_LLM_BASE_URL"))
|
|
416
|
+
base_url = os.getenv("HINDSIGHT_API_ANSWER_LLM_BASE_URL", os.getenv("HINDSIGHT_API_LLM_BASE_URL", ""))
|
|
449
417
|
model = os.getenv("HINDSIGHT_API_ANSWER_LLM_MODEL", os.getenv("HINDSIGHT_API_LLM_MODEL", "openai/gpt-oss-120b"))
|
|
450
418
|
|
|
451
|
-
# Set default base URL if not provided
|
|
452
|
-
if not base_url:
|
|
453
|
-
if provider == "groq":
|
|
454
|
-
base_url = "https://api.groq.com/openai/v1"
|
|
455
|
-
elif provider == "ollama":
|
|
456
|
-
base_url = "http://localhost:11434/v1"
|
|
457
|
-
else:
|
|
458
|
-
base_url = ""
|
|
459
|
-
|
|
460
419
|
return cls(
|
|
461
420
|
provider=provider,
|
|
462
421
|
api_key=api_key,
|
|
@@ -466,27 +425,13 @@ class LLMConfig:
|
|
|
466
425
|
)
|
|
467
426
|
|
|
468
427
|
@classmethod
|
|
469
|
-
def for_judge(cls) -> "
|
|
470
|
-
"""
|
|
471
|
-
Create configuration for judge/evaluator operations from environment variables.
|
|
472
|
-
|
|
473
|
-
Falls back to memory LLM config if judge-specific config not set.
|
|
474
|
-
"""
|
|
475
|
-
# Check if judge-specific config exists, otherwise fall back to memory config
|
|
428
|
+
def for_judge(cls) -> "LLMProvider":
|
|
429
|
+
"""Create provider for judge/evaluator operations. Falls back to memory config if not set."""
|
|
476
430
|
provider = os.getenv("HINDSIGHT_API_JUDGE_LLM_PROVIDER", os.getenv("HINDSIGHT_API_LLM_PROVIDER", "groq"))
|
|
477
431
|
api_key = os.getenv("HINDSIGHT_API_JUDGE_LLM_API_KEY", os.getenv("HINDSIGHT_API_LLM_API_KEY"))
|
|
478
|
-
base_url = os.getenv("HINDSIGHT_API_JUDGE_LLM_BASE_URL", os.getenv("HINDSIGHT_API_LLM_BASE_URL"))
|
|
432
|
+
base_url = os.getenv("HINDSIGHT_API_JUDGE_LLM_BASE_URL", os.getenv("HINDSIGHT_API_LLM_BASE_URL", ""))
|
|
479
433
|
model = os.getenv("HINDSIGHT_API_JUDGE_LLM_MODEL", os.getenv("HINDSIGHT_API_LLM_MODEL", "openai/gpt-oss-120b"))
|
|
480
434
|
|
|
481
|
-
# Set default base URL if not provided
|
|
482
|
-
if not base_url:
|
|
483
|
-
if provider == "groq":
|
|
484
|
-
base_url = "https://api.groq.com/openai/v1"
|
|
485
|
-
elif provider == "ollama":
|
|
486
|
-
base_url = "http://localhost:11434/v1"
|
|
487
|
-
else:
|
|
488
|
-
base_url = ""
|
|
489
|
-
|
|
490
435
|
return cls(
|
|
491
436
|
provider=provider,
|
|
492
437
|
api_key=api_key,
|
|
@@ -494,3 +439,7 @@ class LLMConfig:
|
|
|
494
439
|
model=model,
|
|
495
440
|
reasoning_effort="high"
|
|
496
441
|
)
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
# Backwards compatibility alias
|
|
445
|
+
LLMConfig = LLMProvider
|