hindsight-api 0.0.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hindsight_api/__init__.py +38 -0
- hindsight_api/api/__init__.py +105 -0
- hindsight_api/api/http.py +1872 -0
- hindsight_api/api/mcp.py +157 -0
- hindsight_api/engine/__init__.py +47 -0
- hindsight_api/engine/cross_encoder.py +97 -0
- hindsight_api/engine/db_utils.py +93 -0
- hindsight_api/engine/embeddings.py +113 -0
- hindsight_api/engine/entity_resolver.py +575 -0
- hindsight_api/engine/llm_wrapper.py +269 -0
- hindsight_api/engine/memory_engine.py +3095 -0
- hindsight_api/engine/query_analyzer.py +519 -0
- hindsight_api/engine/response_models.py +222 -0
- hindsight_api/engine/retain/__init__.py +50 -0
- hindsight_api/engine/retain/bank_utils.py +423 -0
- hindsight_api/engine/retain/chunk_storage.py +82 -0
- hindsight_api/engine/retain/deduplication.py +104 -0
- hindsight_api/engine/retain/embedding_processing.py +62 -0
- hindsight_api/engine/retain/embedding_utils.py +54 -0
- hindsight_api/engine/retain/entity_processing.py +90 -0
- hindsight_api/engine/retain/fact_extraction.py +1027 -0
- hindsight_api/engine/retain/fact_storage.py +176 -0
- hindsight_api/engine/retain/link_creation.py +121 -0
- hindsight_api/engine/retain/link_utils.py +651 -0
- hindsight_api/engine/retain/orchestrator.py +405 -0
- hindsight_api/engine/retain/types.py +206 -0
- hindsight_api/engine/search/__init__.py +15 -0
- hindsight_api/engine/search/fusion.py +122 -0
- hindsight_api/engine/search/observation_utils.py +132 -0
- hindsight_api/engine/search/reranking.py +103 -0
- hindsight_api/engine/search/retrieval.py +503 -0
- hindsight_api/engine/search/scoring.py +161 -0
- hindsight_api/engine/search/temporal_extraction.py +64 -0
- hindsight_api/engine/search/think_utils.py +255 -0
- hindsight_api/engine/search/trace.py +215 -0
- hindsight_api/engine/search/tracer.py +447 -0
- hindsight_api/engine/search/types.py +160 -0
- hindsight_api/engine/task_backend.py +223 -0
- hindsight_api/engine/utils.py +203 -0
- hindsight_api/metrics.py +227 -0
- hindsight_api/migrations.py +163 -0
- hindsight_api/models.py +309 -0
- hindsight_api/pg0.py +425 -0
- hindsight_api/web/__init__.py +12 -0
- hindsight_api/web/server.py +143 -0
- hindsight_api-0.0.13.dist-info/METADATA +41 -0
- hindsight_api-0.0.13.dist-info/RECORD +48 -0
- hindsight_api-0.0.13.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LLM wrapper for unified configuration across providers.
|
|
3
|
+
"""
|
|
4
|
+
import os
|
|
5
|
+
import time
|
|
6
|
+
import asyncio
|
|
7
|
+
from typing import Optional, Any, Dict, List
|
|
8
|
+
from openai import AsyncOpenAI, RateLimitError, APIError, APIStatusError, LengthFinishReasonError
|
|
9
|
+
import logging
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
# Disable httpx logging
|
|
14
|
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
15
|
+
|
|
16
|
+
# Global semaphore to limit concurrent LLM requests across all instances
|
|
17
|
+
_global_llm_semaphore = asyncio.Semaphore(32)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class OutputTooLongError(Exception):
|
|
21
|
+
"""
|
|
22
|
+
Bridge exception raised when LLM output exceeds token limits.
|
|
23
|
+
|
|
24
|
+
This wraps provider-specific errors (e.g., OpenAI's LengthFinishReasonError)
|
|
25
|
+
to allow callers to handle output length issues without depending on
|
|
26
|
+
provider-specific implementations.
|
|
27
|
+
"""
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class LLMConfig:
|
|
32
|
+
"""Configuration for an LLM provider."""
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
provider: str,
|
|
37
|
+
api_key: str,
|
|
38
|
+
base_url: str,
|
|
39
|
+
model: str,
|
|
40
|
+
):
|
|
41
|
+
"""
|
|
42
|
+
Initialize LLM configuration.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
provider: Provider name ("openai", "groq", "ollama"). Required.
|
|
46
|
+
api_key: API key. Required.
|
|
47
|
+
base_url: Base URL. Required.
|
|
48
|
+
model: Model name. Required.
|
|
49
|
+
"""
|
|
50
|
+
self.provider = provider.lower()
|
|
51
|
+
self.api_key = api_key
|
|
52
|
+
self.base_url = base_url
|
|
53
|
+
self.model = model
|
|
54
|
+
|
|
55
|
+
# Validate provider
|
|
56
|
+
if self.provider not in ["openai", "groq", "ollama"]:
|
|
57
|
+
raise ValueError(
|
|
58
|
+
f"Invalid LLM provider: {self.provider}. Must be 'openai', 'groq', or 'ollama'."
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# Set default base URLs
|
|
62
|
+
if not self.base_url:
|
|
63
|
+
if self.provider == "groq":
|
|
64
|
+
self.base_url = "https://api.groq.com/openai/v1"
|
|
65
|
+
elif self.provider == "ollama":
|
|
66
|
+
self.base_url = "http://localhost:11434/v1"
|
|
67
|
+
|
|
68
|
+
# Validate API key (not needed for ollama)
|
|
69
|
+
if self.provider != "ollama" and not self.api_key:
|
|
70
|
+
raise ValueError(
|
|
71
|
+
f"API key not found for {self.provider}"
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# Create client (private - use .call() method instead)
|
|
75
|
+
# Disable automatic retries - we handle retries in the call() method
|
|
76
|
+
if self.provider == "ollama":
|
|
77
|
+
self._client = AsyncOpenAI(api_key="ollama", base_url=self.base_url, max_retries=0)
|
|
78
|
+
elif self.base_url:
|
|
79
|
+
self._client = AsyncOpenAI(api_key=self.api_key, base_url=self.base_url, max_retries=0)
|
|
80
|
+
else:
|
|
81
|
+
self._client = AsyncOpenAI(api_key=self.api_key, max_retries=0)
|
|
82
|
+
|
|
83
|
+
logger.info(
|
|
84
|
+
f"Initialized LLM: provider={self.provider}, model={self.model}, base_url={self.base_url}"
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
async def call(
|
|
88
|
+
self,
|
|
89
|
+
messages: List[Dict[str, str]],
|
|
90
|
+
response_format: Optional[Any] = None,
|
|
91
|
+
scope: str = "memory",
|
|
92
|
+
max_retries: int = 10,
|
|
93
|
+
initial_backoff: float = 1.0,
|
|
94
|
+
max_backoff: float = 60.0,
|
|
95
|
+
skip_validation: bool = False,
|
|
96
|
+
**kwargs
|
|
97
|
+
) -> Any:
|
|
98
|
+
"""
|
|
99
|
+
Make an LLM API call with consistent configuration and retry logic.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
messages: List of message dicts with 'role' and 'content'
|
|
103
|
+
response_format: Optional Pydantic model for structured output
|
|
104
|
+
scope: Scope identifier (e.g., 'memory', 'judge') for future tracking
|
|
105
|
+
max_retries: Maximum number of retry attempts (default: 5)
|
|
106
|
+
initial_backoff: Initial backoff time in seconds (default: 1.0)
|
|
107
|
+
max_backoff: Maximum backoff time in seconds (default: 60.0)
|
|
108
|
+
**kwargs: Additional parameters to pass to the API (temperature, max_tokens, etc.)
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
Parsed response if response_format is provided, otherwise the text content
|
|
112
|
+
|
|
113
|
+
Raises:
|
|
114
|
+
Exception: Re-raises any API errors after all retries are exhausted
|
|
115
|
+
"""
|
|
116
|
+
# Use global semaphore to limit concurrent requests
|
|
117
|
+
async with _global_llm_semaphore:
|
|
118
|
+
start_time = time.time()
|
|
119
|
+
|
|
120
|
+
call_params = {
|
|
121
|
+
"model": self.model,
|
|
122
|
+
"messages": messages,
|
|
123
|
+
**kwargs
|
|
124
|
+
}
|
|
125
|
+
if self.provider == "groq":
|
|
126
|
+
call_params["extra_body"] = {
|
|
127
|
+
"service_tier": "auto",
|
|
128
|
+
"reasoning_effort": "low", # Reduce reasoning overhead
|
|
129
|
+
"include_reasoning": False, # Disable hidden reasoning tokens
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
last_exception = None
|
|
133
|
+
|
|
134
|
+
for attempt in range(max_retries + 1):
|
|
135
|
+
try:
|
|
136
|
+
# Use the appropriate response format
|
|
137
|
+
if response_format is not None:
|
|
138
|
+
# Use JSON mode instead of strict parse for flexibility with optional fields
|
|
139
|
+
# This allows the LLM to omit optional fields without validation errors
|
|
140
|
+
import json
|
|
141
|
+
|
|
142
|
+
# Add schema to the system message
|
|
143
|
+
if hasattr(response_format, 'model_json_schema'):
|
|
144
|
+
schema = response_format.model_json_schema()
|
|
145
|
+
schema_msg = f"\n\nYou must respond with valid JSON matching this schema:\n{json.dumps(schema, indent=2)}"
|
|
146
|
+
|
|
147
|
+
# Add schema to the system message if present, otherwise prepend as user message
|
|
148
|
+
if call_params['messages'] and call_params['messages'][0].get('role') == 'system':
|
|
149
|
+
call_params['messages'][0]['content'] += schema_msg
|
|
150
|
+
else:
|
|
151
|
+
# No system message, add schema instruction to first user message
|
|
152
|
+
if call_params['messages']:
|
|
153
|
+
call_params['messages'][0]['content'] = schema_msg + "\n\n" + call_params['messages'][0]['content']
|
|
154
|
+
|
|
155
|
+
call_params['response_format'] = {"type": "json_object"}
|
|
156
|
+
response = await self._client.chat.completions.create(**call_params)
|
|
157
|
+
|
|
158
|
+
# Parse the JSON response
|
|
159
|
+
content = response.choices[0].message.content
|
|
160
|
+
json_data = json.loads(content)
|
|
161
|
+
|
|
162
|
+
# Return raw JSON if skip_validation is True, otherwise validate with Pydantic
|
|
163
|
+
if skip_validation:
|
|
164
|
+
result = json_data
|
|
165
|
+
else:
|
|
166
|
+
result = response_format.model_validate(json_data)
|
|
167
|
+
else:
|
|
168
|
+
# Standard completion and return text content
|
|
169
|
+
response = await self._client.chat.completions.create(**call_params)
|
|
170
|
+
result = response.choices[0].message.content
|
|
171
|
+
|
|
172
|
+
# Log call details only if it takes more than 5 seconds
|
|
173
|
+
duration = time.time() - start_time
|
|
174
|
+
usage = response.usage
|
|
175
|
+
if duration > 10.0:
|
|
176
|
+
ratio = max(1, usage.completion_tokens) / usage.prompt_tokens
|
|
177
|
+
logger.info(
|
|
178
|
+
f"slow llm call: model={self.provider}/{self.model}, "
|
|
179
|
+
f"input_tokens={usage.prompt_tokens}, output_tokens={usage.completion_tokens}, "
|
|
180
|
+
f"total_tokens={usage.total_tokens}, time={duration:.3f}s, ratio out/in={ratio:.2f}"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
return result
|
|
184
|
+
|
|
185
|
+
except LengthFinishReasonError as e:
|
|
186
|
+
# Output exceeded token limits - raise bridge exception for caller to handle
|
|
187
|
+
logger.warning(f"LLM output exceeded token limits: {str(e)}")
|
|
188
|
+
raise OutputTooLongError(
|
|
189
|
+
f"LLM output exceeded token limits. Input may need to be split into smaller chunks."
|
|
190
|
+
) from e
|
|
191
|
+
|
|
192
|
+
except APIStatusError as e:
|
|
193
|
+
last_exception = e
|
|
194
|
+
if attempt < max_retries:
|
|
195
|
+
# Calculate exponential backoff with jitter
|
|
196
|
+
backoff = min(initial_backoff * (2 ** attempt), max_backoff)
|
|
197
|
+
# Add jitter (±20%)
|
|
198
|
+
jitter = backoff * 0.2 * (2 * (time.time() % 1) - 1)
|
|
199
|
+
sleep_time = backoff + jitter
|
|
200
|
+
|
|
201
|
+
# Only log if it's a non-retryable error or final attempt
|
|
202
|
+
# Silent retry for common transient errors like capacity exceeded
|
|
203
|
+
await asyncio.sleep(sleep_time)
|
|
204
|
+
else:
|
|
205
|
+
# Log only on final failed attempt
|
|
206
|
+
logger.error(f"API error after {max_retries + 1} attempts: {str(e)}")
|
|
207
|
+
raise
|
|
208
|
+
|
|
209
|
+
except Exception as e:
|
|
210
|
+
logger.error(f"Unexpected error during LLM call: {type(e).__name__}: {str(e)}")
|
|
211
|
+
raise
|
|
212
|
+
|
|
213
|
+
# This should never be reached, but just in case
|
|
214
|
+
if last_exception:
|
|
215
|
+
raise last_exception
|
|
216
|
+
raise RuntimeError(f"LLM call failed after all retries with no exception captured")
|
|
217
|
+
|
|
218
|
+
@classmethod
|
|
219
|
+
def for_memory(cls) -> "LLMConfig":
|
|
220
|
+
"""Create configuration for memory operations from environment variables."""
|
|
221
|
+
provider = os.getenv("HINDSIGHT_API_LLM_PROVIDER", "groq")
|
|
222
|
+
api_key = os.getenv("HINDSIGHT_API_LLM_API_KEY")
|
|
223
|
+
base_url = os.getenv("HINDSIGHT_API_LLM_BASE_URL")
|
|
224
|
+
model = os.getenv("HINDSIGHT_API_LLM_MODEL", "openai/gpt-oss-120b")
|
|
225
|
+
|
|
226
|
+
# Set default base URL if not provided
|
|
227
|
+
if not base_url:
|
|
228
|
+
if provider == "groq":
|
|
229
|
+
base_url = "https://api.groq.com/openai/v1"
|
|
230
|
+
elif provider == "ollama":
|
|
231
|
+
base_url = "http://localhost:11434/v1"
|
|
232
|
+
else:
|
|
233
|
+
base_url = ""
|
|
234
|
+
|
|
235
|
+
return cls(
|
|
236
|
+
provider=provider,
|
|
237
|
+
api_key=api_key,
|
|
238
|
+
base_url=base_url,
|
|
239
|
+
model=model,
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
@classmethod
|
|
243
|
+
def for_judge(cls) -> "LLMConfig":
|
|
244
|
+
"""
|
|
245
|
+
Create configuration for judge/evaluator operations from environment variables.
|
|
246
|
+
|
|
247
|
+
Falls back to memory LLM config if judge-specific config not set.
|
|
248
|
+
"""
|
|
249
|
+
# Check if judge-specific config exists, otherwise fall back to memory config
|
|
250
|
+
provider = os.getenv("HINDSIGHT_API_JUDGE_LLM_PROVIDER", os.getenv("HINDSIGHT_API_LLM_PROVIDER", "groq"))
|
|
251
|
+
api_key = os.getenv("HINDSIGHT_API_JUDGE_LLM_API_KEY", os.getenv("HINDSIGHT_API_LLM_API_KEY"))
|
|
252
|
+
base_url = os.getenv("HINDSIGHT_API_JUDGE_LLM_BASE_URL", os.getenv("HINDSIGHT_API_LLM_BASE_URL"))
|
|
253
|
+
model = os.getenv("HINDSIGHT_API_JUDGE_LLM_MODEL", os.getenv("HINDSIGHT_API_LLM_MODEL", "openai/gpt-oss-120b"))
|
|
254
|
+
|
|
255
|
+
# Set default base URL if not provided
|
|
256
|
+
if not base_url:
|
|
257
|
+
if provider == "groq":
|
|
258
|
+
base_url = "https://api.groq.com/openai/v1"
|
|
259
|
+
elif provider == "ollama":
|
|
260
|
+
base_url = "http://localhost:11434/v1"
|
|
261
|
+
else:
|
|
262
|
+
base_url = ""
|
|
263
|
+
|
|
264
|
+
return cls(
|
|
265
|
+
provider=provider,
|
|
266
|
+
api_key=api_key,
|
|
267
|
+
base_url=base_url,
|
|
268
|
+
model=model,
|
|
269
|
+
)
|