hindsight-api 0.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. hindsight_api/__init__.py +38 -0
  2. hindsight_api/api/__init__.py +105 -0
  3. hindsight_api/api/http.py +1872 -0
  4. hindsight_api/api/mcp.py +157 -0
  5. hindsight_api/engine/__init__.py +47 -0
  6. hindsight_api/engine/cross_encoder.py +97 -0
  7. hindsight_api/engine/db_utils.py +93 -0
  8. hindsight_api/engine/embeddings.py +113 -0
  9. hindsight_api/engine/entity_resolver.py +575 -0
  10. hindsight_api/engine/llm_wrapper.py +269 -0
  11. hindsight_api/engine/memory_engine.py +3095 -0
  12. hindsight_api/engine/query_analyzer.py +519 -0
  13. hindsight_api/engine/response_models.py +222 -0
  14. hindsight_api/engine/retain/__init__.py +50 -0
  15. hindsight_api/engine/retain/bank_utils.py +423 -0
  16. hindsight_api/engine/retain/chunk_storage.py +82 -0
  17. hindsight_api/engine/retain/deduplication.py +104 -0
  18. hindsight_api/engine/retain/embedding_processing.py +62 -0
  19. hindsight_api/engine/retain/embedding_utils.py +54 -0
  20. hindsight_api/engine/retain/entity_processing.py +90 -0
  21. hindsight_api/engine/retain/fact_extraction.py +1027 -0
  22. hindsight_api/engine/retain/fact_storage.py +176 -0
  23. hindsight_api/engine/retain/link_creation.py +121 -0
  24. hindsight_api/engine/retain/link_utils.py +651 -0
  25. hindsight_api/engine/retain/orchestrator.py +405 -0
  26. hindsight_api/engine/retain/types.py +206 -0
  27. hindsight_api/engine/search/__init__.py +15 -0
  28. hindsight_api/engine/search/fusion.py +122 -0
  29. hindsight_api/engine/search/observation_utils.py +132 -0
  30. hindsight_api/engine/search/reranking.py +103 -0
  31. hindsight_api/engine/search/retrieval.py +503 -0
  32. hindsight_api/engine/search/scoring.py +161 -0
  33. hindsight_api/engine/search/temporal_extraction.py +64 -0
  34. hindsight_api/engine/search/think_utils.py +255 -0
  35. hindsight_api/engine/search/trace.py +215 -0
  36. hindsight_api/engine/search/tracer.py +447 -0
  37. hindsight_api/engine/search/types.py +160 -0
  38. hindsight_api/engine/task_backend.py +223 -0
  39. hindsight_api/engine/utils.py +203 -0
  40. hindsight_api/metrics.py +227 -0
  41. hindsight_api/migrations.py +163 -0
  42. hindsight_api/models.py +309 -0
  43. hindsight_api/pg0.py +425 -0
  44. hindsight_api/web/__init__.py +12 -0
  45. hindsight_api/web/server.py +143 -0
  46. hindsight_api-0.0.13.dist-info/METADATA +41 -0
  47. hindsight_api-0.0.13.dist-info/RECORD +48 -0
  48. hindsight_api-0.0.13.dist-info/WHEEL +4 -0
@@ -0,0 +1,269 @@
1
+ """
2
+ LLM wrapper for unified configuration across providers.
3
+ """
4
+ import os
5
+ import time
6
+ import asyncio
7
+ from typing import Optional, Any, Dict, List
8
+ from openai import AsyncOpenAI, RateLimitError, APIError, APIStatusError, LengthFinishReasonError
9
+ import logging
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ # Disable httpx logging
14
+ logging.getLogger("httpx").setLevel(logging.WARNING)
15
+
16
+ # Global semaphore to limit concurrent LLM requests across all instances
17
+ _global_llm_semaphore = asyncio.Semaphore(32)
18
+
19
+
20
+ class OutputTooLongError(Exception):
21
+ """
22
+ Bridge exception raised when LLM output exceeds token limits.
23
+
24
+ This wraps provider-specific errors (e.g., OpenAI's LengthFinishReasonError)
25
+ to allow callers to handle output length issues without depending on
26
+ provider-specific implementations.
27
+ """
28
+ pass
29
+
30
+
31
+ class LLMConfig:
32
+ """Configuration for an LLM provider."""
33
+
34
+ def __init__(
35
+ self,
36
+ provider: str,
37
+ api_key: str,
38
+ base_url: str,
39
+ model: str,
40
+ ):
41
+ """
42
+ Initialize LLM configuration.
43
+
44
+ Args:
45
+ provider: Provider name ("openai", "groq", "ollama"). Required.
46
+ api_key: API key. Required.
47
+ base_url: Base URL. Required.
48
+ model: Model name. Required.
49
+ """
50
+ self.provider = provider.lower()
51
+ self.api_key = api_key
52
+ self.base_url = base_url
53
+ self.model = model
54
+
55
+ # Validate provider
56
+ if self.provider not in ["openai", "groq", "ollama"]:
57
+ raise ValueError(
58
+ f"Invalid LLM provider: {self.provider}. Must be 'openai', 'groq', or 'ollama'."
59
+ )
60
+
61
+ # Set default base URLs
62
+ if not self.base_url:
63
+ if self.provider == "groq":
64
+ self.base_url = "https://api.groq.com/openai/v1"
65
+ elif self.provider == "ollama":
66
+ self.base_url = "http://localhost:11434/v1"
67
+
68
+ # Validate API key (not needed for ollama)
69
+ if self.provider != "ollama" and not self.api_key:
70
+ raise ValueError(
71
+ f"API key not found for {self.provider}"
72
+ )
73
+
74
+ # Create client (private - use .call() method instead)
75
+ # Disable automatic retries - we handle retries in the call() method
76
+ if self.provider == "ollama":
77
+ self._client = AsyncOpenAI(api_key="ollama", base_url=self.base_url, max_retries=0)
78
+ elif self.base_url:
79
+ self._client = AsyncOpenAI(api_key=self.api_key, base_url=self.base_url, max_retries=0)
80
+ else:
81
+ self._client = AsyncOpenAI(api_key=self.api_key, max_retries=0)
82
+
83
+ logger.info(
84
+ f"Initialized LLM: provider={self.provider}, model={self.model}, base_url={self.base_url}"
85
+ )
86
+
87
+ async def call(
88
+ self,
89
+ messages: List[Dict[str, str]],
90
+ response_format: Optional[Any] = None,
91
+ scope: str = "memory",
92
+ max_retries: int = 10,
93
+ initial_backoff: float = 1.0,
94
+ max_backoff: float = 60.0,
95
+ skip_validation: bool = False,
96
+ **kwargs
97
+ ) -> Any:
98
+ """
99
+ Make an LLM API call with consistent configuration and retry logic.
100
+
101
+ Args:
102
+ messages: List of message dicts with 'role' and 'content'
103
+ response_format: Optional Pydantic model for structured output
104
+ scope: Scope identifier (e.g., 'memory', 'judge') for future tracking
105
+ max_retries: Maximum number of retry attempts (default: 5)
106
+ initial_backoff: Initial backoff time in seconds (default: 1.0)
107
+ max_backoff: Maximum backoff time in seconds (default: 60.0)
108
+ **kwargs: Additional parameters to pass to the API (temperature, max_tokens, etc.)
109
+
110
+ Returns:
111
+ Parsed response if response_format is provided, otherwise the text content
112
+
113
+ Raises:
114
+ Exception: Re-raises any API errors after all retries are exhausted
115
+ """
116
+ # Use global semaphore to limit concurrent requests
117
+ async with _global_llm_semaphore:
118
+ start_time = time.time()
119
+
120
+ call_params = {
121
+ "model": self.model,
122
+ "messages": messages,
123
+ **kwargs
124
+ }
125
+ if self.provider == "groq":
126
+ call_params["extra_body"] = {
127
+ "service_tier": "auto",
128
+ "reasoning_effort": "low", # Reduce reasoning overhead
129
+ "include_reasoning": False, # Disable hidden reasoning tokens
130
+ }
131
+
132
+ last_exception = None
133
+
134
+ for attempt in range(max_retries + 1):
135
+ try:
136
+ # Use the appropriate response format
137
+ if response_format is not None:
138
+ # Use JSON mode instead of strict parse for flexibility with optional fields
139
+ # This allows the LLM to omit optional fields without validation errors
140
+ import json
141
+
142
+ # Add schema to the system message
143
+ if hasattr(response_format, 'model_json_schema'):
144
+ schema = response_format.model_json_schema()
145
+ schema_msg = f"\n\nYou must respond with valid JSON matching this schema:\n{json.dumps(schema, indent=2)}"
146
+
147
+ # Add schema to the system message if present, otherwise prepend as user message
148
+ if call_params['messages'] and call_params['messages'][0].get('role') == 'system':
149
+ call_params['messages'][0]['content'] += schema_msg
150
+ else:
151
+ # No system message, add schema instruction to first user message
152
+ if call_params['messages']:
153
+ call_params['messages'][0]['content'] = schema_msg + "\n\n" + call_params['messages'][0]['content']
154
+
155
+ call_params['response_format'] = {"type": "json_object"}
156
+ response = await self._client.chat.completions.create(**call_params)
157
+
158
+ # Parse the JSON response
159
+ content = response.choices[0].message.content
160
+ json_data = json.loads(content)
161
+
162
+ # Return raw JSON if skip_validation is True, otherwise validate with Pydantic
163
+ if skip_validation:
164
+ result = json_data
165
+ else:
166
+ result = response_format.model_validate(json_data)
167
+ else:
168
+ # Standard completion and return text content
169
+ response = await self._client.chat.completions.create(**call_params)
170
+ result = response.choices[0].message.content
171
+
172
+ # Log call details only if it takes more than 5 seconds
173
+ duration = time.time() - start_time
174
+ usage = response.usage
175
+ if duration > 10.0:
176
+ ratio = max(1, usage.completion_tokens) / usage.prompt_tokens
177
+ logger.info(
178
+ f"slow llm call: model={self.provider}/{self.model}, "
179
+ f"input_tokens={usage.prompt_tokens}, output_tokens={usage.completion_tokens}, "
180
+ f"total_tokens={usage.total_tokens}, time={duration:.3f}s, ratio out/in={ratio:.2f}"
181
+ )
182
+
183
+ return result
184
+
185
+ except LengthFinishReasonError as e:
186
+ # Output exceeded token limits - raise bridge exception for caller to handle
187
+ logger.warning(f"LLM output exceeded token limits: {str(e)}")
188
+ raise OutputTooLongError(
189
+ f"LLM output exceeded token limits. Input may need to be split into smaller chunks."
190
+ ) from e
191
+
192
+ except APIStatusError as e:
193
+ last_exception = e
194
+ if attempt < max_retries:
195
+ # Calculate exponential backoff with jitter
196
+ backoff = min(initial_backoff * (2 ** attempt), max_backoff)
197
+ # Add jitter (±20%)
198
+ jitter = backoff * 0.2 * (2 * (time.time() % 1) - 1)
199
+ sleep_time = backoff + jitter
200
+
201
+ # Only log if it's a non-retryable error or final attempt
202
+ # Silent retry for common transient errors like capacity exceeded
203
+ await asyncio.sleep(sleep_time)
204
+ else:
205
+ # Log only on final failed attempt
206
+ logger.error(f"API error after {max_retries + 1} attempts: {str(e)}")
207
+ raise
208
+
209
+ except Exception as e:
210
+ logger.error(f"Unexpected error during LLM call: {type(e).__name__}: {str(e)}")
211
+ raise
212
+
213
+ # This should never be reached, but just in case
214
+ if last_exception:
215
+ raise last_exception
216
+ raise RuntimeError(f"LLM call failed after all retries with no exception captured")
217
+
218
+ @classmethod
219
+ def for_memory(cls) -> "LLMConfig":
220
+ """Create configuration for memory operations from environment variables."""
221
+ provider = os.getenv("HINDSIGHT_API_LLM_PROVIDER", "groq")
222
+ api_key = os.getenv("HINDSIGHT_API_LLM_API_KEY")
223
+ base_url = os.getenv("HINDSIGHT_API_LLM_BASE_URL")
224
+ model = os.getenv("HINDSIGHT_API_LLM_MODEL", "openai/gpt-oss-120b")
225
+
226
+ # Set default base URL if not provided
227
+ if not base_url:
228
+ if provider == "groq":
229
+ base_url = "https://api.groq.com/openai/v1"
230
+ elif provider == "ollama":
231
+ base_url = "http://localhost:11434/v1"
232
+ else:
233
+ base_url = ""
234
+
235
+ return cls(
236
+ provider=provider,
237
+ api_key=api_key,
238
+ base_url=base_url,
239
+ model=model,
240
+ )
241
+
242
+ @classmethod
243
+ def for_judge(cls) -> "LLMConfig":
244
+ """
245
+ Create configuration for judge/evaluator operations from environment variables.
246
+
247
+ Falls back to memory LLM config if judge-specific config not set.
248
+ """
249
+ # Check if judge-specific config exists, otherwise fall back to memory config
250
+ provider = os.getenv("HINDSIGHT_API_JUDGE_LLM_PROVIDER", os.getenv("HINDSIGHT_API_LLM_PROVIDER", "groq"))
251
+ api_key = os.getenv("HINDSIGHT_API_JUDGE_LLM_API_KEY", os.getenv("HINDSIGHT_API_LLM_API_KEY"))
252
+ base_url = os.getenv("HINDSIGHT_API_JUDGE_LLM_BASE_URL", os.getenv("HINDSIGHT_API_LLM_BASE_URL"))
253
+ model = os.getenv("HINDSIGHT_API_JUDGE_LLM_MODEL", os.getenv("HINDSIGHT_API_LLM_MODEL", "openai/gpt-oss-120b"))
254
+
255
+ # Set default base URL if not provided
256
+ if not base_url:
257
+ if provider == "groq":
258
+ base_url = "https://api.groq.com/openai/v1"
259
+ elif provider == "ollama":
260
+ base_url = "http://localhost:11434/v1"
261
+ else:
262
+ base_url = ""
263
+
264
+ return cls(
265
+ provider=provider,
266
+ api_key=api_key,
267
+ base_url=base_url,
268
+ model=model,
269
+ )