hindsight-api 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. hindsight_api/__init__.py +10 -9
  2. hindsight_api/alembic/env.py +5 -8
  3. hindsight_api/alembic/versions/5a366d414dce_initial_schema.py +266 -180
  4. hindsight_api/alembic/versions/b7c4d8e9f1a2_add_chunks_table.py +32 -32
  5. hindsight_api/alembic/versions/c8e5f2a3b4d1_add_retain_params_to_documents.py +11 -11
  6. hindsight_api/alembic/versions/d9f6a3b4c5e2_rename_bank_to_interactions.py +7 -12
  7. hindsight_api/alembic/versions/e0a1b2c3d4e5_disposition_to_3_traits.py +23 -15
  8. hindsight_api/alembic/versions/rename_personality_to_disposition.py +30 -21
  9. hindsight_api/api/__init__.py +10 -10
  10. hindsight_api/api/http.py +575 -593
  11. hindsight_api/api/mcp.py +31 -33
  12. hindsight_api/banner.py +13 -6
  13. hindsight_api/config.py +17 -12
  14. hindsight_api/engine/__init__.py +9 -9
  15. hindsight_api/engine/cross_encoder.py +23 -27
  16. hindsight_api/engine/db_utils.py +5 -4
  17. hindsight_api/engine/embeddings.py +22 -21
  18. hindsight_api/engine/entity_resolver.py +81 -75
  19. hindsight_api/engine/llm_wrapper.py +74 -88
  20. hindsight_api/engine/memory_engine.py +663 -673
  21. hindsight_api/engine/query_analyzer.py +100 -97
  22. hindsight_api/engine/response_models.py +105 -106
  23. hindsight_api/engine/retain/__init__.py +9 -16
  24. hindsight_api/engine/retain/bank_utils.py +34 -58
  25. hindsight_api/engine/retain/chunk_storage.py +4 -12
  26. hindsight_api/engine/retain/deduplication.py +9 -28
  27. hindsight_api/engine/retain/embedding_processing.py +4 -11
  28. hindsight_api/engine/retain/embedding_utils.py +3 -4
  29. hindsight_api/engine/retain/entity_processing.py +7 -17
  30. hindsight_api/engine/retain/fact_extraction.py +155 -165
  31. hindsight_api/engine/retain/fact_storage.py +11 -23
  32. hindsight_api/engine/retain/link_creation.py +11 -39
  33. hindsight_api/engine/retain/link_utils.py +166 -95
  34. hindsight_api/engine/retain/observation_regeneration.py +39 -52
  35. hindsight_api/engine/retain/orchestrator.py +72 -62
  36. hindsight_api/engine/retain/types.py +49 -43
  37. hindsight_api/engine/search/__init__.py +15 -1
  38. hindsight_api/engine/search/fusion.py +6 -15
  39. hindsight_api/engine/search/graph_retrieval.py +234 -0
  40. hindsight_api/engine/search/mpfp_retrieval.py +438 -0
  41. hindsight_api/engine/search/observation_utils.py +9 -16
  42. hindsight_api/engine/search/reranking.py +4 -7
  43. hindsight_api/engine/search/retrieval.py +388 -193
  44. hindsight_api/engine/search/scoring.py +5 -7
  45. hindsight_api/engine/search/temporal_extraction.py +8 -11
  46. hindsight_api/engine/search/think_utils.py +115 -39
  47. hindsight_api/engine/search/trace.py +68 -38
  48. hindsight_api/engine/search/tracer.py +49 -35
  49. hindsight_api/engine/search/types.py +22 -16
  50. hindsight_api/engine/task_backend.py +21 -26
  51. hindsight_api/engine/utils.py +25 -10
  52. hindsight_api/main.py +21 -40
  53. hindsight_api/mcp_local.py +190 -0
  54. hindsight_api/metrics.py +44 -30
  55. hindsight_api/migrations.py +10 -8
  56. hindsight_api/models.py +60 -72
  57. hindsight_api/pg0.py +64 -337
  58. hindsight_api/server.py +3 -6
  59. {hindsight_api-0.1.4.dist-info → hindsight_api-0.1.6.dist-info}/METADATA +6 -5
  60. hindsight_api-0.1.6.dist-info/RECORD +64 -0
  61. {hindsight_api-0.1.4.dist-info → hindsight_api-0.1.6.dist-info}/entry_points.txt +1 -0
  62. hindsight_api-0.1.4.dist-info/RECORD +0 -61
  63. {hindsight_api-0.1.4.dist-info → hindsight_api-0.1.6.dist-info}/WHEEL +0 -0
@@ -1,15 +1,17 @@
1
1
  """
2
2
  LLM wrapper for unified configuration across providers.
3
3
  """
4
+
5
+ import asyncio
6
+ import logging
4
7
  import os
5
8
  import time
6
- import asyncio
7
- from typing import Optional, Any, Dict, List
8
- from openai import AsyncOpenAI, RateLimitError, APIError, APIStatusError, APIConnectionError, LengthFinishReasonError
9
+ from typing import Any
10
+
9
11
  from google import genai
10
- from google.genai import types as genai_types
11
12
  from google.genai import errors as genai_errors
12
- import logging
13
+ from google.genai import types as genai_types
14
+ from openai import APIConnectionError, APIStatusError, AsyncOpenAI, LengthFinishReasonError
13
15
 
14
16
  # Seed applied to every Groq request for deterministic behavior.
15
17
  DEFAULT_LLM_SEED = 4242
@@ -31,6 +33,7 @@ class OutputTooLongError(Exception):
31
33
  to allow callers to handle output length issues without depending on
32
34
  provider-specific implementations.
33
35
  """
36
+
34
37
  pass
35
38
 
36
39
 
@@ -68,9 +71,7 @@ class LLMProvider:
68
71
  # Validate provider
69
72
  valid_providers = ["openai", "groq", "ollama", "gemini"]
70
73
  if self.provider not in valid_providers:
71
- raise ValueError(
72
- f"Invalid LLM provider: {self.provider}. Must be one of: {', '.join(valid_providers)}"
73
- )
74
+ raise ValueError(f"Invalid LLM provider: {self.provider}. Must be one of: {', '.join(valid_providers)}")
74
75
 
75
76
  # Set default base URLs
76
77
  if not self.base_url:
@@ -106,7 +107,9 @@ class LLMProvider:
106
107
  RuntimeError: If the connection test fails.
107
108
  """
108
109
  try:
109
- logger.info(f"Verifying LLM: provider={self.provider}, model={self.model}, base_url={self.base_url or 'default'}...")
110
+ logger.info(
111
+ f"Verifying LLM: provider={self.provider}, model={self.model}, base_url={self.base_url or 'default'}..."
112
+ )
110
113
  await self.call(
111
114
  messages=[{"role": "user", "content": "Say 'ok'"}],
112
115
  max_completion_tokens=10,
@@ -117,16 +120,14 @@ class LLMProvider:
117
120
  # If we get here without exception, the connection is working
118
121
  logger.info(f"LLM verified: {self.provider}/{self.model}")
119
122
  except Exception as e:
120
- raise RuntimeError(
121
- f"LLM connection verification failed for {self.provider}/{self.model}: {e}"
122
- ) from e
123
+ raise RuntimeError(f"LLM connection verification failed for {self.provider}/{self.model}: {e}") from e
123
124
 
124
125
  async def call(
125
126
  self,
126
- messages: List[Dict[str, str]],
127
- response_format: Optional[Any] = None,
128
- max_completion_tokens: Optional[int] = None,
129
- temperature: Optional[float] = None,
127
+ messages: list[dict[str, str]],
128
+ response_format: Any | None = None,
129
+ max_completion_tokens: int | None = None,
130
+ temperature: float | None = None,
130
131
  scope: str = "memory",
131
132
  max_retries: int = 10,
132
133
  initial_backoff: float = 1.0,
@@ -161,8 +162,7 @@ class LLMProvider:
161
162
  # Handle Gemini provider separately
162
163
  if self.provider == "gemini":
163
164
  return await self._call_gemini(
164
- messages, response_format, max_retries, initial_backoff,
165
- max_backoff, skip_validation, start_time
165
+ messages, response_format, max_retries, initial_backoff, max_backoff, skip_validation, start_time
166
166
  )
167
167
 
168
168
  call_params = {
@@ -175,9 +175,13 @@ class LLMProvider:
175
175
  is_reasoning_model = any(x in model_lower for x in ["gpt-5", "o1", "o3"])
176
176
 
177
177
  # For GPT-4 and GPT-4.1 models, cap max_completion_tokens to 32000
178
+ # For GPT-4o models, cap to 16384
178
179
  is_gpt4_model = any(x in model_lower for x in ["gpt-4.1", "gpt-4-"])
180
+ is_gpt4o_model = "gpt-4o" in model_lower
179
181
  if max_completion_tokens is not None:
180
- if is_gpt4_model and max_completion_tokens > 32000:
182
+ if is_gpt4o_model and max_completion_tokens > 16384:
183
+ max_completion_tokens = 16384
184
+ elif is_gpt4_model and max_completion_tokens > 32000:
181
185
  max_completion_tokens = 32000
182
186
  # For reasoning models, max_completion_tokens includes reasoning + output tokens
183
187
  # Enforce minimum of 16000 to ensure enough space for both
@@ -209,16 +213,18 @@ class LLMProvider:
209
213
  try:
210
214
  if response_format is not None:
211
215
  # Add schema to system message for JSON mode
212
- if hasattr(response_format, 'model_json_schema'):
216
+ if hasattr(response_format, "model_json_schema"):
213
217
  schema = response_format.model_json_schema()
214
218
  schema_msg = f"\n\nYou must respond with valid JSON matching this schema:\n{json.dumps(schema, indent=2)}"
215
219
 
216
- if call_params['messages'] and call_params['messages'][0].get('role') == 'system':
217
- call_params['messages'][0]['content'] += schema_msg
218
- elif call_params['messages']:
219
- call_params['messages'][0]['content'] = schema_msg + "\n\n" + call_params['messages'][0]['content']
220
+ if call_params["messages"] and call_params["messages"][0].get("role") == "system":
221
+ call_params["messages"][0]["content"] += schema_msg
222
+ elif call_params["messages"]:
223
+ call_params["messages"][0]["content"] = (
224
+ schema_msg + "\n\n" + call_params["messages"][0]["content"]
225
+ )
220
226
 
221
- call_params['response_format'] = {"type": "json_object"}
227
+ call_params["response_format"] = {"type": "json_object"}
222
228
  response = await self._client.chat.completions.create(**call_params)
223
229
 
224
230
  content = response.choices[0].message.content
@@ -238,8 +244,8 @@ class LLMProvider:
238
244
  if duration > 10.0:
239
245
  ratio = max(1, usage.completion_tokens) / usage.prompt_tokens
240
246
  cached_tokens = 0
241
- if hasattr(usage, 'prompt_tokens_details') and usage.prompt_tokens_details:
242
- cached_tokens = getattr(usage.prompt_tokens_details, 'cached_tokens', 0) or 0
247
+ if hasattr(usage, "prompt_tokens_details") and usage.prompt_tokens_details:
248
+ cached_tokens = getattr(usage.prompt_tokens_details, "cached_tokens", 0) or 0
243
249
  cache_info = f", cached_tokens={cached_tokens}" if cached_tokens > 0 else ""
244
250
  logger.info(
245
251
  f"slow llm call: model={self.provider}/{self.model}, "
@@ -252,15 +258,19 @@ class LLMProvider:
252
258
  except LengthFinishReasonError as e:
253
259
  logger.warning(f"LLM output exceeded token limits: {str(e)}")
254
260
  raise OutputTooLongError(
255
- f"LLM output exceeded token limits. Input may need to be split into smaller chunks."
261
+ "LLM output exceeded token limits. Input may need to be split into smaller chunks."
256
262
  ) from e
257
263
 
258
264
  except APIConnectionError as e:
259
265
  last_exception = e
260
266
  if attempt < max_retries:
261
- status_code = getattr(e, 'status_code', None) or getattr(getattr(e, 'response', None), 'status_code', None)
262
- logger.warning(f"Connection error, retrying... (attempt {attempt + 1}/{max_retries + 1}) - status_code={status_code}, message={e}")
263
- backoff = min(initial_backoff * (2 ** attempt), max_backoff)
267
+ status_code = getattr(e, "status_code", None) or getattr(
268
+ getattr(e, "response", None), "status_code", None
269
+ )
270
+ logger.warning(
271
+ f"Connection error, retrying... (attempt {attempt + 1}/{max_retries + 1}) - status_code={status_code}, message={e}"
272
+ )
273
+ backoff = min(initial_backoff * (2**attempt), max_backoff)
264
274
  await asyncio.sleep(backoff)
265
275
  continue
266
276
  else:
@@ -268,14 +278,14 @@ class LLMProvider:
268
278
  raise
269
279
 
270
280
  except APIStatusError as e:
271
- # Fast fail on 4xx client errors (except 429 rate limit and 498 which is treated as server error)
272
- if 400 <= e.status_code < 500 and e.status_code not in (429, 498):
273
- logger.error(f"Client error (HTTP {e.status_code}), not retrying: {str(e)}")
281
+ # Fast fail only on 401 (unauthorized) and 403 (forbidden) - these won't recover with retries
282
+ if e.status_code in (401, 403):
283
+ logger.error(f"Auth error (HTTP {e.status_code}), not retrying: {str(e)}")
274
284
  raise
275
285
 
276
286
  last_exception = e
277
287
  if attempt < max_retries:
278
- backoff = min(initial_backoff * (2 ** attempt), max_backoff)
288
+ backoff = min(initial_backoff * (2**attempt), max_backoff)
279
289
  jitter = backoff * 0.2 * (2 * (time.time() % 1) - 1)
280
290
  sleep_time = backoff + jitter
281
291
  await asyncio.sleep(sleep_time)
@@ -289,12 +299,12 @@ class LLMProvider:
289
299
 
290
300
  if last_exception:
291
301
  raise last_exception
292
- raise RuntimeError(f"LLM call failed after all retries with no exception captured")
302
+ raise RuntimeError("LLM call failed after all retries with no exception captured")
293
303
 
294
304
  async def _call_gemini(
295
305
  self,
296
- messages: List[Dict[str, str]],
297
- response_format: Optional[Any],
306
+ messages: list[dict[str, str]],
307
+ response_format: Any | None,
298
308
  max_retries: int,
299
309
  initial_backoff: float,
300
310
  max_backoff: float,
@@ -309,27 +319,21 @@ class LLMProvider:
309
319
  gemini_contents = []
310
320
 
311
321
  for msg in messages:
312
- role = msg.get('role', 'user')
313
- content = msg.get('content', '')
322
+ role = msg.get("role", "user")
323
+ content = msg.get("content", "")
314
324
 
315
- if role == 'system':
325
+ if role == "system":
316
326
  if system_instruction:
317
327
  system_instruction += "\n\n" + content
318
328
  else:
319
329
  system_instruction = content
320
- elif role == 'assistant':
321
- gemini_contents.append(genai_types.Content(
322
- role="model",
323
- parts=[genai_types.Part(text=content)]
324
- ))
330
+ elif role == "assistant":
331
+ gemini_contents.append(genai_types.Content(role="model", parts=[genai_types.Part(text=content)]))
325
332
  else:
326
- gemini_contents.append(genai_types.Content(
327
- role="user",
328
- parts=[genai_types.Part(text=content)]
329
- ))
333
+ gemini_contents.append(genai_types.Content(role="user", parts=[genai_types.Part(text=content)]))
330
334
 
331
335
  # Add JSON schema instruction if response_format is provided
332
- if response_format is not None and hasattr(response_format, 'model_json_schema'):
336
+ if response_format is not None and hasattr(response_format, "model_json_schema"):
333
337
  schema = response_format.model_json_schema()
334
338
  schema_msg = f"\n\nYou must respond with valid JSON matching this schema:\n{json.dumps(schema, indent=2)}"
335
339
  if system_instruction:
@@ -340,10 +344,10 @@ class LLMProvider:
340
344
  # Build generation config
341
345
  config_kwargs = {}
342
346
  if system_instruction:
343
- config_kwargs['system_instruction'] = system_instruction
347
+ config_kwargs["system_instruction"] = system_instruction
344
348
  if response_format is not None:
345
- config_kwargs['response_mime_type'] = 'application/json'
346
- config_kwargs['response_schema'] = response_format
349
+ config_kwargs["response_mime_type"] = "application/json"
350
+ config_kwargs["response_schema"] = response_format
347
351
 
348
352
  generation_config = genai_types.GenerateContentConfig(**config_kwargs) if config_kwargs else None
349
353
 
@@ -362,14 +366,14 @@ class LLMProvider:
362
366
  # Handle empty response
363
367
  if content is None:
364
368
  block_reason = None
365
- if hasattr(response, 'candidates') and response.candidates:
369
+ if hasattr(response, "candidates") and response.candidates:
366
370
  candidate = response.candidates[0]
367
- if hasattr(candidate, 'finish_reason'):
371
+ if hasattr(candidate, "finish_reason"):
368
372
  block_reason = candidate.finish_reason
369
373
 
370
374
  if attempt < max_retries:
371
375
  logger.warning(f"Gemini returned empty response (reason: {block_reason}), retrying...")
372
- backoff = min(initial_backoff * (2 ** attempt), max_backoff)
376
+ backoff = min(initial_backoff * (2**attempt), max_backoff)
373
377
  await asyncio.sleep(backoff)
374
378
  continue
375
379
  else:
@@ -386,7 +390,7 @@ class LLMProvider:
386
390
 
387
391
  # Log slow calls
388
392
  duration = time.time() - start_time
389
- if duration > 10.0 and hasattr(response, 'usage_metadata') and response.usage_metadata:
393
+ if duration > 10.0 and hasattr(response, "usage_metadata") and response.usage_metadata:
390
394
  usage = response.usage_metadata
391
395
  logger.info(
392
396
  f"slow llm call: model={self.provider}/{self.model}, "
@@ -399,8 +403,8 @@ class LLMProvider:
399
403
  except json.JSONDecodeError as e:
400
404
  last_exception = e
401
405
  if attempt < max_retries:
402
- logger.warning(f"Gemini returned invalid JSON, retrying...")
403
- backoff = min(initial_backoff * (2 ** attempt), max_backoff)
406
+ logger.warning("Gemini returned invalid JSON, retrying...")
407
+ backoff = min(initial_backoff * (2**attempt), max_backoff)
404
408
  await asyncio.sleep(backoff)
405
409
  continue
406
410
  else:
@@ -408,16 +412,16 @@ class LLMProvider:
408
412
  raise
409
413
 
410
414
  except genai_errors.APIError as e:
411
- # Fast fail on 4xx client errors (except 429 rate limit)
412
- if e.code and 400 <= e.code < 500 and e.code != 429:
413
- logger.error(f"Gemini client error (HTTP {e.code}), not retrying: {str(e)}")
415
+ # Fast fail only on 401 (unauthorized) and 403 (forbidden) - these won't recover with retries
416
+ if e.code in (401, 403):
417
+ logger.error(f"Gemini auth error (HTTP {e.code}), not retrying: {str(e)}")
414
418
  raise
415
419
 
416
- # Retry on 429 and 5xx
417
- if e.code in (429, 500, 502, 503, 504):
420
+ # Retry on retryable errors (rate limits, server errors, and other client errors like 400)
421
+ if e.code in (400, 429, 500, 502, 503, 504) or (e.code and e.code >= 500):
418
422
  last_exception = e
419
423
  if attempt < max_retries:
420
- backoff = min(initial_backoff * (2 ** attempt), max_backoff)
424
+ backoff = min(initial_backoff * (2**attempt), max_backoff)
421
425
  jitter = backoff * 0.2 * (2 * (time.time() % 1) - 1)
422
426
  await asyncio.sleep(backoff + jitter)
423
427
  else:
@@ -433,7 +437,7 @@ class LLMProvider:
433
437
 
434
438
  if last_exception:
435
439
  raise last_exception
436
- raise RuntimeError(f"Gemini call failed after all retries")
440
+ raise RuntimeError("Gemini call failed after all retries")
437
441
 
438
442
  @classmethod
439
443
  def for_memory(cls) -> "LLMProvider":
@@ -443,13 +447,7 @@ class LLMProvider:
443
447
  base_url = os.getenv("HINDSIGHT_API_LLM_BASE_URL", "")
444
448
  model = os.getenv("HINDSIGHT_API_LLM_MODEL", "openai/gpt-oss-120b")
445
449
 
446
- return cls(
447
- provider=provider,
448
- api_key=api_key,
449
- base_url=base_url,
450
- model=model,
451
- reasoning_effort="low"
452
- )
450
+ return cls(provider=provider, api_key=api_key, base_url=base_url, model=model, reasoning_effort="low")
453
451
 
454
452
  @classmethod
455
453
  def for_answer_generation(cls) -> "LLMProvider":
@@ -459,13 +457,7 @@ class LLMProvider:
459
457
  base_url = os.getenv("HINDSIGHT_API_ANSWER_LLM_BASE_URL", os.getenv("HINDSIGHT_API_LLM_BASE_URL", ""))
460
458
  model = os.getenv("HINDSIGHT_API_ANSWER_LLM_MODEL", os.getenv("HINDSIGHT_API_LLM_MODEL", "openai/gpt-oss-120b"))
461
459
 
462
- return cls(
463
- provider=provider,
464
- api_key=api_key,
465
- base_url=base_url,
466
- model=model,
467
- reasoning_effort="high"
468
- )
460
+ return cls(provider=provider, api_key=api_key, base_url=base_url, model=model, reasoning_effort="high")
469
461
 
470
462
  @classmethod
471
463
  def for_judge(cls) -> "LLMProvider":
@@ -475,13 +467,7 @@ class LLMProvider:
475
467
  base_url = os.getenv("HINDSIGHT_API_JUDGE_LLM_BASE_URL", os.getenv("HINDSIGHT_API_LLM_BASE_URL", ""))
476
468
  model = os.getenv("HINDSIGHT_API_JUDGE_LLM_MODEL", os.getenv("HINDSIGHT_API_LLM_MODEL", "openai/gpt-oss-120b"))
477
469
 
478
- return cls(
479
- provider=provider,
480
- api_key=api_key,
481
- base_url=base_url,
482
- model=model,
483
- reasoning_effort="high"
484
- )
470
+ return cls(provider=provider, api_key=api_key, base_url=base_url, model=model, reasoning_effort="high")
485
471
 
486
472
 
487
473
  # Backwards compatibility alias