hindsight-api 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hindsight_api/__init__.py +2 -0
- hindsight_api/alembic/env.py +24 -1
- hindsight_api/alembic/versions/d9f6a3b4c5e2_rename_bank_to_interactions.py +14 -4
- hindsight_api/alembic/versions/e0a1b2c3d4e5_disposition_to_3_traits.py +54 -13
- hindsight_api/alembic/versions/rename_personality_to_disposition.py +18 -7
- hindsight_api/api/http.py +253 -230
- hindsight_api/api/mcp.py +14 -3
- hindsight_api/config.py +11 -0
- hindsight_api/daemon.py +204 -0
- hindsight_api/engine/__init__.py +12 -1
- hindsight_api/engine/entity_resolver.py +38 -37
- hindsight_api/engine/interface.py +592 -0
- hindsight_api/engine/llm_wrapper.py +176 -6
- hindsight_api/engine/memory_engine.py +1092 -293
- hindsight_api/engine/retain/bank_utils.py +13 -12
- hindsight_api/engine/retain/chunk_storage.py +3 -2
- hindsight_api/engine/retain/fact_storage.py +10 -7
- hindsight_api/engine/retain/link_utils.py +17 -16
- hindsight_api/engine/retain/observation_regeneration.py +17 -16
- hindsight_api/engine/retain/orchestrator.py +2 -3
- hindsight_api/engine/retain/types.py +25 -8
- hindsight_api/engine/search/graph_retrieval.py +6 -5
- hindsight_api/engine/search/mpfp_retrieval.py +8 -7
- hindsight_api/engine/search/reranking.py +17 -0
- hindsight_api/engine/search/retrieval.py +12 -11
- hindsight_api/engine/search/think_utils.py +1 -1
- hindsight_api/engine/search/tracer.py +1 -1
- hindsight_api/engine/task_backend.py +32 -0
- hindsight_api/extensions/__init__.py +66 -0
- hindsight_api/extensions/base.py +81 -0
- hindsight_api/extensions/builtin/__init__.py +18 -0
- hindsight_api/extensions/builtin/tenant.py +33 -0
- hindsight_api/extensions/context.py +110 -0
- hindsight_api/extensions/http.py +89 -0
- hindsight_api/extensions/loader.py +125 -0
- hindsight_api/extensions/operation_validator.py +325 -0
- hindsight_api/extensions/tenant.py +63 -0
- hindsight_api/main.py +97 -17
- hindsight_api/mcp_local.py +7 -1
- hindsight_api/migrations.py +54 -10
- hindsight_api/models.py +15 -0
- hindsight_api/pg0.py +1 -1
- {hindsight_api-0.1.11.dist-info → hindsight_api-0.1.13.dist-info}/METADATA +1 -1
- hindsight_api-0.1.13.dist-info/RECORD +75 -0
- hindsight_api-0.1.11.dist-info/RECORD +0 -64
- {hindsight_api-0.1.11.dist-info → hindsight_api-0.1.13.dist-info}/WHEEL +0 -0
- {hindsight_api-0.1.11.dist-info → hindsight_api-0.1.13.dist-info}/entry_points.txt +0 -0
|
@@ -3,11 +3,13 @@ LLM wrapper for unified configuration across providers.
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
import asyncio
|
|
6
|
+
import json
|
|
6
7
|
import logging
|
|
7
8
|
import os
|
|
8
9
|
import time
|
|
9
10
|
from typing import Any
|
|
10
11
|
|
|
12
|
+
import httpx
|
|
11
13
|
from google import genai
|
|
12
14
|
from google.genai import errors as genai_errors
|
|
13
15
|
from google.genai import types as genai_types
|
|
@@ -96,7 +98,7 @@ class LLMProvider:
|
|
|
96
98
|
client_kwargs = {"api_key": self.api_key, "max_retries": 0}
|
|
97
99
|
if self.base_url:
|
|
98
100
|
client_kwargs["base_url"] = self.base_url
|
|
99
|
-
self._client = AsyncOpenAI(**client_kwargs)
|
|
101
|
+
self._client = AsyncOpenAI(**client_kwargs) # type: ignore[invalid-argument-type] - dict kwargs
|
|
100
102
|
self._gemini_client = None
|
|
101
103
|
|
|
102
104
|
async def verify_connection(self) -> None:
|
|
@@ -112,7 +114,7 @@ class LLMProvider:
|
|
|
112
114
|
)
|
|
113
115
|
await self.call(
|
|
114
116
|
messages=[{"role": "user", "content": "Say 'ok'"}],
|
|
115
|
-
max_completion_tokens=
|
|
117
|
+
max_completion_tokens=100,
|
|
116
118
|
max_retries=2,
|
|
117
119
|
initial_backoff=0.5,
|
|
118
120
|
max_backoff=2.0,
|
|
@@ -157,7 +159,6 @@ class LLMProvider:
|
|
|
157
159
|
"""
|
|
158
160
|
async with _global_llm_semaphore:
|
|
159
161
|
start_time = time.time()
|
|
160
|
-
import json
|
|
161
162
|
|
|
162
163
|
# Handle Gemini provider separately
|
|
163
164
|
if self.provider == "gemini":
|
|
@@ -165,6 +166,20 @@ class LLMProvider:
|
|
|
165
166
|
messages, response_format, max_retries, initial_backoff, max_backoff, skip_validation, start_time
|
|
166
167
|
)
|
|
167
168
|
|
|
169
|
+
# Handle Ollama with native API for structured output (better schema enforcement)
|
|
170
|
+
if self.provider == "ollama" and response_format is not None:
|
|
171
|
+
return await self._call_ollama_native(
|
|
172
|
+
messages,
|
|
173
|
+
response_format,
|
|
174
|
+
max_completion_tokens,
|
|
175
|
+
temperature,
|
|
176
|
+
max_retries,
|
|
177
|
+
initial_backoff,
|
|
178
|
+
max_backoff,
|
|
179
|
+
skip_validation,
|
|
180
|
+
start_time,
|
|
181
|
+
)
|
|
182
|
+
|
|
168
183
|
call_params = {
|
|
169
184
|
"model": self.model,
|
|
170
185
|
"messages": messages,
|
|
@@ -227,7 +242,31 @@ class LLMProvider:
|
|
|
227
242
|
response = await self._client.chat.completions.create(**call_params)
|
|
228
243
|
|
|
229
244
|
content = response.choices[0].message.content
|
|
230
|
-
|
|
245
|
+
|
|
246
|
+
# Log raw LLM response for debugging JSON parse issues
|
|
247
|
+
try:
|
|
248
|
+
json_data = json.loads(content)
|
|
249
|
+
except json.JSONDecodeError as json_err:
|
|
250
|
+
# Truncate content for logging (first 500 and last 200 chars)
|
|
251
|
+
content_preview = content[:500] if content else "<empty>"
|
|
252
|
+
if content and len(content) > 700:
|
|
253
|
+
content_preview = f"{content[:500]}...TRUNCATED...{content[-200:]}"
|
|
254
|
+
logger.warning(
|
|
255
|
+
f"JSON parse error from LLM response (attempt {attempt + 1}/{max_retries + 1}): {json_err}\n"
|
|
256
|
+
f" Model: {self.provider}/{self.model}\n"
|
|
257
|
+
f" Content length: {len(content) if content else 0} chars\n"
|
|
258
|
+
f" Content preview: {content_preview!r}\n"
|
|
259
|
+
f" Finish reason: {response.choices[0].finish_reason if response.choices else 'unknown'}"
|
|
260
|
+
)
|
|
261
|
+
# Retry on JSON parse errors - LLM may return valid JSON on next attempt
|
|
262
|
+
if attempt < max_retries:
|
|
263
|
+
backoff = min(initial_backoff * (2**attempt), max_backoff)
|
|
264
|
+
await asyncio.sleep(backoff)
|
|
265
|
+
last_exception = json_err
|
|
266
|
+
continue
|
|
267
|
+
else:
|
|
268
|
+
logger.error(f"JSON parse error after {max_retries + 1} attempts, giving up")
|
|
269
|
+
raise
|
|
231
270
|
|
|
232
271
|
if skip_validation:
|
|
233
272
|
result = json_data
|
|
@@ -300,6 +339,129 @@ class LLMProvider:
|
|
|
300
339
|
raise last_exception
|
|
301
340
|
raise RuntimeError("LLM call failed after all retries with no exception captured")
|
|
302
341
|
|
|
342
|
+
async def _call_ollama_native(
|
|
343
|
+
self,
|
|
344
|
+
messages: list[dict[str, str]],
|
|
345
|
+
response_format: Any,
|
|
346
|
+
max_completion_tokens: int | None,
|
|
347
|
+
temperature: float | None,
|
|
348
|
+
max_retries: int,
|
|
349
|
+
initial_backoff: float,
|
|
350
|
+
max_backoff: float,
|
|
351
|
+
skip_validation: bool,
|
|
352
|
+
start_time: float,
|
|
353
|
+
) -> Any:
|
|
354
|
+
"""
|
|
355
|
+
Call Ollama using native API with JSON schema enforcement.
|
|
356
|
+
|
|
357
|
+
Ollama's native API supports passing a full JSON schema in the 'format' parameter,
|
|
358
|
+
which provides better structured output control than the OpenAI-compatible API.
|
|
359
|
+
"""
|
|
360
|
+
# Get the JSON schema from the Pydantic model
|
|
361
|
+
schema = response_format.model_json_schema() if hasattr(response_format, "model_json_schema") else None
|
|
362
|
+
|
|
363
|
+
# Build the base URL for Ollama's native API
|
|
364
|
+
# Default OpenAI-compatible URL is http://localhost:11434/v1
|
|
365
|
+
# Native API is at http://localhost:11434/api/chat
|
|
366
|
+
base_url = self.base_url or "http://localhost:11434/v1"
|
|
367
|
+
if base_url.endswith("/v1"):
|
|
368
|
+
native_url = base_url[:-3] + "/api/chat"
|
|
369
|
+
else:
|
|
370
|
+
native_url = base_url.rstrip("/") + "/api/chat"
|
|
371
|
+
|
|
372
|
+
# Build request payload
|
|
373
|
+
payload = {
|
|
374
|
+
"model": self.model,
|
|
375
|
+
"messages": messages,
|
|
376
|
+
"stream": False,
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
# Add schema as format parameter for structured output
|
|
380
|
+
if schema:
|
|
381
|
+
payload["format"] = schema
|
|
382
|
+
|
|
383
|
+
# Add optional parameters with optimized defaults for Ollama
|
|
384
|
+
# Benchmarking shows num_ctx=16384 + num_batch=512 is optimal
|
|
385
|
+
options = {
|
|
386
|
+
"num_ctx": 16384, # 16k context window for larger prompts
|
|
387
|
+
"num_batch": 512, # Optimal batch size for prompt processing
|
|
388
|
+
}
|
|
389
|
+
if max_completion_tokens:
|
|
390
|
+
options["num_predict"] = max_completion_tokens
|
|
391
|
+
if temperature is not None:
|
|
392
|
+
options["temperature"] = temperature
|
|
393
|
+
payload["options"] = options
|
|
394
|
+
|
|
395
|
+
last_exception = None
|
|
396
|
+
|
|
397
|
+
async with httpx.AsyncClient(timeout=300.0) as client:
|
|
398
|
+
for attempt in range(max_retries + 1):
|
|
399
|
+
try:
|
|
400
|
+
response = await client.post(native_url, json=payload)
|
|
401
|
+
response.raise_for_status()
|
|
402
|
+
|
|
403
|
+
result = response.json()
|
|
404
|
+
content = result.get("message", {}).get("content", "")
|
|
405
|
+
|
|
406
|
+
# Parse JSON response
|
|
407
|
+
try:
|
|
408
|
+
json_data = json.loads(content)
|
|
409
|
+
except json.JSONDecodeError as json_err:
|
|
410
|
+
content_preview = content[:500] if content else "<empty>"
|
|
411
|
+
if content and len(content) > 700:
|
|
412
|
+
content_preview = f"{content[:500]}...TRUNCATED...{content[-200:]}"
|
|
413
|
+
logger.warning(
|
|
414
|
+
f"Ollama JSON parse error (attempt {attempt + 1}/{max_retries + 1}): {json_err}\n"
|
|
415
|
+
f" Model: ollama/{self.model}\n"
|
|
416
|
+
f" Content length: {len(content) if content else 0} chars\n"
|
|
417
|
+
f" Content preview: {content_preview!r}"
|
|
418
|
+
)
|
|
419
|
+
if attempt < max_retries:
|
|
420
|
+
backoff = min(initial_backoff * (2**attempt), max_backoff)
|
|
421
|
+
await asyncio.sleep(backoff)
|
|
422
|
+
last_exception = json_err
|
|
423
|
+
continue
|
|
424
|
+
else:
|
|
425
|
+
raise
|
|
426
|
+
|
|
427
|
+
# Validate against Pydantic model or return raw JSON
|
|
428
|
+
if skip_validation:
|
|
429
|
+
return json_data
|
|
430
|
+
else:
|
|
431
|
+
return response_format.model_validate(json_data)
|
|
432
|
+
|
|
433
|
+
except httpx.HTTPStatusError as e:
|
|
434
|
+
last_exception = e
|
|
435
|
+
if attempt < max_retries:
|
|
436
|
+
logger.warning(
|
|
437
|
+
f"Ollama HTTP error (attempt {attempt + 1}/{max_retries + 1}): {e.response.status_code}"
|
|
438
|
+
)
|
|
439
|
+
backoff = min(initial_backoff * (2**attempt), max_backoff)
|
|
440
|
+
await asyncio.sleep(backoff)
|
|
441
|
+
continue
|
|
442
|
+
else:
|
|
443
|
+
logger.error(f"Ollama HTTP error after {max_retries + 1} attempts: {e}")
|
|
444
|
+
raise
|
|
445
|
+
|
|
446
|
+
except httpx.RequestError as e:
|
|
447
|
+
last_exception = e
|
|
448
|
+
if attempt < max_retries:
|
|
449
|
+
logger.warning(f"Ollama connection error (attempt {attempt + 1}/{max_retries + 1}): {e}")
|
|
450
|
+
backoff = min(initial_backoff * (2**attempt), max_backoff)
|
|
451
|
+
await asyncio.sleep(backoff)
|
|
452
|
+
continue
|
|
453
|
+
else:
|
|
454
|
+
logger.error(f"Ollama connection error after {max_retries + 1} attempts: {e}")
|
|
455
|
+
raise
|
|
456
|
+
|
|
457
|
+
except Exception as e:
|
|
458
|
+
logger.error(f"Unexpected error during Ollama call: {type(e).__name__}: {e}")
|
|
459
|
+
raise
|
|
460
|
+
|
|
461
|
+
if last_exception:
|
|
462
|
+
raise last_exception
|
|
463
|
+
raise RuntimeError("Ollama call failed after all retries")
|
|
464
|
+
|
|
303
465
|
async def _call_gemini(
|
|
304
466
|
self,
|
|
305
467
|
messages: list[dict[str, str]],
|
|
@@ -311,8 +473,6 @@ class LLMProvider:
|
|
|
311
473
|
start_time: float,
|
|
312
474
|
) -> Any:
|
|
313
475
|
"""Handle Gemini-specific API calls."""
|
|
314
|
-
import json
|
|
315
|
-
|
|
316
476
|
# Convert OpenAI-style messages to Gemini format
|
|
317
477
|
system_instruction = None
|
|
318
478
|
gemini_contents = []
|
|
@@ -443,6 +603,8 @@ class LLMProvider:
|
|
|
443
603
|
"""Create provider for memory operations from environment variables."""
|
|
444
604
|
provider = os.getenv("HINDSIGHT_API_LLM_PROVIDER", "groq")
|
|
445
605
|
api_key = os.getenv("HINDSIGHT_API_LLM_API_KEY")
|
|
606
|
+
if not api_key:
|
|
607
|
+
raise ValueError("HINDSIGHT_API_LLM_API_KEY environment variable is required")
|
|
446
608
|
base_url = os.getenv("HINDSIGHT_API_LLM_BASE_URL", "")
|
|
447
609
|
model = os.getenv("HINDSIGHT_API_LLM_MODEL", "openai/gpt-oss-120b")
|
|
448
610
|
|
|
@@ -453,6 +615,10 @@ class LLMProvider:
|
|
|
453
615
|
"""Create provider for answer generation. Falls back to memory config if not set."""
|
|
454
616
|
provider = os.getenv("HINDSIGHT_API_ANSWER_LLM_PROVIDER", os.getenv("HINDSIGHT_API_LLM_PROVIDER", "groq"))
|
|
455
617
|
api_key = os.getenv("HINDSIGHT_API_ANSWER_LLM_API_KEY", os.getenv("HINDSIGHT_API_LLM_API_KEY"))
|
|
618
|
+
if not api_key:
|
|
619
|
+
raise ValueError(
|
|
620
|
+
"HINDSIGHT_API_LLM_API_KEY or HINDSIGHT_API_ANSWER_LLM_API_KEY environment variable is required"
|
|
621
|
+
)
|
|
456
622
|
base_url = os.getenv("HINDSIGHT_API_ANSWER_LLM_BASE_URL", os.getenv("HINDSIGHT_API_LLM_BASE_URL", ""))
|
|
457
623
|
model = os.getenv("HINDSIGHT_API_ANSWER_LLM_MODEL", os.getenv("HINDSIGHT_API_LLM_MODEL", "openai/gpt-oss-120b"))
|
|
458
624
|
|
|
@@ -463,6 +629,10 @@ class LLMProvider:
|
|
|
463
629
|
"""Create provider for judge/evaluator operations. Falls back to memory config if not set."""
|
|
464
630
|
provider = os.getenv("HINDSIGHT_API_JUDGE_LLM_PROVIDER", os.getenv("HINDSIGHT_API_LLM_PROVIDER", "groq"))
|
|
465
631
|
api_key = os.getenv("HINDSIGHT_API_JUDGE_LLM_API_KEY", os.getenv("HINDSIGHT_API_LLM_API_KEY"))
|
|
632
|
+
if not api_key:
|
|
633
|
+
raise ValueError(
|
|
634
|
+
"HINDSIGHT_API_LLM_API_KEY or HINDSIGHT_API_JUDGE_LLM_API_KEY environment variable is required"
|
|
635
|
+
)
|
|
466
636
|
base_url = os.getenv("HINDSIGHT_API_JUDGE_LLM_BASE_URL", os.getenv("HINDSIGHT_API_LLM_BASE_URL", ""))
|
|
467
637
|
model = os.getenv("HINDSIGHT_API_JUDGE_LLM_MODEL", os.getenv("HINDSIGHT_API_LLM_MODEL", "openai/gpt-oss-120b"))
|
|
468
638
|
|