hindsight-api 0.1.16__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hindsight_api/api/__init__.py +38 -14
- hindsight_api/api/http.py +100 -9
- hindsight_api/api/mcp.py +203 -52
- hindsight_api/config.py +27 -0
- hindsight_api/engine/interface.py +4 -0
- hindsight_api/engine/llm_wrapper.py +275 -45
- hindsight_api/engine/memory_engine.py +69 -16
- hindsight_api/engine/response_models.py +7 -1
- hindsight_api/engine/retain/entity_processing.py +37 -8
- hindsight_api/engine/retain/fact_extraction.py +49 -6
- hindsight_api/engine/retain/observation_regeneration.py +4 -2
- hindsight_api/engine/retain/orchestrator.py +12 -1
- hindsight_api/engine/retain/types.py +7 -0
- hindsight_api/extensions/context.py +8 -1
- hindsight_api/extensions/operation_validator.py +6 -4
- hindsight_api/main.py +29 -1
- hindsight_api/models.py +3 -0
- {hindsight_api-0.1.16.dist-info → hindsight_api-0.2.0.dist-info}/METADATA +3 -2
- {hindsight_api-0.1.16.dist-info → hindsight_api-0.2.0.dist-info}/RECORD +21 -21
- {hindsight_api-0.1.16.dist-info → hindsight_api-0.2.0.dist-info}/WHEEL +0 -0
- {hindsight_api-0.1.16.dist-info → hindsight_api-0.2.0.dist-info}/entry_points.txt +0 -0
|
@@ -6,6 +6,7 @@ import asyncio
|
|
|
6
6
|
import json
|
|
7
7
|
import logging
|
|
8
8
|
import os
|
|
9
|
+
import re
|
|
9
10
|
import time
|
|
10
11
|
from typing import Any
|
|
11
12
|
|
|
@@ -15,6 +16,13 @@ from google.genai import errors as genai_errors
|
|
|
15
16
|
from google.genai import types as genai_types
|
|
16
17
|
from openai import APIConnectionError, APIStatusError, AsyncOpenAI, LengthFinishReasonError
|
|
17
18
|
|
|
19
|
+
from ..config import (
|
|
20
|
+
DEFAULT_LLM_MAX_CONCURRENT,
|
|
21
|
+
DEFAULT_LLM_TIMEOUT,
|
|
22
|
+
ENV_LLM_MAX_CONCURRENT,
|
|
23
|
+
ENV_LLM_TIMEOUT,
|
|
24
|
+
)
|
|
25
|
+
|
|
18
26
|
# Seed applied to every Groq request for deterministic behavior.
|
|
19
27
|
DEFAULT_LLM_SEED = 4242
|
|
20
28
|
|
|
@@ -24,7 +32,9 @@ logger = logging.getLogger(__name__)
|
|
|
24
32
|
logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
25
33
|
|
|
26
34
|
# Global semaphore to limit concurrent LLM requests across all instances
|
|
27
|
-
|
|
35
|
+
# Set HINDSIGHT_API_LLM_MAX_CONCURRENT=1 for local LLMs (LM Studio, Ollama)
|
|
36
|
+
_llm_max_concurrent = int(os.getenv(ENV_LLM_MAX_CONCURRENT, str(DEFAULT_LLM_MAX_CONCURRENT)))
|
|
37
|
+
_global_llm_semaphore = asyncio.Semaphore(_llm_max_concurrent)
|
|
28
38
|
|
|
29
39
|
|
|
30
40
|
class OutputTooLongError(Exception):
|
|
@@ -58,7 +68,7 @@ class LLMProvider:
|
|
|
58
68
|
Initialize LLM provider.
|
|
59
69
|
|
|
60
70
|
Args:
|
|
61
|
-
provider: Provider name ("openai", "groq", "ollama", "gemini").
|
|
71
|
+
provider: Provider name ("openai", "groq", "ollama", "gemini", "anthropic", "lmstudio").
|
|
62
72
|
api_key: API key.
|
|
63
73
|
base_url: Base URL for the API.
|
|
64
74
|
model: Model name.
|
|
@@ -71,7 +81,7 @@ class LLMProvider:
|
|
|
71
81
|
self.reasoning_effort = reasoning_effort
|
|
72
82
|
|
|
73
83
|
# Validate provider
|
|
74
|
-
valid_providers = ["openai", "groq", "ollama", "gemini"]
|
|
84
|
+
valid_providers = ["openai", "groq", "ollama", "gemini", "anthropic", "lmstudio"]
|
|
75
85
|
if self.provider not in valid_providers:
|
|
76
86
|
raise ValueError(f"Invalid LLM provider: {self.provider}. Must be one of: {', '.join(valid_providers)}")
|
|
77
87
|
|
|
@@ -81,25 +91,48 @@ class LLMProvider:
|
|
|
81
91
|
self.base_url = "https://api.groq.com/openai/v1"
|
|
82
92
|
elif self.provider == "ollama":
|
|
83
93
|
self.base_url = "http://localhost:11434/v1"
|
|
94
|
+
elif self.provider == "lmstudio":
|
|
95
|
+
self.base_url = "http://localhost:1234/v1"
|
|
84
96
|
|
|
85
|
-
# Validate API key (not needed for ollama)
|
|
86
|
-
if self.provider
|
|
97
|
+
# Validate API key (not needed for ollama or lmstudio)
|
|
98
|
+
if self.provider not in ("ollama", "lmstudio") and not self.api_key:
|
|
87
99
|
raise ValueError(f"API key not found for {self.provider}")
|
|
88
100
|
|
|
101
|
+
# Get timeout config (set HINDSIGHT_API_LLM_TIMEOUT for local LLMs that need longer timeouts)
|
|
102
|
+
self.timeout = float(os.getenv(ENV_LLM_TIMEOUT, str(DEFAULT_LLM_TIMEOUT)))
|
|
103
|
+
|
|
89
104
|
# Create client based on provider
|
|
105
|
+
self._client = None
|
|
106
|
+
self._gemini_client = None
|
|
107
|
+
self._anthropic_client = None
|
|
108
|
+
|
|
90
109
|
if self.provider == "gemini":
|
|
91
110
|
self._gemini_client = genai.Client(api_key=self.api_key)
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
111
|
+
elif self.provider == "anthropic":
|
|
112
|
+
from anthropic import AsyncAnthropic
|
|
113
|
+
|
|
114
|
+
# Only pass base_url if it's set (Anthropic uses default URL otherwise)
|
|
115
|
+
anthropic_kwargs = {"api_key": self.api_key}
|
|
116
|
+
if self.base_url:
|
|
117
|
+
anthropic_kwargs["base_url"] = self.base_url
|
|
118
|
+
if self.timeout:
|
|
119
|
+
anthropic_kwargs["timeout"] = self.timeout
|
|
120
|
+
self._anthropic_client = AsyncAnthropic(**anthropic_kwargs)
|
|
121
|
+
elif self.provider in ("ollama", "lmstudio"):
|
|
122
|
+
# Use dummy key if not provided for local
|
|
123
|
+
api_key = self.api_key or "local"
|
|
124
|
+
client_kwargs = {"api_key": api_key, "base_url": self.base_url, "max_retries": 0}
|
|
125
|
+
if self.timeout:
|
|
126
|
+
client_kwargs["timeout"] = self.timeout
|
|
127
|
+
self._client = AsyncOpenAI(**client_kwargs)
|
|
96
128
|
else:
|
|
97
129
|
# Only pass base_url if it's set (OpenAI uses default URL otherwise)
|
|
98
130
|
client_kwargs = {"api_key": self.api_key, "max_retries": 0}
|
|
99
131
|
if self.base_url:
|
|
100
132
|
client_kwargs["base_url"] = self.base_url
|
|
101
|
-
self.
|
|
102
|
-
|
|
133
|
+
if self.timeout:
|
|
134
|
+
client_kwargs["timeout"] = self.timeout
|
|
135
|
+
self._client = AsyncOpenAI(**client_kwargs)
|
|
103
136
|
|
|
104
137
|
async def verify_connection(self) -> None:
|
|
105
138
|
"""
|
|
@@ -135,6 +168,7 @@ class LLMProvider:
|
|
|
135
168
|
initial_backoff: float = 1.0,
|
|
136
169
|
max_backoff: float = 60.0,
|
|
137
170
|
skip_validation: bool = False,
|
|
171
|
+
strict_schema: bool = False,
|
|
138
172
|
) -> Any:
|
|
139
173
|
"""
|
|
140
174
|
Make an LLM API call with retry logic.
|
|
@@ -149,6 +183,7 @@ class LLMProvider:
|
|
|
149
183
|
initial_backoff: Initial backoff time in seconds.
|
|
150
184
|
max_backoff: Maximum backoff time in seconds.
|
|
151
185
|
skip_validation: Return raw JSON without Pydantic validation.
|
|
186
|
+
strict_schema: Use strict JSON schema enforcement (OpenAI only). Guarantees all required fields.
|
|
152
187
|
|
|
153
188
|
Returns:
|
|
154
189
|
Parsed response if response_format is provided, otherwise text content.
|
|
@@ -166,6 +201,19 @@ class LLMProvider:
|
|
|
166
201
|
messages, response_format, max_retries, initial_backoff, max_backoff, skip_validation, start_time
|
|
167
202
|
)
|
|
168
203
|
|
|
204
|
+
# Handle Anthropic provider separately
|
|
205
|
+
if self.provider == "anthropic":
|
|
206
|
+
return await self._call_anthropic(
|
|
207
|
+
messages,
|
|
208
|
+
response_format,
|
|
209
|
+
max_completion_tokens,
|
|
210
|
+
max_retries,
|
|
211
|
+
initial_backoff,
|
|
212
|
+
max_backoff,
|
|
213
|
+
skip_validation,
|
|
214
|
+
start_time,
|
|
215
|
+
)
|
|
216
|
+
|
|
169
217
|
# Handle Ollama with native API for structured output (better schema enforcement)
|
|
170
218
|
if self.provider == "ollama" and response_format is not None:
|
|
171
219
|
return await self._call_ollama_native(
|
|
@@ -226,47 +274,93 @@ class LLMProvider:
|
|
|
226
274
|
for attempt in range(max_retries + 1):
|
|
227
275
|
try:
|
|
228
276
|
if response_format is not None:
|
|
229
|
-
|
|
277
|
+
schema = None
|
|
230
278
|
if hasattr(response_format, "model_json_schema"):
|
|
231
279
|
schema = response_format.model_json_schema()
|
|
232
|
-
schema_msg = f"\n\nYou must respond with valid JSON matching this schema:\n{json.dumps(schema, indent=2)}"
|
|
233
|
-
|
|
234
|
-
if call_params["messages"] and call_params["messages"][0].get("role") == "system":
|
|
235
|
-
call_params["messages"][0]["content"] += schema_msg
|
|
236
|
-
elif call_params["messages"]:
|
|
237
|
-
call_params["messages"][0]["content"] = (
|
|
238
|
-
schema_msg + "\n\n" + call_params["messages"][0]["content"]
|
|
239
|
-
)
|
|
240
280
|
|
|
241
|
-
|
|
281
|
+
if strict_schema and schema is not None:
|
|
282
|
+
# Use OpenAI's strict JSON schema enforcement
|
|
283
|
+
# This guarantees all required fields are returned
|
|
284
|
+
call_params["response_format"] = {
|
|
285
|
+
"type": "json_schema",
|
|
286
|
+
"json_schema": {
|
|
287
|
+
"name": "response",
|
|
288
|
+
"strict": True,
|
|
289
|
+
"schema": schema,
|
|
290
|
+
},
|
|
291
|
+
}
|
|
292
|
+
else:
|
|
293
|
+
# Soft enforcement: add schema to prompt and use json_object mode
|
|
294
|
+
if schema is not None:
|
|
295
|
+
schema_msg = f"\n\nYou must respond with valid JSON matching this schema:\n{json.dumps(schema, indent=2)}"
|
|
296
|
+
|
|
297
|
+
if call_params["messages"] and call_params["messages"][0].get("role") == "system":
|
|
298
|
+
call_params["messages"][0]["content"] += schema_msg
|
|
299
|
+
elif call_params["messages"]:
|
|
300
|
+
call_params["messages"][0]["content"] = (
|
|
301
|
+
schema_msg + "\n\n" + call_params["messages"][0]["content"]
|
|
302
|
+
)
|
|
303
|
+
if self.provider not in ("lmstudio", "ollama"):
|
|
304
|
+
# LM Studio and Ollama don't support json_object response format reliably
|
|
305
|
+
# We rely on the schema in the system message instead
|
|
306
|
+
call_params["response_format"] = {"type": "json_object"}
|
|
307
|
+
|
|
308
|
+
logger.debug(f"Sending request to {self.provider}/{self.model} (timeout={self.timeout})")
|
|
242
309
|
response = await self._client.chat.completions.create(**call_params)
|
|
310
|
+
logger.debug(f"Received response from {self.provider}/{self.model}")
|
|
243
311
|
|
|
244
312
|
content = response.choices[0].message.content
|
|
245
313
|
|
|
246
|
-
#
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
f"
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
if
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
314
|
+
# Strip reasoning model thinking tags
|
|
315
|
+
# Supports: <think>, <thinking>, <reasoning>, |startthink|/|endthink|
|
|
316
|
+
# for reasoning models that embed thinking in their output (e.g., Qwen3, DeepSeek)
|
|
317
|
+
if content:
|
|
318
|
+
original_len = len(content)
|
|
319
|
+
content = re.sub(r"<think>.*?</think>", "", content, flags=re.DOTALL)
|
|
320
|
+
content = re.sub(r"<thinking>.*?</thinking>", "", content, flags=re.DOTALL)
|
|
321
|
+
content = re.sub(r"<reasoning>.*?</reasoning>", "", content, flags=re.DOTALL)
|
|
322
|
+
content = re.sub(r"\|startthink\|.*?\|endthink\|", "", content, flags=re.DOTALL)
|
|
323
|
+
content = content.strip()
|
|
324
|
+
if len(content) < original_len:
|
|
325
|
+
logger.debug(f"Stripped {original_len - len(content)} chars of reasoning tokens")
|
|
326
|
+
|
|
327
|
+
# For local models, they may wrap JSON in markdown code blocks
|
|
328
|
+
if self.provider in ("lmstudio", "ollama"):
|
|
329
|
+
clean_content = content
|
|
330
|
+
if "```json" in content:
|
|
331
|
+
clean_content = content.split("```json")[1].split("```")[0].strip()
|
|
332
|
+
elif "```" in content:
|
|
333
|
+
clean_content = content.split("```")[1].split("```")[0].strip()
|
|
334
|
+
try:
|
|
335
|
+
json_data = json.loads(clean_content)
|
|
336
|
+
except json.JSONDecodeError:
|
|
337
|
+
# Fallback to parsing raw content
|
|
338
|
+
json_data = json.loads(content)
|
|
339
|
+
else:
|
|
340
|
+
# Log raw LLM response for debugging JSON parse issues
|
|
341
|
+
try:
|
|
342
|
+
json_data = json.loads(content)
|
|
343
|
+
except json.JSONDecodeError as json_err:
|
|
344
|
+
# Truncate content for logging (first 500 and last 200 chars)
|
|
345
|
+
content_preview = content[:500] if content else "<empty>"
|
|
346
|
+
if content and len(content) > 700:
|
|
347
|
+
content_preview = f"{content[:500]}...TRUNCATED...{content[-200:]}"
|
|
348
|
+
logger.warning(
|
|
349
|
+
f"JSON parse error from LLM response (attempt {attempt + 1}/{max_retries + 1}): {json_err}\n"
|
|
350
|
+
f" Model: {self.provider}/{self.model}\n"
|
|
351
|
+
f" Content length: {len(content) if content else 0} chars\n"
|
|
352
|
+
f" Content preview: {content_preview!r}\n"
|
|
353
|
+
f" Finish reason: {response.choices[0].finish_reason if response.choices else 'unknown'}"
|
|
354
|
+
)
|
|
355
|
+
# Retry on JSON parse errors - LLM may return valid JSON on next attempt
|
|
356
|
+
if attempt < max_retries:
|
|
357
|
+
backoff = min(initial_backoff * (2**attempt), max_backoff)
|
|
358
|
+
await asyncio.sleep(backoff)
|
|
359
|
+
last_exception = json_err
|
|
360
|
+
continue
|
|
361
|
+
else:
|
|
362
|
+
logger.error(f"JSON parse error after {max_retries + 1} attempts, giving up")
|
|
363
|
+
raise
|
|
270
364
|
|
|
271
365
|
if skip_validation:
|
|
272
366
|
result = json_data
|
|
@@ -339,6 +433,142 @@ class LLMProvider:
|
|
|
339
433
|
raise last_exception
|
|
340
434
|
raise RuntimeError("LLM call failed after all retries with no exception captured")
|
|
341
435
|
|
|
436
|
+
async def _call_anthropic(
|
|
437
|
+
self,
|
|
438
|
+
messages: list[dict[str, str]],
|
|
439
|
+
response_format: Any | None,
|
|
440
|
+
max_completion_tokens: int | None,
|
|
441
|
+
max_retries: int,
|
|
442
|
+
initial_backoff: float,
|
|
443
|
+
max_backoff: float,
|
|
444
|
+
skip_validation: bool,
|
|
445
|
+
start_time: float,
|
|
446
|
+
) -> Any:
|
|
447
|
+
"""Handle Anthropic-specific API calls."""
|
|
448
|
+
from anthropic import APIConnectionError, APIStatusError, RateLimitError
|
|
449
|
+
|
|
450
|
+
# Convert OpenAI-style messages to Anthropic format
|
|
451
|
+
system_prompt = None
|
|
452
|
+
anthropic_messages = []
|
|
453
|
+
|
|
454
|
+
for msg in messages:
|
|
455
|
+
role = msg.get("role", "user")
|
|
456
|
+
content = msg.get("content", "")
|
|
457
|
+
|
|
458
|
+
if role == "system":
|
|
459
|
+
if system_prompt:
|
|
460
|
+
system_prompt += "\n\n" + content
|
|
461
|
+
else:
|
|
462
|
+
system_prompt = content
|
|
463
|
+
else:
|
|
464
|
+
anthropic_messages.append({"role": role, "content": content})
|
|
465
|
+
|
|
466
|
+
# Add JSON schema instruction if response_format is provided
|
|
467
|
+
if response_format is not None and hasattr(response_format, "model_json_schema"):
|
|
468
|
+
schema = response_format.model_json_schema()
|
|
469
|
+
schema_msg = f"\n\nYou must respond with valid JSON matching this schema:\n{json.dumps(schema, indent=2)}"
|
|
470
|
+
if system_prompt:
|
|
471
|
+
system_prompt += schema_msg
|
|
472
|
+
else:
|
|
473
|
+
system_prompt = schema_msg
|
|
474
|
+
|
|
475
|
+
# Prepare parameters
|
|
476
|
+
call_params = {
|
|
477
|
+
"model": self.model,
|
|
478
|
+
"messages": anthropic_messages,
|
|
479
|
+
"max_tokens": max_completion_tokens if max_completion_tokens is not None else 4096,
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
if system_prompt:
|
|
483
|
+
call_params["system"] = system_prompt
|
|
484
|
+
|
|
485
|
+
last_exception = None
|
|
486
|
+
|
|
487
|
+
for attempt in range(max_retries + 1):
|
|
488
|
+
try:
|
|
489
|
+
response = await self._anthropic_client.messages.create(**call_params)
|
|
490
|
+
|
|
491
|
+
# Anthropic response content is a list of blocks
|
|
492
|
+
content = ""
|
|
493
|
+
for block in response.content:
|
|
494
|
+
if block.type == "text":
|
|
495
|
+
content += block.text
|
|
496
|
+
|
|
497
|
+
if response_format is not None:
|
|
498
|
+
# Models may wrap JSON in markdown code blocks
|
|
499
|
+
clean_content = content
|
|
500
|
+
if "```json" in content:
|
|
501
|
+
clean_content = content.split("```json")[1].split("```")[0].strip()
|
|
502
|
+
elif "```" in content:
|
|
503
|
+
clean_content = content.split("```")[1].split("```")[0].strip()
|
|
504
|
+
|
|
505
|
+
try:
|
|
506
|
+
json_data = json.loads(clean_content)
|
|
507
|
+
except json.JSONDecodeError:
|
|
508
|
+
# Fallback to parsing raw content if markdown stripping failed
|
|
509
|
+
json_data = json.loads(content)
|
|
510
|
+
|
|
511
|
+
if skip_validation:
|
|
512
|
+
result = json_data
|
|
513
|
+
else:
|
|
514
|
+
result = response_format.model_validate(json_data)
|
|
515
|
+
else:
|
|
516
|
+
result = content
|
|
517
|
+
|
|
518
|
+
# Log slow calls
|
|
519
|
+
duration = time.time() - start_time
|
|
520
|
+
if duration > 10.0:
|
|
521
|
+
input_tokens = response.usage.input_tokens
|
|
522
|
+
output_tokens = response.usage.output_tokens
|
|
523
|
+
logger.info(
|
|
524
|
+
f"slow llm call: model={self.provider}/{self.model}, "
|
|
525
|
+
f"input_tokens={input_tokens}, output_tokens={output_tokens}, "
|
|
526
|
+
f"time={duration:.3f}s"
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
return result
|
|
530
|
+
|
|
531
|
+
except json.JSONDecodeError as e:
|
|
532
|
+
last_exception = e
|
|
533
|
+
if attempt < max_retries:
|
|
534
|
+
logger.warning("Anthropic returned invalid JSON, retrying...")
|
|
535
|
+
backoff = min(initial_backoff * (2**attempt), max_backoff)
|
|
536
|
+
await asyncio.sleep(backoff)
|
|
537
|
+
continue
|
|
538
|
+
else:
|
|
539
|
+
logger.error(f"Anthropic returned invalid JSON after {max_retries + 1} attempts")
|
|
540
|
+
raise
|
|
541
|
+
|
|
542
|
+
except (APIConnectionError, RateLimitError, APIStatusError) as e:
|
|
543
|
+
# Fast fail on 401/403
|
|
544
|
+
if isinstance(e, APIStatusError) and e.status_code in (401, 403):
|
|
545
|
+
logger.error(f"Anthropic auth error (HTTP {e.status_code}), not retrying: {str(e)}")
|
|
546
|
+
raise
|
|
547
|
+
|
|
548
|
+
last_exception = e
|
|
549
|
+
if attempt < max_retries:
|
|
550
|
+
# Check if it's a rate limit or server error
|
|
551
|
+
should_retry = isinstance(e, (APIConnectionError, RateLimitError)) or (
|
|
552
|
+
isinstance(e, APIStatusError) and e.status_code >= 500
|
|
553
|
+
)
|
|
554
|
+
|
|
555
|
+
if should_retry:
|
|
556
|
+
backoff = min(initial_backoff * (2**attempt), max_backoff)
|
|
557
|
+
jitter = backoff * 0.2 * (2 * (time.time() % 1) - 1)
|
|
558
|
+
await asyncio.sleep(backoff + jitter)
|
|
559
|
+
continue
|
|
560
|
+
|
|
561
|
+
logger.error(f"Anthropic API error after {max_retries + 1} attempts: {str(e)}")
|
|
562
|
+
raise
|
|
563
|
+
|
|
564
|
+
except Exception as e:
|
|
565
|
+
logger.error(f"Unexpected error during Anthropic call: {type(e).__name__}: {str(e)}")
|
|
566
|
+
raise
|
|
567
|
+
|
|
568
|
+
if last_exception:
|
|
569
|
+
raise last_exception
|
|
570
|
+
raise RuntimeError("Anthropic call failed after all retries")
|
|
571
|
+
|
|
342
572
|
async def _call_ollama_native(
|
|
343
573
|
self,
|
|
344
574
|
messages: list[dict[str, str]],
|
|
@@ -17,6 +17,8 @@ import uuid
|
|
|
17
17
|
from datetime import UTC, datetime, timedelta
|
|
18
18
|
from typing import TYPE_CHECKING, Any
|
|
19
19
|
|
|
20
|
+
from ..config import get_config
|
|
21
|
+
|
|
20
22
|
# Context variable for current schema (async-safe, per-task isolation)
|
|
21
23
|
_current_schema: contextvars.ContextVar[str] = contextvars.ContextVar("current_schema", default="public")
|
|
22
24
|
|
|
@@ -372,7 +374,7 @@ class MemoryEngine(MemoryEngineInterface):
|
|
|
372
374
|
|
|
373
375
|
result = await validation_coro
|
|
374
376
|
if not result.allowed:
|
|
375
|
-
raise OperationValidationError(result.reason or "Operation not allowed")
|
|
377
|
+
raise OperationValidationError(result.reason or "Operation not allowed", result.status_code)
|
|
376
378
|
|
|
377
379
|
async def _authenticate_tenant(self, request_context: "RequestContext | None") -> str:
|
|
378
380
|
"""
|
|
@@ -399,7 +401,9 @@ class MemoryEngine(MemoryEngineInterface):
|
|
|
399
401
|
if request_context is None:
|
|
400
402
|
raise AuthenticationError("RequestContext is required when tenant extension is configured")
|
|
401
403
|
|
|
404
|
+
# Let AuthenticationError propagate - HTTP layer will convert to 401
|
|
402
405
|
tenant_context = await self._tenant_extension.authenticate(request_context)
|
|
406
|
+
|
|
403
407
|
_current_schema.set(tenant_context.schema_name)
|
|
404
408
|
return tenant_context.schema_name
|
|
405
409
|
|
|
@@ -2825,13 +2829,16 @@ Guidelines:
|
|
|
2825
2829
|
Handler for form opinion tasks.
|
|
2826
2830
|
|
|
2827
2831
|
Args:
|
|
2828
|
-
task_dict: Dict with keys: 'bank_id', 'answer_text', 'query'
|
|
2832
|
+
task_dict: Dict with keys: 'bank_id', 'answer_text', 'query', 'tenant_id'
|
|
2829
2833
|
"""
|
|
2830
2834
|
bank_id = task_dict["bank_id"]
|
|
2831
2835
|
answer_text = task_dict["answer_text"]
|
|
2832
2836
|
query = task_dict["query"]
|
|
2837
|
+
tenant_id = task_dict.get("tenant_id")
|
|
2833
2838
|
|
|
2834
|
-
await self._extract_and_store_opinions_async(
|
|
2839
|
+
await self._extract_and_store_opinions_async(
|
|
2840
|
+
bank_id=bank_id, answer_text=answer_text, query=query, tenant_id=tenant_id
|
|
2841
|
+
)
|
|
2835
2842
|
|
|
2836
2843
|
async def _handle_reinforce_opinion(self, task_dict: dict[str, Any]):
|
|
2837
2844
|
"""
|
|
@@ -3076,6 +3083,8 @@ Guidelines:
|
|
|
3076
3083
|
*,
|
|
3077
3084
|
budget: Budget | None = None,
|
|
3078
3085
|
context: str | None = None,
|
|
3086
|
+
max_tokens: int = 4096,
|
|
3087
|
+
response_schema: dict | None = None,
|
|
3079
3088
|
request_context: "RequestContext",
|
|
3080
3089
|
) -> ReflectResult:
|
|
3081
3090
|
"""
|
|
@@ -3087,19 +3096,22 @@ Guidelines:
|
|
|
3087
3096
|
3. Retrieves existing opinions (bank's formed perspectives)
|
|
3088
3097
|
4. Uses LLM to formulate an answer
|
|
3089
3098
|
5. Extracts and stores any new opinions formed during reflection
|
|
3090
|
-
6.
|
|
3099
|
+
6. Optionally generates structured output based on response_schema
|
|
3100
|
+
7. Returns plain text answer and the facts used
|
|
3091
3101
|
|
|
3092
3102
|
Args:
|
|
3093
3103
|
bank_id: bank identifier
|
|
3094
3104
|
query: Question to answer
|
|
3095
3105
|
budget: Budget level for memory exploration (low=100, mid=300, high=600 units)
|
|
3096
3106
|
context: Additional context string to include in LLM prompt (not used in recall)
|
|
3107
|
+
response_schema: Optional JSON Schema for structured output
|
|
3097
3108
|
|
|
3098
3109
|
Returns:
|
|
3099
3110
|
ReflectResult containing:
|
|
3100
3111
|
- text: Plain text answer (no markdown)
|
|
3101
3112
|
- based_on: Dict with 'world', 'experience', and 'opinion' fact lists (MemoryFact objects)
|
|
3102
3113
|
- new_opinions: List of newly formed opinions
|
|
3114
|
+
- structured_output: Optional dict if response_schema was provided
|
|
3103
3115
|
"""
|
|
3104
3116
|
# Use cached LLM config
|
|
3105
3117
|
if self._llm_config is None:
|
|
@@ -3177,21 +3189,53 @@ Guidelines:
|
|
|
3177
3189
|
log_buffer.append(f"[REFLECT {reflect_id}] Prompt: {len(prompt)} chars")
|
|
3178
3190
|
|
|
3179
3191
|
system_message = think_utils.get_system_message(disposition)
|
|
3192
|
+
messages = [{"role": "system", "content": system_message}, {"role": "user", "content": prompt}]
|
|
3193
|
+
|
|
3194
|
+
# Prepare response_format if schema provided
|
|
3195
|
+
response_format = None
|
|
3196
|
+
if response_schema is not None:
|
|
3197
|
+
# Wrapper class to provide Pydantic-like interface for raw JSON schemas
|
|
3198
|
+
class JsonSchemaWrapper:
|
|
3199
|
+
def __init__(self, schema: dict):
|
|
3200
|
+
self._schema = schema
|
|
3201
|
+
|
|
3202
|
+
def model_json_schema(self):
|
|
3203
|
+
return self._schema
|
|
3204
|
+
|
|
3205
|
+
response_format = JsonSchemaWrapper(response_schema)
|
|
3180
3206
|
|
|
3181
3207
|
llm_start = time.time()
|
|
3182
|
-
|
|
3183
|
-
messages=
|
|
3184
|
-
scope="
|
|
3185
|
-
|
|
3186
|
-
|
|
3208
|
+
result = await self._llm_config.call(
|
|
3209
|
+
messages=messages,
|
|
3210
|
+
scope="memory_reflect",
|
|
3211
|
+
max_completion_tokens=max_tokens,
|
|
3212
|
+
response_format=response_format,
|
|
3213
|
+
skip_validation=True if response_format else False,
|
|
3214
|
+
# Don't enforce strict_schema - not all providers support it and may retry forever
|
|
3215
|
+
# Soft enforcement (schema in prompt + json_object mode) is sufficient
|
|
3216
|
+
strict_schema=False,
|
|
3187
3217
|
)
|
|
3188
3218
|
llm_time = time.time() - llm_start
|
|
3189
3219
|
|
|
3190
|
-
|
|
3220
|
+
# Handle response based on whether structured output was requested
|
|
3221
|
+
if response_schema is not None:
|
|
3222
|
+
structured_output = result
|
|
3223
|
+
answer_text = "" # Empty for backward compatibility
|
|
3224
|
+
log_buffer.append(f"[REFLECT {reflect_id}] Structured output generated")
|
|
3225
|
+
else:
|
|
3226
|
+
structured_output = None
|
|
3227
|
+
answer_text = result.strip()
|
|
3191
3228
|
|
|
3192
3229
|
# Submit form_opinion task for background processing
|
|
3230
|
+
# Pass tenant_id from request context for internal authentication in background task
|
|
3193
3231
|
await self._task_backend.submit_task(
|
|
3194
|
-
{
|
|
3232
|
+
{
|
|
3233
|
+
"type": "form_opinion",
|
|
3234
|
+
"bank_id": bank_id,
|
|
3235
|
+
"answer_text": answer_text,
|
|
3236
|
+
"query": query,
|
|
3237
|
+
"tenant_id": getattr(request_context, "tenant_id", None) if request_context else None,
|
|
3238
|
+
}
|
|
3195
3239
|
)
|
|
3196
3240
|
|
|
3197
3241
|
total_time = time.time() - reflect_start
|
|
@@ -3205,6 +3249,7 @@ Guidelines:
|
|
|
3205
3249
|
text=answer_text,
|
|
3206
3250
|
based_on={"world": world_results, "experience": agent_results, "opinion": opinion_results},
|
|
3207
3251
|
new_opinions=[], # Opinions are being extracted asynchronously
|
|
3252
|
+
structured_output=structured_output,
|
|
3208
3253
|
)
|
|
3209
3254
|
|
|
3210
3255
|
# Call post-operation hook if validator is configured
|
|
@@ -3228,7 +3273,9 @@ Guidelines:
|
|
|
3228
3273
|
|
|
3229
3274
|
return result
|
|
3230
3275
|
|
|
3231
|
-
async def _extract_and_store_opinions_async(
|
|
3276
|
+
async def _extract_and_store_opinions_async(
|
|
3277
|
+
self, bank_id: str, answer_text: str, query: str, tenant_id: str | None = None
|
|
3278
|
+
):
|
|
3232
3279
|
"""
|
|
3233
3280
|
Background task to extract and store opinions from think response.
|
|
3234
3281
|
|
|
@@ -3238,6 +3285,7 @@ Guidelines:
|
|
|
3238
3285
|
bank_id: bank IDentifier
|
|
3239
3286
|
answer_text: The generated answer text
|
|
3240
3287
|
query: The original query
|
|
3288
|
+
tenant_id: Tenant identifier for internal authentication
|
|
3241
3289
|
"""
|
|
3242
3290
|
try:
|
|
3243
3291
|
# Extract opinions from the answer
|
|
@@ -3248,10 +3296,11 @@ Guidelines:
|
|
|
3248
3296
|
from datetime import datetime
|
|
3249
3297
|
|
|
3250
3298
|
current_time = datetime.now(UTC)
|
|
3251
|
-
# Use internal
|
|
3299
|
+
# Use internal context with tenant_id for background authentication
|
|
3300
|
+
# Extension can check internal=True to bypass normal auth
|
|
3252
3301
|
from hindsight_api.models import RequestContext
|
|
3253
3302
|
|
|
3254
|
-
internal_context = RequestContext()
|
|
3303
|
+
internal_context = RequestContext(tenant_id=tenant_id, internal=True)
|
|
3255
3304
|
for opinion in new_opinions:
|
|
3256
3305
|
await self.retain_async(
|
|
3257
3306
|
bank_id=bank_id,
|
|
@@ -3572,7 +3621,7 @@ Guidelines:
|
|
|
3572
3621
|
self,
|
|
3573
3622
|
bank_id: str,
|
|
3574
3623
|
entity_ids: list[str],
|
|
3575
|
-
min_facts: int =
|
|
3624
|
+
min_facts: int | None = None,
|
|
3576
3625
|
conn=None,
|
|
3577
3626
|
request_context: "RequestContext | None" = None,
|
|
3578
3627
|
) -> None:
|
|
@@ -3584,12 +3633,16 @@ Guidelines:
|
|
|
3584
3633
|
Args:
|
|
3585
3634
|
bank_id: Bank identifier
|
|
3586
3635
|
entity_ids: List of entity IDs to process
|
|
3587
|
-
min_facts: Minimum facts required to regenerate observations
|
|
3636
|
+
min_facts: Minimum facts required to regenerate observations (uses config default if None)
|
|
3588
3637
|
conn: Optional database connection (for transactional atomicity)
|
|
3589
3638
|
"""
|
|
3590
3639
|
if not bank_id or not entity_ids:
|
|
3591
3640
|
return
|
|
3592
3641
|
|
|
3642
|
+
# Use config default if min_facts not specified
|
|
3643
|
+
if min_facts is None:
|
|
3644
|
+
min_facts = get_config().observation_min_facts
|
|
3645
|
+
|
|
3593
3646
|
# Convert to UUIDs
|
|
3594
3647
|
entity_uuids = [uuid.UUID(eid) if isinstance(eid, str) else eid for eid in entity_ids]
|
|
3595
3648
|
|
|
@@ -123,7 +123,8 @@ class ReflectResult(BaseModel):
|
|
|
123
123
|
Result from a reflect operation.
|
|
124
124
|
|
|
125
125
|
Contains the formulated answer, the facts it was based on (organized by type),
|
|
126
|
-
|
|
126
|
+
any new opinions that were formed during the reflection process, and optionally
|
|
127
|
+
structured output if a response schema was provided.
|
|
127
128
|
"""
|
|
128
129
|
|
|
129
130
|
model_config = ConfigDict(
|
|
@@ -145,6 +146,7 @@ class ReflectResult(BaseModel):
|
|
|
145
146
|
"opinion": [],
|
|
146
147
|
},
|
|
147
148
|
"new_opinions": ["Machine learning has great potential in healthcare"],
|
|
149
|
+
"structured_output": {"summary": "ML in healthcare", "confidence": 0.9},
|
|
148
150
|
}
|
|
149
151
|
}
|
|
150
152
|
)
|
|
@@ -154,6 +156,10 @@ class ReflectResult(BaseModel):
|
|
|
154
156
|
description="Facts used to formulate the answer, organized by type (world, experience, opinion)"
|
|
155
157
|
)
|
|
156
158
|
new_opinions: list[str] = Field(default_factory=list, description="List of newly formed opinions during reflection")
|
|
159
|
+
structured_output: dict[str, Any] | None = Field(
|
|
160
|
+
default=None,
|
|
161
|
+
description="Structured output parsed according to the provided response schema. Only present when response_schema was provided.",
|
|
162
|
+
)
|
|
157
163
|
|
|
158
164
|
|
|
159
165
|
class Opinion(BaseModel):
|