hindsight-api 0.1.16__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,6 +6,7 @@ import asyncio
6
6
  import json
7
7
  import logging
8
8
  import os
9
+ import re
9
10
  import time
10
11
  from typing import Any
11
12
 
@@ -15,6 +16,13 @@ from google.genai import errors as genai_errors
15
16
  from google.genai import types as genai_types
16
17
  from openai import APIConnectionError, APIStatusError, AsyncOpenAI, LengthFinishReasonError
17
18
 
19
+ from ..config import (
20
+ DEFAULT_LLM_MAX_CONCURRENT,
21
+ DEFAULT_LLM_TIMEOUT,
22
+ ENV_LLM_MAX_CONCURRENT,
23
+ ENV_LLM_TIMEOUT,
24
+ )
25
+
18
26
  # Seed applied to every Groq request for deterministic behavior.
19
27
  DEFAULT_LLM_SEED = 4242
20
28
 
@@ -24,7 +32,9 @@ logger = logging.getLogger(__name__)
24
32
  logging.getLogger("httpx").setLevel(logging.WARNING)
25
33
 
26
34
  # Global semaphore to limit concurrent LLM requests across all instances
27
- _global_llm_semaphore = asyncio.Semaphore(32)
35
+ # Set HINDSIGHT_API_LLM_MAX_CONCURRENT=1 for local LLMs (LM Studio, Ollama)
36
+ _llm_max_concurrent = int(os.getenv(ENV_LLM_MAX_CONCURRENT, str(DEFAULT_LLM_MAX_CONCURRENT)))
37
+ _global_llm_semaphore = asyncio.Semaphore(_llm_max_concurrent)
28
38
 
29
39
 
30
40
  class OutputTooLongError(Exception):
@@ -58,7 +68,7 @@ class LLMProvider:
58
68
  Initialize LLM provider.
59
69
 
60
70
  Args:
61
- provider: Provider name ("openai", "groq", "ollama", "gemini").
71
+ provider: Provider name ("openai", "groq", "ollama", "gemini", "anthropic", "lmstudio").
62
72
  api_key: API key.
63
73
  base_url: Base URL for the API.
64
74
  model: Model name.
@@ -71,7 +81,7 @@ class LLMProvider:
71
81
  self.reasoning_effort = reasoning_effort
72
82
 
73
83
  # Validate provider
74
- valid_providers = ["openai", "groq", "ollama", "gemini"]
84
+ valid_providers = ["openai", "groq", "ollama", "gemini", "anthropic", "lmstudio"]
75
85
  if self.provider not in valid_providers:
76
86
  raise ValueError(f"Invalid LLM provider: {self.provider}. Must be one of: {', '.join(valid_providers)}")
77
87
 
@@ -81,25 +91,48 @@ class LLMProvider:
81
91
  self.base_url = "https://api.groq.com/openai/v1"
82
92
  elif self.provider == "ollama":
83
93
  self.base_url = "http://localhost:11434/v1"
94
+ elif self.provider == "lmstudio":
95
+ self.base_url = "http://localhost:1234/v1"
84
96
 
85
- # Validate API key (not needed for ollama)
86
- if self.provider != "ollama" and not self.api_key:
97
+ # Validate API key (not needed for ollama or lmstudio)
98
+ if self.provider not in ("ollama", "lmstudio") and not self.api_key:
87
99
  raise ValueError(f"API key not found for {self.provider}")
88
100
 
101
+ # Get timeout config (set HINDSIGHT_API_LLM_TIMEOUT for local LLMs that need longer timeouts)
102
+ self.timeout = float(os.getenv(ENV_LLM_TIMEOUT, str(DEFAULT_LLM_TIMEOUT)))
103
+
89
104
  # Create client based on provider
105
+ self._client = None
106
+ self._gemini_client = None
107
+ self._anthropic_client = None
108
+
90
109
  if self.provider == "gemini":
91
110
  self._gemini_client = genai.Client(api_key=self.api_key)
92
- self._client = None
93
- elif self.provider == "ollama":
94
- self._client = AsyncOpenAI(api_key="ollama", base_url=self.base_url, max_retries=0)
95
- self._gemini_client = None
111
+ elif self.provider == "anthropic":
112
+ from anthropic import AsyncAnthropic
113
+
114
+ # Only pass base_url if it's set (Anthropic uses default URL otherwise)
115
+ anthropic_kwargs = {"api_key": self.api_key}
116
+ if self.base_url:
117
+ anthropic_kwargs["base_url"] = self.base_url
118
+ if self.timeout:
119
+ anthropic_kwargs["timeout"] = self.timeout
120
+ self._anthropic_client = AsyncAnthropic(**anthropic_kwargs)
121
+ elif self.provider in ("ollama", "lmstudio"):
122
+ # Use dummy key if not provided for local
123
+ api_key = self.api_key or "local"
124
+ client_kwargs = {"api_key": api_key, "base_url": self.base_url, "max_retries": 0}
125
+ if self.timeout:
126
+ client_kwargs["timeout"] = self.timeout
127
+ self._client = AsyncOpenAI(**client_kwargs)
96
128
  else:
97
129
  # Only pass base_url if it's set (OpenAI uses default URL otherwise)
98
130
  client_kwargs = {"api_key": self.api_key, "max_retries": 0}
99
131
  if self.base_url:
100
132
  client_kwargs["base_url"] = self.base_url
101
- self._client = AsyncOpenAI(**client_kwargs) # type: ignore[invalid-argument-type] - dict kwargs
102
- self._gemini_client = None
133
+ if self.timeout:
134
+ client_kwargs["timeout"] = self.timeout
135
+ self._client = AsyncOpenAI(**client_kwargs)
103
136
 
104
137
  async def verify_connection(self) -> None:
105
138
  """
@@ -135,6 +168,7 @@ class LLMProvider:
135
168
  initial_backoff: float = 1.0,
136
169
  max_backoff: float = 60.0,
137
170
  skip_validation: bool = False,
171
+ strict_schema: bool = False,
138
172
  ) -> Any:
139
173
  """
140
174
  Make an LLM API call with retry logic.
@@ -149,6 +183,7 @@ class LLMProvider:
149
183
  initial_backoff: Initial backoff time in seconds.
150
184
  max_backoff: Maximum backoff time in seconds.
151
185
  skip_validation: Return raw JSON without Pydantic validation.
186
+ strict_schema: Use strict JSON schema enforcement (OpenAI only). Guarantees all required fields.
152
187
 
153
188
  Returns:
154
189
  Parsed response if response_format is provided, otherwise text content.
@@ -166,6 +201,19 @@ class LLMProvider:
166
201
  messages, response_format, max_retries, initial_backoff, max_backoff, skip_validation, start_time
167
202
  )
168
203
 
204
+ # Handle Anthropic provider separately
205
+ if self.provider == "anthropic":
206
+ return await self._call_anthropic(
207
+ messages,
208
+ response_format,
209
+ max_completion_tokens,
210
+ max_retries,
211
+ initial_backoff,
212
+ max_backoff,
213
+ skip_validation,
214
+ start_time,
215
+ )
216
+
169
217
  # Handle Ollama with native API for structured output (better schema enforcement)
170
218
  if self.provider == "ollama" and response_format is not None:
171
219
  return await self._call_ollama_native(
@@ -226,47 +274,93 @@ class LLMProvider:
226
274
  for attempt in range(max_retries + 1):
227
275
  try:
228
276
  if response_format is not None:
229
- # Add schema to system message for JSON mode
277
+ schema = None
230
278
  if hasattr(response_format, "model_json_schema"):
231
279
  schema = response_format.model_json_schema()
232
- schema_msg = f"\n\nYou must respond with valid JSON matching this schema:\n{json.dumps(schema, indent=2)}"
233
-
234
- if call_params["messages"] and call_params["messages"][0].get("role") == "system":
235
- call_params["messages"][0]["content"] += schema_msg
236
- elif call_params["messages"]:
237
- call_params["messages"][0]["content"] = (
238
- schema_msg + "\n\n" + call_params["messages"][0]["content"]
239
- )
240
280
 
241
- call_params["response_format"] = {"type": "json_object"}
281
+ if strict_schema and schema is not None:
282
+ # Use OpenAI's strict JSON schema enforcement
283
+ # This guarantees all required fields are returned
284
+ call_params["response_format"] = {
285
+ "type": "json_schema",
286
+ "json_schema": {
287
+ "name": "response",
288
+ "strict": True,
289
+ "schema": schema,
290
+ },
291
+ }
292
+ else:
293
+ # Soft enforcement: add schema to prompt and use json_object mode
294
+ if schema is not None:
295
+ schema_msg = f"\n\nYou must respond with valid JSON matching this schema:\n{json.dumps(schema, indent=2)}"
296
+
297
+ if call_params["messages"] and call_params["messages"][0].get("role") == "system":
298
+ call_params["messages"][0]["content"] += schema_msg
299
+ elif call_params["messages"]:
300
+ call_params["messages"][0]["content"] = (
301
+ schema_msg + "\n\n" + call_params["messages"][0]["content"]
302
+ )
303
+ if self.provider not in ("lmstudio", "ollama"):
304
+ # LM Studio and Ollama don't support json_object response format reliably
305
+ # We rely on the schema in the system message instead
306
+ call_params["response_format"] = {"type": "json_object"}
307
+
308
+ logger.debug(f"Sending request to {self.provider}/{self.model} (timeout={self.timeout})")
242
309
  response = await self._client.chat.completions.create(**call_params)
310
+ logger.debug(f"Received response from {self.provider}/{self.model}")
243
311
 
244
312
  content = response.choices[0].message.content
245
313
 
246
- # Log raw LLM response for debugging JSON parse issues
247
- try:
248
- json_data = json.loads(content)
249
- except json.JSONDecodeError as json_err:
250
- # Truncate content for logging (first 500 and last 200 chars)
251
- content_preview = content[:500] if content else "<empty>"
252
- if content and len(content) > 700:
253
- content_preview = f"{content[:500]}...TRUNCATED...{content[-200:]}"
254
- logger.warning(
255
- f"JSON parse error from LLM response (attempt {attempt + 1}/{max_retries + 1}): {json_err}\n"
256
- f" Model: {self.provider}/{self.model}\n"
257
- f" Content length: {len(content) if content else 0} chars\n"
258
- f" Content preview: {content_preview!r}\n"
259
- f" Finish reason: {response.choices[0].finish_reason if response.choices else 'unknown'}"
260
- )
261
- # Retry on JSON parse errors - LLM may return valid JSON on next attempt
262
- if attempt < max_retries:
263
- backoff = min(initial_backoff * (2**attempt), max_backoff)
264
- await asyncio.sleep(backoff)
265
- last_exception = json_err
266
- continue
267
- else:
268
- logger.error(f"JSON parse error after {max_retries + 1} attempts, giving up")
269
- raise
314
+ # Strip reasoning model thinking tags
315
+ # Supports: <think>, <thinking>, <reasoning>, |startthink|/|endthink|
316
+ # for reasoning models that embed thinking in their output (e.g., Qwen3, DeepSeek)
317
+ if content:
318
+ original_len = len(content)
319
+ content = re.sub(r"<think>.*?</think>", "", content, flags=re.DOTALL)
320
+ content = re.sub(r"<thinking>.*?</thinking>", "", content, flags=re.DOTALL)
321
+ content = re.sub(r"<reasoning>.*?</reasoning>", "", content, flags=re.DOTALL)
322
+ content = re.sub(r"\|startthink\|.*?\|endthink\|", "", content, flags=re.DOTALL)
323
+ content = content.strip()
324
+ if len(content) < original_len:
325
+ logger.debug(f"Stripped {original_len - len(content)} chars of reasoning tokens")
326
+
327
+ # For local models, they may wrap JSON in markdown code blocks
328
+ if self.provider in ("lmstudio", "ollama"):
329
+ clean_content = content
330
+ if "```json" in content:
331
+ clean_content = content.split("```json")[1].split("```")[0].strip()
332
+ elif "```" in content:
333
+ clean_content = content.split("```")[1].split("```")[0].strip()
334
+ try:
335
+ json_data = json.loads(clean_content)
336
+ except json.JSONDecodeError:
337
+ # Fallback to parsing raw content
338
+ json_data = json.loads(content)
339
+ else:
340
+ # Log raw LLM response for debugging JSON parse issues
341
+ try:
342
+ json_data = json.loads(content)
343
+ except json.JSONDecodeError as json_err:
344
+ # Truncate content for logging (first 500 and last 200 chars)
345
+ content_preview = content[:500] if content else "<empty>"
346
+ if content and len(content) > 700:
347
+ content_preview = f"{content[:500]}...TRUNCATED...{content[-200:]}"
348
+ logger.warning(
349
+ f"JSON parse error from LLM response (attempt {attempt + 1}/{max_retries + 1}): {json_err}\n"
350
+ f" Model: {self.provider}/{self.model}\n"
351
+ f" Content length: {len(content) if content else 0} chars\n"
352
+ f" Content preview: {content_preview!r}\n"
353
+ f" Finish reason: {response.choices[0].finish_reason if response.choices else 'unknown'}"
354
+ )
355
+ # Retry on JSON parse errors - LLM may return valid JSON on next attempt
356
+ if attempt < max_retries:
357
+ backoff = min(initial_backoff * (2**attempt), max_backoff)
358
+ await asyncio.sleep(backoff)
359
+ last_exception = json_err
360
+ continue
361
+ else:
362
+ logger.error(f"JSON parse error after {max_retries + 1} attempts, giving up")
363
+ raise
270
364
 
271
365
  if skip_validation:
272
366
  result = json_data
@@ -339,6 +433,142 @@ class LLMProvider:
339
433
  raise last_exception
340
434
  raise RuntimeError("LLM call failed after all retries with no exception captured")
341
435
 
436
+ async def _call_anthropic(
437
+ self,
438
+ messages: list[dict[str, str]],
439
+ response_format: Any | None,
440
+ max_completion_tokens: int | None,
441
+ max_retries: int,
442
+ initial_backoff: float,
443
+ max_backoff: float,
444
+ skip_validation: bool,
445
+ start_time: float,
446
+ ) -> Any:
447
+ """Handle Anthropic-specific API calls."""
448
+ from anthropic import APIConnectionError, APIStatusError, RateLimitError
449
+
450
+ # Convert OpenAI-style messages to Anthropic format
451
+ system_prompt = None
452
+ anthropic_messages = []
453
+
454
+ for msg in messages:
455
+ role = msg.get("role", "user")
456
+ content = msg.get("content", "")
457
+
458
+ if role == "system":
459
+ if system_prompt:
460
+ system_prompt += "\n\n" + content
461
+ else:
462
+ system_prompt = content
463
+ else:
464
+ anthropic_messages.append({"role": role, "content": content})
465
+
466
+ # Add JSON schema instruction if response_format is provided
467
+ if response_format is not None and hasattr(response_format, "model_json_schema"):
468
+ schema = response_format.model_json_schema()
469
+ schema_msg = f"\n\nYou must respond with valid JSON matching this schema:\n{json.dumps(schema, indent=2)}"
470
+ if system_prompt:
471
+ system_prompt += schema_msg
472
+ else:
473
+ system_prompt = schema_msg
474
+
475
+ # Prepare parameters
476
+ call_params = {
477
+ "model": self.model,
478
+ "messages": anthropic_messages,
479
+ "max_tokens": max_completion_tokens if max_completion_tokens is not None else 4096,
480
+ }
481
+
482
+ if system_prompt:
483
+ call_params["system"] = system_prompt
484
+
485
+ last_exception = None
486
+
487
+ for attempt in range(max_retries + 1):
488
+ try:
489
+ response = await self._anthropic_client.messages.create(**call_params)
490
+
491
+ # Anthropic response content is a list of blocks
492
+ content = ""
493
+ for block in response.content:
494
+ if block.type == "text":
495
+ content += block.text
496
+
497
+ if response_format is not None:
498
+ # Models may wrap JSON in markdown code blocks
499
+ clean_content = content
500
+ if "```json" in content:
501
+ clean_content = content.split("```json")[1].split("```")[0].strip()
502
+ elif "```" in content:
503
+ clean_content = content.split("```")[1].split("```")[0].strip()
504
+
505
+ try:
506
+ json_data = json.loads(clean_content)
507
+ except json.JSONDecodeError:
508
+ # Fallback to parsing raw content if markdown stripping failed
509
+ json_data = json.loads(content)
510
+
511
+ if skip_validation:
512
+ result = json_data
513
+ else:
514
+ result = response_format.model_validate(json_data)
515
+ else:
516
+ result = content
517
+
518
+ # Log slow calls
519
+ duration = time.time() - start_time
520
+ if duration > 10.0:
521
+ input_tokens = response.usage.input_tokens
522
+ output_tokens = response.usage.output_tokens
523
+ logger.info(
524
+ f"slow llm call: model={self.provider}/{self.model}, "
525
+ f"input_tokens={input_tokens}, output_tokens={output_tokens}, "
526
+ f"time={duration:.3f}s"
527
+ )
528
+
529
+ return result
530
+
531
+ except json.JSONDecodeError as e:
532
+ last_exception = e
533
+ if attempt < max_retries:
534
+ logger.warning("Anthropic returned invalid JSON, retrying...")
535
+ backoff = min(initial_backoff * (2**attempt), max_backoff)
536
+ await asyncio.sleep(backoff)
537
+ continue
538
+ else:
539
+ logger.error(f"Anthropic returned invalid JSON after {max_retries + 1} attempts")
540
+ raise
541
+
542
+ except (APIConnectionError, RateLimitError, APIStatusError) as e:
543
+ # Fast fail on 401/403
544
+ if isinstance(e, APIStatusError) and e.status_code in (401, 403):
545
+ logger.error(f"Anthropic auth error (HTTP {e.status_code}), not retrying: {str(e)}")
546
+ raise
547
+
548
+ last_exception = e
549
+ if attempt < max_retries:
550
+ # Check if it's a rate limit or server error
551
+ should_retry = isinstance(e, (APIConnectionError, RateLimitError)) or (
552
+ isinstance(e, APIStatusError) and e.status_code >= 500
553
+ )
554
+
555
+ if should_retry:
556
+ backoff = min(initial_backoff * (2**attempt), max_backoff)
557
+ jitter = backoff * 0.2 * (2 * (time.time() % 1) - 1)
558
+ await asyncio.sleep(backoff + jitter)
559
+ continue
560
+
561
+ logger.error(f"Anthropic API error after {max_retries + 1} attempts: {str(e)}")
562
+ raise
563
+
564
+ except Exception as e:
565
+ logger.error(f"Unexpected error during Anthropic call: {type(e).__name__}: {str(e)}")
566
+ raise
567
+
568
+ if last_exception:
569
+ raise last_exception
570
+ raise RuntimeError("Anthropic call failed after all retries")
571
+
342
572
  async def _call_ollama_native(
343
573
  self,
344
574
  messages: list[dict[str, str]],
@@ -17,6 +17,8 @@ import uuid
17
17
  from datetime import UTC, datetime, timedelta
18
18
  from typing import TYPE_CHECKING, Any
19
19
 
20
+ from ..config import get_config
21
+
20
22
  # Context variable for current schema (async-safe, per-task isolation)
21
23
  _current_schema: contextvars.ContextVar[str] = contextvars.ContextVar("current_schema", default="public")
22
24
 
@@ -372,7 +374,7 @@ class MemoryEngine(MemoryEngineInterface):
372
374
 
373
375
  result = await validation_coro
374
376
  if not result.allowed:
375
- raise OperationValidationError(result.reason or "Operation not allowed")
377
+ raise OperationValidationError(result.reason or "Operation not allowed", result.status_code)
376
378
 
377
379
  async def _authenticate_tenant(self, request_context: "RequestContext | None") -> str:
378
380
  """
@@ -399,7 +401,9 @@ class MemoryEngine(MemoryEngineInterface):
399
401
  if request_context is None:
400
402
  raise AuthenticationError("RequestContext is required when tenant extension is configured")
401
403
 
404
+ # Let AuthenticationError propagate - HTTP layer will convert to 401
402
405
  tenant_context = await self._tenant_extension.authenticate(request_context)
406
+
403
407
  _current_schema.set(tenant_context.schema_name)
404
408
  return tenant_context.schema_name
405
409
 
@@ -2825,13 +2829,16 @@ Guidelines:
2825
2829
  Handler for form opinion tasks.
2826
2830
 
2827
2831
  Args:
2828
- task_dict: Dict with keys: 'bank_id', 'answer_text', 'query'
2832
+ task_dict: Dict with keys: 'bank_id', 'answer_text', 'query', 'tenant_id'
2829
2833
  """
2830
2834
  bank_id = task_dict["bank_id"]
2831
2835
  answer_text = task_dict["answer_text"]
2832
2836
  query = task_dict["query"]
2837
+ tenant_id = task_dict.get("tenant_id")
2833
2838
 
2834
- await self._extract_and_store_opinions_async(bank_id=bank_id, answer_text=answer_text, query=query)
2839
+ await self._extract_and_store_opinions_async(
2840
+ bank_id=bank_id, answer_text=answer_text, query=query, tenant_id=tenant_id
2841
+ )
2835
2842
 
2836
2843
  async def _handle_reinforce_opinion(self, task_dict: dict[str, Any]):
2837
2844
  """
@@ -3076,6 +3083,8 @@ Guidelines:
3076
3083
  *,
3077
3084
  budget: Budget | None = None,
3078
3085
  context: str | None = None,
3086
+ max_tokens: int = 4096,
3087
+ response_schema: dict | None = None,
3079
3088
  request_context: "RequestContext",
3080
3089
  ) -> ReflectResult:
3081
3090
  """
@@ -3087,19 +3096,22 @@ Guidelines:
3087
3096
  3. Retrieves existing opinions (bank's formed perspectives)
3088
3097
  4. Uses LLM to formulate an answer
3089
3098
  5. Extracts and stores any new opinions formed during reflection
3090
- 6. Returns plain text answer and the facts used
3099
+ 6. Optionally generates structured output based on response_schema
3100
+ 7. Returns plain text answer and the facts used
3091
3101
 
3092
3102
  Args:
3093
3103
  bank_id: bank identifier
3094
3104
  query: Question to answer
3095
3105
  budget: Budget level for memory exploration (low=100, mid=300, high=600 units)
3096
3106
  context: Additional context string to include in LLM prompt (not used in recall)
3107
+ response_schema: Optional JSON Schema for structured output
3097
3108
 
3098
3109
  Returns:
3099
3110
  ReflectResult containing:
3100
3111
  - text: Plain text answer (no markdown)
3101
3112
  - based_on: Dict with 'world', 'experience', and 'opinion' fact lists (MemoryFact objects)
3102
3113
  - new_opinions: List of newly formed opinions
3114
+ - structured_output: Optional dict if response_schema was provided
3103
3115
  """
3104
3116
  # Use cached LLM config
3105
3117
  if self._llm_config is None:
@@ -3177,21 +3189,53 @@ Guidelines:
3177
3189
  log_buffer.append(f"[REFLECT {reflect_id}] Prompt: {len(prompt)} chars")
3178
3190
 
3179
3191
  system_message = think_utils.get_system_message(disposition)
3192
+ messages = [{"role": "system", "content": system_message}, {"role": "user", "content": prompt}]
3193
+
3194
+ # Prepare response_format if schema provided
3195
+ response_format = None
3196
+ if response_schema is not None:
3197
+ # Wrapper class to provide Pydantic-like interface for raw JSON schemas
3198
+ class JsonSchemaWrapper:
3199
+ def __init__(self, schema: dict):
3200
+ self._schema = schema
3201
+
3202
+ def model_json_schema(self):
3203
+ return self._schema
3204
+
3205
+ response_format = JsonSchemaWrapper(response_schema)
3180
3206
 
3181
3207
  llm_start = time.time()
3182
- answer_text = await self._llm_config.call(
3183
- messages=[{"role": "system", "content": system_message}, {"role": "user", "content": prompt}],
3184
- scope="memory_think",
3185
- temperature=0.9,
3186
- max_completion_tokens=1000,
3208
+ result = await self._llm_config.call(
3209
+ messages=messages,
3210
+ scope="memory_reflect",
3211
+ max_completion_tokens=max_tokens,
3212
+ response_format=response_format,
3213
+ skip_validation=True if response_format else False,
3214
+ # Don't enforce strict_schema - not all providers support it and may retry forever
3215
+ # Soft enforcement (schema in prompt + json_object mode) is sufficient
3216
+ strict_schema=False,
3187
3217
  )
3188
3218
  llm_time = time.time() - llm_start
3189
3219
 
3190
- answer_text = answer_text.strip()
3220
+ # Handle response based on whether structured output was requested
3221
+ if response_schema is not None:
3222
+ structured_output = result
3223
+ answer_text = "" # Empty for backward compatibility
3224
+ log_buffer.append(f"[REFLECT {reflect_id}] Structured output generated")
3225
+ else:
3226
+ structured_output = None
3227
+ answer_text = result.strip()
3191
3228
 
3192
3229
  # Submit form_opinion task for background processing
3230
+ # Pass tenant_id from request context for internal authentication in background task
3193
3231
  await self._task_backend.submit_task(
3194
- {"type": "form_opinion", "bank_id": bank_id, "answer_text": answer_text, "query": query}
3232
+ {
3233
+ "type": "form_opinion",
3234
+ "bank_id": bank_id,
3235
+ "answer_text": answer_text,
3236
+ "query": query,
3237
+ "tenant_id": getattr(request_context, "tenant_id", None) if request_context else None,
3238
+ }
3195
3239
  )
3196
3240
 
3197
3241
  total_time = time.time() - reflect_start
@@ -3205,6 +3249,7 @@ Guidelines:
3205
3249
  text=answer_text,
3206
3250
  based_on={"world": world_results, "experience": agent_results, "opinion": opinion_results},
3207
3251
  new_opinions=[], # Opinions are being extracted asynchronously
3252
+ structured_output=structured_output,
3208
3253
  )
3209
3254
 
3210
3255
  # Call post-operation hook if validator is configured
@@ -3228,7 +3273,9 @@ Guidelines:
3228
3273
 
3229
3274
  return result
3230
3275
 
3231
- async def _extract_and_store_opinions_async(self, bank_id: str, answer_text: str, query: str):
3276
+ async def _extract_and_store_opinions_async(
3277
+ self, bank_id: str, answer_text: str, query: str, tenant_id: str | None = None
3278
+ ):
3232
3279
  """
3233
3280
  Background task to extract and store opinions from think response.
3234
3281
 
@@ -3238,6 +3285,7 @@ Guidelines:
3238
3285
  bank_id: bank IDentifier
3239
3286
  answer_text: The generated answer text
3240
3287
  query: The original query
3288
+ tenant_id: Tenant identifier for internal authentication
3241
3289
  """
3242
3290
  try:
3243
3291
  # Extract opinions from the answer
@@ -3248,10 +3296,11 @@ Guidelines:
3248
3296
  from datetime import datetime
3249
3297
 
3250
3298
  current_time = datetime.now(UTC)
3251
- # Use internal request context for background tasks
3299
+ # Use internal context with tenant_id for background authentication
3300
+ # Extension can check internal=True to bypass normal auth
3252
3301
  from hindsight_api.models import RequestContext
3253
3302
 
3254
- internal_context = RequestContext()
3303
+ internal_context = RequestContext(tenant_id=tenant_id, internal=True)
3255
3304
  for opinion in new_opinions:
3256
3305
  await self.retain_async(
3257
3306
  bank_id=bank_id,
@@ -3572,7 +3621,7 @@ Guidelines:
3572
3621
  self,
3573
3622
  bank_id: str,
3574
3623
  entity_ids: list[str],
3575
- min_facts: int = 5,
3624
+ min_facts: int | None = None,
3576
3625
  conn=None,
3577
3626
  request_context: "RequestContext | None" = None,
3578
3627
  ) -> None:
@@ -3584,12 +3633,16 @@ Guidelines:
3584
3633
  Args:
3585
3634
  bank_id: Bank identifier
3586
3635
  entity_ids: List of entity IDs to process
3587
- min_facts: Minimum facts required to regenerate observations
3636
+ min_facts: Minimum facts required to regenerate observations (uses config default if None)
3588
3637
  conn: Optional database connection (for transactional atomicity)
3589
3638
  """
3590
3639
  if not bank_id or not entity_ids:
3591
3640
  return
3592
3641
 
3642
+ # Use config default if min_facts not specified
3643
+ if min_facts is None:
3644
+ min_facts = get_config().observation_min_facts
3645
+
3593
3646
  # Convert to UUIDs
3594
3647
  entity_uuids = [uuid.UUID(eid) if isinstance(eid, str) else eid for eid in entity_ids]
3595
3648
 
@@ -123,7 +123,8 @@ class ReflectResult(BaseModel):
123
123
  Result from a reflect operation.
124
124
 
125
125
  Contains the formulated answer, the facts it was based on (organized by type),
126
- and any new opinions that were formed during the reflection process.
126
+ any new opinions that were formed during the reflection process, and optionally
127
+ structured output if a response schema was provided.
127
128
  """
128
129
 
129
130
  model_config = ConfigDict(
@@ -145,6 +146,7 @@ class ReflectResult(BaseModel):
145
146
  "opinion": [],
146
147
  },
147
148
  "new_opinions": ["Machine learning has great potential in healthcare"],
149
+ "structured_output": {"summary": "ML in healthcare", "confidence": 0.9},
148
150
  }
149
151
  }
150
152
  )
@@ -154,6 +156,10 @@ class ReflectResult(BaseModel):
154
156
  description="Facts used to formulate the answer, organized by type (world, experience, opinion)"
155
157
  )
156
158
  new_opinions: list[str] = Field(default_factory=list, description="List of newly formed opinions during reflection")
159
+ structured_output: dict[str, Any] | None = Field(
160
+ default=None,
161
+ description="Structured output parsed according to the provided response schema. Only present when response_schema was provided.",
162
+ )
157
163
 
158
164
 
159
165
  class Opinion(BaseModel):