hindsight-api 0.2.1__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. hindsight_api/admin/__init__.py +1 -0
  2. hindsight_api/admin/cli.py +311 -0
  3. hindsight_api/alembic/versions/f1a2b3c4d5e6_add_memory_links_composite_index.py +44 -0
  4. hindsight_api/alembic/versions/g2a3b4c5d6e7_add_tags_column.py +48 -0
  5. hindsight_api/alembic/versions/h3c4d5e6f7g8_mental_models_v4.py +112 -0
  6. hindsight_api/alembic/versions/i4d5e6f7g8h9_delete_opinions.py +41 -0
  7. hindsight_api/alembic/versions/j5e6f7g8h9i0_mental_model_versions.py +95 -0
  8. hindsight_api/alembic/versions/k6f7g8h9i0j1_add_directive_subtype.py +58 -0
  9. hindsight_api/alembic/versions/l7g8h9i0j1k2_add_worker_columns.py +109 -0
  10. hindsight_api/alembic/versions/m8h9i0j1k2l3_mental_model_id_to_text.py +41 -0
  11. hindsight_api/alembic/versions/n9i0j1k2l3m4_learnings_and_pinned_reflections.py +134 -0
  12. hindsight_api/alembic/versions/o0j1k2l3m4n5_migrate_mental_models_data.py +113 -0
  13. hindsight_api/alembic/versions/p1k2l3m4n5o6_new_knowledge_architecture.py +194 -0
  14. hindsight_api/alembic/versions/q2l3m4n5o6p7_fix_mental_model_fact_type.py +50 -0
  15. hindsight_api/alembic/versions/r3m4n5o6p7q8_add_reflect_response_to_reflections.py +47 -0
  16. hindsight_api/alembic/versions/s4n5o6p7q8r9_add_consolidated_at_to_memory_units.py +53 -0
  17. hindsight_api/alembic/versions/t5o6p7q8r9s0_rename_mental_models_to_observations.py +134 -0
  18. hindsight_api/alembic/versions/u6p7q8r9s0t1_mental_models_text_id.py +41 -0
  19. hindsight_api/alembic/versions/v7q8r9s0t1u2_add_max_tokens_to_mental_models.py +50 -0
  20. hindsight_api/api/http.py +1406 -118
  21. hindsight_api/api/mcp.py +11 -196
  22. hindsight_api/config.py +359 -27
  23. hindsight_api/engine/consolidation/__init__.py +5 -0
  24. hindsight_api/engine/consolidation/consolidator.py +859 -0
  25. hindsight_api/engine/consolidation/prompts.py +69 -0
  26. hindsight_api/engine/cross_encoder.py +706 -88
  27. hindsight_api/engine/db_budget.py +284 -0
  28. hindsight_api/engine/db_utils.py +11 -0
  29. hindsight_api/engine/directives/__init__.py +5 -0
  30. hindsight_api/engine/directives/models.py +37 -0
  31. hindsight_api/engine/embeddings.py +553 -29
  32. hindsight_api/engine/entity_resolver.py +8 -5
  33. hindsight_api/engine/interface.py +40 -17
  34. hindsight_api/engine/llm_wrapper.py +744 -68
  35. hindsight_api/engine/memory_engine.py +2505 -1017
  36. hindsight_api/engine/mental_models/__init__.py +14 -0
  37. hindsight_api/engine/mental_models/models.py +53 -0
  38. hindsight_api/engine/query_analyzer.py +4 -3
  39. hindsight_api/engine/reflect/__init__.py +18 -0
  40. hindsight_api/engine/reflect/agent.py +933 -0
  41. hindsight_api/engine/reflect/models.py +109 -0
  42. hindsight_api/engine/reflect/observations.py +186 -0
  43. hindsight_api/engine/reflect/prompts.py +483 -0
  44. hindsight_api/engine/reflect/tools.py +437 -0
  45. hindsight_api/engine/reflect/tools_schema.py +250 -0
  46. hindsight_api/engine/response_models.py +168 -4
  47. hindsight_api/engine/retain/bank_utils.py +79 -201
  48. hindsight_api/engine/retain/fact_extraction.py +424 -195
  49. hindsight_api/engine/retain/fact_storage.py +35 -12
  50. hindsight_api/engine/retain/link_utils.py +29 -24
  51. hindsight_api/engine/retain/orchestrator.py +24 -43
  52. hindsight_api/engine/retain/types.py +11 -2
  53. hindsight_api/engine/search/graph_retrieval.py +43 -14
  54. hindsight_api/engine/search/link_expansion_retrieval.py +391 -0
  55. hindsight_api/engine/search/mpfp_retrieval.py +362 -117
  56. hindsight_api/engine/search/reranking.py +2 -2
  57. hindsight_api/engine/search/retrieval.py +848 -201
  58. hindsight_api/engine/search/tags.py +172 -0
  59. hindsight_api/engine/search/think_utils.py +42 -141
  60. hindsight_api/engine/search/trace.py +12 -1
  61. hindsight_api/engine/search/tracer.py +26 -6
  62. hindsight_api/engine/search/types.py +21 -3
  63. hindsight_api/engine/task_backend.py +113 -106
  64. hindsight_api/engine/utils.py +1 -152
  65. hindsight_api/extensions/__init__.py +10 -1
  66. hindsight_api/extensions/builtin/tenant.py +5 -1
  67. hindsight_api/extensions/context.py +10 -1
  68. hindsight_api/extensions/operation_validator.py +81 -4
  69. hindsight_api/extensions/tenant.py +26 -0
  70. hindsight_api/main.py +69 -6
  71. hindsight_api/mcp_local.py +12 -53
  72. hindsight_api/mcp_tools.py +494 -0
  73. hindsight_api/metrics.py +433 -48
  74. hindsight_api/migrations.py +141 -1
  75. hindsight_api/models.py +3 -3
  76. hindsight_api/pg0.py +53 -0
  77. hindsight_api/server.py +39 -2
  78. hindsight_api/worker/__init__.py +11 -0
  79. hindsight_api/worker/main.py +296 -0
  80. hindsight_api/worker/poller.py +486 -0
  81. {hindsight_api-0.2.1.dist-info → hindsight_api-0.4.0.dist-info}/METADATA +16 -6
  82. hindsight_api-0.4.0.dist-info/RECORD +112 -0
  83. {hindsight_api-0.2.1.dist-info → hindsight_api-0.4.0.dist-info}/entry_points.txt +2 -0
  84. hindsight_api/engine/retain/observation_regeneration.py +0 -254
  85. hindsight_api/engine/search/observation_utils.py +0 -125
  86. hindsight_api/engine/search/scoring.py +0 -159
  87. hindsight_api-0.2.1.dist-info/RECORD +0 -75
  88. {hindsight_api-0.2.1.dist-info → hindsight_api-0.4.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,933 @@
1
+ """
2
+ Reflect agent - agentic loop for reflection with native tool calling.
3
+
4
+ Uses hierarchical retrieval:
5
+ 1. search_mental_models - User-curated summaries (highest quality)
6
+ 2. search_observations - Consolidated knowledge with freshness
7
+ 3. recall - Raw facts as ground truth
8
+ """
9
+
10
+ import asyncio
11
+ import json
12
+ import logging
13
+ import re
14
+ import time
15
+ from typing import TYPE_CHECKING, Any, Awaitable, Callable
16
+
17
+ from .models import DirectiveInfo, LLMCall, ReflectAgentResult, TokenUsageSummary, ToolCall
18
+ from .prompts import FINAL_SYSTEM_PROMPT, _extract_directive_rules, build_final_prompt, build_system_prompt_for_tools
19
+ from .tools_schema import get_reflect_tools
20
+
21
+
22
+ def _build_directives_applied(directives: list[dict[str, Any]] | None) -> list[DirectiveInfo]:
23
+ """Build list of DirectiveInfo from directive mental models.
24
+
25
+ Handles multiple directive formats:
26
+ 1. New format: directives have direct 'content' field
27
+ 2. Fallback: directives have 'description' field
28
+ """
29
+ if not directives:
30
+ return []
31
+
32
+ result = []
33
+ for directive in directives:
34
+ directive_id = directive.get("id", "")
35
+ directive_name = directive.get("name", "")
36
+
37
+ # Get content from 'content' field or fallback to 'description'
38
+ content = directive.get("content", "") or directive.get("description", "")
39
+
40
+ result.append(DirectiveInfo(id=directive_id, name=directive_name, content=content))
41
+
42
+ return result
43
+
44
+
45
+ if TYPE_CHECKING:
46
+ from ..llm_wrapper import LLMProvider
47
+ from ..response_models import LLMToolCall
48
+
49
+ logger = logging.getLogger(__name__)
50
+
51
+ DEFAULT_MAX_ITERATIONS = 10
52
+
53
+
54
+ def _normalize_tool_name(name: str) -> str:
55
+ """Normalize tool name from various LLM output formats.
56
+
57
+ Some LLMs output tool names in non-standard formats:
58
+ - 'functions.done' (OpenAI-style prefix)
59
+ - 'call=functions.done' (some models)
60
+ - 'call=done' (some models)
61
+ - 'done<|channel|>commentary' (malformed special tokens appended)
62
+
63
+ Returns the normalized tool name (e.g., 'done', 'recall', etc.)
64
+ """
65
+ # Handle 'call=functions.name' or 'call=name' format
66
+ if name.startswith("call="):
67
+ name = name[len("call=") :]
68
+
69
+ # Handle 'functions.name' format
70
+ if name.startswith("functions."):
71
+ name = name[len("functions.") :]
72
+
73
+ # Handle malformed special tokens appended to tool name
74
+ # e.g., 'done<|channel|>commentary' -> 'done'
75
+ if "<|" in name:
76
+ name = name.split("<|")[0]
77
+
78
+ return name
79
+
80
+
81
+ def _is_done_tool(name: str) -> bool:
82
+ """Check if the tool name represents the 'done' tool."""
83
+ return _normalize_tool_name(name) == "done"
84
+
85
+
86
+ # Pattern to match done() call as text - handles done({...}) with nested JSON
87
+ _DONE_CALL_PATTERN = re.compile(r"done\s*\(\s*\{.*$", re.DOTALL)
88
+
89
+ # Patterns for leaked structured output in the answer field
90
+ _LEAKED_JSON_SUFFIX = re.compile(
91
+ r'\s*```(?:json)?\s*\{[^}]*(?:"(?:observation_ids|memory_ids|mental_model_ids)"|\})\s*```\s*$',
92
+ re.DOTALL | re.IGNORECASE,
93
+ )
94
+ _LEAKED_JSON_OBJECT = re.compile(
95
+ r'\s*\{[^{]*"(?:observation_ids|memory_ids|mental_model_ids|answer)"[^}]*\}\s*$', re.DOTALL
96
+ )
97
+ _TRAILING_IDS_PATTERN = re.compile(
98
+ r"\s*(?:observation_ids|memory_ids|mental_model_ids)\s*[=:]\s*\[.*?\]\s*$", re.DOTALL | re.IGNORECASE
99
+ )
100
+
101
+
102
+ def _clean_answer_text(text: str) -> str:
103
+ """Clean up answer text by removing any done() tool call syntax.
104
+
105
+ Some LLMs output the done() call as text instead of a proper tool call.
106
+ This strips out patterns like: done({"answer": "...", ...})
107
+ """
108
+ # Remove done() call pattern from the end of the text
109
+ cleaned = _DONE_CALL_PATTERN.sub("", text).strip()
110
+ return cleaned if cleaned else text
111
+
112
+
113
+ def _clean_done_answer(text: str) -> str:
114
+ """Clean up the answer field from a done() tool call.
115
+
116
+ Some LLMs leak structured output patterns into the answer text, such as:
117
+ - JSON code blocks with observation_ids/memory_ids at the end
118
+ - Raw JSON objects with these fields
119
+ - Plain text like "observation_ids: [...]"
120
+
121
+ This cleans those patterns while preserving the actual answer content.
122
+ """
123
+ if not text:
124
+ return text
125
+
126
+ cleaned = text
127
+
128
+ # Remove leaked JSON in code blocks at the end
129
+ cleaned = _LEAKED_JSON_SUFFIX.sub("", cleaned).strip()
130
+
131
+ # Remove leaked raw JSON objects at the end
132
+ cleaned = _LEAKED_JSON_OBJECT.sub("", cleaned).strip()
133
+
134
+ # Remove trailing ID patterns
135
+ cleaned = _TRAILING_IDS_PATTERN.sub("", cleaned).strip()
136
+
137
+ return cleaned if cleaned else text
138
+
139
+
140
+ async def _generate_structured_output(
141
+ answer: str,
142
+ response_schema: dict,
143
+ llm_config: "LLMProvider",
144
+ reflect_id: str,
145
+ ) -> tuple[dict[str, Any] | None, int, int]:
146
+ """Generate structured output from an answer using the provided JSON schema.
147
+
148
+ Args:
149
+ answer: The text answer to extract structured data from
150
+ response_schema: JSON Schema for the expected output structure
151
+ llm_config: LLM provider for making the extraction call
152
+ reflect_id: Reflect ID for logging
153
+
154
+ Returns:
155
+ Tuple of (structured_output, input_tokens, output_tokens).
156
+ structured_output is None if generation fails.
157
+ """
158
+ try:
159
+ from typing import Any as TypingAny
160
+
161
+ from pydantic import create_model
162
+
163
+ def _json_schema_type_to_python(field_schema: dict) -> type:
164
+ """Map JSON schema type to Python type for better LLM guidance."""
165
+ json_type = field_schema.get("type", "string")
166
+ if json_type == "array":
167
+ return list
168
+ elif json_type == "object":
169
+ return dict
170
+ elif json_type == "integer":
171
+ return int
172
+ elif json_type == "number":
173
+ return float
174
+ elif json_type == "boolean":
175
+ return bool
176
+ else:
177
+ return str
178
+
179
+ # Build fields from JSON schema properties
180
+ schema_props = response_schema.get("properties", {})
181
+ required_fields = set(response_schema.get("required", []))
182
+ fields: dict[str, TypingAny] = {}
183
+ for field_name, field_schema in schema_props.items():
184
+ field_type = _json_schema_type_to_python(field_schema)
185
+ default = ... if field_name in required_fields else None
186
+ fields[field_name] = (field_type, default)
187
+
188
+ if not fields:
189
+ logger.warning(f"[REFLECT {reflect_id}] No fields found in response_schema, skipping structured output")
190
+ return None, 0, 0
191
+
192
+ DynamicModel = create_model("StructuredResponse", **fields)
193
+
194
+ # Include the full schema in the prompt for better LLM guidance
195
+ schema_str = json.dumps(response_schema, indent=2)
196
+
197
+ # Build field descriptions for the prompt
198
+ field_descriptions = []
199
+ for field_name, field_schema in schema_props.items():
200
+ field_type = field_schema.get("type", "string")
201
+ field_desc = field_schema.get("description", "")
202
+ is_required = field_name in required_fields
203
+ req_marker = " (REQUIRED)" if is_required else " (optional)"
204
+ field_descriptions.append(f"- {field_name} ({field_type}){req_marker}: {field_desc}")
205
+ fields_text = "\n".join(field_descriptions)
206
+
207
+ # Call LLM with the answer to extract structured data
208
+ structured_prompt = f"""Your task is to extract specific information from the answer below and format it as JSON.
209
+
210
+ ANSWER TO EXTRACT FROM:
211
+ \"\"\"
212
+ {answer}
213
+ \"\"\"
214
+
215
+ REQUIRED OUTPUT FORMAT - Extract the following fields from the answer above:
216
+ {fields_text}
217
+
218
+ JSON Schema:
219
+ ```json
220
+ {schema_str}
221
+ ```
222
+
223
+ INSTRUCTIONS:
224
+ 1. Read the answer carefully and identify the information that matches each field
225
+ 2. Extract the ACTUAL content from the answer - do NOT leave fields empty if information is present
226
+ 3. For string fields: use the exact text or a clear summary from the answer
227
+ 4. For array fields: return a JSON array (e.g., ["item1", "item2"]), NOT a string
228
+ 5. For required fields: you MUST provide a value extracted from the answer
229
+ 6. Return ONLY the JSON object, no explanation
230
+
231
+ OUTPUT:"""
232
+
233
+ structured_result, usage = await llm_config.call(
234
+ messages=[
235
+ {
236
+ "role": "system",
237
+ "content": "You are a precise data extraction assistant. Extract information from text and return it as valid JSON matching the provided schema. Always extract actual content - never return empty strings for required fields if information is available.",
238
+ },
239
+ {"role": "user", "content": structured_prompt},
240
+ ],
241
+ response_format=DynamicModel,
242
+ scope="reflect_structured",
243
+ skip_validation=True, # We'll handle the dict ourselves
244
+ return_usage=True,
245
+ )
246
+
247
+ # Convert to dict
248
+ if hasattr(structured_result, "model_dump"):
249
+ structured_output = structured_result.model_dump()
250
+ elif isinstance(structured_result, dict):
251
+ structured_output = structured_result
252
+ else:
253
+ # Try to parse as JSON
254
+ structured_output = json.loads(str(structured_result))
255
+
256
+ # Validate that required fields have non-empty values
257
+ for field_name in required_fields:
258
+ value = structured_output.get(field_name)
259
+ if value is None or value == "" or value == []:
260
+ logger.warning(f"[REFLECT {reflect_id}] Required field '{field_name}' is empty in structured output")
261
+
262
+ logger.info(f"[REFLECT {reflect_id}] Generated structured output with {len(structured_output)} fields")
263
+ return structured_output, usage.input_tokens, usage.output_tokens
264
+
265
+ except Exception as e:
266
+ logger.warning(f"[REFLECT {reflect_id}] Failed to generate structured output: {e}")
267
+ return None, 0, 0
268
+
269
+
270
+ async def run_reflect_agent(
271
+ llm_config: "LLMProvider",
272
+ bank_id: str,
273
+ query: str,
274
+ bank_profile: dict[str, Any],
275
+ search_mental_models_fn: Callable[[str, int], Awaitable[dict[str, Any]]],
276
+ search_observations_fn: Callable[[str, int], Awaitable[dict[str, Any]]],
277
+ recall_fn: Callable[[str, int], Awaitable[dict[str, Any]]],
278
+ expand_fn: Callable[[list[str], str], Awaitable[dict[str, Any]]],
279
+ context: str | None = None,
280
+ max_iterations: int = DEFAULT_MAX_ITERATIONS,
281
+ max_tokens: int | None = None,
282
+ response_schema: dict | None = None,
283
+ directives: list[dict[str, Any]] | None = None,
284
+ has_mental_models: bool = False,
285
+ budget: str | None = None,
286
+ ) -> ReflectAgentResult:
287
+ """
288
+ Execute the reflect agent loop using native tool calling.
289
+
290
+ The agent uses hierarchical retrieval:
291
+ 1. search_mental_models - User-curated summaries (try first)
292
+ 2. search_observations - Consolidated knowledge with freshness
293
+ 3. recall - Raw facts as ground truth
294
+
295
+ Args:
296
+ llm_config: LLM provider for agent calls
297
+ bank_id: Bank identifier
298
+ query: Question to answer
299
+ bank_profile: Bank profile with name and mission
300
+ search_mental_models_fn: Tool callback for searching mental models (query, max_results) -> result
301
+ search_observations_fn: Tool callback for searching observations (query, max_results) -> result
302
+ recall_fn: Tool callback for recall (query, max_tokens) -> result
303
+ expand_fn: Tool callback for expand (memory_ids, depth) -> result
304
+ context: Optional additional context
305
+ max_iterations: Maximum number of iterations before forcing response
306
+ max_tokens: Maximum tokens for the final response
307
+ response_schema: Optional JSON Schema for structured output in final response
308
+ directives: Optional list of directive mental models to inject as hard rules
309
+
310
+ Returns:
311
+ ReflectAgentResult with final answer and metadata
312
+ """
313
+ reflect_id = f"{bank_id[:8]}-{int(time.time() * 1000) % 100000}"
314
+ start_time = time.time()
315
+
316
+ # Build directives_applied for the trace
317
+ directives_applied = _build_directives_applied(directives)
318
+
319
+ # Extract directive rules for tool schema (if any)
320
+ directive_rules = _extract_directive_rules(directives) if directives else None
321
+
322
+ # Get tools for this agent (with directive compliance field if directives exist)
323
+ tools = get_reflect_tools(directive_rules=directive_rules)
324
+
325
+ # Build initial messages (directives are injected into system prompt at START and END)
326
+ system_prompt = build_system_prompt_for_tools(
327
+ bank_profile, context, directives=directives, has_mental_models=has_mental_models, budget=budget
328
+ )
329
+ messages: list[dict[str, Any]] = [
330
+ {"role": "system", "content": system_prompt},
331
+ {"role": "user", "content": query},
332
+ ]
333
+
334
+ # Tracking
335
+ total_tools_called = 0
336
+ tool_trace: list[ToolCall] = []
337
+ tool_trace_summary: list[dict[str, Any]] = []
338
+ llm_trace: list[dict[str, Any]] = []
339
+ context_history: list[dict[str, Any]] = [] # For final prompt fallback
340
+
341
+ # Token usage tracking - accumulate across all LLM calls
342
+ total_input_tokens = 0
343
+ total_output_tokens = 0
344
+
345
+ # Track available IDs for validation (prevents hallucinated citations)
346
+ available_memory_ids: set[str] = set()
347
+ available_mental_model_ids: set[str] = set()
348
+ available_observation_ids: set[str] = set()
349
+
350
+ def _get_llm_trace() -> list[LLMCall]:
351
+ return [
352
+ LLMCall(
353
+ scope=c["scope"],
354
+ duration_ms=c["duration_ms"],
355
+ input_tokens=c.get("input_tokens", 0),
356
+ output_tokens=c.get("output_tokens", 0),
357
+ )
358
+ for c in llm_trace
359
+ ]
360
+
361
+ def _get_usage() -> TokenUsageSummary:
362
+ return TokenUsageSummary(
363
+ input_tokens=total_input_tokens,
364
+ output_tokens=total_output_tokens,
365
+ total_tokens=total_input_tokens + total_output_tokens,
366
+ )
367
+
368
+ def _log_completion(answer: str, iterations: int, forced: bool = False):
369
+ elapsed_ms = int((time.time() - start_time) * 1000)
370
+ tools_summary = (
371
+ ", ".join(
372
+ f"{t['tool']}({t['input_summary']})={t['duration_ms']}ms/{t.get('output_chars', 0)}c"
373
+ for t in tool_trace_summary
374
+ )
375
+ or "none"
376
+ )
377
+ llm_summary = ", ".join(f"{c['scope']}={c['duration_ms']}ms" for c in llm_trace) or "none"
378
+ total_llm_ms = sum(c["duration_ms"] for c in llm_trace)
379
+ total_tools_ms = sum(t["duration_ms"] for t in tool_trace_summary)
380
+
381
+ answer_preview = answer[:100] + "..." if len(answer) > 100 else answer
382
+ mode = "forced" if forced else "done"
383
+ logger.info(
384
+ f"[REFLECT {reflect_id}] {mode} | "
385
+ f"query='{query[:50]}...' | "
386
+ f"iterations={iterations} | "
387
+ f"llm=[{llm_summary}] ({total_llm_ms}ms) | "
388
+ f"tools=[{tools_summary}] ({total_tools_ms}ms) | "
389
+ f"answer='{answer_preview}' | "
390
+ f"total={elapsed_ms}ms"
391
+ )
392
+
393
+ for iteration in range(max_iterations):
394
+ is_last = iteration == max_iterations - 1
395
+
396
+ if is_last:
397
+ # Force text response on last iteration - no tools
398
+ prompt = build_final_prompt(query, context_history, bank_profile, context)
399
+ llm_start = time.time()
400
+ response, usage = await llm_config.call(
401
+ messages=[
402
+ {"role": "system", "content": FINAL_SYSTEM_PROMPT},
403
+ {"role": "user", "content": prompt},
404
+ ],
405
+ scope="reflect_agent_final",
406
+ max_completion_tokens=max_tokens,
407
+ return_usage=True,
408
+ )
409
+ llm_duration = int((time.time() - llm_start) * 1000)
410
+ total_input_tokens += usage.input_tokens
411
+ total_output_tokens += usage.output_tokens
412
+ llm_trace.append(
413
+ {
414
+ "scope": "final",
415
+ "duration_ms": llm_duration,
416
+ "input_tokens": usage.input_tokens,
417
+ "output_tokens": usage.output_tokens,
418
+ }
419
+ )
420
+ answer = _clean_answer_text(response.strip())
421
+
422
+ # Generate structured output if schema provided
423
+ structured_output = None
424
+ if response_schema and answer:
425
+ structured_output, struct_in, struct_out = await _generate_structured_output(
426
+ answer, response_schema, llm_config, reflect_id
427
+ )
428
+ total_input_tokens += struct_in
429
+ total_output_tokens += struct_out
430
+
431
+ _log_completion(answer, iteration + 1, forced=True)
432
+ return ReflectAgentResult(
433
+ text=answer,
434
+ structured_output=structured_output,
435
+ iterations=iteration + 1,
436
+ tools_called=total_tools_called,
437
+ tool_trace=tool_trace,
438
+ llm_trace=_get_llm_trace(),
439
+ usage=_get_usage(),
440
+ directives_applied=directives_applied,
441
+ )
442
+
443
+ # Call LLM with tools
444
+ llm_start = time.time()
445
+
446
+ try:
447
+ result = await llm_config.call_with_tools(
448
+ messages=messages,
449
+ tools=tools,
450
+ scope="reflect_agent",
451
+ tool_choice="required" if iteration == 0 else "auto", # Force tool use on first iteration
452
+ )
453
+ llm_duration = int((time.time() - llm_start) * 1000)
454
+ total_input_tokens += result.input_tokens
455
+ total_output_tokens += result.output_tokens
456
+ llm_trace.append(
457
+ {
458
+ "scope": f"agent_{iteration + 1}",
459
+ "duration_ms": llm_duration,
460
+ "input_tokens": result.input_tokens,
461
+ "output_tokens": result.output_tokens,
462
+ }
463
+ )
464
+
465
+ except Exception as e:
466
+ err_duration = int((time.time() - llm_start) * 1000)
467
+ logger.warning(f"[REFLECT {reflect_id}] LLM error on iteration {iteration + 1}: {e} ({err_duration}ms)")
468
+ llm_trace.append({"scope": f"agent_{iteration + 1}_err", "duration_ms": err_duration})
469
+ # Guardrail: If no evidence gathered yet, retry
470
+ has_gathered_evidence = (
471
+ bool(available_memory_ids) or bool(available_mental_model_ids) or bool(available_observation_ids)
472
+ )
473
+ if not has_gathered_evidence and iteration < max_iterations - 1:
474
+ continue
475
+ prompt = build_final_prompt(query, context_history, bank_profile, context)
476
+ llm_start = time.time()
477
+ response, usage = await llm_config.call(
478
+ messages=[
479
+ {"role": "system", "content": FINAL_SYSTEM_PROMPT},
480
+ {"role": "user", "content": prompt},
481
+ ],
482
+ scope="reflect_agent_final",
483
+ max_completion_tokens=max_tokens,
484
+ return_usage=True,
485
+ )
486
+ llm_duration = int((time.time() - llm_start) * 1000)
487
+ total_input_tokens += usage.input_tokens
488
+ total_output_tokens += usage.output_tokens
489
+ llm_trace.append(
490
+ {
491
+ "scope": "final",
492
+ "duration_ms": llm_duration,
493
+ "input_tokens": usage.input_tokens,
494
+ "output_tokens": usage.output_tokens,
495
+ }
496
+ )
497
+ answer = _clean_answer_text(response.strip())
498
+
499
+ # Generate structured output if schema provided
500
+ structured_output = None
501
+ if response_schema and answer:
502
+ structured_output, struct_in, struct_out = await _generate_structured_output(
503
+ answer, response_schema, llm_config, reflect_id
504
+ )
505
+ total_input_tokens += struct_in
506
+ total_output_tokens += struct_out
507
+
508
+ _log_completion(answer, iteration + 1, forced=True)
509
+ return ReflectAgentResult(
510
+ text=answer,
511
+ structured_output=structured_output,
512
+ iterations=iteration + 1,
513
+ tools_called=total_tools_called,
514
+ tool_trace=tool_trace,
515
+ llm_trace=_get_llm_trace(),
516
+ usage=_get_usage(),
517
+ directives_applied=directives_applied,
518
+ )
519
+
520
+ # No tool calls - LLM wants to respond with text
521
+ if not result.tool_calls:
522
+ if result.content:
523
+ answer = _clean_answer_text(result.content.strip())
524
+
525
+ # Generate structured output if schema provided
526
+ structured_output = None
527
+ if response_schema and answer:
528
+ structured_output, struct_in, struct_out = await _generate_structured_output(
529
+ answer, response_schema, llm_config, reflect_id
530
+ )
531
+ total_input_tokens += struct_in
532
+ total_output_tokens += struct_out
533
+
534
+ _log_completion(answer, iteration + 1)
535
+ return ReflectAgentResult(
536
+ text=answer,
537
+ structured_output=structured_output,
538
+ iterations=iteration + 1,
539
+ tools_called=total_tools_called,
540
+ tool_trace=tool_trace,
541
+ llm_trace=_get_llm_trace(),
542
+ usage=_get_usage(),
543
+ directives_applied=directives_applied,
544
+ )
545
+ # Empty response, force final
546
+ prompt = build_final_prompt(query, context_history, bank_profile, context)
547
+ llm_start = time.time()
548
+ response, usage = await llm_config.call(
549
+ messages=[
550
+ {"role": "system", "content": FINAL_SYSTEM_PROMPT},
551
+ {"role": "user", "content": prompt},
552
+ ],
553
+ scope="reflect_agent_final",
554
+ max_completion_tokens=max_tokens,
555
+ return_usage=True,
556
+ )
557
+ llm_duration = int((time.time() - llm_start) * 1000)
558
+ total_input_tokens += usage.input_tokens
559
+ total_output_tokens += usage.output_tokens
560
+ llm_trace.append(
561
+ {
562
+ "scope": "final",
563
+ "duration_ms": llm_duration,
564
+ "input_tokens": usage.input_tokens,
565
+ "output_tokens": usage.output_tokens,
566
+ }
567
+ )
568
+ answer = _clean_answer_text(response.strip())
569
+
570
+ # Generate structured output if schema provided
571
+ structured_output = None
572
+ if response_schema and answer:
573
+ structured_output, struct_in, struct_out = await _generate_structured_output(
574
+ answer, response_schema, llm_config, reflect_id
575
+ )
576
+ total_input_tokens += struct_in
577
+ total_output_tokens += struct_out
578
+
579
+ _log_completion(answer, iteration + 1, forced=True)
580
+ return ReflectAgentResult(
581
+ text=answer,
582
+ structured_output=structured_output,
583
+ iterations=iteration + 1,
584
+ tools_called=total_tools_called,
585
+ tool_trace=tool_trace,
586
+ llm_trace=_get_llm_trace(),
587
+ usage=_get_usage(),
588
+ directives_applied=directives_applied,
589
+ )
590
+
591
+ # Check for done tool call (handle various LLM output formats)
592
+ done_call = next((tc for tc in result.tool_calls if _is_done_tool(tc.name)), None)
593
+ if done_call:
594
+ # Guardrail: Require evidence before done
595
+ has_gathered_evidence = (
596
+ bool(available_memory_ids) or bool(available_mental_model_ids) or bool(available_observation_ids)
597
+ )
598
+ if not has_gathered_evidence and iteration < max_iterations - 1:
599
+ # Add assistant message and fake tool result asking for evidence
600
+ messages.append(
601
+ {
602
+ "role": "assistant",
603
+ "tool_calls": [_tool_call_to_dict(done_call)],
604
+ }
605
+ )
606
+ messages.append(
607
+ {
608
+ "role": "tool",
609
+ "tool_call_id": done_call.id,
610
+ "name": done_call.name, # Required by Gemini
611
+ "content": json.dumps(
612
+ {
613
+ "error": "You must search for information first. Use search_mental_models(), search_observations(), or recall() before providing your final answer."
614
+ }
615
+ ),
616
+ }
617
+ )
618
+ continue
619
+
620
+ # Process done tool
621
+ return await _process_done_tool(
622
+ done_call,
623
+ available_memory_ids,
624
+ available_mental_model_ids,
625
+ available_observation_ids,
626
+ iteration + 1,
627
+ total_tools_called,
628
+ tool_trace,
629
+ _get_llm_trace(),
630
+ _get_usage(),
631
+ _log_completion,
632
+ reflect_id,
633
+ directives_applied=directives_applied,
634
+ llm_config=llm_config,
635
+ response_schema=response_schema,
636
+ )
637
+
638
+ # Execute other tools in parallel (exclude done tool in all its format variants)
639
+ other_tools = [tc for tc in result.tool_calls if not _is_done_tool(tc.name)]
640
+ if other_tools:
641
+ # Add assistant message with tool calls
642
+ messages.append(
643
+ {
644
+ "role": "assistant",
645
+ "tool_calls": [_tool_call_to_dict(tc) for tc in other_tools],
646
+ }
647
+ )
648
+
649
+ # Execute tools in parallel
650
+ tool_tasks = [
651
+ _execute_tool_with_timing(
652
+ tc,
653
+ search_mental_models_fn,
654
+ search_observations_fn,
655
+ recall_fn,
656
+ expand_fn,
657
+ )
658
+ for tc in other_tools
659
+ ]
660
+ tool_results = await asyncio.gather(*tool_tasks, return_exceptions=True)
661
+ total_tools_called += len(other_tools)
662
+
663
+ # Process results and add to messages
664
+ for tc, result_data in zip(other_tools, tool_results):
665
+ if isinstance(result_data, Exception):
666
+ # Tool execution failed - send error back to LLM so it can try again
667
+ logger.warning(f"[REFLECT {reflect_id}] Tool {tc.name} failed with exception: {result_data}")
668
+ output = {"error": f"Tool execution failed: {result_data}"}
669
+ duration_ms = 0
670
+ else:
671
+ output, duration_ms = result_data
672
+
673
+ # Normalize tool name for consistent tracking
674
+ normalized_tool_name = _normalize_tool_name(tc.name)
675
+
676
+ # Check if tool returned an error response - log but continue (LLM will see the error)
677
+ if isinstance(output, dict) and "error" in output:
678
+ logger.warning(
679
+ f"[REFLECT {reflect_id}] Tool {normalized_tool_name} returned error: {output['error']}"
680
+ )
681
+
682
+ # Track available IDs from tool results (only for successful responses)
683
+ if (
684
+ normalized_tool_name == "search_mental_models"
685
+ and isinstance(output, dict)
686
+ and "mental_models" in output
687
+ ):
688
+ for mm in output["mental_models"]:
689
+ if "id" in mm:
690
+ available_mental_model_ids.add(mm["id"])
691
+
692
+ if (
693
+ normalized_tool_name == "search_observations"
694
+ and isinstance(output, dict)
695
+ and "observations" in output
696
+ ):
697
+ for obs in output["observations"]:
698
+ if "id" in obs:
699
+ available_observation_ids.add(obs["id"])
700
+
701
+ if normalized_tool_name == "recall" and isinstance(output, dict) and "memories" in output:
702
+ for memory in output["memories"]:
703
+ if "id" in memory:
704
+ available_memory_ids.add(memory["id"])
705
+
706
+ # Add tool result message
707
+ messages.append(
708
+ {
709
+ "role": "tool",
710
+ "tool_call_id": tc.id,
711
+ "name": tc.name, # Required by Gemini
712
+ "content": json.dumps(output, default=str),
713
+ }
714
+ )
715
+
716
+ # Track for logging and context history
717
+ input_dict = {"tool": tc.name, **tc.arguments}
718
+ input_summary = _summarize_input(tc.name, tc.arguments)
719
+
720
+ # Extract reason from tool arguments (if provided)
721
+ tool_reason = tc.arguments.get("reason")
722
+
723
+ tool_trace.append(
724
+ ToolCall(
725
+ tool=tc.name,
726
+ reason=tool_reason,
727
+ input=input_dict,
728
+ output=output,
729
+ duration_ms=duration_ms,
730
+ iteration=iteration + 1,
731
+ )
732
+ )
733
+
734
+ try:
735
+ output_chars = len(json.dumps(output))
736
+ except (TypeError, ValueError):
737
+ output_chars = len(str(output))
738
+
739
+ tool_trace_summary.append(
740
+ {
741
+ "tool": tc.name,
742
+ "input_summary": input_summary,
743
+ "duration_ms": duration_ms,
744
+ "output_chars": output_chars,
745
+ }
746
+ )
747
+
748
+ # Keep context history for fallback final prompt
749
+ context_history.append({"tool": tc.name, "input": input_dict, "output": output})
750
+
751
+ # Should not reach here
752
+ answer = "I was unable to formulate a complete answer within the iteration limit."
753
+ _log_completion(answer, max_iterations, forced=True)
754
+ return ReflectAgentResult(
755
+ text=answer,
756
+ iterations=max_iterations,
757
+ tools_called=total_tools_called,
758
+ tool_trace=tool_trace,
759
+ llm_trace=_get_llm_trace(),
760
+ usage=_get_usage(),
761
+ directives_applied=directives_applied,
762
+ )
763
+
764
+
765
+ def _tool_call_to_dict(tc: "LLMToolCall") -> dict[str, Any]:
766
+ """Convert LLMToolCall to OpenAI message format."""
767
+ return {
768
+ "id": tc.id,
769
+ "type": "function",
770
+ "function": {
771
+ "name": tc.name,
772
+ "arguments": json.dumps(tc.arguments),
773
+ },
774
+ }
775
+
776
+
777
+ async def _process_done_tool(
778
+ done_call: "LLMToolCall",
779
+ available_memory_ids: set[str],
780
+ available_mental_model_ids: set[str],
781
+ available_observation_ids: set[str],
782
+ iterations: int,
783
+ total_tools_called: int,
784
+ tool_trace: list[ToolCall],
785
+ llm_trace: list[LLMCall],
786
+ usage: TokenUsageSummary,
787
+ log_completion: Callable,
788
+ reflect_id: str,
789
+ directives_applied: list[DirectiveInfo],
790
+ llm_config: "LLMProvider | None" = None,
791
+ response_schema: dict | None = None,
792
+ ) -> ReflectAgentResult:
793
+ """Process the done tool call and return the result."""
794
+ args = done_call.arguments
795
+
796
+ # Extract and clean the answer - some LLMs leak structured output into the answer text
797
+ raw_answer = args.get("answer", "").strip()
798
+ answer = _clean_done_answer(raw_answer) if raw_answer else ""
799
+ if not answer:
800
+ answer = "No answer provided."
801
+
802
+ # Validate IDs (only include IDs that were actually retrieved)
803
+ used_memory_ids = [mid for mid in args.get("memory_ids", []) if mid in available_memory_ids]
804
+ used_mental_model_ids = [mid for mid in args.get("mental_model_ids", []) if mid in available_mental_model_ids]
805
+ used_observation_ids = [oid for oid in args.get("observation_ids", []) if oid in available_observation_ids]
806
+
807
+ # Generate structured output if schema provided
808
+ structured_output = None
809
+ final_usage = usage
810
+ if response_schema and llm_config and answer:
811
+ structured_output, struct_in, struct_out = await _generate_structured_output(
812
+ answer, response_schema, llm_config, reflect_id
813
+ )
814
+ # Add structured output tokens to usage
815
+ final_usage = TokenUsageSummary(
816
+ input_tokens=usage.input_tokens + struct_in,
817
+ output_tokens=usage.output_tokens + struct_out,
818
+ total_tokens=usage.total_tokens + struct_in + struct_out,
819
+ )
820
+
821
+ log_completion(answer, iterations)
822
+ return ReflectAgentResult(
823
+ text=answer,
824
+ structured_output=structured_output,
825
+ iterations=iterations,
826
+ tools_called=total_tools_called,
827
+ tool_trace=tool_trace,
828
+ llm_trace=llm_trace,
829
+ usage=final_usage,
830
+ used_memory_ids=used_memory_ids,
831
+ used_mental_model_ids=used_mental_model_ids,
832
+ used_observation_ids=used_observation_ids,
833
+ directives_applied=directives_applied,
834
+ )
835
+
836
+
837
+ async def _execute_tool_with_timing(
838
+ tc: "LLMToolCall",
839
+ search_mental_models_fn: Callable[[str, int], Awaitable[dict[str, Any]]],
840
+ search_observations_fn: Callable[[str, int], Awaitable[dict[str, Any]]],
841
+ recall_fn: Callable[[str, int], Awaitable[dict[str, Any]]],
842
+ expand_fn: Callable[[list[str], str], Awaitable[dict[str, Any]]],
843
+ ) -> tuple[dict[str, Any], int]:
844
+ """Execute a tool call and return result with timing."""
845
+ start = time.time()
846
+ result = await _execute_tool(
847
+ tc.name,
848
+ tc.arguments,
849
+ search_mental_models_fn,
850
+ search_observations_fn,
851
+ recall_fn,
852
+ expand_fn,
853
+ )
854
+ duration_ms = int((time.time() - start) * 1000)
855
+ return result, duration_ms
856
+
857
+
858
+ async def _execute_tool(
859
+ tool_name: str,
860
+ args: dict[str, Any],
861
+ search_mental_models_fn: Callable[[str, int], Awaitable[dict[str, Any]]],
862
+ search_observations_fn: Callable[[str, int], Awaitable[dict[str, Any]]],
863
+ recall_fn: Callable[[str, int], Awaitable[dict[str, Any]]],
864
+ expand_fn: Callable[[list[str], str], Awaitable[dict[str, Any]]],
865
+ ) -> dict[str, Any]:
866
+ """Execute a single tool by name."""
867
+ # Normalize tool name for various LLM output formats
868
+ tool_name = _normalize_tool_name(tool_name)
869
+
870
+ if tool_name == "search_mental_models":
871
+ query = args.get("query")
872
+ if not query:
873
+ return {"error": "search_mental_models requires a query parameter"}
874
+ max_results = args.get("max_results") or 5
875
+ return await search_mental_models_fn(query, max_results)
876
+
877
+ elif tool_name == "search_observations":
878
+ query = args.get("query")
879
+ if not query:
880
+ return {"error": "search_observations requires a query parameter"}
881
+ max_tokens = max(args.get("max_tokens") or 5000, 1000) # Default 5000, min 1000
882
+ return await search_observations_fn(query, max_tokens)
883
+
884
+ elif tool_name == "recall":
885
+ query = args.get("query")
886
+ if not query:
887
+ return {"error": "recall requires a query parameter"}
888
+ max_tokens = max(args.get("max_tokens") or 2048, 1000) # Default 2048, min 1000
889
+ return await recall_fn(query, max_tokens)
890
+
891
+ elif tool_name == "expand":
892
+ memory_ids = args.get("memory_ids", [])
893
+ if not memory_ids:
894
+ return {"error": "expand requires memory_ids"}
895
+ depth = args.get("depth", "chunk")
896
+ return await expand_fn(memory_ids, depth)
897
+
898
+ else:
899
+ return {"error": f"Unknown tool: {tool_name}"}
900
+
901
+
902
+ def _summarize_input(tool_name: str, args: dict[str, Any]) -> str:
903
+ """Create a summary of tool input for logging, showing all params."""
904
+ if tool_name == "search_mental_models":
905
+ query = args.get("query", "")
906
+ query_preview = f"'{query[:30]}...'" if len(query) > 30 else f"'{query}'"
907
+ max_results = args.get("max_results") or 5
908
+ return f"(query={query_preview}, max_results={max_results})"
909
+ elif tool_name == "search_observations":
910
+ query = args.get("query", "")
911
+ query_preview = f"'{query[:30]}...'" if len(query) > 30 else f"'{query}'"
912
+ max_tokens = max(args.get("max_tokens") or 5000, 1000)
913
+ return f"(query={query_preview}, max_tokens={max_tokens})"
914
+ elif tool_name == "recall":
915
+ query = args.get("query", "")
916
+ query_preview = f"'{query[:30]}...'" if len(query) > 30 else f"'{query}'"
917
+ # Show actual value used (default 2048, min 1000)
918
+ max_tokens = max(args.get("max_tokens") or 2048, 1000)
919
+ return f"(query={query_preview}, max_tokens={max_tokens})"
920
+ elif tool_name == "expand":
921
+ memory_ids = args.get("memory_ids", [])
922
+ depth = args.get("depth", "chunk")
923
+ return f"(memory_ids=[{len(memory_ids)} ids], depth={depth})"
924
+ elif tool_name == "done":
925
+ answer = args.get("answer", "")
926
+ answer_preview = f"'{answer[:30]}...'" if len(answer) > 30 else f"'{answer}'"
927
+ memory_ids = args.get("memory_ids", [])
928
+ mental_model_ids = args.get("mental_model_ids", [])
929
+ observation_ids = args.get("observation_ids", [])
930
+ return (
931
+ f"(answer={answer_preview}, mem={len(memory_ids)}, mm={len(mental_model_ids)}, obs={len(observation_ids)})"
932
+ )
933
+ return str(args)