hindsight-api 0.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. hindsight_api/__init__.py +38 -0
  2. hindsight_api/api/__init__.py +105 -0
  3. hindsight_api/api/http.py +1872 -0
  4. hindsight_api/api/mcp.py +157 -0
  5. hindsight_api/engine/__init__.py +47 -0
  6. hindsight_api/engine/cross_encoder.py +97 -0
  7. hindsight_api/engine/db_utils.py +93 -0
  8. hindsight_api/engine/embeddings.py +113 -0
  9. hindsight_api/engine/entity_resolver.py +575 -0
  10. hindsight_api/engine/llm_wrapper.py +269 -0
  11. hindsight_api/engine/memory_engine.py +3095 -0
  12. hindsight_api/engine/query_analyzer.py +519 -0
  13. hindsight_api/engine/response_models.py +222 -0
  14. hindsight_api/engine/retain/__init__.py +50 -0
  15. hindsight_api/engine/retain/bank_utils.py +423 -0
  16. hindsight_api/engine/retain/chunk_storage.py +82 -0
  17. hindsight_api/engine/retain/deduplication.py +104 -0
  18. hindsight_api/engine/retain/embedding_processing.py +62 -0
  19. hindsight_api/engine/retain/embedding_utils.py +54 -0
  20. hindsight_api/engine/retain/entity_processing.py +90 -0
  21. hindsight_api/engine/retain/fact_extraction.py +1027 -0
  22. hindsight_api/engine/retain/fact_storage.py +176 -0
  23. hindsight_api/engine/retain/link_creation.py +121 -0
  24. hindsight_api/engine/retain/link_utils.py +651 -0
  25. hindsight_api/engine/retain/orchestrator.py +405 -0
  26. hindsight_api/engine/retain/types.py +206 -0
  27. hindsight_api/engine/search/__init__.py +15 -0
  28. hindsight_api/engine/search/fusion.py +122 -0
  29. hindsight_api/engine/search/observation_utils.py +132 -0
  30. hindsight_api/engine/search/reranking.py +103 -0
  31. hindsight_api/engine/search/retrieval.py +503 -0
  32. hindsight_api/engine/search/scoring.py +161 -0
  33. hindsight_api/engine/search/temporal_extraction.py +64 -0
  34. hindsight_api/engine/search/think_utils.py +255 -0
  35. hindsight_api/engine/search/trace.py +215 -0
  36. hindsight_api/engine/search/tracer.py +447 -0
  37. hindsight_api/engine/search/types.py +160 -0
  38. hindsight_api/engine/task_backend.py +223 -0
  39. hindsight_api/engine/utils.py +203 -0
  40. hindsight_api/metrics.py +227 -0
  41. hindsight_api/migrations.py +163 -0
  42. hindsight_api/models.py +309 -0
  43. hindsight_api/pg0.py +425 -0
  44. hindsight_api/web/__init__.py +12 -0
  45. hindsight_api/web/server.py +143 -0
  46. hindsight_api-0.0.13.dist-info/METADATA +41 -0
  47. hindsight_api-0.0.13.dist-info/RECORD +48 -0
  48. hindsight_api-0.0.13.dist-info/WHEEL +4 -0
@@ -0,0 +1,1027 @@
1
+ """
2
+ Fact extraction from text using LLM.
3
+
4
+ Extracts semantic facts, entities, and temporal information from text.
5
+ Uses the LLMConfig wrapper for all LLM calls.
6
+ """
7
+ import logging
8
+ import os
9
+ import json
10
+ import re
11
+ import asyncio
12
+ from datetime import datetime, timedelta
13
+ from typing import List, Dict, Optional, Literal
14
+ from openai import AsyncOpenAI
15
+ from pydantic import BaseModel, Field, field_validator, ConfigDict
16
+ from ..llm_wrapper import OutputTooLongError, LLMConfig
17
+
18
+
19
+ class Entity(BaseModel):
20
+ """An entity extracted from text."""
21
+ text: str = Field(
22
+ description="The specific, named entity as it appears in the fact. Must be a proper noun or specific identifier."
23
+ )
24
+
25
+
26
+ class Fact(BaseModel):
27
+ """
28
+ Final fact model for storage - built from lenient parsing of LLM response.
29
+
30
+ This is what fact_extraction returns and what the rest of the pipeline expects.
31
+ Combined fact text format: "what | when | where | who | why"
32
+ """
33
+ # Required fields
34
+ fact: str = Field(description="Combined fact text: what | when | where | who | why")
35
+ fact_type: Literal["world", "bank", "opinion"] = Field(description="Perspective: world/bank/opinion")
36
+
37
+ # Optional temporal fields
38
+ occurred_start: Optional[str] = None
39
+ occurred_end: Optional[str] = None
40
+ mentioned_at: Optional[str] = None
41
+
42
+ # Optional location field
43
+ where: Optional[str] = Field(None, description="WHERE the fact occurred or is about (specific location, place, or area)")
44
+
45
+ # Optional structured data
46
+ entities: Optional[List[Entity]] = None
47
+ causal_relations: Optional[List['CausalRelation']] = None
48
+
49
+
50
+ class CausalRelation(BaseModel):
51
+ """Causal relationship between facts."""
52
+ target_fact_index: int = Field(
53
+ description="Index of the related fact in the facts array (0-based). "
54
+ "This creates a directed causal link to another fact in the extraction."
55
+ )
56
+ relation_type: Literal["causes", "caused_by", "enables", "prevents"] = Field(
57
+ description="Type of causal relationship: "
58
+ "'causes' = this fact directly causes the target fact, "
59
+ "'caused_by' = this fact was caused by the target fact, "
60
+ "'enables' = this fact enables/allows the target fact, "
61
+ "'prevents' = this fact prevents/blocks the target fact"
62
+ )
63
+ strength: float = Field(
64
+ description="Strength of causal relationship (0.0 to 1.0). "
65
+ "1.0 = direct/strong causation, 0.5 = moderate, 0.3 = weak/indirect",
66
+ ge=0.0,
67
+ le=1.0,
68
+ default=1.0
69
+ )
70
+
71
+
72
+ class ExtractedFact(BaseModel):
73
+ """A single extracted fact with 5 required dimensions for comprehensive capture."""
74
+
75
+ model_config = ConfigDict(
76
+ json_schema_mode="validation",
77
+ json_schema_extra={
78
+ "required": ["what", "when", "where", "who", "why", "fact_type"]
79
+ }
80
+ )
81
+
82
+ # ==========================================================================
83
+ # FIVE REQUIRED DIMENSIONS - LLM must think about each one
84
+ # ==========================================================================
85
+
86
+ what: str = Field(
87
+ description="WHAT happened - COMPLETE, DETAILED description with ALL specifics. "
88
+ "NEVER summarize or omit details. Include: exact actions, objects, quantities, specifics. "
89
+ "BE VERBOSE - capture every detail that was mentioned. "
90
+ "Example: 'Emily got married to Sarah at a rooftop garden ceremony with 50 guests attending and a live jazz band playing' "
91
+ "NOT: 'A wedding happened' or 'Emily got married'"
92
+ )
93
+
94
+ when: str = Field(
95
+ description="WHEN it happened - ALWAYS include temporal information if mentioned. "
96
+ "Include: specific dates, times, durations, relative time references. "
97
+ "Examples: 'on June 15th, 2024 at 3pm', 'last weekend', 'for the past 3 years', 'every morning at 6am'. "
98
+ "Write 'N/A' ONLY if absolutely no temporal context exists. Prefer converting to absolute dates when possible."
99
+ )
100
+
101
+ where: str = Field(
102
+ description="WHERE it happened or is about - SPECIFIC locations, places, areas, regions if applicable. "
103
+ "Include: cities, neighborhoods, venues, buildings, countries, specific addresses when mentioned. "
104
+ "Examples: 'downtown San Francisco at a rooftop garden venue', 'at the user's home in Brooklyn', 'online via Zoom', 'Paris, France'. "
105
+ "Write 'N/A' ONLY if absolutely no location context exists or if the fact is completely location-agnostic."
106
+ )
107
+
108
+ who: str = Field(
109
+ description="WHO is involved - ALL people/entities with FULL context and relationships. "
110
+ "Include: names, roles, relationships to user, background details. "
111
+ "Resolve coreferences (if 'my roommate' is later named 'Emily', write 'Emily, the user's college roommate'). "
112
+ "BE DETAILED about relationships and roles. "
113
+ "Example: 'Emily (user's college roommate from Stanford, now works at Google), Sarah (Emily's partner of 5 years, software engineer)' "
114
+ "NOT: 'my friend' or 'Emily and Sarah'"
115
+ )
116
+
117
+ why: str = Field(
118
+ description="WHY it matters - ALL emotional, contextual, and motivational details. "
119
+ "Include EVERYTHING: feelings, preferences, motivations, observations, context, background, significance. "
120
+ "BE VERBOSE - capture all the nuance and meaning. "
121
+ "FOR ASSISTANT FACTS: MUST include what the user asked/requested that led to this interaction! "
122
+ "Example (world): 'The user felt thrilled and inspired, has always dreamed of an outdoor ceremony, mentioned wanting a similar garden venue, was particularly moved by the intimate atmosphere and personal vows' "
123
+ "Example (assistant): 'User asked how to fix slow API performance with 1000+ concurrent users, expected 70-80% reduction in database load' "
124
+ "NOT: 'User liked it' or 'To help user'"
125
+ )
126
+
127
+ # ==========================================================================
128
+ # CLASSIFICATION
129
+ # ==========================================================================
130
+
131
+ fact_kind: str = Field(
132
+ default="conversation",
133
+ description="'event' = specific datable occurrence (set occurred dates), 'conversation' = general info (no occurred dates)"
134
+ )
135
+
136
+ # Temporal fields - optional
137
+ occurred_start: Optional[str] = Field(
138
+ default=None,
139
+ description="WHEN the event happened (ISO timestamp). Only for fact_kind='event'. Leave null for conversations."
140
+ )
141
+ occurred_end: Optional[str] = Field(
142
+ default=None,
143
+ description="WHEN the event ended (ISO timestamp). Only for events with duration. Leave null for conversations."
144
+ )
145
+
146
+ # Classification (CRITICAL - required)
147
+ # Note: LLM uses "assistant" but we convert to "bank" for storage
148
+ fact_type: Literal["world", "assistant"] = Field(
149
+ description="'world' = about the user/others (background, experiences). 'assistant' = interactions with the assistant."
150
+ )
151
+
152
+ # Entities - extracted from 'who' field
153
+ entities: Optional[List[Entity]] = Field(
154
+ default=None,
155
+ description="Named entities from 'who': people names, organizations, places. NOT generic relations."
156
+ )
157
+ causal_relations: Optional[List[CausalRelation]] = Field(
158
+ default=None,
159
+ description="Causal links to other facts. Can be null."
160
+ )
161
+
162
+ @field_validator('entities', mode='before')
163
+ @classmethod
164
+ def ensure_entities_list(cls, v):
165
+ """Ensure entities is always a list (convert None to empty list)."""
166
+ if v is None:
167
+ return []
168
+ return v
169
+
170
+ @field_validator('causal_relations', mode='before')
171
+ @classmethod
172
+ def ensure_causal_relations_list(cls, v):
173
+ """Ensure causal_relations is always a list (convert None to empty list)."""
174
+ if v is None:
175
+ return []
176
+ return v
177
+
178
+ def build_fact_text(self) -> str:
179
+ """Combine all dimensions into a single comprehensive fact string."""
180
+ parts = [self.what]
181
+
182
+ # Add 'who' if not N/A
183
+ if self.who and self.who.upper() != 'N/A':
184
+ parts.append(f"Involving: {self.who}")
185
+
186
+ # Add 'why' if not N/A
187
+ if self.why and self.why.upper() != 'N/A':
188
+ parts.append(self.why)
189
+
190
+ if len(parts) == 1:
191
+ return parts[0]
192
+
193
+ return " | ".join(parts)
194
+
195
+
196
+ class FactExtractionResponse(BaseModel):
197
+ """Response containing all extracted facts."""
198
+ facts: List[ExtractedFact] = Field(
199
+ description="List of extracted factual statements"
200
+ )
201
+
202
+
203
+ def chunk_text(text: str, max_chars: int) -> List[str]:
204
+ """
205
+ Split text into chunks, preserving conversation structure when possible.
206
+
207
+ For JSON conversation arrays (user/assistant turns), splits at turn boundaries
208
+ while preserving speaker context. For plain text, uses sentence-aware splitting.
209
+
210
+ Args:
211
+ text: Input text to chunk (plain text or JSON conversation)
212
+ max_chars: Maximum characters per chunk (default 120k ≈ 30k tokens)
213
+
214
+ Returns:
215
+ List of text chunks, roughly under max_chars
216
+ """
217
+ import json
218
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
219
+
220
+ # If text is small enough, return as-is
221
+ if len(text) <= max_chars:
222
+ return [text]
223
+
224
+ # Try to parse as JSON conversation array
225
+ try:
226
+ parsed = json.loads(text)
227
+ if isinstance(parsed, list) and all(isinstance(turn, dict) for turn in parsed):
228
+ # This looks like a conversation - chunk at turn boundaries
229
+ return _chunk_conversation(parsed, max_chars)
230
+ except (json.JSONDecodeError, ValueError):
231
+ pass
232
+
233
+ # Fall back to sentence-aware text splitting
234
+ splitter = RecursiveCharacterTextSplitter(
235
+ chunk_size=max_chars,
236
+ chunk_overlap=0,
237
+ length_function=len,
238
+ is_separator_regex=False,
239
+ separators=[
240
+ "\n\n", # Paragraph breaks
241
+ "\n", # Line breaks
242
+ ". ", # Sentence endings
243
+ "! ", # Exclamations
244
+ "? ", # Questions
245
+ "; ", # Semicolons
246
+ ", ", # Commas
247
+ " ", # Words
248
+ "", # Characters (last resort)
249
+ ],
250
+ )
251
+
252
+ return splitter.split_text(text)
253
+
254
+
255
+ def _chunk_conversation(turns: List[dict], max_chars: int) -> List[str]:
256
+ """
257
+ Chunk a conversation array at turn boundaries, preserving complete turns.
258
+
259
+ Args:
260
+ turns: List of conversation turn dicts (with 'role' and 'content' keys)
261
+ max_chars: Maximum characters per chunk
262
+
263
+ Returns:
264
+ List of JSON-serialized chunks, each containing complete turns
265
+ """
266
+ import json
267
+
268
+ chunks = []
269
+ current_chunk = []
270
+ current_size = 2 # Account for "[]"
271
+
272
+ for turn in turns:
273
+ # Estimate size of this turn when serialized (with comma separator)
274
+ turn_json = json.dumps(turn, ensure_ascii=False)
275
+ turn_size = len(turn_json) + 1 # +1 for comma
276
+
277
+ # If adding this turn would exceed limit and we have turns, save current chunk
278
+ if current_size + turn_size > max_chars and current_chunk:
279
+ chunks.append(json.dumps(current_chunk, ensure_ascii=False))
280
+ current_chunk = []
281
+ current_size = 2 # Reset to "[]"
282
+
283
+ # Add turn to current chunk
284
+ current_chunk.append(turn)
285
+ current_size += turn_size
286
+
287
+ # Add final chunk if non-empty
288
+ if current_chunk:
289
+ chunks.append(json.dumps(current_chunk, ensure_ascii=False))
290
+
291
+ return chunks if chunks else [json.dumps(turns, ensure_ascii=False)]
292
+
293
+
294
+ async def _extract_facts_from_chunk(
295
+ chunk: str,
296
+ chunk_index: int,
297
+ total_chunks: int,
298
+ event_date: datetime,
299
+ context: str,
300
+ llm_config: 'LLMConfig',
301
+ agent_name: str = None,
302
+ extract_opinions: bool = False
303
+ ) -> List[Dict[str, str]]:
304
+ """
305
+ Extract facts from a single chunk (internal helper for parallel processing).
306
+
307
+ Note: event_date parameter is kept for backward compatibility but not used in prompt.
308
+ The LLM extracts temporal information from the context string instead.
309
+ """
310
+ agent_context = f"\n- Your name: {agent_name}" if agent_name else ""
311
+
312
+ # Determine which fact types to extract based on the flag
313
+ # Note: We use "assistant" in the prompt but convert to "bank" for storage
314
+ if extract_opinions:
315
+ # Opinion extraction uses a separate prompt (not this one)
316
+ fact_types_instruction = "Extract ONLY 'opinion' type facts (formed opinions, beliefs, and perspectives). DO NOT extract 'world' or 'assistant' facts."
317
+ else:
318
+ fact_types_instruction = "Extract ONLY 'world' and 'assistant' type facts. DO NOT extract opinions - those are extracted separately."
319
+
320
+ prompt = f"""Extract facts from text into structured format with FOUR required dimensions - BE EXTREMELY DETAILED.
321
+
322
+ {fact_types_instruction}
323
+
324
+ Context: {context if context else 'none'}{agent_context}
325
+
326
+ ══════════════════════════════════════════════════════════════════════════
327
+ FACT FORMAT - ALL FIVE DIMENSIONS REQUIRED - MAXIMUM VERBOSITY
328
+ ══════════════════════════════════════════════════════════════════════════
329
+
330
+ For EACH fact, CAPTURE ALL DETAILS - NEVER SUMMARIZE OR OMIT:
331
+
332
+ 1. **what**: WHAT happened - COMPLETE description with ALL specifics (objects, actions, quantities, details)
333
+ 2. **when**: WHEN it happened - ALWAYS include temporal info with DAY OF WEEK (e.g., "Monday, June 10, 2024")
334
+ - Always include the day name: Monday, Tuesday, Wednesday, Thursday, Friday, Saturday, Sunday
335
+ - Format: "day_name, month day, year" (e.g., "Saturday, June 9, 2024")
336
+ 3. **where**: WHERE it happened or is about - SPECIFIC locations, places, areas, regions (if applicable)
337
+ 4. **who**: WHO is involved - ALL people/entities with FULL relationships and background
338
+ 5. **why**: WHY it matters - ALL emotions, preferences, motivations, significance, nuance
339
+ - For assistant facts: MUST include what the user asked/requested that triggered this!
340
+
341
+ Plus: fact_type, fact_kind, entities, occurred_start/end (for structured dates), where (structured location)
342
+
343
+ VERBOSITY REQUIREMENT: Include EVERY detail mentioned. More detail is ALWAYS better than less.
344
+
345
+ ══════════════════════════════════════════════════════════════════════════
346
+ COREFERENCE RESOLUTION (CRITICAL)
347
+ ══════════════════════════════════════════════════════════════════════════
348
+
349
+ When text uses BOTH a generic relation AND a name for the same person → LINK THEM!
350
+
351
+ Example input: "I went to my college roommate's wedding last June. Emily finally married Sarah after 5 years together."
352
+
353
+ CORRECT output:
354
+ - what: "Emily got married to Sarah at a rooftop garden ceremony"
355
+ - when: "Saturday, June 8, 2024, after dating for 5 years"
356
+ - where: "downtown San Francisco, at a rooftop garden venue"
357
+ - who: "Emily (user's college roommate), Sarah (Emily's partner of 5 years)"
358
+ - why: "User found it romantic and beautiful, dreams of similar outdoor ceremony"
359
+ - where (structured): "San Francisco"
360
+
361
+ WRONG output:
362
+ - what: "User's roommate got married" ← LOSES THE NAME!
363
+ - who: "the roommate" ← WRONG - use the actual name!
364
+ - where: (missing) ← WRONG - include the location!
365
+
366
+ ══════════════════════════════════════════════════════════════════════════
367
+ TEMPORAL HANDLING
368
+ ══════════════════════════════════════════════════════════════════════════
369
+
370
+ For EVENTS (fact_kind="event"):
371
+ - Convert relative dates → absolute WITH DAY OF WEEK: "yesterday" on Saturday March 15 → "Friday, March 14, 2024"
372
+ - Always include the day name (Monday, Tuesday, etc.) in the 'when' field
373
+ - Set occurred_start/occurred_end to WHEN IT HAPPENED (not when mentioned)
374
+
375
+ For CONVERSATIONS (fact_kind="conversation"):
376
+ - General info, preferences, ongoing states → NO occurred dates
377
+ - Examples: "loves coffee", "works as engineer"
378
+
379
+ ══════════════════════════════════════════════════════════════════════════
380
+ FACT TYPE
381
+ ══════════════════════════════════════════════════════════════════════════
382
+
383
+ - **world**: User's life, other people, events (would exist without this conversation)
384
+ - **assistant**: Interactions with assistant (requests, recommendations, help)
385
+ ⚠️ CRITICAL for assistant facts: ALWAYS capture the user's request/question in the fact!
386
+ Include: what the user asked, what problem they wanted solved, what context they provided
387
+
388
+ ══════════════════════════════════════════════════════════════════════════
389
+ USER PREFERENCES (CRITICAL)
390
+ ══════════════════════════════════════════════════════════════════════════
391
+
392
+ ALWAYS extract user preferences as separate facts! Watch for these keywords:
393
+ - "enjoy", "like", "love", "prefer", "hate", "dislike", "favorite", "ideal", "dream", "want"
394
+
395
+ Example: "I love Italian food and prefer outdoor dining"
396
+ → Fact 1: what="User loves Italian food", who="user", why="This is a food preference", entities=["user"]
397
+ → Fact 2: what="User prefers outdoor dining", who="user", why="This is a dining preference", entities=["user"]
398
+
399
+ ══════════════════════════════════════════════════════════════════════════
400
+ ENTITIES - INCLUDE "user" (CRITICAL)
401
+ ══════════════════════════════════════════════════════════════════════════
402
+
403
+ When a fact is ABOUT the user (their preferences, plans, experiences), ALWAYS include "user" in entities!
404
+
405
+ ✅ CORRECT: entities=["user"] for "User loves coffee"
406
+ ✅ CORRECT: entities=["user", "Emily"] for "User attended Emily's wedding"
407
+ ❌ WRONG: entities=[] for facts about the user
408
+
409
+ ══════════════════════════════════════════════════════════════════════════
410
+ EXAMPLES
411
+ ══════════════════════════════════════════════════════════════════════════
412
+
413
+ Example 1 - World Facts (Context: June 10, 2024):
414
+ Input: "I'm planning my wedding and want a small outdoor ceremony. I just got back from my college roommate Emily's wedding - she married Sarah at a rooftop garden, it was so romantic!"
415
+
416
+ Output facts:
417
+
418
+ 1. User's wedding preference
419
+ - what: "User wants a small outdoor ceremony for their wedding"
420
+ - who: "user"
421
+ - why: "User prefers intimate outdoor settings"
422
+ - fact_type: "world", fact_kind: "conversation"
423
+ - entities: ["user"]
424
+
425
+ 2. User planning wedding
426
+ - what: "User is planning their own wedding"
427
+ - who: "user"
428
+ - why: "Inspired by Emily's ceremony"
429
+ - fact_type: "world", fact_kind: "conversation"
430
+ - entities: ["user"]
431
+
432
+ 3. Emily's wedding (THE EVENT)
433
+ - what: "Emily got married to Sarah at a rooftop garden ceremony in the city"
434
+ - who: "Emily (user's college roommate), Sarah (Emily's partner)"
435
+ - why: "User found it romantic and beautiful"
436
+ - fact_type: "world", fact_kind: "event"
437
+ - occurred_start: "2024-06-09T00:00:00Z" (recently, user "just got back")
438
+ - entities: ["user", "Emily", "Sarah"]
439
+
440
+ Example 2 - Assistant Facts (Context: March 5, 2024):
441
+ Input: "User: My API is really slow when we have 1000+ concurrent users. What can I do?
442
+ Assistant: I'd recommend implementing Redis for caching frequently-accessed data, which should reduce your database load by 70-80%."
443
+
444
+ Output fact:
445
+ - what: "Assistant recommended implementing Redis for caching frequently-accessed data to improve API performance"
446
+ - when: "March 5, 2024 during conversation"
447
+ - who: "user, assistant"
448
+ - why: "User asked how to fix slow API performance with 1000+ concurrent users, expected 70-80% reduction in database load"
449
+ - fact_type: "assistant", fact_kind: "conversation"
450
+ - entities: ["user"]
451
+
452
+ Note how the "why" field captures the FULL STORY: what the user asked AND what outcome was expected!
453
+
454
+ ══════════════════════════════════════════════════════════════════════════
455
+ WHAT TO EXTRACT vs SKIP
456
+ ══════════════════════════════════════════════════════════════════════════
457
+
458
+ ✅ EXTRACT: User preferences (ALWAYS as separate facts!), feelings, plans, events, relationships, achievements
459
+ ❌ SKIP: Greetings, filler ("thanks", "cool"), purely structural statements"""
460
+
461
+
462
+
463
+
464
+ import logging
465
+ from openai import BadRequestError
466
+
467
+ logger = logging.getLogger(__name__)
468
+
469
+ # Retry logic for JSON validation errors
470
+ max_retries = 2
471
+ last_error = None
472
+
473
+ # Build user message with metadata and chunk content in a clear format
474
+ # Format event_date with day of week for better temporal reasoning
475
+ event_date_formatted = event_date.strftime('%A, %B %d, %Y') # e.g., "Monday, June 10, 2024"
476
+ user_message = f"""Extract facts from the following text chunk.
477
+
478
+ Chunk: {chunk_index + 1}/{total_chunks}
479
+ Event Date: {event_date_formatted} ({event_date.isoformat()})
480
+ Context: {context if context else 'none'}
481
+
482
+ Text:
483
+ {chunk}"""
484
+
485
+ for attempt in range(max_retries):
486
+ try:
487
+ extraction_response_json = await llm_config.call(
488
+ messages=[
489
+ {
490
+ "role": "system",
491
+ "content": prompt
492
+ },
493
+ {
494
+ "role": "user",
495
+ "content": user_message
496
+ }
497
+ ],
498
+ response_format=FactExtractionResponse,
499
+ scope="memory_extract_facts",
500
+ temperature=0.1,
501
+ max_tokens=65000,
502
+ skip_validation=True, # Get raw JSON, we'll validate leniently
503
+ )
504
+
505
+ # Lenient parsing of facts from raw JSON
506
+ chunk_facts = []
507
+ has_malformed_facts = False
508
+
509
+ # Handle malformed LLM responses
510
+ if not isinstance(extraction_response_json, dict):
511
+ if attempt < max_retries - 1:
512
+ logger.warning(
513
+ f"LLM returned non-dict JSON on attempt {attempt + 1}/{max_retries}: {type(extraction_response_json).__name__}. Retrying..."
514
+ )
515
+ continue
516
+ else:
517
+ logger.warning(
518
+ f"LLM returned non-dict JSON after {max_retries} attempts: {type(extraction_response_json).__name__}. "
519
+ f"Raw: {str(extraction_response_json)[:500]}"
520
+ )
521
+ return []
522
+
523
+ raw_facts = extraction_response_json.get('facts', [])
524
+ if not raw_facts:
525
+ logger.debug(
526
+ f"LLM response missing 'facts' field or returned empty list. "
527
+ f"Response: {extraction_response_json}. "
528
+ f"Input: "
529
+ f"date: {event_date.isoformat()}, "
530
+ f"context: {context if context else 'none'}, "
531
+ f"text: {chunk}"
532
+ )
533
+
534
+ for i, llm_fact in enumerate(raw_facts):
535
+ # Skip non-dict entries but track them for retry
536
+ if not isinstance(llm_fact, dict):
537
+ logger.warning(f"Skipping non-dict fact at index {i}")
538
+ has_malformed_facts = True
539
+ continue
540
+
541
+ # Helper to get non-empty value
542
+ def get_value(field_name):
543
+ value = llm_fact.get(field_name)
544
+ if value and value != '' and value != [] and value != {} and str(value).upper() != 'N/A':
545
+ return value
546
+ return None
547
+
548
+ # NEW FORMAT: what, when, who, why (all required)
549
+ what = get_value('what')
550
+ when = get_value('when')
551
+ who = get_value('who')
552
+ why = get_value('why')
553
+
554
+ # Fallback to old format if new fields not present
555
+ if not what:
556
+ what = get_value('factual_core')
557
+ if not what:
558
+ logger.warning(f"Skipping fact {i}: missing 'what' field")
559
+ continue
560
+
561
+ # Critical field: fact_type
562
+ # LLM uses "assistant" but we convert to "bank" for storage
563
+ fact_type = llm_fact.get('fact_type')
564
+
565
+ # Convert "assistant" → "bank" for storage
566
+ if fact_type == 'assistant':
567
+ fact_type = 'bank'
568
+
569
+ # Validate fact_type (after conversion)
570
+ if fact_type not in ['world', 'bank', 'opinion']:
571
+ # Try to fix common mistakes - check if they swapped fact_type and fact_kind
572
+ fact_kind = llm_fact.get('fact_kind')
573
+ if fact_kind == 'assistant':
574
+ fact_type = 'bank'
575
+ elif fact_kind in ['world', 'bank', 'opinion']:
576
+ fact_type = fact_kind
577
+ else:
578
+ # Default to 'world' if we can't determine
579
+ fact_type = 'world'
580
+ logger.warning(f"Fact {i}: defaulting to fact_type='world'")
581
+
582
+ # Get fact_kind for temporal handling (but don't store it)
583
+ fact_kind = llm_fact.get('fact_kind', 'conversation')
584
+ if fact_kind not in ['conversation', 'event', 'other']:
585
+ fact_kind = 'conversation'
586
+
587
+ # Build combined fact text from the 4 dimensions: what | when | who | why
588
+ fact_data = {}
589
+ combined_parts = [what]
590
+
591
+ if when:
592
+ combined_parts.append(f"When: {when}")
593
+
594
+ if who:
595
+ combined_parts.append(f"Involving: {who}")
596
+
597
+ if why:
598
+ combined_parts.append(why)
599
+
600
+ combined_text = " | ".join(combined_parts)
601
+
602
+ # Add temporal fields
603
+ # For events: occurred_start/occurred_end (when the event happened)
604
+ if fact_kind == 'event':
605
+ occurred_start = get_value('occurred_start')
606
+ occurred_end = get_value('occurred_end')
607
+ if occurred_start:
608
+ fact_data['occurred_start'] = occurred_start
609
+ if occurred_end:
610
+ fact_data['occurred_end'] = occurred_end
611
+
612
+ # Add entities if present (validate as Entity objects)
613
+ # LLM sometimes returns strings instead of {"text": "..."} format
614
+ entities = get_value('entities')
615
+ if entities:
616
+ # Validate and normalize each entity
617
+ validated_entities = []
618
+ for ent in entities:
619
+ if isinstance(ent, str):
620
+ # Normalize string to Entity object
621
+ validated_entities.append(Entity(text=ent))
622
+ elif isinstance(ent, dict) and 'text' in ent:
623
+ try:
624
+ validated_entities.append(Entity.model_validate(ent))
625
+ except Exception as e:
626
+ logger.warning(f"Invalid entity {ent}: {e}")
627
+ if validated_entities:
628
+ fact_data['entities'] = validated_entities
629
+
630
+ # Add causal relations if present (validate as CausalRelation objects)
631
+ # Filter out invalid relations (missing required fields)
632
+ causal_relations = get_value('causal_relations')
633
+ if causal_relations:
634
+ validated_relations = []
635
+ for rel in causal_relations:
636
+ if isinstance(rel, dict) and 'target_fact_index' in rel and 'relation_type' in rel:
637
+ try:
638
+ validated_relations.append(CausalRelation.model_validate(rel))
639
+ except Exception as e:
640
+ logger.warning(f"Invalid causal relation {rel}: {e}")
641
+ if validated_relations:
642
+ fact_data['causal_relations'] = validated_relations
643
+
644
+ # Always set mentioned_at to the event_date (when the conversation/document occurred)
645
+ fact_data['mentioned_at'] = event_date.isoformat()
646
+
647
+ # Build Fact model instance
648
+ try:
649
+ fact = Fact(
650
+ fact=combined_text,
651
+ fact_type=fact_type,
652
+ **fact_data
653
+ )
654
+ chunk_facts.append(fact)
655
+ except Exception as e:
656
+ logger.error(f"Failed to create Fact model for fact {i}: {e}")
657
+ has_malformed_facts = True
658
+ continue
659
+
660
+ # If we got malformed facts and haven't exhausted retries, try again
661
+ if has_malformed_facts and len(chunk_facts) < len(raw_facts) * 0.8 and attempt < max_retries - 1:
662
+ logger.warning(
663
+ f"Got {len(raw_facts) - len(chunk_facts)} malformed facts out of {len(raw_facts)} on attempt {attempt + 1}/{max_retries}. Retrying..."
664
+ )
665
+ continue
666
+
667
+ return chunk_facts
668
+
669
+ except BadRequestError as e:
670
+ last_error = e
671
+ if "json_validate_failed" in str(e):
672
+ logger.warning(f" [1.3.{chunk_index + 1}] Attempt {attempt + 1}/{max_retries} failed with JSON validation error: {e}")
673
+ if attempt < max_retries - 1:
674
+ logger.info(f" [1.3.{chunk_index + 1}] Retrying...")
675
+ continue
676
+ # If it's not a JSON validation error or we're out of retries, re-raise
677
+ raise
678
+
679
+ # If we exhausted all retries, raise the last error
680
+ raise last_error
681
+
682
+
683
+ async def _extract_facts_with_auto_split(
684
+ chunk: str,
685
+ chunk_index: int,
686
+ total_chunks: int,
687
+ event_date: datetime,
688
+ context: str,
689
+ llm_config: LLMConfig,
690
+ agent_name: str = None,
691
+ extract_opinions: bool = False
692
+ ) -> List[Dict[str, str]]:
693
+ """
694
+ Extract facts from a chunk with automatic splitting if output exceeds token limits.
695
+
696
+ If the LLM output is too long (OutputTooLongError), this function automatically
697
+ splits the chunk in half and processes each half recursively.
698
+
699
+ Args:
700
+ chunk: Text chunk to process
701
+ chunk_index: Index of this chunk in the original list
702
+ total_chunks: Total number of original chunks
703
+ event_date: Reference date for temporal information
704
+ context: Context about the conversation/document
705
+ llm_config: LLM configuration to use
706
+ agent_name: Optional agent name (memory owner)
707
+ extract_opinions: If True, extract ONLY opinions. If False, extract world and agent facts (no opinions)
708
+
709
+ Returns:
710
+ List of fact dictionaries extracted from the chunk (possibly from sub-chunks)
711
+ """
712
+ import logging
713
+ logger = logging.getLogger(__name__)
714
+
715
+ try:
716
+ # Try to extract facts from the full chunk
717
+ return await _extract_facts_from_chunk(
718
+ chunk=chunk,
719
+ chunk_index=chunk_index,
720
+ total_chunks=total_chunks,
721
+ event_date=event_date,
722
+ context=context,
723
+ llm_config=llm_config,
724
+ agent_name=agent_name,
725
+ extract_opinions=extract_opinions
726
+ )
727
+ except OutputTooLongError as e:
728
+ # Output exceeded token limits - split the chunk in half and retry
729
+ logger.warning(
730
+ f"Output too long for chunk {chunk_index + 1}/{total_chunks} "
731
+ f"({len(chunk)} chars). Splitting in half and retrying..."
732
+ )
733
+
734
+ # Split at the midpoint, preferring sentence boundaries
735
+ mid_point = len(chunk) // 2
736
+
737
+ # Try to find a sentence boundary near the midpoint
738
+ # Look for ". ", "! ", "? " within 20% of midpoint
739
+ search_range = int(len(chunk) * 0.2)
740
+ search_start = max(0, mid_point - search_range)
741
+ search_end = min(len(chunk), mid_point + search_range)
742
+
743
+ sentence_endings = ['. ', '! ', '? ', '\n\n']
744
+ best_split = mid_point
745
+
746
+ for ending in sentence_endings:
747
+ pos = chunk.rfind(ending, search_start, search_end)
748
+ if pos != -1:
749
+ best_split = pos + len(ending)
750
+ break
751
+
752
+ # Split the chunk
753
+ first_half = chunk[:best_split].strip()
754
+ second_half = chunk[best_split:].strip()
755
+
756
+ logger.info(
757
+ f"Split chunk {chunk_index + 1} into two sub-chunks: "
758
+ f"{len(first_half)} chars and {len(second_half)} chars"
759
+ )
760
+
761
+ # Process both halves recursively (in parallel)
762
+ sub_tasks = [
763
+ _extract_facts_with_auto_split(
764
+ chunk=first_half,
765
+ chunk_index=chunk_index,
766
+ total_chunks=total_chunks,
767
+ event_date=event_date,
768
+ context=context,
769
+ llm_config=llm_config,
770
+ agent_name=agent_name,
771
+ extract_opinions=extract_opinions
772
+ ),
773
+ _extract_facts_with_auto_split(
774
+ chunk=second_half,
775
+ chunk_index=chunk_index,
776
+ total_chunks=total_chunks,
777
+ event_date=event_date,
778
+ context=context,
779
+ llm_config=llm_config,
780
+ agent_name=agent_name,
781
+ extract_opinions=extract_opinions
782
+ )
783
+ ]
784
+
785
+ sub_results = await asyncio.gather(*sub_tasks)
786
+
787
+ # Combine results from both halves
788
+ all_facts = []
789
+ for sub_result in sub_results:
790
+ all_facts.extend(sub_result)
791
+
792
+ logger.info(
793
+ f"Successfully extracted {len(all_facts)} facts from split chunk {chunk_index + 1}"
794
+ )
795
+
796
+ return all_facts
797
+
798
+
799
+ async def extract_facts_from_text(
800
+ text: str,
801
+ event_date: datetime,
802
+ llm_config: LLMConfig,
803
+ agent_name: str,
804
+ context: str = "",
805
+ extract_opinions: bool = False,
806
+ ) -> tuple[List[Fact], List[tuple[str, int]]]:
807
+ """
808
+ Extract semantic facts from conversational or narrative text using LLM.
809
+
810
+ For large texts (>3000 chars), automatically chunks at sentence boundaries
811
+ to avoid hitting output token limits. Processes ALL chunks in PARALLEL for speed.
812
+
813
+ If a chunk produces output that exceeds token limits (OutputTooLongError), it is
814
+ automatically split in half and retried recursively until successful.
815
+
816
+ Args:
817
+ text: Input text (conversation, article, etc.)
818
+ event_date: Reference date for resolving relative times
819
+ context: Context about the conversation/document
820
+ llm_config: LLM configuration to use
821
+ agent_name: Agent name (memory owner)
822
+ extract_opinions: If True, extract ONLY opinions. If False, extract world and bank facts (no opinions)
823
+
824
+ Returns:
825
+ Tuple of (facts, chunks) where:
826
+ - facts: List of Fact model instances
827
+ - chunks: List of tuples (chunk_text, fact_count) for each chunk
828
+ """
829
+ chunks = chunk_text(text, max_chars=3000)
830
+ tasks = [
831
+ _extract_facts_with_auto_split(
832
+ chunk=chunk,
833
+ chunk_index=i,
834
+ total_chunks=len(chunks),
835
+ event_date=event_date,
836
+ context=context,
837
+ llm_config=llm_config,
838
+ agent_name=agent_name,
839
+ extract_opinions=extract_opinions
840
+ )
841
+ for i, chunk in enumerate(chunks)
842
+ ]
843
+ chunk_results = await asyncio.gather(*tasks)
844
+ all_facts = []
845
+ chunk_metadata = [] # [(chunk_text, fact_count), ...]
846
+ for chunk, chunk_facts in zip(chunks, chunk_results):
847
+ all_facts.extend(chunk_facts)
848
+ chunk_metadata.append((chunk, len(chunk_facts)))
849
+ return all_facts, chunk_metadata
850
+
851
+
852
+ # ============================================================================
853
+ # ORCHESTRATION LAYER
854
+ # ============================================================================
855
+
856
+ # Import types for the orchestration layer (note: ExtractedFact here is different from the Pydantic model above)
857
+ from .types import RetainContent, ExtractedFact as ExtractedFactType, ChunkMetadata, CausalRelation as CausalRelationType
858
+ from typing import Tuple
859
+
860
+ logger = logging.getLogger(__name__)
861
+
862
+ # Each fact gets 10 seconds offset to preserve ordering within a document
863
+ SECONDS_PER_FACT = 10
864
+
865
+
866
+ async def extract_facts_from_contents(
867
+ contents: List[RetainContent],
868
+ llm_config,
869
+ agent_name: str,
870
+ extract_opinions: bool = False
871
+ ) -> Tuple[List[ExtractedFactType], List[ChunkMetadata]]:
872
+ """
873
+ Extract facts from multiple content items in parallel.
874
+
875
+ This function:
876
+ 1. Extracts facts from all contents in parallel using the LLM
877
+ 2. Tracks which facts came from which chunks
878
+ 3. Adds time offsets to preserve fact ordering within each content
879
+ 4. Returns typed ExtractedFact and ChunkMetadata objects
880
+
881
+ Args:
882
+ contents: List of RetainContent objects to process
883
+ llm_config: LLM configuration for fact extraction
884
+ agent_name: Name of the agent (for agent-related fact detection)
885
+ extract_opinions: If True, extract only opinions; otherwise world/bank facts
886
+
887
+ Returns:
888
+ Tuple of (extracted_facts, chunks_metadata)
889
+ """
890
+ if not contents:
891
+ return [], []
892
+
893
+ # Step 1: Create parallel fact extraction tasks
894
+ fact_extraction_tasks = []
895
+ for item in contents:
896
+ # Call extract_facts_from_text directly (defined earlier in this file)
897
+ # to avoid circular import with utils.extract_facts
898
+ task = extract_facts_from_text(
899
+ text=item.content,
900
+ event_date=item.event_date,
901
+ context=item.context,
902
+ llm_config=llm_config,
903
+ agent_name=agent_name,
904
+ extract_opinions=extract_opinions
905
+ )
906
+ fact_extraction_tasks.append(task)
907
+
908
+ # Step 2: Wait for all fact extractions to complete
909
+ all_fact_results = await asyncio.gather(*fact_extraction_tasks)
910
+
911
+ # Step 3: Flatten and convert to typed objects
912
+ extracted_facts: List[ExtractedFactType] = []
913
+ chunks_metadata: List[ChunkMetadata] = []
914
+
915
+ global_chunk_idx = 0
916
+ global_fact_idx = 0
917
+
918
+ for content_index, (content, (facts_from_llm, chunks_from_llm)) in enumerate(zip(contents, all_fact_results)):
919
+ chunk_start_idx = global_chunk_idx
920
+
921
+ # Convert chunk tuples to ChunkMetadata objects
922
+ for chunk_index_in_content, (chunk_text, chunk_fact_count) in enumerate(chunks_from_llm):
923
+ chunk_metadata = ChunkMetadata(
924
+ chunk_text=chunk_text,
925
+ fact_count=chunk_fact_count,
926
+ content_index=content_index,
927
+ chunk_index=global_chunk_idx
928
+ )
929
+ chunks_metadata.append(chunk_metadata)
930
+ global_chunk_idx += 1
931
+
932
+ # Convert facts to ExtractedFact objects with proper indexing
933
+ fact_idx_in_content = 0
934
+ for chunk_idx_in_content, (chunk_text, chunk_fact_count) in enumerate(chunks_from_llm):
935
+ chunk_global_idx = chunk_start_idx + chunk_idx_in_content
936
+
937
+ for _ in range(chunk_fact_count):
938
+ if fact_idx_in_content < len(facts_from_llm):
939
+ fact_from_llm = facts_from_llm[fact_idx_in_content]
940
+
941
+ # Convert Fact model from LLM to ExtractedFactType dataclass
942
+ # mentioned_at is always the event_date (when the conversation/document occurred)
943
+ extracted_fact = ExtractedFactType(
944
+ fact_text=fact_from_llm.fact,
945
+ fact_type=fact_from_llm.fact_type,
946
+ entities=[e.text for e in (fact_from_llm.entities or [])],
947
+ # occurred_start/end: from LLM only, leave None if not provided
948
+ occurred_start=_parse_datetime(fact_from_llm.occurred_start) if fact_from_llm.occurred_start else None,
949
+ occurred_end=_parse_datetime(fact_from_llm.occurred_end) if fact_from_llm.occurred_end else None,
950
+ causal_relations=_convert_causal_relations(
951
+ fact_from_llm.causal_relations or [],
952
+ global_fact_idx
953
+ ),
954
+ content_index=content_index,
955
+ chunk_index=chunk_global_idx,
956
+ context=content.context,
957
+ # mentioned_at: always the event_date (when the conversation/document occurred)
958
+ mentioned_at=content.event_date,
959
+ metadata=content.metadata
960
+ )
961
+
962
+ extracted_facts.append(extracted_fact)
963
+ global_fact_idx += 1
964
+ fact_idx_in_content += 1
965
+
966
+ # Step 4: Add time offsets to preserve ordering within each content
967
+ _add_temporal_offsets(extracted_facts, contents)
968
+
969
+ return extracted_facts, chunks_metadata
970
+
971
+
972
+ def _parse_datetime(date_str: str):
973
+ """Parse ISO datetime string."""
974
+ from dateutil import parser as date_parser
975
+ try:
976
+ return date_parser.isoparse(date_str)
977
+ except Exception:
978
+ return None
979
+
980
+
981
+ def _convert_causal_relations(relations_from_llm, fact_start_idx: int) -> List[CausalRelationType]:
982
+ """
983
+ Convert causal relations from LLM format to ExtractedFact format.
984
+
985
+ Adjusts target_fact_index from content-relative to global indices.
986
+ """
987
+ causal_relations = []
988
+ for rel in relations_from_llm:
989
+ causal_relation = CausalRelationType(
990
+ relation_type=rel.relation_type,
991
+ target_fact_index=fact_start_idx + rel.target_fact_index,
992
+ strength=rel.strength
993
+ )
994
+ causal_relations.append(causal_relation)
995
+ return causal_relations
996
+
997
+
998
+ def _add_temporal_offsets(facts: List[ExtractedFactType], contents: List[RetainContent]) -> None:
999
+ """
1000
+ Add time offsets to preserve fact ordering within each content.
1001
+
1002
+ This allows retrieval to distinguish between facts that happened earlier vs later
1003
+ in the same conversation, even when the base event_date is the same.
1004
+
1005
+ Modifies facts in place.
1006
+ """
1007
+ # Group facts by content_index
1008
+ current_content_idx = 0
1009
+ content_fact_start = 0
1010
+
1011
+ for i, fact in enumerate(facts):
1012
+ if fact.content_index != current_content_idx:
1013
+ # Moved to next content
1014
+ current_content_idx = fact.content_index
1015
+ content_fact_start = i
1016
+
1017
+ # Calculate position within this content
1018
+ fact_position = i - content_fact_start
1019
+ offset = timedelta(seconds=fact_position * SECONDS_PER_FACT)
1020
+
1021
+ # Apply offset to all temporal fields
1022
+ if fact.occurred_start:
1023
+ fact.occurred_start = fact.occurred_start + offset
1024
+ if fact.occurred_end:
1025
+ fact.occurred_end = fact.occurred_end + offset
1026
+ if fact.mentioned_at:
1027
+ fact.mentioned_at = fact.mentioned_at + offset