hindsight-api 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. hindsight_api/__init__.py +10 -9
  2. hindsight_api/alembic/env.py +5 -8
  3. hindsight_api/alembic/versions/5a366d414dce_initial_schema.py +266 -180
  4. hindsight_api/alembic/versions/b7c4d8e9f1a2_add_chunks_table.py +32 -32
  5. hindsight_api/alembic/versions/c8e5f2a3b4d1_add_retain_params_to_documents.py +11 -11
  6. hindsight_api/alembic/versions/d9f6a3b4c5e2_rename_bank_to_interactions.py +7 -12
  7. hindsight_api/alembic/versions/e0a1b2c3d4e5_disposition_to_3_traits.py +23 -15
  8. hindsight_api/alembic/versions/rename_personality_to_disposition.py +30 -21
  9. hindsight_api/api/__init__.py +10 -10
  10. hindsight_api/api/http.py +575 -593
  11. hindsight_api/api/mcp.py +30 -28
  12. hindsight_api/banner.py +13 -6
  13. hindsight_api/config.py +9 -13
  14. hindsight_api/engine/__init__.py +9 -9
  15. hindsight_api/engine/cross_encoder.py +22 -21
  16. hindsight_api/engine/db_utils.py +5 -4
  17. hindsight_api/engine/embeddings.py +22 -21
  18. hindsight_api/engine/entity_resolver.py +81 -75
  19. hindsight_api/engine/llm_wrapper.py +61 -79
  20. hindsight_api/engine/memory_engine.py +603 -625
  21. hindsight_api/engine/query_analyzer.py +100 -97
  22. hindsight_api/engine/response_models.py +105 -106
  23. hindsight_api/engine/retain/__init__.py +9 -16
  24. hindsight_api/engine/retain/bank_utils.py +34 -58
  25. hindsight_api/engine/retain/chunk_storage.py +4 -12
  26. hindsight_api/engine/retain/deduplication.py +9 -28
  27. hindsight_api/engine/retain/embedding_processing.py +4 -11
  28. hindsight_api/engine/retain/embedding_utils.py +3 -4
  29. hindsight_api/engine/retain/entity_processing.py +7 -17
  30. hindsight_api/engine/retain/fact_extraction.py +155 -165
  31. hindsight_api/engine/retain/fact_storage.py +11 -23
  32. hindsight_api/engine/retain/link_creation.py +11 -39
  33. hindsight_api/engine/retain/link_utils.py +166 -95
  34. hindsight_api/engine/retain/observation_regeneration.py +39 -52
  35. hindsight_api/engine/retain/orchestrator.py +72 -62
  36. hindsight_api/engine/retain/types.py +49 -43
  37. hindsight_api/engine/search/__init__.py +5 -5
  38. hindsight_api/engine/search/fusion.py +6 -15
  39. hindsight_api/engine/search/graph_retrieval.py +22 -23
  40. hindsight_api/engine/search/mpfp_retrieval.py +76 -92
  41. hindsight_api/engine/search/observation_utils.py +9 -16
  42. hindsight_api/engine/search/reranking.py +4 -7
  43. hindsight_api/engine/search/retrieval.py +87 -66
  44. hindsight_api/engine/search/scoring.py +5 -7
  45. hindsight_api/engine/search/temporal_extraction.py +8 -11
  46. hindsight_api/engine/search/think_utils.py +115 -39
  47. hindsight_api/engine/search/trace.py +68 -39
  48. hindsight_api/engine/search/tracer.py +44 -35
  49. hindsight_api/engine/search/types.py +20 -17
  50. hindsight_api/engine/task_backend.py +21 -26
  51. hindsight_api/engine/utils.py +25 -10
  52. hindsight_api/main.py +21 -40
  53. hindsight_api/mcp_local.py +190 -0
  54. hindsight_api/metrics.py +44 -30
  55. hindsight_api/migrations.py +10 -8
  56. hindsight_api/models.py +60 -72
  57. hindsight_api/pg0.py +22 -23
  58. hindsight_api/server.py +3 -6
  59. {hindsight_api-0.1.5.dist-info → hindsight_api-0.1.6.dist-info}/METADATA +2 -2
  60. hindsight_api-0.1.6.dist-info/RECORD +64 -0
  61. {hindsight_api-0.1.5.dist-info → hindsight_api-0.1.6.dist-info}/entry_points.txt +1 -0
  62. hindsight_api-0.1.5.dist-info/RECORD +0 -63
  63. {hindsight_api-0.1.5.dist-info → hindsight_api-0.1.6.dist-info}/WHEEL +0 -0
@@ -4,16 +4,17 @@ Fact extraction from text using LLM.
4
4
  Extracts semantic facts, entities, and temporal information from text.
5
5
  Uses the LLMConfig wrapper for all LLM calls.
6
6
  """
7
- import logging
8
- import os
7
+
8
+ import asyncio
9
9
  import json
10
+ import logging
10
11
  import re
11
- import asyncio
12
12
  from datetime import datetime, timedelta
13
- from typing import List, Dict, Optional, Literal
14
- from openai import AsyncOpenAI
15
- from pydantic import BaseModel, Field, field_validator, ConfigDict
16
- from ..llm_wrapper import OutputTooLongError, LLMConfig
13
+ from typing import Literal
14
+
15
+ from pydantic import BaseModel, ConfigDict, Field, field_validator
16
+
17
+ from ..llm_wrapper import LLMConfig, OutputTooLongError
17
18
 
18
19
 
19
20
  def _sanitize_text(text: str) -> str:
@@ -31,11 +32,12 @@ def _sanitize_text(text: str) -> str:
31
32
  return text
32
33
  # Remove surrogate characters (U+D800 to U+DFFF) using regex
33
34
  # These are invalid in UTF-8 and cause encoding errors
34
- return re.sub(r'[\ud800-\udfff]', '', text)
35
+ return re.sub(r"[\ud800-\udfff]", "", text)
35
36
 
36
37
 
37
38
  class Entity(BaseModel):
38
39
  """An entity extracted from text."""
40
+
39
41
  text: str = Field(
40
42
  description="The specific, named entity as it appears in the fact. Must be a proper noun or specific identifier."
41
43
  )
@@ -48,42 +50,46 @@ class Fact(BaseModel):
48
50
  This is what fact_extraction returns and what the rest of the pipeline expects.
49
51
  Combined fact text format: "what | when | where | who | why"
50
52
  """
53
+
51
54
  # Required fields
52
55
  fact: str = Field(description="Combined fact text: what | when | where | who | why")
53
56
  fact_type: Literal["world", "experience", "opinion"] = Field(description="Perspective: world/experience/opinion")
54
57
 
55
58
  # Optional temporal fields
56
- occurred_start: Optional[str] = None
57
- occurred_end: Optional[str] = None
58
- mentioned_at: Optional[str] = None
59
+ occurred_start: str | None = None
60
+ occurred_end: str | None = None
61
+ mentioned_at: str | None = None
59
62
 
60
63
  # Optional location field
61
- where: Optional[str] = Field(None, description="WHERE the fact occurred or is about (specific location, place, or area)")
64
+ where: str | None = Field(
65
+ None, description="WHERE the fact occurred or is about (specific location, place, or area)"
66
+ )
62
67
 
63
68
  # Optional structured data
64
- entities: Optional[List[Entity]] = None
65
- causal_relations: Optional[List['CausalRelation']] = None
69
+ entities: list[Entity] | None = None
70
+ causal_relations: list["CausalRelation"] | None = None
66
71
 
67
72
 
68
73
  class CausalRelation(BaseModel):
69
74
  """Causal relationship between facts."""
75
+
70
76
  target_fact_index: int = Field(
71
77
  description="Index of the related fact in the facts array (0-based). "
72
- "This creates a directed causal link to another fact in the extraction."
78
+ "This creates a directed causal link to another fact in the extraction."
73
79
  )
74
80
  relation_type: Literal["causes", "caused_by", "enables", "prevents"] = Field(
75
81
  description="Type of causal relationship: "
76
- "'causes' = this fact directly causes the target fact, "
77
- "'caused_by' = this fact was caused by the target fact, "
78
- "'enables' = this fact enables/allows the target fact, "
79
- "'prevents' = this fact prevents/blocks the target fact"
82
+ "'causes' = this fact directly causes the target fact, "
83
+ "'caused_by' = this fact was caused by the target fact, "
84
+ "'enables' = this fact enables/allows the target fact, "
85
+ "'prevents' = this fact prevents/blocks the target fact"
80
86
  )
81
87
  strength: float = Field(
82
88
  description="Strength of causal relationship (0.0 to 1.0). "
83
- "1.0 = direct/strong causation, 0.5 = moderate, 0.3 = weak/indirect",
89
+ "1.0 = direct/strong causation, 0.5 = moderate, 0.3 = weak/indirect",
84
90
  ge=0.0,
85
91
  le=1.0,
86
- default=1.0
92
+ default=1.0,
87
93
  )
88
94
 
89
95
 
@@ -92,9 +98,7 @@ class ExtractedFact(BaseModel):
92
98
 
93
99
  model_config = ConfigDict(
94
100
  json_schema_mode="validation",
95
- json_schema_extra={
96
- "required": ["what", "when", "where", "who", "why", "fact_type"]
97
- }
101
+ json_schema_extra={"required": ["what", "when", "where", "who", "why", "fact_type"]},
98
102
  )
99
103
 
100
104
  # ==========================================================================
@@ -103,43 +107,43 @@ class ExtractedFact(BaseModel):
103
107
 
104
108
  what: str = Field(
105
109
  description="WHAT happened - COMPLETE, DETAILED description with ALL specifics. "
106
- "NEVER summarize or omit details. Include: exact actions, objects, quantities, specifics. "
107
- "BE VERBOSE - capture every detail that was mentioned. "
108
- "Example: 'Emily got married to Sarah at a rooftop garden ceremony with 50 guests attending and a live jazz band playing' "
109
- "NOT: 'A wedding happened' or 'Emily got married'"
110
+ "NEVER summarize or omit details. Include: exact actions, objects, quantities, specifics. "
111
+ "BE VERBOSE - capture every detail that was mentioned. "
112
+ "Example: 'Emily got married to Sarah at a rooftop garden ceremony with 50 guests attending and a live jazz band playing' "
113
+ "NOT: 'A wedding happened' or 'Emily got married'"
110
114
  )
111
115
 
112
116
  when: str = Field(
113
117
  description="WHEN it happened - ALWAYS include temporal information if mentioned. "
114
- "Include: specific dates, times, durations, relative time references. "
115
- "Examples: 'on June 15th, 2024 at 3pm', 'last weekend', 'for the past 3 years', 'every morning at 6am'. "
116
- "Write 'N/A' ONLY if absolutely no temporal context exists. Prefer converting to absolute dates when possible."
118
+ "Include: specific dates, times, durations, relative time references. "
119
+ "Examples: 'on June 15th, 2024 at 3pm', 'last weekend', 'for the past 3 years', 'every morning at 6am'. "
120
+ "Write 'N/A' ONLY if absolutely no temporal context exists. Prefer converting to absolute dates when possible."
117
121
  )
118
122
 
119
123
  where: str = Field(
120
124
  description="WHERE it happened or is about - SPECIFIC locations, places, areas, regions if applicable. "
121
- "Include: cities, neighborhoods, venues, buildings, countries, specific addresses when mentioned. "
122
- "Examples: 'downtown San Francisco at a rooftop garden venue', 'at the user's home in Brooklyn', 'online via Zoom', 'Paris, France'. "
123
- "Write 'N/A' ONLY if absolutely no location context exists or if the fact is completely location-agnostic."
125
+ "Include: cities, neighborhoods, venues, buildings, countries, specific addresses when mentioned. "
126
+ "Examples: 'downtown San Francisco at a rooftop garden venue', 'at the user's home in Brooklyn', 'online via Zoom', 'Paris, France'. "
127
+ "Write 'N/A' ONLY if absolutely no location context exists or if the fact is completely location-agnostic."
124
128
  )
125
129
 
126
130
  who: str = Field(
127
131
  description="WHO is involved - ALL people/entities with FULL context and relationships. "
128
- "Include: names, roles, relationships to user, background details. "
129
- "Resolve coreferences (if 'my roommate' is later named 'Emily', write 'Emily, the user's college roommate'). "
130
- "BE DETAILED about relationships and roles. "
131
- "Example: 'Emily (user's college roommate from Stanford, now works at Google), Sarah (Emily's partner of 5 years, software engineer)' "
132
- "NOT: 'my friend' or 'Emily and Sarah'"
132
+ "Include: names, roles, relationships to user, background details. "
133
+ "Resolve coreferences (if 'my roommate' is later named 'Emily', write 'Emily, the user's college roommate'). "
134
+ "BE DETAILED about relationships and roles. "
135
+ "Example: 'Emily (user's college roommate from Stanford, now works at Google), Sarah (Emily's partner of 5 years, software engineer)' "
136
+ "NOT: 'my friend' or 'Emily and Sarah'"
133
137
  )
134
138
 
135
139
  why: str = Field(
136
140
  description="WHY it matters - ALL emotional, contextual, and motivational details. "
137
- "Include EVERYTHING: feelings, preferences, motivations, observations, context, background, significance. "
138
- "BE VERBOSE - capture all the nuance and meaning. "
139
- "FOR ASSISTANT FACTS: MUST include what the user asked/requested that led to this interaction! "
140
- "Example (world): 'The user felt thrilled and inspired, has always dreamed of an outdoor ceremony, mentioned wanting a similar garden venue, was particularly moved by the intimate atmosphere and personal vows' "
141
- "Example (assistant): 'User asked how to fix slow API performance with 1000+ concurrent users, expected 70-80% reduction in database load' "
142
- "NOT: 'User liked it' or 'To help user'"
141
+ "Include EVERYTHING: feelings, preferences, motivations, observations, context, background, significance. "
142
+ "BE VERBOSE - capture all the nuance and meaning. "
143
+ "FOR ASSISTANT FACTS: MUST include what the user asked/requested that led to this interaction! "
144
+ "Example (world): 'The user felt thrilled and inspired, has always dreamed of an outdoor ceremony, mentioned wanting a similar garden venue, was particularly moved by the intimate atmosphere and personal vows' "
145
+ "Example (assistant): 'User asked how to fix slow API performance with 1000+ concurrent users, expected 70-80% reduction in database load' "
146
+ "NOT: 'User liked it' or 'To help user'"
143
147
  )
144
148
 
145
149
  # ==========================================================================
@@ -148,17 +152,17 @@ class ExtractedFact(BaseModel):
148
152
 
149
153
  fact_kind: str = Field(
150
154
  default="conversation",
151
- description="'event' = specific datable occurrence (set occurred dates), 'conversation' = general info (no occurred dates)"
155
+ description="'event' = specific datable occurrence (set occurred dates), 'conversation' = general info (no occurred dates)",
152
156
  )
153
157
 
154
158
  # Temporal fields - optional
155
- occurred_start: Optional[str] = Field(
159
+ occurred_start: str | None = Field(
156
160
  default=None,
157
- description="WHEN the event happened (ISO timestamp). Only for fact_kind='event'. Leave null for conversations."
161
+ description="WHEN the event happened (ISO timestamp). Only for fact_kind='event'. Leave null for conversations.",
158
162
  )
159
- occurred_end: Optional[str] = Field(
163
+ occurred_end: str | None = Field(
160
164
  default=None,
161
- description="WHEN the event ended (ISO timestamp). Only for events with duration. Leave null for conversations."
165
+ description="WHEN the event ended (ISO timestamp). Only for events with duration. Leave null for conversations.",
162
166
  )
163
167
 
164
168
  # Classification (CRITICAL - required)
@@ -168,16 +172,15 @@ class ExtractedFact(BaseModel):
168
172
  )
169
173
 
170
174
  # Entities - extracted from fact content
171
- entities: Optional[List[Entity]] = Field(
175
+ entities: list[Entity] | None = Field(
172
176
  default=None,
173
- description="Named entities, objects, AND abstract concepts from the fact. Include: people names, organizations, places, significant objects (e.g., 'coffee maker', 'car'), AND abstract concepts/themes (e.g., 'friendship', 'career growth', 'loss', 'celebration'). Extract anything that could help link related facts together."
177
+ description="Named entities, objects, AND abstract concepts from the fact. Include: people names, organizations, places, significant objects (e.g., 'coffee maker', 'car'), AND abstract concepts/themes (e.g., 'friendship', 'career growth', 'loss', 'celebration'). Extract anything that could help link related facts together.",
174
178
  )
175
- causal_relations: Optional[List[CausalRelation]] = Field(
176
- default=None,
177
- description="Causal links to other facts. Can be null."
179
+ causal_relations: list[CausalRelation] | None = Field(
180
+ default=None, description="Causal links to other facts. Can be null."
178
181
  )
179
182
 
180
- @field_validator('entities', mode='before')
183
+ @field_validator("entities", mode="before")
181
184
  @classmethod
182
185
  def ensure_entities_list(cls, v):
183
186
  """Ensure entities is always a list (convert None to empty list)."""
@@ -185,7 +188,7 @@ class ExtractedFact(BaseModel):
185
188
  return []
186
189
  return v
187
190
 
188
- @field_validator('causal_relations', mode='before')
191
+ @field_validator("causal_relations", mode="before")
189
192
  @classmethod
190
193
  def ensure_causal_relations_list(cls, v):
191
194
  """Ensure causal_relations is always a list (convert None to empty list)."""
@@ -198,11 +201,11 @@ class ExtractedFact(BaseModel):
198
201
  parts = [self.what]
199
202
 
200
203
  # Add 'who' if not N/A
201
- if self.who and self.who.upper() != 'N/A':
204
+ if self.who and self.who.upper() != "N/A":
202
205
  parts.append(f"Involving: {self.who}")
203
206
 
204
207
  # Add 'why' if not N/A
205
- if self.why and self.why.upper() != 'N/A':
208
+ if self.why and self.why.upper() != "N/A":
206
209
  parts.append(self.why)
207
210
 
208
211
  if len(parts) == 1:
@@ -213,12 +216,11 @@ class ExtractedFact(BaseModel):
213
216
 
214
217
  class FactExtractionResponse(BaseModel):
215
218
  """Response containing all extracted facts."""
216
- facts: List[ExtractedFact] = Field(
217
- description="List of extracted factual statements"
218
- )
219
+
220
+ facts: list[ExtractedFact] = Field(description="List of extracted factual statements")
219
221
 
220
222
 
221
- def chunk_text(text: str, max_chars: int) -> List[str]:
223
+ def chunk_text(text: str, max_chars: int) -> list[str]:
222
224
  """
223
225
  Split text into chunks, preserving conversation structure when possible.
224
226
 
@@ -232,7 +234,6 @@ def chunk_text(text: str, max_chars: int) -> List[str]:
232
234
  Returns:
233
235
  List of text chunks, roughly under max_chars
234
236
  """
235
- import json
236
237
  from langchain_text_splitters import RecursiveCharacterTextSplitter
237
238
 
238
239
  # If text is small enough, return as-is
@@ -256,21 +257,21 @@ def chunk_text(text: str, max_chars: int) -> List[str]:
256
257
  is_separator_regex=False,
257
258
  separators=[
258
259
  "\n\n", # Paragraph breaks
259
- "\n", # Line breaks
260
- ". ", # Sentence endings
261
- "! ", # Exclamations
262
- "? ", # Questions
263
- "; ", # Semicolons
264
- ", ", # Commas
265
- " ", # Words
266
- "", # Characters (last resort)
260
+ "\n", # Line breaks
261
+ ". ", # Sentence endings
262
+ "! ", # Exclamations
263
+ "? ", # Questions
264
+ "; ", # Semicolons
265
+ ", ", # Commas
266
+ " ", # Words
267
+ "", # Characters (last resort)
267
268
  ],
268
269
  )
269
270
 
270
271
  return splitter.split_text(text)
271
272
 
272
273
 
273
- def _chunk_conversation(turns: List[dict], max_chars: int) -> List[str]:
274
+ def _chunk_conversation(turns: list[dict], max_chars: int) -> list[str]:
274
275
  """
275
276
  Chunk a conversation array at turn boundaries, preserving complete turns.
276
277
 
@@ -281,7 +282,6 @@ def _chunk_conversation(turns: List[dict], max_chars: int) -> List[str]:
281
282
  Returns:
282
283
  List of JSON-serialized chunks, each containing complete turns
283
284
  """
284
- import json
285
285
 
286
286
  chunks = []
287
287
  current_chunk = []
@@ -315,10 +315,10 @@ async def _extract_facts_from_chunk(
315
315
  total_chunks: int,
316
316
  event_date: datetime,
317
317
  context: str,
318
- llm_config: 'LLMConfig',
318
+ llm_config: "LLMConfig",
319
319
  agent_name: str = None,
320
- extract_opinions: bool = False
321
- ) -> List[Dict[str, str]]:
320
+ extract_opinions: bool = False,
321
+ ) -> list[dict[str, str]]:
322
322
  """
323
323
  Extract facts from a single chunk (internal helper for parallel processing).
324
324
 
@@ -333,7 +333,9 @@ async def _extract_facts_from_chunk(
333
333
  # Opinion extraction uses a separate prompt (not this one)
334
334
  fact_types_instruction = "Extract ONLY 'opinion' type facts (formed opinions, beliefs, and perspectives). DO NOT extract 'world' or 'assistant' facts."
335
335
  else:
336
- fact_types_instruction = "Extract ONLY 'world' and 'assistant' type facts. DO NOT extract opinions - those are extracted separately."
336
+ fact_types_instruction = (
337
+ "Extract ONLY 'world' and 'assistant' type facts. DO NOT extract opinions - those are extracted separately."
338
+ )
337
339
 
338
340
  prompt = f"""Extract facts from text into structured format with FOUR required dimensions - BE EXTREMELY DETAILED.
339
341
 
@@ -534,10 +536,8 @@ WHAT TO EXTRACT vs SKIP
534
536
  ✅ EXTRACT: User preferences (ALWAYS as separate facts!), feelings, plans, events, relationships, achievements
535
537
  ❌ SKIP: Greetings, filler ("thanks", "cool"), purely structural statements"""
536
538
 
537
-
538
-
539
-
540
539
  import logging
540
+
541
541
  from openai import BadRequestError
542
542
 
543
543
  logger = logging.getLogger(__name__)
@@ -548,11 +548,11 @@ WHAT TO EXTRACT vs SKIP
548
548
 
549
549
  # Sanitize input text to prevent Unicode encoding errors (e.g., unpaired surrogates)
550
550
  sanitized_chunk = _sanitize_text(chunk)
551
- sanitized_context = _sanitize_text(context) if context else 'none'
551
+ sanitized_context = _sanitize_text(context) if context else "none"
552
552
 
553
553
  # Build user message with metadata and chunk content in a clear format
554
554
  # Format event_date with day of week for better temporal reasoning
555
- event_date_formatted = event_date.strftime('%A, %B %d, %Y') # e.g., "Monday, June 10, 2024"
555
+ event_date_formatted = event_date.strftime("%A, %B %d, %Y") # e.g., "Monday, June 10, 2024"
556
556
  user_message = f"""Extract facts from the following text chunk.
557
557
  {memory_bank_context}
558
558
 
@@ -566,16 +566,7 @@ Text:
566
566
  for attempt in range(max_retries):
567
567
  try:
568
568
  extraction_response_json = await llm_config.call(
569
- messages=[
570
- {
571
- "role": "system",
572
- "content": prompt
573
- },
574
- {
575
- "role": "user",
576
- "content": user_message
577
- }
578
- ],
569
+ messages=[{"role": "system", "content": prompt}, {"role": "user", "content": user_message}],
579
570
  response_format=FactExtractionResponse,
580
571
  scope="memory_extract_facts",
581
572
  temperature=0.1,
@@ -601,7 +592,7 @@ Text:
601
592
  )
602
593
  return []
603
594
 
604
- raw_facts = extraction_response_json.get('facts', [])
595
+ raw_facts = extraction_response_json.get("facts", [])
605
596
  if not raw_facts:
606
597
  logger.debug(
607
598
  f"LLM response missing 'facts' field or returned empty list. "
@@ -622,48 +613,48 @@ Text:
622
613
  # Helper to get non-empty value
623
614
  def get_value(field_name):
624
615
  value = llm_fact.get(field_name)
625
- if value and value != '' and value != [] and value != {} and str(value).upper() != 'N/A':
616
+ if value and value != "" and value != [] and value != {} and str(value).upper() != "N/A":
626
617
  return value
627
618
  return None
628
619
 
629
620
  # NEW FORMAT: what, when, who, why (all required)
630
- what = get_value('what')
631
- when = get_value('when')
632
- who = get_value('who')
633
- why = get_value('why')
621
+ what = get_value("what")
622
+ when = get_value("when")
623
+ who = get_value("who")
624
+ why = get_value("why")
634
625
 
635
626
  # Fallback to old format if new fields not present
636
627
  if not what:
637
- what = get_value('factual_core')
628
+ what = get_value("factual_core")
638
629
  if not what:
639
630
  logger.warning(f"Skipping fact {i}: missing 'what' field")
640
631
  continue
641
632
 
642
633
  # Critical field: fact_type
643
634
  # LLM uses "assistant" but we convert to "experience" for storage
644
- fact_type = llm_fact.get('fact_type')
635
+ fact_type = llm_fact.get("fact_type")
645
636
 
646
637
  # Convert "assistant" → "experience" for storage
647
- if fact_type == 'assistant':
648
- fact_type = 'experience'
638
+ if fact_type == "assistant":
639
+ fact_type = "experience"
649
640
 
650
641
  # Validate fact_type (after conversion)
651
- if fact_type not in ['world', 'experience', 'opinion']:
642
+ if fact_type not in ["world", "experience", "opinion"]:
652
643
  # Try to fix common mistakes - check if they swapped fact_type and fact_kind
653
- fact_kind = llm_fact.get('fact_kind')
654
- if fact_kind == 'assistant':
655
- fact_type = 'experience'
656
- elif fact_kind in ['world', 'experience', 'opinion']:
644
+ fact_kind = llm_fact.get("fact_kind")
645
+ if fact_kind == "assistant":
646
+ fact_type = "experience"
647
+ elif fact_kind in ["world", "experience", "opinion"]:
657
648
  fact_type = fact_kind
658
649
  else:
659
650
  # Default to 'world' if we can't determine
660
- fact_type = 'world'
651
+ fact_type = "world"
661
652
  logger.warning(f"Fact {i}: defaulting to fact_type='world'")
662
653
 
663
654
  # Get fact_kind for temporal handling (but don't store it)
664
- fact_kind = llm_fact.get('fact_kind', 'conversation')
665
- if fact_kind not in ['conversation', 'event', 'other']:
666
- fact_kind = 'conversation'
655
+ fact_kind = llm_fact.get("fact_kind", "conversation")
656
+ if fact_kind not in ["conversation", "event", "other"]:
657
+ fact_kind = "conversation"
667
658
 
668
659
  # Build combined fact text from the 4 dimensions: what | when | who | why
669
660
  fact_data = {}
@@ -682,20 +673,20 @@ Text:
682
673
 
683
674
  # Add temporal fields
684
675
  # For events: occurred_start/occurred_end (when the event happened)
685
- if fact_kind == 'event':
686
- occurred_start = get_value('occurred_start')
687
- occurred_end = get_value('occurred_end')
676
+ if fact_kind == "event":
677
+ occurred_start = get_value("occurred_start")
678
+ occurred_end = get_value("occurred_end")
688
679
  if occurred_start:
689
- fact_data['occurred_start'] = occurred_start
680
+ fact_data["occurred_start"] = occurred_start
690
681
  # For point events: if occurred_end not set, default to occurred_start
691
682
  if occurred_end:
692
- fact_data['occurred_end'] = occurred_end
683
+ fact_data["occurred_end"] = occurred_end
693
684
  else:
694
- fact_data['occurred_end'] = occurred_start
685
+ fact_data["occurred_end"] = occurred_start
695
686
 
696
687
  # Add entities if present (validate as Entity objects)
697
688
  # LLM sometimes returns strings instead of {"text": "..."} format
698
- entities = get_value('entities')
689
+ entities = get_value("entities")
699
690
  if entities:
700
691
  # Validate and normalize each entity
701
692
  validated_entities = []
@@ -703,38 +694,34 @@ Text:
703
694
  if isinstance(ent, str):
704
695
  # Normalize string to Entity object
705
696
  validated_entities.append(Entity(text=ent))
706
- elif isinstance(ent, dict) and 'text' in ent:
697
+ elif isinstance(ent, dict) and "text" in ent:
707
698
  try:
708
699
  validated_entities.append(Entity.model_validate(ent))
709
700
  except Exception as e:
710
701
  logger.warning(f"Invalid entity {ent}: {e}")
711
702
  if validated_entities:
712
- fact_data['entities'] = validated_entities
703
+ fact_data["entities"] = validated_entities
713
704
 
714
705
  # Add causal relations if present (validate as CausalRelation objects)
715
706
  # Filter out invalid relations (missing required fields)
716
- causal_relations = get_value('causal_relations')
707
+ causal_relations = get_value("causal_relations")
717
708
  if causal_relations:
718
709
  validated_relations = []
719
710
  for rel in causal_relations:
720
- if isinstance(rel, dict) and 'target_fact_index' in rel and 'relation_type' in rel:
711
+ if isinstance(rel, dict) and "target_fact_index" in rel and "relation_type" in rel:
721
712
  try:
722
713
  validated_relations.append(CausalRelation.model_validate(rel))
723
714
  except Exception as e:
724
715
  logger.warning(f"Invalid causal relation {rel}: {e}")
725
716
  if validated_relations:
726
- fact_data['causal_relations'] = validated_relations
717
+ fact_data["causal_relations"] = validated_relations
727
718
 
728
719
  # Always set mentioned_at to the event_date (when the conversation/document occurred)
729
- fact_data['mentioned_at'] = event_date.isoformat()
720
+ fact_data["mentioned_at"] = event_date.isoformat()
730
721
 
731
722
  # Build Fact model instance
732
723
  try:
733
- fact = Fact(
734
- fact=combined_text,
735
- fact_type=fact_type,
736
- **fact_data
737
- )
724
+ fact = Fact(fact=combined_text, fact_type=fact_type, **fact_data)
738
725
  chunk_facts.append(fact)
739
726
  except Exception as e:
740
727
  logger.error(f"Failed to create Fact model for fact {i}: {e}")
@@ -753,7 +740,9 @@ Text:
753
740
  except BadRequestError as e:
754
741
  last_error = e
755
742
  if "json_validate_failed" in str(e):
756
- logger.warning(f" [1.3.{chunk_index + 1}] Attempt {attempt + 1}/{max_retries} failed with JSON validation error: {e}")
743
+ logger.warning(
744
+ f" [1.3.{chunk_index + 1}] Attempt {attempt + 1}/{max_retries} failed with JSON validation error: {e}"
745
+ )
757
746
  if attempt < max_retries - 1:
758
747
  logger.info(f" [1.3.{chunk_index + 1}] Retrying...")
759
748
  continue
@@ -772,8 +761,8 @@ async def _extract_facts_with_auto_split(
772
761
  context: str,
773
762
  llm_config: LLMConfig,
774
763
  agent_name: str = None,
775
- extract_opinions: bool = False
776
- ) -> List[Dict[str, str]]:
764
+ extract_opinions: bool = False,
765
+ ) -> list[dict[str, str]]:
777
766
  """
778
767
  Extract facts from a chunk with automatic splitting if output exceeds token limits.
779
768
 
@@ -794,6 +783,7 @@ async def _extract_facts_with_auto_split(
794
783
  List of fact dictionaries extracted from the chunk (possibly from sub-chunks)
795
784
  """
796
785
  import logging
786
+
797
787
  logger = logging.getLogger(__name__)
798
788
 
799
789
  try:
@@ -806,9 +796,9 @@ async def _extract_facts_with_auto_split(
806
796
  context=context,
807
797
  llm_config=llm_config,
808
798
  agent_name=agent_name,
809
- extract_opinions=extract_opinions
799
+ extract_opinions=extract_opinions,
810
800
  )
811
- except OutputTooLongError as e:
801
+ except OutputTooLongError:
812
802
  # Output exceeded token limits - split the chunk in half and retry
813
803
  logger.warning(
814
804
  f"Output too long for chunk {chunk_index + 1}/{total_chunks} "
@@ -824,7 +814,7 @@ async def _extract_facts_with_auto_split(
824
814
  search_start = max(0, mid_point - search_range)
825
815
  search_end = min(len(chunk), mid_point + search_range)
826
816
 
827
- sentence_endings = ['. ', '! ', '? ', '\n\n']
817
+ sentence_endings = [". ", "! ", "? ", "\n\n"]
828
818
  best_split = mid_point
829
819
 
830
820
  for ending in sentence_endings:
@@ -838,8 +828,7 @@ async def _extract_facts_with_auto_split(
838
828
  second_half = chunk[best_split:].strip()
839
829
 
840
830
  logger.info(
841
- f"Split chunk {chunk_index + 1} into two sub-chunks: "
842
- f"{len(first_half)} chars and {len(second_half)} chars"
831
+ f"Split chunk {chunk_index + 1} into two sub-chunks: {len(first_half)} chars and {len(second_half)} chars"
843
832
  )
844
833
 
845
834
  # Process both halves recursively (in parallel)
@@ -852,7 +841,7 @@ async def _extract_facts_with_auto_split(
852
841
  context=context,
853
842
  llm_config=llm_config,
854
843
  agent_name=agent_name,
855
- extract_opinions=extract_opinions
844
+ extract_opinions=extract_opinions,
856
845
  ),
857
846
  _extract_facts_with_auto_split(
858
847
  chunk=second_half,
@@ -862,8 +851,8 @@ async def _extract_facts_with_auto_split(
862
851
  context=context,
863
852
  llm_config=llm_config,
864
853
  agent_name=agent_name,
865
- extract_opinions=extract_opinions
866
- )
854
+ extract_opinions=extract_opinions,
855
+ ),
867
856
  ]
868
857
 
869
858
  sub_results = await asyncio.gather(*sub_tasks)
@@ -873,9 +862,7 @@ async def _extract_facts_with_auto_split(
873
862
  for sub_result in sub_results:
874
863
  all_facts.extend(sub_result)
875
864
 
876
- logger.info(
877
- f"Successfully extracted {len(all_facts)} facts from split chunk {chunk_index + 1}"
878
- )
865
+ logger.info(f"Successfully extracted {len(all_facts)} facts from split chunk {chunk_index + 1}")
879
866
 
880
867
  return all_facts
881
868
 
@@ -887,7 +874,7 @@ async def extract_facts_from_text(
887
874
  agent_name: str,
888
875
  context: str = "",
889
876
  extract_opinions: bool = False,
890
- ) -> tuple[List[Fact], List[tuple[str, int]]]:
877
+ ) -> tuple[list[Fact], list[tuple[str, int]]]:
891
878
  """
892
879
  Extract semantic facts from conversational or narrative text using LLM.
893
880
 
@@ -920,7 +907,7 @@ async def extract_facts_from_text(
920
907
  context=context,
921
908
  llm_config=llm_config,
922
909
  agent_name=agent_name,
923
- extract_opinions=extract_opinions
910
+ extract_opinions=extract_opinions,
924
911
  )
925
912
  for i, chunk in enumerate(chunks)
926
913
  ]
@@ -938,8 +925,10 @@ async def extract_facts_from_text(
938
925
  # ============================================================================
939
926
 
940
927
  # Import types for the orchestration layer (note: ExtractedFact here is different from the Pydantic model above)
941
- from .types import RetainContent, ExtractedFact as ExtractedFactType, ChunkMetadata, CausalRelation as CausalRelationType
942
- from typing import Tuple
928
+
929
+ from .types import CausalRelation as CausalRelationType
930
+ from .types import ChunkMetadata, RetainContent
931
+ from .types import ExtractedFact as ExtractedFactType
943
932
 
944
933
  logger = logging.getLogger(__name__)
945
934
 
@@ -948,11 +937,8 @@ SECONDS_PER_FACT = 10
948
937
 
949
938
 
950
939
  async def extract_facts_from_contents(
951
- contents: List[RetainContent],
952
- llm_config,
953
- agent_name: str,
954
- extract_opinions: bool = False
955
- ) -> Tuple[List[ExtractedFactType], List[ChunkMetadata]]:
940
+ contents: list[RetainContent], llm_config, agent_name: str, extract_opinions: bool = False
941
+ ) -> tuple[list[ExtractedFactType], list[ChunkMetadata]]:
956
942
  """
957
943
  Extract facts from multiple content items in parallel.
958
944
 
@@ -985,7 +971,7 @@ async def extract_facts_from_contents(
985
971
  context=item.context,
986
972
  llm_config=llm_config,
987
973
  agent_name=agent_name,
988
- extract_opinions=extract_opinions
974
+ extract_opinions=extract_opinions,
989
975
  )
990
976
  fact_extraction_tasks.append(task)
991
977
 
@@ -993,8 +979,8 @@ async def extract_facts_from_contents(
993
979
  all_fact_results = await asyncio.gather(*fact_extraction_tasks)
994
980
 
995
981
  # Step 3: Flatten and convert to typed objects
996
- extracted_facts: List[ExtractedFactType] = []
997
- chunks_metadata: List[ChunkMetadata] = []
982
+ extracted_facts: list[ExtractedFactType] = []
983
+ chunks_metadata: list[ChunkMetadata] = []
998
984
 
999
985
  global_chunk_idx = 0
1000
986
  global_fact_idx = 0
@@ -1008,7 +994,7 @@ async def extract_facts_from_contents(
1008
994
  chunk_text=chunk_text,
1009
995
  fact_count=chunk_fact_count,
1010
996
  content_index=content_index,
1011
- chunk_index=global_chunk_idx
997
+ chunk_index=global_chunk_idx,
1012
998
  )
1013
999
  chunks_metadata.append(chunk_metadata)
1014
1000
  global_chunk_idx += 1
@@ -1029,18 +1015,21 @@ async def extract_facts_from_contents(
1029
1015
  fact_type=fact_from_llm.fact_type,
1030
1016
  entities=[e.text for e in (fact_from_llm.entities or [])],
1031
1017
  # occurred_start/end: from LLM only, leave None if not provided
1032
- occurred_start=_parse_datetime(fact_from_llm.occurred_start) if fact_from_llm.occurred_start else None,
1033
- occurred_end=_parse_datetime(fact_from_llm.occurred_end) if fact_from_llm.occurred_end else None,
1018
+ occurred_start=_parse_datetime(fact_from_llm.occurred_start)
1019
+ if fact_from_llm.occurred_start
1020
+ else None,
1021
+ occurred_end=_parse_datetime(fact_from_llm.occurred_end)
1022
+ if fact_from_llm.occurred_end
1023
+ else None,
1034
1024
  causal_relations=_convert_causal_relations(
1035
- fact_from_llm.causal_relations or [],
1036
- global_fact_idx
1025
+ fact_from_llm.causal_relations or [], global_fact_idx
1037
1026
  ),
1038
1027
  content_index=content_index,
1039
1028
  chunk_index=chunk_global_idx,
1040
1029
  context=content.context,
1041
1030
  # mentioned_at: always the event_date (when the conversation/document occurred)
1042
1031
  mentioned_at=content.event_date,
1043
- metadata=content.metadata
1032
+ metadata=content.metadata,
1044
1033
  )
1045
1034
 
1046
1035
  extracted_facts.append(extracted_fact)
@@ -1056,13 +1045,14 @@ async def extract_facts_from_contents(
1056
1045
  def _parse_datetime(date_str: str):
1057
1046
  """Parse ISO datetime string."""
1058
1047
  from dateutil import parser as date_parser
1048
+
1059
1049
  try:
1060
1050
  return date_parser.isoparse(date_str)
1061
1051
  except Exception:
1062
1052
  return None
1063
1053
 
1064
1054
 
1065
- def _convert_causal_relations(relations_from_llm, fact_start_idx: int) -> List[CausalRelationType]:
1055
+ def _convert_causal_relations(relations_from_llm, fact_start_idx: int) -> list[CausalRelationType]:
1066
1056
  """
1067
1057
  Convert causal relations from LLM format to ExtractedFact format.
1068
1058
 
@@ -1073,13 +1063,13 @@ def _convert_causal_relations(relations_from_llm, fact_start_idx: int) -> List[C
1073
1063
  causal_relation = CausalRelationType(
1074
1064
  relation_type=rel.relation_type,
1075
1065
  target_fact_index=fact_start_idx + rel.target_fact_index,
1076
- strength=rel.strength
1066
+ strength=rel.strength,
1077
1067
  )
1078
1068
  causal_relations.append(causal_relation)
1079
1069
  return causal_relations
1080
1070
 
1081
1071
 
1082
- def _add_temporal_offsets(facts: List[ExtractedFactType], contents: List[RetainContent]) -> None:
1072
+ def _add_temporal_offsets(facts: list[ExtractedFactType], contents: list[RetainContent]) -> None:
1083
1073
  """
1084
1074
  Add time offsets to preserve fact ordering within each content.
1085
1075