hindsight-api 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hindsight_api/__init__.py +10 -9
- hindsight_api/alembic/env.py +5 -8
- hindsight_api/alembic/versions/5a366d414dce_initial_schema.py +266 -180
- hindsight_api/alembic/versions/b7c4d8e9f1a2_add_chunks_table.py +32 -32
- hindsight_api/alembic/versions/c8e5f2a3b4d1_add_retain_params_to_documents.py +11 -11
- hindsight_api/alembic/versions/d9f6a3b4c5e2_rename_bank_to_interactions.py +7 -12
- hindsight_api/alembic/versions/e0a1b2c3d4e5_disposition_to_3_traits.py +23 -15
- hindsight_api/alembic/versions/rename_personality_to_disposition.py +30 -21
- hindsight_api/api/__init__.py +10 -10
- hindsight_api/api/http.py +575 -593
- hindsight_api/api/mcp.py +30 -28
- hindsight_api/banner.py +13 -6
- hindsight_api/config.py +9 -13
- hindsight_api/engine/__init__.py +9 -9
- hindsight_api/engine/cross_encoder.py +22 -21
- hindsight_api/engine/db_utils.py +5 -4
- hindsight_api/engine/embeddings.py +22 -21
- hindsight_api/engine/entity_resolver.py +81 -75
- hindsight_api/engine/llm_wrapper.py +61 -79
- hindsight_api/engine/memory_engine.py +603 -625
- hindsight_api/engine/query_analyzer.py +100 -97
- hindsight_api/engine/response_models.py +105 -106
- hindsight_api/engine/retain/__init__.py +9 -16
- hindsight_api/engine/retain/bank_utils.py +34 -58
- hindsight_api/engine/retain/chunk_storage.py +4 -12
- hindsight_api/engine/retain/deduplication.py +9 -28
- hindsight_api/engine/retain/embedding_processing.py +4 -11
- hindsight_api/engine/retain/embedding_utils.py +3 -4
- hindsight_api/engine/retain/entity_processing.py +7 -17
- hindsight_api/engine/retain/fact_extraction.py +155 -165
- hindsight_api/engine/retain/fact_storage.py +11 -23
- hindsight_api/engine/retain/link_creation.py +11 -39
- hindsight_api/engine/retain/link_utils.py +166 -95
- hindsight_api/engine/retain/observation_regeneration.py +39 -52
- hindsight_api/engine/retain/orchestrator.py +72 -62
- hindsight_api/engine/retain/types.py +49 -43
- hindsight_api/engine/search/__init__.py +5 -5
- hindsight_api/engine/search/fusion.py +6 -15
- hindsight_api/engine/search/graph_retrieval.py +22 -23
- hindsight_api/engine/search/mpfp_retrieval.py +76 -92
- hindsight_api/engine/search/observation_utils.py +9 -16
- hindsight_api/engine/search/reranking.py +4 -7
- hindsight_api/engine/search/retrieval.py +87 -66
- hindsight_api/engine/search/scoring.py +5 -7
- hindsight_api/engine/search/temporal_extraction.py +8 -11
- hindsight_api/engine/search/think_utils.py +115 -39
- hindsight_api/engine/search/trace.py +68 -39
- hindsight_api/engine/search/tracer.py +44 -35
- hindsight_api/engine/search/types.py +20 -17
- hindsight_api/engine/task_backend.py +21 -26
- hindsight_api/engine/utils.py +25 -10
- hindsight_api/main.py +21 -40
- hindsight_api/mcp_local.py +190 -0
- hindsight_api/metrics.py +44 -30
- hindsight_api/migrations.py +10 -8
- hindsight_api/models.py +60 -72
- hindsight_api/pg0.py +22 -23
- hindsight_api/server.py +3 -6
- {hindsight_api-0.1.5.dist-info → hindsight_api-0.1.6.dist-info}/METADATA +2 -2
- hindsight_api-0.1.6.dist-info/RECORD +64 -0
- {hindsight_api-0.1.5.dist-info → hindsight_api-0.1.6.dist-info}/entry_points.txt +1 -0
- hindsight_api-0.1.5.dist-info/RECORD +0 -63
- {hindsight_api-0.1.5.dist-info → hindsight_api-0.1.6.dist-info}/WHEEL +0 -0
|
@@ -4,16 +4,17 @@ Fact extraction from text using LLM.
|
|
|
4
4
|
Extracts semantic facts, entities, and temporal information from text.
|
|
5
5
|
Uses the LLMConfig wrapper for all LLM calls.
|
|
6
6
|
"""
|
|
7
|
-
|
|
8
|
-
import
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
9
|
import json
|
|
10
|
+
import logging
|
|
10
11
|
import re
|
|
11
|
-
import asyncio
|
|
12
12
|
from datetime import datetime, timedelta
|
|
13
|
-
from typing import
|
|
14
|
-
|
|
15
|
-
from pydantic import BaseModel, Field, field_validator
|
|
16
|
-
|
|
13
|
+
from typing import Literal
|
|
14
|
+
|
|
15
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
|
16
|
+
|
|
17
|
+
from ..llm_wrapper import LLMConfig, OutputTooLongError
|
|
17
18
|
|
|
18
19
|
|
|
19
20
|
def _sanitize_text(text: str) -> str:
|
|
@@ -31,11 +32,12 @@ def _sanitize_text(text: str) -> str:
|
|
|
31
32
|
return text
|
|
32
33
|
# Remove surrogate characters (U+D800 to U+DFFF) using regex
|
|
33
34
|
# These are invalid in UTF-8 and cause encoding errors
|
|
34
|
-
return re.sub(r
|
|
35
|
+
return re.sub(r"[\ud800-\udfff]", "", text)
|
|
35
36
|
|
|
36
37
|
|
|
37
38
|
class Entity(BaseModel):
|
|
38
39
|
"""An entity extracted from text."""
|
|
40
|
+
|
|
39
41
|
text: str = Field(
|
|
40
42
|
description="The specific, named entity as it appears in the fact. Must be a proper noun or specific identifier."
|
|
41
43
|
)
|
|
@@ -48,42 +50,46 @@ class Fact(BaseModel):
|
|
|
48
50
|
This is what fact_extraction returns and what the rest of the pipeline expects.
|
|
49
51
|
Combined fact text format: "what | when | where | who | why"
|
|
50
52
|
"""
|
|
53
|
+
|
|
51
54
|
# Required fields
|
|
52
55
|
fact: str = Field(description="Combined fact text: what | when | where | who | why")
|
|
53
56
|
fact_type: Literal["world", "experience", "opinion"] = Field(description="Perspective: world/experience/opinion")
|
|
54
57
|
|
|
55
58
|
# Optional temporal fields
|
|
56
|
-
occurred_start:
|
|
57
|
-
occurred_end:
|
|
58
|
-
mentioned_at:
|
|
59
|
+
occurred_start: str | None = None
|
|
60
|
+
occurred_end: str | None = None
|
|
61
|
+
mentioned_at: str | None = None
|
|
59
62
|
|
|
60
63
|
# Optional location field
|
|
61
|
-
where:
|
|
64
|
+
where: str | None = Field(
|
|
65
|
+
None, description="WHERE the fact occurred or is about (specific location, place, or area)"
|
|
66
|
+
)
|
|
62
67
|
|
|
63
68
|
# Optional structured data
|
|
64
|
-
entities:
|
|
65
|
-
causal_relations:
|
|
69
|
+
entities: list[Entity] | None = None
|
|
70
|
+
causal_relations: list["CausalRelation"] | None = None
|
|
66
71
|
|
|
67
72
|
|
|
68
73
|
class CausalRelation(BaseModel):
|
|
69
74
|
"""Causal relationship between facts."""
|
|
75
|
+
|
|
70
76
|
target_fact_index: int = Field(
|
|
71
77
|
description="Index of the related fact in the facts array (0-based). "
|
|
72
|
-
|
|
78
|
+
"This creates a directed causal link to another fact in the extraction."
|
|
73
79
|
)
|
|
74
80
|
relation_type: Literal["causes", "caused_by", "enables", "prevents"] = Field(
|
|
75
81
|
description="Type of causal relationship: "
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
82
|
+
"'causes' = this fact directly causes the target fact, "
|
|
83
|
+
"'caused_by' = this fact was caused by the target fact, "
|
|
84
|
+
"'enables' = this fact enables/allows the target fact, "
|
|
85
|
+
"'prevents' = this fact prevents/blocks the target fact"
|
|
80
86
|
)
|
|
81
87
|
strength: float = Field(
|
|
82
88
|
description="Strength of causal relationship (0.0 to 1.0). "
|
|
83
|
-
|
|
89
|
+
"1.0 = direct/strong causation, 0.5 = moderate, 0.3 = weak/indirect",
|
|
84
90
|
ge=0.0,
|
|
85
91
|
le=1.0,
|
|
86
|
-
default=1.0
|
|
92
|
+
default=1.0,
|
|
87
93
|
)
|
|
88
94
|
|
|
89
95
|
|
|
@@ -92,9 +98,7 @@ class ExtractedFact(BaseModel):
|
|
|
92
98
|
|
|
93
99
|
model_config = ConfigDict(
|
|
94
100
|
json_schema_mode="validation",
|
|
95
|
-
json_schema_extra={
|
|
96
|
-
"required": ["what", "when", "where", "who", "why", "fact_type"]
|
|
97
|
-
}
|
|
101
|
+
json_schema_extra={"required": ["what", "when", "where", "who", "why", "fact_type"]},
|
|
98
102
|
)
|
|
99
103
|
|
|
100
104
|
# ==========================================================================
|
|
@@ -103,43 +107,43 @@ class ExtractedFact(BaseModel):
|
|
|
103
107
|
|
|
104
108
|
what: str = Field(
|
|
105
109
|
description="WHAT happened - COMPLETE, DETAILED description with ALL specifics. "
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
+
"NEVER summarize or omit details. Include: exact actions, objects, quantities, specifics. "
|
|
111
|
+
"BE VERBOSE - capture every detail that was mentioned. "
|
|
112
|
+
"Example: 'Emily got married to Sarah at a rooftop garden ceremony with 50 guests attending and a live jazz band playing' "
|
|
113
|
+
"NOT: 'A wedding happened' or 'Emily got married'"
|
|
110
114
|
)
|
|
111
115
|
|
|
112
116
|
when: str = Field(
|
|
113
117
|
description="WHEN it happened - ALWAYS include temporal information if mentioned. "
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
118
|
+
"Include: specific dates, times, durations, relative time references. "
|
|
119
|
+
"Examples: 'on June 15th, 2024 at 3pm', 'last weekend', 'for the past 3 years', 'every morning at 6am'. "
|
|
120
|
+
"Write 'N/A' ONLY if absolutely no temporal context exists. Prefer converting to absolute dates when possible."
|
|
117
121
|
)
|
|
118
122
|
|
|
119
123
|
where: str = Field(
|
|
120
124
|
description="WHERE it happened or is about - SPECIFIC locations, places, areas, regions if applicable. "
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
125
|
+
"Include: cities, neighborhoods, venues, buildings, countries, specific addresses when mentioned. "
|
|
126
|
+
"Examples: 'downtown San Francisco at a rooftop garden venue', 'at the user's home in Brooklyn', 'online via Zoom', 'Paris, France'. "
|
|
127
|
+
"Write 'N/A' ONLY if absolutely no location context exists or if the fact is completely location-agnostic."
|
|
124
128
|
)
|
|
125
129
|
|
|
126
130
|
who: str = Field(
|
|
127
131
|
description="WHO is involved - ALL people/entities with FULL context and relationships. "
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
132
|
+
"Include: names, roles, relationships to user, background details. "
|
|
133
|
+
"Resolve coreferences (if 'my roommate' is later named 'Emily', write 'Emily, the user's college roommate'). "
|
|
134
|
+
"BE DETAILED about relationships and roles. "
|
|
135
|
+
"Example: 'Emily (user's college roommate from Stanford, now works at Google), Sarah (Emily's partner of 5 years, software engineer)' "
|
|
136
|
+
"NOT: 'my friend' or 'Emily and Sarah'"
|
|
133
137
|
)
|
|
134
138
|
|
|
135
139
|
why: str = Field(
|
|
136
140
|
description="WHY it matters - ALL emotional, contextual, and motivational details. "
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
141
|
+
"Include EVERYTHING: feelings, preferences, motivations, observations, context, background, significance. "
|
|
142
|
+
"BE VERBOSE - capture all the nuance and meaning. "
|
|
143
|
+
"FOR ASSISTANT FACTS: MUST include what the user asked/requested that led to this interaction! "
|
|
144
|
+
"Example (world): 'The user felt thrilled and inspired, has always dreamed of an outdoor ceremony, mentioned wanting a similar garden venue, was particularly moved by the intimate atmosphere and personal vows' "
|
|
145
|
+
"Example (assistant): 'User asked how to fix slow API performance with 1000+ concurrent users, expected 70-80% reduction in database load' "
|
|
146
|
+
"NOT: 'User liked it' or 'To help user'"
|
|
143
147
|
)
|
|
144
148
|
|
|
145
149
|
# ==========================================================================
|
|
@@ -148,17 +152,17 @@ class ExtractedFact(BaseModel):
|
|
|
148
152
|
|
|
149
153
|
fact_kind: str = Field(
|
|
150
154
|
default="conversation",
|
|
151
|
-
description="'event' = specific datable occurrence (set occurred dates), 'conversation' = general info (no occurred dates)"
|
|
155
|
+
description="'event' = specific datable occurrence (set occurred dates), 'conversation' = general info (no occurred dates)",
|
|
152
156
|
)
|
|
153
157
|
|
|
154
158
|
# Temporal fields - optional
|
|
155
|
-
occurred_start:
|
|
159
|
+
occurred_start: str | None = Field(
|
|
156
160
|
default=None,
|
|
157
|
-
description="WHEN the event happened (ISO timestamp). Only for fact_kind='event'. Leave null for conversations."
|
|
161
|
+
description="WHEN the event happened (ISO timestamp). Only for fact_kind='event'. Leave null for conversations.",
|
|
158
162
|
)
|
|
159
|
-
occurred_end:
|
|
163
|
+
occurred_end: str | None = Field(
|
|
160
164
|
default=None,
|
|
161
|
-
description="WHEN the event ended (ISO timestamp). Only for events with duration. Leave null for conversations."
|
|
165
|
+
description="WHEN the event ended (ISO timestamp). Only for events with duration. Leave null for conversations.",
|
|
162
166
|
)
|
|
163
167
|
|
|
164
168
|
# Classification (CRITICAL - required)
|
|
@@ -168,16 +172,15 @@ class ExtractedFact(BaseModel):
|
|
|
168
172
|
)
|
|
169
173
|
|
|
170
174
|
# Entities - extracted from fact content
|
|
171
|
-
entities:
|
|
175
|
+
entities: list[Entity] | None = Field(
|
|
172
176
|
default=None,
|
|
173
|
-
description="Named entities, objects, AND abstract concepts from the fact. Include: people names, organizations, places, significant objects (e.g., 'coffee maker', 'car'), AND abstract concepts/themes (e.g., 'friendship', 'career growth', 'loss', 'celebration'). Extract anything that could help link related facts together."
|
|
177
|
+
description="Named entities, objects, AND abstract concepts from the fact. Include: people names, organizations, places, significant objects (e.g., 'coffee maker', 'car'), AND abstract concepts/themes (e.g., 'friendship', 'career growth', 'loss', 'celebration'). Extract anything that could help link related facts together.",
|
|
174
178
|
)
|
|
175
|
-
causal_relations:
|
|
176
|
-
default=None,
|
|
177
|
-
description="Causal links to other facts. Can be null."
|
|
179
|
+
causal_relations: list[CausalRelation] | None = Field(
|
|
180
|
+
default=None, description="Causal links to other facts. Can be null."
|
|
178
181
|
)
|
|
179
182
|
|
|
180
|
-
@field_validator(
|
|
183
|
+
@field_validator("entities", mode="before")
|
|
181
184
|
@classmethod
|
|
182
185
|
def ensure_entities_list(cls, v):
|
|
183
186
|
"""Ensure entities is always a list (convert None to empty list)."""
|
|
@@ -185,7 +188,7 @@ class ExtractedFact(BaseModel):
|
|
|
185
188
|
return []
|
|
186
189
|
return v
|
|
187
190
|
|
|
188
|
-
@field_validator(
|
|
191
|
+
@field_validator("causal_relations", mode="before")
|
|
189
192
|
@classmethod
|
|
190
193
|
def ensure_causal_relations_list(cls, v):
|
|
191
194
|
"""Ensure causal_relations is always a list (convert None to empty list)."""
|
|
@@ -198,11 +201,11 @@ class ExtractedFact(BaseModel):
|
|
|
198
201
|
parts = [self.what]
|
|
199
202
|
|
|
200
203
|
# Add 'who' if not N/A
|
|
201
|
-
if self.who and self.who.upper() !=
|
|
204
|
+
if self.who and self.who.upper() != "N/A":
|
|
202
205
|
parts.append(f"Involving: {self.who}")
|
|
203
206
|
|
|
204
207
|
# Add 'why' if not N/A
|
|
205
|
-
if self.why and self.why.upper() !=
|
|
208
|
+
if self.why and self.why.upper() != "N/A":
|
|
206
209
|
parts.append(self.why)
|
|
207
210
|
|
|
208
211
|
if len(parts) == 1:
|
|
@@ -213,12 +216,11 @@ class ExtractedFact(BaseModel):
|
|
|
213
216
|
|
|
214
217
|
class FactExtractionResponse(BaseModel):
|
|
215
218
|
"""Response containing all extracted facts."""
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
)
|
|
219
|
+
|
|
220
|
+
facts: list[ExtractedFact] = Field(description="List of extracted factual statements")
|
|
219
221
|
|
|
220
222
|
|
|
221
|
-
def chunk_text(text: str, max_chars: int) ->
|
|
223
|
+
def chunk_text(text: str, max_chars: int) -> list[str]:
|
|
222
224
|
"""
|
|
223
225
|
Split text into chunks, preserving conversation structure when possible.
|
|
224
226
|
|
|
@@ -232,7 +234,6 @@ def chunk_text(text: str, max_chars: int) -> List[str]:
|
|
|
232
234
|
Returns:
|
|
233
235
|
List of text chunks, roughly under max_chars
|
|
234
236
|
"""
|
|
235
|
-
import json
|
|
236
237
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
237
238
|
|
|
238
239
|
# If text is small enough, return as-is
|
|
@@ -256,21 +257,21 @@ def chunk_text(text: str, max_chars: int) -> List[str]:
|
|
|
256
257
|
is_separator_regex=False,
|
|
257
258
|
separators=[
|
|
258
259
|
"\n\n", # Paragraph breaks
|
|
259
|
-
"\n",
|
|
260
|
-
". ",
|
|
261
|
-
"! ",
|
|
262
|
-
"? ",
|
|
263
|
-
"; ",
|
|
264
|
-
", ",
|
|
265
|
-
" ",
|
|
266
|
-
"",
|
|
260
|
+
"\n", # Line breaks
|
|
261
|
+
". ", # Sentence endings
|
|
262
|
+
"! ", # Exclamations
|
|
263
|
+
"? ", # Questions
|
|
264
|
+
"; ", # Semicolons
|
|
265
|
+
", ", # Commas
|
|
266
|
+
" ", # Words
|
|
267
|
+
"", # Characters (last resort)
|
|
267
268
|
],
|
|
268
269
|
)
|
|
269
270
|
|
|
270
271
|
return splitter.split_text(text)
|
|
271
272
|
|
|
272
273
|
|
|
273
|
-
def _chunk_conversation(turns:
|
|
274
|
+
def _chunk_conversation(turns: list[dict], max_chars: int) -> list[str]:
|
|
274
275
|
"""
|
|
275
276
|
Chunk a conversation array at turn boundaries, preserving complete turns.
|
|
276
277
|
|
|
@@ -281,7 +282,6 @@ def _chunk_conversation(turns: List[dict], max_chars: int) -> List[str]:
|
|
|
281
282
|
Returns:
|
|
282
283
|
List of JSON-serialized chunks, each containing complete turns
|
|
283
284
|
"""
|
|
284
|
-
import json
|
|
285
285
|
|
|
286
286
|
chunks = []
|
|
287
287
|
current_chunk = []
|
|
@@ -315,10 +315,10 @@ async def _extract_facts_from_chunk(
|
|
|
315
315
|
total_chunks: int,
|
|
316
316
|
event_date: datetime,
|
|
317
317
|
context: str,
|
|
318
|
-
llm_config:
|
|
318
|
+
llm_config: "LLMConfig",
|
|
319
319
|
agent_name: str = None,
|
|
320
|
-
extract_opinions: bool = False
|
|
321
|
-
) ->
|
|
320
|
+
extract_opinions: bool = False,
|
|
321
|
+
) -> list[dict[str, str]]:
|
|
322
322
|
"""
|
|
323
323
|
Extract facts from a single chunk (internal helper for parallel processing).
|
|
324
324
|
|
|
@@ -333,7 +333,9 @@ async def _extract_facts_from_chunk(
|
|
|
333
333
|
# Opinion extraction uses a separate prompt (not this one)
|
|
334
334
|
fact_types_instruction = "Extract ONLY 'opinion' type facts (formed opinions, beliefs, and perspectives). DO NOT extract 'world' or 'assistant' facts."
|
|
335
335
|
else:
|
|
336
|
-
fact_types_instruction =
|
|
336
|
+
fact_types_instruction = (
|
|
337
|
+
"Extract ONLY 'world' and 'assistant' type facts. DO NOT extract opinions - those are extracted separately."
|
|
338
|
+
)
|
|
337
339
|
|
|
338
340
|
prompt = f"""Extract facts from text into structured format with FOUR required dimensions - BE EXTREMELY DETAILED.
|
|
339
341
|
|
|
@@ -534,10 +536,8 @@ WHAT TO EXTRACT vs SKIP
|
|
|
534
536
|
✅ EXTRACT: User preferences (ALWAYS as separate facts!), feelings, plans, events, relationships, achievements
|
|
535
537
|
❌ SKIP: Greetings, filler ("thanks", "cool"), purely structural statements"""
|
|
536
538
|
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
539
|
import logging
|
|
540
|
+
|
|
541
541
|
from openai import BadRequestError
|
|
542
542
|
|
|
543
543
|
logger = logging.getLogger(__name__)
|
|
@@ -548,11 +548,11 @@ WHAT TO EXTRACT vs SKIP
|
|
|
548
548
|
|
|
549
549
|
# Sanitize input text to prevent Unicode encoding errors (e.g., unpaired surrogates)
|
|
550
550
|
sanitized_chunk = _sanitize_text(chunk)
|
|
551
|
-
sanitized_context = _sanitize_text(context) if context else
|
|
551
|
+
sanitized_context = _sanitize_text(context) if context else "none"
|
|
552
552
|
|
|
553
553
|
# Build user message with metadata and chunk content in a clear format
|
|
554
554
|
# Format event_date with day of week for better temporal reasoning
|
|
555
|
-
event_date_formatted = event_date.strftime(
|
|
555
|
+
event_date_formatted = event_date.strftime("%A, %B %d, %Y") # e.g., "Monday, June 10, 2024"
|
|
556
556
|
user_message = f"""Extract facts from the following text chunk.
|
|
557
557
|
{memory_bank_context}
|
|
558
558
|
|
|
@@ -566,16 +566,7 @@ Text:
|
|
|
566
566
|
for attempt in range(max_retries):
|
|
567
567
|
try:
|
|
568
568
|
extraction_response_json = await llm_config.call(
|
|
569
|
-
messages=[
|
|
570
|
-
{
|
|
571
|
-
"role": "system",
|
|
572
|
-
"content": prompt
|
|
573
|
-
},
|
|
574
|
-
{
|
|
575
|
-
"role": "user",
|
|
576
|
-
"content": user_message
|
|
577
|
-
}
|
|
578
|
-
],
|
|
569
|
+
messages=[{"role": "system", "content": prompt}, {"role": "user", "content": user_message}],
|
|
579
570
|
response_format=FactExtractionResponse,
|
|
580
571
|
scope="memory_extract_facts",
|
|
581
572
|
temperature=0.1,
|
|
@@ -601,7 +592,7 @@ Text:
|
|
|
601
592
|
)
|
|
602
593
|
return []
|
|
603
594
|
|
|
604
|
-
raw_facts = extraction_response_json.get(
|
|
595
|
+
raw_facts = extraction_response_json.get("facts", [])
|
|
605
596
|
if not raw_facts:
|
|
606
597
|
logger.debug(
|
|
607
598
|
f"LLM response missing 'facts' field or returned empty list. "
|
|
@@ -622,48 +613,48 @@ Text:
|
|
|
622
613
|
# Helper to get non-empty value
|
|
623
614
|
def get_value(field_name):
|
|
624
615
|
value = llm_fact.get(field_name)
|
|
625
|
-
if value and value !=
|
|
616
|
+
if value and value != "" and value != [] and value != {} and str(value).upper() != "N/A":
|
|
626
617
|
return value
|
|
627
618
|
return None
|
|
628
619
|
|
|
629
620
|
# NEW FORMAT: what, when, who, why (all required)
|
|
630
|
-
what = get_value(
|
|
631
|
-
when = get_value(
|
|
632
|
-
who = get_value(
|
|
633
|
-
why = get_value(
|
|
621
|
+
what = get_value("what")
|
|
622
|
+
when = get_value("when")
|
|
623
|
+
who = get_value("who")
|
|
624
|
+
why = get_value("why")
|
|
634
625
|
|
|
635
626
|
# Fallback to old format if new fields not present
|
|
636
627
|
if not what:
|
|
637
|
-
what = get_value(
|
|
628
|
+
what = get_value("factual_core")
|
|
638
629
|
if not what:
|
|
639
630
|
logger.warning(f"Skipping fact {i}: missing 'what' field")
|
|
640
631
|
continue
|
|
641
632
|
|
|
642
633
|
# Critical field: fact_type
|
|
643
634
|
# LLM uses "assistant" but we convert to "experience" for storage
|
|
644
|
-
fact_type = llm_fact.get(
|
|
635
|
+
fact_type = llm_fact.get("fact_type")
|
|
645
636
|
|
|
646
637
|
# Convert "assistant" → "experience" for storage
|
|
647
|
-
if fact_type ==
|
|
648
|
-
fact_type =
|
|
638
|
+
if fact_type == "assistant":
|
|
639
|
+
fact_type = "experience"
|
|
649
640
|
|
|
650
641
|
# Validate fact_type (after conversion)
|
|
651
|
-
if fact_type not in [
|
|
642
|
+
if fact_type not in ["world", "experience", "opinion"]:
|
|
652
643
|
# Try to fix common mistakes - check if they swapped fact_type and fact_kind
|
|
653
|
-
fact_kind = llm_fact.get(
|
|
654
|
-
if fact_kind ==
|
|
655
|
-
fact_type =
|
|
656
|
-
elif fact_kind in [
|
|
644
|
+
fact_kind = llm_fact.get("fact_kind")
|
|
645
|
+
if fact_kind == "assistant":
|
|
646
|
+
fact_type = "experience"
|
|
647
|
+
elif fact_kind in ["world", "experience", "opinion"]:
|
|
657
648
|
fact_type = fact_kind
|
|
658
649
|
else:
|
|
659
650
|
# Default to 'world' if we can't determine
|
|
660
|
-
fact_type =
|
|
651
|
+
fact_type = "world"
|
|
661
652
|
logger.warning(f"Fact {i}: defaulting to fact_type='world'")
|
|
662
653
|
|
|
663
654
|
# Get fact_kind for temporal handling (but don't store it)
|
|
664
|
-
fact_kind = llm_fact.get(
|
|
665
|
-
if fact_kind not in [
|
|
666
|
-
fact_kind =
|
|
655
|
+
fact_kind = llm_fact.get("fact_kind", "conversation")
|
|
656
|
+
if fact_kind not in ["conversation", "event", "other"]:
|
|
657
|
+
fact_kind = "conversation"
|
|
667
658
|
|
|
668
659
|
# Build combined fact text from the 4 dimensions: what | when | who | why
|
|
669
660
|
fact_data = {}
|
|
@@ -682,20 +673,20 @@ Text:
|
|
|
682
673
|
|
|
683
674
|
# Add temporal fields
|
|
684
675
|
# For events: occurred_start/occurred_end (when the event happened)
|
|
685
|
-
if fact_kind ==
|
|
686
|
-
occurred_start = get_value(
|
|
687
|
-
occurred_end = get_value(
|
|
676
|
+
if fact_kind == "event":
|
|
677
|
+
occurred_start = get_value("occurred_start")
|
|
678
|
+
occurred_end = get_value("occurred_end")
|
|
688
679
|
if occurred_start:
|
|
689
|
-
fact_data[
|
|
680
|
+
fact_data["occurred_start"] = occurred_start
|
|
690
681
|
# For point events: if occurred_end not set, default to occurred_start
|
|
691
682
|
if occurred_end:
|
|
692
|
-
fact_data[
|
|
683
|
+
fact_data["occurred_end"] = occurred_end
|
|
693
684
|
else:
|
|
694
|
-
fact_data[
|
|
685
|
+
fact_data["occurred_end"] = occurred_start
|
|
695
686
|
|
|
696
687
|
# Add entities if present (validate as Entity objects)
|
|
697
688
|
# LLM sometimes returns strings instead of {"text": "..."} format
|
|
698
|
-
entities = get_value(
|
|
689
|
+
entities = get_value("entities")
|
|
699
690
|
if entities:
|
|
700
691
|
# Validate and normalize each entity
|
|
701
692
|
validated_entities = []
|
|
@@ -703,38 +694,34 @@ Text:
|
|
|
703
694
|
if isinstance(ent, str):
|
|
704
695
|
# Normalize string to Entity object
|
|
705
696
|
validated_entities.append(Entity(text=ent))
|
|
706
|
-
elif isinstance(ent, dict) and
|
|
697
|
+
elif isinstance(ent, dict) and "text" in ent:
|
|
707
698
|
try:
|
|
708
699
|
validated_entities.append(Entity.model_validate(ent))
|
|
709
700
|
except Exception as e:
|
|
710
701
|
logger.warning(f"Invalid entity {ent}: {e}")
|
|
711
702
|
if validated_entities:
|
|
712
|
-
fact_data[
|
|
703
|
+
fact_data["entities"] = validated_entities
|
|
713
704
|
|
|
714
705
|
# Add causal relations if present (validate as CausalRelation objects)
|
|
715
706
|
# Filter out invalid relations (missing required fields)
|
|
716
|
-
causal_relations = get_value(
|
|
707
|
+
causal_relations = get_value("causal_relations")
|
|
717
708
|
if causal_relations:
|
|
718
709
|
validated_relations = []
|
|
719
710
|
for rel in causal_relations:
|
|
720
|
-
if isinstance(rel, dict) and
|
|
711
|
+
if isinstance(rel, dict) and "target_fact_index" in rel and "relation_type" in rel:
|
|
721
712
|
try:
|
|
722
713
|
validated_relations.append(CausalRelation.model_validate(rel))
|
|
723
714
|
except Exception as e:
|
|
724
715
|
logger.warning(f"Invalid causal relation {rel}: {e}")
|
|
725
716
|
if validated_relations:
|
|
726
|
-
fact_data[
|
|
717
|
+
fact_data["causal_relations"] = validated_relations
|
|
727
718
|
|
|
728
719
|
# Always set mentioned_at to the event_date (when the conversation/document occurred)
|
|
729
|
-
fact_data[
|
|
720
|
+
fact_data["mentioned_at"] = event_date.isoformat()
|
|
730
721
|
|
|
731
722
|
# Build Fact model instance
|
|
732
723
|
try:
|
|
733
|
-
fact = Fact(
|
|
734
|
-
fact=combined_text,
|
|
735
|
-
fact_type=fact_type,
|
|
736
|
-
**fact_data
|
|
737
|
-
)
|
|
724
|
+
fact = Fact(fact=combined_text, fact_type=fact_type, **fact_data)
|
|
738
725
|
chunk_facts.append(fact)
|
|
739
726
|
except Exception as e:
|
|
740
727
|
logger.error(f"Failed to create Fact model for fact {i}: {e}")
|
|
@@ -753,7 +740,9 @@ Text:
|
|
|
753
740
|
except BadRequestError as e:
|
|
754
741
|
last_error = e
|
|
755
742
|
if "json_validate_failed" in str(e):
|
|
756
|
-
logger.warning(
|
|
743
|
+
logger.warning(
|
|
744
|
+
f" [1.3.{chunk_index + 1}] Attempt {attempt + 1}/{max_retries} failed with JSON validation error: {e}"
|
|
745
|
+
)
|
|
757
746
|
if attempt < max_retries - 1:
|
|
758
747
|
logger.info(f" [1.3.{chunk_index + 1}] Retrying...")
|
|
759
748
|
continue
|
|
@@ -772,8 +761,8 @@ async def _extract_facts_with_auto_split(
|
|
|
772
761
|
context: str,
|
|
773
762
|
llm_config: LLMConfig,
|
|
774
763
|
agent_name: str = None,
|
|
775
|
-
extract_opinions: bool = False
|
|
776
|
-
) ->
|
|
764
|
+
extract_opinions: bool = False,
|
|
765
|
+
) -> list[dict[str, str]]:
|
|
777
766
|
"""
|
|
778
767
|
Extract facts from a chunk with automatic splitting if output exceeds token limits.
|
|
779
768
|
|
|
@@ -794,6 +783,7 @@ async def _extract_facts_with_auto_split(
|
|
|
794
783
|
List of fact dictionaries extracted from the chunk (possibly from sub-chunks)
|
|
795
784
|
"""
|
|
796
785
|
import logging
|
|
786
|
+
|
|
797
787
|
logger = logging.getLogger(__name__)
|
|
798
788
|
|
|
799
789
|
try:
|
|
@@ -806,9 +796,9 @@ async def _extract_facts_with_auto_split(
|
|
|
806
796
|
context=context,
|
|
807
797
|
llm_config=llm_config,
|
|
808
798
|
agent_name=agent_name,
|
|
809
|
-
extract_opinions=extract_opinions
|
|
799
|
+
extract_opinions=extract_opinions,
|
|
810
800
|
)
|
|
811
|
-
except OutputTooLongError
|
|
801
|
+
except OutputTooLongError:
|
|
812
802
|
# Output exceeded token limits - split the chunk in half and retry
|
|
813
803
|
logger.warning(
|
|
814
804
|
f"Output too long for chunk {chunk_index + 1}/{total_chunks} "
|
|
@@ -824,7 +814,7 @@ async def _extract_facts_with_auto_split(
|
|
|
824
814
|
search_start = max(0, mid_point - search_range)
|
|
825
815
|
search_end = min(len(chunk), mid_point + search_range)
|
|
826
816
|
|
|
827
|
-
sentence_endings = [
|
|
817
|
+
sentence_endings = [". ", "! ", "? ", "\n\n"]
|
|
828
818
|
best_split = mid_point
|
|
829
819
|
|
|
830
820
|
for ending in sentence_endings:
|
|
@@ -838,8 +828,7 @@ async def _extract_facts_with_auto_split(
|
|
|
838
828
|
second_half = chunk[best_split:].strip()
|
|
839
829
|
|
|
840
830
|
logger.info(
|
|
841
|
-
f"Split chunk {chunk_index + 1} into two sub-chunks: "
|
|
842
|
-
f"{len(first_half)} chars and {len(second_half)} chars"
|
|
831
|
+
f"Split chunk {chunk_index + 1} into two sub-chunks: {len(first_half)} chars and {len(second_half)} chars"
|
|
843
832
|
)
|
|
844
833
|
|
|
845
834
|
# Process both halves recursively (in parallel)
|
|
@@ -852,7 +841,7 @@ async def _extract_facts_with_auto_split(
|
|
|
852
841
|
context=context,
|
|
853
842
|
llm_config=llm_config,
|
|
854
843
|
agent_name=agent_name,
|
|
855
|
-
extract_opinions=extract_opinions
|
|
844
|
+
extract_opinions=extract_opinions,
|
|
856
845
|
),
|
|
857
846
|
_extract_facts_with_auto_split(
|
|
858
847
|
chunk=second_half,
|
|
@@ -862,8 +851,8 @@ async def _extract_facts_with_auto_split(
|
|
|
862
851
|
context=context,
|
|
863
852
|
llm_config=llm_config,
|
|
864
853
|
agent_name=agent_name,
|
|
865
|
-
extract_opinions=extract_opinions
|
|
866
|
-
)
|
|
854
|
+
extract_opinions=extract_opinions,
|
|
855
|
+
),
|
|
867
856
|
]
|
|
868
857
|
|
|
869
858
|
sub_results = await asyncio.gather(*sub_tasks)
|
|
@@ -873,9 +862,7 @@ async def _extract_facts_with_auto_split(
|
|
|
873
862
|
for sub_result in sub_results:
|
|
874
863
|
all_facts.extend(sub_result)
|
|
875
864
|
|
|
876
|
-
logger.info(
|
|
877
|
-
f"Successfully extracted {len(all_facts)} facts from split chunk {chunk_index + 1}"
|
|
878
|
-
)
|
|
865
|
+
logger.info(f"Successfully extracted {len(all_facts)} facts from split chunk {chunk_index + 1}")
|
|
879
866
|
|
|
880
867
|
return all_facts
|
|
881
868
|
|
|
@@ -887,7 +874,7 @@ async def extract_facts_from_text(
|
|
|
887
874
|
agent_name: str,
|
|
888
875
|
context: str = "",
|
|
889
876
|
extract_opinions: bool = False,
|
|
890
|
-
) -> tuple[
|
|
877
|
+
) -> tuple[list[Fact], list[tuple[str, int]]]:
|
|
891
878
|
"""
|
|
892
879
|
Extract semantic facts from conversational or narrative text using LLM.
|
|
893
880
|
|
|
@@ -920,7 +907,7 @@ async def extract_facts_from_text(
|
|
|
920
907
|
context=context,
|
|
921
908
|
llm_config=llm_config,
|
|
922
909
|
agent_name=agent_name,
|
|
923
|
-
extract_opinions=extract_opinions
|
|
910
|
+
extract_opinions=extract_opinions,
|
|
924
911
|
)
|
|
925
912
|
for i, chunk in enumerate(chunks)
|
|
926
913
|
]
|
|
@@ -938,8 +925,10 @@ async def extract_facts_from_text(
|
|
|
938
925
|
# ============================================================================
|
|
939
926
|
|
|
940
927
|
# Import types for the orchestration layer (note: ExtractedFact here is different from the Pydantic model above)
|
|
941
|
-
|
|
942
|
-
from
|
|
928
|
+
|
|
929
|
+
from .types import CausalRelation as CausalRelationType
|
|
930
|
+
from .types import ChunkMetadata, RetainContent
|
|
931
|
+
from .types import ExtractedFact as ExtractedFactType
|
|
943
932
|
|
|
944
933
|
logger = logging.getLogger(__name__)
|
|
945
934
|
|
|
@@ -948,11 +937,8 @@ SECONDS_PER_FACT = 10
|
|
|
948
937
|
|
|
949
938
|
|
|
950
939
|
async def extract_facts_from_contents(
|
|
951
|
-
contents:
|
|
952
|
-
|
|
953
|
-
agent_name: str,
|
|
954
|
-
extract_opinions: bool = False
|
|
955
|
-
) -> Tuple[List[ExtractedFactType], List[ChunkMetadata]]:
|
|
940
|
+
contents: list[RetainContent], llm_config, agent_name: str, extract_opinions: bool = False
|
|
941
|
+
) -> tuple[list[ExtractedFactType], list[ChunkMetadata]]:
|
|
956
942
|
"""
|
|
957
943
|
Extract facts from multiple content items in parallel.
|
|
958
944
|
|
|
@@ -985,7 +971,7 @@ async def extract_facts_from_contents(
|
|
|
985
971
|
context=item.context,
|
|
986
972
|
llm_config=llm_config,
|
|
987
973
|
agent_name=agent_name,
|
|
988
|
-
extract_opinions=extract_opinions
|
|
974
|
+
extract_opinions=extract_opinions,
|
|
989
975
|
)
|
|
990
976
|
fact_extraction_tasks.append(task)
|
|
991
977
|
|
|
@@ -993,8 +979,8 @@ async def extract_facts_from_contents(
|
|
|
993
979
|
all_fact_results = await asyncio.gather(*fact_extraction_tasks)
|
|
994
980
|
|
|
995
981
|
# Step 3: Flatten and convert to typed objects
|
|
996
|
-
extracted_facts:
|
|
997
|
-
chunks_metadata:
|
|
982
|
+
extracted_facts: list[ExtractedFactType] = []
|
|
983
|
+
chunks_metadata: list[ChunkMetadata] = []
|
|
998
984
|
|
|
999
985
|
global_chunk_idx = 0
|
|
1000
986
|
global_fact_idx = 0
|
|
@@ -1008,7 +994,7 @@ async def extract_facts_from_contents(
|
|
|
1008
994
|
chunk_text=chunk_text,
|
|
1009
995
|
fact_count=chunk_fact_count,
|
|
1010
996
|
content_index=content_index,
|
|
1011
|
-
chunk_index=global_chunk_idx
|
|
997
|
+
chunk_index=global_chunk_idx,
|
|
1012
998
|
)
|
|
1013
999
|
chunks_metadata.append(chunk_metadata)
|
|
1014
1000
|
global_chunk_idx += 1
|
|
@@ -1029,18 +1015,21 @@ async def extract_facts_from_contents(
|
|
|
1029
1015
|
fact_type=fact_from_llm.fact_type,
|
|
1030
1016
|
entities=[e.text for e in (fact_from_llm.entities or [])],
|
|
1031
1017
|
# occurred_start/end: from LLM only, leave None if not provided
|
|
1032
|
-
occurred_start=_parse_datetime(fact_from_llm.occurred_start)
|
|
1033
|
-
|
|
1018
|
+
occurred_start=_parse_datetime(fact_from_llm.occurred_start)
|
|
1019
|
+
if fact_from_llm.occurred_start
|
|
1020
|
+
else None,
|
|
1021
|
+
occurred_end=_parse_datetime(fact_from_llm.occurred_end)
|
|
1022
|
+
if fact_from_llm.occurred_end
|
|
1023
|
+
else None,
|
|
1034
1024
|
causal_relations=_convert_causal_relations(
|
|
1035
|
-
fact_from_llm.causal_relations or [],
|
|
1036
|
-
global_fact_idx
|
|
1025
|
+
fact_from_llm.causal_relations or [], global_fact_idx
|
|
1037
1026
|
),
|
|
1038
1027
|
content_index=content_index,
|
|
1039
1028
|
chunk_index=chunk_global_idx,
|
|
1040
1029
|
context=content.context,
|
|
1041
1030
|
# mentioned_at: always the event_date (when the conversation/document occurred)
|
|
1042
1031
|
mentioned_at=content.event_date,
|
|
1043
|
-
metadata=content.metadata
|
|
1032
|
+
metadata=content.metadata,
|
|
1044
1033
|
)
|
|
1045
1034
|
|
|
1046
1035
|
extracted_facts.append(extracted_fact)
|
|
@@ -1056,13 +1045,14 @@ async def extract_facts_from_contents(
|
|
|
1056
1045
|
def _parse_datetime(date_str: str):
|
|
1057
1046
|
"""Parse ISO datetime string."""
|
|
1058
1047
|
from dateutil import parser as date_parser
|
|
1048
|
+
|
|
1059
1049
|
try:
|
|
1060
1050
|
return date_parser.isoparse(date_str)
|
|
1061
1051
|
except Exception:
|
|
1062
1052
|
return None
|
|
1063
1053
|
|
|
1064
1054
|
|
|
1065
|
-
def _convert_causal_relations(relations_from_llm, fact_start_idx: int) ->
|
|
1055
|
+
def _convert_causal_relations(relations_from_llm, fact_start_idx: int) -> list[CausalRelationType]:
|
|
1066
1056
|
"""
|
|
1067
1057
|
Convert causal relations from LLM format to ExtractedFact format.
|
|
1068
1058
|
|
|
@@ -1073,13 +1063,13 @@ def _convert_causal_relations(relations_from_llm, fact_start_idx: int) -> List[C
|
|
|
1073
1063
|
causal_relation = CausalRelationType(
|
|
1074
1064
|
relation_type=rel.relation_type,
|
|
1075
1065
|
target_fact_index=fact_start_idx + rel.target_fact_index,
|
|
1076
|
-
strength=rel.strength
|
|
1066
|
+
strength=rel.strength,
|
|
1077
1067
|
)
|
|
1078
1068
|
causal_relations.append(causal_relation)
|
|
1079
1069
|
return causal_relations
|
|
1080
1070
|
|
|
1081
1071
|
|
|
1082
|
-
def _add_temporal_offsets(facts:
|
|
1072
|
+
def _add_temporal_offsets(facts: list[ExtractedFactType], contents: list[RetainContent]) -> None:
|
|
1083
1073
|
"""
|
|
1084
1074
|
Add time offsets to preserve fact ordering within each content.
|
|
1085
1075
|
|