hindsight-api 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hindsight_api/__init__.py +10 -9
- hindsight_api/alembic/env.py +5 -8
- hindsight_api/alembic/versions/5a366d414dce_initial_schema.py +266 -180
- hindsight_api/alembic/versions/b7c4d8e9f1a2_add_chunks_table.py +32 -32
- hindsight_api/alembic/versions/c8e5f2a3b4d1_add_retain_params_to_documents.py +11 -11
- hindsight_api/alembic/versions/d9f6a3b4c5e2_rename_bank_to_interactions.py +7 -12
- hindsight_api/alembic/versions/e0a1b2c3d4e5_disposition_to_3_traits.py +23 -15
- hindsight_api/alembic/versions/rename_personality_to_disposition.py +30 -21
- hindsight_api/api/__init__.py +10 -10
- hindsight_api/api/http.py +575 -593
- hindsight_api/api/mcp.py +30 -28
- hindsight_api/banner.py +13 -6
- hindsight_api/config.py +9 -13
- hindsight_api/engine/__init__.py +9 -9
- hindsight_api/engine/cross_encoder.py +22 -21
- hindsight_api/engine/db_utils.py +5 -4
- hindsight_api/engine/embeddings.py +22 -21
- hindsight_api/engine/entity_resolver.py +81 -75
- hindsight_api/engine/llm_wrapper.py +61 -79
- hindsight_api/engine/memory_engine.py +603 -625
- hindsight_api/engine/query_analyzer.py +100 -97
- hindsight_api/engine/response_models.py +105 -106
- hindsight_api/engine/retain/__init__.py +9 -16
- hindsight_api/engine/retain/bank_utils.py +34 -58
- hindsight_api/engine/retain/chunk_storage.py +4 -12
- hindsight_api/engine/retain/deduplication.py +9 -28
- hindsight_api/engine/retain/embedding_processing.py +4 -11
- hindsight_api/engine/retain/embedding_utils.py +3 -4
- hindsight_api/engine/retain/entity_processing.py +7 -17
- hindsight_api/engine/retain/fact_extraction.py +155 -165
- hindsight_api/engine/retain/fact_storage.py +11 -23
- hindsight_api/engine/retain/link_creation.py +11 -39
- hindsight_api/engine/retain/link_utils.py +166 -95
- hindsight_api/engine/retain/observation_regeneration.py +39 -52
- hindsight_api/engine/retain/orchestrator.py +72 -62
- hindsight_api/engine/retain/types.py +49 -43
- hindsight_api/engine/search/__init__.py +5 -5
- hindsight_api/engine/search/fusion.py +6 -15
- hindsight_api/engine/search/graph_retrieval.py +22 -23
- hindsight_api/engine/search/mpfp_retrieval.py +76 -92
- hindsight_api/engine/search/observation_utils.py +9 -16
- hindsight_api/engine/search/reranking.py +4 -7
- hindsight_api/engine/search/retrieval.py +87 -66
- hindsight_api/engine/search/scoring.py +5 -7
- hindsight_api/engine/search/temporal_extraction.py +8 -11
- hindsight_api/engine/search/think_utils.py +115 -39
- hindsight_api/engine/search/trace.py +68 -39
- hindsight_api/engine/search/tracer.py +44 -35
- hindsight_api/engine/search/types.py +20 -17
- hindsight_api/engine/task_backend.py +21 -26
- hindsight_api/engine/utils.py +25 -10
- hindsight_api/main.py +21 -40
- hindsight_api/mcp_local.py +190 -0
- hindsight_api/metrics.py +44 -30
- hindsight_api/migrations.py +10 -8
- hindsight_api/models.py +60 -72
- hindsight_api/pg0.py +22 -23
- hindsight_api/server.py +3 -6
- {hindsight_api-0.1.5.dist-info → hindsight_api-0.1.6.dist-info}/METADATA +2 -2
- hindsight_api-0.1.6.dist-info/RECORD +64 -0
- {hindsight_api-0.1.5.dist-info → hindsight_api-0.1.6.dist-info}/entry_points.txt +1 -0
- hindsight_api-0.1.5.dist-info/RECORD +0 -63
- {hindsight_api-0.1.5.dist-info → hindsight_api-0.1.6.dist-info}/WHEEL +0 -0
|
@@ -3,15 +3,14 @@ Observation regeneration for retain pipeline.
|
|
|
3
3
|
|
|
4
4
|
Regenerates entity observations as part of the retain transaction.
|
|
5
5
|
"""
|
|
6
|
+
|
|
6
7
|
import logging
|
|
7
8
|
import time
|
|
8
9
|
import uuid
|
|
9
|
-
from datetime import
|
|
10
|
-
from typing import List, Dict, Optional
|
|
10
|
+
from datetime import UTC, datetime
|
|
11
11
|
|
|
12
12
|
from ..search import observation_utils
|
|
13
13
|
from . import embedding_utils
|
|
14
|
-
from ..db_utils import acquire_with_retry
|
|
15
14
|
from .types import EntityLink
|
|
16
15
|
|
|
17
16
|
logger = logging.getLogger(__name__)
|
|
@@ -19,12 +18,12 @@ logger = logging.getLogger(__name__)
|
|
|
19
18
|
|
|
20
19
|
def utcnow():
|
|
21
20
|
"""Get current UTC time."""
|
|
22
|
-
return datetime.now(
|
|
21
|
+
return datetime.now(UTC)
|
|
23
22
|
|
|
24
23
|
|
|
25
24
|
# Simple dataclass-like container for facts (avoid importing from memory_engine)
|
|
26
25
|
class MemoryFactForObservation:
|
|
27
|
-
def __init__(self, id: str, text: str, fact_type: str, context: str, occurred_start:
|
|
26
|
+
def __init__(self, id: str, text: str, fact_type: str, context: str, occurred_start: str | None):
|
|
28
27
|
self.id = id
|
|
29
28
|
self.text = text
|
|
30
29
|
self.fact_type = fact_type
|
|
@@ -33,12 +32,7 @@ class MemoryFactForObservation:
|
|
|
33
32
|
|
|
34
33
|
|
|
35
34
|
async def regenerate_observations_batch(
|
|
36
|
-
conn,
|
|
37
|
-
embeddings_model,
|
|
38
|
-
llm_config,
|
|
39
|
-
bank_id: str,
|
|
40
|
-
entity_links: List[EntityLink],
|
|
41
|
-
log_buffer: List[str] = None
|
|
35
|
+
conn, embeddings_model, llm_config, bank_id: str, entity_links: list[EntityLink], log_buffer: list[str] = None
|
|
42
36
|
) -> None:
|
|
43
37
|
"""
|
|
44
38
|
Regenerate observations for top entities in this batch.
|
|
@@ -61,7 +55,7 @@ async def regenerate_observations_batch(
|
|
|
61
55
|
return
|
|
62
56
|
|
|
63
57
|
# Count mentions per entity in this batch
|
|
64
|
-
entity_mention_counts:
|
|
58
|
+
entity_mention_counts: dict[str, int] = {}
|
|
65
59
|
for link in entity_links:
|
|
66
60
|
if link.entity_id:
|
|
67
61
|
entity_id = str(link.entity_id)
|
|
@@ -71,11 +65,7 @@ async def regenerate_observations_batch(
|
|
|
71
65
|
return
|
|
72
66
|
|
|
73
67
|
# Sort by mention count descending and take top N
|
|
74
|
-
sorted_entities = sorted(
|
|
75
|
-
entity_mention_counts.items(),
|
|
76
|
-
key=lambda x: x[1],
|
|
77
|
-
reverse=True
|
|
78
|
-
)
|
|
68
|
+
sorted_entities = sorted(entity_mention_counts.items(), key=lambda x: x[1], reverse=True)
|
|
79
69
|
entities_to_process = [e[0] for e in sorted_entities[:TOP_N_ENTITIES]]
|
|
80
70
|
|
|
81
71
|
obs_start = time.time()
|
|
@@ -89,9 +79,10 @@ async def regenerate_observations_batch(
|
|
|
89
79
|
SELECT id, canonical_name FROM entities
|
|
90
80
|
WHERE id = ANY($1) AND bank_id = $2
|
|
91
81
|
""",
|
|
92
|
-
entity_uuids,
|
|
82
|
+
entity_uuids,
|
|
83
|
+
bank_id,
|
|
93
84
|
)
|
|
94
|
-
entity_names = {row[
|
|
85
|
+
entity_names = {row["id"]: row["canonical_name"] for row in entity_rows}
|
|
95
86
|
|
|
96
87
|
# Batch query for fact counts
|
|
97
88
|
fact_counts = await conn.fetch(
|
|
@@ -102,9 +93,10 @@ async def regenerate_observations_batch(
|
|
|
102
93
|
WHERE ue.entity_id = ANY($1) AND mu.bank_id = $2
|
|
103
94
|
GROUP BY ue.entity_id
|
|
104
95
|
""",
|
|
105
|
-
entity_uuids,
|
|
96
|
+
entity_uuids,
|
|
97
|
+
bank_id,
|
|
106
98
|
)
|
|
107
|
-
entity_fact_counts = {row[
|
|
99
|
+
entity_fact_counts = {row["entity_id"]: row["cnt"] for row in fact_counts}
|
|
108
100
|
|
|
109
101
|
# Filter entities that meet the threshold
|
|
110
102
|
entities_with_names = []
|
|
@@ -126,8 +118,7 @@ async def regenerate_observations_batch(
|
|
|
126
118
|
for entity_id, entity_name in entities_with_names:
|
|
127
119
|
try:
|
|
128
120
|
obs_ids = await _regenerate_entity_observations(
|
|
129
|
-
conn, embeddings_model, llm_config,
|
|
130
|
-
bank_id, entity_id, entity_name
|
|
121
|
+
conn, embeddings_model, llm_config, bank_id, entity_id, entity_name
|
|
131
122
|
)
|
|
132
123
|
total_observations += len(obs_ids)
|
|
133
124
|
except Exception as e:
|
|
@@ -135,17 +126,14 @@ async def regenerate_observations_batch(
|
|
|
135
126
|
|
|
136
127
|
obs_time = time.time() - obs_start
|
|
137
128
|
if log_buffer is not None:
|
|
138
|
-
log_buffer.append(
|
|
129
|
+
log_buffer.append(
|
|
130
|
+
f"[11] Observations: {total_observations} observations for {len(entities_with_names)} entities in {obs_time:.3f}s"
|
|
131
|
+
)
|
|
139
132
|
|
|
140
133
|
|
|
141
134
|
async def _regenerate_entity_observations(
|
|
142
|
-
conn,
|
|
143
|
-
|
|
144
|
-
llm_config,
|
|
145
|
-
bank_id: str,
|
|
146
|
-
entity_id: str,
|
|
147
|
-
entity_name: str
|
|
148
|
-
) -> List[str]:
|
|
135
|
+
conn, embeddings_model, llm_config, bank_id: str, entity_id: str, entity_name: str
|
|
136
|
+
) -> list[str]:
|
|
149
137
|
"""
|
|
150
138
|
Regenerate observations for a single entity.
|
|
151
139
|
|
|
@@ -176,7 +164,8 @@ async def _regenerate_entity_observations(
|
|
|
176
164
|
ORDER BY mu.occurred_start DESC
|
|
177
165
|
LIMIT 50
|
|
178
166
|
""",
|
|
179
|
-
bank_id,
|
|
167
|
+
bank_id,
|
|
168
|
+
entity_uuid,
|
|
180
169
|
)
|
|
181
170
|
|
|
182
171
|
if not rows:
|
|
@@ -185,21 +174,19 @@ async def _regenerate_entity_observations(
|
|
|
185
174
|
# Convert to fact objects for observation extraction
|
|
186
175
|
facts = []
|
|
187
176
|
for row in rows:
|
|
188
|
-
occurred_start = row[
|
|
189
|
-
facts.append(
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
177
|
+
occurred_start = row["occurred_start"].isoformat() if row["occurred_start"] else None
|
|
178
|
+
facts.append(
|
|
179
|
+
MemoryFactForObservation(
|
|
180
|
+
id=str(row["id"]),
|
|
181
|
+
text=row["text"],
|
|
182
|
+
fact_type=row["fact_type"],
|
|
183
|
+
context=row["context"],
|
|
184
|
+
occurred_start=occurred_start,
|
|
185
|
+
)
|
|
186
|
+
)
|
|
196
187
|
|
|
197
188
|
# Extract observations using LLM
|
|
198
|
-
observations = await observation_utils.extract_observations_from_facts(
|
|
199
|
-
llm_config,
|
|
200
|
-
entity_name,
|
|
201
|
-
facts
|
|
202
|
-
)
|
|
189
|
+
observations = await observation_utils.extract_observations_from_facts(llm_config, entity_name, facts)
|
|
203
190
|
|
|
204
191
|
if not observations:
|
|
205
192
|
return []
|
|
@@ -217,13 +204,12 @@ async def _regenerate_entity_observations(
|
|
|
217
204
|
AND ue.entity_id = $2
|
|
218
205
|
)
|
|
219
206
|
""",
|
|
220
|
-
bank_id,
|
|
207
|
+
bank_id,
|
|
208
|
+
entity_uuid,
|
|
221
209
|
)
|
|
222
210
|
|
|
223
211
|
# Generate embeddings for new observations
|
|
224
|
-
embeddings = await embedding_utils.generate_embeddings_batch(
|
|
225
|
-
embeddings_model, observations
|
|
226
|
-
)
|
|
212
|
+
embeddings = await embedding_utils.generate_embeddings_batch(embeddings_model, observations)
|
|
227
213
|
|
|
228
214
|
# Insert new observations
|
|
229
215
|
current_time = utcnow()
|
|
@@ -247,9 +233,9 @@ async def _regenerate_entity_observations(
|
|
|
247
233
|
current_time,
|
|
248
234
|
current_time,
|
|
249
235
|
current_time,
|
|
250
|
-
current_time
|
|
236
|
+
current_time,
|
|
251
237
|
)
|
|
252
|
-
obs_id = str(result[
|
|
238
|
+
obs_id = str(result["id"])
|
|
253
239
|
created_ids.append(obs_id)
|
|
254
240
|
|
|
255
241
|
# Link observation to entity
|
|
@@ -258,7 +244,8 @@ async def _regenerate_entity_observations(
|
|
|
258
244
|
INSERT INTO unit_entities (unit_id, entity_id)
|
|
259
245
|
VALUES ($1, $2)
|
|
260
246
|
""",
|
|
261
|
-
uuid.UUID(obs_id),
|
|
247
|
+
uuid.UUID(obs_id),
|
|
248
|
+
entity_uuid,
|
|
262
249
|
)
|
|
263
250
|
|
|
264
251
|
return created_ids
|
|
@@ -3,31 +3,33 @@ Main orchestrator for the retain pipeline.
|
|
|
3
3
|
|
|
4
4
|
Coordinates all retain pipeline modules to store memories efficiently.
|
|
5
5
|
"""
|
|
6
|
+
|
|
6
7
|
import logging
|
|
7
8
|
import time
|
|
8
9
|
import uuid
|
|
9
|
-
from datetime import
|
|
10
|
-
from typing import
|
|
10
|
+
from datetime import UTC, datetime
|
|
11
|
+
from typing import Any
|
|
11
12
|
|
|
12
|
-
from . import bank_utils
|
|
13
13
|
from ..db_utils import acquire_with_retry
|
|
14
|
+
from . import bank_utils
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
def utcnow():
|
|
17
18
|
"""Get current UTC time."""
|
|
18
|
-
return datetime.now(
|
|
19
|
+
return datetime.now(UTC)
|
|
20
|
+
|
|
19
21
|
|
|
20
|
-
from .types import RetainContent, ExtractedFact, ProcessedFact, EntityLink
|
|
21
22
|
from . import (
|
|
22
|
-
fact_extraction,
|
|
23
|
-
embedding_processing,
|
|
24
|
-
deduplication,
|
|
25
23
|
chunk_storage,
|
|
26
|
-
|
|
24
|
+
deduplication,
|
|
25
|
+
embedding_processing,
|
|
27
26
|
entity_processing,
|
|
27
|
+
fact_extraction,
|
|
28
|
+
fact_storage,
|
|
28
29
|
link_creation,
|
|
29
|
-
observation_regeneration
|
|
30
|
+
observation_regeneration,
|
|
30
31
|
)
|
|
32
|
+
from .types import ExtractedFact, ProcessedFact, RetainContent
|
|
31
33
|
|
|
32
34
|
logger = logging.getLogger(__name__)
|
|
33
35
|
|
|
@@ -41,12 +43,12 @@ async def retain_batch(
|
|
|
41
43
|
format_date_fn,
|
|
42
44
|
duplicate_checker_fn,
|
|
43
45
|
bank_id: str,
|
|
44
|
-
contents_dicts:
|
|
45
|
-
document_id:
|
|
46
|
+
contents_dicts: list[dict[str, Any]],
|
|
47
|
+
document_id: str | None = None,
|
|
46
48
|
is_first_batch: bool = True,
|
|
47
|
-
fact_type_override:
|
|
48
|
-
confidence_score:
|
|
49
|
-
) ->
|
|
49
|
+
fact_type_override: str | None = None,
|
|
50
|
+
confidence_score: float | None = None,
|
|
51
|
+
) -> list[list[str]]:
|
|
50
52
|
"""
|
|
51
53
|
Process a batch of content through the retain pipeline.
|
|
52
54
|
|
|
@@ -73,10 +75,10 @@ async def retain_batch(
|
|
|
73
75
|
|
|
74
76
|
# Buffer all logs
|
|
75
77
|
log_buffer = []
|
|
76
|
-
log_buffer.append(f"{'='*60}")
|
|
78
|
+
log_buffer.append(f"{'=' * 60}")
|
|
77
79
|
log_buffer.append(f"RETAIN_BATCH START: {bank_id}")
|
|
78
80
|
log_buffer.append(f"Batch size: {len(contents_dicts)} content items, {total_chars:,} chars")
|
|
79
|
-
log_buffer.append(f"{'='*60}")
|
|
81
|
+
log_buffer.append(f"{'=' * 60}")
|
|
80
82
|
|
|
81
83
|
# Get bank profile
|
|
82
84
|
profile = await bank_utils.get_bank_profile(pool, bank_id)
|
|
@@ -89,21 +91,20 @@ async def retain_batch(
|
|
|
89
91
|
content=item["content"],
|
|
90
92
|
context=item.get("context", ""),
|
|
91
93
|
event_date=item.get("event_date") or utcnow(),
|
|
92
|
-
metadata=item.get("metadata", {})
|
|
94
|
+
metadata=item.get("metadata", {}),
|
|
93
95
|
)
|
|
94
96
|
contents.append(content)
|
|
95
97
|
|
|
96
98
|
# Step 1: Extract facts from all contents
|
|
97
99
|
step_start = time.time()
|
|
98
|
-
extract_opinions =
|
|
100
|
+
extract_opinions = fact_type_override == "opinion"
|
|
99
101
|
|
|
100
102
|
extracted_facts, chunks = await fact_extraction.extract_facts_from_contents(
|
|
101
|
-
contents,
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
103
|
+
contents, llm_config, agent_name, extract_opinions
|
|
104
|
+
)
|
|
105
|
+
log_buffer.append(
|
|
106
|
+
f"[1] Extract facts: {len(extracted_facts)} facts, {len(chunks)} chunks from {len(contents)} contents in {time.time() - step_start:.3f}s"
|
|
105
107
|
)
|
|
106
|
-
log_buffer.append(f"[1] Extract facts: {len(extracted_facts)} facts, {len(chunks)} chunks from {len(contents)} contents in {time.time() - step_start:.3f}s")
|
|
107
108
|
|
|
108
109
|
if not extracted_facts:
|
|
109
110
|
return [[] for _ in contents]
|
|
@@ -130,6 +131,7 @@ async def retain_batch(
|
|
|
130
131
|
|
|
131
132
|
# Group contents by document_id for document tracking and chunk storage
|
|
132
133
|
from collections import defaultdict
|
|
134
|
+
|
|
133
135
|
contents_by_doc = defaultdict(list)
|
|
134
136
|
for idx, content_dict in enumerate(contents_dicts):
|
|
135
137
|
doc_id = content_dict.get("document_id")
|
|
@@ -155,7 +157,11 @@ async def retain_batch(
|
|
|
155
157
|
if first_item.get("context"):
|
|
156
158
|
retain_params["context"] = first_item["context"]
|
|
157
159
|
if first_item.get("event_date"):
|
|
158
|
-
retain_params["event_date"] =
|
|
160
|
+
retain_params["event_date"] = (
|
|
161
|
+
first_item["event_date"].isoformat()
|
|
162
|
+
if hasattr(first_item["event_date"], "isoformat")
|
|
163
|
+
else str(first_item["event_date"])
|
|
164
|
+
)
|
|
159
165
|
if first_item.get("metadata"):
|
|
160
166
|
retain_params["metadata"] = first_item["metadata"]
|
|
161
167
|
|
|
@@ -195,7 +201,11 @@ async def retain_batch(
|
|
|
195
201
|
if first_item.get("context"):
|
|
196
202
|
retain_params["context"] = first_item["context"]
|
|
197
203
|
if first_item.get("event_date"):
|
|
198
|
-
retain_params["event_date"] =
|
|
204
|
+
retain_params["event_date"] = (
|
|
205
|
+
first_item["event_date"].isoformat()
|
|
206
|
+
if hasattr(first_item["event_date"], "isoformat")
|
|
207
|
+
else str(first_item["event_date"])
|
|
208
|
+
)
|
|
199
209
|
if first_item.get("metadata"):
|
|
200
210
|
retain_params["metadata"] = first_item["metadata"]
|
|
201
211
|
|
|
@@ -205,7 +215,9 @@ async def retain_batch(
|
|
|
205
215
|
document_ids_added.append(actual_doc_id)
|
|
206
216
|
|
|
207
217
|
if document_ids_added:
|
|
208
|
-
log_buffer.append(
|
|
218
|
+
log_buffer.append(
|
|
219
|
+
f"[2.5] Document tracking: {len(document_ids_added)} documents in {time.time() - step_start:.3f}s"
|
|
220
|
+
)
|
|
209
221
|
|
|
210
222
|
# Store chunks and map to facts for all documents
|
|
211
223
|
step_start = time.time()
|
|
@@ -230,7 +242,9 @@ async def retain_batch(
|
|
|
230
242
|
for chunk_idx, chunk_id in chunk_id_map.items():
|
|
231
243
|
chunk_id_map_by_doc[(doc_id, chunk_idx)] = chunk_id
|
|
232
244
|
|
|
233
|
-
log_buffer.append(
|
|
245
|
+
log_buffer.append(
|
|
246
|
+
f"[3] Store chunks: {len(chunks)} chunks for {len(chunks_by_doc)} documents in {time.time() - step_start:.3f}s"
|
|
247
|
+
)
|
|
234
248
|
|
|
235
249
|
# Map chunk_ids and document_ids to facts
|
|
236
250
|
for fact, processed_fact in zip(extracted_facts, processed_facts):
|
|
@@ -265,7 +279,9 @@ async def retain_batch(
|
|
|
265
279
|
is_duplicate_flags = await deduplication.check_duplicates_batch(
|
|
266
280
|
conn, bank_id, processed_facts, duplicate_checker_fn
|
|
267
281
|
)
|
|
268
|
-
log_buffer.append(
|
|
282
|
+
log_buffer.append(
|
|
283
|
+
f"[4] Deduplication: {sum(is_duplicate_flags)} duplicates in {time.time() - step_start:.3f}s"
|
|
284
|
+
)
|
|
269
285
|
|
|
270
286
|
# Filter out duplicates
|
|
271
287
|
non_duplicate_facts = deduplication.filter_duplicates(processed_facts, is_duplicate_flags)
|
|
@@ -293,14 +309,18 @@ async def retain_batch(
|
|
|
293
309
|
# Create semantic links
|
|
294
310
|
step_start = time.time()
|
|
295
311
|
embeddings_for_links = [fact.embedding for fact in non_duplicate_facts]
|
|
296
|
-
semantic_link_count = await link_creation.create_semantic_links_batch(
|
|
312
|
+
semantic_link_count = await link_creation.create_semantic_links_batch(
|
|
313
|
+
conn, bank_id, unit_ids, embeddings_for_links
|
|
314
|
+
)
|
|
297
315
|
log_buffer.append(f"[8] Semantic links: {semantic_link_count} links in {time.time() - step_start:.3f}s")
|
|
298
316
|
|
|
299
317
|
# Insert entity links
|
|
300
318
|
step_start = time.time()
|
|
301
319
|
if entity_links:
|
|
302
320
|
await entity_processing.insert_entity_links_batch(conn, entity_links)
|
|
303
|
-
log_buffer.append(
|
|
321
|
+
log_buffer.append(
|
|
322
|
+
f"[9] Entity links: {len(entity_links) if entity_links else 0} links in {time.time() - step_start:.3f}s"
|
|
323
|
+
)
|
|
304
324
|
|
|
305
325
|
# Create causal links
|
|
306
326
|
step_start = time.time()
|
|
@@ -309,34 +329,22 @@ async def retain_batch(
|
|
|
309
329
|
|
|
310
330
|
# Regenerate observations INSIDE transaction for atomicity
|
|
311
331
|
await observation_regeneration.regenerate_observations_batch(
|
|
312
|
-
conn,
|
|
313
|
-
embeddings_model,
|
|
314
|
-
llm_config,
|
|
315
|
-
bank_id,
|
|
316
|
-
entity_links,
|
|
317
|
-
log_buffer
|
|
332
|
+
conn, embeddings_model, llm_config, bank_id, entity_links, log_buffer
|
|
318
333
|
)
|
|
319
334
|
|
|
320
335
|
# Map results back to original content items
|
|
321
|
-
result_unit_ids = _map_results_to_contents(
|
|
322
|
-
contents, extracted_facts, is_duplicate_flags, unit_ids
|
|
323
|
-
)
|
|
336
|
+
result_unit_ids = _map_results_to_contents(contents, extracted_facts, is_duplicate_flags, unit_ids)
|
|
324
337
|
|
|
325
338
|
# Trigger background tasks AFTER transaction commits (opinion reinforcement only)
|
|
326
|
-
await _trigger_background_tasks(
|
|
327
|
-
task_backend,
|
|
328
|
-
bank_id,
|
|
329
|
-
unit_ids,
|
|
330
|
-
non_duplicate_facts
|
|
331
|
-
)
|
|
339
|
+
await _trigger_background_tasks(task_backend, bank_id, unit_ids, non_duplicate_facts)
|
|
332
340
|
|
|
333
341
|
# Log final summary
|
|
334
342
|
total_time = time.time() - start_time
|
|
335
|
-
log_buffer.append(f"{'='*60}")
|
|
343
|
+
log_buffer.append(f"{'=' * 60}")
|
|
336
344
|
log_buffer.append(f"RETAIN_BATCH COMPLETE: {len(unit_ids)} units in {total_time:.3f}s")
|
|
337
345
|
if document_ids_added:
|
|
338
346
|
log_buffer.append(f"Documents: {', '.join(document_ids_added)}")
|
|
339
|
-
log_buffer.append(f"{'='*60}")
|
|
347
|
+
log_buffer.append(f"{'=' * 60}")
|
|
340
348
|
|
|
341
349
|
logger.info("\n" + "\n".join(log_buffer) + "\n")
|
|
342
350
|
|
|
@@ -344,11 +352,11 @@ async def retain_batch(
|
|
|
344
352
|
|
|
345
353
|
|
|
346
354
|
def _map_results_to_contents(
|
|
347
|
-
contents:
|
|
348
|
-
extracted_facts:
|
|
349
|
-
is_duplicate_flags:
|
|
350
|
-
unit_ids:
|
|
351
|
-
) ->
|
|
355
|
+
contents: list[RetainContent],
|
|
356
|
+
extracted_facts: list[ExtractedFact],
|
|
357
|
+
is_duplicate_flags: list[bool],
|
|
358
|
+
unit_ids: list[str],
|
|
359
|
+
) -> list[list[str]]:
|
|
352
360
|
"""
|
|
353
361
|
Map created unit IDs back to original content items.
|
|
354
362
|
|
|
@@ -376,17 +384,19 @@ def _map_results_to_contents(
|
|
|
376
384
|
async def _trigger_background_tasks(
|
|
377
385
|
task_backend,
|
|
378
386
|
bank_id: str,
|
|
379
|
-
unit_ids:
|
|
380
|
-
facts:
|
|
387
|
+
unit_ids: list[str],
|
|
388
|
+
facts: list[ProcessedFact],
|
|
381
389
|
) -> None:
|
|
382
390
|
"""Trigger opinion reinforcement as background task (after transaction commits)."""
|
|
383
391
|
# Trigger opinion reinforcement if there are entities
|
|
384
392
|
fact_entities = [[e.name for e in fact.entities] for fact in facts]
|
|
385
393
|
if any(fact_entities):
|
|
386
|
-
await task_backend.submit_task(
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
394
|
+
await task_backend.submit_task(
|
|
395
|
+
{
|
|
396
|
+
"type": "reinforce_opinion",
|
|
397
|
+
"bank_id": bank_id,
|
|
398
|
+
"created_unit_ids": unit_ids,
|
|
399
|
+
"unit_texts": [fact.fact_text for fact in facts],
|
|
400
|
+
"unit_entities": fact_entities,
|
|
401
|
+
}
|
|
402
|
+
)
|