hindsight-api 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hindsight_api/__init__.py +10 -9
- hindsight_api/alembic/env.py +5 -8
- hindsight_api/alembic/versions/5a366d414dce_initial_schema.py +266 -180
- hindsight_api/alembic/versions/b7c4d8e9f1a2_add_chunks_table.py +32 -32
- hindsight_api/alembic/versions/c8e5f2a3b4d1_add_retain_params_to_documents.py +11 -11
- hindsight_api/alembic/versions/d9f6a3b4c5e2_rename_bank_to_interactions.py +7 -12
- hindsight_api/alembic/versions/e0a1b2c3d4e5_disposition_to_3_traits.py +23 -15
- hindsight_api/alembic/versions/rename_personality_to_disposition.py +30 -21
- hindsight_api/api/__init__.py +10 -10
- hindsight_api/api/http.py +575 -593
- hindsight_api/api/mcp.py +30 -28
- hindsight_api/banner.py +13 -6
- hindsight_api/config.py +9 -13
- hindsight_api/engine/__init__.py +9 -9
- hindsight_api/engine/cross_encoder.py +22 -21
- hindsight_api/engine/db_utils.py +5 -4
- hindsight_api/engine/embeddings.py +22 -21
- hindsight_api/engine/entity_resolver.py +81 -75
- hindsight_api/engine/llm_wrapper.py +61 -79
- hindsight_api/engine/memory_engine.py +603 -625
- hindsight_api/engine/query_analyzer.py +100 -97
- hindsight_api/engine/response_models.py +105 -106
- hindsight_api/engine/retain/__init__.py +9 -16
- hindsight_api/engine/retain/bank_utils.py +34 -58
- hindsight_api/engine/retain/chunk_storage.py +4 -12
- hindsight_api/engine/retain/deduplication.py +9 -28
- hindsight_api/engine/retain/embedding_processing.py +4 -11
- hindsight_api/engine/retain/embedding_utils.py +3 -4
- hindsight_api/engine/retain/entity_processing.py +7 -17
- hindsight_api/engine/retain/fact_extraction.py +155 -165
- hindsight_api/engine/retain/fact_storage.py +11 -23
- hindsight_api/engine/retain/link_creation.py +11 -39
- hindsight_api/engine/retain/link_utils.py +166 -95
- hindsight_api/engine/retain/observation_regeneration.py +39 -52
- hindsight_api/engine/retain/orchestrator.py +72 -62
- hindsight_api/engine/retain/types.py +49 -43
- hindsight_api/engine/search/__init__.py +5 -5
- hindsight_api/engine/search/fusion.py +6 -15
- hindsight_api/engine/search/graph_retrieval.py +22 -23
- hindsight_api/engine/search/mpfp_retrieval.py +76 -92
- hindsight_api/engine/search/observation_utils.py +9 -16
- hindsight_api/engine/search/reranking.py +4 -7
- hindsight_api/engine/search/retrieval.py +87 -66
- hindsight_api/engine/search/scoring.py +5 -7
- hindsight_api/engine/search/temporal_extraction.py +8 -11
- hindsight_api/engine/search/think_utils.py +115 -39
- hindsight_api/engine/search/trace.py +68 -39
- hindsight_api/engine/search/tracer.py +44 -35
- hindsight_api/engine/search/types.py +20 -17
- hindsight_api/engine/task_backend.py +21 -26
- hindsight_api/engine/utils.py +25 -10
- hindsight_api/main.py +21 -40
- hindsight_api/mcp_local.py +190 -0
- hindsight_api/metrics.py +44 -30
- hindsight_api/migrations.py +10 -8
- hindsight_api/models.py +60 -72
- hindsight_api/pg0.py +22 -23
- hindsight_api/server.py +3 -6
- hindsight_api-0.1.7.dist-info/METADATA +178 -0
- hindsight_api-0.1.7.dist-info/RECORD +64 -0
- {hindsight_api-0.1.5.dist-info → hindsight_api-0.1.7.dist-info}/entry_points.txt +1 -0
- hindsight_api-0.1.5.dist-info/METADATA +0 -42
- hindsight_api-0.1.5.dist-info/RECORD +0 -63
- {hindsight_api-0.1.5.dist-info → hindsight_api-0.1.7.dist-info}/WHEEL +0 -0
|
@@ -2,10 +2,9 @@
|
|
|
2
2
|
Link creation utilities for temporal, semantic, and entity links.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
import time
|
|
6
5
|
import logging
|
|
7
|
-
|
|
8
|
-
from datetime import
|
|
6
|
+
import time
|
|
7
|
+
from datetime import UTC, datetime, timedelta
|
|
9
8
|
from uuid import UUID
|
|
10
9
|
|
|
11
10
|
from .types import EntityLink
|
|
@@ -19,7 +18,7 @@ def _normalize_datetime(dt):
|
|
|
19
18
|
return None
|
|
20
19
|
if dt.tzinfo is None:
|
|
21
20
|
# Naive datetime - assume UTC
|
|
22
|
-
return dt.replace(tzinfo=
|
|
21
|
+
return dt.replace(tzinfo=UTC)
|
|
23
22
|
return dt
|
|
24
23
|
|
|
25
24
|
|
|
@@ -54,24 +53,26 @@ def compute_temporal_links(
|
|
|
54
53
|
try:
|
|
55
54
|
time_lower = unit_event_date_norm - timedelta(hours=time_window_hours)
|
|
56
55
|
except OverflowError:
|
|
57
|
-
time_lower = datetime.min.replace(tzinfo=
|
|
56
|
+
time_lower = datetime.min.replace(tzinfo=UTC)
|
|
58
57
|
try:
|
|
59
58
|
time_upper = unit_event_date_norm + timedelta(hours=time_window_hours)
|
|
60
59
|
except OverflowError:
|
|
61
|
-
time_upper = datetime.max.replace(tzinfo=
|
|
60
|
+
time_upper = datetime.max.replace(tzinfo=UTC)
|
|
62
61
|
|
|
63
62
|
# Filter candidates within this unit's time window
|
|
64
63
|
matching_neighbors = [
|
|
65
|
-
(row[
|
|
64
|
+
(row["id"], row["event_date"])
|
|
66
65
|
for row in candidates
|
|
67
|
-
if time_lower <= _normalize_datetime(row[
|
|
66
|
+
if time_lower <= _normalize_datetime(row["event_date"]) <= time_upper
|
|
68
67
|
][:10] # Limit to top 10
|
|
69
68
|
|
|
70
69
|
for recent_id, recent_event_date in matching_neighbors:
|
|
71
70
|
# Calculate temporal proximity weight
|
|
72
|
-
time_diff_hours = abs(
|
|
71
|
+
time_diff_hours = abs(
|
|
72
|
+
(unit_event_date_norm - _normalize_datetime(recent_event_date)).total_seconds() / 3600
|
|
73
|
+
)
|
|
73
74
|
weight = max(0.3, 1.0 - (time_diff_hours / time_window_hours))
|
|
74
|
-
links.append((unit_id, str(recent_id),
|
|
75
|
+
links.append((unit_id, str(recent_id), "temporal", weight, None))
|
|
75
76
|
|
|
76
77
|
return links
|
|
77
78
|
|
|
@@ -99,17 +100,17 @@ def compute_temporal_query_bounds(
|
|
|
99
100
|
try:
|
|
100
101
|
min_date = min(all_dates) - timedelta(hours=time_window_hours)
|
|
101
102
|
except OverflowError:
|
|
102
|
-
min_date = datetime.min.replace(tzinfo=
|
|
103
|
+
min_date = datetime.min.replace(tzinfo=UTC)
|
|
103
104
|
|
|
104
105
|
try:
|
|
105
106
|
max_date = max(all_dates) + timedelta(hours=time_window_hours)
|
|
106
107
|
except OverflowError:
|
|
107
|
-
max_date = datetime.max.replace(tzinfo=
|
|
108
|
+
max_date = datetime.max.replace(tzinfo=UTC)
|
|
108
109
|
|
|
109
110
|
return min_date, max_date
|
|
110
111
|
|
|
111
112
|
|
|
112
|
-
def _log(log_buffer, message, level=
|
|
113
|
+
def _log(log_buffer, message, level="info"):
|
|
113
114
|
"""Helper to log to buffer if available, otherwise use logger.
|
|
114
115
|
|
|
115
116
|
Args:
|
|
@@ -117,7 +118,7 @@ def _log(log_buffer, message, level='info'):
|
|
|
117
118
|
message: The log message
|
|
118
119
|
level: 'info', 'debug', 'warning', or 'error'. Debug messages are not added to buffer.
|
|
119
120
|
"""
|
|
120
|
-
if level ==
|
|
121
|
+
if level == "debug":
|
|
121
122
|
# Debug messages only go to logger, not to buffer
|
|
122
123
|
logger.debug(message)
|
|
123
124
|
return
|
|
@@ -125,23 +126,23 @@ def _log(log_buffer, message, level='info'):
|
|
|
125
126
|
if log_buffer is not None:
|
|
126
127
|
log_buffer.append(message)
|
|
127
128
|
else:
|
|
128
|
-
if level ==
|
|
129
|
+
if level == "info":
|
|
129
130
|
logger.info(message)
|
|
130
131
|
else:
|
|
131
|
-
logger.log(logging.WARNING if level ==
|
|
132
|
+
logger.log(logging.WARNING if level == "warning" else logging.ERROR, message)
|
|
132
133
|
|
|
133
134
|
|
|
134
135
|
async def extract_entities_batch_optimized(
|
|
135
136
|
entity_resolver,
|
|
136
137
|
conn,
|
|
137
138
|
bank_id: str,
|
|
138
|
-
unit_ids:
|
|
139
|
-
sentences:
|
|
139
|
+
unit_ids: list[str],
|
|
140
|
+
sentences: list[str],
|
|
140
141
|
context: str,
|
|
141
|
-
fact_dates:
|
|
142
|
-
llm_entities:
|
|
143
|
-
log_buffer:
|
|
144
|
-
) ->
|
|
142
|
+
fact_dates: list,
|
|
143
|
+
llm_entities: list[list[dict]],
|
|
144
|
+
log_buffer: list[str] = None,
|
|
145
|
+
) -> list[tuple]:
|
|
145
146
|
"""
|
|
146
147
|
Process LLM-extracted entities for ALL facts in batch.
|
|
147
148
|
|
|
@@ -171,15 +172,19 @@ async def extract_entities_batch_optimized(
|
|
|
171
172
|
formatted_entities = []
|
|
172
173
|
for ent in entity_list:
|
|
173
174
|
# Handle both Entity objects and dicts
|
|
174
|
-
if hasattr(ent,
|
|
175
|
+
if hasattr(ent, "text"):
|
|
175
176
|
# Entity objects only have 'text', default type to 'CONCEPT'
|
|
176
|
-
formatted_entities.append({
|
|
177
|
+
formatted_entities.append({"text": ent.text, "type": "CONCEPT"})
|
|
177
178
|
elif isinstance(ent, dict):
|
|
178
|
-
formatted_entities.append({
|
|
179
|
+
formatted_entities.append({"text": ent.get("text", ""), "type": ent.get("type", "CONCEPT")})
|
|
179
180
|
all_entities.append(formatted_entities)
|
|
180
181
|
|
|
181
182
|
total_entities = sum(len(ents) for ents in all_entities)
|
|
182
|
-
_log(
|
|
183
|
+
_log(
|
|
184
|
+
log_buffer,
|
|
185
|
+
f" [6.1] Process LLM entities: {total_entities} entities from {len(sentences)} facts in {time.time() - substep_start:.3f}s",
|
|
186
|
+
level="debug",
|
|
187
|
+
)
|
|
183
188
|
|
|
184
189
|
# Step 2: Resolve entities in BATCH (much faster!)
|
|
185
190
|
substep_start = time.time()
|
|
@@ -195,13 +200,19 @@ async def extract_entities_batch_optimized(
|
|
|
195
200
|
continue
|
|
196
201
|
|
|
197
202
|
for local_idx, entity in enumerate(entities):
|
|
198
|
-
all_entities_flat.append(
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
+
all_entities_flat.append(
|
|
204
|
+
{
|
|
205
|
+
"text": entity["text"],
|
|
206
|
+
"type": entity["type"],
|
|
207
|
+
"nearby_entities": entities,
|
|
208
|
+
}
|
|
209
|
+
)
|
|
203
210
|
entity_to_unit.append((unit_id, local_idx, fact_date))
|
|
204
|
-
_log(
|
|
211
|
+
_log(
|
|
212
|
+
log_buffer,
|
|
213
|
+
f" [6.2.1] Prepare entities: {len(all_entities_flat)} entities in {time.time() - substep_6_2_1_start:.3f}s",
|
|
214
|
+
level="debug",
|
|
215
|
+
)
|
|
205
216
|
|
|
206
217
|
# Resolve ALL entities in one batch call
|
|
207
218
|
if all_entities_flat:
|
|
@@ -210,7 +221,7 @@ async def extract_entities_batch_optimized(
|
|
|
210
221
|
|
|
211
222
|
# Add per-entity dates to entity data for batch resolution
|
|
212
223
|
for idx, (unit_id, local_idx, fact_date) in enumerate(entity_to_unit):
|
|
213
|
-
all_entities_flat[idx][
|
|
224
|
+
all_entities_flat[idx]["event_date"] = fact_date
|
|
214
225
|
|
|
215
226
|
# Resolve ALL entities in ONE batch call (much faster than sequential buckets)
|
|
216
227
|
# INSERT ... ON CONFLICT handles any race conditions at the DB level
|
|
@@ -219,10 +230,14 @@ async def extract_entities_batch_optimized(
|
|
|
219
230
|
entities_data=all_entities_flat,
|
|
220
231
|
context=context,
|
|
221
232
|
unit_event_date=None, # Not used when per-entity dates provided
|
|
222
|
-
conn=conn # Use main transaction connection
|
|
233
|
+
conn=conn, # Use main transaction connection
|
|
223
234
|
)
|
|
224
235
|
|
|
225
|
-
_log(
|
|
236
|
+
_log(
|
|
237
|
+
log_buffer,
|
|
238
|
+
f" [6.2.2] Resolve entities: {len(all_entities_flat)} entities in single batch in {time.time() - substep_6_2_2_start:.3f}s",
|
|
239
|
+
level="debug",
|
|
240
|
+
)
|
|
226
241
|
|
|
227
242
|
# [6.2.3] Create unit-entity links in BATCH
|
|
228
243
|
substep_6_2_3_start = time.time()
|
|
@@ -239,12 +254,24 @@ async def extract_entities_batch_optimized(
|
|
|
239
254
|
|
|
240
255
|
# Batch insert all unit-entity links (MUCH faster!)
|
|
241
256
|
await entity_resolver.link_units_to_entities_batch(unit_entity_pairs, conn=conn)
|
|
242
|
-
_log(
|
|
257
|
+
_log(
|
|
258
|
+
log_buffer,
|
|
259
|
+
f" [6.2.3] Create unit-entity links (batched): {len(unit_entity_pairs)} links in {time.time() - substep_6_2_3_start:.3f}s",
|
|
260
|
+
level="debug",
|
|
261
|
+
)
|
|
243
262
|
|
|
244
|
-
_log(
|
|
263
|
+
_log(
|
|
264
|
+
log_buffer,
|
|
265
|
+
f" [6.2] Entity resolution (batched): {len(all_entities_flat)} entities resolved in {time.time() - step_6_2_start:.3f}s",
|
|
266
|
+
level="debug",
|
|
267
|
+
)
|
|
245
268
|
else:
|
|
246
269
|
unit_to_entity_ids = {}
|
|
247
|
-
_log(
|
|
270
|
+
_log(
|
|
271
|
+
log_buffer,
|
|
272
|
+
f" [6.2] Entity resolution (batched): 0 entities in {time.time() - step_6_2_start:.3f}s",
|
|
273
|
+
level="debug",
|
|
274
|
+
)
|
|
248
275
|
|
|
249
276
|
# Step 3: Create entity links between units that share entities
|
|
250
277
|
substep_start = time.time()
|
|
@@ -253,13 +280,14 @@ async def extract_entities_batch_optimized(
|
|
|
253
280
|
for entity_ids in unit_to_entity_ids.values():
|
|
254
281
|
all_entity_ids.update(entity_ids)
|
|
255
282
|
|
|
256
|
-
_log(log_buffer, f" [6.3] Creating entity links for {len(all_entity_ids)} unique entities...", level=
|
|
283
|
+
_log(log_buffer, f" [6.3] Creating entity links for {len(all_entity_ids)} unique entities...", level="debug")
|
|
257
284
|
|
|
258
285
|
# Find all units that reference these entities (ONE batched query)
|
|
259
286
|
entity_to_units = {}
|
|
260
287
|
if all_entity_ids:
|
|
261
288
|
query_start = time.time()
|
|
262
289
|
import uuid
|
|
290
|
+
|
|
263
291
|
entity_id_list = [uuid.UUID(eid) if isinstance(eid, str) else eid for eid in all_entity_ids]
|
|
264
292
|
rows = await conn.fetch(
|
|
265
293
|
"""
|
|
@@ -267,25 +295,29 @@ async def extract_entities_batch_optimized(
|
|
|
267
295
|
FROM unit_entities
|
|
268
296
|
WHERE entity_id = ANY($1::uuid[])
|
|
269
297
|
""",
|
|
270
|
-
entity_id_list
|
|
298
|
+
entity_id_list,
|
|
299
|
+
)
|
|
300
|
+
_log(
|
|
301
|
+
log_buffer,
|
|
302
|
+
f" [6.3.1] Query unit_entities: {len(rows)} rows in {time.time() - query_start:.3f}s",
|
|
303
|
+
level="debug",
|
|
271
304
|
)
|
|
272
|
-
_log(log_buffer, f" [6.3.1] Query unit_entities: {len(rows)} rows in {time.time() - query_start:.3f}s", level='debug')
|
|
273
305
|
|
|
274
306
|
# Group by entity_id
|
|
275
307
|
group_start = time.time()
|
|
276
308
|
for row in rows:
|
|
277
|
-
entity_id = row[
|
|
309
|
+
entity_id = row["entity_id"]
|
|
278
310
|
if entity_id not in entity_to_units:
|
|
279
311
|
entity_to_units[entity_id] = []
|
|
280
|
-
entity_to_units[entity_id].append(row[
|
|
281
|
-
_log(log_buffer, f" [6.3.2] Group by entity_id: {time.time() - group_start:.3f}s", level=
|
|
312
|
+
entity_to_units[entity_id].append(row["unit_id"])
|
|
313
|
+
_log(log_buffer, f" [6.3.2] Group by entity_id: {time.time() - group_start:.3f}s", level="debug")
|
|
282
314
|
|
|
283
315
|
# Create bidirectional links between units that share entities
|
|
284
316
|
# OPTIMIZATION: Limit links per entity to avoid N² explosion
|
|
285
317
|
# Only link each new unit to the most recent MAX_LINKS_PER_ENTITY units
|
|
286
318
|
MAX_LINKS_PER_ENTITY = 50 # Limit to prevent explosion when entity appears in many facts
|
|
287
319
|
link_gen_start = time.time()
|
|
288
|
-
links:
|
|
320
|
+
links: list[EntityLink] = []
|
|
289
321
|
new_unit_set = set(unit_ids) # Units from this batch
|
|
290
322
|
|
|
291
323
|
def to_uuid(val) -> UUID:
|
|
@@ -299,27 +331,52 @@ async def extract_entities_batch_optimized(
|
|
|
299
331
|
|
|
300
332
|
# Link new units to each other (within batch) - also limited
|
|
301
333
|
# For very common entities, limit within-batch links too
|
|
302
|
-
new_units_to_link =
|
|
334
|
+
new_units_to_link = (
|
|
335
|
+
new_units[-MAX_LINKS_PER_ENTITY:] if len(new_units) > MAX_LINKS_PER_ENTITY else new_units
|
|
336
|
+
)
|
|
303
337
|
for i, unit_id_1 in enumerate(new_units_to_link):
|
|
304
|
-
for unit_id_2 in new_units_to_link[i+1:]:
|
|
305
|
-
links.append(
|
|
306
|
-
|
|
338
|
+
for unit_id_2 in new_units_to_link[i + 1 :]:
|
|
339
|
+
links.append(
|
|
340
|
+
EntityLink(
|
|
341
|
+
from_unit_id=to_uuid(unit_id_1), to_unit_id=to_uuid(unit_id_2), entity_id=entity_uuid
|
|
342
|
+
)
|
|
343
|
+
)
|
|
344
|
+
links.append(
|
|
345
|
+
EntityLink(
|
|
346
|
+
from_unit_id=to_uuid(unit_id_2), to_unit_id=to_uuid(unit_id_1), entity_id=entity_uuid
|
|
347
|
+
)
|
|
348
|
+
)
|
|
307
349
|
|
|
308
350
|
# Link new units to LIMITED existing units (most recent)
|
|
309
351
|
existing_to_link = existing_units[-MAX_LINKS_PER_ENTITY:] # Take most recent
|
|
310
352
|
for new_unit in new_units:
|
|
311
353
|
for existing_unit in existing_to_link:
|
|
312
|
-
links.append(
|
|
313
|
-
|
|
354
|
+
links.append(
|
|
355
|
+
EntityLink(
|
|
356
|
+
from_unit_id=to_uuid(new_unit), to_unit_id=to_uuid(existing_unit), entity_id=entity_uuid
|
|
357
|
+
)
|
|
358
|
+
)
|
|
359
|
+
links.append(
|
|
360
|
+
EntityLink(
|
|
361
|
+
from_unit_id=to_uuid(existing_unit), to_unit_id=to_uuid(new_unit), entity_id=entity_uuid
|
|
362
|
+
)
|
|
363
|
+
)
|
|
314
364
|
|
|
315
|
-
_log(
|
|
316
|
-
|
|
365
|
+
_log(
|
|
366
|
+
log_buffer, f" [6.3.3] Generate {len(links)} links: {time.time() - link_gen_start:.3f}s", level="debug"
|
|
367
|
+
)
|
|
368
|
+
_log(
|
|
369
|
+
log_buffer,
|
|
370
|
+
f" [6.3] Entity link creation: {len(links)} links for {len(all_entity_ids)} unique entities in {time.time() - substep_start:.3f}s",
|
|
371
|
+
level="debug",
|
|
372
|
+
)
|
|
317
373
|
|
|
318
374
|
return links
|
|
319
375
|
|
|
320
376
|
except Exception as e:
|
|
321
377
|
logger.error(f"Failed to extract entities in batch: {str(e)}")
|
|
322
378
|
import traceback
|
|
379
|
+
|
|
323
380
|
traceback.print_exc()
|
|
324
381
|
raise
|
|
325
382
|
|
|
@@ -327,9 +384,9 @@ async def extract_entities_batch_optimized(
|
|
|
327
384
|
async def create_temporal_links_batch_per_fact(
|
|
328
385
|
conn,
|
|
329
386
|
bank_id: str,
|
|
330
|
-
unit_ids:
|
|
387
|
+
unit_ids: list[str],
|
|
331
388
|
time_window_hours: int = 24,
|
|
332
|
-
log_buffer:
|
|
389
|
+
log_buffer: list[str] = None,
|
|
333
390
|
) -> int:
|
|
334
391
|
"""
|
|
335
392
|
Create temporal links for multiple units, each with their own event_date.
|
|
@@ -361,10 +418,13 @@ async def create_temporal_links_batch_per_fact(
|
|
|
361
418
|
FROM memory_units
|
|
362
419
|
WHERE id::text = ANY($1)
|
|
363
420
|
""",
|
|
364
|
-
unit_ids
|
|
421
|
+
unit_ids,
|
|
422
|
+
)
|
|
423
|
+
new_units = {str(row["id"]): row["event_date"] for row in rows}
|
|
424
|
+
_log(
|
|
425
|
+
log_buffer,
|
|
426
|
+
f" [7.1] Fetch event_dates for {len(unit_ids)} units: {time_mod.time() - fetch_dates_start:.3f}s",
|
|
365
427
|
)
|
|
366
|
-
new_units = {str(row['id']): row['event_date'] for row in rows}
|
|
367
|
-
_log(log_buffer, f" [7.1] Fetch event_dates for {len(unit_ids)} units: {time_mod.time() - fetch_dates_start:.3f}s")
|
|
368
428
|
|
|
369
429
|
# Fetch ALL potential temporal neighbors in ONE query (much faster!)
|
|
370
430
|
# Get time range across all units with overflow protection
|
|
@@ -383,9 +443,12 @@ async def create_temporal_links_batch_per_fact(
|
|
|
383
443
|
bank_id,
|
|
384
444
|
min_date,
|
|
385
445
|
max_date,
|
|
386
|
-
unit_ids
|
|
446
|
+
unit_ids,
|
|
447
|
+
)
|
|
448
|
+
_log(
|
|
449
|
+
log_buffer,
|
|
450
|
+
f" [7.2] Fetch {len(all_candidates)} candidate neighbors (1 query): {time_mod.time() - fetch_neighbors_start:.3f}s",
|
|
387
451
|
)
|
|
388
|
-
_log(log_buffer, f" [7.2] Fetch {len(all_candidates)} candidate neighbors (1 query): {time_mod.time() - fetch_neighbors_start:.3f}s")
|
|
389
452
|
|
|
390
453
|
# Filter and create links in memory (much faster than N queries)
|
|
391
454
|
link_gen_start = time_mod.time()
|
|
@@ -408,8 +471,8 @@ async def create_temporal_links_batch_per_fact(
|
|
|
408
471
|
if time_diff_hours <= time_window_hours:
|
|
409
472
|
weight = max(0.3, 1.0 - (time_diff_hours / time_window_hours))
|
|
410
473
|
# Create bidirectional links
|
|
411
|
-
links.append((unit_id, other_id,
|
|
412
|
-
links.append((other_id, unit_id,
|
|
474
|
+
links.append((unit_id, other_id, "temporal", weight, None))
|
|
475
|
+
links.append((other_id, unit_id, "temporal", weight, None))
|
|
413
476
|
|
|
414
477
|
_log(log_buffer, f" [7.3] Generate {len(links)} temporal links: {time_mod.time() - link_gen_start:.3f}s")
|
|
415
478
|
|
|
@@ -421,7 +484,7 @@ async def create_temporal_links_batch_per_fact(
|
|
|
421
484
|
VALUES ($1, $2, $3, $4, $5)
|
|
422
485
|
ON CONFLICT (from_unit_id, to_unit_id, link_type, COALESCE(entity_id, '00000000-0000-0000-0000-000000000000'::uuid)) DO NOTHING
|
|
423
486
|
""",
|
|
424
|
-
links
|
|
487
|
+
links,
|
|
425
488
|
)
|
|
426
489
|
_log(log_buffer, f" [7.4] Insert {len(links)} temporal links: {time_mod.time() - insert_start:.3f}s")
|
|
427
490
|
|
|
@@ -430,6 +493,7 @@ async def create_temporal_links_batch_per_fact(
|
|
|
430
493
|
except Exception as e:
|
|
431
494
|
logger.error(f"Failed to create temporal links: {str(e)}")
|
|
432
495
|
import traceback
|
|
496
|
+
|
|
433
497
|
traceback.print_exc()
|
|
434
498
|
raise
|
|
435
499
|
|
|
@@ -437,11 +501,11 @@ async def create_temporal_links_batch_per_fact(
|
|
|
437
501
|
async def create_semantic_links_batch(
|
|
438
502
|
conn,
|
|
439
503
|
bank_id: str,
|
|
440
|
-
unit_ids:
|
|
441
|
-
embeddings:
|
|
504
|
+
unit_ids: list[str],
|
|
505
|
+
embeddings: list[list[float]],
|
|
442
506
|
top_k: int = 5,
|
|
443
507
|
threshold: float = 0.7,
|
|
444
|
-
log_buffer:
|
|
508
|
+
log_buffer: list[str] = None,
|
|
445
509
|
) -> int:
|
|
446
510
|
"""
|
|
447
511
|
Create semantic links for multiple units efficiently.
|
|
@@ -465,6 +529,7 @@ async def create_semantic_links_batch(
|
|
|
465
529
|
|
|
466
530
|
try:
|
|
467
531
|
import time as time_mod
|
|
532
|
+
|
|
468
533
|
import numpy as np
|
|
469
534
|
|
|
470
535
|
# Fetch ALL existing units with embeddings in ONE query
|
|
@@ -478,9 +543,12 @@ async def create_semantic_links_batch(
|
|
|
478
543
|
AND id::text != ALL($2)
|
|
479
544
|
""",
|
|
480
545
|
bank_id,
|
|
481
|
-
unit_ids
|
|
546
|
+
unit_ids,
|
|
547
|
+
)
|
|
548
|
+
_log(
|
|
549
|
+
log_buffer,
|
|
550
|
+
f" [8.1] Fetch {len(all_existing)} existing embeddings (1 query): {time_mod.time() - fetch_start:.3f}s",
|
|
482
551
|
)
|
|
483
|
-
_log(log_buffer, f" [8.1] Fetch {len(all_existing)} existing embeddings (1 query): {time_mod.time() - fetch_start:.3f}s")
|
|
484
552
|
|
|
485
553
|
# Convert to numpy for vectorized similarity computation
|
|
486
554
|
compute_start = time_mod.time()
|
|
@@ -488,15 +556,16 @@ async def create_semantic_links_batch(
|
|
|
488
556
|
|
|
489
557
|
if all_existing:
|
|
490
558
|
# Convert existing embeddings to numpy array
|
|
491
|
-
existing_ids = [str(row[
|
|
559
|
+
existing_ids = [str(row["id"]) for row in all_existing]
|
|
492
560
|
# Stack embeddings as 2D array: (num_embeddings, embedding_dim)
|
|
493
561
|
embedding_arrays = []
|
|
494
562
|
for row in all_existing:
|
|
495
|
-
raw_emb = row[
|
|
563
|
+
raw_emb = row["embedding"]
|
|
496
564
|
# Handle different pgvector formats
|
|
497
565
|
if isinstance(raw_emb, str):
|
|
498
566
|
# Parse string format: "[1.0, 2.0, ...]"
|
|
499
567
|
import json
|
|
568
|
+
|
|
500
569
|
emb = np.array(json.loads(raw_emb), dtype=np.float32)
|
|
501
570
|
elif isinstance(raw_emb, (list, tuple)):
|
|
502
571
|
emb = np.array(raw_emb, dtype=np.float32)
|
|
@@ -537,7 +606,7 @@ async def create_semantic_links_batch(
|
|
|
537
606
|
similar_id = existing_ids[idx]
|
|
538
607
|
# Clamp to [0, 1] to handle floating point precision issues
|
|
539
608
|
similarity = float(min(1.0, max(0.0, similarities[idx])))
|
|
540
|
-
all_links.append((unit_id, similar_id,
|
|
609
|
+
all_links.append((unit_id, similar_id, "semantic", similarity, None))
|
|
541
610
|
|
|
542
611
|
# Also compute similarities WITHIN the new batch (new units to each other)
|
|
543
612
|
# Apply the same top_k limit per unit as we do for existing units
|
|
@@ -565,9 +634,12 @@ async def create_semantic_links_batch(
|
|
|
565
634
|
other_id = unit_ids[other_idx]
|
|
566
635
|
# Clamp to [0, 1] to handle floating point precision issues
|
|
567
636
|
similarity = float(min(1.0, max(0.0, similarities[local_idx])))
|
|
568
|
-
all_links.append((unit_id, other_id,
|
|
637
|
+
all_links.append((unit_id, other_id, "semantic", similarity, None))
|
|
569
638
|
|
|
570
|
-
_log(
|
|
639
|
+
_log(
|
|
640
|
+
log_buffer,
|
|
641
|
+
f" [8.2] Compute similarities & generate {len(all_links)} semantic links: {time_mod.time() - compute_start:.3f}s",
|
|
642
|
+
)
|
|
571
643
|
|
|
572
644
|
if all_links:
|
|
573
645
|
insert_start = time_mod.time()
|
|
@@ -577,20 +649,23 @@ async def create_semantic_links_batch(
|
|
|
577
649
|
VALUES ($1, $2, $3, $4, $5)
|
|
578
650
|
ON CONFLICT (from_unit_id, to_unit_id, link_type, COALESCE(entity_id, '00000000-0000-0000-0000-000000000000'::uuid)) DO NOTHING
|
|
579
651
|
""",
|
|
580
|
-
all_links
|
|
652
|
+
all_links,
|
|
653
|
+
)
|
|
654
|
+
_log(
|
|
655
|
+
log_buffer, f" [8.3] Insert {len(all_links)} semantic links: {time_mod.time() - insert_start:.3f}s"
|
|
581
656
|
)
|
|
582
|
-
_log(log_buffer, f" [8.3] Insert {len(all_links)} semantic links: {time_mod.time() - insert_start:.3f}s")
|
|
583
657
|
|
|
584
658
|
return len(all_links)
|
|
585
659
|
|
|
586
660
|
except Exception as e:
|
|
587
661
|
logger.error(f"Failed to create semantic links: {str(e)}")
|
|
588
662
|
import traceback
|
|
663
|
+
|
|
589
664
|
traceback.print_exc()
|
|
590
665
|
raise
|
|
591
666
|
|
|
592
667
|
|
|
593
|
-
async def insert_entity_links_batch(conn, links:
|
|
668
|
+
async def insert_entity_links_batch(conn, links: list[EntityLink], chunk_size: int = 50000):
|
|
594
669
|
"""
|
|
595
670
|
Insert all entity links using COPY to temp table + INSERT for maximum speed.
|
|
596
671
|
|
|
@@ -606,7 +681,6 @@ async def insert_entity_links_batch(conn, links: List[EntityLink], chunk_size: i
|
|
|
606
681
|
if not links:
|
|
607
682
|
return
|
|
608
683
|
|
|
609
|
-
import uuid as uuid_mod
|
|
610
684
|
import time as time_mod
|
|
611
685
|
|
|
612
686
|
total_start = time_mod.time()
|
|
@@ -633,21 +707,15 @@ async def insert_entity_links_batch(conn, links: List[EntityLink], chunk_size: i
|
|
|
633
707
|
convert_start = time_mod.time()
|
|
634
708
|
records = []
|
|
635
709
|
for link in links:
|
|
636
|
-
records.append((
|
|
637
|
-
link.from_unit_id,
|
|
638
|
-
link.to_unit_id,
|
|
639
|
-
link.link_type,
|
|
640
|
-
link.weight,
|
|
641
|
-
link.entity_id
|
|
642
|
-
))
|
|
710
|
+
records.append((link.from_unit_id, link.to_unit_id, link.link_type, link.weight, link.entity_id))
|
|
643
711
|
logger.debug(f" [9.3] Convert {len(records)} records: {time_mod.time() - convert_start:.3f}s")
|
|
644
712
|
|
|
645
713
|
# Bulk load using COPY (fastest method)
|
|
646
714
|
copy_start = time_mod.time()
|
|
647
715
|
await conn.copy_records_to_table(
|
|
648
|
-
|
|
716
|
+
"_temp_entity_links",
|
|
649
717
|
records=records,
|
|
650
|
-
columns=[
|
|
718
|
+
columns=["from_unit_id", "to_unit_id", "link_type", "weight", "entity_id"],
|
|
651
719
|
)
|
|
652
720
|
logger.debug(f" [9.4] COPY {len(records)} records to temp table: {time_mod.time() - copy_start:.3f}s")
|
|
653
721
|
|
|
@@ -665,8 +733,8 @@ async def insert_entity_links_batch(conn, links: List[EntityLink], chunk_size: i
|
|
|
665
733
|
|
|
666
734
|
async def create_causal_links_batch(
|
|
667
735
|
conn,
|
|
668
|
-
unit_ids:
|
|
669
|
-
causal_relations_per_fact:
|
|
736
|
+
unit_ids: list[str],
|
|
737
|
+
causal_relations_per_fact: list[list[dict]],
|
|
670
738
|
) -> int:
|
|
671
739
|
"""
|
|
672
740
|
Create causal links between facts based on LLM-extracted causal relationships.
|
|
@@ -694,6 +762,7 @@ async def create_causal_links_batch(
|
|
|
694
762
|
|
|
695
763
|
try:
|
|
696
764
|
import time as time_mod
|
|
765
|
+
|
|
697
766
|
create_start = time_mod.time()
|
|
698
767
|
|
|
699
768
|
# Build links list
|
|
@@ -705,12 +774,12 @@ async def create_causal_links_batch(
|
|
|
705
774
|
from_unit_id = unit_ids[fact_idx]
|
|
706
775
|
|
|
707
776
|
for relation in causal_relations:
|
|
708
|
-
target_idx = relation[
|
|
709
|
-
relation_type = relation[
|
|
710
|
-
strength = relation.get(
|
|
777
|
+
target_idx = relation["target_fact_index"]
|
|
778
|
+
relation_type = relation["relation_type"]
|
|
779
|
+
strength = relation.get("strength", 1.0)
|
|
711
780
|
|
|
712
781
|
# Validate relation_type - must match database constraint
|
|
713
|
-
valid_types = {
|
|
782
|
+
valid_types = {"causes", "caused_by", "enables", "prevents"}
|
|
714
783
|
if relation_type not in valid_types:
|
|
715
784
|
logger.error(
|
|
716
785
|
f"Invalid relation_type '{relation_type}' (type: {type(relation_type).__name__}) "
|
|
@@ -735,7 +804,6 @@ async def create_causal_links_batch(
|
|
|
735
804
|
# weight is the strength of the relationship
|
|
736
805
|
links.append((from_unit_id, to_unit_id, relation_type, strength, None))
|
|
737
806
|
|
|
738
|
-
|
|
739
807
|
if links:
|
|
740
808
|
insert_start = time_mod.time()
|
|
741
809
|
try:
|
|
@@ -745,14 +813,16 @@ async def create_causal_links_batch(
|
|
|
745
813
|
VALUES ($1, $2, $3, $4, $5)
|
|
746
814
|
ON CONFLICT (from_unit_id, to_unit_id, link_type, COALESCE(entity_id, '00000000-0000-0000-0000-000000000000'::uuid)) DO NOTHING
|
|
747
815
|
""",
|
|
748
|
-
links
|
|
816
|
+
links,
|
|
749
817
|
)
|
|
750
818
|
except Exception as db_error:
|
|
751
819
|
# Log the actual data being inserted for debugging
|
|
752
820
|
logger.error(f"Database insert failed for causal links. Error: {db_error}")
|
|
753
821
|
logger.error(f"Attempted to insert {len(links)} links. First few:")
|
|
754
822
|
for i, link in enumerate(links[:3]):
|
|
755
|
-
logger.error(
|
|
823
|
+
logger.error(
|
|
824
|
+
f" Link {i}: from={link[0]}, to={link[1]}, type='{link[2]}' (repr={repr(link[2])}), weight={link[3]}, entity={link[4]}"
|
|
825
|
+
)
|
|
756
826
|
raise
|
|
757
827
|
|
|
758
828
|
return len(links)
|
|
@@ -760,5 +830,6 @@ async def create_causal_links_batch(
|
|
|
760
830
|
except Exception as e:
|
|
761
831
|
logger.error(f"Failed to create causal links: {str(e)}")
|
|
762
832
|
import traceback
|
|
833
|
+
|
|
763
834
|
traceback.print_exc()
|
|
764
835
|
raise
|