hindsight-api 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. hindsight_api/__init__.py +10 -9
  2. hindsight_api/alembic/env.py +5 -8
  3. hindsight_api/alembic/versions/5a366d414dce_initial_schema.py +266 -180
  4. hindsight_api/alembic/versions/b7c4d8e9f1a2_add_chunks_table.py +32 -32
  5. hindsight_api/alembic/versions/c8e5f2a3b4d1_add_retain_params_to_documents.py +11 -11
  6. hindsight_api/alembic/versions/d9f6a3b4c5e2_rename_bank_to_interactions.py +7 -12
  7. hindsight_api/alembic/versions/e0a1b2c3d4e5_disposition_to_3_traits.py +23 -15
  8. hindsight_api/alembic/versions/rename_personality_to_disposition.py +30 -21
  9. hindsight_api/api/__init__.py +10 -10
  10. hindsight_api/api/http.py +575 -593
  11. hindsight_api/api/mcp.py +30 -28
  12. hindsight_api/banner.py +13 -6
  13. hindsight_api/config.py +9 -13
  14. hindsight_api/engine/__init__.py +9 -9
  15. hindsight_api/engine/cross_encoder.py +22 -21
  16. hindsight_api/engine/db_utils.py +5 -4
  17. hindsight_api/engine/embeddings.py +22 -21
  18. hindsight_api/engine/entity_resolver.py +81 -75
  19. hindsight_api/engine/llm_wrapper.py +61 -79
  20. hindsight_api/engine/memory_engine.py +603 -625
  21. hindsight_api/engine/query_analyzer.py +100 -97
  22. hindsight_api/engine/response_models.py +105 -106
  23. hindsight_api/engine/retain/__init__.py +9 -16
  24. hindsight_api/engine/retain/bank_utils.py +34 -58
  25. hindsight_api/engine/retain/chunk_storage.py +4 -12
  26. hindsight_api/engine/retain/deduplication.py +9 -28
  27. hindsight_api/engine/retain/embedding_processing.py +4 -11
  28. hindsight_api/engine/retain/embedding_utils.py +3 -4
  29. hindsight_api/engine/retain/entity_processing.py +7 -17
  30. hindsight_api/engine/retain/fact_extraction.py +155 -165
  31. hindsight_api/engine/retain/fact_storage.py +11 -23
  32. hindsight_api/engine/retain/link_creation.py +11 -39
  33. hindsight_api/engine/retain/link_utils.py +166 -95
  34. hindsight_api/engine/retain/observation_regeneration.py +39 -52
  35. hindsight_api/engine/retain/orchestrator.py +72 -62
  36. hindsight_api/engine/retain/types.py +49 -43
  37. hindsight_api/engine/search/__init__.py +5 -5
  38. hindsight_api/engine/search/fusion.py +6 -15
  39. hindsight_api/engine/search/graph_retrieval.py +22 -23
  40. hindsight_api/engine/search/mpfp_retrieval.py +76 -92
  41. hindsight_api/engine/search/observation_utils.py +9 -16
  42. hindsight_api/engine/search/reranking.py +4 -7
  43. hindsight_api/engine/search/retrieval.py +87 -66
  44. hindsight_api/engine/search/scoring.py +5 -7
  45. hindsight_api/engine/search/temporal_extraction.py +8 -11
  46. hindsight_api/engine/search/think_utils.py +115 -39
  47. hindsight_api/engine/search/trace.py +68 -39
  48. hindsight_api/engine/search/tracer.py +44 -35
  49. hindsight_api/engine/search/types.py +20 -17
  50. hindsight_api/engine/task_backend.py +21 -26
  51. hindsight_api/engine/utils.py +25 -10
  52. hindsight_api/main.py +21 -40
  53. hindsight_api/mcp_local.py +190 -0
  54. hindsight_api/metrics.py +44 -30
  55. hindsight_api/migrations.py +10 -8
  56. hindsight_api/models.py +60 -72
  57. hindsight_api/pg0.py +22 -23
  58. hindsight_api/server.py +3 -6
  59. hindsight_api-0.1.7.dist-info/METADATA +178 -0
  60. hindsight_api-0.1.7.dist-info/RECORD +64 -0
  61. {hindsight_api-0.1.5.dist-info → hindsight_api-0.1.7.dist-info}/entry_points.txt +1 -0
  62. hindsight_api-0.1.5.dist-info/METADATA +0 -42
  63. hindsight_api-0.1.5.dist-info/RECORD +0 -63
  64. {hindsight_api-0.1.5.dist-info → hindsight_api-0.1.7.dist-info}/WHEEL +0 -0
@@ -2,10 +2,9 @@
2
2
  Link creation utilities for temporal, semantic, and entity links.
3
3
  """
4
4
 
5
- import time
6
5
  import logging
7
- from typing import List
8
- from datetime import timedelta, datetime, timezone
6
+ import time
7
+ from datetime import UTC, datetime, timedelta
9
8
  from uuid import UUID
10
9
 
11
10
  from .types import EntityLink
@@ -19,7 +18,7 @@ def _normalize_datetime(dt):
19
18
  return None
20
19
  if dt.tzinfo is None:
21
20
  # Naive datetime - assume UTC
22
- return dt.replace(tzinfo=timezone.utc)
21
+ return dt.replace(tzinfo=UTC)
23
22
  return dt
24
23
 
25
24
 
@@ -54,24 +53,26 @@ def compute_temporal_links(
54
53
  try:
55
54
  time_lower = unit_event_date_norm - timedelta(hours=time_window_hours)
56
55
  except OverflowError:
57
- time_lower = datetime.min.replace(tzinfo=timezone.utc)
56
+ time_lower = datetime.min.replace(tzinfo=UTC)
58
57
  try:
59
58
  time_upper = unit_event_date_norm + timedelta(hours=time_window_hours)
60
59
  except OverflowError:
61
- time_upper = datetime.max.replace(tzinfo=timezone.utc)
60
+ time_upper = datetime.max.replace(tzinfo=UTC)
62
61
 
63
62
  # Filter candidates within this unit's time window
64
63
  matching_neighbors = [
65
- (row['id'], row['event_date'])
64
+ (row["id"], row["event_date"])
66
65
  for row in candidates
67
- if time_lower <= _normalize_datetime(row['event_date']) <= time_upper
66
+ if time_lower <= _normalize_datetime(row["event_date"]) <= time_upper
68
67
  ][:10] # Limit to top 10
69
68
 
70
69
  for recent_id, recent_event_date in matching_neighbors:
71
70
  # Calculate temporal proximity weight
72
- time_diff_hours = abs((unit_event_date_norm - _normalize_datetime(recent_event_date)).total_seconds() / 3600)
71
+ time_diff_hours = abs(
72
+ (unit_event_date_norm - _normalize_datetime(recent_event_date)).total_seconds() / 3600
73
+ )
73
74
  weight = max(0.3, 1.0 - (time_diff_hours / time_window_hours))
74
- links.append((unit_id, str(recent_id), 'temporal', weight, None))
75
+ links.append((unit_id, str(recent_id), "temporal", weight, None))
75
76
 
76
77
  return links
77
78
 
@@ -99,17 +100,17 @@ def compute_temporal_query_bounds(
99
100
  try:
100
101
  min_date = min(all_dates) - timedelta(hours=time_window_hours)
101
102
  except OverflowError:
102
- min_date = datetime.min.replace(tzinfo=timezone.utc)
103
+ min_date = datetime.min.replace(tzinfo=UTC)
103
104
 
104
105
  try:
105
106
  max_date = max(all_dates) + timedelta(hours=time_window_hours)
106
107
  except OverflowError:
107
- max_date = datetime.max.replace(tzinfo=timezone.utc)
108
+ max_date = datetime.max.replace(tzinfo=UTC)
108
109
 
109
110
  return min_date, max_date
110
111
 
111
112
 
112
- def _log(log_buffer, message, level='info'):
113
+ def _log(log_buffer, message, level="info"):
113
114
  """Helper to log to buffer if available, otherwise use logger.
114
115
 
115
116
  Args:
@@ -117,7 +118,7 @@ def _log(log_buffer, message, level='info'):
117
118
  message: The log message
118
119
  level: 'info', 'debug', 'warning', or 'error'. Debug messages are not added to buffer.
119
120
  """
120
- if level == 'debug':
121
+ if level == "debug":
121
122
  # Debug messages only go to logger, not to buffer
122
123
  logger.debug(message)
123
124
  return
@@ -125,23 +126,23 @@ def _log(log_buffer, message, level='info'):
125
126
  if log_buffer is not None:
126
127
  log_buffer.append(message)
127
128
  else:
128
- if level == 'info':
129
+ if level == "info":
129
130
  logger.info(message)
130
131
  else:
131
- logger.log(logging.WARNING if level == 'warning' else logging.ERROR, message)
132
+ logger.log(logging.WARNING if level == "warning" else logging.ERROR, message)
132
133
 
133
134
 
134
135
  async def extract_entities_batch_optimized(
135
136
  entity_resolver,
136
137
  conn,
137
138
  bank_id: str,
138
- unit_ids: List[str],
139
- sentences: List[str],
139
+ unit_ids: list[str],
140
+ sentences: list[str],
140
141
  context: str,
141
- fact_dates: List,
142
- llm_entities: List[List[dict]],
143
- log_buffer: List[str] = None,
144
- ) -> List[tuple]:
142
+ fact_dates: list,
143
+ llm_entities: list[list[dict]],
144
+ log_buffer: list[str] = None,
145
+ ) -> list[tuple]:
145
146
  """
146
147
  Process LLM-extracted entities for ALL facts in batch.
147
148
 
@@ -171,15 +172,19 @@ async def extract_entities_batch_optimized(
171
172
  formatted_entities = []
172
173
  for ent in entity_list:
173
174
  # Handle both Entity objects and dicts
174
- if hasattr(ent, 'text'):
175
+ if hasattr(ent, "text"):
175
176
  # Entity objects only have 'text', default type to 'CONCEPT'
176
- formatted_entities.append({'text': ent.text, 'type': 'CONCEPT'})
177
+ formatted_entities.append({"text": ent.text, "type": "CONCEPT"})
177
178
  elif isinstance(ent, dict):
178
- formatted_entities.append({'text': ent.get('text', ''), 'type': ent.get('type', 'CONCEPT')})
179
+ formatted_entities.append({"text": ent.get("text", ""), "type": ent.get("type", "CONCEPT")})
179
180
  all_entities.append(formatted_entities)
180
181
 
181
182
  total_entities = sum(len(ents) for ents in all_entities)
182
- _log(log_buffer, f" [6.1] Process LLM entities: {total_entities} entities from {len(sentences)} facts in {time.time() - substep_start:.3f}s", level='debug')
183
+ _log(
184
+ log_buffer,
185
+ f" [6.1] Process LLM entities: {total_entities} entities from {len(sentences)} facts in {time.time() - substep_start:.3f}s",
186
+ level="debug",
187
+ )
183
188
 
184
189
  # Step 2: Resolve entities in BATCH (much faster!)
185
190
  substep_start = time.time()
@@ -195,13 +200,19 @@ async def extract_entities_batch_optimized(
195
200
  continue
196
201
 
197
202
  for local_idx, entity in enumerate(entities):
198
- all_entities_flat.append({
199
- 'text': entity['text'],
200
- 'type': entity['type'],
201
- 'nearby_entities': entities,
202
- })
203
+ all_entities_flat.append(
204
+ {
205
+ "text": entity["text"],
206
+ "type": entity["type"],
207
+ "nearby_entities": entities,
208
+ }
209
+ )
203
210
  entity_to_unit.append((unit_id, local_idx, fact_date))
204
- _log(log_buffer, f" [6.2.1] Prepare entities: {len(all_entities_flat)} entities in {time.time() - substep_6_2_1_start:.3f}s", level='debug')
211
+ _log(
212
+ log_buffer,
213
+ f" [6.2.1] Prepare entities: {len(all_entities_flat)} entities in {time.time() - substep_6_2_1_start:.3f}s",
214
+ level="debug",
215
+ )
205
216
 
206
217
  # Resolve ALL entities in one batch call
207
218
  if all_entities_flat:
@@ -210,7 +221,7 @@ async def extract_entities_batch_optimized(
210
221
 
211
222
  # Add per-entity dates to entity data for batch resolution
212
223
  for idx, (unit_id, local_idx, fact_date) in enumerate(entity_to_unit):
213
- all_entities_flat[idx]['event_date'] = fact_date
224
+ all_entities_flat[idx]["event_date"] = fact_date
214
225
 
215
226
  # Resolve ALL entities in ONE batch call (much faster than sequential buckets)
216
227
  # INSERT ... ON CONFLICT handles any race conditions at the DB level
@@ -219,10 +230,14 @@ async def extract_entities_batch_optimized(
219
230
  entities_data=all_entities_flat,
220
231
  context=context,
221
232
  unit_event_date=None, # Not used when per-entity dates provided
222
- conn=conn # Use main transaction connection
233
+ conn=conn, # Use main transaction connection
223
234
  )
224
235
 
225
- _log(log_buffer, f" [6.2.2] Resolve entities: {len(all_entities_flat)} entities in single batch in {time.time() - substep_6_2_2_start:.3f}s", level='debug')
236
+ _log(
237
+ log_buffer,
238
+ f" [6.2.2] Resolve entities: {len(all_entities_flat)} entities in single batch in {time.time() - substep_6_2_2_start:.3f}s",
239
+ level="debug",
240
+ )
226
241
 
227
242
  # [6.2.3] Create unit-entity links in BATCH
228
243
  substep_6_2_3_start = time.time()
@@ -239,12 +254,24 @@ async def extract_entities_batch_optimized(
239
254
 
240
255
  # Batch insert all unit-entity links (MUCH faster!)
241
256
  await entity_resolver.link_units_to_entities_batch(unit_entity_pairs, conn=conn)
242
- _log(log_buffer, f" [6.2.3] Create unit-entity links (batched): {len(unit_entity_pairs)} links in {time.time() - substep_6_2_3_start:.3f}s", level='debug')
257
+ _log(
258
+ log_buffer,
259
+ f" [6.2.3] Create unit-entity links (batched): {len(unit_entity_pairs)} links in {time.time() - substep_6_2_3_start:.3f}s",
260
+ level="debug",
261
+ )
243
262
 
244
- _log(log_buffer, f" [6.2] Entity resolution (batched): {len(all_entities_flat)} entities resolved in {time.time() - step_6_2_start:.3f}s", level='debug')
263
+ _log(
264
+ log_buffer,
265
+ f" [6.2] Entity resolution (batched): {len(all_entities_flat)} entities resolved in {time.time() - step_6_2_start:.3f}s",
266
+ level="debug",
267
+ )
245
268
  else:
246
269
  unit_to_entity_ids = {}
247
- _log(log_buffer, f" [6.2] Entity resolution (batched): 0 entities in {time.time() - step_6_2_start:.3f}s", level='debug')
270
+ _log(
271
+ log_buffer,
272
+ f" [6.2] Entity resolution (batched): 0 entities in {time.time() - step_6_2_start:.3f}s",
273
+ level="debug",
274
+ )
248
275
 
249
276
  # Step 3: Create entity links between units that share entities
250
277
  substep_start = time.time()
@@ -253,13 +280,14 @@ async def extract_entities_batch_optimized(
253
280
  for entity_ids in unit_to_entity_ids.values():
254
281
  all_entity_ids.update(entity_ids)
255
282
 
256
- _log(log_buffer, f" [6.3] Creating entity links for {len(all_entity_ids)} unique entities...", level='debug')
283
+ _log(log_buffer, f" [6.3] Creating entity links for {len(all_entity_ids)} unique entities...", level="debug")
257
284
 
258
285
  # Find all units that reference these entities (ONE batched query)
259
286
  entity_to_units = {}
260
287
  if all_entity_ids:
261
288
  query_start = time.time()
262
289
  import uuid
290
+
263
291
  entity_id_list = [uuid.UUID(eid) if isinstance(eid, str) else eid for eid in all_entity_ids]
264
292
  rows = await conn.fetch(
265
293
  """
@@ -267,25 +295,29 @@ async def extract_entities_batch_optimized(
267
295
  FROM unit_entities
268
296
  WHERE entity_id = ANY($1::uuid[])
269
297
  """,
270
- entity_id_list
298
+ entity_id_list,
299
+ )
300
+ _log(
301
+ log_buffer,
302
+ f" [6.3.1] Query unit_entities: {len(rows)} rows in {time.time() - query_start:.3f}s",
303
+ level="debug",
271
304
  )
272
- _log(log_buffer, f" [6.3.1] Query unit_entities: {len(rows)} rows in {time.time() - query_start:.3f}s", level='debug')
273
305
 
274
306
  # Group by entity_id
275
307
  group_start = time.time()
276
308
  for row in rows:
277
- entity_id = row['entity_id']
309
+ entity_id = row["entity_id"]
278
310
  if entity_id not in entity_to_units:
279
311
  entity_to_units[entity_id] = []
280
- entity_to_units[entity_id].append(row['unit_id'])
281
- _log(log_buffer, f" [6.3.2] Group by entity_id: {time.time() - group_start:.3f}s", level='debug')
312
+ entity_to_units[entity_id].append(row["unit_id"])
313
+ _log(log_buffer, f" [6.3.2] Group by entity_id: {time.time() - group_start:.3f}s", level="debug")
282
314
 
283
315
  # Create bidirectional links between units that share entities
284
316
  # OPTIMIZATION: Limit links per entity to avoid N² explosion
285
317
  # Only link each new unit to the most recent MAX_LINKS_PER_ENTITY units
286
318
  MAX_LINKS_PER_ENTITY = 50 # Limit to prevent explosion when entity appears in many facts
287
319
  link_gen_start = time.time()
288
- links: List[EntityLink] = []
320
+ links: list[EntityLink] = []
289
321
  new_unit_set = set(unit_ids) # Units from this batch
290
322
 
291
323
  def to_uuid(val) -> UUID:
@@ -299,27 +331,52 @@ async def extract_entities_batch_optimized(
299
331
 
300
332
  # Link new units to each other (within batch) - also limited
301
333
  # For very common entities, limit within-batch links too
302
- new_units_to_link = new_units[-MAX_LINKS_PER_ENTITY:] if len(new_units) > MAX_LINKS_PER_ENTITY else new_units
334
+ new_units_to_link = (
335
+ new_units[-MAX_LINKS_PER_ENTITY:] if len(new_units) > MAX_LINKS_PER_ENTITY else new_units
336
+ )
303
337
  for i, unit_id_1 in enumerate(new_units_to_link):
304
- for unit_id_2 in new_units_to_link[i+1:]:
305
- links.append(EntityLink(from_unit_id=to_uuid(unit_id_1), to_unit_id=to_uuid(unit_id_2), entity_id=entity_uuid))
306
- links.append(EntityLink(from_unit_id=to_uuid(unit_id_2), to_unit_id=to_uuid(unit_id_1), entity_id=entity_uuid))
338
+ for unit_id_2 in new_units_to_link[i + 1 :]:
339
+ links.append(
340
+ EntityLink(
341
+ from_unit_id=to_uuid(unit_id_1), to_unit_id=to_uuid(unit_id_2), entity_id=entity_uuid
342
+ )
343
+ )
344
+ links.append(
345
+ EntityLink(
346
+ from_unit_id=to_uuid(unit_id_2), to_unit_id=to_uuid(unit_id_1), entity_id=entity_uuid
347
+ )
348
+ )
307
349
 
308
350
  # Link new units to LIMITED existing units (most recent)
309
351
  existing_to_link = existing_units[-MAX_LINKS_PER_ENTITY:] # Take most recent
310
352
  for new_unit in new_units:
311
353
  for existing_unit in existing_to_link:
312
- links.append(EntityLink(from_unit_id=to_uuid(new_unit), to_unit_id=to_uuid(existing_unit), entity_id=entity_uuid))
313
- links.append(EntityLink(from_unit_id=to_uuid(existing_unit), to_unit_id=to_uuid(new_unit), entity_id=entity_uuid))
354
+ links.append(
355
+ EntityLink(
356
+ from_unit_id=to_uuid(new_unit), to_unit_id=to_uuid(existing_unit), entity_id=entity_uuid
357
+ )
358
+ )
359
+ links.append(
360
+ EntityLink(
361
+ from_unit_id=to_uuid(existing_unit), to_unit_id=to_uuid(new_unit), entity_id=entity_uuid
362
+ )
363
+ )
314
364
 
315
- _log(log_buffer, f" [6.3.3] Generate {len(links)} links: {time.time() - link_gen_start:.3f}s", level='debug')
316
- _log(log_buffer, f" [6.3] Entity link creation: {len(links)} links for {len(all_entity_ids)} unique entities in {time.time() - substep_start:.3f}s", level='debug')
365
+ _log(
366
+ log_buffer, f" [6.3.3] Generate {len(links)} links: {time.time() - link_gen_start:.3f}s", level="debug"
367
+ )
368
+ _log(
369
+ log_buffer,
370
+ f" [6.3] Entity link creation: {len(links)} links for {len(all_entity_ids)} unique entities in {time.time() - substep_start:.3f}s",
371
+ level="debug",
372
+ )
317
373
 
318
374
  return links
319
375
 
320
376
  except Exception as e:
321
377
  logger.error(f"Failed to extract entities in batch: {str(e)}")
322
378
  import traceback
379
+
323
380
  traceback.print_exc()
324
381
  raise
325
382
 
@@ -327,9 +384,9 @@ async def extract_entities_batch_optimized(
327
384
  async def create_temporal_links_batch_per_fact(
328
385
  conn,
329
386
  bank_id: str,
330
- unit_ids: List[str],
387
+ unit_ids: list[str],
331
388
  time_window_hours: int = 24,
332
- log_buffer: List[str] = None,
389
+ log_buffer: list[str] = None,
333
390
  ) -> int:
334
391
  """
335
392
  Create temporal links for multiple units, each with their own event_date.
@@ -361,10 +418,13 @@ async def create_temporal_links_batch_per_fact(
361
418
  FROM memory_units
362
419
  WHERE id::text = ANY($1)
363
420
  """,
364
- unit_ids
421
+ unit_ids,
422
+ )
423
+ new_units = {str(row["id"]): row["event_date"] for row in rows}
424
+ _log(
425
+ log_buffer,
426
+ f" [7.1] Fetch event_dates for {len(unit_ids)} units: {time_mod.time() - fetch_dates_start:.3f}s",
365
427
  )
366
- new_units = {str(row['id']): row['event_date'] for row in rows}
367
- _log(log_buffer, f" [7.1] Fetch event_dates for {len(unit_ids)} units: {time_mod.time() - fetch_dates_start:.3f}s")
368
428
 
369
429
  # Fetch ALL potential temporal neighbors in ONE query (much faster!)
370
430
  # Get time range across all units with overflow protection
@@ -383,9 +443,12 @@ async def create_temporal_links_batch_per_fact(
383
443
  bank_id,
384
444
  min_date,
385
445
  max_date,
386
- unit_ids
446
+ unit_ids,
447
+ )
448
+ _log(
449
+ log_buffer,
450
+ f" [7.2] Fetch {len(all_candidates)} candidate neighbors (1 query): {time_mod.time() - fetch_neighbors_start:.3f}s",
387
451
  )
388
- _log(log_buffer, f" [7.2] Fetch {len(all_candidates)} candidate neighbors (1 query): {time_mod.time() - fetch_neighbors_start:.3f}s")
389
452
 
390
453
  # Filter and create links in memory (much faster than N queries)
391
454
  link_gen_start = time_mod.time()
@@ -408,8 +471,8 @@ async def create_temporal_links_batch_per_fact(
408
471
  if time_diff_hours <= time_window_hours:
409
472
  weight = max(0.3, 1.0 - (time_diff_hours / time_window_hours))
410
473
  # Create bidirectional links
411
- links.append((unit_id, other_id, 'temporal', weight, None))
412
- links.append((other_id, unit_id, 'temporal', weight, None))
474
+ links.append((unit_id, other_id, "temporal", weight, None))
475
+ links.append((other_id, unit_id, "temporal", weight, None))
413
476
 
414
477
  _log(log_buffer, f" [7.3] Generate {len(links)} temporal links: {time_mod.time() - link_gen_start:.3f}s")
415
478
 
@@ -421,7 +484,7 @@ async def create_temporal_links_batch_per_fact(
421
484
  VALUES ($1, $2, $3, $4, $5)
422
485
  ON CONFLICT (from_unit_id, to_unit_id, link_type, COALESCE(entity_id, '00000000-0000-0000-0000-000000000000'::uuid)) DO NOTHING
423
486
  """,
424
- links
487
+ links,
425
488
  )
426
489
  _log(log_buffer, f" [7.4] Insert {len(links)} temporal links: {time_mod.time() - insert_start:.3f}s")
427
490
 
@@ -430,6 +493,7 @@ async def create_temporal_links_batch_per_fact(
430
493
  except Exception as e:
431
494
  logger.error(f"Failed to create temporal links: {str(e)}")
432
495
  import traceback
496
+
433
497
  traceback.print_exc()
434
498
  raise
435
499
 
@@ -437,11 +501,11 @@ async def create_temporal_links_batch_per_fact(
437
501
  async def create_semantic_links_batch(
438
502
  conn,
439
503
  bank_id: str,
440
- unit_ids: List[str],
441
- embeddings: List[List[float]],
504
+ unit_ids: list[str],
505
+ embeddings: list[list[float]],
442
506
  top_k: int = 5,
443
507
  threshold: float = 0.7,
444
- log_buffer: List[str] = None,
508
+ log_buffer: list[str] = None,
445
509
  ) -> int:
446
510
  """
447
511
  Create semantic links for multiple units efficiently.
@@ -465,6 +529,7 @@ async def create_semantic_links_batch(
465
529
 
466
530
  try:
467
531
  import time as time_mod
532
+
468
533
  import numpy as np
469
534
 
470
535
  # Fetch ALL existing units with embeddings in ONE query
@@ -478,9 +543,12 @@ async def create_semantic_links_batch(
478
543
  AND id::text != ALL($2)
479
544
  """,
480
545
  bank_id,
481
- unit_ids
546
+ unit_ids,
547
+ )
548
+ _log(
549
+ log_buffer,
550
+ f" [8.1] Fetch {len(all_existing)} existing embeddings (1 query): {time_mod.time() - fetch_start:.3f}s",
482
551
  )
483
- _log(log_buffer, f" [8.1] Fetch {len(all_existing)} existing embeddings (1 query): {time_mod.time() - fetch_start:.3f}s")
484
552
 
485
553
  # Convert to numpy for vectorized similarity computation
486
554
  compute_start = time_mod.time()
@@ -488,15 +556,16 @@ async def create_semantic_links_batch(
488
556
 
489
557
  if all_existing:
490
558
  # Convert existing embeddings to numpy array
491
- existing_ids = [str(row['id']) for row in all_existing]
559
+ existing_ids = [str(row["id"]) for row in all_existing]
492
560
  # Stack embeddings as 2D array: (num_embeddings, embedding_dim)
493
561
  embedding_arrays = []
494
562
  for row in all_existing:
495
- raw_emb = row['embedding']
563
+ raw_emb = row["embedding"]
496
564
  # Handle different pgvector formats
497
565
  if isinstance(raw_emb, str):
498
566
  # Parse string format: "[1.0, 2.0, ...]"
499
567
  import json
568
+
500
569
  emb = np.array(json.loads(raw_emb), dtype=np.float32)
501
570
  elif isinstance(raw_emb, (list, tuple)):
502
571
  emb = np.array(raw_emb, dtype=np.float32)
@@ -537,7 +606,7 @@ async def create_semantic_links_batch(
537
606
  similar_id = existing_ids[idx]
538
607
  # Clamp to [0, 1] to handle floating point precision issues
539
608
  similarity = float(min(1.0, max(0.0, similarities[idx])))
540
- all_links.append((unit_id, similar_id, 'semantic', similarity, None))
609
+ all_links.append((unit_id, similar_id, "semantic", similarity, None))
541
610
 
542
611
  # Also compute similarities WITHIN the new batch (new units to each other)
543
612
  # Apply the same top_k limit per unit as we do for existing units
@@ -565,9 +634,12 @@ async def create_semantic_links_batch(
565
634
  other_id = unit_ids[other_idx]
566
635
  # Clamp to [0, 1] to handle floating point precision issues
567
636
  similarity = float(min(1.0, max(0.0, similarities[local_idx])))
568
- all_links.append((unit_id, other_id, 'semantic', similarity, None))
637
+ all_links.append((unit_id, other_id, "semantic", similarity, None))
569
638
 
570
- _log(log_buffer, f" [8.2] Compute similarities & generate {len(all_links)} semantic links: {time_mod.time() - compute_start:.3f}s")
639
+ _log(
640
+ log_buffer,
641
+ f" [8.2] Compute similarities & generate {len(all_links)} semantic links: {time_mod.time() - compute_start:.3f}s",
642
+ )
571
643
 
572
644
  if all_links:
573
645
  insert_start = time_mod.time()
@@ -577,20 +649,23 @@ async def create_semantic_links_batch(
577
649
  VALUES ($1, $2, $3, $4, $5)
578
650
  ON CONFLICT (from_unit_id, to_unit_id, link_type, COALESCE(entity_id, '00000000-0000-0000-0000-000000000000'::uuid)) DO NOTHING
579
651
  """,
580
- all_links
652
+ all_links,
653
+ )
654
+ _log(
655
+ log_buffer, f" [8.3] Insert {len(all_links)} semantic links: {time_mod.time() - insert_start:.3f}s"
581
656
  )
582
- _log(log_buffer, f" [8.3] Insert {len(all_links)} semantic links: {time_mod.time() - insert_start:.3f}s")
583
657
 
584
658
  return len(all_links)
585
659
 
586
660
  except Exception as e:
587
661
  logger.error(f"Failed to create semantic links: {str(e)}")
588
662
  import traceback
663
+
589
664
  traceback.print_exc()
590
665
  raise
591
666
 
592
667
 
593
- async def insert_entity_links_batch(conn, links: List[EntityLink], chunk_size: int = 50000):
668
+ async def insert_entity_links_batch(conn, links: list[EntityLink], chunk_size: int = 50000):
594
669
  """
595
670
  Insert all entity links using COPY to temp table + INSERT for maximum speed.
596
671
 
@@ -606,7 +681,6 @@ async def insert_entity_links_batch(conn, links: List[EntityLink], chunk_size: i
606
681
  if not links:
607
682
  return
608
683
 
609
- import uuid as uuid_mod
610
684
  import time as time_mod
611
685
 
612
686
  total_start = time_mod.time()
@@ -633,21 +707,15 @@ async def insert_entity_links_batch(conn, links: List[EntityLink], chunk_size: i
633
707
  convert_start = time_mod.time()
634
708
  records = []
635
709
  for link in links:
636
- records.append((
637
- link.from_unit_id,
638
- link.to_unit_id,
639
- link.link_type,
640
- link.weight,
641
- link.entity_id
642
- ))
710
+ records.append((link.from_unit_id, link.to_unit_id, link.link_type, link.weight, link.entity_id))
643
711
  logger.debug(f" [9.3] Convert {len(records)} records: {time_mod.time() - convert_start:.3f}s")
644
712
 
645
713
  # Bulk load using COPY (fastest method)
646
714
  copy_start = time_mod.time()
647
715
  await conn.copy_records_to_table(
648
- '_temp_entity_links',
716
+ "_temp_entity_links",
649
717
  records=records,
650
- columns=['from_unit_id', 'to_unit_id', 'link_type', 'weight', 'entity_id']
718
+ columns=["from_unit_id", "to_unit_id", "link_type", "weight", "entity_id"],
651
719
  )
652
720
  logger.debug(f" [9.4] COPY {len(records)} records to temp table: {time_mod.time() - copy_start:.3f}s")
653
721
 
@@ -665,8 +733,8 @@ async def insert_entity_links_batch(conn, links: List[EntityLink], chunk_size: i
665
733
 
666
734
  async def create_causal_links_batch(
667
735
  conn,
668
- unit_ids: List[str],
669
- causal_relations_per_fact: List[List[dict]],
736
+ unit_ids: list[str],
737
+ causal_relations_per_fact: list[list[dict]],
670
738
  ) -> int:
671
739
  """
672
740
  Create causal links between facts based on LLM-extracted causal relationships.
@@ -694,6 +762,7 @@ async def create_causal_links_batch(
694
762
 
695
763
  try:
696
764
  import time as time_mod
765
+
697
766
  create_start = time_mod.time()
698
767
 
699
768
  # Build links list
@@ -705,12 +774,12 @@ async def create_causal_links_batch(
705
774
  from_unit_id = unit_ids[fact_idx]
706
775
 
707
776
  for relation in causal_relations:
708
- target_idx = relation['target_fact_index']
709
- relation_type = relation['relation_type']
710
- strength = relation.get('strength', 1.0)
777
+ target_idx = relation["target_fact_index"]
778
+ relation_type = relation["relation_type"]
779
+ strength = relation.get("strength", 1.0)
711
780
 
712
781
  # Validate relation_type - must match database constraint
713
- valid_types = {'causes', 'caused_by', 'enables', 'prevents'}
782
+ valid_types = {"causes", "caused_by", "enables", "prevents"}
714
783
  if relation_type not in valid_types:
715
784
  logger.error(
716
785
  f"Invalid relation_type '{relation_type}' (type: {type(relation_type).__name__}) "
@@ -735,7 +804,6 @@ async def create_causal_links_batch(
735
804
  # weight is the strength of the relationship
736
805
  links.append((from_unit_id, to_unit_id, relation_type, strength, None))
737
806
 
738
-
739
807
  if links:
740
808
  insert_start = time_mod.time()
741
809
  try:
@@ -745,14 +813,16 @@ async def create_causal_links_batch(
745
813
  VALUES ($1, $2, $3, $4, $5)
746
814
  ON CONFLICT (from_unit_id, to_unit_id, link_type, COALESCE(entity_id, '00000000-0000-0000-0000-000000000000'::uuid)) DO NOTHING
747
815
  """,
748
- links
816
+ links,
749
817
  )
750
818
  except Exception as db_error:
751
819
  # Log the actual data being inserted for debugging
752
820
  logger.error(f"Database insert failed for causal links. Error: {db_error}")
753
821
  logger.error(f"Attempted to insert {len(links)} links. First few:")
754
822
  for i, link in enumerate(links[:3]):
755
- logger.error(f" Link {i}: from={link[0]}, to={link[1]}, type='{link[2]}' (repr={repr(link[2])}), weight={link[3]}, entity={link[4]}")
823
+ logger.error(
824
+ f" Link {i}: from={link[0]}, to={link[1]}, type='{link[2]}' (repr={repr(link[2])}), weight={link[3]}, entity={link[4]}"
825
+ )
756
826
  raise
757
827
 
758
828
  return len(links)
@@ -760,5 +830,6 @@ async def create_causal_links_batch(
760
830
  except Exception as e:
761
831
  logger.error(f"Failed to create causal links: {str(e)}")
762
832
  import traceback
833
+
763
834
  traceback.print_exc()
764
835
  raise