hindsight-api 0.0.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hindsight_api/__init__.py +38 -0
- hindsight_api/api/__init__.py +105 -0
- hindsight_api/api/http.py +1872 -0
- hindsight_api/api/mcp.py +157 -0
- hindsight_api/engine/__init__.py +47 -0
- hindsight_api/engine/cross_encoder.py +97 -0
- hindsight_api/engine/db_utils.py +93 -0
- hindsight_api/engine/embeddings.py +113 -0
- hindsight_api/engine/entity_resolver.py +575 -0
- hindsight_api/engine/llm_wrapper.py +269 -0
- hindsight_api/engine/memory_engine.py +3095 -0
- hindsight_api/engine/query_analyzer.py +519 -0
- hindsight_api/engine/response_models.py +222 -0
- hindsight_api/engine/retain/__init__.py +50 -0
- hindsight_api/engine/retain/bank_utils.py +423 -0
- hindsight_api/engine/retain/chunk_storage.py +82 -0
- hindsight_api/engine/retain/deduplication.py +104 -0
- hindsight_api/engine/retain/embedding_processing.py +62 -0
- hindsight_api/engine/retain/embedding_utils.py +54 -0
- hindsight_api/engine/retain/entity_processing.py +90 -0
- hindsight_api/engine/retain/fact_extraction.py +1027 -0
- hindsight_api/engine/retain/fact_storage.py +176 -0
- hindsight_api/engine/retain/link_creation.py +121 -0
- hindsight_api/engine/retain/link_utils.py +651 -0
- hindsight_api/engine/retain/orchestrator.py +405 -0
- hindsight_api/engine/retain/types.py +206 -0
- hindsight_api/engine/search/__init__.py +15 -0
- hindsight_api/engine/search/fusion.py +122 -0
- hindsight_api/engine/search/observation_utils.py +132 -0
- hindsight_api/engine/search/reranking.py +103 -0
- hindsight_api/engine/search/retrieval.py +503 -0
- hindsight_api/engine/search/scoring.py +161 -0
- hindsight_api/engine/search/temporal_extraction.py +64 -0
- hindsight_api/engine/search/think_utils.py +255 -0
- hindsight_api/engine/search/trace.py +215 -0
- hindsight_api/engine/search/tracer.py +447 -0
- hindsight_api/engine/search/types.py +160 -0
- hindsight_api/engine/task_backend.py +223 -0
- hindsight_api/engine/utils.py +203 -0
- hindsight_api/metrics.py +227 -0
- hindsight_api/migrations.py +163 -0
- hindsight_api/models.py +309 -0
- hindsight_api/pg0.py +425 -0
- hindsight_api/web/__init__.py +12 -0
- hindsight_api/web/server.py +143 -0
- hindsight_api-0.0.13.dist-info/METADATA +41 -0
- hindsight_api-0.0.13.dist-info/RECORD +48 -0
- hindsight_api-0.0.13.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Chunk storage for retain pipeline.
|
|
3
|
+
|
|
4
|
+
Handles storage of document chunks in the database.
|
|
5
|
+
"""
|
|
6
|
+
import logging
|
|
7
|
+
from typing import List, Dict, Optional
|
|
8
|
+
|
|
9
|
+
from .types import ChunkMetadata
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
async def store_chunks_batch(
|
|
15
|
+
conn,
|
|
16
|
+
bank_id: str,
|
|
17
|
+
document_id: str,
|
|
18
|
+
chunks: List[ChunkMetadata]
|
|
19
|
+
) -> Dict[int, str]:
|
|
20
|
+
"""
|
|
21
|
+
Store document chunks in the database.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
conn: Database connection
|
|
25
|
+
bank_id: Bank identifier
|
|
26
|
+
document_id: Document identifier
|
|
27
|
+
chunks: List of ChunkMetadata objects
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Dictionary mapping global chunk index to chunk_id
|
|
31
|
+
"""
|
|
32
|
+
if not chunks:
|
|
33
|
+
return {}
|
|
34
|
+
|
|
35
|
+
# Prepare chunk data for batch insert
|
|
36
|
+
chunk_ids = []
|
|
37
|
+
chunk_texts = []
|
|
38
|
+
chunk_indices = []
|
|
39
|
+
chunk_id_map = {}
|
|
40
|
+
|
|
41
|
+
for chunk in chunks:
|
|
42
|
+
chunk_id = f"{bank_id}_{document_id}_{chunk.chunk_index}"
|
|
43
|
+
chunk_ids.append(chunk_id)
|
|
44
|
+
chunk_texts.append(chunk.chunk_text)
|
|
45
|
+
chunk_indices.append(chunk.chunk_index)
|
|
46
|
+
chunk_id_map[chunk.chunk_index] = chunk_id
|
|
47
|
+
|
|
48
|
+
# Batch insert all chunks
|
|
49
|
+
await conn.execute(
|
|
50
|
+
"""
|
|
51
|
+
INSERT INTO chunks (chunk_id, document_id, bank_id, chunk_text, chunk_index)
|
|
52
|
+
SELECT * FROM unnest($1::text[], $2::text[], $3::text[], $4::text[], $5::integer[])
|
|
53
|
+
""",
|
|
54
|
+
chunk_ids,
|
|
55
|
+
[document_id] * len(chunk_texts),
|
|
56
|
+
[bank_id] * len(chunk_texts),
|
|
57
|
+
chunk_texts,
|
|
58
|
+
chunk_indices
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
return chunk_id_map
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def map_facts_to_chunks(
|
|
65
|
+
facts_chunk_indices: List[int],
|
|
66
|
+
chunk_id_map: Dict[int, str]
|
|
67
|
+
) -> List[Optional[str]]:
|
|
68
|
+
"""
|
|
69
|
+
Map fact chunk indices to chunk IDs.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
facts_chunk_indices: List of chunk indices for each fact
|
|
73
|
+
chunk_id_map: Dictionary mapping chunk index to chunk_id
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
List of chunk_ids (same length as facts_chunk_indices)
|
|
77
|
+
"""
|
|
78
|
+
chunk_ids = []
|
|
79
|
+
for chunk_idx in facts_chunk_indices:
|
|
80
|
+
chunk_id = chunk_id_map.get(chunk_idx)
|
|
81
|
+
chunk_ids.append(chunk_id)
|
|
82
|
+
return chunk_ids
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Deduplication logic for retain pipeline.
|
|
3
|
+
|
|
4
|
+
Checks for duplicate facts using semantic similarity and temporal proximity.
|
|
5
|
+
"""
|
|
6
|
+
import logging
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from typing import List
|
|
9
|
+
from collections import defaultdict
|
|
10
|
+
|
|
11
|
+
from .types import ProcessedFact
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
async def check_duplicates_batch(
|
|
17
|
+
conn,
|
|
18
|
+
bank_id: str,
|
|
19
|
+
facts: List[ProcessedFact],
|
|
20
|
+
duplicate_checker_fn
|
|
21
|
+
) -> List[bool]:
|
|
22
|
+
"""
|
|
23
|
+
Check which facts are duplicates using batched time-window queries.
|
|
24
|
+
|
|
25
|
+
Groups facts by 12-hour time buckets to efficiently check for duplicates
|
|
26
|
+
within a 24-hour window.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
conn: Database connection
|
|
30
|
+
bank_id: Bank identifier
|
|
31
|
+
facts: List of ProcessedFact objects to check
|
|
32
|
+
duplicate_checker_fn: Async function(conn, bank_id, texts, embeddings, date, time_window_hours)
|
|
33
|
+
that returns List[bool] indicating duplicates
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
List of boolean flags (same length as facts) indicating if each fact is a duplicate
|
|
37
|
+
"""
|
|
38
|
+
if not facts:
|
|
39
|
+
return []
|
|
40
|
+
|
|
41
|
+
# Group facts by event_date (rounded to 12-hour buckets) for efficient batching
|
|
42
|
+
time_buckets = defaultdict(list)
|
|
43
|
+
for idx, fact in enumerate(facts):
|
|
44
|
+
# Use occurred_start if available, otherwise use mentioned_at
|
|
45
|
+
# For deduplication purposes, we need a time reference
|
|
46
|
+
fact_date = fact.occurred_start if fact.occurred_start is not None else fact.mentioned_at
|
|
47
|
+
|
|
48
|
+
# Defensive: if both are None (shouldn't happen), use now()
|
|
49
|
+
if fact_date is None:
|
|
50
|
+
from datetime import datetime, timezone
|
|
51
|
+
fact_date = datetime.now(timezone.utc)
|
|
52
|
+
|
|
53
|
+
# Round to 12-hour bucket to group similar times
|
|
54
|
+
bucket_key = fact_date.replace(
|
|
55
|
+
hour=(fact_date.hour // 12) * 12,
|
|
56
|
+
minute=0,
|
|
57
|
+
second=0,
|
|
58
|
+
microsecond=0
|
|
59
|
+
)
|
|
60
|
+
time_buckets[bucket_key].append((idx, fact))
|
|
61
|
+
|
|
62
|
+
# Process each bucket in batch
|
|
63
|
+
all_is_duplicate = [False] * len(facts)
|
|
64
|
+
|
|
65
|
+
for bucket_date, bucket_items in time_buckets.items():
|
|
66
|
+
indices = [item[0] for item in bucket_items]
|
|
67
|
+
texts = [item[1].fact_text for item in bucket_items]
|
|
68
|
+
embeddings = [item[1].embedding for item in bucket_items]
|
|
69
|
+
|
|
70
|
+
# Check duplicates for this time bucket
|
|
71
|
+
dup_flags = await duplicate_checker_fn(
|
|
72
|
+
conn,
|
|
73
|
+
bank_id,
|
|
74
|
+
texts,
|
|
75
|
+
embeddings,
|
|
76
|
+
bucket_date,
|
|
77
|
+
time_window_hours=24
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# Map results back to original indices
|
|
81
|
+
for idx, is_dup in zip(indices, dup_flags):
|
|
82
|
+
all_is_duplicate[idx] = is_dup
|
|
83
|
+
|
|
84
|
+
return all_is_duplicate
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def filter_duplicates(
|
|
88
|
+
facts: List[ProcessedFact],
|
|
89
|
+
is_duplicate_flags: List[bool]
|
|
90
|
+
) -> List[ProcessedFact]:
|
|
91
|
+
"""
|
|
92
|
+
Filter out duplicate facts based on duplicate flags.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
facts: List of ProcessedFact objects
|
|
96
|
+
is_duplicate_flags: Boolean flags indicating which facts are duplicates
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
List of non-duplicate facts
|
|
100
|
+
"""
|
|
101
|
+
if len(facts) != len(is_duplicate_flags):
|
|
102
|
+
raise ValueError(f"Mismatch between facts ({len(facts)}) and flags ({len(is_duplicate_flags)})")
|
|
103
|
+
|
|
104
|
+
return [fact for fact, is_dup in zip(facts, is_duplicate_flags) if not is_dup]
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Embedding processing for retain pipeline.
|
|
3
|
+
|
|
4
|
+
Handles augmenting fact texts with temporal information and generating embeddings.
|
|
5
|
+
"""
|
|
6
|
+
import logging
|
|
7
|
+
from typing import List
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
|
|
10
|
+
from . import embedding_utils
|
|
11
|
+
from .types import ExtractedFact
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def augment_texts_with_dates(facts: List[ExtractedFact], format_date_fn) -> List[str]:
|
|
17
|
+
"""
|
|
18
|
+
Augment fact texts with readable dates for better temporal matching.
|
|
19
|
+
|
|
20
|
+
This allows queries like "camping in June" to match facts that happened in June.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
facts: List of ExtractedFact objects
|
|
24
|
+
format_date_fn: Function to format datetime to readable string
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
List of augmented text strings (same length as facts)
|
|
28
|
+
"""
|
|
29
|
+
augmented_texts = []
|
|
30
|
+
for fact in facts:
|
|
31
|
+
# Use occurred_start as the representative date
|
|
32
|
+
fact_date = fact.occurred_start or fact.mentioned_at
|
|
33
|
+
readable_date = format_date_fn(fact_date)
|
|
34
|
+
# Augment text with date for embedding (but store original text in DB)
|
|
35
|
+
augmented_text = f"{fact.fact_text} (happened in {readable_date})"
|
|
36
|
+
augmented_texts.append(augmented_text)
|
|
37
|
+
return augmented_texts
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
async def generate_embeddings_batch(
|
|
41
|
+
embeddings_model,
|
|
42
|
+
texts: List[str]
|
|
43
|
+
) -> List[List[float]]:
|
|
44
|
+
"""
|
|
45
|
+
Generate embeddings for a batch of texts.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
embeddings_model: Embeddings model instance
|
|
49
|
+
texts: List of text strings to embed
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
List of embedding vectors (same length as texts)
|
|
53
|
+
"""
|
|
54
|
+
if not texts:
|
|
55
|
+
return []
|
|
56
|
+
|
|
57
|
+
embeddings = await embedding_utils.generate_embeddings_batch(
|
|
58
|
+
embeddings_model,
|
|
59
|
+
texts
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
return embeddings
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Embedding generation utilities for memory units.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
from typing import List
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def generate_embedding(embeddings_backend, text: str) -> List[float]:
|
|
13
|
+
"""
|
|
14
|
+
Generate embedding for text using the provided embeddings backend.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
embeddings_backend: Embeddings instance to use for encoding
|
|
18
|
+
text: Text to embed
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
Embedding vector (dimension depends on embeddings backend)
|
|
22
|
+
"""
|
|
23
|
+
try:
|
|
24
|
+
embeddings = embeddings_backend.encode([text])
|
|
25
|
+
return embeddings[0]
|
|
26
|
+
except Exception as e:
|
|
27
|
+
raise Exception(f"Failed to generate embedding: {str(e)}")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
async def generate_embeddings_batch(embeddings_backend, texts: List[str]) -> List[List[float]]:
|
|
31
|
+
"""
|
|
32
|
+
Generate embeddings for multiple texts using the provided embeddings backend.
|
|
33
|
+
|
|
34
|
+
Runs the embedding generation in a thread pool to avoid blocking the event loop
|
|
35
|
+
for CPU-bound operations.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
embeddings_backend: Embeddings instance to use for encoding
|
|
39
|
+
texts: List of texts to embed
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
List of embeddings in same order as input texts
|
|
43
|
+
"""
|
|
44
|
+
try:
|
|
45
|
+
# Run embeddings in thread pool to avoid blocking event loop
|
|
46
|
+
loop = asyncio.get_event_loop()
|
|
47
|
+
embeddings = await loop.run_in_executor(
|
|
48
|
+
None, # Use default thread pool
|
|
49
|
+
embeddings_backend.encode,
|
|
50
|
+
texts
|
|
51
|
+
)
|
|
52
|
+
return embeddings
|
|
53
|
+
except Exception as e:
|
|
54
|
+
raise Exception(f"Failed to generate batch embeddings: {str(e)}")
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Entity processing for retain pipeline.
|
|
3
|
+
|
|
4
|
+
Handles entity extraction, resolution, and link creation for stored facts.
|
|
5
|
+
"""
|
|
6
|
+
import logging
|
|
7
|
+
from typing import List, Tuple, Dict, Any
|
|
8
|
+
from uuid import UUID
|
|
9
|
+
|
|
10
|
+
from .types import ProcessedFact, EntityRef
|
|
11
|
+
from . import link_utils
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
async def process_entities_batch(
|
|
17
|
+
entity_resolver,
|
|
18
|
+
conn,
|
|
19
|
+
bank_id: str,
|
|
20
|
+
unit_ids: List[str],
|
|
21
|
+
facts: List[ProcessedFact],
|
|
22
|
+
log_buffer: List[str] = None
|
|
23
|
+
) -> List[Tuple[str, str, float]]:
|
|
24
|
+
"""
|
|
25
|
+
Process entities for all facts and create entity links.
|
|
26
|
+
|
|
27
|
+
This function:
|
|
28
|
+
1. Extracts entity mentions from fact texts
|
|
29
|
+
2. Resolves entity names to canonical entities
|
|
30
|
+
3. Creates entity records in the database
|
|
31
|
+
4. Returns entity links ready for insertion
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
entity_resolver: EntityResolver instance for entity resolution
|
|
35
|
+
conn: Database connection
|
|
36
|
+
bank_id: Bank identifier
|
|
37
|
+
unit_ids: List of unit IDs (same length as facts)
|
|
38
|
+
facts: List of ProcessedFact objects
|
|
39
|
+
log_buffer: Optional buffer for detailed logging
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
List of entity link tuples: (unit_id, entity_id, confidence)
|
|
43
|
+
"""
|
|
44
|
+
if not unit_ids or not facts:
|
|
45
|
+
return []
|
|
46
|
+
|
|
47
|
+
if len(unit_ids) != len(facts):
|
|
48
|
+
raise ValueError(f"Mismatch between unit_ids ({len(unit_ids)}) and facts ({len(facts)})")
|
|
49
|
+
|
|
50
|
+
# Extract data for link_utils function
|
|
51
|
+
fact_texts = [fact.fact_text for fact in facts]
|
|
52
|
+
# Use occurred_start if available, otherwise use mentioned_at for entity timestamps
|
|
53
|
+
fact_dates = [fact.occurred_start if fact.occurred_start is not None else fact.mentioned_at for fact in facts]
|
|
54
|
+
# Convert EntityRef objects to dict format expected by link_utils
|
|
55
|
+
entities_per_fact = [
|
|
56
|
+
[{'text': entity.name, 'type': 'CONCEPT'} for entity in (fact.entities or [])]
|
|
57
|
+
for fact in facts
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
# Use existing link_utils function for entity processing
|
|
61
|
+
entity_links = await link_utils.extract_entities_batch_optimized(
|
|
62
|
+
entity_resolver,
|
|
63
|
+
conn,
|
|
64
|
+
bank_id,
|
|
65
|
+
unit_ids,
|
|
66
|
+
fact_texts,
|
|
67
|
+
"", # context (not used in current implementation)
|
|
68
|
+
fact_dates,
|
|
69
|
+
entities_per_fact,
|
|
70
|
+
log_buffer # Pass log_buffer for detailed logging
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
return entity_links
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
async def insert_entity_links_batch(
|
|
77
|
+
conn,
|
|
78
|
+
entity_links: List[Tuple[str, str, float]]
|
|
79
|
+
) -> None:
|
|
80
|
+
"""
|
|
81
|
+
Insert entity links in batch.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
conn: Database connection
|
|
85
|
+
entity_links: List of (unit_id, entity_id, confidence) tuples
|
|
86
|
+
"""
|
|
87
|
+
if not entity_links:
|
|
88
|
+
return
|
|
89
|
+
|
|
90
|
+
await link_utils.insert_entity_links_batch(conn, entity_links)
|