PyPI - hindsight-api - Versions diffs - 0.0.13__py3-none-any.whl - Mend

hindsight-api 0.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

hindsight_api/__init__.py +38 -0
hindsight_api/api/__init__.py +105 -0
hindsight_api/api/http.py +1872 -0
hindsight_api/api/mcp.py +157 -0
hindsight_api/engine/__init__.py +47 -0
hindsight_api/engine/cross_encoder.py +97 -0
hindsight_api/engine/db_utils.py +93 -0
hindsight_api/engine/embeddings.py +113 -0
hindsight_api/engine/entity_resolver.py +575 -0
hindsight_api/engine/llm_wrapper.py +269 -0
hindsight_api/engine/memory_engine.py +3095 -0
hindsight_api/engine/query_analyzer.py +519 -0
hindsight_api/engine/response_models.py +222 -0
hindsight_api/engine/retain/__init__.py +50 -0
hindsight_api/engine/retain/bank_utils.py +423 -0
hindsight_api/engine/retain/chunk_storage.py +82 -0
hindsight_api/engine/retain/deduplication.py +104 -0
hindsight_api/engine/retain/embedding_processing.py +62 -0
hindsight_api/engine/retain/embedding_utils.py +54 -0
hindsight_api/engine/retain/entity_processing.py +90 -0
hindsight_api/engine/retain/fact_extraction.py +1027 -0
hindsight_api/engine/retain/fact_storage.py +176 -0
hindsight_api/engine/retain/link_creation.py +121 -0
hindsight_api/engine/retain/link_utils.py +651 -0
hindsight_api/engine/retain/orchestrator.py +405 -0
hindsight_api/engine/retain/types.py +206 -0
hindsight_api/engine/search/__init__.py +15 -0
hindsight_api/engine/search/fusion.py +122 -0
hindsight_api/engine/search/observation_utils.py +132 -0
hindsight_api/engine/search/reranking.py +103 -0
hindsight_api/engine/search/retrieval.py +503 -0
hindsight_api/engine/search/scoring.py +161 -0
hindsight_api/engine/search/temporal_extraction.py +64 -0
hindsight_api/engine/search/think_utils.py +255 -0
hindsight_api/engine/search/trace.py +215 -0
hindsight_api/engine/search/tracer.py +447 -0
hindsight_api/engine/search/types.py +160 -0
hindsight_api/engine/task_backend.py +223 -0
hindsight_api/engine/utils.py +203 -0
hindsight_api/metrics.py +227 -0
hindsight_api/migrations.py +163 -0
hindsight_api/models.py +309 -0
hindsight_api/pg0.py +425 -0
hindsight_api/web/__init__.py +12 -0
hindsight_api/web/server.py +143 -0
hindsight_api-0.0.13.dist-info/METADATA +41 -0
hindsight_api-0.0.13.dist-info/RECORD +48 -0
hindsight_api-0.0.13.dist-info/WHEEL +4 -0

hindsight_api/engine/search/tracer.py ADDED Viewed

@@ -0,0 +1,447 @@
+"""
+Search tracer for collecting detailed search execution traces.
+The SearchTracer collects comprehensive information about each step
+of the spreading activation search process for debugging and visualization.
+"""
+import time
+from datetime import datetime, timezone
+from typing import List, Optional, Dict, Any, Literal
+from .trace import (
+    SearchTrace,
+    QueryInfo,
+    EntryPoint,
+    NodeVisit,
+    WeightComponents,
+    LinkInfo,
+    PruningDecision,
+    SearchSummary,
+    SearchPhaseMetrics,
+    RetrievalResult,
+    RetrievalMethodResults,
+    RRFMergeResult,
+    RerankedResult,
+)
+class SearchTracer:
+    """
+    Tracer for collecting detailed search execution information.
+    Usage:
+        tracer = SearchTracer(query="Who is Alice?", budget=50, max_tokens=4096)
+        tracer.start()
+        # During search...
+        tracer.record_query_embedding(embedding)
+        tracer.add_entry_point(node_id, text, similarity, rank)
+        tracer.visit_node(...)
+        tracer.prune_node(...)
+        # After search...
+        trace = tracer.finalize(final_results)
+        json_output = trace.to_json()
+    """
+    def __init__(self, query: str, budget: int, max_tokens: int):
+        """
+        Initialize tracer.
+        Args:
+            query: Search query text
+            budget: Maximum nodes to explore
+            max_tokens: Maximum tokens to return in results
+        """
+        self.query_text = query
+        self.budget = budget
+        self.max_tokens = max_tokens
+        # Trace data
+        self.query_embedding: Optional[List[float]] = None
+        self.start_time: Optional[float] = None
+        self.entry_points: List[EntryPoint] = []
+        self.visits: List[NodeVisit] = []
+        self.pruned: List[PruningDecision] = []
+        self.phase_metrics: List[SearchPhaseMetrics] = []
+        # New 4-way retrieval tracking
+        self.retrieval_results: List[RetrievalMethodResults] = []
+        self.rrf_merged: List[RRFMergeResult] = []
+        self.reranked: List[RerankedResult] = []
+        # Tracking state
+        self.current_step = 0
+        self.nodes_visited_set = set()  # For quick lookups
+        # Link statistics
+        self.temporal_links_followed = 0
+        self.semantic_links_followed = 0
+        self.entity_links_followed = 0
+    def start(self):
+        """Start timing the search."""
+        self.start_time = time.time()
+    def record_query_embedding(self, embedding: List[float]):
+        """Record the query embedding."""
+        self.query_embedding = embedding
+    def add_entry_point(self, node_id: str, text: str, similarity: float, rank: int):
+        """
+        Record an entry point.
+        Args:
+            node_id: Memory unit ID
+            text: Memory unit text
+            similarity: Cosine similarity to query
+            rank: Rank among entry points (1-based)
+        """
+        # Clamp similarity to [0.0, 1.0] to handle floating-point precision
+        similarity = min(1.0, max(0.0, similarity))
+        self.entry_points.append(
+            EntryPoint(
+                node_id=node_id,
+                text=text,
+                similarity_score=similarity,
+                rank=rank,
+            )
+        )
+    def visit_node(
+        self,
+        node_id: str,
+        text: str,
+        context: str,
+        event_date: datetime,
+        access_count: int,
+        is_entry_point: bool,
+        parent_node_id: Optional[str],
+        link_type: Optional[Literal["temporal", "semantic", "entity"]],
+        link_weight: Optional[float],
+        activation: float,
+        semantic_similarity: float,
+        recency: float,
+        frequency: float,
+        final_weight: float,
+    ):
+        """
+        Record visiting a node.
+        Args:
+            node_id: Memory unit ID
+            text: Memory unit text
+            context: Memory unit context
+            event_date: When the memory occurred
+            access_count: Access count before this search
+            is_entry_point: Whether this is an entry point
+            parent_node_id: Node that led here (None for entry points)
+            link_type: Type of link from parent
+            link_weight: Weight of link from parent
+            activation: Activation score
+            semantic_similarity: Semantic similarity to query
+            recency: Recency weight
+            frequency: Frequency weight
+            final_weight: Combined final weight
+        """
+        self.current_step += 1
+        self.nodes_visited_set.add(node_id)
+        # Clamp values to handle floating-point precision issues
+        # (sometimes normalization produces values like 1.0000005 instead of 1.0)
+        semantic_similarity = min(1.0, max(0.0, semantic_similarity))
+        recency = min(1.0, max(0.0, recency))
+        frequency = min(1.0, max(0.0, frequency))
+        # Calculate weight contributions for transparency
+        weights = WeightComponents(
+            activation=activation,
+            semantic_similarity=semantic_similarity,
+            recency=recency,
+            frequency=frequency,
+            final_weight=final_weight,
+            activation_contribution=0.3 * activation,
+            semantic_contribution=0.3 * semantic_similarity,
+            recency_contribution=0.25 * recency,
+            frequency_contribution=0.15 * frequency,
+        )
+        visit = NodeVisit(
+            step=self.current_step,
+            node_id=node_id,
+            text=text,
+            context=context,
+            event_date=event_date,
+            access_count=access_count,
+            is_entry_point=is_entry_point,
+            parent_node_id=parent_node_id,
+            link_type=link_type,
+            link_weight=link_weight,
+            weights=weights,
+            neighbors_explored=[],
+            final_rank=None,  # Will be set later
+        )
+        self.visits.append(visit)
+        # Track link statistics
+        if link_type == "temporal":
+            self.temporal_links_followed += 1
+        elif link_type == "semantic":
+            self.semantic_links_followed += 1
+        elif link_type == "entity":
+            self.entity_links_followed += 1
+    def add_neighbor_link(
+        self,
+        from_node_id: str,
+        to_node_id: str,
+        link_type: Literal["temporal", "semantic", "entity"],
+        link_weight: float,
+        entity_id: Optional[str],
+        new_activation: Optional[float],
+        followed: bool,
+        prune_reason: Optional[str] = None,
+        is_supplementary: bool = False,
+    ):
+        """
+        Record a link to a neighbor (whether followed or not).
+        Args:
+            from_node_id: Source node
+            to_node_id: Target node
+            link_type: Type of link
+            link_weight: Weight of link
+            entity_id: Entity ID if link is entity-based
+            new_activation: Activation passed to neighbor (None for supplementary links)
+            followed: Whether link was followed
+            prune_reason: Why link was not followed (if not followed)
+            is_supplementary: Whether this is a supplementary link (multiple connections)
+        """
+        # Find the visit for the source node
+        visit = None
+        for v in self.visits:
+            if v.node_id == from_node_id:
+                visit = v
+                break
+        if visit is None:
+            # Node not found, skip
+            return
+        link_info = LinkInfo(
+            to_node_id=to_node_id,
+            link_type=link_type,
+            link_weight=link_weight,
+            entity_id=entity_id,
+            new_activation=new_activation,
+            followed=followed,
+            prune_reason=prune_reason,
+            is_supplementary=is_supplementary,
+        )
+        visit.neighbors_explored.append(link_info)
+    def prune_node(
+        self,
+        node_id: str,
+        reason: Literal["already_visited", "activation_too_low", "budget_exhausted"],
+        activation: float,
+    ):
+        """
+        Record a node being pruned (not visited).
+        Args:
+            node_id: Node that was pruned
+            reason: Why it was pruned
+            activation: Activation value when pruned
+        """
+        self.pruned.append(
+            PruningDecision(
+                node_id=node_id,
+                reason=reason,
+                activation=activation,
+                would_have_been_step=self.current_step + 1,
+            )
+        )
+    def add_phase_metric(self, phase_name: str, duration_seconds: float, details: Optional[Dict[str, Any]] = None):
+        """
+        Record metrics for a search phase.
+        Args:
+            phase_name: Name of the phase
+            duration_seconds: Time taken
+            details: Additional phase-specific details
+        """
+        self.phase_metrics.append(
+            SearchPhaseMetrics(
+                phase_name=phase_name,
+                duration_seconds=duration_seconds,
+                details=details or {},
+            )
+        )
+    def add_retrieval_results(
+        self,
+        method_name: Literal["semantic", "bm25", "graph", "temporal"],
+        results: List[tuple],  # List of (doc_id, data) tuples
+        duration_seconds: float,
+        score_field: str,  # e.g., "similarity", "bm25_score"
+        metadata: Optional[Dict[str, Any]] = None
+    ):
+        """
+        Record results from a single retrieval method.
+        Args:
+            method_name: Name of the retrieval method
+            results: List of (doc_id, data) tuples from retrieval
+            duration_seconds: Time taken for this retrieval
+            score_field: Field name containing the score in data dict
+            metadata: Optional metadata about this retrieval method
+        """
+        retrieval_results = []
+        for rank, (doc_id, data) in enumerate(results, start=1):
+            score = data.get(score_field)
+            if score is None:
+                score = 0.0
+            retrieval_results.append(
+                RetrievalResult(
+                    rank=rank,
+                    node_id=doc_id,
+                    text=data.get("text", ""),
+                    context=data.get("context", ""),
+                    event_date=data.get("event_date"),
+                    fact_type=data.get("fact_type"),
+                    score=score,
+                    score_name=score_field,
+                )
+            )
+        self.retrieval_results.append(
+            RetrievalMethodResults(
+                method_name=method_name,
+                results=retrieval_results,
+                duration_seconds=duration_seconds,
+                metadata=metadata or {},
+            )
+        )
+    def add_rrf_merged(self, merged_results: List[tuple]):
+        """
+        Record RRF merged results.
+        Args:
+            merged_results: List of (doc_id, data, rrf_meta) tuples from RRF merge
+        """
+        self.rrf_merged = []
+        for rank, (doc_id, data, rrf_meta) in enumerate(merged_results, start=1):
+            self.rrf_merged.append(
+                RRFMergeResult(
+                    node_id=doc_id,
+                    text=data.get("text", ""),
+                    rrf_score=rrf_meta.get("rrf_score", 0.0),
+                    source_ranks=rrf_meta.get("source_ranks", {}),
+                    final_rrf_rank=rank,
+                )
+            )
+    def add_reranked(self, reranked_results: List[Dict[str, Any]], rrf_merged: List):
+        """
+        Record reranked results.
+        Args:
+            reranked_results: List of result dicts after reranking
+            rrf_merged: Original RRF merged results for comparison
+        """
+        # Build map of node_id -> rrf_rank
+        rrf_rank_map = {}
+        for item in self.rrf_merged:
+            rrf_rank_map[item.node_id] = item.final_rrf_rank
+        self.reranked = []
+        for rank, result in enumerate(reranked_results, start=1):
+            node_id = result["id"]
+            rrf_rank = rrf_rank_map.get(node_id, len(rrf_merged) + 1)
+            rank_change = rrf_rank - rank  # Positive = moved up
+            # Extract score components (only include non-None values)
+            score_components = {}
+            for key in ["semantic_similarity", "bm25_score", "rrf_score", "recency_normalized", "frequency_normalized"]:
+                if key in result and result[key] is not None:
+                    score_components[key] = result[key]
+            self.reranked.append(
+                RerankedResult(
+                    node_id=node_id,
+                    text=result.get("text", ""),
+                    rerank_score=result.get("weight", 0.0),
+                    rerank_rank=rank,
+                    rrf_rank=rrf_rank,
+                    rank_change=rank_change,
+                    score_components=score_components,
+                )
+            )
+    def finalize(self, final_results: List[Dict[str, Any]]) -> SearchTrace:
+        """
+        Finalize the trace and return the complete SearchTrace object.
+        Args:
+            final_results: Final ranked results returned to user
+        Returns:
+            Complete SearchTrace object
+        """
+        if self.start_time is None:
+            raise ValueError("Tracer not started - call start() first")
+        total_duration = time.time() - self.start_time
+        # Set final ranks on visits based on results
+        for rank, result in enumerate(final_results, 1):
+            result_node_id = result["id"]
+            for visit in self.visits:
+                if visit.node_id == result_node_id:
+                    visit.final_rank = rank
+                    break
+        # Create query info
+        query_info = QueryInfo(
+            query_text=self.query_text,
+            query_embedding=self.query_embedding or [],
+            timestamp=datetime.now(timezone.utc),
+            budget=self.budget,
+            max_tokens=self.max_tokens,
+        )
+        # Create summary
+        summary = SearchSummary(
+            total_nodes_visited=len(self.visits),
+            total_nodes_pruned=len(self.pruned),
+            entry_points_found=len(self.entry_points),
+            budget_used=len(self.visits),
+            budget_remaining=self.budget - len(self.visits),
+            total_duration_seconds=total_duration,
+            results_returned=len(final_results),
+            temporal_links_followed=self.temporal_links_followed,
+            semantic_links_followed=self.semantic_links_followed,
+            entity_links_followed=self.entity_links_followed,
+            phase_metrics=self.phase_metrics,
+        )
+        # Create complete trace
+        trace = SearchTrace(
+            query=query_info,
+            retrieval_results=self.retrieval_results,
+            rrf_merged=self.rrf_merged,
+            reranked=self.reranked,
+            entry_points=self.entry_points,
+            visits=self.visits,
+            pruned=self.pruned,
+            summary=summary,
+            final_results=final_results,
+        )
+        return trace

hindsight_api/engine/search/types.py ADDED Viewed

@@ -0,0 +1,160 @@
+"""
+Type definitions for the recall pipeline.
+These dataclasses replace Dict[str, Any] types throughout the recall pipeline,
+providing type safety and making data flow explicit.
+"""
+from dataclasses import dataclass, field
+from typing import Optional, List, Dict, Any
+from datetime import datetime
+@dataclass
+class RetrievalResult:
+    """
+    Result from a single retrieval method (semantic, BM25, graph, or temporal).
+    This represents a raw result from the database query, before merging or reranking.
+    """
+    id: str
+    text: str
+    fact_type: str
+    context: Optional[str] = None
+    event_date: Optional[datetime] = None
+    occurred_start: Optional[datetime] = None
+    occurred_end: Optional[datetime] = None
+    mentioned_at: Optional[datetime] = None
+    document_id: Optional[str] = None
+    chunk_id: Optional[str] = None
+    access_count: int = 0
+    embedding: Optional[List[float]] = None
+    # Retrieval-specific scores (only one will be set depending on retrieval method)
+    similarity: Optional[float] = None  # Semantic/graph retrieval
+    bm25_score: Optional[float] = None  # BM25 retrieval
+    temporal_score: Optional[float] = None  # Temporal retrieval
+    temporal_proximity: Optional[float] = None  # Temporal retrieval
+    @classmethod
+    def from_db_row(cls, row: Dict[str, Any]) -> "RetrievalResult":
+        """Create from a database row (asyncpg Record converted to dict)."""
+        return cls(
+            id=str(row["id"]),
+            text=row["text"],
+            fact_type=row["fact_type"],
+            context=row.get("context"),
+            event_date=row.get("event_date"),
+            occurred_start=row.get("occurred_start"),
+            occurred_end=row.get("occurred_end"),
+            mentioned_at=row.get("mentioned_at"),
+            document_id=row.get("document_id"),
+            chunk_id=row.get("chunk_id"),
+            access_count=row.get("access_count", 0),
+            embedding=row.get("embedding"),
+            similarity=row.get("similarity"),
+            bm25_score=row.get("bm25_score"),
+            temporal_score=row.get("temporal_score"),
+            temporal_proximity=row.get("temporal_proximity"),
+        )
+@dataclass
+class MergedCandidate:
+    """
+    Candidate after RRF merge of multiple retrieval results.
+    Contains the original retrieval data plus RRF metadata.
+    """
+    # Original retrieval data
+    retrieval: RetrievalResult
+    # RRF metadata
+    rrf_score: float
+    rrf_rank: int = 0
+    source_ranks: Dict[str, int] = field(default_factory=dict)  # method_name -> rank
+    @property
+    def id(self) -> str:
+        """Convenience property to access ID."""
+        return self.retrieval.id
+@dataclass
+class ScoredResult:
+    """
+    Result after reranking and scoring.
+    Contains all retrieval/merge data plus reranking scores and combined score.
+    """
+    # Original merged candidate
+    candidate: MergedCandidate
+    # Reranking scores
+    cross_encoder_score: float = 0.0
+    cross_encoder_score_normalized: float = 0.0
+    # Normalized component scores
+    rrf_normalized: float = 0.0
+    recency: float = 0.5
+    temporal: float = 0.5
+    # Final combined score
+    combined_score: float = 0.0
+    weight: float = 0.0  # Final weight used for ranking
+    @property
+    def id(self) -> str:
+        """Convenience property to access ID."""
+        return self.candidate.id
+    @property
+    def retrieval(self) -> RetrievalResult:
+        """Convenience property to access retrieval data."""
+        return self.candidate.retrieval
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Convert to dict for backwards compatibility.
+        This is used during the transition period and for serialization.
+        """
+        # Start with retrieval data
+        result = {
+            "id": self.retrieval.id,
+            "text": self.retrieval.text,
+            "fact_type": self.retrieval.fact_type,
+            "context": self.retrieval.context,
+            "event_date": self.retrieval.event_date,
+            "occurred_start": self.retrieval.occurred_start,
+            "occurred_end": self.retrieval.occurred_end,
+            "mentioned_at": self.retrieval.mentioned_at,
+            "document_id": self.retrieval.document_id,
+            "chunk_id": self.retrieval.chunk_id,
+            "access_count": self.retrieval.access_count,
+            "embedding": self.retrieval.embedding,
+            "semantic_similarity": self.retrieval.similarity,
+            "bm25_score": self.retrieval.bm25_score,
+        }
+        # Add temporal scores if present
+        if self.retrieval.temporal_score is not None:
+            result["temporal_score"] = self.retrieval.temporal_score
+        if self.retrieval.temporal_proximity is not None:
+            result["temporal_proximity"] = self.retrieval.temporal_proximity
+        # Add RRF metadata
+        result["rrf_score"] = self.candidate.rrf_score
+        result["rrf_rank"] = self.candidate.rrf_rank
+        result.update(self.candidate.source_ranks)
+        # Add reranking scores
+        result["cross_encoder_score"] = self.cross_encoder_score
+        result["cross_encoder_score_normalized"] = self.cross_encoder_score_normalized
+        result["rrf_normalized"] = self.rrf_normalized
+        result["recency"] = self.recency
+        result["combined_score"] = self.combined_score
+        result["weight"] = self.weight
+        result["activation"] = self.weight  # Legacy field
+        return result