PyPI - hindsight-api - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl - Mend

hindsight-api 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

hindsight_api/__init__.py +10 -9
hindsight_api/alembic/env.py +5 -8
hindsight_api/alembic/versions/5a366d414dce_initial_schema.py +266 -180
hindsight_api/alembic/versions/b7c4d8e9f1a2_add_chunks_table.py +32 -32
hindsight_api/alembic/versions/c8e5f2a3b4d1_add_retain_params_to_documents.py +11 -11
hindsight_api/alembic/versions/d9f6a3b4c5e2_rename_bank_to_interactions.py +7 -12
hindsight_api/alembic/versions/e0a1b2c3d4e5_disposition_to_3_traits.py +23 -15
hindsight_api/alembic/versions/rename_personality_to_disposition.py +30 -21
hindsight_api/api/__init__.py +10 -10
hindsight_api/api/http.py +575 -593
hindsight_api/api/mcp.py +31 -33
hindsight_api/banner.py +13 -6
hindsight_api/config.py +17 -12
hindsight_api/engine/__init__.py +9 -9
hindsight_api/engine/cross_encoder.py +23 -27
hindsight_api/engine/db_utils.py +5 -4
hindsight_api/engine/embeddings.py +22 -21
hindsight_api/engine/entity_resolver.py +81 -75
hindsight_api/engine/llm_wrapper.py +74 -88
hindsight_api/engine/memory_engine.py +663 -673
hindsight_api/engine/query_analyzer.py +100 -97
hindsight_api/engine/response_models.py +105 -106
hindsight_api/engine/retain/__init__.py +9 -16
hindsight_api/engine/retain/bank_utils.py +34 -58
hindsight_api/engine/retain/chunk_storage.py +4 -12
hindsight_api/engine/retain/deduplication.py +9 -28
hindsight_api/engine/retain/embedding_processing.py +4 -11
hindsight_api/engine/retain/embedding_utils.py +3 -4
hindsight_api/engine/retain/entity_processing.py +7 -17
hindsight_api/engine/retain/fact_extraction.py +155 -165
hindsight_api/engine/retain/fact_storage.py +11 -23
hindsight_api/engine/retain/link_creation.py +11 -39
hindsight_api/engine/retain/link_utils.py +166 -95
hindsight_api/engine/retain/observation_regeneration.py +39 -52
hindsight_api/engine/retain/orchestrator.py +72 -62
hindsight_api/engine/retain/types.py +49 -43
hindsight_api/engine/search/__init__.py +15 -1
hindsight_api/engine/search/fusion.py +6 -15
hindsight_api/engine/search/graph_retrieval.py +234 -0
hindsight_api/engine/search/mpfp_retrieval.py +438 -0
hindsight_api/engine/search/observation_utils.py +9 -16
hindsight_api/engine/search/reranking.py +4 -7
hindsight_api/engine/search/retrieval.py +388 -193
hindsight_api/engine/search/scoring.py +5 -7
hindsight_api/engine/search/temporal_extraction.py +8 -11
hindsight_api/engine/search/think_utils.py +115 -39
hindsight_api/engine/search/trace.py +68 -38
hindsight_api/engine/search/tracer.py +49 -35
hindsight_api/engine/search/types.py +22 -16
hindsight_api/engine/task_backend.py +21 -26
hindsight_api/engine/utils.py +25 -10
hindsight_api/main.py +21 -40
hindsight_api/mcp_local.py +190 -0
hindsight_api/metrics.py +44 -30
hindsight_api/migrations.py +10 -8
hindsight_api/models.py +60 -72
hindsight_api/pg0.py +64 -337
hindsight_api/server.py +3 -6
{hindsight_api-0.1.4.dist-info → hindsight_api-0.1.6.dist-info}/METADATA +6 -5
hindsight_api-0.1.6.dist-info/RECORD +64 -0
{hindsight_api-0.1.4.dist-info → hindsight_api-0.1.6.dist-info}/entry_points.txt +1 -0
hindsight_api-0.1.4.dist-info/RECORD +0 -61
{hindsight_api-0.1.4.dist-info → hindsight_api-0.1.6.dist-info}/WHEEL +0 -0

hindsight_api/engine/response_models.py CHANGED Viewed

@@ -6,9 +6,9 @@ API response models should be kept separate and convert from these core models t
 API stability even if internal models change.
 """
-from typing import Optional, List, Dict, Any
-from pydantic import BaseModel, Field, ConfigDict
+from typing import Any
+from pydantic import BaseModel, ConfigDict, Field
 # Valid fact types for recall operations (excludes 'observation' which is internal)
 VALID_RECALL_FACT_TYPES = frozenset(["world", "experience", "opinion"])
@@ -23,17 +23,12 @@ class DispositionTraits(BaseModel):
     - literalism: 1=flexible interpretation, 5=literal interpretation (how strictly to interpret information)
     - empathy: 1=detached, 5=empathetic (how much to consider emotional context)
     """
     skepticism: int = Field(ge=1, le=5, description="How skeptical vs trusting (1=trusting, 5=skeptical)")
     literalism: int = Field(ge=1, le=5, description="How literally to interpret information (1=flexible, 5=literal)")
     empathy: int = Field(ge=1, le=5, description="How much to consider emotional context (1=detached, 5=empathetic)")
-    model_config = ConfigDict(json_schema_extra={
-        "example": {
-            "skepticism": 3,
-            "literalism": 3,
-            "empathy": 3
-        }
-    })
+    model_config = ConfigDict(json_schema_extra={"example": {"skepticism": 3, "literalism": 3, "empathy": 3}})
 class MemoryFact(BaseModel):
@@ -43,38 +38,44 @@ class MemoryFact(BaseModel):
     This represents a unit of information stored in the memory system,
     including both the content and metadata.
     """
-    model_config = ConfigDict(json_schema_extra={
-        "example": {
-            "id": "123e4567-e89b-12d3-a456-426614174000",
-            "text": "Alice works at Google on the AI team",
-            "fact_type": "world",
-            "entities": ["Alice", "Google"],
-            "context": "work info",
-            "occurred_start": "2024-01-15T10:30:00Z",
-            "occurred_end": "2024-01-15T10:30:00Z",
-            "mentioned_at": "2024-01-15T10:30:00Z",
-            "document_id": "session_abc123",
-            "metadata": {"source": "slack"},
-            "chunk_id": "bank123_session_abc123_0",
-            "activation": 0.95
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {
+                "id": "123e4567-e89b-12d3-a456-426614174000",
+                "text": "Alice works at Google on the AI team",
+                "fact_type": "world",
+                "entities": ["Alice", "Google"],
+                "context": "work info",
+                "occurred_start": "2024-01-15T10:30:00Z",
+                "occurred_end": "2024-01-15T10:30:00Z",
+                "mentioned_at": "2024-01-15T10:30:00Z",
+                "document_id": "session_abc123",
+                "metadata": {"source": "slack"},
+                "chunk_id": "bank123_session_abc123_0",
+                "activation": 0.95,
+            }
         }
-    })
+    )
     id: str = Field(description="Unique identifier for the memory fact")
     text: str = Field(description="The actual text content of the memory")
     fact_type: str = Field(description="Type of fact: 'world', 'experience', 'opinion', or 'observation'")
-    entities: Optional[List[str]] = Field(None, description="Entity names mentioned in this fact")
-    context: Optional[str] = Field(None, description="Additional context for the memory")
-    occurred_start: Optional[str] = Field(None, description="ISO format date when the event started occurring")
-    occurred_end: Optional[str] = Field(None, description="ISO format date when the event ended occurring")
-    mentioned_at: Optional[str] = Field(None, description="ISO format date when the fact was mentioned/learned")
-    document_id: Optional[str] = Field(None, description="ID of the document this memory belongs to")
-    metadata: Optional[Dict[str, str]] = Field(None, description="User-defined metadata")
-    chunk_id: Optional[str] = Field(None, description="ID of the chunk this fact was extracted from (format: bank_id_document_id_chunk_index)")
+    entities: list[str] | None = Field(None, description="Entity names mentioned in this fact")
+    context: str | None = Field(None, description="Additional context for the memory")
+    occurred_start: str | None = Field(None, description="ISO format date when the event started occurring")
+    occurred_end: str | None = Field(None, description="ISO format date when the event ended occurring")
+    mentioned_at: str | None = Field(None, description="ISO format date when the fact was mentioned/learned")
+    document_id: str | None = Field(None, description="ID of the document this memory belongs to")
+    metadata: dict[str, str] | None = Field(None, description="User-defined metadata")
+    chunk_id: str | None = Field(
+        None, description="ID of the chunk this fact was extracted from (format: bank_id_document_id_chunk_index)"
+    )
 class ChunkInfo(BaseModel):
     """Information about a chunk."""
     chunk_text: str = Field(description="The raw chunk text")
     chunk_index: int = Field(description="Index of the chunk within the document")
     truncated: bool = Field(default=False, description="Whether the chunk was truncated due to token limits")
@@ -87,35 +88,33 @@ class RecallResult(BaseModel):
     Contains a list of matching memory facts and optional trace information
     for debugging and transparency.
     """
-    model_config = ConfigDict(json_schema_extra={
-        "example": {
-            "results": [
-                {
-                    "id": "123e4567-e89b-12d3-a456-426614174000",
-                    "text": "Alice works at Google on the AI team",
-                    "fact_type": "world",
-                    "context": "work info",
-                    "occurred_start": "2024-01-15T10:30:00Z",
-                    "occurred_end": "2024-01-15T10:30:00Z",
-                    "activation": 0.95
-                }
-            ],
-            "trace": {
-                "query": "What did Alice say about machine learning?",
-                "num_results": 1
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {
+                "results": [
+                    {
+                        "id": "123e4567-e89b-12d3-a456-426614174000",
+                        "text": "Alice works at Google on the AI team",
+                        "fact_type": "world",
+                        "context": "work info",
+                        "occurred_start": "2024-01-15T10:30:00Z",
+                        "occurred_end": "2024-01-15T10:30:00Z",
+                        "activation": 0.95,
+                    }
+                ],
+                "trace": {"query": "What did Alice say about machine learning?", "num_results": 1},
             }
         }
-    })
+    )
-    results: List[MemoryFact] = Field(description="List of memory facts matching the query")
-    trace: Optional[Dict[str, Any]] = Field(None, description="Trace information for debugging")
-    entities: Optional[Dict[str, "EntityState"]] = Field(
-        None,
-        description="Entity states for entities mentioned in results (keyed by canonical name)"
+    results: list[MemoryFact] = Field(description="List of memory facts matching the query")
+    trace: dict[str, Any] | None = Field(None, description="Trace information for debugging")
+    entities: dict[str, "EntityState"] | None = Field(
+        None, description="Entity states for entities mentioned in results (keyed by canonical name)"
     )
-    chunks: Optional[Dict[str, ChunkInfo]] = Field(
-        None,
-        description="Chunks for facts, keyed by '{document_id}_{chunk_index}'"
+    chunks: dict[str, ChunkInfo] | None = Field(
+        None, description="Chunks for facts, keyed by '{document_id}_{chunk_index}'"
     )
@@ -126,37 +125,35 @@ class ReflectResult(BaseModel):
     Contains the formulated answer, the facts it was based on (organized by type),
     and any new opinions that were formed during the reflection process.
     """
-    model_config = ConfigDict(json_schema_extra={
-        "example": {
-            "text": "Based on my knowledge, machine learning is being actively used in healthcare...",
-            "based_on": {
-                "world": [
-                    {
-                        "id": "123e4567-e89b-12d3-a456-426614174000",
-                        "text": "Machine learning is used in medical diagnosis",
-                        "fact_type": "world",
-                        "context": "healthcare",
-                        "occurred_start": "2024-01-15T10:30:00Z",
-                        "occurred_end": "2024-01-15T10:30:00Z"
-                    }
-                ],
-                "experience": [],
-                "opinion": []
-            },
-            "new_opinions": [
-                "Machine learning has great potential in healthcare"
-            ]
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {
+                "text": "Based on my knowledge, machine learning is being actively used in healthcare...",
+                "based_on": {
+                    "world": [
+                        {
+                            "id": "123e4567-e89b-12d3-a456-426614174000",
+                            "text": "Machine learning is used in medical diagnosis",
+                            "fact_type": "world",
+                            "context": "healthcare",
+                            "occurred_start": "2024-01-15T10:30:00Z",
+                            "occurred_end": "2024-01-15T10:30:00Z",
+                        }
+                    ],
+                    "experience": [],
+                    "opinion": [],
+                },
+                "new_opinions": ["Machine learning has great potential in healthcare"],
+            }
         }
-    })
+    )
     text: str = Field(description="The formulated answer text")
-    based_on: Dict[str, List[MemoryFact]] = Field(
+    based_on: dict[str, list[MemoryFact]] = Field(
         description="Facts used to formulate the answer, organized by type (world, experience, opinion)"
     )
-    new_opinions: List[str] = Field(
-        default_factory=list,
-        description="List of newly formed opinions during reflection"
-    )
+    new_opinions: list[str] = Field(default_factory=list, description="List of newly formed opinions during reflection")
 class Opinion(BaseModel):
@@ -166,12 +163,12 @@ class Opinion(BaseModel):
     Opinions represent the bank's formed perspectives on topics,
     with a confidence level indicating strength of belief.
     """
-    model_config = ConfigDict(json_schema_extra={
-        "example": {
-            "text": "Machine learning has great potential in healthcare",
-            "confidence": 0.85
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {"text": "Machine learning has great potential in healthcare", "confidence": 0.85}
         }
-    })
+    )
     text: str = Field(description="The opinion text")
     confidence: float = Field(description="Confidence score between 0.0 and 1.0")
@@ -184,15 +181,15 @@ class EntityObservation(BaseModel):
     Observations are objective facts synthesized from multiple memory facts
     about an entity, without personality influence.
     """
-    model_config = ConfigDict(json_schema_extra={
-        "example": {
-            "text": "John is detail-oriented and works at Google",
-            "mentioned_at": "2024-01-15T10:30:00Z"
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {"text": "John is detail-oriented and works at Google", "mentioned_at": "2024-01-15T10:30:00Z"}
         }
-    })
+    )
     text: str = Field(description="The observation text")
-    mentioned_at: Optional[str] = Field(None, description="ISO format date when this observation was created")
+    mentioned_at: str | None = Field(None, description="ISO format date when this observation was created")
 class EntityState(BaseModel):
@@ -201,20 +198,22 @@ class EntityState(BaseModel):
     Contains observations synthesized from facts about the entity.
     """
-    model_config = ConfigDict(json_schema_extra={
-        "example": {
-            "entity_id": "123e4567-e89b-12d3-a456-426614174000",
-            "canonical_name": "John",
-            "observations": [
-                {"text": "John is detail-oriented", "mentioned_at": "2024-01-15T10:30:00Z"},
-                {"text": "John works at Google on the AI team", "mentioned_at": "2024-01-14T09:00:00Z"}
-            ]
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {
+                "entity_id": "123e4567-e89b-12d3-a456-426614174000",
+                "canonical_name": "John",
+                "observations": [
+                    {"text": "John is detail-oriented", "mentioned_at": "2024-01-15T10:30:00Z"},
+                    {"text": "John works at Google on the AI team", "mentioned_at": "2024-01-14T09:00:00Z"},
+                ],
+            }
         }
-    })
+    )
     entity_id: str = Field(description="Unique identifier for the entity")
     canonical_name: str = Field(description="Canonical name of the entity")
-    observations: List[EntityObservation] = Field(
-        default_factory=list,
-        description="List of observations about this entity"
+    observations: list[EntityObservation] = Field(
+        default_factory=list, description="List of observations about this entity"
     )

hindsight_api/engine/retain/__init__.py CHANGED Viewed

@@ -12,23 +12,16 @@ This package contains modular components for the retain operation:
 - fact_storage: Handle fact insertion into database
 """
-from .types import (
-    RetainContent,
-    ExtractedFact,
-    ProcessedFact,
-    ChunkMetadata,
-    EntityRef,
-    CausalRelation,
-    RetainBatch
+from . import (
+    chunk_storage,
+    deduplication,
+    embedding_processing,
+    entity_processing,
+    fact_extraction,
+    fact_storage,
+    link_creation,
 )
-from . import fact_extraction
-from . import embedding_processing
-from . import deduplication
-from . import entity_processing
-from . import link_creation
-from . import chunk_storage
-from . import fact_storage
+from .types import CausalRelation, ChunkMetadata, EntityRef, ExtractedFact, ProcessedFact, RetainBatch, RetainContent
 __all__ = [
     # Types

hindsight_api/engine/retain/bank_utils.py CHANGED Viewed

@@ -5,8 +5,10 @@ bank profile utilities for disposition and background management.
 import json
 import logging
 import re
-from typing import Dict, Optional, TypedDict
+from typing import TypedDict
 from pydantic import BaseModel, Field
 from ..db_utils import acquire_with_retry
 from ..response_models import DispositionTraits
@@ -21,6 +23,7 @@ DEFAULT_DISPOSITION = {
 class BankProfile(TypedDict):
     """Type for bank profile data."""
     name: str
     disposition: DispositionTraits
     background: str
@@ -28,6 +31,7 @@ class BankProfile(TypedDict):
 class BackgroundMergeResponse(BaseModel):
     """LLM response for background merge with disposition inference."""
     background: str = Field(description="Merged background in first person perspective")
     disposition: DispositionTraits = Field(description="Inferred disposition traits (skepticism, literalism, empathy)")
@@ -51,7 +55,7 @@ async def get_bank_profile(pool, bank_id: str) -> BankProfile:
             SELECT name, disposition, background
             FROM banks WHERE bank_id = $1
             """,
-            bank_id
+            bank_id,
         )
         if row:
@@ -61,9 +65,7 @@ async def get_bank_profile(pool, bank_id: str) -> BankProfile:
                 disposition_data = json.loads(disposition_data)
             return BankProfile(
-                name=row["name"],
-                disposition=DispositionTraits(**disposition_data),
-                background=row["background"]
+                name=row["name"], disposition=DispositionTraits(**disposition_data), background=row["background"]
             )
         # Bank doesn't exist, create with defaults
@@ -76,21 +78,13 @@ async def get_bank_profile(pool, bank_id: str) -> BankProfile:
             bank_id,
             bank_id,  # Default name is the bank_id
             json.dumps(DEFAULT_DISPOSITION),
-            ""
+            "",
         )
-        return BankProfile(
-            name=bank_id,
-            disposition=DispositionTraits(**DEFAULT_DISPOSITION),
-            background=""
-        )
+        return BankProfile(name=bank_id, disposition=DispositionTraits(**DEFAULT_DISPOSITION), background="")
-async def update_bank_disposition(
-    pool,
-    bank_id: str,
-    disposition: Dict[str, int]
-) -> None:
+async def update_bank_disposition(pool, bank_id: str, disposition: dict[str, int]) -> None:
     """
     Update bank disposition traits.
@@ -111,17 +105,11 @@ async def update_bank_disposition(
             WHERE bank_id = $1
             """,
             bank_id,
-            json.dumps(disposition)
+            json.dumps(disposition),
         )
-async def merge_bank_background(
-    pool,
-    llm_config,
-    bank_id: str,
-    new_info: str,
-    update_disposition: bool = True
-) -> dict:
+async def merge_bank_background(pool, llm_config, bank_id: str, new_info: str, update_disposition: bool = True) -> dict:
     """
     Merge new background information with existing background using LLM.
     Normalizes to first person ("I") and resolves conflicts.
@@ -142,12 +130,7 @@ async def merge_bank_background(
     current_background = profile["background"]
     # Use LLM to merge backgrounds and optionally infer disposition
-    result = await _llm_merge_background(
-        llm_config,
-        current_background,
-        new_info,
-        infer_disposition=update_disposition
-    )
+    result = await _llm_merge_background(llm_config, current_background, new_info, infer_disposition=update_disposition)
     merged_background = result["background"]
     inferred_disposition = result.get("disposition")
@@ -166,7 +149,7 @@ async def merge_bank_background(
                 """,
                 bank_id,
                 merged_background,
-                json.dumps(inferred_disposition)
+                json.dumps(inferred_disposition),
             )
         else:
             # Update only background
@@ -178,7 +161,7 @@ async def merge_bank_background(
                 WHERE bank_id = $1
                 """,
                 bank_id,
-                merged_background
+                merged_background,
             )
     response = {"background": merged_background}
@@ -188,12 +171,7 @@ async def merge_bank_background(
     return response
-async def _llm_merge_background(
-    llm_config,
-    current: str,
-    new_info: str,
-    infer_disposition: bool = False
-) -> dict:
+async def _llm_merge_background(llm_config, current: str, new_info: str, infer_disposition: bool = False) -> dict:
     """
     Use LLM to intelligently merge background information.
     Optionally infer Big Five disposition traits from the merged background.
@@ -273,25 +251,19 @@ Merged background:"""
                     response_format=BackgroundMergeResponse,
                     scope="bank_background",
                     temperature=0.3,
-                    max_completion_tokens=8192
+                    max_completion_tokens=8192,
                 )
                 logger.info(f"Successfully got structured response: background={parsed.background[:100]}")
                 # Convert Pydantic model to dict format
-                return {
-                    "background": parsed.background,
-                    "disposition": parsed.disposition.model_dump()
-                }
+                return {"background": parsed.background, "disposition": parsed.disposition.model_dump()}
             except Exception as e:
                 logger.warning(f"Structured output failed, falling back to manual parsing: {e}")
                 # Fall through to manual parsing below
         # Manual parsing fallback or non-disposition merge
         content = await llm_config.call(
-            messages=messages,
-            scope="bank_background",
-            temperature=0.3,
-            max_completion_tokens=8192
+            messages=messages, scope="bank_background", temperature=0.3, max_completion_tokens=8192
         )
         logger.info(f"LLM response for background merge (first 500 chars): {content[:500]}")
@@ -310,7 +282,7 @@ Merged background:"""
             # Method 2: Extract from markdown code blocks
             if result is None:
                 # Remove markdown code blocks
-                code_block_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', content, re.DOTALL)
+                code_block_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", content, re.DOTALL)
                 if code_block_match:
                     try:
                         result = json.loads(code_block_match.group(1))
@@ -321,7 +293,9 @@ Merged background:"""
             # Method 3: Find nested JSON structure
             if result is None:
                 # Look for JSON object with nested structure
-                json_match = re.search(r'\{[^{}]*"background"[^{}]*"disposition"[^{}]*\{[^{}]*\}[^{}]*\}', content, re.DOTALL)
+                json_match = re.search(
+                    r'\{[^{}]*"background"[^{}]*"disposition"[^{}]*\{[^{}]*\}[^{}]*\}', content, re.DOTALL
+                )
                 if json_match:
                     try:
                         result = json.loads(json_match.group())
@@ -335,7 +309,7 @@ Merged background:"""
                 # Fallback: use new_info as background with default disposition
                 return {
                     "background": new_info if new_info else current if current else "",
-                    "disposition": DEFAULT_DISPOSITION.copy()
+                    "disposition": DEFAULT_DISPOSITION.copy(),
                 }
             # Validate disposition values
@@ -401,13 +375,15 @@ async def list_banks(pool) -> list:
             if isinstance(disposition_data, str):
                 disposition_data = json.loads(disposition_data)
-            result.append({
-                "bank_id": row["bank_id"],
-                "name": row["name"],
-                "disposition": disposition_data,
-                "background": row["background"],
-                "created_at": row["created_at"].isoformat() if row["created_at"] else None,
-                "updated_at": row["updated_at"].isoformat() if row["updated_at"] else None,
-            })
+            result.append(
+                {
+                    "bank_id": row["bank_id"],
+                    "name": row["name"],
+                    "disposition": disposition_data,
+                    "background": row["background"],
+                    "created_at": row["created_at"].isoformat() if row["created_at"] else None,
+                    "updated_at": row["updated_at"].isoformat() if row["updated_at"] else None,
+                }
+            )
         return result

hindsight_api/engine/retain/chunk_storage.py CHANGED Viewed

@@ -3,20 +3,15 @@ Chunk storage for retain pipeline.
 Handles storage of document chunks in the database.
 """
 import logging
-from typing import List, Dict, Optional
 from .types import ChunkMetadata
 logger = logging.getLogger(__name__)
-async def store_chunks_batch(
-    conn,
-    bank_id: str,
-    document_id: str,
-    chunks: List[ChunkMetadata]
-) -> Dict[int, str]:
+async def store_chunks_batch(conn, bank_id: str, document_id: str, chunks: list[ChunkMetadata]) -> dict[int, str]:
     """
     Store document chunks in the database.
@@ -55,16 +50,13 @@ async def store_chunks_batch(
         [document_id] * len(chunk_texts),
         [bank_id] * len(chunk_texts),
         chunk_texts,
-        chunk_indices
+        chunk_indices,
     )
     return chunk_id_map
-def map_facts_to_chunks(
-    facts_chunk_indices: List[int],
-    chunk_id_map: Dict[int, str]
-) -> List[Optional[str]]:
+def map_facts_to_chunks(facts_chunk_indices: list[int], chunk_id_map: dict[int, str]) -> list[str | None]:
     """
     Map fact chunk indices to chunk IDs.

hindsight_api/engine/retain/deduplication.py CHANGED Viewed

@@ -3,22 +3,17 @@ Deduplication logic for retain pipeline.
 Checks for duplicate facts using semantic similarity and temporal proximity.
 """
 import logging
-from datetime import datetime
-from typing import List
 from collections import defaultdict
+from datetime import UTC
 from .types import ProcessedFact
 logger = logging.getLogger(__name__)
-async def check_duplicates_batch(
-    conn,
-    bank_id: str,
-    facts: List[ProcessedFact],
-    duplicate_checker_fn
-) -> List[bool]:
+async def check_duplicates_batch(conn, bank_id: str, facts: list[ProcessedFact], duplicate_checker_fn) -> list[bool]:
     """
     Check which facts are duplicates using batched time-window queries.
@@ -47,16 +42,12 @@ async def check_duplicates_batch(
         # Defensive: if both are None (shouldn't happen), use now()
         if fact_date is None:
-            from datetime import datetime, timezone
-            fact_date = datetime.now(timezone.utc)
+            from datetime import datetime
+            fact_date = datetime.now(UTC)
         # Round to 12-hour bucket to group similar times
-        bucket_key = fact_date.replace(
-            hour=(fact_date.hour // 12) * 12,
-            minute=0,
-            second=0,
-            microsecond=0
-        )
+        bucket_key = fact_date.replace(hour=(fact_date.hour // 12) * 12, minute=0, second=0, microsecond=0)
         time_buckets[bucket_key].append((idx, fact))
     # Process each bucket in batch
@@ -68,14 +59,7 @@ async def check_duplicates_batch(
         embeddings = [item[1].embedding for item in bucket_items]
         # Check duplicates for this time bucket
-        dup_flags = await duplicate_checker_fn(
-            conn,
-            bank_id,
-            texts,
-            embeddings,
-            bucket_date,
-            time_window_hours=24
-        )
+        dup_flags = await duplicate_checker_fn(conn, bank_id, texts, embeddings, bucket_date, time_window_hours=24)
         # Map results back to original indices
         for idx, is_dup in zip(indices, dup_flags):
@@ -84,10 +68,7 @@ async def check_duplicates_batch(
     return all_is_duplicate
-def filter_duplicates(
-    facts: List[ProcessedFact],
-    is_duplicate_flags: List[bool]
-) -> List[ProcessedFact]:
+def filter_duplicates(facts: list[ProcessedFact], is_duplicate_flags: list[bool]) -> list[ProcessedFact]:
     """
     Filter out duplicate facts based on duplicate flags.

hindsight-api 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

hindsight-api 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl