PyPI - markdown-memory-vec - Versions diffs - 0.1.0__py3-none-any.whl - Mend

markdown-memory-vec 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

markdown_memory_vec-0.1.0.dist-info/METADATA +219 -0
markdown_memory_vec-0.1.0.dist-info/RECORD +13 -0
markdown_memory_vec-0.1.0.dist-info/WHEEL +4 -0
markdown_memory_vec-0.1.0.dist-info/entry_points.txt +2 -0
markdown_memory_vec-0.1.0.dist-info/licenses/LICENSE +21 -0
memory_vec/__init__.py +73 -0
memory_vec/__main__.py +109 -0
memory_vec/embedder.py +137 -0
memory_vec/indexer.py +307 -0
memory_vec/interfaces.py +118 -0
memory_vec/search.py +234 -0
memory_vec/service.py +326 -0
memory_vec/store.py +470 -0

memory_vec/indexer.py ADDED Viewed

@@ -0,0 +1,307 @@
+"""
+Markdown-to-vector indexing pipeline.
+Reads Markdown files, splits them into overlapping chunks, computes
+SHA-256 hashes for deduplication, embeds the text, and stores the
+resulting vectors in a :class:`SqliteVecStore`.
+Key design principles (following OpenClaw memsearch):
+- Markdown files remain the **source of truth**; the vector index is a
+  derived acceleration structure.
+- SHA-256 content hashing ensures we never re-embed unchanged chunks.
+- YAML frontmatter is parsed for metadata (importance, type, tags).
+"""
+from __future__ import annotations
+import logging
+import re
+from pathlib import Path
+from typing import Any, Optional
+import yaml
+from .interfaces import IEmbedder
+from .store import SqliteVecStore, content_hash
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Chunking constants (reference: OpenClaw ~400 tokens, 80 overlap)
+# ---------------------------------------------------------------------------
+_APPROX_CHARS_PER_TOKEN = 4  # rough heuristic for English text
+_DEFAULT_CHUNK_TOKENS = 400
+_DEFAULT_OVERLAP_TOKENS = 80
+_CHUNK_SIZE = _DEFAULT_CHUNK_TOKENS * _APPROX_CHARS_PER_TOKEN  # ~1600 chars
+_OVERLAP_SIZE = _DEFAULT_OVERLAP_TOKENS * _APPROX_CHARS_PER_TOKEN  # ~320 chars
+# Regex for YAML frontmatter delimited by ---
+_FRONTMATTER_RE = re.compile(r"^---\s*\n(.*?)\n---\s*\n", re.DOTALL)
+# ---------------------------------------------------------------------------
+# Frontmatter parsing
+# ---------------------------------------------------------------------------
+def _parse_frontmatter(text: str) -> tuple[dict[str, Any], str]:
+    """Extract YAML frontmatter (if any) and return ``(metadata, body)``.
+    If the file has no frontmatter, returns empty metadata and the
+    original text.
+    """
+    match = _FRONTMATTER_RE.match(text)
+    if not match:
+        return {}, text
+    raw_yaml = match.group(1)
+    body = text[match.end() :]
+    try:
+        meta = yaml.safe_load(raw_yaml)
+        if not isinstance(meta, dict):
+            meta = {}
+    except yaml.YAMLError:
+        meta = {}
+    return meta, body
+# ---------------------------------------------------------------------------
+# Chunking
+# ---------------------------------------------------------------------------
+def _chunk_text(
+    text: str,
+    chunk_size: int = _CHUNK_SIZE,
+    overlap_size: int = _OVERLAP_SIZE,
+) -> list[str]:
+    """Split *text* into overlapping chunks.
+    Strategy:
+    1. Split on ``\\n\\n`` (paragraph boundaries) first.
+    2. Accumulate paragraphs until the chunk exceeds *chunk_size*.
+    3. Adjacent chunks share *overlap_size* characters of trailing context.
+    4. Files shorter than *chunk_size* are returned as a single chunk.
+    """
+    if not text.strip():
+        return []
+    # Short text — no need to split
+    if len(text) <= chunk_size:
+        return [text.strip()]
+    paragraphs = text.split("\n\n")
+    chunks: list[str] = []
+    current: list[str] = []
+    current_len = 0
+    for para in paragraphs:
+        para = para.strip()
+        if not para:
+            continue
+        para_len = len(para)
+        if current_len + para_len > chunk_size and current:
+            # Flush current chunk
+            chunk_text_val = "\n\n".join(current).strip()
+            if chunk_text_val:
+                chunks.append(chunk_text_val)
+            # Build overlap from the tail of current
+            overlap_buf: list[str] = []
+            overlap_len = 0
+            for p in reversed(current):
+                if overlap_len + len(p) > overlap_size:
+                    break
+                overlap_buf.insert(0, p)
+                overlap_len += len(p)
+            current = overlap_buf
+            current_len = overlap_len
+        current.append(para)
+        current_len += para_len
+    # Final chunk
+    if current:
+        chunk_text_val = "\n\n".join(current).strip()
+        if chunk_text_val:
+            chunks.append(chunk_text_val)
+    return chunks
+# ---------------------------------------------------------------------------
+# Indexer
+# ---------------------------------------------------------------------------
+class MemoryIndexer:
+    """Build and maintain the vector index for Markdown memory files.
+    Parameters
+    ----------
+    store:
+        The :class:`SqliteVecStore` to write embeddings to.
+    embedder:
+        An :class:`IEmbedder` implementation used for text → vector.
+    memory_root:
+        Root directory of the memory files.  When provided, all stored
+        ``file_path`` values are converted to paths **relative** to this
+        root so that the index is portable and free of duplicates caused
+        by mixing absolute / relative paths.
+    chunk_size:
+        Target chunk size in characters (default ~1600 ≈ 400 tokens).
+    overlap_size:
+        Overlap between adjacent chunks in characters (default ~320 ≈ 80 tokens).
+    """
+    def __init__(
+        self,
+        store: SqliteVecStore,
+        embedder: IEmbedder,
+        memory_root: Optional[str | Path] = None,
+        chunk_size: int = _CHUNK_SIZE,
+        overlap_size: int = _OVERLAP_SIZE,
+    ) -> None:
+        self._store = store
+        self._embedder = embedder
+        self._memory_root: Optional[Path] = Path(memory_root).resolve() if memory_root else None
+        self._chunk_size = chunk_size
+        self._overlap_size = overlap_size
+    # -- public API ----------------------------------------------------------
+    def index_file(self, file_path: str | Path) -> int:
+        """Index a single Markdown file.
+        Returns the number of *new or updated* chunks that were embedded.
+        Chunks whose SHA-256 hash has not changed are skipped.
+        """
+        path = Path(file_path).resolve()
+        if not path.exists() or not path.is_file():
+            logger.warning("index_file: %s does not exist or is not a file", path)
+            return 0
+        raw_text = path.read_text(encoding="utf-8")
+        meta, body = _parse_frontmatter(raw_text)
+        importance = float(meta.get("importance", 0.5))
+        memory_type = str(meta.get("type", "semantic"))
+        tags: list[str] = meta.get("tags", []) or []
+        if not isinstance(tags, list):
+            tags = [str(tags)]
+        chunks = _chunk_text(body, self._chunk_size, self._overlap_size)
+        if not chunks:
+            return 0
+        # Use relative path (relative to memory_root) as the canonical key
+        # to avoid duplicates from absolute vs relative path differences.
+        if self._memory_root and path.is_relative_to(self._memory_root):
+            file_key = str(path.relative_to(self._memory_root))
+        else:
+            file_key = str(path)
+        existing_hashes = self._store.get_hashes_for_file(file_key)
+        new_or_updated = 0
+        # Track which chunk indexes we process this round
+        current_indexes: set[int] = set()
+        for idx, chunk in enumerate(chunks):
+            current_indexes.add(idx)
+            chunk_hash = content_hash(chunk)
+            old_hash = existing_hashes.get(idx)
+            if old_hash == chunk_hash:
+                # Unchanged — skip re-embedding
+                continue
+            if old_hash is not None:
+                # Hash changed — delete old then re-insert
+                self._delete_chunk(file_key, idx)
+            embedding = self._embedder.embed(chunk)
+            self._store.insert_embedding(
+                embedding=embedding,
+                file_path=file_key,
+                chunk_index=idx,
+                chunk_text=chunk,
+                hash_value=chunk_hash,
+                importance=importance,
+                memory_type=memory_type,
+                tags=tags,
+            )
+            new_or_updated += 1
+        # Remove stale chunks (old chunks beyond new chunk count)
+        for old_idx in set(existing_hashes.keys()) - current_indexes:
+            self._delete_chunk(file_key, old_idx)
+        if new_or_updated:
+            logger.info("Indexed %s: %d chunks embedded (%d total)", path.name, new_or_updated, len(chunks))
+        return new_or_updated
+    def index_directory(self, dir_path: str | Path) -> int:
+        """Recursively index all ``.md`` files under *dir_path*.
+        Returns the total number of new/updated chunks.
+        """
+        root = Path(dir_path)
+        if not root.is_dir():
+            logger.warning("index_directory: %s is not a directory", root)
+            return 0
+        total = 0
+        for md_file in sorted(root.rglob("*.md")):
+            total += self.index_file(md_file)
+        return total
+    def reindex_all(self, memory_root: str | Path) -> int:
+        """Drop all existing index data and rebuild from scratch.
+        Parameters
+        ----------
+        memory_root:
+            Root directory containing Markdown memory files.
+        Returns the total number of chunks indexed.
+        """
+        root = Path(memory_root)
+        # Clear everything via the store's public clear() method
+        self._store.clear()
+        return self.index_directory(root)
+    def remove_file(self, file_path: str | Path) -> int:
+        """Remove all indexed chunks for *file_path*.
+        Returns the number of rows deleted.
+        """
+        path = Path(file_path).resolve()
+        if self._memory_root and path.is_relative_to(self._memory_root):
+            file_key = str(path.relative_to(self._memory_root))
+        else:
+            file_key = str(path)
+        return self._store.delete_by_file(file_key)
+    # -- internal helpers ----------------------------------------------------
+    def _delete_chunk(self, file_path: str, chunk_index: int) -> None:
+        """Delete a specific chunk (by file_path + chunk_index) from both tables."""
+        conn = self._store.connection
+        rows = conn.execute(
+            "SELECT id FROM memory_vec_meta WHERE file_path = ? AND chunk_index = ?",
+            (file_path, chunk_index),
+        ).fetchall()
+        for (rowid,) in rows:
+            self._store.delete_embedding(rowid)
+# ---------------------------------------------------------------------------
+# Module-level helpers exposed for testing / scripting
+# ---------------------------------------------------------------------------
+def chunk_text(
+    text: str,
+    chunk_size: int = _CHUNK_SIZE,
+    overlap_size: int = _OVERLAP_SIZE,
+) -> list[str]:
+    """Public wrapper around the internal chunking function."""
+    return _chunk_text(text, chunk_size, overlap_size)
+def parse_frontmatter(text: str) -> tuple[dict[str, Any], str]:
+    """Public wrapper around YAML frontmatter parsing."""
+    return _parse_frontmatter(text)

memory_vec/interfaces.py ADDED Viewed

@@ -0,0 +1,118 @@
+"""
+Interfaces for vector infrastructure components.
+These are the contracts that concrete implementations (store, embedder) must fulfill.
+This file serves as the integration point between components.
+"""
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Sequence
+@dataclass
+class VectorSearchResult:
+    """Raw result from a vector KNN search."""
+    id: str
+    distance: float  # Lower = more similar (L2) or higher = more similar (cosine)
+    metadata: Dict[str, Any]
+@dataclass
+class VectorRecord:
+    """A record stored in the vector store."""
+    id: str
+    embedding: List[float]
+    metadata: Dict[str, Any]
+class IEmbedder(ABC):
+    """Interface for text embedding models."""
+    @abstractmethod
+    def embed(self, text: str) -> List[float]:
+        """Embed a single text string into a vector.
+        Args:
+            text: The text to embed.
+        Returns:
+            A list of floats representing the embedding vector.
+        """
+        ...
+    @abstractmethod
+    def embed_batch(self, texts: List[str]) -> List[List[float]]:
+        """Embed multiple texts into vectors.
+        Args:
+            texts: A list of texts to embed.
+        Returns:
+            A list of embedding vectors.
+        """
+        ...
+    @property
+    @abstractmethod
+    def dimension(self) -> int:
+        """Return the dimension of the embedding vectors."""
+        ...
+class ISqliteVecStore(ABC):
+    """Interface for sqlite-vec based vector storage.
+    Concrete implementations use sqlite-vec for KNN search over embedding vectors.
+    """
+    @abstractmethod
+    def add(self, records: Sequence[VectorRecord]) -> None:
+        """Add records to the vector store.
+        Args:
+            records: Sequence of VectorRecord to add.
+        """
+        ...
+    @abstractmethod
+    def search(
+        self,
+        query_embedding: List[float],
+        top_k: int = 10,
+        filter_metadata: Optional[Dict[str, Any]] = None,
+    ) -> List[VectorSearchResult]:
+        """Perform KNN search.
+        Args:
+            query_embedding: The query vector.
+            top_k: Number of results to return.
+            filter_metadata: Optional metadata filters.
+        Returns:
+            List of VectorSearchResult sorted by relevance.
+        """
+        ...
+    @abstractmethod
+    def delete(self, ids: Sequence[str]) -> None:
+        """Delete records by IDs.
+        Args:
+            ids: IDs of records to delete.
+        """
+        ...
+    @abstractmethod
+    def clear(self) -> None:
+        """Delete all records from the store."""
+        ...
+    @abstractmethod
+    def count(self) -> int:
+        """Return the number of records in the store."""
+        ...

memory_vec/search.py ADDED Viewed

@@ -0,0 +1,234 @@
+"""
+Hybrid search service combining semantic similarity, importance weighting, and temporal decay.
+The hybrid retrieval formula:
+    score = α × semantic_similarity(query, memory)    # sqlite-vec KNN
+          + β × importance_weight(memory.importance)    # frontmatter
+          + γ × temporal_decay(memory.last_accessed)    # frontmatter
+    temporal_decay = exp(-λ × days_since_access)
+    Default weights: α=0.6, β=0.2, γ=0.2, λ=0.05
+"""
+from __future__ import annotations
+import logging
+import math
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from typing import Any, Dict, List, Optional
+logger = logging.getLogger(__name__)
+@dataclass
+class SearchResult:
+    """Result of a hybrid search combining semantic, importance, and temporal signals."""
+    file_path: str
+    chunk_text: str
+    chunk_index: int
+    semantic_score: float  # 0.0-1.0, cosine similarity
+    importance: float  # 0.0-1.0, from frontmatter
+    temporal_decay: float  # 0.0-1.0, exp(-λ × days)
+    hybrid_score: float  # Weighted combination
+    memory_type: Optional[str] = None
+    tags: list[str] = field(default_factory=list)
+    last_accessed: Optional[datetime] = None
+class HybridSearchService:
+    """
+    Hybrid search service that combines:
+    1. Semantic similarity (via sqlite-vec KNN search)
+    2. Importance weighting (from memory frontmatter)
+    3. Temporal decay (based on last access time)
+    The combination formula is:
+        score = α × semantic + β × importance + γ × temporal_decay
+    All imports of vector infrastructure (ISqliteVecStore, IEmbedder) are optional
+    to support graceful degradation when sqlite-vec is not installed.
+    """
+    def __init__(
+        self,
+        vec_store: Any,  # ISqliteVecStore — typed as Any for optional import safety
+        embedder: Any,  # IEmbedder — typed as Any for optional import safety
+        alpha: float = 0.6,
+        beta: float = 0.2,
+        gamma: float = 0.2,
+        decay_lambda: float = 0.05,
+    ):
+        """
+        Initialize the hybrid search service.
+        Args:
+            vec_store: Vector store implementing ISqliteVecStore interface.
+            embedder: Embedding model implementing IEmbedder interface.
+            alpha: Weight for semantic similarity (default 0.6).
+            beta: Weight for importance score (default 0.2).
+            gamma: Weight for temporal decay (default 0.2).
+            decay_lambda: Decay rate for temporal scoring (default 0.05).
+        Raises:
+            ValueError: If weights don't sum to approximately 1.0.
+        """
+        weight_sum = alpha + beta + gamma
+        if abs(weight_sum - 1.0) > 1e-6:
+            raise ValueError(f"Weights must sum to 1.0, got α={alpha} + β={beta} + γ={gamma} = {weight_sum}")
+        self.vec_store = vec_store
+        self.embedder = embedder
+        self.alpha = alpha
+        self.beta = beta
+        self.gamma = gamma
+        self.decay_lambda = decay_lambda
+    def search(
+        self,
+        query: str,
+        top_k: int = 10,
+        memory_type: Optional[str] = None,
+        min_score: float = 0.0,
+    ) -> list[SearchResult]:
+        """
+        Perform hybrid search combining semantic, importance, and temporal signals.
+        Args:
+            query: The search query text.
+            top_k: Maximum number of results to return.
+            memory_type: Optional filter by memory type (e.g., "semantic", "episodic").
+            min_score: Minimum hybrid score threshold (0.0-1.0).
+        Returns:
+            List of SearchResult sorted by hybrid_score descending.
+        """
+        if not query.strip():
+            return []
+        # Step 1: Embed the query
+        try:
+            query_embedding = self.embedder.embed(query)
+        except Exception:
+            logger.warning("Failed to embed query, returning empty results", exc_info=True)
+            return []
+        # Step 2: KNN search via vector store
+        # Request more candidates than top_k to allow for filtering and re-ranking
+        candidate_k = min(top_k * 3, 100)
+        filter_metadata: Optional[Dict[str, Any]] = None
+        if memory_type:
+            filter_metadata = {"memory_type": memory_type}
+        try:
+            raw_results = self.vec_store.search(
+                query_embedding=query_embedding,
+                top_k=candidate_k,
+                filter_metadata=filter_metadata,
+            )
+        except Exception:
+            logger.warning("Vector search failed, returning empty results", exc_info=True)
+            return []
+        # Step 3: Compute hybrid scores
+        results: list[SearchResult] = []
+        for raw in raw_results:
+            metadata = raw.metadata or {}
+            # Extract fields from metadata
+            file_path = metadata.get("file_path", "")
+            chunk_text = metadata.get("chunk_text", "")
+            chunk_index = metadata.get("chunk_index", 0)
+            importance = float(metadata.get("importance", 0.5))
+            tags = metadata.get("tags", [])
+            mem_type = metadata.get("memory_type")
+            last_accessed_str = metadata.get("last_accessed")
+            # Parse last_accessed
+            last_accessed: Optional[datetime] = None
+            if last_accessed_str:
+                try:
+                    last_accessed = datetime.fromisoformat(str(last_accessed_str))
+                except (ValueError, TypeError):
+                    last_accessed = None
+            # Normalize semantic score: convert cosine distance to similarity.
+            # sqlite-vec with distance_metric=cosine returns distance in [0, 2]:
+            #   0 = identical, 1 = orthogonal, 2 = opposite.
+            # Similarity = 1 - distance maps to [-1, 1]; we clamp to [0, 1].
+            semantic_score = max(0.0, min(1.0, 1.0 - raw.distance))
+            # Compute temporal decay
+            temporal_decay = self.compute_temporal_decay(last_accessed)
+            # Clamp importance to [0, 1]
+            importance = max(0.0, min(1.0, importance))
+            # Compute hybrid score
+            hybrid_score = self.compute_hybrid_score(semantic_score, importance, temporal_decay)
+            if hybrid_score >= min_score:
+                results.append(
+                    SearchResult(
+                        file_path=file_path,
+                        chunk_text=chunk_text,
+                        chunk_index=chunk_index,
+                        semantic_score=semantic_score,
+                        importance=importance,
+                        temporal_decay=temporal_decay,
+                        hybrid_score=hybrid_score,
+                        memory_type=mem_type,
+                        tags=tags if isinstance(tags, list) else [],
+                        last_accessed=last_accessed,
+                    )
+                )
+        # Step 4: Sort by hybrid score and return top_k
+        results.sort(key=lambda r: r.hybrid_score, reverse=True)
+        return results[:top_k]
+    def compute_temporal_decay(self, last_accessed: Optional[datetime]) -> float:
+        """
+        Compute temporal decay factor: exp(-λ × days_since_access).
+        Args:
+            last_accessed: When the memory was last accessed. If None, returns 0.5
+                          (neutral — neither penalized nor boosted).
+        Returns:
+            A float in [0, 1] where 1.0 means "just accessed" and approaches 0.0
+            for very old memories.
+        """
+        if last_accessed is None:
+            return 0.5  # Neutral default for memories without access time
+        now = datetime.now(timezone.utc)
+        # Ensure last_accessed is timezone-aware
+        if last_accessed.tzinfo is None:
+            last_accessed = last_accessed.replace(tzinfo=timezone.utc)
+        delta = now - last_accessed
+        days_since_access = max(0.0, delta.total_seconds() / 86400.0)
+        return math.exp(-self.decay_lambda * days_since_access)
+    def compute_hybrid_score(
+        self,
+        semantic_score: float,
+        importance: float,
+        temporal_decay: float,
+    ) -> float:
+        """
+        Compute the weighted hybrid score.
+        Args:
+            semantic_score: Cosine similarity score [0, 1].
+            importance: Importance weight from frontmatter [0, 1].
+            temporal_decay: Temporal decay factor [0, 1].
+        Returns:
+            Weighted hybrid score = α × semantic + β × importance + γ × temporal_decay.
+        """
+        return self.alpha * semantic_score + self.beta * importance + self.gamma * temporal_decay