PyPI - echo-vector - Versions diffs - 0.1.1__py3-none-any.whl - Mend

echo-vector 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

echo_vector-0.1.1.dist-info/METADATA +288 -0
echo_vector-0.1.1.dist-info/RECORD +38 -0
echo_vector-0.1.1.dist-info/WHEEL +4 -0
echo_vector-0.1.1.dist-info/entry_points.txt +2 -0
echovector/__init__.py +7 -0
echovector/api/__init__.py +1 -0
echovector/api/server.py +144 -0
echovector/audio/__init__.py +12 -0
echovector/audio/chunker.py +71 -0
echovector/audio/metadata.py +58 -0
echovector/audio/processor.py +53 -0
echovector/audio/streaming.py +33 -0
echovector/cli/__init__.py +1 -0
echovector/cli/main.py +165 -0
echovector/core.py +289 -0
echovector/embeddings/__init__.py +15 -0
echovector/embeddings/ast_model.py +41 -0
echovector/embeddings/base.py +43 -0
echovector/embeddings/cache.py +96 -0
echovector/embeddings/clap.py +126 -0
echovector/embeddings/factory.py +78 -0
echovector/embeddings/hubert.py +41 -0
echovector/embeddings/local.py +109 -0
echovector/embeddings/wav2vec2.py +41 -0
echovector/embeddings/whisper_enc.py +44 -0
echovector/evaluation/__init__.py +1 -0
echovector/evaluation/metrics.py +45 -0
echovector/indexing/__init__.py +12 -0
echovector/indexing/base.py +105 -0
echovector/indexing/faiss_index.py +182 -0
echovector/indexing/store.py +165 -0
echovector/search/__init__.py +14 -0
echovector/search/engine.py +82 -0
echovector/search/filters.py +55 -0
echovector/search/results.py +41 -0
echovector/utils/__init__.py +6 -0
echovector/utils/config.py +69 -0
echovector/utils/logging.py +31 -0

echovector/indexing/faiss_index.py ADDED Viewed

@@ -0,0 +1,182 @@
+"""Faiss-based index implementation."""
+import os
+from typing import Any, cast
+import faiss
+import numpy as np
+import numpy.typing as npt
+from .base import BaseIndex
+from .store import SQLiteStore
+class FaissIndex(BaseIndex):
+    """Faiss-based index using IndexFlatIP (Inner Product) for vector search."""
+    def __init__(self, dimension: int, db_path: str = ":memory:") -> None:
+        """Initialize the Faiss index and the metadata store.
+        Args:
+            dimension: Dimensionality of the embeddings.
+            db_path: Path to the SQLite store database.
+        """
+        self.dimension = dimension
+        self.index: faiss.IndexIDMap2 = faiss.IndexIDMap2(faiss.IndexFlatIP(dimension))
+        self.store = SQLiteStore(db_path)
+    def add(
+        self,
+        embeddings: npt.NDArray[np.float32],
+        ids: list[str],
+        metadata: list[dict[str, Any]] | None = None,
+    ) -> None:
+        """Add embeddings, their string IDs, and metadata to the index.
+        Supports batched and incremental indexing.
+        Args:
+            embeddings: A 2D numpy array of embeddings (np.float32).
+            ids: A list of string IDs corresponding to the embeddings.
+            metadata: An optional list of metadata dictionaries.
+        Raises:
+            ValueError: If input dimensions or lengths are invalid.
+        """
+        if embeddings.ndim != 2 or embeddings.shape[1] != self.dimension:
+            raise ValueError(f"Embeddings must be a 2D array with dimension {self.dimension}")
+        num_vectors = embeddings.shape[0]
+        if len(ids) != num_vectors:
+            raise ValueError("Number of IDs must match the number of embeddings.")
+        if metadata is None:
+            metadata = [{} for _ in range(num_vectors)]
+        elif len(metadata) != num_vectors:
+            raise ValueError("Length of metadata must match the number of embeddings.")
+        # Ensure embeddings are contiguous and float32
+        embeddings_f32 = np.ascontiguousarray(embeddings, dtype=np.float32)
+        # Use max stored ID + 1 so IDs remain unique after deletions.
+        start_id = self.store.get_max_int_id() + 1
+        int_ids = list(range(start_id, start_id + num_vectors))
+        # Add to Faiss index with explicit IDs
+        self.index.add_with_ids(embeddings_f32, np.array(int_ids, dtype=np.int64))
+        # Add to SQLite store
+        self.store.add(int_ids, ids, metadata)
+    def search(
+        self, query_embeddings: npt.NDArray[np.float32], k: int = 10
+    ) -> tuple[
+        npt.NDArray[np.float32],
+        list[list[str | None]],
+        list[list[dict[str, Any] | None]],
+    ]:
+        """Search for the k nearest neighbors.
+        Args:
+            query_embeddings: 2D numpy array of query vectors.
+            k: Number of nearest neighbors to retrieve.
+        Returns:
+            A tuple of (distances, string_ids, metadata).
+        Raises:
+            ValueError: If query embeddings dimensions are invalid.
+        """
+        if query_embeddings.ndim != 2 or query_embeddings.shape[1] != self.dimension:
+            raise ValueError(
+                f"Query embeddings must be a 2D array with dimension {self.dimension}"
+            )
+        if self.index.ntotal == 0:
+            return (
+                np.array([], dtype=np.float32).reshape(query_embeddings.shape[0], 0),
+                [[] for _ in range(query_embeddings.shape[0])],
+                [[] for _ in range(query_embeddings.shape[0])],
+            )
+        query_f32 = np.ascontiguousarray(query_embeddings, dtype=np.float32)
+        # Ensure k is not larger than index size
+        actual_k = min(k, self.index.ntotal)
+        distances, int_indices = self.index.search(query_f32, actual_k)
+        all_string_ids: list[list[str | None]] = []
+        all_metadata: list[list[dict[str, Any] | None]] = []
+        for indices in int_indices:
+            # -1 is returned by Faiss if not enough results are found
+            valid_indices = [int(idx) for idx in indices if idx != -1]
+            if not valid_indices:
+                all_string_ids.append([])
+                all_metadata.append([])
+                continue
+            str_ids, meta = self.store.get_by_int_ids(valid_indices)
+            # Reconstruct list to handle -1
+            row_str_ids: list[str | None] = []
+            row_meta: list[dict[str, Any] | None] = []
+            valid_idx_ptr = 0
+            for idx in indices:
+                if idx == -1:
+                    row_str_ids.append(None)
+                    row_meta.append(None)
+                else:
+                    row_str_ids.append(str_ids[valid_idx_ptr])
+                    row_meta.append(meta[valid_idx_ptr])
+                    valid_idx_ptr += 1
+            all_string_ids.append(row_str_ids)
+            all_metadata.append(row_meta)
+        return distances, all_string_ids, all_metadata
+    def remove_int_ids(self, int_ids: list[int]) -> None:
+        """Remove vectors by their integer IDs from the FAISS index and metadata store.
+        Args:
+            int_ids: List of integer IDs to remove.
+        """
+        if not int_ids:
+            return
+        ids_array = np.array(int_ids, dtype=np.int64)
+        self.index.remove_ids(faiss.IDSelectorBatch(ids_array))
+        placeholders = ",".join("?" for _ in int_ids)
+        delete_query = f"DELETE FROM metadata WHERE int_id IN ({placeholders})"  # noqa: S608
+        self.store._conn.execute(delete_query, int_ids)
+        self.store._conn.commit()
+    def save(self, index_path: str) -> None:
+        """Save the Faiss index to disk.
+        Args:
+            index_path: The file path to save the index to.
+        """
+        parent = os.path.dirname(index_path)
+        if parent:
+            os.makedirs(parent, exist_ok=True)
+        faiss.write_index(self.index, index_path)
+    def load(self, index_path: str) -> None:
+        """Load the Faiss index from disk.
+        Args:
+            index_path: The file path to load the index from.
+        Raises:
+            FileNotFoundError: If the index file does not exist.
+            ValueError: If the loaded index dimension does not match.
+        """
+        if not os.path.exists(index_path):
+            raise FileNotFoundError(f"Index file {index_path} not found.")
+        self.index = cast("faiss.IndexIDMap2", faiss.read_index(index_path))
+        if self.index.d != self.dimension:
+            raise ValueError(
+                f"Loaded index dimension ({self.index.d}) does not match "
+                f"expected dimension ({self.dimension})"
+            )

echovector/indexing/store.py ADDED Viewed

@@ -0,0 +1,165 @@
+"""SQLite-based store for metadata persistence."""
+import contextlib
+import json
+import sqlite3
+from typing import Any
+from .base import BaseStore
+class SQLiteStore(BaseStore):
+    """SQLite-based store for metadata and string ID persistence."""
+    def __init__(self, db_path: str = ":memory:") -> None:
+        """Initialize the SQLite store.
+        Args:
+            db_path: Path to the SQLite database file.
+        """
+        self.db_path = db_path
+        self._conn = sqlite3.connect(self.db_path, check_same_thread=False)
+        self.initialize()
+    def initialize(self) -> None:
+        """Initialize the database schema."""
+        cursor = self._conn.cursor()
+        cursor.execute(
+            """
+            CREATE TABLE IF NOT EXISTS metadata (
+                int_id INTEGER PRIMARY KEY,
+                string_id TEXT UNIQUE NOT NULL,
+                metadata_json TEXT
+            )
+            """
+        )
+        self._conn.commit()
+    def add(
+        self, int_ids: list[int], string_ids: list[str], metadata_list: list[dict[str, Any]]
+    ) -> None:
+        """Add metadata and ID mappings to the store.
+        Args:
+            int_ids: List of integer IDs assigned by the index.
+            string_ids: List of original string IDs.
+            metadata_list: List of metadata dictionaries.
+        Raises:
+            ValueError: If lengths of input lists do not match.
+        """
+        if not (len(int_ids) == len(string_ids) == len(metadata_list)):
+            raise ValueError("Mismatched lengths for IDs and metadata.")
+        cursor = self._conn.cursor()
+        data = [
+            (i_id, s_id, json.dumps(meta))
+            for i_id, s_id, meta in zip(int_ids, string_ids, metadata_list, strict=True)
+        ]
+        cursor.executemany(
+            """
+            INSERT OR REPLACE INTO metadata (int_id, string_id, metadata_json)
+            VALUES (?, ?, ?)
+            """,
+            data,
+        )
+        self._conn.commit()
+    def get_by_int_ids(
+        self, int_ids: list[int]
+    ) -> tuple[list[str | None], list[dict[str, Any] | None]]:
+        """Retrieve string IDs and metadata for a list of integer IDs.
+        Args:
+            int_ids: List of integer IDs to query.
+        Returns:
+            A tuple containing a list of string IDs and a list of metadata dictionaries.
+        """
+        if not int_ids:
+            return [], []
+        cursor = self._conn.cursor()
+        placeholders = ",".join("?" for _ in int_ids)
+        cols = "int_id, string_id, metadata_json"
+        query = f"SELECT {cols} FROM metadata WHERE int_id IN ({placeholders})"  # noqa: S608
+        cursor.execute(query, int_ids)
+        rows = cursor.fetchall()
+        row_dict = {row[0]: (row[1], json.loads(row[2])) for row in rows}
+        string_ids: list[str | None] = []
+        metadata: list[dict[str, Any] | None] = []
+        for i_id in int_ids:
+            if i_id in row_dict:
+                string_ids.append(row_dict[i_id][0])
+                metadata.append(row_dict[i_id][1])
+            else:
+                string_ids.append(None)
+                metadata.append(None)
+        return string_ids, metadata
+    def get_max_int_id(self) -> int:
+        """Get the maximum integer ID currently in the store.
+        Returns:
+            The maximum integer ID, or -1 if empty.
+        """
+        cursor = self._conn.cursor()
+        cursor.execute("SELECT MAX(int_id) FROM metadata")
+        row = cursor.fetchone()
+        return int(row[0]) if row[0] is not None else -1
+    def has_filepath(self, filepath: str) -> bool:
+        """Return True if any chunk from filepath is already stored.
+        Args:
+            filepath: Absolute or relative path of the source audio file.
+        """
+        cursor = self._conn.cursor()
+        cursor.execute(
+            "SELECT 1 FROM metadata WHERE string_id LIKE ? LIMIT 1",
+            (filepath + "#%",),
+        )
+        return cursor.fetchone() is not None
+    def get_int_ids_for_filepath(self, filepath: str) -> list[int]:
+        """Return all integer IDs belonging to chunks of filepath.
+        Args:
+            filepath: Source audio file path.
+        """
+        cursor = self._conn.cursor()
+        cursor.execute(
+            "SELECT int_id FROM metadata WHERE string_id LIKE ?",
+            (filepath + "#%",),
+        )
+        return [row[0] for row in cursor.fetchall()]
+    def delete_by_filepath(self, filepath: str) -> list[int]:
+        """Delete all chunks for filepath and return their integer IDs.
+        Args:
+            filepath: Source audio file path.
+        Returns:
+            List of integer IDs that were removed.
+        """
+        int_ids = self.get_int_ids_for_filepath(filepath)
+        if int_ids:
+            placeholders = ",".join("?" for _ in int_ids)
+            delete_query = f"DELETE FROM metadata WHERE int_id IN ({placeholders})"  # noqa: S608
+            self._conn.execute(delete_query, int_ids)
+            self._conn.commit()
+        return int_ids
+    def close(self) -> None:
+        """Close the database connection."""
+        self._conn.close()
+    def __del__(self) -> None:
+        """Ensure connection is closed on GC to avoid Python 3.13 sqlite3 finalizer bug."""
+        with contextlib.suppress(Exception):
+            self._conn.close()

echovector/search/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+"""Search module for EchoVector."""
+from echovector.search.engine import Embedder, SearchEngine, VectorIndex
+from echovector.search.filters import SearchFilter
+from echovector.search.results import SearchResult, TimestampRange
+__all__ = [
+    "Embedder",
+    "SearchEngine",
+    "SearchFilter",
+    "SearchResult",
+    "TimestampRange",
+    "VectorIndex",
+]

echovector/search/engine.py ADDED Viewed

@@ -0,0 +1,82 @@
+"""Search engine implementation."""
+from typing import Any, Protocol
+from echovector.search.filters import SearchFilter
+from echovector.search.results import SearchResult, TimestampRange
+class Embedder(Protocol):
+    """Protocol for text embedders."""
+    def embed_text(self, text: str) -> list[float]:
+        """Embed a text query into a vector."""
+        ...
+class VectorIndex(Protocol):
+    """Protocol for vector indices."""
+    def search(self, vector: list[float], top_k: int) -> list[dict[str, Any]]:
+        """Search the index.
+        Expected to return a list of dictionaries, each containing:
+        - 'filepath': str
+        - 'start': float
+        - 'end': float
+        - 'score': float
+        - 'metadata': dict (optional)
+        """
+        ...
+class SearchEngine:
+    """Engine for executing searches against an index."""
+    def __init__(self, index: VectorIndex, embedder: Embedder) -> None:
+        """Initialize the search engine.
+        Args:
+            index: The vector index to search against.
+            embedder: The embedder to use for queries.
+        """
+        self._index = index
+        self._embedder = embedder
+    def search(
+        self, query: str, top_k: int = 10, filters: SearchFilter | None = None
+    ) -> list[SearchResult]:
+        """Search the index for a given query.
+        Args:
+            query: The text query.
+            top_k: Number of results to return.
+            filters: Optional filters to apply.
+        Returns:
+            A list of hydrated SearchResult objects.
+        """
+        vector = self._embedder.embed_text(query)
+        # If filtering, fetch more to ensure we have enough post-filter
+        fetch_k = top_k * 5 if filters else top_k
+        raw_results = self._index.search(vector, fetch_k)
+        results = []
+        for raw in raw_results:
+            # Safely get metadata
+            metadata = raw.get("metadata", {})
+            result = SearchResult(
+                filepath=raw["filepath"],
+                timestamp_range=TimestampRange(start=raw["start"], end=raw["end"]),
+                score=raw["score"],
+                metadata=metadata,
+            )
+            results.append(result)
+        if filters:
+            results = filters.apply(results)
+        return results[:top_k]

echovector/search/filters.py ADDED Viewed

@@ -0,0 +1,55 @@
+"""Filtering logic for search results."""
+from typing import Any
+from echovector.search.results import SearchResult
+class SearchFilter:
+    """Filter parameters for search queries.
+    Attributes:
+        filepaths: Optional list of allowed file paths.
+        min_score: Optional minimum score threshold.
+        metadata_filters: Optional exact match metadata filters.
+    """
+    def __init__(
+        self,
+        filepaths: list[str] | None = None,
+        min_score: float | None = None,
+        metadata_filters: dict[str, Any] | None = None,
+    ) -> None:
+        """Initialize the search filter.
+        Args:
+            filepaths: List of valid file paths.
+            min_score: Minimum required score.
+            metadata_filters: Key-value pairs that must match exactly.
+        """
+        self.filepaths = filepaths
+        self.min_score = min_score
+        self.metadata_filters = metadata_filters or {}
+    def apply(self, results: list[SearchResult]) -> list[SearchResult]:
+        """Apply filters to a list of results.
+        Args:
+            results: List of SearchResult objects.
+        Returns:
+            Filtered list of SearchResult objects.
+        """
+        filtered = results
+        if self.min_score is not None:
+            filtered = [r for r in filtered if r.score >= self.min_score]
+        if self.filepaths is not None:
+            valid_paths = set(self.filepaths)
+            filtered = [r for r in filtered if r.filepath in valid_paths]
+        if self.metadata_filters:
+            for key, val in self.metadata_filters.items():
+                filtered = [r for r in filtered if r.metadata and r.metadata.get(key) == val]
+        return filtered

echovector/search/results.py ADDED Viewed

@@ -0,0 +1,41 @@
+"""Models for search results."""
+from dataclasses import dataclass, field
+from typing import Any
+@dataclass(frozen=True)
+class TimestampRange:
+    """Represents a time range in an audio file.
+    Attributes:
+        start: Start time in seconds.
+        end: End time in seconds.
+    """
+    start: float
+    end: float
+    def __post_init__(self) -> None:
+        """Validate timestamp range."""
+        if self.start < 0:
+            raise ValueError("Start time cannot be negative.")
+        if self.end < self.start:
+            raise ValueError("End time cannot be less than start time.")
+@dataclass(frozen=True)
+class SearchResult:
+    """A hydrated search result from the engine.
+    Attributes:
+        filepath: Path to the audio file.
+        timestamp_range: The time range within the audio file.
+        score: The search score (e.g., cosine similarity).
+        metadata: Optional metadata dictionary.
+    """
+    filepath: str
+    timestamp_range: TimestampRange
+    score: float
+    metadata: dict[str, Any] = field(default_factory=dict)

echovector/utils/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Utility modules for EchoVector."""
+from echovector.utils.config import Config
+from echovector.utils.logging import logger, setup_logger
+__all__ = ["Config", "logger", "setup_logger"]

echovector/utils/config.py ADDED Viewed

@@ -0,0 +1,69 @@
+"""Configuration management for EchoVector."""
+import json
+from pathlib import Path
+from typing import Any
+class Config:
+    """Configuration class to manage settings for EchoVector."""
+    def __init__(self, config_dict: dict[str, Any] | None = None) -> None:
+        """Initialize the Config object.
+        Args:
+            config_dict: Dictionary containing configuration parameters.
+        """
+        self._config = config_dict or {}
+    @classmethod
+    def from_json(cls, file_path: str | Path) -> "Config":
+        """Load configuration from a JSON file.
+        Args:
+            file_path: Path to the JSON configuration file.
+        Returns:
+            Config instance populated with data from the JSON file.
+        """
+        with open(file_path, encoding="utf-8") as f:
+            data = json.load(f)
+        return cls(data)
+    def to_json(self, file_path: str | Path) -> None:
+        """Save current configuration to a JSON file.
+        Args:
+            file_path: Path where the JSON configuration will be saved.
+        """
+        with open(file_path, "w", encoding="utf-8") as f:
+            json.dump(self._config, f, indent=4)
+    def get(self, key: str, default: Any = None) -> Any:
+        """Get a configuration value.
+        Args:
+            key: Configuration key.
+            default: Default value if key is not found.
+        Returns:
+            The value for the specified key or the default value.
+        """
+        return self._config.get(key, default)
+    def set(self, key: str, value: Any) -> None:
+        """Set a configuration value.
+        Args:
+            key: Configuration key.
+            value: Configuration value.
+        """
+        self._config[key] = value
+    def update(self, other_config: dict[str, Any]) -> None:
+        """Update configuration with another dictionary.
+        Args:
+            other_config: Dictionary to update current configuration with.
+        """
+        self._config.update(other_config)

echovector/utils/logging.py ADDED Viewed

@@ -0,0 +1,31 @@
+"""Logging configuration for EchoVector."""
+import logging
+import sys
+def setup_logger(name: str = "echovector", level: int = logging.INFO) -> logging.Logger:
+    """Set up and return a logger with the specified name and level.
+    Args:
+        name: Name of the logger.
+        level: Logging level.
+    Returns:
+        Configured logger instance.
+    """
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+    if not logger.handlers:
+        formatter = logging.Formatter(
+            fmt="%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
+        )
+        handler = logging.StreamHandler(sys.stdout)
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+    return logger
+logger = setup_logger()