echo-vector 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,126 @@
1
+ """CLAP (Contrastive Language-Audio Pretraining) embedding backend."""
2
+
3
+ from typing import cast
4
+
5
+ import librosa
6
+ import numpy as np
7
+ import numpy.typing as npt
8
+
9
+ from echovector.embeddings.base import EmbeddingBackend
10
+
11
+ try:
12
+ import torch
13
+ from transformers import ClapModel, ClapProcessor
14
+
15
+ _CLAP_AVAILABLE = True
16
+ except ImportError: # pragma: no cover
17
+ _CLAP_AVAILABLE = False
18
+
19
+
20
+ class ClapBackend(EmbeddingBackend):
21
+ """Embedding backend using the CLAP model from Hugging Face transformers.
22
+
23
+ Supports both audio and text embeddings in the same semantic space.
24
+ """
25
+
26
+ def __init__(
27
+ self,
28
+ model_name: str = "laion/clap-htsat-unfused",
29
+ device: str | None = None,
30
+ ) -> None:
31
+ """Initialize the CLAP backend.
32
+
33
+ Args:
34
+ model_name: The Hugging Face model identifier.
35
+ device: Device to run the model on (e.g., 'cpu', 'cuda').
36
+
37
+ Raises:
38
+ ImportError: If torch or transformers are not installed.
39
+ """
40
+ if not _CLAP_AVAILABLE:
41
+ raise ImportError(
42
+ "CLAP backend requires torch and transformers. "
43
+ "Install them with: pip install 'echo_vector[clap]'"
44
+ )
45
+ if device is None:
46
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
47
+ else:
48
+ self.device = device
49
+
50
+ self.processor = ClapProcessor.from_pretrained(model_name)
51
+ self.model = ClapModel.from_pretrained(model_name)
52
+ self.model.to(self.device)
53
+ self.model.eval()
54
+
55
+ # Determine embedding dimension from model config
56
+ self._embedding_dim = self.model.config.projection_dim
57
+
58
+ @property
59
+ def embedding_dim(self) -> int:
60
+ """Return the dimensionality of the generated embeddings."""
61
+ return cast("int", self._embedding_dim)
62
+
63
+ def _load_and_resample(self, path: str, target_sr: int) -> np.ndarray:
64
+ """Load audio and resample to the target sample rate."""
65
+ audio_array, _ = librosa.load(path, sr=target_sr, mono=True)
66
+ return audio_array
67
+
68
+ def embed_audio(self, audio_paths: list[str]) -> npt.NDArray[np.float32]:
69
+ """Embed a batch of audio files into the CLAP semantic space.
70
+
71
+ Args:
72
+ audio_paths: List of file paths to audio files.
73
+
74
+ Returns:
75
+ A numpy array of shape (batch_size, embedding_dim).
76
+ """
77
+ target_sr = 48000 # Default for CLAP
78
+ feature_extractor = getattr(self.processor, "feature_extractor", None)
79
+ if feature_extractor is not None and hasattr(feature_extractor, "sampling_rate"):
80
+ target_sr = int(getattr(feature_extractor, "sampling_rate", target_sr))
81
+
82
+ audios = [self._load_and_resample(path, target_sr) for path in audio_paths]
83
+
84
+ inputs = self.processor(
85
+ audios=audios,
86
+ return_tensors="pt",
87
+ sampling_rate=target_sr,
88
+ )
89
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
90
+
91
+ with torch.no_grad():
92
+ audio_features = self.model.get_audio_features(**inputs)
93
+ # Normalize to ensure audio and text embeddings share the unit hypersphere
94
+ audio_features = audio_features / audio_features.norm(dim=-1, keepdim=True)
95
+
96
+ return cast(
97
+ "npt.NDArray[np.float32]",
98
+ audio_features.cpu().numpy().astype(np.float32),
99
+ )
100
+
101
+ def embed_text(self, texts: list[str]) -> npt.NDArray[np.float32]:
102
+ """Embed a batch of text queries into the CLAP semantic space.
103
+
104
+ Args:
105
+ texts: List of text strings.
106
+
107
+ Returns:
108
+ A numpy array of shape (batch_size, embedding_dim).
109
+ """
110
+ inputs = self.processor(
111
+ text=texts,
112
+ return_tensors="pt",
113
+ padding=True,
114
+ truncation=True,
115
+ )
116
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
117
+
118
+ with torch.no_grad():
119
+ text_features = self.model.get_text_features(**inputs)
120
+ # Normalize to ensure audio and text embeddings share the unit hypersphere
121
+ text_features = text_features / text_features.norm(dim=-1, keepdim=True)
122
+
123
+ return cast(
124
+ "npt.NDArray[np.float32]",
125
+ text_features.cpu().numpy().astype(np.float32),
126
+ )
@@ -0,0 +1,78 @@
1
+ """Factory for creating embedding backends."""
2
+
3
+ from typing import Any, ClassVar
4
+
5
+ from echovector.embeddings.base import EmbeddingBackend
6
+
7
+ # Map backend names to their (module, class) for lazy import.
8
+ _BACKEND_LOCATIONS: dict[str, tuple[str, str]] = {
9
+ "clap": ("echovector.embeddings.clap", "ClapBackend"),
10
+ "whisper": ("echovector.embeddings.whisper_enc", "WhisperBackend"),
11
+ "wav2vec2": ("echovector.embeddings.wav2vec2", "Wav2Vec2Backend"),
12
+ "hubert": ("echovector.embeddings.hubert", "HubertBackend"),
13
+ "ast": ("echovector.embeddings.ast_model", "ASTBackend"),
14
+ "local": ("echovector.embeddings.local", "LocalFeatureBackend"),
15
+ "smoke": ("echovector.embeddings.local", "LocalFeatureBackend"),
16
+ }
17
+
18
+
19
+ class EmbeddingFactory:
20
+ """Factory to instantiate embedding backends by name."""
21
+
22
+ _registry: ClassVar[dict[str, type[EmbeddingBackend]]] = {}
23
+
24
+ @classmethod
25
+ def register_backend(cls, name: str, backend_cls: type[EmbeddingBackend]) -> None:
26
+ """Register a new backend class.
27
+
28
+ Args:
29
+ name: The name to register the backend under.
30
+ backend_cls: The backend class.
31
+ """
32
+ cls._registry[name] = backend_cls
33
+
34
+ @classmethod
35
+ def create(cls, name: str, **kwargs: Any) -> EmbeddingBackend:
36
+ """Create an embedding backend instance.
37
+
38
+ Args:
39
+ name: The name of the backend (e.g., 'clap', 'whisper').
40
+ **kwargs: Additional keyword arguments to pass to the backend constructor.
41
+
42
+ Returns:
43
+ An instance of the requested EmbeddingBackend.
44
+
45
+ Raises:
46
+ ValueError: If the backend name is not registered.
47
+ """
48
+ import importlib
49
+
50
+ name_lower = name.lower()
51
+
52
+ if name_lower not in cls._registry:
53
+ if name_lower not in _BACKEND_LOCATIONS:
54
+ valid_names = ", ".join(_BACKEND_LOCATIONS.keys())
55
+ raise ValueError(
56
+ f"Unknown embedding backend: '{name}'. Valid options are: {valid_names}"
57
+ )
58
+ module_path, class_name = _BACKEND_LOCATIONS[name_lower]
59
+ module = importlib.import_module(module_path)
60
+ cls._registry[name_lower] = getattr(module, class_name)
61
+
62
+ return cls._registry[name_lower](**kwargs)
63
+
64
+
65
+ def get_embedding_model(
66
+ model_type: str = "clap",
67
+ **kwargs: Any,
68
+ ) -> EmbeddingBackend:
69
+ """Convenience function to get an embedding model.
70
+
71
+ Args:
72
+ model_type: The type of model to instantiate.
73
+ **kwargs: Arguments to pass to the model constructor.
74
+
75
+ Returns:
76
+ An instantiated EmbeddingBackend.
77
+ """
78
+ return EmbeddingFactory.create(model_type, **kwargs)
@@ -0,0 +1,41 @@
1
+ """HuBERT embedding backend stub."""
2
+
3
+ import numpy as np
4
+ import numpy.typing as npt
5
+
6
+ from echovector.embeddings.base import EmbeddingBackend
7
+
8
+
9
+ class HubertBackend(EmbeddingBackend):
10
+ """Stub implementation for the HuBERT embedding backend."""
11
+
12
+ def __init__(self, model_name: str = "facebook/hubert-base-ls960") -> None:
13
+ """Initialize the stub HuBERT backend."""
14
+ self.model_name = model_name
15
+
16
+ @property
17
+ def embedding_dim(self) -> int:
18
+ """Return a stub embedding dimension."""
19
+ return 768
20
+
21
+ def embed_audio(self, audio_paths: list[str]) -> npt.NDArray[np.float32]:
22
+ """Embed a batch of audio files.
23
+
24
+ Args:
25
+ audio_paths: List of file paths to audio files.
26
+
27
+ Raises:
28
+ NotImplementedError: As this is a stub.
29
+ """
30
+ raise NotImplementedError("HuBERT audio embedding not implemented.")
31
+
32
+ def embed_text(self, texts: list[str]) -> npt.NDArray[np.float32]:
33
+ """Embed a batch of text queries.
34
+
35
+ Args:
36
+ texts: List of text strings.
37
+
38
+ Raises:
39
+ NotImplementedError: HuBERT backend does not support text embeddings.
40
+ """
41
+ raise NotImplementedError("HuBERT backend does not support text embeddings.")
@@ -0,0 +1,109 @@
1
+ """Deterministic local embedding backend for offline smoke tests."""
2
+
3
+ from typing import cast
4
+
5
+ import librosa
6
+ import numpy as np
7
+ import numpy.typing as npt
8
+ import soundfile as sf
9
+
10
+ from echovector.embeddings.base import EmbeddingBackend
11
+
12
+
13
+ class LocalFeatureBackend(EmbeddingBackend):
14
+ """Small dependency-only backend that avoids model downloads.
15
+
16
+ This backend is intended for CI, Kaggle smoke tests, and demos. It is not a
17
+ replacement for CLAP when semantic text/audio alignment matters.
18
+ """
19
+
20
+ def __init__(self, sample_rate: int = 16_000) -> None:
21
+ """Initialize the backend."""
22
+ self.sample_rate = sample_rate
23
+
24
+ @property
25
+ def embedding_dim(self) -> int:
26
+ """Return the feature vector size."""
27
+ return 8
28
+
29
+ def embed_audio(self, audio_paths: list[str]) -> npt.NDArray[np.float32]:
30
+ """Embed audio files using simple acoustic descriptors."""
31
+ return np.vstack([self._embed_audio_file(path) for path in audio_paths]).astype(np.float32)
32
+
33
+ def embed_text(self, texts: list[str]) -> npt.NDArray[np.float32]:
34
+ """Embed text queries with a keyword-weighted acoustic proxy."""
35
+ return np.vstack([self._embed_text(text) for text in texts]).astype(np.float32)
36
+
37
+ def _embed_audio_file(self, path: str) -> npt.NDArray[np.float32]:
38
+ audio, sample_rate = sf.read(path, dtype="float32", always_2d=False)
39
+ if audio.ndim > 1:
40
+ audio = np.mean(audio, axis=1)
41
+ if sample_rate != self.sample_rate:
42
+ audio = librosa.resample(
43
+ y=audio,
44
+ orig_sr=int(sample_rate),
45
+ target_sr=self.sample_rate,
46
+ axis=0,
47
+ )
48
+
49
+ if len(audio) == 0:
50
+ return self._normalize(np.zeros(self.embedding_dim, dtype=np.float32))
51
+
52
+ duration = len(audio) / self.sample_rate
53
+ rms = float(np.sqrt(np.mean(np.square(audio))))
54
+ zcr = float(np.mean(librosa.feature.zero_crossing_rate(y=audio)[0]))
55
+ centroid = float(
56
+ np.mean(librosa.feature.spectral_centroid(y=audio, sr=self.sample_rate)[0])
57
+ / (self.sample_rate / 2)
58
+ )
59
+ bandwidth = float(
60
+ np.mean(librosa.feature.spectral_bandwidth(y=audio, sr=self.sample_rate)[0])
61
+ / (self.sample_rate / 2)
62
+ )
63
+ flatness = float(np.mean(librosa.feature.spectral_flatness(y=audio)[0]))
64
+
65
+ features = np.array(
66
+ [
67
+ min(duration / 30.0, 1.0),
68
+ min(rms * 4.0, 1.0),
69
+ min(zcr * 8.0, 1.0),
70
+ min(centroid, 1.0),
71
+ min(bandwidth, 1.0),
72
+ min(flatness * 10.0, 1.0),
73
+ 1.0 if rms < 0.01 else 0.0,
74
+ max(1.0 - min(centroid, 1.0), 0.0),
75
+ ],
76
+ dtype=np.float32,
77
+ )
78
+ return self._normalize(features)
79
+
80
+ def _embed_text(self, text: str) -> npt.NDArray[np.float32]:
81
+ lowered = text.lower()
82
+ features = np.full(self.embedding_dim, 0.05, dtype=np.float32)
83
+
84
+ if any(term in lowered for term in ("long", "duration", "extended")):
85
+ features[0] = 1.0
86
+ if any(term in lowered for term in ("loud", "strong", "alarm", "alert")):
87
+ features[1] = 1.0
88
+ if any(term in lowered for term in ("noisy", "buzz", "speech", "busy")):
89
+ features[2] = 1.0
90
+ if any(term in lowered for term in ("high", "treble", "bright", "alarm", "alert")):
91
+ features[3] = 1.0
92
+ if any(term in lowered for term in ("wide", "broadband", "noise")):
93
+ features[4] = 1.0
94
+ if any(term in lowered for term in ("flat", "noise", "static")):
95
+ features[5] = 1.0
96
+ if any(term in lowered for term in ("silent", "silence", "quiet")):
97
+ features[6] = 1.0
98
+ if any(term in lowered for term in ("low", "bass", "deep")):
99
+ features[3] = 0.0
100
+ features[4] = max(features[4], 0.2)
101
+ features[7] = 1.0
102
+
103
+ return self._normalize(features)
104
+
105
+ def _normalize(self, vector: npt.NDArray[np.float32]) -> npt.NDArray[np.float32]:
106
+ norm = float(np.linalg.norm(vector))
107
+ if norm == 0.0:
108
+ return vector
109
+ return cast("npt.NDArray[np.float32]", (vector / norm).astype(np.float32))
@@ -0,0 +1,41 @@
1
+ """Wav2Vec2 embedding backend stub."""
2
+
3
+ import numpy as np
4
+ import numpy.typing as npt
5
+
6
+ from echovector.embeddings.base import EmbeddingBackend
7
+
8
+
9
+ class Wav2Vec2Backend(EmbeddingBackend):
10
+ """Stub implementation for the Wav2Vec2 embedding backend."""
11
+
12
+ def __init__(self, model_name: str = "facebook/wav2vec2-base") -> None:
13
+ """Initialize the stub Wav2Vec2 backend."""
14
+ self.model_name = model_name
15
+
16
+ @property
17
+ def embedding_dim(self) -> int:
18
+ """Return a stub embedding dimension."""
19
+ return 768
20
+
21
+ def embed_audio(self, audio_paths: list[str]) -> npt.NDArray[np.float32]:
22
+ """Embed a batch of audio files.
23
+
24
+ Args:
25
+ audio_paths: List of file paths to audio files.
26
+
27
+ Raises:
28
+ NotImplementedError: As this is a stub.
29
+ """
30
+ raise NotImplementedError("Wav2Vec2 audio embedding not implemented.")
31
+
32
+ def embed_text(self, texts: list[str]) -> npt.NDArray[np.float32]:
33
+ """Embed a batch of text queries.
34
+
35
+ Args:
36
+ texts: List of text strings.
37
+
38
+ Raises:
39
+ NotImplementedError: Wav2Vec2 backend does not support text embeddings.
40
+ """
41
+ raise NotImplementedError("Wav2Vec2 backend does not support text embeddings.")
@@ -0,0 +1,44 @@
1
+ """Whisper encoder embedding backend stub."""
2
+
3
+ import numpy as np
4
+ import numpy.typing as npt
5
+
6
+ from echovector.embeddings.base import EmbeddingBackend
7
+
8
+
9
+ class WhisperBackend(EmbeddingBackend):
10
+ """Stub implementation for the Whisper embedding backend.
11
+
12
+ Uses the encoder of Whisper models to extract audio features.
13
+ """
14
+
15
+ def __init__(self, model_name: str = "openai/whisper-base") -> None:
16
+ """Initialize the stub Whisper backend."""
17
+ self.model_name = model_name
18
+
19
+ @property
20
+ def embedding_dim(self) -> int:
21
+ """Return a stub embedding dimension."""
22
+ return 512
23
+
24
+ def embed_audio(self, audio_paths: list[str]) -> npt.NDArray[np.float32]:
25
+ """Embed a batch of audio files.
26
+
27
+ Args:
28
+ audio_paths: List of file paths to audio files.
29
+
30
+ Raises:
31
+ NotImplementedError: As this is a stub.
32
+ """
33
+ raise NotImplementedError("Whisper audio embedding not implemented.")
34
+
35
+ def embed_text(self, texts: list[str]) -> npt.NDArray[np.float32]:
36
+ """Embed a batch of text queries.
37
+
38
+ Args:
39
+ texts: List of text strings.
40
+
41
+ Raises:
42
+ NotImplementedError: Whisper backend does not support text embeddings.
43
+ """
44
+ raise NotImplementedError("Whisper backend does not support text embeddings.")
@@ -0,0 +1 @@
1
+ """Evaluation module initialization."""
@@ -0,0 +1,45 @@
1
+ """Evaluation metrics for comparing audio vectors and embeddings."""
2
+
3
+ import numpy as np
4
+ import numpy.typing as npt
5
+
6
+
7
+ def cosine_similarity(vec1: npt.NDArray[np.float32], vec2: npt.NDArray[np.float32]) -> float:
8
+ """Calculate the cosine similarity between two vectors.
9
+
10
+ Args:
11
+ vec1: First vector.
12
+ vec2: Second vector.
13
+
14
+ Returns:
15
+ Cosine similarity score between -1.0 and 1.0.
16
+ """
17
+ if vec1.shape != vec2.shape:
18
+ raise ValueError("Vectors must have the same shape.")
19
+
20
+ norm1 = np.linalg.norm(vec1)
21
+ norm2 = np.linalg.norm(vec2)
22
+
23
+ if norm1 == 0 or norm2 == 0:
24
+ return 0.0
25
+
26
+ dot_product = np.dot(vec1.flatten(), vec2.flatten())
27
+ similarity = dot_product / (norm1 * norm2)
28
+ return float(similarity)
29
+
30
+
31
+ def euclidean_distance(vec1: npt.NDArray[np.float32], vec2: npt.NDArray[np.float32]) -> float:
32
+ """Calculate the Euclidean distance between two vectors.
33
+
34
+ Args:
35
+ vec1: First vector.
36
+ vec2: Second vector.
37
+
38
+ Returns:
39
+ Euclidean distance.
40
+ """
41
+ if vec1.shape != vec2.shape:
42
+ raise ValueError("Vectors must have the same shape.")
43
+
44
+ distance = np.linalg.norm(vec1.flatten() - vec2.flatten())
45
+ return float(distance)
@@ -0,0 +1,12 @@
1
+ """EchoVector indexing module."""
2
+
3
+ from .base import BaseIndex, BaseStore
4
+ from .faiss_index import FaissIndex
5
+ from .store import SQLiteStore
6
+
7
+ __all__ = [
8
+ "BaseIndex",
9
+ "BaseStore",
10
+ "FaissIndex",
11
+ "SQLiteStore",
12
+ ]
@@ -0,0 +1,105 @@
1
+ """Base interfaces for the indexing module."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Any
5
+
6
+ import numpy as np
7
+ import numpy.typing as npt
8
+
9
+
10
+ class BaseStore(ABC):
11
+ """Abstract base class for metadata storage."""
12
+
13
+ @abstractmethod
14
+ def initialize(self) -> None:
15
+ """Initialize the store (e.g., create tables)."""
16
+
17
+ @abstractmethod
18
+ def add(
19
+ self, int_ids: list[int], string_ids: list[str], metadata_list: list[dict[str, Any]]
20
+ ) -> None:
21
+ """Add metadata and ID mappings to the store.
22
+
23
+ Args:
24
+ int_ids: List of integer IDs assigned by the index.
25
+ string_ids: List of original string IDs.
26
+ metadata_list: List of metadata dictionaries.
27
+ """
28
+
29
+ @abstractmethod
30
+ def get_by_int_ids(
31
+ self, int_ids: list[int]
32
+ ) -> tuple[list[str | None], list[dict[str, Any] | None]]:
33
+ """Retrieve string IDs and metadata for a list of integer IDs.
34
+
35
+ Args:
36
+ int_ids: List of integer IDs to query.
37
+
38
+ Returns:
39
+ A tuple containing a list of string IDs and a list of metadata dictionaries.
40
+ """
41
+
42
+ @abstractmethod
43
+ def get_max_int_id(self) -> int:
44
+ """Get the maximum integer ID currently in the store.
45
+
46
+ Returns:
47
+ The maximum integer ID, or -1 if empty.
48
+ """
49
+
50
+ @abstractmethod
51
+ def close(self) -> None:
52
+ """Close any open connections."""
53
+
54
+
55
+ class BaseIndex(ABC):
56
+ """Abstract base class for vector indices."""
57
+
58
+ @abstractmethod
59
+ def add(
60
+ self,
61
+ embeddings: npt.NDArray[np.float32],
62
+ ids: list[str],
63
+ metadata: list[dict[str, Any]] | None = None,
64
+ ) -> None:
65
+ """Add embeddings, IDs, and metadata to the index.
66
+
67
+ Args:
68
+ embeddings: A 2D numpy array of embeddings.
69
+ ids: A list of string IDs corresponding to the embeddings.
70
+ metadata: An optional list of metadata dictionaries.
71
+ """
72
+
73
+ @abstractmethod
74
+ def search(
75
+ self, query_embeddings: npt.NDArray[np.float32], k: int = 10
76
+ ) -> tuple[
77
+ npt.NDArray[np.float32],
78
+ list[list[str | None]],
79
+ list[list[dict[str, Any] | None]],
80
+ ]:
81
+ """Search for the k nearest neighbors for each query.
82
+
83
+ Args:
84
+ query_embeddings: A 2D numpy array of query embeddings.
85
+ k: The number of nearest neighbors to retrieve.
86
+
87
+ Returns:
88
+ A tuple of (distances, string_ids, metadata).
89
+ """
90
+
91
+ @abstractmethod
92
+ def save(self, index_path: str) -> None:
93
+ """Save the index to disk.
94
+
95
+ Args:
96
+ index_path: The file path to save the index to.
97
+ """
98
+
99
+ @abstractmethod
100
+ def load(self, index_path: str) -> None:
101
+ """Load the index from disk.
102
+
103
+ Args:
104
+ index_path: The file path to load the index from.
105
+ """