echo-vector 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- echo_vector-0.1.1.dist-info/METADATA +288 -0
- echo_vector-0.1.1.dist-info/RECORD +38 -0
- echo_vector-0.1.1.dist-info/WHEEL +4 -0
- echo_vector-0.1.1.dist-info/entry_points.txt +2 -0
- echovector/__init__.py +7 -0
- echovector/api/__init__.py +1 -0
- echovector/api/server.py +144 -0
- echovector/audio/__init__.py +12 -0
- echovector/audio/chunker.py +71 -0
- echovector/audio/metadata.py +58 -0
- echovector/audio/processor.py +53 -0
- echovector/audio/streaming.py +33 -0
- echovector/cli/__init__.py +1 -0
- echovector/cli/main.py +165 -0
- echovector/core.py +289 -0
- echovector/embeddings/__init__.py +15 -0
- echovector/embeddings/ast_model.py +41 -0
- echovector/embeddings/base.py +43 -0
- echovector/embeddings/cache.py +96 -0
- echovector/embeddings/clap.py +126 -0
- echovector/embeddings/factory.py +78 -0
- echovector/embeddings/hubert.py +41 -0
- echovector/embeddings/local.py +109 -0
- echovector/embeddings/wav2vec2.py +41 -0
- echovector/embeddings/whisper_enc.py +44 -0
- echovector/evaluation/__init__.py +1 -0
- echovector/evaluation/metrics.py +45 -0
- echovector/indexing/__init__.py +12 -0
- echovector/indexing/base.py +105 -0
- echovector/indexing/faiss_index.py +182 -0
- echovector/indexing/store.py +165 -0
- echovector/search/__init__.py +14 -0
- echovector/search/engine.py +82 -0
- echovector/search/filters.py +55 -0
- echovector/search/results.py +41 -0
- echovector/utils/__init__.py +6 -0
- echovector/utils/config.py +69 -0
- echovector/utils/logging.py +31 -0
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""CLAP (Contrastive Language-Audio Pretraining) embedding backend."""
|
|
2
|
+
|
|
3
|
+
from typing import cast
|
|
4
|
+
|
|
5
|
+
import librosa
|
|
6
|
+
import numpy as np
|
|
7
|
+
import numpy.typing as npt
|
|
8
|
+
|
|
9
|
+
from echovector.embeddings.base import EmbeddingBackend
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import torch
|
|
13
|
+
from transformers import ClapModel, ClapProcessor
|
|
14
|
+
|
|
15
|
+
_CLAP_AVAILABLE = True
|
|
16
|
+
except ImportError: # pragma: no cover
|
|
17
|
+
_CLAP_AVAILABLE = False
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ClapBackend(EmbeddingBackend):
|
|
21
|
+
"""Embedding backend using the CLAP model from Hugging Face transformers.
|
|
22
|
+
|
|
23
|
+
Supports both audio and text embeddings in the same semantic space.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
model_name: str = "laion/clap-htsat-unfused",
|
|
29
|
+
device: str | None = None,
|
|
30
|
+
) -> None:
|
|
31
|
+
"""Initialize the CLAP backend.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
model_name: The Hugging Face model identifier.
|
|
35
|
+
device: Device to run the model on (e.g., 'cpu', 'cuda').
|
|
36
|
+
|
|
37
|
+
Raises:
|
|
38
|
+
ImportError: If torch or transformers are not installed.
|
|
39
|
+
"""
|
|
40
|
+
if not _CLAP_AVAILABLE:
|
|
41
|
+
raise ImportError(
|
|
42
|
+
"CLAP backend requires torch and transformers. "
|
|
43
|
+
"Install them with: pip install 'echo_vector[clap]'"
|
|
44
|
+
)
|
|
45
|
+
if device is None:
|
|
46
|
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
47
|
+
else:
|
|
48
|
+
self.device = device
|
|
49
|
+
|
|
50
|
+
self.processor = ClapProcessor.from_pretrained(model_name)
|
|
51
|
+
self.model = ClapModel.from_pretrained(model_name)
|
|
52
|
+
self.model.to(self.device)
|
|
53
|
+
self.model.eval()
|
|
54
|
+
|
|
55
|
+
# Determine embedding dimension from model config
|
|
56
|
+
self._embedding_dim = self.model.config.projection_dim
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def embedding_dim(self) -> int:
|
|
60
|
+
"""Return the dimensionality of the generated embeddings."""
|
|
61
|
+
return cast("int", self._embedding_dim)
|
|
62
|
+
|
|
63
|
+
def _load_and_resample(self, path: str, target_sr: int) -> np.ndarray:
|
|
64
|
+
"""Load audio and resample to the target sample rate."""
|
|
65
|
+
audio_array, _ = librosa.load(path, sr=target_sr, mono=True)
|
|
66
|
+
return audio_array
|
|
67
|
+
|
|
68
|
+
def embed_audio(self, audio_paths: list[str]) -> npt.NDArray[np.float32]:
|
|
69
|
+
"""Embed a batch of audio files into the CLAP semantic space.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
audio_paths: List of file paths to audio files.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
A numpy array of shape (batch_size, embedding_dim).
|
|
76
|
+
"""
|
|
77
|
+
target_sr = 48000 # Default for CLAP
|
|
78
|
+
feature_extractor = getattr(self.processor, "feature_extractor", None)
|
|
79
|
+
if feature_extractor is not None and hasattr(feature_extractor, "sampling_rate"):
|
|
80
|
+
target_sr = int(getattr(feature_extractor, "sampling_rate", target_sr))
|
|
81
|
+
|
|
82
|
+
audios = [self._load_and_resample(path, target_sr) for path in audio_paths]
|
|
83
|
+
|
|
84
|
+
inputs = self.processor(
|
|
85
|
+
audios=audios,
|
|
86
|
+
return_tensors="pt",
|
|
87
|
+
sampling_rate=target_sr,
|
|
88
|
+
)
|
|
89
|
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
|
90
|
+
|
|
91
|
+
with torch.no_grad():
|
|
92
|
+
audio_features = self.model.get_audio_features(**inputs)
|
|
93
|
+
# Normalize to ensure audio and text embeddings share the unit hypersphere
|
|
94
|
+
audio_features = audio_features / audio_features.norm(dim=-1, keepdim=True)
|
|
95
|
+
|
|
96
|
+
return cast(
|
|
97
|
+
"npt.NDArray[np.float32]",
|
|
98
|
+
audio_features.cpu().numpy().astype(np.float32),
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
def embed_text(self, texts: list[str]) -> npt.NDArray[np.float32]:
|
|
102
|
+
"""Embed a batch of text queries into the CLAP semantic space.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
texts: List of text strings.
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
A numpy array of shape (batch_size, embedding_dim).
|
|
109
|
+
"""
|
|
110
|
+
inputs = self.processor(
|
|
111
|
+
text=texts,
|
|
112
|
+
return_tensors="pt",
|
|
113
|
+
padding=True,
|
|
114
|
+
truncation=True,
|
|
115
|
+
)
|
|
116
|
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
|
117
|
+
|
|
118
|
+
with torch.no_grad():
|
|
119
|
+
text_features = self.model.get_text_features(**inputs)
|
|
120
|
+
# Normalize to ensure audio and text embeddings share the unit hypersphere
|
|
121
|
+
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
|
|
122
|
+
|
|
123
|
+
return cast(
|
|
124
|
+
"npt.NDArray[np.float32]",
|
|
125
|
+
text_features.cpu().numpy().astype(np.float32),
|
|
126
|
+
)
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""Factory for creating embedding backends."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, ClassVar
|
|
4
|
+
|
|
5
|
+
from echovector.embeddings.base import EmbeddingBackend
|
|
6
|
+
|
|
7
|
+
# Map backend names to their (module, class) for lazy import.
|
|
8
|
+
_BACKEND_LOCATIONS: dict[str, tuple[str, str]] = {
|
|
9
|
+
"clap": ("echovector.embeddings.clap", "ClapBackend"),
|
|
10
|
+
"whisper": ("echovector.embeddings.whisper_enc", "WhisperBackend"),
|
|
11
|
+
"wav2vec2": ("echovector.embeddings.wav2vec2", "Wav2Vec2Backend"),
|
|
12
|
+
"hubert": ("echovector.embeddings.hubert", "HubertBackend"),
|
|
13
|
+
"ast": ("echovector.embeddings.ast_model", "ASTBackend"),
|
|
14
|
+
"local": ("echovector.embeddings.local", "LocalFeatureBackend"),
|
|
15
|
+
"smoke": ("echovector.embeddings.local", "LocalFeatureBackend"),
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class EmbeddingFactory:
|
|
20
|
+
"""Factory to instantiate embedding backends by name."""
|
|
21
|
+
|
|
22
|
+
_registry: ClassVar[dict[str, type[EmbeddingBackend]]] = {}
|
|
23
|
+
|
|
24
|
+
@classmethod
|
|
25
|
+
def register_backend(cls, name: str, backend_cls: type[EmbeddingBackend]) -> None:
|
|
26
|
+
"""Register a new backend class.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
name: The name to register the backend under.
|
|
30
|
+
backend_cls: The backend class.
|
|
31
|
+
"""
|
|
32
|
+
cls._registry[name] = backend_cls
|
|
33
|
+
|
|
34
|
+
@classmethod
|
|
35
|
+
def create(cls, name: str, **kwargs: Any) -> EmbeddingBackend:
|
|
36
|
+
"""Create an embedding backend instance.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
name: The name of the backend (e.g., 'clap', 'whisper').
|
|
40
|
+
**kwargs: Additional keyword arguments to pass to the backend constructor.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
An instance of the requested EmbeddingBackend.
|
|
44
|
+
|
|
45
|
+
Raises:
|
|
46
|
+
ValueError: If the backend name is not registered.
|
|
47
|
+
"""
|
|
48
|
+
import importlib
|
|
49
|
+
|
|
50
|
+
name_lower = name.lower()
|
|
51
|
+
|
|
52
|
+
if name_lower not in cls._registry:
|
|
53
|
+
if name_lower not in _BACKEND_LOCATIONS:
|
|
54
|
+
valid_names = ", ".join(_BACKEND_LOCATIONS.keys())
|
|
55
|
+
raise ValueError(
|
|
56
|
+
f"Unknown embedding backend: '{name}'. Valid options are: {valid_names}"
|
|
57
|
+
)
|
|
58
|
+
module_path, class_name = _BACKEND_LOCATIONS[name_lower]
|
|
59
|
+
module = importlib.import_module(module_path)
|
|
60
|
+
cls._registry[name_lower] = getattr(module, class_name)
|
|
61
|
+
|
|
62
|
+
return cls._registry[name_lower](**kwargs)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def get_embedding_model(
|
|
66
|
+
model_type: str = "clap",
|
|
67
|
+
**kwargs: Any,
|
|
68
|
+
) -> EmbeddingBackend:
|
|
69
|
+
"""Convenience function to get an embedding model.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
model_type: The type of model to instantiate.
|
|
73
|
+
**kwargs: Arguments to pass to the model constructor.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
An instantiated EmbeddingBackend.
|
|
77
|
+
"""
|
|
78
|
+
return EmbeddingFactory.create(model_type, **kwargs)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""HuBERT embedding backend stub."""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import numpy.typing as npt
|
|
5
|
+
|
|
6
|
+
from echovector.embeddings.base import EmbeddingBackend
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class HubertBackend(EmbeddingBackend):
|
|
10
|
+
"""Stub implementation for the HuBERT embedding backend."""
|
|
11
|
+
|
|
12
|
+
def __init__(self, model_name: str = "facebook/hubert-base-ls960") -> None:
|
|
13
|
+
"""Initialize the stub HuBERT backend."""
|
|
14
|
+
self.model_name = model_name
|
|
15
|
+
|
|
16
|
+
@property
|
|
17
|
+
def embedding_dim(self) -> int:
|
|
18
|
+
"""Return a stub embedding dimension."""
|
|
19
|
+
return 768
|
|
20
|
+
|
|
21
|
+
def embed_audio(self, audio_paths: list[str]) -> npt.NDArray[np.float32]:
|
|
22
|
+
"""Embed a batch of audio files.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
audio_paths: List of file paths to audio files.
|
|
26
|
+
|
|
27
|
+
Raises:
|
|
28
|
+
NotImplementedError: As this is a stub.
|
|
29
|
+
"""
|
|
30
|
+
raise NotImplementedError("HuBERT audio embedding not implemented.")
|
|
31
|
+
|
|
32
|
+
def embed_text(self, texts: list[str]) -> npt.NDArray[np.float32]:
|
|
33
|
+
"""Embed a batch of text queries.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
texts: List of text strings.
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
NotImplementedError: HuBERT backend does not support text embeddings.
|
|
40
|
+
"""
|
|
41
|
+
raise NotImplementedError("HuBERT backend does not support text embeddings.")
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Deterministic local embedding backend for offline smoke tests."""
|
|
2
|
+
|
|
3
|
+
from typing import cast
|
|
4
|
+
|
|
5
|
+
import librosa
|
|
6
|
+
import numpy as np
|
|
7
|
+
import numpy.typing as npt
|
|
8
|
+
import soundfile as sf
|
|
9
|
+
|
|
10
|
+
from echovector.embeddings.base import EmbeddingBackend
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class LocalFeatureBackend(EmbeddingBackend):
|
|
14
|
+
"""Small dependency-only backend that avoids model downloads.
|
|
15
|
+
|
|
16
|
+
This backend is intended for CI, Kaggle smoke tests, and demos. It is not a
|
|
17
|
+
replacement for CLAP when semantic text/audio alignment matters.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, sample_rate: int = 16_000) -> None:
|
|
21
|
+
"""Initialize the backend."""
|
|
22
|
+
self.sample_rate = sample_rate
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def embedding_dim(self) -> int:
|
|
26
|
+
"""Return the feature vector size."""
|
|
27
|
+
return 8
|
|
28
|
+
|
|
29
|
+
def embed_audio(self, audio_paths: list[str]) -> npt.NDArray[np.float32]:
|
|
30
|
+
"""Embed audio files using simple acoustic descriptors."""
|
|
31
|
+
return np.vstack([self._embed_audio_file(path) for path in audio_paths]).astype(np.float32)
|
|
32
|
+
|
|
33
|
+
def embed_text(self, texts: list[str]) -> npt.NDArray[np.float32]:
|
|
34
|
+
"""Embed text queries with a keyword-weighted acoustic proxy."""
|
|
35
|
+
return np.vstack([self._embed_text(text) for text in texts]).astype(np.float32)
|
|
36
|
+
|
|
37
|
+
def _embed_audio_file(self, path: str) -> npt.NDArray[np.float32]:
|
|
38
|
+
audio, sample_rate = sf.read(path, dtype="float32", always_2d=False)
|
|
39
|
+
if audio.ndim > 1:
|
|
40
|
+
audio = np.mean(audio, axis=1)
|
|
41
|
+
if sample_rate != self.sample_rate:
|
|
42
|
+
audio = librosa.resample(
|
|
43
|
+
y=audio,
|
|
44
|
+
orig_sr=int(sample_rate),
|
|
45
|
+
target_sr=self.sample_rate,
|
|
46
|
+
axis=0,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
if len(audio) == 0:
|
|
50
|
+
return self._normalize(np.zeros(self.embedding_dim, dtype=np.float32))
|
|
51
|
+
|
|
52
|
+
duration = len(audio) / self.sample_rate
|
|
53
|
+
rms = float(np.sqrt(np.mean(np.square(audio))))
|
|
54
|
+
zcr = float(np.mean(librosa.feature.zero_crossing_rate(y=audio)[0]))
|
|
55
|
+
centroid = float(
|
|
56
|
+
np.mean(librosa.feature.spectral_centroid(y=audio, sr=self.sample_rate)[0])
|
|
57
|
+
/ (self.sample_rate / 2)
|
|
58
|
+
)
|
|
59
|
+
bandwidth = float(
|
|
60
|
+
np.mean(librosa.feature.spectral_bandwidth(y=audio, sr=self.sample_rate)[0])
|
|
61
|
+
/ (self.sample_rate / 2)
|
|
62
|
+
)
|
|
63
|
+
flatness = float(np.mean(librosa.feature.spectral_flatness(y=audio)[0]))
|
|
64
|
+
|
|
65
|
+
features = np.array(
|
|
66
|
+
[
|
|
67
|
+
min(duration / 30.0, 1.0),
|
|
68
|
+
min(rms * 4.0, 1.0),
|
|
69
|
+
min(zcr * 8.0, 1.0),
|
|
70
|
+
min(centroid, 1.0),
|
|
71
|
+
min(bandwidth, 1.0),
|
|
72
|
+
min(flatness * 10.0, 1.0),
|
|
73
|
+
1.0 if rms < 0.01 else 0.0,
|
|
74
|
+
max(1.0 - min(centroid, 1.0), 0.0),
|
|
75
|
+
],
|
|
76
|
+
dtype=np.float32,
|
|
77
|
+
)
|
|
78
|
+
return self._normalize(features)
|
|
79
|
+
|
|
80
|
+
def _embed_text(self, text: str) -> npt.NDArray[np.float32]:
|
|
81
|
+
lowered = text.lower()
|
|
82
|
+
features = np.full(self.embedding_dim, 0.05, dtype=np.float32)
|
|
83
|
+
|
|
84
|
+
if any(term in lowered for term in ("long", "duration", "extended")):
|
|
85
|
+
features[0] = 1.0
|
|
86
|
+
if any(term in lowered for term in ("loud", "strong", "alarm", "alert")):
|
|
87
|
+
features[1] = 1.0
|
|
88
|
+
if any(term in lowered for term in ("noisy", "buzz", "speech", "busy")):
|
|
89
|
+
features[2] = 1.0
|
|
90
|
+
if any(term in lowered for term in ("high", "treble", "bright", "alarm", "alert")):
|
|
91
|
+
features[3] = 1.0
|
|
92
|
+
if any(term in lowered for term in ("wide", "broadband", "noise")):
|
|
93
|
+
features[4] = 1.0
|
|
94
|
+
if any(term in lowered for term in ("flat", "noise", "static")):
|
|
95
|
+
features[5] = 1.0
|
|
96
|
+
if any(term in lowered for term in ("silent", "silence", "quiet")):
|
|
97
|
+
features[6] = 1.0
|
|
98
|
+
if any(term in lowered for term in ("low", "bass", "deep")):
|
|
99
|
+
features[3] = 0.0
|
|
100
|
+
features[4] = max(features[4], 0.2)
|
|
101
|
+
features[7] = 1.0
|
|
102
|
+
|
|
103
|
+
return self._normalize(features)
|
|
104
|
+
|
|
105
|
+
def _normalize(self, vector: npt.NDArray[np.float32]) -> npt.NDArray[np.float32]:
|
|
106
|
+
norm = float(np.linalg.norm(vector))
|
|
107
|
+
if norm == 0.0:
|
|
108
|
+
return vector
|
|
109
|
+
return cast("npt.NDArray[np.float32]", (vector / norm).astype(np.float32))
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Wav2Vec2 embedding backend stub."""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import numpy.typing as npt
|
|
5
|
+
|
|
6
|
+
from echovector.embeddings.base import EmbeddingBackend
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Wav2Vec2Backend(EmbeddingBackend):
|
|
10
|
+
"""Stub implementation for the Wav2Vec2 embedding backend."""
|
|
11
|
+
|
|
12
|
+
def __init__(self, model_name: str = "facebook/wav2vec2-base") -> None:
|
|
13
|
+
"""Initialize the stub Wav2Vec2 backend."""
|
|
14
|
+
self.model_name = model_name
|
|
15
|
+
|
|
16
|
+
@property
|
|
17
|
+
def embedding_dim(self) -> int:
|
|
18
|
+
"""Return a stub embedding dimension."""
|
|
19
|
+
return 768
|
|
20
|
+
|
|
21
|
+
def embed_audio(self, audio_paths: list[str]) -> npt.NDArray[np.float32]:
|
|
22
|
+
"""Embed a batch of audio files.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
audio_paths: List of file paths to audio files.
|
|
26
|
+
|
|
27
|
+
Raises:
|
|
28
|
+
NotImplementedError: As this is a stub.
|
|
29
|
+
"""
|
|
30
|
+
raise NotImplementedError("Wav2Vec2 audio embedding not implemented.")
|
|
31
|
+
|
|
32
|
+
def embed_text(self, texts: list[str]) -> npt.NDArray[np.float32]:
|
|
33
|
+
"""Embed a batch of text queries.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
texts: List of text strings.
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
NotImplementedError: Wav2Vec2 backend does not support text embeddings.
|
|
40
|
+
"""
|
|
41
|
+
raise NotImplementedError("Wav2Vec2 backend does not support text embeddings.")
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Whisper encoder embedding backend stub."""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import numpy.typing as npt
|
|
5
|
+
|
|
6
|
+
from echovector.embeddings.base import EmbeddingBackend
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class WhisperBackend(EmbeddingBackend):
|
|
10
|
+
"""Stub implementation for the Whisper embedding backend.
|
|
11
|
+
|
|
12
|
+
Uses the encoder of Whisper models to extract audio features.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, model_name: str = "openai/whisper-base") -> None:
|
|
16
|
+
"""Initialize the stub Whisper backend."""
|
|
17
|
+
self.model_name = model_name
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def embedding_dim(self) -> int:
|
|
21
|
+
"""Return a stub embedding dimension."""
|
|
22
|
+
return 512
|
|
23
|
+
|
|
24
|
+
def embed_audio(self, audio_paths: list[str]) -> npt.NDArray[np.float32]:
|
|
25
|
+
"""Embed a batch of audio files.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
audio_paths: List of file paths to audio files.
|
|
29
|
+
|
|
30
|
+
Raises:
|
|
31
|
+
NotImplementedError: As this is a stub.
|
|
32
|
+
"""
|
|
33
|
+
raise NotImplementedError("Whisper audio embedding not implemented.")
|
|
34
|
+
|
|
35
|
+
def embed_text(self, texts: list[str]) -> npt.NDArray[np.float32]:
|
|
36
|
+
"""Embed a batch of text queries.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
texts: List of text strings.
|
|
40
|
+
|
|
41
|
+
Raises:
|
|
42
|
+
NotImplementedError: Whisper backend does not support text embeddings.
|
|
43
|
+
"""
|
|
44
|
+
raise NotImplementedError("Whisper backend does not support text embeddings.")
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Evaluation module initialization."""
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Evaluation metrics for comparing audio vectors and embeddings."""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import numpy.typing as npt
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def cosine_similarity(vec1: npt.NDArray[np.float32], vec2: npt.NDArray[np.float32]) -> float:
|
|
8
|
+
"""Calculate the cosine similarity between two vectors.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
vec1: First vector.
|
|
12
|
+
vec2: Second vector.
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
Cosine similarity score between -1.0 and 1.0.
|
|
16
|
+
"""
|
|
17
|
+
if vec1.shape != vec2.shape:
|
|
18
|
+
raise ValueError("Vectors must have the same shape.")
|
|
19
|
+
|
|
20
|
+
norm1 = np.linalg.norm(vec1)
|
|
21
|
+
norm2 = np.linalg.norm(vec2)
|
|
22
|
+
|
|
23
|
+
if norm1 == 0 or norm2 == 0:
|
|
24
|
+
return 0.0
|
|
25
|
+
|
|
26
|
+
dot_product = np.dot(vec1.flatten(), vec2.flatten())
|
|
27
|
+
similarity = dot_product / (norm1 * norm2)
|
|
28
|
+
return float(similarity)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def euclidean_distance(vec1: npt.NDArray[np.float32], vec2: npt.NDArray[np.float32]) -> float:
|
|
32
|
+
"""Calculate the Euclidean distance between two vectors.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
vec1: First vector.
|
|
36
|
+
vec2: Second vector.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Euclidean distance.
|
|
40
|
+
"""
|
|
41
|
+
if vec1.shape != vec2.shape:
|
|
42
|
+
raise ValueError("Vectors must have the same shape.")
|
|
43
|
+
|
|
44
|
+
distance = np.linalg.norm(vec1.flatten() - vec2.flatten())
|
|
45
|
+
return float(distance)
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""Base interfaces for the indexing module."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import numpy.typing as npt
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BaseStore(ABC):
|
|
11
|
+
"""Abstract base class for metadata storage."""
|
|
12
|
+
|
|
13
|
+
@abstractmethod
|
|
14
|
+
def initialize(self) -> None:
|
|
15
|
+
"""Initialize the store (e.g., create tables)."""
|
|
16
|
+
|
|
17
|
+
@abstractmethod
|
|
18
|
+
def add(
|
|
19
|
+
self, int_ids: list[int], string_ids: list[str], metadata_list: list[dict[str, Any]]
|
|
20
|
+
) -> None:
|
|
21
|
+
"""Add metadata and ID mappings to the store.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
int_ids: List of integer IDs assigned by the index.
|
|
25
|
+
string_ids: List of original string IDs.
|
|
26
|
+
metadata_list: List of metadata dictionaries.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
@abstractmethod
|
|
30
|
+
def get_by_int_ids(
|
|
31
|
+
self, int_ids: list[int]
|
|
32
|
+
) -> tuple[list[str | None], list[dict[str, Any] | None]]:
|
|
33
|
+
"""Retrieve string IDs and metadata for a list of integer IDs.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
int_ids: List of integer IDs to query.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
A tuple containing a list of string IDs and a list of metadata dictionaries.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
@abstractmethod
|
|
43
|
+
def get_max_int_id(self) -> int:
|
|
44
|
+
"""Get the maximum integer ID currently in the store.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
The maximum integer ID, or -1 if empty.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
@abstractmethod
|
|
51
|
+
def close(self) -> None:
|
|
52
|
+
"""Close any open connections."""
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class BaseIndex(ABC):
|
|
56
|
+
"""Abstract base class for vector indices."""
|
|
57
|
+
|
|
58
|
+
@abstractmethod
|
|
59
|
+
def add(
|
|
60
|
+
self,
|
|
61
|
+
embeddings: npt.NDArray[np.float32],
|
|
62
|
+
ids: list[str],
|
|
63
|
+
metadata: list[dict[str, Any]] | None = None,
|
|
64
|
+
) -> None:
|
|
65
|
+
"""Add embeddings, IDs, and metadata to the index.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
embeddings: A 2D numpy array of embeddings.
|
|
69
|
+
ids: A list of string IDs corresponding to the embeddings.
|
|
70
|
+
metadata: An optional list of metadata dictionaries.
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
@abstractmethod
|
|
74
|
+
def search(
|
|
75
|
+
self, query_embeddings: npt.NDArray[np.float32], k: int = 10
|
|
76
|
+
) -> tuple[
|
|
77
|
+
npt.NDArray[np.float32],
|
|
78
|
+
list[list[str | None]],
|
|
79
|
+
list[list[dict[str, Any] | None]],
|
|
80
|
+
]:
|
|
81
|
+
"""Search for the k nearest neighbors for each query.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
query_embeddings: A 2D numpy array of query embeddings.
|
|
85
|
+
k: The number of nearest neighbors to retrieve.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
A tuple of (distances, string_ids, metadata).
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
@abstractmethod
|
|
92
|
+
def save(self, index_path: str) -> None:
|
|
93
|
+
"""Save the index to disk.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
index_path: The file path to save the index to.
|
|
97
|
+
"""
|
|
98
|
+
|
|
99
|
+
@abstractmethod
|
|
100
|
+
def load(self, index_path: str) -> None:
|
|
101
|
+
"""Load the index from disk.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
index_path: The file path to load the index from.
|
|
105
|
+
"""
|