echo-vector 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,33 @@
1
+ from collections.abc import Generator
2
+
3
+ import numpy as np
4
+ import numpy.typing as npt
5
+ import soundfile as sf
6
+
7
+
8
+ class AudioStreamer:
9
+ """Streams audio data in blocks."""
10
+
11
+ def __init__(self, block_size: int = 4096) -> None:
12
+ """Initialize the streamer.
13
+
14
+ Args:
15
+ block_size: Number of frames per block.
16
+ """
17
+ self.block_size = block_size
18
+
19
+ def stream(self, file_path: str) -> Generator[npt.NDArray[np.float32], None, None]:
20
+ """Stream audio from a file.
21
+
22
+ Args:
23
+ file_path: Path to the audio file.
24
+
25
+ Yields:
26
+ Blocks of audio data as numpy arrays.
27
+ """
28
+ for block in sf.blocks(
29
+ file_path, blocksize=self.block_size, dtype="float32", always_2d=False
30
+ ):
31
+ if block.ndim > 1:
32
+ block = np.mean(block, axis=1, dtype=np.float32)
33
+ yield block
@@ -0,0 +1 @@
1
+ """CLI module for EchoVector."""
echovector/cli/main.py ADDED
@@ -0,0 +1,165 @@
1
+ """Command-line interface for EchoVector."""
2
+
3
+ from pathlib import Path
4
+ from typing import Annotated
5
+
6
+ import typer
7
+ from rich.console import Console
8
+ from rich.table import Table
9
+
10
+ from echovector import EchoVector
11
+
12
+ app = typer.Typer(
13
+ name="echovector",
14
+ help="EchoVector CLI for semantic text search over audio files.",
15
+ add_completion=False,
16
+ )
17
+ console = Console()
18
+
19
+
20
+ def _open_engine(store_dir: Path, backend: str, device: str | None) -> EchoVector:
21
+ if device:
22
+ return EchoVector(store_dir=store_dir, backend=backend, device=device)
23
+ return EchoVector(store_dir=store_dir, backend=backend)
24
+
25
+
26
+ @app.command()
27
+ def index(
28
+ files: Annotated[
29
+ list[Path],
30
+ typer.Argument(help="Audio files or directories to index."),
31
+ ],
32
+ recursive: Annotated[
33
+ bool,
34
+ typer.Option("--recursive", "-r", help="Recursively search directories for audio files."),
35
+ ] = False,
36
+ store_dir: Annotated[
37
+ Path,
38
+ typer.Option("--store-dir", help="Directory for index files."),
39
+ ] = Path(".echovector"),
40
+ backend: Annotated[str, typer.Option("--backend", help="Embedding backend to use.")] = "clap",
41
+ device: Annotated[
42
+ str | None,
43
+ typer.Option("--device", help="Model device, e.g. cpu or cuda."),
44
+ ] = None,
45
+ reset: Annotated[
46
+ bool,
47
+ typer.Option("--reset", help="Clear the existing index before indexing."),
48
+ ] = False,
49
+ chunk_seconds: Annotated[
50
+ float,
51
+ typer.Option("--chunk-seconds", help="Audio chunk size to embed."),
52
+ ] = 10.0,
53
+ overlap_seconds: Annotated[
54
+ float,
55
+ typer.Option("--overlap-seconds", help="Overlap between adjacent chunks."),
56
+ ] = 1.0,
57
+ ) -> None:
58
+ """Index audio files for search."""
59
+ try:
60
+ if device:
61
+ engine = EchoVector(
62
+ store_dir=store_dir,
63
+ backend=backend,
64
+ device=device,
65
+ chunk_seconds=chunk_seconds,
66
+ overlap_seconds=overlap_seconds,
67
+ )
68
+ else:
69
+ engine = EchoVector(
70
+ store_dir=store_dir,
71
+ backend=backend,
72
+ chunk_seconds=chunk_seconds,
73
+ overlap_seconds=overlap_seconds,
74
+ )
75
+ if reset:
76
+ engine.reset()
77
+ count = engine.index(files, recursive=recursive)
78
+ except Exception as exc:
79
+ console.print(f"[bold red]Indexing failed:[/bold red] {exc}")
80
+ raise typer.Exit(code=1) from exc
81
+
82
+ console.print(f"[bold green]Indexing complete.[/bold green] Indexed {count} chunk(s).")
83
+ console.print(f"[dim]Store: {store_dir}[/dim]")
84
+
85
+
86
+ @app.command()
87
+ def search(
88
+ query: Annotated[str, typer.Argument(help="Text query to search for.")],
89
+ top_k: Annotated[
90
+ int,
91
+ typer.Option("--top-k", "-k", help="Number of top results to return."),
92
+ ] = 5,
93
+ store_dir: Annotated[
94
+ Path,
95
+ typer.Option("--store-dir", help="Directory containing the index."),
96
+ ] = Path(".echovector"),
97
+ backend: Annotated[
98
+ str,
99
+ typer.Option("--backend", help="Embedding backend used for the index."),
100
+ ] = "clap",
101
+ device: Annotated[
102
+ str | None,
103
+ typer.Option("--device", help="Model device, e.g. cpu or cuda."),
104
+ ] = None,
105
+ ) -> None:
106
+ """Search an existing vector index without scanning audio files."""
107
+ try:
108
+ engine = _open_engine(store_dir=store_dir, backend=backend, device=device)
109
+ results = engine.search(query, top_k=top_k)
110
+ except Exception as exc:
111
+ console.print(f"[bold red]Search failed:[/bold red] {exc}")
112
+ raise typer.Exit(code=1) from exc
113
+
114
+ table = Table(title=f"Top {top_k} Results for '{query}'")
115
+ table.add_column("Score", justify="right", style="cyan", no_wrap=True)
116
+ table.add_column("File", style="magenta")
117
+ table.add_column("Timestamp", justify="right", style="green")
118
+
119
+ for result in results:
120
+ table.add_row(
121
+ f"{result.score:.4f}",
122
+ str(result.metadata.get("filename", result.filepath)),
123
+ f"{result.timestamp_range.start:.1f}s-{result.timestamp_range.end:.1f}s",
124
+ )
125
+
126
+ if results:
127
+ console.print(table)
128
+ else:
129
+ console.print("[yellow]No results found.[/yellow]")
130
+
131
+
132
+ @app.command()
133
+ def stats(
134
+ store_dir: Annotated[
135
+ Path,
136
+ typer.Option("--store-dir", help="Directory containing the index."),
137
+ ] = Path(".echovector"),
138
+ backend: Annotated[
139
+ str,
140
+ typer.Option("--backend", help="Embedding backend used for the index."),
141
+ ] = "clap",
142
+ device: Annotated[
143
+ str | None,
144
+ typer.Option("--device", help="Model device, e.g. cpu or cuda."),
145
+ ] = None,
146
+ ) -> None:
147
+ """Display statistics about the current index."""
148
+ try:
149
+ stats_data = _open_engine(store_dir=store_dir, backend=backend, device=device).stats()
150
+ except Exception as exc:
151
+ console.print(f"[bold red]Stats failed:[/bold red] {exc}")
152
+ raise typer.Exit(code=1) from exc
153
+
154
+ table = Table(title="Index Statistics", show_header=False)
155
+ table.add_column("Metric", style="cyan")
156
+ table.add_column("Value", style="magenta")
157
+
158
+ for key, value in stats_data.items():
159
+ table.add_row(key, str(value))
160
+
161
+ console.print(table)
162
+
163
+
164
+ if __name__ == "__main__":
165
+ app()
echovector/core.py ADDED
@@ -0,0 +1,289 @@
1
+ """Core public API for EchoVector."""
2
+
3
+ from collections.abc import Sequence
4
+ from pathlib import Path
5
+ from tempfile import TemporaryDirectory
6
+ from typing import Any, cast
7
+
8
+ import numpy as np
9
+ import numpy.typing as npt
10
+ import soundfile as sf
11
+
12
+ from echovector.audio.metadata import extract_metadata
13
+ from echovector.audio.processor import AudioProcessor as FileAudioProcessor
14
+ from echovector.embeddings.base import EmbeddingBackend
15
+ from echovector.embeddings.factory import get_embedding_model
16
+ from echovector.indexing.faiss_index import FaissIndex
17
+ from echovector.search.results import SearchResult, TimestampRange
18
+
19
+ AUDIO_EXTENSIONS = {".wav", ".mp3", ".flac", ".m4a", ".ogg", ".aiff", ".aif"}
20
+
21
+
22
+ class EchoVector:
23
+ """High-level interface for indexing and searching audio files."""
24
+
25
+ def __init__(
26
+ self,
27
+ store_dir: str | Path = ".echovector",
28
+ backend: str | EmbeddingBackend = "clap",
29
+ recursive: bool = True,
30
+ chunk_seconds: float = 10.0,
31
+ overlap_seconds: float = 1.0,
32
+ sample_rate: int = 48_000,
33
+ **backend_kwargs: Any,
34
+ ) -> None:
35
+ """Initialize EchoVector."""
36
+ if chunk_seconds <= 0.0:
37
+ raise ValueError("chunk_seconds must be positive.")
38
+ if overlap_seconds < 0.0:
39
+ raise ValueError("overlap_seconds cannot be negative.")
40
+ if overlap_seconds >= chunk_seconds:
41
+ raise ValueError("overlap_seconds must be smaller than chunk_seconds.")
42
+
43
+ self.store_dir = Path(store_dir)
44
+ self.store_dir.mkdir(parents=True, exist_ok=True)
45
+ self.index_path = self.store_dir / "index.faiss"
46
+ self.db_path = self.store_dir / "metadata.sqlite"
47
+ self.recursive = recursive
48
+ self.chunk_seconds = chunk_seconds
49
+ self.overlap_seconds = overlap_seconds
50
+ self.sample_rate = sample_rate
51
+ self.audio_processor = FileAudioProcessor(target_sample_rate=sample_rate, mono=True)
52
+
53
+ self.embedder = (
54
+ get_embedding_model(backend, **backend_kwargs) if isinstance(backend, str) else backend
55
+ )
56
+ self.index_backend = FaissIndex(
57
+ dimension=self.embedder.embedding_dim,
58
+ db_path=str(self.db_path),
59
+ )
60
+
61
+ if self.index_path.exists():
62
+ self.index_backend.load(str(self.index_path))
63
+
64
+ def index(
65
+ self,
66
+ targets: str | Path | Sequence[str | Path],
67
+ recursive: bool | None = None,
68
+ batch_size: int = 16,
69
+ force: bool = False,
70
+ ) -> int:
71
+ """Index audio chunks from paths or directories.
72
+
73
+ Args:
74
+ targets: One or more file paths or directories to index.
75
+ recursive: Override the instance-level recursive setting.
76
+ batch_size: Number of chunks to embed per batch.
77
+ force: If True, remove and re-index files that are already stored.
78
+ If False (default), already-indexed files are skipped.
79
+
80
+ Returns:
81
+ Number of new chunks added to the index.
82
+ """
83
+ files = self._resolve_audio_files(
84
+ targets,
85
+ self.recursive if recursive is None else recursive,
86
+ )
87
+ if not files:
88
+ return 0
89
+
90
+ if force:
91
+ for file_path in files:
92
+ int_ids = self.index_backend.store.delete_by_filepath(str(file_path))
93
+ if int_ids:
94
+ self.index_backend.remove_int_ids(int_ids)
95
+ else:
96
+ files = [f for f in files if not self.index_backend.store.has_filepath(str(f))]
97
+ if not files:
98
+ return 0
99
+
100
+ indexed_chunks = 0
101
+ with TemporaryDirectory(prefix="echovector-chunks-") as temp_dir:
102
+ chunk_paths: list[Path] = []
103
+ chunk_ids: list[str] = []
104
+ chunk_metadata: list[dict[str, Any]] = []
105
+
106
+ for file_path in files:
107
+ audio = self.audio_processor.load_audio(str(file_path))
108
+ for (
109
+ chunk_number,
110
+ start_seconds,
111
+ end_seconds,
112
+ chunk_audio,
113
+ ) in self._iter_chunks(audio):
114
+ chunk_path = Path(temp_dir) / f"chunk-{indexed_chunks:08d}.wav"
115
+ sf.write(chunk_path, chunk_audio, self.sample_rate)
116
+ chunk_paths.append(chunk_path)
117
+ chunk_ids.append(f"{file_path}#{start_seconds:.3f}-{end_seconds:.3f}")
118
+ chunk_metadata.append(
119
+ self._metadata_for_chunk(
120
+ file_path,
121
+ chunk_number,
122
+ start_seconds,
123
+ end_seconds,
124
+ )
125
+ )
126
+ indexed_chunks += 1
127
+
128
+ if len(chunk_paths) >= batch_size:
129
+ self._add_chunk_batch(chunk_paths, chunk_ids, chunk_metadata)
130
+ chunk_paths = []
131
+ chunk_ids = []
132
+ chunk_metadata = []
133
+
134
+ if chunk_paths:
135
+ self._add_chunk_batch(chunk_paths, chunk_ids, chunk_metadata)
136
+
137
+ self.index_backend.save(str(self.index_path))
138
+ return indexed_chunks
139
+
140
+ def search(self, query: str, top_k: int = 5) -> list[SearchResult]:
141
+ """Search indexed audio with a text query."""
142
+ if top_k < 0:
143
+ raise ValueError("top_k must be non-negative.")
144
+ if top_k == 0:
145
+ return []
146
+
147
+ query_embedding = self._normalize_rows(self.embedder.embed_text([query]))
148
+ distances, ids, metadata_rows = self.index_backend.search(query_embedding, k=top_k)
149
+
150
+ results: list[SearchResult] = []
151
+ for offset, string_id in enumerate(ids[0] if ids else []):
152
+ if string_id is None:
153
+ continue
154
+ metadata = metadata_rows[0][offset] if metadata_rows and metadata_rows[0] else {}
155
+ metadata = metadata or {}
156
+ results.append(
157
+ SearchResult(
158
+ filepath=str(metadata.get("filepath", string_id)),
159
+ timestamp_range=TimestampRange(
160
+ start=float(metadata.get("start", 0.0)),
161
+ end=float(metadata.get("end", metadata.get("duration", 0.0))),
162
+ ),
163
+ score=float(distances[0][offset]),
164
+ metadata=metadata,
165
+ )
166
+ )
167
+
168
+ return results
169
+
170
+ def stats(self) -> dict[str, Any]:
171
+ """Return basic index statistics."""
172
+ return {
173
+ "store_dir": str(self.store_dir),
174
+ "index_path": str(self.index_path),
175
+ "metadata_path": str(self.db_path),
176
+ "embedding_dim": self.embedder.embedding_dim,
177
+ "chunks": int(self.index_backend.index.ntotal),
178
+ "vectors": int(self.index_backend.index.ntotal),
179
+ "chunk_seconds": self.chunk_seconds,
180
+ "overlap_seconds": self.overlap_seconds,
181
+ }
182
+
183
+ def reset(self) -> None:
184
+ """Clear the current on-disk and in-memory index."""
185
+ if self.index_path.exists():
186
+ self.index_path.unlink()
187
+ if self.db_path.exists():
188
+ self.index_backend.store.close()
189
+ self.db_path.unlink()
190
+ self.index_backend = FaissIndex(
191
+ dimension=self.embedder.embedding_dim,
192
+ db_path=str(self.db_path),
193
+ )
194
+
195
+ def _resolve_audio_files(
196
+ self,
197
+ targets: str | Path | Sequence[str | Path],
198
+ recursive: bool,
199
+ ) -> list[Path]:
200
+ target_list = [targets] if isinstance(targets, (str, Path)) else targets
201
+ files: list[Path] = []
202
+
203
+ for target in target_list:
204
+ path = Path(target).expanduser()
205
+ if path.is_dir():
206
+ iterator = path.rglob("*") if recursive else path.glob("*")
207
+ files.extend(
208
+ candidate
209
+ for candidate in iterator
210
+ if candidate.is_file() and candidate.suffix.lower() in AUDIO_EXTENSIONS
211
+ )
212
+ elif path.is_file() and path.suffix.lower() in AUDIO_EXTENSIONS:
213
+ files.append(path)
214
+ elif not path.exists():
215
+ raise FileNotFoundError(f"Audio path not found: {path}")
216
+
217
+ return sorted(dict.fromkeys(files))
218
+
219
+ def _metadata_for_chunk(
220
+ self,
221
+ path: Path,
222
+ chunk_number: int,
223
+ start_seconds: float,
224
+ end_seconds: float,
225
+ ) -> dict[str, Any]:
226
+ metadata = extract_metadata(str(path))
227
+ return {
228
+ "filepath": str(path),
229
+ "filename": path.name,
230
+ "chunk_id": chunk_number,
231
+ "start": start_seconds,
232
+ "end": end_seconds,
233
+ "chunk_duration": end_seconds - start_seconds,
234
+ "duration": metadata.duration,
235
+ "sample_rate": metadata.sample_rate,
236
+ "channels": metadata.channels,
237
+ "format": metadata.format,
238
+ "file_size": metadata.file_size,
239
+ }
240
+
241
+ def _iter_chunks(
242
+ self,
243
+ audio: npt.NDArray[np.float32],
244
+ ) -> list[tuple[int, float, float, npt.NDArray[np.float32]]]:
245
+ chunk_samples = round(self.chunk_seconds * self.sample_rate)
246
+ overlap_samples = round(self.overlap_seconds * self.sample_rate)
247
+ step_samples = chunk_samples - overlap_samples
248
+
249
+ chunks: list[tuple[int, float, float, npt.NDArray[np.float32]]] = []
250
+ if len(audio) == 0:
251
+ return chunks
252
+
253
+ start_sample = 0
254
+ chunk_number = 0
255
+ while start_sample < len(audio):
256
+ end_sample = min(start_sample + chunk_samples, len(audio))
257
+ chunk = audio[start_sample:end_sample]
258
+ start_seconds = start_sample / self.sample_rate
259
+ end_seconds = end_sample / self.sample_rate
260
+ chunks.append((chunk_number, start_seconds, end_seconds, chunk))
261
+
262
+ if end_sample == len(audio):
263
+ break
264
+ start_sample += step_samples
265
+ chunk_number += 1
266
+
267
+ return chunks
268
+
269
+ def _add_chunk_batch(
270
+ self,
271
+ chunk_paths: Sequence[Path],
272
+ chunk_ids: Sequence[str],
273
+ chunk_metadata: Sequence[dict[str, Any]],
274
+ ) -> None:
275
+ embeddings = self.embedder.embed_audio([str(path) for path in chunk_paths])
276
+ self.index_backend.add(
277
+ self._normalize_rows(embeddings),
278
+ list(chunk_ids),
279
+ list(chunk_metadata),
280
+ )
281
+
282
+ def _normalize_rows(
283
+ self,
284
+ embeddings: npt.NDArray[np.float32],
285
+ ) -> npt.NDArray[np.float32]:
286
+ embeddings_f32 = np.asarray(embeddings, dtype=np.float32)
287
+ norms = np.linalg.norm(embeddings_f32, axis=1, keepdims=True)
288
+ norms[norms == 0.0] = 1.0
289
+ return cast("npt.NDArray[np.float32]", embeddings_f32 / norms)
@@ -0,0 +1,15 @@
1
+ """EchoVector embeddings module.
2
+
3
+ Contains backends for generating embeddings from audio and text.
4
+ """
5
+
6
+ from echovector.embeddings.base import EmbeddingBackend
7
+ from echovector.embeddings.cache import EmbeddingCache
8
+ from echovector.embeddings.factory import EmbeddingFactory, get_embedding_model
9
+
10
+ __all__ = [
11
+ "EmbeddingBackend",
12
+ "EmbeddingCache",
13
+ "EmbeddingFactory",
14
+ "get_embedding_model",
15
+ ]
@@ -0,0 +1,41 @@
1
+ """AST (Audio Spectrogram Transformer) embedding backend stub."""
2
+
3
+ import numpy as np
4
+ import numpy.typing as npt
5
+
6
+ from echovector.embeddings.base import EmbeddingBackend
7
+
8
+
9
+ class ASTBackend(EmbeddingBackend):
10
+ """Stub implementation for the AST embedding backend."""
11
+
12
+ def __init__(self, model_name: str = "MIT/ast-finetuned-audioset-10-10-0.4593") -> None:
13
+ """Initialize the stub AST backend."""
14
+ self.model_name = model_name
15
+
16
+ @property
17
+ def embedding_dim(self) -> int:
18
+ """Return a stub embedding dimension."""
19
+ return 768
20
+
21
+ def embed_audio(self, audio_paths: list[str]) -> npt.NDArray[np.float32]:
22
+ """Embed a batch of audio files.
23
+
24
+ Args:
25
+ audio_paths: List of file paths to audio files.
26
+
27
+ Raises:
28
+ NotImplementedError: As this is a stub.
29
+ """
30
+ raise NotImplementedError("AST audio embedding not implemented.")
31
+
32
+ def embed_text(self, texts: list[str]) -> npt.NDArray[np.float32]:
33
+ """Embed a batch of text queries.
34
+
35
+ Args:
36
+ texts: List of text strings.
37
+
38
+ Raises:
39
+ NotImplementedError: AST backend does not support text embeddings.
40
+ """
41
+ raise NotImplementedError("AST backend does not support text embeddings.")
@@ -0,0 +1,43 @@
1
+ """Base protocols and types for EchoVector embedding backends."""
2
+
3
+ from typing import Protocol
4
+
5
+ import numpy as np
6
+ import numpy.typing as npt
7
+
8
+
9
+ class EmbeddingBackend(Protocol):
10
+ """Protocol for embedding models."""
11
+
12
+ @property
13
+ def embedding_dim(self) -> int:
14
+ """Return the dimensionality of the generated embeddings.
15
+
16
+ Returns:
17
+ The integer dimension size.
18
+ """
19
+ ...
20
+
21
+ def embed_audio(self, audio_paths: list[str]) -> npt.NDArray[np.float32]:
22
+ """Embed a batch of audio files.
23
+
24
+ Args:
25
+ audio_paths: List of file paths to audio files.
26
+
27
+ Returns:
28
+ A numpy array of shape (batch_size, embedding_dim).
29
+ """
30
+ ...
31
+
32
+ def embed_text(self, texts: list[str]) -> npt.NDArray[np.float32]:
33
+ """Embed a batch of text queries.
34
+
35
+ Raises NotImplementedError if the backend does not support text embeddings.
36
+
37
+ Args:
38
+ texts: List of text strings.
39
+
40
+ Returns:
41
+ A numpy array of shape (batch_size, embedding_dim).
42
+ """
43
+ ...
@@ -0,0 +1,96 @@
1
+ """Content-addressed caching for audio embeddings."""
2
+
3
+ import hashlib
4
+ from pathlib import Path
5
+ from typing import cast
6
+
7
+ import numpy as np
8
+ import numpy.typing as npt
9
+
10
+
11
+ class EmbeddingCache:
12
+ """Content-addressed storage for embeddings.
13
+
14
+ Embeddings are cached based on the hash of the file content or text.
15
+ """
16
+
17
+ def __init__(self, cache_dir: str | Path = ".cache/echovector") -> None:
18
+ """Initialize the embedding cache.
19
+
20
+ Args:
21
+ cache_dir: Directory to store the cache files.
22
+ """
23
+ self.cache_dir = Path(cache_dir)
24
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
25
+
26
+ def _compute_file_hash(self, filepath: str) -> str:
27
+ """Compute SHA-256 hash of a file's contents."""
28
+ hasher = hashlib.sha256()
29
+ with open(filepath, "rb") as f:
30
+ while chunk := f.read(8192):
31
+ hasher.update(chunk)
32
+ return hasher.hexdigest()
33
+
34
+ def _compute_text_hash(self, text: str) -> str:
35
+ """Compute SHA-256 hash of a text string."""
36
+ hasher = hashlib.sha256()
37
+ hasher.update(text.encode("utf-8"))
38
+ return hasher.hexdigest()
39
+
40
+ def _get_cache_path(self, hash_key: str) -> Path:
41
+ """Get the file path for a cached embedding."""
42
+ return self.cache_dir / f"{hash_key}.npy"
43
+
44
+ def get_audio_embedding(self, filepath: str) -> npt.NDArray[np.float32] | None:
45
+ """Retrieve cached embedding for an audio file.
46
+
47
+ Args:
48
+ filepath: Path to the audio file.
49
+
50
+ Returns:
51
+ The embedding array if cached, else None.
52
+ """
53
+ hash_key = self._compute_file_hash(filepath)
54
+ cache_path = self._get_cache_path(hash_key)
55
+
56
+ if cache_path.exists():
57
+ return cast("npt.NDArray[np.float32]", np.load(cache_path))
58
+ return None
59
+
60
+ def put_audio_embedding(self, filepath: str, embedding: npt.NDArray[np.float32]) -> None:
61
+ """Store embedding for an audio file in cache.
62
+
63
+ Args:
64
+ filepath: Path to the audio file.
65
+ embedding: The embedding array.
66
+ """
67
+ hash_key = self._compute_file_hash(filepath)
68
+ cache_path = self._get_cache_path(hash_key)
69
+ np.save(cache_path, embedding)
70
+
71
+ def get_text_embedding(self, text: str) -> npt.NDArray[np.float32] | None:
72
+ """Retrieve cached embedding for a text string.
73
+
74
+ Args:
75
+ text: The text string.
76
+
77
+ Returns:
78
+ The embedding array if cached, else None.
79
+ """
80
+ hash_key = self._compute_text_hash(text)
81
+ cache_path = self._get_cache_path(hash_key)
82
+
83
+ if cache_path.exists():
84
+ return cast("npt.NDArray[np.float32]", np.load(cache_path))
85
+ return None
86
+
87
+ def put_text_embedding(self, text: str, embedding: npt.NDArray[np.float32]) -> None:
88
+ """Store embedding for a text string in cache.
89
+
90
+ Args:
91
+ text: The text string.
92
+ embedding: The embedding array.
93
+ """
94
+ hash_key = self._compute_text_hash(text)
95
+ cache_path = self._get_cache_path(hash_key)
96
+ np.save(cache_path, embedding)