echo-vector 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- echo_vector-0.1.1.dist-info/METADATA +288 -0
- echo_vector-0.1.1.dist-info/RECORD +38 -0
- echo_vector-0.1.1.dist-info/WHEEL +4 -0
- echo_vector-0.1.1.dist-info/entry_points.txt +2 -0
- echovector/__init__.py +7 -0
- echovector/api/__init__.py +1 -0
- echovector/api/server.py +144 -0
- echovector/audio/__init__.py +12 -0
- echovector/audio/chunker.py +71 -0
- echovector/audio/metadata.py +58 -0
- echovector/audio/processor.py +53 -0
- echovector/audio/streaming.py +33 -0
- echovector/cli/__init__.py +1 -0
- echovector/cli/main.py +165 -0
- echovector/core.py +289 -0
- echovector/embeddings/__init__.py +15 -0
- echovector/embeddings/ast_model.py +41 -0
- echovector/embeddings/base.py +43 -0
- echovector/embeddings/cache.py +96 -0
- echovector/embeddings/clap.py +126 -0
- echovector/embeddings/factory.py +78 -0
- echovector/embeddings/hubert.py +41 -0
- echovector/embeddings/local.py +109 -0
- echovector/embeddings/wav2vec2.py +41 -0
- echovector/embeddings/whisper_enc.py +44 -0
- echovector/evaluation/__init__.py +1 -0
- echovector/evaluation/metrics.py +45 -0
- echovector/indexing/__init__.py +12 -0
- echovector/indexing/base.py +105 -0
- echovector/indexing/faiss_index.py +182 -0
- echovector/indexing/store.py +165 -0
- echovector/search/__init__.py +14 -0
- echovector/search/engine.py +82 -0
- echovector/search/filters.py +55 -0
- echovector/search/results.py +41 -0
- echovector/utils/__init__.py +6 -0
- echovector/utils/config.py +69 -0
- echovector/utils/logging.py +31 -0
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from collections.abc import Generator
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import numpy.typing as npt
|
|
5
|
+
import soundfile as sf
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class AudioStreamer:
|
|
9
|
+
"""Streams audio data in blocks."""
|
|
10
|
+
|
|
11
|
+
def __init__(self, block_size: int = 4096) -> None:
|
|
12
|
+
"""Initialize the streamer.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
block_size: Number of frames per block.
|
|
16
|
+
"""
|
|
17
|
+
self.block_size = block_size
|
|
18
|
+
|
|
19
|
+
def stream(self, file_path: str) -> Generator[npt.NDArray[np.float32], None, None]:
|
|
20
|
+
"""Stream audio from a file.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
file_path: Path to the audio file.
|
|
24
|
+
|
|
25
|
+
Yields:
|
|
26
|
+
Blocks of audio data as numpy arrays.
|
|
27
|
+
"""
|
|
28
|
+
for block in sf.blocks(
|
|
29
|
+
file_path, blocksize=self.block_size, dtype="float32", always_2d=False
|
|
30
|
+
):
|
|
31
|
+
if block.ndim > 1:
|
|
32
|
+
block = np.mean(block, axis=1, dtype=np.float32)
|
|
33
|
+
yield block
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""CLI module for EchoVector."""
|
echovector/cli/main.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""Command-line interface for EchoVector."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Annotated
|
|
5
|
+
|
|
6
|
+
import typer
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
from rich.table import Table
|
|
9
|
+
|
|
10
|
+
from echovector import EchoVector
|
|
11
|
+
|
|
12
|
+
app = typer.Typer(
|
|
13
|
+
name="echovector",
|
|
14
|
+
help="EchoVector CLI for semantic text search over audio files.",
|
|
15
|
+
add_completion=False,
|
|
16
|
+
)
|
|
17
|
+
console = Console()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _open_engine(store_dir: Path, backend: str, device: str | None) -> EchoVector:
|
|
21
|
+
if device:
|
|
22
|
+
return EchoVector(store_dir=store_dir, backend=backend, device=device)
|
|
23
|
+
return EchoVector(store_dir=store_dir, backend=backend)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@app.command()
|
|
27
|
+
def index(
|
|
28
|
+
files: Annotated[
|
|
29
|
+
list[Path],
|
|
30
|
+
typer.Argument(help="Audio files or directories to index."),
|
|
31
|
+
],
|
|
32
|
+
recursive: Annotated[
|
|
33
|
+
bool,
|
|
34
|
+
typer.Option("--recursive", "-r", help="Recursively search directories for audio files."),
|
|
35
|
+
] = False,
|
|
36
|
+
store_dir: Annotated[
|
|
37
|
+
Path,
|
|
38
|
+
typer.Option("--store-dir", help="Directory for index files."),
|
|
39
|
+
] = Path(".echovector"),
|
|
40
|
+
backend: Annotated[str, typer.Option("--backend", help="Embedding backend to use.")] = "clap",
|
|
41
|
+
device: Annotated[
|
|
42
|
+
str | None,
|
|
43
|
+
typer.Option("--device", help="Model device, e.g. cpu or cuda."),
|
|
44
|
+
] = None,
|
|
45
|
+
reset: Annotated[
|
|
46
|
+
bool,
|
|
47
|
+
typer.Option("--reset", help="Clear the existing index before indexing."),
|
|
48
|
+
] = False,
|
|
49
|
+
chunk_seconds: Annotated[
|
|
50
|
+
float,
|
|
51
|
+
typer.Option("--chunk-seconds", help="Audio chunk size to embed."),
|
|
52
|
+
] = 10.0,
|
|
53
|
+
overlap_seconds: Annotated[
|
|
54
|
+
float,
|
|
55
|
+
typer.Option("--overlap-seconds", help="Overlap between adjacent chunks."),
|
|
56
|
+
] = 1.0,
|
|
57
|
+
) -> None:
|
|
58
|
+
"""Index audio files for search."""
|
|
59
|
+
try:
|
|
60
|
+
if device:
|
|
61
|
+
engine = EchoVector(
|
|
62
|
+
store_dir=store_dir,
|
|
63
|
+
backend=backend,
|
|
64
|
+
device=device,
|
|
65
|
+
chunk_seconds=chunk_seconds,
|
|
66
|
+
overlap_seconds=overlap_seconds,
|
|
67
|
+
)
|
|
68
|
+
else:
|
|
69
|
+
engine = EchoVector(
|
|
70
|
+
store_dir=store_dir,
|
|
71
|
+
backend=backend,
|
|
72
|
+
chunk_seconds=chunk_seconds,
|
|
73
|
+
overlap_seconds=overlap_seconds,
|
|
74
|
+
)
|
|
75
|
+
if reset:
|
|
76
|
+
engine.reset()
|
|
77
|
+
count = engine.index(files, recursive=recursive)
|
|
78
|
+
except Exception as exc:
|
|
79
|
+
console.print(f"[bold red]Indexing failed:[/bold red] {exc}")
|
|
80
|
+
raise typer.Exit(code=1) from exc
|
|
81
|
+
|
|
82
|
+
console.print(f"[bold green]Indexing complete.[/bold green] Indexed {count} chunk(s).")
|
|
83
|
+
console.print(f"[dim]Store: {store_dir}[/dim]")
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@app.command()
|
|
87
|
+
def search(
|
|
88
|
+
query: Annotated[str, typer.Argument(help="Text query to search for.")],
|
|
89
|
+
top_k: Annotated[
|
|
90
|
+
int,
|
|
91
|
+
typer.Option("--top-k", "-k", help="Number of top results to return."),
|
|
92
|
+
] = 5,
|
|
93
|
+
store_dir: Annotated[
|
|
94
|
+
Path,
|
|
95
|
+
typer.Option("--store-dir", help="Directory containing the index."),
|
|
96
|
+
] = Path(".echovector"),
|
|
97
|
+
backend: Annotated[
|
|
98
|
+
str,
|
|
99
|
+
typer.Option("--backend", help="Embedding backend used for the index."),
|
|
100
|
+
] = "clap",
|
|
101
|
+
device: Annotated[
|
|
102
|
+
str | None,
|
|
103
|
+
typer.Option("--device", help="Model device, e.g. cpu or cuda."),
|
|
104
|
+
] = None,
|
|
105
|
+
) -> None:
|
|
106
|
+
"""Search an existing vector index without scanning audio files."""
|
|
107
|
+
try:
|
|
108
|
+
engine = _open_engine(store_dir=store_dir, backend=backend, device=device)
|
|
109
|
+
results = engine.search(query, top_k=top_k)
|
|
110
|
+
except Exception as exc:
|
|
111
|
+
console.print(f"[bold red]Search failed:[/bold red] {exc}")
|
|
112
|
+
raise typer.Exit(code=1) from exc
|
|
113
|
+
|
|
114
|
+
table = Table(title=f"Top {top_k} Results for '{query}'")
|
|
115
|
+
table.add_column("Score", justify="right", style="cyan", no_wrap=True)
|
|
116
|
+
table.add_column("File", style="magenta")
|
|
117
|
+
table.add_column("Timestamp", justify="right", style="green")
|
|
118
|
+
|
|
119
|
+
for result in results:
|
|
120
|
+
table.add_row(
|
|
121
|
+
f"{result.score:.4f}",
|
|
122
|
+
str(result.metadata.get("filename", result.filepath)),
|
|
123
|
+
f"{result.timestamp_range.start:.1f}s-{result.timestamp_range.end:.1f}s",
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
if results:
|
|
127
|
+
console.print(table)
|
|
128
|
+
else:
|
|
129
|
+
console.print("[yellow]No results found.[/yellow]")
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
@app.command()
|
|
133
|
+
def stats(
|
|
134
|
+
store_dir: Annotated[
|
|
135
|
+
Path,
|
|
136
|
+
typer.Option("--store-dir", help="Directory containing the index."),
|
|
137
|
+
] = Path(".echovector"),
|
|
138
|
+
backend: Annotated[
|
|
139
|
+
str,
|
|
140
|
+
typer.Option("--backend", help="Embedding backend used for the index."),
|
|
141
|
+
] = "clap",
|
|
142
|
+
device: Annotated[
|
|
143
|
+
str | None,
|
|
144
|
+
typer.Option("--device", help="Model device, e.g. cpu or cuda."),
|
|
145
|
+
] = None,
|
|
146
|
+
) -> None:
|
|
147
|
+
"""Display statistics about the current index."""
|
|
148
|
+
try:
|
|
149
|
+
stats_data = _open_engine(store_dir=store_dir, backend=backend, device=device).stats()
|
|
150
|
+
except Exception as exc:
|
|
151
|
+
console.print(f"[bold red]Stats failed:[/bold red] {exc}")
|
|
152
|
+
raise typer.Exit(code=1) from exc
|
|
153
|
+
|
|
154
|
+
table = Table(title="Index Statistics", show_header=False)
|
|
155
|
+
table.add_column("Metric", style="cyan")
|
|
156
|
+
table.add_column("Value", style="magenta")
|
|
157
|
+
|
|
158
|
+
for key, value in stats_data.items():
|
|
159
|
+
table.add_row(key, str(value))
|
|
160
|
+
|
|
161
|
+
console.print(table)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
if __name__ == "__main__":
|
|
165
|
+
app()
|
echovector/core.py
ADDED
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
"""Core public API for EchoVector."""
|
|
2
|
+
|
|
3
|
+
from collections.abc import Sequence
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from tempfile import TemporaryDirectory
|
|
6
|
+
from typing import Any, cast
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import numpy.typing as npt
|
|
10
|
+
import soundfile as sf
|
|
11
|
+
|
|
12
|
+
from echovector.audio.metadata import extract_metadata
|
|
13
|
+
from echovector.audio.processor import AudioProcessor as FileAudioProcessor
|
|
14
|
+
from echovector.embeddings.base import EmbeddingBackend
|
|
15
|
+
from echovector.embeddings.factory import get_embedding_model
|
|
16
|
+
from echovector.indexing.faiss_index import FaissIndex
|
|
17
|
+
from echovector.search.results import SearchResult, TimestampRange
|
|
18
|
+
|
|
19
|
+
AUDIO_EXTENSIONS = {".wav", ".mp3", ".flac", ".m4a", ".ogg", ".aiff", ".aif"}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class EchoVector:
|
|
23
|
+
"""High-level interface for indexing and searching audio files."""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
store_dir: str | Path = ".echovector",
|
|
28
|
+
backend: str | EmbeddingBackend = "clap",
|
|
29
|
+
recursive: bool = True,
|
|
30
|
+
chunk_seconds: float = 10.0,
|
|
31
|
+
overlap_seconds: float = 1.0,
|
|
32
|
+
sample_rate: int = 48_000,
|
|
33
|
+
**backend_kwargs: Any,
|
|
34
|
+
) -> None:
|
|
35
|
+
"""Initialize EchoVector."""
|
|
36
|
+
if chunk_seconds <= 0.0:
|
|
37
|
+
raise ValueError("chunk_seconds must be positive.")
|
|
38
|
+
if overlap_seconds < 0.0:
|
|
39
|
+
raise ValueError("overlap_seconds cannot be negative.")
|
|
40
|
+
if overlap_seconds >= chunk_seconds:
|
|
41
|
+
raise ValueError("overlap_seconds must be smaller than chunk_seconds.")
|
|
42
|
+
|
|
43
|
+
self.store_dir = Path(store_dir)
|
|
44
|
+
self.store_dir.mkdir(parents=True, exist_ok=True)
|
|
45
|
+
self.index_path = self.store_dir / "index.faiss"
|
|
46
|
+
self.db_path = self.store_dir / "metadata.sqlite"
|
|
47
|
+
self.recursive = recursive
|
|
48
|
+
self.chunk_seconds = chunk_seconds
|
|
49
|
+
self.overlap_seconds = overlap_seconds
|
|
50
|
+
self.sample_rate = sample_rate
|
|
51
|
+
self.audio_processor = FileAudioProcessor(target_sample_rate=sample_rate, mono=True)
|
|
52
|
+
|
|
53
|
+
self.embedder = (
|
|
54
|
+
get_embedding_model(backend, **backend_kwargs) if isinstance(backend, str) else backend
|
|
55
|
+
)
|
|
56
|
+
self.index_backend = FaissIndex(
|
|
57
|
+
dimension=self.embedder.embedding_dim,
|
|
58
|
+
db_path=str(self.db_path),
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
if self.index_path.exists():
|
|
62
|
+
self.index_backend.load(str(self.index_path))
|
|
63
|
+
|
|
64
|
+
def index(
|
|
65
|
+
self,
|
|
66
|
+
targets: str | Path | Sequence[str | Path],
|
|
67
|
+
recursive: bool | None = None,
|
|
68
|
+
batch_size: int = 16,
|
|
69
|
+
force: bool = False,
|
|
70
|
+
) -> int:
|
|
71
|
+
"""Index audio chunks from paths or directories.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
targets: One or more file paths or directories to index.
|
|
75
|
+
recursive: Override the instance-level recursive setting.
|
|
76
|
+
batch_size: Number of chunks to embed per batch.
|
|
77
|
+
force: If True, remove and re-index files that are already stored.
|
|
78
|
+
If False (default), already-indexed files are skipped.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
Number of new chunks added to the index.
|
|
82
|
+
"""
|
|
83
|
+
files = self._resolve_audio_files(
|
|
84
|
+
targets,
|
|
85
|
+
self.recursive if recursive is None else recursive,
|
|
86
|
+
)
|
|
87
|
+
if not files:
|
|
88
|
+
return 0
|
|
89
|
+
|
|
90
|
+
if force:
|
|
91
|
+
for file_path in files:
|
|
92
|
+
int_ids = self.index_backend.store.delete_by_filepath(str(file_path))
|
|
93
|
+
if int_ids:
|
|
94
|
+
self.index_backend.remove_int_ids(int_ids)
|
|
95
|
+
else:
|
|
96
|
+
files = [f for f in files if not self.index_backend.store.has_filepath(str(f))]
|
|
97
|
+
if not files:
|
|
98
|
+
return 0
|
|
99
|
+
|
|
100
|
+
indexed_chunks = 0
|
|
101
|
+
with TemporaryDirectory(prefix="echovector-chunks-") as temp_dir:
|
|
102
|
+
chunk_paths: list[Path] = []
|
|
103
|
+
chunk_ids: list[str] = []
|
|
104
|
+
chunk_metadata: list[dict[str, Any]] = []
|
|
105
|
+
|
|
106
|
+
for file_path in files:
|
|
107
|
+
audio = self.audio_processor.load_audio(str(file_path))
|
|
108
|
+
for (
|
|
109
|
+
chunk_number,
|
|
110
|
+
start_seconds,
|
|
111
|
+
end_seconds,
|
|
112
|
+
chunk_audio,
|
|
113
|
+
) in self._iter_chunks(audio):
|
|
114
|
+
chunk_path = Path(temp_dir) / f"chunk-{indexed_chunks:08d}.wav"
|
|
115
|
+
sf.write(chunk_path, chunk_audio, self.sample_rate)
|
|
116
|
+
chunk_paths.append(chunk_path)
|
|
117
|
+
chunk_ids.append(f"{file_path}#{start_seconds:.3f}-{end_seconds:.3f}")
|
|
118
|
+
chunk_metadata.append(
|
|
119
|
+
self._metadata_for_chunk(
|
|
120
|
+
file_path,
|
|
121
|
+
chunk_number,
|
|
122
|
+
start_seconds,
|
|
123
|
+
end_seconds,
|
|
124
|
+
)
|
|
125
|
+
)
|
|
126
|
+
indexed_chunks += 1
|
|
127
|
+
|
|
128
|
+
if len(chunk_paths) >= batch_size:
|
|
129
|
+
self._add_chunk_batch(chunk_paths, chunk_ids, chunk_metadata)
|
|
130
|
+
chunk_paths = []
|
|
131
|
+
chunk_ids = []
|
|
132
|
+
chunk_metadata = []
|
|
133
|
+
|
|
134
|
+
if chunk_paths:
|
|
135
|
+
self._add_chunk_batch(chunk_paths, chunk_ids, chunk_metadata)
|
|
136
|
+
|
|
137
|
+
self.index_backend.save(str(self.index_path))
|
|
138
|
+
return indexed_chunks
|
|
139
|
+
|
|
140
|
+
def search(self, query: str, top_k: int = 5) -> list[SearchResult]:
|
|
141
|
+
"""Search indexed audio with a text query."""
|
|
142
|
+
if top_k < 0:
|
|
143
|
+
raise ValueError("top_k must be non-negative.")
|
|
144
|
+
if top_k == 0:
|
|
145
|
+
return []
|
|
146
|
+
|
|
147
|
+
query_embedding = self._normalize_rows(self.embedder.embed_text([query]))
|
|
148
|
+
distances, ids, metadata_rows = self.index_backend.search(query_embedding, k=top_k)
|
|
149
|
+
|
|
150
|
+
results: list[SearchResult] = []
|
|
151
|
+
for offset, string_id in enumerate(ids[0] if ids else []):
|
|
152
|
+
if string_id is None:
|
|
153
|
+
continue
|
|
154
|
+
metadata = metadata_rows[0][offset] if metadata_rows and metadata_rows[0] else {}
|
|
155
|
+
metadata = metadata or {}
|
|
156
|
+
results.append(
|
|
157
|
+
SearchResult(
|
|
158
|
+
filepath=str(metadata.get("filepath", string_id)),
|
|
159
|
+
timestamp_range=TimestampRange(
|
|
160
|
+
start=float(metadata.get("start", 0.0)),
|
|
161
|
+
end=float(metadata.get("end", metadata.get("duration", 0.0))),
|
|
162
|
+
),
|
|
163
|
+
score=float(distances[0][offset]),
|
|
164
|
+
metadata=metadata,
|
|
165
|
+
)
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
return results
|
|
169
|
+
|
|
170
|
+
def stats(self) -> dict[str, Any]:
|
|
171
|
+
"""Return basic index statistics."""
|
|
172
|
+
return {
|
|
173
|
+
"store_dir": str(self.store_dir),
|
|
174
|
+
"index_path": str(self.index_path),
|
|
175
|
+
"metadata_path": str(self.db_path),
|
|
176
|
+
"embedding_dim": self.embedder.embedding_dim,
|
|
177
|
+
"chunks": int(self.index_backend.index.ntotal),
|
|
178
|
+
"vectors": int(self.index_backend.index.ntotal),
|
|
179
|
+
"chunk_seconds": self.chunk_seconds,
|
|
180
|
+
"overlap_seconds": self.overlap_seconds,
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
def reset(self) -> None:
|
|
184
|
+
"""Clear the current on-disk and in-memory index."""
|
|
185
|
+
if self.index_path.exists():
|
|
186
|
+
self.index_path.unlink()
|
|
187
|
+
if self.db_path.exists():
|
|
188
|
+
self.index_backend.store.close()
|
|
189
|
+
self.db_path.unlink()
|
|
190
|
+
self.index_backend = FaissIndex(
|
|
191
|
+
dimension=self.embedder.embedding_dim,
|
|
192
|
+
db_path=str(self.db_path),
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
def _resolve_audio_files(
|
|
196
|
+
self,
|
|
197
|
+
targets: str | Path | Sequence[str | Path],
|
|
198
|
+
recursive: bool,
|
|
199
|
+
) -> list[Path]:
|
|
200
|
+
target_list = [targets] if isinstance(targets, (str, Path)) else targets
|
|
201
|
+
files: list[Path] = []
|
|
202
|
+
|
|
203
|
+
for target in target_list:
|
|
204
|
+
path = Path(target).expanduser()
|
|
205
|
+
if path.is_dir():
|
|
206
|
+
iterator = path.rglob("*") if recursive else path.glob("*")
|
|
207
|
+
files.extend(
|
|
208
|
+
candidate
|
|
209
|
+
for candidate in iterator
|
|
210
|
+
if candidate.is_file() and candidate.suffix.lower() in AUDIO_EXTENSIONS
|
|
211
|
+
)
|
|
212
|
+
elif path.is_file() and path.suffix.lower() in AUDIO_EXTENSIONS:
|
|
213
|
+
files.append(path)
|
|
214
|
+
elif not path.exists():
|
|
215
|
+
raise FileNotFoundError(f"Audio path not found: {path}")
|
|
216
|
+
|
|
217
|
+
return sorted(dict.fromkeys(files))
|
|
218
|
+
|
|
219
|
+
def _metadata_for_chunk(
|
|
220
|
+
self,
|
|
221
|
+
path: Path,
|
|
222
|
+
chunk_number: int,
|
|
223
|
+
start_seconds: float,
|
|
224
|
+
end_seconds: float,
|
|
225
|
+
) -> dict[str, Any]:
|
|
226
|
+
metadata = extract_metadata(str(path))
|
|
227
|
+
return {
|
|
228
|
+
"filepath": str(path),
|
|
229
|
+
"filename": path.name,
|
|
230
|
+
"chunk_id": chunk_number,
|
|
231
|
+
"start": start_seconds,
|
|
232
|
+
"end": end_seconds,
|
|
233
|
+
"chunk_duration": end_seconds - start_seconds,
|
|
234
|
+
"duration": metadata.duration,
|
|
235
|
+
"sample_rate": metadata.sample_rate,
|
|
236
|
+
"channels": metadata.channels,
|
|
237
|
+
"format": metadata.format,
|
|
238
|
+
"file_size": metadata.file_size,
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
def _iter_chunks(
|
|
242
|
+
self,
|
|
243
|
+
audio: npt.NDArray[np.float32],
|
|
244
|
+
) -> list[tuple[int, float, float, npt.NDArray[np.float32]]]:
|
|
245
|
+
chunk_samples = round(self.chunk_seconds * self.sample_rate)
|
|
246
|
+
overlap_samples = round(self.overlap_seconds * self.sample_rate)
|
|
247
|
+
step_samples = chunk_samples - overlap_samples
|
|
248
|
+
|
|
249
|
+
chunks: list[tuple[int, float, float, npt.NDArray[np.float32]]] = []
|
|
250
|
+
if len(audio) == 0:
|
|
251
|
+
return chunks
|
|
252
|
+
|
|
253
|
+
start_sample = 0
|
|
254
|
+
chunk_number = 0
|
|
255
|
+
while start_sample < len(audio):
|
|
256
|
+
end_sample = min(start_sample + chunk_samples, len(audio))
|
|
257
|
+
chunk = audio[start_sample:end_sample]
|
|
258
|
+
start_seconds = start_sample / self.sample_rate
|
|
259
|
+
end_seconds = end_sample / self.sample_rate
|
|
260
|
+
chunks.append((chunk_number, start_seconds, end_seconds, chunk))
|
|
261
|
+
|
|
262
|
+
if end_sample == len(audio):
|
|
263
|
+
break
|
|
264
|
+
start_sample += step_samples
|
|
265
|
+
chunk_number += 1
|
|
266
|
+
|
|
267
|
+
return chunks
|
|
268
|
+
|
|
269
|
+
def _add_chunk_batch(
|
|
270
|
+
self,
|
|
271
|
+
chunk_paths: Sequence[Path],
|
|
272
|
+
chunk_ids: Sequence[str],
|
|
273
|
+
chunk_metadata: Sequence[dict[str, Any]],
|
|
274
|
+
) -> None:
|
|
275
|
+
embeddings = self.embedder.embed_audio([str(path) for path in chunk_paths])
|
|
276
|
+
self.index_backend.add(
|
|
277
|
+
self._normalize_rows(embeddings),
|
|
278
|
+
list(chunk_ids),
|
|
279
|
+
list(chunk_metadata),
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
def _normalize_rows(
|
|
283
|
+
self,
|
|
284
|
+
embeddings: npt.NDArray[np.float32],
|
|
285
|
+
) -> npt.NDArray[np.float32]:
|
|
286
|
+
embeddings_f32 = np.asarray(embeddings, dtype=np.float32)
|
|
287
|
+
norms = np.linalg.norm(embeddings_f32, axis=1, keepdims=True)
|
|
288
|
+
norms[norms == 0.0] = 1.0
|
|
289
|
+
return cast("npt.NDArray[np.float32]", embeddings_f32 / norms)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""EchoVector embeddings module.
|
|
2
|
+
|
|
3
|
+
Contains backends for generating embeddings from audio and text.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from echovector.embeddings.base import EmbeddingBackend
|
|
7
|
+
from echovector.embeddings.cache import EmbeddingCache
|
|
8
|
+
from echovector.embeddings.factory import EmbeddingFactory, get_embedding_model
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"EmbeddingBackend",
|
|
12
|
+
"EmbeddingCache",
|
|
13
|
+
"EmbeddingFactory",
|
|
14
|
+
"get_embedding_model",
|
|
15
|
+
]
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""AST (Audio Spectrogram Transformer) embedding backend stub."""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import numpy.typing as npt
|
|
5
|
+
|
|
6
|
+
from echovector.embeddings.base import EmbeddingBackend
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ASTBackend(EmbeddingBackend):
|
|
10
|
+
"""Stub implementation for the AST embedding backend."""
|
|
11
|
+
|
|
12
|
+
def __init__(self, model_name: str = "MIT/ast-finetuned-audioset-10-10-0.4593") -> None:
|
|
13
|
+
"""Initialize the stub AST backend."""
|
|
14
|
+
self.model_name = model_name
|
|
15
|
+
|
|
16
|
+
@property
|
|
17
|
+
def embedding_dim(self) -> int:
|
|
18
|
+
"""Return a stub embedding dimension."""
|
|
19
|
+
return 768
|
|
20
|
+
|
|
21
|
+
def embed_audio(self, audio_paths: list[str]) -> npt.NDArray[np.float32]:
|
|
22
|
+
"""Embed a batch of audio files.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
audio_paths: List of file paths to audio files.
|
|
26
|
+
|
|
27
|
+
Raises:
|
|
28
|
+
NotImplementedError: As this is a stub.
|
|
29
|
+
"""
|
|
30
|
+
raise NotImplementedError("AST audio embedding not implemented.")
|
|
31
|
+
|
|
32
|
+
def embed_text(self, texts: list[str]) -> npt.NDArray[np.float32]:
|
|
33
|
+
"""Embed a batch of text queries.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
texts: List of text strings.
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
NotImplementedError: AST backend does not support text embeddings.
|
|
40
|
+
"""
|
|
41
|
+
raise NotImplementedError("AST backend does not support text embeddings.")
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Base protocols and types for EchoVector embedding backends."""
|
|
2
|
+
|
|
3
|
+
from typing import Protocol
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import numpy.typing as npt
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class EmbeddingBackend(Protocol):
|
|
10
|
+
"""Protocol for embedding models."""
|
|
11
|
+
|
|
12
|
+
@property
|
|
13
|
+
def embedding_dim(self) -> int:
|
|
14
|
+
"""Return the dimensionality of the generated embeddings.
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
The integer dimension size.
|
|
18
|
+
"""
|
|
19
|
+
...
|
|
20
|
+
|
|
21
|
+
def embed_audio(self, audio_paths: list[str]) -> npt.NDArray[np.float32]:
|
|
22
|
+
"""Embed a batch of audio files.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
audio_paths: List of file paths to audio files.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
A numpy array of shape (batch_size, embedding_dim).
|
|
29
|
+
"""
|
|
30
|
+
...
|
|
31
|
+
|
|
32
|
+
def embed_text(self, texts: list[str]) -> npt.NDArray[np.float32]:
|
|
33
|
+
"""Embed a batch of text queries.
|
|
34
|
+
|
|
35
|
+
Raises NotImplementedError if the backend does not support text embeddings.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
texts: List of text strings.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
A numpy array of shape (batch_size, embedding_dim).
|
|
42
|
+
"""
|
|
43
|
+
...
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Content-addressed caching for audio embeddings."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import cast
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import numpy.typing as npt
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class EmbeddingCache:
|
|
12
|
+
"""Content-addressed storage for embeddings.
|
|
13
|
+
|
|
14
|
+
Embeddings are cached based on the hash of the file content or text.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, cache_dir: str | Path = ".cache/echovector") -> None:
|
|
18
|
+
"""Initialize the embedding cache.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
cache_dir: Directory to store the cache files.
|
|
22
|
+
"""
|
|
23
|
+
self.cache_dir = Path(cache_dir)
|
|
24
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
25
|
+
|
|
26
|
+
def _compute_file_hash(self, filepath: str) -> str:
|
|
27
|
+
"""Compute SHA-256 hash of a file's contents."""
|
|
28
|
+
hasher = hashlib.sha256()
|
|
29
|
+
with open(filepath, "rb") as f:
|
|
30
|
+
while chunk := f.read(8192):
|
|
31
|
+
hasher.update(chunk)
|
|
32
|
+
return hasher.hexdigest()
|
|
33
|
+
|
|
34
|
+
def _compute_text_hash(self, text: str) -> str:
|
|
35
|
+
"""Compute SHA-256 hash of a text string."""
|
|
36
|
+
hasher = hashlib.sha256()
|
|
37
|
+
hasher.update(text.encode("utf-8"))
|
|
38
|
+
return hasher.hexdigest()
|
|
39
|
+
|
|
40
|
+
def _get_cache_path(self, hash_key: str) -> Path:
|
|
41
|
+
"""Get the file path for a cached embedding."""
|
|
42
|
+
return self.cache_dir / f"{hash_key}.npy"
|
|
43
|
+
|
|
44
|
+
def get_audio_embedding(self, filepath: str) -> npt.NDArray[np.float32] | None:
|
|
45
|
+
"""Retrieve cached embedding for an audio file.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
filepath: Path to the audio file.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
The embedding array if cached, else None.
|
|
52
|
+
"""
|
|
53
|
+
hash_key = self._compute_file_hash(filepath)
|
|
54
|
+
cache_path = self._get_cache_path(hash_key)
|
|
55
|
+
|
|
56
|
+
if cache_path.exists():
|
|
57
|
+
return cast("npt.NDArray[np.float32]", np.load(cache_path))
|
|
58
|
+
return None
|
|
59
|
+
|
|
60
|
+
def put_audio_embedding(self, filepath: str, embedding: npt.NDArray[np.float32]) -> None:
|
|
61
|
+
"""Store embedding for an audio file in cache.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
filepath: Path to the audio file.
|
|
65
|
+
embedding: The embedding array.
|
|
66
|
+
"""
|
|
67
|
+
hash_key = self._compute_file_hash(filepath)
|
|
68
|
+
cache_path = self._get_cache_path(hash_key)
|
|
69
|
+
np.save(cache_path, embedding)
|
|
70
|
+
|
|
71
|
+
def get_text_embedding(self, text: str) -> npt.NDArray[np.float32] | None:
|
|
72
|
+
"""Retrieve cached embedding for a text string.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
text: The text string.
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
The embedding array if cached, else None.
|
|
79
|
+
"""
|
|
80
|
+
hash_key = self._compute_text_hash(text)
|
|
81
|
+
cache_path = self._get_cache_path(hash_key)
|
|
82
|
+
|
|
83
|
+
if cache_path.exists():
|
|
84
|
+
return cast("npt.NDArray[np.float32]", np.load(cache_path))
|
|
85
|
+
return None
|
|
86
|
+
|
|
87
|
+
def put_text_embedding(self, text: str, embedding: npt.NDArray[np.float32]) -> None:
|
|
88
|
+
"""Store embedding for a text string in cache.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
text: The text string.
|
|
92
|
+
embedding: The embedding array.
|
|
93
|
+
"""
|
|
94
|
+
hash_key = self._compute_text_hash(text)
|
|
95
|
+
cache_path = self._get_cache_path(hash_key)
|
|
96
|
+
np.save(cache_path, embedding)
|