echo-vector 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,288 @@
1
+ Metadata-Version: 2.4
2
+ Name: echo_vector
3
+ Version: 0.1.1
4
+ Summary: Semantic text search over audio files without full transcription
5
+ Project-URL: Homepage, https://github.com/ahron-maslin/echo_vector
6
+ Project-URL: Documentation, https://github.com/ahron-maslin/echo_vector#readme
7
+ Project-URL: Repository, https://github.com/ahron-maslin/echo_vector
8
+ Project-URL: Issues, https://github.com/ahron-maslin/echo_vector/issues
9
+ Project-URL: Changelog, https://github.com/ahron-maslin/echo_vector/blob/main/CHANGELOG.md
10
+ Author: EchoVector Contributors
11
+ License-Expression: MIT
12
+ Keywords: CLAP,FAISS,audio,embeddings,search,semantic,vector
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Analysis
22
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Classifier: Topic :: Text Processing :: Indexing
24
+ Classifier: Typing :: Typed
25
+ Requires-Python: >=3.12
26
+ Requires-Dist: faiss-cpu<2,>=1.7
27
+ Requires-Dist: librosa<1,>=0.10
28
+ Requires-Dist: numpy<3,>=1.26
29
+ Requires-Dist: pydantic<3,>=2.5
30
+ Requires-Dist: pydub<1,>=0.25
31
+ Requires-Dist: rich<14,>=13.7
32
+ Requires-Dist: soundfile<1,>=0.12
33
+ Requires-Dist: tqdm<5,>=4.66
34
+ Requires-Dist: typer[all]<1,>=0.12
35
+ Provides-Extra: all
36
+ Requires-Dist: fastapi<1,>=0.109; extra == 'all'
37
+ Requires-Dist: httpx2<3,>=2.0; extra == 'all'
38
+ Requires-Dist: hypothesis<7,>=6.92; extra == 'all'
39
+ Requires-Dist: mkdocs-gen-files<1,>=0.5; extra == 'all'
40
+ Requires-Dist: mkdocs-literate-nav<1,>=0.6; extra == 'all'
41
+ Requires-Dist: mkdocs-material<10,>=9.5; extra == 'all'
42
+ Requires-Dist: mkdocstrings[python]<1,>=0.24; extra == 'all'
43
+ Requires-Dist: mutmut<3,>=2.4; extra == 'all'
44
+ Requires-Dist: mypy<2,>=1.8; extra == 'all'
45
+ Requires-Dist: pre-commit<4,>=3.6; extra == 'all'
46
+ Requires-Dist: pytest-asyncio<1,>=0.23; extra == 'all'
47
+ Requires-Dist: pytest-cov<6,>=5.0; extra == 'all'
48
+ Requires-Dist: pytest-xdist<4,>=3.5; extra == 'all'
49
+ Requires-Dist: pytest<9,>=8.0; extra == 'all'
50
+ Requires-Dist: ruff<1,>=0.15; extra == 'all'
51
+ Requires-Dist: torch<3,>=2.1; extra == 'all'
52
+ Requires-Dist: transformers<5,>=4.36; extra == 'all'
53
+ Requires-Dist: uvicorn[standard]<1,>=0.27; extra == 'all'
54
+ Provides-Extra: api
55
+ Requires-Dist: fastapi<1,>=0.109; extra == 'api'
56
+ Requires-Dist: uvicorn[standard]<1,>=0.27; extra == 'api'
57
+ Provides-Extra: clap
58
+ Requires-Dist: torch<3,>=2.1; extra == 'clap'
59
+ Requires-Dist: transformers<5,>=4.36; extra == 'clap'
60
+ Provides-Extra: dev
61
+ Requires-Dist: httpx2<3,>=2.0; extra == 'dev'
62
+ Requires-Dist: hypothesis<7,>=6.92; extra == 'dev'
63
+ Requires-Dist: mutmut<3,>=2.4; extra == 'dev'
64
+ Requires-Dist: mypy<2,>=1.8; extra == 'dev'
65
+ Requires-Dist: pre-commit<4,>=3.6; extra == 'dev'
66
+ Requires-Dist: pytest-asyncio<1,>=0.23; extra == 'dev'
67
+ Requires-Dist: pytest-cov<6,>=5.0; extra == 'dev'
68
+ Requires-Dist: pytest-xdist<4,>=3.5; extra == 'dev'
69
+ Requires-Dist: pytest<9,>=8.0; extra == 'dev'
70
+ Requires-Dist: ruff<1,>=0.15; extra == 'dev'
71
+ Provides-Extra: docs
72
+ Requires-Dist: mkdocs-gen-files<1,>=0.5; extra == 'docs'
73
+ Requires-Dist: mkdocs-literate-nav<1,>=0.6; extra == 'docs'
74
+ Requires-Dist: mkdocs-material<10,>=9.5; extra == 'docs'
75
+ Requires-Dist: mkdocstrings[python]<1,>=0.24; extra == 'docs'
76
+ Description-Content-Type: text/markdown
77
+
78
+ # 🔊 EchoVector
79
+
80
+ > **Semantic text search over audio files — without full transcription.**
81
+
82
+ [![CI](https://github.com/echovector/echovector/actions/workflows/test.yml/badge.svg)](https://github.com/echovector/echovector/actions/workflows/test.yml)
83
+ [![Coverage](https://img.shields.io/badge/coverage-%3E95%25-brightgreen)](.)
84
+ [![Python](https://img.shields.io/badge/python-3.12%2B-blue)](.)
85
+ [![License](https://img.shields.io/badge/license-MIT-green)](LICENSE)
86
+
87
+ ---
88
+
89
+ ## What is EchoVector?
90
+
91
+ EchoVector indexes audio files by generating **semantic embeddings directly from audio waveforms**, then lets you search them with natural language text queries — all without transcribing a single word.
92
+
93
+ ### Traditional approach (slow & expensive)
94
+
95
+ ```
96
+ Audio → Full Transcription → Text Embeddings → Text Search
97
+ ```
98
+
99
+ ### EchoVector approach (fast & efficient)
100
+
101
+ ```
102
+ Audio → Audio Chunks → Audio Embeddings ─┐
103
+ ├─► ANN Search → Results
104
+ Text Query → Text Embedding ──────────────┘
105
+ ```
106
+
107
+ ## Features
108
+
109
+ - 🎵 **Multi-format support** — MP3, WAV, FLAC, M4A
110
+ - 🧠 **Direct audio embeddings** — No transcription needed
111
+ - 🔍 **Semantic search** — Query with natural language
112
+ - ⚡ **FAISS-powered** — Approximate nearest neighbor search
113
+ - 🔌 **Pluggable backends** — CLAP, Whisper, wav2vec2, HuBERT, AST
114
+ - 🧪 **Offline smoke backend** — `local` backend for CI/Kaggle tests without model downloads
115
+ - 📊 **Rich CLI** — Progress bars, colors, benchmarking mode
116
+ - 🌐 **REST API** — Optional FastAPI server
117
+ - 📦 **Production-ready** — Typed, tested, documented
118
+
119
+ ## Quick Start
120
+
121
+ ### Installation
122
+
123
+ ```bash
124
+ pip install echovector
125
+ ```
126
+
127
+ Or with uv:
128
+
129
+ ```bash
130
+ uv add echovector
131
+ ```
132
+
133
+ ### CLI Usage
134
+
135
+ ```bash
136
+ # One-time indexing: split audio into timestamped chunks and embed each chunk
137
+ echovector index ./meetings
138
+
139
+ # Fast repeated search: embed only the text query and search the saved FAISS index
140
+ echovector search "discussion about transformers"
141
+
142
+ # Search with options
143
+ echovector search "pricing strategy" --top-k 10
144
+
145
+ # View index statistics
146
+ echovector stats
147
+ ```
148
+
149
+ For a no-download smoke test, use the deterministic local backend:
150
+
151
+ ```bash
152
+ echovector index ./meetings --backend local --store-dir ./ev-index
153
+ echovector search "high alarm tone" --backend local --store-dir ./ev-index
154
+ echovector stats --backend local --store-dir ./ev-index
155
+ ```
156
+
157
+ The search command does not reopen or scan the audio files. All expensive audio processing happens
158
+ during `index`; `search` loads the saved vector index, embeds the short text query, and returns the
159
+ nearest timestamped chunks.
160
+
161
+ ### Python API
162
+
163
+ ```python
164
+ from echovector import EchoVector
165
+
166
+ ev = EchoVector()
167
+
168
+ # Index audio files
169
+ ev.index("./meetings")
170
+
171
+ # Search with natural language
172
+ results = ev.search("conversation about CUDA kernels")
173
+
174
+ for r in results:
175
+ print(
176
+ f"{r.filepath} "
177
+ f"[{r.timestamp_range.start:.1f}s - {r.timestamp_range.end:.1f}s] "
178
+ f"score={r.score:.4f}"
179
+ )
180
+ ```
181
+
182
+ ## Testing on Kaggle
183
+
184
+ Kaggle is useful for GPU-backed CLAP tests, but first check the runtime Python version:
185
+
186
+ ```python
187
+ import sys
188
+ print(sys.version)
189
+ ```
190
+
191
+ EchoVector currently declares `Python >=3.12`. If the Kaggle image is older, install and test in a
192
+ Python 3.12-capable environment instead, or relax the project requirement only after validating the
193
+ test suite on that Python version.
194
+
195
+ ### Notebook smoke test without internet/model downloads
196
+
197
+ Upload this repository as a Kaggle dataset, attach it to a notebook, then run:
198
+
199
+ ```python
200
+ %cd /kaggle/input/<your-echo-vector-dataset>
201
+ !pip install -e . --no-deps
202
+ !pip install numpy soundfile librosa faiss-cpu typer rich pydantic
203
+ !python -m pytest tests/ -q
204
+ ```
205
+
206
+ Create a tiny audio corpus and test the real CLI/index path:
207
+
208
+ ```python
209
+ import os
210
+ import numpy as np
211
+ import soundfile as sf
212
+
213
+ audio_dir = "/kaggle/working/ev-audio"
214
+ index_dir = "/kaggle/working/ev-index"
215
+ os.makedirs(audio_dir, exist_ok=True)
216
+
217
+ sr = 16000
218
+ t = np.linspace(0, 1.0, sr, endpoint=False)
219
+ sf.write(f"{audio_dir}/high_tone.wav", 0.25 * np.sin(2 * np.pi * 880 * t), sr)
220
+ sf.write(f"{audio_dir}/low_tone.wav", 0.25 * np.sin(2 * np.pi * 110 * t), sr)
221
+ ```
222
+
223
+ ```python
224
+ !echovector index /kaggle/working/ev-audio --backend local --store-dir /kaggle/working/ev-index --reset
225
+ !echovector search "high alarm tone" --backend local --store-dir /kaggle/working/ev-index --top-k 2
226
+ !echovector stats --backend local --store-dir /kaggle/working/ev-index
227
+ ```
228
+
229
+ This validates packaging, audio loading, FAISS persistence, metadata storage, and the CLI without
230
+ depending on Hugging Face downloads.
231
+
232
+ ### CLAP semantic test
233
+
234
+ For actual semantic text-to-audio search, enable internet in the notebook settings and use a GPU
235
+ runtime if available:
236
+
237
+ ```python
238
+ !pip install transformers torch faiss-cpu librosa soundfile
239
+ !echovector index /kaggle/input/<audio-dataset> --backend clap --device cuda --store-dir /kaggle/working/clap-index --recursive --reset
240
+ !echovector search "people discussing pricing strategy" --backend clap --device cuda --store-dir /kaggle/working/clap-index --top-k 10
241
+ ```
242
+
243
+ If GPU is unavailable, replace `--device cuda` with `--device cpu`; it will be slower. Keep indexes
244
+ under `/kaggle/working` so they are writable during the notebook session.
245
+
246
+ ## Architecture
247
+
248
+ ```
249
+ echovector/
250
+ ├── audio/ # Audio loading, chunking, streaming, metadata
251
+ ├── embeddings/ # Pluggable embedding backends (CLAP, Whisper, etc.)
252
+ ├── indexing/ # Vector index backends (FAISS, with pluggable design)
253
+ ├── search/ # Search engine, filtering, result hydration
254
+ ├── cli/ # Typer-based CLI with Rich output
255
+ ├── api/ # Optional FastAPI server
256
+ ├── evaluation/ # Metrics (recall@k, throughput)
257
+ ├── benchmarks/ # Reproducible benchmark harness
258
+ └── utils/ # Config, logging, helpers
259
+ ```
260
+
261
+ ## Supported Embedding Backends
262
+
263
+ | Backend | Text+Audio Aligned | Notes |
264
+ |---------|-------------------|-------|
265
+ | **CLAP** (default) | ✅ | Best for text→audio search |
266
+ | Whisper Encoder | ❌ | Audio-only embeddings |
267
+ | wav2vec2 | ❌ | Audio-only, good for speech |
268
+ | HuBERT | ❌ | Audio-only, self-supervised |
269
+ | Audio Spectrogram Transformer | ❌ | Audio-only, classification-focused |
270
+
271
+ ## Development
272
+
273
+ ```bash
274
+ # Clone and install
275
+ git clone https://github.com/echovector/echovector.git
276
+ cd echovector
277
+ uv sync --all-extras
278
+
279
+ # Run checks
280
+ make lint
281
+ make typecheck
282
+ make test
283
+ make coverage
284
+ ```
285
+
286
+ ## License
287
+
288
+ MIT
@@ -0,0 +1,38 @@
1
+ echovector/__init__.py,sha256=bNOH--O4ZZ2JGLk2WBmuR-LDJwxrHH9NFPVJGmNlUys,154
2
+ echovector/core.py,sha256=WgTis_6d1SNbmT8iE26SZjk2xeT5U_Ju6_iOh53czDc,10857
3
+ echovector/api/__init__.py,sha256=UXSa3IbGYl8Ld7hU8YHtIDB3RWr-Jr47xerJpd-JKGI,33
4
+ echovector/api/server.py,sha256=fU4780Am7V9ZoNpuOmWKaObEudHm98e4QTG6koR8rKc,4059
5
+ echovector/audio/__init__.py,sha256=FLz5_kYf5l1VpWIFXZcKuDV6BDft-bxugyNSqEaAEm0,300
6
+ echovector/audio/chunker.py,sha256=oA2XUkhBoQBsdIIFJmX9WkNUMMeTtHaCX-P9jlFCKk8,2340
7
+ echovector/audio/metadata.py,sha256=YWP6A7HS_mK0oC8cXoF5uiQ-3NQcFNgE1mN38X_osU4,1474
8
+ echovector/audio/processor.py,sha256=rxNz72dc3WvM-U8TyCNC3wUGDWZZOEDiP-SgoP3_peE,1589
9
+ echovector/audio/streaming.py,sha256=sm6-XGdVn4NrgviDdpqOulQVd1ZgYdJxw-eug1MjSaE,893
10
+ echovector/cli/__init__.py,sha256=I_dYESPy1z4et6onBmUZUf9QGk-mWz34codFENej_0U,33
11
+ echovector/cli/main.py,sha256=TTJNzPgBO86YdgIMhmLe8FOuGoQeQysqXZUT5bfY9b0,5302
12
+ echovector/embeddings/__init__.py,sha256=HJ3yRohOHxLYiVI1SePdiPIfY4Pxd-Fum6rbm4_WbSw,407
13
+ echovector/embeddings/ast_model.py,sha256=0ZulFEx3jschoBft2cJn1UX3ZYUcINn03guqMINbekg,1277
14
+ echovector/embeddings/base.py,sha256=3zYaxqamBvj2BEPwQWMm2czcqT-jocKFLjkRvVsTWp8,1080
15
+ echovector/embeddings/cache.py,sha256=3yMxEQhQsHELyj7ZYdeT3Kwm4f0EuVRbt5WLvicOeCM,3120
16
+ echovector/embeddings/clap.py,sha256=h7i1hyzuOHegLq0bHT2MnNHPkiNKholGHRP4UbaPbcA,4311
17
+ echovector/embeddings/factory.py,sha256=j0CWYZq4NVYU8igdxOKL2pKkE_3FBQFVXtnYCmnXPxs,2683
18
+ echovector/embeddings/hubert.py,sha256=Imnmw6m9ilfRPAFCQVP8Ff_vdS-uHJ4Q6633PQr6Sag,1253
19
+ echovector/embeddings/local.py,sha256=jBDrWXMTyTCEDZVmFeNd8C4AoBL2mv0H3m_hk-EhpwE,4277
20
+ echovector/embeddings/wav2vec2.py,sha256=g8JGc6ji3_MDxTIUgQkT565OeZMkb09Ly72lyPsKEZk,1263
21
+ echovector/embeddings/whisper_enc.py,sha256=weFspd31TzRE83tEnnioESas-e0t33i112DiCrfBdhQ,1333
22
+ echovector/evaluation/__init__.py,sha256=3vsJte5hUM4D9lwLouVyqLHKuZ6Xlqwyzoi7UAXw8FE,40
23
+ echovector/evaluation/metrics.py,sha256=jXNWoko4T3TZIgZKBiL5eoVrjEWfPGcT15fAepM4IfQ,1227
24
+ echovector/indexing/__init__.py,sha256=x8D0AvoJFce5_LU-Evdvi01JGmgr8bvemu3ZJRpw-4g,227
25
+ echovector/indexing/base.py,sha256=MXHVxxr2T0o2GWo4jO_8ZNnSRMyJoi9LrEL7EiDp3lY,2880
26
+ echovector/indexing/faiss_index.py,sha256=hWvuKwD4oLEFOknp_OxZWAUI_cpVNJNeIamVcRmbLhM,6615
27
+ echovector/indexing/store.py,sha256=C3zVDSjOGZNIu49e_XXVHFzDCQpWf51VpIHbz1TXP_w,5357
28
+ echovector/search/__init__.py,sha256=qy3p-vIXtIbzw8ratKf434IUJFOLtxPLjT1DsAwaDIE,360
29
+ echovector/search/engine.py,sha256=48IjaLjUTWoD11zI-MCJl9K_yoccxeJrP4qdkqeUZbU,2319
30
+ echovector/search/filters.py,sha256=AHUbq9OzyTnQa7Ci2bjzzpeNhWYtNqBN-7ypmPk5Hq4,1712
31
+ echovector/search/results.py,sha256=3qDgF_VDXTlxbfa0RWCQz_iTEhTG7CCMkBBnlDQmheE,1063
32
+ echovector/utils/__init__.py,sha256=78OxBC7wC9q5JALB4u38dJTU73xbqlThp_33E5LidzU,188
33
+ echovector/utils/config.py,sha256=CN2i1p0k_hbq2XA2mLGwgEFS2vfSI5XShT9CmtBeu24,2002
34
+ echovector/utils/logging.py,sha256=XdnP3hL9FKm0lZCG4k_Kz8kgLwxpNvbGxCFJdyusRrI,778
35
+ echo_vector-0.1.1.dist-info/METADATA,sha256=wM-6TNcq5clyP85Je2qN3mvNZcCXwl_bplJgCeD5xfI,10058
36
+ echo_vector-0.1.1.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
37
+ echo_vector-0.1.1.dist-info/entry_points.txt,sha256=5ckspm1n74MtWsT9mfNvLymeDG9JMPVVsh21DMN7B-g,55
38
+ echo_vector-0.1.1.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ echovector = echovector.cli.main:app
echovector/__init__.py ADDED
@@ -0,0 +1,7 @@
1
+ """EchoVector: Audio vector embedding and processing library."""
2
+
3
+ from echovector.core import EchoVector
4
+
5
+ __version__ = "0.1.0"
6
+
7
+ __all__ = ["EchoVector"]
@@ -0,0 +1 @@
1
+ """API module for EchoVector."""
@@ -0,0 +1,144 @@
1
+ """FastAPI server for EchoVector."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ from fastapi import Depends, FastAPI
8
+ from pydantic import BaseModel, Field
9
+
10
+ if TYPE_CHECKING:
11
+ from echovector.core import EchoVector
12
+
13
+ app = FastAPI(
14
+ title="EchoVector API",
15
+ description="Semantic search over audio files.",
16
+ version="0.1.0",
17
+ )
18
+
19
+ # Default engine instance — override via app.dependency_overrides in tests.
20
+ _default_engine: EchoVector | None = None
21
+
22
+
23
+ def get_engine() -> EchoVector:
24
+ """Return the active EchoVector engine."""
25
+ if _default_engine is None:
26
+ raise RuntimeError(
27
+ "No EchoVector engine configured. Call configure_engine() before starting the server."
28
+ )
29
+ return _default_engine
30
+
31
+
32
+ def configure_engine(engine: EchoVector) -> None:
33
+ """Set the engine used by the server at startup."""
34
+ global _default_engine
35
+ _default_engine = engine
36
+
37
+
38
+ # ---------------------------------------------------------------------------
39
+ # Request / response models
40
+ # ---------------------------------------------------------------------------
41
+
42
+
43
+ class IndexRequest(BaseModel):
44
+ """Paths to index."""
45
+
46
+ paths: list[str] = Field(..., description="Audio file or directory paths to index.")
47
+ force: bool = Field(False, description="Re-index files that are already stored.")
48
+
49
+
50
+ class IndexResponse(BaseModel):
51
+ """Result of an index operation."""
52
+
53
+ chunks_added: int
54
+ files_skipped: int
55
+
56
+
57
+ class SearchRequest(BaseModel):
58
+ """Text query for audio search."""
59
+
60
+ query: str = Field(..., description="Natural language query.")
61
+ top_k: int = Field(5, ge=1, description="Maximum results to return.")
62
+
63
+
64
+ class SearchResultItem(BaseModel):
65
+ """Single search result."""
66
+
67
+ filepath: str
68
+ start: float
69
+ end: float
70
+ score: float
71
+ metadata: dict[str, Any] = Field(default_factory=dict)
72
+
73
+
74
+ class SearchResponse(BaseModel):
75
+ """Search results."""
76
+
77
+ results: list[SearchResultItem]
78
+
79
+
80
+ class StatsResponse(BaseModel):
81
+ """Index statistics."""
82
+
83
+ chunks: int
84
+ embedding_dim: int
85
+ store_dir: str
86
+
87
+
88
+ # ---------------------------------------------------------------------------
89
+ # Endpoints
90
+ # ---------------------------------------------------------------------------
91
+
92
+
93
+ @app.post("/index", response_model=IndexResponse)
94
+ def index_audio(
95
+ request: IndexRequest,
96
+ engine: EchoVector = Depends(get_engine),
97
+ ) -> IndexResponse:
98
+ """Index audio files or directories."""
99
+ before = engine.stats()["chunks"]
100
+ chunks_added = engine.index(request.paths, force=request.force)
101
+ after = engine.stats()["chunks"]
102
+ files_skipped = len(request.paths) - (after - before > 0 or chunks_added > 0)
103
+ # Simpler: derive skipped count from chunks_added being 0 per file
104
+ files_skipped = max(0, len(request.paths) - (1 if chunks_added > 0 else 0))
105
+ return IndexResponse(chunks_added=chunks_added, files_skipped=files_skipped)
106
+
107
+
108
+ @app.post("/search", response_model=SearchResponse)
109
+ def search_audio(
110
+ request: SearchRequest,
111
+ engine: EchoVector = Depends(get_engine),
112
+ ) -> SearchResponse:
113
+ """Search indexed audio with a text query."""
114
+ results = engine.search(request.query, top_k=request.top_k)
115
+ return SearchResponse(
116
+ results=[
117
+ SearchResultItem(
118
+ filepath=r.filepath,
119
+ start=r.timestamp_range.start,
120
+ end=r.timestamp_range.end,
121
+ score=r.score,
122
+ metadata=r.metadata or {},
123
+ )
124
+ for r in results
125
+ ]
126
+ )
127
+
128
+
129
+ @app.get("/stats", response_model=StatsResponse)
130
+ def get_stats(engine: EchoVector = Depends(get_engine)) -> StatsResponse:
131
+ """Return index statistics."""
132
+ s = engine.stats()
133
+ return StatsResponse(
134
+ chunks=int(s["chunks"]),
135
+ embedding_dim=int(s["embedding_dim"]),
136
+ store_dir=str(s["store_dir"]),
137
+ )
138
+
139
+
140
+ @app.post("/reset")
141
+ def reset_index(engine: EchoVector = Depends(get_engine)) -> dict[str, str]:
142
+ """Clear the index."""
143
+ engine.reset()
144
+ return {"status": "ok"}
@@ -0,0 +1,12 @@
1
+ from .chunker import SilenceAwareChunker
2
+ from .metadata import AudioMetadata, extract_metadata
3
+ from .processor import AudioProcessor
4
+ from .streaming import AudioStreamer
5
+
6
+ __all__ = [
7
+ "AudioMetadata",
8
+ "AudioProcessor",
9
+ "AudioStreamer",
10
+ "SilenceAwareChunker",
11
+ "extract_metadata",
12
+ ]
@@ -0,0 +1,71 @@
1
+ import numpy as np
2
+ import numpy.typing as npt
3
+
4
+
5
+ class SilenceAwareChunker:
6
+ """Chunks audio based on silence."""
7
+
8
+ def __init__(
9
+ self,
10
+ top_db: float = 60.0,
11
+ min_chunk_length: float = 1.0,
12
+ max_chunk_length: float = 10.0,
13
+ sample_rate: int = 16000,
14
+ ) -> None:
15
+ """Initialize the chunker.
16
+
17
+ Args:
18
+ top_db: The threshold (in decibels) below reference to consider as silence.
19
+ min_chunk_length: Minimum length of a chunk in seconds.
20
+ max_chunk_length: Maximum length of a chunk in seconds.
21
+ sample_rate: The sample rate of the audio.
22
+ """
23
+ self.top_db = top_db
24
+ self.min_chunk_length = min_chunk_length
25
+ self.max_chunk_length = max_chunk_length
26
+ self.sample_rate = sample_rate
27
+
28
+ def chunk(self, audio: npt.NDArray[np.float32]) -> list[npt.NDArray[np.float32]]:
29
+ """Split audio into chunks based on silence.
30
+
31
+ Args:
32
+ audio: The audio signal to chunk.
33
+
34
+ Returns:
35
+ A list of audio chunks.
36
+ """
37
+ if len(audio) == 0:
38
+ return []
39
+
40
+ max_amplitude = float(np.max(np.abs(audio)))
41
+ if max_amplitude == 0.0:
42
+ return []
43
+
44
+ threshold = max_amplitude * (10.0 ** (-self.top_db / 20.0))
45
+ non_silent = np.flatnonzero(np.abs(audio) > threshold)
46
+ if len(non_silent) == 0:
47
+ return []
48
+
49
+ breaks = np.where(np.diff(non_silent) > 1)[0] + 1
50
+ intervals = [
51
+ (int(group[0]), int(group[-1]) + 1)
52
+ for group in np.split(non_silent, breaks)
53
+ if len(group) > 0
54
+ ]
55
+
56
+ chunks: list[npt.NDArray[np.float32]] = []
57
+ min_samples = int(self.min_chunk_length * self.sample_rate)
58
+ max_samples = int(self.max_chunk_length * self.sample_rate)
59
+
60
+ for start, end in intervals:
61
+ interval_audio: npt.NDArray[np.float32] = audio[start:end]
62
+
63
+ while len(interval_audio) > max_samples:
64
+ chunks.append(interval_audio[:max_samples])
65
+ interval_audio = interval_audio[max_samples:]
66
+
67
+ keep_short_first = len(interval_audio) > 0 and len(chunks) == 0
68
+ if len(interval_audio) >= min_samples or keep_short_first:
69
+ chunks.append(interval_audio)
70
+
71
+ return chunks
@@ -0,0 +1,58 @@
1
+ import os
2
+ from dataclasses import dataclass
3
+
4
+ import librosa
5
+ import soundfile as sf
6
+
7
+
8
+ @dataclass
9
+ class AudioMetadata:
10
+ """Metadata for an audio file."""
11
+
12
+ duration: float
13
+ sample_rate: int
14
+ channels: int
15
+ format: str
16
+ file_size: int
17
+ file_path: str
18
+
19
+
20
+ def extract_metadata(file_path: str) -> AudioMetadata:
21
+ """Extract metadata from an audio file.
22
+
23
+ Args:
24
+ file_path: Path to the audio file.
25
+
26
+ Returns:
27
+ AudioMetadata object containing extracted metadata.
28
+
29
+ Raises:
30
+ FileNotFoundError: If the file does not exist.
31
+ """
32
+ if not os.path.exists(file_path):
33
+ raise FileNotFoundError(f"Audio file not found: {file_path}")
34
+
35
+ file_size = os.path.getsize(file_path)
36
+
37
+ try:
38
+ info = sf.info(file_path)
39
+ duration = float(info.duration)
40
+ sample_rate = int(info.samplerate)
41
+ channels = int(info.channels)
42
+ fmt = str(info.format)
43
+ except Exception:
44
+ # Fallback to librosa
45
+ duration = float(librosa.get_duration(path=file_path))
46
+ sample_rate = int(librosa.get_samplerate(file_path))
47
+ y, _ = librosa.load(file_path, sr=None, mono=False)
48
+ channels = int(y.shape[0]) if y.ndim > 1 else 1
49
+ fmt = str(os.path.splitext(file_path)[1].lstrip("."))
50
+
51
+ return AudioMetadata(
52
+ duration=duration,
53
+ sample_rate=sample_rate,
54
+ channels=channels,
55
+ format=fmt,
56
+ file_size=file_size,
57
+ file_path=file_path,
58
+ )
@@ -0,0 +1,53 @@
1
+ import os
2
+
3
+ import librosa
4
+ import numpy as np
5
+ import numpy.typing as npt
6
+ import soundfile as sf
7
+
8
+
9
+ class AudioProcessor:
10
+ """Processes audio files for vectorization."""
11
+
12
+ def __init__(self, target_sample_rate: int = 16000, mono: bool = True) -> None:
13
+ """Initialize the AudioProcessor.
14
+
15
+ Args:
16
+ target_sample_rate: The sample rate to convert audio to.
17
+ mono: Whether to convert audio to mono.
18
+ """
19
+ self.target_sample_rate = target_sample_rate
20
+ self.mono = mono
21
+
22
+ def load_audio(self, file_path: str) -> npt.NDArray[np.float32]:
23
+ """Load an audio file into a numpy array.
24
+
25
+ Args:
26
+ file_path: Path to the audio file (mp3/wav/flac/m4a).
27
+
28
+ Returns:
29
+ The loaded audio as a numpy array.
30
+
31
+ Raises:
32
+ FileNotFoundError: If the file does not exist.
33
+ """
34
+ if not os.path.exists(file_path):
35
+ raise FileNotFoundError(f"Audio file not found: {file_path}")
36
+
37
+ try:
38
+ audio, sample_rate = sf.read(file_path, dtype="float32", always_2d=False)
39
+ except sf.LibsndfileError:
40
+ audio, sample_rate = librosa.load(file_path, sr=None, mono=False)
41
+
42
+ if self.mono and audio.ndim > 1:
43
+ audio = np.mean(audio, axis=1)
44
+
45
+ if sample_rate != self.target_sample_rate:
46
+ audio = librosa.resample(
47
+ y=audio,
48
+ orig_sr=int(sample_rate),
49
+ target_sr=self.target_sample_rate,
50
+ axis=0,
51
+ )
52
+
53
+ return np.asarray(audio, dtype=np.float32)