pytest-audioeval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,27 @@
1
+ """AudioEval facade — main fixture interface."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import contextlib
6
+
7
+ from pytest_audioeval.samples.registry import SampleRegistry
8
+ from pytest_audioeval.stt import STTClient
9
+ from pytest_audioeval.tts import TTSClient
10
+
11
+
12
+ class AudioEval:
13
+ """Pytest fixture facade: audioeval.stt / audioeval.tts / audioeval.samples."""
14
+
15
+ __slots__ = ("samples", "stt", "tts")
16
+
17
+ def __init__(self, *, stt_url: str | None = None, tts_url: str | None = None) -> None:
18
+ self.samples = SampleRegistry()
19
+ self.stt: STTClient | None = STTClient(url=stt_url) if stt_url else None
20
+ self.tts: TTSClient | None = TTSClient(url=tts_url) if tts_url else None
21
+
22
+ async def aclose(self) -> None:
23
+ """Cleanup clients."""
24
+ for client in (self.stt, self.tts):
25
+ if client is not None:
26
+ with contextlib.suppress(RuntimeError):
27
+ await client.aclose()
@@ -0,0 +1,28 @@
1
+ """Perceptual audio quality via PESQ."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Self
7
+
8
+ import numpy as np
9
+ from pesq import pesq as pesq_score
10
+
11
+
12
+ @dataclass(slots=True, frozen=True)
13
+ class AudioMetrics:
14
+ """PESQ MOS (1-5 scale)."""
15
+
16
+ mos: float
17
+ sample_rate: int
18
+
19
+ @classmethod
20
+ def compute(cls, reference: np.ndarray, hypothesis: np.ndarray, *, sample_rate: int = 16_000) -> Self:
21
+ """PESQ wideband comparison."""
22
+ score = pesq_score(sample_rate, reference, hypothesis, "wb")
23
+ return cls(mos=float(score), sample_rate=sample_rate)
24
+
25
+ def assert_quality(self, *, min_mos: float = 3.0) -> None:
26
+ """Raise AssertionError if MOS below threshold."""
27
+ if self.mos < min_mos:
28
+ raise AssertionError(f"PESQ MOS {self.mos:.2f} < {min_mos}")
@@ -0,0 +1,45 @@
1
+ """Word/character-level transcription quality metrics."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Self
7
+
8
+ from jiwer import cer, process_words, wer
9
+
10
+
11
+ @dataclass(slots=True, frozen=True)
12
+ class TextMetrics:
13
+ """WER/CER transcription quality. O(n) via jiwer."""
14
+
15
+ wer: float
16
+ cer: float
17
+ substitutions: int
18
+ insertions: int
19
+ deletions: int
20
+
21
+ @classmethod
22
+ def compute(cls, reference: str, hypothesis: str) -> Self:
23
+ """Compute all text metrics in a single pass."""
24
+ output = process_words(reference, hypothesis)
25
+ return cls(
26
+ wer=wer(reference, hypothesis),
27
+ cer=cer(reference, hypothesis),
28
+ substitutions=output.substitutions,
29
+ insertions=output.insertions,
30
+ deletions=output.deletions,
31
+ )
32
+
33
+ def assert_quality(self, *, max_wer: float = 0.2, max_cer: float = 0.15) -> None:
34
+ """Raise AssertionError with detailed breakdown on failure."""
35
+ violations: list[str] = []
36
+ if self.wer > max_wer:
37
+ violations.append(f"WER {self.wer:.3f} > {max_wer}")
38
+ if self.cer > max_cer:
39
+ violations.append(f"CER {self.cer:.3f} > {max_cer}")
40
+ if violations:
41
+ msg = (
42
+ f"Audio quality assertion failed: {', '.join(violations)} | "
43
+ f"subs={self.substitutions} ins={self.insertions} del={self.deletions}"
44
+ )
45
+ raise AssertionError(msg)
@@ -0,0 +1,41 @@
1
+ """pytest entry point — fixtures and hooks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import AsyncIterator
6
+ from typing import Any
7
+
8
+ import pytest
9
+
10
+ from pytest_audioeval.client import AudioEval
11
+
12
+
13
+ def pytest_addoption(parser: Any) -> None:
14
+ """Register CLI options for audioeval."""
15
+ group = parser.getgroup("audioeval", "Audio evaluation options")
16
+ group.addoption("--stt-url", default=None, help="STT service WebSocket URL")
17
+ group.addoption("--tts-url", default=None, help="TTS service HTTP URL")
18
+ group.addoption("--audioeval-wer", type=float, default=0.2, help="Max WER threshold")
19
+ group.addoption("--audioeval-cer", type=float, default=0.15, help="Max CER threshold")
20
+ group.addoption("--audioeval-mos", type=float, default=3.0, help="Min PESQ MOS threshold")
21
+
22
+
23
+ @pytest.fixture(scope="session")
24
+ async def audioeval(request: pytest.FixtureRequest) -> AsyncIterator[AudioEval]:
25
+ """Session-scoped AudioEval fixture with CLI-driven configuration."""
26
+ stt_url = request.config.getoption("--stt-url")
27
+ tts_url = request.config.getoption("--tts-url")
28
+
29
+ ae = AudioEval(stt_url=stt_url, tts_url=tts_url)
30
+ yield ae
31
+ await ae.aclose()
32
+
33
+
34
+ @pytest.fixture
35
+ def audioeval_thresholds(request: pytest.FixtureRequest) -> dict[str, float]:
36
+ """Per-test threshold overrides from CLI."""
37
+ return {
38
+ "max_wer": request.config.getoption("--audioeval-wer"),
39
+ "max_cer": request.config.getoption("--audioeval-cer"),
40
+ "min_mos": request.config.getoption("--audioeval-mos"),
41
+ }
File without changes
@@ -0,0 +1 @@
1
+ One two three four five six seven eight nine ten.
@@ -0,0 +1 @@
1
+ Hello world.
@@ -0,0 +1 @@
1
+ The quick brown fox jumps over the lazy dog.
@@ -0,0 +1,114 @@
1
+ """Sample catalog and ground-truth registry."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from enum import StrEnum, auto
7
+ from pathlib import Path
8
+
9
+ import numpy as np
10
+ import soundfile as sf
11
+
12
+
13
+ class SampleLang(StrEnum):
14
+ """Supported sample languages."""
15
+
16
+ EN = auto()
17
+ ES = auto()
18
+ FR = auto()
19
+ DE = auto()
20
+
21
+
22
+ @dataclass(slots=True, frozen=True)
23
+ class AudioSample:
24
+ """Ground-truth pair: audio + expected transcription."""
25
+
26
+ name: str
27
+ lang: SampleLang
28
+ reference_text: str
29
+ audio_path: Path
30
+ sample_rate: int = 16_000
31
+ duration_ms: int = 0
32
+
33
+ def audio_bytes(self) -> bytes:
34
+ """Raw file bytes."""
35
+ return self.audio_path.read_bytes()
36
+
37
+ def audio_numpy(self) -> np.ndarray:
38
+ """Load as float32 numpy array."""
39
+ data, _ = sf.read(self.audio_path, dtype="float32")
40
+ return data
41
+
42
+ def chunks(self, chunk_ms: int = 200) -> list[bytes]:
43
+ """Split audio into float32 PCM chunks for streaming. O(n/chunk_size)."""
44
+ data = self.audio_numpy()
45
+ samples_per_chunk = (self.sample_rate * chunk_ms) // 1000
46
+ return [data[i : i + samples_per_chunk].tobytes() for i in range(0, len(data), samples_per_chunk)]
47
+
48
+
49
+ _SAMPLES_DIR = Path(__file__).resolve().parent / "audio"
50
+
51
+
52
+ class SampleRegistry:
53
+ """Catalog of embedded audio fixtures. O(1) lookup by name."""
54
+
55
+ __slots__ = ("_catalog",)
56
+
57
+ def __init__(self) -> None:
58
+ self._catalog: dict[str, AudioSample] = {}
59
+ self._discover()
60
+
61
+ def _discover(self) -> None:
62
+ """Auto-register samples from directory convention: {lang}/{name}.wav + .txt."""
63
+ if not _SAMPLES_DIR.exists():
64
+ return
65
+ for lang_dir in sorted(_SAMPLES_DIR.iterdir()):
66
+ if not lang_dir.is_dir():
67
+ continue
68
+ lang_code = lang_dir.name
69
+ if lang_code not in SampleLang.__members__.values():
70
+ continue
71
+ lang = SampleLang(lang_code)
72
+ for wav_path in sorted(lang_dir.glob("*.wav")):
73
+ txt_path = wav_path.with_suffix(".txt")
74
+ if not txt_path.exists():
75
+ continue
76
+ name = wav_path.stem
77
+ key = f"{lang_code}_{name}"
78
+ info = sf.info(wav_path)
79
+ self._catalog[key] = AudioSample(
80
+ name=name,
81
+ lang=lang,
82
+ reference_text=txt_path.read_text().strip(),
83
+ audio_path=wav_path,
84
+ sample_rate=int(info.samplerate),
85
+ duration_ms=int(info.duration * 1000),
86
+ )
87
+
88
+ def __getattr__(self, name: str) -> AudioSample:
89
+ """Attribute-style access: samples.en_hello_world."""
90
+ if name in self._catalog:
91
+ return self._catalog[name]
92
+ available = ", ".join(sorted(self._catalog))
93
+ raise AttributeError(f"Sample '{name}' not found. Available: {available}")
94
+
95
+ def all(self) -> list[AudioSample]:
96
+ """All registered samples."""
97
+ return list(self._catalog.values())
98
+
99
+ def by_lang(self, lang: SampleLang) -> list[AudioSample]:
100
+ """Filter samples by language. O(n)."""
101
+ return [s for s in self._catalog.values() if s.lang == lang]
102
+
103
+ def register(self, sample: AudioSample) -> None:
104
+ """Register custom project-specific samples."""
105
+ self._catalog[f"{sample.lang}_{sample.name}"] = sample
106
+
107
+ def __len__(self) -> int:
108
+ return len(self._catalog)
109
+
110
+ def __contains__(self, name: str) -> bool:
111
+ return name in self._catalog
112
+
113
+ def __repr__(self) -> str:
114
+ return f"SampleRegistry({len(self._catalog)} samples)"
@@ -0,0 +1,135 @@
1
+ """STT evaluation client — httpx + httpx-ws + httpx-sse under the hood."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import time
7
+ from collections.abc import AsyncIterator
8
+ from contextlib import asynccontextmanager
9
+ from dataclasses import dataclass, field
10
+ from typing import Any, Self
11
+
12
+ import httpx
13
+ from httpx_sse import EventSource
14
+ from httpx_ws import AsyncWebSocketClient, AsyncWebSocketSession
15
+
16
+ from pytest_audioeval.metrics.text import TextMetrics
17
+ from pytest_audioeval.samples.registry import AudioSample
18
+
19
+
20
+ @dataclass(slots=True)
21
+ class STTResult:
22
+ """STT evaluation result with optional metrics."""
23
+
24
+ hypothesis_text: str = ""
25
+ text_metrics: TextMetrics | None = None
26
+ latency_ms: float = 0.0
27
+ chunks_received: int = 0
28
+ fragments: list[str] = field(default_factory=list)
29
+
30
+ def assert_quality(self, *, max_wer: float = 0.2, max_cer: float = 0.15) -> Self:
31
+ """Assert STT quality. Chainable."""
32
+ if self.text_metrics is None:
33
+ raise AssertionError("No text metrics — call compute_metrics() first or provide a sample")
34
+ self.text_metrics.assert_quality(max_wer=max_wer, max_cer=max_cer)
35
+ return self
36
+
37
+ def compute_metrics(self, reference: str) -> Self:
38
+ """Compute WER/CER against reference. Chainable."""
39
+ self.text_metrics = TextMetrics.compute(reference, self.hypothesis_text)
40
+ return self
41
+
42
+
43
+ class STTSession:
44
+ """Active WebSocket session for STT evaluation."""
45
+
46
+ __slots__ = ("_result", "_sample", "_session", "_t0")
47
+
48
+ def __init__(self, *, session: AsyncWebSocketSession, sample: AudioSample | None) -> None:
49
+ self._session = session
50
+ self._sample = sample
51
+ self._t0 = time.perf_counter()
52
+ self._result = STTResult()
53
+
54
+ async def send_bytes(self, data: bytes) -> None:
55
+ """Send binary audio data."""
56
+ await self._session.send_bytes(data)
57
+
58
+ async def send_text(self, data: str) -> None:
59
+ """Send text (JSON config, END_OF_AUDIO, etc.)."""
60
+ await self._session.send_text(data)
61
+
62
+ async def send_sample(self, sample: AudioSample, *, chunk_ms: int = 200) -> None:
63
+ """Stream sample in chunks with realistic pacing."""
64
+ for chunk in sample.chunks(chunk_ms):
65
+ await self._session.send_bytes(chunk)
66
+ await asyncio.sleep(chunk_ms / 1000)
67
+
68
+ async def receive_text(self, *, timeout: float | None = None) -> str:
69
+ """Receive text frame and accumulate as fragment."""
70
+ text = await self._session.receive_text(timeout=timeout)
71
+ self._result.fragments.append(text)
72
+ self._result.chunks_received += 1
73
+ return text
74
+
75
+ async def receive_bytes(self, *, timeout: float | None = None) -> bytes:
76
+ """Receive binary frame."""
77
+ return await self._session.receive_bytes(timeout=timeout)
78
+
79
+ def result(self) -> STTResult:
80
+ """Build STTResult from accumulated fragments."""
81
+ self._result.latency_ms = (time.perf_counter() - self._t0) * 1000
82
+ self._result.hypothesis_text = " ".join(self._result.fragments)
83
+ if self._sample and self._result.hypothesis_text:
84
+ self._result.compute_metrics(self._sample.reference_text)
85
+ return self._result
86
+
87
+
88
+ class STTClient:
89
+ """STT evaluation client — HTTP batch + WebSocket streaming."""
90
+
91
+ __slots__ = ("_timeout", "_url")
92
+
93
+ def __init__(self, *, url: str, timeout: float = 30.0) -> None:
94
+ self._url = url
95
+ self._timeout = timeout
96
+
97
+ async def post(self, *, data: bytes | None = None, **kwargs: Any) -> httpx.Response:
98
+ """Batch POST audio to STT endpoint (e.g. OpenAI Whisper API). Returns raw httpx.Response."""
99
+ async with httpx.AsyncClient(timeout=self._timeout) as client:
100
+ response = await client.post(self._url, content=data, **kwargs)
101
+ response.raise_for_status()
102
+ return response
103
+
104
+ @asynccontextmanager
105
+ async def stream(self, *, data: bytes | None = None, **kwargs: Any) -> AsyncIterator[httpx.Response]:
106
+ """Chunked streaming POST. Yields httpx.Response for aiter_bytes/aiter_lines."""
107
+ async with (
108
+ httpx.AsyncClient(timeout=self._timeout) as client,
109
+ client.stream("POST", self._url, content=data, **kwargs) as response,
110
+ ):
111
+ response.raise_for_status()
112
+ yield response
113
+
114
+ @asynccontextmanager
115
+ async def sse(self, *, data: bytes | None = None, **kwargs: Any) -> AsyncIterator[EventSource]:
116
+ """SSE streaming POST. Yields EventSource for aiter_sse()."""
117
+ headers = kwargs.pop("headers", {})
118
+ headers["Accept"] = "text/event-stream"
119
+ async with (
120
+ httpx.AsyncClient(timeout=self._timeout) as client,
121
+ client.stream("POST", self._url, content=data, headers=headers, **kwargs) as response,
122
+ ):
123
+ response.raise_for_status()
124
+ yield EventSource(response)
125
+
126
+ @asynccontextmanager
127
+ async def ws(self, *, sample: AudioSample | None = None, **kwargs: Any) -> AsyncIterator[STTSession]:
128
+ """Open WebSocket session for STT streaming (e.g. WhisperLive)."""
129
+ async with httpx.AsyncClient() as client:
130
+ ws_client = AsyncWebSocketClient(client, keepalive_ping_interval_seconds=None)
131
+ async with ws_client.connect(self._url, **kwargs) as session:
132
+ yield STTSession(session=session, sample=sample)
133
+
134
+ async def aclose(self) -> None:
135
+ """No-op — clients are created per-call."""
@@ -0,0 +1,61 @@
1
+ """TTS evaluation client — httpx + httpx-sse + httpx-ws under the hood."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import AsyncIterator
6
+ from contextlib import asynccontextmanager
7
+ from typing import Any
8
+
9
+ import httpx
10
+ from httpx_sse import EventSource
11
+ from httpx_ws import AsyncWebSocketClient, AsyncWebSocketSession
12
+
13
+
14
+ class TTSClient:
15
+ """TTS evaluation client — HTTP batch, streaming, and SSE."""
16
+
17
+ __slots__ = ("_timeout", "_url")
18
+
19
+ def __init__(self, *, url: str, timeout: float = 30.0) -> None:
20
+ self._url = url
21
+ self._timeout = timeout
22
+
23
+ async def post(self, *, json: dict[str, Any] | None = None, **kwargs: Any) -> httpx.Response:
24
+ """Batch POST to TTS endpoint. Returns raw httpx.Response."""
25
+ async with httpx.AsyncClient(timeout=self._timeout) as client:
26
+ response = await client.post(self._url, json=json, **kwargs)
27
+ response.raise_for_status()
28
+ return response
29
+
30
+ @asynccontextmanager
31
+ async def stream(self, *, json: dict[str, Any] | None = None, **kwargs: Any) -> AsyncIterator[httpx.Response]:
32
+ """Chunked streaming POST. Yields httpx.Response for aiter_bytes/aiter_lines."""
33
+ async with (
34
+ httpx.AsyncClient(timeout=self._timeout) as client,
35
+ client.stream("POST", self._url, json=json, **kwargs) as response,
36
+ ):
37
+ response.raise_for_status()
38
+ yield response
39
+
40
+ @asynccontextmanager
41
+ async def sse(self, *, json: dict[str, Any] | None = None, **kwargs: Any) -> AsyncIterator[EventSource]:
42
+ """SSE streaming POST. Yields EventSource for aiter_sse()."""
43
+ headers = kwargs.pop("headers", {})
44
+ headers["Accept"] = "text/event-stream"
45
+ async with (
46
+ httpx.AsyncClient(timeout=self._timeout) as client,
47
+ client.stream("POST", self._url, json=json, headers=headers, **kwargs) as response,
48
+ ):
49
+ response.raise_for_status()
50
+ yield EventSource(response)
51
+
52
+ @asynccontextmanager
53
+ async def ws(self, **kwargs: Any) -> AsyncIterator[AsyncWebSocketSession]:
54
+ """Open WebSocket session for TTS streaming (e.g. WebSocket-based TTS servers)."""
55
+ async with httpx.AsyncClient() as client:
56
+ ws_client = AsyncWebSocketClient(client, keepalive_ping_interval_seconds=None)
57
+ async with ws_client.connect(self._url, **kwargs) as session:
58
+ yield session
59
+
60
+ async def aclose(self) -> None:
61
+ """No-op — clients are created per-call."""
@@ -0,0 +1,282 @@
1
+ Metadata-Version: 2.4
2
+ Name: pytest-audioeval
3
+ Version: 0.1.0
4
+ Summary: Pytest plugin for STT/TTS integration testing with httpx, metrics, and embedded audio samples.
5
+ Project-URL: Homepage, https://damvolkov.github.io/pytest-audioeval
6
+ Project-URL: Documentation, https://damvolkov.github.io/pytest-audioeval
7
+ Project-URL: Repository, https://github.com/damvolkov/pytest-audioeval
8
+ Project-URL: Changelog, https://damvolkov.github.io/pytest-audioeval/changelog/
9
+ Author-email: damvolkov <damvolkovv@gmail.com>
10
+ License-Expression: MIT
11
+ License-File: LICENSE
12
+ Keywords: audio,evaluation,metrics,pesq,pytest,speech,stt,testing,tts,wer
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Framework :: Pytest
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
20
+ Classifier: Topic :: Software Development :: Testing
21
+ Classifier: Typing :: Typed
22
+ Requires-Python: >=3.13
23
+ Requires-Dist: httpx-sse>=0.4.3
24
+ Requires-Dist: httpx-ws>=0.8.2
25
+ Requires-Dist: httpx>=0.27
26
+ Requires-Dist: jiwer>=3.0
27
+ Requires-Dist: numpy>=2.0
28
+ Requires-Dist: pesq>=0.0.4
29
+ Requires-Dist: pytest>=8.0
30
+ Requires-Dist: soundfile>=0.12
31
+ Description-Content-Type: text/markdown
32
+
33
+ # pytest-audioeval
34
+
35
+ Pytest plugin for STT/TTS integration testing. Built on the httpx ecosystem (`httpx`, `httpx-ws`, `httpx-sse`) with built-in metrics, embedded ground-truth audio samples, and chainable assertions.
36
+
37
+ ## Features
38
+
39
+ - **STT via WebSocket** — `audioeval.stt.ws()` streams audio, collects transcription
40
+ - **TTS via HTTP** — `audioeval.tts.post()` batch, `.stream()` chunked, `.sse()` Server-Sent Events
41
+ - **Text metrics** — WER, CER, substitutions, insertions, deletions (via `jiwer`)
42
+ - **Audio metrics** — PESQ MOS 1–5 scale (via `pesq`)
43
+ - **Embedded samples** — ground-truth audio + reference text pairs, multi-language ready
44
+ - **Chainable assertions** — `result.compute_metrics(ref).assert_quality(max_wer=0.2)`
45
+ - **CLI thresholds** — `--audioeval-wer`, `--audioeval-cer`, `--audioeval-mos`
46
+
47
+ ## Install
48
+
49
+ ```bash
50
+ uv add pytest-audioeval
51
+ ```
52
+
53
+ ## Quick Start
54
+
55
+ ### STT — WebSocket
56
+
57
+ ```python
58
+ import asyncio
59
+ import uuid
60
+ import orjson as json
61
+ from pytest_audioeval.client import AudioEval
62
+
63
+
64
+ async def test_user_stt_ws(audioeval: AudioEval) -> None:
65
+ sample = audioeval.samples.en_hello_world
66
+
67
+ async with audioeval.stt.ws(sample=sample) as session:
68
+ config = json.dumps(
69
+ {"uid": str(uuid.uuid4()), "language": "en", "task": "transcribe",
70
+ "model": "large-v3-turbo", "use_vad": True}
71
+ ).decode()
72
+ await session.send_text(config)
73
+
74
+ ready = await session.receive_text()
75
+ assert "SERVER_READY" in ready
76
+
77
+ await session.send_sample(sample, chunk_ms=200)
78
+ await asyncio.sleep(2)
79
+ await session.send_text("END_OF_AUDIO")
80
+
81
+ # Collect transcription segments...
82
+ ```
83
+
84
+ ### TTS — Batch POST
85
+
86
+ ```python
87
+ import io
88
+ import soundfile as sf
89
+ from pytest_audioeval.client import AudioEval
90
+
91
+
92
+ async def test_user_tts_batch(audioeval: AudioEval) -> None:
93
+ response = await audioeval.tts.post(
94
+ json={"input": "Hello world.", "model": "kokoro",
95
+ "voice": "af_heart", "response_format": "wav", "stream": False},
96
+ )
97
+ data, rate = sf.read(io.BytesIO(response.content), dtype="float32")
98
+ assert rate == 24_000
99
+ assert len(data) > 0
100
+ ```
101
+
102
+ ### TTS — Chunked Streaming
103
+
104
+ ```python
105
+ async def test_user_tts_streaming(audioeval: AudioEval) -> None:
106
+ chunks = []
107
+ async with audioeval.tts.stream(json={"input": "Hello.", ...}) as response:
108
+ async for chunk in response.aiter_bytes():
109
+ chunks.append(chunk)
110
+ assert len(chunks) > 0
111
+ ```
112
+
113
+ ### TTS — Server-Sent Events
114
+
115
+ ```python
116
+ async def test_user_tts_sse(audioeval: AudioEval) -> None:
117
+ async with audioeval.tts.sse(json={"input": "Hello.", ...}) as event_source:
118
+ async for sse in event_source.aiter_sse():
119
+ print(sse.data)
120
+ ```
121
+
122
+ ### Text Metrics
123
+
124
+ ```python
125
+ from pytest_audioeval.metrics.text import TextMetrics
126
+
127
+
128
+ async def test_user_metrics_text() -> None:
129
+ metrics = TextMetrics.compute(
130
+ reference="the quick brown fox jumps over the lazy dog",
131
+ hypothesis="the quick brown fox jumps over the lazy dock",
132
+ )
133
+ assert metrics.wer < 0.15
134
+ assert metrics.substitutions == 1
135
+ ```
136
+
137
+ ### STT Result — Chainable Assertions
138
+
139
+ ```python
140
+ from pytest_audioeval.stt import STTResult
141
+
142
+
143
+ async def test_user_stt_result() -> None:
144
+ result = STTResult(hypothesis_text="Hello world.")
145
+ result.compute_metrics("Hello world.")
146
+ result.assert_quality(max_wer=0.2, max_cer=0.15)
147
+ ```
148
+
149
+ ### Sample Registry
150
+
151
+ ```python
152
+ from pytest_audioeval.samples.registry import SampleLang
153
+
154
+
155
+ async def test_user_samples_browse(audioeval: AudioEval) -> None:
156
+ # All samples
157
+ assert len(audioeval.samples) >= 3
158
+
159
+ # Filter by language
160
+ en_samples = audioeval.samples.by_lang(SampleLang.EN)
161
+
162
+ # Attribute access: {lang}_{name}
163
+ sample = audioeval.samples.en_hello_world
164
+ assert sample.reference_text == "Hello world."
165
+
166
+ # Audio access
167
+ audio_f32 = sample.audio_numpy() # numpy float32 array
168
+ audio_raw = sample.audio_bytes() # raw bytes
169
+ chunks = sample.chunks(chunk_ms=200) # chunked for streaming
170
+ ```
171
+
172
+ ### CLI Thresholds
173
+
174
+ ```python
175
+ async def test_user_thresholds(audioeval_thresholds: dict[str, float]) -> None:
176
+ assert audioeval_thresholds["max_wer"] == 0.2
177
+ assert audioeval_thresholds["max_cer"] == 0.15
178
+ assert audioeval_thresholds["min_mos"] == 3.0
179
+ ```
180
+
181
+ ## CLI Options
182
+
183
+ ```bash
184
+ pytest --stt-url=ws://localhost:45120 --tts-url=http://localhost:45130/v1/audio/speech
185
+ pytest --audioeval-wer=0.15 --audioeval-cer=0.10 --audioeval-mos=3.5
186
+ ```
187
+
188
+ | Option | Default | Description |
189
+ |---|---|---|
190
+ | `--stt-url` | `None` | STT service WebSocket URL |
191
+ | `--tts-url` | `None` | TTS service HTTP URL |
192
+ | `--audioeval-wer` | `0.2` | Max WER threshold |
193
+ | `--audioeval-cer` | `0.15` | Max CER threshold |
194
+ | `--audioeval-mos` | `3.0` | Min PESQ MOS threshold |
195
+
196
+ ## Fixtures
197
+
198
+ | Fixture | Scope | Type | Description |
199
+ |---|---|---|---|
200
+ | `audioeval` | session | `AudioEval` | Main facade — `audioeval.stt`, `audioeval.tts`, `audioeval.samples` |
201
+ | `audioeval_thresholds` | function | `dict[str, float]` | CLI-driven threshold dict |
202
+
203
+ ## Architecture
204
+
205
+ ```
206
+ src/pytest_audioeval/
207
+ ├── plugin.py # pytest entry point (fixtures, CLI options)
208
+ ├── client.py # AudioEval facade
209
+ ├── stt.py # STTClient (httpx-ws), STTSession, STTResult
210
+ ├── tts.py # TTSClient (httpx + httpx-sse)
211
+ ├── metrics/
212
+ │ ├── text.py # TextMetrics — WER, CER via jiwer
213
+ │ └── audio.py # AudioMetrics — PESQ MOS via pesq
214
+ └── samples/
215
+ ├── registry.py # SampleRegistry + AudioSample + SampleLang
216
+ └── audio/en/ # Embedded ground-truth WAV + TXT pairs
217
+ ```
218
+
219
+ ### Clients
220
+
221
+ | Client | Transport | Methods |
222
+ |---|---|---|
223
+ | `STTClient` | `httpx-ws` | `.ws()` — WebSocket context manager yielding `STTSession` |
224
+ | `TTSClient` | `httpx` + `httpx-sse` | `.post()` batch, `.stream()` chunked, `.sse()` SSE |
225
+
226
+ ### Metrics
227
+
228
+ | Metric | Class | Source | Range |
229
+ |---|---|---|---|
230
+ | Word Error Rate (WER) | `TextMetrics` | `jiwer` | 0.0 – 1.0+ |
231
+ | Character Error Rate (CER) | `TextMetrics` | `jiwer` | 0.0 – 1.0+ |
232
+ | Substitutions / Insertions / Deletions | `TextMetrics` | `jiwer` | 0 – N |
233
+ | PESQ MOS | `AudioMetrics` | `pesq` | 1.0 – 5.0 |
234
+
235
+ ### Samples
236
+
237
+ Embedded ground-truth audio with reference transcriptions:
238
+
239
+ ```
240
+ samples/audio/
241
+ └── en/ # English (16kHz, float32)
242
+ ├── hello_world.wav # "Hello world."
243
+ ├── quick_brown_fox.wav
244
+ └── counting.wav # "One, two, three, four, five."
245
+ ```
246
+
247
+ Access: `audioeval.samples.en_hello_world`, `audioeval.samples.en_counting`, etc.
248
+
249
+ ## Infrastructure
250
+
251
+ Integration tests require GPU-accelerated TTS/STT services:
252
+
253
+ ```bash
254
+ make infra-up # Start TTS (Kokoro) + STT (WhisperLive)
255
+ make infra-status # Check health
256
+ make infra-logs # View logs
257
+ make infra-down # Stop services
258
+ ```
259
+
260
+ | Service | Image | Port | Protocol |
261
+ |---|---|---|---|
262
+ | TTS (Kokoro) | `ghcr.io/remsky/kokoro-fastapi-gpu` | `45130` | HTTP |
263
+ | STT (WhisperLive) | `ghcr.io/collabora/whisperlive-gpu` | `45120` | WebSocket |
264
+
265
+ ## Development
266
+
267
+ ```bash
268
+ make install # uv sync --dev
269
+ make lint # ruff check + format
270
+ make test-unit # unit tests (no services)
271
+ make test-integration # integration tests (requires services)
272
+ make coverage # coverage report (>90%)
273
+ ```
274
+
275
+ ## Requirements
276
+
277
+ - Python >= 3.13
278
+ - NVIDIA GPU + Docker with nvidia-container-toolkit (for integration tests)
279
+
280
+ ## License
281
+
282
+ MIT
@@ -0,0 +1,19 @@
1
+ pytest_audioeval/client.py,sha256=q2-Wc4t-Y973mMKAjcx4i63LoKm51ItrPxm2AbgKBas,939
2
+ pytest_audioeval/plugin.py,sha256=GQsmvWjgJGkVDVg63WJpXIg3bzEy_VAzy3SH6UBYT5Y,1545
3
+ pytest_audioeval/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ pytest_audioeval/stt.py,sha256=swdlUwfk6nbxULzpp9Ln5J1FDxma3rjSxV4xeFZp_Qk,5484
5
+ pytest_audioeval/tts.py,sha256=YWETOLXsAjFSV0DOZ5XjD5btelLUF7Ze5nmz7WYeyy0,2531
6
+ pytest_audioeval/metrics/audio.py,sha256=4XWYXd6YRoAiYnlFDjTFdmg6vtLmhaG56fMYcC1x2cg,842
7
+ pytest_audioeval/metrics/text.py,sha256=Koisq7M8D4k-AUA_OiCj9TG0cb6EawWL8OJbHsNMD8w,1489
8
+ pytest_audioeval/samples/registry.py,sha256=j_L8sVONaY_syx_8KX9iqcNCNkkLA1hcKxX3EqBAJmM,3690
9
+ pytest_audioeval/samples/audio/en/counting.txt,sha256=Drz5VehysxuVt0jTmqDmGI-3mBIXNEIZPy8VXdLrCww,49
10
+ pytest_audioeval/samples/audio/en/counting.wav,sha256=ReH23MoTEk5-2Wg7vq6PQJ-dPNjQBN5nR9nQL8GvEoc,191056
11
+ pytest_audioeval/samples/audio/en/hello_world.txt,sha256=qj7BbmrMgJ2LKBhmInYlar_S8bRBy1FXSTPz1L0RXRE,12
12
+ pytest_audioeval/samples/audio/en/hello_world.wav,sha256=1podm-hax7z8VE-nIRW2dQZgGSvh83F9DfJTLjQJ_8U,73256
13
+ pytest_audioeval/samples/audio/en/quick_brown_fox.txt,sha256=71N_JciVv6eCUmUpqbY9l6pjFWTV14nCt2VEjIY1-2w,44
14
+ pytest_audioeval/samples/audio/en/quick_brown_fox.wav,sha256=x8yH2Da4BggWU04pxsXWpDrmLu5pmy8dFVmLhmra_tY,184952
15
+ pytest_audioeval-0.1.0.dist-info/METADATA,sha256=qnbfw-erpOwHqz3esGgx5umMGuNJc9PYbR0ThBuHDPg,8922
16
+ pytest_audioeval-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
17
+ pytest_audioeval-0.1.0.dist-info/entry_points.txt,sha256=yDBwGzawDne6_1gZ6zS6Mv6jUVm8IHdI084rWB2nlcY,47
18
+ pytest_audioeval-0.1.0.dist-info/licenses/LICENSE,sha256=4K5VRrESrMWeDdFrydaJe4ZtvcVCfF3FSMdxK6893cE,1070
19
+ pytest_audioeval-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [pytest11]
2
+ audioeval = pytest_audioeval.plugin
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Damien Volkov
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.