PyPI - mate-workload-stt - Versions diffs - 0.1.0__tar.gz - Mend

mate-workload-stt 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

mate_workload_stt-0.1.0/.gitignore +30 -0
mate_workload_stt-0.1.0/PKG-INFO +50 -0
mate_workload_stt-0.1.0/README.md +33 -0
mate_workload_stt-0.1.0/pyproject.toml +35 -0
mate_workload_stt-0.1.0/src/mate_workload_stt/__init__.py +123 -0
mate_workload_stt-0.1.0/src/mate_workload_stt/_download.py +56 -0
mate_workload_stt-0.1.0/src/mate_workload_stt/_measure.py +109 -0
mate_workload_stt-0.1.0/src/mate_workload_stt/_profiles.py +47 -0
mate_workload_stt-0.1.0/src/mate_workload_stt/py.typed +0 -0
mate_workload_stt-0.1.0/tests/fixtures.py +23 -0
mate_workload_stt-0.1.0/tests/test_measure.py +86 -0

mate_workload_stt-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,30 @@
+# Python
+.venv/
+__pycache__/
+*.py[cod]
+*.egg-info/
+dist/
+build/
+.pytest_cache/
+# uv
+uv.lock
+# Node / Cloudflare Worker
+worker/node_modules/
+worker/.wrangler/
+worker/dist/
+# Results (local benchmark output)
+results/
+# Secrets / local config
+.env
+*.env.local
+# OS
+.DS_Store
+Thumbs.db
+# Internal planning notes
+BENCH_NOTES.md

mate_workload_stt-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,50 @@
+Metadata-Version: 2.4
+Name: mate-workload-stt
+Version: 0.1.0
+Summary: Speech-to-text workload plugin for mate-bench
+Project-URL: Homepage, https://github.com/T0nd3/mate-bench
+Project-URL: Repository, https://github.com/T0nd3/mate-bench
+Author-email: Benjamin Fäuster <benjamin.faeuster@web.de>
+License: MIT
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: System :: Benchmark
+Requires-Python: >=3.11
+Requires-Dist: mate-bench<0.2,>=0.1
+Description-Content-Type: text/markdown
+# mate-workload-stt
+Speech-to-text workload plugin for [mate-bench](https://github.com/T0nd3/mate-bench).
+Benchmarks STT transcription speed and accuracy using LibriSpeech test-clean audio clips
+with [faster-whisper](https://github.com/SYSTRAN/faster-whisper).
+## Metrics
+| Metric | Description |
+|--------|-------------|
+| `rtf` | Real-Time Factor — `processing_time / audio_duration` (lower is better) |
+| `wer` | Word Error Rate — edit distance / total reference words (lower is better) |
+| `total_audio_seconds` | Total audio processed per run |
+## Profiles
+| Profile | Clips | Audio | Model |
+|---------|-------|-------|-------|
+| `quick` | 5 | ~50 s | whisper-large-v3 |
+| `standard` | 20 | ~200 s | whisper-large-v3 |
+## Usage
+```bash
+mate-bench run stt --profile quick
+mate-bench run stt --profile standard
+```
+## Test data
+[LibriSpeech test-clean](https://openslr.org/12) (Panayotov et al., 2015).
+Licensed [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/).

mate_workload_stt-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,33 @@
+# mate-workload-stt
+Speech-to-text workload plugin for [mate-bench](https://github.com/T0nd3/mate-bench).
+Benchmarks STT transcription speed and accuracy using LibriSpeech test-clean audio clips
+with [faster-whisper](https://github.com/SYSTRAN/faster-whisper).
+## Metrics
+| Metric | Description |
+|--------|-------------|
+| `rtf` | Real-Time Factor — `processing_time / audio_duration` (lower is better) |
+| `wer` | Word Error Rate — edit distance / total reference words (lower is better) |
+| `total_audio_seconds` | Total audio processed per run |
+## Profiles
+| Profile | Clips | Audio | Model |
+|---------|-------|-------|-------|
+| `quick` | 5 | ~50 s | whisper-large-v3 |
+| `standard` | 20 | ~200 s | whisper-large-v3 |
+## Usage
+```bash
+mate-bench run stt --profile quick
+mate-bench run stt --profile standard
+```
+## Test data
+[LibriSpeech test-clean](https://openslr.org/12) (Panayotov et al., 2015).
+Licensed [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/).

mate_workload_stt-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,35 @@
+[project]
+name = "mate-workload-stt"
+version = "0.1.0"
+description = "Speech-to-text workload plugin for mate-bench"
+readme = "README.md"
+requires-python = ">=3.11"
+license = {text = "MIT"}
+authors = [{name = "Benjamin Fäuster", email = "benjamin.faeuster@web.de"}]
+classifiers = [
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: System :: Benchmark",
+]
+dependencies = [
+    "mate-bench>=0.1,<0.2",
+]
+[project.urls]
+Homepage = "https://github.com/T0nd3/mate-bench"
+Repository = "https://github.com/T0nd3/mate-bench"
+[project.entry-points."mate_bench.workload"]
+stt = "mate_workload_stt:SttWorkload"
+[dependency-groups]
+dev = ["pytest>=8.0", "ruff>=0.4"]
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.build.targets.wheel]
+packages = ["src/mate_workload_stt"]

mate_workload_stt-0.1.0/src/mate_workload_stt/__init__.py ADDED Viewed

@@ -0,0 +1,123 @@
+"""Speech-to-text workload plugin for mate-bench.
+Benchmarks STT transcription speed (RTF) and accuracy (WER) using
+LibriSpeech test-clean audio clips with a faster-whisper backend.
+"""
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Any
+from mate_bench._utils import sha256_file
+from mate_bench.paths import TEST_SETS_DIR
+from mate_bench.plugin import (
+    EnginePlugin,
+    Measurement,
+    Mode,
+    PluginManifest,
+    ProfileConfig,
+    TestSetSpec,
+)
+from ._download import fetch_clips, fetch_manifest
+from ._measure import measure
+from ._profiles import PROFILES, TEST_SETS
+__all__ = ["SttWorkload"]
+_STT_DIR = TEST_SETS_DIR / "stt"
+class SttWorkload:
+    name = "stt"
+    manifest = PluginManifest(requires_mate_bench=">=0.1,<0.2", api_version=1)
+    profiles: dict[str, ProfileConfig] = PROFILES
+    test_sets: dict[str, TestSetSpec] = TEST_SETS
+    def estimate_download(self, profile: str) -> int:
+        return PROFILES[profile].download_size_bytes
+    def estimate_vram(self, profile: str) -> int:
+        return int(PROFILES[profile].vram_required_gb * 1024**3)
+    def estimate_runtime(self, profile: str) -> int:
+        return PROFILES[profile].estimated_runtime_seconds
+    def required_models(self, profile: str) -> list[str]:
+        return [PROFILES[profile].reference_engine_config["model"]]
+    def _manifest_path(self, test_set_id: str) -> Path:
+        spec = TEST_SETS[test_set_id]
+        name = spec.url.rsplit("/", 1)[-1]
+        return _STT_DIR / "manifests" / name
+    def setup_closed(self, profile: str) -> None:
+        spec = TEST_SETS[PROFILES[profile].test_set_id]
+        cached_manifest = fetch_manifest(spec.url, spec.sha256, _STT_DIR)
+        fetch_clips(cached_manifest, _STT_DIR / "clips")
+    def setup_open(self, profile: str, user_inputs: dict[str, Any]) -> None:
+        raise NotImplementedError("open mode not supported for stt workload")
+    def _load_clips_and_paths(self, profile: str) -> tuple[list[dict], dict[str, Path]]:
+        test_set_id = PROFILES[profile].test_set_id
+        spec = TEST_SETS[test_set_id]
+        cached_manifest = fetch_manifest(spec.url, spec.sha256, _STT_DIR)
+        clip_paths = fetch_clips(cached_manifest, _STT_DIR / "clips")
+        return cached_manifest["clips"], clip_paths
+    def run(
+        self,
+        profile: str,
+        mode: Mode,
+        engine: EnginePlugin,
+        runs: int,
+        warmup_runs: int,
+    ) -> Measurement:
+        if mode != Mode.CLOSED:
+            raise NotImplementedError("open mode not supported for stt workload")
+        cfg = PROFILES[profile].reference_engine_config
+        model = cfg["model"]
+        clips, clip_paths = self._load_clips_and_paths(profile)
+        median_stats, std_dev_stats, throttling_detected = measure(
+            engine,  # type: ignore[arg-type]
+            model,
+            clips,
+            clip_paths,
+            runs,
+            warmup_runs,
+        )
+        return Measurement(
+            runs=runs,
+            warmup_runs=warmup_runs,
+            median=median_stats,
+            std_dev=std_dev_stats,
+            vram_peak_gb=0.0,
+            throttling_detected=throttling_detected,
+        )
+    def test_set_hash(self, profile: str) -> str:
+        path = self._manifest_path(PROFILES[profile].test_set_id)
+        if not path.exists():
+            self.setup_closed(profile)
+        return sha256_file(path)
+    def cleanup(self, profile: str) -> None:
+        test_set_id = PROFILES[profile].test_set_id
+        spec = TEST_SETS[test_set_id]
+        name = spec.url.rsplit("/", 1)[-1]
+        manifest_path = _STT_DIR / "manifests" / name
+        if manifest_path.exists():
+            with manifest_path.open() as f:
+                manifest = json.load(f)
+            for clip in manifest.get("clips", []):
+                fname = clip["url"].rsplit("/", 1)[-1]
+                p = _STT_DIR / "clips" / fname
+                if p.exists():
+                    p.unlink()
+            manifest_path.unlink()

mate_workload_stt-0.1.0/src/mate_workload_stt/_download.py ADDED Viewed

@@ -0,0 +1,56 @@
+from __future__ import annotations
+import hashlib
+import json
+import urllib.request
+from pathlib import Path
+def _sha256_file(path: Path) -> str:
+    h = hashlib.sha256()
+    with path.open("rb") as f:
+        for chunk in iter(lambda: f.read(65536), b""):
+            h.update(chunk)
+    return f"sha256:{h.hexdigest()}"
+def _download(url: str, dst: Path) -> None:
+    dst.parent.mkdir(parents=True, exist_ok=True)
+    with urllib.request.urlopen(url) as resp, dst.open("wb") as f:
+        while chunk := resp.read(65536):
+            f.write(chunk)
+def fetch_manifest(url: str, expected_sha256: str, cache_dir: Path) -> dict:
+    """Download and cache the test-set manifest JSON; verify hash."""
+    name = url.rsplit("/", 1)[-1]
+    dst = cache_dir / "manifests" / name
+    if not dst.exists():
+        _download(url, dst)
+    if expected_sha256 != "sha256:PENDING":
+        actual = _sha256_file(dst)
+        if actual != expected_sha256:
+            dst.unlink()
+            raise ValueError(f"Manifest hash mismatch: expected {expected_sha256}, got {actual}")
+    with dst.open() as f:
+        return json.load(f)
+def fetch_clips(manifest: dict, clips_dir: Path) -> dict[str, Path]:
+    """Download all clips referenced in the manifest; return {clip_id: local_path}."""
+    clips_dir.mkdir(parents=True, exist_ok=True)
+    result: dict[str, Path] = {}
+    for clip in manifest["clips"]:
+        clip_id = clip["id"]
+        fname = clip["url"].rsplit("/", 1)[-1]
+        dst = clips_dir / fname
+        if not dst.exists():
+            _download(clip["url"], dst)
+        actual = _sha256_file(dst)
+        if actual != clip["sha256"]:
+            dst.unlink()
+            raise ValueError(
+                f"Clip {clip_id} hash mismatch: expected {clip['sha256']}, got {actual}"
+            )
+        result[clip_id] = dst
+    return result

mate_workload_stt-0.1.0/src/mate_workload_stt/_measure.py ADDED Viewed

@@ -0,0 +1,109 @@
+from __future__ import annotations
+import statistics
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Protocol
+class _TranscribeResult(Protocol):
+    text: str
+    audio_duration_s: float
+    processing_time_s: float
+class _SttEngine(Protocol):
+    def transcribe(self, audio_path: Path, model: str) -> _TranscribeResult: ...
+@dataclass
+class _RunStats:
+    total_audio_s: float
+    total_processing_s: float
+    rtf: float
+    wer: float
+def _word_edit_distance(ref: list[str], hyp: list[str]) -> int:
+    m, n = len(ref), len(hyp)
+    dp = list(range(n + 1))
+    for i in range(1, m + 1):
+        prev = dp[:]
+        dp[0] = i
+        for j in range(1, n + 1):
+            if ref[i - 1] == hyp[j - 1]:
+                dp[j] = prev[j - 1]
+            else:
+                dp[j] = 1 + min(prev[j], dp[j - 1], prev[j - 1])
+    return dp[n]
+def corpus_wer(references: list[str], hypotheses: list[str]) -> float:
+    """Word Error Rate over a corpus (lower is better)."""
+    total_errors = 0
+    total_words = 0
+    for ref, hyp in zip(references, hypotheses, strict=False):
+        r = ref.upper().split()
+        h = hyp.upper().split()
+        total_errors += _word_edit_distance(r, h)
+        total_words += len(r)
+    return total_errors / max(total_words, 1)
+def _aggregate_run(
+    results: list[_TranscribeResult],
+    references: list[str],
+) -> _RunStats:
+    total_audio = sum(r.audio_duration_s for r in results)
+    total_proc = sum(r.processing_time_s for r in results)
+    rtf = total_proc / total_audio if total_audio > 0 else 0.0
+    wer = corpus_wer(references, [r.text for r in results])
+    return _RunStats(
+        total_audio_s=total_audio,
+        total_processing_s=total_proc,
+        rtf=rtf,
+        wer=wer,
+    )
+def measure(
+    engine: _SttEngine,
+    model: str,
+    clips: list[dict[str, Any]],
+    clip_paths: dict[str, Path],
+    runs: int,
+    warmup_runs: int,
+) -> tuple[dict[str, Any], dict[str, Any], bool]:
+    """Run the STT benchmark loop; return (median, std_dev, throttling_detected)."""
+    references = [c["reference"] for c in clips]
+    all_stats: list[_RunStats] = []
+    for i in range(warmup_runs + runs):
+        run_results: list[_TranscribeResult] = []
+        for clip in clips:
+            path = clip_paths[clip["id"]]
+            result = engine.transcribe(path, model)
+            run_results.append(result)
+        if i >= warmup_runs:
+            all_stats.append(_aggregate_run(run_results, references))
+    rtf_values = [s.rtf for s in all_stats]
+    wer_values = [s.wer for s in all_stats]
+    median_rtf = statistics.median(rtf_values) if rtf_values else 0.0
+    median_wer = statistics.median(wer_values) if wer_values else 0.0
+    std_rtf = statistics.stdev(rtf_values) if len(rtf_values) > 1 else 0.0
+    std_wer = statistics.stdev(wer_values) if len(wer_values) > 1 else 0.0
+    cv = std_rtf / median_rtf if median_rtf > 0 else 0.0
+    throttling_detected = cv > 0.15
+    median_stats = {
+        "rtf": median_rtf,
+        "wer": median_wer,
+        "total_audio_seconds": all_stats[0].total_audio_s if all_stats else 0.0,
+    }
+    std_dev_stats = {"rtf": std_rtf, "wer": std_wer}
+    return median_stats, std_dev_stats, throttling_detected

mate_workload_stt-0.1.0/src/mate_workload_stt/_profiles.py ADDED Viewed

@@ -0,0 +1,47 @@
+from __future__ import annotations
+from mate_bench.plugin import ProfileConfig, TestSetSpec
+# Public R2 CDN
+CDN_BASE = "https://pub-f27eb09940c14a8dac6ae7fe10e789f3.r2.dev"
+# SHA256 values are filled in by scripts/prepare_stt_testset.py after upload.
+TEST_SETS: dict[str, TestSetSpec] = {
+    "stt-librispeech-quick-v1": TestSetSpec(
+        id="stt-librispeech-quick-v1",
+        url=f"{CDN_BASE}/stt/stt-librispeech-quick-v1.json",
+        sha256="sha256:420a49113d77e4e72a1740eeb9fff9b1918def6821c994b1d2c7e59990b8adc3",
+        size_bytes=754300,
+        license="CC BY 4.0",
+        source="LibriSpeech test-clean (Panayotov et al., 2015) — openslr.org/12",
+    ),
+    "stt-librispeech-standard-v1": TestSetSpec(
+        id="stt-librispeech-standard-v1",
+        url=f"{CDN_BASE}/stt/stt-librispeech-standard-v1.json",
+        sha256="sha256:66a200da99544095c68b90fff5287ed4460b61c854ef29a29524373973a2915b",
+        size_bytes=3184083,
+        license="CC BY 4.0",
+        source="LibriSpeech test-clean (Panayotov et al., 2015) — openslr.org/12",
+    ),
+}
+PROFILES: dict[str, ProfileConfig] = {
+    "quick": ProfileConfig(
+        name="quick",
+        description="5 LibriSpeech clips (~50 s audio) with whisper-large-v3 (~3 min, ~3 GB VRAM)",
+        test_set_id="stt-librispeech-quick-v1",
+        reference_engine_config={"engine": "faster-whisper", "model": "large-v3"},
+        vram_required_gb=3.0,
+        download_size_bytes=1_500_000_000,  # large-v3 ~1.5 GB
+        estimated_runtime_seconds=180,
+    ),
+    "standard": ProfileConfig(
+        name="standard",
+        description="20 LibriSpeech clips (~200 s audio) with whisper-large-v3 (~8 min, ~3 GB VRAM)",
+        test_set_id="stt-librispeech-standard-v1",
+        reference_engine_config={"engine": "faster-whisper", "model": "large-v3"},
+        vram_required_gb=3.0,
+        download_size_bytes=1_500_000_000,
+        estimated_runtime_seconds=480,
+    ),
+}

mate_workload_stt-0.1.0/src/mate_workload_stt/py.typed ADDED Viewed

File without changes

mate_workload_stt-0.1.0/tests/fixtures.py ADDED Viewed

@@ -0,0 +1,23 @@
+from pathlib import Path
+MOCK_CLIPS = [
+    {
+        "id": "1089-134686-0000",
+        "url": "https://example.com/stt/clips/1089-134686-0000.flac",
+        "sha256": "sha256:aabbcc",
+        "duration_s": 11.4,
+        "reference": "HE HOPED THERE WOULD BE STEW FOR DINNER TURNIPS AND CARROTS",
+    },
+    {
+        "id": "1221-135766-0001",
+        "url": "https://example.com/stt/clips/1221-135766-0001.flac",
+        "sha256": "sha256:ddeeff",
+        "duration_s": 6.3,
+        "reference": "AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP",
+    },
+]
+MOCK_CLIP_PATHS: dict[str, Path] = {
+    "1089-134686-0000": Path("/tmp/1089-134686-0000.flac"),
+    "1221-135766-0001": Path("/tmp/1221-135766-0001.flac"),
+}

mate_workload_stt-0.1.0/tests/test_measure.py ADDED Viewed

@@ -0,0 +1,86 @@
+from __future__ import annotations
+from unittest.mock import MagicMock
+import pytest
+from mate_workload_stt._measure import corpus_wer, measure
+from .fixtures import MOCK_CLIP_PATHS, MOCK_CLIPS
+# ── corpus_wer ────────────────────────────────────────────────────────────────
+class TestCorpusWer:
+    def test_perfect_match(self):
+        assert corpus_wer(["HELLO WORLD"], ["HELLO WORLD"]) == 0.0
+    def test_one_substitution_of_two(self):
+        result = corpus_wer(["HELLO WORLD"], ["HELLO EARTH"])
+        assert result == pytest.approx(0.5)
+    def test_full_deletion(self):
+        result = corpus_wer(["ONE TWO THREE"], [""])
+        assert result == pytest.approx(1.0)
+    def test_empty_reference(self):
+        assert corpus_wer([""], [""]) == 0.0
+    def test_case_insensitive(self):
+        assert corpus_wer(["hello world"], ["HELLO WORLD"]) == 0.0
+    def test_multi_clip_corpus(self):
+        refs = ["ONE TWO", "THREE FOUR"]
+        hyps = ["ONE TWO", "THREE FIVE"]
+        # 0 errors + 1 error over 4 total words = 0.25
+        assert corpus_wer(refs, hyps) == pytest.approx(0.25)
+# ── measure ───────────────────────────────────────────────────────────────────
+def _make_engine(processing_time: float, text: str) -> MagicMock:
+    result = MagicMock()
+    result.text = text
+    result.audio_duration_s = 10.0
+    result.processing_time_s = processing_time
+    engine = MagicMock()
+    engine.transcribe.return_value = result
+    return engine
+class TestMeasure:
+    def test_rtf_is_correct(self):
+        engine = _make_engine(
+            processing_time=1.0, text="HE HOPED THERE WOULD BE STEW FOR DINNER TURNIPS AND CARROTS"
+        )
+        median, _std_dev, _throttling = measure(
+            engine, "large-v3", MOCK_CLIPS, MOCK_CLIP_PATHS, runs=1, warmup_runs=0
+        )
+        # processing_time=1.0, audio_duration=10.0 per clip → RTF = 0.1
+        assert median["rtf"] == pytest.approx(0.1)
+    def test_warmup_excluded(self):
+        call_count = 0
+        engine = MagicMock()
+        def side_effect(path, model):
+            nonlocal call_count
+            call_count += 1
+            r = MagicMock()
+            r.text = "HE HOPED THERE WOULD BE STEW FOR DINNER TURNIPS AND CARROTS"
+            r.audio_duration_s = 10.0
+            r.processing_time_s = 1.0
+            return r
+        engine.transcribe.side_effect = side_effect
+        measure(engine, "large-v3", MOCK_CLIPS, MOCK_CLIP_PATHS, runs=1, warmup_runs=2)
+        # 2 warmup + 1 run, each with 2 clips → 6 total calls
+        assert call_count == 6
+    def test_no_throttling_stable_run(self):
+        engine = _make_engine(1.0, "HE HOPED THERE WOULD BE STEW FOR DINNER TURNIPS AND CARROTS")
+        _, _, throttling = measure(
+            engine, "large-v3", MOCK_CLIPS, MOCK_CLIP_PATHS, runs=3, warmup_runs=0
+        )
+        assert throttling is False