open-asr-server 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 codyw912
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,196 @@
1
+ Metadata-Version: 2.3
2
+ Name: open-asr-server
3
+ Version: 0.1.2
4
+ Summary: OpenAI-compatible ASR server with pluggable local backends.
5
+ Keywords: asr,speech-to-text,openai,fastapi,whisper
6
+ Author: codyw912
7
+ Author-email: codyw912 <32690983+codyw912@users.noreply.github.com>
8
+ License: MIT License
9
+
10
+ Copyright (c) 2025 codyw912
11
+
12
+ Permission is hereby granted, free of charge, to any person obtaining a copy
13
+ of this software and associated documentation files (the "Software"), to deal
14
+ in the Software without restriction, including without limitation the rights
15
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16
+ copies of the Software, and to permit persons to whom the Software is
17
+ furnished to do so, subject to the following conditions:
18
+
19
+ The above copyright notice and this permission notice shall be included in all
20
+ copies or substantial portions of the Software.
21
+
22
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28
+ SOFTWARE.
29
+ Classifier: License :: OSI Approved :: MIT License
30
+ Classifier: Programming Language :: Python :: 3
31
+ Classifier: Programming Language :: Python :: 3.11
32
+ Classifier: Programming Language :: Python :: 3.12
33
+ Classifier: Programming Language :: Python :: 3.13
34
+ Classifier: Programming Language :: Python :: 3.14
35
+ Classifier: Programming Language :: Python :: Implementation :: CPython
36
+ Requires-Dist: fastapi>=0.110
37
+ Requires-Dist: huggingface-hub>=0.23
38
+ Requires-Dist: pydantic>=2.0
39
+ Requires-Dist: python-multipart>=0.0.9
40
+ Requires-Dist: typer>=0.12
41
+ Requires-Dist: uvicorn>=0.29
42
+ Requires-Dist: pytest>=7.4 ; extra == 'dev'
43
+ Requires-Dist: pytest-cov>=4.1 ; extra == 'dev'
44
+ Requires-Dist: httpx>=0.25 ; extra == 'dev'
45
+ Requires-Dist: lightning-whisper-mlx ; python_full_version < '3.12' and extra == 'lightning-whisper'
46
+ Requires-Dist: parakeet-mlx ; extra == 'parakeet'
47
+ Requires-Dist: mlx-whisper ; python_full_version < '3.12' and extra == 'whisper'
48
+ Requires-Python: >=3.11
49
+ Project-URL: Homepage, https://github.com/codyw912/open-asr-server
50
+ Project-URL: Repository, https://github.com/codyw912/open-asr-server
51
+ Project-URL: Issues, https://github.com/codyw912/open-asr-server/issues
52
+ Provides-Extra: dev
53
+ Provides-Extra: lightning-whisper
54
+ Provides-Extra: parakeet
55
+ Provides-Extra: whisper
56
+ Description-Content-Type: text/markdown
57
+
58
+ # Open ASR Server
59
+
60
+ [![CI](https://github.com/codyw912/open-asr-server/actions/workflows/ci.yml/badge.svg)](https://github.com/codyw912/open-asr-server/actions/workflows/ci.yml)
61
+ [![PyPI](https://img.shields.io/pypi/v/open-asr-server)](https://pypi.org/project/open-asr-server/)
62
+
63
+ OpenAI-compatible ASR server with pluggable local transcription backends.
64
+
65
+ ## Install
66
+
67
+ Base install includes the API server and shared models/formatters:
68
+
69
+ ```bash
70
+ uv tool install "open-asr-server"
71
+ ```
72
+
73
+ Add backend extras as needed:
74
+
75
+ ```bash
76
+ uv tool install "open-asr-server[parakeet]"
77
+ uv tool install "open-asr-server[whisper]"
78
+ uv tool install "open-asr-server[lightning-whisper]"
79
+ ```
80
+
81
+ Note: the Whisper extras currently require Python 3.11 (tiktoken build constraints on 3.12+).
82
+
83
+ ### Whisper troubleshooting
84
+
85
+ If `uv run --extra whisper` fails on Python 3.12+, use a 3.11 interpreter for now:
86
+
87
+ ```bash
88
+ uv run --python 3.11 --extra whisper -- open-asr-server serve --host 127.0.0.1 --port 8000
89
+ ```
90
+
91
+ ## Run
92
+
93
+ Install at least one backend extra before running (the default model uses
94
+ Parakeet MLX):
95
+
96
+ ```bash
97
+ uv tool install "open-asr-server[parakeet]"
98
+ ```
99
+
100
+ Then start the server:
101
+
102
+ ```bash
103
+ uv tool run open-asr-server serve --host 127.0.0.1 --port 8000
104
+ ```
105
+
106
+ Environment variables:
107
+
108
+ - `OPEN_ASR_SERVER_DEFAULT_MODEL`: default model ID for requests
109
+ - `OPEN_ASR_SERVER_PRELOAD`: comma-separated models to preload at startup
110
+ - `OPEN_ASR_SERVER_API_KEY`: optional shared secret for requests
111
+ - `OPEN_ASR_SERVER_ALLOWED_MODELS`: comma-separated allowed model IDs or patterns
112
+ - `OPEN_ASR_SERVER_MAX_UPLOAD_BYTES`: max upload size in bytes (default: 26214400)
113
+ - `OPEN_ASR_SERVER_RATE_LIMIT_PER_MINUTE`: optional per-client request limit (off by default)
114
+ - `OPEN_ASR_SERVER_TRANSCRIBE_TIMEOUT_SECONDS`: optional transcription timeout (off by default)
115
+ - `OPEN_ASR_SERVER_TRANSCRIBE_WORKERS`: optional thread pool size for transcriptions
116
+ - `OPEN_ASR_SERVER_MODEL_DIR`: override the Hugging Face cache location for this server
117
+ - `OPEN_ASR_SERVER_HF_TOKEN`: optional Hugging Face token for gated/private models
118
+
119
+ Models default to the Hugging Face cache unless a local path is provided. Use
120
+ `OPEN_ASR_SERVER_MODEL_DIR` if you want a dedicated cache without changing your
121
+ global HF environment. Use `OPEN_ASR_SERVER_HF_TOKEN` to authenticate downloads
122
+ without setting global HF environment variables.
123
+
124
+ Use `OPEN_ASR_SERVER_TRANSCRIBE_TIMEOUT_SECONDS` to bound long transcriptions.
125
+ If you set `OPEN_ASR_SERVER_TRANSCRIBE_WORKERS`, transcriptions run in a
126
+ background thread pool instead of the event loop.
127
+
128
+ ## Sample audio
129
+
130
+ Two short clips are included in `samples/` for quick smoke tests:
131
+
132
+ - `samples/jfk_0_5.flac`
133
+ - `samples/jfk_5_10.flac`
134
+
135
+ They are derived from `tests/jfk.flac` in the OpenAI Whisper repo (MIT); the
136
+ original JFK speech is public domain.
137
+
138
+ ```bash
139
+ uv run --extra parakeet scripts/smoke_parakeet.py samples/jfk_0_5.flac
140
+ uv run --python 3.11 --extra whisper scripts/smoke_whisper.py samples/jfk_0_5.flac
141
+ uv run --python 3.11 --extra lightning-whisper scripts/smoke_lightning.py samples/jfk_0_5.flac
142
+ ```
143
+
144
+ ## Backend options
145
+
146
+ All backends are MLX-based today (Apple Silicon/macOS). Non-MLX backends are
147
+ planned, but not yet supported.
148
+
149
+ Model IDs determine which backend is used:
150
+
151
+ - Parakeet MLX: `mlx-community/parakeet-tdt-0.6b-v3` (default) or `parakeet-*`
152
+ - MLX Whisper: `whisper-large-v3-turbo` or `mlx-community/whisper-large-v3-turbo`
153
+ - Lightning Whisper MLX: `lightning-whisper-distil-large-v3`
154
+
155
+ ## API compatibility
156
+
157
+ The server implements:
158
+
159
+ - `POST /v1/audio/transcriptions`
160
+ - `GET /v1/models`
161
+
162
+ Example:
163
+
164
+ ```bash
165
+ curl -s -X POST "http://127.0.0.1:8000/v1/audio/transcriptions" \
166
+ -F "file=@audio.wav" \
167
+ -F "model=whisper-large-v3-turbo"
168
+ ```
169
+
170
+ ## Security
171
+
172
+ This server is designed for trusted networks. If you expose it publicly, enable
173
+ `OPEN_ASR_SERVER_API_KEY` and front it with a reverse proxy that provides
174
+ TLS and rate limiting. `OPEN_ASR_SERVER_RATE_LIMIT_PER_MINUTE` offers a simple
175
+ in-process limiter for single-instance use, but it is not a substitute for
176
+ production-grade rate limiting.
177
+
178
+ API key headers:
179
+
180
+ - `Authorization: Bearer <token>`
181
+ - `X-API-Key: <token>`
182
+
183
+ Use `OPEN_ASR_SERVER_ALLOWED_MODELS` to limit which model IDs can be loaded
184
+ and prevent unbounded downloads. Avoid logging request bodies or filenames if
185
+ those may contain sensitive data, and review reverse-proxy access logs for any
186
+ retention concerns.
187
+
188
+ ## Release
189
+
190
+ ```bash
191
+ uv version --bump patch
192
+ uv run --extra dev pytest
193
+ uv build --no-sources
194
+ uv publish --index testpypi --token "$UV_PUBLISH_TOKEN"
195
+ uv publish --token "$UV_PUBLISH_TOKEN"
196
+ ```
@@ -0,0 +1,139 @@
1
+ # Open ASR Server
2
+
3
+ [![CI](https://github.com/codyw912/open-asr-server/actions/workflows/ci.yml/badge.svg)](https://github.com/codyw912/open-asr-server/actions/workflows/ci.yml)
4
+ [![PyPI](https://img.shields.io/pypi/v/open-asr-server)](https://pypi.org/project/open-asr-server/)
5
+
6
+ OpenAI-compatible ASR server with pluggable local transcription backends.
7
+
8
+ ## Install
9
+
10
+ Base install includes the API server and shared models/formatters:
11
+
12
+ ```bash
13
+ uv tool install "open-asr-server"
14
+ ```
15
+
16
+ Add backend extras as needed:
17
+
18
+ ```bash
19
+ uv tool install "open-asr-server[parakeet]"
20
+ uv tool install "open-asr-server[whisper]"
21
+ uv tool install "open-asr-server[lightning-whisper]"
22
+ ```
23
+
24
+ Note: the Whisper extras currently require Python 3.11 (tiktoken build constraints on 3.12+).
25
+
26
+ ### Whisper troubleshooting
27
+
28
+ If `uv run --extra whisper` fails on Python 3.12+, use a 3.11 interpreter for now:
29
+
30
+ ```bash
31
+ uv run --python 3.11 --extra whisper -- open-asr-server serve --host 127.0.0.1 --port 8000
32
+ ```
33
+
34
+ ## Run
35
+
36
+ Install at least one backend extra before running (the default model uses
37
+ Parakeet MLX):
38
+
39
+ ```bash
40
+ uv tool install "open-asr-server[parakeet]"
41
+ ```
42
+
43
+ Then start the server:
44
+
45
+ ```bash
46
+ uv tool run open-asr-server serve --host 127.0.0.1 --port 8000
47
+ ```
48
+
49
+ Environment variables:
50
+
51
+ - `OPEN_ASR_SERVER_DEFAULT_MODEL`: default model ID for requests
52
+ - `OPEN_ASR_SERVER_PRELOAD`: comma-separated models to preload at startup
53
+ - `OPEN_ASR_SERVER_API_KEY`: optional shared secret for requests
54
+ - `OPEN_ASR_SERVER_ALLOWED_MODELS`: comma-separated allowed model IDs or patterns
55
+ - `OPEN_ASR_SERVER_MAX_UPLOAD_BYTES`: max upload size in bytes (default: 26214400)
56
+ - `OPEN_ASR_SERVER_RATE_LIMIT_PER_MINUTE`: optional per-client request limit (off by default)
57
+ - `OPEN_ASR_SERVER_TRANSCRIBE_TIMEOUT_SECONDS`: optional transcription timeout (off by default)
58
+ - `OPEN_ASR_SERVER_TRANSCRIBE_WORKERS`: optional thread pool size for transcriptions
59
+ - `OPEN_ASR_SERVER_MODEL_DIR`: override the Hugging Face cache location for this server
60
+ - `OPEN_ASR_SERVER_HF_TOKEN`: optional Hugging Face token for gated/private models
61
+
62
+ Models default to the Hugging Face cache unless a local path is provided. Use
63
+ `OPEN_ASR_SERVER_MODEL_DIR` if you want a dedicated cache without changing your
64
+ global HF environment. Use `OPEN_ASR_SERVER_HF_TOKEN` to authenticate downloads
65
+ without setting global HF environment variables.
66
+
67
+ Use `OPEN_ASR_SERVER_TRANSCRIBE_TIMEOUT_SECONDS` to bound long transcriptions.
68
+ If you set `OPEN_ASR_SERVER_TRANSCRIBE_WORKERS`, transcriptions run in a
69
+ background thread pool instead of the event loop.
70
+
71
+ ## Sample audio
72
+
73
+ Two short clips are included in `samples/` for quick smoke tests:
74
+
75
+ - `samples/jfk_0_5.flac`
76
+ - `samples/jfk_5_10.flac`
77
+
78
+ They are derived from `tests/jfk.flac` in the OpenAI Whisper repo (MIT); the
79
+ original JFK speech is public domain.
80
+
81
+ ```bash
82
+ uv run --extra parakeet scripts/smoke_parakeet.py samples/jfk_0_5.flac
83
+ uv run --python 3.11 --extra whisper scripts/smoke_whisper.py samples/jfk_0_5.flac
84
+ uv run --python 3.11 --extra lightning-whisper scripts/smoke_lightning.py samples/jfk_0_5.flac
85
+ ```
86
+
87
+ ## Backend options
88
+
89
+ All backends are MLX-based today (Apple Silicon/macOS). Non-MLX backends are
90
+ planned, but not yet supported.
91
+
92
+ Model IDs determine which backend is used:
93
+
94
+ - Parakeet MLX: `mlx-community/parakeet-tdt-0.6b-v3` (default) or `parakeet-*`
95
+ - MLX Whisper: `whisper-large-v3-turbo` or `mlx-community/whisper-large-v3-turbo`
96
+ - Lightning Whisper MLX: `lightning-whisper-distil-large-v3`
97
+
98
+ ## API compatibility
99
+
100
+ The server implements:
101
+
102
+ - `POST /v1/audio/transcriptions`
103
+ - `GET /v1/models`
104
+
105
+ Example:
106
+
107
+ ```bash
108
+ curl -s -X POST "http://127.0.0.1:8000/v1/audio/transcriptions" \
109
+ -F "file=@audio.wav" \
110
+ -F "model=whisper-large-v3-turbo"
111
+ ```
112
+
113
+ ## Security
114
+
115
+ This server is designed for trusted networks. If you expose it publicly, enable
116
+ `OPEN_ASR_SERVER_API_KEY` and front it with a reverse proxy that provides
117
+ TLS and rate limiting. `OPEN_ASR_SERVER_RATE_LIMIT_PER_MINUTE` offers a simple
118
+ in-process limiter for single-instance use, but it is not a substitute for
119
+ production-grade rate limiting.
120
+
121
+ API key headers:
122
+
123
+ - `Authorization: Bearer <token>`
124
+ - `X-API-Key: <token>`
125
+
126
+ Use `OPEN_ASR_SERVER_ALLOWED_MODELS` to limit which model IDs can be loaded
127
+ and prevent unbounded downloads. Avoid logging request bodies or filenames if
128
+ those may contain sensitive data, and review reverse-proxy access logs for any
129
+ retention concerns.
130
+
131
+ ## Release
132
+
133
+ ```bash
134
+ uv version --bump patch
135
+ uv run --extra dev pytest
136
+ uv build --no-sources
137
+ uv publish --index testpypi --token "$UV_PUBLISH_TOKEN"
138
+ uv publish --token "$UV_PUBLISH_TOKEN"
139
+ ```
@@ -0,0 +1,59 @@
1
+ [project]
2
+ name = "open-asr-server"
3
+ version = "0.1.2"
4
+ description = "OpenAI-compatible ASR server with pluggable local backends."
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "codyw912", email = "32690983+codyw912@users.noreply.github.com" }
8
+ ]
9
+ license = { file = "LICENSE" }
10
+ requires-python = ">=3.11"
11
+ classifiers = [
12
+ "License :: OSI Approved :: MIT License",
13
+ "Programming Language :: Python :: 3",
14
+ "Programming Language :: Python :: 3.11",
15
+ "Programming Language :: Python :: 3.12",
16
+ "Programming Language :: Python :: 3.13",
17
+ "Programming Language :: Python :: 3.14",
18
+ "Programming Language :: Python :: Implementation :: CPython",
19
+ ]
20
+ keywords = ["asr", "speech-to-text", "openai", "fastapi", "whisper"]
21
+ dependencies = [
22
+ "fastapi>=0.110",
23
+ "huggingface-hub>=0.23",
24
+ "pydantic>=2.0",
25
+ "python-multipart>=0.0.9",
26
+ "typer>=0.12",
27
+ "uvicorn>=0.29",
28
+ ]
29
+
30
+ [project.urls]
31
+ Homepage = "https://github.com/codyw912/open-asr-server"
32
+ Repository = "https://github.com/codyw912/open-asr-server"
33
+ Issues = "https://github.com/codyw912/open-asr-server/issues"
34
+
35
+ [project.optional-dependencies]
36
+ parakeet = [
37
+ "parakeet-mlx",
38
+ ]
39
+ whisper = [
40
+ "mlx-whisper; python_version < '3.12'",
41
+ ]
42
+ lightning-whisper = [
43
+ "lightning-whisper-mlx; python_version < '3.12'",
44
+ ]
45
+ dev = [
46
+ "pytest>=7.4",
47
+ "pytest-cov>=4.1",
48
+ "httpx>=0.25",
49
+ ]
50
+
51
+ [project.scripts]
52
+ open-asr-server = "open_asr_server.cli:main"
53
+
54
+ [tool.pytest.ini_options]
55
+ pythonpath = ["src"]
56
+
57
+ [build-system]
58
+ requires = ["uv_build>=0.9.24,<0.10.0"]
59
+ build-backend = "uv_build"
@@ -0,0 +1,22 @@
1
+ """OpenAI-compatible ASR server for local transcription."""
2
+
3
+ from .config import ServerConfig
4
+
5
+ __version__ = "0.1.0"
6
+
7
+ def create_app(config: ServerConfig | None = None):
8
+ """Create the FastAPI application."""
9
+ from .app import create_app as _create_app
10
+
11
+ return _create_app(config)
12
+
13
+
14
+ try:
15
+ app = create_app()
16
+ except ModuleNotFoundError as exc:
17
+ if exc.name == "fastapi":
18
+ app = None
19
+ else:
20
+ raise
21
+
22
+ __all__ = ["create_app", "ServerConfig", "app", "__version__"]
@@ -0,0 +1,57 @@
1
+ """FastAPI application factory."""
2
+
3
+ from concurrent.futures import ThreadPoolExecutor
4
+ from contextlib import asynccontextmanager
5
+
6
+ from fastapi import FastAPI
7
+
8
+ from .backends import preload_backend
9
+ from .config import ServerConfig
10
+ from .routes import router
11
+ from .utils.rate_limit import RateLimiter
12
+
13
+
14
+ def create_app(config: ServerConfig | None = None) -> FastAPI:
15
+ """Create and configure the FastAPI application.
16
+
17
+ Args:
18
+ config: Server configuration. Uses defaults if not provided.
19
+
20
+ Returns:
21
+ Configured FastAPI application.
22
+ """
23
+ config = config or ServerConfig.from_env()
24
+
25
+ transcribe_executor = (
26
+ ThreadPoolExecutor(max_workers=config.transcribe_workers)
27
+ if config.transcribe_workers
28
+ else None
29
+ )
30
+
31
+ @asynccontextmanager
32
+ async def lifespan(app: FastAPI):
33
+ # Startup: preload models if configured
34
+ for model in config.preload_models:
35
+ preload_backend(model)
36
+ yield
37
+ # Shutdown: close executor if created
38
+ if transcribe_executor:
39
+ transcribe_executor.shutdown(wait=False)
40
+
41
+ app = FastAPI(
42
+ title="OpenAI-Compatible ASR Server",
43
+ description="Local transcription server with OpenAI Whisper API compatibility",
44
+ version="0.1.0",
45
+ lifespan=lifespan,
46
+ )
47
+
48
+ app.state.config = config
49
+ app.state.rate_limiter = (
50
+ RateLimiter(config.rate_limit_per_minute)
51
+ if config.rate_limit_per_minute
52
+ else None
53
+ )
54
+ app.state.transcribe_executor = transcribe_executor
55
+ app.include_router(router)
56
+
57
+ return app
@@ -0,0 +1,66 @@
1
+ """Backend registry for transcription engines."""
2
+
3
+ import fnmatch
4
+ from typing import Callable
5
+
6
+ from .base import TranscriptionBackend
7
+
8
+ _backends: dict[str, TranscriptionBackend] = {}
9
+ _backend_factories: dict[str, Callable[[str], TranscriptionBackend]] = {}
10
+
11
+
12
+ def register_backend(
13
+ model_pattern: str, factory: Callable[[str], TranscriptionBackend]
14
+ ) -> None:
15
+ """Register a backend factory for a model pattern.
16
+
17
+ Args:
18
+ model_pattern: Glob pattern to match model names (e.g., "parakeet-*").
19
+ factory: Callable that takes a model ID and returns a backend instance.
20
+ """
21
+ _backend_factories[model_pattern] = factory
22
+
23
+
24
+ def get_backend(model: str) -> TranscriptionBackend | None:
25
+ """Get or create backend for model (lazy loading).
26
+
27
+ Args:
28
+ model: Model identifier to look up.
29
+
30
+ Returns:
31
+ Backend instance, or None if no matching factory found.
32
+ """
33
+ if model not in _backends:
34
+ for pattern, factory in _backend_factories.items():
35
+ if fnmatch.fnmatch(model, pattern) or model == pattern:
36
+ _backends[model] = factory(model)
37
+ break
38
+ return _backends.get(model)
39
+
40
+
41
+ def preload_backend(model: str) -> TranscriptionBackend | None:
42
+ """Eagerly load a backend.
43
+
44
+ Args:
45
+ model: Model identifier to preload.
46
+
47
+ Returns:
48
+ Backend instance, or None if no matching factory found.
49
+ """
50
+ return get_backend(model)
51
+
52
+
53
+ def list_registered_patterns() -> list[str]:
54
+ """List registered model patterns."""
55
+ return list(_backend_factories.keys())
56
+
57
+
58
+ def list_loaded_models() -> list[str]:
59
+ """List currently loaded model instances."""
60
+ return list(_backends.keys())
61
+
62
+
63
+ # Import backends to trigger registration
64
+ from . import parakeet as _parakeet # noqa: F401, E402
65
+ from . import whisper as _whisper # noqa: F401, E402
66
+ from . import lightning_whisper as _lightning_whisper # noqa: F401, E402
@@ -0,0 +1,71 @@
1
+ """Base types and protocol for transcription backends."""
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import Protocol, runtime_checkable
6
+
7
+
8
+ @dataclass
9
+ class WordSegment:
10
+ """A single word with timestamps."""
11
+
12
+ word: str
13
+ start: float
14
+ end: float
15
+
16
+
17
+ @dataclass
18
+ class Segment:
19
+ """A segment (sentence/phrase) with timestamps."""
20
+
21
+ id: int
22
+ start: float
23
+ end: float
24
+ text: str
25
+ confidence: float | None = None
26
+
27
+
28
+ @dataclass
29
+ class TranscriptionResult:
30
+ """Unified result format for all backends."""
31
+
32
+ text: str
33
+ language: str | None
34
+ duration: float
35
+ words: list[WordSegment] | None = None
36
+ segments: list[Segment] | None = None
37
+
38
+
39
+ @runtime_checkable
40
+ class TranscriptionBackend(Protocol):
41
+ """Protocol for transcription backends.
42
+
43
+ Implement this protocol to add support for a new ASR engine.
44
+ """
45
+
46
+ def transcribe(
47
+ self,
48
+ audio_path: Path,
49
+ language: str | None = None,
50
+ temperature: float = 0.0,
51
+ word_timestamps: bool = False,
52
+ prompt: str | None = None,
53
+ ) -> TranscriptionResult:
54
+ """Transcribe audio file and return unified result.
55
+
56
+ Args:
57
+ audio_path: Path to the audio file.
58
+ language: Optional ISO-639-1 language code hint.
59
+ temperature: Sampling temperature (0.0-1.0).
60
+ word_timestamps: Whether to include word-level timestamps.
61
+ prompt: Optional prompt to guide decoding, if supported.
62
+
63
+ Returns:
64
+ TranscriptionResult with text and optional timestamps.
65
+ """
66
+ ...
67
+
68
+ @property
69
+ def supported_languages(self) -> list[str] | None:
70
+ """List of supported language codes, or None if auto-detect only."""
71
+ ...