whisper-smith 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- whisper_smith-0.1.0/.gitignore +28 -0
- whisper_smith-0.1.0/.python-version +1 -0
- whisper_smith-0.1.0/.readthedocs.yaml +17 -0
- whisper_smith-0.1.0/LICENSE +21 -0
- whisper_smith-0.1.0/Makefile +27 -0
- whisper_smith-0.1.0/PKG-INFO +236 -0
- whisper_smith-0.1.0/README.md +199 -0
- whisper_smith-0.1.0/data/.gitkeep +0 -0
- whisper_smith-0.1.0/docs/Makefile +18 -0
- whisper_smith-0.1.0/docs/requirements.txt +3 -0
- whisper_smith-0.1.0/docs/source/aligned-json.rst +47 -0
- whisper_smith-0.1.0/docs/source/api/align.rst +6 -0
- whisper_smith-0.1.0/docs/source/api/diarize.rst +6 -0
- whisper_smith-0.1.0/docs/source/api/exporters.rst +6 -0
- whisper_smith-0.1.0/docs/source/api/index.rst +11 -0
- whisper_smith-0.1.0/docs/source/api/models.rst +6 -0
- whisper_smith-0.1.0/docs/source/api/transcribe.rst +6 -0
- whisper_smith-0.1.0/docs/source/cli.rst +68 -0
- whisper_smith-0.1.0/docs/source/conf.py +27 -0
- whisper_smith-0.1.0/docs/source/index.rst +27 -0
- whisper_smith-0.1.0/docs/source/installation.rst +48 -0
- whisper_smith-0.1.0/docs/source/python-api.rst +55 -0
- whisper_smith-0.1.0/docs/source/troubleshooting.rst +33 -0
- whisper_smith-0.1.0/pyproject.toml +67 -0
- whisper_smith-0.1.0/src/whisper_smith/__init__.py +11 -0
- whisper_smith-0.1.0/src/whisper_smith/align.py +47 -0
- whisper_smith-0.1.0/src/whisper_smith/cli.py +332 -0
- whisper_smith-0.1.0/src/whisper_smith/diarize.py +200 -0
- whisper_smith-0.1.0/src/whisper_smith/exporters/__init__.py +176 -0
- whisper_smith-0.1.0/src/whisper_smith/exporters/json.py +14 -0
- whisper_smith-0.1.0/src/whisper_smith/exporters/markdown.py +20 -0
- whisper_smith-0.1.0/src/whisper_smith/exporters/subtitles.py +53 -0
- whisper_smith-0.1.0/src/whisper_smith/exporters/text.py +16 -0
- whisper_smith-0.1.0/src/whisper_smith/models.py +31 -0
- whisper_smith-0.1.0/src/whisper_smith/transcribe.py +317 -0
- whisper_smith-0.1.0/tests/test_align.py +72 -0
- whisper_smith-0.1.0/tests/test_cli_args.py +318 -0
- whisper_smith-0.1.0/tests/test_cli_output_path.py +383 -0
- whisper_smith-0.1.0/tests/test_diarize.py +319 -0
- whisper_smith-0.1.0/tests/test_exporters.py +132 -0
- whisper_smith-0.1.0/tests/test_models.py +22 -0
- whisper_smith-0.1.0/tests/test_package_import.py +4 -0
- whisper_smith-0.1.0/tests/test_transcribe.py +316 -0
- whisper_smith-0.1.0/uv.lock +3949 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Python-generated files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[oc]
|
|
4
|
+
build/
|
|
5
|
+
dist/
|
|
6
|
+
wheels/
|
|
7
|
+
*.egg-info
|
|
8
|
+
|
|
9
|
+
# Virtual environments
|
|
10
|
+
.venv
|
|
11
|
+
|
|
12
|
+
# Environment files
|
|
13
|
+
.env
|
|
14
|
+
.env.*
|
|
15
|
+
!.env.example
|
|
16
|
+
|
|
17
|
+
# IDE
|
|
18
|
+
.idea/
|
|
19
|
+
.nova/
|
|
20
|
+
|
|
21
|
+
.ruff_cache/
|
|
22
|
+
|
|
23
|
+
# Data directory
|
|
24
|
+
data/
|
|
25
|
+
!data/.gitkeep
|
|
26
|
+
|
|
27
|
+
# macOS
|
|
28
|
+
.DS_Store
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.10
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# .readthedocs.yaml
|
|
2
|
+
version: 2
|
|
3
|
+
|
|
4
|
+
build:
|
|
5
|
+
os: ubuntu-24.04
|
|
6
|
+
tools:
|
|
7
|
+
python: "3.12"
|
|
8
|
+
|
|
9
|
+
sphinx:
|
|
10
|
+
configuration: docs/source/conf.py
|
|
11
|
+
fail_on_warning: true
|
|
12
|
+
|
|
13
|
+
python:
|
|
14
|
+
install:
|
|
15
|
+
- requirements: docs/requirements.txt
|
|
16
|
+
- method: pip
|
|
17
|
+
path: .
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Eiichi YAMAMOTO
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
20
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
21
|
+
IN THE SOFTWARE.
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
.PHONY: help clean
|
|
2
|
+
|
|
3
|
+
# --- Variables ---
|
|
4
|
+
|
|
5
|
+
.DEFAULT_GOAL := help
|
|
6
|
+
|
|
7
|
+
# --- General Targets ---
|
|
8
|
+
|
|
9
|
+
help: ## Show this help message
|
|
10
|
+
@echo "Usage: make [target]"
|
|
11
|
+
@echo ""
|
|
12
|
+
@echo "Targets:"
|
|
13
|
+
@printf " \033[36m%-28s\033[0m %s\n" "help" "Show this help message"
|
|
14
|
+
@grep -E '^[a-zA-Z_-]+:.*?## ' $(MAKEFILE_LIST) \
|
|
15
|
+
| grep -v '^help:' \
|
|
16
|
+
| sort \
|
|
17
|
+
| awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-28s\033[0m %s\n", $$1, $$2}'
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
clean: ## Remove cache files and generated local artifacts
|
|
21
|
+
@echo "Cleaning up..."
|
|
22
|
+
@find . -type d -name "__pycache__" -not -path "*/.venv/*" -exec rm -rf {} +
|
|
23
|
+
@find . -type d -name ".pytest_cache" -not -path "*/.venv/*" -exec rm -rf {} +
|
|
24
|
+
@find . -type d -name ".ruff_cache" -not -path "*/.venv/*" -exec rm -rf {} +
|
|
25
|
+
@rm -rf .coverage htmlcov/
|
|
26
|
+
@echo "Cleanup complete."
|
|
27
|
+
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: whisper-smith
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A small Python transcription helper using OpenAI speech-to-text APIs.
|
|
5
|
+
Project-URL: Documentation, https://whisper-smith.readthedocs.io/
|
|
6
|
+
Project-URL: Homepage, https://github.com/yeiichi/whisper-smith
|
|
7
|
+
Project-URL: Repository, https://github.com/yeiichi/whisper-smith
|
|
8
|
+
Project-URL: Issues, https://github.com/yeiichi/whisper-smith/issues
|
|
9
|
+
Author-email: Eiichi YAMAMOTO <info@yeiichi.com>
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: audio,diarization,openai,speech-to-text,transcription
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Environment :: Console
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: Intended Audience :: End Users/Desktop
|
|
17
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
|
|
23
|
+
Classifier: Topic :: Utilities
|
|
24
|
+
Requires-Python: >=3.10
|
|
25
|
+
Requires-Dist: imageio-ffmpeg
|
|
26
|
+
Requires-Dist: openai
|
|
27
|
+
Requires-Dist: python-dotenv
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest>=9.0.3; extra == 'dev'
|
|
30
|
+
Provides-Extra: diarize
|
|
31
|
+
Requires-Dist: huggingface-hub<1; extra == 'diarize'
|
|
32
|
+
Requires-Dist: numpy<2; extra == 'diarize'
|
|
33
|
+
Requires-Dist: pyannote-audio; extra == 'diarize'
|
|
34
|
+
Requires-Dist: torch<2.9; extra == 'diarize'
|
|
35
|
+
Requires-Dist: torchaudio<2.9; extra == 'diarize'
|
|
36
|
+
Description-Content-Type: text/markdown
|
|
37
|
+
|
|
38
|
+
# whisper-smith
|
|
39
|
+
|
|
40
|
+
[](https://pypi.org/project/whisper-smith/)
|
|
41
|
+
[](https://pypi.org/project/whisper-smith/)
|
|
42
|
+
[](https://github.com/yeiichi/whisper-smith/blob/main/LICENSE)
|
|
43
|
+
[](https://whisper-smith.readthedocs.io/)
|
|
44
|
+
|
|
45
|
+
`whisper-smith` is a small Python CLI/app helper for transcribing audio files with OpenAI speech-to-text models.
|
|
46
|
+
|
|
47
|
+
## Features
|
|
48
|
+
|
|
49
|
+
- Transcribe local audio files
|
|
50
|
+
- CLI-first workflow for quick terminal use
|
|
51
|
+
- Output as `txt`, `json`, `srt`, or `vtt`
|
|
52
|
+
- Automatically infer output format from output file extension
|
|
53
|
+
- Load environment variables from `.env`
|
|
54
|
+
|
|
55
|
+
## Requirements
|
|
56
|
+
|
|
57
|
+
- Python `3.10+`
|
|
58
|
+
- An OpenAI API key (`OPENAI_API_KEY`)
|
|
59
|
+
- For large-file fallback: either system `ffmpeg` in `PATH`, or Python package `imageio-ffmpeg`
|
|
60
|
+
- For optional speaker diarization: a Hugging Face token (`HUGGINGFACE_TOKEN`) and `pyannote.audio`
|
|
61
|
+
|
|
62
|
+
## Installation
|
|
63
|
+
|
|
64
|
+
### Option 1: uv (recommended)
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
uv sync
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### Option 2: pip
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
pip install -e .
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Optional speaker diarization dependencies
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
uv sync --extra diarize
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
or:
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
pip install -e ".[diarize]"
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Configuration
|
|
89
|
+
|
|
90
|
+
Set your API key in the environment or in a `.env` file:
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
export OPENAI_API_KEY="your_api_key_here"
|
|
94
|
+
export HUGGINGFACE_TOKEN="your_huggingface_token_here"
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Or create `.env` in project root:
|
|
98
|
+
|
|
99
|
+
```env
|
|
100
|
+
OPENAI_API_KEY=your_api_key_here
|
|
101
|
+
HUGGINGFACE_TOKEN=your_huggingface_token_here
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## CLI Usage Guide
|
|
105
|
+
|
|
106
|
+
Basic command:
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
whisper-smith <audio_path>
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Show help:
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
whisper-smith --help
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### 1) Print transcript to terminal (default `txt`)
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
whisper-smith data/sample.m4a
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### 2) Save transcript to a file
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
whisper-smith data/sample.m4a --output data/sample.txt
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### 3) Choose output format explicitly
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
whisper-smith data/sample.m4a --format json --output data/sample.json
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
Supported CLI formats: `txt`, `json`, `srt`, `vtt`
|
|
137
|
+
|
|
138
|
+
### 4) Let format be inferred from output extension
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
whisper-smith data/sample.m4a --output data/sample.srt
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### 5) Overwrite existing file
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
whisper-smith data/sample.m4a --output data/sample.txt --overwrite
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### 6) Run speaker diarization
|
|
151
|
+
|
|
152
|
+
```bash
|
|
153
|
+
whisper-smith data/sample.m4a --diarize --output data/sample.diarization.json
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
Diarization currently supports JSON output only. Optional speaker hints:
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
whisper-smith data/sample.m4a --diarize --format json --num-speakers 2
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### 7) Create speaker-aligned transcript JSON
|
|
163
|
+
|
|
164
|
+
Run the full pipeline from one audio file:
|
|
165
|
+
|
|
166
|
+
```bash
|
|
167
|
+
whisper-smith data/sample.m4a --align --output data/sample.aligned.json
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
This writes the main aligned transcript JSON to `data/sample.aligned.json` and
|
|
171
|
+
also writes intermediate artifacts beside it:
|
|
172
|
+
|
|
173
|
+
```text
|
|
174
|
+
data/sample.transcript.json
|
|
175
|
+
data/sample.diarization.json
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
To put the intermediate artifacts in a separate directory:
|
|
179
|
+
|
|
180
|
+
```bash
|
|
181
|
+
whisper-smith data/sample.m4a --align --output data/sample.aligned.json --artifacts-dir data/artifacts
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
## Python Usage
|
|
185
|
+
|
|
186
|
+
```python
|
|
187
|
+
from pathlib import Path
|
|
188
|
+
from whisper_smith.transcribe import transcribe_audio
|
|
189
|
+
from whisper_smith.exporters import export_transcript
|
|
190
|
+
|
|
191
|
+
result = transcribe_audio(Path("data/sample.m4a"))
|
|
192
|
+
print(result.text)
|
|
193
|
+
|
|
194
|
+
srt = export_transcript(result, "srt")
|
|
195
|
+
Path("data/sample.srt").write_text(srt, encoding="utf-8")
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
### Speaker diarization
|
|
199
|
+
|
|
200
|
+
```python
|
|
201
|
+
from pathlib import Path
|
|
202
|
+
from whisper_smith.diarize import diarize_audio
|
|
203
|
+
|
|
204
|
+
result = diarize_audio(Path("data/sample.m4a"))
|
|
205
|
+
|
|
206
|
+
for segment in result.segments:
|
|
207
|
+
print(segment.start, segment.end, segment.speaker)
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
`diarize_audio` uses `HUGGINGFACE_TOKEN` from the environment, or accepts
|
|
211
|
+
`hf_token="..."` explicitly.
|
|
212
|
+
|
|
213
|
+
The default local model is `pyannote/speaker-diarization-3.1`, which is compatible
|
|
214
|
+
with the Intel macOS dependency set. You may pass a different model explicitly
|
|
215
|
+
from Python when running on a newer platform.
|
|
216
|
+
|
|
217
|
+
## Notes
|
|
218
|
+
|
|
219
|
+
- If `--output` is omitted, transcript is printed to stdout.
|
|
220
|
+
- If `--format` is omitted, format is inferred from `--output` extension when possible.
|
|
221
|
+
- If an output file already exists, add `--overwrite` to replace it.
|
|
222
|
+
- Transcription uses a timestamp-capable OpenAI model by default so JSON, SRT,
|
|
223
|
+
and VTT outputs have segment timestamps.
|
|
224
|
+
- For large audio files, `whisper-smith` automatically splits audio into chunks and
|
|
225
|
+
merges transcript text.
|
|
226
|
+
- If diarization fails with `torchaudio` missing `AudioMetaData`, refresh the
|
|
227
|
+
optional diarization dependencies with `uv lock --upgrade-package torch
|
|
228
|
+
--upgrade-package torchaudio` and then `uv sync --extra diarize`.
|
|
229
|
+
|
|
230
|
+
## Development
|
|
231
|
+
|
|
232
|
+
Run tests:
|
|
233
|
+
|
|
234
|
+
```bash
|
|
235
|
+
pytest
|
|
236
|
+
```
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
# whisper-smith
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/whisper-smith/)
|
|
4
|
+
[](https://pypi.org/project/whisper-smith/)
|
|
5
|
+
[](https://github.com/yeiichi/whisper-smith/blob/main/LICENSE)
|
|
6
|
+
[](https://whisper-smith.readthedocs.io/)
|
|
7
|
+
|
|
8
|
+
`whisper-smith` is a small Python CLI/app helper for transcribing audio files with OpenAI speech-to-text models.
|
|
9
|
+
|
|
10
|
+
## Features
|
|
11
|
+
|
|
12
|
+
- Transcribe local audio files
|
|
13
|
+
- CLI-first workflow for quick terminal use
|
|
14
|
+
- Output as `txt`, `json`, `srt`, or `vtt`
|
|
15
|
+
- Automatically infer output format from output file extension
|
|
16
|
+
- Load environment variables from `.env`
|
|
17
|
+
|
|
18
|
+
## Requirements
|
|
19
|
+
|
|
20
|
+
- Python `3.10+`
|
|
21
|
+
- An OpenAI API key (`OPENAI_API_KEY`)
|
|
22
|
+
- For large-file fallback: either system `ffmpeg` in `PATH`, or Python package `imageio-ffmpeg`
|
|
23
|
+
- For optional speaker diarization: a Hugging Face token (`HUGGINGFACE_TOKEN`) and `pyannote.audio`
|
|
24
|
+
|
|
25
|
+
## Installation
|
|
26
|
+
|
|
27
|
+
### Option 1: uv (recommended)
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
uv sync
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
### Option 2: pip
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install -e .
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### Optional speaker diarization dependencies
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
uv sync --extra diarize
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
or:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install -e ".[diarize]"
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Configuration
|
|
52
|
+
|
|
53
|
+
Set your API key in the environment or in a `.env` file:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
export OPENAI_API_KEY="your_api_key_here"
|
|
57
|
+
export HUGGINGFACE_TOKEN="your_huggingface_token_here"
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
Or create `.env` in project root:
|
|
61
|
+
|
|
62
|
+
```env
|
|
63
|
+
OPENAI_API_KEY=your_api_key_here
|
|
64
|
+
HUGGINGFACE_TOKEN=your_huggingface_token_here
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## CLI Usage Guide
|
|
68
|
+
|
|
69
|
+
Basic command:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
whisper-smith <audio_path>
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Show help:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
whisper-smith --help
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### 1) Print transcript to terminal (default `txt`)
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
whisper-smith data/sample.m4a
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### 2) Save transcript to a file
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
whisper-smith data/sample.m4a --output data/sample.txt
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### 3) Choose output format explicitly
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
whisper-smith data/sample.m4a --format json --output data/sample.json
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Supported CLI formats: `txt`, `json`, `srt`, `vtt`
|
|
100
|
+
|
|
101
|
+
### 4) Let format be inferred from output extension
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
whisper-smith data/sample.m4a --output data/sample.srt
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### 5) Overwrite existing file
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
whisper-smith data/sample.m4a --output data/sample.txt --overwrite
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### 6) Run speaker diarization
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
whisper-smith data/sample.m4a --diarize --output data/sample.diarization.json
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
Diarization currently supports JSON output only. Optional speaker hints:
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
whisper-smith data/sample.m4a --diarize --format json --num-speakers 2
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### 7) Create speaker-aligned transcript JSON
|
|
126
|
+
|
|
127
|
+
Run the full pipeline from one audio file:
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
whisper-smith data/sample.m4a --align --output data/sample.aligned.json
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
This writes the main aligned transcript JSON to `data/sample.aligned.json` and
|
|
134
|
+
also writes intermediate artifacts beside it:
|
|
135
|
+
|
|
136
|
+
```text
|
|
137
|
+
data/sample.transcript.json
|
|
138
|
+
data/sample.diarization.json
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
To put the intermediate artifacts in a separate directory:
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
whisper-smith data/sample.m4a --align --output data/sample.aligned.json --artifacts-dir data/artifacts
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## Python Usage
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
from pathlib import Path
|
|
151
|
+
from whisper_smith.transcribe import transcribe_audio
|
|
152
|
+
from whisper_smith.exporters import export_transcript
|
|
153
|
+
|
|
154
|
+
result = transcribe_audio(Path("data/sample.m4a"))
|
|
155
|
+
print(result.text)
|
|
156
|
+
|
|
157
|
+
srt = export_transcript(result, "srt")
|
|
158
|
+
Path("data/sample.srt").write_text(srt, encoding="utf-8")
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
### Speaker diarization
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
from pathlib import Path
|
|
165
|
+
from whisper_smith.diarize import diarize_audio
|
|
166
|
+
|
|
167
|
+
result = diarize_audio(Path("data/sample.m4a"))
|
|
168
|
+
|
|
169
|
+
for segment in result.segments:
|
|
170
|
+
print(segment.start, segment.end, segment.speaker)
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
`diarize_audio` uses `HUGGINGFACE_TOKEN` from the environment, or accepts
|
|
174
|
+
`hf_token="..."` explicitly.
|
|
175
|
+
|
|
176
|
+
The default local model is `pyannote/speaker-diarization-3.1`, which is compatible
|
|
177
|
+
with the Intel macOS dependency set. You may pass a different model explicitly
|
|
178
|
+
from Python when running on a newer platform.
|
|
179
|
+
|
|
180
|
+
## Notes
|
|
181
|
+
|
|
182
|
+
- If `--output` is omitted, transcript is printed to stdout.
|
|
183
|
+
- If `--format` is omitted, format is inferred from `--output` extension when possible.
|
|
184
|
+
- If an output file already exists, add `--overwrite` to replace it.
|
|
185
|
+
- Transcription uses a timestamp-capable OpenAI model by default so JSON, SRT,
|
|
186
|
+
and VTT outputs have segment timestamps.
|
|
187
|
+
- For large audio files, `whisper-smith` automatically splits audio into chunks and
|
|
188
|
+
merges transcript text.
|
|
189
|
+
- If diarization fails with `torchaudio` missing `AudioMetaData`, refresh the
|
|
190
|
+
optional diarization dependencies with `uv lock --upgrade-package torch
|
|
191
|
+
--upgrade-package torchaudio` and then `uv sync --extra diarize`.
|
|
192
|
+
|
|
193
|
+
## Development
|
|
194
|
+
|
|
195
|
+
Run tests:
|
|
196
|
+
|
|
197
|
+
```bash
|
|
198
|
+
pytest
|
|
199
|
+
```
|
|
File without changes
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
.PHONY: help clean html
|
|
2
|
+
|
|
3
|
+
SPHINXOPTS ?=
|
|
4
|
+
SPHINXBUILD ?= sphinx-build
|
|
5
|
+
SOURCEDIR = source
|
|
6
|
+
BUILDDIR = build
|
|
7
|
+
|
|
8
|
+
help:
|
|
9
|
+
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS)
|
|
10
|
+
|
|
11
|
+
clean:
|
|
12
|
+
@rm -rf "$(BUILDDIR)"
|
|
13
|
+
|
|
14
|
+
html:
|
|
15
|
+
@$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS)
|
|
16
|
+
|
|
17
|
+
%:
|
|
18
|
+
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
Aligned JSON Workflow
|
|
2
|
+
=====================
|
|
3
|
+
|
|
4
|
+
The aligned JSON workflow is the main product flow for combining transcription
|
|
5
|
+
and diarization.
|
|
6
|
+
|
|
7
|
+
Run the pipeline
|
|
8
|
+
----------------
|
|
9
|
+
|
|
10
|
+
.. code-block:: bash
|
|
11
|
+
|
|
12
|
+
whisper-smith data/sample.m4a --align --output data/sample.aligned.json
|
|
13
|
+
|
|
14
|
+
Outputs:
|
|
15
|
+
|
|
16
|
+
.. code-block:: text
|
|
17
|
+
|
|
18
|
+
data/sample.aligned.json
|
|
19
|
+
data/sample.transcript.json
|
|
20
|
+
data/sample.diarization.json
|
|
21
|
+
|
|
22
|
+
Output shape
|
|
23
|
+
------------
|
|
24
|
+
|
|
25
|
+
Each aligned transcript segment contains timestamps, text, and the assigned
|
|
26
|
+
speaker label:
|
|
27
|
+
|
|
28
|
+
.. code-block:: json
|
|
29
|
+
|
|
30
|
+
{
|
|
31
|
+
"segments": [
|
|
32
|
+
{
|
|
33
|
+
"start": 0.0,
|
|
34
|
+
"end": 7.08,
|
|
35
|
+
"text": "Hello world.",
|
|
36
|
+
"speaker": "SPEAKER_01"
|
|
37
|
+
}
|
|
38
|
+
],
|
|
39
|
+
"text": "Hello world."
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
How speakers are assigned
|
|
43
|
+
-------------------------
|
|
44
|
+
|
|
45
|
+
``assign_speakers`` compares each transcript segment with diarization segments
|
|
46
|
+
and chooses the speaker with the largest time overlap. If no diarization segment
|
|
47
|
+
overlaps, the transcript segment keeps its existing speaker value.
|