whisper-smith 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. whisper_smith-0.1.0/.gitignore +28 -0
  2. whisper_smith-0.1.0/.python-version +1 -0
  3. whisper_smith-0.1.0/.readthedocs.yaml +17 -0
  4. whisper_smith-0.1.0/LICENSE +21 -0
  5. whisper_smith-0.1.0/Makefile +27 -0
  6. whisper_smith-0.1.0/PKG-INFO +236 -0
  7. whisper_smith-0.1.0/README.md +199 -0
  8. whisper_smith-0.1.0/data/.gitkeep +0 -0
  9. whisper_smith-0.1.0/docs/Makefile +18 -0
  10. whisper_smith-0.1.0/docs/requirements.txt +3 -0
  11. whisper_smith-0.1.0/docs/source/aligned-json.rst +47 -0
  12. whisper_smith-0.1.0/docs/source/api/align.rst +6 -0
  13. whisper_smith-0.1.0/docs/source/api/diarize.rst +6 -0
  14. whisper_smith-0.1.0/docs/source/api/exporters.rst +6 -0
  15. whisper_smith-0.1.0/docs/source/api/index.rst +11 -0
  16. whisper_smith-0.1.0/docs/source/api/models.rst +6 -0
  17. whisper_smith-0.1.0/docs/source/api/transcribe.rst +6 -0
  18. whisper_smith-0.1.0/docs/source/cli.rst +68 -0
  19. whisper_smith-0.1.0/docs/source/conf.py +27 -0
  20. whisper_smith-0.1.0/docs/source/index.rst +27 -0
  21. whisper_smith-0.1.0/docs/source/installation.rst +48 -0
  22. whisper_smith-0.1.0/docs/source/python-api.rst +55 -0
  23. whisper_smith-0.1.0/docs/source/troubleshooting.rst +33 -0
  24. whisper_smith-0.1.0/pyproject.toml +67 -0
  25. whisper_smith-0.1.0/src/whisper_smith/__init__.py +11 -0
  26. whisper_smith-0.1.0/src/whisper_smith/align.py +47 -0
  27. whisper_smith-0.1.0/src/whisper_smith/cli.py +332 -0
  28. whisper_smith-0.1.0/src/whisper_smith/diarize.py +200 -0
  29. whisper_smith-0.1.0/src/whisper_smith/exporters/__init__.py +176 -0
  30. whisper_smith-0.1.0/src/whisper_smith/exporters/json.py +14 -0
  31. whisper_smith-0.1.0/src/whisper_smith/exporters/markdown.py +20 -0
  32. whisper_smith-0.1.0/src/whisper_smith/exporters/subtitles.py +53 -0
  33. whisper_smith-0.1.0/src/whisper_smith/exporters/text.py +16 -0
  34. whisper_smith-0.1.0/src/whisper_smith/models.py +31 -0
  35. whisper_smith-0.1.0/src/whisper_smith/transcribe.py +317 -0
  36. whisper_smith-0.1.0/tests/test_align.py +72 -0
  37. whisper_smith-0.1.0/tests/test_cli_args.py +318 -0
  38. whisper_smith-0.1.0/tests/test_cli_output_path.py +383 -0
  39. whisper_smith-0.1.0/tests/test_diarize.py +319 -0
  40. whisper_smith-0.1.0/tests/test_exporters.py +132 -0
  41. whisper_smith-0.1.0/tests/test_models.py +22 -0
  42. whisper_smith-0.1.0/tests/test_package_import.py +4 -0
  43. whisper_smith-0.1.0/tests/test_transcribe.py +316 -0
  44. whisper_smith-0.1.0/uv.lock +3949 -0
@@ -0,0 +1,28 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+
12
+ # Environment files
13
+ .env
14
+ .env.*
15
+ !.env.example
16
+
17
+ # IDE
18
+ .idea/
19
+ .nova/
20
+
21
+ .ruff_cache/
22
+
23
+ # Data directory
24
+ data/
25
+ !data/.gitkeep
26
+
27
+ # macOS
28
+ .DS_Store
@@ -0,0 +1 @@
1
+ 3.10
@@ -0,0 +1,17 @@
1
+ # .readthedocs.yaml
2
+ version: 2
3
+
4
+ build:
5
+ os: ubuntu-24.04
6
+ tools:
7
+ python: "3.12"
8
+
9
+ sphinx:
10
+ configuration: docs/source/conf.py
11
+ fail_on_warning: true
12
+
13
+ python:
14
+ install:
15
+ - requirements: docs/requirements.txt
16
+ - method: pip
17
+ path: .
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Eiichi YAMAMOTO
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
+ IN THE SOFTWARE.
@@ -0,0 +1,27 @@
1
+ .PHONY: help clean
2
+
3
+ # --- Variables ---
4
+
5
+ .DEFAULT_GOAL := help
6
+
7
+ # --- General Targets ---
8
+
9
+ help: ## Show this help message
10
+ @echo "Usage: make [target]"
11
+ @echo ""
12
+ @echo "Targets:"
13
+ @printf " \033[36m%-28s\033[0m %s\n" "help" "Show this help message"
14
+ @grep -E '^[a-zA-Z_-]+:.*?## ' $(MAKEFILE_LIST) \
15
+ | grep -v '^help:' \
16
+ | sort \
17
+ | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-28s\033[0m %s\n", $$1, $$2}'
18
+
19
+
20
+ clean: ## Remove cache files and generated local artifacts
21
+ @echo "Cleaning up..."
22
+ @find . -type d -name "__pycache__" -not -path "*/.venv/*" -exec rm -rf {} +
23
+ @find . -type d -name ".pytest_cache" -not -path "*/.venv/*" -exec rm -rf {} +
24
+ @find . -type d -name ".ruff_cache" -not -path "*/.venv/*" -exec rm -rf {} +
25
+ @rm -rf .coverage htmlcov/
26
+ @echo "Cleanup complete."
27
+
@@ -0,0 +1,236 @@
1
+ Metadata-Version: 2.4
2
+ Name: whisper-smith
3
+ Version: 0.1.0
4
+ Summary: A small Python transcription helper using OpenAI speech-to-text APIs.
5
+ Project-URL: Documentation, https://whisper-smith.readthedocs.io/
6
+ Project-URL: Homepage, https://github.com/yeiichi/whisper-smith
7
+ Project-URL: Repository, https://github.com/yeiichi/whisper-smith
8
+ Project-URL: Issues, https://github.com/yeiichi/whisper-smith/issues
9
+ Author-email: Eiichi YAMAMOTO <info@yeiichi.com>
10
+ License-Expression: MIT
11
+ License-File: LICENSE
12
+ Keywords: audio,diarization,openai,speech-to-text,transcription
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Environment :: Console
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: Intended Audience :: End Users/Desktop
17
+ Classifier: License :: OSI Approved :: MIT License
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
23
+ Classifier: Topic :: Utilities
24
+ Requires-Python: >=3.10
25
+ Requires-Dist: imageio-ffmpeg
26
+ Requires-Dist: openai
27
+ Requires-Dist: python-dotenv
28
+ Provides-Extra: dev
29
+ Requires-Dist: pytest>=9.0.3; extra == 'dev'
30
+ Provides-Extra: diarize
31
+ Requires-Dist: huggingface-hub<1; extra == 'diarize'
32
+ Requires-Dist: numpy<2; extra == 'diarize'
33
+ Requires-Dist: pyannote-audio; extra == 'diarize'
34
+ Requires-Dist: torch<2.9; extra == 'diarize'
35
+ Requires-Dist: torchaudio<2.9; extra == 'diarize'
36
+ Description-Content-Type: text/markdown
37
+
38
+ # whisper-smith
39
+
40
+ [![PyPI version](https://img.shields.io/pypi/v/whisper-smith)](https://pypi.org/project/whisper-smith/)
41
+ [![Python versions](https://img.shields.io/pypi/pyversions/whisper-smith)](https://pypi.org/project/whisper-smith/)
42
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/yeiichi/whisper-smith/blob/main/LICENSE)
43
+ [![Docs](https://readthedocs.org/projects/whisper-smith/badge/?version=latest)](https://whisper-smith.readthedocs.io/)
44
+
45
+ `whisper-smith` is a small Python CLI/app helper for transcribing audio files with OpenAI speech-to-text models.
46
+
47
+ ## Features
48
+
49
+ - Transcribe local audio files
50
+ - CLI-first workflow for quick terminal use
51
+ - Output as `txt`, `json`, `srt`, or `vtt`
52
+ - Automatically infer output format from output file extension
53
+ - Load environment variables from `.env`
54
+
55
+ ## Requirements
56
+
57
+ - Python `3.10+`
58
+ - An OpenAI API key (`OPENAI_API_KEY`)
59
+ - For large-file fallback: either system `ffmpeg` in `PATH`, or Python package `imageio-ffmpeg`
60
+ - For optional speaker diarization: a Hugging Face token (`HUGGINGFACE_TOKEN`) and `pyannote.audio`
61
+
62
+ ## Installation
63
+
64
+ ### Option 1: uv (recommended)
65
+
66
+ ```bash
67
+ uv sync
68
+ ```
69
+
70
+ ### Option 2: pip
71
+
72
+ ```bash
73
+ pip install -e .
74
+ ```
75
+
76
+ ### Optional speaker diarization dependencies
77
+
78
+ ```bash
79
+ uv sync --extra diarize
80
+ ```
81
+
82
+ or:
83
+
84
+ ```bash
85
+ pip install -e ".[diarize]"
86
+ ```
87
+
88
+ ## Configuration
89
+
90
+ Set your API key in the environment or in a `.env` file:
91
+
92
+ ```bash
93
+ export OPENAI_API_KEY="your_api_key_here"
94
+ export HUGGINGFACE_TOKEN="your_huggingface_token_here"
95
+ ```
96
+
97
+ Or create `.env` in project root:
98
+
99
+ ```env
100
+ OPENAI_API_KEY=your_api_key_here
101
+ HUGGINGFACE_TOKEN=your_huggingface_token_here
102
+ ```
103
+
104
+ ## CLI Usage Guide
105
+
106
+ Basic command:
107
+
108
+ ```bash
109
+ whisper-smith <audio_path>
110
+ ```
111
+
112
+ Show help:
113
+
114
+ ```bash
115
+ whisper-smith --help
116
+ ```
117
+
118
+ ### 1) Print transcript to terminal (default `txt`)
119
+
120
+ ```bash
121
+ whisper-smith data/sample.m4a
122
+ ```
123
+
124
+ ### 2) Save transcript to a file
125
+
126
+ ```bash
127
+ whisper-smith data/sample.m4a --output data/sample.txt
128
+ ```
129
+
130
+ ### 3) Choose output format explicitly
131
+
132
+ ```bash
133
+ whisper-smith data/sample.m4a --format json --output data/sample.json
134
+ ```
135
+
136
+ Supported CLI formats: `txt`, `json`, `srt`, `vtt`
137
+
138
+ ### 4) Let format be inferred from output extension
139
+
140
+ ```bash
141
+ whisper-smith data/sample.m4a --output data/sample.srt
142
+ ```
143
+
144
+ ### 5) Overwrite existing file
145
+
146
+ ```bash
147
+ whisper-smith data/sample.m4a --output data/sample.txt --overwrite
148
+ ```
149
+
150
+ ### 6) Run speaker diarization
151
+
152
+ ```bash
153
+ whisper-smith data/sample.m4a --diarize --output data/sample.diarization.json
154
+ ```
155
+
156
+ Diarization currently supports JSON output only. Optional speaker hints:
157
+
158
+ ```bash
159
+ whisper-smith data/sample.m4a --diarize --format json --num-speakers 2
160
+ ```
161
+
162
+ ### 7) Create speaker-aligned transcript JSON
163
+
164
+ Run the full pipeline from one audio file:
165
+
166
+ ```bash
167
+ whisper-smith data/sample.m4a --align --output data/sample.aligned.json
168
+ ```
169
+
170
+ This writes the main aligned transcript JSON to `data/sample.aligned.json` and
171
+ also writes intermediate artifacts beside it:
172
+
173
+ ```text
174
+ data/sample.transcript.json
175
+ data/sample.diarization.json
176
+ ```
177
+
178
+ To put the intermediate artifacts in a separate directory:
179
+
180
+ ```bash
181
+ whisper-smith data/sample.m4a --align --output data/sample.aligned.json --artifacts-dir data/artifacts
182
+ ```
183
+
184
+ ## Python Usage
185
+
186
+ ```python
187
+ from pathlib import Path
188
+ from whisper_smith.transcribe import transcribe_audio
189
+ from whisper_smith.exporters import export_transcript
190
+
191
+ result = transcribe_audio(Path("data/sample.m4a"))
192
+ print(result.text)
193
+
194
+ srt = export_transcript(result, "srt")
195
+ Path("data/sample.srt").write_text(srt, encoding="utf-8")
196
+ ```
197
+
198
+ ### Speaker diarization
199
+
200
+ ```python
201
+ from pathlib import Path
202
+ from whisper_smith.diarize import diarize_audio
203
+
204
+ result = diarize_audio(Path("data/sample.m4a"))
205
+
206
+ for segment in result.segments:
207
+ print(segment.start, segment.end, segment.speaker)
208
+ ```
209
+
210
+ `diarize_audio` uses `HUGGINGFACE_TOKEN` from the environment, or accepts
211
+ `hf_token="..."` explicitly.
212
+
213
+ The default local model is `pyannote/speaker-diarization-3.1`, which is compatible
214
+ with the Intel macOS dependency set. You may pass a different model explicitly
215
+ from Python when running on a newer platform.
216
+
217
+ ## Notes
218
+
219
+ - If `--output` is omitted, transcript is printed to stdout.
220
+ - If `--format` is omitted, format is inferred from `--output` extension when possible.
221
+ - If an output file already exists, add `--overwrite` to replace it.
222
+ - Transcription uses a timestamp-capable OpenAI model by default so JSON, SRT,
223
+ and VTT outputs have segment timestamps.
224
+ - For large audio files, `whisper-smith` automatically splits audio into chunks and
225
+ merges transcript text.
226
+ - If diarization fails with `torchaudio` missing `AudioMetaData`, refresh the
227
+ optional diarization dependencies with `uv lock --upgrade-package torch
228
+ --upgrade-package torchaudio` and then `uv sync --extra diarize`.
229
+
230
+ ## Development
231
+
232
+ Run tests:
233
+
234
+ ```bash
235
+ pytest
236
+ ```
@@ -0,0 +1,199 @@
1
+ # whisper-smith
2
+
3
+ [![PyPI version](https://img.shields.io/pypi/v/whisper-smith)](https://pypi.org/project/whisper-smith/)
4
+ [![Python versions](https://img.shields.io/pypi/pyversions/whisper-smith)](https://pypi.org/project/whisper-smith/)
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/yeiichi/whisper-smith/blob/main/LICENSE)
6
+ [![Docs](https://readthedocs.org/projects/whisper-smith/badge/?version=latest)](https://whisper-smith.readthedocs.io/)
7
+
8
+ `whisper-smith` is a small Python CLI/app helper for transcribing audio files with OpenAI speech-to-text models.
9
+
10
+ ## Features
11
+
12
+ - Transcribe local audio files
13
+ - CLI-first workflow for quick terminal use
14
+ - Output as `txt`, `json`, `srt`, or `vtt`
15
+ - Automatically infer output format from output file extension
16
+ - Load environment variables from `.env`
17
+
18
+ ## Requirements
19
+
20
+ - Python `3.10+`
21
+ - An OpenAI API key (`OPENAI_API_KEY`)
22
+ - For large-file fallback: either system `ffmpeg` in `PATH`, or Python package `imageio-ffmpeg`
23
+ - For optional speaker diarization: a Hugging Face token (`HUGGINGFACE_TOKEN`) and `pyannote.audio`
24
+
25
+ ## Installation
26
+
27
+ ### Option 1: uv (recommended)
28
+
29
+ ```bash
30
+ uv sync
31
+ ```
32
+
33
+ ### Option 2: pip
34
+
35
+ ```bash
36
+ pip install -e .
37
+ ```
38
+
39
+ ### Optional speaker diarization dependencies
40
+
41
+ ```bash
42
+ uv sync --extra diarize
43
+ ```
44
+
45
+ or:
46
+
47
+ ```bash
48
+ pip install -e ".[diarize]"
49
+ ```
50
+
51
+ ## Configuration
52
+
53
+ Set your API key in the environment or in a `.env` file:
54
+
55
+ ```bash
56
+ export OPENAI_API_KEY="your_api_key_here"
57
+ export HUGGINGFACE_TOKEN="your_huggingface_token_here"
58
+ ```
59
+
60
+ Or create `.env` in project root:
61
+
62
+ ```env
63
+ OPENAI_API_KEY=your_api_key_here
64
+ HUGGINGFACE_TOKEN=your_huggingface_token_here
65
+ ```
66
+
67
+ ## CLI Usage Guide
68
+
69
+ Basic command:
70
+
71
+ ```bash
72
+ whisper-smith <audio_path>
73
+ ```
74
+
75
+ Show help:
76
+
77
+ ```bash
78
+ whisper-smith --help
79
+ ```
80
+
81
+ ### 1) Print transcript to terminal (default `txt`)
82
+
83
+ ```bash
84
+ whisper-smith data/sample.m4a
85
+ ```
86
+
87
+ ### 2) Save transcript to a file
88
+
89
+ ```bash
90
+ whisper-smith data/sample.m4a --output data/sample.txt
91
+ ```
92
+
93
+ ### 3) Choose output format explicitly
94
+
95
+ ```bash
96
+ whisper-smith data/sample.m4a --format json --output data/sample.json
97
+ ```
98
+
99
+ Supported CLI formats: `txt`, `json`, `srt`, `vtt`
100
+
101
+ ### 4) Let format be inferred from output extension
102
+
103
+ ```bash
104
+ whisper-smith data/sample.m4a --output data/sample.srt
105
+ ```
106
+
107
+ ### 5) Overwrite existing file
108
+
109
+ ```bash
110
+ whisper-smith data/sample.m4a --output data/sample.txt --overwrite
111
+ ```
112
+
113
+ ### 6) Run speaker diarization
114
+
115
+ ```bash
116
+ whisper-smith data/sample.m4a --diarize --output data/sample.diarization.json
117
+ ```
118
+
119
+ Diarization currently supports JSON output only. Optional speaker hints:
120
+
121
+ ```bash
122
+ whisper-smith data/sample.m4a --diarize --format json --num-speakers 2
123
+ ```
124
+
125
+ ### 7) Create speaker-aligned transcript JSON
126
+
127
+ Run the full pipeline from one audio file:
128
+
129
+ ```bash
130
+ whisper-smith data/sample.m4a --align --output data/sample.aligned.json
131
+ ```
132
+
133
+ This writes the main aligned transcript JSON to `data/sample.aligned.json` and
134
+ also writes intermediate artifacts beside it:
135
+
136
+ ```text
137
+ data/sample.transcript.json
138
+ data/sample.diarization.json
139
+ ```
140
+
141
+ To put the intermediate artifacts in a separate directory:
142
+
143
+ ```bash
144
+ whisper-smith data/sample.m4a --align --output data/sample.aligned.json --artifacts-dir data/artifacts
145
+ ```
146
+
147
+ ## Python Usage
148
+
149
+ ```python
150
+ from pathlib import Path
151
+ from whisper_smith.transcribe import transcribe_audio
152
+ from whisper_smith.exporters import export_transcript
153
+
154
+ result = transcribe_audio(Path("data/sample.m4a"))
155
+ print(result.text)
156
+
157
+ srt = export_transcript(result, "srt")
158
+ Path("data/sample.srt").write_text(srt, encoding="utf-8")
159
+ ```
160
+
161
+ ### Speaker diarization
162
+
163
+ ```python
164
+ from pathlib import Path
165
+ from whisper_smith.diarize import diarize_audio
166
+
167
+ result = diarize_audio(Path("data/sample.m4a"))
168
+
169
+ for segment in result.segments:
170
+ print(segment.start, segment.end, segment.speaker)
171
+ ```
172
+
173
+ `diarize_audio` uses `HUGGINGFACE_TOKEN` from the environment, or accepts
174
+ `hf_token="..."` explicitly.
175
+
176
+ The default local model is `pyannote/speaker-diarization-3.1`, which is compatible
177
+ with the Intel macOS dependency set. You may pass a different model explicitly
178
+ from Python when running on a newer platform.
179
+
180
+ ## Notes
181
+
182
+ - If `--output` is omitted, transcript is printed to stdout.
183
+ - If `--format` is omitted, format is inferred from `--output` extension when possible.
184
+ - If an output file already exists, add `--overwrite` to replace it.
185
+ - Transcription uses a timestamp-capable OpenAI model by default so JSON, SRT,
186
+ and VTT outputs have segment timestamps.
187
+ - For large audio files, `whisper-smith` automatically splits audio into chunks and
188
+ merges transcript text.
189
+ - If diarization fails with `torchaudio` missing `AudioMetaData`, refresh the
190
+ optional diarization dependencies with `uv lock --upgrade-package torch
191
+ --upgrade-package torchaudio` and then `uv sync --extra diarize`.
192
+
193
+ ## Development
194
+
195
+ Run tests:
196
+
197
+ ```bash
198
+ pytest
199
+ ```
File without changes
@@ -0,0 +1,18 @@
1
+ .PHONY: help clean html
2
+
3
+ SPHINXOPTS ?=
4
+ SPHINXBUILD ?= sphinx-build
5
+ SOURCEDIR = source
6
+ BUILDDIR = build
7
+
8
+ help:
9
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS)
10
+
11
+ clean:
12
+ @rm -rf "$(BUILDDIR)"
13
+
14
+ html:
15
+ @$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS)
16
+
17
+ %:
18
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS)
@@ -0,0 +1,3 @@
1
+ sphinx>=7.4
2
+ furo>=2024.8.6
3
+ myst-parser>=4.0.0
@@ -0,0 +1,47 @@
1
+ Aligned JSON Workflow
2
+ =====================
3
+
4
+ The aligned JSON workflow is the main product flow for combining transcription
5
+ and diarization.
6
+
7
+ Run the pipeline
8
+ ----------------
9
+
10
+ .. code-block:: bash
11
+
12
+ whisper-smith data/sample.m4a --align --output data/sample.aligned.json
13
+
14
+ Outputs:
15
+
16
+ .. code-block:: text
17
+
18
+ data/sample.aligned.json
19
+ data/sample.transcript.json
20
+ data/sample.diarization.json
21
+
22
+ Output shape
23
+ ------------
24
+
25
+ Each aligned transcript segment contains timestamps, text, and the assigned
26
+ speaker label:
27
+
28
+ .. code-block:: json
29
+
30
+ {
31
+ "segments": [
32
+ {
33
+ "start": 0.0,
34
+ "end": 7.08,
35
+ "text": "Hello world.",
36
+ "speaker": "SPEAKER_01"
37
+ }
38
+ ],
39
+ "text": "Hello world."
40
+ }
41
+
42
+ How speakers are assigned
43
+ -------------------------
44
+
45
+ ``assign_speakers`` compares each transcript segment with diarization segments
46
+ and chooses the speaker with the largest time overlap. If no diarization segment
47
+ overlaps, the transcript segment keeps its existing speaker value.
@@ -0,0 +1,6 @@
1
+ Alignment
2
+ =========
3
+
4
+ .. automodule:: whisper_smith.align
5
+ :members: assign_speakers
6
+ :undoc-members:
@@ -0,0 +1,6 @@
1
+ Diarization
2
+ ===========
3
+
4
+ .. automodule:: whisper_smith.diarize
5
+ :members: diarize_audio, diarize_file, from_pyannote_output
6
+ :undoc-members:
@@ -0,0 +1,6 @@
1
+ Exporters
2
+ =========
3
+
4
+ .. automodule:: whisper_smith.exporters
5
+ :members:
6
+ :undoc-members:
@@ -0,0 +1,11 @@
1
+ API Reference
2
+ =============
3
+
4
+ .. toctree::
5
+ :maxdepth: 1
6
+
7
+ models
8
+ transcribe
9
+ diarize
10
+ align
11
+ exporters
@@ -0,0 +1,6 @@
1
+ Models
2
+ ======
3
+
4
+ .. automodule:: whisper_smith.models
5
+ :members:
6
+ :undoc-members:
@@ -0,0 +1,6 @@
1
+ Transcription
2
+ =============
3
+
4
+ .. automodule:: whisper_smith.transcribe
5
+ :members: transcribe_audio, transcribe_file, from_openai_response
6
+ :undoc-members: