opensono 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
opensono-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 OpenSono
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,201 @@
1
+ Metadata-Version: 2.4
2
+ Name: opensono
3
+ Version: 0.1.0
4
+ Summary: Open-source audio transcription with speaker diarization
5
+ Author-email: OpenSono <hello@opensono.ai>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://opensono.ai
8
+ Project-URL: Repository, https://github.com/penkow/opensono
9
+ Project-URL: Issues, https://github.com/penkow/opensono/issues
10
+ Keywords: transcription,whisper,diarization,speech-to-text,nemo
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Requires-Python: >=3.10
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: faster-whisper
24
+ Requires-Dist: nemo_toolkit[asr]
25
+ Requires-Dist: soundfile
26
+ Requires-Dist: librosa
27
+ Requires-Dist: numpy
28
+ Dynamic: license-file
29
+
30
+ # OpenSono
31
+
32
+ **Open-source audio transcription with speaker diarization.**
33
+
34
+ Transcribe audio files with word-level timestamps and automatic speaker identification using [Faster Whisper](https://github.com/SYSTRAN/faster-whisper) and [NVIDIA NeMo Sortformer](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/asr/speaker_diarization/intro.html).
35
+
36
+ > This is the Python CLI companion to [OpenSono WebApp](https://opensono.vercel.app) — the free, browser-based transcription tool.
37
+
38
+ ## Features
39
+
40
+ - **Accurate transcription** — Powered by Whisper large-v3
41
+ - **Speaker diarization** — Automatically identifies up to 4 speakers using NVIDIA Sortformer
42
+ - **Word-level timestamps** — Precise timing for every word
43
+ - **Multiple output formats** — Plain text, VTT subtitles, or JSON
44
+ - **Auto language detection** — Supports 99+ languages
45
+ - **Colored terminal output** — Speaker-coded output for easy reading
46
+
47
+ ## Installation
48
+
49
+ ```bash
50
+ pip install opensono
51
+ ```
52
+
53
+ > **Note:** The NeMo toolkit has additional system dependencies. See the [NeMo installation guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/getting-started.html) for details.
54
+
55
+ Requires Python 3.10+ and a CUDA-capable GPU (recommended).
56
+
57
+ ### From source
58
+
59
+ ```bash
60
+ git clone https://github.com/penkow/opensono
61
+ cd opensono
62
+ pip install .
63
+ ```
64
+
65
+ ## Usage
66
+
67
+ After installing, the `opensono` command is available anywhere in your terminal.
68
+
69
+ ### Basic transcription with speaker diarization
70
+
71
+ ```bash
72
+ opensono meeting.wav
73
+ ```
74
+
75
+ ### Transcription only (no diarization)
76
+
77
+ ```bash
78
+ opensono interview.mp3 --no-diarize
79
+ ```
80
+
81
+ ### Export as VTT subtitles
82
+
83
+ ```bash
84
+ opensono podcast.wav -f vtt -o subtitles.vtt
85
+ ```
86
+
87
+ ### Export as JSON
88
+
89
+ ```bash
90
+ opensono recording.wav -f json -o transcript.json
91
+ ```
92
+
93
+ ### Specify language (skip auto-detection)
94
+
95
+ ```bash
96
+ opensono audio.wav --language en
97
+ ```
98
+
99
+ ### Use a smaller/faster model
100
+
101
+ ```bash
102
+ opensono audio.wav --model-size base
103
+ ```
104
+
105
+ ### CPU-only
106
+
107
+ ```bash
108
+ opensono audio.wav --device cpu --compute-type int8
109
+ ```
110
+
111
+ ### Check version
112
+
113
+ ```bash
114
+ opensono --version
115
+ ```
116
+
117
+ You can also run it as a Python module:
118
+
119
+ ```bash
120
+ python -m opensono audio.wav
121
+ ```
122
+
123
+ ## Options
124
+
125
+ | Flag | Default | Description |
126
+ |------|---------|-------------|
127
+ | `--model-size` | `large-v3` | Whisper model size (`tiny`, `base`, `small`, `medium`, `large-v3`) |
128
+ | `--device` | `cuda` | Compute device (`cuda` or `cpu`) |
129
+ | `--compute-type` | `float16` | Precision (`float16`, `int8`, `float32`) |
130
+ | `--language` | auto-detect | Language code (e.g. `en`, `fr`, `de`) |
131
+ | `--format`, `-f` | `text` | Output format (`text`, `vtt`, `json`) |
132
+ | `--output`, `-o` | stdout | Output file path |
133
+ | `--no-diarize` | off | Skip speaker diarization |
134
+
135
+ ## Output formats
136
+
137
+ ### Text (default)
138
+
139
+ ```
140
+ Speaker 0 [0:00 - 0:03]
141
+ Hello, welcome to the meeting.
142
+
143
+ Speaker 1 [0:03 - 0:07]
144
+ Thanks for having me. Let's get started.
145
+ ```
146
+
147
+ ### VTT
148
+
149
+ ```
150
+ WEBVTT
151
+
152
+ 00:00:00.000 --> 00:00:03.500
153
+ <v Speaker 0>Hello, welcome to the meeting.
154
+
155
+ 00:00:03.500 --> 00:00:07.200
156
+ <v Speaker 1>Thanks for having me. Let's get started.
157
+ ```
158
+
159
+ ### JSON
160
+
161
+ ```json
162
+ [
163
+ {
164
+ "text": "Hello, welcome to the meeting.",
165
+ "start_time": 0.0,
166
+ "end_time": 3.5,
167
+ "speaker_id": 0
168
+ }
169
+ ]
170
+ ```
171
+
172
+ ## How it works
173
+
174
+ 1. **Audio preprocessing** — Converts input to 16 kHz mono WAV
175
+ 2. **Transcription** — Faster Whisper produces word-level timestamps
176
+ 3. **Diarization** — NeMo Sortformer identifies speaker segments
177
+ 4. **Merging** — Each word is assigned to a speaker based on temporal overlap
178
+ 5. **Grouping** — Consecutive words from the same speaker are combined into chunks
179
+
180
+ ## Models
181
+
182
+ | Component | Model | Size |
183
+ |-----------|-------|------|
184
+ | Transcription | [Faster Whisper large-v3](https://huggingface.co/Systran/faster-whisper-large-v3) | ~3 GB |
185
+ | Diarization | [NVIDIA Sortformer 4spk v2.1](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/diar_streaming_sortformer_4spk-v2.1) | ~100 MB |
186
+
187
+ Models are downloaded automatically on first run and cached locally.
188
+
189
+ ## Requirements
190
+
191
+ - Python 3.10+
192
+ - CUDA-capable GPU (recommended) or CPU
193
+ - ~4 GB VRAM for GPU inference with large-v3
194
+
195
+ ## Browser version
196
+
197
+ Don't want to install anything? Use [OpenSono WebApp](https://opensono.vercel.app) — the same transcription engine running entirely in your browser. No uploads, no sign-up, completely private.
198
+
199
+ ## License
200
+
201
+ MIT — see [LICENSE](LICENSE) for details.
@@ -0,0 +1,172 @@
1
+ # OpenSono
2
+
3
+ **Open-source audio transcription with speaker diarization.**
4
+
5
+ Transcribe audio files with word-level timestamps and automatic speaker identification using [Faster Whisper](https://github.com/SYSTRAN/faster-whisper) and [NVIDIA NeMo Sortformer](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/asr/speaker_diarization/intro.html).
6
+
7
+ > This is the Python CLI companion to [OpenSono WebApp](https://opensono.vercel.app) — the free, browser-based transcription tool.
8
+
9
+ ## Features
10
+
11
+ - **Accurate transcription** — Powered by Whisper large-v3
12
+ - **Speaker diarization** — Automatically identifies up to 4 speakers using NVIDIA Sortformer
13
+ - **Word-level timestamps** — Precise timing for every word
14
+ - **Multiple output formats** — Plain text, VTT subtitles, or JSON
15
+ - **Auto language detection** — Supports 99+ languages
16
+ - **Colored terminal output** — Speaker-coded output for easy reading
17
+
18
+ ## Installation
19
+
20
+ ```bash
21
+ pip install opensono
22
+ ```
23
+
24
+ > **Note:** The NeMo toolkit has additional system dependencies. See the [NeMo installation guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/getting-started.html) for details.
25
+
26
+ Requires Python 3.10+ and a CUDA-capable GPU (recommended).
27
+
28
+ ### From source
29
+
30
+ ```bash
31
+ git clone https://github.com/penkow/opensono
32
+ cd opensono
33
+ pip install .
34
+ ```
35
+
36
+ ## Usage
37
+
38
+ After installing, the `opensono` command is available anywhere in your terminal.
39
+
40
+ ### Basic transcription with speaker diarization
41
+
42
+ ```bash
43
+ opensono meeting.wav
44
+ ```
45
+
46
+ ### Transcription only (no diarization)
47
+
48
+ ```bash
49
+ opensono interview.mp3 --no-diarize
50
+ ```
51
+
52
+ ### Export as VTT subtitles
53
+
54
+ ```bash
55
+ opensono podcast.wav -f vtt -o subtitles.vtt
56
+ ```
57
+
58
+ ### Export as JSON
59
+
60
+ ```bash
61
+ opensono recording.wav -f json -o transcript.json
62
+ ```
63
+
64
+ ### Specify language (skip auto-detection)
65
+
66
+ ```bash
67
+ opensono audio.wav --language en
68
+ ```
69
+
70
+ ### Use a smaller/faster model
71
+
72
+ ```bash
73
+ opensono audio.wav --model-size base
74
+ ```
75
+
76
+ ### CPU-only
77
+
78
+ ```bash
79
+ opensono audio.wav --device cpu --compute-type int8
80
+ ```
81
+
82
+ ### Check version
83
+
84
+ ```bash
85
+ opensono --version
86
+ ```
87
+
88
+ You can also run it as a Python module:
89
+
90
+ ```bash
91
+ python -m opensono audio.wav
92
+ ```
93
+
94
+ ## Options
95
+
96
+ | Flag | Default | Description |
97
+ |------|---------|-------------|
98
+ | `--model-size` | `large-v3` | Whisper model size (`tiny`, `base`, `small`, `medium`, `large-v3`) |
99
+ | `--device` | `cuda` | Compute device (`cuda` or `cpu`) |
100
+ | `--compute-type` | `float16` | Precision (`float16`, `int8`, `float32`) |
101
+ | `--language` | auto-detect | Language code (e.g. `en`, `fr`, `de`) |
102
+ | `--format`, `-f` | `text` | Output format (`text`, `vtt`, `json`) |
103
+ | `--output`, `-o` | stdout | Output file path |
104
+ | `--no-diarize` | off | Skip speaker diarization |
105
+
106
+ ## Output formats
107
+
108
+ ### Text (default)
109
+
110
+ ```
111
+ Speaker 0 [0:00 - 0:03]
112
+ Hello, welcome to the meeting.
113
+
114
+ Speaker 1 [0:03 - 0:07]
115
+ Thanks for having me. Let's get started.
116
+ ```
117
+
118
+ ### VTT
119
+
120
+ ```
121
+ WEBVTT
122
+
123
+ 00:00:00.000 --> 00:00:03.500
124
+ <v Speaker 0>Hello, welcome to the meeting.
125
+
126
+ 00:00:03.500 --> 00:00:07.200
127
+ <v Speaker 1>Thanks for having me. Let's get started.
128
+ ```
129
+
130
+ ### JSON
131
+
132
+ ```json
133
+ [
134
+ {
135
+ "text": "Hello, welcome to the meeting.",
136
+ "start_time": 0.0,
137
+ "end_time": 3.5,
138
+ "speaker_id": 0
139
+ }
140
+ ]
141
+ ```
142
+
143
+ ## How it works
144
+
145
+ 1. **Audio preprocessing** — Converts input to 16 kHz mono WAV
146
+ 2. **Transcription** — Faster Whisper produces word-level timestamps
147
+ 3. **Diarization** — NeMo Sortformer identifies speaker segments
148
+ 4. **Merging** — Each word is assigned to a speaker based on temporal overlap
149
+ 5. **Grouping** — Consecutive words from the same speaker are combined into chunks
150
+
151
+ ## Models
152
+
153
+ | Component | Model | Size |
154
+ |-----------|-------|------|
155
+ | Transcription | [Faster Whisper large-v3](https://huggingface.co/Systran/faster-whisper-large-v3) | ~3 GB |
156
+ | Diarization | [NVIDIA Sortformer 4spk v2.1](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/diar_streaming_sortformer_4spk-v2.1) | ~100 MB |
157
+
158
+ Models are downloaded automatically on first run and cached locally.
159
+
160
+ ## Requirements
161
+
162
+ - Python 3.10+
163
+ - CUDA-capable GPU (recommended) or CPU
164
+ - ~4 GB VRAM for GPU inference with large-v3
165
+
166
+ ## Browser version
167
+
168
+ Don't want to install anything? Use [OpenSono WebApp](https://opensono.vercel.app) — the same transcription engine running entirely in your browser. No uploads, no sign-up, completely private.
169
+
170
+ ## License
171
+
172
+ MIT — see [LICENSE](LICENSE) for details.
@@ -0,0 +1,3 @@
1
+ """OpenSono — open-source audio transcription with speaker diarization."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,6 @@
1
+ """Entry point for `python -m opensono`."""
2
+
3
+ from opensono.core import main
4
+
5
+ if __name__ == "__main__":
6
+ main()
@@ -0,0 +1,437 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Transcribe and diarize audio files using Faster Whisper + NeMo Sortformer.
4
+
5
+ Produces speaker-attributed transcription with word-level timestamps.
6
+ """
7
+
8
+ import argparse
9
+ import json
10
+ import sys
11
+ import tempfile
12
+ import os
13
+ from dataclasses import dataclass, asdict
14
+ from pathlib import Path
15
+
16
+ import numpy as np
17
+ import soundfile as sf
18
+ from faster_whisper import WhisperModel
19
+
20
+
21
+ @dataclass
22
+ class WordTimestamp:
23
+ text: str
24
+ start: float
25
+ end: float
26
+ speaker_id: int = 0
27
+
28
+
29
+ @dataclass
30
+ class SpeakerSegment:
31
+ start: float
32
+ end: float
33
+ speaker_id: int
34
+
35
+
36
+ @dataclass
37
+ class TranscriptChunk:
38
+ text: str
39
+ start_time: float
40
+ end_time: float
41
+ speaker_id: int
42
+
43
+
44
+ # ---------------------------------------------------------------------------
45
+ # Diarization
46
+ # ---------------------------------------------------------------------------
47
+
48
+ def load_diarization_model():
49
+ """Load the NeMo Sortformer diarization model."""
50
+ from nemo.collections.asr.models import SortformerEncLabelModel
51
+
52
+ model = SortformerEncLabelModel.from_pretrained(
53
+ "nvidia/diar_streaming_sortformer_4spk-v2.1"
54
+ )
55
+ model.eval()
56
+
57
+ # Configure for chunked / streaming-style processing
58
+ model.sortformer_modules.chunk_len = 340
59
+ model.sortformer_modules.chunk_right_context = 40
60
+ model.sortformer_modules.fifo_len = 40
61
+ model.sortformer_modules.spkcache_update_period = 300
62
+
63
+ return model
64
+
65
+
66
+ def diarize_audio(diar_model, audio_path: str) -> list[SpeakerSegment]:
67
+ """Run diarization and return speaker segments."""
68
+ predicted = diar_model.diarize(audio=[audio_path], batch_size=1)
69
+
70
+ segments: list[SpeakerSegment] = []
71
+ for seg in predicted[0]:
72
+ # NeMo Sortformer returns segments as strings like:
73
+ # "start=0.00 end=1.50 speaker=speaker_0"
74
+ # or as objects — handle both formats
75
+ if isinstance(seg, str):
76
+ parts = seg.strip().split()
77
+ start = float(parts[0].split("=")[1])
78
+ end = float(parts[1].split("=")[1])
79
+ speaker_label = parts[2].split("=")[1]
80
+ speaker_id = int(speaker_label.split("_")[-1])
81
+ else:
82
+ # Assume object with .start, .end, .speaker attributes
83
+ start = float(seg.start)
84
+ end = float(seg.end)
85
+ speaker_label = str(seg.speaker) if hasattr(seg, "speaker") else str(seg.speaker_id)
86
+ speaker_id = int(speaker_label.split("_")[-1]) if "_" in speaker_label else int(speaker_label)
87
+
88
+ segments.append(SpeakerSegment(start=start, end=end, speaker_id=speaker_id))
89
+
90
+ return segments
91
+
92
+
93
+ # ---------------------------------------------------------------------------
94
+ # Transcription
95
+ # ---------------------------------------------------------------------------
96
+
97
+ def transcribe_audio(
98
+ whisper_model: WhisperModel,
99
+ audio_path: str,
100
+ language: str | None = None,
101
+ ) -> tuple[list[WordTimestamp], str]:
102
+ """Transcribe audio and return word-level timestamps."""
103
+ segments_iter, info = whisper_model.transcribe(
104
+ audio_path,
105
+ beam_size=5,
106
+ word_timestamps=True,
107
+ language=language,
108
+ )
109
+
110
+ detected_lang = info.language
111
+ print(
112
+ f"Detected language: {detected_lang} "
113
+ f"(probability {info.language_probability:.2f})",
114
+ file=sys.stderr,
115
+ )
116
+
117
+ words: list[WordTimestamp] = []
118
+ for segment in segments_iter:
119
+ if segment.words:
120
+ for w in segment.words:
121
+ words.append(
122
+ WordTimestamp(text=w.word, start=w.start, end=w.end)
123
+ )
124
+
125
+ return words, detected_lang
126
+
127
+
128
+ # ---------------------------------------------------------------------------
129
+ # Merging (mirrors the web app logic)
130
+ # ---------------------------------------------------------------------------
131
+
132
+ def merge_speakers_with_words(
133
+ speaker_segments: list[SpeakerSegment],
134
+ words: list[WordTimestamp],
135
+ ) -> list[WordTimestamp]:
136
+ """Assign a speaker_id to each word based on diarization segments."""
137
+ if not words:
138
+ return []
139
+ if not speaker_segments:
140
+ return [WordTimestamp(w.text, w.start, w.end, 0) for w in words]
141
+
142
+ sorted_segs = sorted(speaker_segments, key=lambda s: s.start)
143
+
144
+ result: list[WordTimestamp] = []
145
+ for w in words:
146
+ midpoint = (w.start + w.end) / 2
147
+
148
+ # Find containing segment
149
+ matched = None
150
+ for seg in sorted_segs:
151
+ if seg.start <= midpoint <= seg.end:
152
+ matched = seg
153
+ break
154
+
155
+ if matched:
156
+ result.append(WordTimestamp(w.text, w.start, w.end, matched.speaker_id))
157
+ continue
158
+
159
+ # Nearest segment fallback
160
+ nearest = min(
161
+ sorted_segs,
162
+ key=lambda s: abs(midpoint - (s.start + s.end) / 2),
163
+ )
164
+ result.append(WordTimestamp(w.text, w.start, w.end, nearest.speaker_id))
165
+
166
+ return result
167
+
168
+
169
+ def group_words_into_chunks(
170
+ words: list[WordTimestamp],
171
+ ) -> list[TranscriptChunk]:
172
+ """Group consecutive words by speaker into transcript chunks."""
173
+ if not words:
174
+ return []
175
+
176
+ chunks: list[TranscriptChunk] = []
177
+ current_words: list[str] = [words[0].text]
178
+ current_start = words[0].start
179
+ current_end = words[0].end
180
+ current_speaker = words[0].speaker_id
181
+
182
+ for w in words[1:]:
183
+ if w.speaker_id != current_speaker:
184
+ chunks.append(TranscriptChunk(
185
+ text=" ".join(current_words).strip(),
186
+ start_time=current_start,
187
+ end_time=current_end,
188
+ speaker_id=current_speaker,
189
+ ))
190
+ current_words = [w.text]
191
+ current_start = w.start
192
+ current_end = w.end
193
+ current_speaker = w.speaker_id
194
+ else:
195
+ current_words.append(w.text)
196
+ current_end = w.end
197
+
198
+ chunks.append(TranscriptChunk(
199
+ text=" ".join(current_words).strip(),
200
+ start_time=current_start,
201
+ end_time=current_end,
202
+ speaker_id=current_speaker,
203
+ ))
204
+
205
+ return chunks
206
+
207
+
208
+ def merge_consecutive_chunks(
209
+ chunks: list[TranscriptChunk],
210
+ gap_threshold: float = 1.0,
211
+ ) -> list[TranscriptChunk]:
212
+ """Merge adjacent chunks from the same speaker if gap is small."""
213
+ if len(chunks) <= 1:
214
+ return chunks
215
+
216
+ merged = [TranscriptChunk(
217
+ chunks[0].text, chunks[0].start_time,
218
+ chunks[0].end_time, chunks[0].speaker_id
219
+ )]
220
+
221
+ for c in chunks[1:]:
222
+ prev = merged[-1]
223
+ gap = c.start_time - prev.end_time
224
+ if c.speaker_id == prev.speaker_id and gap < gap_threshold:
225
+ merged[-1] = TranscriptChunk(
226
+ text=prev.text + " " + c.text,
227
+ start_time=prev.start_time,
228
+ end_time=max(prev.end_time, c.end_time),
229
+ speaker_id=prev.speaker_id,
230
+ )
231
+ else:
232
+ merged.append(TranscriptChunk(
233
+ c.text, c.start_time, c.end_time, c.speaker_id
234
+ ))
235
+
236
+ return merged
237
+
238
+
239
+ # ---------------------------------------------------------------------------
240
+ # Audio helpers
241
+ # ---------------------------------------------------------------------------
242
+
243
+ def ensure_wav_16k_mono(audio_path: str) -> str:
244
+ """
245
+ Convert audio to 16 kHz mono WAV if needed.
246
+ Returns path to the (possibly converted) file.
247
+ """
248
+ data, sr = sf.read(audio_path)
249
+
250
+ needs_conversion = False
251
+ if sr != 16000:
252
+ needs_conversion = True
253
+ if data.ndim > 1:
254
+ needs_conversion = True
255
+
256
+ if not needs_conversion and audio_path.lower().endswith(".wav"):
257
+ return audio_path
258
+
259
+ # Convert
260
+ if data.ndim > 1:
261
+ data = data.mean(axis=1)
262
+
263
+ if sr != 16000:
264
+ import librosa
265
+ data = librosa.resample(data, orig_sr=sr, target_sr=16000)
266
+
267
+ tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
268
+ sf.write(tmp.name, data, 16000)
269
+ return tmp.name
270
+
271
+
272
+ # ---------------------------------------------------------------------------
273
+ # Output formatters
274
+ # ---------------------------------------------------------------------------
275
+
276
+ def format_time(seconds: float) -> str:
277
+ m, s = divmod(int(seconds), 60)
278
+ return f"{m}:{s:02d}"
279
+
280
+
281
+ def format_vtt_time(seconds: float) -> str:
282
+ h = int(seconds // 3600)
283
+ m = int((seconds % 3600) // 60)
284
+ s = int(seconds % 60)
285
+ ms = int((seconds % 1) * 1000)
286
+ return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}"
287
+
288
+
289
+ def output_text(chunks: list[TranscriptChunk]) -> str:
290
+ lines: list[str] = []
291
+ for c in chunks:
292
+ ts = f"[{format_time(c.start_time)} - {format_time(c.end_time)}]"
293
+ lines.append(f"Speaker {c.speaker_id} {ts}")
294
+ lines.append(f" {c.text}")
295
+ lines.append("")
296
+ return "\n".join(lines)
297
+
298
+
299
+ def output_vtt(chunks: list[TranscriptChunk]) -> str:
300
+ lines = ["WEBVTT", ""]
301
+ for c in chunks:
302
+ start = format_vtt_time(c.start_time)
303
+ end = format_vtt_time(c.end_time)
304
+ lines.append(f"{start} --> {end}")
305
+ lines.append(f"<v Speaker {c.speaker_id}>{c.text}")
306
+ lines.append("")
307
+ return "\n".join(lines)
308
+
309
+
310
+ def output_json(chunks: list[TranscriptChunk]) -> str:
311
+ return json.dumps([asdict(c) for c in chunks], indent=2)
312
+
313
+
314
+ # ---------------------------------------------------------------------------
315
+ # Main
316
+ # ---------------------------------------------------------------------------
317
+
318
+ SPEAKER_COLORS = [
319
+ "\033[94m", # blue
320
+ "\033[92m", # green
321
+ "\033[95m", # purple
322
+ "\033[93m", # orange/yellow
323
+ "\033[91m", # red/pink
324
+ ]
325
+ RESET = "\033[0m"
326
+
327
+
328
+ def print_colored(chunks: list[TranscriptChunk]) -> None:
329
+ for c in chunks:
330
+ color = SPEAKER_COLORS[c.speaker_id % len(SPEAKER_COLORS)]
331
+ ts = f"[{format_time(c.start_time)} - {format_time(c.end_time)}]"
332
+ print(f"{color}Speaker {c.speaker_id} {ts}{RESET}")
333
+ print(f" {c.text}")
334
+ print()
335
+
336
+
337
+ def main():
338
+ from opensono import __version__
339
+
340
+ parser = argparse.ArgumentParser(
341
+ description="Transcribe and diarize audio using Faster Whisper + NeMo Sortformer",
342
+ )
343
+ parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
344
+ parser.add_argument("audio", help="Path to audio file")
345
+ parser.add_argument(
346
+ "--model-size", default="large-v3",
347
+ help="Whisper model size (default: large-v3)",
348
+ )
349
+ parser.add_argument(
350
+ "--device", default="cuda", choices=["cuda", "cpu"],
351
+ help="Device to run Whisper on (default: cuda)",
352
+ )
353
+ parser.add_argument(
354
+ "--compute-type", default="float16",
355
+ help="Compute type for Whisper (default: float16, use int8 for CPU)",
356
+ )
357
+ parser.add_argument(
358
+ "--language", default=None,
359
+ help="Language code (e.g. en). Auto-detected if not set.",
360
+ )
361
+ parser.add_argument(
362
+ "--output", "-o", default=None,
363
+ help="Output file path. Prints to stdout if not set.",
364
+ )
365
+ parser.add_argument(
366
+ "--format", "-f", default="text",
367
+ choices=["text", "vtt", "json"],
368
+ help="Output format (default: text)",
369
+ )
370
+ parser.add_argument(
371
+ "--no-diarize", action="store_true",
372
+ help="Skip diarization (transcription only)",
373
+ )
374
+
375
+ args = parser.parse_args()
376
+
377
+ audio_path = str(Path(args.audio).resolve())
378
+ if not Path(audio_path).exists():
379
+ print(f"Error: file not found: {audio_path}", file=sys.stderr)
380
+ sys.exit(1)
381
+
382
+ # Prepare audio
383
+ print("Preparing audio...", file=sys.stderr)
384
+ wav_path = ensure_wav_16k_mono(audio_path)
385
+ tmp_created = wav_path != audio_path
386
+
387
+ try:
388
+ # Load Whisper
389
+ print(f"Loading Whisper model ({args.model_size})...", file=sys.stderr)
390
+ whisper = WhisperModel(
391
+ args.model_size, device=args.device, compute_type=args.compute_type
392
+ )
393
+
394
+ # Transcribe
395
+ print("Transcribing...", file=sys.stderr)
396
+ words, detected_lang = transcribe_audio(whisper, wav_path, args.language)
397
+ print(f" {len(words)} words transcribed", file=sys.stderr)
398
+
399
+ # Diarize
400
+ speaker_segments: list[SpeakerSegment] = []
401
+ if not args.no_diarize:
402
+ print("Loading diarization model...", file=sys.stderr)
403
+ diar_model = load_diarization_model()
404
+ print("Diarizing...", file=sys.stderr)
405
+ speaker_segments = diarize_audio(diar_model, wav_path)
406
+ print(f" {len(speaker_segments)} speaker segments found", file=sys.stderr)
407
+
408
+ # Merge
409
+ words_with_speakers = merge_speakers_with_words(speaker_segments, words)
410
+ chunks = group_words_into_chunks(words_with_speakers)
411
+ chunks = merge_consecutive_chunks(chunks)
412
+
413
+ # Output
414
+ if args.format == "vtt":
415
+ result = output_vtt(chunks)
416
+ elif args.format == "json":
417
+ result = output_json(chunks)
418
+ else:
419
+ result = output_text(chunks)
420
+
421
+ if args.output:
422
+ Path(args.output).write_text(result)
423
+ print(f"Saved to {args.output}", file=sys.stderr)
424
+ else:
425
+ # Use colored output for terminal text format
426
+ if args.format == "text" and sys.stdout.isatty():
427
+ print_colored(chunks)
428
+ else:
429
+ print(result)
430
+
431
+ finally:
432
+ if tmp_created and os.path.exists(wav_path):
433
+ os.unlink(wav_path)
434
+
435
+
436
+ if __name__ == "__main__":
437
+ main()
@@ -0,0 +1,201 @@
1
+ Metadata-Version: 2.4
2
+ Name: opensono
3
+ Version: 0.1.0
4
+ Summary: Open-source audio transcription with speaker diarization
5
+ Author-email: OpenSono <hello@opensono.ai>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://opensono.ai
8
+ Project-URL: Repository, https://github.com/penkow/opensono
9
+ Project-URL: Issues, https://github.com/penkow/opensono/issues
10
+ Keywords: transcription,whisper,diarization,speech-to-text,nemo
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Requires-Python: >=3.10
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: faster-whisper
24
+ Requires-Dist: nemo_toolkit[asr]
25
+ Requires-Dist: soundfile
26
+ Requires-Dist: librosa
27
+ Requires-Dist: numpy
28
+ Dynamic: license-file
29
+
30
+ # OpenSono
31
+
32
+ **Open-source audio transcription with speaker diarization.**
33
+
34
+ Transcribe audio files with word-level timestamps and automatic speaker identification using [Faster Whisper](https://github.com/SYSTRAN/faster-whisper) and [NVIDIA NeMo Sortformer](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/asr/speaker_diarization/intro.html).
35
+
36
+ > This is the Python CLI companion to [OpenSono WebApp](https://opensono.vercel.app) — the free, browser-based transcription tool.
37
+
38
+ ## Features
39
+
40
+ - **Accurate transcription** — Powered by Whisper large-v3
41
+ - **Speaker diarization** — Automatically identifies up to 4 speakers using NVIDIA Sortformer
42
+ - **Word-level timestamps** — Precise timing for every word
43
+ - **Multiple output formats** — Plain text, VTT subtitles, or JSON
44
+ - **Auto language detection** — Supports 99+ languages
45
+ - **Colored terminal output** — Speaker-coded output for easy reading
46
+
47
+ ## Installation
48
+
49
+ ```bash
50
+ pip install opensono
51
+ ```
52
+
53
+ > **Note:** The NeMo toolkit has additional system dependencies. See the [NeMo installation guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/getting-started.html) for details.
54
+
55
+ Requires Python 3.10+ and a CUDA-capable GPU (recommended).
56
+
57
+ ### From source
58
+
59
+ ```bash
60
+ git clone https://github.com/penkow/opensono
61
+ cd opensono
62
+ pip install .
63
+ ```
64
+
65
+ ## Usage
66
+
67
+ After installing, the `opensono` command is available anywhere in your terminal.
68
+
69
+ ### Basic transcription with speaker diarization
70
+
71
+ ```bash
72
+ opensono meeting.wav
73
+ ```
74
+
75
+ ### Transcription only (no diarization)
76
+
77
+ ```bash
78
+ opensono interview.mp3 --no-diarize
79
+ ```
80
+
81
+ ### Export as VTT subtitles
82
+
83
+ ```bash
84
+ opensono podcast.wav -f vtt -o subtitles.vtt
85
+ ```
86
+
87
+ ### Export as JSON
88
+
89
+ ```bash
90
+ opensono recording.wav -f json -o transcript.json
91
+ ```
92
+
93
+ ### Specify language (skip auto-detection)
94
+
95
+ ```bash
96
+ opensono audio.wav --language en
97
+ ```
98
+
99
+ ### Use a smaller/faster model
100
+
101
+ ```bash
102
+ opensono audio.wav --model-size base
103
+ ```
104
+
105
+ ### CPU-only
106
+
107
+ ```bash
108
+ opensono audio.wav --device cpu --compute-type int8
109
+ ```
110
+
111
+ ### Check version
112
+
113
+ ```bash
114
+ opensono --version
115
+ ```
116
+
117
+ You can also run it as a Python module:
118
+
119
+ ```bash
120
+ python -m opensono audio.wav
121
+ ```
122
+
123
+ ## Options
124
+
125
+ | Flag | Default | Description |
126
+ |------|---------|-------------|
127
+ | `--model-size` | `large-v3` | Whisper model size (`tiny`, `base`, `small`, `medium`, `large-v3`) |
128
+ | `--device` | `cuda` | Compute device (`cuda` or `cpu`) |
129
+ | `--compute-type` | `float16` | Precision (`float16`, `int8`, `float32`) |
130
+ | `--language` | auto-detect | Language code (e.g. `en`, `fr`, `de`) |
131
+ | `--format`, `-f` | `text` | Output format (`text`, `vtt`, `json`) |
132
+ | `--output`, `-o` | stdout | Output file path |
133
+ | `--no-diarize` | off | Skip speaker diarization |
134
+
135
+ ## Output formats
136
+
137
+ ### Text (default)
138
+
139
+ ```
140
+ Speaker 0 [0:00 - 0:03]
141
+ Hello, welcome to the meeting.
142
+
143
+ Speaker 1 [0:03 - 0:07]
144
+ Thanks for having me. Let's get started.
145
+ ```
146
+
147
+ ### VTT
148
+
149
+ ```
150
+ WEBVTT
151
+
152
+ 00:00:00.000 --> 00:00:03.500
153
+ <v Speaker 0>Hello, welcome to the meeting.
154
+
155
+ 00:00:03.500 --> 00:00:07.200
156
+ <v Speaker 1>Thanks for having me. Let's get started.
157
+ ```
158
+
159
+ ### JSON
160
+
161
+ ```json
162
+ [
163
+ {
164
+ "text": "Hello, welcome to the meeting.",
165
+ "start_time": 0.0,
166
+ "end_time": 3.5,
167
+ "speaker_id": 0
168
+ }
169
+ ]
170
+ ```
171
+
172
+ ## How it works
173
+
174
+ 1. **Audio preprocessing** — Converts input to 16 kHz mono WAV
175
+ 2. **Transcription** — Faster Whisper produces word-level timestamps
176
+ 3. **Diarization** — NeMo Sortformer identifies speaker segments
177
+ 4. **Merging** — Each word is assigned to a speaker based on temporal overlap
178
+ 5. **Grouping** — Consecutive words from the same speaker are combined into chunks
179
+
180
+ ## Models
181
+
182
+ | Component | Model | Size |
183
+ |-----------|-------|------|
184
+ | Transcription | [Faster Whisper large-v3](https://huggingface.co/Systran/faster-whisper-large-v3) | ~3 GB |
185
+ | Diarization | [NVIDIA Sortformer 4spk v2.1](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/diar_streaming_sortformer_4spk-v2.1) | ~100 MB |
186
+
187
+ Models are downloaded automatically on first run and cached locally.
188
+
189
+ ## Requirements
190
+
191
+ - Python 3.10+
192
+ - CUDA-capable GPU (recommended) or CPU
193
+ - ~4 GB VRAM for GPU inference with large-v3
194
+
195
+ ## Browser version
196
+
197
+ Don't want to install anything? Use [OpenSono WebApp](https://opensono.vercel.app) — the same transcription engine running entirely in your browser. No uploads, no sign-up, completely private.
198
+
199
+ ## License
200
+
201
+ MIT — see [LICENSE](LICENSE) for details.
@@ -0,0 +1,12 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ opensono/__init__.py
5
+ opensono/__main__.py
6
+ opensono/core.py
7
+ opensono.egg-info/PKG-INFO
8
+ opensono.egg-info/SOURCES.txt
9
+ opensono.egg-info/dependency_links.txt
10
+ opensono.egg-info/entry_points.txt
11
+ opensono.egg-info/requires.txt
12
+ opensono.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ opensono = opensono.core:main
@@ -0,0 +1,5 @@
1
+ faster-whisper
2
+ nemo_toolkit[asr]
3
+ soundfile
4
+ librosa
5
+ numpy
@@ -0,0 +1 @@
1
+ opensono
@@ -0,0 +1,45 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "opensono"
7
+ version = "0.1.0"
8
+ description = "Open-source audio transcription with speaker diarization"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.10"
12
+ authors = [
13
+ { name = "OpenSono", email = "hello@opensono.ai" },
14
+ ]
15
+ keywords = ["transcription", "whisper", "diarization", "speech-to-text", "nemo"]
16
+ classifiers = [
17
+ "Development Status :: 4 - Beta",
18
+ "Intended Audience :: Developers",
19
+ "Intended Audience :: Science/Research",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12",
24
+ "Topic :: Multimedia :: Sound/Audio :: Speech",
25
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
26
+ ]
27
+ dependencies = [
28
+ "faster-whisper",
29
+ "nemo_toolkit[asr]",
30
+ "soundfile",
31
+ "librosa",
32
+ "numpy",
33
+ ]
34
+
35
+ [project.urls]
36
+ Homepage = "https://opensono.ai"
37
+ Repository = "https://github.com/penkow/opensono"
38
+ Issues = "https://github.com/penkow/opensono/issues"
39
+
40
+ [project.scripts]
41
+ opensono = "opensono.core:main"
42
+
43
+ [tool.setuptools.packages.find]
44
+ where = ["."]
45
+ include = ["opensono*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+