opensono 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opensono-0.1.0/LICENSE +21 -0
- opensono-0.1.0/PKG-INFO +201 -0
- opensono-0.1.0/README.md +172 -0
- opensono-0.1.0/opensono/__init__.py +3 -0
- opensono-0.1.0/opensono/__main__.py +6 -0
- opensono-0.1.0/opensono/core.py +437 -0
- opensono-0.1.0/opensono.egg-info/PKG-INFO +201 -0
- opensono-0.1.0/opensono.egg-info/SOURCES.txt +12 -0
- opensono-0.1.0/opensono.egg-info/dependency_links.txt +1 -0
- opensono-0.1.0/opensono.egg-info/entry_points.txt +2 -0
- opensono-0.1.0/opensono.egg-info/requires.txt +5 -0
- opensono-0.1.0/opensono.egg-info/top_level.txt +1 -0
- opensono-0.1.0/pyproject.toml +45 -0
- opensono-0.1.0/setup.cfg +4 -0
opensono-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 OpenSono
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
opensono-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: opensono
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Open-source audio transcription with speaker diarization
|
|
5
|
+
Author-email: OpenSono <hello@opensono.ai>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://opensono.ai
|
|
8
|
+
Project-URL: Repository, https://github.com/penkow/opensono
|
|
9
|
+
Project-URL: Issues, https://github.com/penkow/opensono/issues
|
|
10
|
+
Keywords: transcription,whisper,diarization,speech-to-text,nemo
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: faster-whisper
|
|
24
|
+
Requires-Dist: nemo_toolkit[asr]
|
|
25
|
+
Requires-Dist: soundfile
|
|
26
|
+
Requires-Dist: librosa
|
|
27
|
+
Requires-Dist: numpy
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
# OpenSono
|
|
31
|
+
|
|
32
|
+
**Open-source audio transcription with speaker diarization.**
|
|
33
|
+
|
|
34
|
+
Transcribe audio files with word-level timestamps and automatic speaker identification using [Faster Whisper](https://github.com/SYSTRAN/faster-whisper) and [NVIDIA NeMo Sortformer](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/asr/speaker_diarization/intro.html).
|
|
35
|
+
|
|
36
|
+
> This is the Python CLI companion to [OpenSono WebApp](https://opensono.vercel.app) — the free, browser-based transcription tool.
|
|
37
|
+
|
|
38
|
+
## Features
|
|
39
|
+
|
|
40
|
+
- **Accurate transcription** — Powered by Whisper large-v3
|
|
41
|
+
- **Speaker diarization** — Automatically identifies up to 4 speakers using NVIDIA Sortformer
|
|
42
|
+
- **Word-level timestamps** — Precise timing for every word
|
|
43
|
+
- **Multiple output formats** — Plain text, VTT subtitles, or JSON
|
|
44
|
+
- **Auto language detection** — Supports 99+ languages
|
|
45
|
+
- **Colored terminal output** — Speaker-coded output for easy reading
|
|
46
|
+
|
|
47
|
+
## Installation
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install opensono
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
> **Note:** The NeMo toolkit has additional system dependencies. See the [NeMo installation guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/getting-started.html) for details.
|
|
54
|
+
|
|
55
|
+
Requires Python 3.10+ and a CUDA-capable GPU (recommended).
|
|
56
|
+
|
|
57
|
+
### From source
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
git clone https://github.com/penkow/opensono
|
|
61
|
+
cd opensono
|
|
62
|
+
pip install .
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Usage
|
|
66
|
+
|
|
67
|
+
After installing, the `opensono` command is available anywhere in your terminal.
|
|
68
|
+
|
|
69
|
+
### Basic transcription with speaker diarization
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
opensono meeting.wav
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### Transcription only (no diarization)
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
opensono interview.mp3 --no-diarize
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Export as VTT subtitles
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
opensono podcast.wav -f vtt -o subtitles.vtt
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Export as JSON
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
opensono recording.wav -f json -o transcript.json
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Specify language (skip auto-detection)
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
opensono audio.wav --language en
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Use a smaller/faster model
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
opensono audio.wav --model-size base
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### CPU-only
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
opensono audio.wav --device cpu --compute-type int8
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### Check version
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
opensono --version
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
You can also run it as a Python module:
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
python -m opensono audio.wav
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## Options
|
|
124
|
+
|
|
125
|
+
| Flag | Default | Description |
|
|
126
|
+
|------|---------|-------------|
|
|
127
|
+
| `--model-size` | `large-v3` | Whisper model size (`tiny`, `base`, `small`, `medium`, `large-v3`) |
|
|
128
|
+
| `--device` | `cuda` | Compute device (`cuda` or `cpu`) |
|
|
129
|
+
| `--compute-type` | `float16` | Precision (`float16`, `int8`, `float32`) |
|
|
130
|
+
| `--language` | auto-detect | Language code (e.g. `en`, `fr`, `de`) |
|
|
131
|
+
| `--format`, `-f` | `text` | Output format (`text`, `vtt`, `json`) |
|
|
132
|
+
| `--output`, `-o` | stdout | Output file path |
|
|
133
|
+
| `--no-diarize` | off | Skip speaker diarization |
|
|
134
|
+
|
|
135
|
+
## Output formats
|
|
136
|
+
|
|
137
|
+
### Text (default)
|
|
138
|
+
|
|
139
|
+
```
|
|
140
|
+
Speaker 0 [0:00 - 0:03]
|
|
141
|
+
Hello, welcome to the meeting.
|
|
142
|
+
|
|
143
|
+
Speaker 1 [0:03 - 0:07]
|
|
144
|
+
Thanks for having me. Let's get started.
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
### VTT
|
|
148
|
+
|
|
149
|
+
```
|
|
150
|
+
WEBVTT
|
|
151
|
+
|
|
152
|
+
00:00:00.000 --> 00:00:03.500
|
|
153
|
+
<v Speaker 0>Hello, welcome to the meeting.
|
|
154
|
+
|
|
155
|
+
00:00:03.500 --> 00:00:07.200
|
|
156
|
+
<v Speaker 1>Thanks for having me. Let's get started.
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### JSON
|
|
160
|
+
|
|
161
|
+
```json
|
|
162
|
+
[
|
|
163
|
+
{
|
|
164
|
+
"text": "Hello, welcome to the meeting.",
|
|
165
|
+
"start_time": 0.0,
|
|
166
|
+
"end_time": 3.5,
|
|
167
|
+
"speaker_id": 0
|
|
168
|
+
}
|
|
169
|
+
]
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
## How it works
|
|
173
|
+
|
|
174
|
+
1. **Audio preprocessing** — Converts input to 16 kHz mono WAV
|
|
175
|
+
2. **Transcription** — Faster Whisper produces word-level timestamps
|
|
176
|
+
3. **Diarization** — NeMo Sortformer identifies speaker segments
|
|
177
|
+
4. **Merging** — Each word is assigned to a speaker based on temporal overlap
|
|
178
|
+
5. **Grouping** — Consecutive words from the same speaker are combined into chunks
|
|
179
|
+
|
|
180
|
+
## Models
|
|
181
|
+
|
|
182
|
+
| Component | Model | Size |
|
|
183
|
+
|-----------|-------|------|
|
|
184
|
+
| Transcription | [Faster Whisper large-v3](https://huggingface.co/Systran/faster-whisper-large-v3) | ~3 GB |
|
|
185
|
+
| Diarization | [NVIDIA Sortformer 4spk v2.1](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/diar_streaming_sortformer_4spk-v2.1) | ~100 MB |
|
|
186
|
+
|
|
187
|
+
Models are downloaded automatically on first run and cached locally.
|
|
188
|
+
|
|
189
|
+
## Requirements
|
|
190
|
+
|
|
191
|
+
- Python 3.10+
|
|
192
|
+
- CUDA-capable GPU (recommended) or CPU
|
|
193
|
+
- ~4 GB VRAM for GPU inference with large-v3
|
|
194
|
+
|
|
195
|
+
## Browser version
|
|
196
|
+
|
|
197
|
+
Don't want to install anything? Use [OpenSono WebApp](https://opensono.vercel.app) — the same transcription engine running entirely in your browser. No uploads, no sign-up, completely private.
|
|
198
|
+
|
|
199
|
+
## License
|
|
200
|
+
|
|
201
|
+
MIT — see [LICENSE](LICENSE) for details.
|
opensono-0.1.0/README.md
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
# OpenSono
|
|
2
|
+
|
|
3
|
+
**Open-source audio transcription with speaker diarization.**
|
|
4
|
+
|
|
5
|
+
Transcribe audio files with word-level timestamps and automatic speaker identification using [Faster Whisper](https://github.com/SYSTRAN/faster-whisper) and [NVIDIA NeMo Sortformer](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/asr/speaker_diarization/intro.html).
|
|
6
|
+
|
|
7
|
+
> This is the Python CLI companion to [OpenSono WebApp](https://opensono.vercel.app) — the free, browser-based transcription tool.
|
|
8
|
+
|
|
9
|
+
## Features
|
|
10
|
+
|
|
11
|
+
- **Accurate transcription** — Powered by Whisper large-v3
|
|
12
|
+
- **Speaker diarization** — Automatically identifies up to 4 speakers using NVIDIA Sortformer
|
|
13
|
+
- **Word-level timestamps** — Precise timing for every word
|
|
14
|
+
- **Multiple output formats** — Plain text, VTT subtitles, or JSON
|
|
15
|
+
- **Auto language detection** — Supports 99+ languages
|
|
16
|
+
- **Colored terminal output** — Speaker-coded output for easy reading
|
|
17
|
+
|
|
18
|
+
## Installation
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install opensono
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
> **Note:** The NeMo toolkit has additional system dependencies. See the [NeMo installation guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/getting-started.html) for details.
|
|
25
|
+
|
|
26
|
+
Requires Python 3.10+ and a CUDA-capable GPU (recommended).
|
|
27
|
+
|
|
28
|
+
### From source
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
git clone https://github.com/penkow/opensono
|
|
32
|
+
cd opensono
|
|
33
|
+
pip install .
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Usage
|
|
37
|
+
|
|
38
|
+
After installing, the `opensono` command is available anywhere in your terminal.
|
|
39
|
+
|
|
40
|
+
### Basic transcription with speaker diarization
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
opensono meeting.wav
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### Transcription only (no diarization)
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
opensono interview.mp3 --no-diarize
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### Export as VTT subtitles
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
opensono podcast.wav -f vtt -o subtitles.vtt
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### Export as JSON
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
opensono recording.wav -f json -o transcript.json
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Specify language (skip auto-detection)
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
opensono audio.wav --language en
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### Use a smaller/faster model
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
opensono audio.wav --model-size base
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### CPU-only
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
opensono audio.wav --device cpu --compute-type int8
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Check version
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
opensono --version
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
You can also run it as a Python module:
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
python -m opensono audio.wav
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Options
|
|
95
|
+
|
|
96
|
+
| Flag | Default | Description |
|
|
97
|
+
|------|---------|-------------|
|
|
98
|
+
| `--model-size` | `large-v3` | Whisper model size (`tiny`, `base`, `small`, `medium`, `large-v3`) |
|
|
99
|
+
| `--device` | `cuda` | Compute device (`cuda` or `cpu`) |
|
|
100
|
+
| `--compute-type` | `float16` | Precision (`float16`, `int8`, `float32`) |
|
|
101
|
+
| `--language` | auto-detect | Language code (e.g. `en`, `fr`, `de`) |
|
|
102
|
+
| `--format`, `-f` | `text` | Output format (`text`, `vtt`, `json`) |
|
|
103
|
+
| `--output`, `-o` | stdout | Output file path |
|
|
104
|
+
| `--no-diarize` | off | Skip speaker diarization |
|
|
105
|
+
|
|
106
|
+
## Output formats
|
|
107
|
+
|
|
108
|
+
### Text (default)
|
|
109
|
+
|
|
110
|
+
```
|
|
111
|
+
Speaker 0 [0:00 - 0:03]
|
|
112
|
+
Hello, welcome to the meeting.
|
|
113
|
+
|
|
114
|
+
Speaker 1 [0:03 - 0:07]
|
|
115
|
+
Thanks for having me. Let's get started.
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### VTT
|
|
119
|
+
|
|
120
|
+
```
|
|
121
|
+
WEBVTT
|
|
122
|
+
|
|
123
|
+
00:00:00.000 --> 00:00:03.500
|
|
124
|
+
<v Speaker 0>Hello, welcome to the meeting.
|
|
125
|
+
|
|
126
|
+
00:00:03.500 --> 00:00:07.200
|
|
127
|
+
<v Speaker 1>Thanks for having me. Let's get started.
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### JSON
|
|
131
|
+
|
|
132
|
+
```json
|
|
133
|
+
[
|
|
134
|
+
{
|
|
135
|
+
"text": "Hello, welcome to the meeting.",
|
|
136
|
+
"start_time": 0.0,
|
|
137
|
+
"end_time": 3.5,
|
|
138
|
+
"speaker_id": 0
|
|
139
|
+
}
|
|
140
|
+
]
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## How it works
|
|
144
|
+
|
|
145
|
+
1. **Audio preprocessing** — Converts input to 16 kHz mono WAV
|
|
146
|
+
2. **Transcription** — Faster Whisper produces word-level timestamps
|
|
147
|
+
3. **Diarization** — NeMo Sortformer identifies speaker segments
|
|
148
|
+
4. **Merging** — Each word is assigned to a speaker based on temporal overlap
|
|
149
|
+
5. **Grouping** — Consecutive words from the same speaker are combined into chunks
|
|
150
|
+
|
|
151
|
+
## Models
|
|
152
|
+
|
|
153
|
+
| Component | Model | Size |
|
|
154
|
+
|-----------|-------|------|
|
|
155
|
+
| Transcription | [Faster Whisper large-v3](https://huggingface.co/Systran/faster-whisper-large-v3) | ~3 GB |
|
|
156
|
+
| Diarization | [NVIDIA Sortformer 4spk v2.1](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/diar_streaming_sortformer_4spk-v2.1) | ~100 MB |
|
|
157
|
+
|
|
158
|
+
Models are downloaded automatically on first run and cached locally.
|
|
159
|
+
|
|
160
|
+
## Requirements
|
|
161
|
+
|
|
162
|
+
- Python 3.10+
|
|
163
|
+
- CUDA-capable GPU (recommended) or CPU
|
|
164
|
+
- ~4 GB VRAM for GPU inference with large-v3
|
|
165
|
+
|
|
166
|
+
## Browser version
|
|
167
|
+
|
|
168
|
+
Don't want to install anything? Use [OpenSono WebApp](https://opensono.vercel.app) — the same transcription engine running entirely in your browser. No uploads, no sign-up, completely private.
|
|
169
|
+
|
|
170
|
+
## License
|
|
171
|
+
|
|
172
|
+
MIT — see [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,437 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Transcribe and diarize audio files using Faster Whisper + NeMo Sortformer.
|
|
4
|
+
|
|
5
|
+
Produces speaker-attributed transcription with word-level timestamps.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import argparse
|
|
9
|
+
import json
|
|
10
|
+
import sys
|
|
11
|
+
import tempfile
|
|
12
|
+
import os
|
|
13
|
+
from dataclasses import dataclass, asdict
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
import soundfile as sf
|
|
18
|
+
from faster_whisper import WhisperModel
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class WordTimestamp:
|
|
23
|
+
text: str
|
|
24
|
+
start: float
|
|
25
|
+
end: float
|
|
26
|
+
speaker_id: int = 0
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class SpeakerSegment:
|
|
31
|
+
start: float
|
|
32
|
+
end: float
|
|
33
|
+
speaker_id: int
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class TranscriptChunk:
|
|
38
|
+
text: str
|
|
39
|
+
start_time: float
|
|
40
|
+
end_time: float
|
|
41
|
+
speaker_id: int
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# ---------------------------------------------------------------------------
|
|
45
|
+
# Diarization
|
|
46
|
+
# ---------------------------------------------------------------------------
|
|
47
|
+
|
|
48
|
+
def load_diarization_model():
|
|
49
|
+
"""Load the NeMo Sortformer diarization model."""
|
|
50
|
+
from nemo.collections.asr.models import SortformerEncLabelModel
|
|
51
|
+
|
|
52
|
+
model = SortformerEncLabelModel.from_pretrained(
|
|
53
|
+
"nvidia/diar_streaming_sortformer_4spk-v2.1"
|
|
54
|
+
)
|
|
55
|
+
model.eval()
|
|
56
|
+
|
|
57
|
+
# Configure for chunked / streaming-style processing
|
|
58
|
+
model.sortformer_modules.chunk_len = 340
|
|
59
|
+
model.sortformer_modules.chunk_right_context = 40
|
|
60
|
+
model.sortformer_modules.fifo_len = 40
|
|
61
|
+
model.sortformer_modules.spkcache_update_period = 300
|
|
62
|
+
|
|
63
|
+
return model
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def diarize_audio(diar_model, audio_path: str) -> list[SpeakerSegment]:
|
|
67
|
+
"""Run diarization and return speaker segments."""
|
|
68
|
+
predicted = diar_model.diarize(audio=[audio_path], batch_size=1)
|
|
69
|
+
|
|
70
|
+
segments: list[SpeakerSegment] = []
|
|
71
|
+
for seg in predicted[0]:
|
|
72
|
+
# NeMo Sortformer returns segments as strings like:
|
|
73
|
+
# "start=0.00 end=1.50 speaker=speaker_0"
|
|
74
|
+
# or as objects — handle both formats
|
|
75
|
+
if isinstance(seg, str):
|
|
76
|
+
parts = seg.strip().split()
|
|
77
|
+
start = float(parts[0].split("=")[1])
|
|
78
|
+
end = float(parts[1].split("=")[1])
|
|
79
|
+
speaker_label = parts[2].split("=")[1]
|
|
80
|
+
speaker_id = int(speaker_label.split("_")[-1])
|
|
81
|
+
else:
|
|
82
|
+
# Assume object with .start, .end, .speaker attributes
|
|
83
|
+
start = float(seg.start)
|
|
84
|
+
end = float(seg.end)
|
|
85
|
+
speaker_label = str(seg.speaker) if hasattr(seg, "speaker") else str(seg.speaker_id)
|
|
86
|
+
speaker_id = int(speaker_label.split("_")[-1]) if "_" in speaker_label else int(speaker_label)
|
|
87
|
+
|
|
88
|
+
segments.append(SpeakerSegment(start=start, end=end, speaker_id=speaker_id))
|
|
89
|
+
|
|
90
|
+
return segments
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# ---------------------------------------------------------------------------
|
|
94
|
+
# Transcription
|
|
95
|
+
# ---------------------------------------------------------------------------
|
|
96
|
+
|
|
97
|
+
def transcribe_audio(
|
|
98
|
+
whisper_model: WhisperModel,
|
|
99
|
+
audio_path: str,
|
|
100
|
+
language: str | None = None,
|
|
101
|
+
) -> tuple[list[WordTimestamp], str]:
|
|
102
|
+
"""Transcribe audio and return word-level timestamps."""
|
|
103
|
+
segments_iter, info = whisper_model.transcribe(
|
|
104
|
+
audio_path,
|
|
105
|
+
beam_size=5,
|
|
106
|
+
word_timestamps=True,
|
|
107
|
+
language=language,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
detected_lang = info.language
|
|
111
|
+
print(
|
|
112
|
+
f"Detected language: {detected_lang} "
|
|
113
|
+
f"(probability {info.language_probability:.2f})",
|
|
114
|
+
file=sys.stderr,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
words: list[WordTimestamp] = []
|
|
118
|
+
for segment in segments_iter:
|
|
119
|
+
if segment.words:
|
|
120
|
+
for w in segment.words:
|
|
121
|
+
words.append(
|
|
122
|
+
WordTimestamp(text=w.word, start=w.start, end=w.end)
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
return words, detected_lang
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
# ---------------------------------------------------------------------------
|
|
129
|
+
# Merging (mirrors the web app logic)
|
|
130
|
+
# ---------------------------------------------------------------------------
|
|
131
|
+
|
|
132
|
+
def merge_speakers_with_words(
|
|
133
|
+
speaker_segments: list[SpeakerSegment],
|
|
134
|
+
words: list[WordTimestamp],
|
|
135
|
+
) -> list[WordTimestamp]:
|
|
136
|
+
"""Assign a speaker_id to each word based on diarization segments."""
|
|
137
|
+
if not words:
|
|
138
|
+
return []
|
|
139
|
+
if not speaker_segments:
|
|
140
|
+
return [WordTimestamp(w.text, w.start, w.end, 0) for w in words]
|
|
141
|
+
|
|
142
|
+
sorted_segs = sorted(speaker_segments, key=lambda s: s.start)
|
|
143
|
+
|
|
144
|
+
result: list[WordTimestamp] = []
|
|
145
|
+
for w in words:
|
|
146
|
+
midpoint = (w.start + w.end) / 2
|
|
147
|
+
|
|
148
|
+
# Find containing segment
|
|
149
|
+
matched = None
|
|
150
|
+
for seg in sorted_segs:
|
|
151
|
+
if seg.start <= midpoint <= seg.end:
|
|
152
|
+
matched = seg
|
|
153
|
+
break
|
|
154
|
+
|
|
155
|
+
if matched:
|
|
156
|
+
result.append(WordTimestamp(w.text, w.start, w.end, matched.speaker_id))
|
|
157
|
+
continue
|
|
158
|
+
|
|
159
|
+
# Nearest segment fallback
|
|
160
|
+
nearest = min(
|
|
161
|
+
sorted_segs,
|
|
162
|
+
key=lambda s: abs(midpoint - (s.start + s.end) / 2),
|
|
163
|
+
)
|
|
164
|
+
result.append(WordTimestamp(w.text, w.start, w.end, nearest.speaker_id))
|
|
165
|
+
|
|
166
|
+
return result
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def group_words_into_chunks(
|
|
170
|
+
words: list[WordTimestamp],
|
|
171
|
+
) -> list[TranscriptChunk]:
|
|
172
|
+
"""Group consecutive words by speaker into transcript chunks."""
|
|
173
|
+
if not words:
|
|
174
|
+
return []
|
|
175
|
+
|
|
176
|
+
chunks: list[TranscriptChunk] = []
|
|
177
|
+
current_words: list[str] = [words[0].text]
|
|
178
|
+
current_start = words[0].start
|
|
179
|
+
current_end = words[0].end
|
|
180
|
+
current_speaker = words[0].speaker_id
|
|
181
|
+
|
|
182
|
+
for w in words[1:]:
|
|
183
|
+
if w.speaker_id != current_speaker:
|
|
184
|
+
chunks.append(TranscriptChunk(
|
|
185
|
+
text=" ".join(current_words).strip(),
|
|
186
|
+
start_time=current_start,
|
|
187
|
+
end_time=current_end,
|
|
188
|
+
speaker_id=current_speaker,
|
|
189
|
+
))
|
|
190
|
+
current_words = [w.text]
|
|
191
|
+
current_start = w.start
|
|
192
|
+
current_end = w.end
|
|
193
|
+
current_speaker = w.speaker_id
|
|
194
|
+
else:
|
|
195
|
+
current_words.append(w.text)
|
|
196
|
+
current_end = w.end
|
|
197
|
+
|
|
198
|
+
chunks.append(TranscriptChunk(
|
|
199
|
+
text=" ".join(current_words).strip(),
|
|
200
|
+
start_time=current_start,
|
|
201
|
+
end_time=current_end,
|
|
202
|
+
speaker_id=current_speaker,
|
|
203
|
+
))
|
|
204
|
+
|
|
205
|
+
return chunks
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def merge_consecutive_chunks(
|
|
209
|
+
chunks: list[TranscriptChunk],
|
|
210
|
+
gap_threshold: float = 1.0,
|
|
211
|
+
) -> list[TranscriptChunk]:
|
|
212
|
+
"""Merge adjacent chunks from the same speaker if gap is small."""
|
|
213
|
+
if len(chunks) <= 1:
|
|
214
|
+
return chunks
|
|
215
|
+
|
|
216
|
+
merged = [TranscriptChunk(
|
|
217
|
+
chunks[0].text, chunks[0].start_time,
|
|
218
|
+
chunks[0].end_time, chunks[0].speaker_id
|
|
219
|
+
)]
|
|
220
|
+
|
|
221
|
+
for c in chunks[1:]:
|
|
222
|
+
prev = merged[-1]
|
|
223
|
+
gap = c.start_time - prev.end_time
|
|
224
|
+
if c.speaker_id == prev.speaker_id and gap < gap_threshold:
|
|
225
|
+
merged[-1] = TranscriptChunk(
|
|
226
|
+
text=prev.text + " " + c.text,
|
|
227
|
+
start_time=prev.start_time,
|
|
228
|
+
end_time=max(prev.end_time, c.end_time),
|
|
229
|
+
speaker_id=prev.speaker_id,
|
|
230
|
+
)
|
|
231
|
+
else:
|
|
232
|
+
merged.append(TranscriptChunk(
|
|
233
|
+
c.text, c.start_time, c.end_time, c.speaker_id
|
|
234
|
+
))
|
|
235
|
+
|
|
236
|
+
return merged
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
# ---------------------------------------------------------------------------
|
|
240
|
+
# Audio helpers
|
|
241
|
+
# ---------------------------------------------------------------------------
|
|
242
|
+
|
|
243
|
+
def ensure_wav_16k_mono(audio_path: str) -> str:
|
|
244
|
+
"""
|
|
245
|
+
Convert audio to 16 kHz mono WAV if needed.
|
|
246
|
+
Returns path to the (possibly converted) file.
|
|
247
|
+
"""
|
|
248
|
+
data, sr = sf.read(audio_path)
|
|
249
|
+
|
|
250
|
+
needs_conversion = False
|
|
251
|
+
if sr != 16000:
|
|
252
|
+
needs_conversion = True
|
|
253
|
+
if data.ndim > 1:
|
|
254
|
+
needs_conversion = True
|
|
255
|
+
|
|
256
|
+
if not needs_conversion and audio_path.lower().endswith(".wav"):
|
|
257
|
+
return audio_path
|
|
258
|
+
|
|
259
|
+
# Convert
|
|
260
|
+
if data.ndim > 1:
|
|
261
|
+
data = data.mean(axis=1)
|
|
262
|
+
|
|
263
|
+
if sr != 16000:
|
|
264
|
+
import librosa
|
|
265
|
+
data = librosa.resample(data, orig_sr=sr, target_sr=16000)
|
|
266
|
+
|
|
267
|
+
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
|
268
|
+
sf.write(tmp.name, data, 16000)
|
|
269
|
+
return tmp.name
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
# ---------------------------------------------------------------------------
|
|
273
|
+
# Output formatters
|
|
274
|
+
# ---------------------------------------------------------------------------
|
|
275
|
+
|
|
276
|
+
def format_time(seconds: float) -> str:
|
|
277
|
+
m, s = divmod(int(seconds), 60)
|
|
278
|
+
return f"{m}:{s:02d}"
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def format_vtt_time(seconds: float) -> str:
|
|
282
|
+
h = int(seconds // 3600)
|
|
283
|
+
m = int((seconds % 3600) // 60)
|
|
284
|
+
s = int(seconds % 60)
|
|
285
|
+
ms = int((seconds % 1) * 1000)
|
|
286
|
+
return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}"
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def output_text(chunks: list[TranscriptChunk]) -> str:
|
|
290
|
+
lines: list[str] = []
|
|
291
|
+
for c in chunks:
|
|
292
|
+
ts = f"[{format_time(c.start_time)} - {format_time(c.end_time)}]"
|
|
293
|
+
lines.append(f"Speaker {c.speaker_id} {ts}")
|
|
294
|
+
lines.append(f" {c.text}")
|
|
295
|
+
lines.append("")
|
|
296
|
+
return "\n".join(lines)
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def output_vtt(chunks: list[TranscriptChunk]) -> str:
|
|
300
|
+
lines = ["WEBVTT", ""]
|
|
301
|
+
for c in chunks:
|
|
302
|
+
start = format_vtt_time(c.start_time)
|
|
303
|
+
end = format_vtt_time(c.end_time)
|
|
304
|
+
lines.append(f"{start} --> {end}")
|
|
305
|
+
lines.append(f"<v Speaker {c.speaker_id}>{c.text}")
|
|
306
|
+
lines.append("")
|
|
307
|
+
return "\n".join(lines)
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def output_json(chunks: list[TranscriptChunk]) -> str:
|
|
311
|
+
return json.dumps([asdict(c) for c in chunks], indent=2)
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
# ---------------------------------------------------------------------------
|
|
315
|
+
# Main
|
|
316
|
+
# ---------------------------------------------------------------------------
|
|
317
|
+
|
|
318
|
+
SPEAKER_COLORS = [
|
|
319
|
+
"\033[94m", # blue
|
|
320
|
+
"\033[92m", # green
|
|
321
|
+
"\033[95m", # purple
|
|
322
|
+
"\033[93m", # orange/yellow
|
|
323
|
+
"\033[91m", # red/pink
|
|
324
|
+
]
|
|
325
|
+
RESET = "\033[0m"
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def print_colored(chunks: list[TranscriptChunk]) -> None:
|
|
329
|
+
for c in chunks:
|
|
330
|
+
color = SPEAKER_COLORS[c.speaker_id % len(SPEAKER_COLORS)]
|
|
331
|
+
ts = f"[{format_time(c.start_time)} - {format_time(c.end_time)}]"
|
|
332
|
+
print(f"{color}Speaker {c.speaker_id} {ts}{RESET}")
|
|
333
|
+
print(f" {c.text}")
|
|
334
|
+
print()
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def main():
|
|
338
|
+
from opensono import __version__
|
|
339
|
+
|
|
340
|
+
parser = argparse.ArgumentParser(
|
|
341
|
+
description="Transcribe and diarize audio using Faster Whisper + NeMo Sortformer",
|
|
342
|
+
)
|
|
343
|
+
parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
|
|
344
|
+
parser.add_argument("audio", help="Path to audio file")
|
|
345
|
+
parser.add_argument(
|
|
346
|
+
"--model-size", default="large-v3",
|
|
347
|
+
help="Whisper model size (default: large-v3)",
|
|
348
|
+
)
|
|
349
|
+
parser.add_argument(
|
|
350
|
+
"--device", default="cuda", choices=["cuda", "cpu"],
|
|
351
|
+
help="Device to run Whisper on (default: cuda)",
|
|
352
|
+
)
|
|
353
|
+
parser.add_argument(
|
|
354
|
+
"--compute-type", default="float16",
|
|
355
|
+
help="Compute type for Whisper (default: float16, use int8 for CPU)",
|
|
356
|
+
)
|
|
357
|
+
parser.add_argument(
|
|
358
|
+
"--language", default=None,
|
|
359
|
+
help="Language code (e.g. en). Auto-detected if not set.",
|
|
360
|
+
)
|
|
361
|
+
parser.add_argument(
|
|
362
|
+
"--output", "-o", default=None,
|
|
363
|
+
help="Output file path. Prints to stdout if not set.",
|
|
364
|
+
)
|
|
365
|
+
parser.add_argument(
|
|
366
|
+
"--format", "-f", default="text",
|
|
367
|
+
choices=["text", "vtt", "json"],
|
|
368
|
+
help="Output format (default: text)",
|
|
369
|
+
)
|
|
370
|
+
parser.add_argument(
|
|
371
|
+
"--no-diarize", action="store_true",
|
|
372
|
+
help="Skip diarization (transcription only)",
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
args = parser.parse_args()
|
|
376
|
+
|
|
377
|
+
audio_path = str(Path(args.audio).resolve())
|
|
378
|
+
if not Path(audio_path).exists():
|
|
379
|
+
print(f"Error: file not found: {audio_path}", file=sys.stderr)
|
|
380
|
+
sys.exit(1)
|
|
381
|
+
|
|
382
|
+
# Prepare audio
|
|
383
|
+
print("Preparing audio...", file=sys.stderr)
|
|
384
|
+
wav_path = ensure_wav_16k_mono(audio_path)
|
|
385
|
+
tmp_created = wav_path != audio_path
|
|
386
|
+
|
|
387
|
+
try:
|
|
388
|
+
# Load Whisper
|
|
389
|
+
print(f"Loading Whisper model ({args.model_size})...", file=sys.stderr)
|
|
390
|
+
whisper = WhisperModel(
|
|
391
|
+
args.model_size, device=args.device, compute_type=args.compute_type
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
# Transcribe
|
|
395
|
+
print("Transcribing...", file=sys.stderr)
|
|
396
|
+
words, detected_lang = transcribe_audio(whisper, wav_path, args.language)
|
|
397
|
+
print(f" {len(words)} words transcribed", file=sys.stderr)
|
|
398
|
+
|
|
399
|
+
# Diarize
|
|
400
|
+
speaker_segments: list[SpeakerSegment] = []
|
|
401
|
+
if not args.no_diarize:
|
|
402
|
+
print("Loading diarization model...", file=sys.stderr)
|
|
403
|
+
diar_model = load_diarization_model()
|
|
404
|
+
print("Diarizing...", file=sys.stderr)
|
|
405
|
+
speaker_segments = diarize_audio(diar_model, wav_path)
|
|
406
|
+
print(f" {len(speaker_segments)} speaker segments found", file=sys.stderr)
|
|
407
|
+
|
|
408
|
+
# Merge
|
|
409
|
+
words_with_speakers = merge_speakers_with_words(speaker_segments, words)
|
|
410
|
+
chunks = group_words_into_chunks(words_with_speakers)
|
|
411
|
+
chunks = merge_consecutive_chunks(chunks)
|
|
412
|
+
|
|
413
|
+
# Output
|
|
414
|
+
if args.format == "vtt":
|
|
415
|
+
result = output_vtt(chunks)
|
|
416
|
+
elif args.format == "json":
|
|
417
|
+
result = output_json(chunks)
|
|
418
|
+
else:
|
|
419
|
+
result = output_text(chunks)
|
|
420
|
+
|
|
421
|
+
if args.output:
|
|
422
|
+
Path(args.output).write_text(result)
|
|
423
|
+
print(f"Saved to {args.output}", file=sys.stderr)
|
|
424
|
+
else:
|
|
425
|
+
# Use colored output for terminal text format
|
|
426
|
+
if args.format == "text" and sys.stdout.isatty():
|
|
427
|
+
print_colored(chunks)
|
|
428
|
+
else:
|
|
429
|
+
print(result)
|
|
430
|
+
|
|
431
|
+
finally:
|
|
432
|
+
if tmp_created and os.path.exists(wav_path):
|
|
433
|
+
os.unlink(wav_path)
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
if __name__ == "__main__":
|
|
437
|
+
main()
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: opensono
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Open-source audio transcription with speaker diarization
|
|
5
|
+
Author-email: OpenSono <hello@opensono.ai>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://opensono.ai
|
|
8
|
+
Project-URL: Repository, https://github.com/penkow/opensono
|
|
9
|
+
Project-URL: Issues, https://github.com/penkow/opensono/issues
|
|
10
|
+
Keywords: transcription,whisper,diarization,speech-to-text,nemo
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: faster-whisper
|
|
24
|
+
Requires-Dist: nemo_toolkit[asr]
|
|
25
|
+
Requires-Dist: soundfile
|
|
26
|
+
Requires-Dist: librosa
|
|
27
|
+
Requires-Dist: numpy
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
# OpenSono
|
|
31
|
+
|
|
32
|
+
**Open-source audio transcription with speaker diarization.**
|
|
33
|
+
|
|
34
|
+
Transcribe audio files with word-level timestamps and automatic speaker identification using [Faster Whisper](https://github.com/SYSTRAN/faster-whisper) and [NVIDIA NeMo Sortformer](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/asr/speaker_diarization/intro.html).
|
|
35
|
+
|
|
36
|
+
> This is the Python CLI companion to [OpenSono WebApp](https://opensono.vercel.app) — the free, browser-based transcription tool.
|
|
37
|
+
|
|
38
|
+
## Features
|
|
39
|
+
|
|
40
|
+
- **Accurate transcription** — Powered by Whisper large-v3
|
|
41
|
+
- **Speaker diarization** — Automatically identifies up to 4 speakers using NVIDIA Sortformer
|
|
42
|
+
- **Word-level timestamps** — Precise timing for every word
|
|
43
|
+
- **Multiple output formats** — Plain text, VTT subtitles, or JSON
|
|
44
|
+
- **Auto language detection** — Supports 99+ languages
|
|
45
|
+
- **Colored terminal output** — Speaker-coded output for easy reading
|
|
46
|
+
|
|
47
|
+
## Installation
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install opensono
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
> **Note:** The NeMo toolkit has additional system dependencies. See the [NeMo installation guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/getting-started.html) for details.
|
|
54
|
+
|
|
55
|
+
Requires Python 3.10+ and a CUDA-capable GPU (recommended).
|
|
56
|
+
|
|
57
|
+
### From source
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
git clone https://github.com/penkow/opensono
|
|
61
|
+
cd opensono
|
|
62
|
+
pip install .
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Usage
|
|
66
|
+
|
|
67
|
+
After installing, the `opensono` command is available anywhere in your terminal.
|
|
68
|
+
|
|
69
|
+
### Basic transcription with speaker diarization
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
opensono meeting.wav
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### Transcription only (no diarization)
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
opensono interview.mp3 --no-diarize
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Export as VTT subtitles
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
opensono podcast.wav -f vtt -o subtitles.vtt
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Export as JSON
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
opensono recording.wav -f json -o transcript.json
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Specify language (skip auto-detection)
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
opensono audio.wav --language en
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Use a smaller/faster model
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
opensono audio.wav --model-size base
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### CPU-only
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
opensono audio.wav --device cpu --compute-type int8
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### Check version
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
opensono --version
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
You can also run it as a Python module:
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
python -m opensono audio.wav
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## Options
|
|
124
|
+
|
|
125
|
+
| Flag | Default | Description |
|
|
126
|
+
|------|---------|-------------|
|
|
127
|
+
| `--model-size` | `large-v3` | Whisper model size (`tiny`, `base`, `small`, `medium`, `large-v3`) |
|
|
128
|
+
| `--device` | `cuda` | Compute device (`cuda` or `cpu`) |
|
|
129
|
+
| `--compute-type` | `float16` | Precision (`float16`, `int8`, `float32`) |
|
|
130
|
+
| `--language` | auto-detect | Language code (e.g. `en`, `fr`, `de`) |
|
|
131
|
+
| `--format`, `-f` | `text` | Output format (`text`, `vtt`, `json`) |
|
|
132
|
+
| `--output`, `-o` | stdout | Output file path |
|
|
133
|
+
| `--no-diarize` | off | Skip speaker diarization |
|
|
134
|
+
|
|
135
|
+
## Output formats
|
|
136
|
+
|
|
137
|
+
### Text (default)
|
|
138
|
+
|
|
139
|
+
```
|
|
140
|
+
Speaker 0 [0:00 - 0:03]
|
|
141
|
+
Hello, welcome to the meeting.
|
|
142
|
+
|
|
143
|
+
Speaker 1 [0:03 - 0:07]
|
|
144
|
+
Thanks for having me. Let's get started.
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
### VTT
|
|
148
|
+
|
|
149
|
+
```
|
|
150
|
+
WEBVTT
|
|
151
|
+
|
|
152
|
+
00:00:00.000 --> 00:00:03.500
|
|
153
|
+
<v Speaker 0>Hello, welcome to the meeting.
|
|
154
|
+
|
|
155
|
+
00:00:03.500 --> 00:00:07.200
|
|
156
|
+
<v Speaker 1>Thanks for having me. Let's get started.
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### JSON
|
|
160
|
+
|
|
161
|
+
```json
|
|
162
|
+
[
|
|
163
|
+
{
|
|
164
|
+
"text": "Hello, welcome to the meeting.",
|
|
165
|
+
"start_time": 0.0,
|
|
166
|
+
"end_time": 3.5,
|
|
167
|
+
"speaker_id": 0
|
|
168
|
+
}
|
|
169
|
+
]
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
## How it works
|
|
173
|
+
|
|
174
|
+
1. **Audio preprocessing** — Converts input to 16 kHz mono WAV
|
|
175
|
+
2. **Transcription** — Faster Whisper produces word-level timestamps
|
|
176
|
+
3. **Diarization** — NeMo Sortformer identifies speaker segments
|
|
177
|
+
4. **Merging** — Each word is assigned to a speaker based on temporal overlap
|
|
178
|
+
5. **Grouping** — Consecutive words from the same speaker are combined into chunks
|
|
179
|
+
|
|
180
|
+
## Models
|
|
181
|
+
|
|
182
|
+
| Component | Model | Size |
|
|
183
|
+
|-----------|-------|------|
|
|
184
|
+
| Transcription | [Faster Whisper large-v3](https://huggingface.co/Systran/faster-whisper-large-v3) | ~3 GB |
|
|
185
|
+
| Diarization | [NVIDIA Sortformer 4spk v2.1](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/diar_streaming_sortformer_4spk-v2.1) | ~100 MB |
|
|
186
|
+
|
|
187
|
+
Models are downloaded automatically on first run and cached locally.
|
|
188
|
+
|
|
189
|
+
## Requirements
|
|
190
|
+
|
|
191
|
+
- Python 3.10+
|
|
192
|
+
- CUDA-capable GPU (recommended) or CPU
|
|
193
|
+
- ~4 GB VRAM for GPU inference with large-v3
|
|
194
|
+
|
|
195
|
+
## Browser version
|
|
196
|
+
|
|
197
|
+
Don't want to install anything? Use [OpenSono WebApp](https://opensono.vercel.app) — the same transcription engine running entirely in your browser. No uploads, no sign-up, completely private.
|
|
198
|
+
|
|
199
|
+
## License
|
|
200
|
+
|
|
201
|
+
MIT — see [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
opensono/__init__.py
|
|
5
|
+
opensono/__main__.py
|
|
6
|
+
opensono/core.py
|
|
7
|
+
opensono.egg-info/PKG-INFO
|
|
8
|
+
opensono.egg-info/SOURCES.txt
|
|
9
|
+
opensono.egg-info/dependency_links.txt
|
|
10
|
+
opensono.egg-info/entry_points.txt
|
|
11
|
+
opensono.egg-info/requires.txt
|
|
12
|
+
opensono.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
opensono
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "opensono"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Open-source audio transcription with speaker diarization"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "OpenSono", email = "hello@opensono.ai" },
|
|
14
|
+
]
|
|
15
|
+
keywords = ["transcription", "whisper", "diarization", "speech-to-text", "nemo"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"Intended Audience :: Science/Research",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.10",
|
|
22
|
+
"Programming Language :: Python :: 3.11",
|
|
23
|
+
"Programming Language :: Python :: 3.12",
|
|
24
|
+
"Topic :: Multimedia :: Sound/Audio :: Speech",
|
|
25
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
26
|
+
]
|
|
27
|
+
dependencies = [
|
|
28
|
+
"faster-whisper",
|
|
29
|
+
"nemo_toolkit[asr]",
|
|
30
|
+
"soundfile",
|
|
31
|
+
"librosa",
|
|
32
|
+
"numpy",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
[project.urls]
|
|
36
|
+
Homepage = "https://opensono.ai"
|
|
37
|
+
Repository = "https://github.com/penkow/opensono"
|
|
38
|
+
Issues = "https://github.com/penkow/opensono/issues"
|
|
39
|
+
|
|
40
|
+
[project.scripts]
|
|
41
|
+
opensono = "opensono.core:main"
|
|
42
|
+
|
|
43
|
+
[tool.setuptools.packages.find]
|
|
44
|
+
where = ["."]
|
|
45
|
+
include = ["opensono*"]
|
opensono-0.1.0/setup.cfg
ADDED