speech-mine 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- speech_mine-0.1.0/.github/workflows/publish.yml +43 -0
- speech_mine-0.1.0/.gitignore +210 -0
- speech_mine-0.1.0/.python-version +1 -0
- speech_mine-0.1.0/PKG-INFO +76 -0
- speech_mine-0.1.0/README.md +53 -0
- speech_mine-0.1.0/docs/chunk.md +81 -0
- speech_mine-0.1.0/docs/extract.md +109 -0
- speech_mine-0.1.0/docs/format.md +100 -0
- speech_mine-0.1.0/docs/index.md +30 -0
- speech_mine-0.1.0/docs/installation.md +47 -0
- speech_mine-0.1.0/docs/models.md +33 -0
- speech_mine-0.1.0/docs/output-format.md +49 -0
- speech_mine-0.1.0/docs/search.md +97 -0
- speech_mine-0.1.0/docs/troubleshooting.md +46 -0
- speech_mine-0.1.0/examples/example_chunk_config.yaml +26 -0
- speech_mine-0.1.0/examples/example_extract_metadata.json +13 -0
- speech_mine-0.1.0/examples/example_extract_output.csv +454 -0
- speech_mine-0.1.0/examples/example_format_output.txt +63 -0
- speech_mine-0.1.0/mkdocs.yml +47 -0
- speech_mine-0.1.0/pyproject.toml +45 -0
- speech_mine-0.1.0/src/speech_mine/__init__.py +13 -0
- speech_mine-0.1.0/src/speech_mine/access.py +354 -0
- speech_mine-0.1.0/src/speech_mine/cli.py +409 -0
- speech_mine-0.1.0/src/speech_mine/diarizer/__init__.py +12 -0
- speech_mine-0.1.0/src/speech_mine/diarizer/cli.py +107 -0
- speech_mine-0.1.0/src/speech_mine/diarizer/cli_extract.py +159 -0
- speech_mine-0.1.0/src/speech_mine/diarizer/cli_format.py +107 -0
- speech_mine-0.1.0/src/speech_mine/diarizer/formatter.py +330 -0
- speech_mine-0.1.0/src/speech_mine/diarizer/models.py +14 -0
- speech_mine-0.1.0/src/speech_mine/diarizer/processor.py +414 -0
- speech_mine-0.1.0/src/speech_mine/fuzz.py +94 -0
- speech_mine-0.1.0/src/speech_mine/models.py +32 -0
- speech_mine-0.1.0/src/speech_mine/pickaxe/__init__.py +10 -0
- speech_mine-0.1.0/src/speech_mine/pickaxe/chunk.py +212 -0
- speech_mine-0.1.0/src/speech_mine/pickaxe/cli_chunk.py +137 -0
- speech_mine-0.1.0/tests/test_chunk.py +253 -0
- speech_mine-0.1.0/tests/test_diary_access.py +520 -0
- speech_mine-0.1.0/tests/test_speech_fuzz.py +221 -0
- speech_mine-0.1.0/uv.lock +2932 -0
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
build:
|
|
9
|
+
name: Build distribution
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
|
|
15
|
+
- name: Install uv
|
|
16
|
+
uses: astral-sh/setup-uv@v5
|
|
17
|
+
|
|
18
|
+
- name: Build package
|
|
19
|
+
run: uv build
|
|
20
|
+
|
|
21
|
+
- name: Upload build artifacts
|
|
22
|
+
uses: actions/upload-artifact@v4
|
|
23
|
+
with:
|
|
24
|
+
name: dist
|
|
25
|
+
path: dist/
|
|
26
|
+
|
|
27
|
+
publish:
|
|
28
|
+
name: Publish to PyPI
|
|
29
|
+
needs: build
|
|
30
|
+
runs-on: ubuntu-latest
|
|
31
|
+
environment: pypi
|
|
32
|
+
permissions:
|
|
33
|
+
id-token: write # required for OIDC trusted publishing
|
|
34
|
+
|
|
35
|
+
steps:
|
|
36
|
+
- name: Download build artifacts
|
|
37
|
+
uses: actions/download-artifact@v4
|
|
38
|
+
with:
|
|
39
|
+
name: dist
|
|
40
|
+
path: dist/
|
|
41
|
+
|
|
42
|
+
- name: Publish to PyPI
|
|
43
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[codz]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py.cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
#Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# UV
|
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
100
|
+
# commonly ignored for libraries.
|
|
101
|
+
#uv.lock
|
|
102
|
+
|
|
103
|
+
# poetry
|
|
104
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
105
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
106
|
+
# commonly ignored for libraries.
|
|
107
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
108
|
+
#poetry.lock
|
|
109
|
+
#poetry.toml
|
|
110
|
+
|
|
111
|
+
# pdm
|
|
112
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
113
|
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
|
114
|
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
|
115
|
+
#pdm.lock
|
|
116
|
+
#pdm.toml
|
|
117
|
+
.pdm-python
|
|
118
|
+
.pdm-build/
|
|
119
|
+
|
|
120
|
+
# pixi
|
|
121
|
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
|
122
|
+
#pixi.lock
|
|
123
|
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
|
124
|
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
|
125
|
+
.pixi
|
|
126
|
+
|
|
127
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
128
|
+
__pypackages__/
|
|
129
|
+
|
|
130
|
+
# Celery stuff
|
|
131
|
+
celerybeat-schedule
|
|
132
|
+
celerybeat.pid
|
|
133
|
+
|
|
134
|
+
# SageMath parsed files
|
|
135
|
+
*.sage.py
|
|
136
|
+
|
|
137
|
+
# Environments
|
|
138
|
+
.env
|
|
139
|
+
.envrc
|
|
140
|
+
.venv
|
|
141
|
+
env/
|
|
142
|
+
venv/
|
|
143
|
+
ENV/
|
|
144
|
+
env.bak/
|
|
145
|
+
venv.bak/
|
|
146
|
+
|
|
147
|
+
# Spyder project settings
|
|
148
|
+
.spyderproject
|
|
149
|
+
.spyproject
|
|
150
|
+
|
|
151
|
+
# Rope project settings
|
|
152
|
+
.ropeproject
|
|
153
|
+
|
|
154
|
+
# mkdocs documentation
|
|
155
|
+
/site
|
|
156
|
+
|
|
157
|
+
# mypy
|
|
158
|
+
.mypy_cache/
|
|
159
|
+
.dmypy.json
|
|
160
|
+
dmypy.json
|
|
161
|
+
|
|
162
|
+
# Pyre type checker
|
|
163
|
+
.pyre/
|
|
164
|
+
|
|
165
|
+
# pytype static type analyzer
|
|
166
|
+
.pytype/
|
|
167
|
+
|
|
168
|
+
# Cython debug symbols
|
|
169
|
+
cython_debug/
|
|
170
|
+
|
|
171
|
+
# PyCharm
|
|
172
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
173
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
174
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
175
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
176
|
+
#.idea/
|
|
177
|
+
|
|
178
|
+
# Abstra
|
|
179
|
+
# Abstra is an AI-powered process automation framework.
|
|
180
|
+
# Ignore directories containing user credentials, local state, and settings.
|
|
181
|
+
# Learn more at https://abstra.io/docs
|
|
182
|
+
.abstra/
|
|
183
|
+
|
|
184
|
+
# Visual Studio Code
|
|
185
|
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
|
186
|
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
|
187
|
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
|
188
|
+
# you could uncomment the following to ignore the entire vscode folder
|
|
189
|
+
# .vscode/
|
|
190
|
+
|
|
191
|
+
# Ruff stuff:
|
|
192
|
+
.ruff_cache/
|
|
193
|
+
|
|
194
|
+
# PyPI configuration file
|
|
195
|
+
.pypirc
|
|
196
|
+
|
|
197
|
+
# Cursor
|
|
198
|
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
|
199
|
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
|
200
|
+
# refer to https://docs.cursor.com/context/ignore-files
|
|
201
|
+
.cursorignore
|
|
202
|
+
.cursorindexingignore
|
|
203
|
+
|
|
204
|
+
# Marimo
|
|
205
|
+
marimo/_static/
|
|
206
|
+
marimo/_lsp/
|
|
207
|
+
__marimo__/
|
|
208
|
+
|
|
209
|
+
# macOS files
|
|
210
|
+
.DS_Storesite/
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.11
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: speech-mine
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A powerful tool for extracting and analyzing speech data from audio files with known speaker counts and contents.
|
|
5
|
+
Project-URL: Homepage, https://github.com/your-org/speech-mine
|
|
6
|
+
Project-URL: Repository, https://github.com/your-org/speech-mine
|
|
7
|
+
Project-URL: Issues, https://github.com/your-org/speech-mine/issues
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
13
|
+
Requires-Python: >=3.11
|
|
14
|
+
Requires-Dist: faster-whisper>=1.2.0
|
|
15
|
+
Requires-Dist: pandas>=2.3.2
|
|
16
|
+
Requires-Dist: pyannote-audio>=3.3.2
|
|
17
|
+
Requires-Dist: pydub>=0.25.1
|
|
18
|
+
Requires-Dist: pytest>=8.4.2
|
|
19
|
+
Requires-Dist: pyyaml>=6.0.0
|
|
20
|
+
Requires-Dist: rapidfuzz>=3.14.1
|
|
21
|
+
Requires-Dist: tqdm>=4.67.1
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# speech-mine
|
|
26
|
+
|
|
27
|
+
Speech diarization and transcript analysis toolkit. Extract speaker-labeled transcripts from audio, format them into readable scripts, search them with fuzzy matching, and pre-process audio with chunking.
|
|
28
|
+
|
|
29
|
+
## Modules
|
|
30
|
+
|
|
31
|
+
| Module | Description | Docs |
|
|
32
|
+
|--------|-------------|------|
|
|
33
|
+
| `extract` | Transcribe audio with speaker diarization | [→](docs/extract.md) |
|
|
34
|
+
| `format` | Format CSV transcripts into readable scripts | [→](docs/format.md) |
|
|
35
|
+
| `chunk` | Split audio into segments via YAML config | [→](docs/chunk.md) |
|
|
36
|
+
| `search` | Fuzzy search transcripts by word or phrase | [→](docs/search.md) |
|
|
37
|
+
|
|
38
|
+
## Installation
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
42
|
+
git clone <repository-url>
|
|
43
|
+
cd speech-mine
|
|
44
|
+
uv sync
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
See [docs/installation.md](docs/installation.md) for library dependency setup and HuggingFace token configuration.
|
|
48
|
+
|
|
49
|
+
## Quick Start
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
# 1. Extract a transcript
|
|
53
|
+
uv run speech-mine extract interview.mp3 output.csv \
|
|
54
|
+
--hf-token YOUR_TOKEN \
|
|
55
|
+
--num-speakers 2 \
|
|
56
|
+
--compute-type float32
|
|
57
|
+
|
|
58
|
+
# 2. Format into a readable script
|
|
59
|
+
uv run speech-mine format output.csv script.txt
|
|
60
|
+
|
|
61
|
+
# 3. Search it
|
|
62
|
+
uv run speech-mine search "topic of interest" output.csv --pretty
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Documentation
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
# Serve docs locally
|
|
69
|
+
uv run mkdocs serve
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
Or browse the `docs/` folder directly.
|
|
73
|
+
|
|
74
|
+
## License
|
|
75
|
+
|
|
76
|
+
TBD
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
|
|
2
|
+
# speech-mine
|
|
3
|
+
|
|
4
|
+
Speech diarization and transcript analysis toolkit. Extract speaker-labeled transcripts from audio, format them into readable scripts, search them with fuzzy matching, and pre-process audio with chunking.
|
|
5
|
+
|
|
6
|
+
## Modules
|
|
7
|
+
|
|
8
|
+
| Module | Description | Docs |
|
|
9
|
+
|--------|-------------|------|
|
|
10
|
+
| `extract` | Transcribe audio with speaker diarization | [→](docs/extract.md) |
|
|
11
|
+
| `format` | Format CSV transcripts into readable scripts | [→](docs/format.md) |
|
|
12
|
+
| `chunk` | Split audio into segments via YAML config | [→](docs/chunk.md) |
|
|
13
|
+
| `search` | Fuzzy search transcripts by word or phrase | [→](docs/search.md) |
|
|
14
|
+
|
|
15
|
+
## Installation
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
19
|
+
git clone <repository-url>
|
|
20
|
+
cd speech-mine
|
|
21
|
+
uv sync
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
See [docs/installation.md](docs/installation.md) for library dependency setup and HuggingFace token configuration.
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
# 1. Extract a transcript
|
|
30
|
+
uv run speech-mine extract interview.mp3 output.csv \
|
|
31
|
+
--hf-token YOUR_TOKEN \
|
|
32
|
+
--num-speakers 2 \
|
|
33
|
+
--compute-type float32
|
|
34
|
+
|
|
35
|
+
# 2. Format into a readable script
|
|
36
|
+
uv run speech-mine format output.csv script.txt
|
|
37
|
+
|
|
38
|
+
# 3. Search it
|
|
39
|
+
uv run speech-mine search "topic of interest" output.csv --pretty
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Documentation
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
# Serve docs locally
|
|
46
|
+
uv run mkdocs serve
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Or browse the `docs/` folder directly.
|
|
50
|
+
|
|
51
|
+
## License
|
|
52
|
+
|
|
53
|
+
TBD
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# chunk — Audio Chunking
|
|
2
|
+
|
|
3
|
+
Splits a `.wav` file into smaller segments based on a YAML configuration defining time boundaries. Useful for pre-processing long recordings before running `extract`.
|
|
4
|
+
|
|
5
|
+
!!! note
|
|
6
|
+
Only `.wav` input files are supported. Output chunks are also `.wav`.
|
|
7
|
+
|
|
8
|
+
## CLI
|
|
9
|
+
|
|
10
|
+
```bash
|
|
11
|
+
uv run speech-mine chunk <audio.wav> <config.yaml> <output_dir/> [options]
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
### Options
|
|
15
|
+
|
|
16
|
+
| Flag | Default | Description |
|
|
17
|
+
|------|---------|-------------|
|
|
18
|
+
| `--fade-in MS` | `0` | Fade in duration in milliseconds |
|
|
19
|
+
| `--fade-out MS` | `0` | Fade out duration in milliseconds |
|
|
20
|
+
| `--padding MS` | `0` | Silence padding added to both ends (ms) |
|
|
21
|
+
| `--verbose` | — | Print file sizes for each chunk |
|
|
22
|
+
|
|
23
|
+
### Examples
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
# Basic chunking
|
|
27
|
+
uv run speech-mine chunk recording.wav config.yaml chunks/
|
|
28
|
+
|
|
29
|
+
# With fade effects and padding
|
|
30
|
+
uv run speech-mine chunk recording.wav config.yaml chunks/ \
|
|
31
|
+
--fade-in 500 \
|
|
32
|
+
--fade-out 500 \
|
|
33
|
+
--padding 100 \
|
|
34
|
+
--verbose
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Library
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
from speech_mine.pickaxe.chunk import chunk_audio_file, AudioChunker
|
|
41
|
+
|
|
42
|
+
# Convenience function
|
|
43
|
+
output_files = chunk_audio_file(
|
|
44
|
+
audio_path="recording.wav",
|
|
45
|
+
config_path="config.yaml",
|
|
46
|
+
output_dir="chunks/",
|
|
47
|
+
fade_in=500,
|
|
48
|
+
fade_out=500,
|
|
49
|
+
silence_padding=100,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Or use the class directly
|
|
53
|
+
chunker = AudioChunker(fade_in_duration=500, fade_out_duration=500, silence_padding=100)
|
|
54
|
+
output_files = chunker.process_audio_file("recording.wav", "config.yaml", "chunks/")
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## YAML config format
|
|
58
|
+
|
|
59
|
+
See [examples/example_chunk_config.yaml](https://github.com/your-org/speech-mine/blob/main/examples/example_chunk_config.yaml) for a full example.
|
|
60
|
+
|
|
61
|
+
```yaml
|
|
62
|
+
chunks:
|
|
63
|
+
- start: 0.0
|
|
64
|
+
end: 30.0
|
|
65
|
+
name: "intro" # optional — included in output filename
|
|
66
|
+
- start: 30.0
|
|
67
|
+
end: 120.0
|
|
68
|
+
name: "discussion"
|
|
69
|
+
- start: 120.0
|
|
70
|
+
end: 300.0
|
|
71
|
+
# no name — output will be "2.wav"
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Output filenames follow the pattern `{index}.{name}.wav` or `{index}.wav` if no name is set. Chunks are sorted by start time before indexing.
|
|
75
|
+
|
|
76
|
+
Validation rules:
|
|
77
|
+
|
|
78
|
+
- `start` and `end` are required for every chunk
|
|
79
|
+
- `end` must be greater than `start`
|
|
80
|
+
- `end` cannot exceed the audio file duration
|
|
81
|
+
- Start times must be unique across chunks
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
# extract — Transcription + Speaker Diarization
|
|
2
|
+
|
|
3
|
+
Transcribes audio and labels each segment with the speaker who said it. Uses [faster-whisper](https://github.com/SYSTRAN/faster-whisper) for transcription and [pyannote](https://github.com/pyannote/pyannote-audio) for speaker diarization.
|
|
4
|
+
|
|
5
|
+
**Supported audio formats:** `.wav`, `.mp3`, `.ogg`, `.flac`
|
|
6
|
+
|
|
7
|
+
## CLI
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
uv run speech-mine extract <audio> <output.csv> --hf-token TOKEN [options]
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
### Options
|
|
14
|
+
|
|
15
|
+
| Flag | Default | Description |
|
|
16
|
+
|------|---------|-------------|
|
|
17
|
+
| `--hf-token TOKEN` | *(required)* | HuggingFace access token |
|
|
18
|
+
| `--model SIZE` | `large-v3` | Whisper model size |
|
|
19
|
+
| `--device` | `auto` | `auto`, `cpu`, or `cuda` |
|
|
20
|
+
| `--compute-type` | `float16` | `float16` (GPU), `float32` (CPU), `int8` |
|
|
21
|
+
| `--num-speakers N` | — | Exact speaker count (best accuracy when known) |
|
|
22
|
+
| `--min-speakers N` | `1` | Minimum expected speakers |
|
|
23
|
+
| `--max-speakers N` | — | Maximum expected speakers |
|
|
24
|
+
| `--verbose` | — | Enable verbose logging |
|
|
25
|
+
|
|
26
|
+
### Examples
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
# Basic (CPU)
|
|
30
|
+
uv run speech-mine extract interview.mp3 output.csv \
|
|
31
|
+
--hf-token YOUR_TOKEN \
|
|
32
|
+
--compute-type float32
|
|
33
|
+
|
|
34
|
+
# 2-person interview with known speaker count
|
|
35
|
+
uv run speech-mine extract interview.wav output.csv \
|
|
36
|
+
--hf-token YOUR_TOKEN \
|
|
37
|
+
--num-speakers 2 \
|
|
38
|
+
--compute-type float32
|
|
39
|
+
|
|
40
|
+
# GPU with best accuracy model
|
|
41
|
+
uv run speech-mine extract meeting.wav output.csv \
|
|
42
|
+
--hf-token YOUR_TOKEN \
|
|
43
|
+
--model large-v3 \
|
|
44
|
+
--device cuda \
|
|
45
|
+
--compute-type float16 \
|
|
46
|
+
--num-speakers 4
|
|
47
|
+
|
|
48
|
+
# Speaker range when count is unknown
|
|
49
|
+
uv run speech-mine extract conference.wav output.csv \
|
|
50
|
+
--hf-token YOUR_TOKEN \
|
|
51
|
+
--min-speakers 2 \
|
|
52
|
+
--max-speakers 8 \
|
|
53
|
+
--compute-type float32
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
!!! warning
|
|
57
|
+
Always use `--compute-type float32` when running on CPU. The default (`float16`) requires a GPU and will raise an error on CPU.
|
|
58
|
+
|
|
59
|
+
## Library
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
from speech_mine.diarizer.processor import SpeechDiarizationProcessor
|
|
63
|
+
|
|
64
|
+
processor = SpeechDiarizationProcessor(
|
|
65
|
+
hf_token="YOUR_TOKEN",
|
|
66
|
+
num_speakers=2,
|
|
67
|
+
whisper_model_size="large-v3",
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Full pipeline in one call
|
|
71
|
+
processor.process_audio_file("interview.mp3", "output.csv")
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### Individual pipeline steps
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
# Step 1: Transcribe
|
|
78
|
+
segments, info = processor.transcribe_audio("interview.mp3")
|
|
79
|
+
|
|
80
|
+
# Step 2: Diarize
|
|
81
|
+
diarization = processor.perform_speaker_diarization("interview.mp3")
|
|
82
|
+
|
|
83
|
+
# Step 3: Align
|
|
84
|
+
aligned = processor.align_transcription_with_speakers(segments, diarization)
|
|
85
|
+
|
|
86
|
+
# Step 4: Save
|
|
87
|
+
processor.save_to_csv(aligned, "output.csv", info)
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Output
|
|
91
|
+
|
|
92
|
+
Two files are written:
|
|
93
|
+
|
|
94
|
+
- `output.csv` — segment and word-level transcript data ([see example](https://github.com/your-org/speech-mine/blob/main/examples/example_extract_output.csv))
|
|
95
|
+
- `output_metadata.json` — language, duration, speaker list, processing info ([see example](https://github.com/your-org/speech-mine/blob/main/examples/example_extract_metadata.json))
|
|
96
|
+
|
|
97
|
+
See [Output Format](output-format.md) for full column/field reference.
|
|
98
|
+
|
|
99
|
+
## Speaker Count Tips
|
|
100
|
+
|
|
101
|
+
Specifying `--num-speakers` when you know the exact count improves diarization accuracy by 15–30%.
|
|
102
|
+
|
|
103
|
+
| Parameter | When to use |
|
|
104
|
+
|-----------|-------------|
|
|
105
|
+
| `--num-speakers N` | You know exactly how many people speak |
|
|
106
|
+
| `--min-speakers N` | You know there are at least N speakers |
|
|
107
|
+
| `--max-speakers N` | You want to cap false speaker detection |
|
|
108
|
+
|
|
109
|
+
See [Model Options](models.md) for whisper model and compute type guidance.
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# format — Script Formatting
|
|
2
|
+
|
|
3
|
+
Converts the CSV output from `extract` into a human-readable, movie-style script.
|
|
4
|
+
|
|
5
|
+
## CLI
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
uv run speech-mine format <input.csv> <output.txt> [options]
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
### Options
|
|
12
|
+
|
|
13
|
+
| Flag | Description |
|
|
14
|
+
|------|-------------|
|
|
15
|
+
| `--speakers FILE` | JSON file mapping `SPEAKER_00` → custom name |
|
|
16
|
+
| `--create-template` | Generate a speaker names template JSON from the CSV |
|
|
17
|
+
|
|
18
|
+
### Examples
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
# Basic formatting
|
|
22
|
+
uv run speech-mine format output.csv script.txt
|
|
23
|
+
|
|
24
|
+
# Generate a speaker names template
|
|
25
|
+
uv run speech-mine format output.csv script.txt --create-template
|
|
26
|
+
|
|
27
|
+
# Format with custom speaker names
|
|
28
|
+
uv run speech-mine format output.csv script.txt --speakers output_speaker_names.json
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
### Custom speaker names workflow
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
# 1. Generate template — creates output_speaker_names.json
|
|
35
|
+
uv run speech-mine format output.csv script.txt --create-template
|
|
36
|
+
|
|
37
|
+
# 2. Edit the template
|
|
38
|
+
# {"SPEAKER_00": "Alice", "SPEAKER_01": "Bob"}
|
|
39
|
+
|
|
40
|
+
# 3. Format with names applied
|
|
41
|
+
uv run speech-mine format output.csv final_script.txt --speakers output_speaker_names.json
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Library
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
from speech_mine.diarizer.formatter import ScriptFormatter
|
|
48
|
+
|
|
49
|
+
# Basic formatting
|
|
50
|
+
formatter = ScriptFormatter()
|
|
51
|
+
formatter.format_script("output.csv", "script.txt")
|
|
52
|
+
|
|
53
|
+
# With custom speaker names
|
|
54
|
+
formatter = ScriptFormatter(custom_speakers={"SPEAKER_00": "Alice", "SPEAKER_01": "Bob"})
|
|
55
|
+
formatter.format_script("output.csv", "script.txt")
|
|
56
|
+
|
|
57
|
+
# Generate a speaker names template from a CSV
|
|
58
|
+
template_path = ScriptFormatter.create_custom_speakers_template("output.csv")
|
|
59
|
+
|
|
60
|
+
# Load speaker names from a JSON file
|
|
61
|
+
speakers = ScriptFormatter.load_custom_speakers("names.json")
|
|
62
|
+
formatter = ScriptFormatter(custom_speakers=speakers)
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Output format
|
|
66
|
+
|
|
67
|
+
See [examples/example_format_output.txt](https://github.com/your-org/speech-mine/blob/main/examples/example_format_output.txt) for a full sample.
|
|
68
|
+
|
|
69
|
+
```
|
|
70
|
+
================================================================================
|
|
71
|
+
TRANSCRIPT
|
|
72
|
+
================================================================================
|
|
73
|
+
|
|
74
|
+
RECORDING DETAILS:
|
|
75
|
+
----------------------------------------
|
|
76
|
+
File: interview.mp3
|
|
77
|
+
Duration: 08:47
|
|
78
|
+
Language: EN (confidence: 99.0%)
|
|
79
|
+
Speakers: 2
|
|
80
|
+
Processed: 2026-03-05 22:00:00
|
|
81
|
+
|
|
82
|
+
CAST:
|
|
83
|
+
----------------------------------------
|
|
84
|
+
SPEAKER A
|
|
85
|
+
SPEAKER B
|
|
86
|
+
|
|
87
|
+
TRANSCRIPT:
|
|
88
|
+
----------------------------------------
|
|
89
|
+
|
|
90
|
+
[00:00 - 00:05] SPEAKER A:
|
|
91
|
+
So tell me about your background.
|
|
92
|
+
|
|
93
|
+
[00:06 - 00:12] SPEAKER B:
|
|
94
|
+
Sure, I started out in radio back in the eighties.
|
|
95
|
+
|
|
96
|
+
[...5:30 pause...]
|
|
97
|
+
|
|
98
|
+
[05:42 - 05:50] SPEAKER A:
|
|
99
|
+
And how did that shape your career?
|
|
100
|
+
```
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# speech-mine
|
|
2
|
+
|
|
3
|
+
Speech diarization and transcript analysis toolkit. Extract speaker-labeled transcripts from audio, format them into readable scripts, search them with fuzzy matching, and pre-process audio with chunking.
|
|
4
|
+
|
|
5
|
+
## Modules
|
|
6
|
+
|
|
7
|
+
| Module | Description |
|
|
8
|
+
|--------|-------------|
|
|
9
|
+
| [`extract`](extract.md) | Transcribe audio with speaker diarization |
|
|
10
|
+
| [`format`](format.md) | Format CSV transcripts into readable scripts |
|
|
11
|
+
| [`chunk`](chunk.md) | Split audio into segments via YAML config |
|
|
12
|
+
| [`search`](search.md) | Fuzzy search transcripts by word or phrase |
|
|
13
|
+
|
|
14
|
+
## Quick Start
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
# 1. Extract a transcript
|
|
18
|
+
uv run speech-mine extract interview.mp3 output.csv \
|
|
19
|
+
--hf-token YOUR_TOKEN \
|
|
20
|
+
--num-speakers 2 \
|
|
21
|
+
--compute-type float32
|
|
22
|
+
|
|
23
|
+
# 2. Format it into a readable script
|
|
24
|
+
uv run speech-mine format output.csv script.txt
|
|
25
|
+
|
|
26
|
+
# 3. Search it
|
|
27
|
+
uv run speech-mine search "topic of interest" output.csv --pretty
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
See [Installation](installation.md) to get started.
|