scribe-cli 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scribe_cli-0.3.0/.github/workflows/pypi.yml +35 -0
- scribe_cli-0.3.0/.gitignore +5 -0
- scribe_cli-0.3.0/LICENSE +29 -0
- scribe_cli-0.3.0/PKG-INFO +135 -0
- scribe_cli-0.3.0/README.md +79 -0
- scribe_cli-0.3.0/pyproject.toml +50 -0
- scribe_cli-0.3.0/scribe/__init__.py +1 -0
- scribe_cli-0.3.0/scribe/_version.py +16 -0
- scribe_cli-0.3.0/scribe/audio.py +51 -0
- scribe_cli-0.3.0/scribe/install_desktop.py +43 -0
- scribe_cli-0.3.0/scribe/keyboard.py +19 -0
- scribe_cli-0.3.0/scribe/models.py +133 -0
- scribe_cli-0.3.0/scribe/models.toml +31 -0
- scribe_cli-0.3.0/scribe/saverecording.py +62 -0
- scribe_cli-0.3.0/scribe/streamer.py +215 -0
- scribe_cli-0.3.0/scribe/testpynput.py +142 -0
- scribe_cli-0.3.0/scribe/util.py +185 -0
- scribe_cli-0.3.0/scribe_cli.egg-info/PKG-INFO +135 -0
- scribe_cli-0.3.0/scribe_cli.egg-info/SOURCES.txt +24 -0
- scribe_cli-0.3.0/scribe_cli.egg-info/dependency_links.txt +1 -0
- scribe_cli-0.3.0/scribe_cli.egg-info/entry_points.txt +3 -0
- scribe_cli-0.3.0/scribe_cli.egg-info/requires.txt +18 -0
- scribe_cli-0.3.0/scribe_cli.egg-info/top_level.txt +1 -0
- scribe_cli-0.3.0/setup.cfg +4 -0
- scribe_cli-0.3.0/share/icon.jpg +0 -0
- scribe_cli-0.3.0/templates/scribe.desktop +8 -0
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- '*'
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
build-and-publish:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
environment:
|
|
12
|
+
name: pypi
|
|
13
|
+
permissions:
|
|
14
|
+
id-token: write # This is required for OIDC
|
|
15
|
+
contents: read
|
|
16
|
+
|
|
17
|
+
steps:
|
|
18
|
+
- name: Checkout code
|
|
19
|
+
uses: actions/checkout@v2
|
|
20
|
+
|
|
21
|
+
- name: Set up Python
|
|
22
|
+
uses: actions/setup-python@v2
|
|
23
|
+
with:
|
|
24
|
+
python-version: '3.x'
|
|
25
|
+
|
|
26
|
+
- name: Install dependencies
|
|
27
|
+
run: |
|
|
28
|
+
python -m pip install --upgrade pip
|
|
29
|
+
pip install setuptools setuptools-scm[toml] wheel build
|
|
30
|
+
|
|
31
|
+
- name: Build distribution
|
|
32
|
+
run: python -m build
|
|
33
|
+
|
|
34
|
+
- name: Publish to PyPI
|
|
35
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
scribe_cli-0.3.0/LICENSE
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Mahé Perrette
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
Note: This project relies on external packages that may have more restrictive
|
|
26
|
+
licenses. For example, the `pynput` package is licensed under LGPLv3, which
|
|
27
|
+
has different requirements compared to the MIT License. Please review the
|
|
28
|
+
licenses of all dependencies before using or distributing this software to
|
|
29
|
+
ensure compliance with their respective terms.
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: scribe-cli
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: scribe is a local speech recognition tool that provides real-time transcription using vosk and whisper AI.
|
|
5
|
+
Author-email: Mahé Perrette <mahe.perrette@gmail.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2024 Mahé Perrette
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
Note: This project relies on external packages that may have more restrictive
|
|
31
|
+
licenses. For example, the `pynput` package is licensed under LGPLv3, which
|
|
32
|
+
has different requirements compared to the MIT License. Please review the
|
|
33
|
+
licenses of all dependencies before using or distributing this software to
|
|
34
|
+
ensure compliance with their respective terms.
|
|
35
|
+
Project-URL: Homepage, https://github.com/perrette/scribe
|
|
36
|
+
Keywords: speech recognition,transcription,AI,language,vosk,whisper,openai
|
|
37
|
+
Classifier: Programming Language :: Python :: 3
|
|
38
|
+
Classifier: Operating System :: OS Independent
|
|
39
|
+
Requires-Python: >=3.9
|
|
40
|
+
Description-Content-Type: text/markdown
|
|
41
|
+
License-File: LICENSE
|
|
42
|
+
Requires-Dist: numpy
|
|
43
|
+
Requires-Dist: sounddevice
|
|
44
|
+
Requires-Dist: tqdm
|
|
45
|
+
Requires-Dist: requests
|
|
46
|
+
Provides-Extra: keyboard
|
|
47
|
+
Requires-Dist: pynput; extra == "keyboard"
|
|
48
|
+
Provides-Extra: whisper
|
|
49
|
+
Requires-Dist: openai-whisper; extra == "whisper"
|
|
50
|
+
Provides-Extra: vosk
|
|
51
|
+
Requires-Dist: vosk; extra == "vosk"
|
|
52
|
+
Provides-Extra: all
|
|
53
|
+
Requires-Dist: pynput; extra == "all"
|
|
54
|
+
Requires-Dist: openai-whisper; extra == "all"
|
|
55
|
+
Requires-Dist: vosk; extra == "all"
|
|
56
|
+
|
|
57
|
+
# Scribe
|
|
58
|
+
|
|
59
|
+
`scribe` is a local speech recognition tool that provides real-time transcription using vosk and whisper AI.
|
|
60
|
+
|
|
61
|
+
## Installation
|
|
62
|
+
|
|
63
|
+
Install PortAudio library. E.g. on Ubuntu:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
sudo apt-get install portaudio19-dev
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
The python dependencies should be dealt with automatically:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
pip install scribe-cli[all]"
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
(note the `-cli` suffix for client)
|
|
76
|
+
|
|
77
|
+
or for local development:
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
git clone https://github.com/perrette/scribe.git
|
|
81
|
+
cd scribe
|
|
82
|
+
pip install -e .[all]
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
You can leave the optional dependencies (leave out `[all]`) but must install at least one of `vosk` or `openai-whisper` packages (see Usage below).
|
|
86
|
+
|
|
87
|
+
The `vosk` language models will download on-the-fly.
|
|
88
|
+
The default data folder is `$HOME/.local/share/vosk/language-models`.
|
|
89
|
+
This can be modified.
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
## Usage
|
|
93
|
+
|
|
94
|
+
Just type in the terminal:
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
scribe
|
|
98
|
+
```
|
|
99
|
+
and the script will guide you through the choice of backend (`whisper` or `vosk`) and the specific language model.
|
|
100
|
+
After this, you will be prompted to start recording your microphone and print the transcribed text in real-time (`vosk`)
|
|
101
|
+
or until after recording is complete (`whisper`).
|
|
102
|
+
You can interrupt the recording via Ctrl + C and start again or change model.
|
|
103
|
+
|
|
104
|
+
The default (`whisper`) is excellent at transcribing a full-length audio sequences in [many languages](https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages). It is really impressive,
|
|
105
|
+
but it cannot do real-time out of the box, and depending on the model can have relatively long execution time, especially with the `turbo` model (at least on my laptop with CPU only). The `small` model is also excellent and runs much faster. It is selected as default in `scribe` for that reason.
|
|
106
|
+
With the `whisker` model you need to stop the registration manually before the transcription occurs (Ctrl + C), though after
|
|
107
|
+
60 seconds it will stop automatically (and try to continue afterward).
|
|
108
|
+
|
|
109
|
+
The `vosk` backend is good at
|
|
110
|
+
doing real-time transcription for one language, but tended to make more mistakes in my tests and it does not do punctuation.
|
|
111
|
+
There are many [vosk models](https://alphacephei.com/vosk/models) available, and here a few are associated to [a handful of languages](scribe/models.toml) `en`, `fr`, `it`, `de` (so far).
|
|
112
|
+
|
|
113
|
+
To skip the initial selection menu you can do:
|
|
114
|
+
```bash
|
|
115
|
+
scribe --backend whisper --model small --no-prompt
|
|
116
|
+
```
|
|
117
|
+
where `--no-prompt` jumps right to the recording (after the first interruption, you can still choose to change the backend and model).
|
|
118
|
+
|
|
119
|
+
### Advanced usage as keyboard replacement
|
|
120
|
+
|
|
121
|
+
With the `--keyboard` option `scribe` will attempt to simulate a keyboard and send transcribed characters to the applcation under focus:
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
scribe --keyboard
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
It relies on the optional `pynput` dependency (installed together with `scribe` if you used the `[all]` or `[keyboard]` option).
|
|
128
|
+
|
|
129
|
+
`pynput` may require [some configuration](https://pynput.readthedocs.io/en/latest/limitations.html) (I *think* got it to work with `xhost +SI:localuser:$(whoami)` as far as the display is concerned). It has [limitations]((https://pynput.readthedocs.io/en/latest/limitations.html)). In my Ubuntu + Wayland system it works in chromium based applications (including vscode) but it does not in firefox and sublime text and any of the rest (not even in a terminal !).
|
|
130
|
+
Workarounds include using the Xorg version of GNOME... Suggestions welcome.
|
|
131
|
+
|
|
132
|
+
### Start as an application in Ubuntu
|
|
133
|
+
|
|
134
|
+
If you run Ubuntu (or else?) with GNOME, the script `scribe-install [...]` will create a `scribe.desktop` file and place it under `$HOME/.local/share/applications`
|
|
135
|
+
to make it available from the quick launch menu. Any option will be passed on to `scribe`.
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# Scribe
|
|
2
|
+
|
|
3
|
+
`scribe` is a local speech recognition tool that provides real-time transcription using vosk and whisper AI.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
Install PortAudio library. E.g. on Ubuntu:
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
sudo apt-get install portaudio19-dev
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
The python dependencies should be dealt with automatically:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pip install scribe-cli[all]"
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
(note the `-cli` suffix for client)
|
|
20
|
+
|
|
21
|
+
or for local development:
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
git clone https://github.com/perrette/scribe.git
|
|
25
|
+
cd scribe
|
|
26
|
+
pip install -e .[all]
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
You can leave the optional dependencies (leave out `[all]`) but must install at least one of `vosk` or `openai-whisper` packages (see Usage below).
|
|
30
|
+
|
|
31
|
+
The `vosk` language models will download on-the-fly.
|
|
32
|
+
The default data folder is `$HOME/.local/share/vosk/language-models`.
|
|
33
|
+
This can be modified.
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
## Usage
|
|
37
|
+
|
|
38
|
+
Just type in the terminal:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
scribe
|
|
42
|
+
```
|
|
43
|
+
and the script will guide you through the choice of backend (`whisper` or `vosk`) and the specific language model.
|
|
44
|
+
After this, you will be prompted to start recording your microphone and print the transcribed text in real-time (`vosk`)
|
|
45
|
+
or until after recording is complete (`whisper`).
|
|
46
|
+
You can interrupt the recording via Ctrl + C and start again or change model.
|
|
47
|
+
|
|
48
|
+
The default (`whisper`) is excellent at transcribing a full-length audio sequences in [many languages](https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages). It is really impressive,
|
|
49
|
+
but it cannot do real-time out of the box, and depending on the model can have relatively long execution time, especially with the `turbo` model (at least on my laptop with CPU only). The `small` model is also excellent and runs much faster. It is selected as default in `scribe` for that reason.
|
|
50
|
+
With the `whisker` model you need to stop the registration manually before the transcription occurs (Ctrl + C), though after
|
|
51
|
+
60 seconds it will stop automatically (and try to continue afterward).
|
|
52
|
+
|
|
53
|
+
The `vosk` backend is good at
|
|
54
|
+
doing real-time transcription for one language, but tended to make more mistakes in my tests and it does not do punctuation.
|
|
55
|
+
There are many [vosk models](https://alphacephei.com/vosk/models) available, and here a few are associated to [a handful of languages](scribe/models.toml) `en`, `fr`, `it`, `de` (so far).
|
|
56
|
+
|
|
57
|
+
To skip the initial selection menu you can do:
|
|
58
|
+
```bash
|
|
59
|
+
scribe --backend whisper --model small --no-prompt
|
|
60
|
+
```
|
|
61
|
+
where `--no-prompt` jumps right to the recording (after the first interruption, you can still choose to change the backend and model).
|
|
62
|
+
|
|
63
|
+
### Advanced usage as keyboard replacement
|
|
64
|
+
|
|
65
|
+
With the `--keyboard` option `scribe` will attempt to simulate a keyboard and send transcribed characters to the applcation under focus:
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
scribe --keyboard
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
It relies on the optional `pynput` dependency (installed together with `scribe` if you used the `[all]` or `[keyboard]` option).
|
|
72
|
+
|
|
73
|
+
`pynput` may require [some configuration](https://pynput.readthedocs.io/en/latest/limitations.html) (I *think* got it to work with `xhost +SI:localuser:$(whoami)` as far as the display is concerned). It has [limitations]((https://pynput.readthedocs.io/en/latest/limitations.html)). In my Ubuntu + Wayland system it works in chromium based applications (including vscode) but it does not in firefox and sublime text and any of the rest (not even in a terminal !).
|
|
74
|
+
Workarounds include using the Xorg version of GNOME... Suggestions welcome.
|
|
75
|
+
|
|
76
|
+
### Start as an application in Ubuntu
|
|
77
|
+
|
|
78
|
+
If you run Ubuntu (or else?) with GNOME, the script `scribe-install [...]` will create a `scribe.desktop` file and place it under `$HOME/.local/share/applications`
|
|
79
|
+
to make it available from the quick launch menu. Any option will be passed on to `scribe`.
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "setuptools_scm[toml]>=6.2", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "scribe-cli"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "scribe is a local speech recognition tool that provides real-time transcription using vosk and whisper AI."
|
|
9
|
+
authors = [
|
|
10
|
+
{ name="Mahé Perrette", email="mahe.perrette@gmail.com" }
|
|
11
|
+
]
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
license = { file="LICENSE" }
|
|
14
|
+
requires-python = ">=3.9"
|
|
15
|
+
dependencies = [
|
|
16
|
+
"numpy",
|
|
17
|
+
"sounddevice",
|
|
18
|
+
"tqdm",
|
|
19
|
+
"requests",
|
|
20
|
+
]
|
|
21
|
+
optional-dependencies = { keyboard = ["pynput"], whisper = ["openai-whisper"], vosk = ["vosk"], all = ["pynput", "openai-whisper", "vosk"] }
|
|
22
|
+
|
|
23
|
+
classifiers = [
|
|
24
|
+
"Programming Language :: Python :: 3",
|
|
25
|
+
"Operating System :: OS Independent",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
keywords = [
|
|
29
|
+
"speech recognition",
|
|
30
|
+
"transcription",
|
|
31
|
+
"AI",
|
|
32
|
+
"language",
|
|
33
|
+
"vosk",
|
|
34
|
+
"whisper",
|
|
35
|
+
"openai",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
[tool.setuptools]
|
|
39
|
+
packages = [ "scribe" ]
|
|
40
|
+
|
|
41
|
+
[tool.setuptools_scm]
|
|
42
|
+
write_to = "scribe/_version.py"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
[project.urls]
|
|
46
|
+
Homepage = "https://github.com/perrette/scribe"
|
|
47
|
+
|
|
48
|
+
[project.scripts]
|
|
49
|
+
scribe = "scribe.streamer:main"
|
|
50
|
+
scribe-install = "scribe.install_desktop:main"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from ._version import __version__
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# file generated by setuptools_scm
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
TYPE_CHECKING = False
|
|
4
|
+
if TYPE_CHECKING:
|
|
5
|
+
from typing import Tuple, Union
|
|
6
|
+
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
7
|
+
else:
|
|
8
|
+
VERSION_TUPLE = object
|
|
9
|
+
|
|
10
|
+
version: str
|
|
11
|
+
__version__: str
|
|
12
|
+
__version_tuple__: VERSION_TUPLE
|
|
13
|
+
version_tuple: VERSION_TUPLE
|
|
14
|
+
|
|
15
|
+
__version__ = version = '0.3.0'
|
|
16
|
+
__version_tuple__ = version_tuple = (0, 3, 0)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import sounddevice as sd
|
|
2
|
+
import queue
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def get_duration(audio_length_bytes, # bytes
|
|
6
|
+
sampling_rate = 16000, # Hz
|
|
7
|
+
num_channels = 1, # Mono
|
|
8
|
+
sample_width = 2, # 16-bit audio
|
|
9
|
+
):
|
|
10
|
+
|
|
11
|
+
# Calculate the number of samples
|
|
12
|
+
num_samples = audio_length_bytes / (num_channels * sample_width)
|
|
13
|
+
|
|
14
|
+
# Calculate the duration in seconds
|
|
15
|
+
duration_seconds = num_samples / sampling_rate
|
|
16
|
+
|
|
17
|
+
return duration_seconds
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Microphone:
|
|
21
|
+
def __init__(self,
|
|
22
|
+
samplerate = 16000, # Vosk models typically use a 16kHz sample rate
|
|
23
|
+
channels = 1, # Mono audio
|
|
24
|
+
device = None, # Default device
|
|
25
|
+
dtype = 'int16', # Vosk models typically use 16-bit audio
|
|
26
|
+
):
|
|
27
|
+
self.q = queue.Queue()
|
|
28
|
+
self.samplerate = samplerate
|
|
29
|
+
self.channels = channels
|
|
30
|
+
self.device = device
|
|
31
|
+
self.dtype = dtype
|
|
32
|
+
|
|
33
|
+
# Fonction callback pour traiter les morceaux audio
|
|
34
|
+
def callback(self, indata, frames, time, status):
|
|
35
|
+
if status:
|
|
36
|
+
print(status)
|
|
37
|
+
self.q.put(bytes(indata))
|
|
38
|
+
# if frames > 1000: # Ajustez cette valeur pour essayer différents morceaux de taille
|
|
39
|
+
# rec.AcceptWaveform(bytes(indata))
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def open_stream(self):
|
|
43
|
+
self.q.queue.clear()
|
|
44
|
+
return sd.InputStream(samplerate=self.samplerate, device=self.device,
|
|
45
|
+
channels=self.channels, callback=self.callback, dtype=self.dtype)
|
|
46
|
+
|
|
47
|
+
def device_info(self):
|
|
48
|
+
return sd.query_devices(self.device, 'input')
|
|
49
|
+
|
|
50
|
+
def get_duraction(self, audio_length_bytes):
|
|
51
|
+
return get_duration(audio_length_bytes, self.samplerate, self.channels, {'int16':2}[self.dtype])
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import os, sys, platform, shutil, sysconfig
|
|
2
|
+
import argparse
|
|
3
|
+
|
|
4
|
+
def main():
|
|
5
|
+
|
|
6
|
+
# Check if the current platform is Linux
|
|
7
|
+
if platform.system() != "Linux":
|
|
8
|
+
print("This package is only supported on Linux systems.", file=sys.stderr)
|
|
9
|
+
sys.exit(0)
|
|
10
|
+
|
|
11
|
+
parser = argparse.ArgumentParser("Install the desktop file for the scribe package. Any arguments to this script will be passed on to `scribe`.")
|
|
12
|
+
o, rest = parser.parse_known_args()
|
|
13
|
+
o.arguments = rest
|
|
14
|
+
|
|
15
|
+
PACKAGE_NAME = 'scribe'
|
|
16
|
+
|
|
17
|
+
HOME = os.environ.get('HOME',os.path.expanduser('~'))
|
|
18
|
+
XDG_SHARE = os.environ.get('XDG_DATA_HOME', os.path.join(HOME, '.local','share'))
|
|
19
|
+
XDG_APP_DATA = os.path.join(XDG_SHARE, 'applications')
|
|
20
|
+
XDG_SCRIBE_DATA = os.path.join(XDG_SHARE, PACKAGE_NAME)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# Create the directory if it doesn't exist
|
|
24
|
+
os.makedirs(XDG_SCRIBE_DATA, exist_ok=True)
|
|
25
|
+
os.makedirs(XDG_APP_DATA, exist_ok=True)
|
|
26
|
+
|
|
27
|
+
# Copy your files to the desired location
|
|
28
|
+
print("Copying files to", XDG_SCRIBE_DATA)
|
|
29
|
+
shutil.copy('share/icon.jpg', XDG_SCRIBE_DATA)
|
|
30
|
+
|
|
31
|
+
with open('templates/scribe.desktop') as f:
|
|
32
|
+
template = f.read()
|
|
33
|
+
|
|
34
|
+
bin_folder = sysconfig.get_path("scripts")
|
|
35
|
+
desktop_file = template.format(XDG_SCRIBE_DATA=XDG_SCRIBE_DATA, bin_folder=bin_folder, options=' '.join(o.arguments))
|
|
36
|
+
|
|
37
|
+
print("Writing desktop file to", XDG_APP_DATA)
|
|
38
|
+
with open(os.path.join(XDG_APP_DATA, 'scribe.desktop'), "w") as f:
|
|
39
|
+
f.write(desktop_file)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
if __name__ == "__main__":
|
|
43
|
+
main()
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""This module handles typing characters as if they were typed on a keyboard.
|
|
2
|
+
"""
|
|
3
|
+
try:
|
|
4
|
+
# import pyautogui
|
|
5
|
+
from pynput.keyboard import Controller
|
|
6
|
+
|
|
7
|
+
except ImportError:
|
|
8
|
+
print("Please install pynput to use the keyboard feature.")
|
|
9
|
+
print("Alternatively specify [keyboard] optional dependency to voskrealtime, e.g. `pip install -e .[keyboard]`")
|
|
10
|
+
raise
|
|
11
|
+
|
|
12
|
+
# Create a keyboard controller
|
|
13
|
+
keyboard = Controller()
|
|
14
|
+
|
|
15
|
+
def type_text(text, interval=0):
|
|
16
|
+
# Simulate typing a string
|
|
17
|
+
# import subprocess
|
|
18
|
+
# subprocess.run(["ydotool", "type", text])
|
|
19
|
+
keyboard.type(text)
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import numpy as np
|
|
4
|
+
from scribe.util import download_model
|
|
5
|
+
|
|
6
|
+
VOSK_MODELS_FOLDER = os.path.join(os.environ.get("HOME"),
|
|
7
|
+
".local/share/vosk/language-models")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AbstractTranscriber:
|
|
11
|
+
backend = None
|
|
12
|
+
def __init__(self, model, model_name=None, language=None, samplerate=16000, model_kwargs={}):
|
|
13
|
+
self.model_name = model_name
|
|
14
|
+
self.language = language
|
|
15
|
+
self.model = model
|
|
16
|
+
self.model_kwargs = model_kwargs
|
|
17
|
+
self.samplerate = samplerate
|
|
18
|
+
|
|
19
|
+
def transcribe_audio(self, audio_data):
|
|
20
|
+
raise NotImplementedError()
|
|
21
|
+
|
|
22
|
+
def transcribe_realtime_audio(self, audio_data):
|
|
23
|
+
raise NotImplementedError()
|
|
24
|
+
|
|
25
|
+
def start_recording(self, microphone,
|
|
26
|
+
start_message="Recording... Press Ctrl+C to stop.",
|
|
27
|
+
stop_message="Stopped recording."):
|
|
28
|
+
|
|
29
|
+
with microphone.open_stream():
|
|
30
|
+
print(start_message)
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
while True:
|
|
34
|
+
while not microphone.q.empty():
|
|
35
|
+
data = microphone.q.get()
|
|
36
|
+
yield self.transcribe_realtime_audio(data)
|
|
37
|
+
|
|
38
|
+
except KeyboardInterrupt:
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
finally:
|
|
42
|
+
result = self.finalize()
|
|
43
|
+
microphone.q.queue.clear()
|
|
44
|
+
yield result
|
|
45
|
+
|
|
46
|
+
print(stop_message)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def get_vosk_model(model, data_folder=None, url=None):
|
|
50
|
+
"""Load the Vosk recognizer"""
|
|
51
|
+
import vosk
|
|
52
|
+
if data_folder is None:
|
|
53
|
+
data_folder = VOSK_MODELS_FOLDER
|
|
54
|
+
model_path = os.path.join(data_folder, model)
|
|
55
|
+
if not os.path.exists(model_path):
|
|
56
|
+
if url is None:
|
|
57
|
+
url = f"https://alphacephei.com/vosk/models/{model}.zip"
|
|
58
|
+
download_model(url, data_folder)
|
|
59
|
+
assert os.path.exists(model_path)
|
|
60
|
+
|
|
61
|
+
return vosk.Model(model_path)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def get_vosk_recognizer(model, samplerate=16000):
|
|
65
|
+
import vosk
|
|
66
|
+
return vosk.KaldiRecognizer(model, samplerate)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class VoskTranscriber(AbstractTranscriber):
|
|
70
|
+
backend = "vosk"
|
|
71
|
+
|
|
72
|
+
def __init__(self, model_name, model=None, model_kwargs={}, **kwargs):
|
|
73
|
+
if model is None:
|
|
74
|
+
model = get_vosk_model(model_name, **model_kwargs)
|
|
75
|
+
super().__init__(model, model_name, model_kwargs=model_kwargs, **kwargs)
|
|
76
|
+
self.recognizer = get_vosk_recognizer(model, self.samplerate)
|
|
77
|
+
|
|
78
|
+
def transcribe_realtime_audio(self, audio_bytes=b"", finalize=False):
|
|
79
|
+
final = self.recognizer.AcceptWaveform(audio_bytes)
|
|
80
|
+
if final:
|
|
81
|
+
result = self.recognizer.Result()
|
|
82
|
+
else:
|
|
83
|
+
result = self.recognizer.PartialResult()
|
|
84
|
+
result_dict = json.loads(result)
|
|
85
|
+
|
|
86
|
+
if final:
|
|
87
|
+
pass
|
|
88
|
+
elif finalize:
|
|
89
|
+
result_dict["text"] = result_dict.pop("partial", "")
|
|
90
|
+
else:
|
|
91
|
+
assert not final
|
|
92
|
+
if "text" in result_dict:
|
|
93
|
+
del result_dict["text"]
|
|
94
|
+
return result_dict
|
|
95
|
+
|
|
96
|
+
def transcribe_audio(self, audio_data=None):
|
|
97
|
+
return self.transcribe_realtime_audio(audio_data, finalize=True)
|
|
98
|
+
|
|
99
|
+
def finalize(self):
|
|
100
|
+
return self.transcribe_audio(b"")
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class WhisperTranscriber(AbstractTranscriber):
|
|
104
|
+
backend = "whisper"
|
|
105
|
+
|
|
106
|
+
def __init__(self, model_name, language=None, model=None, model_kwargs={}, **kwargs):
|
|
107
|
+
import whisper
|
|
108
|
+
if model is None:
|
|
109
|
+
model = whisper.load_model(model_name)
|
|
110
|
+
super().__init__(model, model_name, language, model_kwargs=model_kwargs, **kwargs)
|
|
111
|
+
self.audio_buffer = b''
|
|
112
|
+
|
|
113
|
+
def transcribe_realtime_audio(self, audio_bytes=b"", max_duration=60):
|
|
114
|
+
self.audio_buffer += audio_bytes
|
|
115
|
+
|
|
116
|
+
one_second = self.samplerate * 2 # 16-bit audio, 1 channel ~ 32000 bytes
|
|
117
|
+
if len(self.audio_buffer) < max_duration * one_second:
|
|
118
|
+
return {"partial": f"{len(self.audio_buffer)} bytes received (duration: {len(self.audio_buffer) / one_second:.2f} seconds)"}
|
|
119
|
+
|
|
120
|
+
else:
|
|
121
|
+
return self.finalize()
|
|
122
|
+
|
|
123
|
+
def transcribe_audio(self, audio_bytes):
|
|
124
|
+
print("\nTranscribing...")
|
|
125
|
+
audio_array = np.frombuffer(audio_bytes, dtype=np.int16).flatten().astype(np.float32) / 32768.0
|
|
126
|
+
return self.model.transcribe(audio_array, fp16=False, language=self.language)
|
|
127
|
+
|
|
128
|
+
def finalize(self):
|
|
129
|
+
if len(self.audio_buffer) == 0:
|
|
130
|
+
return {"text": ""}
|
|
131
|
+
result = self.transcribe_audio(self.audio_buffer)
|
|
132
|
+
self.audio_buffer = b''
|
|
133
|
+
return result
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
[vosk.en]
|
|
2
|
+
model = "vosk-model-en-us-0.42-gigaspeech"
|
|
3
|
+
|
|
4
|
+
[vosk.fr]
|
|
5
|
+
model = "vosk-model-fr-0.22"
|
|
6
|
+
|
|
7
|
+
[vosk.de]
|
|
8
|
+
model = "vosk-model-de-tuda-0.6-900k"
|
|
9
|
+
|
|
10
|
+
[vosk.it]
|
|
11
|
+
model = "vosk-model-it-0.22"
|
|
12
|
+
|
|
13
|
+
[_meta.en]
|
|
14
|
+
language = "English (US)"
|
|
15
|
+
start_message = "Listening... Press Ctrl+C to stop."
|
|
16
|
+
stop_message = "Recording stopped."
|
|
17
|
+
|
|
18
|
+
[_meta.fr]
|
|
19
|
+
language = "French"
|
|
20
|
+
start_message = "En écoute... Appuyez sur Ctrl+C pour arrêter."
|
|
21
|
+
stop_message = "Écoute arrêtée."
|
|
22
|
+
|
|
23
|
+
[_meta.de]
|
|
24
|
+
language = "German"
|
|
25
|
+
start_message = "Hören... Drücken Sie Strg+C, um zu stoppen."
|
|
26
|
+
stop_message = "Aufnahme gestoppt."
|
|
27
|
+
|
|
28
|
+
[_meta.it]
|
|
29
|
+
language = "Italian"
|
|
30
|
+
start_message = "In ascolto... Premere Ctrl+C per interrompere."
|
|
31
|
+
stop_message = "Registrazione interrotta."
|