scribe-cli 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,35 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - '*'
7
+
8
+ jobs:
9
+ build-and-publish:
10
+ runs-on: ubuntu-latest
11
+ environment:
12
+ name: pypi
13
+ permissions:
14
+ id-token: write # This is required for OIDC
15
+ contents: read
16
+
17
+ steps:
18
+ - name: Checkout code
19
+ uses: actions/checkout@v2
20
+
21
+ - name: Set up Python
22
+ uses: actions/setup-python@v2
23
+ with:
24
+ python-version: '3.x'
25
+
26
+ - name: Install dependencies
27
+ run: |
28
+ python -m pip install --upgrade pip
29
+ pip install setuptools setuptools-scm[toml] wheel build
30
+
31
+ - name: Build distribution
32
+ run: python -m build
33
+
34
+ - name: Publish to PyPI
35
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,5 @@
1
+ __pycache__
2
+ *.pyc
3
+ .venv
4
+ build
5
+ dist
@@ -0,0 +1,29 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Mahé Perrette
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
23
+ ---
24
+
25
+ Note: This project relies on external packages that may have more restrictive
26
+ licenses. For example, the `pynput` package is licensed under LGPLv3, which
27
+ has different requirements compared to the MIT License. Please review the
28
+ licenses of all dependencies before using or distributing this software to
29
+ ensure compliance with their respective terms.
@@ -0,0 +1,135 @@
1
+ Metadata-Version: 2.2
2
+ Name: scribe-cli
3
+ Version: 0.3.0
4
+ Summary: scribe is a local speech recognition tool that provides real-time transcription using vosk and whisper AI.
5
+ Author-email: Mahé Perrette <mahe.perrette@gmail.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2024 Mahé Perrette
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ ---
29
+
30
+ Note: This project relies on external packages that may have more restrictive
31
+ licenses. For example, the `pynput` package is licensed under LGPLv3, which
32
+ has different requirements compared to the MIT License. Please review the
33
+ licenses of all dependencies before using or distributing this software to
34
+ ensure compliance with their respective terms.
35
+ Project-URL: Homepage, https://github.com/perrette/scribe
36
+ Keywords: speech recognition,transcription,AI,language,vosk,whisper,openai
37
+ Classifier: Programming Language :: Python :: 3
38
+ Classifier: Operating System :: OS Independent
39
+ Requires-Python: >=3.9
40
+ Description-Content-Type: text/markdown
41
+ License-File: LICENSE
42
+ Requires-Dist: numpy
43
+ Requires-Dist: sounddevice
44
+ Requires-Dist: tqdm
45
+ Requires-Dist: requests
46
+ Provides-Extra: keyboard
47
+ Requires-Dist: pynput; extra == "keyboard"
48
+ Provides-Extra: whisper
49
+ Requires-Dist: openai-whisper; extra == "whisper"
50
+ Provides-Extra: vosk
51
+ Requires-Dist: vosk; extra == "vosk"
52
+ Provides-Extra: all
53
+ Requires-Dist: pynput; extra == "all"
54
+ Requires-Dist: openai-whisper; extra == "all"
55
+ Requires-Dist: vosk; extra == "all"
56
+
57
+ # Scribe
58
+
59
+ `scribe` is a local speech recognition tool that provides real-time transcription using vosk and whisper AI.
60
+
61
+ ## Installation
62
+
63
+ Install PortAudio library. E.g. on Ubuntu:
64
+
65
+ ```bash
66
+ sudo apt-get install portaudio19-dev
67
+ ```
68
+
69
+ The python dependencies should be dealt with automatically:
70
+
71
+ ```bash
72
+ pip install scribe-cli[all]"
73
+ ```
74
+
75
+ (note the `-cli` suffix for client)
76
+
77
+ or for local development:
78
+
79
+ ```bash
80
+ git clone https://github.com/perrette/scribe.git
81
+ cd scribe
82
+ pip install -e .[all]
83
+ ```
84
+
85
+ You can leave the optional dependencies (leave out `[all]`) but must install at least one of `vosk` or `openai-whisper` packages (see Usage below).
86
+
87
+ The `vosk` language models will download on-the-fly.
88
+ The default data folder is `$HOME/.local/share/vosk/language-models`.
89
+ This can be modified.
90
+
91
+
92
+ ## Usage
93
+
94
+ Just type in the terminal:
95
+
96
+ ```bash
97
+ scribe
98
+ ```
99
+ and the script will guide you through the choice of backend (`whisper` or `vosk`) and the specific language model.
100
+ After this, you will be prompted to start recording your microphone and print the transcribed text in real-time (`vosk`)
101
+ or until after recording is complete (`whisper`).
102
+ You can interrupt the recording via Ctrl + C and start again or change model.
103
+
104
+ The default (`whisper`) is excellent at transcribing a full-length audio sequences in [many languages](https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages). It is really impressive,
105
+ but it cannot do real-time out of the box, and depending on the model can have relatively long execution time, especially with the `turbo` model (at least on my laptop with CPU only). The `small` model is also excellent and runs much faster. It is selected as default in `scribe` for that reason.
106
+ With the `whisker` model you need to stop the registration manually before the transcription occurs (Ctrl + C), though after
107
+ 60 seconds it will stop automatically (and try to continue afterward).
108
+
109
+ The `vosk` backend is good at
110
+ doing real-time transcription for one language, but tended to make more mistakes in my tests and it does not do punctuation.
111
+ There are many [vosk models](https://alphacephei.com/vosk/models) available, and here a few are associated to [a handful of languages](scribe/models.toml) `en`, `fr`, `it`, `de` (so far).
112
+
113
+ To skip the initial selection menu you can do:
114
+ ```bash
115
+ scribe --backend whisper --model small --no-prompt
116
+ ```
117
+ where `--no-prompt` jumps right to the recording (after the first interruption, you can still choose to change the backend and model).
118
+
119
+ ### Advanced usage as keyboard replacement
120
+
121
+ With the `--keyboard` option `scribe` will attempt to simulate a keyboard and send transcribed characters to the applcation under focus:
122
+
123
+ ```bash
124
+ scribe --keyboard
125
+ ```
126
+
127
+ It relies on the optional `pynput` dependency (installed together with `scribe` if you used the `[all]` or `[keyboard]` option).
128
+
129
+ `pynput` may require [some configuration](https://pynput.readthedocs.io/en/latest/limitations.html) (I *think* got it to work with `xhost +SI:localuser:$(whoami)` as far as the display is concerned). It has [limitations]((https://pynput.readthedocs.io/en/latest/limitations.html)). In my Ubuntu + Wayland system it works in chromium based applications (including vscode) but it does not in firefox and sublime text and any of the rest (not even in a terminal !).
130
+ Workarounds include using the Xorg version of GNOME... Suggestions welcome.
131
+
132
+ ### Start as an application in Ubuntu
133
+
134
+ If you run Ubuntu (or else?) with GNOME, the script `scribe-install [...]` will create a `scribe.desktop` file and place it under `$HOME/.local/share/applications`
135
+ to make it available from the quick launch menu. Any option will be passed on to `scribe`.
@@ -0,0 +1,79 @@
1
+ # Scribe
2
+
3
+ `scribe` is a local speech recognition tool that provides real-time transcription using vosk and whisper AI.
4
+
5
+ ## Installation
6
+
7
+ Install PortAudio library. E.g. on Ubuntu:
8
+
9
+ ```bash
10
+ sudo apt-get install portaudio19-dev
11
+ ```
12
+
13
+ The python dependencies should be dealt with automatically:
14
+
15
+ ```bash
16
+ pip install scribe-cli[all]"
17
+ ```
18
+
19
+ (note the `-cli` suffix for client)
20
+
21
+ or for local development:
22
+
23
+ ```bash
24
+ git clone https://github.com/perrette/scribe.git
25
+ cd scribe
26
+ pip install -e .[all]
27
+ ```
28
+
29
+ You can leave the optional dependencies (leave out `[all]`) but must install at least one of `vosk` or `openai-whisper` packages (see Usage below).
30
+
31
+ The `vosk` language models will download on-the-fly.
32
+ The default data folder is `$HOME/.local/share/vosk/language-models`.
33
+ This can be modified.
34
+
35
+
36
+ ## Usage
37
+
38
+ Just type in the terminal:
39
+
40
+ ```bash
41
+ scribe
42
+ ```
43
+ and the script will guide you through the choice of backend (`whisper` or `vosk`) and the specific language model.
44
+ After this, you will be prompted to start recording your microphone and print the transcribed text in real-time (`vosk`)
45
+ or until after recording is complete (`whisper`).
46
+ You can interrupt the recording via Ctrl + C and start again or change model.
47
+
48
+ The default (`whisper`) is excellent at transcribing a full-length audio sequences in [many languages](https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages). It is really impressive,
49
+ but it cannot do real-time out of the box, and depending on the model can have relatively long execution time, especially with the `turbo` model (at least on my laptop with CPU only). The `small` model is also excellent and runs much faster. It is selected as default in `scribe` for that reason.
50
+ With the `whisker` model you need to stop the registration manually before the transcription occurs (Ctrl + C), though after
51
+ 60 seconds it will stop automatically (and try to continue afterward).
52
+
53
+ The `vosk` backend is good at
54
+ doing real-time transcription for one language, but tended to make more mistakes in my tests and it does not do punctuation.
55
+ There are many [vosk models](https://alphacephei.com/vosk/models) available, and here a few are associated to [a handful of languages](scribe/models.toml) `en`, `fr`, `it`, `de` (so far).
56
+
57
+ To skip the initial selection menu you can do:
58
+ ```bash
59
+ scribe --backend whisper --model small --no-prompt
60
+ ```
61
+ where `--no-prompt` jumps right to the recording (after the first interruption, you can still choose to change the backend and model).
62
+
63
+ ### Advanced usage as keyboard replacement
64
+
65
+ With the `--keyboard` option `scribe` will attempt to simulate a keyboard and send transcribed characters to the applcation under focus:
66
+
67
+ ```bash
68
+ scribe --keyboard
69
+ ```
70
+
71
+ It relies on the optional `pynput` dependency (installed together with `scribe` if you used the `[all]` or `[keyboard]` option).
72
+
73
+ `pynput` may require [some configuration](https://pynput.readthedocs.io/en/latest/limitations.html) (I *think* got it to work with `xhost +SI:localuser:$(whoami)` as far as the display is concerned). It has [limitations]((https://pynput.readthedocs.io/en/latest/limitations.html)). In my Ubuntu + Wayland system it works in chromium based applications (including vscode) but it does not in firefox and sublime text and any of the rest (not even in a terminal !).
74
+ Workarounds include using the Xorg version of GNOME... Suggestions welcome.
75
+
76
+ ### Start as an application in Ubuntu
77
+
78
+ If you run Ubuntu (or else?) with GNOME, the script `scribe-install [...]` will create a `scribe.desktop` file and place it under `$HOME/.local/share/applications`
79
+ to make it available from the quick launch menu. Any option will be passed on to `scribe`.
@@ -0,0 +1,50 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "setuptools_scm[toml]>=6.2", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "scribe-cli"
7
+ dynamic = ["version"]
8
+ description = "scribe is a local speech recognition tool that provides real-time transcription using vosk and whisper AI."
9
+ authors = [
10
+ { name="Mahé Perrette", email="mahe.perrette@gmail.com" }
11
+ ]
12
+ readme = "README.md"
13
+ license = { file="LICENSE" }
14
+ requires-python = ">=3.9"
15
+ dependencies = [
16
+ "numpy",
17
+ "sounddevice",
18
+ "tqdm",
19
+ "requests",
20
+ ]
21
+ optional-dependencies = { keyboard = ["pynput"], whisper = ["openai-whisper"], vosk = ["vosk"], all = ["pynput", "openai-whisper", "vosk"] }
22
+
23
+ classifiers = [
24
+ "Programming Language :: Python :: 3",
25
+ "Operating System :: OS Independent",
26
+ ]
27
+
28
+ keywords = [
29
+ "speech recognition",
30
+ "transcription",
31
+ "AI",
32
+ "language",
33
+ "vosk",
34
+ "whisper",
35
+ "openai",
36
+ ]
37
+
38
+ [tool.setuptools]
39
+ packages = [ "scribe" ]
40
+
41
+ [tool.setuptools_scm]
42
+ write_to = "scribe/_version.py"
43
+
44
+
45
+ [project.urls]
46
+ Homepage = "https://github.com/perrette/scribe"
47
+
48
+ [project.scripts]
49
+ scribe = "scribe.streamer:main"
50
+ scribe-install = "scribe.install_desktop:main"
@@ -0,0 +1 @@
1
+ from ._version import __version__
@@ -0,0 +1,16 @@
1
+ # file generated by setuptools_scm
2
+ # don't change, don't track in version control
3
+ TYPE_CHECKING = False
4
+ if TYPE_CHECKING:
5
+ from typing import Tuple, Union
6
+ VERSION_TUPLE = Tuple[Union[int, str], ...]
7
+ else:
8
+ VERSION_TUPLE = object
9
+
10
+ version: str
11
+ __version__: str
12
+ __version_tuple__: VERSION_TUPLE
13
+ version_tuple: VERSION_TUPLE
14
+
15
+ __version__ = version = '0.3.0'
16
+ __version_tuple__ = version_tuple = (0, 3, 0)
@@ -0,0 +1,51 @@
1
+ import sounddevice as sd
2
+ import queue
3
+
4
+
5
+ def get_duration(audio_length_bytes, # bytes
6
+ sampling_rate = 16000, # Hz
7
+ num_channels = 1, # Mono
8
+ sample_width = 2, # 16-bit audio
9
+ ):
10
+
11
+ # Calculate the number of samples
12
+ num_samples = audio_length_bytes / (num_channels * sample_width)
13
+
14
+ # Calculate the duration in seconds
15
+ duration_seconds = num_samples / sampling_rate
16
+
17
+ return duration_seconds
18
+
19
+
20
+ class Microphone:
21
+ def __init__(self,
22
+ samplerate = 16000, # Vosk models typically use a 16kHz sample rate
23
+ channels = 1, # Mono audio
24
+ device = None, # Default device
25
+ dtype = 'int16', # Vosk models typically use 16-bit audio
26
+ ):
27
+ self.q = queue.Queue()
28
+ self.samplerate = samplerate
29
+ self.channels = channels
30
+ self.device = device
31
+ self.dtype = dtype
32
+
33
+ # Fonction callback pour traiter les morceaux audio
34
+ def callback(self, indata, frames, time, status):
35
+ if status:
36
+ print(status)
37
+ self.q.put(bytes(indata))
38
+ # if frames > 1000: # Ajustez cette valeur pour essayer différents morceaux de taille
39
+ # rec.AcceptWaveform(bytes(indata))
40
+
41
+
42
+ def open_stream(self):
43
+ self.q.queue.clear()
44
+ return sd.InputStream(samplerate=self.samplerate, device=self.device,
45
+ channels=self.channels, callback=self.callback, dtype=self.dtype)
46
+
47
+ def device_info(self):
48
+ return sd.query_devices(self.device, 'input')
49
+
50
+ def get_duraction(self, audio_length_bytes):
51
+ return get_duration(audio_length_bytes, self.samplerate, self.channels, {'int16':2}[self.dtype])
@@ -0,0 +1,43 @@
1
+ import os, sys, platform, shutil, sysconfig
2
+ import argparse
3
+
4
+ def main():
5
+
6
+ # Check if the current platform is Linux
7
+ if platform.system() != "Linux":
8
+ print("This package is only supported on Linux systems.", file=sys.stderr)
9
+ sys.exit(0)
10
+
11
+ parser = argparse.ArgumentParser("Install the desktop file for the scribe package. Any arguments to this script will be passed on to `scribe`.")
12
+ o, rest = parser.parse_known_args()
13
+ o.arguments = rest
14
+
15
+ PACKAGE_NAME = 'scribe'
16
+
17
+ HOME = os.environ.get('HOME',os.path.expanduser('~'))
18
+ XDG_SHARE = os.environ.get('XDG_DATA_HOME', os.path.join(HOME, '.local','share'))
19
+ XDG_APP_DATA = os.path.join(XDG_SHARE, 'applications')
20
+ XDG_SCRIBE_DATA = os.path.join(XDG_SHARE, PACKAGE_NAME)
21
+
22
+
23
+ # Create the directory if it doesn't exist
24
+ os.makedirs(XDG_SCRIBE_DATA, exist_ok=True)
25
+ os.makedirs(XDG_APP_DATA, exist_ok=True)
26
+
27
+ # Copy your files to the desired location
28
+ print("Copying files to", XDG_SCRIBE_DATA)
29
+ shutil.copy('share/icon.jpg', XDG_SCRIBE_DATA)
30
+
31
+ with open('templates/scribe.desktop') as f:
32
+ template = f.read()
33
+
34
+ bin_folder = sysconfig.get_path("scripts")
35
+ desktop_file = template.format(XDG_SCRIBE_DATA=XDG_SCRIBE_DATA, bin_folder=bin_folder, options=' '.join(o.arguments))
36
+
37
+ print("Writing desktop file to", XDG_APP_DATA)
38
+ with open(os.path.join(XDG_APP_DATA, 'scribe.desktop'), "w") as f:
39
+ f.write(desktop_file)
40
+
41
+
42
+ if __name__ == "__main__":
43
+ main()
@@ -0,0 +1,19 @@
1
+ """This module handles typing characters as if they were typed on a keyboard.
2
+ """
3
+ try:
4
+ # import pyautogui
5
+ from pynput.keyboard import Controller
6
+
7
+ except ImportError:
8
+ print("Please install pynput to use the keyboard feature.")
9
+ print("Alternatively specify [keyboard] optional dependency to voskrealtime, e.g. `pip install -e .[keyboard]`")
10
+ raise
11
+
12
+ # Create a keyboard controller
13
+ keyboard = Controller()
14
+
15
+ def type_text(text, interval=0):
16
+ # Simulate typing a string
17
+ # import subprocess
18
+ # subprocess.run(["ydotool", "type", text])
19
+ keyboard.type(text)
@@ -0,0 +1,133 @@
1
+ import os
2
+ import json
3
+ import numpy as np
4
+ from scribe.util import download_model
5
+
6
+ VOSK_MODELS_FOLDER = os.path.join(os.environ.get("HOME"),
7
+ ".local/share/vosk/language-models")
8
+
9
+
10
+ class AbstractTranscriber:
11
+ backend = None
12
+ def __init__(self, model, model_name=None, language=None, samplerate=16000, model_kwargs={}):
13
+ self.model_name = model_name
14
+ self.language = language
15
+ self.model = model
16
+ self.model_kwargs = model_kwargs
17
+ self.samplerate = samplerate
18
+
19
+ def transcribe_audio(self, audio_data):
20
+ raise NotImplementedError()
21
+
22
+ def transcribe_realtime_audio(self, audio_data):
23
+ raise NotImplementedError()
24
+
25
+ def start_recording(self, microphone,
26
+ start_message="Recording... Press Ctrl+C to stop.",
27
+ stop_message="Stopped recording."):
28
+
29
+ with microphone.open_stream():
30
+ print(start_message)
31
+
32
+ try:
33
+ while True:
34
+ while not microphone.q.empty():
35
+ data = microphone.q.get()
36
+ yield self.transcribe_realtime_audio(data)
37
+
38
+ except KeyboardInterrupt:
39
+ pass
40
+
41
+ finally:
42
+ result = self.finalize()
43
+ microphone.q.queue.clear()
44
+ yield result
45
+
46
+ print(stop_message)
47
+
48
+
49
+ def get_vosk_model(model, data_folder=None, url=None):
50
+ """Load the Vosk recognizer"""
51
+ import vosk
52
+ if data_folder is None:
53
+ data_folder = VOSK_MODELS_FOLDER
54
+ model_path = os.path.join(data_folder, model)
55
+ if not os.path.exists(model_path):
56
+ if url is None:
57
+ url = f"https://alphacephei.com/vosk/models/{model}.zip"
58
+ download_model(url, data_folder)
59
+ assert os.path.exists(model_path)
60
+
61
+ return vosk.Model(model_path)
62
+
63
+
64
+ def get_vosk_recognizer(model, samplerate=16000):
65
+ import vosk
66
+ return vosk.KaldiRecognizer(model, samplerate)
67
+
68
+
69
+ class VoskTranscriber(AbstractTranscriber):
70
+ backend = "vosk"
71
+
72
+ def __init__(self, model_name, model=None, model_kwargs={}, **kwargs):
73
+ if model is None:
74
+ model = get_vosk_model(model_name, **model_kwargs)
75
+ super().__init__(model, model_name, model_kwargs=model_kwargs, **kwargs)
76
+ self.recognizer = get_vosk_recognizer(model, self.samplerate)
77
+
78
+ def transcribe_realtime_audio(self, audio_bytes=b"", finalize=False):
79
+ final = self.recognizer.AcceptWaveform(audio_bytes)
80
+ if final:
81
+ result = self.recognizer.Result()
82
+ else:
83
+ result = self.recognizer.PartialResult()
84
+ result_dict = json.loads(result)
85
+
86
+ if final:
87
+ pass
88
+ elif finalize:
89
+ result_dict["text"] = result_dict.pop("partial", "")
90
+ else:
91
+ assert not final
92
+ if "text" in result_dict:
93
+ del result_dict["text"]
94
+ return result_dict
95
+
96
+ def transcribe_audio(self, audio_data=None):
97
+ return self.transcribe_realtime_audio(audio_data, finalize=True)
98
+
99
+ def finalize(self):
100
+ return self.transcribe_audio(b"")
101
+
102
+
103
+ class WhisperTranscriber(AbstractTranscriber):
104
+ backend = "whisper"
105
+
106
+ def __init__(self, model_name, language=None, model=None, model_kwargs={}, **kwargs):
107
+ import whisper
108
+ if model is None:
109
+ model = whisper.load_model(model_name)
110
+ super().__init__(model, model_name, language, model_kwargs=model_kwargs, **kwargs)
111
+ self.audio_buffer = b''
112
+
113
+ def transcribe_realtime_audio(self, audio_bytes=b"", max_duration=60):
114
+ self.audio_buffer += audio_bytes
115
+
116
+ one_second = self.samplerate * 2 # 16-bit audio, 1 channel ~ 32000 bytes
117
+ if len(self.audio_buffer) < max_duration * one_second:
118
+ return {"partial": f"{len(self.audio_buffer)} bytes received (duration: {len(self.audio_buffer) / one_second:.2f} seconds)"}
119
+
120
+ else:
121
+ return self.finalize()
122
+
123
+ def transcribe_audio(self, audio_bytes):
124
+ print("\nTranscribing...")
125
+ audio_array = np.frombuffer(audio_bytes, dtype=np.int16).flatten().astype(np.float32) / 32768.0
126
+ return self.model.transcribe(audio_array, fp16=False, language=self.language)
127
+
128
+ def finalize(self):
129
+ if len(self.audio_buffer) == 0:
130
+ return {"text": ""}
131
+ result = self.transcribe_audio(self.audio_buffer)
132
+ self.audio_buffer = b''
133
+ return result
@@ -0,0 +1,31 @@
1
+ [vosk.en]
2
+ model = "vosk-model-en-us-0.42-gigaspeech"
3
+
4
+ [vosk.fr]
5
+ model = "vosk-model-fr-0.22"
6
+
7
+ [vosk.de]
8
+ model = "vosk-model-de-tuda-0.6-900k"
9
+
10
+ [vosk.it]
11
+ model = "vosk-model-it-0.22"
12
+
13
+ [_meta.en]
14
+ language = "English (US)"
15
+ start_message = "Listening... Press Ctrl+C to stop."
16
+ stop_message = "Recording stopped."
17
+
18
+ [_meta.fr]
19
+ language = "French"
20
+ start_message = "En écoute... Appuyez sur Ctrl+C pour arrêter."
21
+ stop_message = "Écoute arrêtée."
22
+
23
+ [_meta.de]
24
+ language = "German"
25
+ start_message = "Hören... Drücken Sie Strg+C, um zu stoppen."
26
+ stop_message = "Aufnahme gestoppt."
27
+
28
+ [_meta.it]
29
+ language = "Italian"
30
+ start_message = "In ascolto... Premere Ctrl+C per interrompere."
31
+ stop_message = "Registrazione interrotta."