scribe-cli 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scribe/__init__.py ADDED
@@ -0,0 +1 @@
1
+ from ._version import __version__
scribe/_version.py ADDED
@@ -0,0 +1,16 @@
1
+ # file generated by setuptools_scm
2
+ # don't change, don't track in version control
3
+ TYPE_CHECKING = False
4
+ if TYPE_CHECKING:
5
+ from typing import Tuple, Union
6
+ VERSION_TUPLE = Tuple[Union[int, str], ...]
7
+ else:
8
+ VERSION_TUPLE = object
9
+
10
+ version: str
11
+ __version__: str
12
+ __version_tuple__: VERSION_TUPLE
13
+ version_tuple: VERSION_TUPLE
14
+
15
+ __version__ = version = '0.3.0'
16
+ __version_tuple__ = version_tuple = (0, 3, 0)
scribe/audio.py ADDED
@@ -0,0 +1,51 @@
1
+ import sounddevice as sd
2
+ import queue
3
+
4
+
5
+ def get_duration(audio_length_bytes, # bytes
6
+ sampling_rate = 16000, # Hz
7
+ num_channels = 1, # Mono
8
+ sample_width = 2, # 16-bit audio
9
+ ):
10
+
11
+ # Calculate the number of samples
12
+ num_samples = audio_length_bytes / (num_channels * sample_width)
13
+
14
+ # Calculate the duration in seconds
15
+ duration_seconds = num_samples / sampling_rate
16
+
17
+ return duration_seconds
18
+
19
+
20
+ class Microphone:
21
+ def __init__(self,
22
+ samplerate = 16000, # Vosk models typically use a 16kHz sample rate
23
+ channels = 1, # Mono audio
24
+ device = None, # Default device
25
+ dtype = 'int16', # Vosk models typically use 16-bit audio
26
+ ):
27
+ self.q = queue.Queue()
28
+ self.samplerate = samplerate
29
+ self.channels = channels
30
+ self.device = device
31
+ self.dtype = dtype
32
+
33
+ # Fonction callback pour traiter les morceaux audio
34
+ def callback(self, indata, frames, time, status):
35
+ if status:
36
+ print(status)
37
+ self.q.put(bytes(indata))
38
+ # if frames > 1000: # Ajustez cette valeur pour essayer différents morceaux de taille
39
+ # rec.AcceptWaveform(bytes(indata))
40
+
41
+
42
+ def open_stream(self):
43
+ self.q.queue.clear()
44
+ return sd.InputStream(samplerate=self.samplerate, device=self.device,
45
+ channels=self.channels, callback=self.callback, dtype=self.dtype)
46
+
47
+ def device_info(self):
48
+ return sd.query_devices(self.device, 'input')
49
+
50
+ def get_duraction(self, audio_length_bytes):
51
+ return get_duration(audio_length_bytes, self.samplerate, self.channels, {'int16':2}[self.dtype])
@@ -0,0 +1,43 @@
1
+ import os, sys, platform, shutil, sysconfig
2
+ import argparse
3
+
4
+ def main():
5
+
6
+ # Check if the current platform is Linux
7
+ if platform.system() != "Linux":
8
+ print("This package is only supported on Linux systems.", file=sys.stderr)
9
+ sys.exit(0)
10
+
11
+ parser = argparse.ArgumentParser("Install the desktop file for the scribe package. Any arguments to this script will be passed on to `scribe`.")
12
+ o, rest = parser.parse_known_args()
13
+ o.arguments = rest
14
+
15
+ PACKAGE_NAME = 'scribe'
16
+
17
+ HOME = os.environ.get('HOME',os.path.expanduser('~'))
18
+ XDG_SHARE = os.environ.get('XDG_DATA_HOME', os.path.join(HOME, '.local','share'))
19
+ XDG_APP_DATA = os.path.join(XDG_SHARE, 'applications')
20
+ XDG_SCRIBE_DATA = os.path.join(XDG_SHARE, PACKAGE_NAME)
21
+
22
+
23
+ # Create the directory if it doesn't exist
24
+ os.makedirs(XDG_SCRIBE_DATA, exist_ok=True)
25
+ os.makedirs(XDG_APP_DATA, exist_ok=True)
26
+
27
+ # Copy your files to the desired location
28
+ print("Copying files to", XDG_SCRIBE_DATA)
29
+ shutil.copy('share/icon.jpg', XDG_SCRIBE_DATA)
30
+
31
+ with open('templates/scribe.desktop') as f:
32
+ template = f.read()
33
+
34
+ bin_folder = sysconfig.get_path("scripts")
35
+ desktop_file = template.format(XDG_SCRIBE_DATA=XDG_SCRIBE_DATA, bin_folder=bin_folder, options=' '.join(o.arguments))
36
+
37
+ print("Writing desktop file to", XDG_APP_DATA)
38
+ with open(os.path.join(XDG_APP_DATA, 'scribe.desktop'), "w") as f:
39
+ f.write(desktop_file)
40
+
41
+
42
+ if __name__ == "__main__":
43
+ main()
scribe/keyboard.py ADDED
@@ -0,0 +1,19 @@
1
+ """This module handles typing characters as if they were typed on a keyboard.
2
+ """
3
+ try:
4
+ # import pyautogui
5
+ from pynput.keyboard import Controller
6
+
7
+ except ImportError:
8
+ print("Please install pynput to use the keyboard feature.")
9
+ print("Alternatively specify [keyboard] optional dependency to voskrealtime, e.g. `pip install -e .[keyboard]`")
10
+ raise
11
+
12
+ # Create a keyboard controller
13
+ keyboard = Controller()
14
+
15
+ def type_text(text, interval=0):
16
+ # Simulate typing a string
17
+ # import subprocess
18
+ # subprocess.run(["ydotool", "type", text])
19
+ keyboard.type(text)
scribe/models.py ADDED
@@ -0,0 +1,133 @@
1
+ import os
2
+ import json
3
+ import numpy as np
4
+ from scribe.util import download_model
5
+
6
+ VOSK_MODELS_FOLDER = os.path.join(os.environ.get("HOME"),
7
+ ".local/share/vosk/language-models")
8
+
9
+
10
+ class AbstractTranscriber:
11
+ backend = None
12
+ def __init__(self, model, model_name=None, language=None, samplerate=16000, model_kwargs={}):
13
+ self.model_name = model_name
14
+ self.language = language
15
+ self.model = model
16
+ self.model_kwargs = model_kwargs
17
+ self.samplerate = samplerate
18
+
19
+ def transcribe_audio(self, audio_data):
20
+ raise NotImplementedError()
21
+
22
+ def transcribe_realtime_audio(self, audio_data):
23
+ raise NotImplementedError()
24
+
25
+ def start_recording(self, microphone,
26
+ start_message="Recording... Press Ctrl+C to stop.",
27
+ stop_message="Stopped recording."):
28
+
29
+ with microphone.open_stream():
30
+ print(start_message)
31
+
32
+ try:
33
+ while True:
34
+ while not microphone.q.empty():
35
+ data = microphone.q.get()
36
+ yield self.transcribe_realtime_audio(data)
37
+
38
+ except KeyboardInterrupt:
39
+ pass
40
+
41
+ finally:
42
+ result = self.finalize()
43
+ microphone.q.queue.clear()
44
+ yield result
45
+
46
+ print(stop_message)
47
+
48
+
49
+ def get_vosk_model(model, data_folder=None, url=None):
50
+ """Load the Vosk recognizer"""
51
+ import vosk
52
+ if data_folder is None:
53
+ data_folder = VOSK_MODELS_FOLDER
54
+ model_path = os.path.join(data_folder, model)
55
+ if not os.path.exists(model_path):
56
+ if url is None:
57
+ url = f"https://alphacephei.com/vosk/models/{model}.zip"
58
+ download_model(url, data_folder)
59
+ assert os.path.exists(model_path)
60
+
61
+ return vosk.Model(model_path)
62
+
63
+
64
+ def get_vosk_recognizer(model, samplerate=16000):
65
+ import vosk
66
+ return vosk.KaldiRecognizer(model, samplerate)
67
+
68
+
69
+ class VoskTranscriber(AbstractTranscriber):
70
+ backend = "vosk"
71
+
72
+ def __init__(self, model_name, model=None, model_kwargs={}, **kwargs):
73
+ if model is None:
74
+ model = get_vosk_model(model_name, **model_kwargs)
75
+ super().__init__(model, model_name, model_kwargs=model_kwargs, **kwargs)
76
+ self.recognizer = get_vosk_recognizer(model, self.samplerate)
77
+
78
+ def transcribe_realtime_audio(self, audio_bytes=b"", finalize=False):
79
+ final = self.recognizer.AcceptWaveform(audio_bytes)
80
+ if final:
81
+ result = self.recognizer.Result()
82
+ else:
83
+ result = self.recognizer.PartialResult()
84
+ result_dict = json.loads(result)
85
+
86
+ if final:
87
+ pass
88
+ elif finalize:
89
+ result_dict["text"] = result_dict.pop("partial", "")
90
+ else:
91
+ assert not final
92
+ if "text" in result_dict:
93
+ del result_dict["text"]
94
+ return result_dict
95
+
96
+ def transcribe_audio(self, audio_data=None):
97
+ return self.transcribe_realtime_audio(audio_data, finalize=True)
98
+
99
+ def finalize(self):
100
+ return self.transcribe_audio(b"")
101
+
102
+
103
+ class WhisperTranscriber(AbstractTranscriber):
104
+ backend = "whisper"
105
+
106
+ def __init__(self, model_name, language=None, model=None, model_kwargs={}, **kwargs):
107
+ import whisper
108
+ if model is None:
109
+ model = whisper.load_model(model_name)
110
+ super().__init__(model, model_name, language, model_kwargs=model_kwargs, **kwargs)
111
+ self.audio_buffer = b''
112
+
113
+ def transcribe_realtime_audio(self, audio_bytes=b"", max_duration=60):
114
+ self.audio_buffer += audio_bytes
115
+
116
+ one_second = self.samplerate * 2 # 16-bit audio, 1 channel ~ 32000 bytes
117
+ if len(self.audio_buffer) < max_duration * one_second:
118
+ return {"partial": f"{len(self.audio_buffer)} bytes received (duration: {len(self.audio_buffer) / one_second:.2f} seconds)"}
119
+
120
+ else:
121
+ return self.finalize()
122
+
123
+ def transcribe_audio(self, audio_bytes):
124
+ print("\nTranscribing...")
125
+ audio_array = np.frombuffer(audio_bytes, dtype=np.int16).flatten().astype(np.float32) / 32768.0
126
+ return self.model.transcribe(audio_array, fp16=False, language=self.language)
127
+
128
+ def finalize(self):
129
+ if len(self.audio_buffer) == 0:
130
+ return {"text": ""}
131
+ result = self.transcribe_audio(self.audio_buffer)
132
+ self.audio_buffer = b''
133
+ return result
scribe/models.toml ADDED
@@ -0,0 +1,31 @@
1
+ [vosk.en]
2
+ model = "vosk-model-en-us-0.42-gigaspeech"
3
+
4
+ [vosk.fr]
5
+ model = "vosk-model-fr-0.22"
6
+
7
+ [vosk.de]
8
+ model = "vosk-model-de-tuda-0.6-900k"
9
+
10
+ [vosk.it]
11
+ model = "vosk-model-it-0.22"
12
+
13
+ [_meta.en]
14
+ language = "English (US)"
15
+ start_message = "Listening... Press Ctrl+C to stop."
16
+ stop_message = "Recording stopped."
17
+
18
+ [_meta.fr]
19
+ language = "French"
20
+ start_message = "En écoute... Appuyez sur Ctrl+C pour arrêter."
21
+ stop_message = "Écoute arrêtée."
22
+
23
+ [_meta.de]
24
+ language = "German"
25
+ start_message = "Hören... Drücken Sie Strg+C, um zu stoppen."
26
+ stop_message = "Aufnahme gestoppt."
27
+
28
+ [_meta.it]
29
+ language = "Italian"
30
+ start_message = "In ascolto... Premere Ctrl+C per interrompere."
31
+ stop_message = "Registrazione interrotta."
@@ -0,0 +1,62 @@
1
+ import sounddevice as sd
2
+ import time
3
+ import numpy as np
4
+ from pydub import AudioSegment
5
+ import io
6
+
7
+ # Set up the audio parameters
8
+ samplerate = 16000 # Vosk models typically use 16 kHz
9
+ channels = 1
10
+ duration = 5 # in seconds
11
+ chunk_size = 1024 # Number of samples per chunk (adjust this based on your needs)
12
+
13
+ # Create a numpy array to store the full audio buffer
14
+ full_audio = np.zeros((0,), dtype=np.float32) # Use float32 initially for better precision
15
+
16
+ # Callback function to collect audio data into the buffer
17
+ def callback(indata, frames, time, status):
18
+ if status:
19
+ print(status)
20
+ global full_audio
21
+ # Append the incoming data to the full audio buffer (in float32)
22
+ full_audio = np.concatenate((full_audio, indata[:, 0]))
23
+
24
+ # Function to record audio for a fixed duration
25
+ def record_for_duration(duration):
26
+ try:
27
+ with sd.InputStream(samplerate=samplerate, channels=channels, callback=callback):
28
+ print(f"Recording for {duration} seconds or interrupt with Ctrl-C")
29
+ sd.sleep(duration * 1000) # Sleep for the duration to ensure we record for the fixed time
30
+ except KeyboardInterrupt:
31
+ pass
32
+ print("Recording finished.")
33
+ save_audio_as_mp3()
34
+
35
+ # Function to save the recorded audio as an MP3 file
36
+ def save_audio_as_mp3():
37
+ # Convert the float32 audio buffer to 16-bit PCM (this is required for conversion to MP3)
38
+ audio_16bit = np.int16(full_audio * 32767) # Scale to the 16-bit range
39
+
40
+ # Create an in-memory audio file using io.BytesIO
41
+ audio_data = io.BytesIO()
42
+
43
+ # Save the audio as WAV using pydub (since pydub works with WAV directly)
44
+ # Convert the numpy array to a pydub AudioSegment
45
+ audio_segment = AudioSegment(
46
+ audio_16bit.tobytes(),
47
+ frame_rate=samplerate,
48
+ sample_width=2, # 16-bit audio
49
+ channels=1
50
+ )
51
+
52
+ # Export as MP3`
53
+ audio_segment.export(audio_data, format="mp3")
54
+
55
+ # Save to a file
56
+ with open("recording.mp3", "wb") as f:
57
+ f.write(audio_data.getvalue())
58
+ print("Recording saved as recording.mp3")
59
+
60
+ # Example: record for 5 seconds
61
+ record_for_duration(5*60)
62
+
scribe/streamer.py ADDED
@@ -0,0 +1,215 @@
1
+ from pathlib import Path
2
+ import tomllib
3
+ import argparse
4
+ from scribe.audio import Microphone
5
+ from scribe.util import print_partial, clear_line, prompt_choices, check_dependencies, ansi_link, colored
6
+ from scribe.models import VoskTranscriber, WhisperTranscriber
7
+
8
+ with open(Path(__file__).parent / "models.toml", "rb") as f:
9
+ language_config_default = tomllib.load(f)
10
+
11
+ language_config = language_config_default.copy()
12
+
13
+
14
+ # Commencer l'enregistrement
15
+ def start_recording(micro, transcriber, keyboard=False, latency=0):
16
+
17
+ if keyboard:
18
+ try:
19
+ from scribe.keyboard import type_text
20
+ except ImportError:
21
+ keyboard = False
22
+ exit(1)
23
+
24
+ greetings = { k: v for k, v in language_config["_meta"].get(transcriber.language, {}).items()
25
+ if v is not None and k.startswith(("start", "stop"))
26
+ }
27
+
28
+ for result in transcriber.start_recording(micro, **greetings):
29
+
30
+ if result.get('text'):
31
+ clear_line()
32
+ print(result.get('text'))
33
+ if keyboard:
34
+ type_text(result['text'] + " ", interval=latency) # Simulate typing
35
+ else:
36
+ print_partial(result.get('partial', ''))
37
+
38
+
39
+ def get_default_backend():
40
+ try:
41
+ import vosk
42
+ return "vosk"
43
+ except ImportError:
44
+ try:
45
+ import whisper
46
+ return "whisper"
47
+ except ImportError:
48
+ raise ImportError("Please install either vosk or whisper to use this script.")
49
+
50
+ BACKENDS = ["whisper", "vosk"]
51
+ UNAVAILABLE_BACKENDS = []
52
+
53
+
54
+ def pick_specialist_model(model, language, backend):
55
+ """ choose a specialist version of a model if language is specified (whisper)"""
56
+
57
+ if backend == "whisper" and language and language.lower() in ["en", "english"]:
58
+ available_models_en = ["tiny.en", "base.en", "small.en", "medium.en", "large", "turbo"]
59
+ if model + ".en" in available_models_en:
60
+ model += ".en"
61
+
62
+ return model
63
+
64
+
65
+ def get_transcriber(o, prompt=True):
66
+
67
+ if o.backend:
68
+ checked_backend = check_dependencies(o.backend)
69
+ if not checked_backend:
70
+ print(f"Backend {o.backend} is not available.")
71
+ exit(1)
72
+ backend = o.backend
73
+
74
+ elif not prompt:
75
+ backend = choices[0]
76
+
77
+ else:
78
+ checked_backend = False
79
+ while not checked_backend:
80
+ backend = prompt_choices(BACKENDS, o.backend, "backend", UNAVAILABLE_BACKENDS)
81
+ # raise an error if the user has explicitly selected a backend that is not available
82
+ checked_backend = check_dependencies(backend, raise_error=backend==o.backend)
83
+ if not checked_backend:
84
+ print(f"Backend {o.backend} is not available.")
85
+ UNAVAILABLE_BACKENDS.append(backend)
86
+
87
+ print(f"Selected backend: {backend}")
88
+
89
+ if o.model:
90
+ model = pick_specialist_model(o.model, o.language, backend)
91
+
92
+ else:
93
+
94
+ if backend == "vosk":
95
+ available_languages = list(language_config[backend])
96
+ if o.language:
97
+ if o.language not in available_languages:
98
+ print(f"Language '{o.language}' is not pre-defined (yet) for backend '{backend}'.")
99
+ print(f"Yet it may actually exist.")
100
+ print(f"Please choose the model explictly from {ansi_link('https://alphacephei.com/vosk/models')}.")
101
+ print(f"Or pick one of the pre-defined languages: ", " ".join(available_languages))
102
+ exit(1)
103
+ choices = [language_config[backend][o.language]["model"]]
104
+ default_model = choices[0]
105
+
106
+ else:
107
+ available_models = [language_config[backend][lang]["model"] for lang in available_languages]
108
+ choices = list(zip(available_models, available_languages)) + [f" * [Any model from {ansi_link('https://alphacephei.com/vosk/models')}]"]
109
+ default_model = choices[0]
110
+
111
+ print(f"For information about vosk models see: {ansi_link('https://alphacephei.com/vosk/models')}")
112
+ if prompt:
113
+ model = prompt_choices(choices, default=default_model, label="model")
114
+ else:
115
+ model = default_model
116
+
117
+ elif backend == "whisper":
118
+
119
+ models = ["tiny", "base", "small", "medium", "large", "turbo"]
120
+ english_models = ["tiny.en", "base.en", "small.en", "medium.en"]
121
+ default_model = "small"
122
+
123
+ print("Some models have a specialized English version (.en) which will be selected as default is `-l en` was requested, but can also be requested explicitly below (option not listed). See [documentation](https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages).")
124
+ if prompt:
125
+ model = prompt_choices(models, default=default_model, label="model",
126
+ hidden_models=english_models)
127
+ else:
128
+ model = default_model
129
+
130
+ model = pick_specialist_model(model, o.language, backend)
131
+
132
+ print(f"Selected model: {model}")
133
+
134
+ if backend == "vosk":
135
+ try:
136
+ transcriber = VoskTranscriber(model_name=model,
137
+ language=o.language,
138
+ samplerate=o.samplerate,
139
+ model_kwargs={"data_folder": o.data_folder})
140
+ except Exception as error:
141
+ print(error)
142
+ print(f"Failed to (down)load model {model}.")
143
+ exit(1)
144
+
145
+ elif backend == "whisper":
146
+ transcriber = WhisperTranscriber(model_name=model, language=o.language, samplerate=o.samplerate)
147
+
148
+ else:
149
+ raise ValueError(f"Unknown backend: {backend}")
150
+
151
+ return transcriber
152
+
153
+ def get_parser():
154
+
155
+ parser = argparse.ArgumentParser()
156
+ parser.add_argument("--backend", choices=BACKENDS,
157
+ help="Choose the backend to use for speech recognition (will be prompted otherwise).")
158
+
159
+ parser.add_argument("--model",
160
+ help="""For vosk, any model from https://alphacephei.com/vosk/models,
161
+ e.g. 'vosk-model-small-en-us-0.15'.
162
+ For whisper, see https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages""")
163
+
164
+ parser.add_argument("-l", "--language", choices=list(language_config["vosk"]),
165
+ help="An alias for preselected models when using the vosk backend, or 'en' for the English version of whisper models.")
166
+
167
+ parser.add_argument("--no-prompt", action="store_false", dest="prompt", help="Disable prompts for backend and model selection and jump to recording")
168
+
169
+ parser.add_argument("--samplerate", default=16000, type=int, help=argparse.SUPPRESS)
170
+ parser.add_argument("--keyboard", action="store_true")
171
+ parser.add_argument("--latency", default=0, type=float, help="keyboard latency")
172
+
173
+ parser.add_argument("--data-folder", help="Folder to store Vosk models.")
174
+
175
+ return parser
176
+
177
+
178
+ def main(args=None):
179
+
180
+ parser = get_parser()
181
+ o = parser.parse_args(args)
182
+
183
+
184
+ # Set up the microphone for recording
185
+ micro = Microphone(samplerate=o.samplerate)
186
+
187
+ transcriber = None
188
+
189
+ while True:
190
+ if transcriber is None:
191
+ transcriber = get_transcriber(o, prompt=o.prompt)
192
+ print(f"[ Model {transcriber.model_name} from {transcriber.backend} selected. ]")
193
+ if o.prompt:
194
+ print(f"Choose any of the following actions:")
195
+ print(f"[q] quit")
196
+ print(f"[e] change model")
197
+ print(colored(f"Press [Enter] or any other key to start recording.", "BOLD"))
198
+
199
+ key = input()
200
+ if key == "q":
201
+ exit(0)
202
+ if key == "e":
203
+ transcriber = None
204
+ continue
205
+ start_recording(micro, transcriber, keyboard=o.keyboard, latency=o.latency)
206
+
207
+ # if we arrived so far, that means we pressed Ctrl + C anyway, and need Enter to move on.
208
+ # So we leave the wider range of options to change the model.
209
+ o.prompt = True
210
+ o.backend = None
211
+ o.model = None
212
+ o.language = None
213
+
214
+ if __name__ == "__main__":
215
+ main()
scribe/testpynput.py ADDED
@@ -0,0 +1,142 @@
1
+ import subprocess
2
+
3
+ def type_text(text):
4
+ subprocess.run(["ydotool", "type", text])
5
+
6
+ # Example usage
7
+ type_text("Hello, World!")
8
+
9
+
10
+ import pyautogui
11
+ import time
12
+
13
+ # Simulate typing a string
14
+ pyautogui.write('Hello, pyautogui!', interval=0.1)
15
+
16
+
17
+
18
+ # # Simulate pressing and releasing the Enter key
19
+ # pyautogui.press('enter')
20
+
21
+ # # Simulate pressing and releasing the 'A' key
22
+ # pyautogui.press('a')
23
+
24
+ # # Simulate a hotkey combination (e.g., Ctrl+C)
25
+ # pyautogui.hotkey('ctrl', 'c')
26
+
27
+ # Hello, World!
28
+
29
+ # # import os
30
+ # # import subprocess
31
+ # # from evdev import UInput, ecodes as e
32
+ # # # Function to create and initialize a virtual keyboard
33
+ # # def create_virtual_keyboard():
34
+ # # capabilities = {
35
+ # # e.EV_KEY: [e.KEY_A, e.KEY_B, e.KEY_C, e.KEY_1, e.KEY_2, e.KEY_3]
36
+ # # }
37
+ # # return UInput(capabilities)
38
+ # # # Function to send a keystroke
39
+ # # def send_keystroke(virtual_keyboard, key):
40
+ # # virtual_keyboard.write(e.EV_KEY, key, 1) # Key down
41
+ # # virtual_keyboard.write(e.EV_KEY, key, 0) # Key up
42
+ # # virtual_keyboard.syn()
43
+
44
+ # # # Main script execution
45
+ # # if __name__ == "__main__":
46
+ # # try:
47
+ # # vk = create_virtual_keyboard()
48
+ # # print("Virtual keyboard initialized.")
49
+ # # send_keystroke(vk, e.KEY_A) # Sends the 'A' key
50
+ # # send_keystroke(vk, e.KEY_B) # Sends the 'B' key
51
+ # # finally:
52
+ # # vk.close()
53
+ # # print("Virtual keyboard closed.")
54
+
55
+
56
+ # import os
57
+ # import subprocess
58
+ # from evdev import UInput, ecodes as e
59
+
60
+ # # Function to create and initialize a virtual keyboard
61
+ # def create_virtual_keyboard():
62
+ # capabilities = {
63
+ # e.EV_KEY: [e.KEY_A, e.KEY_B, e.KEY_C, e.KEY_1, e.KEY_2, e.KEY_3]
64
+ # }
65
+ # try:
66
+ # ui = UInput(capabilities, name="virtual-keyboard")
67
+ # print("Virtual keyboard created successfully.")
68
+ # return ui
69
+ # except Exception as ex:
70
+ # print(f"Failed to create virtual keyboard: {ex}")
71
+ # return None
72
+
73
+ # # Function to send a keystroke
74
+ # def send_keystroke(virtual_keyboard, key):
75
+ # try:
76
+ # virtual_keyboard.write(e.EV_KEY, key, 1) # Key down
77
+ # virtual_keyboard.write(e.EV_KEY, key, 0) # Key up
78
+ # virtual_keyboard.syn()
79
+ # print(f"Sent keystroke: {key}")
80
+ # except Exception as ex:
81
+ # print(f"Failed to send keystroke: {ex}")
82
+
83
+ # # Main script execution
84
+ # if __name__ == "__main__":
85
+ # vk = create_virtual_keyboard()
86
+ # if vk:
87
+ # try:
88
+ # send_keystroke(vk, e.KEY_A) # Sends the 'A' key
89
+ # send_keystroke(vk, e.KEY_B) # Sends the 'B' key
90
+ # finally:
91
+ # vk.close()
92
+ # print("Virtual keyboard closed.")
93
+
94
+
95
+ from pynput.keyboard import Controller, Key
96
+ import time
97
+
98
+ # Create a keyboard controller
99
+ keyboard = Controller()
100
+
101
+ # Simulate typing a string
102
+ keyboard.type("Hello, World!")
103
+
104
+ # Simulate pressing and releasing a key
105
+ keyboard.press(Key.enter)
106
+ keyboard.release(Key.enter)
107
+
108
+ # Simulate pressing and releasing the 'A' key
109
+ keyboard.press('a')
110
+ keyboard.release('a')
111
+
112
+ # Simulate pressing and releasing the 'Ctrl' key
113
+ with keyboard.pressed(Key.ctrl):
114
+ keyboard.press('c')
115
+ keyboard.release('c')
116
+
117
+
118
+
119
+ # import time
120
+ # import subprocess
121
+
122
+ # def send_key_event(event_device, key_code):
123
+ # # Press the key
124
+ # subprocess.run(["sudo", "evemu-event", event_device, "--type", "EV_KEY", "--code", key_code, "--value", "1", "--sync"])
125
+ # # Release the key
126
+ # subprocess.run(["sudo", "evemu-event", event_device, "--type", "EV_KEY", "--code", key_code, "--value", "0", "--sync"])
127
+
128
+ # # Example usage
129
+ # event_device = "/dev/input/event3" # Your keyboard device
130
+ # send_key_event(event_device, "KEY_A")
131
+
132
+ # for letter in "Hello, World!":
133
+ # if letter:
134
+ # if letter == " ":
135
+ # send_key_event(event_device, "KEY_SPACE")
136
+ # elif letter == ",":
137
+ # send_key_event(event_device, "KEY_COMMA")
138
+ # elif letter == "!":
139
+ # send_key_event(event_device, "KEY_1")
140
+ # else:
141
+ # send_key_event(event_device, f"KEY_{letter.upper()}")
142
+ # time.sleep(0.1)
scribe/util.py ADDED
@@ -0,0 +1,185 @@
1
+ import os
2
+ import re
3
+ import tqdm
4
+ import shutil
5
+ from functools import partial
6
+
7
+
8
+ class bcolors:
9
+ # https://stackoverflow.com/a/287944/2192272
10
+ HEADER = '\033[95m'
11
+ OKBLUE = '\033[94m'
12
+ OKGREEN = '\033[92m'
13
+ WARNING = '\033[93m'
14
+ FAIL = '\033[91m'
15
+ ENDC = '\033[0m'
16
+ BOLD = '\033[1m'
17
+ UNDERLINE = '\033[4m'
18
+
19
+ def strip_colors(s):
20
+ for name, c in vars(bcolors).items():
21
+ if name.startswith("_"):
22
+ continue
23
+ s = s.replace(c, '')
24
+ return s
25
+
26
+
27
+ def ansi_link(uri, label=None):
28
+ """https://stackoverflow.com/a/71309268/2192272
29
+ """
30
+ if label is None:
31
+ label = uri
32
+ parameters = ''
33
+
34
+ # OSC 8 ; params ; URI ST <name> OSC 8 ;; ST
35
+ escape_mask = '\033]8;{};{}\033\\{}\033]8;;\033\\'
36
+
37
+ return escape_mask.format(parameters, uri, label)
38
+
39
+ def colored(text, color):
40
+ if hasattr(bcolors, color):
41
+ color = getattr(bcolors, color)
42
+ return f"{color}{text}{bcolors.ENDC}"
43
+
44
+
45
+ ANSI_LINK_RE = re.compile(r'(?P<ansi_sequence>\033]8;(?P<parameter>.*?);(?P<uri>.*?)\033\\(?P<label>.*?)\033]8;;\033\\)')
46
+
47
+ def strip_ansi_link(s):
48
+ for m in ANSI_LINK_RE.findall(s):
49
+ s = s.replace(m[0], m[3])
50
+ return s
51
+
52
+
53
+ def strip_all(s):
54
+ s = strip_colors(s)
55
+ s = strip_ansi_link(s)
56
+ return s
57
+
58
+
59
+ # Function to clear the terminal line
60
+ def clear_line():
61
+ # Get terminal width
62
+ terminal_width = shutil.get_terminal_size().columns
63
+ print("\r" + " " * terminal_width, end="") # Clear the line
64
+ print("\r", end="") # Return cursor to the beginning of the line
65
+
66
+
67
+ def print_partial(msg):
68
+ # Get terminal width
69
+ terminal_width = shutil.get_terminal_size().columns
70
+ start = max(0, len(msg) + 7 - terminal_width)
71
+ print(f"\r[...] {msg[start:]}", end="")
72
+
73
+ def check_status_code(status_code):
74
+ if 200 <= status_code < 300:
75
+ return True
76
+ else:
77
+ return False
78
+
79
+ def download_model(url, data_folder):
80
+ import requests
81
+ import zipfile
82
+ import io
83
+
84
+ os.makedirs(data_folder, exist_ok=True)
85
+
86
+ print(f"Downloading model from {url}...")
87
+ response = requests.get(url, stream=True)
88
+
89
+ # check the URL was correct:
90
+ if not check_status_code(response.status_code):
91
+ raise RuntimeError(f"Model download failed with error {response.status_code}")
92
+
93
+ total_size = int(response.headers.get('content-length', 0))
94
+ block_size = 1024 # 1 Kibibyte
95
+ t = tqdm.tqdm(total=total_size, unit='iB', unit_scale=True)
96
+
97
+ with io.BytesIO() as temp_file:
98
+ for data in response.iter_content(block_size):
99
+ t.update(len(data))
100
+ temp_file.write(data)
101
+ t.close()
102
+ temp_file.seek(0)
103
+
104
+ # check the file was downloaded correctly
105
+ if total_size != 0 and t.n != total_size:
106
+ raise RuntimeError(f"Model download size is 0 or less than stated size")
107
+
108
+ with zipfile.ZipFile(temp_file) as z:
109
+ z.extractall(data_folder)
110
+
111
+ print(f"Model downloaded and unpacked to {data_folder}")
112
+
113
+
114
+ def format_choice(enum, default=None, unavailable=None):
115
+ i, value = enum
116
+ if type(value) in [tuple, list]:
117
+ value_str = f" {value[0]} ({' | '.join(value[1:])})"
118
+ else:
119
+ value_str = value
120
+
121
+ if (default is not None and value == default) or (default is None and i == 0):
122
+ return f' ' + colored(f'({i+1}) {value_str} [Press Enter]', 'BOLD')
123
+ elif unavailable and value in unavailable:
124
+ return f' ' + colored(f'{" "} {value_str} -> unavailable !!', 'FAIL')
125
+ else:
126
+ return f' ({i+1}) {value_str}'
127
+
128
+ def is_integer(value):
129
+ try:
130
+ int(value)
131
+ return True
132
+ except (ValueError, TypeError):
133
+ return False
134
+
135
+ def prompt_choices(choices, default=None, label="value", unavailable_choices=None, hidden_models=None):
136
+ value = None
137
+ if unavailable_choices is None:
138
+ unavailable_choices = []
139
+ available_choices = choices
140
+ else:
141
+ available_choices = [c for c in choices if c not in unavailable_choices]
142
+
143
+ wildcard = any("*" in choice for choice in available_choices)
144
+
145
+ while (value not in (available_choices + (hidden_models or []))) or ("*" in value):
146
+ if value:
147
+ print(f"Invalid {label}: {value}")
148
+ value = input(f"""Please choose a {label}:
149
+ {'\n'.join(map(partial(format_choice, default=default, unavailable=unavailable_choices),
150
+ enumerate(available_choices + unavailable_choices)))}
151
+ (type number or any name or alias or press [Enter])...
152
+ """)
153
+ if value == "":
154
+ value = default or available_choices[0]
155
+
156
+ if is_integer(value):
157
+ try:
158
+ value = available_choices[int(value) - 1]
159
+ except IndexError:
160
+ continue
161
+
162
+ if "*" in value:
163
+ continue
164
+
165
+ # can match any other choice so we break
166
+ if wildcard:
167
+ break
168
+
169
+ assert "*" not in value
170
+ return value[0] if type(value) in [list, tuple] else value
171
+
172
+
173
+ def check_dependencies(backend, dependencies=None, raise_error=False):
174
+ from importlib import import_module
175
+ modules = dependencies or [backend]
176
+ try:
177
+ for module in modules:
178
+ import_module(module)
179
+ return True
180
+ except ImportError:
181
+ # if requested by the user, raise an Exception
182
+ if raise_error:
183
+ raise
184
+ return False
185
+ return False
@@ -0,0 +1,29 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Mahé Perrette
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
23
+ ---
24
+
25
+ Note: This project relies on external packages that may have more restrictive
26
+ licenses. For example, the `pynput` package is licensed under LGPLv3, which
27
+ has different requirements compared to the MIT License. Please review the
28
+ licenses of all dependencies before using or distributing this software to
29
+ ensure compliance with their respective terms.
@@ -0,0 +1,135 @@
1
+ Metadata-Version: 2.2
2
+ Name: scribe-cli
3
+ Version: 0.3.0
4
+ Summary: scribe is a local speech recognition tool that provides real-time transcription using vosk and whisper AI.
5
+ Author-email: Mahé Perrette <mahe.perrette@gmail.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2024 Mahé Perrette
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ ---
29
+
30
+ Note: This project relies on external packages that may have more restrictive
31
+ licenses. For example, the `pynput` package is licensed under LGPLv3, which
32
+ has different requirements compared to the MIT License. Please review the
33
+ licenses of all dependencies before using or distributing this software to
34
+ ensure compliance with their respective terms.
35
+ Project-URL: Homepage, https://github.com/perrette/scribe
36
+ Keywords: speech recognition,transcription,AI,language,vosk,whisper,openai
37
+ Classifier: Programming Language :: Python :: 3
38
+ Classifier: Operating System :: OS Independent
39
+ Requires-Python: >=3.9
40
+ Description-Content-Type: text/markdown
41
+ License-File: LICENSE
42
+ Requires-Dist: numpy
43
+ Requires-Dist: sounddevice
44
+ Requires-Dist: tqdm
45
+ Requires-Dist: requests
46
+ Provides-Extra: keyboard
47
+ Requires-Dist: pynput; extra == "keyboard"
48
+ Provides-Extra: whisper
49
+ Requires-Dist: openai-whisper; extra == "whisper"
50
+ Provides-Extra: vosk
51
+ Requires-Dist: vosk; extra == "vosk"
52
+ Provides-Extra: all
53
+ Requires-Dist: pynput; extra == "all"
54
+ Requires-Dist: openai-whisper; extra == "all"
55
+ Requires-Dist: vosk; extra == "all"
56
+
57
+ # Scribe
58
+
59
+ `scribe` is a local speech recognition tool that provides real-time transcription using vosk and whisper AI.
60
+
61
+ ## Installation
62
+
63
+ Install PortAudio library. E.g. on Ubuntu:
64
+
65
+ ```bash
66
+ sudo apt-get install portaudio19-dev
67
+ ```
68
+
69
+ The python dependencies should be dealt with automatically:
70
+
71
+ ```bash
72
+ pip install scribe-cli[all]"
73
+ ```
74
+
75
+ (note the `-cli` suffix for client)
76
+
77
+ or for local development:
78
+
79
+ ```bash
80
+ git clone https://github.com/perrette/scribe.git
81
+ cd scribe
82
+ pip install -e .[all]
83
+ ```
84
+
85
+ You can leave the optional dependencies (leave out `[all]`) but must install at least one of `vosk` or `openai-whisper` packages (see Usage below).
86
+
87
+ The `vosk` language models will download on-the-fly.
88
+ The default data folder is `$HOME/.local/share/vosk/language-models`.
89
+ This can be modified.
90
+
91
+
92
+ ## Usage
93
+
94
+ Just type in the terminal:
95
+
96
+ ```bash
97
+ scribe
98
+ ```
99
+ and the script will guide you through the choice of backend (`whisper` or `vosk`) and the specific language model.
100
+ After this, you will be prompted to start recording your microphone and print the transcribed text in real-time (`vosk`)
101
+ or until after recording is complete (`whisper`).
102
+ You can interrupt the recording via Ctrl + C and start again or change model.
103
+
104
+ The default (`whisper`) is excellent at transcribing a full-length audio sequences in [many languages](https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages). It is really impressive,
105
+ but it cannot do real-time out of the box, and depending on the model can have relatively long execution time, especially with the `turbo` model (at least on my laptop with CPU only). The `small` model is also excellent and runs much faster. It is selected as default in `scribe` for that reason.
106
+ With the `whisker` model you need to stop the registration manually before the transcription occurs (Ctrl + C), though after
107
+ 60 seconds it will stop automatically (and try to continue afterward).
108
+
109
+ The `vosk` backend is good at
110
+ doing real-time transcription for one language, but tended to make more mistakes in my tests and it does not do punctuation.
111
+ There are many [vosk models](https://alphacephei.com/vosk/models) available, and here a few are associated to [a handful of languages](scribe/models.toml) `en`, `fr`, `it`, `de` (so far).
112
+
113
+ To skip the initial selection menu you can do:
114
+ ```bash
115
+ scribe --backend whisper --model small --no-prompt
116
+ ```
117
+ where `--no-prompt` jumps right to the recording (after the first interruption, you can still choose to change the backend and model).
118
+
119
+ ### Advanced usage as keyboard replacement
120
+
121
+ With the `--keyboard` option `scribe` will attempt to simulate a keyboard and send transcribed characters to the applcation under focus:
122
+
123
+ ```bash
124
+ scribe --keyboard
125
+ ```
126
+
127
+ It relies on the optional `pynput` dependency (installed together with `scribe` if you used the `[all]` or `[keyboard]` option).
128
+
129
+ `pynput` may require [some configuration](https://pynput.readthedocs.io/en/latest/limitations.html) (I *think* got it to work with `xhost +SI:localuser:$(whoami)` as far as the display is concerned). It has [limitations]((https://pynput.readthedocs.io/en/latest/limitations.html)). In my Ubuntu + Wayland system it works in chromium based applications (including vscode) but it does not in firefox and sublime text and any of the rest (not even in a terminal !).
130
+ Workarounds include using the Xorg version of GNOME... Suggestions welcome.
131
+
132
+ ### Start as an application in Ubuntu
133
+
134
+ If you run Ubuntu (or else?) with GNOME, the script `scribe-install [...]` will create a `scribe.desktop` file and place it under `$HOME/.local/share/applications`
135
+ to make it available from the quick launch menu. Any option will be passed on to `scribe`.
@@ -0,0 +1,17 @@
1
+ scribe/__init__.py,sha256=WjDmvecyDIyJLYp4rCV9vsSYbQDc4L1EpYqORvEXliI,33
2
+ scribe/_version.py,sha256=Jk2iAU7m-7Vx9XV1TtdD9ZoJraIncDq_4_Wd-qtUotg,411
3
+ scribe/audio.py,sha256=PXtkRwVlyEf-m3eVIH61nlCpfiiZvFVJQWdW4mhL6bY,1716
4
+ scribe/install_desktop.py,sha256=iKZjIddi_Y-xmg-mrrdZvwrec4gwS2TiNvdg1Uu2YqQ,1491
5
+ scribe/keyboard.py,sha256=jhiPv927Pydnta1Q8QaoPWNCQpojDZmvuAZ8UOo1TyM,581
6
+ scribe/models.py,sha256=bJVp_2t_UG996NBWh2bQf5_44ny4Mqr-6kCyS0w7qNA,4466
7
+ scribe/models.toml,sha256=H5IvYx03QbaZpgPuyl08RC_t24FqN-ZKfS5bnCMVLSc,715
8
+ scribe/saverecording.py,sha256=20QyJNMb1kzGkLXajBnBxIXMMji0Bp3Do69bHZD8S7g,2137
9
+ scribe/streamer.py,sha256=-4hWI6lX3Kz1qvc8NSqtofIlOkOVN2mS-JqCC4eHCzg,8223
10
+ scribe/testpynput.py,sha256=J1GlX7ns2yMeEfhoq_BVKHQ6REcTKhS8OMt2aak1RfY,4062
11
+ scribe/util.py,sha256=ep-5W99tMltG4SAbdCMIB3qI98zVyb3M170OOlSXR_c,5322
12
+ scribe_cli-0.3.0.dist-info/LICENSE,sha256=76NFkiJg6-f0qj4s4YSIkHcWho3eckzmOKIbEFMOM-U,1426
13
+ scribe_cli-0.3.0.dist-info/METADATA,sha256=y3PMQDX44HnAKEAcFjpOTsr8zUbgkXfsegMuvmAjk1Y,6316
14
+ scribe_cli-0.3.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
15
+ scribe_cli-0.3.0.dist-info/entry_points.txt,sha256=I76vox2zTEO8vr5b1UU6bYfKCjkviEyXMq4cndW8yQc,93
16
+ scribe_cli-0.3.0.dist-info/top_level.txt,sha256=N57j6gzwa6unDew22CFvM3KKQQL0H2itDsc166HHINg,7
17
+ scribe_cli-0.3.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (75.8.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ scribe = scribe.streamer:main
3
+ scribe-install = scribe.install_desktop:main
@@ -0,0 +1 @@
1
+ scribe