scribe-cli 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scribe/__init__.py +1 -0
- scribe/_version.py +16 -0
- scribe/audio.py +51 -0
- scribe/install_desktop.py +43 -0
- scribe/keyboard.py +19 -0
- scribe/models.py +133 -0
- scribe/models.toml +31 -0
- scribe/saverecording.py +62 -0
- scribe/streamer.py +215 -0
- scribe/testpynput.py +142 -0
- scribe/util.py +185 -0
- scribe_cli-0.3.0.dist-info/LICENSE +29 -0
- scribe_cli-0.3.0.dist-info/METADATA +135 -0
- scribe_cli-0.3.0.dist-info/RECORD +17 -0
- scribe_cli-0.3.0.dist-info/WHEEL +5 -0
- scribe_cli-0.3.0.dist-info/entry_points.txt +3 -0
- scribe_cli-0.3.0.dist-info/top_level.txt +1 -0
scribe/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from ._version import __version__
|
scribe/_version.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# file generated by setuptools_scm
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
TYPE_CHECKING = False
|
|
4
|
+
if TYPE_CHECKING:
|
|
5
|
+
from typing import Tuple, Union
|
|
6
|
+
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
7
|
+
else:
|
|
8
|
+
VERSION_TUPLE = object
|
|
9
|
+
|
|
10
|
+
version: str
|
|
11
|
+
__version__: str
|
|
12
|
+
__version_tuple__: VERSION_TUPLE
|
|
13
|
+
version_tuple: VERSION_TUPLE
|
|
14
|
+
|
|
15
|
+
__version__ = version = '0.3.0'
|
|
16
|
+
__version_tuple__ = version_tuple = (0, 3, 0)
|
scribe/audio.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import sounddevice as sd
|
|
2
|
+
import queue
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def get_duration(audio_length_bytes, # bytes
|
|
6
|
+
sampling_rate = 16000, # Hz
|
|
7
|
+
num_channels = 1, # Mono
|
|
8
|
+
sample_width = 2, # 16-bit audio
|
|
9
|
+
):
|
|
10
|
+
|
|
11
|
+
# Calculate the number of samples
|
|
12
|
+
num_samples = audio_length_bytes / (num_channels * sample_width)
|
|
13
|
+
|
|
14
|
+
# Calculate the duration in seconds
|
|
15
|
+
duration_seconds = num_samples / sampling_rate
|
|
16
|
+
|
|
17
|
+
return duration_seconds
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Microphone:
|
|
21
|
+
def __init__(self,
|
|
22
|
+
samplerate = 16000, # Vosk models typically use a 16kHz sample rate
|
|
23
|
+
channels = 1, # Mono audio
|
|
24
|
+
device = None, # Default device
|
|
25
|
+
dtype = 'int16', # Vosk models typically use 16-bit audio
|
|
26
|
+
):
|
|
27
|
+
self.q = queue.Queue()
|
|
28
|
+
self.samplerate = samplerate
|
|
29
|
+
self.channels = channels
|
|
30
|
+
self.device = device
|
|
31
|
+
self.dtype = dtype
|
|
32
|
+
|
|
33
|
+
# Fonction callback pour traiter les morceaux audio
|
|
34
|
+
def callback(self, indata, frames, time, status):
|
|
35
|
+
if status:
|
|
36
|
+
print(status)
|
|
37
|
+
self.q.put(bytes(indata))
|
|
38
|
+
# if frames > 1000: # Ajustez cette valeur pour essayer différents morceaux de taille
|
|
39
|
+
# rec.AcceptWaveform(bytes(indata))
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def open_stream(self):
|
|
43
|
+
self.q.queue.clear()
|
|
44
|
+
return sd.InputStream(samplerate=self.samplerate, device=self.device,
|
|
45
|
+
channels=self.channels, callback=self.callback, dtype=self.dtype)
|
|
46
|
+
|
|
47
|
+
def device_info(self):
|
|
48
|
+
return sd.query_devices(self.device, 'input')
|
|
49
|
+
|
|
50
|
+
def get_duraction(self, audio_length_bytes):
|
|
51
|
+
return get_duration(audio_length_bytes, self.samplerate, self.channels, {'int16':2}[self.dtype])
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import os, sys, platform, shutil, sysconfig
|
|
2
|
+
import argparse
|
|
3
|
+
|
|
4
|
+
def main():
|
|
5
|
+
|
|
6
|
+
# Check if the current platform is Linux
|
|
7
|
+
if platform.system() != "Linux":
|
|
8
|
+
print("This package is only supported on Linux systems.", file=sys.stderr)
|
|
9
|
+
sys.exit(0)
|
|
10
|
+
|
|
11
|
+
parser = argparse.ArgumentParser("Install the desktop file for the scribe package. Any arguments to this script will be passed on to `scribe`.")
|
|
12
|
+
o, rest = parser.parse_known_args()
|
|
13
|
+
o.arguments = rest
|
|
14
|
+
|
|
15
|
+
PACKAGE_NAME = 'scribe'
|
|
16
|
+
|
|
17
|
+
HOME = os.environ.get('HOME',os.path.expanduser('~'))
|
|
18
|
+
XDG_SHARE = os.environ.get('XDG_DATA_HOME', os.path.join(HOME, '.local','share'))
|
|
19
|
+
XDG_APP_DATA = os.path.join(XDG_SHARE, 'applications')
|
|
20
|
+
XDG_SCRIBE_DATA = os.path.join(XDG_SHARE, PACKAGE_NAME)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# Create the directory if it doesn't exist
|
|
24
|
+
os.makedirs(XDG_SCRIBE_DATA, exist_ok=True)
|
|
25
|
+
os.makedirs(XDG_APP_DATA, exist_ok=True)
|
|
26
|
+
|
|
27
|
+
# Copy your files to the desired location
|
|
28
|
+
print("Copying files to", XDG_SCRIBE_DATA)
|
|
29
|
+
shutil.copy('share/icon.jpg', XDG_SCRIBE_DATA)
|
|
30
|
+
|
|
31
|
+
with open('templates/scribe.desktop') as f:
|
|
32
|
+
template = f.read()
|
|
33
|
+
|
|
34
|
+
bin_folder = sysconfig.get_path("scripts")
|
|
35
|
+
desktop_file = template.format(XDG_SCRIBE_DATA=XDG_SCRIBE_DATA, bin_folder=bin_folder, options=' '.join(o.arguments))
|
|
36
|
+
|
|
37
|
+
print("Writing desktop file to", XDG_APP_DATA)
|
|
38
|
+
with open(os.path.join(XDG_APP_DATA, 'scribe.desktop'), "w") as f:
|
|
39
|
+
f.write(desktop_file)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
if __name__ == "__main__":
|
|
43
|
+
main()
|
scribe/keyboard.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""This module handles typing characters as if they were typed on a keyboard.
|
|
2
|
+
"""
|
|
3
|
+
try:
|
|
4
|
+
# import pyautogui
|
|
5
|
+
from pynput.keyboard import Controller
|
|
6
|
+
|
|
7
|
+
except ImportError:
|
|
8
|
+
print("Please install pynput to use the keyboard feature.")
|
|
9
|
+
print("Alternatively specify [keyboard] optional dependency to voskrealtime, e.g. `pip install -e .[keyboard]`")
|
|
10
|
+
raise
|
|
11
|
+
|
|
12
|
+
# Create a keyboard controller
|
|
13
|
+
keyboard = Controller()
|
|
14
|
+
|
|
15
|
+
def type_text(text, interval=0):
|
|
16
|
+
# Simulate typing a string
|
|
17
|
+
# import subprocess
|
|
18
|
+
# subprocess.run(["ydotool", "type", text])
|
|
19
|
+
keyboard.type(text)
|
scribe/models.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import numpy as np
|
|
4
|
+
from scribe.util import download_model
|
|
5
|
+
|
|
6
|
+
VOSK_MODELS_FOLDER = os.path.join(os.environ.get("HOME"),
|
|
7
|
+
".local/share/vosk/language-models")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AbstractTranscriber:
|
|
11
|
+
backend = None
|
|
12
|
+
def __init__(self, model, model_name=None, language=None, samplerate=16000, model_kwargs={}):
|
|
13
|
+
self.model_name = model_name
|
|
14
|
+
self.language = language
|
|
15
|
+
self.model = model
|
|
16
|
+
self.model_kwargs = model_kwargs
|
|
17
|
+
self.samplerate = samplerate
|
|
18
|
+
|
|
19
|
+
def transcribe_audio(self, audio_data):
|
|
20
|
+
raise NotImplementedError()
|
|
21
|
+
|
|
22
|
+
def transcribe_realtime_audio(self, audio_data):
|
|
23
|
+
raise NotImplementedError()
|
|
24
|
+
|
|
25
|
+
def start_recording(self, microphone,
|
|
26
|
+
start_message="Recording... Press Ctrl+C to stop.",
|
|
27
|
+
stop_message="Stopped recording."):
|
|
28
|
+
|
|
29
|
+
with microphone.open_stream():
|
|
30
|
+
print(start_message)
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
while True:
|
|
34
|
+
while not microphone.q.empty():
|
|
35
|
+
data = microphone.q.get()
|
|
36
|
+
yield self.transcribe_realtime_audio(data)
|
|
37
|
+
|
|
38
|
+
except KeyboardInterrupt:
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
finally:
|
|
42
|
+
result = self.finalize()
|
|
43
|
+
microphone.q.queue.clear()
|
|
44
|
+
yield result
|
|
45
|
+
|
|
46
|
+
print(stop_message)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def get_vosk_model(model, data_folder=None, url=None):
|
|
50
|
+
"""Load the Vosk recognizer"""
|
|
51
|
+
import vosk
|
|
52
|
+
if data_folder is None:
|
|
53
|
+
data_folder = VOSK_MODELS_FOLDER
|
|
54
|
+
model_path = os.path.join(data_folder, model)
|
|
55
|
+
if not os.path.exists(model_path):
|
|
56
|
+
if url is None:
|
|
57
|
+
url = f"https://alphacephei.com/vosk/models/{model}.zip"
|
|
58
|
+
download_model(url, data_folder)
|
|
59
|
+
assert os.path.exists(model_path)
|
|
60
|
+
|
|
61
|
+
return vosk.Model(model_path)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def get_vosk_recognizer(model, samplerate=16000):
|
|
65
|
+
import vosk
|
|
66
|
+
return vosk.KaldiRecognizer(model, samplerate)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class VoskTranscriber(AbstractTranscriber):
|
|
70
|
+
backend = "vosk"
|
|
71
|
+
|
|
72
|
+
def __init__(self, model_name, model=None, model_kwargs={}, **kwargs):
|
|
73
|
+
if model is None:
|
|
74
|
+
model = get_vosk_model(model_name, **model_kwargs)
|
|
75
|
+
super().__init__(model, model_name, model_kwargs=model_kwargs, **kwargs)
|
|
76
|
+
self.recognizer = get_vosk_recognizer(model, self.samplerate)
|
|
77
|
+
|
|
78
|
+
def transcribe_realtime_audio(self, audio_bytes=b"", finalize=False):
|
|
79
|
+
final = self.recognizer.AcceptWaveform(audio_bytes)
|
|
80
|
+
if final:
|
|
81
|
+
result = self.recognizer.Result()
|
|
82
|
+
else:
|
|
83
|
+
result = self.recognizer.PartialResult()
|
|
84
|
+
result_dict = json.loads(result)
|
|
85
|
+
|
|
86
|
+
if final:
|
|
87
|
+
pass
|
|
88
|
+
elif finalize:
|
|
89
|
+
result_dict["text"] = result_dict.pop("partial", "")
|
|
90
|
+
else:
|
|
91
|
+
assert not final
|
|
92
|
+
if "text" in result_dict:
|
|
93
|
+
del result_dict["text"]
|
|
94
|
+
return result_dict
|
|
95
|
+
|
|
96
|
+
def transcribe_audio(self, audio_data=None):
|
|
97
|
+
return self.transcribe_realtime_audio(audio_data, finalize=True)
|
|
98
|
+
|
|
99
|
+
def finalize(self):
|
|
100
|
+
return self.transcribe_audio(b"")
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class WhisperTranscriber(AbstractTranscriber):
|
|
104
|
+
backend = "whisper"
|
|
105
|
+
|
|
106
|
+
def __init__(self, model_name, language=None, model=None, model_kwargs={}, **kwargs):
|
|
107
|
+
import whisper
|
|
108
|
+
if model is None:
|
|
109
|
+
model = whisper.load_model(model_name)
|
|
110
|
+
super().__init__(model, model_name, language, model_kwargs=model_kwargs, **kwargs)
|
|
111
|
+
self.audio_buffer = b''
|
|
112
|
+
|
|
113
|
+
def transcribe_realtime_audio(self, audio_bytes=b"", max_duration=60):
|
|
114
|
+
self.audio_buffer += audio_bytes
|
|
115
|
+
|
|
116
|
+
one_second = self.samplerate * 2 # 16-bit audio, 1 channel ~ 32000 bytes
|
|
117
|
+
if len(self.audio_buffer) < max_duration * one_second:
|
|
118
|
+
return {"partial": f"{len(self.audio_buffer)} bytes received (duration: {len(self.audio_buffer) / one_second:.2f} seconds)"}
|
|
119
|
+
|
|
120
|
+
else:
|
|
121
|
+
return self.finalize()
|
|
122
|
+
|
|
123
|
+
def transcribe_audio(self, audio_bytes):
|
|
124
|
+
print("\nTranscribing...")
|
|
125
|
+
audio_array = np.frombuffer(audio_bytes, dtype=np.int16).flatten().astype(np.float32) / 32768.0
|
|
126
|
+
return self.model.transcribe(audio_array, fp16=False, language=self.language)
|
|
127
|
+
|
|
128
|
+
def finalize(self):
|
|
129
|
+
if len(self.audio_buffer) == 0:
|
|
130
|
+
return {"text": ""}
|
|
131
|
+
result = self.transcribe_audio(self.audio_buffer)
|
|
132
|
+
self.audio_buffer = b''
|
|
133
|
+
return result
|
scribe/models.toml
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
[vosk.en]
|
|
2
|
+
model = "vosk-model-en-us-0.42-gigaspeech"
|
|
3
|
+
|
|
4
|
+
[vosk.fr]
|
|
5
|
+
model = "vosk-model-fr-0.22"
|
|
6
|
+
|
|
7
|
+
[vosk.de]
|
|
8
|
+
model = "vosk-model-de-tuda-0.6-900k"
|
|
9
|
+
|
|
10
|
+
[vosk.it]
|
|
11
|
+
model = "vosk-model-it-0.22"
|
|
12
|
+
|
|
13
|
+
[_meta.en]
|
|
14
|
+
language = "English (US)"
|
|
15
|
+
start_message = "Listening... Press Ctrl+C to stop."
|
|
16
|
+
stop_message = "Recording stopped."
|
|
17
|
+
|
|
18
|
+
[_meta.fr]
|
|
19
|
+
language = "French"
|
|
20
|
+
start_message = "En écoute... Appuyez sur Ctrl+C pour arrêter."
|
|
21
|
+
stop_message = "Écoute arrêtée."
|
|
22
|
+
|
|
23
|
+
[_meta.de]
|
|
24
|
+
language = "German"
|
|
25
|
+
start_message = "Hören... Drücken Sie Strg+C, um zu stoppen."
|
|
26
|
+
stop_message = "Aufnahme gestoppt."
|
|
27
|
+
|
|
28
|
+
[_meta.it]
|
|
29
|
+
language = "Italian"
|
|
30
|
+
start_message = "In ascolto... Premere Ctrl+C per interrompere."
|
|
31
|
+
stop_message = "Registrazione interrotta."
|
scribe/saverecording.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import sounddevice as sd
|
|
2
|
+
import time
|
|
3
|
+
import numpy as np
|
|
4
|
+
from pydub import AudioSegment
|
|
5
|
+
import io
|
|
6
|
+
|
|
7
|
+
# Set up the audio parameters
|
|
8
|
+
samplerate = 16000 # Vosk models typically use 16 kHz
|
|
9
|
+
channels = 1
|
|
10
|
+
duration = 5 # in seconds
|
|
11
|
+
chunk_size = 1024 # Number of samples per chunk (adjust this based on your needs)
|
|
12
|
+
|
|
13
|
+
# Create a numpy array to store the full audio buffer
|
|
14
|
+
full_audio = np.zeros((0,), dtype=np.float32) # Use float32 initially for better precision
|
|
15
|
+
|
|
16
|
+
# Callback function to collect audio data into the buffer
|
|
17
|
+
def callback(indata, frames, time, status):
|
|
18
|
+
if status:
|
|
19
|
+
print(status)
|
|
20
|
+
global full_audio
|
|
21
|
+
# Append the incoming data to the full audio buffer (in float32)
|
|
22
|
+
full_audio = np.concatenate((full_audio, indata[:, 0]))
|
|
23
|
+
|
|
24
|
+
# Function to record audio for a fixed duration
|
|
25
|
+
def record_for_duration(duration):
|
|
26
|
+
try:
|
|
27
|
+
with sd.InputStream(samplerate=samplerate, channels=channels, callback=callback):
|
|
28
|
+
print(f"Recording for {duration} seconds or interrupt with Ctrl-C")
|
|
29
|
+
sd.sleep(duration * 1000) # Sleep for the duration to ensure we record for the fixed time
|
|
30
|
+
except KeyboardInterrupt:
|
|
31
|
+
pass
|
|
32
|
+
print("Recording finished.")
|
|
33
|
+
save_audio_as_mp3()
|
|
34
|
+
|
|
35
|
+
# Function to save the recorded audio as an MP3 file
|
|
36
|
+
def save_audio_as_mp3():
|
|
37
|
+
# Convert the float32 audio buffer to 16-bit PCM (this is required for conversion to MP3)
|
|
38
|
+
audio_16bit = np.int16(full_audio * 32767) # Scale to the 16-bit range
|
|
39
|
+
|
|
40
|
+
# Create an in-memory audio file using io.BytesIO
|
|
41
|
+
audio_data = io.BytesIO()
|
|
42
|
+
|
|
43
|
+
# Save the audio as WAV using pydub (since pydub works with WAV directly)
|
|
44
|
+
# Convert the numpy array to a pydub AudioSegment
|
|
45
|
+
audio_segment = AudioSegment(
|
|
46
|
+
audio_16bit.tobytes(),
|
|
47
|
+
frame_rate=samplerate,
|
|
48
|
+
sample_width=2, # 16-bit audio
|
|
49
|
+
channels=1
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Export as MP3`
|
|
53
|
+
audio_segment.export(audio_data, format="mp3")
|
|
54
|
+
|
|
55
|
+
# Save to a file
|
|
56
|
+
with open("recording.mp3", "wb") as f:
|
|
57
|
+
f.write(audio_data.getvalue())
|
|
58
|
+
print("Recording saved as recording.mp3")
|
|
59
|
+
|
|
60
|
+
# Example: record for 5 seconds
|
|
61
|
+
record_for_duration(5*60)
|
|
62
|
+
|
scribe/streamer.py
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import tomllib
|
|
3
|
+
import argparse
|
|
4
|
+
from scribe.audio import Microphone
|
|
5
|
+
from scribe.util import print_partial, clear_line, prompt_choices, check_dependencies, ansi_link, colored
|
|
6
|
+
from scribe.models import VoskTranscriber, WhisperTranscriber
|
|
7
|
+
|
|
8
|
+
with open(Path(__file__).parent / "models.toml", "rb") as f:
|
|
9
|
+
language_config_default = tomllib.load(f)
|
|
10
|
+
|
|
11
|
+
language_config = language_config_default.copy()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# Commencer l'enregistrement
|
|
15
|
+
def start_recording(micro, transcriber, keyboard=False, latency=0):
|
|
16
|
+
|
|
17
|
+
if keyboard:
|
|
18
|
+
try:
|
|
19
|
+
from scribe.keyboard import type_text
|
|
20
|
+
except ImportError:
|
|
21
|
+
keyboard = False
|
|
22
|
+
exit(1)
|
|
23
|
+
|
|
24
|
+
greetings = { k: v for k, v in language_config["_meta"].get(transcriber.language, {}).items()
|
|
25
|
+
if v is not None and k.startswith(("start", "stop"))
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
for result in transcriber.start_recording(micro, **greetings):
|
|
29
|
+
|
|
30
|
+
if result.get('text'):
|
|
31
|
+
clear_line()
|
|
32
|
+
print(result.get('text'))
|
|
33
|
+
if keyboard:
|
|
34
|
+
type_text(result['text'] + " ", interval=latency) # Simulate typing
|
|
35
|
+
else:
|
|
36
|
+
print_partial(result.get('partial', ''))
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def get_default_backend():
|
|
40
|
+
try:
|
|
41
|
+
import vosk
|
|
42
|
+
return "vosk"
|
|
43
|
+
except ImportError:
|
|
44
|
+
try:
|
|
45
|
+
import whisper
|
|
46
|
+
return "whisper"
|
|
47
|
+
except ImportError:
|
|
48
|
+
raise ImportError("Please install either vosk or whisper to use this script.")
|
|
49
|
+
|
|
50
|
+
BACKENDS = ["whisper", "vosk"]
|
|
51
|
+
UNAVAILABLE_BACKENDS = []
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def pick_specialist_model(model, language, backend):
|
|
55
|
+
""" choose a specialist version of a model if language is specified (whisper)"""
|
|
56
|
+
|
|
57
|
+
if backend == "whisper" and language and language.lower() in ["en", "english"]:
|
|
58
|
+
available_models_en = ["tiny.en", "base.en", "small.en", "medium.en", "large", "turbo"]
|
|
59
|
+
if model + ".en" in available_models_en:
|
|
60
|
+
model += ".en"
|
|
61
|
+
|
|
62
|
+
return model
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def get_transcriber(o, prompt=True):
|
|
66
|
+
|
|
67
|
+
if o.backend:
|
|
68
|
+
checked_backend = check_dependencies(o.backend)
|
|
69
|
+
if not checked_backend:
|
|
70
|
+
print(f"Backend {o.backend} is not available.")
|
|
71
|
+
exit(1)
|
|
72
|
+
backend = o.backend
|
|
73
|
+
|
|
74
|
+
elif not prompt:
|
|
75
|
+
backend = choices[0]
|
|
76
|
+
|
|
77
|
+
else:
|
|
78
|
+
checked_backend = False
|
|
79
|
+
while not checked_backend:
|
|
80
|
+
backend = prompt_choices(BACKENDS, o.backend, "backend", UNAVAILABLE_BACKENDS)
|
|
81
|
+
# raise an error if the user has explicitly selected a backend that is not available
|
|
82
|
+
checked_backend = check_dependencies(backend, raise_error=backend==o.backend)
|
|
83
|
+
if not checked_backend:
|
|
84
|
+
print(f"Backend {o.backend} is not available.")
|
|
85
|
+
UNAVAILABLE_BACKENDS.append(backend)
|
|
86
|
+
|
|
87
|
+
print(f"Selected backend: {backend}")
|
|
88
|
+
|
|
89
|
+
if o.model:
|
|
90
|
+
model = pick_specialist_model(o.model, o.language, backend)
|
|
91
|
+
|
|
92
|
+
else:
|
|
93
|
+
|
|
94
|
+
if backend == "vosk":
|
|
95
|
+
available_languages = list(language_config[backend])
|
|
96
|
+
if o.language:
|
|
97
|
+
if o.language not in available_languages:
|
|
98
|
+
print(f"Language '{o.language}' is not pre-defined (yet) for backend '{backend}'.")
|
|
99
|
+
print(f"Yet it may actually exist.")
|
|
100
|
+
print(f"Please choose the model explictly from {ansi_link('https://alphacephei.com/vosk/models')}.")
|
|
101
|
+
print(f"Or pick one of the pre-defined languages: ", " ".join(available_languages))
|
|
102
|
+
exit(1)
|
|
103
|
+
choices = [language_config[backend][o.language]["model"]]
|
|
104
|
+
default_model = choices[0]
|
|
105
|
+
|
|
106
|
+
else:
|
|
107
|
+
available_models = [language_config[backend][lang]["model"] for lang in available_languages]
|
|
108
|
+
choices = list(zip(available_models, available_languages)) + [f" * [Any model from {ansi_link('https://alphacephei.com/vosk/models')}]"]
|
|
109
|
+
default_model = choices[0]
|
|
110
|
+
|
|
111
|
+
print(f"For information about vosk models see: {ansi_link('https://alphacephei.com/vosk/models')}")
|
|
112
|
+
if prompt:
|
|
113
|
+
model = prompt_choices(choices, default=default_model, label="model")
|
|
114
|
+
else:
|
|
115
|
+
model = default_model
|
|
116
|
+
|
|
117
|
+
elif backend == "whisper":
|
|
118
|
+
|
|
119
|
+
models = ["tiny", "base", "small", "medium", "large", "turbo"]
|
|
120
|
+
english_models = ["tiny.en", "base.en", "small.en", "medium.en"]
|
|
121
|
+
default_model = "small"
|
|
122
|
+
|
|
123
|
+
print("Some models have a specialized English version (.en) which will be selected as default is `-l en` was requested, but can also be requested explicitly below (option not listed). See [documentation](https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages).")
|
|
124
|
+
if prompt:
|
|
125
|
+
model = prompt_choices(models, default=default_model, label="model",
|
|
126
|
+
hidden_models=english_models)
|
|
127
|
+
else:
|
|
128
|
+
model = default_model
|
|
129
|
+
|
|
130
|
+
model = pick_specialist_model(model, o.language, backend)
|
|
131
|
+
|
|
132
|
+
print(f"Selected model: {model}")
|
|
133
|
+
|
|
134
|
+
if backend == "vosk":
|
|
135
|
+
try:
|
|
136
|
+
transcriber = VoskTranscriber(model_name=model,
|
|
137
|
+
language=o.language,
|
|
138
|
+
samplerate=o.samplerate,
|
|
139
|
+
model_kwargs={"data_folder": o.data_folder})
|
|
140
|
+
except Exception as error:
|
|
141
|
+
print(error)
|
|
142
|
+
print(f"Failed to (down)load model {model}.")
|
|
143
|
+
exit(1)
|
|
144
|
+
|
|
145
|
+
elif backend == "whisper":
|
|
146
|
+
transcriber = WhisperTranscriber(model_name=model, language=o.language, samplerate=o.samplerate)
|
|
147
|
+
|
|
148
|
+
else:
|
|
149
|
+
raise ValueError(f"Unknown backend: {backend}")
|
|
150
|
+
|
|
151
|
+
return transcriber
|
|
152
|
+
|
|
153
|
+
def get_parser():
|
|
154
|
+
|
|
155
|
+
parser = argparse.ArgumentParser()
|
|
156
|
+
parser.add_argument("--backend", choices=BACKENDS,
|
|
157
|
+
help="Choose the backend to use for speech recognition (will be prompted otherwise).")
|
|
158
|
+
|
|
159
|
+
parser.add_argument("--model",
|
|
160
|
+
help="""For vosk, any model from https://alphacephei.com/vosk/models,
|
|
161
|
+
e.g. 'vosk-model-small-en-us-0.15'.
|
|
162
|
+
For whisper, see https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages""")
|
|
163
|
+
|
|
164
|
+
parser.add_argument("-l", "--language", choices=list(language_config["vosk"]),
|
|
165
|
+
help="An alias for preselected models when using the vosk backend, or 'en' for the English version of whisper models.")
|
|
166
|
+
|
|
167
|
+
parser.add_argument("--no-prompt", action="store_false", dest="prompt", help="Disable prompts for backend and model selection and jump to recording")
|
|
168
|
+
|
|
169
|
+
parser.add_argument("--samplerate", default=16000, type=int, help=argparse.SUPPRESS)
|
|
170
|
+
parser.add_argument("--keyboard", action="store_true")
|
|
171
|
+
parser.add_argument("--latency", default=0, type=float, help="keyboard latency")
|
|
172
|
+
|
|
173
|
+
parser.add_argument("--data-folder", help="Folder to store Vosk models.")
|
|
174
|
+
|
|
175
|
+
return parser
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def main(args=None):
|
|
179
|
+
|
|
180
|
+
parser = get_parser()
|
|
181
|
+
o = parser.parse_args(args)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
# Set up the microphone for recording
|
|
185
|
+
micro = Microphone(samplerate=o.samplerate)
|
|
186
|
+
|
|
187
|
+
transcriber = None
|
|
188
|
+
|
|
189
|
+
while True:
|
|
190
|
+
if transcriber is None:
|
|
191
|
+
transcriber = get_transcriber(o, prompt=o.prompt)
|
|
192
|
+
print(f"[ Model {transcriber.model_name} from {transcriber.backend} selected. ]")
|
|
193
|
+
if o.prompt:
|
|
194
|
+
print(f"Choose any of the following actions:")
|
|
195
|
+
print(f"[q] quit")
|
|
196
|
+
print(f"[e] change model")
|
|
197
|
+
print(colored(f"Press [Enter] or any other key to start recording.", "BOLD"))
|
|
198
|
+
|
|
199
|
+
key = input()
|
|
200
|
+
if key == "q":
|
|
201
|
+
exit(0)
|
|
202
|
+
if key == "e":
|
|
203
|
+
transcriber = None
|
|
204
|
+
continue
|
|
205
|
+
start_recording(micro, transcriber, keyboard=o.keyboard, latency=o.latency)
|
|
206
|
+
|
|
207
|
+
# if we arrived so far, that means we pressed Ctrl + C anyway, and need Enter to move on.
|
|
208
|
+
# So we leave the wider range of options to change the model.
|
|
209
|
+
o.prompt = True
|
|
210
|
+
o.backend = None
|
|
211
|
+
o.model = None
|
|
212
|
+
o.language = None
|
|
213
|
+
|
|
214
|
+
if __name__ == "__main__":
|
|
215
|
+
main()
|
scribe/testpynput.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
|
|
3
|
+
def type_text(text):
|
|
4
|
+
subprocess.run(["ydotool", "type", text])
|
|
5
|
+
|
|
6
|
+
# Example usage
|
|
7
|
+
type_text("Hello, World!")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
import pyautogui
|
|
11
|
+
import time
|
|
12
|
+
|
|
13
|
+
# Simulate typing a string
|
|
14
|
+
pyautogui.write('Hello, pyautogui!', interval=0.1)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# # Simulate pressing and releasing the Enter key
|
|
19
|
+
# pyautogui.press('enter')
|
|
20
|
+
|
|
21
|
+
# # Simulate pressing and releasing the 'A' key
|
|
22
|
+
# pyautogui.press('a')
|
|
23
|
+
|
|
24
|
+
# # Simulate a hotkey combination (e.g., Ctrl+C)
|
|
25
|
+
# pyautogui.hotkey('ctrl', 'c')
|
|
26
|
+
|
|
27
|
+
# Hello, World!
|
|
28
|
+
|
|
29
|
+
# # import os
|
|
30
|
+
# # import subprocess
|
|
31
|
+
# # from evdev import UInput, ecodes as e
|
|
32
|
+
# # # Function to create and initialize a virtual keyboard
|
|
33
|
+
# # def create_virtual_keyboard():
|
|
34
|
+
# # capabilities = {
|
|
35
|
+
# # e.EV_KEY: [e.KEY_A, e.KEY_B, e.KEY_C, e.KEY_1, e.KEY_2, e.KEY_3]
|
|
36
|
+
# # }
|
|
37
|
+
# # return UInput(capabilities)
|
|
38
|
+
# # # Function to send a keystroke
|
|
39
|
+
# # def send_keystroke(virtual_keyboard, key):
|
|
40
|
+
# # virtual_keyboard.write(e.EV_KEY, key, 1) # Key down
|
|
41
|
+
# # virtual_keyboard.write(e.EV_KEY, key, 0) # Key up
|
|
42
|
+
# # virtual_keyboard.syn()
|
|
43
|
+
|
|
44
|
+
# # # Main script execution
|
|
45
|
+
# # if __name__ == "__main__":
|
|
46
|
+
# # try:
|
|
47
|
+
# # vk = create_virtual_keyboard()
|
|
48
|
+
# # print("Virtual keyboard initialized.")
|
|
49
|
+
# # send_keystroke(vk, e.KEY_A) # Sends the 'A' key
|
|
50
|
+
# # send_keystroke(vk, e.KEY_B) # Sends the 'B' key
|
|
51
|
+
# # finally:
|
|
52
|
+
# # vk.close()
|
|
53
|
+
# # print("Virtual keyboard closed.")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# import os
|
|
57
|
+
# import subprocess
|
|
58
|
+
# from evdev import UInput, ecodes as e
|
|
59
|
+
|
|
60
|
+
# # Function to create and initialize a virtual keyboard
|
|
61
|
+
# def create_virtual_keyboard():
|
|
62
|
+
# capabilities = {
|
|
63
|
+
# e.EV_KEY: [e.KEY_A, e.KEY_B, e.KEY_C, e.KEY_1, e.KEY_2, e.KEY_3]
|
|
64
|
+
# }
|
|
65
|
+
# try:
|
|
66
|
+
# ui = UInput(capabilities, name="virtual-keyboard")
|
|
67
|
+
# print("Virtual keyboard created successfully.")
|
|
68
|
+
# return ui
|
|
69
|
+
# except Exception as ex:
|
|
70
|
+
# print(f"Failed to create virtual keyboard: {ex}")
|
|
71
|
+
# return None
|
|
72
|
+
|
|
73
|
+
# # Function to send a keystroke
|
|
74
|
+
# def send_keystroke(virtual_keyboard, key):
|
|
75
|
+
# try:
|
|
76
|
+
# virtual_keyboard.write(e.EV_KEY, key, 1) # Key down
|
|
77
|
+
# virtual_keyboard.write(e.EV_KEY, key, 0) # Key up
|
|
78
|
+
# virtual_keyboard.syn()
|
|
79
|
+
# print(f"Sent keystroke: {key}")
|
|
80
|
+
# except Exception as ex:
|
|
81
|
+
# print(f"Failed to send keystroke: {ex}")
|
|
82
|
+
|
|
83
|
+
# # Main script execution
|
|
84
|
+
# if __name__ == "__main__":
|
|
85
|
+
# vk = create_virtual_keyboard()
|
|
86
|
+
# if vk:
|
|
87
|
+
# try:
|
|
88
|
+
# send_keystroke(vk, e.KEY_A) # Sends the 'A' key
|
|
89
|
+
# send_keystroke(vk, e.KEY_B) # Sends the 'B' key
|
|
90
|
+
# finally:
|
|
91
|
+
# vk.close()
|
|
92
|
+
# print("Virtual keyboard closed.")
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
from pynput.keyboard import Controller, Key
|
|
96
|
+
import time
|
|
97
|
+
|
|
98
|
+
# Create a keyboard controller
|
|
99
|
+
keyboard = Controller()
|
|
100
|
+
|
|
101
|
+
# Simulate typing a string
|
|
102
|
+
keyboard.type("Hello, World!")
|
|
103
|
+
|
|
104
|
+
# Simulate pressing and releasing a key
|
|
105
|
+
keyboard.press(Key.enter)
|
|
106
|
+
keyboard.release(Key.enter)
|
|
107
|
+
|
|
108
|
+
# Simulate pressing and releasing the 'A' key
|
|
109
|
+
keyboard.press('a')
|
|
110
|
+
keyboard.release('a')
|
|
111
|
+
|
|
112
|
+
# Simulate pressing and releasing the 'Ctrl' key
|
|
113
|
+
with keyboard.pressed(Key.ctrl):
|
|
114
|
+
keyboard.press('c')
|
|
115
|
+
keyboard.release('c')
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# import time
|
|
120
|
+
# import subprocess
|
|
121
|
+
|
|
122
|
+
# def send_key_event(event_device, key_code):
|
|
123
|
+
# # Press the key
|
|
124
|
+
# subprocess.run(["sudo", "evemu-event", event_device, "--type", "EV_KEY", "--code", key_code, "--value", "1", "--sync"])
|
|
125
|
+
# # Release the key
|
|
126
|
+
# subprocess.run(["sudo", "evemu-event", event_device, "--type", "EV_KEY", "--code", key_code, "--value", "0", "--sync"])
|
|
127
|
+
|
|
128
|
+
# # Example usage
|
|
129
|
+
# event_device = "/dev/input/event3" # Your keyboard device
|
|
130
|
+
# send_key_event(event_device, "KEY_A")
|
|
131
|
+
|
|
132
|
+
# for letter in "Hello, World!":
|
|
133
|
+
# if letter:
|
|
134
|
+
# if letter == " ":
|
|
135
|
+
# send_key_event(event_device, "KEY_SPACE")
|
|
136
|
+
# elif letter == ",":
|
|
137
|
+
# send_key_event(event_device, "KEY_COMMA")
|
|
138
|
+
# elif letter == "!":
|
|
139
|
+
# send_key_event(event_device, "KEY_1")
|
|
140
|
+
# else:
|
|
141
|
+
# send_key_event(event_device, f"KEY_{letter.upper()}")
|
|
142
|
+
# time.sleep(0.1)
|
scribe/util.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
import tqdm
|
|
4
|
+
import shutil
|
|
5
|
+
from functools import partial
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class bcolors:
|
|
9
|
+
# https://stackoverflow.com/a/287944/2192272
|
|
10
|
+
HEADER = '\033[95m'
|
|
11
|
+
OKBLUE = '\033[94m'
|
|
12
|
+
OKGREEN = '\033[92m'
|
|
13
|
+
WARNING = '\033[93m'
|
|
14
|
+
FAIL = '\033[91m'
|
|
15
|
+
ENDC = '\033[0m'
|
|
16
|
+
BOLD = '\033[1m'
|
|
17
|
+
UNDERLINE = '\033[4m'
|
|
18
|
+
|
|
19
|
+
def strip_colors(s):
|
|
20
|
+
for name, c in vars(bcolors).items():
|
|
21
|
+
if name.startswith("_"):
|
|
22
|
+
continue
|
|
23
|
+
s = s.replace(c, '')
|
|
24
|
+
return s
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def ansi_link(uri, label=None):
|
|
28
|
+
"""https://stackoverflow.com/a/71309268/2192272
|
|
29
|
+
"""
|
|
30
|
+
if label is None:
|
|
31
|
+
label = uri
|
|
32
|
+
parameters = ''
|
|
33
|
+
|
|
34
|
+
# OSC 8 ; params ; URI ST <name> OSC 8 ;; ST
|
|
35
|
+
escape_mask = '\033]8;{};{}\033\\{}\033]8;;\033\\'
|
|
36
|
+
|
|
37
|
+
return escape_mask.format(parameters, uri, label)
|
|
38
|
+
|
|
39
|
+
def colored(text, color):
|
|
40
|
+
if hasattr(bcolors, color):
|
|
41
|
+
color = getattr(bcolors, color)
|
|
42
|
+
return f"{color}{text}{bcolors.ENDC}"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
ANSI_LINK_RE = re.compile(r'(?P<ansi_sequence>\033]8;(?P<parameter>.*?);(?P<uri>.*?)\033\\(?P<label>.*?)\033]8;;\033\\)')
|
|
46
|
+
|
|
47
|
+
def strip_ansi_link(s):
|
|
48
|
+
for m in ANSI_LINK_RE.findall(s):
|
|
49
|
+
s = s.replace(m[0], m[3])
|
|
50
|
+
return s
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def strip_all(s):
|
|
54
|
+
s = strip_colors(s)
|
|
55
|
+
s = strip_ansi_link(s)
|
|
56
|
+
return s
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# Function to clear the terminal line
|
|
60
|
+
def clear_line():
|
|
61
|
+
# Get terminal width
|
|
62
|
+
terminal_width = shutil.get_terminal_size().columns
|
|
63
|
+
print("\r" + " " * terminal_width, end="") # Clear the line
|
|
64
|
+
print("\r", end="") # Return cursor to the beginning of the line
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def print_partial(msg):
|
|
68
|
+
# Get terminal width
|
|
69
|
+
terminal_width = shutil.get_terminal_size().columns
|
|
70
|
+
start = max(0, len(msg) + 7 - terminal_width)
|
|
71
|
+
print(f"\r[...] {msg[start:]}", end="")
|
|
72
|
+
|
|
73
|
+
def check_status_code(status_code):
|
|
74
|
+
if 200 <= status_code < 300:
|
|
75
|
+
return True
|
|
76
|
+
else:
|
|
77
|
+
return False
|
|
78
|
+
|
|
79
|
+
def download_model(url, data_folder):
|
|
80
|
+
import requests
|
|
81
|
+
import zipfile
|
|
82
|
+
import io
|
|
83
|
+
|
|
84
|
+
os.makedirs(data_folder, exist_ok=True)
|
|
85
|
+
|
|
86
|
+
print(f"Downloading model from {url}...")
|
|
87
|
+
response = requests.get(url, stream=True)
|
|
88
|
+
|
|
89
|
+
# check the URL was correct:
|
|
90
|
+
if not check_status_code(response.status_code):
|
|
91
|
+
raise RuntimeError(f"Model download failed with error {response.status_code}")
|
|
92
|
+
|
|
93
|
+
total_size = int(response.headers.get('content-length', 0))
|
|
94
|
+
block_size = 1024 # 1 Kibibyte
|
|
95
|
+
t = tqdm.tqdm(total=total_size, unit='iB', unit_scale=True)
|
|
96
|
+
|
|
97
|
+
with io.BytesIO() as temp_file:
|
|
98
|
+
for data in response.iter_content(block_size):
|
|
99
|
+
t.update(len(data))
|
|
100
|
+
temp_file.write(data)
|
|
101
|
+
t.close()
|
|
102
|
+
temp_file.seek(0)
|
|
103
|
+
|
|
104
|
+
# check the file was downloaded correctly
|
|
105
|
+
if total_size != 0 and t.n != total_size:
|
|
106
|
+
raise RuntimeError(f"Model download size is 0 or less than stated size")
|
|
107
|
+
|
|
108
|
+
with zipfile.ZipFile(temp_file) as z:
|
|
109
|
+
z.extractall(data_folder)
|
|
110
|
+
|
|
111
|
+
print(f"Model downloaded and unpacked to {data_folder}")
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def format_choice(enum, default=None, unavailable=None):
|
|
115
|
+
i, value = enum
|
|
116
|
+
if type(value) in [tuple, list]:
|
|
117
|
+
value_str = f" {value[0]} ({' | '.join(value[1:])})"
|
|
118
|
+
else:
|
|
119
|
+
value_str = value
|
|
120
|
+
|
|
121
|
+
if (default is not None and value == default) or (default is None and i == 0):
|
|
122
|
+
return f' ' + colored(f'({i+1}) {value_str} [Press Enter]', 'BOLD')
|
|
123
|
+
elif unavailable and value in unavailable:
|
|
124
|
+
return f' ' + colored(f'{" "} {value_str} -> unavailable !!', 'FAIL')
|
|
125
|
+
else:
|
|
126
|
+
return f' ({i+1}) {value_str}'
|
|
127
|
+
|
|
128
|
+
def is_integer(value):
|
|
129
|
+
try:
|
|
130
|
+
int(value)
|
|
131
|
+
return True
|
|
132
|
+
except (ValueError, TypeError):
|
|
133
|
+
return False
|
|
134
|
+
|
|
135
|
+
def prompt_choices(choices, default=None, label="value", unavailable_choices=None, hidden_models=None):
|
|
136
|
+
value = None
|
|
137
|
+
if unavailable_choices is None:
|
|
138
|
+
unavailable_choices = []
|
|
139
|
+
available_choices = choices
|
|
140
|
+
else:
|
|
141
|
+
available_choices = [c for c in choices if c not in unavailable_choices]
|
|
142
|
+
|
|
143
|
+
wildcard = any("*" in choice for choice in available_choices)
|
|
144
|
+
|
|
145
|
+
while (value not in (available_choices + (hidden_models or []))) or ("*" in value):
|
|
146
|
+
if value:
|
|
147
|
+
print(f"Invalid {label}: {value}")
|
|
148
|
+
value = input(f"""Please choose a {label}:
|
|
149
|
+
{'\n'.join(map(partial(format_choice, default=default, unavailable=unavailable_choices),
|
|
150
|
+
enumerate(available_choices + unavailable_choices)))}
|
|
151
|
+
(type number or any name or alias or press [Enter])...
|
|
152
|
+
""")
|
|
153
|
+
if value == "":
|
|
154
|
+
value = default or available_choices[0]
|
|
155
|
+
|
|
156
|
+
if is_integer(value):
|
|
157
|
+
try:
|
|
158
|
+
value = available_choices[int(value) - 1]
|
|
159
|
+
except IndexError:
|
|
160
|
+
continue
|
|
161
|
+
|
|
162
|
+
if "*" in value:
|
|
163
|
+
continue
|
|
164
|
+
|
|
165
|
+
# can match any other choice so we break
|
|
166
|
+
if wildcard:
|
|
167
|
+
break
|
|
168
|
+
|
|
169
|
+
assert "*" not in value
|
|
170
|
+
return value[0] if type(value) in [list, tuple] else value
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def check_dependencies(backend, dependencies=None, raise_error=False):
|
|
174
|
+
from importlib import import_module
|
|
175
|
+
modules = dependencies or [backend]
|
|
176
|
+
try:
|
|
177
|
+
for module in modules:
|
|
178
|
+
import_module(module)
|
|
179
|
+
return True
|
|
180
|
+
except ImportError:
|
|
181
|
+
# if requested by the user, raise an Exception
|
|
182
|
+
if raise_error:
|
|
183
|
+
raise
|
|
184
|
+
return False
|
|
185
|
+
return False
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Mahé Perrette
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
Note: This project relies on external packages that may have more restrictive
|
|
26
|
+
licenses. For example, the `pynput` package is licensed under LGPLv3, which
|
|
27
|
+
has different requirements compared to the MIT License. Please review the
|
|
28
|
+
licenses of all dependencies before using or distributing this software to
|
|
29
|
+
ensure compliance with their respective terms.
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: scribe-cli
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: scribe is a local speech recognition tool that provides real-time transcription using vosk and whisper AI.
|
|
5
|
+
Author-email: Mahé Perrette <mahe.perrette@gmail.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2024 Mahé Perrette
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
Note: This project relies on external packages that may have more restrictive
|
|
31
|
+
licenses. For example, the `pynput` package is licensed under LGPLv3, which
|
|
32
|
+
has different requirements compared to the MIT License. Please review the
|
|
33
|
+
licenses of all dependencies before using or distributing this software to
|
|
34
|
+
ensure compliance with their respective terms.
|
|
35
|
+
Project-URL: Homepage, https://github.com/perrette/scribe
|
|
36
|
+
Keywords: speech recognition,transcription,AI,language,vosk,whisper,openai
|
|
37
|
+
Classifier: Programming Language :: Python :: 3
|
|
38
|
+
Classifier: Operating System :: OS Independent
|
|
39
|
+
Requires-Python: >=3.9
|
|
40
|
+
Description-Content-Type: text/markdown
|
|
41
|
+
License-File: LICENSE
|
|
42
|
+
Requires-Dist: numpy
|
|
43
|
+
Requires-Dist: sounddevice
|
|
44
|
+
Requires-Dist: tqdm
|
|
45
|
+
Requires-Dist: requests
|
|
46
|
+
Provides-Extra: keyboard
|
|
47
|
+
Requires-Dist: pynput; extra == "keyboard"
|
|
48
|
+
Provides-Extra: whisper
|
|
49
|
+
Requires-Dist: openai-whisper; extra == "whisper"
|
|
50
|
+
Provides-Extra: vosk
|
|
51
|
+
Requires-Dist: vosk; extra == "vosk"
|
|
52
|
+
Provides-Extra: all
|
|
53
|
+
Requires-Dist: pynput; extra == "all"
|
|
54
|
+
Requires-Dist: openai-whisper; extra == "all"
|
|
55
|
+
Requires-Dist: vosk; extra == "all"
|
|
56
|
+
|
|
57
|
+
# Scribe
|
|
58
|
+
|
|
59
|
+
`scribe` is a local speech recognition tool that provides real-time transcription using vosk and whisper AI.
|
|
60
|
+
|
|
61
|
+
## Installation
|
|
62
|
+
|
|
63
|
+
Install PortAudio library. E.g. on Ubuntu:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
sudo apt-get install portaudio19-dev
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
The python dependencies should be dealt with automatically:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
pip install scribe-cli[all]"
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
(note the `-cli` suffix for client)
|
|
76
|
+
|
|
77
|
+
or for local development:
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
git clone https://github.com/perrette/scribe.git
|
|
81
|
+
cd scribe
|
|
82
|
+
pip install -e .[all]
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
You can leave the optional dependencies (leave out `[all]`) but must install at least one of `vosk` or `openai-whisper` packages (see Usage below).
|
|
86
|
+
|
|
87
|
+
The `vosk` language models will download on-the-fly.
|
|
88
|
+
The default data folder is `$HOME/.local/share/vosk/language-models`.
|
|
89
|
+
This can be modified.
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
## Usage
|
|
93
|
+
|
|
94
|
+
Just type in the terminal:
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
scribe
|
|
98
|
+
```
|
|
99
|
+
and the script will guide you through the choice of backend (`whisper` or `vosk`) and the specific language model.
|
|
100
|
+
After this, you will be prompted to start recording your microphone and print the transcribed text in real-time (`vosk`)
|
|
101
|
+
or until after recording is complete (`whisper`).
|
|
102
|
+
You can interrupt the recording via Ctrl + C and start again or change model.
|
|
103
|
+
|
|
104
|
+
The default (`whisper`) is excellent at transcribing a full-length audio sequences in [many languages](https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages). It is really impressive,
|
|
105
|
+
but it cannot do real-time out of the box, and depending on the model can have relatively long execution time, especially with the `turbo` model (at least on my laptop with CPU only). The `small` model is also excellent and runs much faster. It is selected as default in `scribe` for that reason.
|
|
106
|
+
With the `whisker` model you need to stop the registration manually before the transcription occurs (Ctrl + C), though after
|
|
107
|
+
60 seconds it will stop automatically (and try to continue afterward).
|
|
108
|
+
|
|
109
|
+
The `vosk` backend is good at
|
|
110
|
+
doing real-time transcription for one language, but tended to make more mistakes in my tests and it does not do punctuation.
|
|
111
|
+
There are many [vosk models](https://alphacephei.com/vosk/models) available, and here a few are associated to [a handful of languages](scribe/models.toml) `en`, `fr`, `it`, `de` (so far).
|
|
112
|
+
|
|
113
|
+
To skip the initial selection menu you can do:
|
|
114
|
+
```bash
|
|
115
|
+
scribe --backend whisper --model small --no-prompt
|
|
116
|
+
```
|
|
117
|
+
where `--no-prompt` jumps right to the recording (after the first interruption, you can still choose to change the backend and model).
|
|
118
|
+
|
|
119
|
+
### Advanced usage as keyboard replacement
|
|
120
|
+
|
|
121
|
+
With the `--keyboard` option `scribe` will attempt to simulate a keyboard and send transcribed characters to the applcation under focus:
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
scribe --keyboard
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
It relies on the optional `pynput` dependency (installed together with `scribe` if you used the `[all]` or `[keyboard]` option).
|
|
128
|
+
|
|
129
|
+
`pynput` may require [some configuration](https://pynput.readthedocs.io/en/latest/limitations.html) (I *think* got it to work with `xhost +SI:localuser:$(whoami)` as far as the display is concerned). It has [limitations]((https://pynput.readthedocs.io/en/latest/limitations.html)). In my Ubuntu + Wayland system it works in chromium based applications (including vscode) but it does not in firefox and sublime text and any of the rest (not even in a terminal !).
|
|
130
|
+
Workarounds include using the Xorg version of GNOME... Suggestions welcome.
|
|
131
|
+
|
|
132
|
+
### Start as an application in Ubuntu
|
|
133
|
+
|
|
134
|
+
If you run Ubuntu (or else?) with GNOME, the script `scribe-install [...]` will create a `scribe.desktop` file and place it under `$HOME/.local/share/applications`
|
|
135
|
+
to make it available from the quick launch menu. Any option will be passed on to `scribe`.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
scribe/__init__.py,sha256=WjDmvecyDIyJLYp4rCV9vsSYbQDc4L1EpYqORvEXliI,33
|
|
2
|
+
scribe/_version.py,sha256=Jk2iAU7m-7Vx9XV1TtdD9ZoJraIncDq_4_Wd-qtUotg,411
|
|
3
|
+
scribe/audio.py,sha256=PXtkRwVlyEf-m3eVIH61nlCpfiiZvFVJQWdW4mhL6bY,1716
|
|
4
|
+
scribe/install_desktop.py,sha256=iKZjIddi_Y-xmg-mrrdZvwrec4gwS2TiNvdg1Uu2YqQ,1491
|
|
5
|
+
scribe/keyboard.py,sha256=jhiPv927Pydnta1Q8QaoPWNCQpojDZmvuAZ8UOo1TyM,581
|
|
6
|
+
scribe/models.py,sha256=bJVp_2t_UG996NBWh2bQf5_44ny4Mqr-6kCyS0w7qNA,4466
|
|
7
|
+
scribe/models.toml,sha256=H5IvYx03QbaZpgPuyl08RC_t24FqN-ZKfS5bnCMVLSc,715
|
|
8
|
+
scribe/saverecording.py,sha256=20QyJNMb1kzGkLXajBnBxIXMMji0Bp3Do69bHZD8S7g,2137
|
|
9
|
+
scribe/streamer.py,sha256=-4hWI6lX3Kz1qvc8NSqtofIlOkOVN2mS-JqCC4eHCzg,8223
|
|
10
|
+
scribe/testpynput.py,sha256=J1GlX7ns2yMeEfhoq_BVKHQ6REcTKhS8OMt2aak1RfY,4062
|
|
11
|
+
scribe/util.py,sha256=ep-5W99tMltG4SAbdCMIB3qI98zVyb3M170OOlSXR_c,5322
|
|
12
|
+
scribe_cli-0.3.0.dist-info/LICENSE,sha256=76NFkiJg6-f0qj4s4YSIkHcWho3eckzmOKIbEFMOM-U,1426
|
|
13
|
+
scribe_cli-0.3.0.dist-info/METADATA,sha256=y3PMQDX44HnAKEAcFjpOTsr8zUbgkXfsegMuvmAjk1Y,6316
|
|
14
|
+
scribe_cli-0.3.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
15
|
+
scribe_cli-0.3.0.dist-info/entry_points.txt,sha256=I76vox2zTEO8vr5b1UU6bYfKCjkviEyXMq4cndW8yQc,93
|
|
16
|
+
scribe_cli-0.3.0.dist-info/top_level.txt,sha256=N57j6gzwa6unDew22CFvM3KKQQL0H2itDsc166HHINg,7
|
|
17
|
+
scribe_cli-0.3.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
scribe
|