scribe-cli 0.5.1__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scribe_cli-0.5.1/scribe_cli.egg-info → scribe_cli-0.6.0}/PKG-INFO +2 -2
- {scribe_cli-0.5.1 → scribe_cli-0.6.0}/pyproject.toml +2 -0
- {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe/_version.py +2 -2
- {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe/audio.py +27 -2
- {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe/models.py +35 -7
- {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe/streamer.py +74 -48
- {scribe_cli-0.5.1 → scribe_cli-0.6.0/scribe_cli.egg-info}/PKG-INFO +2 -2
- {scribe_cli-0.5.1 → scribe_cli-0.6.0}/.github/workflows/pypi.yml +0 -0
- {scribe_cli-0.5.1 → scribe_cli-0.6.0}/.gitignore +0 -0
- {scribe_cli-0.5.1 → scribe_cli-0.6.0}/LICENSE +0 -0
- {scribe_cli-0.5.1 → scribe_cli-0.6.0}/README.md +0 -0
- {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe/__init__.py +0 -0
- {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe/install_desktop.py +0 -0
- {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe/keyboard.py +0 -0
- {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe/models.toml +0 -0
- {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe/saverecording.py +0 -0
- {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe/testpynput.py +0 -0
- {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe/util.py +0 -0
- {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe_cli.egg-info/SOURCES.txt +0 -0
- {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe_cli.egg-info/dependency_links.txt +0 -0
- {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe_cli.egg-info/entry_points.txt +0 -0
- {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe_cli.egg-info/requires.txt +0 -0
- {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe_cli.egg-info/top_level.txt +0 -0
- {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe_data/__init__.py +0 -0
- {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe_data/share/icon.jpg +0 -0
- {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe_data/templates/scribe.desktop +0 -0
- {scribe_cli-0.5.1 → scribe_cli-0.6.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: scribe-cli
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: scribe is a local speech recognition tool that provides real-time transcription using vosk and whisper AI.
|
|
5
5
|
Author-email: Mahé Perrette <mahe.perrette@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -33,7 +33,7 @@ License: MIT License
|
|
|
33
33
|
licenses of all dependencies before using or distributing this software to
|
|
34
34
|
ensure compliance with their respective terms.
|
|
35
35
|
Project-URL: Homepage, https://github.com/perrette/scribe
|
|
36
|
-
Keywords: speech recognition,transcription,AI,language,vosk,whisper,openai
|
|
36
|
+
Keywords: speech recognition,transcription,AI,language,vosk,whisper,openai,keyboard,clipboard
|
|
37
37
|
Classifier: Programming Language :: Python :: 3
|
|
38
38
|
Classifier: Operating System :: OS Independent
|
|
39
39
|
Requires-Python: >=3.9
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
import sounddevice as sd
|
|
2
1
|
import queue
|
|
2
|
+
import numpy as np
|
|
3
|
+
import sounddevice as sd
|
|
3
4
|
|
|
4
5
|
|
|
5
6
|
def get_duration(audio_length_bytes, # bytes
|
|
@@ -48,4 +49,28 @@ class Microphone:
|
|
|
48
49
|
return sd.query_devices(self.device, 'input')
|
|
49
50
|
|
|
50
51
|
def get_duraction(self, audio_length_bytes):
|
|
51
|
-
return get_duration(audio_length_bytes, self.samplerate, self.channels, {'int16':2}[self.dtype])
|
|
52
|
+
return get_duration(audio_length_bytes, self.samplerate, self.channels, {'int16':2}[self.dtype])
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def calculate_decibels(data_bytes):
|
|
57
|
+
"""
|
|
58
|
+
Calculate the decibel level of integer-valued audio data.
|
|
59
|
+
|
|
60
|
+
:param data_bytes: Audio data as a bytes object.
|
|
61
|
+
:return: Decibel level of the audio data.
|
|
62
|
+
"""
|
|
63
|
+
# Normalize the integer samples to the range [-1.0, 1.0]
|
|
64
|
+
data = np.frombuffer(data_bytes, dtype=np.int16)
|
|
65
|
+
normalized_data = data / 32768.0
|
|
66
|
+
|
|
67
|
+
# Calculate the RMS value
|
|
68
|
+
rms = np.sqrt(np.mean(np.square(normalized_data)))
|
|
69
|
+
|
|
70
|
+
if rms == 0:
|
|
71
|
+
return -np.inf
|
|
72
|
+
|
|
73
|
+
# Convert RMS to decibels
|
|
74
|
+
db = 20 * np.log10(rms)
|
|
75
|
+
|
|
76
|
+
return db
|
|
@@ -1,8 +1,16 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import json
|
|
3
|
-
import numpy as np
|
|
4
3
|
import time
|
|
4
|
+
from collections import deque
|
|
5
|
+
import numpy as np
|
|
5
6
|
from scribe.util import download_model
|
|
7
|
+
from scribe.audio import calculate_decibels
|
|
8
|
+
|
|
9
|
+
def is_silent(data, silence_thresh=-40):
|
|
10
|
+
"""
|
|
11
|
+
Détermine si un segment audio est un silence en fonction du niveau de volume.
|
|
12
|
+
"""
|
|
13
|
+
return calculate_decibels(data) < silence_thresh
|
|
6
14
|
|
|
7
15
|
VOSK_MODELS_FOLDER = os.path.join(os.environ.get("HOME"),
|
|
8
16
|
".local/share/vosk/language-models")
|
|
@@ -10,22 +18,24 @@ VOSK_MODELS_FOLDER = os.path.join(os.environ.get("HOME"),
|
|
|
10
18
|
|
|
11
19
|
class AbstractTranscriber:
|
|
12
20
|
backend = None
|
|
13
|
-
def __init__(self, model, model_name=None, language=None, samplerate=16000, timeout=None, model_kwargs={}
|
|
21
|
+
def __init__(self, model, model_name=None, language=None, samplerate=16000, timeout=None, model_kwargs={},
|
|
22
|
+
silence_thresh=-40, silence_duration=2, restart_after_silence=False):
|
|
14
23
|
self.model_name = model_name
|
|
15
24
|
self.language = language
|
|
16
25
|
self.model = model
|
|
17
26
|
self.model_kwargs = model_kwargs
|
|
18
27
|
self.samplerate = samplerate
|
|
19
28
|
self.timeout = timeout
|
|
20
|
-
self.
|
|
21
|
-
self.
|
|
22
|
-
self.
|
|
29
|
+
self.silence_thresh = silence_thresh
|
|
30
|
+
self.silence_duration = silence_duration
|
|
31
|
+
self.restart_after_silence = restart_after_silence
|
|
32
|
+
self.reset()
|
|
23
33
|
|
|
24
34
|
def get_elapsed(self):
|
|
25
35
|
return time.time() - self.start_time
|
|
26
36
|
|
|
27
37
|
def is_overtime(self):
|
|
28
|
-
return time.time() - self.start_time > self.timeout
|
|
38
|
+
return self.timeout is not None and time.time() - self.start_time > self.timeout
|
|
29
39
|
|
|
30
40
|
def transcribe_realtime_audio(self, audio_bytes=b""):
|
|
31
41
|
self.audio_buffer += audio_bytes
|
|
@@ -36,6 +46,8 @@ class AbstractTranscriber:
|
|
|
36
46
|
|
|
37
47
|
def reset(self):
|
|
38
48
|
self.audio_buffer = b''
|
|
49
|
+
self.start_time = time.time()
|
|
50
|
+
self.last_sound_time = time.time()
|
|
39
51
|
|
|
40
52
|
def start_recording(self, microphone,
|
|
41
53
|
start_message="Recording... Press Ctrl+C to stop.",
|
|
@@ -50,6 +62,23 @@ class AbstractTranscriber:
|
|
|
50
62
|
while True:
|
|
51
63
|
while not microphone.q.empty():
|
|
52
64
|
data = microphone.q.get()
|
|
65
|
+
|
|
66
|
+
# Vérifier si le segment est un silence
|
|
67
|
+
if is_silent(data, self.silence_thresh):
|
|
68
|
+
silence_duration = time.time() - self.last_sound_time
|
|
69
|
+
|
|
70
|
+
if self.silence_duration is not None and silence_duration >= self.silence_duration and len(self.audio_buffer) > 0:
|
|
71
|
+
if self.restart_after_silence:
|
|
72
|
+
result = self.finalize()
|
|
73
|
+
microphone.q.queue.clear()
|
|
74
|
+
self.reset()
|
|
75
|
+
yield result
|
|
76
|
+
else:
|
|
77
|
+
raise KeyboardInterrupt("Silence detected: {:.2f} seconds".format(silence_duration))
|
|
78
|
+
|
|
79
|
+
else:
|
|
80
|
+
self.last_sound_time = time.time()
|
|
81
|
+
|
|
53
82
|
yield self.transcribe_realtime_audio(data)
|
|
54
83
|
|
|
55
84
|
if self.is_overtime():
|
|
@@ -137,7 +166,6 @@ class WhisperTranscriber(AbstractTranscriber):
|
|
|
137
166
|
super().__init__(model, model_name, language, model_kwargs=model_kwargs, **kwargs)
|
|
138
167
|
|
|
139
168
|
def transcribe_audio(self, audio_bytes):
|
|
140
|
-
print("\nIf --keyboard is set, change focus to target app NOW !")
|
|
141
169
|
print("Transcribing...")
|
|
142
170
|
audio_array = np.frombuffer(audio_bytes, dtype=np.int16).flatten().astype(np.float32) / 32768.0
|
|
143
171
|
return self.model.transcribe(audio_array, fp16=False, language=self.language)
|
|
@@ -11,47 +11,6 @@ with open(Path(__file__).parent / "models.toml", "rb") as f:
|
|
|
11
11
|
language_config = language_config_default.copy()
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
# Commencer l'enregistrement
|
|
15
|
-
def start_recording(micro, transcriber, clipboard=True, keyboard=False, latency=0):
|
|
16
|
-
|
|
17
|
-
if keyboard:
|
|
18
|
-
try:
|
|
19
|
-
from scribe.keyboard import type_text
|
|
20
|
-
except ImportError:
|
|
21
|
-
keyboard = False
|
|
22
|
-
print("Keyboard simulation is not available.")
|
|
23
|
-
return
|
|
24
|
-
|
|
25
|
-
if clipboard:
|
|
26
|
-
try:
|
|
27
|
-
import pyperclip
|
|
28
|
-
except ImportError:
|
|
29
|
-
clipboard = False
|
|
30
|
-
print("Clipboard simulation is not available.")
|
|
31
|
-
return
|
|
32
|
-
|
|
33
|
-
fulltext = ""
|
|
34
|
-
|
|
35
|
-
greetings = { k: v for k, v in language_config["_meta"].get(transcriber.language, {}).items()
|
|
36
|
-
if v is not None and k.startswith(("start", "stop"))
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
for result in transcriber.start_recording(micro, **greetings):
|
|
40
|
-
|
|
41
|
-
if result.get('text'):
|
|
42
|
-
clear_line()
|
|
43
|
-
print(result.get('text'))
|
|
44
|
-
if keyboard:
|
|
45
|
-
type_text(result['text'] + " ", interval=latency) # Simulate typing
|
|
46
|
-
|
|
47
|
-
if clipboard:
|
|
48
|
-
fulltext += result['text'] + " "
|
|
49
|
-
pyperclip.copy(fulltext)
|
|
50
|
-
|
|
51
|
-
else:
|
|
52
|
-
print_partial(result.get('partial', ''))
|
|
53
|
-
|
|
54
|
-
|
|
55
14
|
def get_default_backend():
|
|
56
15
|
try:
|
|
57
16
|
import vosk
|
|
@@ -153,6 +112,7 @@ def get_transcriber(o, prompt=True):
|
|
|
153
112
|
language=o.language,
|
|
154
113
|
samplerate=o.samplerate,
|
|
155
114
|
timeout=None, # vosk keeps going (no timeout)
|
|
115
|
+
silence_duration=None, # vosk handles silences internally
|
|
156
116
|
model_kwargs={"data_folder": o.data_folder})
|
|
157
117
|
except Exception as error:
|
|
158
118
|
print(error)
|
|
@@ -160,7 +120,7 @@ def get_transcriber(o, prompt=True):
|
|
|
160
120
|
exit(1)
|
|
161
121
|
|
|
162
122
|
elif backend == "whisper":
|
|
163
|
-
transcriber = WhisperTranscriber(model_name=model, language=o.language, samplerate=o.samplerate, timeout=o.duration)
|
|
123
|
+
transcriber = WhisperTranscriber(model_name=model, language=o.language, samplerate=o.samplerate, timeout=o.duration, silence_duration=o.silence)
|
|
164
124
|
|
|
165
125
|
else:
|
|
166
126
|
raise ValueError(f"Unknown backend: {backend}")
|
|
@@ -184,16 +144,71 @@ def get_parser():
|
|
|
184
144
|
parser.add_argument("--no-prompt", action="store_false", dest="prompt", help="Disable prompts for backend and model selection and jump to recording")
|
|
185
145
|
|
|
186
146
|
parser.add_argument("--samplerate", default=16000, type=int, help=argparse.SUPPRESS)
|
|
187
|
-
parser.add_argument("--duration", default=60, type=int, help="duration in seconds before whisper models start transcribing (default %(default)ss)")
|
|
188
147
|
parser.add_argument("--keyboard", action="store_true")
|
|
189
148
|
parser.add_argument("--no-clipboard", dest="clipboard", action="store_false")
|
|
190
149
|
parser.add_argument("--latency", default=0, type=float, help="keyboard latency")
|
|
191
150
|
|
|
151
|
+
group = parser.add_argument_group("whisper options")
|
|
152
|
+
group.add_argument("--duration", default=120, type=int, help="Max duration of the whisper recording (default %(default)ss)")
|
|
153
|
+
group.add_argument("--silence", default=2, type=float, help="silence duration that prompt transcription (whisper) (default %(default)ss)")
|
|
154
|
+
group.add_argument("--restart-after-silence", action="store_true", help="Restart the recording after a transcription triggered by a silence")
|
|
155
|
+
|
|
192
156
|
parser.add_argument("--data-folder", help="Folder to store Vosk models.")
|
|
193
157
|
|
|
194
158
|
return parser
|
|
195
159
|
|
|
196
160
|
|
|
161
|
+
# Commencer l'enregistrement
|
|
162
|
+
def start_recording(micro, transcriber, clipboard=True, keyboard=False, latency=0):
|
|
163
|
+
|
|
164
|
+
if keyboard:
|
|
165
|
+
try:
|
|
166
|
+
from scribe.keyboard import type_text
|
|
167
|
+
except ImportError:
|
|
168
|
+
keyboard = False
|
|
169
|
+
print("Keyboard simulation is not available.")
|
|
170
|
+
return
|
|
171
|
+
|
|
172
|
+
print("\nChange focus to target app during transcription.")
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
if clipboard:
|
|
176
|
+
try:
|
|
177
|
+
import pyperclip
|
|
178
|
+
except ImportError:
|
|
179
|
+
clipboard = False
|
|
180
|
+
print("Clipboard simulation is not available.")
|
|
181
|
+
return
|
|
182
|
+
|
|
183
|
+
print("\nThe full transcription will be copied to clipboard as it becomes available.")
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
fulltext = ""
|
|
187
|
+
|
|
188
|
+
greetings = { k: v for k, v in language_config["_meta"].get(transcriber.language, {}).items()
|
|
189
|
+
if v is not None and k.startswith(("start", "stop"))
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
for result in transcriber.start_recording(micro, **greetings):
|
|
193
|
+
|
|
194
|
+
if result.get('text'):
|
|
195
|
+
clear_line()
|
|
196
|
+
print(result.get('text'))
|
|
197
|
+
if keyboard:
|
|
198
|
+
type_text(result['text'] + " ", interval=latency) # Simulate typing
|
|
199
|
+
|
|
200
|
+
if clipboard:
|
|
201
|
+
fulltext += result['text'] + " "
|
|
202
|
+
pyperclip.copy(fulltext)
|
|
203
|
+
|
|
204
|
+
else:
|
|
205
|
+
print_partial(result.get('partial', ''))
|
|
206
|
+
|
|
207
|
+
if clipboard:
|
|
208
|
+
print("Copied to clipboard.")
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
|
|
197
212
|
def main(args=None):
|
|
198
213
|
|
|
199
214
|
parser = get_parser()
|
|
@@ -205,18 +220,22 @@ def main(args=None):
|
|
|
205
220
|
|
|
206
221
|
transcriber = None
|
|
207
222
|
|
|
223
|
+
toggle = {True: "On", False: "Off"}
|
|
224
|
+
|
|
208
225
|
while True:
|
|
209
226
|
if transcriber is None:
|
|
210
227
|
transcriber = get_transcriber(o, prompt=o.prompt)
|
|
211
|
-
print(f"
|
|
228
|
+
print(f">>> Model {transcriber.model_name} from {transcriber.backend} selected. Keyboard [{'on' if o.keyboard else 'off'}]. Clipboard [{'on' if o.clipboard else 'off'}] <<<")
|
|
212
229
|
if o.prompt:
|
|
213
230
|
print(f"Choose any of the following actions:")
|
|
214
231
|
print(f"[q] quit")
|
|
215
232
|
print(f"[e] change model")
|
|
216
|
-
print(f"[k] toggle keyboard [{
|
|
217
|
-
print(f"[c] toggle clipboard [{
|
|
233
|
+
print(f"[k] toggle keyboard [{toggle[o.keyboard]}] -> [{toggle[not o.keyboard]}]")
|
|
234
|
+
print(f"[c] toggle clipboard [{toggle[o.clipboard]}] -> [{toggle[not o.clipboard]}]")
|
|
218
235
|
if transcriber.backend == "whisper":
|
|
219
236
|
print(f"[t] change duration (currently {transcriber.timeout}s)")
|
|
237
|
+
print(f"[b] change silence duration (currently {transcriber.silence_duration}s)")
|
|
238
|
+
print(f"[a] toggle auto-restart after silence [{toggle[o.restart_after_silence]}] -> [{toggle[not o.restart_after_silence]}]")
|
|
220
239
|
print(colored(f"Press [Enter] or any other key to start recording.", "BOLD"))
|
|
221
240
|
|
|
222
241
|
key = input()
|
|
@@ -232,9 +251,16 @@ def main(args=None):
|
|
|
232
251
|
o.clipboard = not o.clipboard
|
|
233
252
|
continue
|
|
234
253
|
if key == "t":
|
|
235
|
-
|
|
254
|
+
ans = input(f"Enter new duration in seconds (current: {transcriber.timeout}): ")
|
|
255
|
+
try:
|
|
256
|
+
o.duration = transcriber.timeout = int(ans)
|
|
257
|
+
except:
|
|
258
|
+
print("Invalid duration. Must be an integer.")
|
|
259
|
+
continue
|
|
260
|
+
if key == "b":
|
|
261
|
+
ans = input(f"Enter new silence break duration in seconds (current: {transcriber.silence_duration}): ")
|
|
236
262
|
try:
|
|
237
|
-
o.
|
|
263
|
+
o.silence = transcriber.silence_duration = int(ans)
|
|
238
264
|
except:
|
|
239
265
|
print("Invalid duration. Must be an integer.")
|
|
240
266
|
continue
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: scribe-cli
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: scribe is a local speech recognition tool that provides real-time transcription using vosk and whisper AI.
|
|
5
5
|
Author-email: Mahé Perrette <mahe.perrette@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -33,7 +33,7 @@ License: MIT License
|
|
|
33
33
|
licenses of all dependencies before using or distributing this software to
|
|
34
34
|
ensure compliance with their respective terms.
|
|
35
35
|
Project-URL: Homepage, https://github.com/perrette/scribe
|
|
36
|
-
Keywords: speech recognition,transcription,AI,language,vosk,whisper,openai
|
|
36
|
+
Keywords: speech recognition,transcription,AI,language,vosk,whisper,openai,keyboard,clipboard
|
|
37
37
|
Classifier: Programming Language :: Python :: 3
|
|
38
38
|
Classifier: Operating System :: OS Independent
|
|
39
39
|
Requires-Python: >=3.9
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|