scribe-cli 0.5.0__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. {scribe_cli-0.5.0/scribe_cli.egg-info → scribe_cli-0.6.0}/PKG-INFO +2 -2
  2. {scribe_cli-0.5.0 → scribe_cli-0.6.0}/pyproject.toml +2 -0
  3. {scribe_cli-0.5.0 → scribe_cli-0.6.0}/scribe/_version.py +2 -2
  4. {scribe_cli-0.5.0 → scribe_cli-0.6.0}/scribe/audio.py +27 -2
  5. {scribe_cli-0.5.0 → scribe_cli-0.6.0}/scribe/models.py +39 -9
  6. {scribe_cli-0.5.0 → scribe_cli-0.6.0}/scribe/streamer.py +76 -50
  7. {scribe_cli-0.5.0 → scribe_cli-0.6.0/scribe_cli.egg-info}/PKG-INFO +2 -2
  8. {scribe_cli-0.5.0 → scribe_cli-0.6.0}/.github/workflows/pypi.yml +0 -0
  9. {scribe_cli-0.5.0 → scribe_cli-0.6.0}/.gitignore +0 -0
  10. {scribe_cli-0.5.0 → scribe_cli-0.6.0}/LICENSE +0 -0
  11. {scribe_cli-0.5.0 → scribe_cli-0.6.0}/README.md +0 -0
  12. {scribe_cli-0.5.0 → scribe_cli-0.6.0}/scribe/__init__.py +0 -0
  13. {scribe_cli-0.5.0 → scribe_cli-0.6.0}/scribe/install_desktop.py +0 -0
  14. {scribe_cli-0.5.0 → scribe_cli-0.6.0}/scribe/keyboard.py +0 -0
  15. {scribe_cli-0.5.0 → scribe_cli-0.6.0}/scribe/models.toml +0 -0
  16. {scribe_cli-0.5.0 → scribe_cli-0.6.0}/scribe/saverecording.py +0 -0
  17. {scribe_cli-0.5.0 → scribe_cli-0.6.0}/scribe/testpynput.py +0 -0
  18. {scribe_cli-0.5.0 → scribe_cli-0.6.0}/scribe/util.py +0 -0
  19. {scribe_cli-0.5.0 → scribe_cli-0.6.0}/scribe_cli.egg-info/SOURCES.txt +0 -0
  20. {scribe_cli-0.5.0 → scribe_cli-0.6.0}/scribe_cli.egg-info/dependency_links.txt +0 -0
  21. {scribe_cli-0.5.0 → scribe_cli-0.6.0}/scribe_cli.egg-info/entry_points.txt +0 -0
  22. {scribe_cli-0.5.0 → scribe_cli-0.6.0}/scribe_cli.egg-info/requires.txt +0 -0
  23. {scribe_cli-0.5.0 → scribe_cli-0.6.0}/scribe_cli.egg-info/top_level.txt +0 -0
  24. {scribe_cli-0.5.0 → scribe_cli-0.6.0}/scribe_data/__init__.py +0 -0
  25. {scribe_cli-0.5.0 → scribe_cli-0.6.0}/scribe_data/share/icon.jpg +0 -0
  26. {scribe_cli-0.5.0 → scribe_cli-0.6.0}/scribe_data/templates/scribe.desktop +0 -0
  27. {scribe_cli-0.5.0 → scribe_cli-0.6.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: scribe-cli
3
- Version: 0.5.0
3
+ Version: 0.6.0
4
4
  Summary: scribe is a local speech recognition tool that provides real-time transcription using vosk and whisper AI.
5
5
  Author-email: Mahé Perrette <mahe.perrette@gmail.com>
6
6
  License: MIT License
@@ -33,7 +33,7 @@ License: MIT License
33
33
  licenses of all dependencies before using or distributing this software to
34
34
  ensure compliance with their respective terms.
35
35
  Project-URL: Homepage, https://github.com/perrette/scribe
36
- Keywords: speech recognition,transcription,AI,language,vosk,whisper,openai
36
+ Keywords: speech recognition,transcription,AI,language,vosk,whisper,openai,keyboard,clipboard
37
37
  Classifier: Programming Language :: Python :: 3
38
38
  Classifier: Operating System :: OS Independent
39
39
  Requires-Python: >=3.9
@@ -34,6 +34,8 @@ keywords = [
34
34
  "vosk",
35
35
  "whisper",
36
36
  "openai",
37
+ "keyboard",
38
+ "clipboard",
37
39
  ]
38
40
 
39
41
  [tool.setuptools]
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.5.0'
16
- __version_tuple__ = version_tuple = (0, 5, 0)
15
+ __version__ = version = '0.6.0'
16
+ __version_tuple__ = version_tuple = (0, 6, 0)
@@ -1,5 +1,6 @@
1
- import sounddevice as sd
2
1
  import queue
2
+ import numpy as np
3
+ import sounddevice as sd
3
4
 
4
5
 
5
6
  def get_duration(audio_length_bytes, # bytes
@@ -48,4 +49,28 @@ class Microphone:
48
49
  return sd.query_devices(self.device, 'input')
49
50
 
50
51
  def get_duraction(self, audio_length_bytes):
51
- return get_duration(audio_length_bytes, self.samplerate, self.channels, {'int16':2}[self.dtype])
52
+ return get_duration(audio_length_bytes, self.samplerate, self.channels, {'int16':2}[self.dtype])
53
+
54
+
55
+
56
+ def calculate_decibels(data_bytes):
57
+ """
58
+ Calculate the decibel level of integer-valued audio data.
59
+
60
+ :param data_bytes: Audio data as a bytes object.
61
+ :return: Decibel level of the audio data.
62
+ """
63
+ # Normalize the integer samples to the range [-1.0, 1.0]
64
+ data = np.frombuffer(data_bytes, dtype=np.int16)
65
+ normalized_data = data / 32768.0
66
+
67
+ # Calculate the RMS value
68
+ rms = np.sqrt(np.mean(np.square(normalized_data)))
69
+
70
+ if rms == 0:
71
+ return -np.inf
72
+
73
+ # Convert RMS to decibels
74
+ db = 20 * np.log10(rms)
75
+
76
+ return db
@@ -1,7 +1,16 @@
1
1
  import os
2
2
  import json
3
+ import time
4
+ from collections import deque
3
5
  import numpy as np
4
6
  from scribe.util import download_model
7
+ from scribe.audio import calculate_decibels
8
+
9
+ def is_silent(data, silence_thresh=-40):
10
+ """
11
+ Détermine si un segment audio est un silence en fonction du niveau de volume.
12
+ """
13
+ return calculate_decibels(data) < silence_thresh
5
14
 
6
15
  VOSK_MODELS_FOLDER = os.path.join(os.environ.get("HOME"),
7
16
  ".local/share/vosk/language-models")
@@ -9,21 +18,24 @@ VOSK_MODELS_FOLDER = os.path.join(os.environ.get("HOME"),
9
18
 
10
19
  class AbstractTranscriber:
11
20
  backend = None
12
- def __init__(self, model, model_name=None, language=None, samplerate=16000, max_duration=None, model_kwargs={}):
21
+ def __init__(self, model, model_name=None, language=None, samplerate=16000, timeout=None, model_kwargs={},
22
+ silence_thresh=-40, silence_duration=2, restart_after_silence=False):
13
23
  self.model_name = model_name
14
24
  self.language = language
15
25
  self.model = model
16
26
  self.model_kwargs = model_kwargs
17
27
  self.samplerate = samplerate
18
- self.max_duration = max_duration
19
- self.one_second_bytes = self.samplerate * 2 # 16-bit audio, 1 channel ~ 32000 bytes
20
- self.audio_buffer = b''
28
+ self.timeout = timeout
29
+ self.silence_thresh = silence_thresh
30
+ self.silence_duration = silence_duration
31
+ self.restart_after_silence = restart_after_silence
32
+ self.reset()
21
33
 
22
- def get_elapsed(self, size=None):
23
- return len(size or self.audio_buffer) / self.one_second_bytes
34
+ def get_elapsed(self):
35
+ return time.time() - self.start_time
24
36
 
25
- def is_overtime(self, elapsed=None, size=None):
26
- return self.max_duration and (elapsed or self.get_elapsed(size)) > self.max_duration
37
+ def is_overtime(self):
38
+ return self.timeout is not None and time.time() - self.start_time > self.timeout
27
39
 
28
40
  def transcribe_realtime_audio(self, audio_bytes=b""):
29
41
  self.audio_buffer += audio_bytes
@@ -34,6 +46,8 @@ class AbstractTranscriber:
34
46
 
35
47
  def reset(self):
36
48
  self.audio_buffer = b''
49
+ self.start_time = time.time()
50
+ self.last_sound_time = time.time()
37
51
 
38
52
  def start_recording(self, microphone,
39
53
  start_message="Recording... Press Ctrl+C to stop.",
@@ -48,6 +62,23 @@ class AbstractTranscriber:
48
62
  while True:
49
63
  while not microphone.q.empty():
50
64
  data = microphone.q.get()
65
+
66
+ # Vérifier si le segment est un silence
67
+ if is_silent(data, self.silence_thresh):
68
+ silence_duration = time.time() - self.last_sound_time
69
+
70
+ if self.silence_duration is not None and silence_duration >= self.silence_duration and len(self.audio_buffer) > 0:
71
+ if self.restart_after_silence:
72
+ result = self.finalize()
73
+ microphone.q.queue.clear()
74
+ self.reset()
75
+ yield result
76
+ else:
77
+ raise KeyboardInterrupt("Silence detected: {:.2f} seconds".format(silence_duration))
78
+
79
+ else:
80
+ self.last_sound_time = time.time()
81
+
51
82
  yield self.transcribe_realtime_audio(data)
52
83
 
53
84
  if self.is_overtime():
@@ -135,7 +166,6 @@ class WhisperTranscriber(AbstractTranscriber):
135
166
  super().__init__(model, model_name, language, model_kwargs=model_kwargs, **kwargs)
136
167
 
137
168
  def transcribe_audio(self, audio_bytes):
138
- print("\nIf --keyboard is set, change focus to target app NOW !")
139
169
  print("Transcribing...")
140
170
  audio_array = np.frombuffer(audio_bytes, dtype=np.int16).flatten().astype(np.float32) / 32768.0
141
171
  return self.model.transcribe(audio_array, fp16=False, language=self.language)
@@ -11,47 +11,6 @@ with open(Path(__file__).parent / "models.toml", "rb") as f:
11
11
  language_config = language_config_default.copy()
12
12
 
13
13
 
14
- # Commencer l'enregistrement
15
- def start_recording(micro, transcriber, clipboard=True, keyboard=False, latency=0):
16
-
17
- if keyboard:
18
- try:
19
- from scribe.keyboard import type_text
20
- except ImportError:
21
- keyboard = False
22
- print("Keyboard simulation is not available.")
23
- return
24
-
25
- if clipboard:
26
- try:
27
- import pyperclip
28
- except ImportError:
29
- clipboard = False
30
- print("Clipboard simulation is not available.")
31
- return
32
-
33
- fulltext = ""
34
-
35
- greetings = { k: v for k, v in language_config["_meta"].get(transcriber.language, {}).items()
36
- if v is not None and k.startswith(("start", "stop"))
37
- }
38
-
39
- for result in transcriber.start_recording(micro, **greetings):
40
-
41
- if result.get('text'):
42
- clear_line()
43
- print(result.get('text'))
44
- if keyboard:
45
- type_text(result['text'] + " ", interval=latency) # Simulate typing
46
-
47
- if clipboard:
48
- fulltext += result['text'] + " "
49
- pyperclip.copy(fulltext)
50
-
51
- else:
52
- print_partial(result.get('partial', ''))
53
-
54
-
55
14
  def get_default_backend():
56
15
  try:
57
16
  import vosk
@@ -152,7 +111,8 @@ def get_transcriber(o, prompt=True):
152
111
  transcriber = VoskTranscriber(model_name=model,
153
112
  language=o.language,
154
113
  samplerate=o.samplerate,
155
- max_duration=None, # vosk keeps going (no timeout)
114
+ timeout=None, # vosk keeps going (no timeout)
115
+ silence_duration=None, # vosk handles silences internally
156
116
  model_kwargs={"data_folder": o.data_folder})
157
117
  except Exception as error:
158
118
  print(error)
@@ -160,7 +120,7 @@ def get_transcriber(o, prompt=True):
160
120
  exit(1)
161
121
 
162
122
  elif backend == "whisper":
163
- transcriber = WhisperTranscriber(model_name=model, language=o.language, samplerate=o.samplerate, max_duration=o.duration)
123
+ transcriber = WhisperTranscriber(model_name=model, language=o.language, samplerate=o.samplerate, timeout=o.duration, silence_duration=o.silence)
164
124
 
165
125
  else:
166
126
  raise ValueError(f"Unknown backend: {backend}")
@@ -184,16 +144,71 @@ def get_parser():
184
144
  parser.add_argument("--no-prompt", action="store_false", dest="prompt", help="Disable prompts for backend and model selection and jump to recording")
185
145
 
186
146
  parser.add_argument("--samplerate", default=16000, type=int, help=argparse.SUPPRESS)
187
- parser.add_argument("--duration", default=60, type=int, help="duration in seconds before whisper models start transcribing (default %(default)ss)")
188
147
  parser.add_argument("--keyboard", action="store_true")
189
148
  parser.add_argument("--no-clipboard", dest="clipboard", action="store_false")
190
149
  parser.add_argument("--latency", default=0, type=float, help="keyboard latency")
191
150
 
151
+ group = parser.add_argument_group("whisper options")
152
+ group.add_argument("--duration", default=120, type=int, help="Max duration of the whisper recording (default %(default)ss)")
153
+ group.add_argument("--silence", default=2, type=float, help="silence duration that prompt transcription (whisper) (default %(default)ss)")
154
+ group.add_argument("--restart-after-silence", action="store_true", help="Restart the recording after a transcription triggered by a silence")
155
+
192
156
  parser.add_argument("--data-folder", help="Folder to store Vosk models.")
193
157
 
194
158
  return parser
195
159
 
196
160
 
161
+ # Commencer l'enregistrement
162
+ def start_recording(micro, transcriber, clipboard=True, keyboard=False, latency=0):
163
+
164
+ if keyboard:
165
+ try:
166
+ from scribe.keyboard import type_text
167
+ except ImportError:
168
+ keyboard = False
169
+ print("Keyboard simulation is not available.")
170
+ return
171
+
172
+ print("\nChange focus to target app during transcription.")
173
+
174
+
175
+ if clipboard:
176
+ try:
177
+ import pyperclip
178
+ except ImportError:
179
+ clipboard = False
180
+ print("Clipboard simulation is not available.")
181
+ return
182
+
183
+ print("\nThe full transcription will be copied to clipboard as it becomes available.")
184
+
185
+
186
+ fulltext = ""
187
+
188
+ greetings = { k: v for k, v in language_config["_meta"].get(transcriber.language, {}).items()
189
+ if v is not None and k.startswith(("start", "stop"))
190
+ }
191
+
192
+ for result in transcriber.start_recording(micro, **greetings):
193
+
194
+ if result.get('text'):
195
+ clear_line()
196
+ print(result.get('text'))
197
+ if keyboard:
198
+ type_text(result['text'] + " ", interval=latency) # Simulate typing
199
+
200
+ if clipboard:
201
+ fulltext += result['text'] + " "
202
+ pyperclip.copy(fulltext)
203
+
204
+ else:
205
+ print_partial(result.get('partial', ''))
206
+
207
+ if clipboard:
208
+ print("Copied to clipboard.")
209
+
210
+
211
+
197
212
  def main(args=None):
198
213
 
199
214
  parser = get_parser()
@@ -205,18 +220,22 @@ def main(args=None):
205
220
 
206
221
  transcriber = None
207
222
 
223
+ toggle = {True: "On", False: "Off"}
224
+
208
225
  while True:
209
226
  if transcriber is None:
210
227
  transcriber = get_transcriber(o, prompt=o.prompt)
211
- print(f"[ Model {transcriber.model_name} from {transcriber.backend} selected. Keyboard [{'on' if o.keyboard else 'off'}]. Clipboard [{'on' if o.clipboard else 'off'}]]")
228
+ print(f">>> Model {transcriber.model_name} from {transcriber.backend} selected. Keyboard [{'on' if o.keyboard else 'off'}]. Clipboard [{'on' if o.clipboard else 'off'}] <<<")
212
229
  if o.prompt:
213
230
  print(f"Choose any of the following actions:")
214
231
  print(f"[q] quit")
215
232
  print(f"[e] change model")
216
- print(f"[k] toggle keyboard [{'off' if o.keyboard else 'on'}]")
217
- print(f"[c] toggle clipboard [{'off' if o.clipboard else 'on'}]")
233
+ print(f"[k] toggle keyboard [{toggle[o.keyboard]}] -> [{toggle[not o.keyboard]}]")
234
+ print(f"[c] toggle clipboard [{toggle[o.clipboard]}] -> [{toggle[not o.clipboard]}]")
218
235
  if transcriber.backend == "whisper":
219
- print(f"[t] change duration (currently {transcriber.max_duration}s)")
236
+ print(f"[t] change duration (currently {transcriber.timeout}s)")
237
+ print(f"[b] change silence duration (currently {transcriber.silence_duration}s)")
238
+ print(f"[a] toggle auto-restart after silence [{toggle[o.restart_after_silence]}] -> [{toggle[not o.restart_after_silence]}]")
220
239
  print(colored(f"Press [Enter] or any other key to start recording.", "BOLD"))
221
240
 
222
241
  key = input()
@@ -232,9 +251,16 @@ def main(args=None):
232
251
  o.clipboard = not o.clipboard
233
252
  continue
234
253
  if key == "t":
235
- duration = input(f"Enter new duration in seconds (current: {transcriber.max_duration}): ")
254
+ ans = input(f"Enter new duration in seconds (current: {transcriber.timeout}): ")
255
+ try:
256
+ o.duration = transcriber.timeout = int(ans)
257
+ except:
258
+ print("Invalid duration. Must be an integer.")
259
+ continue
260
+ if key == "b":
261
+ ans = input(f"Enter new silence break duration in seconds (current: {transcriber.silence_duration}): ")
236
262
  try:
237
- o.duration = transcriber.max_duration = int(duration)
263
+ o.silence = transcriber.silence_duration = int(ans)
238
264
  except:
239
265
  print("Invalid duration. Must be an integer.")
240
266
  continue
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: scribe-cli
3
- Version: 0.5.0
3
+ Version: 0.6.0
4
4
  Summary: scribe is a local speech recognition tool that provides real-time transcription using vosk and whisper AI.
5
5
  Author-email: Mahé Perrette <mahe.perrette@gmail.com>
6
6
  License: MIT License
@@ -33,7 +33,7 @@ License: MIT License
33
33
  licenses of all dependencies before using or distributing this software to
34
34
  ensure compliance with their respective terms.
35
35
  Project-URL: Homepage, https://github.com/perrette/scribe
36
- Keywords: speech recognition,transcription,AI,language,vosk,whisper,openai
36
+ Keywords: speech recognition,transcription,AI,language,vosk,whisper,openai,keyboard,clipboard
37
37
  Classifier: Programming Language :: Python :: 3
38
38
  Classifier: Operating System :: OS Independent
39
39
  Requires-Python: >=3.9
File without changes
File without changes
File without changes
File without changes
File without changes