scribe-cli 0.5.1__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. {scribe_cli-0.5.1/scribe_cli.egg-info → scribe_cli-0.6.0}/PKG-INFO +2 -2
  2. {scribe_cli-0.5.1 → scribe_cli-0.6.0}/pyproject.toml +2 -0
  3. {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe/_version.py +2 -2
  4. {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe/audio.py +27 -2
  5. {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe/models.py +35 -7
  6. {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe/streamer.py +74 -48
  7. {scribe_cli-0.5.1 → scribe_cli-0.6.0/scribe_cli.egg-info}/PKG-INFO +2 -2
  8. {scribe_cli-0.5.1 → scribe_cli-0.6.0}/.github/workflows/pypi.yml +0 -0
  9. {scribe_cli-0.5.1 → scribe_cli-0.6.0}/.gitignore +0 -0
  10. {scribe_cli-0.5.1 → scribe_cli-0.6.0}/LICENSE +0 -0
  11. {scribe_cli-0.5.1 → scribe_cli-0.6.0}/README.md +0 -0
  12. {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe/__init__.py +0 -0
  13. {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe/install_desktop.py +0 -0
  14. {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe/keyboard.py +0 -0
  15. {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe/models.toml +0 -0
  16. {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe/saverecording.py +0 -0
  17. {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe/testpynput.py +0 -0
  18. {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe/util.py +0 -0
  19. {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe_cli.egg-info/SOURCES.txt +0 -0
  20. {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe_cli.egg-info/dependency_links.txt +0 -0
  21. {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe_cli.egg-info/entry_points.txt +0 -0
  22. {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe_cli.egg-info/requires.txt +0 -0
  23. {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe_cli.egg-info/top_level.txt +0 -0
  24. {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe_data/__init__.py +0 -0
  25. {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe_data/share/icon.jpg +0 -0
  26. {scribe_cli-0.5.1 → scribe_cli-0.6.0}/scribe_data/templates/scribe.desktop +0 -0
  27. {scribe_cli-0.5.1 → scribe_cli-0.6.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: scribe-cli
3
- Version: 0.5.1
3
+ Version: 0.6.0
4
4
  Summary: scribe is a local speech recognition tool that provides real-time transcription using vosk and whisper AI.
5
5
  Author-email: Mahé Perrette <mahe.perrette@gmail.com>
6
6
  License: MIT License
@@ -33,7 +33,7 @@ License: MIT License
33
33
  licenses of all dependencies before using or distributing this software to
34
34
  ensure compliance with their respective terms.
35
35
  Project-URL: Homepage, https://github.com/perrette/scribe
36
- Keywords: speech recognition,transcription,AI,language,vosk,whisper,openai
36
+ Keywords: speech recognition,transcription,AI,language,vosk,whisper,openai,keyboard,clipboard
37
37
  Classifier: Programming Language :: Python :: 3
38
38
  Classifier: Operating System :: OS Independent
39
39
  Requires-Python: >=3.9
@@ -34,6 +34,8 @@ keywords = [
34
34
  "vosk",
35
35
  "whisper",
36
36
  "openai",
37
+ "keyboard",
38
+ "clipboard",
37
39
  ]
38
40
 
39
41
  [tool.setuptools]
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.5.1'
16
- __version_tuple__ = version_tuple = (0, 5, 1)
15
+ __version__ = version = '0.6.0'
16
+ __version_tuple__ = version_tuple = (0, 6, 0)
@@ -1,5 +1,6 @@
1
- import sounddevice as sd
2
1
  import queue
2
+ import numpy as np
3
+ import sounddevice as sd
3
4
 
4
5
 
5
6
  def get_duration(audio_length_bytes, # bytes
@@ -48,4 +49,28 @@ class Microphone:
48
49
  return sd.query_devices(self.device, 'input')
49
50
 
50
51
  def get_duraction(self, audio_length_bytes):
51
- return get_duration(audio_length_bytes, self.samplerate, self.channels, {'int16':2}[self.dtype])
52
+ return get_duration(audio_length_bytes, self.samplerate, self.channels, {'int16':2}[self.dtype])
53
+
54
+
55
+
56
+ def calculate_decibels(data_bytes):
57
+ """
58
+ Calculate the decibel level of integer-valued audio data.
59
+
60
+ :param data_bytes: Audio data as a bytes object.
61
+ :return: Decibel level of the audio data.
62
+ """
63
+ # Normalize the integer samples to the range [-1.0, 1.0]
64
+ data = np.frombuffer(data_bytes, dtype=np.int16)
65
+ normalized_data = data / 32768.0
66
+
67
+ # Calculate the RMS value
68
+ rms = np.sqrt(np.mean(np.square(normalized_data)))
69
+
70
+ if rms == 0:
71
+ return -np.inf
72
+
73
+ # Convert RMS to decibels
74
+ db = 20 * np.log10(rms)
75
+
76
+ return db
@@ -1,8 +1,16 @@
1
1
  import os
2
2
  import json
3
- import numpy as np
4
3
  import time
4
+ from collections import deque
5
+ import numpy as np
5
6
  from scribe.util import download_model
7
+ from scribe.audio import calculate_decibels
8
+
9
+ def is_silent(data, silence_thresh=-40):
10
+ """
11
+ Détermine si un segment audio est un silence en fonction du niveau de volume.
12
+ """
13
+ return calculate_decibels(data) < silence_thresh
6
14
 
7
15
  VOSK_MODELS_FOLDER = os.path.join(os.environ.get("HOME"),
8
16
  ".local/share/vosk/language-models")
@@ -10,22 +18,24 @@ VOSK_MODELS_FOLDER = os.path.join(os.environ.get("HOME"),
10
18
 
11
19
  class AbstractTranscriber:
12
20
  backend = None
13
- def __init__(self, model, model_name=None, language=None, samplerate=16000, timeout=None, model_kwargs={}):
21
+ def __init__(self, model, model_name=None, language=None, samplerate=16000, timeout=None, model_kwargs={},
22
+ silence_thresh=-40, silence_duration=2, restart_after_silence=False):
14
23
  self.model_name = model_name
15
24
  self.language = language
16
25
  self.model = model
17
26
  self.model_kwargs = model_kwargs
18
27
  self.samplerate = samplerate
19
28
  self.timeout = timeout
20
- self.one_second_bytes = self.samplerate * 2 # 16-bit audio, 1 channel ~ 32000 bytes
21
- self.audio_buffer = b''
22
- self.start_time = time.time()
29
+ self.silence_thresh = silence_thresh
30
+ self.silence_duration = silence_duration
31
+ self.restart_after_silence = restart_after_silence
32
+ self.reset()
23
33
 
24
34
  def get_elapsed(self):
25
35
  return time.time() - self.start_time
26
36
 
27
37
  def is_overtime(self):
28
- return time.time() - self.start_time > self.timeout
38
+ return self.timeout is not None and time.time() - self.start_time > self.timeout
29
39
 
30
40
  def transcribe_realtime_audio(self, audio_bytes=b""):
31
41
  self.audio_buffer += audio_bytes
@@ -36,6 +46,8 @@ class AbstractTranscriber:
36
46
 
37
47
  def reset(self):
38
48
  self.audio_buffer = b''
49
+ self.start_time = time.time()
50
+ self.last_sound_time = time.time()
39
51
 
40
52
  def start_recording(self, microphone,
41
53
  start_message="Recording... Press Ctrl+C to stop.",
@@ -50,6 +62,23 @@ class AbstractTranscriber:
50
62
  while True:
51
63
  while not microphone.q.empty():
52
64
  data = microphone.q.get()
65
+
66
+ # Vérifier si le segment est un silence
67
+ if is_silent(data, self.silence_thresh):
68
+ silence_duration = time.time() - self.last_sound_time
69
+
70
+ if self.silence_duration is not None and silence_duration >= self.silence_duration and len(self.audio_buffer) > 0:
71
+ if self.restart_after_silence:
72
+ result = self.finalize()
73
+ microphone.q.queue.clear()
74
+ self.reset()
75
+ yield result
76
+ else:
77
+ raise KeyboardInterrupt("Silence detected: {:.2f} seconds".format(silence_duration))
78
+
79
+ else:
80
+ self.last_sound_time = time.time()
81
+
53
82
  yield self.transcribe_realtime_audio(data)
54
83
 
55
84
  if self.is_overtime():
@@ -137,7 +166,6 @@ class WhisperTranscriber(AbstractTranscriber):
137
166
  super().__init__(model, model_name, language, model_kwargs=model_kwargs, **kwargs)
138
167
 
139
168
  def transcribe_audio(self, audio_bytes):
140
- print("\nIf --keyboard is set, change focus to target app NOW !")
141
169
  print("Transcribing...")
142
170
  audio_array = np.frombuffer(audio_bytes, dtype=np.int16).flatten().astype(np.float32) / 32768.0
143
171
  return self.model.transcribe(audio_array, fp16=False, language=self.language)
@@ -11,47 +11,6 @@ with open(Path(__file__).parent / "models.toml", "rb") as f:
11
11
  language_config = language_config_default.copy()
12
12
 
13
13
 
14
- # Commencer l'enregistrement
15
- def start_recording(micro, transcriber, clipboard=True, keyboard=False, latency=0):
16
-
17
- if keyboard:
18
- try:
19
- from scribe.keyboard import type_text
20
- except ImportError:
21
- keyboard = False
22
- print("Keyboard simulation is not available.")
23
- return
24
-
25
- if clipboard:
26
- try:
27
- import pyperclip
28
- except ImportError:
29
- clipboard = False
30
- print("Clipboard simulation is not available.")
31
- return
32
-
33
- fulltext = ""
34
-
35
- greetings = { k: v for k, v in language_config["_meta"].get(transcriber.language, {}).items()
36
- if v is not None and k.startswith(("start", "stop"))
37
- }
38
-
39
- for result in transcriber.start_recording(micro, **greetings):
40
-
41
- if result.get('text'):
42
- clear_line()
43
- print(result.get('text'))
44
- if keyboard:
45
- type_text(result['text'] + " ", interval=latency) # Simulate typing
46
-
47
- if clipboard:
48
- fulltext += result['text'] + " "
49
- pyperclip.copy(fulltext)
50
-
51
- else:
52
- print_partial(result.get('partial', ''))
53
-
54
-
55
14
  def get_default_backend():
56
15
  try:
57
16
  import vosk
@@ -153,6 +112,7 @@ def get_transcriber(o, prompt=True):
153
112
  language=o.language,
154
113
  samplerate=o.samplerate,
155
114
  timeout=None, # vosk keeps going (no timeout)
115
+ silence_duration=None, # vosk handles silences internally
156
116
  model_kwargs={"data_folder": o.data_folder})
157
117
  except Exception as error:
158
118
  print(error)
@@ -160,7 +120,7 @@ def get_transcriber(o, prompt=True):
160
120
  exit(1)
161
121
 
162
122
  elif backend == "whisper":
163
- transcriber = WhisperTranscriber(model_name=model, language=o.language, samplerate=o.samplerate, timeout=o.duration)
123
+ transcriber = WhisperTranscriber(model_name=model, language=o.language, samplerate=o.samplerate, timeout=o.duration, silence_duration=o.silence)
164
124
 
165
125
  else:
166
126
  raise ValueError(f"Unknown backend: {backend}")
@@ -184,16 +144,71 @@ def get_parser():
184
144
  parser.add_argument("--no-prompt", action="store_false", dest="prompt", help="Disable prompts for backend and model selection and jump to recording")
185
145
 
186
146
  parser.add_argument("--samplerate", default=16000, type=int, help=argparse.SUPPRESS)
187
- parser.add_argument("--duration", default=60, type=int, help="duration in seconds before whisper models start transcribing (default %(default)ss)")
188
147
  parser.add_argument("--keyboard", action="store_true")
189
148
  parser.add_argument("--no-clipboard", dest="clipboard", action="store_false")
190
149
  parser.add_argument("--latency", default=0, type=float, help="keyboard latency")
191
150
 
151
+ group = parser.add_argument_group("whisper options")
152
+ group.add_argument("--duration", default=120, type=int, help="Max duration of the whisper recording (default %(default)ss)")
153
+ group.add_argument("--silence", default=2, type=float, help="silence duration that prompt transcription (whisper) (default %(default)ss)")
154
+ group.add_argument("--restart-after-silence", action="store_true", help="Restart the recording after a transcription triggered by a silence")
155
+
192
156
  parser.add_argument("--data-folder", help="Folder to store Vosk models.")
193
157
 
194
158
  return parser
195
159
 
196
160
 
161
+ # Commencer l'enregistrement
162
+ def start_recording(micro, transcriber, clipboard=True, keyboard=False, latency=0):
163
+
164
+ if keyboard:
165
+ try:
166
+ from scribe.keyboard import type_text
167
+ except ImportError:
168
+ keyboard = False
169
+ print("Keyboard simulation is not available.")
170
+ return
171
+
172
+ print("\nChange focus to target app during transcription.")
173
+
174
+
175
+ if clipboard:
176
+ try:
177
+ import pyperclip
178
+ except ImportError:
179
+ clipboard = False
180
+ print("Clipboard simulation is not available.")
181
+ return
182
+
183
+ print("\nThe full transcription will be copied to clipboard as it becomes available.")
184
+
185
+
186
+ fulltext = ""
187
+
188
+ greetings = { k: v for k, v in language_config["_meta"].get(transcriber.language, {}).items()
189
+ if v is not None and k.startswith(("start", "stop"))
190
+ }
191
+
192
+ for result in transcriber.start_recording(micro, **greetings):
193
+
194
+ if result.get('text'):
195
+ clear_line()
196
+ print(result.get('text'))
197
+ if keyboard:
198
+ type_text(result['text'] + " ", interval=latency) # Simulate typing
199
+
200
+ if clipboard:
201
+ fulltext += result['text'] + " "
202
+ pyperclip.copy(fulltext)
203
+
204
+ else:
205
+ print_partial(result.get('partial', ''))
206
+
207
+ if clipboard:
208
+ print("Copied to clipboard.")
209
+
210
+
211
+
197
212
  def main(args=None):
198
213
 
199
214
  parser = get_parser()
@@ -205,18 +220,22 @@ def main(args=None):
205
220
 
206
221
  transcriber = None
207
222
 
223
+ toggle = {True: "On", False: "Off"}
224
+
208
225
  while True:
209
226
  if transcriber is None:
210
227
  transcriber = get_transcriber(o, prompt=o.prompt)
211
- print(f"[ Model {transcriber.model_name} from {transcriber.backend} selected. Keyboard [{'on' if o.keyboard else 'off'}]. Clipboard [{'on' if o.clipboard else 'off'}]]")
228
+ print(f">>> Model {transcriber.model_name} from {transcriber.backend} selected. Keyboard [{'on' if o.keyboard else 'off'}]. Clipboard [{'on' if o.clipboard else 'off'}] <<<")
212
229
  if o.prompt:
213
230
  print(f"Choose any of the following actions:")
214
231
  print(f"[q] quit")
215
232
  print(f"[e] change model")
216
- print(f"[k] toggle keyboard [{'off' if o.keyboard else 'on'}]")
217
- print(f"[c] toggle clipboard [{'off' if o.clipboard else 'on'}]")
233
+ print(f"[k] toggle keyboard [{toggle[o.keyboard]}] -> [{toggle[not o.keyboard]}]")
234
+ print(f"[c] toggle clipboard [{toggle[o.clipboard]}] -> [{toggle[not o.clipboard]}]")
218
235
  if transcriber.backend == "whisper":
219
236
  print(f"[t] change duration (currently {transcriber.timeout}s)")
237
+ print(f"[b] change silence duration (currently {transcriber.silence_duration}s)")
238
+ print(f"[a] toggle auto-restart after silence [{toggle[o.restart_after_silence]}] -> [{toggle[not o.restart_after_silence]}]")
220
239
  print(colored(f"Press [Enter] or any other key to start recording.", "BOLD"))
221
240
 
222
241
  key = input()
@@ -232,9 +251,16 @@ def main(args=None):
232
251
  o.clipboard = not o.clipboard
233
252
  continue
234
253
  if key == "t":
235
- duration = input(f"Enter new duration in seconds (current: {transcriber.timeout}): ")
254
+ ans = input(f"Enter new duration in seconds (current: {transcriber.timeout}): ")
255
+ try:
256
+ o.duration = transcriber.timeout = int(ans)
257
+ except:
258
+ print("Invalid duration. Must be an integer.")
259
+ continue
260
+ if key == "b":
261
+ ans = input(f"Enter new silence break duration in seconds (current: {transcriber.silence_duration}): ")
236
262
  try:
237
- o.duration = transcriber.timeout = int(duration)
263
+ o.silence = transcriber.silence_duration = int(ans)
238
264
  except:
239
265
  print("Invalid duration. Must be an integer.")
240
266
  continue
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: scribe-cli
3
- Version: 0.5.1
3
+ Version: 0.6.0
4
4
  Summary: scribe is a local speech recognition tool that provides real-time transcription using vosk and whisper AI.
5
5
  Author-email: Mahé Perrette <mahe.perrette@gmail.com>
6
6
  License: MIT License
@@ -33,7 +33,7 @@ License: MIT License
33
33
  licenses of all dependencies before using or distributing this software to
34
34
  ensure compliance with their respective terms.
35
35
  Project-URL: Homepage, https://github.com/perrette/scribe
36
- Keywords: speech recognition,transcription,AI,language,vosk,whisper,openai
36
+ Keywords: speech recognition,transcription,AI,language,vosk,whisper,openai,keyboard,clipboard
37
37
  Classifier: Programming Language :: Python :: 3
38
38
  Classifier: Operating System :: OS Independent
39
39
  Requires-Python: >=3.9
File without changes
File without changes
File without changes
File without changes
File without changes