PyPI - scribe-cli - Versions diffs - 0.5.0__tar.gz → 0.6.0__tar.gz - Mend

scribe-cli 0.5.0tar.gz → 0.6.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

{scribe_cli-0.5.0/scribe_cli.egg-info → scribe_cli-0.6.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: scribe-cli
-Version: 0.5.0
+Version: 0.6.0
 Summary: scribe is a local speech recognition tool that provides real-time transcription using vosk and whisper AI.
 Author-email: Mahé Perrette <mahe.perrette@gmail.com>
 License: MIT License
@@ -33,7 +33,7 @@ License: MIT License
         licenses of all dependencies before using or distributing this software to
         ensure compliance with their respective terms.
 Project-URL: Homepage, https://github.com/perrette/scribe
-Keywords: speech recognition,transcription,AI,language,vosk,whisper,openai
+Keywords: speech recognition,transcription,AI,language,vosk,whisper,openai,keyboard,clipboard
 Classifier: Programming Language :: Python :: 3
 Classifier: Operating System :: OS Independent
 Requires-Python: >=3.9

{scribe_cli-0.5.0 → scribe_cli-0.6.0}/pyproject.toml RENAMED Viewed

@@ -34,6 +34,8 @@ keywords = [
     "vosk",
     "whisper",
     "openai",
+    "keyboard",
+    "clipboard",
 ]
 [tool.setuptools]

{scribe_cli-0.5.0 → scribe_cli-0.6.0}/scribe/_version.py RENAMED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.5.0'
-__version_tuple__ = version_tuple = (0, 5, 0)
+__version__ = version = '0.6.0'
+__version_tuple__ = version_tuple = (0, 6, 0)

{scribe_cli-0.5.0 → scribe_cli-0.6.0}/scribe/audio.py RENAMED Viewed

@@ -1,5 +1,6 @@
-import sounddevice as sd
 import queue
+import numpy as np
+import sounddevice as sd
 def get_duration(audio_length_bytes, # bytes
@@ -48,4 +49,28 @@ class Microphone:
         return sd.query_devices(self.device, 'input')
     def get_duraction(self, audio_length_bytes):
-        return get_duration(audio_length_bytes, self.samplerate, self.channels, {'int16':2}[self.dtype])
+        return get_duration(audio_length_bytes, self.samplerate, self.channels, {'int16':2}[self.dtype])
+def calculate_decibels(data_bytes):
+    """
+    Calculate the decibel level of integer-valued audio data.
+    :param data_bytes: Audio data as a bytes object.
+    :return: Decibel level of the audio data.
+    """
+    # Normalize the integer samples to the range [-1.0, 1.0]
+    data = np.frombuffer(data_bytes, dtype=np.int16)
+    normalized_data = data / 32768.0
+    # Calculate the RMS value
+    rms = np.sqrt(np.mean(np.square(normalized_data)))
+    if rms == 0:
+        return -np.inf
+    # Convert RMS to decibels
+    db = 20 * np.log10(rms)
+    return db

{scribe_cli-0.5.0 → scribe_cli-0.6.0}/scribe/models.py RENAMED Viewed

@@ -1,7 +1,16 @@
 import os
 import json
+import time
+from collections import deque
 import numpy as np
 from scribe.util import download_model
+from scribe.audio import calculate_decibels
+def is_silent(data, silence_thresh=-40):
+    """
+    Détermine si un segment audio est un silence en fonction du niveau de volume.
+    """
+    return calculate_decibels(data) < silence_thresh
 VOSK_MODELS_FOLDER = os.path.join(os.environ.get("HOME"),
                                       ".local/share/vosk/language-models")
@@ -9,21 +18,24 @@ VOSK_MODELS_FOLDER = os.path.join(os.environ.get("HOME"),
 class AbstractTranscriber:
     backend = None
-    def __init__(self, model, model_name=None, language=None, samplerate=16000, max_duration=None, model_kwargs={}):
+    def __init__(self, model, model_name=None, language=None, samplerate=16000, timeout=None, model_kwargs={},
+                 silence_thresh=-40, silence_duration=2, restart_after_silence=False):
         self.model_name = model_name
         self.language = language
         self.model = model
         self.model_kwargs = model_kwargs
         self.samplerate = samplerate
-        self.max_duration = max_duration
-        self.one_second_bytes = self.samplerate * 2 # 16-bit audio, 1 channel  ~ 32000 bytes
-        self.audio_buffer = b''
+        self.timeout = timeout
+        self.silence_thresh = silence_thresh
+        self.silence_duration = silence_duration
+        self.restart_after_silence = restart_after_silence
+        self.reset()
-    def get_elapsed(self, size=None):
-        return len(size or self.audio_buffer) / self.one_second_bytes
+    def get_elapsed(self):
+        return time.time() - self.start_time
-    def is_overtime(self, elapsed=None, size=None):
-        return self.max_duration and (elapsed or self.get_elapsed(size)) > self.max_duration
+    def is_overtime(self):
+        return self.timeout is not None and time.time() - self.start_time > self.timeout
     def transcribe_realtime_audio(self, audio_bytes=b""):
         self.audio_buffer += audio_bytes
@@ -34,6 +46,8 @@ class AbstractTranscriber:
     def reset(self):
         self.audio_buffer = b''
+        self.start_time = time.time()
+        self.last_sound_time = time.time()
     def start_recording(self, microphone,
                         start_message="Recording... Press Ctrl+C to stop.",
@@ -48,6 +62,23 @@ class AbstractTranscriber:
                 while True:
                     while not microphone.q.empty():
                         data = microphone.q.get()
+                        # Vérifier si le segment est un silence
+                        if is_silent(data, self.silence_thresh):
+                            silence_duration = time.time() - self.last_sound_time
+                            if self.silence_duration is not None and silence_duration >= self.silence_duration and len(self.audio_buffer) > 0:
+                                if self.restart_after_silence:
+                                    result = self.finalize()
+                                    microphone.q.queue.clear()
+                                    self.reset()
+                                    yield result
+                                else:
+                                    raise KeyboardInterrupt("Silence detected: {:.2f} seconds".format(silence_duration))
+                        else:
+                            self.last_sound_time = time.time()
                         yield self.transcribe_realtime_audio(data)
                         if self.is_overtime():
@@ -135,7 +166,6 @@ class WhisperTranscriber(AbstractTranscriber):
         super().__init__(model, model_name, language, model_kwargs=model_kwargs, **kwargs)
     def transcribe_audio(self, audio_bytes):
-        print("\nIf --keyboard is set, change focus to target app NOW !")
         print("Transcribing...")
         audio_array = np.frombuffer(audio_bytes, dtype=np.int16).flatten().astype(np.float32) / 32768.0
         return self.model.transcribe(audio_array, fp16=False, language=self.language)

{scribe_cli-0.5.0 → scribe_cli-0.6.0}/scribe/streamer.py RENAMED Viewed

@@ -11,47 +11,6 @@ with open(Path(__file__).parent / "models.toml", "rb") as f:
 language_config = language_config_default.copy()
-# Commencer l'enregistrement
-def start_recording(micro, transcriber, clipboard=True, keyboard=False, latency=0):
-    if keyboard:
-        try:
-            from scribe.keyboard import type_text
-        except ImportError:
-            keyboard = False
-            print("Keyboard simulation is not available.")
-            return
-    if clipboard:
-        try:
-            import pyperclip
-        except ImportError:
-            clipboard = False
-            print("Clipboard simulation is not available.")
-            return
-    fulltext = ""
-    greetings = { k: v for k, v in language_config["_meta"].get(transcriber.language, {}).items()
-                if v is not None and k.startswith(("start", "stop"))
-    }
-    for result in transcriber.start_recording(micro, **greetings):
-        if result.get('text'):
-            clear_line()
-            print(result.get('text'))
-            if keyboard:
-                type_text(result['text'] + " ", interval=latency) # Simulate typing
-            if clipboard:
-                fulltext += result['text'] + " "
-                pyperclip.copy(fulltext)
-        else:
-            print_partial(result.get('partial', ''))
 def get_default_backend():
     try:
         import vosk
@@ -152,7 +111,8 @@ def get_transcriber(o, prompt=True):
             transcriber = VoskTranscriber(model_name=model,
                                         language=o.language,
                                         samplerate=o.samplerate,
-                                        max_duration=None, # vosk keeps going (no timeout)
+                                        timeout=None, # vosk keeps going (no timeout)
+                                        silence_duration=None, # vosk handles silences internally
                                         model_kwargs={"data_folder": o.data_folder})
         except Exception as error:
             print(error)
@@ -160,7 +120,7 @@ def get_transcriber(o, prompt=True):
             exit(1)
     elif backend == "whisper":
-        transcriber = WhisperTranscriber(model_name=model, language=o.language, samplerate=o.samplerate, max_duration=o.duration)
+        transcriber = WhisperTranscriber(model_name=model, language=o.language, samplerate=o.samplerate, timeout=o.duration, silence_duration=o.silence)
     else:
         raise ValueError(f"Unknown backend: {backend}")
@@ -184,16 +144,71 @@ def get_parser():
     parser.add_argument("--no-prompt", action="store_false", dest="prompt", help="Disable prompts for backend and model selection and jump to recording")
     parser.add_argument("--samplerate", default=16000, type=int, help=argparse.SUPPRESS)
-    parser.add_argument("--duration", default=60, type=int, help="duration in seconds before whisper models start transcribing (default %(default)ss)")
     parser.add_argument("--keyboard", action="store_true")
     parser.add_argument("--no-clipboard", dest="clipboard", action="store_false")
     parser.add_argument("--latency", default=0, type=float, help="keyboard latency")
+    group = parser.add_argument_group("whisper options")
+    group.add_argument("--duration", default=120, type=int, help="Max duration of the whisper recording (default %(default)ss)")
+    group.add_argument("--silence", default=2, type=float, help="silence duration that prompt transcription (whisper) (default %(default)ss)")
+    group.add_argument("--restart-after-silence", action="store_true", help="Restart the recording after a transcription triggered by a silence")
     parser.add_argument("--data-folder", help="Folder to store Vosk models.")
     return parser
+# Commencer l'enregistrement
+def start_recording(micro, transcriber, clipboard=True, keyboard=False, latency=0):
+    if keyboard:
+        try:
+            from scribe.keyboard import type_text
+        except ImportError:
+            keyboard = False
+            print("Keyboard simulation is not available.")
+            return
+        print("\nChange focus to target app during transcription.")
+    if clipboard:
+        try:
+            import pyperclip
+        except ImportError:
+            clipboard = False
+            print("Clipboard simulation is not available.")
+            return
+        print("\nThe full transcription will be copied to clipboard as it becomes available.")
+    fulltext = ""
+    greetings = { k: v for k, v in language_config["_meta"].get(transcriber.language, {}).items()
+                if v is not None and k.startswith(("start", "stop"))
+    }
+    for result in transcriber.start_recording(micro, **greetings):
+        if result.get('text'):
+            clear_line()
+            print(result.get('text'))
+            if keyboard:
+                type_text(result['text'] + " ", interval=latency) # Simulate typing
+            if clipboard:
+                fulltext += result['text'] + " "
+                pyperclip.copy(fulltext)
+        else:
+            print_partial(result.get('partial', ''))
+    if clipboard:
+        print("Copied to clipboard.")
 def main(args=None):
     parser = get_parser()
@@ -205,18 +220,22 @@ def main(args=None):
     transcriber = None
+    toggle = {True: "On", False: "Off"}
     while True:
         if transcriber is None:
             transcriber = get_transcriber(o, prompt=o.prompt)
-        print(f"[ Model {transcriber.model_name} from {transcriber.backend} selected. Keyboard [{'on' if o.keyboard else 'off'}]. Clipboard [{'on' if o.clipboard else 'off'}]]")
+        print(f">>> Model {transcriber.model_name} from {transcriber.backend} selected. Keyboard [{'on' if o.keyboard else 'off'}]. Clipboard [{'on' if o.clipboard else 'off'}] <<<")
         if o.prompt:
             print(f"Choose any of the following actions:")
             print(f"[q] quit")
             print(f"[e] change model")
-            print(f"[k] toggle keyboard [{'off' if o.keyboard else 'on'}]")
-            print(f"[c] toggle clipboard [{'off' if o.clipboard else 'on'}]")
+            print(f"[k] toggle keyboard [{toggle[o.keyboard]}] -> [{toggle[not o.keyboard]}]")
+            print(f"[c] toggle clipboard [{toggle[o.clipboard]}] -> [{toggle[not o.clipboard]}]")
             if transcriber.backend == "whisper":
-                print(f"[t] change duration (currently {transcriber.max_duration}s)")
+                print(f"[t] change duration (currently {transcriber.timeout}s)")
+                print(f"[b] change silence duration (currently {transcriber.silence_duration}s)")
+                print(f"[a] toggle auto-restart after silence [{toggle[o.restart_after_silence]}] -> [{toggle[not o.restart_after_silence]}]")
             print(colored(f"Press [Enter] or any other key to start recording.", "BOLD"))
             key = input()
@@ -232,9 +251,16 @@ def main(args=None):
                 o.clipboard = not o.clipboard
                 continue
             if key == "t":
-                duration = input(f"Enter new duration in seconds (current: {transcriber.max_duration}): ")
+                ans = input(f"Enter new duration in seconds (current: {transcriber.timeout}): ")
+                try:
+                    o.duration = transcriber.timeout = int(ans)
+                except:
+                    print("Invalid duration. Must be an integer.")
+                continue
+            if key == "b":
+                ans = input(f"Enter new silence break duration in seconds (current: {transcriber.silence_duration}): ")
                 try:
-                    o.duration = transcriber.max_duration = int(duration)
+                    o.silence = transcriber.silence_duration = int(ans)
                 except:
                     print("Invalid duration. Must be an integer.")
                 continue

{scribe_cli-0.5.0 → scribe_cli-0.6.0/scribe_cli.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: scribe-cli
-Version: 0.5.0
+Version: 0.6.0
 Summary: scribe is a local speech recognition tool that provides real-time transcription using vosk and whisper AI.
 Author-email: Mahé Perrette <mahe.perrette@gmail.com>
 License: MIT License
@@ -33,7 +33,7 @@ License: MIT License
         licenses of all dependencies before using or distributing this software to
         ensure compliance with their respective terms.
 Project-URL: Homepage, https://github.com/perrette/scribe
-Keywords: speech recognition,transcription,AI,language,vosk,whisper,openai
+Keywords: speech recognition,transcription,AI,language,vosk,whisper,openai,keyboard,clipboard
 Classifier: Programming Language :: Python :: 3
 Classifier: Operating System :: OS Independent
 Requires-Python: >=3.9