PyPI - s2t - Versions diffs - 0.1.9__tar.gz → 0.1.11__tar.gz - Mend

s2t 0.1.9tar.gz → 0.1.11tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

{s2t-0.1.9/src/s2t.egg-info → s2t-0.1.11}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: s2t
-Version: 0.1.9
+Version: 0.1.11
 Summary: Speech to Text (s2t): Record audio, run Whisper, export formats, and copy transcript to clipboard.
 Author: Maintainers
 License-Expression: LicenseRef-Proprietary

{s2t-0.1.9 → s2t-0.1.11}/src/s2t/cli.py RENAMED Viewed

@@ -49,6 +49,7 @@ from .types import TranscriptionResult
 from .utils import (
     convert_wav_to_mp3,
     copy_to_clipboard,
+    debug_log,
     make_session_dir,
     open_in_shell_editor,
 )
@@ -57,6 +58,7 @@ from .whisper_engine import WhisperEngine
 def run_session(opts: SessionOptions) -> int:
     session_dir = make_session_dir(opts.outdir)
+    debug_log(opts.verbose, "cli", f"Session started; directory: {session_dir}")
     profile_data: dict = {}
     requested = opts.recording_format.lower()
     effective = requested
@@ -64,6 +66,12 @@ def run_session(opts: SessionOptions) -> int:
         logging.warning("ffmpeg not found; falling back to FLAC recording instead of MP3.")
         effective = "flac"
     ext = ".flac" if effective == "flac" else ".wav"
+    if requested != effective:
+        debug_log(
+            opts.verbose,
+            "cli",
+            f"Recording format adjusted: requested={requested}, effective={effective}",
+        )
     engine = WhisperEngine(
         model_name=opts.model,
@@ -77,6 +85,8 @@ def run_session(opts: SessionOptions) -> int:
         profile=profile_data if opts.profile else {},
     )
     ex, fut = engine.preload()
+    if ex is not None:
+        debug_log(opts.verbose, "cli", f"Model preload submitted for '{opts.model}'")
     # Determine translation target languages from options
     target_langs: list[str] = []
@@ -98,8 +108,14 @@ def run_session(opts: SessionOptions) -> int:
             detected_lang_event=detected_lang_event,
             detected_lang_holder=detected_lang,
         )
+        debug_log(
+            opts.verbose,
+            "cli",
+            f"Translation targets requested: {', '.join(target_langs)}",
+        )
-    tx_q: queue.Queue[tuple[int, Path, int, float]] = queue.Queue()
+    # Include split cause per chunk: "space" (manual), "pause" (auto), "finish" (final)
+    tx_q: queue.Queue[tuple[int, Path, int, float, str]] = queue.Queue()
     cumulative_text = ""
     next_to_emit = 1
     pending: dict[int, str] = {}
@@ -148,17 +164,29 @@ def run_session(opts: SessionOptions) -> int:
     def tx_worker():
         model = engine.resolve_model(fut)
+        debug_log(opts.verbose, "cli", "Transcription worker started")
         nonlocal cumulative_text, next_to_emit
         finished_texts: dict[int, str] = {}
+        causes: dict[int, str] = {}
         while True:
-            idx, path, frames, offset = tx_q.get()
+            idx, path, frames, offset, cause = tx_q.get()
             if idx == -1:
                 break
+            debug_log(
+                opts.verbose,
+                "cli",
+                f"Dequeued chunk {idx}: {path.name if path else '(final)'} (frames={frames}, offset={offset:.3f}, cause={cause or '-'})",
+            )
             # If in spoken-prompt mode, ensure we don't process payload chunks before prompt is done
             if opts.prompt and idx > 1 and not prompt_done.is_set():
+                debug_log(opts.verbose, "cli", f"Waiting for prompt before processing chunk {idx}")
                 prompt_done.wait()
             # Build latest-ready prompt based on already finished chunks
             prompt = _build_latest_ready_prompt(idx, finished_texts)
+            if prompt:
+                debug_log(
+                    opts.verbose, "cli", f"Built initial prompt for chunk {idx} (len={len(prompt)})"
+                )
             res = engine.transcribe_chunk(model, path, frames, initial_prompt=prompt)
             # Record detected language once (for translator preload if needed)
             if target_langs and detected_lang["code"] is None:
@@ -166,6 +194,7 @@ def run_session(opts: SessionOptions) -> int:
                 if lang_code:
                     detected_lang["code"] = lang_code
                     detected_lang_event.set()
+                    debug_log(opts.verbose, "cli", f"Detected source language: {lang_code}")
             engine.write_chunk_outputs(res, path)
             text_i = (res.get("text", "") or "").strip()
             with agg_lock:
@@ -174,20 +203,58 @@ def run_session(opts: SessionOptions) -> int:
                 results.append(res)
                 offsets.append(offset)
                 pending[idx] = text_i
+                # Track cause for formatting when emitting in-order
+                # cause is one of: "space", "pause", "finish" (or empty for sentinel)
+                # Default to "pause" if unknown to avoid extra blank lines.
+                causes[idx] = cause or "pause"
                 while next_to_emit in pending:
                     out = pending.pop(next_to_emit)
+                    cause_i = causes.get(next_to_emit) or "pause"
                     if out:
+                        # Live stdout behavior
                         print(out)
-                        print("")
-                        cumulative_text += out if not cumulative_text else ("\n\n" + out)
-                        try:
-                            copy_to_clipboard(cumulative_text)
-                        except Exception:
-                            pass
+                        if cause_i == "space":
+                            print("")  # blank line after SPACE
+                        # Build cumulative text with post-separator semantics
+                        if not cumulative_text:
+                            cumulative_text = out
+                        else:
+                            cumulative_text += out
+                        # Append separator AFTER the chunk, matching stdout
+                        if cause_i == "space":
+                            if not cumulative_text.endswith("\n\n"):
+                                # ensure exactly one paragraph break
+                                if cumulative_text.endswith("\n"):
+                                    cumulative_text += "\n"
+                                else:
+                                    cumulative_text += "\n\n"
+                        else:
+                            # single line break after non-space chunks
+                            if not (
+                                cumulative_text.endswith("\n") or cumulative_text.endswith("\n\n")
+                            ):
+                                cumulative_text += "\n"
+                    else:
+                        # Even if chunk text is empty, respect SPACE as a paragraph break
+                        if cause_i == "space":
+                            print("")  # blank line on stdout
+                            if cumulative_text:
+                                if cumulative_text.endswith("\n\n"):
+                                    pass
+                                elif cumulative_text.endswith("\n"):
+                                    cumulative_text += "\n"
+                                else:
+                                    cumulative_text += "\n\n"
+                        # For empty non-space chunks, do not alter cumulative_text
+                    try:
+                        copy_to_clipboard(cumulative_text)
+                    except Exception:
+                        pass
                     next_to_emit += 1
                 # If this was the prompt chunk, signal readiness and instruct user
                 if opts.prompt and idx == 1 and not prompt_done.is_set():
                     prompt_done.set()
+                    debug_log(opts.verbose, "cli", "Prompt transcribed; resuming payload")
                     print("=" * 60)
                     print("Prompt transcribed. Start speaking your main content now.")
                     print("=" * 60)
@@ -195,6 +262,7 @@ def run_session(opts: SessionOptions) -> int:
                     if prompt_resume_event is not None:
                         prompt_resume_event.set()
         tx_done.set()
+        debug_log(opts.verbose, "cli", "Transcription worker finished")
     tx_t = threading.Thread(target=tx_worker, daemon=True)
     tx_t.start()
@@ -202,6 +270,7 @@ def run_session(opts: SessionOptions) -> int:
     if opts.prompt:
         print("Prompt mode enabled: Speak your prompt first, then press SPACE.")
         print("Recording will wait for the prompt transcription before starting payload.")
+        debug_log(opts.verbose, "cli", "Prompt mode enabled")
     # Prepare resume event to pause recording between prompt and payload
     prompt_resume_event = threading.Event() if opts.prompt else None
     rec = Recorder(
@@ -221,6 +290,9 @@ def run_session(opts: SessionOptions) -> int:
     t1 = time.perf_counter()
     if opts.profile:
         profile_data["recording_sec"] = t1 - t0
+    debug_log(
+        opts.verbose, "cli", f"Recording finished in {(t1 - t0):.3f}s (chunks={len(chunk_paths)})"
+    )
     tx_t.join()
     merged: TranscriptionResult = engine.merge_results(results, chunk_offsets, cumulative_text)
@@ -230,8 +302,7 @@ def run_session(opts: SessionOptions) -> int:
     try:
         if chunk_paths:
             concat_audio(chunk_paths, base_audio_path, opts.rate, opts.channels)
-            if opts.verbose:
-                print(f"Merged audio written: {base_audio_path.name}", file=sys.stderr)
+            debug_log(opts.verbose, "cli", f"Merged audio written: {base_audio_path.name}")
             if requested == "mp3" and shutil.which("ffmpeg") is not None:
                 mp3_out = session_dir / "recording.mp3"
                 convert_wav_to_mp3(
@@ -242,11 +313,9 @@ def run_session(opts: SessionOptions) -> int:
                     ),
                     mp3_out,
                 )
-                if opts.verbose:
-                    print(f"Converted merged audio to MP3: {mp3_out.name}", file=sys.stderr)
+                debug_log(opts.verbose, "cli", f"Converted merged audio to MP3: {mp3_out.name}")
     except Exception as e:
-        if opts.verbose:
-            print(f"Warning: failed to merge chunk audio: {e}", file=sys.stderr)
+        debug_log(opts.verbose, "cli", f"Warning: failed to merge chunk audio: {e}")
     # Optionally delete chunk files (audio + per-chunk outputs)
     if chunk_paths and not opts.keep_chunks:
@@ -299,11 +368,11 @@ def run_session(opts: SessionOptions) -> int:
         # Decide source language: CLI hint takes precedence; else detected; else skip with warning
         src_lang = (opts.lang.lower() if opts.lang else (detected_lang["code"] or "")).strip()
         if not src_lang:
-            if opts.verbose:
-                print(
-                    "Warning: Could not determine source language for translation; skipping post-translation.",
-                    file=sys.stderr,
-                )
+            debug_log(
+                opts.verbose,
+                "cli",
+                "Warning: Could not determine source language for translation; skipping post-translation.",
+            )
         else:
             # Skip identical language targets
             effective_targets = [t for t in target_langs if t.lower() != src_lang.lower()]
@@ -335,8 +404,7 @@ def run_session(opts: SessionOptions) -> int:
                     f"{base_audio_path.stem}.{tgt}{base_audio_path.suffix}"
                 )
                 write_final_outputs(translated, session_dir, suffixed)
-                if opts.verbose:
-                    print(f"Created translated outputs for '{tgt}'.", file=sys.stderr)
+                debug_log(opts.verbose, "cli", f"Created translated outputs for '{tgt}'.")
             except Exception as e:
                 print(
                     f"Warning: failed to translate to '{tgt}': {e}",

{s2t-0.1.9 → s2t-0.1.11}/src/s2t/recorder.py RENAMED Viewed

@@ -11,6 +11,8 @@ from typing import Any, Protocol, cast, runtime_checkable
 import numpy as np
+from .utils import debug_log
 class Recorder:
     def __init__(
@@ -41,7 +43,7 @@ class Recorder:
     def run(
         self,
-        tx_queue: queue.Queue[tuple[int, Path, int, float]],
+        tx_queue: queue.Queue[tuple[int, Path, int, float, str]],
     ) -> tuple[list[Path], list[int], list[float]]:
         import platform
         import termios
@@ -71,14 +73,12 @@ class Recorder:
                     ms = cast(_MSVCRT, msvcrt)
                     last_space = 0.0
-                    if self.verbose:
-                        print("[key] using msvcrt (Windows)", file=sys.stderr)
+                    debug_log(self.verbose, "recorder", "Key input: using msvcrt (Windows)")
                     while not stop_evt.is_set():
                         if ms.kbhit():
                             ch = ms.getwch()
                             if ch in ("\r", "\n"):
-                                if self.verbose:
-                                    print("[key] ENTER", file=sys.stderr)
+                                debug_log(self.verbose, "recorder", "Key input: ENTER")
                                 evt_q.put("ENTER")
                                 break
                             if ch == " ":
@@ -88,8 +88,7 @@ class Recorder:
                                 ):
                                     continue
                                 last_space = now
-                                if self.verbose:
-                                    print("[key] SPACE", file=sys.stderr)
+                                debug_log(self.verbose, "recorder", "Key input: SPACE")
                                 evt_q.put("SPACE")
                         time.sleep(0.01)
                 else:
@@ -97,8 +96,9 @@ class Recorder:
                     try:
                         if sys.stdin.isatty():
                             fd = sys.stdin.fileno()
-                            if self.verbose:
-                                print("[key] using sys.stdin (isatty, fd read)", file=sys.stderr)
+                            debug_log(
+                                self.verbose, "recorder", "Key input: using sys.stdin (TTY fd read)"
+                            )
                             old = termios.tcgetattr(fd)
                             tty.setcbreak(fd)
                             last_space = 0.0
@@ -114,8 +114,7 @@ class Recorder:
                                             continue
                                         ch = ch_b.decode(errors="ignore")
                                         if ch in ("\n", "\r"):
-                                            if self.verbose:
-                                                print("[key] ENTER", file=sys.stderr)
+                                            debug_log(self.verbose, "recorder", "Key input: ENTER")
                                             evt_q.put("ENTER")
                                             break
                                         if ch == " ":
@@ -125,8 +124,7 @@ class Recorder:
                                             ):
                                                 continue
                                             last_space = now
-                                            if self.verbose:
-                                                print("[key] SPACE", file=sys.stderr)
+                                            debug_log(self.verbose, "recorder", "Key input: SPACE")
                                             evt_q.put("SPACE")
                             finally:
                                 termios.tcsetattr(fd, termios.TCSADRAIN, old)
@@ -137,8 +135,11 @@ class Recorder:
                             try:
                                 fd = os.open("/dev/tty", os.O_RDONLY)
                                 using_devtty = True
-                                if self.verbose:
-                                    print("[key] using /dev/tty (stdin not TTY)", file=sys.stderr)
+                                debug_log(
+                                    self.verbose,
+                                    "recorder",
+                                    "Key input: using /dev/tty (stdin not TTY)",
+                                )
                                 old = termios.tcgetattr(fd)
                                 tty.setcbreak(fd)
                                 last_space = 0.0
@@ -151,8 +152,9 @@ class Recorder:
                                                 continue
                                             ch = ch_b.decode(errors="ignore")
                                             if ch in ("\n", "\r"):
-                                                if self.verbose:
-                                                    print("[key] ENTER", file=sys.stderr)
+                                                debug_log(
+                                                    self.verbose, "recorder", "Key input: ENTER"
+                                                )
                                                 evt_q.put("ENTER")
                                                 break
                                             if ch == " ":
@@ -162,8 +164,9 @@ class Recorder:
                                                 ):
                                                     continue
                                                 last_space = now
-                                                if self.verbose:
-                                                    print("[key] SPACE", file=sys.stderr)
+                                                debug_log(
+                                                    self.verbose, "recorder", "Key input: SPACE"
+                                                )
                                                 evt_q.put("SPACE")
                                 finally:
                                     termios.tcsetattr(fd, termios.TCSADRAIN, old)
@@ -185,14 +188,16 @@ class Recorder:
                                         continue
                                     # If user hits Enter on empty line, treat as ENTER
                                     if line == "\n" or line == "\r\n":
-                                        if self.verbose:
-                                            print("[key] ENTER (line mode)", file=sys.stderr)
+                                        debug_log(
+                                            self.verbose, "recorder", "Key input: ENTER (line mode)"
+                                        )
                                         evt_q.put("ENTER")
                                         break
                                     # If first non-empty char is space, treat as SPACE
                                     if line and line[0] == " ":
-                                        if self.verbose:
-                                            print("[key] SPACE (line mode)", file=sys.stderr)
+                                        debug_log(
+                                            self.verbose, "recorder", "Key input: SPACE (line mode)"
+                                        )
                                         evt_q.put("SPACE")
                     except Exception as e:
                         print(f"Warning: key reader failed: {e}", file=sys.stderr)
@@ -224,7 +229,7 @@ class Recorder:
             threshold_rms = 0.015  # conservative RMS threshold for float32 [-1,1]
             split_cooldown_sec = 0.2
-            def _do_split() -> None:
+            def _do_split(cause: str) -> None:
                 nonlocal fh, frames_written, cur_path, chunk_index, offset_seconds_total
                 fh.flush()
                 fh.close()
@@ -234,12 +239,19 @@ class Recorder:
                     chunk_frames.append(frames_written)
                     chunk_offsets.append(offset_seconds_total)
                     offset_seconds_total += dur
-                    if self.verbose:
-                        print(
-                            f"Saved chunk: {cur_path.name} ({dur:.2f}s)",
-                            file=sys.stderr,
-                        )
-                    tx_queue.put((chunk_index, cur_path, frames_written, chunk_offsets[-1]))
+                    debug_log(
+                        self.verbose,
+                        "recorder",
+                        f"Saved chunk {chunk_index}: {cur_path.name} ({dur:.2f}s)",
+                    )
+                    # Include split cause so downstream can format output accordingly
+                    # cause: "space" (manual split) or "pause" (auto-split)
+                    tx_queue.put((chunk_index, cur_path, frames_written, chunk_offsets[-1], cause))
+                    debug_log(
+                        self.verbose,
+                        "recorder",
+                        f"Enqueued chunk {chunk_index} for transcription (cause={cause})",
+                    )
                 else:
                     try:
                         cur_path.unlink(missing_ok=True)
@@ -253,8 +265,14 @@ class Recorder:
                     and self.resume_event is not None
                 ):
                     self._paused = True
+                    debug_log(
+                        self.verbose,
+                        "recorder",
+                        "Paused after first chunk; waiting for resume (prompt mode)",
+                    )
                     self.resume_event.wait()
                     self._paused = False
+                    debug_log(self.verbose, "recorder", "Resumed after prompt")
                 cur_path = self.session_dir / f"chunk_{chunk_index:04d}{self.ext}"
                 fh = sf.SoundFile(
                     str(cur_path),
@@ -270,8 +288,13 @@ class Recorder:
                 try:
                     while True:
                         cmd = ctrl_q.get_nowait()
-                        if cmd == "split":
-                            _do_split()
+                        if cmd == "split_manual":
+                            _do_split("space")
+                            # Reset silence tracking on manual split
+                            silent_frames_run = 0
+                            seen_non_silent = False
+                        elif cmd == "split_auto":
+                            _do_split("pause")
                             # Reset silence tracking on manual split
                             silent_frames_run = 0
                             seen_non_silent = False
@@ -284,20 +307,33 @@ class Recorder:
                                 chunk_frames.append(frames_written)
                                 chunk_offsets.append(offset_seconds_total)
                                 offset_seconds_total += dur
-                                if self.verbose:
-                                    print(
-                                        f"Saved chunk: {cur_path.name} ({dur:.2f}s)",
-                                        file=sys.stderr,
-                                    )
+                                debug_log(
+                                    self.verbose,
+                                    "recorder",
+                                    f"Saved chunk {chunk_index}: {cur_path.name} ({dur:.2f}s)",
+                                )
+                                # Final chunk – mark cause as "finish" so downstream can avoid extra blank spacing
                                 tx_queue.put(
-                                    (chunk_index, cur_path, frames_written, chunk_offsets[-1])
+                                    (
+                                        chunk_index,
+                                        cur_path,
+                                        frames_written,
+                                        chunk_offsets[-1],
+                                        "finish",
+                                    )
+                                )
+                                debug_log(
+                                    self.verbose,
+                                    "recorder",
+                                    f"Enqueued final chunk {chunk_index} for transcription",
                                 )
                             else:
                                 try:
                                     cur_path.unlink(missing_ok=True)
                                 except Exception:
                                     pass
-                            tx_queue.put((-1, Path(), 0, 0.0))
+                            tx_queue.put((-1, Path(), 0, 0.0, ""))
+                            debug_log(self.verbose, "recorder", "Signaled transcription finish")
                             return
                 except queue.Empty:
                     pass
@@ -342,18 +378,18 @@ class Recorder:
                         enough_length = frames_written >= int(self.samplerate * self.min_chunk_sec)
                         cooldown_ok = (time.perf_counter() - last_split_time) >= split_cooldown_sec
                         if enough_silence and enough_length and seen_non_silent and cooldown_ok:
-                            if self.verbose:
-                                print(
-                                    f"[auto] split (≥{self.silence_sec:.2f}s silence)",
-                                    file=sys.stderr,
-                                )
+                            debug_log(
+                                self.verbose,
+                                "recorder",
+                                f"Auto-split (≥{self.silence_sec:.2f}s silence)",
+                            )
                             last_split_time = time.perf_counter()
-                            # Queue a split for the next control phase
-                            ctrl_q.put("split")
+                            # Queue an auto split for the next control phase
+                            ctrl_q.put("split_auto")
                             # Reset silence tracking now to avoid cascaded triggers
                             silent_frames_run = 0
                             seen_non_silent = False
-            tx_queue.put((-1, Path(), 0, 0.0))
+            tx_queue.put((-1, Path(), 0, 0.0, ""))
         def cb(indata: Any, frames: int, time_info: Any, status: Any) -> None:
             if status:
@@ -375,6 +411,12 @@ class Recorder:
         print("—" * 60)
         print("")
+        debug_log(
+            self.verbose,
+            "recorder",
+            f"Recording started (rate={self.samplerate}, channels={self.channels}, ext={self.ext})",
+        )
         import sounddevice as sd
         with sd.InputStream(samplerate=self.samplerate, channels=self.channels, callback=cb):
@@ -384,9 +426,10 @@ class Recorder:
                 except queue.Empty:
                     continue
                 if evt == "SPACE":
-                    ctrl_q.put("split")
+                    ctrl_q.put("split_manual")
                 elif evt == "ENTER":
                     ctrl_q.put("finish")
                     break
         writer_t.join()
+        debug_log(self.verbose, "recorder", "Recording finished")
         return chunk_paths, chunk_frames, chunk_offsets

{s2t-0.1.9 → s2t-0.1.11}/src/s2t/translator/argos_backend.py RENAMED Viewed

@@ -8,6 +8,7 @@ from collections.abc import Iterable
 from pathlib import Path
 from ..types import SegmentDict, TranscriptionResult
+from ..utils import debug_log
 # Global install coordination to avoid duplicate downloads in parallel
 _install_lock = threading.Lock()
@@ -25,8 +26,7 @@ class ArgosTranslator:
         self.verbose = verbose
     def _debug(self, msg: str) -> None:
-        if self.verbose:
-            print(msg)
+        debug_log(self.verbose, "argos", msg)
     @staticmethod
     def _guess_packages_dir() -> str:

{s2t-0.1.9 → s2t-0.1.11}/src/s2t/utils.py RENAMED Viewed

@@ -5,6 +5,7 @@ import platform
 import shutil
 import subprocess
 import sys
+import time
 from datetime import datetime
 from pathlib import Path
@@ -36,6 +37,25 @@ def convert_wav_to_mp3(wav_path: Path, mp3_path: Path) -> None:
     subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
+# Baseline at program start for relative timestamps in verbose logs
+_START_TIME = time.perf_counter()
+def debug_log(verbose: bool, component: str, msg: str) -> None:
+    """Emit a timestamped debug line to stderr if verbose is enabled.
+    Args:
+        verbose: Whether verbose mode is active.
+        component: Short component tag (e.g., 'recorder', 'whisper', 'cli', 'argos').
+        msg: Message to print.
+    """
+    if not verbose:
+        return
+    elapsed = time.perf_counter() - _START_TIME
+    # Elapsed with milliseconds precision
+    print(f"[+{elapsed:.3f}s] [{component}] {msg}", file=sys.stderr, flush=True)
 def copy_to_clipboard(text: str) -> None:
     system = platform.system()
     try:

{s2t-0.1.9 → s2t-0.1.11}/src/s2t/whisper_engine.py RENAMED Viewed

@@ -5,7 +5,20 @@ from concurrent.futures import Future, ThreadPoolExecutor
 from pathlib import Path
 from typing import Any
+import numpy as np
 from .types import SegmentDict, TranscriptionResult
+from .utils import debug_log
+# --- Tuning parameters (easy to adjust later) ---
+# Silence trim parameters operate on 16 kHz mono arrays
+TRIM_RMS_THRESHOLD: float = 0.012  # RMS threshold for speech vs. silence
+TRIM_MIN_VOICED_SEC: float = 0.5  # Require at least this much voiced audio to transcribe
+TRIM_PAD_MS: int = 50  # Keep a short pad around detected speech (ms)
+# Whisper inference behavior on low/empty audio
+WHISPER_NO_SPEECH_THRESHOLD: float = 0.7
+WHISPER_CONDITION_ON_PREV: bool = False
 class WhisperEngine:
@@ -48,6 +61,7 @@ class WhisperEngine:
                 return m, (t1 - t0)
             fut = self._executor.submit(_load, self.model_name)
+            debug_log(self.verbose, "whisper", f"Submitted model preload: {self.model_name}")
             return self._executor, fut
         except Exception:
             return None, None
@@ -62,6 +76,9 @@ class WhisperEngine:
                 self.profile["model_load_sec"] = self.profile.get("model_load_sec", 0.0) + float(
                     load_dur
                 )
+                debug_log(
+                    self.verbose, "whisper", f"Model resolved via preload in {float(load_dur):.3f}s"
+                )
             except Exception:
                 model = None
         if model is None:
@@ -69,6 +86,7 @@ class WhisperEngine:
             model = whisper.load_model(self.model_name)
             t1m = time.perf_counter()
             self.profile["model_load_sec"] = self.profile.get("model_load_sec", 0.0) + (t1m - t0m)
+            debug_log(self.verbose, "whisper", f"Loaded model synchronously in {(t1m - t0m):.3f}s")
         return model
     def transcribe_chunk(
@@ -79,10 +97,8 @@ class WhisperEngine:
         initial_prompt: str | None = None,
     ) -> TranscriptionResult:
         # Load audio without ffmpeg by reading via soundfile and passing a numpy array
-        # to Whisper. We ensure mono float32 at 16 kHz as expected by Whisper's API.
+        # to Whisper. Convert to mono float32 and resample to 16 kHz as expected by Whisper's API.
         task = "translate" if self.translate else "transcribe"
-        import numpy as np
         try:
             import soundfile as sf
         except Exception as e:
@@ -100,16 +116,71 @@ class WhisperEngine:
         # Resample to 16k expected by Whisper when passing arrays
         mono_16k: np.ndarray = resample_linear(mono, int(sr), 16000)
+        # Trim leading/trailing silence to avoid hallucinations on near-empty chunks
+        def _moving_rms(x: np.ndarray, win_len: int) -> np.ndarray:
+            if x.size == 0:
+                return np.zeros(0, dtype=np.float32)
+            win = np.ones(win_len, dtype=np.float32) / float(win_len)
+            sq = np.square(x.astype(np.float32, copy=False))
+            # same-length RMS via 'same' convolution
+            ma = np.convolve(sq, win, mode="same")
+            return np.sqrt(ma).astype(np.float32, copy=False)
+        def _trim_silence(x: np.ndarray, sr16k: int) -> tuple[np.ndarray, float, float]:
+            # Returns (trimmed, leading_sec, trailing_sec)
+            if x.size == 0:
+                return x, 0.0, 0.0
+            win_len = max(1, int(round(sr16k * 0.03)))  # 30 ms window
+            rms = _moving_rms(x, win_len)
+            thr = float(TRIM_RMS_THRESHOLD)
+            voiced = np.where(rms >= thr)[0]
+            if voiced.size == 0:
+                return np.zeros(0, dtype=np.float32), 0.0, float(x.size) / sr16k
+            start_idx = int(voiced[0])
+            end_idx = int(voiced[-1])
+            pad = int(round((TRIM_PAD_MS / 1000.0) * sr16k))
+            a = max(0, start_idx - pad)
+            b = min(x.size, end_idx + pad + 1)
+            lead_sec = float(a) / sr16k
+            trail_sec = float(x.size - b) / sr16k
+            return x[a:b], lead_sec, trail_sec
+        pre_sec = float(mono_16k.size) / 16000.0
+        trimmed, lead_sec, trail_sec = _trim_silence(mono_16k, 16000)
+        post_sec = float(trimmed.size) / 16000.0
+        debug_log(
+            self.verbose,
+            "whisper",
+            f"Chunk {audio_path.name}: trim {pre_sec:.2f}s -> {post_sec:.2f}s (lead {lead_sec:.2f}s, tail {trail_sec:.2f}s)",
+        )
+        # If too short after trimming, skip transcription
+        if post_sec < float(TRIM_MIN_VOICED_SEC):
+            debug_log(
+                self.verbose,
+                "whisper",
+                f"Chunk {audio_path.name}: too short after trim ({post_sec:.2f}s) – skipping",
+            )
+            return {"text": "", "segments": []}
         t0 = time.perf_counter()
+        debug_log(
+            self.verbose, "whisper", f"Transcribing chunk {audio_path.name} (frames={frames})"
+        )
         res: dict[str, Any] = model.transcribe(
-            mono_16k,
+            trimmed,
             task=task,
             language=self.language,
             fp16=False,
-            initial_prompt=initial_prompt,
+            initial_prompt=(initial_prompt if post_sec >= float(TRIM_MIN_VOICED_SEC) else None),
+            condition_on_previous_text=bool(WHISPER_CONDITION_ON_PREV),
+            no_speech_threshold=float(WHISPER_NO_SPEECH_THRESHOLD),
         )
         t1 = time.perf_counter()
         self.profile["transcribe_sec"] = self.profile.get("transcribe_sec", 0.0) + (t1 - t0)
+        debug_log(
+            self.verbose, "whisper", f"Transcribed chunk {audio_path.name} in {(t1 - t0):.3f}s"
+        )
         text_c = str(res.get("text", "") or "").strip()
         lang_code = str(res.get("language", "") or "")
         if self.native_segmentation:
@@ -117,8 +188,9 @@ class WhisperEngine:
             segs_typed: list[SegmentDict] = []
             for s in segs_raw:
                 try:
-                    start = float(s.get("start", 0.0))
-                    end = float(s.get("end", 0.0))
+                    # Adjust for leading trim so times align with original chunk timeline
+                    start = float(s.get("start", 0.0)) + float(lead_sec)
+                    end = float(s.get("end", 0.0)) + float(lead_sec)
                     text = str(s.get("text", "") or "")
                     segs_typed.append({"start": start, "end": end, "text": text})
                 except Exception:
@@ -129,8 +201,12 @@ class WhisperEngine:
             return out
         # Collapsed single segment per chunk
         segs_raw = res.get("segments", []) or []
-        start = float(segs_raw[0].get("start", 0.0)) if segs_raw else 0.0
-        end = float(segs_raw[-1].get("end", 0.0)) if segs_raw else (frames / float(self.samplerate))
+        start = (float(segs_raw[0].get("start", 0.0)) + float(lead_sec)) if segs_raw else 0.0
+        end = (
+            (float(segs_raw[-1].get("end", 0.0)) + float(lead_sec))
+            if segs_raw
+            else (frames / float(self.samplerate))
+        )
         out2: TranscriptionResult = {
             "text": text_c,
             "segments": ([{"start": start, "end": end, "text": text_c}] if text_c else []),
@@ -143,12 +219,17 @@ class WhisperEngine:
         try:
             from whisper.utils import get_writer
+            debug_log(self.verbose, "whisper", f"Writing outputs for {audio_path.name}")
             for fmt in ("txt", "srt", "vtt", "tsv", "json"):
                 writer = get_writer(fmt, str(self.session_dir))
                 writer(result, str(audio_path))
+            debug_log(self.verbose, "whisper", f"Wrote outputs for {audio_path.name}")
         except Exception as e:
-            if self.verbose:
-                print(f"Warning: failed to write chunk outputs for {audio_path.name}: {e}")
+            debug_log(
+                self.verbose,
+                "whisper",
+                f"Warning: failed to write chunk outputs for {audio_path.name}: {e}",
+            )
     def merge_results(
         self, results: list[TranscriptionResult], offsets: list[float], cumulative_text: str

{s2t-0.1.9 → s2t-0.1.11/src/s2t.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: s2t
-Version: 0.1.9
+Version: 0.1.11
 Summary: Speech to Text (s2t): Record audio, run Whisper, export formats, and copy transcript to clipboard.
 Author: Maintainers
 License-Expression: LicenseRef-Proprietary