s2t 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
s2t/cli.py CHANGED
@@ -49,6 +49,7 @@ from .types import TranscriptionResult
49
49
  from .utils import (
50
50
  convert_wav_to_mp3,
51
51
  copy_to_clipboard,
52
+ debug_log,
52
53
  make_session_dir,
53
54
  open_in_shell_editor,
54
55
  )
@@ -57,6 +58,7 @@ from .whisper_engine import WhisperEngine
57
58
 
58
59
  def run_session(opts: SessionOptions) -> int:
59
60
  session_dir = make_session_dir(opts.outdir)
61
+ debug_log(opts.verbose, "cli", f"Session started; directory: {session_dir}")
60
62
  profile_data: dict = {}
61
63
  requested = opts.recording_format.lower()
62
64
  effective = requested
@@ -64,6 +66,12 @@ def run_session(opts: SessionOptions) -> int:
64
66
  logging.warning("ffmpeg not found; falling back to FLAC recording instead of MP3.")
65
67
  effective = "flac"
66
68
  ext = ".flac" if effective == "flac" else ".wav"
69
+ if requested != effective:
70
+ debug_log(
71
+ opts.verbose,
72
+ "cli",
73
+ f"Recording format adjusted: requested={requested}, effective={effective}",
74
+ )
67
75
 
68
76
  engine = WhisperEngine(
69
77
  model_name=opts.model,
@@ -77,6 +85,8 @@ def run_session(opts: SessionOptions) -> int:
77
85
  profile=profile_data if opts.profile else {},
78
86
  )
79
87
  ex, fut = engine.preload()
88
+ if ex is not None:
89
+ debug_log(opts.verbose, "cli", f"Model preload submitted for '{opts.model}'")
80
90
 
81
91
  # Determine translation target languages from options
82
92
  target_langs: list[str] = []
@@ -98,6 +108,11 @@ def run_session(opts: SessionOptions) -> int:
98
108
  detected_lang_event=detected_lang_event,
99
109
  detected_lang_holder=detected_lang,
100
110
  )
111
+ debug_log(
112
+ opts.verbose,
113
+ "cli",
114
+ f"Translation targets requested: {', '.join(target_langs)}",
115
+ )
101
116
 
102
117
  # Include split cause per chunk: "space" (manual), "pause" (auto), "finish" (final)
103
118
  tx_q: queue.Queue[tuple[int, Path, int, float, str]] = queue.Queue()
@@ -149,6 +164,7 @@ def run_session(opts: SessionOptions) -> int:
149
164
 
150
165
  def tx_worker():
151
166
  model = engine.resolve_model(fut)
167
+ debug_log(opts.verbose, "cli", "Transcription worker started")
152
168
  nonlocal cumulative_text, next_to_emit
153
169
  finished_texts: dict[int, str] = {}
154
170
  causes: dict[int, str] = {}
@@ -156,11 +172,21 @@ def run_session(opts: SessionOptions) -> int:
156
172
  idx, path, frames, offset, cause = tx_q.get()
157
173
  if idx == -1:
158
174
  break
175
+ debug_log(
176
+ opts.verbose,
177
+ "cli",
178
+ f"Dequeued chunk {idx}: {path.name if path else '(final)'} (frames={frames}, offset={offset:.3f}, cause={cause or '-'})",
179
+ )
159
180
  # If in spoken-prompt mode, ensure we don't process payload chunks before prompt is done
160
181
  if opts.prompt and idx > 1 and not prompt_done.is_set():
182
+ debug_log(opts.verbose, "cli", f"Waiting for prompt before processing chunk {idx}")
161
183
  prompt_done.wait()
162
184
  # Build latest-ready prompt based on already finished chunks
163
185
  prompt = _build_latest_ready_prompt(idx, finished_texts)
186
+ if prompt:
187
+ debug_log(
188
+ opts.verbose, "cli", f"Built initial prompt for chunk {idx} (len={len(prompt)})"
189
+ )
164
190
  res = engine.transcribe_chunk(model, path, frames, initial_prompt=prompt)
165
191
  # Record detected language once (for translator preload if needed)
166
192
  if target_langs and detected_lang["code"] is None:
@@ -168,6 +194,7 @@ def run_session(opts: SessionOptions) -> int:
168
194
  if lang_code:
169
195
  detected_lang["code"] = lang_code
170
196
  detected_lang_event.set()
197
+ debug_log(opts.verbose, "cli", f"Detected source language: {lang_code}")
171
198
  engine.write_chunk_outputs(res, path)
172
199
  text_i = (res.get("text", "") or "").strip()
173
200
  with agg_lock:
@@ -182,23 +209,52 @@ def run_session(opts: SessionOptions) -> int:
182
209
  causes[idx] = cause or "pause"
183
210
  while next_to_emit in pending:
184
211
  out = pending.pop(next_to_emit)
212
+ cause_i = causes.get(next_to_emit) or "pause"
185
213
  if out:
186
- # Determine separator based on split cause
187
- sep = "\n\n" if causes.get(next_to_emit) == "space" else "\n"
214
+ # Live stdout behavior
188
215
  print(out)
189
- if causes.get(next_to_emit) == "space":
190
- # Space -> insert a blank line after the chunk
191
- print("")
192
- # Build cumulative text with matching separators
193
- cumulative_text += out if not cumulative_text else (sep + out)
194
- try:
195
- copy_to_clipboard(cumulative_text)
196
- except Exception:
197
- pass
216
+ if cause_i == "space":
217
+ print("") # blank line after SPACE
218
+ # Build cumulative text with post-separator semantics
219
+ if not cumulative_text:
220
+ cumulative_text = out
221
+ else:
222
+ cumulative_text += out
223
+ # Append separator AFTER the chunk, matching stdout
224
+ if cause_i == "space":
225
+ if not cumulative_text.endswith("\n\n"):
226
+ # ensure exactly one paragraph break
227
+ if cumulative_text.endswith("\n"):
228
+ cumulative_text += "\n"
229
+ else:
230
+ cumulative_text += "\n\n"
231
+ else:
232
+ # single line break after non-space chunks
233
+ if not (
234
+ cumulative_text.endswith("\n") or cumulative_text.endswith("\n\n")
235
+ ):
236
+ cumulative_text += "\n"
237
+ else:
238
+ # Even if chunk text is empty, respect SPACE as a paragraph break
239
+ if cause_i == "space":
240
+ print("") # blank line on stdout
241
+ if cumulative_text:
242
+ if cumulative_text.endswith("\n\n"):
243
+ pass
244
+ elif cumulative_text.endswith("\n"):
245
+ cumulative_text += "\n"
246
+ else:
247
+ cumulative_text += "\n\n"
248
+ # For empty non-space chunks, do not alter cumulative_text
249
+ try:
250
+ copy_to_clipboard(cumulative_text)
251
+ except Exception:
252
+ pass
198
253
  next_to_emit += 1
199
254
  # If this was the prompt chunk, signal readiness and instruct user
200
255
  if opts.prompt and idx == 1 and not prompt_done.is_set():
201
256
  prompt_done.set()
257
+ debug_log(opts.verbose, "cli", "Prompt transcribed; resuming payload")
202
258
  print("=" * 60)
203
259
  print("Prompt transcribed. Start speaking your main content now.")
204
260
  print("=" * 60)
@@ -206,6 +262,7 @@ def run_session(opts: SessionOptions) -> int:
206
262
  if prompt_resume_event is not None:
207
263
  prompt_resume_event.set()
208
264
  tx_done.set()
265
+ debug_log(opts.verbose, "cli", "Transcription worker finished")
209
266
 
210
267
  tx_t = threading.Thread(target=tx_worker, daemon=True)
211
268
  tx_t.start()
@@ -213,6 +270,7 @@ def run_session(opts: SessionOptions) -> int:
213
270
  if opts.prompt:
214
271
  print("Prompt mode enabled: Speak your prompt first, then press SPACE.")
215
272
  print("Recording will wait for the prompt transcription before starting payload.")
273
+ debug_log(opts.verbose, "cli", "Prompt mode enabled")
216
274
  # Prepare resume event to pause recording between prompt and payload
217
275
  prompt_resume_event = threading.Event() if opts.prompt else None
218
276
  rec = Recorder(
@@ -232,17 +290,24 @@ def run_session(opts: SessionOptions) -> int:
232
290
  t1 = time.perf_counter()
233
291
  if opts.profile:
234
292
  profile_data["recording_sec"] = t1 - t0
293
+ debug_log(
294
+ opts.verbose, "cli", f"Recording finished in {(t1 - t0):.3f}s (chunks={len(chunk_paths)})"
295
+ )
235
296
  tx_t.join()
236
297
 
237
298
  merged: TranscriptionResult = engine.merge_results(results, chunk_offsets, cumulative_text)
238
299
  base_audio_path = session_dir / f"recording{ext}"
239
300
  txt_path = write_final_outputs(merged, session_dir, base_audio_path)
301
+ # Ensure Recording.txt exactly mirrors the clipboard text (including blank lines)
302
+ try:
303
+ txt_path.write_text(cumulative_text, encoding="utf-8")
304
+ except Exception:
305
+ pass
240
306
 
241
307
  try:
242
308
  if chunk_paths:
243
309
  concat_audio(chunk_paths, base_audio_path, opts.rate, opts.channels)
244
- if opts.verbose:
245
- print(f"Merged audio written: {base_audio_path.name}", file=sys.stderr)
310
+ debug_log(opts.verbose, "cli", f"Merged audio written: {base_audio_path.name}")
246
311
  if requested == "mp3" and shutil.which("ffmpeg") is not None:
247
312
  mp3_out = session_dir / "recording.mp3"
248
313
  convert_wav_to_mp3(
@@ -253,11 +318,9 @@ def run_session(opts: SessionOptions) -> int:
253
318
  ),
254
319
  mp3_out,
255
320
  )
256
- if opts.verbose:
257
- print(f"Converted merged audio to MP3: {mp3_out.name}", file=sys.stderr)
321
+ debug_log(opts.verbose, "cli", f"Converted merged audio to MP3: {mp3_out.name}")
258
322
  except Exception as e:
259
- if opts.verbose:
260
- print(f"Warning: failed to merge chunk audio: {e}", file=sys.stderr)
323
+ debug_log(opts.verbose, "cli", f"Warning: failed to merge chunk audio: {e}")
261
324
 
262
325
  # Optionally delete chunk files (audio + per-chunk outputs)
263
326
  if chunk_paths and not opts.keep_chunks:
@@ -310,11 +373,11 @@ def run_session(opts: SessionOptions) -> int:
310
373
  # Decide source language: CLI hint takes precedence; else detected; else skip with warning
311
374
  src_lang = (opts.lang.lower() if opts.lang else (detected_lang["code"] or "")).strip()
312
375
  if not src_lang:
313
- if opts.verbose:
314
- print(
315
- "Warning: Could not determine source language for translation; skipping post-translation.",
316
- file=sys.stderr,
317
- )
376
+ debug_log(
377
+ opts.verbose,
378
+ "cli",
379
+ "Warning: Could not determine source language for translation; skipping post-translation.",
380
+ )
318
381
  else:
319
382
  # Skip identical language targets
320
383
  effective_targets = [t for t in target_langs if t.lower() != src_lang.lower()]
@@ -346,8 +409,7 @@ def run_session(opts: SessionOptions) -> int:
346
409
  f"{base_audio_path.stem}.{tgt}{base_audio_path.suffix}"
347
410
  )
348
411
  write_final_outputs(translated, session_dir, suffixed)
349
- if opts.verbose:
350
- print(f"Created translated outputs for '{tgt}'.", file=sys.stderr)
412
+ debug_log(opts.verbose, "cli", f"Created translated outputs for '{tgt}'.")
351
413
  except Exception as e:
352
414
  print(
353
415
  f"Warning: failed to translate to '{tgt}': {e}",
s2t/recorder.py CHANGED
@@ -11,6 +11,8 @@ from typing import Any, Protocol, cast, runtime_checkable
11
11
 
12
12
  import numpy as np
13
13
 
14
+ from .utils import debug_log
15
+
14
16
 
15
17
  class Recorder:
16
18
  def __init__(
@@ -71,14 +73,12 @@ class Recorder:
71
73
  ms = cast(_MSVCRT, msvcrt)
72
74
 
73
75
  last_space = 0.0
74
- if self.verbose:
75
- print("[key] using msvcrt (Windows)", file=sys.stderr)
76
+ debug_log(self.verbose, "recorder", "Key input: using msvcrt (Windows)")
76
77
  while not stop_evt.is_set():
77
78
  if ms.kbhit():
78
79
  ch = ms.getwch()
79
80
  if ch in ("\r", "\n"):
80
- if self.verbose:
81
- print("[key] ENTER", file=sys.stderr)
81
+ debug_log(self.verbose, "recorder", "Key input: ENTER")
82
82
  evt_q.put("ENTER")
83
83
  break
84
84
  if ch == " ":
@@ -88,8 +88,7 @@ class Recorder:
88
88
  ):
89
89
  continue
90
90
  last_space = now
91
- if self.verbose:
92
- print("[key] SPACE", file=sys.stderr)
91
+ debug_log(self.verbose, "recorder", "Key input: SPACE")
93
92
  evt_q.put("SPACE")
94
93
  time.sleep(0.01)
95
94
  else:
@@ -97,8 +96,9 @@ class Recorder:
97
96
  try:
98
97
  if sys.stdin.isatty():
99
98
  fd = sys.stdin.fileno()
100
- if self.verbose:
101
- print("[key] using sys.stdin (isatty, fd read)", file=sys.stderr)
99
+ debug_log(
100
+ self.verbose, "recorder", "Key input: using sys.stdin (TTY fd read)"
101
+ )
102
102
  old = termios.tcgetattr(fd)
103
103
  tty.setcbreak(fd)
104
104
  last_space = 0.0
@@ -114,8 +114,7 @@ class Recorder:
114
114
  continue
115
115
  ch = ch_b.decode(errors="ignore")
116
116
  if ch in ("\n", "\r"):
117
- if self.verbose:
118
- print("[key] ENTER", file=sys.stderr)
117
+ debug_log(self.verbose, "recorder", "Key input: ENTER")
119
118
  evt_q.put("ENTER")
120
119
  break
121
120
  if ch == " ":
@@ -125,8 +124,7 @@ class Recorder:
125
124
  ):
126
125
  continue
127
126
  last_space = now
128
- if self.verbose:
129
- print("[key] SPACE", file=sys.stderr)
127
+ debug_log(self.verbose, "recorder", "Key input: SPACE")
130
128
  evt_q.put("SPACE")
131
129
  finally:
132
130
  termios.tcsetattr(fd, termios.TCSADRAIN, old)
@@ -137,8 +135,11 @@ class Recorder:
137
135
  try:
138
136
  fd = os.open("/dev/tty", os.O_RDONLY)
139
137
  using_devtty = True
140
- if self.verbose:
141
- print("[key] using /dev/tty (stdin not TTY)", file=sys.stderr)
138
+ debug_log(
139
+ self.verbose,
140
+ "recorder",
141
+ "Key input: using /dev/tty (stdin not TTY)",
142
+ )
142
143
  old = termios.tcgetattr(fd)
143
144
  tty.setcbreak(fd)
144
145
  last_space = 0.0
@@ -151,8 +152,9 @@ class Recorder:
151
152
  continue
152
153
  ch = ch_b.decode(errors="ignore")
153
154
  if ch in ("\n", "\r"):
154
- if self.verbose:
155
- print("[key] ENTER", file=sys.stderr)
155
+ debug_log(
156
+ self.verbose, "recorder", "Key input: ENTER"
157
+ )
156
158
  evt_q.put("ENTER")
157
159
  break
158
160
  if ch == " ":
@@ -162,8 +164,9 @@ class Recorder:
162
164
  ):
163
165
  continue
164
166
  last_space = now
165
- if self.verbose:
166
- print("[key] SPACE", file=sys.stderr)
167
+ debug_log(
168
+ self.verbose, "recorder", "Key input: SPACE"
169
+ )
167
170
  evt_q.put("SPACE")
168
171
  finally:
169
172
  termios.tcsetattr(fd, termios.TCSADRAIN, old)
@@ -185,14 +188,16 @@ class Recorder:
185
188
  continue
186
189
  # If user hits Enter on empty line, treat as ENTER
187
190
  if line == "\n" or line == "\r\n":
188
- if self.verbose:
189
- print("[key] ENTER (line mode)", file=sys.stderr)
191
+ debug_log(
192
+ self.verbose, "recorder", "Key input: ENTER (line mode)"
193
+ )
190
194
  evt_q.put("ENTER")
191
195
  break
192
196
  # If first non-empty char is space, treat as SPACE
193
197
  if line and line[0] == " ":
194
- if self.verbose:
195
- print("[key] SPACE (line mode)", file=sys.stderr)
198
+ debug_log(
199
+ self.verbose, "recorder", "Key input: SPACE (line mode)"
200
+ )
196
201
  evt_q.put("SPACE")
197
202
  except Exception as e:
198
203
  print(f"Warning: key reader failed: {e}", file=sys.stderr)
@@ -234,14 +239,19 @@ class Recorder:
234
239
  chunk_frames.append(frames_written)
235
240
  chunk_offsets.append(offset_seconds_total)
236
241
  offset_seconds_total += dur
237
- if self.verbose:
238
- print(
239
- f"Saved chunk: {cur_path.name} ({dur:.2f}s)",
240
- file=sys.stderr,
241
- )
242
+ debug_log(
243
+ self.verbose,
244
+ "recorder",
245
+ f"Saved chunk {chunk_index}: {cur_path.name} ({dur:.2f}s)",
246
+ )
242
247
  # Include split cause so downstream can format output accordingly
243
248
  # cause: "space" (manual split) or "pause" (auto-split)
244
249
  tx_queue.put((chunk_index, cur_path, frames_written, chunk_offsets[-1], cause))
250
+ debug_log(
251
+ self.verbose,
252
+ "recorder",
253
+ f"Enqueued chunk {chunk_index} for transcription (cause={cause})",
254
+ )
245
255
  else:
246
256
  try:
247
257
  cur_path.unlink(missing_ok=True)
@@ -255,8 +265,14 @@ class Recorder:
255
265
  and self.resume_event is not None
256
266
  ):
257
267
  self._paused = True
268
+ debug_log(
269
+ self.verbose,
270
+ "recorder",
271
+ "Paused after first chunk; waiting for resume (prompt mode)",
272
+ )
258
273
  self.resume_event.wait()
259
274
  self._paused = False
275
+ debug_log(self.verbose, "recorder", "Resumed after prompt")
260
276
  cur_path = self.session_dir / f"chunk_{chunk_index:04d}{self.ext}"
261
277
  fh = sf.SoundFile(
262
278
  str(cur_path),
@@ -291,11 +307,11 @@ class Recorder:
291
307
  chunk_frames.append(frames_written)
292
308
  chunk_offsets.append(offset_seconds_total)
293
309
  offset_seconds_total += dur
294
- if self.verbose:
295
- print(
296
- f"Saved chunk: {cur_path.name} ({dur:.2f}s)",
297
- file=sys.stderr,
298
- )
310
+ debug_log(
311
+ self.verbose,
312
+ "recorder",
313
+ f"Saved chunk {chunk_index}: {cur_path.name} ({dur:.2f}s)",
314
+ )
299
315
  # Final chunk – mark cause as "finish" so downstream can avoid extra blank spacing
300
316
  tx_queue.put(
301
317
  (
@@ -306,12 +322,18 @@ class Recorder:
306
322
  "finish",
307
323
  )
308
324
  )
325
+ debug_log(
326
+ self.verbose,
327
+ "recorder",
328
+ f"Enqueued final chunk {chunk_index} for transcription",
329
+ )
309
330
  else:
310
331
  try:
311
332
  cur_path.unlink(missing_ok=True)
312
333
  except Exception:
313
334
  pass
314
335
  tx_queue.put((-1, Path(), 0, 0.0, ""))
336
+ debug_log(self.verbose, "recorder", "Signaled transcription finish")
315
337
  return
316
338
  except queue.Empty:
317
339
  pass
@@ -356,11 +378,11 @@ class Recorder:
356
378
  enough_length = frames_written >= int(self.samplerate * self.min_chunk_sec)
357
379
  cooldown_ok = (time.perf_counter() - last_split_time) >= split_cooldown_sec
358
380
  if enough_silence and enough_length and seen_non_silent and cooldown_ok:
359
- if self.verbose:
360
- print(
361
- f"[auto] split (≥{self.silence_sec:.2f}s silence)",
362
- file=sys.stderr,
363
- )
381
+ debug_log(
382
+ self.verbose,
383
+ "recorder",
384
+ f"Auto-split (≥{self.silence_sec:.2f}s silence)",
385
+ )
364
386
  last_split_time = time.perf_counter()
365
387
  # Queue an auto split for the next control phase
366
388
  ctrl_q.put("split_auto")
@@ -389,6 +411,12 @@ class Recorder:
389
411
  print("—" * 60)
390
412
  print("")
391
413
 
414
+ debug_log(
415
+ self.verbose,
416
+ "recorder",
417
+ f"Recording started (rate={self.samplerate}, channels={self.channels}, ext={self.ext})",
418
+ )
419
+
392
420
  import sounddevice as sd
393
421
 
394
422
  with sd.InputStream(samplerate=self.samplerate, channels=self.channels, callback=cb):
@@ -403,4 +431,5 @@ class Recorder:
403
431
  ctrl_q.put("finish")
404
432
  break
405
433
  writer_t.join()
434
+ debug_log(self.verbose, "recorder", "Recording finished")
406
435
  return chunk_paths, chunk_frames, chunk_offsets
@@ -8,6 +8,7 @@ from collections.abc import Iterable
8
8
  from pathlib import Path
9
9
 
10
10
  from ..types import SegmentDict, TranscriptionResult
11
+ from ..utils import debug_log
11
12
 
12
13
  # Global install coordination to avoid duplicate downloads in parallel
13
14
  _install_lock = threading.Lock()
@@ -25,8 +26,7 @@ class ArgosTranslator:
25
26
  self.verbose = verbose
26
27
 
27
28
  def _debug(self, msg: str) -> None:
28
- if self.verbose:
29
- print(msg)
29
+ debug_log(self.verbose, "argos", msg)
30
30
 
31
31
  @staticmethod
32
32
  def _guess_packages_dir() -> str:
s2t/utils.py CHANGED
@@ -5,6 +5,7 @@ import platform
5
5
  import shutil
6
6
  import subprocess
7
7
  import sys
8
+ import time
8
9
  from datetime import datetime
9
10
  from pathlib import Path
10
11
 
@@ -36,6 +37,25 @@ def convert_wav_to_mp3(wav_path: Path, mp3_path: Path) -> None:
36
37
  subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
37
38
 
38
39
 
40
+ # Baseline at program start for relative timestamps in verbose logs
41
+ _START_TIME = time.perf_counter()
42
+
43
+
44
+ def debug_log(verbose: bool, component: str, msg: str) -> None:
45
+ """Emit a timestamped debug line to stderr if verbose is enabled.
46
+
47
+ Args:
48
+ verbose: Whether verbose mode is active.
49
+ component: Short component tag (e.g., 'recorder', 'whisper', 'cli', 'argos').
50
+ msg: Message to print.
51
+ """
52
+ if not verbose:
53
+ return
54
+ elapsed = time.perf_counter() - _START_TIME
55
+ # Elapsed with milliseconds precision
56
+ print(f"[+{elapsed:.3f}s] [{component}] {msg}", file=sys.stderr, flush=True)
57
+
58
+
39
59
  def copy_to_clipboard(text: str) -> None:
40
60
  system = platform.system()
41
61
  try:
s2t/whisper_engine.py CHANGED
@@ -5,7 +5,20 @@ from concurrent.futures import Future, ThreadPoolExecutor
5
5
  from pathlib import Path
6
6
  from typing import Any
7
7
 
8
+ import numpy as np
9
+
8
10
  from .types import SegmentDict, TranscriptionResult
11
+ from .utils import debug_log
12
+
13
+ # --- Tuning parameters (easy to adjust later) ---
14
+ # Silence trim parameters operate on 16 kHz mono arrays
15
+ TRIM_RMS_THRESHOLD: float = 0.012 # RMS threshold for speech vs. silence
16
+ TRIM_MIN_VOICED_SEC: float = 0.5 # Require at least this much voiced audio to transcribe
17
+ TRIM_PAD_MS: int = 50 # Keep a short pad around detected speech (ms)
18
+
19
+ # Whisper inference behavior on low/empty audio
20
+ WHISPER_NO_SPEECH_THRESHOLD: float = 0.7
21
+ WHISPER_CONDITION_ON_PREV: bool = False
9
22
 
10
23
 
11
24
  class WhisperEngine:
@@ -48,6 +61,7 @@ class WhisperEngine:
48
61
  return m, (t1 - t0)
49
62
 
50
63
  fut = self._executor.submit(_load, self.model_name)
64
+ debug_log(self.verbose, "whisper", f"Submitted model preload: {self.model_name}")
51
65
  return self._executor, fut
52
66
  except Exception:
53
67
  return None, None
@@ -62,6 +76,9 @@ class WhisperEngine:
62
76
  self.profile["model_load_sec"] = self.profile.get("model_load_sec", 0.0) + float(
63
77
  load_dur
64
78
  )
79
+ debug_log(
80
+ self.verbose, "whisper", f"Model resolved via preload in {float(load_dur):.3f}s"
81
+ )
65
82
  except Exception:
66
83
  model = None
67
84
  if model is None:
@@ -69,6 +86,7 @@ class WhisperEngine:
69
86
  model = whisper.load_model(self.model_name)
70
87
  t1m = time.perf_counter()
71
88
  self.profile["model_load_sec"] = self.profile.get("model_load_sec", 0.0) + (t1m - t0m)
89
+ debug_log(self.verbose, "whisper", f"Loaded model synchronously in {(t1m - t0m):.3f}s")
72
90
  return model
73
91
 
74
92
  def transcribe_chunk(
@@ -79,10 +97,8 @@ class WhisperEngine:
79
97
  initial_prompt: str | None = None,
80
98
  ) -> TranscriptionResult:
81
99
  # Load audio without ffmpeg by reading via soundfile and passing a numpy array
82
- # to Whisper. We ensure mono float32 at 16 kHz as expected by Whisper's API.
100
+ # to Whisper. Convert to mono float32 and resample to 16 kHz as expected by Whisper's API.
83
101
  task = "translate" if self.translate else "transcribe"
84
- import numpy as np
85
-
86
102
  try:
87
103
  import soundfile as sf
88
104
  except Exception as e:
@@ -100,16 +116,71 @@ class WhisperEngine:
100
116
  # Resample to 16k expected by Whisper when passing arrays
101
117
  mono_16k: np.ndarray = resample_linear(mono, int(sr), 16000)
102
118
 
119
+ # Trim leading/trailing silence to avoid hallucinations on near-empty chunks
120
+ def _moving_rms(x: np.ndarray, win_len: int) -> np.ndarray:
121
+ if x.size == 0:
122
+ return np.zeros(0, dtype=np.float32)
123
+ win = np.ones(win_len, dtype=np.float32) / float(win_len)
124
+ sq = np.square(x.astype(np.float32, copy=False))
125
+ # same-length RMS via 'same' convolution
126
+ ma = np.convolve(sq, win, mode="same")
127
+ return np.sqrt(ma).astype(np.float32, copy=False)
128
+
129
+ def _trim_silence(x: np.ndarray, sr16k: int) -> tuple[np.ndarray, float, float]:
130
+ # Returns (trimmed, leading_sec, trailing_sec)
131
+ if x.size == 0:
132
+ return x, 0.0, 0.0
133
+ win_len = max(1, int(round(sr16k * 0.03))) # 30 ms window
134
+ rms = _moving_rms(x, win_len)
135
+ thr = float(TRIM_RMS_THRESHOLD)
136
+ voiced = np.where(rms >= thr)[0]
137
+ if voiced.size == 0:
138
+ return np.zeros(0, dtype=np.float32), 0.0, float(x.size) / sr16k
139
+ start_idx = int(voiced[0])
140
+ end_idx = int(voiced[-1])
141
+ pad = int(round((TRIM_PAD_MS / 1000.0) * sr16k))
142
+ a = max(0, start_idx - pad)
143
+ b = min(x.size, end_idx + pad + 1)
144
+ lead_sec = float(a) / sr16k
145
+ trail_sec = float(x.size - b) / sr16k
146
+ return x[a:b], lead_sec, trail_sec
147
+
148
+ pre_sec = float(mono_16k.size) / 16000.0
149
+ trimmed, lead_sec, trail_sec = _trim_silence(mono_16k, 16000)
150
+ post_sec = float(trimmed.size) / 16000.0
151
+ debug_log(
152
+ self.verbose,
153
+ "whisper",
154
+ f"Chunk {audio_path.name}: trim {pre_sec:.2f}s -> {post_sec:.2f}s (lead {lead_sec:.2f}s, tail {trail_sec:.2f}s)",
155
+ )
156
+
157
+ # If too short after trimming, skip transcription
158
+ if post_sec < float(TRIM_MIN_VOICED_SEC):
159
+ debug_log(
160
+ self.verbose,
161
+ "whisper",
162
+ f"Chunk {audio_path.name}: too short after trim ({post_sec:.2f}s) – skipping",
163
+ )
164
+ return {"text": "", "segments": []}
165
+
103
166
  t0 = time.perf_counter()
167
+ debug_log(
168
+ self.verbose, "whisper", f"Transcribing chunk {audio_path.name} (frames={frames})"
169
+ )
104
170
  res: dict[str, Any] = model.transcribe(
105
- mono_16k,
171
+ trimmed,
106
172
  task=task,
107
173
  language=self.language,
108
174
  fp16=False,
109
- initial_prompt=initial_prompt,
175
+ initial_prompt=(initial_prompt if post_sec >= float(TRIM_MIN_VOICED_SEC) else None),
176
+ condition_on_previous_text=bool(WHISPER_CONDITION_ON_PREV),
177
+ no_speech_threshold=float(WHISPER_NO_SPEECH_THRESHOLD),
110
178
  )
111
179
  t1 = time.perf_counter()
112
180
  self.profile["transcribe_sec"] = self.profile.get("transcribe_sec", 0.0) + (t1 - t0)
181
+ debug_log(
182
+ self.verbose, "whisper", f"Transcribed chunk {audio_path.name} in {(t1 - t0):.3f}s"
183
+ )
113
184
  text_c = str(res.get("text", "") or "").strip()
114
185
  lang_code = str(res.get("language", "") or "")
115
186
  if self.native_segmentation:
@@ -117,8 +188,9 @@ class WhisperEngine:
117
188
  segs_typed: list[SegmentDict] = []
118
189
  for s in segs_raw:
119
190
  try:
120
- start = float(s.get("start", 0.0))
121
- end = float(s.get("end", 0.0))
191
+ # Adjust for leading trim so times align with original chunk timeline
192
+ start = float(s.get("start", 0.0)) + float(lead_sec)
193
+ end = float(s.get("end", 0.0)) + float(lead_sec)
122
194
  text = str(s.get("text", "") or "")
123
195
  segs_typed.append({"start": start, "end": end, "text": text})
124
196
  except Exception:
@@ -129,8 +201,12 @@ class WhisperEngine:
129
201
  return out
130
202
  # Collapsed single segment per chunk
131
203
  segs_raw = res.get("segments", []) or []
132
- start = float(segs_raw[0].get("start", 0.0)) if segs_raw else 0.0
133
- end = float(segs_raw[-1].get("end", 0.0)) if segs_raw else (frames / float(self.samplerate))
204
+ start = (float(segs_raw[0].get("start", 0.0)) + float(lead_sec)) if segs_raw else 0.0
205
+ end = (
206
+ (float(segs_raw[-1].get("end", 0.0)) + float(lead_sec))
207
+ if segs_raw
208
+ else (frames / float(self.samplerate))
209
+ )
134
210
  out2: TranscriptionResult = {
135
211
  "text": text_c,
136
212
  "segments": ([{"start": start, "end": end, "text": text_c}] if text_c else []),
@@ -143,12 +219,17 @@ class WhisperEngine:
143
219
  try:
144
220
  from whisper.utils import get_writer
145
221
 
222
+ debug_log(self.verbose, "whisper", f"Writing outputs for {audio_path.name}")
146
223
  for fmt in ("txt", "srt", "vtt", "tsv", "json"):
147
224
  writer = get_writer(fmt, str(self.session_dir))
148
225
  writer(result, str(audio_path))
226
+ debug_log(self.verbose, "whisper", f"Wrote outputs for {audio_path.name}")
149
227
  except Exception as e:
150
- if self.verbose:
151
- print(f"Warning: failed to write chunk outputs for {audio_path.name}: {e}")
228
+ debug_log(
229
+ self.verbose,
230
+ "whisper",
231
+ f"Warning: failed to write chunk outputs for {audio_path.name}: {e}",
232
+ )
152
233
 
153
234
  def merge_results(
154
235
  self, results: list[TranscriptionResult], offsets: list[float], cumulative_text: str
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: s2t
3
- Version: 0.1.10
3
+ Version: 0.1.12
4
4
  Summary: Speech to Text (s2t): Record audio, run Whisper, export formats, and copy transcript to clipboard.
5
5
  Author: Maintainers
6
6
  License-Expression: LicenseRef-Proprietary
@@ -0,0 +1,16 @@
1
+ s2t/__init__.py,sha256=wV4E9i-7KrUn1dOtLUQB3ZGEKx9gRWH3hPHlpw-ZdWc,332
2
+ s2t/cli.py,sha256=rcrJ1KWwzrpob0dBWWOJCYH2KBfCUpKD0Is_3f-LzqU,24452
3
+ s2t/config.py,sha256=uw4CZSSXmUvnlOrqBGR1Rcq-WdXucHj3KICRcCb_pkU,485
4
+ s2t/outputs.py,sha256=Lo8VcARZ7QPuuQQNu8myD5J4c4NO1Rs0L1DLnzLe9tM,1546
5
+ s2t/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
6
+ s2t/recorder.py,sha256=VYFqZ6LlP2zbwjWclZqM8ve5HEnZ3oyM9hLV1V3IkPI,20495
7
+ s2t/types.py,sha256=jBiRN-tr0qVw-lhaXvnsyKrVGDyLkqEbxs9qkQ6qGqI,339
8
+ s2t/utils.py,sha256=p7klapPW3Multxk261NlPtEpnEi3kpiTSHBPBTv4XC0,4059
9
+ s2t/whisper_engine.py,sha256=T4HYmr1czwj78LsUdgRGWEBCfaghvHVqplQDaQDaR4o,10373
10
+ s2t/translator/__init__.py,sha256=K-MKves7kZ4-62POfrmWeOcBaTjsTzeFSu8QNHqYuus,239
11
+ s2t/translator/argos_backend.py,sha256=hXzQ8ZgJJOcUhcTJdTdVw1lSzptl8FXtfYr5PyOxKkg,19096
12
+ s2t-0.1.12.dist-info/METADATA,sha256=SQ3kqXvmdVmx5SQ1Owmo_moYV4dCs9J8PKBDureJ5Sw,5475
13
+ s2t-0.1.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
+ s2t-0.1.12.dist-info/entry_points.txt,sha256=JISIUlZAJ3DX1dB6zT3X_E3vcXI-eWEQKwHiT35fPKs,37
15
+ s2t-0.1.12.dist-info/top_level.txt,sha256=o8N0JcuHdIrfX3iGHvntHiDC2XgN7__joyNu08ZOh0s,4
16
+ s2t-0.1.12.dist-info/RECORD,,
@@ -1,16 +0,0 @@
1
- s2t/__init__.py,sha256=wV4E9i-7KrUn1dOtLUQB3ZGEKx9gRWH3hPHlpw-ZdWc,332
2
- s2t/cli.py,sha256=1t3fchtywFaeuPONp-B3FmSzBnSxJRvP6jx9AS-b1Ok,21351
3
- s2t/config.py,sha256=uw4CZSSXmUvnlOrqBGR1Rcq-WdXucHj3KICRcCb_pkU,485
4
- s2t/outputs.py,sha256=Lo8VcARZ7QPuuQQNu8myD5J4c4NO1Rs0L1DLnzLe9tM,1546
5
- s2t/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
6
- s2t/recorder.py,sha256=Z3Hn8l1xLY7XzLR6zqMYulTBfTRNWD-zqwk_V5x18Sc,19228
7
- s2t/types.py,sha256=jBiRN-tr0qVw-lhaXvnsyKrVGDyLkqEbxs9qkQ6qGqI,339
8
- s2t/utils.py,sha256=YU6YhiuONmqhrKte4DY5tiC5PP-yFExJMMBzFUiA8qA,3416
9
- s2t/whisper_engine.py,sha256=x-V7ST9e3JnwMWdbMh4C7dHjA420jaOtXH2-igeh7vc,6492
10
- s2t/translator/__init__.py,sha256=K-MKves7kZ4-62POfrmWeOcBaTjsTzeFSu8QNHqYuus,239
11
- s2t/translator/argos_backend.py,sha256=VW_OYFFBuNZgcWM-fbvR6XGokuxS2fptkCMFIO9MD1I,19068
12
- s2t-0.1.10.dist-info/METADATA,sha256=ViJoiYC5WG_aLaRTLXedQ-o8TwyBQ_sCN126u9o96lY,5475
13
- s2t-0.1.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
- s2t-0.1.10.dist-info/entry_points.txt,sha256=JISIUlZAJ3DX1dB6zT3X_E3vcXI-eWEQKwHiT35fPKs,37
15
- s2t-0.1.10.dist-info/top_level.txt,sha256=o8N0JcuHdIrfX3iGHvntHiDC2XgN7__joyNu08ZOh0s,4
16
- s2t-0.1.10.dist-info/RECORD,,
File without changes