s2t 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
s2t/cli.py CHANGED
@@ -49,6 +49,7 @@ from .types import TranscriptionResult
49
49
  from .utils import (
50
50
  convert_wav_to_mp3,
51
51
  copy_to_clipboard,
52
+ debug_log,
52
53
  make_session_dir,
53
54
  open_in_shell_editor,
54
55
  )
@@ -57,6 +58,7 @@ from .whisper_engine import WhisperEngine
57
58
 
58
59
  def run_session(opts: SessionOptions) -> int:
59
60
  session_dir = make_session_dir(opts.outdir)
61
+ debug_log(opts.verbose, "cli", f"Session started; directory: {session_dir}")
60
62
  profile_data: dict = {}
61
63
  requested = opts.recording_format.lower()
62
64
  effective = requested
@@ -64,6 +66,12 @@ def run_session(opts: SessionOptions) -> int:
64
66
  logging.warning("ffmpeg not found; falling back to FLAC recording instead of MP3.")
65
67
  effective = "flac"
66
68
  ext = ".flac" if effective == "flac" else ".wav"
69
+ if requested != effective:
70
+ debug_log(
71
+ opts.verbose,
72
+ "cli",
73
+ f"Recording format adjusted: requested={requested}, effective={effective}",
74
+ )
67
75
 
68
76
  engine = WhisperEngine(
69
77
  model_name=opts.model,
@@ -77,6 +85,8 @@ def run_session(opts: SessionOptions) -> int:
77
85
  profile=profile_data if opts.profile else {},
78
86
  )
79
87
  ex, fut = engine.preload()
88
+ if ex is not None:
89
+ debug_log(opts.verbose, "cli", f"Model preload submitted for '{opts.model}'")
80
90
 
81
91
  # Determine translation target languages from options
82
92
  target_langs: list[str] = []
@@ -98,8 +108,14 @@ def run_session(opts: SessionOptions) -> int:
98
108
  detected_lang_event=detected_lang_event,
99
109
  detected_lang_holder=detected_lang,
100
110
  )
111
+ debug_log(
112
+ opts.verbose,
113
+ "cli",
114
+ f"Translation targets requested: {', '.join(target_langs)}",
115
+ )
101
116
 
102
- tx_q: queue.Queue[tuple[int, Path, int, float]] = queue.Queue()
117
+ # Include split cause per chunk: "space" (manual), "pause" (auto), "finish" (final)
118
+ tx_q: queue.Queue[tuple[int, Path, int, float, str]] = queue.Queue()
103
119
  cumulative_text = ""
104
120
  next_to_emit = 1
105
121
  pending: dict[int, str] = {}
@@ -148,17 +164,29 @@ def run_session(opts: SessionOptions) -> int:
148
164
 
149
165
  def tx_worker():
150
166
  model = engine.resolve_model(fut)
167
+ debug_log(opts.verbose, "cli", "Transcription worker started")
151
168
  nonlocal cumulative_text, next_to_emit
152
169
  finished_texts: dict[int, str] = {}
170
+ causes: dict[int, str] = {}
153
171
  while True:
154
- idx, path, frames, offset = tx_q.get()
172
+ idx, path, frames, offset, cause = tx_q.get()
155
173
  if idx == -1:
156
174
  break
175
+ debug_log(
176
+ opts.verbose,
177
+ "cli",
178
+ f"Dequeued chunk {idx}: {path.name if path else '(final)'} (frames={frames}, offset={offset:.3f}, cause={cause or '-'})",
179
+ )
157
180
  # If in spoken-prompt mode, ensure we don't process payload chunks before prompt is done
158
181
  if opts.prompt and idx > 1 and not prompt_done.is_set():
182
+ debug_log(opts.verbose, "cli", f"Waiting for prompt before processing chunk {idx}")
159
183
  prompt_done.wait()
160
184
  # Build latest-ready prompt based on already finished chunks
161
185
  prompt = _build_latest_ready_prompt(idx, finished_texts)
186
+ if prompt:
187
+ debug_log(
188
+ opts.verbose, "cli", f"Built initial prompt for chunk {idx} (len={len(prompt)})"
189
+ )
162
190
  res = engine.transcribe_chunk(model, path, frames, initial_prompt=prompt)
163
191
  # Record detected language once (for translator preload if needed)
164
192
  if target_langs and detected_lang["code"] is None:
@@ -166,6 +194,7 @@ def run_session(opts: SessionOptions) -> int:
166
194
  if lang_code:
167
195
  detected_lang["code"] = lang_code
168
196
  detected_lang_event.set()
197
+ debug_log(opts.verbose, "cli", f"Detected source language: {lang_code}")
169
198
  engine.write_chunk_outputs(res, path)
170
199
  text_i = (res.get("text", "") or "").strip()
171
200
  with agg_lock:
@@ -174,20 +203,58 @@ def run_session(opts: SessionOptions) -> int:
174
203
  results.append(res)
175
204
  offsets.append(offset)
176
205
  pending[idx] = text_i
206
+ # Track cause for formatting when emitting in-order
207
+ # cause is one of: "space", "pause", "finish" (or empty for sentinel)
208
+ # Default to "pause" if unknown to avoid extra blank lines.
209
+ causes[idx] = cause or "pause"
177
210
  while next_to_emit in pending:
178
211
  out = pending.pop(next_to_emit)
212
+ cause_i = causes.get(next_to_emit) or "pause"
179
213
  if out:
214
+ # Live stdout behavior
180
215
  print(out)
181
- print("")
182
- cumulative_text += out if not cumulative_text else ("\n\n" + out)
183
- try:
184
- copy_to_clipboard(cumulative_text)
185
- except Exception:
186
- pass
216
+ if cause_i == "space":
217
+ print("") # blank line after SPACE
218
+ # Build cumulative text with post-separator semantics
219
+ if not cumulative_text:
220
+ cumulative_text = out
221
+ else:
222
+ cumulative_text += out
223
+ # Append separator AFTER the chunk, matching stdout
224
+ if cause_i == "space":
225
+ if not cumulative_text.endswith("\n\n"):
226
+ # ensure exactly one paragraph break
227
+ if cumulative_text.endswith("\n"):
228
+ cumulative_text += "\n"
229
+ else:
230
+ cumulative_text += "\n\n"
231
+ else:
232
+ # single line break after non-space chunks
233
+ if not (
234
+ cumulative_text.endswith("\n") or cumulative_text.endswith("\n\n")
235
+ ):
236
+ cumulative_text += "\n"
237
+ else:
238
+ # Even if chunk text is empty, respect SPACE as a paragraph break
239
+ if cause_i == "space":
240
+ print("") # blank line on stdout
241
+ if cumulative_text:
242
+ if cumulative_text.endswith("\n\n"):
243
+ pass
244
+ elif cumulative_text.endswith("\n"):
245
+ cumulative_text += "\n"
246
+ else:
247
+ cumulative_text += "\n\n"
248
+ # For empty non-space chunks, do not alter cumulative_text
249
+ try:
250
+ copy_to_clipboard(cumulative_text)
251
+ except Exception:
252
+ pass
187
253
  next_to_emit += 1
188
254
  # If this was the prompt chunk, signal readiness and instruct user
189
255
  if opts.prompt and idx == 1 and not prompt_done.is_set():
190
256
  prompt_done.set()
257
+ debug_log(opts.verbose, "cli", "Prompt transcribed; resuming payload")
191
258
  print("=" * 60)
192
259
  print("Prompt transcribed. Start speaking your main content now.")
193
260
  print("=" * 60)
@@ -195,6 +262,7 @@ def run_session(opts: SessionOptions) -> int:
195
262
  if prompt_resume_event is not None:
196
263
  prompt_resume_event.set()
197
264
  tx_done.set()
265
+ debug_log(opts.verbose, "cli", "Transcription worker finished")
198
266
 
199
267
  tx_t = threading.Thread(target=tx_worker, daemon=True)
200
268
  tx_t.start()
@@ -202,6 +270,7 @@ def run_session(opts: SessionOptions) -> int:
202
270
  if opts.prompt:
203
271
  print("Prompt mode enabled: Speak your prompt first, then press SPACE.")
204
272
  print("Recording will wait for the prompt transcription before starting payload.")
273
+ debug_log(opts.verbose, "cli", "Prompt mode enabled")
205
274
  # Prepare resume event to pause recording between prompt and payload
206
275
  prompt_resume_event = threading.Event() if opts.prompt else None
207
276
  rec = Recorder(
@@ -221,6 +290,9 @@ def run_session(opts: SessionOptions) -> int:
221
290
  t1 = time.perf_counter()
222
291
  if opts.profile:
223
292
  profile_data["recording_sec"] = t1 - t0
293
+ debug_log(
294
+ opts.verbose, "cli", f"Recording finished in {(t1 - t0):.3f}s (chunks={len(chunk_paths)})"
295
+ )
224
296
  tx_t.join()
225
297
 
226
298
  merged: TranscriptionResult = engine.merge_results(results, chunk_offsets, cumulative_text)
@@ -230,8 +302,7 @@ def run_session(opts: SessionOptions) -> int:
230
302
  try:
231
303
  if chunk_paths:
232
304
  concat_audio(chunk_paths, base_audio_path, opts.rate, opts.channels)
233
- if opts.verbose:
234
- print(f"Merged audio written: {base_audio_path.name}", file=sys.stderr)
305
+ debug_log(opts.verbose, "cli", f"Merged audio written: {base_audio_path.name}")
235
306
  if requested == "mp3" and shutil.which("ffmpeg") is not None:
236
307
  mp3_out = session_dir / "recording.mp3"
237
308
  convert_wav_to_mp3(
@@ -242,11 +313,9 @@ def run_session(opts: SessionOptions) -> int:
242
313
  ),
243
314
  mp3_out,
244
315
  )
245
- if opts.verbose:
246
- print(f"Converted merged audio to MP3: {mp3_out.name}", file=sys.stderr)
316
+ debug_log(opts.verbose, "cli", f"Converted merged audio to MP3: {mp3_out.name}")
247
317
  except Exception as e:
248
- if opts.verbose:
249
- print(f"Warning: failed to merge chunk audio: {e}", file=sys.stderr)
318
+ debug_log(opts.verbose, "cli", f"Warning: failed to merge chunk audio: {e}")
250
319
 
251
320
  # Optionally delete chunk files (audio + per-chunk outputs)
252
321
  if chunk_paths and not opts.keep_chunks:
@@ -299,11 +368,11 @@ def run_session(opts: SessionOptions) -> int:
299
368
  # Decide source language: CLI hint takes precedence; else detected; else skip with warning
300
369
  src_lang = (opts.lang.lower() if opts.lang else (detected_lang["code"] or "")).strip()
301
370
  if not src_lang:
302
- if opts.verbose:
303
- print(
304
- "Warning: Could not determine source language for translation; skipping post-translation.",
305
- file=sys.stderr,
306
- )
371
+ debug_log(
372
+ opts.verbose,
373
+ "cli",
374
+ "Warning: Could not determine source language for translation; skipping post-translation.",
375
+ )
307
376
  else:
308
377
  # Skip identical language targets
309
378
  effective_targets = [t for t in target_langs if t.lower() != src_lang.lower()]
@@ -335,8 +404,7 @@ def run_session(opts: SessionOptions) -> int:
335
404
  f"{base_audio_path.stem}.{tgt}{base_audio_path.suffix}"
336
405
  )
337
406
  write_final_outputs(translated, session_dir, suffixed)
338
- if opts.verbose:
339
- print(f"Created translated outputs for '{tgt}'.", file=sys.stderr)
407
+ debug_log(opts.verbose, "cli", f"Created translated outputs for '{tgt}'.")
340
408
  except Exception as e:
341
409
  print(
342
410
  f"Warning: failed to translate to '{tgt}': {e}",
s2t/recorder.py CHANGED
@@ -11,6 +11,8 @@ from typing import Any, Protocol, cast, runtime_checkable
11
11
 
12
12
  import numpy as np
13
13
 
14
+ from .utils import debug_log
15
+
14
16
 
15
17
  class Recorder:
16
18
  def __init__(
@@ -41,7 +43,7 @@ class Recorder:
41
43
 
42
44
  def run(
43
45
  self,
44
- tx_queue: queue.Queue[tuple[int, Path, int, float]],
46
+ tx_queue: queue.Queue[tuple[int, Path, int, float, str]],
45
47
  ) -> tuple[list[Path], list[int], list[float]]:
46
48
  import platform
47
49
  import termios
@@ -71,14 +73,12 @@ class Recorder:
71
73
  ms = cast(_MSVCRT, msvcrt)
72
74
 
73
75
  last_space = 0.0
74
- if self.verbose:
75
- print("[key] using msvcrt (Windows)", file=sys.stderr)
76
+ debug_log(self.verbose, "recorder", "Key input: using msvcrt (Windows)")
76
77
  while not stop_evt.is_set():
77
78
  if ms.kbhit():
78
79
  ch = ms.getwch()
79
80
  if ch in ("\r", "\n"):
80
- if self.verbose:
81
- print("[key] ENTER", file=sys.stderr)
81
+ debug_log(self.verbose, "recorder", "Key input: ENTER")
82
82
  evt_q.put("ENTER")
83
83
  break
84
84
  if ch == " ":
@@ -88,8 +88,7 @@ class Recorder:
88
88
  ):
89
89
  continue
90
90
  last_space = now
91
- if self.verbose:
92
- print("[key] SPACE", file=sys.stderr)
91
+ debug_log(self.verbose, "recorder", "Key input: SPACE")
93
92
  evt_q.put("SPACE")
94
93
  time.sleep(0.01)
95
94
  else:
@@ -97,8 +96,9 @@ class Recorder:
97
96
  try:
98
97
  if sys.stdin.isatty():
99
98
  fd = sys.stdin.fileno()
100
- if self.verbose:
101
- print("[key] using sys.stdin (isatty, fd read)", file=sys.stderr)
99
+ debug_log(
100
+ self.verbose, "recorder", "Key input: using sys.stdin (TTY fd read)"
101
+ )
102
102
  old = termios.tcgetattr(fd)
103
103
  tty.setcbreak(fd)
104
104
  last_space = 0.0
@@ -114,8 +114,7 @@ class Recorder:
114
114
  continue
115
115
  ch = ch_b.decode(errors="ignore")
116
116
  if ch in ("\n", "\r"):
117
- if self.verbose:
118
- print("[key] ENTER", file=sys.stderr)
117
+ debug_log(self.verbose, "recorder", "Key input: ENTER")
119
118
  evt_q.put("ENTER")
120
119
  break
121
120
  if ch == " ":
@@ -125,8 +124,7 @@ class Recorder:
125
124
  ):
126
125
  continue
127
126
  last_space = now
128
- if self.verbose:
129
- print("[key] SPACE", file=sys.stderr)
127
+ debug_log(self.verbose, "recorder", "Key input: SPACE")
130
128
  evt_q.put("SPACE")
131
129
  finally:
132
130
  termios.tcsetattr(fd, termios.TCSADRAIN, old)
@@ -137,8 +135,11 @@ class Recorder:
137
135
  try:
138
136
  fd = os.open("/dev/tty", os.O_RDONLY)
139
137
  using_devtty = True
140
- if self.verbose:
141
- print("[key] using /dev/tty (stdin not TTY)", file=sys.stderr)
138
+ debug_log(
139
+ self.verbose,
140
+ "recorder",
141
+ "Key input: using /dev/tty (stdin not TTY)",
142
+ )
142
143
  old = termios.tcgetattr(fd)
143
144
  tty.setcbreak(fd)
144
145
  last_space = 0.0
@@ -151,8 +152,9 @@ class Recorder:
151
152
  continue
152
153
  ch = ch_b.decode(errors="ignore")
153
154
  if ch in ("\n", "\r"):
154
- if self.verbose:
155
- print("[key] ENTER", file=sys.stderr)
155
+ debug_log(
156
+ self.verbose, "recorder", "Key input: ENTER"
157
+ )
156
158
  evt_q.put("ENTER")
157
159
  break
158
160
  if ch == " ":
@@ -162,8 +164,9 @@ class Recorder:
162
164
  ):
163
165
  continue
164
166
  last_space = now
165
- if self.verbose:
166
- print("[key] SPACE", file=sys.stderr)
167
+ debug_log(
168
+ self.verbose, "recorder", "Key input: SPACE"
169
+ )
167
170
  evt_q.put("SPACE")
168
171
  finally:
169
172
  termios.tcsetattr(fd, termios.TCSADRAIN, old)
@@ -185,14 +188,16 @@ class Recorder:
185
188
  continue
186
189
  # If user hits Enter on empty line, treat as ENTER
187
190
  if line == "\n" or line == "\r\n":
188
- if self.verbose:
189
- print("[key] ENTER (line mode)", file=sys.stderr)
191
+ debug_log(
192
+ self.verbose, "recorder", "Key input: ENTER (line mode)"
193
+ )
190
194
  evt_q.put("ENTER")
191
195
  break
192
196
  # If first non-empty char is space, treat as SPACE
193
197
  if line and line[0] == " ":
194
- if self.verbose:
195
- print("[key] SPACE (line mode)", file=sys.stderr)
198
+ debug_log(
199
+ self.verbose, "recorder", "Key input: SPACE (line mode)"
200
+ )
196
201
  evt_q.put("SPACE")
197
202
  except Exception as e:
198
203
  print(f"Warning: key reader failed: {e}", file=sys.stderr)
@@ -224,7 +229,7 @@ class Recorder:
224
229
  threshold_rms = 0.015 # conservative RMS threshold for float32 [-1,1]
225
230
  split_cooldown_sec = 0.2
226
231
 
227
- def _do_split() -> None:
232
+ def _do_split(cause: str) -> None:
228
233
  nonlocal fh, frames_written, cur_path, chunk_index, offset_seconds_total
229
234
  fh.flush()
230
235
  fh.close()
@@ -234,12 +239,19 @@ class Recorder:
234
239
  chunk_frames.append(frames_written)
235
240
  chunk_offsets.append(offset_seconds_total)
236
241
  offset_seconds_total += dur
237
- if self.verbose:
238
- print(
239
- f"Saved chunk: {cur_path.name} ({dur:.2f}s)",
240
- file=sys.stderr,
241
- )
242
- tx_queue.put((chunk_index, cur_path, frames_written, chunk_offsets[-1]))
242
+ debug_log(
243
+ self.verbose,
244
+ "recorder",
245
+ f"Saved chunk {chunk_index}: {cur_path.name} ({dur:.2f}s)",
246
+ )
247
+ # Include split cause so downstream can format output accordingly
248
+ # cause: "space" (manual split) or "pause" (auto-split)
249
+ tx_queue.put((chunk_index, cur_path, frames_written, chunk_offsets[-1], cause))
250
+ debug_log(
251
+ self.verbose,
252
+ "recorder",
253
+ f"Enqueued chunk {chunk_index} for transcription (cause={cause})",
254
+ )
243
255
  else:
244
256
  try:
245
257
  cur_path.unlink(missing_ok=True)
@@ -253,8 +265,14 @@ class Recorder:
253
265
  and self.resume_event is not None
254
266
  ):
255
267
  self._paused = True
268
+ debug_log(
269
+ self.verbose,
270
+ "recorder",
271
+ "Paused after first chunk; waiting for resume (prompt mode)",
272
+ )
256
273
  self.resume_event.wait()
257
274
  self._paused = False
275
+ debug_log(self.verbose, "recorder", "Resumed after prompt")
258
276
  cur_path = self.session_dir / f"chunk_{chunk_index:04d}{self.ext}"
259
277
  fh = sf.SoundFile(
260
278
  str(cur_path),
@@ -270,8 +288,13 @@ class Recorder:
270
288
  try:
271
289
  while True:
272
290
  cmd = ctrl_q.get_nowait()
273
- if cmd == "split":
274
- _do_split()
291
+ if cmd == "split_manual":
292
+ _do_split("space")
293
+ # Reset silence tracking on manual split
294
+ silent_frames_run = 0
295
+ seen_non_silent = False
296
+ elif cmd == "split_auto":
297
+ _do_split("pause")
275
298
  # Reset silence tracking on manual split
276
299
  silent_frames_run = 0
277
300
  seen_non_silent = False
@@ -284,20 +307,33 @@ class Recorder:
284
307
  chunk_frames.append(frames_written)
285
308
  chunk_offsets.append(offset_seconds_total)
286
309
  offset_seconds_total += dur
287
- if self.verbose:
288
- print(
289
- f"Saved chunk: {cur_path.name} ({dur:.2f}s)",
290
- file=sys.stderr,
291
- )
310
+ debug_log(
311
+ self.verbose,
312
+ "recorder",
313
+ f"Saved chunk {chunk_index}: {cur_path.name} ({dur:.2f}s)",
314
+ )
315
+ # Final chunk – mark cause as "finish" so downstream can avoid extra blank spacing
292
316
  tx_queue.put(
293
- (chunk_index, cur_path, frames_written, chunk_offsets[-1])
317
+ (
318
+ chunk_index,
319
+ cur_path,
320
+ frames_written,
321
+ chunk_offsets[-1],
322
+ "finish",
323
+ )
324
+ )
325
+ debug_log(
326
+ self.verbose,
327
+ "recorder",
328
+ f"Enqueued final chunk {chunk_index} for transcription",
294
329
  )
295
330
  else:
296
331
  try:
297
332
  cur_path.unlink(missing_ok=True)
298
333
  except Exception:
299
334
  pass
300
- tx_queue.put((-1, Path(), 0, 0.0))
335
+ tx_queue.put((-1, Path(), 0, 0.0, ""))
336
+ debug_log(self.verbose, "recorder", "Signaled transcription finish")
301
337
  return
302
338
  except queue.Empty:
303
339
  pass
@@ -342,18 +378,18 @@ class Recorder:
342
378
  enough_length = frames_written >= int(self.samplerate * self.min_chunk_sec)
343
379
  cooldown_ok = (time.perf_counter() - last_split_time) >= split_cooldown_sec
344
380
  if enough_silence and enough_length and seen_non_silent and cooldown_ok:
345
- if self.verbose:
346
- print(
347
- f"[auto] split (≥{self.silence_sec:.2f}s silence)",
348
- file=sys.stderr,
349
- )
381
+ debug_log(
382
+ self.verbose,
383
+ "recorder",
384
+ f"Auto-split (≥{self.silence_sec:.2f}s silence)",
385
+ )
350
386
  last_split_time = time.perf_counter()
351
- # Queue a split for the next control phase
352
- ctrl_q.put("split")
387
+ # Queue an auto split for the next control phase
388
+ ctrl_q.put("split_auto")
353
389
  # Reset silence tracking now to avoid cascaded triggers
354
390
  silent_frames_run = 0
355
391
  seen_non_silent = False
356
- tx_queue.put((-1, Path(), 0, 0.0))
392
+ tx_queue.put((-1, Path(), 0, 0.0, ""))
357
393
 
358
394
  def cb(indata: Any, frames: int, time_info: Any, status: Any) -> None:
359
395
  if status:
@@ -375,6 +411,12 @@ class Recorder:
375
411
  print("—" * 60)
376
412
  print("")
377
413
 
414
+ debug_log(
415
+ self.verbose,
416
+ "recorder",
417
+ f"Recording started (rate={self.samplerate}, channels={self.channels}, ext={self.ext})",
418
+ )
419
+
378
420
  import sounddevice as sd
379
421
 
380
422
  with sd.InputStream(samplerate=self.samplerate, channels=self.channels, callback=cb):
@@ -384,9 +426,10 @@ class Recorder:
384
426
  except queue.Empty:
385
427
  continue
386
428
  if evt == "SPACE":
387
- ctrl_q.put("split")
429
+ ctrl_q.put("split_manual")
388
430
  elif evt == "ENTER":
389
431
  ctrl_q.put("finish")
390
432
  break
391
433
  writer_t.join()
434
+ debug_log(self.verbose, "recorder", "Recording finished")
392
435
  return chunk_paths, chunk_frames, chunk_offsets
@@ -8,6 +8,7 @@ from collections.abc import Iterable
8
8
  from pathlib import Path
9
9
 
10
10
  from ..types import SegmentDict, TranscriptionResult
11
+ from ..utils import debug_log
11
12
 
12
13
  # Global install coordination to avoid duplicate downloads in parallel
13
14
  _install_lock = threading.Lock()
@@ -25,8 +26,7 @@ class ArgosTranslator:
25
26
  self.verbose = verbose
26
27
 
27
28
  def _debug(self, msg: str) -> None:
28
- if self.verbose:
29
- print(msg)
29
+ debug_log(self.verbose, "argos", msg)
30
30
 
31
31
  @staticmethod
32
32
  def _guess_packages_dir() -> str:
s2t/utils.py CHANGED
@@ -5,6 +5,7 @@ import platform
5
5
  import shutil
6
6
  import subprocess
7
7
  import sys
8
+ import time
8
9
  from datetime import datetime
9
10
  from pathlib import Path
10
11
 
@@ -36,6 +37,25 @@ def convert_wav_to_mp3(wav_path: Path, mp3_path: Path) -> None:
36
37
  subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
37
38
 
38
39
 
40
+ # Baseline at program start for relative timestamps in verbose logs
41
+ _START_TIME = time.perf_counter()
42
+
43
+
44
+ def debug_log(verbose: bool, component: str, msg: str) -> None:
45
+ """Emit a timestamped debug line to stderr if verbose is enabled.
46
+
47
+ Args:
48
+ verbose: Whether verbose mode is active.
49
+ component: Short component tag (e.g., 'recorder', 'whisper', 'cli', 'argos').
50
+ msg: Message to print.
51
+ """
52
+ if not verbose:
53
+ return
54
+ elapsed = time.perf_counter() - _START_TIME
55
+ # Elapsed with milliseconds precision
56
+ print(f"[+{elapsed:.3f}s] [{component}] {msg}", file=sys.stderr, flush=True)
57
+
58
+
39
59
  def copy_to_clipboard(text: str) -> None:
40
60
  system = platform.system()
41
61
  try:
s2t/whisper_engine.py CHANGED
@@ -5,7 +5,20 @@ from concurrent.futures import Future, ThreadPoolExecutor
5
5
  from pathlib import Path
6
6
  from typing import Any
7
7
 
8
+ import numpy as np
9
+
8
10
  from .types import SegmentDict, TranscriptionResult
11
+ from .utils import debug_log
12
+
13
+ # --- Tuning parameters (easy to adjust later) ---
14
+ # Silence trim parameters operate on 16 kHz mono arrays
15
+ TRIM_RMS_THRESHOLD: float = 0.012 # RMS threshold for speech vs. silence
16
+ TRIM_MIN_VOICED_SEC: float = 0.5 # Require at least this much voiced audio to transcribe
17
+ TRIM_PAD_MS: int = 50 # Keep a short pad around detected speech (ms)
18
+
19
+ # Whisper inference behavior on low/empty audio
20
+ WHISPER_NO_SPEECH_THRESHOLD: float = 0.7
21
+ WHISPER_CONDITION_ON_PREV: bool = False
9
22
 
10
23
 
11
24
  class WhisperEngine:
@@ -48,6 +61,7 @@ class WhisperEngine:
48
61
  return m, (t1 - t0)
49
62
 
50
63
  fut = self._executor.submit(_load, self.model_name)
64
+ debug_log(self.verbose, "whisper", f"Submitted model preload: {self.model_name}")
51
65
  return self._executor, fut
52
66
  except Exception:
53
67
  return None, None
@@ -62,6 +76,9 @@ class WhisperEngine:
62
76
  self.profile["model_load_sec"] = self.profile.get("model_load_sec", 0.0) + float(
63
77
  load_dur
64
78
  )
79
+ debug_log(
80
+ self.verbose, "whisper", f"Model resolved via preload in {float(load_dur):.3f}s"
81
+ )
65
82
  except Exception:
66
83
  model = None
67
84
  if model is None:
@@ -69,6 +86,7 @@ class WhisperEngine:
69
86
  model = whisper.load_model(self.model_name)
70
87
  t1m = time.perf_counter()
71
88
  self.profile["model_load_sec"] = self.profile.get("model_load_sec", 0.0) + (t1m - t0m)
89
+ debug_log(self.verbose, "whisper", f"Loaded model synchronously in {(t1m - t0m):.3f}s")
72
90
  return model
73
91
 
74
92
  def transcribe_chunk(
@@ -79,10 +97,8 @@ class WhisperEngine:
79
97
  initial_prompt: str | None = None,
80
98
  ) -> TranscriptionResult:
81
99
  # Load audio without ffmpeg by reading via soundfile and passing a numpy array
82
- # to Whisper. We ensure mono float32 at 16 kHz as expected by Whisper's API.
100
+ # to Whisper. Convert to mono float32 and resample to 16 kHz as expected by Whisper's API.
83
101
  task = "translate" if self.translate else "transcribe"
84
- import numpy as np
85
-
86
102
  try:
87
103
  import soundfile as sf
88
104
  except Exception as e:
@@ -100,16 +116,71 @@ class WhisperEngine:
100
116
  # Resample to 16k expected by Whisper when passing arrays
101
117
  mono_16k: np.ndarray = resample_linear(mono, int(sr), 16000)
102
118
 
119
+ # Trim leading/trailing silence to avoid hallucinations on near-empty chunks
120
+ def _moving_rms(x: np.ndarray, win_len: int) -> np.ndarray:
121
+ if x.size == 0:
122
+ return np.zeros(0, dtype=np.float32)
123
+ win = np.ones(win_len, dtype=np.float32) / float(win_len)
124
+ sq = np.square(x.astype(np.float32, copy=False))
125
+ # same-length RMS via 'same' convolution
126
+ ma = np.convolve(sq, win, mode="same")
127
+ return np.sqrt(ma).astype(np.float32, copy=False)
128
+
129
+ def _trim_silence(x: np.ndarray, sr16k: int) -> tuple[np.ndarray, float, float]:
130
+ # Returns (trimmed, leading_sec, trailing_sec)
131
+ if x.size == 0:
132
+ return x, 0.0, 0.0
133
+ win_len = max(1, int(round(sr16k * 0.03))) # 30 ms window
134
+ rms = _moving_rms(x, win_len)
135
+ thr = float(TRIM_RMS_THRESHOLD)
136
+ voiced = np.where(rms >= thr)[0]
137
+ if voiced.size == 0:
138
+ return np.zeros(0, dtype=np.float32), 0.0, float(x.size) / sr16k
139
+ start_idx = int(voiced[0])
140
+ end_idx = int(voiced[-1])
141
+ pad = int(round((TRIM_PAD_MS / 1000.0) * sr16k))
142
+ a = max(0, start_idx - pad)
143
+ b = min(x.size, end_idx + pad + 1)
144
+ lead_sec = float(a) / sr16k
145
+ trail_sec = float(x.size - b) / sr16k
146
+ return x[a:b], lead_sec, trail_sec
147
+
148
+ pre_sec = float(mono_16k.size) / 16000.0
149
+ trimmed, lead_sec, trail_sec = _trim_silence(mono_16k, 16000)
150
+ post_sec = float(trimmed.size) / 16000.0
151
+ debug_log(
152
+ self.verbose,
153
+ "whisper",
154
+ f"Chunk {audio_path.name}: trim {pre_sec:.2f}s -> {post_sec:.2f}s (lead {lead_sec:.2f}s, tail {trail_sec:.2f}s)",
155
+ )
156
+
157
+ # If too short after trimming, skip transcription
158
+ if post_sec < float(TRIM_MIN_VOICED_SEC):
159
+ debug_log(
160
+ self.verbose,
161
+ "whisper",
162
+ f"Chunk {audio_path.name}: too short after trim ({post_sec:.2f}s) – skipping",
163
+ )
164
+ return {"text": "", "segments": []}
165
+
103
166
  t0 = time.perf_counter()
167
+ debug_log(
168
+ self.verbose, "whisper", f"Transcribing chunk {audio_path.name} (frames={frames})"
169
+ )
104
170
  res: dict[str, Any] = model.transcribe(
105
- mono_16k,
171
+ trimmed,
106
172
  task=task,
107
173
  language=self.language,
108
174
  fp16=False,
109
- initial_prompt=initial_prompt,
175
+ initial_prompt=(initial_prompt if post_sec >= float(TRIM_MIN_VOICED_SEC) else None),
176
+ condition_on_previous_text=bool(WHISPER_CONDITION_ON_PREV),
177
+ no_speech_threshold=float(WHISPER_NO_SPEECH_THRESHOLD),
110
178
  )
111
179
  t1 = time.perf_counter()
112
180
  self.profile["transcribe_sec"] = self.profile.get("transcribe_sec", 0.0) + (t1 - t0)
181
+ debug_log(
182
+ self.verbose, "whisper", f"Transcribed chunk {audio_path.name} in {(t1 - t0):.3f}s"
183
+ )
113
184
  text_c = str(res.get("text", "") or "").strip()
114
185
  lang_code = str(res.get("language", "") or "")
115
186
  if self.native_segmentation:
@@ -117,8 +188,9 @@ class WhisperEngine:
117
188
  segs_typed: list[SegmentDict] = []
118
189
  for s in segs_raw:
119
190
  try:
120
- start = float(s.get("start", 0.0))
121
- end = float(s.get("end", 0.0))
191
+ # Adjust for leading trim so times align with original chunk timeline
192
+ start = float(s.get("start", 0.0)) + float(lead_sec)
193
+ end = float(s.get("end", 0.0)) + float(lead_sec)
122
194
  text = str(s.get("text", "") or "")
123
195
  segs_typed.append({"start": start, "end": end, "text": text})
124
196
  except Exception:
@@ -129,8 +201,12 @@ class WhisperEngine:
129
201
  return out
130
202
  # Collapsed single segment per chunk
131
203
  segs_raw = res.get("segments", []) or []
132
- start = float(segs_raw[0].get("start", 0.0)) if segs_raw else 0.0
133
- end = float(segs_raw[-1].get("end", 0.0)) if segs_raw else (frames / float(self.samplerate))
204
+ start = (float(segs_raw[0].get("start", 0.0)) + float(lead_sec)) if segs_raw else 0.0
205
+ end = (
206
+ (float(segs_raw[-1].get("end", 0.0)) + float(lead_sec))
207
+ if segs_raw
208
+ else (frames / float(self.samplerate))
209
+ )
134
210
  out2: TranscriptionResult = {
135
211
  "text": text_c,
136
212
  "segments": ([{"start": start, "end": end, "text": text_c}] if text_c else []),
@@ -143,12 +219,17 @@ class WhisperEngine:
143
219
  try:
144
220
  from whisper.utils import get_writer
145
221
 
222
+ debug_log(self.verbose, "whisper", f"Writing outputs for {audio_path.name}")
146
223
  for fmt in ("txt", "srt", "vtt", "tsv", "json"):
147
224
  writer = get_writer(fmt, str(self.session_dir))
148
225
  writer(result, str(audio_path))
226
+ debug_log(self.verbose, "whisper", f"Wrote outputs for {audio_path.name}")
149
227
  except Exception as e:
150
- if self.verbose:
151
- print(f"Warning: failed to write chunk outputs for {audio_path.name}: {e}")
228
+ debug_log(
229
+ self.verbose,
230
+ "whisper",
231
+ f"Warning: failed to write chunk outputs for {audio_path.name}: {e}",
232
+ )
152
233
 
153
234
  def merge_results(
154
235
  self, results: list[TranscriptionResult], offsets: list[float], cumulative_text: str
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: s2t
3
- Version: 0.1.9
3
+ Version: 0.1.11
4
4
  Summary: Speech to Text (s2t): Record audio, run Whisper, export formats, and copy transcript to clipboard.
5
5
  Author: Maintainers
6
6
  License-Expression: LicenseRef-Proprietary
@@ -0,0 +1,16 @@
1
+ s2t/__init__.py,sha256=wV4E9i-7KrUn1dOtLUQB3ZGEKx9gRWH3hPHlpw-ZdWc,332
2
+ s2t/cli.py,sha256=HgN-AXrBKn1MLjQd8U5hXBETlOKhOa8_3BH96GaVRss,24259
3
+ s2t/config.py,sha256=uw4CZSSXmUvnlOrqBGR1Rcq-WdXucHj3KICRcCb_pkU,485
4
+ s2t/outputs.py,sha256=Lo8VcARZ7QPuuQQNu8myD5J4c4NO1Rs0L1DLnzLe9tM,1546
5
+ s2t/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
6
+ s2t/recorder.py,sha256=VYFqZ6LlP2zbwjWclZqM8ve5HEnZ3oyM9hLV1V3IkPI,20495
7
+ s2t/types.py,sha256=jBiRN-tr0qVw-lhaXvnsyKrVGDyLkqEbxs9qkQ6qGqI,339
8
+ s2t/utils.py,sha256=p7klapPW3Multxk261NlPtEpnEi3kpiTSHBPBTv4XC0,4059
9
+ s2t/whisper_engine.py,sha256=T4HYmr1czwj78LsUdgRGWEBCfaghvHVqplQDaQDaR4o,10373
10
+ s2t/translator/__init__.py,sha256=K-MKves7kZ4-62POfrmWeOcBaTjsTzeFSu8QNHqYuus,239
11
+ s2t/translator/argos_backend.py,sha256=hXzQ8ZgJJOcUhcTJdTdVw1lSzptl8FXtfYr5PyOxKkg,19096
12
+ s2t-0.1.11.dist-info/METADATA,sha256=kKEruaZB9lMsAeeejO4wwsj52jffj8tBCnp5WpYAlms,5475
13
+ s2t-0.1.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
+ s2t-0.1.11.dist-info/entry_points.txt,sha256=JISIUlZAJ3DX1dB6zT3X_E3vcXI-eWEQKwHiT35fPKs,37
15
+ s2t-0.1.11.dist-info/top_level.txt,sha256=o8N0JcuHdIrfX3iGHvntHiDC2XgN7__joyNu08ZOh0s,4
16
+ s2t-0.1.11.dist-info/RECORD,,
@@ -1,16 +0,0 @@
1
- s2t/__init__.py,sha256=wV4E9i-7KrUn1dOtLUQB3ZGEKx9gRWH3hPHlpw-ZdWc,332
2
- s2t/cli.py,sha256=W-QyO8NhzslwK8cyodKyg9crXeffD1IlblcIGGqZt7Q,20572
3
- s2t/config.py,sha256=uw4CZSSXmUvnlOrqBGR1Rcq-WdXucHj3KICRcCb_pkU,485
4
- s2t/outputs.py,sha256=Lo8VcARZ7QPuuQQNu8myD5J4c4NO1Rs0L1DLnzLe9tM,1546
5
- s2t/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
6
- s2t/recorder.py,sha256=tVUGwnHnlwQM9UtdHUlQIZ3GubCrop9suZYxc9yHHmE,18370
7
- s2t/types.py,sha256=jBiRN-tr0qVw-lhaXvnsyKrVGDyLkqEbxs9qkQ6qGqI,339
8
- s2t/utils.py,sha256=YU6YhiuONmqhrKte4DY5tiC5PP-yFExJMMBzFUiA8qA,3416
9
- s2t/whisper_engine.py,sha256=x-V7ST9e3JnwMWdbMh4C7dHjA420jaOtXH2-igeh7vc,6492
10
- s2t/translator/__init__.py,sha256=K-MKves7kZ4-62POfrmWeOcBaTjsTzeFSu8QNHqYuus,239
11
- s2t/translator/argos_backend.py,sha256=VW_OYFFBuNZgcWM-fbvR6XGokuxS2fptkCMFIO9MD1I,19068
12
- s2t-0.1.9.dist-info/METADATA,sha256=eY-j5C7WKtOhnxdKsFNEFU6MiT5G1cXJQpMroKAsIi8,5474
13
- s2t-0.1.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
- s2t-0.1.9.dist-info/entry_points.txt,sha256=JISIUlZAJ3DX1dB6zT3X_E3vcXI-eWEQKwHiT35fPKs,37
15
- s2t-0.1.9.dist-info/top_level.txt,sha256=o8N0JcuHdIrfX3iGHvntHiDC2XgN7__joyNu08ZOh0s,4
16
- s2t-0.1.9.dist-info/RECORD,,
File without changes