s2t 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- s2t/cli.py +86 -24
- s2t/recorder.py +66 -37
- s2t/translator/argos_backend.py +2 -2
- s2t/utils.py +20 -0
- s2t/whisper_engine.py +92 -11
- {s2t-0.1.10.dist-info → s2t-0.1.12.dist-info}/METADATA +1 -1
- s2t-0.1.12.dist-info/RECORD +16 -0
- s2t-0.1.10.dist-info/RECORD +0 -16
- {s2t-0.1.10.dist-info → s2t-0.1.12.dist-info}/WHEEL +0 -0
- {s2t-0.1.10.dist-info → s2t-0.1.12.dist-info}/entry_points.txt +0 -0
- {s2t-0.1.10.dist-info → s2t-0.1.12.dist-info}/top_level.txt +0 -0
s2t/cli.py
CHANGED
@@ -49,6 +49,7 @@ from .types import TranscriptionResult
|
|
49
49
|
from .utils import (
|
50
50
|
convert_wav_to_mp3,
|
51
51
|
copy_to_clipboard,
|
52
|
+
debug_log,
|
52
53
|
make_session_dir,
|
53
54
|
open_in_shell_editor,
|
54
55
|
)
|
@@ -57,6 +58,7 @@ from .whisper_engine import WhisperEngine
|
|
57
58
|
|
58
59
|
def run_session(opts: SessionOptions) -> int:
|
59
60
|
session_dir = make_session_dir(opts.outdir)
|
61
|
+
debug_log(opts.verbose, "cli", f"Session started; directory: {session_dir}")
|
60
62
|
profile_data: dict = {}
|
61
63
|
requested = opts.recording_format.lower()
|
62
64
|
effective = requested
|
@@ -64,6 +66,12 @@ def run_session(opts: SessionOptions) -> int:
|
|
64
66
|
logging.warning("ffmpeg not found; falling back to FLAC recording instead of MP3.")
|
65
67
|
effective = "flac"
|
66
68
|
ext = ".flac" if effective == "flac" else ".wav"
|
69
|
+
if requested != effective:
|
70
|
+
debug_log(
|
71
|
+
opts.verbose,
|
72
|
+
"cli",
|
73
|
+
f"Recording format adjusted: requested={requested}, effective={effective}",
|
74
|
+
)
|
67
75
|
|
68
76
|
engine = WhisperEngine(
|
69
77
|
model_name=opts.model,
|
@@ -77,6 +85,8 @@ def run_session(opts: SessionOptions) -> int:
|
|
77
85
|
profile=profile_data if opts.profile else {},
|
78
86
|
)
|
79
87
|
ex, fut = engine.preload()
|
88
|
+
if ex is not None:
|
89
|
+
debug_log(opts.verbose, "cli", f"Model preload submitted for '{opts.model}'")
|
80
90
|
|
81
91
|
# Determine translation target languages from options
|
82
92
|
target_langs: list[str] = []
|
@@ -98,6 +108,11 @@ def run_session(opts: SessionOptions) -> int:
|
|
98
108
|
detected_lang_event=detected_lang_event,
|
99
109
|
detected_lang_holder=detected_lang,
|
100
110
|
)
|
111
|
+
debug_log(
|
112
|
+
opts.verbose,
|
113
|
+
"cli",
|
114
|
+
f"Translation targets requested: {', '.join(target_langs)}",
|
115
|
+
)
|
101
116
|
|
102
117
|
# Include split cause per chunk: "space" (manual), "pause" (auto), "finish" (final)
|
103
118
|
tx_q: queue.Queue[tuple[int, Path, int, float, str]] = queue.Queue()
|
@@ -149,6 +164,7 @@ def run_session(opts: SessionOptions) -> int:
|
|
149
164
|
|
150
165
|
def tx_worker():
|
151
166
|
model = engine.resolve_model(fut)
|
167
|
+
debug_log(opts.verbose, "cli", "Transcription worker started")
|
152
168
|
nonlocal cumulative_text, next_to_emit
|
153
169
|
finished_texts: dict[int, str] = {}
|
154
170
|
causes: dict[int, str] = {}
|
@@ -156,11 +172,21 @@ def run_session(opts: SessionOptions) -> int:
|
|
156
172
|
idx, path, frames, offset, cause = tx_q.get()
|
157
173
|
if idx == -1:
|
158
174
|
break
|
175
|
+
debug_log(
|
176
|
+
opts.verbose,
|
177
|
+
"cli",
|
178
|
+
f"Dequeued chunk {idx}: {path.name if path else '(final)'} (frames={frames}, offset={offset:.3f}, cause={cause or '-'})",
|
179
|
+
)
|
159
180
|
# If in spoken-prompt mode, ensure we don't process payload chunks before prompt is done
|
160
181
|
if opts.prompt and idx > 1 and not prompt_done.is_set():
|
182
|
+
debug_log(opts.verbose, "cli", f"Waiting for prompt before processing chunk {idx}")
|
161
183
|
prompt_done.wait()
|
162
184
|
# Build latest-ready prompt based on already finished chunks
|
163
185
|
prompt = _build_latest_ready_prompt(idx, finished_texts)
|
186
|
+
if prompt:
|
187
|
+
debug_log(
|
188
|
+
opts.verbose, "cli", f"Built initial prompt for chunk {idx} (len={len(prompt)})"
|
189
|
+
)
|
164
190
|
res = engine.transcribe_chunk(model, path, frames, initial_prompt=prompt)
|
165
191
|
# Record detected language once (for translator preload if needed)
|
166
192
|
if target_langs and detected_lang["code"] is None:
|
@@ -168,6 +194,7 @@ def run_session(opts: SessionOptions) -> int:
|
|
168
194
|
if lang_code:
|
169
195
|
detected_lang["code"] = lang_code
|
170
196
|
detected_lang_event.set()
|
197
|
+
debug_log(opts.verbose, "cli", f"Detected source language: {lang_code}")
|
171
198
|
engine.write_chunk_outputs(res, path)
|
172
199
|
text_i = (res.get("text", "") or "").strip()
|
173
200
|
with agg_lock:
|
@@ -182,23 +209,52 @@ def run_session(opts: SessionOptions) -> int:
|
|
182
209
|
causes[idx] = cause or "pause"
|
183
210
|
while next_to_emit in pending:
|
184
211
|
out = pending.pop(next_to_emit)
|
212
|
+
cause_i = causes.get(next_to_emit) or "pause"
|
185
213
|
if out:
|
186
|
-
#
|
187
|
-
sep = "\n\n" if causes.get(next_to_emit) == "space" else "\n"
|
214
|
+
# Live stdout behavior
|
188
215
|
print(out)
|
189
|
-
if
|
190
|
-
#
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
216
|
+
if cause_i == "space":
|
217
|
+
print("") # blank line after SPACE
|
218
|
+
# Build cumulative text with post-separator semantics
|
219
|
+
if not cumulative_text:
|
220
|
+
cumulative_text = out
|
221
|
+
else:
|
222
|
+
cumulative_text += out
|
223
|
+
# Append separator AFTER the chunk, matching stdout
|
224
|
+
if cause_i == "space":
|
225
|
+
if not cumulative_text.endswith("\n\n"):
|
226
|
+
# ensure exactly one paragraph break
|
227
|
+
if cumulative_text.endswith("\n"):
|
228
|
+
cumulative_text += "\n"
|
229
|
+
else:
|
230
|
+
cumulative_text += "\n\n"
|
231
|
+
else:
|
232
|
+
# single line break after non-space chunks
|
233
|
+
if not (
|
234
|
+
cumulative_text.endswith("\n") or cumulative_text.endswith("\n\n")
|
235
|
+
):
|
236
|
+
cumulative_text += "\n"
|
237
|
+
else:
|
238
|
+
# Even if chunk text is empty, respect SPACE as a paragraph break
|
239
|
+
if cause_i == "space":
|
240
|
+
print("") # blank line on stdout
|
241
|
+
if cumulative_text:
|
242
|
+
if cumulative_text.endswith("\n\n"):
|
243
|
+
pass
|
244
|
+
elif cumulative_text.endswith("\n"):
|
245
|
+
cumulative_text += "\n"
|
246
|
+
else:
|
247
|
+
cumulative_text += "\n\n"
|
248
|
+
# For empty non-space chunks, do not alter cumulative_text
|
249
|
+
try:
|
250
|
+
copy_to_clipboard(cumulative_text)
|
251
|
+
except Exception:
|
252
|
+
pass
|
198
253
|
next_to_emit += 1
|
199
254
|
# If this was the prompt chunk, signal readiness and instruct user
|
200
255
|
if opts.prompt and idx == 1 and not prompt_done.is_set():
|
201
256
|
prompt_done.set()
|
257
|
+
debug_log(opts.verbose, "cli", "Prompt transcribed; resuming payload")
|
202
258
|
print("=" * 60)
|
203
259
|
print("Prompt transcribed. Start speaking your main content now.")
|
204
260
|
print("=" * 60)
|
@@ -206,6 +262,7 @@ def run_session(opts: SessionOptions) -> int:
|
|
206
262
|
if prompt_resume_event is not None:
|
207
263
|
prompt_resume_event.set()
|
208
264
|
tx_done.set()
|
265
|
+
debug_log(opts.verbose, "cli", "Transcription worker finished")
|
209
266
|
|
210
267
|
tx_t = threading.Thread(target=tx_worker, daemon=True)
|
211
268
|
tx_t.start()
|
@@ -213,6 +270,7 @@ def run_session(opts: SessionOptions) -> int:
|
|
213
270
|
if opts.prompt:
|
214
271
|
print("Prompt mode enabled: Speak your prompt first, then press SPACE.")
|
215
272
|
print("Recording will wait for the prompt transcription before starting payload.")
|
273
|
+
debug_log(opts.verbose, "cli", "Prompt mode enabled")
|
216
274
|
# Prepare resume event to pause recording between prompt and payload
|
217
275
|
prompt_resume_event = threading.Event() if opts.prompt else None
|
218
276
|
rec = Recorder(
|
@@ -232,17 +290,24 @@ def run_session(opts: SessionOptions) -> int:
|
|
232
290
|
t1 = time.perf_counter()
|
233
291
|
if opts.profile:
|
234
292
|
profile_data["recording_sec"] = t1 - t0
|
293
|
+
debug_log(
|
294
|
+
opts.verbose, "cli", f"Recording finished in {(t1 - t0):.3f}s (chunks={len(chunk_paths)})"
|
295
|
+
)
|
235
296
|
tx_t.join()
|
236
297
|
|
237
298
|
merged: TranscriptionResult = engine.merge_results(results, chunk_offsets, cumulative_text)
|
238
299
|
base_audio_path = session_dir / f"recording{ext}"
|
239
300
|
txt_path = write_final_outputs(merged, session_dir, base_audio_path)
|
301
|
+
# Ensure Recording.txt exactly mirrors the clipboard text (including blank lines)
|
302
|
+
try:
|
303
|
+
txt_path.write_text(cumulative_text, encoding="utf-8")
|
304
|
+
except Exception:
|
305
|
+
pass
|
240
306
|
|
241
307
|
try:
|
242
308
|
if chunk_paths:
|
243
309
|
concat_audio(chunk_paths, base_audio_path, opts.rate, opts.channels)
|
244
|
-
|
245
|
-
print(f"Merged audio written: {base_audio_path.name}", file=sys.stderr)
|
310
|
+
debug_log(opts.verbose, "cli", f"Merged audio written: {base_audio_path.name}")
|
246
311
|
if requested == "mp3" and shutil.which("ffmpeg") is not None:
|
247
312
|
mp3_out = session_dir / "recording.mp3"
|
248
313
|
convert_wav_to_mp3(
|
@@ -253,11 +318,9 @@ def run_session(opts: SessionOptions) -> int:
|
|
253
318
|
),
|
254
319
|
mp3_out,
|
255
320
|
)
|
256
|
-
|
257
|
-
print(f"Converted merged audio to MP3: {mp3_out.name}", file=sys.stderr)
|
321
|
+
debug_log(opts.verbose, "cli", f"Converted merged audio to MP3: {mp3_out.name}")
|
258
322
|
except Exception as e:
|
259
|
-
|
260
|
-
print(f"Warning: failed to merge chunk audio: {e}", file=sys.stderr)
|
323
|
+
debug_log(opts.verbose, "cli", f"Warning: failed to merge chunk audio: {e}")
|
261
324
|
|
262
325
|
# Optionally delete chunk files (audio + per-chunk outputs)
|
263
326
|
if chunk_paths and not opts.keep_chunks:
|
@@ -310,11 +373,11 @@ def run_session(opts: SessionOptions) -> int:
|
|
310
373
|
# Decide source language: CLI hint takes precedence; else detected; else skip with warning
|
311
374
|
src_lang = (opts.lang.lower() if opts.lang else (detected_lang["code"] or "")).strip()
|
312
375
|
if not src_lang:
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
376
|
+
debug_log(
|
377
|
+
opts.verbose,
|
378
|
+
"cli",
|
379
|
+
"Warning: Could not determine source language for translation; skipping post-translation.",
|
380
|
+
)
|
318
381
|
else:
|
319
382
|
# Skip identical language targets
|
320
383
|
effective_targets = [t for t in target_langs if t.lower() != src_lang.lower()]
|
@@ -346,8 +409,7 @@ def run_session(opts: SessionOptions) -> int:
|
|
346
409
|
f"{base_audio_path.stem}.{tgt}{base_audio_path.suffix}"
|
347
410
|
)
|
348
411
|
write_final_outputs(translated, session_dir, suffixed)
|
349
|
-
|
350
|
-
print(f"Created translated outputs for '{tgt}'.", file=sys.stderr)
|
412
|
+
debug_log(opts.verbose, "cli", f"Created translated outputs for '{tgt}'.")
|
351
413
|
except Exception as e:
|
352
414
|
print(
|
353
415
|
f"Warning: failed to translate to '{tgt}': {e}",
|
s2t/recorder.py
CHANGED
@@ -11,6 +11,8 @@ from typing import Any, Protocol, cast, runtime_checkable
|
|
11
11
|
|
12
12
|
import numpy as np
|
13
13
|
|
14
|
+
from .utils import debug_log
|
15
|
+
|
14
16
|
|
15
17
|
class Recorder:
|
16
18
|
def __init__(
|
@@ -71,14 +73,12 @@ class Recorder:
|
|
71
73
|
ms = cast(_MSVCRT, msvcrt)
|
72
74
|
|
73
75
|
last_space = 0.0
|
74
|
-
|
75
|
-
print("[key] using msvcrt (Windows)", file=sys.stderr)
|
76
|
+
debug_log(self.verbose, "recorder", "Key input: using msvcrt (Windows)")
|
76
77
|
while not stop_evt.is_set():
|
77
78
|
if ms.kbhit():
|
78
79
|
ch = ms.getwch()
|
79
80
|
if ch in ("\r", "\n"):
|
80
|
-
|
81
|
-
print("[key] ENTER", file=sys.stderr)
|
81
|
+
debug_log(self.verbose, "recorder", "Key input: ENTER")
|
82
82
|
evt_q.put("ENTER")
|
83
83
|
break
|
84
84
|
if ch == " ":
|
@@ -88,8 +88,7 @@ class Recorder:
|
|
88
88
|
):
|
89
89
|
continue
|
90
90
|
last_space = now
|
91
|
-
|
92
|
-
print("[key] SPACE", file=sys.stderr)
|
91
|
+
debug_log(self.verbose, "recorder", "Key input: SPACE")
|
93
92
|
evt_q.put("SPACE")
|
94
93
|
time.sleep(0.01)
|
95
94
|
else:
|
@@ -97,8 +96,9 @@ class Recorder:
|
|
97
96
|
try:
|
98
97
|
if sys.stdin.isatty():
|
99
98
|
fd = sys.stdin.fileno()
|
100
|
-
|
101
|
-
|
99
|
+
debug_log(
|
100
|
+
self.verbose, "recorder", "Key input: using sys.stdin (TTY fd read)"
|
101
|
+
)
|
102
102
|
old = termios.tcgetattr(fd)
|
103
103
|
tty.setcbreak(fd)
|
104
104
|
last_space = 0.0
|
@@ -114,8 +114,7 @@ class Recorder:
|
|
114
114
|
continue
|
115
115
|
ch = ch_b.decode(errors="ignore")
|
116
116
|
if ch in ("\n", "\r"):
|
117
|
-
|
118
|
-
print("[key] ENTER", file=sys.stderr)
|
117
|
+
debug_log(self.verbose, "recorder", "Key input: ENTER")
|
119
118
|
evt_q.put("ENTER")
|
120
119
|
break
|
121
120
|
if ch == " ":
|
@@ -125,8 +124,7 @@ class Recorder:
|
|
125
124
|
):
|
126
125
|
continue
|
127
126
|
last_space = now
|
128
|
-
|
129
|
-
print("[key] SPACE", file=sys.stderr)
|
127
|
+
debug_log(self.verbose, "recorder", "Key input: SPACE")
|
130
128
|
evt_q.put("SPACE")
|
131
129
|
finally:
|
132
130
|
termios.tcsetattr(fd, termios.TCSADRAIN, old)
|
@@ -137,8 +135,11 @@ class Recorder:
|
|
137
135
|
try:
|
138
136
|
fd = os.open("/dev/tty", os.O_RDONLY)
|
139
137
|
using_devtty = True
|
140
|
-
|
141
|
-
|
138
|
+
debug_log(
|
139
|
+
self.verbose,
|
140
|
+
"recorder",
|
141
|
+
"Key input: using /dev/tty (stdin not TTY)",
|
142
|
+
)
|
142
143
|
old = termios.tcgetattr(fd)
|
143
144
|
tty.setcbreak(fd)
|
144
145
|
last_space = 0.0
|
@@ -151,8 +152,9 @@ class Recorder:
|
|
151
152
|
continue
|
152
153
|
ch = ch_b.decode(errors="ignore")
|
153
154
|
if ch in ("\n", "\r"):
|
154
|
-
|
155
|
-
|
155
|
+
debug_log(
|
156
|
+
self.verbose, "recorder", "Key input: ENTER"
|
157
|
+
)
|
156
158
|
evt_q.put("ENTER")
|
157
159
|
break
|
158
160
|
if ch == " ":
|
@@ -162,8 +164,9 @@ class Recorder:
|
|
162
164
|
):
|
163
165
|
continue
|
164
166
|
last_space = now
|
165
|
-
|
166
|
-
|
167
|
+
debug_log(
|
168
|
+
self.verbose, "recorder", "Key input: SPACE"
|
169
|
+
)
|
167
170
|
evt_q.put("SPACE")
|
168
171
|
finally:
|
169
172
|
termios.tcsetattr(fd, termios.TCSADRAIN, old)
|
@@ -185,14 +188,16 @@ class Recorder:
|
|
185
188
|
continue
|
186
189
|
# If user hits Enter on empty line, treat as ENTER
|
187
190
|
if line == "\n" or line == "\r\n":
|
188
|
-
|
189
|
-
|
191
|
+
debug_log(
|
192
|
+
self.verbose, "recorder", "Key input: ENTER (line mode)"
|
193
|
+
)
|
190
194
|
evt_q.put("ENTER")
|
191
195
|
break
|
192
196
|
# If first non-empty char is space, treat as SPACE
|
193
197
|
if line and line[0] == " ":
|
194
|
-
|
195
|
-
|
198
|
+
debug_log(
|
199
|
+
self.verbose, "recorder", "Key input: SPACE (line mode)"
|
200
|
+
)
|
196
201
|
evt_q.put("SPACE")
|
197
202
|
except Exception as e:
|
198
203
|
print(f"Warning: key reader failed: {e}", file=sys.stderr)
|
@@ -234,14 +239,19 @@ class Recorder:
|
|
234
239
|
chunk_frames.append(frames_written)
|
235
240
|
chunk_offsets.append(offset_seconds_total)
|
236
241
|
offset_seconds_total += dur
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
+
debug_log(
|
243
|
+
self.verbose,
|
244
|
+
"recorder",
|
245
|
+
f"Saved chunk {chunk_index}: {cur_path.name} ({dur:.2f}s)",
|
246
|
+
)
|
242
247
|
# Include split cause so downstream can format output accordingly
|
243
248
|
# cause: "space" (manual split) or "pause" (auto-split)
|
244
249
|
tx_queue.put((chunk_index, cur_path, frames_written, chunk_offsets[-1], cause))
|
250
|
+
debug_log(
|
251
|
+
self.verbose,
|
252
|
+
"recorder",
|
253
|
+
f"Enqueued chunk {chunk_index} for transcription (cause={cause})",
|
254
|
+
)
|
245
255
|
else:
|
246
256
|
try:
|
247
257
|
cur_path.unlink(missing_ok=True)
|
@@ -255,8 +265,14 @@ class Recorder:
|
|
255
265
|
and self.resume_event is not None
|
256
266
|
):
|
257
267
|
self._paused = True
|
268
|
+
debug_log(
|
269
|
+
self.verbose,
|
270
|
+
"recorder",
|
271
|
+
"Paused after first chunk; waiting for resume (prompt mode)",
|
272
|
+
)
|
258
273
|
self.resume_event.wait()
|
259
274
|
self._paused = False
|
275
|
+
debug_log(self.verbose, "recorder", "Resumed after prompt")
|
260
276
|
cur_path = self.session_dir / f"chunk_{chunk_index:04d}{self.ext}"
|
261
277
|
fh = sf.SoundFile(
|
262
278
|
str(cur_path),
|
@@ -291,11 +307,11 @@ class Recorder:
|
|
291
307
|
chunk_frames.append(frames_written)
|
292
308
|
chunk_offsets.append(offset_seconds_total)
|
293
309
|
offset_seconds_total += dur
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
310
|
+
debug_log(
|
311
|
+
self.verbose,
|
312
|
+
"recorder",
|
313
|
+
f"Saved chunk {chunk_index}: {cur_path.name} ({dur:.2f}s)",
|
314
|
+
)
|
299
315
|
# Final chunk – mark cause as "finish" so downstream can avoid extra blank spacing
|
300
316
|
tx_queue.put(
|
301
317
|
(
|
@@ -306,12 +322,18 @@ class Recorder:
|
|
306
322
|
"finish",
|
307
323
|
)
|
308
324
|
)
|
325
|
+
debug_log(
|
326
|
+
self.verbose,
|
327
|
+
"recorder",
|
328
|
+
f"Enqueued final chunk {chunk_index} for transcription",
|
329
|
+
)
|
309
330
|
else:
|
310
331
|
try:
|
311
332
|
cur_path.unlink(missing_ok=True)
|
312
333
|
except Exception:
|
313
334
|
pass
|
314
335
|
tx_queue.put((-1, Path(), 0, 0.0, ""))
|
336
|
+
debug_log(self.verbose, "recorder", "Signaled transcription finish")
|
315
337
|
return
|
316
338
|
except queue.Empty:
|
317
339
|
pass
|
@@ -356,11 +378,11 @@ class Recorder:
|
|
356
378
|
enough_length = frames_written >= int(self.samplerate * self.min_chunk_sec)
|
357
379
|
cooldown_ok = (time.perf_counter() - last_split_time) >= split_cooldown_sec
|
358
380
|
if enough_silence and enough_length and seen_non_silent and cooldown_ok:
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
381
|
+
debug_log(
|
382
|
+
self.verbose,
|
383
|
+
"recorder",
|
384
|
+
f"Auto-split (≥{self.silence_sec:.2f}s silence)",
|
385
|
+
)
|
364
386
|
last_split_time = time.perf_counter()
|
365
387
|
# Queue an auto split for the next control phase
|
366
388
|
ctrl_q.put("split_auto")
|
@@ -389,6 +411,12 @@ class Recorder:
|
|
389
411
|
print("—" * 60)
|
390
412
|
print("")
|
391
413
|
|
414
|
+
debug_log(
|
415
|
+
self.verbose,
|
416
|
+
"recorder",
|
417
|
+
f"Recording started (rate={self.samplerate}, channels={self.channels}, ext={self.ext})",
|
418
|
+
)
|
419
|
+
|
392
420
|
import sounddevice as sd
|
393
421
|
|
394
422
|
with sd.InputStream(samplerate=self.samplerate, channels=self.channels, callback=cb):
|
@@ -403,4 +431,5 @@ class Recorder:
|
|
403
431
|
ctrl_q.put("finish")
|
404
432
|
break
|
405
433
|
writer_t.join()
|
434
|
+
debug_log(self.verbose, "recorder", "Recording finished")
|
406
435
|
return chunk_paths, chunk_frames, chunk_offsets
|
s2t/translator/argos_backend.py
CHANGED
@@ -8,6 +8,7 @@ from collections.abc import Iterable
|
|
8
8
|
from pathlib import Path
|
9
9
|
|
10
10
|
from ..types import SegmentDict, TranscriptionResult
|
11
|
+
from ..utils import debug_log
|
11
12
|
|
12
13
|
# Global install coordination to avoid duplicate downloads in parallel
|
13
14
|
_install_lock = threading.Lock()
|
@@ -25,8 +26,7 @@ class ArgosTranslator:
|
|
25
26
|
self.verbose = verbose
|
26
27
|
|
27
28
|
def _debug(self, msg: str) -> None:
|
28
|
-
|
29
|
-
print(msg)
|
29
|
+
debug_log(self.verbose, "argos", msg)
|
30
30
|
|
31
31
|
@staticmethod
|
32
32
|
def _guess_packages_dir() -> str:
|
s2t/utils.py
CHANGED
@@ -5,6 +5,7 @@ import platform
|
|
5
5
|
import shutil
|
6
6
|
import subprocess
|
7
7
|
import sys
|
8
|
+
import time
|
8
9
|
from datetime import datetime
|
9
10
|
from pathlib import Path
|
10
11
|
|
@@ -36,6 +37,25 @@ def convert_wav_to_mp3(wav_path: Path, mp3_path: Path) -> None:
|
|
36
37
|
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
|
37
38
|
|
38
39
|
|
40
|
+
# Baseline at program start for relative timestamps in verbose logs
|
41
|
+
_START_TIME = time.perf_counter()
|
42
|
+
|
43
|
+
|
44
|
+
def debug_log(verbose: bool, component: str, msg: str) -> None:
|
45
|
+
"""Emit a timestamped debug line to stderr if verbose is enabled.
|
46
|
+
|
47
|
+
Args:
|
48
|
+
verbose: Whether verbose mode is active.
|
49
|
+
component: Short component tag (e.g., 'recorder', 'whisper', 'cli', 'argos').
|
50
|
+
msg: Message to print.
|
51
|
+
"""
|
52
|
+
if not verbose:
|
53
|
+
return
|
54
|
+
elapsed = time.perf_counter() - _START_TIME
|
55
|
+
# Elapsed with milliseconds precision
|
56
|
+
print(f"[+{elapsed:.3f}s] [{component}] {msg}", file=sys.stderr, flush=True)
|
57
|
+
|
58
|
+
|
39
59
|
def copy_to_clipboard(text: str) -> None:
|
40
60
|
system = platform.system()
|
41
61
|
try:
|
s2t/whisper_engine.py
CHANGED
@@ -5,7 +5,20 @@ from concurrent.futures import Future, ThreadPoolExecutor
|
|
5
5
|
from pathlib import Path
|
6
6
|
from typing import Any
|
7
7
|
|
8
|
+
import numpy as np
|
9
|
+
|
8
10
|
from .types import SegmentDict, TranscriptionResult
|
11
|
+
from .utils import debug_log
|
12
|
+
|
13
|
+
# --- Tuning parameters (easy to adjust later) ---
|
14
|
+
# Silence trim parameters operate on 16 kHz mono arrays
|
15
|
+
TRIM_RMS_THRESHOLD: float = 0.012 # RMS threshold for speech vs. silence
|
16
|
+
TRIM_MIN_VOICED_SEC: float = 0.5 # Require at least this much voiced audio to transcribe
|
17
|
+
TRIM_PAD_MS: int = 50 # Keep a short pad around detected speech (ms)
|
18
|
+
|
19
|
+
# Whisper inference behavior on low/empty audio
|
20
|
+
WHISPER_NO_SPEECH_THRESHOLD: float = 0.7
|
21
|
+
WHISPER_CONDITION_ON_PREV: bool = False
|
9
22
|
|
10
23
|
|
11
24
|
class WhisperEngine:
|
@@ -48,6 +61,7 @@ class WhisperEngine:
|
|
48
61
|
return m, (t1 - t0)
|
49
62
|
|
50
63
|
fut = self._executor.submit(_load, self.model_name)
|
64
|
+
debug_log(self.verbose, "whisper", f"Submitted model preload: {self.model_name}")
|
51
65
|
return self._executor, fut
|
52
66
|
except Exception:
|
53
67
|
return None, None
|
@@ -62,6 +76,9 @@ class WhisperEngine:
|
|
62
76
|
self.profile["model_load_sec"] = self.profile.get("model_load_sec", 0.0) + float(
|
63
77
|
load_dur
|
64
78
|
)
|
79
|
+
debug_log(
|
80
|
+
self.verbose, "whisper", f"Model resolved via preload in {float(load_dur):.3f}s"
|
81
|
+
)
|
65
82
|
except Exception:
|
66
83
|
model = None
|
67
84
|
if model is None:
|
@@ -69,6 +86,7 @@ class WhisperEngine:
|
|
69
86
|
model = whisper.load_model(self.model_name)
|
70
87
|
t1m = time.perf_counter()
|
71
88
|
self.profile["model_load_sec"] = self.profile.get("model_load_sec", 0.0) + (t1m - t0m)
|
89
|
+
debug_log(self.verbose, "whisper", f"Loaded model synchronously in {(t1m - t0m):.3f}s")
|
72
90
|
return model
|
73
91
|
|
74
92
|
def transcribe_chunk(
|
@@ -79,10 +97,8 @@ class WhisperEngine:
|
|
79
97
|
initial_prompt: str | None = None,
|
80
98
|
) -> TranscriptionResult:
|
81
99
|
# Load audio without ffmpeg by reading via soundfile and passing a numpy array
|
82
|
-
# to Whisper.
|
100
|
+
# to Whisper. Convert to mono float32 and resample to 16 kHz as expected by Whisper's API.
|
83
101
|
task = "translate" if self.translate else "transcribe"
|
84
|
-
import numpy as np
|
85
|
-
|
86
102
|
try:
|
87
103
|
import soundfile as sf
|
88
104
|
except Exception as e:
|
@@ -100,16 +116,71 @@ class WhisperEngine:
|
|
100
116
|
# Resample to 16k expected by Whisper when passing arrays
|
101
117
|
mono_16k: np.ndarray = resample_linear(mono, int(sr), 16000)
|
102
118
|
|
119
|
+
# Trim leading/trailing silence to avoid hallucinations on near-empty chunks
|
120
|
+
def _moving_rms(x: np.ndarray, win_len: int) -> np.ndarray:
|
121
|
+
if x.size == 0:
|
122
|
+
return np.zeros(0, dtype=np.float32)
|
123
|
+
win = np.ones(win_len, dtype=np.float32) / float(win_len)
|
124
|
+
sq = np.square(x.astype(np.float32, copy=False))
|
125
|
+
# same-length RMS via 'same' convolution
|
126
|
+
ma = np.convolve(sq, win, mode="same")
|
127
|
+
return np.sqrt(ma).astype(np.float32, copy=False)
|
128
|
+
|
129
|
+
def _trim_silence(x: np.ndarray, sr16k: int) -> tuple[np.ndarray, float, float]:
|
130
|
+
# Returns (trimmed, leading_sec, trailing_sec)
|
131
|
+
if x.size == 0:
|
132
|
+
return x, 0.0, 0.0
|
133
|
+
win_len = max(1, int(round(sr16k * 0.03))) # 30 ms window
|
134
|
+
rms = _moving_rms(x, win_len)
|
135
|
+
thr = float(TRIM_RMS_THRESHOLD)
|
136
|
+
voiced = np.where(rms >= thr)[0]
|
137
|
+
if voiced.size == 0:
|
138
|
+
return np.zeros(0, dtype=np.float32), 0.0, float(x.size) / sr16k
|
139
|
+
start_idx = int(voiced[0])
|
140
|
+
end_idx = int(voiced[-1])
|
141
|
+
pad = int(round((TRIM_PAD_MS / 1000.0) * sr16k))
|
142
|
+
a = max(0, start_idx - pad)
|
143
|
+
b = min(x.size, end_idx + pad + 1)
|
144
|
+
lead_sec = float(a) / sr16k
|
145
|
+
trail_sec = float(x.size - b) / sr16k
|
146
|
+
return x[a:b], lead_sec, trail_sec
|
147
|
+
|
148
|
+
pre_sec = float(mono_16k.size) / 16000.0
|
149
|
+
trimmed, lead_sec, trail_sec = _trim_silence(mono_16k, 16000)
|
150
|
+
post_sec = float(trimmed.size) / 16000.0
|
151
|
+
debug_log(
|
152
|
+
self.verbose,
|
153
|
+
"whisper",
|
154
|
+
f"Chunk {audio_path.name}: trim {pre_sec:.2f}s -> {post_sec:.2f}s (lead {lead_sec:.2f}s, tail {trail_sec:.2f}s)",
|
155
|
+
)
|
156
|
+
|
157
|
+
# If too short after trimming, skip transcription
|
158
|
+
if post_sec < float(TRIM_MIN_VOICED_SEC):
|
159
|
+
debug_log(
|
160
|
+
self.verbose,
|
161
|
+
"whisper",
|
162
|
+
f"Chunk {audio_path.name}: too short after trim ({post_sec:.2f}s) – skipping",
|
163
|
+
)
|
164
|
+
return {"text": "", "segments": []}
|
165
|
+
|
103
166
|
t0 = time.perf_counter()
|
167
|
+
debug_log(
|
168
|
+
self.verbose, "whisper", f"Transcribing chunk {audio_path.name} (frames={frames})"
|
169
|
+
)
|
104
170
|
res: dict[str, Any] = model.transcribe(
|
105
|
-
|
171
|
+
trimmed,
|
106
172
|
task=task,
|
107
173
|
language=self.language,
|
108
174
|
fp16=False,
|
109
|
-
initial_prompt=initial_prompt,
|
175
|
+
initial_prompt=(initial_prompt if post_sec >= float(TRIM_MIN_VOICED_SEC) else None),
|
176
|
+
condition_on_previous_text=bool(WHISPER_CONDITION_ON_PREV),
|
177
|
+
no_speech_threshold=float(WHISPER_NO_SPEECH_THRESHOLD),
|
110
178
|
)
|
111
179
|
t1 = time.perf_counter()
|
112
180
|
self.profile["transcribe_sec"] = self.profile.get("transcribe_sec", 0.0) + (t1 - t0)
|
181
|
+
debug_log(
|
182
|
+
self.verbose, "whisper", f"Transcribed chunk {audio_path.name} in {(t1 - t0):.3f}s"
|
183
|
+
)
|
113
184
|
text_c = str(res.get("text", "") or "").strip()
|
114
185
|
lang_code = str(res.get("language", "") or "")
|
115
186
|
if self.native_segmentation:
|
@@ -117,8 +188,9 @@ class WhisperEngine:
|
|
117
188
|
segs_typed: list[SegmentDict] = []
|
118
189
|
for s in segs_raw:
|
119
190
|
try:
|
120
|
-
|
121
|
-
|
191
|
+
# Adjust for leading trim so times align with original chunk timeline
|
192
|
+
start = float(s.get("start", 0.0)) + float(lead_sec)
|
193
|
+
end = float(s.get("end", 0.0)) + float(lead_sec)
|
122
194
|
text = str(s.get("text", "") or "")
|
123
195
|
segs_typed.append({"start": start, "end": end, "text": text})
|
124
196
|
except Exception:
|
@@ -129,8 +201,12 @@ class WhisperEngine:
|
|
129
201
|
return out
|
130
202
|
# Collapsed single segment per chunk
|
131
203
|
segs_raw = res.get("segments", []) or []
|
132
|
-
start = float(segs_raw[0].get("start", 0.0)) if segs_raw else 0.0
|
133
|
-
end =
|
204
|
+
start = (float(segs_raw[0].get("start", 0.0)) + float(lead_sec)) if segs_raw else 0.0
|
205
|
+
end = (
|
206
|
+
(float(segs_raw[-1].get("end", 0.0)) + float(lead_sec))
|
207
|
+
if segs_raw
|
208
|
+
else (frames / float(self.samplerate))
|
209
|
+
)
|
134
210
|
out2: TranscriptionResult = {
|
135
211
|
"text": text_c,
|
136
212
|
"segments": ([{"start": start, "end": end, "text": text_c}] if text_c else []),
|
@@ -143,12 +219,17 @@ class WhisperEngine:
|
|
143
219
|
try:
|
144
220
|
from whisper.utils import get_writer
|
145
221
|
|
222
|
+
debug_log(self.verbose, "whisper", f"Writing outputs for {audio_path.name}")
|
146
223
|
for fmt in ("txt", "srt", "vtt", "tsv", "json"):
|
147
224
|
writer = get_writer(fmt, str(self.session_dir))
|
148
225
|
writer(result, str(audio_path))
|
226
|
+
debug_log(self.verbose, "whisper", f"Wrote outputs for {audio_path.name}")
|
149
227
|
except Exception as e:
|
150
|
-
|
151
|
-
|
228
|
+
debug_log(
|
229
|
+
self.verbose,
|
230
|
+
"whisper",
|
231
|
+
f"Warning: failed to write chunk outputs for {audio_path.name}: {e}",
|
232
|
+
)
|
152
233
|
|
153
234
|
def merge_results(
|
154
235
|
self, results: list[TranscriptionResult], offsets: list[float], cumulative_text: str
|
@@ -0,0 +1,16 @@
|
|
1
|
+
s2t/__init__.py,sha256=wV4E9i-7KrUn1dOtLUQB3ZGEKx9gRWH3hPHlpw-ZdWc,332
|
2
|
+
s2t/cli.py,sha256=rcrJ1KWwzrpob0dBWWOJCYH2KBfCUpKD0Is_3f-LzqU,24452
|
3
|
+
s2t/config.py,sha256=uw4CZSSXmUvnlOrqBGR1Rcq-WdXucHj3KICRcCb_pkU,485
|
4
|
+
s2t/outputs.py,sha256=Lo8VcARZ7QPuuQQNu8myD5J4c4NO1Rs0L1DLnzLe9tM,1546
|
5
|
+
s2t/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
6
|
+
s2t/recorder.py,sha256=VYFqZ6LlP2zbwjWclZqM8ve5HEnZ3oyM9hLV1V3IkPI,20495
|
7
|
+
s2t/types.py,sha256=jBiRN-tr0qVw-lhaXvnsyKrVGDyLkqEbxs9qkQ6qGqI,339
|
8
|
+
s2t/utils.py,sha256=p7klapPW3Multxk261NlPtEpnEi3kpiTSHBPBTv4XC0,4059
|
9
|
+
s2t/whisper_engine.py,sha256=T4HYmr1czwj78LsUdgRGWEBCfaghvHVqplQDaQDaR4o,10373
|
10
|
+
s2t/translator/__init__.py,sha256=K-MKves7kZ4-62POfrmWeOcBaTjsTzeFSu8QNHqYuus,239
|
11
|
+
s2t/translator/argos_backend.py,sha256=hXzQ8ZgJJOcUhcTJdTdVw1lSzptl8FXtfYr5PyOxKkg,19096
|
12
|
+
s2t-0.1.12.dist-info/METADATA,sha256=SQ3kqXvmdVmx5SQ1Owmo_moYV4dCs9J8PKBDureJ5Sw,5475
|
13
|
+
s2t-0.1.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
14
|
+
s2t-0.1.12.dist-info/entry_points.txt,sha256=JISIUlZAJ3DX1dB6zT3X_E3vcXI-eWEQKwHiT35fPKs,37
|
15
|
+
s2t-0.1.12.dist-info/top_level.txt,sha256=o8N0JcuHdIrfX3iGHvntHiDC2XgN7__joyNu08ZOh0s,4
|
16
|
+
s2t-0.1.12.dist-info/RECORD,,
|
s2t-0.1.10.dist-info/RECORD
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
s2t/__init__.py,sha256=wV4E9i-7KrUn1dOtLUQB3ZGEKx9gRWH3hPHlpw-ZdWc,332
|
2
|
-
s2t/cli.py,sha256=1t3fchtywFaeuPONp-B3FmSzBnSxJRvP6jx9AS-b1Ok,21351
|
3
|
-
s2t/config.py,sha256=uw4CZSSXmUvnlOrqBGR1Rcq-WdXucHj3KICRcCb_pkU,485
|
4
|
-
s2t/outputs.py,sha256=Lo8VcARZ7QPuuQQNu8myD5J4c4NO1Rs0L1DLnzLe9tM,1546
|
5
|
-
s2t/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
6
|
-
s2t/recorder.py,sha256=Z3Hn8l1xLY7XzLR6zqMYulTBfTRNWD-zqwk_V5x18Sc,19228
|
7
|
-
s2t/types.py,sha256=jBiRN-tr0qVw-lhaXvnsyKrVGDyLkqEbxs9qkQ6qGqI,339
|
8
|
-
s2t/utils.py,sha256=YU6YhiuONmqhrKte4DY5tiC5PP-yFExJMMBzFUiA8qA,3416
|
9
|
-
s2t/whisper_engine.py,sha256=x-V7ST9e3JnwMWdbMh4C7dHjA420jaOtXH2-igeh7vc,6492
|
10
|
-
s2t/translator/__init__.py,sha256=K-MKves7kZ4-62POfrmWeOcBaTjsTzeFSu8QNHqYuus,239
|
11
|
-
s2t/translator/argos_backend.py,sha256=VW_OYFFBuNZgcWM-fbvR6XGokuxS2fptkCMFIO9MD1I,19068
|
12
|
-
s2t-0.1.10.dist-info/METADATA,sha256=ViJoiYC5WG_aLaRTLXedQ-o8TwyBQ_sCN126u9o96lY,5475
|
13
|
-
s2t-0.1.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
14
|
-
s2t-0.1.10.dist-info/entry_points.txt,sha256=JISIUlZAJ3DX1dB6zT3X_E3vcXI-eWEQKwHiT35fPKs,37
|
15
|
-
s2t-0.1.10.dist-info/top_level.txt,sha256=o8N0JcuHdIrfX3iGHvntHiDC2XgN7__joyNu08ZOh0s,4
|
16
|
-
s2t-0.1.10.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|