s2t 0.1.9__tar.gz → 0.1.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {s2t-0.1.9/src/s2t.egg-info → s2t-0.1.11}/PKG-INFO +1 -1
- {s2t-0.1.9 → s2t-0.1.11}/src/s2t/cli.py +89 -21
- {s2t-0.1.9 → s2t-0.1.11}/src/s2t/recorder.py +91 -48
- {s2t-0.1.9 → s2t-0.1.11}/src/s2t/translator/argos_backend.py +2 -2
- {s2t-0.1.9 → s2t-0.1.11}/src/s2t/utils.py +20 -0
- {s2t-0.1.9 → s2t-0.1.11}/src/s2t/whisper_engine.py +92 -11
- {s2t-0.1.9 → s2t-0.1.11/src/s2t.egg-info}/PKG-INFO +1 -1
- {s2t-0.1.9 → s2t-0.1.11}/.gitignore +0 -0
- {s2t-0.1.9 → s2t-0.1.11}/.pre-commit-config.yaml +0 -0
- {s2t-0.1.9 → s2t-0.1.11}/AGENTS.md +0 -0
- {s2t-0.1.9 → s2t-0.1.11}/CONTRIBUTING.md +0 -0
- {s2t-0.1.9 → s2t-0.1.11}/MANIFEST.in +0 -0
- {s2t-0.1.9 → s2t-0.1.11}/Makefile +0 -0
- {s2t-0.1.9 → s2t-0.1.11}/README.md +0 -0
- {s2t-0.1.9 → s2t-0.1.11}/docs/RELEASING.md +0 -0
- {s2t-0.1.9 → s2t-0.1.11}/docs/SESSION_STATE.md +0 -0
- {s2t-0.1.9 → s2t-0.1.11}/pyproject.toml +0 -0
- {s2t-0.1.9 → s2t-0.1.11}/scripts/bench_transcribe.py +0 -0
- {s2t-0.1.9 → s2t-0.1.11}/scripts/tag.sh +0 -0
- {s2t-0.1.9 → s2t-0.1.11}/setup.cfg +0 -0
- {s2t-0.1.9 → s2t-0.1.11}/src/s2t/__init__.py +0 -0
- {s2t-0.1.9 → s2t-0.1.11}/src/s2t/config.py +0 -0
- {s2t-0.1.9 → s2t-0.1.11}/src/s2t/outputs.py +0 -0
- {s2t-0.1.9 → s2t-0.1.11}/src/s2t/py.typed +0 -0
- {s2t-0.1.9 → s2t-0.1.11}/src/s2t/translator/__init__.py +0 -0
- {s2t-0.1.9 → s2t-0.1.11}/src/s2t/types.py +0 -0
- {s2t-0.1.9 → s2t-0.1.11}/src/s2t.egg-info/SOURCES.txt +0 -0
- {s2t-0.1.9 → s2t-0.1.11}/src/s2t.egg-info/dependency_links.txt +0 -0
- {s2t-0.1.9 → s2t-0.1.11}/src/s2t.egg-info/entry_points.txt +0 -0
- {s2t-0.1.9 → s2t-0.1.11}/src/s2t.egg-info/requires.txt +0 -0
- {s2t-0.1.9 → s2t-0.1.11}/src/s2t.egg-info/top_level.txt +0 -0
@@ -49,6 +49,7 @@ from .types import TranscriptionResult
|
|
49
49
|
from .utils import (
|
50
50
|
convert_wav_to_mp3,
|
51
51
|
copy_to_clipboard,
|
52
|
+
debug_log,
|
52
53
|
make_session_dir,
|
53
54
|
open_in_shell_editor,
|
54
55
|
)
|
@@ -57,6 +58,7 @@ from .whisper_engine import WhisperEngine
|
|
57
58
|
|
58
59
|
def run_session(opts: SessionOptions) -> int:
|
59
60
|
session_dir = make_session_dir(opts.outdir)
|
61
|
+
debug_log(opts.verbose, "cli", f"Session started; directory: {session_dir}")
|
60
62
|
profile_data: dict = {}
|
61
63
|
requested = opts.recording_format.lower()
|
62
64
|
effective = requested
|
@@ -64,6 +66,12 @@ def run_session(opts: SessionOptions) -> int:
|
|
64
66
|
logging.warning("ffmpeg not found; falling back to FLAC recording instead of MP3.")
|
65
67
|
effective = "flac"
|
66
68
|
ext = ".flac" if effective == "flac" else ".wav"
|
69
|
+
if requested != effective:
|
70
|
+
debug_log(
|
71
|
+
opts.verbose,
|
72
|
+
"cli",
|
73
|
+
f"Recording format adjusted: requested={requested}, effective={effective}",
|
74
|
+
)
|
67
75
|
|
68
76
|
engine = WhisperEngine(
|
69
77
|
model_name=opts.model,
|
@@ -77,6 +85,8 @@ def run_session(opts: SessionOptions) -> int:
|
|
77
85
|
profile=profile_data if opts.profile else {},
|
78
86
|
)
|
79
87
|
ex, fut = engine.preload()
|
88
|
+
if ex is not None:
|
89
|
+
debug_log(opts.verbose, "cli", f"Model preload submitted for '{opts.model}'")
|
80
90
|
|
81
91
|
# Determine translation target languages from options
|
82
92
|
target_langs: list[str] = []
|
@@ -98,8 +108,14 @@ def run_session(opts: SessionOptions) -> int:
|
|
98
108
|
detected_lang_event=detected_lang_event,
|
99
109
|
detected_lang_holder=detected_lang,
|
100
110
|
)
|
111
|
+
debug_log(
|
112
|
+
opts.verbose,
|
113
|
+
"cli",
|
114
|
+
f"Translation targets requested: {', '.join(target_langs)}",
|
115
|
+
)
|
101
116
|
|
102
|
-
|
117
|
+
# Include split cause per chunk: "space" (manual), "pause" (auto), "finish" (final)
|
118
|
+
tx_q: queue.Queue[tuple[int, Path, int, float, str]] = queue.Queue()
|
103
119
|
cumulative_text = ""
|
104
120
|
next_to_emit = 1
|
105
121
|
pending: dict[int, str] = {}
|
@@ -148,17 +164,29 @@ def run_session(opts: SessionOptions) -> int:
|
|
148
164
|
|
149
165
|
def tx_worker():
|
150
166
|
model = engine.resolve_model(fut)
|
167
|
+
debug_log(opts.verbose, "cli", "Transcription worker started")
|
151
168
|
nonlocal cumulative_text, next_to_emit
|
152
169
|
finished_texts: dict[int, str] = {}
|
170
|
+
causes: dict[int, str] = {}
|
153
171
|
while True:
|
154
|
-
idx, path, frames, offset = tx_q.get()
|
172
|
+
idx, path, frames, offset, cause = tx_q.get()
|
155
173
|
if idx == -1:
|
156
174
|
break
|
175
|
+
debug_log(
|
176
|
+
opts.verbose,
|
177
|
+
"cli",
|
178
|
+
f"Dequeued chunk {idx}: {path.name if path else '(final)'} (frames={frames}, offset={offset:.3f}, cause={cause or '-'})",
|
179
|
+
)
|
157
180
|
# If in spoken-prompt mode, ensure we don't process payload chunks before prompt is done
|
158
181
|
if opts.prompt and idx > 1 and not prompt_done.is_set():
|
182
|
+
debug_log(opts.verbose, "cli", f"Waiting for prompt before processing chunk {idx}")
|
159
183
|
prompt_done.wait()
|
160
184
|
# Build latest-ready prompt based on already finished chunks
|
161
185
|
prompt = _build_latest_ready_prompt(idx, finished_texts)
|
186
|
+
if prompt:
|
187
|
+
debug_log(
|
188
|
+
opts.verbose, "cli", f"Built initial prompt for chunk {idx} (len={len(prompt)})"
|
189
|
+
)
|
162
190
|
res = engine.transcribe_chunk(model, path, frames, initial_prompt=prompt)
|
163
191
|
# Record detected language once (for translator preload if needed)
|
164
192
|
if target_langs and detected_lang["code"] is None:
|
@@ -166,6 +194,7 @@ def run_session(opts: SessionOptions) -> int:
|
|
166
194
|
if lang_code:
|
167
195
|
detected_lang["code"] = lang_code
|
168
196
|
detected_lang_event.set()
|
197
|
+
debug_log(opts.verbose, "cli", f"Detected source language: {lang_code}")
|
169
198
|
engine.write_chunk_outputs(res, path)
|
170
199
|
text_i = (res.get("text", "") or "").strip()
|
171
200
|
with agg_lock:
|
@@ -174,20 +203,58 @@ def run_session(opts: SessionOptions) -> int:
|
|
174
203
|
results.append(res)
|
175
204
|
offsets.append(offset)
|
176
205
|
pending[idx] = text_i
|
206
|
+
# Track cause for formatting when emitting in-order
|
207
|
+
# cause is one of: "space", "pause", "finish" (or empty for sentinel)
|
208
|
+
# Default to "pause" if unknown to avoid extra blank lines.
|
209
|
+
causes[idx] = cause or "pause"
|
177
210
|
while next_to_emit in pending:
|
178
211
|
out = pending.pop(next_to_emit)
|
212
|
+
cause_i = causes.get(next_to_emit) or "pause"
|
179
213
|
if out:
|
214
|
+
# Live stdout behavior
|
180
215
|
print(out)
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
216
|
+
if cause_i == "space":
|
217
|
+
print("") # blank line after SPACE
|
218
|
+
# Build cumulative text with post-separator semantics
|
219
|
+
if not cumulative_text:
|
220
|
+
cumulative_text = out
|
221
|
+
else:
|
222
|
+
cumulative_text += out
|
223
|
+
# Append separator AFTER the chunk, matching stdout
|
224
|
+
if cause_i == "space":
|
225
|
+
if not cumulative_text.endswith("\n\n"):
|
226
|
+
# ensure exactly one paragraph break
|
227
|
+
if cumulative_text.endswith("\n"):
|
228
|
+
cumulative_text += "\n"
|
229
|
+
else:
|
230
|
+
cumulative_text += "\n\n"
|
231
|
+
else:
|
232
|
+
# single line break after non-space chunks
|
233
|
+
if not (
|
234
|
+
cumulative_text.endswith("\n") or cumulative_text.endswith("\n\n")
|
235
|
+
):
|
236
|
+
cumulative_text += "\n"
|
237
|
+
else:
|
238
|
+
# Even if chunk text is empty, respect SPACE as a paragraph break
|
239
|
+
if cause_i == "space":
|
240
|
+
print("") # blank line on stdout
|
241
|
+
if cumulative_text:
|
242
|
+
if cumulative_text.endswith("\n\n"):
|
243
|
+
pass
|
244
|
+
elif cumulative_text.endswith("\n"):
|
245
|
+
cumulative_text += "\n"
|
246
|
+
else:
|
247
|
+
cumulative_text += "\n\n"
|
248
|
+
# For empty non-space chunks, do not alter cumulative_text
|
249
|
+
try:
|
250
|
+
copy_to_clipboard(cumulative_text)
|
251
|
+
except Exception:
|
252
|
+
pass
|
187
253
|
next_to_emit += 1
|
188
254
|
# If this was the prompt chunk, signal readiness and instruct user
|
189
255
|
if opts.prompt and idx == 1 and not prompt_done.is_set():
|
190
256
|
prompt_done.set()
|
257
|
+
debug_log(opts.verbose, "cli", "Prompt transcribed; resuming payload")
|
191
258
|
print("=" * 60)
|
192
259
|
print("Prompt transcribed. Start speaking your main content now.")
|
193
260
|
print("=" * 60)
|
@@ -195,6 +262,7 @@ def run_session(opts: SessionOptions) -> int:
|
|
195
262
|
if prompt_resume_event is not None:
|
196
263
|
prompt_resume_event.set()
|
197
264
|
tx_done.set()
|
265
|
+
debug_log(opts.verbose, "cli", "Transcription worker finished")
|
198
266
|
|
199
267
|
tx_t = threading.Thread(target=tx_worker, daemon=True)
|
200
268
|
tx_t.start()
|
@@ -202,6 +270,7 @@ def run_session(opts: SessionOptions) -> int:
|
|
202
270
|
if opts.prompt:
|
203
271
|
print("Prompt mode enabled: Speak your prompt first, then press SPACE.")
|
204
272
|
print("Recording will wait for the prompt transcription before starting payload.")
|
273
|
+
debug_log(opts.verbose, "cli", "Prompt mode enabled")
|
205
274
|
# Prepare resume event to pause recording between prompt and payload
|
206
275
|
prompt_resume_event = threading.Event() if opts.prompt else None
|
207
276
|
rec = Recorder(
|
@@ -221,6 +290,9 @@ def run_session(opts: SessionOptions) -> int:
|
|
221
290
|
t1 = time.perf_counter()
|
222
291
|
if opts.profile:
|
223
292
|
profile_data["recording_sec"] = t1 - t0
|
293
|
+
debug_log(
|
294
|
+
opts.verbose, "cli", f"Recording finished in {(t1 - t0):.3f}s (chunks={len(chunk_paths)})"
|
295
|
+
)
|
224
296
|
tx_t.join()
|
225
297
|
|
226
298
|
merged: TranscriptionResult = engine.merge_results(results, chunk_offsets, cumulative_text)
|
@@ -230,8 +302,7 @@ def run_session(opts: SessionOptions) -> int:
|
|
230
302
|
try:
|
231
303
|
if chunk_paths:
|
232
304
|
concat_audio(chunk_paths, base_audio_path, opts.rate, opts.channels)
|
233
|
-
|
234
|
-
print(f"Merged audio written: {base_audio_path.name}", file=sys.stderr)
|
305
|
+
debug_log(opts.verbose, "cli", f"Merged audio written: {base_audio_path.name}")
|
235
306
|
if requested == "mp3" and shutil.which("ffmpeg") is not None:
|
236
307
|
mp3_out = session_dir / "recording.mp3"
|
237
308
|
convert_wav_to_mp3(
|
@@ -242,11 +313,9 @@ def run_session(opts: SessionOptions) -> int:
|
|
242
313
|
),
|
243
314
|
mp3_out,
|
244
315
|
)
|
245
|
-
|
246
|
-
print(f"Converted merged audio to MP3: {mp3_out.name}", file=sys.stderr)
|
316
|
+
debug_log(opts.verbose, "cli", f"Converted merged audio to MP3: {mp3_out.name}")
|
247
317
|
except Exception as e:
|
248
|
-
|
249
|
-
print(f"Warning: failed to merge chunk audio: {e}", file=sys.stderr)
|
318
|
+
debug_log(opts.verbose, "cli", f"Warning: failed to merge chunk audio: {e}")
|
250
319
|
|
251
320
|
# Optionally delete chunk files (audio + per-chunk outputs)
|
252
321
|
if chunk_paths and not opts.keep_chunks:
|
@@ -299,11 +368,11 @@ def run_session(opts: SessionOptions) -> int:
|
|
299
368
|
# Decide source language: CLI hint takes precedence; else detected; else skip with warning
|
300
369
|
src_lang = (opts.lang.lower() if opts.lang else (detected_lang["code"] or "")).strip()
|
301
370
|
if not src_lang:
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
371
|
+
debug_log(
|
372
|
+
opts.verbose,
|
373
|
+
"cli",
|
374
|
+
"Warning: Could not determine source language for translation; skipping post-translation.",
|
375
|
+
)
|
307
376
|
else:
|
308
377
|
# Skip identical language targets
|
309
378
|
effective_targets = [t for t in target_langs if t.lower() != src_lang.lower()]
|
@@ -335,8 +404,7 @@ def run_session(opts: SessionOptions) -> int:
|
|
335
404
|
f"{base_audio_path.stem}.{tgt}{base_audio_path.suffix}"
|
336
405
|
)
|
337
406
|
write_final_outputs(translated, session_dir, suffixed)
|
338
|
-
|
339
|
-
print(f"Created translated outputs for '{tgt}'.", file=sys.stderr)
|
407
|
+
debug_log(opts.verbose, "cli", f"Created translated outputs for '{tgt}'.")
|
340
408
|
except Exception as e:
|
341
409
|
print(
|
342
410
|
f"Warning: failed to translate to '{tgt}': {e}",
|
@@ -11,6 +11,8 @@ from typing import Any, Protocol, cast, runtime_checkable
|
|
11
11
|
|
12
12
|
import numpy as np
|
13
13
|
|
14
|
+
from .utils import debug_log
|
15
|
+
|
14
16
|
|
15
17
|
class Recorder:
|
16
18
|
def __init__(
|
@@ -41,7 +43,7 @@ class Recorder:
|
|
41
43
|
|
42
44
|
def run(
|
43
45
|
self,
|
44
|
-
tx_queue: queue.Queue[tuple[int, Path, int, float]],
|
46
|
+
tx_queue: queue.Queue[tuple[int, Path, int, float, str]],
|
45
47
|
) -> tuple[list[Path], list[int], list[float]]:
|
46
48
|
import platform
|
47
49
|
import termios
|
@@ -71,14 +73,12 @@ class Recorder:
|
|
71
73
|
ms = cast(_MSVCRT, msvcrt)
|
72
74
|
|
73
75
|
last_space = 0.0
|
74
|
-
|
75
|
-
print("[key] using msvcrt (Windows)", file=sys.stderr)
|
76
|
+
debug_log(self.verbose, "recorder", "Key input: using msvcrt (Windows)")
|
76
77
|
while not stop_evt.is_set():
|
77
78
|
if ms.kbhit():
|
78
79
|
ch = ms.getwch()
|
79
80
|
if ch in ("\r", "\n"):
|
80
|
-
|
81
|
-
print("[key] ENTER", file=sys.stderr)
|
81
|
+
debug_log(self.verbose, "recorder", "Key input: ENTER")
|
82
82
|
evt_q.put("ENTER")
|
83
83
|
break
|
84
84
|
if ch == " ":
|
@@ -88,8 +88,7 @@ class Recorder:
|
|
88
88
|
):
|
89
89
|
continue
|
90
90
|
last_space = now
|
91
|
-
|
92
|
-
print("[key] SPACE", file=sys.stderr)
|
91
|
+
debug_log(self.verbose, "recorder", "Key input: SPACE")
|
93
92
|
evt_q.put("SPACE")
|
94
93
|
time.sleep(0.01)
|
95
94
|
else:
|
@@ -97,8 +96,9 @@ class Recorder:
|
|
97
96
|
try:
|
98
97
|
if sys.stdin.isatty():
|
99
98
|
fd = sys.stdin.fileno()
|
100
|
-
|
101
|
-
|
99
|
+
debug_log(
|
100
|
+
self.verbose, "recorder", "Key input: using sys.stdin (TTY fd read)"
|
101
|
+
)
|
102
102
|
old = termios.tcgetattr(fd)
|
103
103
|
tty.setcbreak(fd)
|
104
104
|
last_space = 0.0
|
@@ -114,8 +114,7 @@ class Recorder:
|
|
114
114
|
continue
|
115
115
|
ch = ch_b.decode(errors="ignore")
|
116
116
|
if ch in ("\n", "\r"):
|
117
|
-
|
118
|
-
print("[key] ENTER", file=sys.stderr)
|
117
|
+
debug_log(self.verbose, "recorder", "Key input: ENTER")
|
119
118
|
evt_q.put("ENTER")
|
120
119
|
break
|
121
120
|
if ch == " ":
|
@@ -125,8 +124,7 @@ class Recorder:
|
|
125
124
|
):
|
126
125
|
continue
|
127
126
|
last_space = now
|
128
|
-
|
129
|
-
print("[key] SPACE", file=sys.stderr)
|
127
|
+
debug_log(self.verbose, "recorder", "Key input: SPACE")
|
130
128
|
evt_q.put("SPACE")
|
131
129
|
finally:
|
132
130
|
termios.tcsetattr(fd, termios.TCSADRAIN, old)
|
@@ -137,8 +135,11 @@ class Recorder:
|
|
137
135
|
try:
|
138
136
|
fd = os.open("/dev/tty", os.O_RDONLY)
|
139
137
|
using_devtty = True
|
140
|
-
|
141
|
-
|
138
|
+
debug_log(
|
139
|
+
self.verbose,
|
140
|
+
"recorder",
|
141
|
+
"Key input: using /dev/tty (stdin not TTY)",
|
142
|
+
)
|
142
143
|
old = termios.tcgetattr(fd)
|
143
144
|
tty.setcbreak(fd)
|
144
145
|
last_space = 0.0
|
@@ -151,8 +152,9 @@ class Recorder:
|
|
151
152
|
continue
|
152
153
|
ch = ch_b.decode(errors="ignore")
|
153
154
|
if ch in ("\n", "\r"):
|
154
|
-
|
155
|
-
|
155
|
+
debug_log(
|
156
|
+
self.verbose, "recorder", "Key input: ENTER"
|
157
|
+
)
|
156
158
|
evt_q.put("ENTER")
|
157
159
|
break
|
158
160
|
if ch == " ":
|
@@ -162,8 +164,9 @@ class Recorder:
|
|
162
164
|
):
|
163
165
|
continue
|
164
166
|
last_space = now
|
165
|
-
|
166
|
-
|
167
|
+
debug_log(
|
168
|
+
self.verbose, "recorder", "Key input: SPACE"
|
169
|
+
)
|
167
170
|
evt_q.put("SPACE")
|
168
171
|
finally:
|
169
172
|
termios.tcsetattr(fd, termios.TCSADRAIN, old)
|
@@ -185,14 +188,16 @@ class Recorder:
|
|
185
188
|
continue
|
186
189
|
# If user hits Enter on empty line, treat as ENTER
|
187
190
|
if line == "\n" or line == "\r\n":
|
188
|
-
|
189
|
-
|
191
|
+
debug_log(
|
192
|
+
self.verbose, "recorder", "Key input: ENTER (line mode)"
|
193
|
+
)
|
190
194
|
evt_q.put("ENTER")
|
191
195
|
break
|
192
196
|
# If first non-empty char is space, treat as SPACE
|
193
197
|
if line and line[0] == " ":
|
194
|
-
|
195
|
-
|
198
|
+
debug_log(
|
199
|
+
self.verbose, "recorder", "Key input: SPACE (line mode)"
|
200
|
+
)
|
196
201
|
evt_q.put("SPACE")
|
197
202
|
except Exception as e:
|
198
203
|
print(f"Warning: key reader failed: {e}", file=sys.stderr)
|
@@ -224,7 +229,7 @@ class Recorder:
|
|
224
229
|
threshold_rms = 0.015 # conservative RMS threshold for float32 [-1,1]
|
225
230
|
split_cooldown_sec = 0.2
|
226
231
|
|
227
|
-
def _do_split() -> None:
|
232
|
+
def _do_split(cause: str) -> None:
|
228
233
|
nonlocal fh, frames_written, cur_path, chunk_index, offset_seconds_total
|
229
234
|
fh.flush()
|
230
235
|
fh.close()
|
@@ -234,12 +239,19 @@ class Recorder:
|
|
234
239
|
chunk_frames.append(frames_written)
|
235
240
|
chunk_offsets.append(offset_seconds_total)
|
236
241
|
offset_seconds_total += dur
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
242
|
+
debug_log(
|
243
|
+
self.verbose,
|
244
|
+
"recorder",
|
245
|
+
f"Saved chunk {chunk_index}: {cur_path.name} ({dur:.2f}s)",
|
246
|
+
)
|
247
|
+
# Include split cause so downstream can format output accordingly
|
248
|
+
# cause: "space" (manual split) or "pause" (auto-split)
|
249
|
+
tx_queue.put((chunk_index, cur_path, frames_written, chunk_offsets[-1], cause))
|
250
|
+
debug_log(
|
251
|
+
self.verbose,
|
252
|
+
"recorder",
|
253
|
+
f"Enqueued chunk {chunk_index} for transcription (cause={cause})",
|
254
|
+
)
|
243
255
|
else:
|
244
256
|
try:
|
245
257
|
cur_path.unlink(missing_ok=True)
|
@@ -253,8 +265,14 @@ class Recorder:
|
|
253
265
|
and self.resume_event is not None
|
254
266
|
):
|
255
267
|
self._paused = True
|
268
|
+
debug_log(
|
269
|
+
self.verbose,
|
270
|
+
"recorder",
|
271
|
+
"Paused after first chunk; waiting for resume (prompt mode)",
|
272
|
+
)
|
256
273
|
self.resume_event.wait()
|
257
274
|
self._paused = False
|
275
|
+
debug_log(self.verbose, "recorder", "Resumed after prompt")
|
258
276
|
cur_path = self.session_dir / f"chunk_{chunk_index:04d}{self.ext}"
|
259
277
|
fh = sf.SoundFile(
|
260
278
|
str(cur_path),
|
@@ -270,8 +288,13 @@ class Recorder:
|
|
270
288
|
try:
|
271
289
|
while True:
|
272
290
|
cmd = ctrl_q.get_nowait()
|
273
|
-
if cmd == "
|
274
|
-
_do_split()
|
291
|
+
if cmd == "split_manual":
|
292
|
+
_do_split("space")
|
293
|
+
# Reset silence tracking on manual split
|
294
|
+
silent_frames_run = 0
|
295
|
+
seen_non_silent = False
|
296
|
+
elif cmd == "split_auto":
|
297
|
+
_do_split("pause")
|
275
298
|
# Reset silence tracking on manual split
|
276
299
|
silent_frames_run = 0
|
277
300
|
seen_non_silent = False
|
@@ -284,20 +307,33 @@ class Recorder:
|
|
284
307
|
chunk_frames.append(frames_written)
|
285
308
|
chunk_offsets.append(offset_seconds_total)
|
286
309
|
offset_seconds_total += dur
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
310
|
+
debug_log(
|
311
|
+
self.verbose,
|
312
|
+
"recorder",
|
313
|
+
f"Saved chunk {chunk_index}: {cur_path.name} ({dur:.2f}s)",
|
314
|
+
)
|
315
|
+
# Final chunk – mark cause as "finish" so downstream can avoid extra blank spacing
|
292
316
|
tx_queue.put(
|
293
|
-
(
|
317
|
+
(
|
318
|
+
chunk_index,
|
319
|
+
cur_path,
|
320
|
+
frames_written,
|
321
|
+
chunk_offsets[-1],
|
322
|
+
"finish",
|
323
|
+
)
|
324
|
+
)
|
325
|
+
debug_log(
|
326
|
+
self.verbose,
|
327
|
+
"recorder",
|
328
|
+
f"Enqueued final chunk {chunk_index} for transcription",
|
294
329
|
)
|
295
330
|
else:
|
296
331
|
try:
|
297
332
|
cur_path.unlink(missing_ok=True)
|
298
333
|
except Exception:
|
299
334
|
pass
|
300
|
-
tx_queue.put((-1, Path(), 0, 0.0))
|
335
|
+
tx_queue.put((-1, Path(), 0, 0.0, ""))
|
336
|
+
debug_log(self.verbose, "recorder", "Signaled transcription finish")
|
301
337
|
return
|
302
338
|
except queue.Empty:
|
303
339
|
pass
|
@@ -342,18 +378,18 @@ class Recorder:
|
|
342
378
|
enough_length = frames_written >= int(self.samplerate * self.min_chunk_sec)
|
343
379
|
cooldown_ok = (time.perf_counter() - last_split_time) >= split_cooldown_sec
|
344
380
|
if enough_silence and enough_length and seen_non_silent and cooldown_ok:
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
381
|
+
debug_log(
|
382
|
+
self.verbose,
|
383
|
+
"recorder",
|
384
|
+
f"Auto-split (≥{self.silence_sec:.2f}s silence)",
|
385
|
+
)
|
350
386
|
last_split_time = time.perf_counter()
|
351
|
-
# Queue
|
352
|
-
ctrl_q.put("
|
387
|
+
# Queue an auto split for the next control phase
|
388
|
+
ctrl_q.put("split_auto")
|
353
389
|
# Reset silence tracking now to avoid cascaded triggers
|
354
390
|
silent_frames_run = 0
|
355
391
|
seen_non_silent = False
|
356
|
-
tx_queue.put((-1, Path(), 0, 0.0))
|
392
|
+
tx_queue.put((-1, Path(), 0, 0.0, ""))
|
357
393
|
|
358
394
|
def cb(indata: Any, frames: int, time_info: Any, status: Any) -> None:
|
359
395
|
if status:
|
@@ -375,6 +411,12 @@ class Recorder:
|
|
375
411
|
print("—" * 60)
|
376
412
|
print("")
|
377
413
|
|
414
|
+
debug_log(
|
415
|
+
self.verbose,
|
416
|
+
"recorder",
|
417
|
+
f"Recording started (rate={self.samplerate}, channels={self.channels}, ext={self.ext})",
|
418
|
+
)
|
419
|
+
|
378
420
|
import sounddevice as sd
|
379
421
|
|
380
422
|
with sd.InputStream(samplerate=self.samplerate, channels=self.channels, callback=cb):
|
@@ -384,9 +426,10 @@ class Recorder:
|
|
384
426
|
except queue.Empty:
|
385
427
|
continue
|
386
428
|
if evt == "SPACE":
|
387
|
-
ctrl_q.put("
|
429
|
+
ctrl_q.put("split_manual")
|
388
430
|
elif evt == "ENTER":
|
389
431
|
ctrl_q.put("finish")
|
390
432
|
break
|
391
433
|
writer_t.join()
|
434
|
+
debug_log(self.verbose, "recorder", "Recording finished")
|
392
435
|
return chunk_paths, chunk_frames, chunk_offsets
|
@@ -8,6 +8,7 @@ from collections.abc import Iterable
|
|
8
8
|
from pathlib import Path
|
9
9
|
|
10
10
|
from ..types import SegmentDict, TranscriptionResult
|
11
|
+
from ..utils import debug_log
|
11
12
|
|
12
13
|
# Global install coordination to avoid duplicate downloads in parallel
|
13
14
|
_install_lock = threading.Lock()
|
@@ -25,8 +26,7 @@ class ArgosTranslator:
|
|
25
26
|
self.verbose = verbose
|
26
27
|
|
27
28
|
def _debug(self, msg: str) -> None:
|
28
|
-
|
29
|
-
print(msg)
|
29
|
+
debug_log(self.verbose, "argos", msg)
|
30
30
|
|
31
31
|
@staticmethod
|
32
32
|
def _guess_packages_dir() -> str:
|
@@ -5,6 +5,7 @@ import platform
|
|
5
5
|
import shutil
|
6
6
|
import subprocess
|
7
7
|
import sys
|
8
|
+
import time
|
8
9
|
from datetime import datetime
|
9
10
|
from pathlib import Path
|
10
11
|
|
@@ -36,6 +37,25 @@ def convert_wav_to_mp3(wav_path: Path, mp3_path: Path) -> None:
|
|
36
37
|
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
|
37
38
|
|
38
39
|
|
40
|
+
# Baseline at program start for relative timestamps in verbose logs
|
41
|
+
_START_TIME = time.perf_counter()
|
42
|
+
|
43
|
+
|
44
|
+
def debug_log(verbose: bool, component: str, msg: str) -> None:
|
45
|
+
"""Emit a timestamped debug line to stderr if verbose is enabled.
|
46
|
+
|
47
|
+
Args:
|
48
|
+
verbose: Whether verbose mode is active.
|
49
|
+
component: Short component tag (e.g., 'recorder', 'whisper', 'cli', 'argos').
|
50
|
+
msg: Message to print.
|
51
|
+
"""
|
52
|
+
if not verbose:
|
53
|
+
return
|
54
|
+
elapsed = time.perf_counter() - _START_TIME
|
55
|
+
# Elapsed with milliseconds precision
|
56
|
+
print(f"[+{elapsed:.3f}s] [{component}] {msg}", file=sys.stderr, flush=True)
|
57
|
+
|
58
|
+
|
39
59
|
def copy_to_clipboard(text: str) -> None:
|
40
60
|
system = platform.system()
|
41
61
|
try:
|
@@ -5,7 +5,20 @@ from concurrent.futures import Future, ThreadPoolExecutor
|
|
5
5
|
from pathlib import Path
|
6
6
|
from typing import Any
|
7
7
|
|
8
|
+
import numpy as np
|
9
|
+
|
8
10
|
from .types import SegmentDict, TranscriptionResult
|
11
|
+
from .utils import debug_log
|
12
|
+
|
13
|
+
# --- Tuning parameters (easy to adjust later) ---
|
14
|
+
# Silence trim parameters operate on 16 kHz mono arrays
|
15
|
+
TRIM_RMS_THRESHOLD: float = 0.012 # RMS threshold for speech vs. silence
|
16
|
+
TRIM_MIN_VOICED_SEC: float = 0.5 # Require at least this much voiced audio to transcribe
|
17
|
+
TRIM_PAD_MS: int = 50 # Keep a short pad around detected speech (ms)
|
18
|
+
|
19
|
+
# Whisper inference behavior on low/empty audio
|
20
|
+
WHISPER_NO_SPEECH_THRESHOLD: float = 0.7
|
21
|
+
WHISPER_CONDITION_ON_PREV: bool = False
|
9
22
|
|
10
23
|
|
11
24
|
class WhisperEngine:
|
@@ -48,6 +61,7 @@ class WhisperEngine:
|
|
48
61
|
return m, (t1 - t0)
|
49
62
|
|
50
63
|
fut = self._executor.submit(_load, self.model_name)
|
64
|
+
debug_log(self.verbose, "whisper", f"Submitted model preload: {self.model_name}")
|
51
65
|
return self._executor, fut
|
52
66
|
except Exception:
|
53
67
|
return None, None
|
@@ -62,6 +76,9 @@ class WhisperEngine:
|
|
62
76
|
self.profile["model_load_sec"] = self.profile.get("model_load_sec", 0.0) + float(
|
63
77
|
load_dur
|
64
78
|
)
|
79
|
+
debug_log(
|
80
|
+
self.verbose, "whisper", f"Model resolved via preload in {float(load_dur):.3f}s"
|
81
|
+
)
|
65
82
|
except Exception:
|
66
83
|
model = None
|
67
84
|
if model is None:
|
@@ -69,6 +86,7 @@ class WhisperEngine:
|
|
69
86
|
model = whisper.load_model(self.model_name)
|
70
87
|
t1m = time.perf_counter()
|
71
88
|
self.profile["model_load_sec"] = self.profile.get("model_load_sec", 0.0) + (t1m - t0m)
|
89
|
+
debug_log(self.verbose, "whisper", f"Loaded model synchronously in {(t1m - t0m):.3f}s")
|
72
90
|
return model
|
73
91
|
|
74
92
|
def transcribe_chunk(
|
@@ -79,10 +97,8 @@ class WhisperEngine:
|
|
79
97
|
initial_prompt: str | None = None,
|
80
98
|
) -> TranscriptionResult:
|
81
99
|
# Load audio without ffmpeg by reading via soundfile and passing a numpy array
|
82
|
-
# to Whisper.
|
100
|
+
# to Whisper. Convert to mono float32 and resample to 16 kHz as expected by Whisper's API.
|
83
101
|
task = "translate" if self.translate else "transcribe"
|
84
|
-
import numpy as np
|
85
|
-
|
86
102
|
try:
|
87
103
|
import soundfile as sf
|
88
104
|
except Exception as e:
|
@@ -100,16 +116,71 @@ class WhisperEngine:
|
|
100
116
|
# Resample to 16k expected by Whisper when passing arrays
|
101
117
|
mono_16k: np.ndarray = resample_linear(mono, int(sr), 16000)
|
102
118
|
|
119
|
+
# Trim leading/trailing silence to avoid hallucinations on near-empty chunks
|
120
|
+
def _moving_rms(x: np.ndarray, win_len: int) -> np.ndarray:
|
121
|
+
if x.size == 0:
|
122
|
+
return np.zeros(0, dtype=np.float32)
|
123
|
+
win = np.ones(win_len, dtype=np.float32) / float(win_len)
|
124
|
+
sq = np.square(x.astype(np.float32, copy=False))
|
125
|
+
# same-length RMS via 'same' convolution
|
126
|
+
ma = np.convolve(sq, win, mode="same")
|
127
|
+
return np.sqrt(ma).astype(np.float32, copy=False)
|
128
|
+
|
129
|
+
def _trim_silence(x: np.ndarray, sr16k: int) -> tuple[np.ndarray, float, float]:
|
130
|
+
# Returns (trimmed, leading_sec, trailing_sec)
|
131
|
+
if x.size == 0:
|
132
|
+
return x, 0.0, 0.0
|
133
|
+
win_len = max(1, int(round(sr16k * 0.03))) # 30 ms window
|
134
|
+
rms = _moving_rms(x, win_len)
|
135
|
+
thr = float(TRIM_RMS_THRESHOLD)
|
136
|
+
voiced = np.where(rms >= thr)[0]
|
137
|
+
if voiced.size == 0:
|
138
|
+
return np.zeros(0, dtype=np.float32), 0.0, float(x.size) / sr16k
|
139
|
+
start_idx = int(voiced[0])
|
140
|
+
end_idx = int(voiced[-1])
|
141
|
+
pad = int(round((TRIM_PAD_MS / 1000.0) * sr16k))
|
142
|
+
a = max(0, start_idx - pad)
|
143
|
+
b = min(x.size, end_idx + pad + 1)
|
144
|
+
lead_sec = float(a) / sr16k
|
145
|
+
trail_sec = float(x.size - b) / sr16k
|
146
|
+
return x[a:b], lead_sec, trail_sec
|
147
|
+
|
148
|
+
pre_sec = float(mono_16k.size) / 16000.0
|
149
|
+
trimmed, lead_sec, trail_sec = _trim_silence(mono_16k, 16000)
|
150
|
+
post_sec = float(trimmed.size) / 16000.0
|
151
|
+
debug_log(
|
152
|
+
self.verbose,
|
153
|
+
"whisper",
|
154
|
+
f"Chunk {audio_path.name}: trim {pre_sec:.2f}s -> {post_sec:.2f}s (lead {lead_sec:.2f}s, tail {trail_sec:.2f}s)",
|
155
|
+
)
|
156
|
+
|
157
|
+
# If too short after trimming, skip transcription
|
158
|
+
if post_sec < float(TRIM_MIN_VOICED_SEC):
|
159
|
+
debug_log(
|
160
|
+
self.verbose,
|
161
|
+
"whisper",
|
162
|
+
f"Chunk {audio_path.name}: too short after trim ({post_sec:.2f}s) – skipping",
|
163
|
+
)
|
164
|
+
return {"text": "", "segments": []}
|
165
|
+
|
103
166
|
t0 = time.perf_counter()
|
167
|
+
debug_log(
|
168
|
+
self.verbose, "whisper", f"Transcribing chunk {audio_path.name} (frames={frames})"
|
169
|
+
)
|
104
170
|
res: dict[str, Any] = model.transcribe(
|
105
|
-
|
171
|
+
trimmed,
|
106
172
|
task=task,
|
107
173
|
language=self.language,
|
108
174
|
fp16=False,
|
109
|
-
initial_prompt=initial_prompt,
|
175
|
+
initial_prompt=(initial_prompt if post_sec >= float(TRIM_MIN_VOICED_SEC) else None),
|
176
|
+
condition_on_previous_text=bool(WHISPER_CONDITION_ON_PREV),
|
177
|
+
no_speech_threshold=float(WHISPER_NO_SPEECH_THRESHOLD),
|
110
178
|
)
|
111
179
|
t1 = time.perf_counter()
|
112
180
|
self.profile["transcribe_sec"] = self.profile.get("transcribe_sec", 0.0) + (t1 - t0)
|
181
|
+
debug_log(
|
182
|
+
self.verbose, "whisper", f"Transcribed chunk {audio_path.name} in {(t1 - t0):.3f}s"
|
183
|
+
)
|
113
184
|
text_c = str(res.get("text", "") or "").strip()
|
114
185
|
lang_code = str(res.get("language", "") or "")
|
115
186
|
if self.native_segmentation:
|
@@ -117,8 +188,9 @@ class WhisperEngine:
|
|
117
188
|
segs_typed: list[SegmentDict] = []
|
118
189
|
for s in segs_raw:
|
119
190
|
try:
|
120
|
-
|
121
|
-
|
191
|
+
# Adjust for leading trim so times align with original chunk timeline
|
192
|
+
start = float(s.get("start", 0.0)) + float(lead_sec)
|
193
|
+
end = float(s.get("end", 0.0)) + float(lead_sec)
|
122
194
|
text = str(s.get("text", "") or "")
|
123
195
|
segs_typed.append({"start": start, "end": end, "text": text})
|
124
196
|
except Exception:
|
@@ -129,8 +201,12 @@ class WhisperEngine:
|
|
129
201
|
return out
|
130
202
|
# Collapsed single segment per chunk
|
131
203
|
segs_raw = res.get("segments", []) or []
|
132
|
-
start = float(segs_raw[0].get("start", 0.0)) if segs_raw else 0.0
|
133
|
-
end =
|
204
|
+
start = (float(segs_raw[0].get("start", 0.0)) + float(lead_sec)) if segs_raw else 0.0
|
205
|
+
end = (
|
206
|
+
(float(segs_raw[-1].get("end", 0.0)) + float(lead_sec))
|
207
|
+
if segs_raw
|
208
|
+
else (frames / float(self.samplerate))
|
209
|
+
)
|
134
210
|
out2: TranscriptionResult = {
|
135
211
|
"text": text_c,
|
136
212
|
"segments": ([{"start": start, "end": end, "text": text_c}] if text_c else []),
|
@@ -143,12 +219,17 @@ class WhisperEngine:
|
|
143
219
|
try:
|
144
220
|
from whisper.utils import get_writer
|
145
221
|
|
222
|
+
debug_log(self.verbose, "whisper", f"Writing outputs for {audio_path.name}")
|
146
223
|
for fmt in ("txt", "srt", "vtt", "tsv", "json"):
|
147
224
|
writer = get_writer(fmt, str(self.session_dir))
|
148
225
|
writer(result, str(audio_path))
|
226
|
+
debug_log(self.verbose, "whisper", f"Wrote outputs for {audio_path.name}")
|
149
227
|
except Exception as e:
|
150
|
-
|
151
|
-
|
228
|
+
debug_log(
|
229
|
+
self.verbose,
|
230
|
+
"whisper",
|
231
|
+
f"Warning: failed to write chunk outputs for {audio_path.name}: {e}",
|
232
|
+
)
|
152
233
|
|
153
234
|
def merge_results(
|
154
235
|
self, results: list[TranscriptionResult], offsets: list[float], cumulative_text: str
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|