s2t 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
s2t/cli.py CHANGED
@@ -213,6 +213,8 @@ def run_session(opts: SessionOptions) -> int:
213
213
  verbose=opts.verbose,
214
214
  pause_after_first_chunk=opts.prompt,
215
215
  resume_event=prompt_resume_event,
216
+ silence_sec=opts.silence_sec,
217
+ min_chunk_sec=opts.min_chunk_sec,
216
218
  )
217
219
  t0 = time.perf_counter()
218
220
  chunk_paths, chunk_frames, chunk_offsets = rec.run(tx_q)
@@ -442,9 +444,21 @@ def main(argv: list[str] | None = None) -> int:
442
444
  help="Debounce window for SPACE (ms). If >0, ignores rapid successive space presses",
443
445
  )
444
446
  parser.add_argument(
445
- "--native-segmentation",
447
+ "--silence-sec",
448
+ type=float,
449
+ default=1.0,
450
+ help="Auto-split when continuous silence >= this many seconds (0 disables)",
451
+ )
452
+ parser.add_argument(
453
+ "--min-chunk-sec",
454
+ type=float,
455
+ default=5.0,
456
+ help="Minimum duration a chunk must reach before auto-split can trigger",
457
+ )
458
+ parser.add_argument(
459
+ "--chunk-segmentation",
446
460
  action="store_true",
447
- help="Use Whisper's native segmentation inside chunks (default collapses each chunk to a single phrase)",
461
+ help="Disable Whisper's native segmentation: emit exactly one segment per recorded chunk",
448
462
  )
449
463
  parser.add_argument(
450
464
  "-p",
@@ -492,10 +506,12 @@ def main(argv: list[str] | None = None) -> int:
492
506
  lang=args.lang,
493
507
  translate=args.translate,
494
508
  translate_to=(args.translate_to or []),
495
- native_segmentation=getattr(args, "native_segmentation", False),
509
+ native_segmentation=(not getattr(args, "chunk_segmentation", False)),
496
510
  verbose=args.verbose,
497
511
  edit=args.edit,
498
512
  debounce_ms=getattr(args, "debounce_ms", 0),
513
+ silence_sec=getattr(args, "silence_sec", 1.0),
514
+ min_chunk_sec=getattr(args, "min_chunk_sec", 5.0),
499
515
  profile=args.profile,
500
516
  keep_chunks=getattr(args, "keep_chunks", False),
501
517
  prompt=getattr(args, "prompt", False),
s2t/config.py CHANGED
@@ -18,6 +18,8 @@ class SessionOptions:
18
18
  verbose: bool
19
19
  edit: bool
20
20
  debounce_ms: int
21
+ silence_sec: float
22
+ min_chunk_sec: float
21
23
  profile: bool
22
24
  keep_chunks: bool
23
25
  prompt: bool
s2t/recorder.py CHANGED
@@ -9,6 +9,8 @@ import time
9
9
  from pathlib import Path
10
10
  from typing import Any, Protocol, cast, runtime_checkable
11
11
 
12
+ import numpy as np
13
+
12
14
 
13
15
  class Recorder:
14
16
  def __init__(
@@ -21,6 +23,8 @@ class Recorder:
21
23
  verbose: bool = False,
22
24
  pause_after_first_chunk: bool = False,
23
25
  resume_event: threading.Event | None = None,
26
+ silence_sec: float = 1.0,
27
+ min_chunk_sec: float = 5.0,
24
28
  ) -> None:
25
29
  self.session_dir = session_dir
26
30
  self.samplerate = samplerate
@@ -31,6 +35,9 @@ class Recorder:
31
35
  self.pause_after_first_chunk = pause_after_first_chunk
32
36
  self.resume_event = resume_event
33
37
  self._paused = False
38
+ # Auto-split config
39
+ self.silence_sec = max(0.0, float(silence_sec))
40
+ self.min_chunk_sec = max(0.0, float(min_chunk_sec))
34
41
 
35
42
  def run(
36
43
  self,
@@ -209,50 +216,65 @@ class Recorder:
209
216
  fh = sf.SoundFile(
210
217
  str(cur_path), mode="w", samplerate=self.samplerate, channels=self.channels
211
218
  )
219
+ # State for auto-split based on silence
220
+ silent_frames_run = 0
221
+ seen_non_silent = False
222
+ last_split_time = 0.0
223
+ # Internal thresholds
224
+ threshold_rms = 0.015 # conservative RMS threshold for float32 [-1,1]
225
+ split_cooldown_sec = 0.2
226
+
227
+ def _do_split() -> None:
228
+ nonlocal fh, frames_written, cur_path, chunk_index, offset_seconds_total
229
+ fh.flush()
230
+ fh.close()
231
+ if frames_written > 0:
232
+ dur = frames_written / float(self.samplerate)
233
+ chunk_paths.append(cur_path)
234
+ chunk_frames.append(frames_written)
235
+ chunk_offsets.append(offset_seconds_total)
236
+ offset_seconds_total += dur
237
+ if self.verbose:
238
+ print(
239
+ f"Saved chunk: {cur_path.name} ({dur:.2f}s)",
240
+ file=sys.stderr,
241
+ )
242
+ tx_queue.put((chunk_index, cur_path, frames_written, chunk_offsets[-1]))
243
+ else:
244
+ try:
245
+ cur_path.unlink(missing_ok=True)
246
+ except Exception:
247
+ pass
248
+ frames_written = 0
249
+ chunk_index += 1
250
+ if (
251
+ self.pause_after_first_chunk
252
+ and chunk_index == 2
253
+ and self.resume_event is not None
254
+ ):
255
+ self._paused = True
256
+ self.resume_event.wait()
257
+ self._paused = False
258
+ cur_path = self.session_dir / f"chunk_{chunk_index:04d}{self.ext}"
259
+ fh = sf.SoundFile(
260
+ str(cur_path),
261
+ mode="w",
262
+ samplerate=self.samplerate,
263
+ channels=self.channels,
264
+ )
265
+ # Reset silence tracking after a split
266
+ return
267
+
212
268
  while True:
213
269
  # First, handle any pending control commands so SPACE/ENTER are never blocked by frames backlog.
214
270
  try:
215
271
  while True:
216
272
  cmd = ctrl_q.get_nowait()
217
273
  if cmd == "split":
218
- fh.flush()
219
- fh.close()
220
- if frames_written > 0:
221
- dur = frames_written / float(self.samplerate)
222
- chunk_paths.append(cur_path)
223
- chunk_frames.append(frames_written)
224
- chunk_offsets.append(offset_seconds_total)
225
- offset_seconds_total += dur
226
- if self.verbose:
227
- print(
228
- f"Saved chunk: {cur_path.name} ({dur:.2f}s)",
229
- file=sys.stderr,
230
- )
231
- tx_queue.put(
232
- (chunk_index, cur_path, frames_written, chunk_offsets[-1])
233
- )
234
- else:
235
- try:
236
- cur_path.unlink(missing_ok=True)
237
- except Exception:
238
- pass
239
- frames_written = 0
240
- chunk_index += 1
241
- if (
242
- self.pause_after_first_chunk
243
- and chunk_index == 2
244
- and self.resume_event is not None
245
- ):
246
- self._paused = True
247
- self.resume_event.wait()
248
- self._paused = False
249
- cur_path = self.session_dir / f"chunk_{chunk_index:04d}{self.ext}"
250
- fh = sf.SoundFile(
251
- str(cur_path),
252
- mode="w",
253
- samplerate=self.samplerate,
254
- channels=self.channels,
255
- )
274
+ _do_split()
275
+ # Reset silence tracking on manual split
276
+ silent_frames_run = 0
277
+ seen_non_silent = False
256
278
  elif cmd == "finish":
257
279
  fh.flush()
258
280
  fh.close()
@@ -289,6 +311,48 @@ class Recorder:
289
311
  data = payload
290
312
  fh.write(data)
291
313
  frames_written += len(data)
314
+ # Auto-split based on silence if enabled
315
+ if self.silence_sec > 0.0:
316
+ try:
317
+ arr = np.asarray(data, dtype=np.float32)
318
+ if arr.ndim == 2 and arr.shape[1] > 1:
319
+ # average channels
320
+ arr_mono = arr.mean(axis=1)
321
+ else:
322
+ arr_mono = arr.reshape(-1)
323
+ # compute RMS
324
+ rms = (
325
+ float(np.sqrt(np.mean(np.square(arr_mono))))
326
+ if arr_mono.size
327
+ else 0.0
328
+ )
329
+ except Exception:
330
+ rms = 0.0
331
+
332
+ if rms < threshold_rms:
333
+ silent_frames_run += len(arr_mono)
334
+ else:
335
+ silent_frames_run = 0
336
+ seen_non_silent = True
337
+
338
+ # Conditions to auto-split
339
+ enough_silence = silent_frames_run >= int(
340
+ self.samplerate * self.silence_sec
341
+ )
342
+ enough_length = frames_written >= int(self.samplerate * self.min_chunk_sec)
343
+ cooldown_ok = (time.perf_counter() - last_split_time) >= split_cooldown_sec
344
+ if enough_silence and enough_length and seen_non_silent and cooldown_ok:
345
+ if self.verbose:
346
+ print(
347
+ f"[auto] split (≥{self.silence_sec:.2f}s silence)",
348
+ file=sys.stderr,
349
+ )
350
+ last_split_time = time.perf_counter()
351
+ # Queue a split for the next control phase
352
+ ctrl_q.put("split")
353
+ # Reset silence tracking now to avoid cascaded triggers
354
+ silent_frames_run = 0
355
+ seen_non_silent = False
292
356
  tx_queue.put((-1, Path(), 0, 0.0))
293
357
 
294
358
  def cb(indata: Any, frames: int, time_info: Any, status: Any) -> None:
@@ -302,7 +366,12 @@ class Recorder:
302
366
  key_t.start()
303
367
  writer_t.start()
304
368
 
305
- print("Recording… Press SPACE to split, Enter to finish.")
369
+ msg = "Recording… Press SPACE to split, Enter to finish."
370
+ if self.silence_sec > 0.0:
371
+ msg += (
372
+ f" Auto-split on ≥{self.silence_sec:.2f}s silence (min {self.min_chunk_sec:.2f}s)."
373
+ )
374
+ print(msg)
306
375
  print("—" * 60)
307
376
  print("")
308
377
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: s2t
3
- Version: 0.1.5
3
+ Version: 0.1.7
4
4
  Summary: Speech to Text (s2t): Record audio, run Whisper, export formats, and copy transcript to clipboard.
5
5
  Author: Maintainers
6
6
  License-Expression: LicenseRef-Proprietary
@@ -55,6 +55,8 @@ System requirements (Linux)
55
55
  - Translate to English: `-t` (long: `--translate`). You may still provide `--lang` as an input-language hint if you want.
56
56
  - List available models and exit: `-L` (long: `--list-models`)
57
57
  - Recording format: `-f flac|wav|mp3` (long: `--recording-format`), default `flac`. MP3 requires ffmpeg; if absent, it falls back to FLAC with a warning.
58
+ - Auto-split on silence: `--silence-sec 1.0` (default `1.0`; `0` disables). When continuous silence ≥ this many seconds is detected, the current chunk is ended automatically.
59
+ - Minimum chunk length for auto-split: `--min-chunk-sec 5.0` (default `5.0`). Prevents very short chunks and avoids splitting early in a sentence.
58
60
  - Prompt mode (spoken prompt): `-p` (long: `--prompt`). Speak your prompt first, then press SPACE to use it as prompt and continue with your main content. If you press ENTER instead of SPACE, no prompt is used; the spoken audio is transcribed as normal payload and the session ends.
59
61
  - Keep chunk files: `--keep-chunks` — by default, per‑chunk audio and per‑chunk Whisper outputs are deleted after the final merge.
60
62
  - Open transcript for editing: `-e` (long: `--edit`) — opens the generated `.txt` in your shell editor (`$VISUAL`/`$EDITOR`).
@@ -69,6 +71,11 @@ Outputs are written into a timestamped folder under the chosen output directory
69
71
  - Final outputs: `recording.flac/.wav` (and `recording.mp3` if requested and ffmpeg available), plus `recording.txt/.srt/.vtt/.tsv/.json`
70
72
  - Clipboard mirrors the combined `.txt` with blank lines between chunks.
71
73
 
74
+ Auto-splitting details
75
+ - SPACE always splits immediately; ENTER finishes the recording.
76
+ - With `--silence-sec > 0`, chunks end automatically after detected continuous silence of that many seconds.
77
+ - Auto-split only triggers once the current chunk has at least `--min-chunk-sec` seconds and after speech has been detected (to ignore leading silence). A short internal cooldown avoids duplicate splits.
78
+
72
79
  ## Makefile (optional)
73
80
  - Setup venv + dev deps: `make setup`
74
81
  - Lint/format/test: `make lint`, `make format`, `make test`; combined gate: `make check`
@@ -1,16 +1,16 @@
1
1
  s2t/__init__.py,sha256=wV4E9i-7KrUn1dOtLUQB3ZGEKx9gRWH3hPHlpw-ZdWc,332
2
- s2t/cli.py,sha256=Qf6Hz0Ew9ncLbQQoCPDG7ZiYWeGbwBcZMZi_WbEu54w,20018
3
- s2t/config.py,sha256=lFc_x5fIx_q0JpTcI4Lm4aubxhIXVH34foBvLMUNFGs,437
2
+ s2t/cli.py,sha256=p6lvizzW1T1Y-_ykJ8FdPaBVvZS_no7OgRv4gWlZ95s,20572
3
+ s2t/config.py,sha256=uw4CZSSXmUvnlOrqBGR1Rcq-WdXucHj3KICRcCb_pkU,485
4
4
  s2t/outputs.py,sha256=Lo8VcARZ7QPuuQQNu8myD5J4c4NO1Rs0L1DLnzLe9tM,1546
5
5
  s2t/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
6
- s2t/recorder.py,sha256=bzf3DxwXSO9VmMWDnAW96Q6t0IG1Xnj-Cn8g-t3s0qo,15509
6
+ s2t/recorder.py,sha256=tVUGwnHnlwQM9UtdHUlQIZ3GubCrop9suZYxc9yHHmE,18370
7
7
  s2t/types.py,sha256=jBiRN-tr0qVw-lhaXvnsyKrVGDyLkqEbxs9qkQ6qGqI,339
8
8
  s2t/utils.py,sha256=YU6YhiuONmqhrKte4DY5tiC5PP-yFExJMMBzFUiA8qA,3416
9
9
  s2t/whisper_engine.py,sha256=x-V7ST9e3JnwMWdbMh4C7dHjA420jaOtXH2-igeh7vc,6492
10
10
  s2t/translator/__init__.py,sha256=K-MKves7kZ4-62POfrmWeOcBaTjsTzeFSu8QNHqYuus,239
11
11
  s2t/translator/argos_backend.py,sha256=VW_OYFFBuNZgcWM-fbvR6XGokuxS2fptkCMFIO9MD1I,19068
12
- s2t-0.1.5.dist-info/METADATA,sha256=2ib6WJF5wef_EdJH7fSuwNzoevoZCMLr9tAGldphqBs,4642
13
- s2t-0.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
- s2t-0.1.5.dist-info/entry_points.txt,sha256=JISIUlZAJ3DX1dB6zT3X_E3vcXI-eWEQKwHiT35fPKs,37
15
- s2t-0.1.5.dist-info/top_level.txt,sha256=o8N0JcuHdIrfX3iGHvntHiDC2XgN7__joyNu08ZOh0s,4
16
- s2t-0.1.5.dist-info/RECORD,,
12
+ s2t-0.1.7.dist-info/METADATA,sha256=VV7G7rNO-2iDMh3SWSETTY0uG99YyfPlQeaOsDQulAk,5372
13
+ s2t-0.1.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
+ s2t-0.1.7.dist-info/entry_points.txt,sha256=JISIUlZAJ3DX1dB6zT3X_E3vcXI-eWEQKwHiT35fPKs,37
15
+ s2t-0.1.7.dist-info/top_level.txt,sha256=o8N0JcuHdIrfX3iGHvntHiDC2XgN7__joyNu08ZOh0s,4
16
+ s2t-0.1.7.dist-info/RECORD,,
File without changes