s2t 0.1.0.post1.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
s2t/__init__.py ADDED
@@ -0,0 +1,13 @@
1
+ from importlib.metadata import PackageNotFoundError, version
2
+
3
+ __all__ = ["__version__"]
4
+
5
+ try:
6
+ __version__ = version("s2t")
7
+ except PackageNotFoundError:
8
+ try:
9
+ from setuptools_scm import get_version
10
+
11
+ __version__ = get_version(root="..", relative_to=__file__)
12
+ except Exception:
13
+ __version__ = "0.0.0"
s2t/cli.py ADDED
@@ -0,0 +1,420 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Interactive microphone recording -> Whisper transcription -> outputs + clipboard
4
+
5
+ Features
6
+ - Records from the default microphone until you press Enter.
7
+ - Default recording format is FLAC (lossless); WAV and MP3 are supported. MP3 requires ffmpeg; otherwise it falls back to FLAC with a warning.
8
+ - Uses Whisper's Python API (no subprocess) to transcribe/translate and emits txt, srt, vtt, tsv, json.
9
+ - Copies the .txt transcript to the system clipboard.
10
+ - Creates a per-session subdirectory under a base output directory, named with an ISO timestamp (e.g., 2025-01-31T14-22-05+0200).
11
+
12
+ Requirements
13
+ - Python packages: sounddevice, soundfile, openai-whisper (pip install sounddevice soundfile openai-whisper)
14
+ - Optional: ffmpeg (only needed for MP3 or if Whisper loads audio by path for MP3)
15
+
16
+ Usage
17
+ s2t
18
+ Optional
19
+ s2t -l de -m turbo -o transcripts -t -f flac
20
+
21
+ Notes
22
+ - Default output directory is `transcripts/` if `-o/--outdir` is omitted.
23
+ - In prompt mode (`-p/--prompt`), speak your prompt first, then press SPACE. The app waits until the prompt is transcribed, prints a separator, and then you start speaking your main content. You may also press ENTER instead of SPACE to finish after the prompt; in that case the session ends after transcribing the prompt.
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import argparse
29
+ import json
30
+ import logging
31
+ import queue
32
+ import re
33
+ import shutil
34
+ import sys
35
+ import threading
36
+ import time
37
+ from pathlib import Path
38
+
39
+ from . import __version__
40
+ from .config import SessionOptions
41
+ from .outputs import concat_audio, write_final_outputs
42
+ from .recorder import Recorder
43
+ from .types import TranscriptionResult
44
+ from .utils import (
45
+ convert_wav_to_mp3,
46
+ copy_to_clipboard,
47
+ make_session_dir,
48
+ open_in_shell_editor,
49
+ )
50
+ from .whisper_engine import WhisperEngine
51
+
52
+
53
+ def run_session(opts: SessionOptions) -> int:
54
+ session_dir = make_session_dir(opts.outdir)
55
+ profile_data: dict = {}
56
+ requested = opts.recording_format.lower()
57
+ effective = requested
58
+ if requested == "mp3" and shutil.which("ffmpeg") is None:
59
+ logging.warning("ffmpeg not found; falling back to FLAC recording instead of MP3.")
60
+ effective = "flac"
61
+ ext = ".flac" if effective == "flac" else ".wav"
62
+
63
+ engine = WhisperEngine(
64
+ model_name=opts.model,
65
+ translate=opts.translate,
66
+ language=opts.lang,
67
+ native_segmentation=opts.native_segmentation,
68
+ session_dir=session_dir,
69
+ samplerate=opts.rate,
70
+ channels=opts.channels,
71
+ verbose=opts.verbose,
72
+ profile=profile_data if opts.profile else {},
73
+ )
74
+ ex, fut = engine.preload()
75
+
76
+ tx_q: queue.Queue[tuple[int, Path, int, float]] = queue.Queue()
77
+ cumulative_text = ""
78
+ next_to_emit = 1
79
+ pending: dict[int, str] = {}
80
+ results: list[TranscriptionResult] = []
81
+ offsets: list[float] = []
82
+ agg_lock = threading.Lock()
83
+ tx_done = threading.Event()
84
+
85
+ def _build_latest_ready_prompt(
86
+ current_index: int, finished: dict[int, str], max_chars: int = 800, max_chunks: int = 3
87
+ ) -> str | None:
88
+ parts: list[str] = []
89
+ total = 0
90
+ taken_chunks = 0
91
+ # Walk backward from previous indices
92
+ for idx in range(current_index - 1, 0, -1):
93
+ if idx not in finished:
94
+ continue
95
+ text = finished[idx].strip()
96
+ if not text:
97
+ continue
98
+ # Split into sentences (simple heuristic: ., !, ? followed by whitespace or end)
99
+ sentences = re.split(r"(?<=[.!?])[\s\n]+", text)
100
+ # Take completed sentences from the end
101
+ for s in reversed(sentences):
102
+ s = s.strip()
103
+ if not s:
104
+ continue
105
+ # Ensure it looks like a completed sentence
106
+ # Use triple-quoted raw string to safely include quotes in the class
107
+ if not re.search(r"""[.!?][\)\]\}"']*$|[.!?]$""", s):
108
+ # skip likely incomplete trailing fragment
109
+ continue
110
+ if total + len(s) + (1 if parts else 0) > max_chars:
111
+ return (" ".join(reversed(parts))) or None
112
+ parts.append(s)
113
+ total += len(s) + (1 if parts else 0)
114
+ # We don't count sentences per chunk strictly, but stop if we already got from enough chunks
115
+ taken_chunks += 1
116
+ if taken_chunks >= max_chunks or total >= max_chars:
117
+ break
118
+ return (" ".join(reversed(parts))) or None
119
+
120
+ # Event signaling that prompt (chunk #1) is fully transcribed
121
+ prompt_done = threading.Event()
122
+
123
+ def tx_worker():
124
+ model = engine.resolve_model(fut)
125
+ nonlocal cumulative_text, next_to_emit
126
+ finished_texts: dict[int, str] = {}
127
+ while True:
128
+ idx, path, frames, offset = tx_q.get()
129
+ if idx == -1:
130
+ break
131
+ # If in spoken-prompt mode, ensure we don't process payload chunks before prompt is done
132
+ if opts.prompt and idx > 1 and not prompt_done.is_set():
133
+ prompt_done.wait()
134
+ # Build latest-ready prompt based on already finished chunks
135
+ prompt = _build_latest_ready_prompt(idx, finished_texts)
136
+ res = engine.transcribe_chunk(model, path, frames, initial_prompt=prompt)
137
+ engine.write_chunk_outputs(res, path)
138
+ text_i = (res.get("text", "") or "").strip()
139
+ with agg_lock:
140
+ if text_i:
141
+ finished_texts[idx] = text_i
142
+ results.append(res)
143
+ offsets.append(offset)
144
+ pending[idx] = text_i
145
+ while next_to_emit in pending:
146
+ out = pending.pop(next_to_emit)
147
+ if out:
148
+ print(out)
149
+ print("")
150
+ cumulative_text += out if not cumulative_text else ("\n\n" + out)
151
+ try:
152
+ copy_to_clipboard(cumulative_text)
153
+ except Exception:
154
+ pass
155
+ next_to_emit += 1
156
+ # If this was the prompt chunk, signal readiness and instruct user
157
+ if opts.prompt and idx == 1 and not prompt_done.is_set():
158
+ prompt_done.set()
159
+ print("=" * 60)
160
+ print("Prompt transcribed. Start speaking your main content now.")
161
+ print("=" * 60)
162
+ # Allow recorder to resume writing the next chunk
163
+ if prompt_resume_event is not None:
164
+ prompt_resume_event.set()
165
+ tx_done.set()
166
+
167
+ tx_t = threading.Thread(target=tx_worker, daemon=True)
168
+ tx_t.start()
169
+
170
+ if opts.prompt:
171
+ print("Prompt mode enabled: Speak your prompt first, then press SPACE.")
172
+ print("Recording will wait for the prompt transcription before starting payload.")
173
+ # Prepare resume event to pause recording between prompt and payload
174
+ prompt_resume_event = threading.Event() if opts.prompt else None
175
+ rec = Recorder(
176
+ session_dir,
177
+ opts.rate,
178
+ opts.channels,
179
+ ext,
180
+ debounce_ms=opts.debounce_ms,
181
+ verbose=opts.verbose,
182
+ pause_after_first_chunk=opts.prompt,
183
+ resume_event=prompt_resume_event,
184
+ )
185
+ t0 = time.perf_counter()
186
+ chunk_paths, chunk_frames, chunk_offsets = rec.run(tx_q)
187
+ t1 = time.perf_counter()
188
+ if opts.profile:
189
+ profile_data["recording_sec"] = t1 - t0
190
+ tx_t.join()
191
+
192
+ merged: TranscriptionResult = engine.merge_results(results, chunk_offsets, cumulative_text)
193
+ base_audio_path = session_dir / f"recording{ext}"
194
+ txt_path = write_final_outputs(merged, session_dir, base_audio_path)
195
+
196
+ try:
197
+ if chunk_paths:
198
+ concat_audio(chunk_paths, base_audio_path, opts.rate, opts.channels)
199
+ if opts.verbose:
200
+ print(f"Merged audio written: {base_audio_path.name}", file=sys.stderr)
201
+ if requested == "mp3" and shutil.which("ffmpeg") is not None:
202
+ mp3_out = session_dir / "recording.mp3"
203
+ convert_wav_to_mp3(
204
+ (
205
+ base_audio_path
206
+ if base_audio_path.suffix.lower() == ".wav"
207
+ else base_audio_path
208
+ ),
209
+ mp3_out,
210
+ )
211
+ if opts.verbose:
212
+ print(f"Converted merged audio to MP3: {mp3_out.name}", file=sys.stderr)
213
+ except Exception as e:
214
+ if opts.verbose:
215
+ print(f"Warning: failed to merge chunk audio: {e}", file=sys.stderr)
216
+
217
+ # Optionally delete chunk files (audio + per-chunk outputs)
218
+ if chunk_paths and not opts.keep_chunks:
219
+ for p in chunk_paths:
220
+ try:
221
+ p.unlink(missing_ok=True)
222
+ except Exception:
223
+ pass
224
+ stem = p.with_suffix("")
225
+ for suf in (".txt", ".srt", ".vtt", ".tsv", ".json"):
226
+ try:
227
+ (stem.with_suffix(suf)).unlink(missing_ok=True)
228
+ except Exception:
229
+ pass
230
+
231
+ text_final: str = merged.get("text") or cumulative_text
232
+ t_cb0 = time.perf_counter()
233
+ copy_to_clipboard(text_final)
234
+ t_cb1 = time.perf_counter()
235
+ profile_data["clipboard_sec"] = t_cb1 - t_cb0
236
+
237
+ print("—" * 60)
238
+ print(f"Done. Files in folder: {session_dir}")
239
+ print("Created:")
240
+ if chunk_paths:
241
+ print(f" - chunks: {chunk_paths[0].name} … {chunk_paths[-1].name} (x{len(chunk_paths)})")
242
+ print(" - Whisper outputs: .txt, .srt, .vtt, .tsv, .json")
243
+ print(f"Copied TXT to clipboard: {txt_path.name}")
244
+
245
+ if opts.edit:
246
+ opened, used = open_in_shell_editor(txt_path)
247
+ if opened:
248
+ print("—" * 60)
249
+ print(f"Opened transcript in editor: {used or '$VISUAL/$EDITOR'}")
250
+ else:
251
+ print("—" * 60)
252
+ print(
253
+ "Could not open an editor from $VISUAL/$EDITOR or fallbacks; printing transcript instead:"
254
+ )
255
+ print(text_final.rstrip("\n"))
256
+ else:
257
+ print("—" * 60)
258
+ print("Transcript (clipboard text):")
259
+ print(text_final.rstrip("\n"))
260
+
261
+ if opts.profile:
262
+ try:
263
+ prof_path = session_dir / "profile.json"
264
+ prof_json = {**profile_data}
265
+ prof_json["total_sec"] = prof_json.get("total_sec", (time.perf_counter() - t0))
266
+ prof_path.write_text(json.dumps(prof_json, indent=2), encoding="utf-8")
267
+ print("—" * 60)
268
+ print("Profiling summary (seconds):")
269
+ for key in (
270
+ "recording_sec",
271
+ "model_load_sec",
272
+ "transcribe_sec",
273
+ "clipboard_sec",
274
+ "total_sec",
275
+ ):
276
+ if key in prof_json:
277
+ print(f" {key}: {prof_json[key]:.3f}")
278
+ print(f"Saved profiling JSON: {prof_path}")
279
+ except Exception as e:
280
+ print(f"Warning: failed to write profiling JSON: {e}")
281
+ return 0
282
+
283
+
284
+ def main(argv: list[str] | None = None) -> int:
285
+ parser = argparse.ArgumentParser(
286
+ description="Record speech, transcribe with Whisper, emit outputs, and copy .txt to clipboard."
287
+ )
288
+ parser.add_argument(
289
+ "-V",
290
+ "--version",
291
+ action="version",
292
+ version=f"%(prog)s {__version__}",
293
+ help="Show program's version number and exit",
294
+ )
295
+ parser.add_argument(
296
+ "-l",
297
+ "--lang",
298
+ help="Whisper language (e.g., 'de' or 'en'); auto-detect if omitted",
299
+ default=None,
300
+ )
301
+ parser.add_argument(
302
+ "-r", "--rate", type=int, default=44100, help="Sample rate (default: 44100)"
303
+ )
304
+ parser.add_argument(
305
+ "-c", "--channels", type=int, default=1, help="Channels (1=mono, 2=stereo; default: 1)"
306
+ )
307
+ parser.add_argument(
308
+ "-m",
309
+ "--model",
310
+ default="turbo",
311
+ help="Whisper model (e.g., turbo, base, small, medium, large-v2)",
312
+ )
313
+ parser.add_argument(
314
+ "-f",
315
+ "--recording-format",
316
+ choices=["flac", "wav", "mp3"],
317
+ default="flac",
318
+ help="Audio container for the recording (default: flac)",
319
+ )
320
+ parser.add_argument(
321
+ "-o",
322
+ "--outdir",
323
+ default=None,
324
+ help="Base output directory for timestamped sessions (default: current directory)",
325
+ )
326
+ parser.add_argument(
327
+ "-t",
328
+ "--translate",
329
+ action="store_true",
330
+ help="Translate to English instead of transcribing in source language",
331
+ )
332
+ parser.add_argument(
333
+ "-v",
334
+ "--verbose",
335
+ action="store_true",
336
+ help="Print details about the Whisper invocation",
337
+ )
338
+ parser.add_argument(
339
+ "-L",
340
+ "--list-models",
341
+ action="store_true",
342
+ help="List available Whisper model names and exit",
343
+ )
344
+ parser.add_argument(
345
+ "--profile",
346
+ action="store_true",
347
+ help="Collect and print timing information; also writes profile.json to the session folder",
348
+ )
349
+ parser.add_argument(
350
+ "--debounce-ms",
351
+ type=int,
352
+ default=0,
353
+ help="Debounce window for SPACE (ms). If >0, ignores rapid successive space presses",
354
+ )
355
+ parser.add_argument(
356
+ "--native-segmentation",
357
+ action="store_true",
358
+ help="Use Whisper's native segmentation inside chunks (default collapses each chunk to a single phrase)",
359
+ )
360
+ parser.add_argument(
361
+ "-p",
362
+ "--prompt",
363
+ action="store_true",
364
+ help="Spoken prompt mode: speak your prompt, then press SPACE to use it as prompt and continue with payload; if you press ENTER instead, no prompt is used and the spoken audio is transcribed as normal payload before ending",
365
+ )
366
+ parser.add_argument(
367
+ "--keep-chunks",
368
+ action="store_true",
369
+ help="Keep per-chunk audio and outputs (default: delete after final merge)",
370
+ )
371
+ parser.add_argument(
372
+ "-e",
373
+ "--edit",
374
+ action="store_true",
375
+ help="Open the transcript (.txt) in the system's default editor instead of printing to stdout",
376
+ )
377
+ args = parser.parse_args(argv)
378
+
379
+ try:
380
+ if args.list_models:
381
+ try:
382
+ import whisper
383
+
384
+ models = sorted(whisper.available_models())
385
+ print("Available models:")
386
+ for m in models:
387
+ print(f" - {m}")
388
+ return 0
389
+ except Exception as e:
390
+ print(f"Error listing models: {e}", file=sys.stderr)
391
+ return 1
392
+ logging.basicConfig(
393
+ level=(logging.INFO if args.verbose else logging.WARNING),
394
+ format="%(levelname)s: %(message)s",
395
+ )
396
+ # Default outdir to 'transcripts' if not provided
397
+ opts = SessionOptions(
398
+ outdir=Path(args.outdir) if args.outdir else Path("transcripts"),
399
+ rate=args.rate,
400
+ channels=args.channels,
401
+ recording_format=args.recording_format,
402
+ model=args.model,
403
+ lang=args.lang,
404
+ translate=args.translate,
405
+ native_segmentation=getattr(args, "native_segmentation", False),
406
+ verbose=args.verbose,
407
+ edit=args.edit,
408
+ debounce_ms=getattr(args, "debounce_ms", 0),
409
+ profile=args.profile,
410
+ keep_chunks=getattr(args, "keep_chunks", False),
411
+ prompt=getattr(args, "prompt", False),
412
+ )
413
+ return run_session(opts)
414
+ except Exception as e:
415
+ print(f"Error: {e}", file=sys.stderr)
416
+ return 1
417
+
418
+
419
+ if __name__ == "__main__":
420
+ raise SystemExit(main())
s2t/config.py ADDED
@@ -0,0 +1,22 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+
6
+
7
+ @dataclass
8
+ class SessionOptions:
9
+ outdir: Path | None
10
+ rate: int
11
+ channels: int
12
+ recording_format: str
13
+ model: str
14
+ lang: str | None
15
+ translate: bool
16
+ native_segmentation: bool
17
+ verbose: bool
18
+ edit: bool
19
+ debounce_ms: int
20
+ profile: bool
21
+ keep_chunks: bool
22
+ prompt: bool
s2t/outputs.py ADDED
@@ -0,0 +1,49 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from .types import TranscriptionResult
6
+
7
+
8
+ def write_final_outputs(
9
+ merged_result: TranscriptionResult, session_dir: Path, base_audio_path: Path
10
+ ) -> Path:
11
+ try:
12
+ from whisper.utils import get_writer
13
+
14
+ for fmt in ("txt", "srt", "vtt", "tsv", "json"):
15
+ writer = get_writer(fmt, str(session_dir))
16
+ writer(merged_result, str(base_audio_path))
17
+ return session_dir / "recording.txt"
18
+ except Exception as e:
19
+ print(f"Error writing merged outputs: {e}")
20
+ txt_path = session_dir / "recording.txt"
21
+ try:
22
+ txt_path.write_text(merged_result.get("text", ""), encoding="utf-8")
23
+ except Exception:
24
+ pass
25
+ return txt_path
26
+
27
+
28
+ def concat_audio(
29
+ chunk_paths: list[Path],
30
+ out_path: Path,
31
+ samplerate: int,
32
+ channels: int,
33
+ ) -> None:
34
+ try:
35
+ import soundfile as sf
36
+
37
+ fmt = "FLAC" if out_path.suffix.lower() == ".flac" else "WAV"
38
+ with sf.SoundFile(
39
+ str(out_path), mode="w", samplerate=samplerate, channels=channels, format=fmt
40
+ ) as outf:
41
+ for p in chunk_paths:
42
+ with sf.SoundFile(str(p), mode="r") as inf:
43
+ while True:
44
+ data = inf.read(frames=16384, dtype="float32")
45
+ if data.size == 0:
46
+ break
47
+ outf.write(data)
48
+ except Exception as e:
49
+ print(f"Warning: failed to merge chunk audio: {e}")
s2t/py.typed ADDED
@@ -0,0 +1 @@
1
+
s2t/recorder.py ADDED
@@ -0,0 +1,205 @@
1
+ from __future__ import annotations
2
+
3
+ import queue
4
+ import select
5
+ import sys
6
+ import threading
7
+ import time
8
+ from pathlib import Path
9
+ from typing import Any, Protocol, cast, runtime_checkable
10
+
11
+
12
+ class Recorder:
13
+ def __init__(
14
+ self,
15
+ session_dir: Path,
16
+ samplerate: int,
17
+ channels: int,
18
+ ext: str,
19
+ debounce_ms: int = 0,
20
+ verbose: bool = False,
21
+ pause_after_first_chunk: bool = False,
22
+ resume_event: threading.Event | None = None,
23
+ ) -> None:
24
+ self.session_dir = session_dir
25
+ self.samplerate = samplerate
26
+ self.channels = channels
27
+ self.ext = ext
28
+ self.debounce_ms = max(0, int(debounce_ms))
29
+ self.verbose = verbose
30
+ self.pause_after_first_chunk = pause_after_first_chunk
31
+ self.resume_event = resume_event
32
+ self._paused = False
33
+
34
+ def run(
35
+ self,
36
+ tx_queue: queue.Queue[tuple[int, Path, int, float]],
37
+ ) -> tuple[list[Path], list[int], list[float]]:
38
+ import platform
39
+ import termios
40
+ import tty
41
+
42
+ try:
43
+ import sounddevice as sd
44
+ import soundfile as sf
45
+ except Exception as e:
46
+ raise RuntimeError("sounddevice/soundfile required for recording.") from e
47
+
48
+ evt_q: queue.Queue[str] = queue.Queue()
49
+ stop_evt = threading.Event()
50
+
51
+ def key_reader() -> None:
52
+ try:
53
+ if platform.system() == "Windows":
54
+ import msvcrt
55
+
56
+ @runtime_checkable
57
+ class _MSVCRT(Protocol):
58
+ def kbhit(self) -> int: ...
59
+ def getwch(self) -> str: ...
60
+
61
+ ms = cast(_MSVCRT, msvcrt)
62
+
63
+ last_space = 0.0
64
+ while not stop_evt.is_set():
65
+ if ms.kbhit():
66
+ ch = ms.getwch()
67
+ if ch in ("\r", "\n"):
68
+ evt_q.put("ENTER")
69
+ break
70
+ if ch == " ":
71
+ now = time.perf_counter()
72
+ if self.debounce_ms and (now - last_space) < (
73
+ self.debounce_ms / 1000.0
74
+ ):
75
+ continue
76
+ last_space = now
77
+ evt_q.put("SPACE")
78
+ time.sleep(0.01)
79
+ else:
80
+ fd = sys.stdin.fileno()
81
+ old = termios.tcgetattr(fd)
82
+ tty.setcbreak(fd)
83
+ last_space = 0.0
84
+ try:
85
+ while not stop_evt.is_set():
86
+ r, _, _ = select.select([sys.stdin], [], [], 0.05)
87
+ if r:
88
+ ch = sys.stdin.read(1)
89
+ if ch in ("\n", "\r"):
90
+ evt_q.put("ENTER")
91
+ break
92
+ if ch == " ":
93
+ now = time.perf_counter()
94
+ if self.debounce_ms and (now - last_space) < (
95
+ self.debounce_ms / 1000.0
96
+ ):
97
+ continue
98
+ last_space = now
99
+ evt_q.put("SPACE")
100
+ finally:
101
+ termios.tcsetattr(fd, termios.TCSADRAIN, old)
102
+ except Exception:
103
+ pass
104
+
105
+ audio_q: queue.Queue[tuple[str, Any]] = queue.Queue(maxsize=128)
106
+ chunk_index = 1
107
+ chunk_paths: list[Path] = []
108
+ chunk_frames: list[int] = []
109
+ chunk_offsets: list[float] = []
110
+ offset_seconds_total = 0.0
111
+
112
+ def writer_fn() -> None:
113
+ nonlocal chunk_index, offset_seconds_total
114
+ frames_written = 0
115
+ cur_path = self.session_dir / f"chunk_{chunk_index:04d}{self.ext}"
116
+ fh = sf.SoundFile(
117
+ str(cur_path), mode="w", samplerate=self.samplerate, channels=self.channels
118
+ )
119
+ while True:
120
+ kind, payload = audio_q.get()
121
+ if kind == "frames":
122
+ data = payload
123
+ fh.write(data)
124
+ frames_written += len(data)
125
+ elif kind == "split":
126
+ fh.flush()
127
+ fh.close()
128
+ if frames_written > 0:
129
+ dur = frames_written / float(self.samplerate)
130
+ chunk_paths.append(cur_path)
131
+ chunk_frames.append(frames_written)
132
+ chunk_offsets.append(offset_seconds_total)
133
+ offset_seconds_total += dur
134
+ if self.verbose:
135
+ print(f"Saved chunk: {cur_path.name} ({dur:.2f}s)", file=sys.stderr)
136
+ tx_queue.put((chunk_index, cur_path, frames_written, chunk_offsets[-1]))
137
+ else:
138
+ try:
139
+ cur_path.unlink(missing_ok=True)
140
+ except Exception:
141
+ pass
142
+ frames_written = 0
143
+ chunk_index += 1
144
+ if (
145
+ self.pause_after_first_chunk
146
+ and chunk_index == 2
147
+ and self.resume_event is not None
148
+ ):
149
+ self._paused = True
150
+ self.resume_event.wait()
151
+ self._paused = False
152
+ cur_path = self.session_dir / f"chunk_{chunk_index:04d}{self.ext}"
153
+ fh = sf.SoundFile(
154
+ str(cur_path), mode="w", samplerate=self.samplerate, channels=self.channels
155
+ )
156
+ elif kind == "finish":
157
+ fh.flush()
158
+ fh.close()
159
+ if frames_written > 0:
160
+ dur = frames_written / float(self.samplerate)
161
+ chunk_paths.append(cur_path)
162
+ chunk_frames.append(frames_written)
163
+ chunk_offsets.append(offset_seconds_total)
164
+ offset_seconds_total += dur
165
+ if self.verbose:
166
+ print(f"Saved chunk: {cur_path.name} ({dur:.2f}s)", file=sys.stderr)
167
+ tx_queue.put((chunk_index, cur_path, frames_written, chunk_offsets[-1]))
168
+ else:
169
+ try:
170
+ cur_path.unlink(missing_ok=True)
171
+ except Exception:
172
+ pass
173
+ break
174
+ tx_queue.put((-1, Path(), 0, 0.0))
175
+
176
+ def cb(indata: Any, frames: int, time_info: Any, status: Any) -> None:
177
+ if status:
178
+ print(status, file=sys.stderr)
179
+ if not self._paused:
180
+ audio_q.put(("frames", indata.copy()))
181
+
182
+ key_t = threading.Thread(target=key_reader, daemon=True)
183
+ writer_t = threading.Thread(target=writer_fn, daemon=True)
184
+ key_t.start()
185
+ writer_t.start()
186
+
187
+ print("Recording… Press SPACE to split, Enter to finish.")
188
+ print("—" * 60)
189
+ print("")
190
+
191
+ import sounddevice as sd
192
+
193
+ with sd.InputStream(samplerate=self.samplerate, channels=self.channels, callback=cb):
194
+ while True:
195
+ try:
196
+ evt = evt_q.get(timeout=0.05)
197
+ except queue.Empty:
198
+ continue
199
+ if evt == "SPACE":
200
+ audio_q.put(("split", None))
201
+ elif evt == "ENTER":
202
+ audio_q.put(("finish", None))
203
+ break
204
+ writer_t.join()
205
+ return chunk_paths, chunk_frames, chunk_offsets
s2t/types.py ADDED
@@ -0,0 +1,14 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TypedDict
4
+
5
+
6
+ class SegmentDict(TypedDict, total=False):
7
+ start: float
8
+ end: float
9
+ text: str
10
+
11
+
12
+ class TranscriptionResult(TypedDict):
13
+ text: str
14
+ segments: list[SegmentDict]
s2t/utils.py ADDED
@@ -0,0 +1,109 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import platform
5
+ import shutil
6
+ import subprocess
7
+ import sys
8
+ from datetime import datetime
9
+ from pathlib import Path
10
+
11
+ import numpy as np
12
+
13
+
14
+ def check_dependency(cmd: str, install_hint: str) -> None:
15
+ if shutil.which(cmd) is None:
16
+ raise RuntimeError(f"Dependency '{cmd}' not found. Hint: {install_hint}")
17
+
18
+
19
+ def convert_wav_to_mp3(wav_path: Path, mp3_path: Path) -> None:
20
+ check_dependency(
21
+ "ffmpeg",
22
+ "macOS: brew install ffmpeg; Linux: apt/yum; Windows: install ffmpeg and add to PATH",
23
+ )
24
+ cmd = [
25
+ "ffmpeg",
26
+ "-y",
27
+ "-i",
28
+ str(wav_path),
29
+ "-vn",
30
+ "-acodec",
31
+ "libmp3lame",
32
+ "-q:a",
33
+ "2",
34
+ str(mp3_path),
35
+ ]
36
+ subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
37
+
38
+
39
+ def copy_to_clipboard(text: str) -> None:
40
+ system = platform.system()
41
+ try:
42
+ if system == "Darwin":
43
+ subprocess.run(["pbcopy"], input=text, text=True, check=True)
44
+ return
45
+ if system == "Windows":
46
+ subprocess.run(["clip"], input=text, text=True, check=True)
47
+ return
48
+ if shutil.which("xclip"):
49
+ subprocess.run(["xclip", "-selection", "clipboard"], input=text, text=True, check=True)
50
+ return
51
+ if shutil.which("xsel"):
52
+ subprocess.run(["xsel", "--clipboard", "--input"], input=text, text=True, check=True)
53
+ return
54
+ try:
55
+ import pyperclip
56
+
57
+ pyperclip.copy(text)
58
+ return
59
+ except Exception:
60
+ pass
61
+ except Exception as e:
62
+ print(f"Copy to clipboard failed: {e}", file=sys.stderr)
63
+ return
64
+ print("No clipboard tool found (pbcopy/clip/xclip/xsel). Optional: pip install pyperclip.")
65
+
66
+
67
+ def open_in_shell_editor(file_path: Path) -> tuple[bool, str]:
68
+ env_editor = os.environ.get("VISUAL") or os.environ.get("EDITOR")
69
+ candidates: list[list[str]] = []
70
+ if env_editor:
71
+ import shlex as _shlex
72
+
73
+ try:
74
+ candidates.append(_shlex.split(env_editor))
75
+ except Exception:
76
+ candidates.append([env_editor])
77
+ candidates += [["vim"], ["nvim"], ["nano"], ["micro"], ["notepad"]]
78
+ for argv in candidates:
79
+ exe = argv[0]
80
+ if shutil.which(exe) is None:
81
+ continue
82
+ try:
83
+ subprocess.run(argv + [str(file_path)], check=True)
84
+ return True, " ".join(argv)
85
+ except Exception:
86
+ continue
87
+ return False, ""
88
+
89
+
90
+ def make_session_dir(base_dir: Path | None = None) -> Path:
91
+ ts = datetime.now().astimezone().strftime("%Y-%m-%dT%H-%M-%S%z")
92
+ base = Path(base_dir) if base_dir is not None else Path.cwd()
93
+ base.mkdir(parents=True, exist_ok=True)
94
+ session = base / ts
95
+ session.mkdir(parents=True, exist_ok=False)
96
+ return session
97
+
98
+
99
+ def resample_linear(x: np.ndarray, src_sr: int, dst_sr: int) -> np.ndarray:
100
+ if src_sr == dst_sr:
101
+ return x.astype(np.float32, copy=False)
102
+ x = x.astype(np.float32, copy=False)
103
+ n_src = x.shape[0]
104
+ n_dst = int(round(n_src * (dst_sr / float(src_sr))))
105
+ if n_src == 0 or n_dst == 0:
106
+ return np.zeros(n_dst, dtype=np.float32)
107
+ src_t = np.linspace(0.0, 1.0, num=n_src, endpoint=False)
108
+ dst_t = np.linspace(0.0, 1.0, num=n_dst, endpoint=False)
109
+ return np.interp(dst_t, src_t, x).astype(np.float32)
s2t/whisper_engine.py ADDED
@@ -0,0 +1,139 @@
1
+ from __future__ import annotations
2
+
3
+ import time
4
+ from concurrent.futures import Future, ThreadPoolExecutor
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from .types import SegmentDict, TranscriptionResult
9
+
10
+
11
+ class WhisperEngine:
12
+ def __init__(
13
+ self,
14
+ model_name: str,
15
+ translate: bool,
16
+ language: str | None,
17
+ native_segmentation: bool,
18
+ session_dir: Path,
19
+ samplerate: int,
20
+ channels: int,
21
+ verbose: bool = False,
22
+ profile: dict | None = None,
23
+ ) -> None:
24
+ self.model_name = model_name
25
+ self.translate = translate
26
+ self.language = language
27
+ self.native_segmentation = native_segmentation
28
+ self.session_dir = session_dir
29
+ self.samplerate = samplerate
30
+ self.channels = channels
31
+ self.verbose = verbose
32
+ self.profile = profile or {}
33
+ self._executor: ThreadPoolExecutor | None = None
34
+
35
+ def preload(self) -> tuple[ThreadPoolExecutor | None, Future | None]:
36
+ try:
37
+ self._executor = ThreadPoolExecutor(max_workers=1)
38
+
39
+ def _load(name: str):
40
+ import whisper
41
+
42
+ t0 = time.perf_counter()
43
+ m = whisper.load_model(name)
44
+ t1 = time.perf_counter()
45
+ return m, (t1 - t0)
46
+
47
+ fut = self._executor.submit(_load, self.model_name)
48
+ return self._executor, fut
49
+ except Exception:
50
+ return None, None
51
+
52
+ def resolve_model(self, fut: Future | None):
53
+ import whisper
54
+
55
+ model = None
56
+ if fut is not None:
57
+ try:
58
+ model, load_dur = fut.result()
59
+ self.profile["model_load_sec"] = self.profile.get("model_load_sec", 0.0) + float(
60
+ load_dur
61
+ )
62
+ except Exception:
63
+ model = None
64
+ if model is None:
65
+ t0m = time.perf_counter()
66
+ model = whisper.load_model(self.model_name)
67
+ t1m = time.perf_counter()
68
+ self.profile["model_load_sec"] = self.profile.get("model_load_sec", 0.0) + (t1m - t0m)
69
+ return model
70
+
71
+ def transcribe_chunk(
72
+ self,
73
+ model,
74
+ audio_path: Path,
75
+ frames: int,
76
+ initial_prompt: str | None = None,
77
+ ) -> TranscriptionResult:
78
+ task = "translate" if self.translate else "transcribe"
79
+ t0 = time.perf_counter()
80
+ res: dict[str, Any] = model.transcribe(
81
+ str(audio_path),
82
+ task=task,
83
+ language=self.language,
84
+ fp16=False,
85
+ initial_prompt=initial_prompt,
86
+ )
87
+ t1 = time.perf_counter()
88
+ self.profile["transcribe_sec"] = self.profile.get("transcribe_sec", 0.0) + (t1 - t0)
89
+ text_c = str(res.get("text", "") or "").strip()
90
+ if self.native_segmentation:
91
+ segs_raw = res.get("segments", []) or []
92
+ segs_typed: list[SegmentDict] = []
93
+ for s in segs_raw:
94
+ try:
95
+ start = float(s.get("start", 0.0))
96
+ end = float(s.get("end", 0.0))
97
+ text = str(s.get("text", "") or "")
98
+ segs_typed.append({"start": start, "end": end, "text": text})
99
+ except Exception:
100
+ continue
101
+ return {"text": text_c, "segments": segs_typed}
102
+ # Collapsed single segment per chunk
103
+ segs_raw = res.get("segments", []) or []
104
+ start = float(segs_raw[0].get("start", 0.0)) if segs_raw else 0.0
105
+ end = float(segs_raw[-1].get("end", 0.0)) if segs_raw else (frames / float(self.samplerate))
106
+ return {
107
+ "text": text_c,
108
+ "segments": ([{"start": start, "end": end, "text": text_c}] if text_c else []),
109
+ }
110
+
111
+ def write_chunk_outputs(self, result: TranscriptionResult, audio_path: Path) -> None:
112
+ try:
113
+ from whisper.utils import get_writer
114
+
115
+ for fmt in ("txt", "srt", "vtt", "tsv", "json"):
116
+ writer = get_writer(fmt, str(self.session_dir))
117
+ writer(result, str(audio_path))
118
+ except Exception as e:
119
+ if self.verbose:
120
+ print(f"Warning: failed to write chunk outputs for {audio_path.name}: {e}")
121
+
122
+ def merge_results(
123
+ self, results: list[TranscriptionResult], offsets: list[float], cumulative_text: str
124
+ ) -> TranscriptionResult:
125
+ merged: TranscriptionResult = {"text": "", "segments": []}
126
+ for res, off in zip(results, offsets, strict=False):
127
+ merged["text"] += res.get("text") or ""
128
+ for s in res.get("segments", []):
129
+ s2: SegmentDict = {}
130
+ if "start" in s:
131
+ s2["start"] = float(s["start"]) + off
132
+ if "end" in s:
133
+ s2["end"] = float(s["end"]) + off
134
+ if "text" in s:
135
+ s2["text"] = s["text"]
136
+ merged["segments"].append(s2)
137
+ if (cumulative_text or "").strip():
138
+ merged["text"] = cumulative_text
139
+ return merged
@@ -0,0 +1,85 @@
1
+ Metadata-Version: 2.4
2
+ Name: s2t
3
+ Version: 0.1.0.post1.dev2
4
+ Summary: Speech to Text (s2t): Record audio, run Whisper, export formats, and copy transcript to clipboard.
5
+ Author: Maintainers
6
+ License-Expression: LicenseRef-Proprietary
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Programming Language :: Python :: 3 :: Only
9
+ Classifier: Programming Language :: Python :: 3.11
10
+ Classifier: Environment :: Console
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.11
13
+ Description-Content-Type: text/markdown
14
+ Requires-Dist: sounddevice>=0.4.6
15
+ Requires-Dist: soundfile>=0.12.1
16
+ Requires-Dist: numpy>=1.23
17
+ Requires-Dist: openai-whisper>=20231117
18
+ Provides-Extra: dev
19
+ Requires-Dist: pytest>=7; extra == "dev"
20
+ Requires-Dist: pytest-cov>=4; extra == "dev"
21
+ Requires-Dist: ruff>=0.4; extra == "dev"
22
+ Requires-Dist: mypy>=1.7; extra == "dev"
23
+ Requires-Dist: build>=1; extra == "dev"
24
+ Requires-Dist: setuptools-scm>=8; extra == "dev"
25
+ Requires-Dist: twine>=4; extra == "dev"
26
+
27
+ # s2t
28
+
29
+ Record audio from your microphone, run Whisper to transcribe it, export common formats, and copy the .txt transcript to your clipboard.
30
+
31
+ ## Install
32
+ - From local checkout:
33
+ - Editable: `pip install -e .`
34
+ - Standard: `pip install .`
35
+
36
+ Requirements: Python 3.11+. No mandatory external binaries. ffmpeg is optional (only for MP3 encoding/decoding).
37
+
38
+ System requirements (Linux)
39
+ - Some environments need system libraries for audio I/O:
40
+ - Debian/Ubuntu: `sudo apt-get install libportaudio2 libsndfile1`
41
+ - Fedora/RHEL: `sudo dnf install portaudio libsndfile`
42
+ - Optional for MP3: ffmpeg (`sudo apt-get install ffmpeg` or `brew install ffmpeg`).
43
+
44
+ ## Usage
45
+ - Start interactive recording and transcribe:
46
+ - `s2t`
47
+ - Short options:
48
+ - Language: `-l de` (long: `--lang de`)
49
+ - Model: `-m large-v3` (long: `--model large-v3`)
50
+ - Sample rate: `-r 48000` (long: `--rate 48000`)
51
+ - Channels: `-c 2` (long: `--channels 2`)
52
+ - Output dir: `-o transcripts` (long: `--outdir transcripts`) — default is `transcripts/` if omitted
53
+ - Translate to English: `-t` (long: `--translate`). You may still provide `--lang` as an input-language hint if you want.
54
+ - List available models and exit: `-L` (long: `--list-models`)
55
+ - Recording format: `-f flac|wav|mp3` (long: `--recording-format`), default `flac`. MP3 requires ffmpeg; if absent, it falls back to FLAC with a warning.
56
+ - Prompt mode (spoken prompt): `-p` (long: `--prompt`). Speak your prompt first, then press SPACE to use it as prompt and continue with your main content. If you press ENTER instead of SPACE, no prompt is used; the spoken audio is transcribed as normal payload and the session ends.
57
+ - Keep chunk files: `--keep-chunks` — by default, per‑chunk audio and per‑chunk Whisper outputs are deleted after the final merge.
58
+ - Open transcript for editing: `-e` (long: `--edit`) — opens the generated `.txt` in your shell editor (`$VISUAL`/`$EDITOR`).
59
+ - Examples:
60
+ - Transcribe in German using large-v3: `s2t -l de -m large-v3`
61
+ - Translate any input to English: `s2t -t`
62
+ - Write outputs under transcripts/: `s2t -o transcripts`
63
+ - List local model names: `s2t -L`
64
+
65
+ Outputs are written into a timestamped folder under the chosen output directory (default is `transcripts/`), e.g. `transcripts/2025-01-31T14-22-05+0200/`, containing:
66
+ - Per‑chunk outputs: `chunk_####.flac/.wav` plus `chunk_####.txt/.srt/.vtt/.tsv/.json` (deleted by default unless `--keep-chunks`)
67
+ - Final outputs: `recording.flac/.wav` (and `recording.mp3` if requested and ffmpeg available), plus `recording.txt/.srt/.vtt/.tsv/.json`
68
+ - Clipboard mirrors the combined `.txt` with blank lines between chunks.
69
+
70
+ ## Makefile (optional)
71
+ - Setup venv + dev deps: `make setup`
72
+ - Lint/format/test: `make lint`, `make format`, `make test`; combined gate: `make check`
73
+ - Build sdist/wheel: `make build` (runs `check` first)
74
+ - Publish to PyPI/TestPyPI: `make publish`, `make publish-test` (run after `build`)
75
+ - Run CLI: `make record ARGS='-l de -t -o transcripts'`
76
+ - List models: `make list-models`
77
+ - Show package version: `make version`
78
+
79
+ Notes on models
80
+ - The local openai-whisper CLI supports models like: `tiny`, `base`, `small`, `medium`, `large-v1`, `large-v2`, `large-v3` and their `.en` variants.
81
+ - The name `turbo` refers to OpenAI’s hosted model family and is not provided by the local `whisper` CLI. If you pass `-m turbo`, the command may fail; choose a supported local model instead.
82
+
83
+ ## Development & Release
84
+ - Für Entwickler-Setup und Beitragshinweise siehe `CONTRIBUTING.md`.
85
+ - Für den Release-Prozess siehe `docs/RELEASING.md`.
@@ -0,0 +1,14 @@
1
+ s2t/__init__.py,sha256=wV4E9i-7KrUn1dOtLUQB3ZGEKx9gRWH3hPHlpw-ZdWc,332
2
+ s2t/cli.py,sha256=5Z0YxLPwvfV8wrU-vN1s1HzzOLmA0HYi5uVf6brUtQQ,15786
3
+ s2t/config.py,sha256=mzz6ljGEupNDAzlUwf5kvl0iKqO8WZ4TWsU4nSVtp0M,409
4
+ s2t/outputs.py,sha256=Lo8VcARZ7QPuuQQNu8myD5J4c4NO1Rs0L1DLnzLe9tM,1546
5
+ s2t/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
6
+ s2t/recorder.py,sha256=uBD9mYf-uUCkRJw8fQitVnDrX6PwRNXJycyY4dBfXL0,8076
7
+ s2t/types.py,sha256=BuMyWuueS7EZbk7I_CkIWSb69Yi6g9-wr7CZLAZKflw,242
8
+ s2t/utils.py,sha256=YU6YhiuONmqhrKte4DY5tiC5PP-yFExJMMBzFUiA8qA,3416
9
+ s2t/whisper_engine.py,sha256=s9NBPtyptdhKauKQB4moq2SeGDQp2z7qc13e8C00SxY,5075
10
+ s2t-0.1.0.post1.dev2.dist-info/METADATA,sha256=c-7jrltbRiLjW0ixPZwgf49L8Ar7p7N5Dc7b0QO_pUo,4568
11
+ s2t-0.1.0.post1.dev2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
+ s2t-0.1.0.post1.dev2.dist-info/entry_points.txt,sha256=JISIUlZAJ3DX1dB6zT3X_E3vcXI-eWEQKwHiT35fPKs,37
13
+ s2t-0.1.0.post1.dev2.dist-info/top_level.txt,sha256=o8N0JcuHdIrfX3iGHvntHiDC2XgN7__joyNu08ZOh0s,4
14
+ s2t-0.1.0.post1.dev2.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ s2t = s2t.cli:main
@@ -0,0 +1 @@
1
+ s2t