s2t 0.1.0.post1.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- s2t/__init__.py +13 -0
- s2t/cli.py +420 -0
- s2t/config.py +22 -0
- s2t/outputs.py +49 -0
- s2t/py.typed +1 -0
- s2t/recorder.py +205 -0
- s2t/types.py +14 -0
- s2t/utils.py +109 -0
- s2t/whisper_engine.py +139 -0
- s2t-0.1.0.post1.dev2.dist-info/METADATA +85 -0
- s2t-0.1.0.post1.dev2.dist-info/RECORD +14 -0
- s2t-0.1.0.post1.dev2.dist-info/WHEEL +5 -0
- s2t-0.1.0.post1.dev2.dist-info/entry_points.txt +2 -0
- s2t-0.1.0.post1.dev2.dist-info/top_level.txt +1 -0
s2t/__init__.py
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
from importlib.metadata import PackageNotFoundError, version
|
2
|
+
|
3
|
+
__all__ = ["__version__"]
|
4
|
+
|
5
|
+
try:
|
6
|
+
__version__ = version("s2t")
|
7
|
+
except PackageNotFoundError:
|
8
|
+
try:
|
9
|
+
from setuptools_scm import get_version
|
10
|
+
|
11
|
+
__version__ = get_version(root="..", relative_to=__file__)
|
12
|
+
except Exception:
|
13
|
+
__version__ = "0.0.0"
|
s2t/cli.py
ADDED
@@ -0,0 +1,420 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
Interactive microphone recording -> Whisper transcription -> outputs + clipboard
|
4
|
+
|
5
|
+
Features
|
6
|
+
- Records from the default microphone until you press Enter.
|
7
|
+
- Default recording format is FLAC (lossless); WAV and MP3 are supported. MP3 requires ffmpeg; otherwise it falls back to FLAC with a warning.
|
8
|
+
- Uses Whisper's Python API (no subprocess) to transcribe/translate and emits txt, srt, vtt, tsv, json.
|
9
|
+
- Copies the .txt transcript to the system clipboard.
|
10
|
+
- Creates a per-session subdirectory under a base output directory, named with an ISO timestamp (e.g., 2025-01-31T14-22-05+0200).
|
11
|
+
|
12
|
+
Requirements
|
13
|
+
- Python packages: sounddevice, soundfile, openai-whisper (pip install sounddevice soundfile openai-whisper)
|
14
|
+
- Optional: ffmpeg (only needed for MP3 or if Whisper loads audio by path for MP3)
|
15
|
+
|
16
|
+
Usage
|
17
|
+
s2t
|
18
|
+
Optional
|
19
|
+
s2t -l de -m turbo -o transcripts -t -f flac
|
20
|
+
|
21
|
+
Notes
|
22
|
+
- Default output directory is `transcripts/` if `-o/--outdir` is omitted.
|
23
|
+
- In prompt mode (`-p/--prompt`), speak your prompt first, then press SPACE. The app waits until the prompt is transcribed, prints a separator, and then you start speaking your main content. You may also press ENTER instead of SPACE to finish after the prompt; in that case the session ends after transcribing the prompt.
|
24
|
+
"""
|
25
|
+
|
26
|
+
from __future__ import annotations
|
27
|
+
|
28
|
+
import argparse
|
29
|
+
import json
|
30
|
+
import logging
|
31
|
+
import queue
|
32
|
+
import re
|
33
|
+
import shutil
|
34
|
+
import sys
|
35
|
+
import threading
|
36
|
+
import time
|
37
|
+
from pathlib import Path
|
38
|
+
|
39
|
+
from . import __version__
|
40
|
+
from .config import SessionOptions
|
41
|
+
from .outputs import concat_audio, write_final_outputs
|
42
|
+
from .recorder import Recorder
|
43
|
+
from .types import TranscriptionResult
|
44
|
+
from .utils import (
|
45
|
+
convert_wav_to_mp3,
|
46
|
+
copy_to_clipboard,
|
47
|
+
make_session_dir,
|
48
|
+
open_in_shell_editor,
|
49
|
+
)
|
50
|
+
from .whisper_engine import WhisperEngine
|
51
|
+
|
52
|
+
|
53
|
+
def run_session(opts: SessionOptions) -> int:
|
54
|
+
session_dir = make_session_dir(opts.outdir)
|
55
|
+
profile_data: dict = {}
|
56
|
+
requested = opts.recording_format.lower()
|
57
|
+
effective = requested
|
58
|
+
if requested == "mp3" and shutil.which("ffmpeg") is None:
|
59
|
+
logging.warning("ffmpeg not found; falling back to FLAC recording instead of MP3.")
|
60
|
+
effective = "flac"
|
61
|
+
ext = ".flac" if effective == "flac" else ".wav"
|
62
|
+
|
63
|
+
engine = WhisperEngine(
|
64
|
+
model_name=opts.model,
|
65
|
+
translate=opts.translate,
|
66
|
+
language=opts.lang,
|
67
|
+
native_segmentation=opts.native_segmentation,
|
68
|
+
session_dir=session_dir,
|
69
|
+
samplerate=opts.rate,
|
70
|
+
channels=opts.channels,
|
71
|
+
verbose=opts.verbose,
|
72
|
+
profile=profile_data if opts.profile else {},
|
73
|
+
)
|
74
|
+
ex, fut = engine.preload()
|
75
|
+
|
76
|
+
tx_q: queue.Queue[tuple[int, Path, int, float]] = queue.Queue()
|
77
|
+
cumulative_text = ""
|
78
|
+
next_to_emit = 1
|
79
|
+
pending: dict[int, str] = {}
|
80
|
+
results: list[TranscriptionResult] = []
|
81
|
+
offsets: list[float] = []
|
82
|
+
agg_lock = threading.Lock()
|
83
|
+
tx_done = threading.Event()
|
84
|
+
|
85
|
+
def _build_latest_ready_prompt(
|
86
|
+
current_index: int, finished: dict[int, str], max_chars: int = 800, max_chunks: int = 3
|
87
|
+
) -> str | None:
|
88
|
+
parts: list[str] = []
|
89
|
+
total = 0
|
90
|
+
taken_chunks = 0
|
91
|
+
# Walk backward from previous indices
|
92
|
+
for idx in range(current_index - 1, 0, -1):
|
93
|
+
if idx not in finished:
|
94
|
+
continue
|
95
|
+
text = finished[idx].strip()
|
96
|
+
if not text:
|
97
|
+
continue
|
98
|
+
# Split into sentences (simple heuristic: ., !, ? followed by whitespace or end)
|
99
|
+
sentences = re.split(r"(?<=[.!?])[\s\n]+", text)
|
100
|
+
# Take completed sentences from the end
|
101
|
+
for s in reversed(sentences):
|
102
|
+
s = s.strip()
|
103
|
+
if not s:
|
104
|
+
continue
|
105
|
+
# Ensure it looks like a completed sentence
|
106
|
+
# Use triple-quoted raw string to safely include quotes in the class
|
107
|
+
if not re.search(r"""[.!?][\)\]\}"']*$|[.!?]$""", s):
|
108
|
+
# skip likely incomplete trailing fragment
|
109
|
+
continue
|
110
|
+
if total + len(s) + (1 if parts else 0) > max_chars:
|
111
|
+
return (" ".join(reversed(parts))) or None
|
112
|
+
parts.append(s)
|
113
|
+
total += len(s) + (1 if parts else 0)
|
114
|
+
# We don't count sentences per chunk strictly, but stop if we already got from enough chunks
|
115
|
+
taken_chunks += 1
|
116
|
+
if taken_chunks >= max_chunks or total >= max_chars:
|
117
|
+
break
|
118
|
+
return (" ".join(reversed(parts))) or None
|
119
|
+
|
120
|
+
# Event signaling that prompt (chunk #1) is fully transcribed
|
121
|
+
prompt_done = threading.Event()
|
122
|
+
|
123
|
+
def tx_worker():
|
124
|
+
model = engine.resolve_model(fut)
|
125
|
+
nonlocal cumulative_text, next_to_emit
|
126
|
+
finished_texts: dict[int, str] = {}
|
127
|
+
while True:
|
128
|
+
idx, path, frames, offset = tx_q.get()
|
129
|
+
if idx == -1:
|
130
|
+
break
|
131
|
+
# If in spoken-prompt mode, ensure we don't process payload chunks before prompt is done
|
132
|
+
if opts.prompt and idx > 1 and not prompt_done.is_set():
|
133
|
+
prompt_done.wait()
|
134
|
+
# Build latest-ready prompt based on already finished chunks
|
135
|
+
prompt = _build_latest_ready_prompt(idx, finished_texts)
|
136
|
+
res = engine.transcribe_chunk(model, path, frames, initial_prompt=prompt)
|
137
|
+
engine.write_chunk_outputs(res, path)
|
138
|
+
text_i = (res.get("text", "") or "").strip()
|
139
|
+
with agg_lock:
|
140
|
+
if text_i:
|
141
|
+
finished_texts[idx] = text_i
|
142
|
+
results.append(res)
|
143
|
+
offsets.append(offset)
|
144
|
+
pending[idx] = text_i
|
145
|
+
while next_to_emit in pending:
|
146
|
+
out = pending.pop(next_to_emit)
|
147
|
+
if out:
|
148
|
+
print(out)
|
149
|
+
print("")
|
150
|
+
cumulative_text += out if not cumulative_text else ("\n\n" + out)
|
151
|
+
try:
|
152
|
+
copy_to_clipboard(cumulative_text)
|
153
|
+
except Exception:
|
154
|
+
pass
|
155
|
+
next_to_emit += 1
|
156
|
+
# If this was the prompt chunk, signal readiness and instruct user
|
157
|
+
if opts.prompt and idx == 1 and not prompt_done.is_set():
|
158
|
+
prompt_done.set()
|
159
|
+
print("=" * 60)
|
160
|
+
print("Prompt transcribed. Start speaking your main content now.")
|
161
|
+
print("=" * 60)
|
162
|
+
# Allow recorder to resume writing the next chunk
|
163
|
+
if prompt_resume_event is not None:
|
164
|
+
prompt_resume_event.set()
|
165
|
+
tx_done.set()
|
166
|
+
|
167
|
+
tx_t = threading.Thread(target=tx_worker, daemon=True)
|
168
|
+
tx_t.start()
|
169
|
+
|
170
|
+
if opts.prompt:
|
171
|
+
print("Prompt mode enabled: Speak your prompt first, then press SPACE.")
|
172
|
+
print("Recording will wait for the prompt transcription before starting payload.")
|
173
|
+
# Prepare resume event to pause recording between prompt and payload
|
174
|
+
prompt_resume_event = threading.Event() if opts.prompt else None
|
175
|
+
rec = Recorder(
|
176
|
+
session_dir,
|
177
|
+
opts.rate,
|
178
|
+
opts.channels,
|
179
|
+
ext,
|
180
|
+
debounce_ms=opts.debounce_ms,
|
181
|
+
verbose=opts.verbose,
|
182
|
+
pause_after_first_chunk=opts.prompt,
|
183
|
+
resume_event=prompt_resume_event,
|
184
|
+
)
|
185
|
+
t0 = time.perf_counter()
|
186
|
+
chunk_paths, chunk_frames, chunk_offsets = rec.run(tx_q)
|
187
|
+
t1 = time.perf_counter()
|
188
|
+
if opts.profile:
|
189
|
+
profile_data["recording_sec"] = t1 - t0
|
190
|
+
tx_t.join()
|
191
|
+
|
192
|
+
merged: TranscriptionResult = engine.merge_results(results, chunk_offsets, cumulative_text)
|
193
|
+
base_audio_path = session_dir / f"recording{ext}"
|
194
|
+
txt_path = write_final_outputs(merged, session_dir, base_audio_path)
|
195
|
+
|
196
|
+
try:
|
197
|
+
if chunk_paths:
|
198
|
+
concat_audio(chunk_paths, base_audio_path, opts.rate, opts.channels)
|
199
|
+
if opts.verbose:
|
200
|
+
print(f"Merged audio written: {base_audio_path.name}", file=sys.stderr)
|
201
|
+
if requested == "mp3" and shutil.which("ffmpeg") is not None:
|
202
|
+
mp3_out = session_dir / "recording.mp3"
|
203
|
+
convert_wav_to_mp3(
|
204
|
+
(
|
205
|
+
base_audio_path
|
206
|
+
if base_audio_path.suffix.lower() == ".wav"
|
207
|
+
else base_audio_path
|
208
|
+
),
|
209
|
+
mp3_out,
|
210
|
+
)
|
211
|
+
if opts.verbose:
|
212
|
+
print(f"Converted merged audio to MP3: {mp3_out.name}", file=sys.stderr)
|
213
|
+
except Exception as e:
|
214
|
+
if opts.verbose:
|
215
|
+
print(f"Warning: failed to merge chunk audio: {e}", file=sys.stderr)
|
216
|
+
|
217
|
+
# Optionally delete chunk files (audio + per-chunk outputs)
|
218
|
+
if chunk_paths and not opts.keep_chunks:
|
219
|
+
for p in chunk_paths:
|
220
|
+
try:
|
221
|
+
p.unlink(missing_ok=True)
|
222
|
+
except Exception:
|
223
|
+
pass
|
224
|
+
stem = p.with_suffix("")
|
225
|
+
for suf in (".txt", ".srt", ".vtt", ".tsv", ".json"):
|
226
|
+
try:
|
227
|
+
(stem.with_suffix(suf)).unlink(missing_ok=True)
|
228
|
+
except Exception:
|
229
|
+
pass
|
230
|
+
|
231
|
+
text_final: str = merged.get("text") or cumulative_text
|
232
|
+
t_cb0 = time.perf_counter()
|
233
|
+
copy_to_clipboard(text_final)
|
234
|
+
t_cb1 = time.perf_counter()
|
235
|
+
profile_data["clipboard_sec"] = t_cb1 - t_cb0
|
236
|
+
|
237
|
+
print("—" * 60)
|
238
|
+
print(f"Done. Files in folder: {session_dir}")
|
239
|
+
print("Created:")
|
240
|
+
if chunk_paths:
|
241
|
+
print(f" - chunks: {chunk_paths[0].name} … {chunk_paths[-1].name} (x{len(chunk_paths)})")
|
242
|
+
print(" - Whisper outputs: .txt, .srt, .vtt, .tsv, .json")
|
243
|
+
print(f"Copied TXT to clipboard: {txt_path.name}")
|
244
|
+
|
245
|
+
if opts.edit:
|
246
|
+
opened, used = open_in_shell_editor(txt_path)
|
247
|
+
if opened:
|
248
|
+
print("—" * 60)
|
249
|
+
print(f"Opened transcript in editor: {used or '$VISUAL/$EDITOR'}")
|
250
|
+
else:
|
251
|
+
print("—" * 60)
|
252
|
+
print(
|
253
|
+
"Could not open an editor from $VISUAL/$EDITOR or fallbacks; printing transcript instead:"
|
254
|
+
)
|
255
|
+
print(text_final.rstrip("\n"))
|
256
|
+
else:
|
257
|
+
print("—" * 60)
|
258
|
+
print("Transcript (clipboard text):")
|
259
|
+
print(text_final.rstrip("\n"))
|
260
|
+
|
261
|
+
if opts.profile:
|
262
|
+
try:
|
263
|
+
prof_path = session_dir / "profile.json"
|
264
|
+
prof_json = {**profile_data}
|
265
|
+
prof_json["total_sec"] = prof_json.get("total_sec", (time.perf_counter() - t0))
|
266
|
+
prof_path.write_text(json.dumps(prof_json, indent=2), encoding="utf-8")
|
267
|
+
print("—" * 60)
|
268
|
+
print("Profiling summary (seconds):")
|
269
|
+
for key in (
|
270
|
+
"recording_sec",
|
271
|
+
"model_load_sec",
|
272
|
+
"transcribe_sec",
|
273
|
+
"clipboard_sec",
|
274
|
+
"total_sec",
|
275
|
+
):
|
276
|
+
if key in prof_json:
|
277
|
+
print(f" {key}: {prof_json[key]:.3f}")
|
278
|
+
print(f"Saved profiling JSON: {prof_path}")
|
279
|
+
except Exception as e:
|
280
|
+
print(f"Warning: failed to write profiling JSON: {e}")
|
281
|
+
return 0
|
282
|
+
|
283
|
+
|
284
|
+
def main(argv: list[str] | None = None) -> int:
|
285
|
+
parser = argparse.ArgumentParser(
|
286
|
+
description="Record speech, transcribe with Whisper, emit outputs, and copy .txt to clipboard."
|
287
|
+
)
|
288
|
+
parser.add_argument(
|
289
|
+
"-V",
|
290
|
+
"--version",
|
291
|
+
action="version",
|
292
|
+
version=f"%(prog)s {__version__}",
|
293
|
+
help="Show program's version number and exit",
|
294
|
+
)
|
295
|
+
parser.add_argument(
|
296
|
+
"-l",
|
297
|
+
"--lang",
|
298
|
+
help="Whisper language (e.g., 'de' or 'en'); auto-detect if omitted",
|
299
|
+
default=None,
|
300
|
+
)
|
301
|
+
parser.add_argument(
|
302
|
+
"-r", "--rate", type=int, default=44100, help="Sample rate (default: 44100)"
|
303
|
+
)
|
304
|
+
parser.add_argument(
|
305
|
+
"-c", "--channels", type=int, default=1, help="Channels (1=mono, 2=stereo; default: 1)"
|
306
|
+
)
|
307
|
+
parser.add_argument(
|
308
|
+
"-m",
|
309
|
+
"--model",
|
310
|
+
default="turbo",
|
311
|
+
help="Whisper model (e.g., turbo, base, small, medium, large-v2)",
|
312
|
+
)
|
313
|
+
parser.add_argument(
|
314
|
+
"-f",
|
315
|
+
"--recording-format",
|
316
|
+
choices=["flac", "wav", "mp3"],
|
317
|
+
default="flac",
|
318
|
+
help="Audio container for the recording (default: flac)",
|
319
|
+
)
|
320
|
+
parser.add_argument(
|
321
|
+
"-o",
|
322
|
+
"--outdir",
|
323
|
+
default=None,
|
324
|
+
help="Base output directory for timestamped sessions (default: current directory)",
|
325
|
+
)
|
326
|
+
parser.add_argument(
|
327
|
+
"-t",
|
328
|
+
"--translate",
|
329
|
+
action="store_true",
|
330
|
+
help="Translate to English instead of transcribing in source language",
|
331
|
+
)
|
332
|
+
parser.add_argument(
|
333
|
+
"-v",
|
334
|
+
"--verbose",
|
335
|
+
action="store_true",
|
336
|
+
help="Print details about the Whisper invocation",
|
337
|
+
)
|
338
|
+
parser.add_argument(
|
339
|
+
"-L",
|
340
|
+
"--list-models",
|
341
|
+
action="store_true",
|
342
|
+
help="List available Whisper model names and exit",
|
343
|
+
)
|
344
|
+
parser.add_argument(
|
345
|
+
"--profile",
|
346
|
+
action="store_true",
|
347
|
+
help="Collect and print timing information; also writes profile.json to the session folder",
|
348
|
+
)
|
349
|
+
parser.add_argument(
|
350
|
+
"--debounce-ms",
|
351
|
+
type=int,
|
352
|
+
default=0,
|
353
|
+
help="Debounce window for SPACE (ms). If >0, ignores rapid successive space presses",
|
354
|
+
)
|
355
|
+
parser.add_argument(
|
356
|
+
"--native-segmentation",
|
357
|
+
action="store_true",
|
358
|
+
help="Use Whisper's native segmentation inside chunks (default collapses each chunk to a single phrase)",
|
359
|
+
)
|
360
|
+
parser.add_argument(
|
361
|
+
"-p",
|
362
|
+
"--prompt",
|
363
|
+
action="store_true",
|
364
|
+
help="Spoken prompt mode: speak your prompt, then press SPACE to use it as prompt and continue with payload; if you press ENTER instead, no prompt is used and the spoken audio is transcribed as normal payload before ending",
|
365
|
+
)
|
366
|
+
parser.add_argument(
|
367
|
+
"--keep-chunks",
|
368
|
+
action="store_true",
|
369
|
+
help="Keep per-chunk audio and outputs (default: delete after final merge)",
|
370
|
+
)
|
371
|
+
parser.add_argument(
|
372
|
+
"-e",
|
373
|
+
"--edit",
|
374
|
+
action="store_true",
|
375
|
+
help="Open the transcript (.txt) in the system's default editor instead of printing to stdout",
|
376
|
+
)
|
377
|
+
args = parser.parse_args(argv)
|
378
|
+
|
379
|
+
try:
|
380
|
+
if args.list_models:
|
381
|
+
try:
|
382
|
+
import whisper
|
383
|
+
|
384
|
+
models = sorted(whisper.available_models())
|
385
|
+
print("Available models:")
|
386
|
+
for m in models:
|
387
|
+
print(f" - {m}")
|
388
|
+
return 0
|
389
|
+
except Exception as e:
|
390
|
+
print(f"Error listing models: {e}", file=sys.stderr)
|
391
|
+
return 1
|
392
|
+
logging.basicConfig(
|
393
|
+
level=(logging.INFO if args.verbose else logging.WARNING),
|
394
|
+
format="%(levelname)s: %(message)s",
|
395
|
+
)
|
396
|
+
# Default outdir to 'transcripts' if not provided
|
397
|
+
opts = SessionOptions(
|
398
|
+
outdir=Path(args.outdir) if args.outdir else Path("transcripts"),
|
399
|
+
rate=args.rate,
|
400
|
+
channels=args.channels,
|
401
|
+
recording_format=args.recording_format,
|
402
|
+
model=args.model,
|
403
|
+
lang=args.lang,
|
404
|
+
translate=args.translate,
|
405
|
+
native_segmentation=getattr(args, "native_segmentation", False),
|
406
|
+
verbose=args.verbose,
|
407
|
+
edit=args.edit,
|
408
|
+
debounce_ms=getattr(args, "debounce_ms", 0),
|
409
|
+
profile=args.profile,
|
410
|
+
keep_chunks=getattr(args, "keep_chunks", False),
|
411
|
+
prompt=getattr(args, "prompt", False),
|
412
|
+
)
|
413
|
+
return run_session(opts)
|
414
|
+
except Exception as e:
|
415
|
+
print(f"Error: {e}", file=sys.stderr)
|
416
|
+
return 1
|
417
|
+
|
418
|
+
|
419
|
+
if __name__ == "__main__":
|
420
|
+
raise SystemExit(main())
|
s2t/config.py
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from dataclasses import dataclass
|
4
|
+
from pathlib import Path
|
5
|
+
|
6
|
+
|
7
|
+
@dataclass
|
8
|
+
class SessionOptions:
|
9
|
+
outdir: Path | None
|
10
|
+
rate: int
|
11
|
+
channels: int
|
12
|
+
recording_format: str
|
13
|
+
model: str
|
14
|
+
lang: str | None
|
15
|
+
translate: bool
|
16
|
+
native_segmentation: bool
|
17
|
+
verbose: bool
|
18
|
+
edit: bool
|
19
|
+
debounce_ms: int
|
20
|
+
profile: bool
|
21
|
+
keep_chunks: bool
|
22
|
+
prompt: bool
|
s2t/outputs.py
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from pathlib import Path
|
4
|
+
|
5
|
+
from .types import TranscriptionResult
|
6
|
+
|
7
|
+
|
8
|
+
def write_final_outputs(
|
9
|
+
merged_result: TranscriptionResult, session_dir: Path, base_audio_path: Path
|
10
|
+
) -> Path:
|
11
|
+
try:
|
12
|
+
from whisper.utils import get_writer
|
13
|
+
|
14
|
+
for fmt in ("txt", "srt", "vtt", "tsv", "json"):
|
15
|
+
writer = get_writer(fmt, str(session_dir))
|
16
|
+
writer(merged_result, str(base_audio_path))
|
17
|
+
return session_dir / "recording.txt"
|
18
|
+
except Exception as e:
|
19
|
+
print(f"Error writing merged outputs: {e}")
|
20
|
+
txt_path = session_dir / "recording.txt"
|
21
|
+
try:
|
22
|
+
txt_path.write_text(merged_result.get("text", ""), encoding="utf-8")
|
23
|
+
except Exception:
|
24
|
+
pass
|
25
|
+
return txt_path
|
26
|
+
|
27
|
+
|
28
|
+
def concat_audio(
|
29
|
+
chunk_paths: list[Path],
|
30
|
+
out_path: Path,
|
31
|
+
samplerate: int,
|
32
|
+
channels: int,
|
33
|
+
) -> None:
|
34
|
+
try:
|
35
|
+
import soundfile as sf
|
36
|
+
|
37
|
+
fmt = "FLAC" if out_path.suffix.lower() == ".flac" else "WAV"
|
38
|
+
with sf.SoundFile(
|
39
|
+
str(out_path), mode="w", samplerate=samplerate, channels=channels, format=fmt
|
40
|
+
) as outf:
|
41
|
+
for p in chunk_paths:
|
42
|
+
with sf.SoundFile(str(p), mode="r") as inf:
|
43
|
+
while True:
|
44
|
+
data = inf.read(frames=16384, dtype="float32")
|
45
|
+
if data.size == 0:
|
46
|
+
break
|
47
|
+
outf.write(data)
|
48
|
+
except Exception as e:
|
49
|
+
print(f"Warning: failed to merge chunk audio: {e}")
|
s2t/py.typed
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
|
s2t/recorder.py
ADDED
@@ -0,0 +1,205 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import queue
|
4
|
+
import select
|
5
|
+
import sys
|
6
|
+
import threading
|
7
|
+
import time
|
8
|
+
from pathlib import Path
|
9
|
+
from typing import Any, Protocol, cast, runtime_checkable
|
10
|
+
|
11
|
+
|
12
|
+
class Recorder:
|
13
|
+
def __init__(
|
14
|
+
self,
|
15
|
+
session_dir: Path,
|
16
|
+
samplerate: int,
|
17
|
+
channels: int,
|
18
|
+
ext: str,
|
19
|
+
debounce_ms: int = 0,
|
20
|
+
verbose: bool = False,
|
21
|
+
pause_after_first_chunk: bool = False,
|
22
|
+
resume_event: threading.Event | None = None,
|
23
|
+
) -> None:
|
24
|
+
self.session_dir = session_dir
|
25
|
+
self.samplerate = samplerate
|
26
|
+
self.channels = channels
|
27
|
+
self.ext = ext
|
28
|
+
self.debounce_ms = max(0, int(debounce_ms))
|
29
|
+
self.verbose = verbose
|
30
|
+
self.pause_after_first_chunk = pause_after_first_chunk
|
31
|
+
self.resume_event = resume_event
|
32
|
+
self._paused = False
|
33
|
+
|
34
|
+
def run(
|
35
|
+
self,
|
36
|
+
tx_queue: queue.Queue[tuple[int, Path, int, float]],
|
37
|
+
) -> tuple[list[Path], list[int], list[float]]:
|
38
|
+
import platform
|
39
|
+
import termios
|
40
|
+
import tty
|
41
|
+
|
42
|
+
try:
|
43
|
+
import sounddevice as sd
|
44
|
+
import soundfile as sf
|
45
|
+
except Exception as e:
|
46
|
+
raise RuntimeError("sounddevice/soundfile required for recording.") from e
|
47
|
+
|
48
|
+
evt_q: queue.Queue[str] = queue.Queue()
|
49
|
+
stop_evt = threading.Event()
|
50
|
+
|
51
|
+
def key_reader() -> None:
|
52
|
+
try:
|
53
|
+
if platform.system() == "Windows":
|
54
|
+
import msvcrt
|
55
|
+
|
56
|
+
@runtime_checkable
|
57
|
+
class _MSVCRT(Protocol):
|
58
|
+
def kbhit(self) -> int: ...
|
59
|
+
def getwch(self) -> str: ...
|
60
|
+
|
61
|
+
ms = cast(_MSVCRT, msvcrt)
|
62
|
+
|
63
|
+
last_space = 0.0
|
64
|
+
while not stop_evt.is_set():
|
65
|
+
if ms.kbhit():
|
66
|
+
ch = ms.getwch()
|
67
|
+
if ch in ("\r", "\n"):
|
68
|
+
evt_q.put("ENTER")
|
69
|
+
break
|
70
|
+
if ch == " ":
|
71
|
+
now = time.perf_counter()
|
72
|
+
if self.debounce_ms and (now - last_space) < (
|
73
|
+
self.debounce_ms / 1000.0
|
74
|
+
):
|
75
|
+
continue
|
76
|
+
last_space = now
|
77
|
+
evt_q.put("SPACE")
|
78
|
+
time.sleep(0.01)
|
79
|
+
else:
|
80
|
+
fd = sys.stdin.fileno()
|
81
|
+
old = termios.tcgetattr(fd)
|
82
|
+
tty.setcbreak(fd)
|
83
|
+
last_space = 0.0
|
84
|
+
try:
|
85
|
+
while not stop_evt.is_set():
|
86
|
+
r, _, _ = select.select([sys.stdin], [], [], 0.05)
|
87
|
+
if r:
|
88
|
+
ch = sys.stdin.read(1)
|
89
|
+
if ch in ("\n", "\r"):
|
90
|
+
evt_q.put("ENTER")
|
91
|
+
break
|
92
|
+
if ch == " ":
|
93
|
+
now = time.perf_counter()
|
94
|
+
if self.debounce_ms and (now - last_space) < (
|
95
|
+
self.debounce_ms / 1000.0
|
96
|
+
):
|
97
|
+
continue
|
98
|
+
last_space = now
|
99
|
+
evt_q.put("SPACE")
|
100
|
+
finally:
|
101
|
+
termios.tcsetattr(fd, termios.TCSADRAIN, old)
|
102
|
+
except Exception:
|
103
|
+
pass
|
104
|
+
|
105
|
+
audio_q: queue.Queue[tuple[str, Any]] = queue.Queue(maxsize=128)
|
106
|
+
chunk_index = 1
|
107
|
+
chunk_paths: list[Path] = []
|
108
|
+
chunk_frames: list[int] = []
|
109
|
+
chunk_offsets: list[float] = []
|
110
|
+
offset_seconds_total = 0.0
|
111
|
+
|
112
|
+
def writer_fn() -> None:
|
113
|
+
nonlocal chunk_index, offset_seconds_total
|
114
|
+
frames_written = 0
|
115
|
+
cur_path = self.session_dir / f"chunk_{chunk_index:04d}{self.ext}"
|
116
|
+
fh = sf.SoundFile(
|
117
|
+
str(cur_path), mode="w", samplerate=self.samplerate, channels=self.channels
|
118
|
+
)
|
119
|
+
while True:
|
120
|
+
kind, payload = audio_q.get()
|
121
|
+
if kind == "frames":
|
122
|
+
data = payload
|
123
|
+
fh.write(data)
|
124
|
+
frames_written += len(data)
|
125
|
+
elif kind == "split":
|
126
|
+
fh.flush()
|
127
|
+
fh.close()
|
128
|
+
if frames_written > 0:
|
129
|
+
dur = frames_written / float(self.samplerate)
|
130
|
+
chunk_paths.append(cur_path)
|
131
|
+
chunk_frames.append(frames_written)
|
132
|
+
chunk_offsets.append(offset_seconds_total)
|
133
|
+
offset_seconds_total += dur
|
134
|
+
if self.verbose:
|
135
|
+
print(f"Saved chunk: {cur_path.name} ({dur:.2f}s)", file=sys.stderr)
|
136
|
+
tx_queue.put((chunk_index, cur_path, frames_written, chunk_offsets[-1]))
|
137
|
+
else:
|
138
|
+
try:
|
139
|
+
cur_path.unlink(missing_ok=True)
|
140
|
+
except Exception:
|
141
|
+
pass
|
142
|
+
frames_written = 0
|
143
|
+
chunk_index += 1
|
144
|
+
if (
|
145
|
+
self.pause_after_first_chunk
|
146
|
+
and chunk_index == 2
|
147
|
+
and self.resume_event is not None
|
148
|
+
):
|
149
|
+
self._paused = True
|
150
|
+
self.resume_event.wait()
|
151
|
+
self._paused = False
|
152
|
+
cur_path = self.session_dir / f"chunk_{chunk_index:04d}{self.ext}"
|
153
|
+
fh = sf.SoundFile(
|
154
|
+
str(cur_path), mode="w", samplerate=self.samplerate, channels=self.channels
|
155
|
+
)
|
156
|
+
elif kind == "finish":
|
157
|
+
fh.flush()
|
158
|
+
fh.close()
|
159
|
+
if frames_written > 0:
|
160
|
+
dur = frames_written / float(self.samplerate)
|
161
|
+
chunk_paths.append(cur_path)
|
162
|
+
chunk_frames.append(frames_written)
|
163
|
+
chunk_offsets.append(offset_seconds_total)
|
164
|
+
offset_seconds_total += dur
|
165
|
+
if self.verbose:
|
166
|
+
print(f"Saved chunk: {cur_path.name} ({dur:.2f}s)", file=sys.stderr)
|
167
|
+
tx_queue.put((chunk_index, cur_path, frames_written, chunk_offsets[-1]))
|
168
|
+
else:
|
169
|
+
try:
|
170
|
+
cur_path.unlink(missing_ok=True)
|
171
|
+
except Exception:
|
172
|
+
pass
|
173
|
+
break
|
174
|
+
tx_queue.put((-1, Path(), 0, 0.0))
|
175
|
+
|
176
|
+
def cb(indata: Any, frames: int, time_info: Any, status: Any) -> None:
|
177
|
+
if status:
|
178
|
+
print(status, file=sys.stderr)
|
179
|
+
if not self._paused:
|
180
|
+
audio_q.put(("frames", indata.copy()))
|
181
|
+
|
182
|
+
key_t = threading.Thread(target=key_reader, daemon=True)
|
183
|
+
writer_t = threading.Thread(target=writer_fn, daemon=True)
|
184
|
+
key_t.start()
|
185
|
+
writer_t.start()
|
186
|
+
|
187
|
+
print("Recording… Press SPACE to split, Enter to finish.")
|
188
|
+
print("—" * 60)
|
189
|
+
print("")
|
190
|
+
|
191
|
+
import sounddevice as sd
|
192
|
+
|
193
|
+
with sd.InputStream(samplerate=self.samplerate, channels=self.channels, callback=cb):
|
194
|
+
while True:
|
195
|
+
try:
|
196
|
+
evt = evt_q.get(timeout=0.05)
|
197
|
+
except queue.Empty:
|
198
|
+
continue
|
199
|
+
if evt == "SPACE":
|
200
|
+
audio_q.put(("split", None))
|
201
|
+
elif evt == "ENTER":
|
202
|
+
audio_q.put(("finish", None))
|
203
|
+
break
|
204
|
+
writer_t.join()
|
205
|
+
return chunk_paths, chunk_frames, chunk_offsets
|
s2t/types.py
ADDED
s2t/utils.py
ADDED
@@ -0,0 +1,109 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import os
|
4
|
+
import platform
|
5
|
+
import shutil
|
6
|
+
import subprocess
|
7
|
+
import sys
|
8
|
+
from datetime import datetime
|
9
|
+
from pathlib import Path
|
10
|
+
|
11
|
+
import numpy as np
|
12
|
+
|
13
|
+
|
14
|
+
def check_dependency(cmd: str, install_hint: str) -> None:
|
15
|
+
if shutil.which(cmd) is None:
|
16
|
+
raise RuntimeError(f"Dependency '{cmd}' not found. Hint: {install_hint}")
|
17
|
+
|
18
|
+
|
19
|
+
def convert_wav_to_mp3(wav_path: Path, mp3_path: Path) -> None:
|
20
|
+
check_dependency(
|
21
|
+
"ffmpeg",
|
22
|
+
"macOS: brew install ffmpeg; Linux: apt/yum; Windows: install ffmpeg and add to PATH",
|
23
|
+
)
|
24
|
+
cmd = [
|
25
|
+
"ffmpeg",
|
26
|
+
"-y",
|
27
|
+
"-i",
|
28
|
+
str(wav_path),
|
29
|
+
"-vn",
|
30
|
+
"-acodec",
|
31
|
+
"libmp3lame",
|
32
|
+
"-q:a",
|
33
|
+
"2",
|
34
|
+
str(mp3_path),
|
35
|
+
]
|
36
|
+
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
|
37
|
+
|
38
|
+
|
39
|
+
def copy_to_clipboard(text: str) -> None:
|
40
|
+
system = platform.system()
|
41
|
+
try:
|
42
|
+
if system == "Darwin":
|
43
|
+
subprocess.run(["pbcopy"], input=text, text=True, check=True)
|
44
|
+
return
|
45
|
+
if system == "Windows":
|
46
|
+
subprocess.run(["clip"], input=text, text=True, check=True)
|
47
|
+
return
|
48
|
+
if shutil.which("xclip"):
|
49
|
+
subprocess.run(["xclip", "-selection", "clipboard"], input=text, text=True, check=True)
|
50
|
+
return
|
51
|
+
if shutil.which("xsel"):
|
52
|
+
subprocess.run(["xsel", "--clipboard", "--input"], input=text, text=True, check=True)
|
53
|
+
return
|
54
|
+
try:
|
55
|
+
import pyperclip
|
56
|
+
|
57
|
+
pyperclip.copy(text)
|
58
|
+
return
|
59
|
+
except Exception:
|
60
|
+
pass
|
61
|
+
except Exception as e:
|
62
|
+
print(f"Copy to clipboard failed: {e}", file=sys.stderr)
|
63
|
+
return
|
64
|
+
print("No clipboard tool found (pbcopy/clip/xclip/xsel). Optional: pip install pyperclip.")
|
65
|
+
|
66
|
+
|
67
|
+
def open_in_shell_editor(file_path: Path) -> tuple[bool, str]:
|
68
|
+
env_editor = os.environ.get("VISUAL") or os.environ.get("EDITOR")
|
69
|
+
candidates: list[list[str]] = []
|
70
|
+
if env_editor:
|
71
|
+
import shlex as _shlex
|
72
|
+
|
73
|
+
try:
|
74
|
+
candidates.append(_shlex.split(env_editor))
|
75
|
+
except Exception:
|
76
|
+
candidates.append([env_editor])
|
77
|
+
candidates += [["vim"], ["nvim"], ["nano"], ["micro"], ["notepad"]]
|
78
|
+
for argv in candidates:
|
79
|
+
exe = argv[0]
|
80
|
+
if shutil.which(exe) is None:
|
81
|
+
continue
|
82
|
+
try:
|
83
|
+
subprocess.run(argv + [str(file_path)], check=True)
|
84
|
+
return True, " ".join(argv)
|
85
|
+
except Exception:
|
86
|
+
continue
|
87
|
+
return False, ""
|
88
|
+
|
89
|
+
|
90
|
+
def make_session_dir(base_dir: Path | None = None) -> Path:
|
91
|
+
ts = datetime.now().astimezone().strftime("%Y-%m-%dT%H-%M-%S%z")
|
92
|
+
base = Path(base_dir) if base_dir is not None else Path.cwd()
|
93
|
+
base.mkdir(parents=True, exist_ok=True)
|
94
|
+
session = base / ts
|
95
|
+
session.mkdir(parents=True, exist_ok=False)
|
96
|
+
return session
|
97
|
+
|
98
|
+
|
99
|
+
def resample_linear(x: np.ndarray, src_sr: int, dst_sr: int) -> np.ndarray:
|
100
|
+
if src_sr == dst_sr:
|
101
|
+
return x.astype(np.float32, copy=False)
|
102
|
+
x = x.astype(np.float32, copy=False)
|
103
|
+
n_src = x.shape[0]
|
104
|
+
n_dst = int(round(n_src * (dst_sr / float(src_sr))))
|
105
|
+
if n_src == 0 or n_dst == 0:
|
106
|
+
return np.zeros(n_dst, dtype=np.float32)
|
107
|
+
src_t = np.linspace(0.0, 1.0, num=n_src, endpoint=False)
|
108
|
+
dst_t = np.linspace(0.0, 1.0, num=n_dst, endpoint=False)
|
109
|
+
return np.interp(dst_t, src_t, x).astype(np.float32)
|
s2t/whisper_engine.py
ADDED
@@ -0,0 +1,139 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import time
|
4
|
+
from concurrent.futures import Future, ThreadPoolExecutor
|
5
|
+
from pathlib import Path
|
6
|
+
from typing import Any
|
7
|
+
|
8
|
+
from .types import SegmentDict, TranscriptionResult
|
9
|
+
|
10
|
+
|
11
|
+
class WhisperEngine:
|
12
|
+
def __init__(
|
13
|
+
self,
|
14
|
+
model_name: str,
|
15
|
+
translate: bool,
|
16
|
+
language: str | None,
|
17
|
+
native_segmentation: bool,
|
18
|
+
session_dir: Path,
|
19
|
+
samplerate: int,
|
20
|
+
channels: int,
|
21
|
+
verbose: bool = False,
|
22
|
+
profile: dict | None = None,
|
23
|
+
) -> None:
|
24
|
+
self.model_name = model_name
|
25
|
+
self.translate = translate
|
26
|
+
self.language = language
|
27
|
+
self.native_segmentation = native_segmentation
|
28
|
+
self.session_dir = session_dir
|
29
|
+
self.samplerate = samplerate
|
30
|
+
self.channels = channels
|
31
|
+
self.verbose = verbose
|
32
|
+
self.profile = profile or {}
|
33
|
+
self._executor: ThreadPoolExecutor | None = None
|
34
|
+
|
35
|
+
def preload(self) -> tuple[ThreadPoolExecutor | None, Future | None]:
|
36
|
+
try:
|
37
|
+
self._executor = ThreadPoolExecutor(max_workers=1)
|
38
|
+
|
39
|
+
def _load(name: str):
|
40
|
+
import whisper
|
41
|
+
|
42
|
+
t0 = time.perf_counter()
|
43
|
+
m = whisper.load_model(name)
|
44
|
+
t1 = time.perf_counter()
|
45
|
+
return m, (t1 - t0)
|
46
|
+
|
47
|
+
fut = self._executor.submit(_load, self.model_name)
|
48
|
+
return self._executor, fut
|
49
|
+
except Exception:
|
50
|
+
return None, None
|
51
|
+
|
52
|
+
def resolve_model(self, fut: Future | None):
|
53
|
+
import whisper
|
54
|
+
|
55
|
+
model = None
|
56
|
+
if fut is not None:
|
57
|
+
try:
|
58
|
+
model, load_dur = fut.result()
|
59
|
+
self.profile["model_load_sec"] = self.profile.get("model_load_sec", 0.0) + float(
|
60
|
+
load_dur
|
61
|
+
)
|
62
|
+
except Exception:
|
63
|
+
model = None
|
64
|
+
if model is None:
|
65
|
+
t0m = time.perf_counter()
|
66
|
+
model = whisper.load_model(self.model_name)
|
67
|
+
t1m = time.perf_counter()
|
68
|
+
self.profile["model_load_sec"] = self.profile.get("model_load_sec", 0.0) + (t1m - t0m)
|
69
|
+
return model
|
70
|
+
|
71
|
+
def transcribe_chunk(
|
72
|
+
self,
|
73
|
+
model,
|
74
|
+
audio_path: Path,
|
75
|
+
frames: int,
|
76
|
+
initial_prompt: str | None = None,
|
77
|
+
) -> TranscriptionResult:
|
78
|
+
task = "translate" if self.translate else "transcribe"
|
79
|
+
t0 = time.perf_counter()
|
80
|
+
res: dict[str, Any] = model.transcribe(
|
81
|
+
str(audio_path),
|
82
|
+
task=task,
|
83
|
+
language=self.language,
|
84
|
+
fp16=False,
|
85
|
+
initial_prompt=initial_prompt,
|
86
|
+
)
|
87
|
+
t1 = time.perf_counter()
|
88
|
+
self.profile["transcribe_sec"] = self.profile.get("transcribe_sec", 0.0) + (t1 - t0)
|
89
|
+
text_c = str(res.get("text", "") or "").strip()
|
90
|
+
if self.native_segmentation:
|
91
|
+
segs_raw = res.get("segments", []) or []
|
92
|
+
segs_typed: list[SegmentDict] = []
|
93
|
+
for s in segs_raw:
|
94
|
+
try:
|
95
|
+
start = float(s.get("start", 0.0))
|
96
|
+
end = float(s.get("end", 0.0))
|
97
|
+
text = str(s.get("text", "") or "")
|
98
|
+
segs_typed.append({"start": start, "end": end, "text": text})
|
99
|
+
except Exception:
|
100
|
+
continue
|
101
|
+
return {"text": text_c, "segments": segs_typed}
|
102
|
+
# Collapsed single segment per chunk
|
103
|
+
segs_raw = res.get("segments", []) or []
|
104
|
+
start = float(segs_raw[0].get("start", 0.0)) if segs_raw else 0.0
|
105
|
+
end = float(segs_raw[-1].get("end", 0.0)) if segs_raw else (frames / float(self.samplerate))
|
106
|
+
return {
|
107
|
+
"text": text_c,
|
108
|
+
"segments": ([{"start": start, "end": end, "text": text_c}] if text_c else []),
|
109
|
+
}
|
110
|
+
|
111
|
+
def write_chunk_outputs(self, result: TranscriptionResult, audio_path: Path) -> None:
|
112
|
+
try:
|
113
|
+
from whisper.utils import get_writer
|
114
|
+
|
115
|
+
for fmt in ("txt", "srt", "vtt", "tsv", "json"):
|
116
|
+
writer = get_writer(fmt, str(self.session_dir))
|
117
|
+
writer(result, str(audio_path))
|
118
|
+
except Exception as e:
|
119
|
+
if self.verbose:
|
120
|
+
print(f"Warning: failed to write chunk outputs for {audio_path.name}: {e}")
|
121
|
+
|
122
|
+
def merge_results(
|
123
|
+
self, results: list[TranscriptionResult], offsets: list[float], cumulative_text: str
|
124
|
+
) -> TranscriptionResult:
|
125
|
+
merged: TranscriptionResult = {"text": "", "segments": []}
|
126
|
+
for res, off in zip(results, offsets, strict=False):
|
127
|
+
merged["text"] += res.get("text") or ""
|
128
|
+
for s in res.get("segments", []):
|
129
|
+
s2: SegmentDict = {}
|
130
|
+
if "start" in s:
|
131
|
+
s2["start"] = float(s["start"]) + off
|
132
|
+
if "end" in s:
|
133
|
+
s2["end"] = float(s["end"]) + off
|
134
|
+
if "text" in s:
|
135
|
+
s2["text"] = s["text"]
|
136
|
+
merged["segments"].append(s2)
|
137
|
+
if (cumulative_text or "").strip():
|
138
|
+
merged["text"] = cumulative_text
|
139
|
+
return merged
|
@@ -0,0 +1,85 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: s2t
|
3
|
+
Version: 0.1.0.post1.dev2
|
4
|
+
Summary: Speech to Text (s2t): Record audio, run Whisper, export formats, and copy transcript to clipboard.
|
5
|
+
Author: Maintainers
|
6
|
+
License-Expression: LicenseRef-Proprietary
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
8
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
9
|
+
Classifier: Programming Language :: Python :: 3.11
|
10
|
+
Classifier: Environment :: Console
|
11
|
+
Classifier: Operating System :: OS Independent
|
12
|
+
Requires-Python: >=3.11
|
13
|
+
Description-Content-Type: text/markdown
|
14
|
+
Requires-Dist: sounddevice>=0.4.6
|
15
|
+
Requires-Dist: soundfile>=0.12.1
|
16
|
+
Requires-Dist: numpy>=1.23
|
17
|
+
Requires-Dist: openai-whisper>=20231117
|
18
|
+
Provides-Extra: dev
|
19
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
20
|
+
Requires-Dist: pytest-cov>=4; extra == "dev"
|
21
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
22
|
+
Requires-Dist: mypy>=1.7; extra == "dev"
|
23
|
+
Requires-Dist: build>=1; extra == "dev"
|
24
|
+
Requires-Dist: setuptools-scm>=8; extra == "dev"
|
25
|
+
Requires-Dist: twine>=4; extra == "dev"
|
26
|
+
|
27
|
+
# s2t
|
28
|
+
|
29
|
+
Record audio from your microphone, run Whisper to transcribe it, export common formats, and copy the .txt transcript to your clipboard.
|
30
|
+
|
31
|
+
## Install
|
32
|
+
- From local checkout:
|
33
|
+
- Editable: `pip install -e .`
|
34
|
+
- Standard: `pip install .`
|
35
|
+
|
36
|
+
Requirements: Python 3.11+. No mandatory external binaries. ffmpeg is optional (only for MP3 encoding/decoding).
|
37
|
+
|
38
|
+
System requirements (Linux)
|
39
|
+
- Some environments need system libraries for audio I/O:
|
40
|
+
- Debian/Ubuntu: `sudo apt-get install libportaudio2 libsndfile1`
|
41
|
+
- Fedora/RHEL: `sudo dnf install portaudio libsndfile`
|
42
|
+
- Optional for MP3: ffmpeg (`sudo apt-get install ffmpeg` or `brew install ffmpeg`).
|
43
|
+
|
44
|
+
## Usage
|
45
|
+
- Start interactive recording and transcribe:
|
46
|
+
- `s2t`
|
47
|
+
- Short options:
|
48
|
+
- Language: `-l de` (long: `--lang de`)
|
49
|
+
- Model: `-m large-v3` (long: `--model large-v3`)
|
50
|
+
- Sample rate: `-r 48000` (long: `--rate 48000`)
|
51
|
+
- Channels: `-c 2` (long: `--channels 2`)
|
52
|
+
- Output dir: `-o transcripts` (long: `--outdir transcripts`) — default is `transcripts/` if omitted
|
53
|
+
- Translate to English: `-t` (long: `--translate`). You may still provide `--lang` as an input-language hint if you want.
|
54
|
+
- List available models and exit: `-L` (long: `--list-models`)
|
55
|
+
- Recording format: `-f flac|wav|mp3` (long: `--recording-format`), default `flac`. MP3 requires ffmpeg; if absent, it falls back to FLAC with a warning.
|
56
|
+
- Prompt mode (spoken prompt): `-p` (long: `--prompt`). Speak your prompt first, then press SPACE to use it as prompt and continue with your main content. If you press ENTER instead of SPACE, no prompt is used; the spoken audio is transcribed as normal payload and the session ends.
|
57
|
+
- Keep chunk files: `--keep-chunks` — by default, per‑chunk audio and per‑chunk Whisper outputs are deleted after the final merge.
|
58
|
+
- Open transcript for editing: `-e` (long: `--edit`) — opens the generated `.txt` in your shell editor (`$VISUAL`/`$EDITOR`).
|
59
|
+
- Examples:
|
60
|
+
- Transcribe in German using large-v3: `s2t -l de -m large-v3`
|
61
|
+
- Translate any input to English: `s2t -t`
|
62
|
+
- Write outputs under transcripts/: `s2t -o transcripts`
|
63
|
+
- List local model names: `s2t -L`
|
64
|
+
|
65
|
+
Outputs are written into a timestamped folder under the chosen output directory (default is `transcripts/`), e.g. `transcripts/2025-01-31T14-22-05+0200/`, containing:
|
66
|
+
- Per‑chunk outputs: `chunk_####.flac/.wav` plus `chunk_####.txt/.srt/.vtt/.tsv/.json` (deleted by default unless `--keep-chunks`)
|
67
|
+
- Final outputs: `recording.flac/.wav` (and `recording.mp3` if requested and ffmpeg available), plus `recording.txt/.srt/.vtt/.tsv/.json`
|
68
|
+
- Clipboard mirrors the combined `.txt` with blank lines between chunks.
|
69
|
+
|
70
|
+
## Makefile (optional)
|
71
|
+
- Setup venv + dev deps: `make setup`
|
72
|
+
- Lint/format/test: `make lint`, `make format`, `make test`; combined gate: `make check`
|
73
|
+
- Build sdist/wheel: `make build` (runs `check` first)
|
74
|
+
- Publish to PyPI/TestPyPI: `make publish`, `make publish-test` (run after `build`)
|
75
|
+
- Run CLI: `make record ARGS='-l de -t -o transcripts'`
|
76
|
+
- List models: `make list-models`
|
77
|
+
- Show package version: `make version`
|
78
|
+
|
79
|
+
Notes on models
|
80
|
+
- The local openai-whisper CLI supports models like: `tiny`, `base`, `small`, `medium`, `large-v1`, `large-v2`, `large-v3` and their `.en` variants.
|
81
|
+
- The name `turbo` refers to OpenAI’s hosted model family and is not provided by the local `whisper` CLI. If you pass `-m turbo`, the command may fail; choose a supported local model instead.
|
82
|
+
|
83
|
+
## Development & Release
|
84
|
+
- Für Entwickler-Setup und Beitragshinweise siehe `CONTRIBUTING.md`.
|
85
|
+
- Für den Release-Prozess siehe `docs/RELEASING.md`.
|
@@ -0,0 +1,14 @@
|
|
1
|
+
s2t/__init__.py,sha256=wV4E9i-7KrUn1dOtLUQB3ZGEKx9gRWH3hPHlpw-ZdWc,332
|
2
|
+
s2t/cli.py,sha256=5Z0YxLPwvfV8wrU-vN1s1HzzOLmA0HYi5uVf6brUtQQ,15786
|
3
|
+
s2t/config.py,sha256=mzz6ljGEupNDAzlUwf5kvl0iKqO8WZ4TWsU4nSVtp0M,409
|
4
|
+
s2t/outputs.py,sha256=Lo8VcARZ7QPuuQQNu8myD5J4c4NO1Rs0L1DLnzLe9tM,1546
|
5
|
+
s2t/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
6
|
+
s2t/recorder.py,sha256=uBD9mYf-uUCkRJw8fQitVnDrX6PwRNXJycyY4dBfXL0,8076
|
7
|
+
s2t/types.py,sha256=BuMyWuueS7EZbk7I_CkIWSb69Yi6g9-wr7CZLAZKflw,242
|
8
|
+
s2t/utils.py,sha256=YU6YhiuONmqhrKte4DY5tiC5PP-yFExJMMBzFUiA8qA,3416
|
9
|
+
s2t/whisper_engine.py,sha256=s9NBPtyptdhKauKQB4moq2SeGDQp2z7qc13e8C00SxY,5075
|
10
|
+
s2t-0.1.0.post1.dev2.dist-info/METADATA,sha256=c-7jrltbRiLjW0ixPZwgf49L8Ar7p7N5Dc7b0QO_pUo,4568
|
11
|
+
s2t-0.1.0.post1.dev2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
12
|
+
s2t-0.1.0.post1.dev2.dist-info/entry_points.txt,sha256=JISIUlZAJ3DX1dB6zT3X_E3vcXI-eWEQKwHiT35fPKs,37
|
13
|
+
s2t-0.1.0.post1.dev2.dist-info/top_level.txt,sha256=o8N0JcuHdIrfX3iGHvntHiDC2XgN7__joyNu08ZOh0s,4
|
14
|
+
s2t-0.1.0.post1.dev2.dist-info/RECORD,,
|
@@ -0,0 +1 @@
|
|
1
|
+
s2t
|