speaksy 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
speaksy/core.py ADDED
@@ -0,0 +1,540 @@
1
+ """Core voice typing engine for Speaksy."""
2
+
3
+ import io
4
+ import logging
5
+ import os
6
+ import signal
7
+ import subprocess
8
+ import sys
9
+ import tempfile
10
+ import threading
11
+ import time
12
+ import wave
13
+ from collections import deque
14
+
15
+ import httpx
16
+ import numpy as np
17
+ import sounddevice as sd
18
+ from PIL import Image, ImageDraw
19
+ from pynput import keyboard
20
+
21
+ try:
22
+ import pystray
23
+ HAS_TRAY = True
24
+ except ImportError:
25
+ HAS_TRAY = False
26
+
27
+ log = logging.getLogger("speaksy")
28
+
29
+
30
+ # ---------------------------------------------------------------------------
31
+ # Audio Recorder
32
+ # ---------------------------------------------------------------------------
33
+
34
+
35
+ class AudioRecorder:
36
+ """Captures microphone audio with a rolling pre-buffer."""
37
+
38
+ def __init__(self, sample_rate=16000, channels=1, pre_buffer_sec=0.5):
39
+ self.sample_rate = sample_rate
40
+ self.channels = channels
41
+ self.chunk_size = 1024
42
+ pre_buffer_chunks = int(sample_rate * pre_buffer_sec / self.chunk_size)
43
+ self.pre_buffer = deque(maxlen=max(pre_buffer_chunks, 1))
44
+ self.recording_chunks = []
45
+ self.is_recording = False
46
+ self.stream = None
47
+ self._lock = threading.Lock()
48
+
49
+ def open(self):
50
+ """Start the always-on audio input stream for pre-buffering."""
51
+ self.stream = sd.InputStream(
52
+ samplerate=self.sample_rate,
53
+ channels=self.channels,
54
+ dtype="int16",
55
+ blocksize=self.chunk_size,
56
+ callback=self._audio_callback,
57
+ )
58
+ self.stream.start()
59
+
60
+ def _audio_callback(self, indata, frames, time_info, status):
61
+ chunk = indata.copy()
62
+ with self._lock:
63
+ if self.is_recording:
64
+ self.recording_chunks.append(chunk)
65
+ else:
66
+ self.pre_buffer.append(chunk)
67
+
68
+ def start_recording(self):
69
+ """Begin capturing audio, including the pre-buffer."""
70
+ with self._lock:
71
+ self.recording_chunks = list(self.pre_buffer)
72
+ self.pre_buffer.clear()
73
+ self.is_recording = True
74
+ log.info("Recording started")
75
+
76
+ def stop_recording(self):
77
+ """Stop capturing and return audio as an in-memory WAV BytesIO."""
78
+ with self._lock:
79
+ self.is_recording = False
80
+ chunks = self.recording_chunks
81
+ self.recording_chunks = []
82
+
83
+ if not chunks:
84
+ log.warning("No audio captured")
85
+ return None
86
+
87
+ audio_data = np.concatenate(chunks, axis=0)
88
+ duration = len(audio_data) / self.sample_rate
89
+ log.info(f"Captured {duration:.1f}s of audio")
90
+
91
+ buf = io.BytesIO()
92
+ with wave.open(buf, "wb") as wf:
93
+ wf.setnchannels(self.channels)
94
+ wf.setsampwidth(2)
95
+ wf.setframerate(self.sample_rate)
96
+ wf.writeframes(audio_data.tobytes())
97
+ buf.seek(0)
98
+ return buf
99
+
100
+ def close(self):
101
+ if self.stream:
102
+ self.stream.stop()
103
+ self.stream.close()
104
+ self.stream = None
105
+
106
+
107
+ # ---------------------------------------------------------------------------
108
+ # Transcribers
109
+ # ---------------------------------------------------------------------------
110
+
111
+
112
+ class GroqTranscriber:
113
+ """Transcribe audio via the Groq cloud API."""
114
+
115
+ API_URL = "https://api.groq.com/openai/v1/audio/transcriptions"
116
+
117
+ def __init__(self, api_key, model="whisper-large-v3-turbo", language=None):
118
+ self.api_key = api_key
119
+ self.model = model
120
+ self.language = language
121
+
122
+ def transcribe(self, audio_buf):
123
+ if not self.api_key:
124
+ raise ValueError("No Groq API key configured")
125
+
126
+ audio_buf.seek(0)
127
+ files = {"file": ("audio.wav", audio_buf, "audio/wav")}
128
+ data = {"model": self.model}
129
+ if self.language:
130
+ data["language"] = self.language
131
+
132
+ resp = httpx.post(
133
+ self.API_URL,
134
+ headers={"Authorization": f"Bearer {self.api_key}"},
135
+ files=files,
136
+ data=data,
137
+ timeout=30.0,
138
+ )
139
+ resp.raise_for_status()
140
+ return resp.json()["text"].strip()
141
+
142
+
143
+ class LocalTranscriber:
144
+ """Transcribe audio locally using faster-whisper."""
145
+
146
+ def __init__(self, model_size="base", device="cpu", compute_type="int8"):
147
+ self.model_size = model_size
148
+ self.device = device
149
+ self.compute_type = compute_type
150
+ self._model = None
151
+
152
+ def preload(self):
153
+ """Load the Whisper model."""
154
+ if self._model is not None:
155
+ return
156
+ log.info(f"Loading local Whisper model '{self.model_size}'...")
157
+ from faster_whisper import WhisperModel
158
+ self._model = WhisperModel(
159
+ self.model_size, device=self.device, compute_type=self.compute_type
160
+ )
161
+ log.info("Local Whisper model loaded")
162
+
163
+ def transcribe(self, audio_buf):
164
+ if self._model is None:
165
+ self.preload()
166
+
167
+ audio_buf.seek(0)
168
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
169
+ tmp.write(audio_buf.read())
170
+ tmp_path = tmp.name
171
+
172
+ try:
173
+ segments, _ = self._model.transcribe(tmp_path, vad_filter=True)
174
+ return " ".join(seg.text for seg in segments).strip()
175
+ finally:
176
+ os.unlink(tmp_path)
177
+
178
+
179
+ def route_transcription(audio_buf, groq, local, config):
180
+ """Try Groq first, fall back to local on any failure."""
181
+ primary = config.get("stt", {}).get("primary", "groq")
182
+
183
+ if primary == "groq" and groq.api_key:
184
+ try:
185
+ text = groq.transcribe(audio_buf)
186
+ log.info(f'[Groq] "{text}"')
187
+ return text, "groq"
188
+ except Exception as e:
189
+ log.warning(f"Groq failed ({e}), falling back to local")
190
+ audio_buf.seek(0)
191
+
192
+ text = local.transcribe(audio_buf)
193
+ log.info(f'[Local] "{text}"')
194
+ return text, "local"
195
+
196
+
197
+ # ---------------------------------------------------------------------------
198
+ # Text Cleanup (LLM post-processing)
199
+ # ---------------------------------------------------------------------------
200
+
201
+ CLEANUP_PROMPT = (
202
+ "Clean up this voice transcription. Fix grammar, punctuation, and "
203
+ "capitalization. Remove filler words (um, uh, like, you know, so, "
204
+ "basically, actually). Do NOT change the meaning, add new content, "
205
+ "or remove meaningful words. If the text is already clean, return it "
206
+ "unchanged. Return ONLY the cleaned text, nothing else."
207
+ )
208
+
209
+
210
+ class TextCleaner:
211
+ """Post-process transcribed text through a fast LLM."""
212
+
213
+ CHAT_URL = "https://api.groq.com/openai/v1/chat/completions"
214
+
215
+ def __init__(self, api_key, model="llama-3.1-8b-instant"):
216
+ self.api_key = api_key
217
+ self.model = model
218
+
219
+ def clean(self, text):
220
+ if not self.api_key or not text:
221
+ return text
222
+
223
+ try:
224
+ resp = httpx.post(
225
+ self.CHAT_URL,
226
+ headers={"Authorization": f"Bearer {self.api_key}"},
227
+ json={
228
+ "model": self.model,
229
+ "messages": [
230
+ {"role": "system", "content": CLEANUP_PROMPT},
231
+ {"role": "user", "content": text},
232
+ ],
233
+ "temperature": 0,
234
+ "max_tokens": len(text) * 2,
235
+ },
236
+ timeout=10.0,
237
+ )
238
+ resp.raise_for_status()
239
+ cleaned = resp.json()["choices"][0]["message"]["content"].strip()
240
+ if cleaned:
241
+ log.info(f'[Cleanup] "{text}" -> "{cleaned}"')
242
+ return cleaned
243
+ except Exception as e:
244
+ log.warning(f"Text cleanup failed ({e}), using raw transcription")
245
+
246
+ return text
247
+
248
+
249
+ # ---------------------------------------------------------------------------
250
+ # Text Injection
251
+ # ---------------------------------------------------------------------------
252
+
253
+
254
+ def inject_text(text, restore_clipboard=True):
255
+ """Type text at cursor position via clipboard paste."""
256
+ if not text:
257
+ return
258
+
259
+ old_clipboard = None
260
+ if restore_clipboard:
261
+ try:
262
+ result = subprocess.run(
263
+ ["xclip", "-sel", "clip", "-o"],
264
+ capture_output=True,
265
+ timeout=2,
266
+ )
267
+ if result.returncode == 0:
268
+ old_clipboard = result.stdout
269
+ except Exception:
270
+ pass
271
+
272
+ proc = subprocess.Popen(["xclip", "-sel", "clip"], stdin=subprocess.PIPE)
273
+ proc.communicate(text.encode("utf-8"))
274
+
275
+ time.sleep(0.05)
276
+ subprocess.run(["xdotool", "key", "ctrl+v"], timeout=2)
277
+ time.sleep(0.1)
278
+
279
+ if restore_clipboard and old_clipboard is not None:
280
+ time.sleep(0.1)
281
+ proc = subprocess.Popen(["xclip", "-sel", "clip"], stdin=subprocess.PIPE)
282
+ proc.communicate(old_clipboard)
283
+
284
+
285
+ # ---------------------------------------------------------------------------
286
+ # Tray Icon
287
+ # ---------------------------------------------------------------------------
288
+
289
+ TRAY_COLORS = {
290
+ "ready": (76, 175, 80, 255),
291
+ "recording": (244, 67, 54, 255),
292
+ "processing": (255, 193, 7, 255),
293
+ "fallback": (255, 152, 0, 255),
294
+ }
295
+
296
+ TRAY_TITLES = {
297
+ "ready": "Speaksy - Ready",
298
+ "recording": "Speaksy - Recording...",
299
+ "processing": "Speaksy - Transcribing...",
300
+ "fallback": "Speaksy - Local Mode",
301
+ }
302
+
303
+
304
+ def _make_circle_icon(color, size=64):
305
+ img = Image.new("RGBA", (size, size), (0, 0, 0, 0))
306
+ draw = ImageDraw.Draw(img)
307
+ draw.ellipse([4, 4, size - 4, size - 4], fill=color)
308
+ return img
309
+
310
+
311
+ class TrayManager:
312
+ """System tray icon showing recording state."""
313
+
314
+ def __init__(self, on_quit):
315
+ self.on_quit = on_quit
316
+ self._icon = None
317
+
318
+ def start(self):
319
+ menu = pystray.Menu(
320
+ pystray.MenuItem("Speaksy", None, enabled=False),
321
+ pystray.Menu.SEPARATOR,
322
+ pystray.MenuItem("Quit", lambda: self._quit()),
323
+ )
324
+ self._icon = pystray.Icon(
325
+ "speaksy",
326
+ _make_circle_icon(TRAY_COLORS["ready"]),
327
+ TRAY_TITLES["ready"],
328
+ menu,
329
+ )
330
+ self._icon.run()
331
+
332
+ def set_state(self, state):
333
+ if self._icon:
334
+ self._icon.icon = _make_circle_icon(
335
+ TRAY_COLORS.get(state, TRAY_COLORS["ready"])
336
+ )
337
+ self._icon.title = TRAY_TITLES.get(state, "Speaksy")
338
+
339
+ def _quit(self):
340
+ if self._icon:
341
+ self._icon.stop()
342
+ self.on_quit()
343
+
344
+
345
+ # ---------------------------------------------------------------------------
346
+ # Hotkey Manager
347
+ # ---------------------------------------------------------------------------
348
+
349
+
350
+ def _parse_key(key_str):
351
+ """Parse 'Key.ctrl_r' or 'Key.f8' to a pynput key object."""
352
+ if key_str.startswith("Key."):
353
+ attr = key_str[4:]
354
+ return getattr(keyboard.Key, attr)
355
+ return keyboard.KeyCode.from_char(key_str)
356
+
357
+
358
+ class HotkeyManager:
359
+ """Global hotkey listener for push-to-talk and toggle modes."""
360
+
361
+ def __init__(self, push_to_talk_key, toggle_key, on_start, on_stop):
362
+ self.ptt_key = _parse_key(push_to_talk_key)
363
+ self.toggle_key = _parse_key(toggle_key)
364
+ self.on_start = on_start
365
+ self.on_stop = on_stop
366
+ self._ptt_held = False
367
+ self._toggle_active = False
368
+ self._listener = None
369
+
370
+ def start(self):
371
+ self._listener = keyboard.Listener(
372
+ on_press=self._on_press,
373
+ on_release=self._on_release,
374
+ )
375
+ self._listener.daemon = True
376
+ self._listener.start()
377
+ log.info(
378
+ f"Hotkeys active: hold {self.ptt_key} (push-to-talk), "
379
+ f"press {self.toggle_key} (toggle)"
380
+ )
381
+
382
+ def _on_press(self, key):
383
+ if key == self.ptt_key and not self._ptt_held:
384
+ self._ptt_held = True
385
+ if not self._toggle_active:
386
+ self.on_start()
387
+ elif key == self.toggle_key:
388
+ if self._toggle_active:
389
+ self._toggle_active = False
390
+ self.on_stop()
391
+ else:
392
+ self._toggle_active = True
393
+ self.on_start()
394
+
395
+ def _on_release(self, key):
396
+ if key == self.ptt_key and self._ptt_held:
397
+ self._ptt_held = False
398
+ if not self._toggle_active:
399
+ self.on_stop()
400
+
401
+ def stop(self):
402
+ if self._listener:
403
+ self._listener.stop()
404
+
405
+
406
+ # ---------------------------------------------------------------------------
407
+ # Main Engine
408
+ # ---------------------------------------------------------------------------
409
+
410
+
411
+ class SpeaksyEngine:
412
+ """Orchestrates recording, transcription, and text injection."""
413
+
414
+ def __init__(self, config, api_key=None):
415
+ self.config = config
416
+ self.api_key = api_key or os.getenv("GROQ_API_KEY", "")
417
+ self._recording = False
418
+ self._transcribing = False
419
+ self._using_fallback = False
420
+
421
+ audio_cfg = config.get("audio", {})
422
+ self.recorder = AudioRecorder(
423
+ sample_rate=audio_cfg.get("sample_rate", 16000),
424
+ channels=audio_cfg.get("channels", 1),
425
+ pre_buffer_sec=audio_cfg.get("pre_buffer_seconds", 0.5),
426
+ )
427
+
428
+ stt_cfg = config.get("stt", {})
429
+ self.groq = GroqTranscriber(
430
+ api_key=self.api_key,
431
+ model=stt_cfg.get("groq_model", "whisper-large-v3-turbo"),
432
+ language=stt_cfg.get("language"),
433
+ )
434
+ self.local = LocalTranscriber(
435
+ model_size=stt_cfg.get("local_model", "base"),
436
+ device=stt_cfg.get("local_device", "cpu"),
437
+ compute_type=stt_cfg.get("local_compute_type", "int8"),
438
+ )
439
+
440
+ cleanup_cfg = config.get("cleanup", {})
441
+ self.cleaner = None
442
+ if cleanup_cfg.get("enabled", True):
443
+ self.cleaner = TextCleaner(
444
+ api_key=self.api_key,
445
+ model=cleanup_cfg.get("model", "llama-3.1-8b-instant"),
446
+ )
447
+
448
+ tray_cfg = config.get("tray", {})
449
+ self.tray = None
450
+ if tray_cfg.get("enabled", True) and HAS_TRAY:
451
+ self.tray = TrayManager(on_quit=self.shutdown)
452
+
453
+ hotkey_cfg = config.get("hotkeys", {})
454
+ self.hotkeys = HotkeyManager(
455
+ push_to_talk_key=hotkey_cfg.get("push_to_talk", "Key.ctrl_r"),
456
+ toggle_key=hotkey_cfg.get("toggle", "Key.f8"),
457
+ on_start=self._on_record_start,
458
+ on_stop=self._on_record_stop,
459
+ )
460
+
461
+ def _on_record_start(self):
462
+ if self._recording or self._transcribing:
463
+ return
464
+ self._recording = True
465
+ self.recorder.start_recording()
466
+ if self.tray:
467
+ self.tray.set_state("recording")
468
+
469
+ def _on_record_stop(self):
470
+ if not self._recording:
471
+ return
472
+ self._recording = False
473
+ audio_buf = self.recorder.stop_recording()
474
+ if audio_buf is None:
475
+ if self.tray:
476
+ self.tray.set_state("ready")
477
+ return
478
+
479
+ self._transcribing = True
480
+ if self.tray:
481
+ self.tray.set_state("processing")
482
+ threading.Thread(
483
+ target=self._transcribe_and_type, args=(audio_buf,), daemon=True
484
+ ).start()
485
+
486
+ def _transcribe_and_type(self, audio_buf):
487
+ try:
488
+ text, source = route_transcription(
489
+ audio_buf, self.groq, self.local, self.config
490
+ )
491
+ self._using_fallback = source == "local"
492
+
493
+ if text:
494
+ if self.cleaner and not self._using_fallback:
495
+ text = self.cleaner.clean(text)
496
+
497
+ text_cfg = self.config.get("text_injection", {})
498
+ inject_text(
499
+ text,
500
+ restore_clipboard=text_cfg.get("restore_clipboard", True),
501
+ )
502
+ except Exception as e:
503
+ log.error(f"Transcription failed: {e}")
504
+ finally:
505
+ self._transcribing = False
506
+ if self.tray:
507
+ state = "fallback" if self._using_fallback else "ready"
508
+ self.tray.set_state(state)
509
+
510
+ def run(self):
511
+ """Start the engine (blocking)."""
512
+ self.recorder.open()
513
+
514
+ # Preload local model in background
515
+ threading.Thread(target=self.local.preload, daemon=True).start()
516
+
517
+ self.hotkeys.start()
518
+
519
+ api_status = "configured" if self.api_key else "NOT SET (local only)"
520
+ log.info(f"Speaksy ready. API key: {api_status}")
521
+
522
+ hotkey_cfg = self.config.get("hotkeys", {})
523
+ ptt = hotkey_cfg.get("push_to_talk", "Key.ctrl_r")
524
+ toggle = hotkey_cfg.get("toggle", "Key.f8")
525
+ log.info(f"Hold {ptt} (push-to-talk) or press {toggle} (toggle)")
526
+
527
+ if self.tray:
528
+ self.tray.start()
529
+ else:
530
+ try:
531
+ signal.pause()
532
+ except KeyboardInterrupt:
533
+ pass
534
+
535
+ def shutdown(self):
536
+ """Stop the engine."""
537
+ log.info("Shutting down...")
538
+ self.hotkeys.stop()
539
+ self.recorder.close()
540
+ sys.exit(0)
speaksy/runner.py ADDED
@@ -0,0 +1,31 @@
1
+ """Runner module for systemd service."""
2
+
3
+ import logging
4
+ import os
5
+ import sys
6
+
7
+ from speaksy.config import get_api_key, load_config
8
+ from speaksy.core import SpeaksyEngine
9
+
10
+
11
+ def main():
12
+ """Run the speaksy engine."""
13
+ logging.basicConfig(
14
+ level=logging.INFO,
15
+ format="%(asctime)s [%(levelname)s] %(message)s",
16
+ datefmt="%H:%M:%S",
17
+ )
18
+
19
+ config = load_config()
20
+ api_key = get_api_key()
21
+
22
+ if not api_key:
23
+ logging.error("No API key configured. Run 'speaksy' to set up.")
24
+ sys.exit(1)
25
+
26
+ engine = SpeaksyEngine(config, api_key=api_key)
27
+ engine.run()
28
+
29
+
30
+ if __name__ == "__main__":
31
+ main()