ai-screenshooter 1.5.0__tar.gz → 1.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ai-screenshooter
3
- Version: 1.5.0
3
+ Version: 1.7.0
4
4
  Summary: A CLI tool to capture and send AI-powered screenshots
5
5
  Home-page: https://github.com/tech4vision/ai-screenshoter
6
6
  Author: Last Shot AI
@@ -14,6 +14,10 @@ Requires-Dist: requests
14
14
  Requires-Dist: Pillow
15
15
  Requires-Dist: pygetwindow
16
16
  Requires-Dist: pyperclip
17
+ Requires-Dist: sounddevice
18
+ Requires-Dist: soundfile
19
+ Requires-Dist: numpy
20
+ Requires-Dist: faster-whisper
17
21
  Dynamic: author
18
22
  Dynamic: author-email
19
23
  Dynamic: classifier
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ai-screenshooter
3
- Version: 1.5.0
3
+ Version: 1.7.0
4
4
  Summary: A CLI tool to capture and send AI-powered screenshots
5
5
  Home-page: https://github.com/tech4vision/ai-screenshoter
6
6
  Author: Last Shot AI
@@ -14,6 +14,10 @@ Requires-Dist: requests
14
14
  Requires-Dist: Pillow
15
15
  Requires-Dist: pygetwindow
16
16
  Requires-Dist: pyperclip
17
+ Requires-Dist: sounddevice
18
+ Requires-Dist: soundfile
19
+ Requires-Dist: numpy
20
+ Requires-Dist: faster-whisper
17
21
  Dynamic: author
18
22
  Dynamic: author-email
19
23
  Dynamic: classifier
@@ -3,3 +3,7 @@ requests
3
3
  Pillow
4
4
  pygetwindow
5
5
  pyperclip
6
+ sounddevice
7
+ soundfile
8
+ numpy
9
+ faster-whisper
@@ -6,6 +6,7 @@ import logging
6
6
  import atexit
7
7
  import time
8
8
  import subprocess
9
+ import threading
9
10
  import requests
10
11
  import pygetwindow as gw
11
12
  import pyperclip
@@ -18,8 +19,15 @@ from pynput import keyboard
18
19
  PID_FILE = Path.home() / ".ai-screenshooter.pid"
19
20
  LOG_FILE = Path.home() / ".ai-screenshooter.log"
20
21
  SCREENSHOT_DIR = Path.home() / ".ai-screenshooter" / "screenshots"
22
+ AUDIO_DIR = Path.home() / ".ai-screenshooter" / "audio"
21
23
  TIMEOUT_SECONDS = 5 * 60 * 60 # 5 hours
22
24
 
25
+ # Audio recording constants
26
+ SAMPLE_RATE = 16000 # Whisper expects 16kHz
27
+ CHANNELS = 1 # Mono audio
28
+ WHISPER_MODEL = "base" # Options: tiny, base, small, medium, large
29
+ DOUBLE_TAP_THRESHOLD = 0.5 # 500ms window for double-tap
30
+
23
31
  # Server URLs
24
32
  PROD_URL = "https://service.tech4vision.net/ai-management-service/api/v1/sessions/code-challenge"
25
33
  LOCAL_URL = "http://localhost:8082/api/v1/sessions/code-challenge"
@@ -31,6 +39,13 @@ API_URL = None
31
39
  current_keys = set()
32
40
  logger = logging.getLogger("ai-screenshooter")
33
41
 
42
+ # Voice recording state
43
+ is_recording = False
44
+ audio_thread = None
45
+ audio_data = []
46
+ whisper_model = None # Lazy-loaded on first use
47
+ last_esc_time = 0 # For double-tap detection
48
+
34
49
  if sys.platform == "win32":
35
50
  import ctypes
36
51
  from ctypes import Structure, c_long
@@ -299,12 +314,190 @@ def send_clipboard_text():
299
314
  logger.error(f"Error sending clipboard text: {e}")
300
315
 
301
316
 
317
+ # ============ Voice Recording Functions ============
318
+
319
+ def get_whisper_model():
320
+ """Lazy-load Whisper model on first use."""
321
+ global whisper_model
322
+ if whisper_model is None:
323
+ try:
324
+ from faster_whisper import WhisperModel
325
+ logger.info(f"Loading Whisper model '{WHISPER_MODEL}' (first time may download ~74MB)...")
326
+ whisper_model = WhisperModel(WHISPER_MODEL, device="cpu", compute_type="int8")
327
+ logger.info("Whisper model loaded successfully.")
328
+ except Exception as e:
329
+ logger.error(f"Failed to load Whisper model: {e}")
330
+ return None
331
+ return whisper_model
332
+
333
+
334
+ def record_audio():
335
+ """Record audio from microphone in a separate thread."""
336
+ global audio_data, is_recording
337
+ import sounddevice as sd
338
+
339
+ audio_data = []
340
+
341
+ def audio_callback(indata, frames, time_info, status):
342
+ if status:
343
+ logger.warning(f"Audio status: {status}")
344
+ if is_recording:
345
+ audio_data.append(indata.copy())
346
+
347
+ try:
348
+ with sd.InputStream(samplerate=SAMPLE_RATE, channels=CHANNELS,
349
+ callback=audio_callback, dtype='float32'):
350
+ while is_recording:
351
+ sd.sleep(100) # Sleep 100ms, check if still recording
352
+ except Exception as e:
353
+ logger.error(f"Microphone error: {e}")
354
+
355
+
356
+ def start_voice_recording():
357
+ """Start recording audio in a background thread."""
358
+ global is_recording, audio_thread, audio_data
359
+
360
+ if is_recording:
361
+ return # Already recording
362
+
363
+ logger.info("Voice recording started... (release ESC to stop)")
364
+ is_recording = True
365
+ audio_data = []
366
+
367
+ audio_thread = threading.Thread(target=record_audio, daemon=True)
368
+ audio_thread.start()
369
+
370
+
371
+ def stop_voice_recording_and_send():
372
+ """Stop recording, transcribe audio, and send to API."""
373
+ global is_recording, audio_thread, audio_data
374
+
375
+ if not is_recording:
376
+ return
377
+
378
+ logger.info("Voice recording stopped, processing...")
379
+ is_recording = False
380
+
381
+ # Wait for recording thread to finish
382
+ if audio_thread:
383
+ audio_thread.join(timeout=1.0)
384
+
385
+ # Check if we have audio data
386
+ if not audio_data:
387
+ logger.warning("No audio recorded.")
388
+ return
389
+
390
+ # Combine audio chunks
391
+ try:
392
+ import numpy as np
393
+ import soundfile as sf
394
+
395
+ audio_array = np.concatenate(audio_data, axis=0)
396
+
397
+ # Minimum recording duration check (0.5 seconds)
398
+ if len(audio_array) < SAMPLE_RATE * 0.5:
399
+ logger.warning("Recording too short, ignoring.")
400
+ return
401
+
402
+ # Save to temporary file
403
+ AUDIO_DIR.mkdir(parents=True, exist_ok=True)
404
+ temp_audio_path = AUDIO_DIR / f"recording_{int(time.time())}.wav"
405
+
406
+ sf.write(str(temp_audio_path), audio_array, SAMPLE_RATE)
407
+ logger.info(f"Audio saved: {temp_audio_path}")
408
+
409
+ # Transcribe
410
+ transcribed_text = transcribe_audio(temp_audio_path)
411
+
412
+ if transcribed_text:
413
+ # Send to API
414
+ send_transcribed_text(transcribed_text)
415
+
416
+ except Exception as e:
417
+ logger.error(f"Error processing audio: {e}")
418
+ finally:
419
+ # Cleanup temp file
420
+ try:
421
+ if 'temp_audio_path' in locals() and temp_audio_path.exists():
422
+ temp_audio_path.unlink()
423
+ except Exception:
424
+ pass
425
+
426
+
427
+ def transcribe_audio(audio_path):
428
+ """Transcribe audio file using Whisper."""
429
+ try:
430
+ model = get_whisper_model()
431
+ if model is None:
432
+ return None
433
+
434
+ logger.info("Transcribing audio...")
435
+ segments, info = model.transcribe(str(audio_path), beam_size=5)
436
+
437
+ # Combine all segments
438
+ text = " ".join([segment.text.strip() for segment in segments])
439
+
440
+ if text:
441
+ logger.info(f"Transcription: {text[:100]}{'...' if len(text) > 100 else ''}")
442
+ else:
443
+ logger.warning("Transcription returned empty text.")
444
+
445
+ return text
446
+
447
+ except Exception as e:
448
+ logger.error(f"Transcription error: {e}")
449
+ return None
450
+
451
+
452
+ def send_transcribed_text(text):
453
+ """Send transcribed text to the Code tab API."""
454
+ if not API_TOKEN:
455
+ logger.error("No API token provided!")
456
+ return
457
+
458
+ if not text or not text.strip():
459
+ logger.warning("No text to send.")
460
+ return
461
+
462
+ try:
463
+ response = requests.post(
464
+ f"{API_URL}/chat",
465
+ headers={
466
+ "Authorization": f"Bearer {API_TOKEN}",
467
+ "Content-Type": "application/json"
468
+ },
469
+ json={"message": text}
470
+ )
471
+
472
+ if response.status_code == 200:
473
+ logger.info("Transcribed text sent successfully.")
474
+ else:
475
+ logger.error(f"Failed to send text: {response.text}")
476
+ except Exception as e:
477
+ logger.error(f"Error sending transcribed text: {e}")
478
+
479
+
302
480
  # ============ Keyboard Handlers ============
303
481
 
304
482
  def on_press(key):
483
+ global last_esc_time, is_recording
484
+
305
485
  current_keys.add(key)
486
+
306
487
  try:
307
- if key == keyboard.Key.down and keyboard.Key.esc in current_keys:
488
+ # Double-tap ESC detection for voice recording
489
+ if key == keyboard.Key.esc:
490
+ current_time = time.time()
491
+ time_since_last = current_time - last_esc_time
492
+
493
+ if time_since_last < DOUBLE_TAP_THRESHOLD and not is_recording:
494
+ # Double-tap detected - start recording
495
+ start_voice_recording()
496
+
497
+ last_esc_time = current_time
498
+
499
+ # Other hotkeys (ESC + arrow keys)
500
+ elif key == keyboard.Key.down and keyboard.Key.esc in current_keys:
308
501
  logger.info("Capturing screenshot...")
309
502
  capture_screenshot()
310
503
  elif key == keyboard.Key.up and keyboard.Key.esc in current_keys:
@@ -318,11 +511,18 @@ def on_press(key):
318
511
 
319
512
 
320
513
  def on_release(key):
514
+ global is_recording
515
+
321
516
  try:
322
517
  current_keys.remove(key)
323
518
  except KeyError:
324
519
  pass
325
520
 
521
+ # Stop voice recording when ESC is released
522
+ if is_recording and key == keyboard.Key.esc:
523
+ # Run transcription in background thread to not block keyboard listener
524
+ threading.Thread(target=stop_voice_recording_and_send, daemon=True).start()
525
+
326
526
 
327
527
  # ============ CLI Commands ============
328
528
 
@@ -370,6 +570,7 @@ def cmd_start(args):
370
570
  logger.info("Press ESC + Down to capture a screenshot.")
371
571
  logger.info("Press ESC + Up to send all stored screenshots.")
372
572
  logger.info("Press ESC + Right to send clipboard text to Code tab.")
573
+ logger.info("Double-tap ESC (hold on 2nd) to record voice and send transcription.")
373
574
  if not is_daemon:
374
575
  logger.info("Running... (Press Ctrl + C to exit)")
375
576
 
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name="ai-screenshooter",
5
- version="1.5.0",
5
+ version="1.7.0",
6
6
  packages=find_packages(),
7
7
  py_modules=["ai_screenshot"],
8
8
  install_requires=[
@@ -10,7 +10,11 @@ setup(
10
10
  "requests",
11
11
  "Pillow",
12
12
  "pygetwindow",
13
- "pyperclip"
13
+ "pyperclip",
14
+ "sounddevice",
15
+ "soundfile",
16
+ "numpy",
17
+ "faster-whisper"
14
18
  ],
15
19
  entry_points={
16
20
  "console_scripts": [