ai-screenshooter 1.5.0__tar.gz → 1.7.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ai-screenshooter
3
- Version: 1.5.0
3
+ Version: 1.7.1
4
4
  Summary: A CLI tool to capture and send AI-powered screenshots
5
5
  Home-page: https://github.com/tech4vision/ai-screenshoter
6
6
  Author: Last Shot AI
@@ -14,6 +14,10 @@ Requires-Dist: requests
14
14
  Requires-Dist: Pillow
15
15
  Requires-Dist: pygetwindow
16
16
  Requires-Dist: pyperclip
17
+ Requires-Dist: sounddevice
18
+ Requires-Dist: soundfile
19
+ Requires-Dist: numpy
20
+ Requires-Dist: faster-whisper
17
21
  Dynamic: author
18
22
  Dynamic: author-email
19
23
  Dynamic: classifier
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ai-screenshooter
3
- Version: 1.5.0
3
+ Version: 1.7.1
4
4
  Summary: A CLI tool to capture and send AI-powered screenshots
5
5
  Home-page: https://github.com/tech4vision/ai-screenshoter
6
6
  Author: Last Shot AI
@@ -14,6 +14,10 @@ Requires-Dist: requests
14
14
  Requires-Dist: Pillow
15
15
  Requires-Dist: pygetwindow
16
16
  Requires-Dist: pyperclip
17
+ Requires-Dist: sounddevice
18
+ Requires-Dist: soundfile
19
+ Requires-Dist: numpy
20
+ Requires-Dist: faster-whisper
17
21
  Dynamic: author
18
22
  Dynamic: author-email
19
23
  Dynamic: classifier
@@ -3,3 +3,7 @@ requests
3
3
  Pillow
4
4
  pygetwindow
5
5
  pyperclip
6
+ sounddevice
7
+ soundfile
8
+ numpy
9
+ faster-whisper
@@ -1,4 +1,5 @@
1
1
  import argparse
2
+ import json
2
3
  import os
3
4
  import sys
4
5
  import signal
@@ -6,6 +7,7 @@ import logging
6
7
  import atexit
7
8
  import time
8
9
  import subprocess
10
+ import threading
9
11
  import requests
10
12
  import pygetwindow as gw
11
13
  import pyperclip
@@ -17,9 +19,17 @@ from pynput import keyboard
17
19
  # Constants
18
20
  PID_FILE = Path.home() / ".ai-screenshooter.pid"
19
21
  LOG_FILE = Path.home() / ".ai-screenshooter.log"
22
+ META_FILE = Path.home() / ".ai-screenshooter.meta.json"
20
23
  SCREENSHOT_DIR = Path.home() / ".ai-screenshooter" / "screenshots"
24
+ AUDIO_DIR = Path.home() / ".ai-screenshooter" / "audio"
21
25
  TIMEOUT_SECONDS = 5 * 60 * 60 # 5 hours
22
26
 
27
+ # Audio recording constants
28
+ SAMPLE_RATE = 16000 # Whisper expects 16kHz
29
+ CHANNELS = 1 # Mono audio
30
+ WHISPER_MODEL = "base" # Options: tiny, base, small, medium, large
31
+ DOUBLE_TAP_THRESHOLD = 0.5 # 500ms window for double-tap
32
+
23
33
  # Server URLs
24
34
  PROD_URL = "https://service.tech4vision.net/ai-management-service/api/v1/sessions/code-challenge"
25
35
  LOCAL_URL = "http://localhost:8082/api/v1/sessions/code-challenge"
@@ -31,6 +41,13 @@ API_URL = None
31
41
  current_keys = set()
32
42
  logger = logging.getLogger("ai-screenshooter")
33
43
 
44
+ # Voice recording state
45
+ is_recording = False
46
+ audio_thread = None
47
+ audio_data = []
48
+ whisper_model = None # Lazy-loaded on first use
49
+ last_esc_time = 0 # For double-tap detection
50
+
34
51
  if sys.platform == "win32":
35
52
  import ctypes
36
53
  from ctypes import Structure, c_long
@@ -82,6 +99,31 @@ def cleanup_pid_file():
82
99
  PID_FILE.unlink()
83
100
  except Exception:
84
101
  pass
102
+ try:
103
+ if META_FILE.exists():
104
+ META_FILE.unlink()
105
+ except Exception:
106
+ pass
107
+
108
+
109
+ def write_meta_file(server_mode, server_url):
110
+ """Write process metadata for status command."""
111
+ meta = {
112
+ "started_at": time.time(),
113
+ "server_mode": server_mode,
114
+ "server_url": server_url,
115
+ }
116
+ META_FILE.write_text(json.dumps(meta))
117
+
118
+
119
+ def read_meta_file():
120
+ """Read process metadata, return None if invalid."""
121
+ if not META_FILE.exists():
122
+ return None
123
+ try:
124
+ return json.loads(META_FILE.read_text())
125
+ except (ValueError, IOError):
126
+ return None
85
127
 
86
128
 
87
129
  # ============ Process Management ============
@@ -299,11 +341,196 @@ def send_clipboard_text():
299
341
  logger.error(f"Error sending clipboard text: {e}")
300
342
 
301
343
 
344
+ # ============ Voice Recording Functions ============
345
+
346
+ def get_whisper_model():
347
+ """Lazy-load Whisper model on first use."""
348
+ global whisper_model
349
+ if whisper_model is None:
350
+ try:
351
+ from faster_whisper import WhisperModel
352
+ logger.info(f"Loading Whisper model '{WHISPER_MODEL}' (first time may download ~74MB)...")
353
+ whisper_model = WhisperModel(WHISPER_MODEL, device="cpu", compute_type="int8")
354
+ logger.info("Whisper model loaded successfully.")
355
+ except Exception as e:
356
+ logger.error(f"Failed to load Whisper model: {e}")
357
+ return None
358
+ return whisper_model
359
+
360
+
361
+ def record_audio():
362
+ """Record audio from microphone in a separate thread."""
363
+ global audio_data, is_recording
364
+ import sounddevice as sd
365
+
366
+ audio_data = []
367
+
368
+ def audio_callback(indata, frames, time_info, status):
369
+ if status:
370
+ logger.warning(f"Audio status: {status}")
371
+ if is_recording:
372
+ audio_data.append(indata.copy())
373
+
374
+ try:
375
+ with sd.InputStream(samplerate=SAMPLE_RATE, channels=CHANNELS,
376
+ callback=audio_callback, dtype='float32'):
377
+ while is_recording:
378
+ sd.sleep(100) # Sleep 100ms, check if still recording
379
+ except Exception as e:
380
+ logger.error(f"Microphone error: {e}")
381
+
382
+
383
+ def start_voice_recording():
384
+ """Start recording audio in a background thread."""
385
+ global is_recording, audio_thread, audio_data
386
+
387
+ if is_recording:
388
+ return # Already recording
389
+
390
+ logger.info("Voice recording started... (release ESC to stop)")
391
+ is_recording = True
392
+ audio_data = []
393
+
394
+ audio_thread = threading.Thread(target=record_audio, daemon=True)
395
+ audio_thread.start()
396
+
397
+
398
+ def stop_voice_recording_and_send():
399
+ """Stop recording, transcribe audio, and send to API."""
400
+ global is_recording, audio_thread, audio_data
401
+
402
+ if not is_recording:
403
+ return
404
+
405
+ logger.info("Voice recording stopped, processing...")
406
+ is_recording = False
407
+
408
+ # Wait for recording thread to finish
409
+ if audio_thread:
410
+ audio_thread.join(timeout=1.0)
411
+
412
+ # Check if we have audio data
413
+ if not audio_data:
414
+ logger.warning("No audio recorded.")
415
+ return
416
+
417
+ # Combine audio chunks
418
+ try:
419
+ import numpy as np
420
+ import soundfile as sf
421
+
422
+ audio_array = np.concatenate(audio_data, axis=0)
423
+
424
+ # Minimum recording duration check (0.5 seconds)
425
+ if len(audio_array) < SAMPLE_RATE * 0.5:
426
+ logger.warning("Recording too short, ignoring.")
427
+ return
428
+
429
+ # Save to temporary file
430
+ AUDIO_DIR.mkdir(parents=True, exist_ok=True)
431
+ temp_audio_path = AUDIO_DIR / f"recording_{int(time.time())}.wav"
432
+
433
+ sf.write(str(temp_audio_path), audio_array, SAMPLE_RATE)
434
+ logger.info(f"Audio saved: {temp_audio_path}")
435
+
436
+ # Transcribe
437
+ transcribed_text = transcribe_audio(temp_audio_path)
438
+
439
+ if transcribed_text:
440
+ # Send to API
441
+ send_transcribed_text(transcribed_text)
442
+
443
+ except Exception as e:
444
+ logger.error(f"Error processing audio: {e}")
445
+ finally:
446
+ # Cleanup temp file
447
+ try:
448
+ if 'temp_audio_path' in locals() and temp_audio_path.exists():
449
+ temp_audio_path.unlink()
450
+ except Exception:
451
+ pass
452
+
453
+
454
+ def transcribe_audio(audio_path):
455
+ """Transcribe audio file using Whisper."""
456
+ try:
457
+ model = get_whisper_model()
458
+ if model is None:
459
+ return None
460
+
461
+ logger.info("Transcribing audio...")
462
+ segments, info = model.transcribe(str(audio_path), beam_size=5)
463
+
464
+ # Combine all segments
465
+ text = " ".join([segment.text.strip() for segment in segments])
466
+
467
+ if text:
468
+ logger.info(f"Transcription: {text[:100]}{'...' if len(text) > 100 else ''}")
469
+ else:
470
+ logger.warning("Transcription returned empty text.")
471
+
472
+ return text
473
+
474
+ except Exception as e:
475
+ logger.error(f"Transcription error: {e}")
476
+ return None
477
+
478
+
479
+ def send_transcribed_text(text):
480
+ """Send transcribed text to the Code tab API."""
481
+ if not API_TOKEN:
482
+ logger.error("No API token provided!")
483
+ return
484
+
485
+ if not text or not text.strip():
486
+ logger.warning("No text to send.")
487
+ return
488
+
489
+ try:
490
+ response = requests.post(
491
+ f"{API_URL}/chat",
492
+ headers={
493
+ "Authorization": f"Bearer {API_TOKEN}",
494
+ "Content-Type": "application/json"
495
+ },
496
+ json={"message": text}
497
+ )
498
+
499
+ if response.status_code == 200:
500
+ logger.info("Transcribed text sent successfully.")
501
+ else:
502
+ logger.error(f"Failed to send text: {response.text}")
503
+ except Exception as e:
504
+ logger.error(f"Error sending transcribed text: {e}")
505
+
506
+
302
507
  # ============ Keyboard Handlers ============
303
508
 
304
509
  def on_press(key):
305
- current_keys.add(key)
510
+ global last_esc_time, is_recording
511
+
306
512
  try:
513
+ # Double-tap ESC detection for voice recording
514
+ if key == keyboard.Key.esc:
515
+ # Ignore repeated key events from holding ESC
516
+ if keyboard.Key.esc in current_keys:
517
+ return
518
+ current_keys.add(key)
519
+
520
+ current_time = time.time()
521
+ time_since_last = current_time - last_esc_time
522
+
523
+ if time_since_last < DOUBLE_TAP_THRESHOLD and not is_recording:
524
+ # Double-tap detected - start recording
525
+ start_voice_recording()
526
+
527
+ last_esc_time = current_time
528
+
529
+ # Track non-ESC keys for combo detection
530
+ else:
531
+ current_keys.add(key)
532
+
533
+ # Other hotkeys (ESC + arrow keys)
307
534
  if key == keyboard.Key.down and keyboard.Key.esc in current_keys:
308
535
  logger.info("Capturing screenshot...")
309
536
  capture_screenshot()
@@ -318,11 +545,18 @@ def on_press(key):
318
545
 
319
546
 
320
547
  def on_release(key):
548
+ global is_recording
549
+
321
550
  try:
322
551
  current_keys.remove(key)
323
552
  except KeyError:
324
553
  pass
325
554
 
555
+ # Stop voice recording when ESC is released
556
+ if is_recording and key == keyboard.Key.esc:
557
+ # Run transcription in background thread to not block keyboard listener
558
+ threading.Thread(target=stop_voice_recording_and_send, daemon=True).start()
559
+
326
560
 
327
561
  # ============ CLI Commands ============
328
562
 
@@ -330,19 +564,20 @@ def cmd_start(args):
330
564
  """Handle the start command."""
331
565
  global API_TOKEN, API_URL
332
566
 
333
- # If --background flag, spawn a new process and exit
334
- if args.background:
335
- print("Starting in background mode...")
567
+ is_daemon = getattr(args, 'daemon', False)
568
+
569
+ # Kill any existing instance (unless this is the daemon subprocess itself)
570
+ if not is_daemon:
336
571
  killed = kill_existing_process()
337
572
  if killed:
338
- print("Killed existing instance.")
573
+ print("Replaced existing instance.")
339
574
 
575
+ # If --background flag, spawn a new process and exit
576
+ if args.background:
577
+ print("Starting in background mode...")
340
578
  start_background_process(args.token, args.local)
341
579
  return
342
580
 
343
- # If --daemon flag (internal), this is the actual daemon process
344
- is_daemon = getattr(args, 'daemon', False)
345
-
346
581
  if is_daemon:
347
582
  # Write PID file
348
583
  write_pid_file()
@@ -365,11 +600,16 @@ def cmd_start(args):
365
600
  API_URL = LOCAL_URL if args.local else PROD_URL
366
601
 
367
602
  server_mode = "LOCAL" if args.local else "PRODUCTION"
603
+
604
+ # Write metadata for status command
605
+ write_meta_file(server_mode, API_URL)
606
+
368
607
  logger.info("AI Screenshot CLI started.")
369
608
  logger.info(f"Server: {server_mode} ({API_URL})")
370
609
  logger.info("Press ESC + Down to capture a screenshot.")
371
610
  logger.info("Press ESC + Up to send all stored screenshots.")
372
611
  logger.info("Press ESC + Right to send clipboard text to Code tab.")
612
+ logger.info("Double-tap ESC (hold on 2nd) to record voice and send transcription.")
373
613
  if not is_daemon:
374
614
  logger.info("Running... (Press Ctrl + C to exit)")
375
615
 
@@ -383,11 +623,38 @@ def cmd_status(args):
383
623
  pid = get_pid_from_file()
384
624
  if pid and is_process_running(pid):
385
625
  print(f"ai-screenshooter is running (PID: {pid})")
626
+
627
+ meta = read_meta_file()
628
+ if meta:
629
+ # Uptime
630
+ elapsed = time.time() - meta.get("started_at", time.time())
631
+ hours, remainder = divmod(int(elapsed), 3600)
632
+ minutes, seconds = divmod(remainder, 60)
633
+ print(f" Uptime: {hours}h {minutes}m {seconds}s")
634
+
635
+ # Time remaining
636
+ remaining = TIMEOUT_SECONDS - elapsed
637
+ if remaining > 0:
638
+ rh, rr = divmod(int(remaining), 3600)
639
+ rm, rs = divmod(rr, 60)
640
+ print(f" Expires: {rh}h {rm}m {rs}s remaining")
641
+
642
+ # Server
643
+ print(f" Server: {meta.get('server_mode', 'UNKNOWN')} ({meta.get('server_url', '')})")
644
+
645
+ print()
646
+ print(" Listening for hotkeys:")
647
+ print(" ESC + Down Capture screenshot")
648
+ print(" ESC + Up Send all screenshots")
649
+ print(" ESC + Right Send clipboard text to Code tab")
650
+ print(" Double-tap ESC Record voice, transcribe and send")
651
+
386
652
  return 0
387
653
  else:
388
654
  print("ai-screenshooter is not running")
389
655
  if PID_FILE.exists():
390
- print(f"(stale PID file exists at {PID_FILE})")
656
+ print(f"(stale PID file found, cleaning up)")
657
+ cleanup_pid_file()
391
658
  return 1
392
659
 
393
660
 
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name="ai-screenshooter",
5
- version="1.5.0",
5
+ version="1.7.1",
6
6
  packages=find_packages(),
7
7
  py_modules=["ai_screenshot"],
8
8
  install_requires=[
@@ -10,7 +10,11 @@ setup(
10
10
  "requests",
11
11
  "Pillow",
12
12
  "pygetwindow",
13
- "pyperclip"
13
+ "pyperclip",
14
+ "sounddevice",
15
+ "soundfile",
16
+ "numpy",
17
+ "faster-whisper"
14
18
  ],
15
19
  entry_points={
16
20
  "console_scripts": [