ai-screenshooter 1.3.0__tar.gz → 1.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ai-screenshooter
3
- Version: 1.3.0
3
+ Version: 1.7.0
4
4
  Summary: A CLI tool to capture and send AI-powered screenshots
5
5
  Home-page: https://github.com/tech4vision/ai-screenshoter
6
6
  Author: Last Shot AI
@@ -13,6 +13,11 @@ Requires-Dist: pynput
13
13
  Requires-Dist: requests
14
14
  Requires-Dist: Pillow
15
15
  Requires-Dist: pygetwindow
16
+ Requires-Dist: pyperclip
17
+ Requires-Dist: sounddevice
18
+ Requires-Dist: soundfile
19
+ Requires-Dist: numpy
20
+ Requires-Dist: faster-whisper
16
21
  Dynamic: author
17
22
  Dynamic: author-email
18
23
  Dynamic: classifier
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ai-screenshooter
3
- Version: 1.3.0
3
+ Version: 1.7.0
4
4
  Summary: A CLI tool to capture and send AI-powered screenshots
5
5
  Home-page: https://github.com/tech4vision/ai-screenshoter
6
6
  Author: Last Shot AI
@@ -13,6 +13,11 @@ Requires-Dist: pynput
13
13
  Requires-Dist: requests
14
14
  Requires-Dist: Pillow
15
15
  Requires-Dist: pygetwindow
16
+ Requires-Dist: pyperclip
17
+ Requires-Dist: sounddevice
18
+ Requires-Dist: soundfile
19
+ Requires-Dist: numpy
20
+ Requires-Dist: faster-whisper
16
21
  Dynamic: author
17
22
  Dynamic: author-email
18
23
  Dynamic: classifier
@@ -0,0 +1,9 @@
1
+ pynput
2
+ requests
3
+ Pillow
4
+ pygetwindow
5
+ pyperclip
6
+ sounddevice
7
+ soundfile
8
+ numpy
9
+ faster-whisper
@@ -6,8 +6,10 @@ import logging
6
6
  import atexit
7
7
  import time
8
8
  import subprocess
9
+ import threading
9
10
  import requests
10
11
  import pygetwindow as gw
12
+ import pyperclip
11
13
  from pathlib import Path
12
14
  from PIL import ImageGrab
13
15
  from pynput import keyboard
@@ -17,8 +19,15 @@ from pynput import keyboard
17
19
  PID_FILE = Path.home() / ".ai-screenshooter.pid"
18
20
  LOG_FILE = Path.home() / ".ai-screenshooter.log"
19
21
  SCREENSHOT_DIR = Path.home() / ".ai-screenshooter" / "screenshots"
22
+ AUDIO_DIR = Path.home() / ".ai-screenshooter" / "audio"
20
23
  TIMEOUT_SECONDS = 5 * 60 * 60 # 5 hours
21
24
 
25
+ # Audio recording constants
26
+ SAMPLE_RATE = 16000 # Whisper expects 16kHz
27
+ CHANNELS = 1 # Mono audio
28
+ WHISPER_MODEL = "base" # Options: tiny, base, small, medium, large
29
+ DOUBLE_TAP_THRESHOLD = 0.5 # 500ms window for double-tap
30
+
22
31
  # Server URLs
23
32
  PROD_URL = "https://service.tech4vision.net/ai-management-service/api/v1/sessions/code-challenge"
24
33
  LOCAL_URL = "http://localhost:8082/api/v1/sessions/code-challenge"
@@ -30,6 +39,13 @@ API_URL = None
30
39
  current_keys = set()
31
40
  logger = logging.getLogger("ai-screenshooter")
32
41
 
42
+ # Voice recording state
43
+ is_recording = False
44
+ audio_thread = None
45
+ audio_data = []
46
+ whisper_model = None # Lazy-loaded on first use
47
+ last_esc_time = 0 # For double-tap detection
48
+
33
49
  if sys.platform == "win32":
34
50
  import ctypes
35
51
  from ctypes import Structure, c_long
@@ -269,27 +285,244 @@ def send_screenshots():
269
285
  logger.error(f"Error uploading screenshots: {e}")
270
286
 
271
287
 
288
+ def send_clipboard_text():
289
+ """Send clipboard content to Code tab API."""
290
+ if not API_TOKEN:
291
+ logger.error("No API token provided!")
292
+ return
293
+
294
+ try:
295
+ text = pyperclip.paste()
296
+ if not text or not text.strip():
297
+ logger.warning("Clipboard is empty.")
298
+ return
299
+
300
+ response = requests.post(
301
+ f"{API_URL}/chat",
302
+ headers={
303
+ "Authorization": f"Bearer {API_TOKEN}",
304
+ "Content-Type": "application/json"
305
+ },
306
+ json={"message": text}
307
+ )
308
+
309
+ if response.status_code == 200:
310
+ logger.info("Text sent to Code tab successfully.")
311
+ else:
312
+ logger.error(f"Failed to send text: {response.text}")
313
+ except Exception as e:
314
+ logger.error(f"Error sending clipboard text: {e}")
315
+
316
+
317
+ # ============ Voice Recording Functions ============
318
+
319
+ def get_whisper_model():
320
+ """Lazy-load Whisper model on first use."""
321
+ global whisper_model
322
+ if whisper_model is None:
323
+ try:
324
+ from faster_whisper import WhisperModel
325
+ logger.info(f"Loading Whisper model '{WHISPER_MODEL}' (first time may download ~74MB)...")
326
+ whisper_model = WhisperModel(WHISPER_MODEL, device="cpu", compute_type="int8")
327
+ logger.info("Whisper model loaded successfully.")
328
+ except Exception as e:
329
+ logger.error(f"Failed to load Whisper model: {e}")
330
+ return None
331
+ return whisper_model
332
+
333
+
334
+ def record_audio():
335
+ """Record audio from microphone in a separate thread."""
336
+ global audio_data, is_recording
337
+ import sounddevice as sd
338
+
339
+ audio_data = []
340
+
341
+ def audio_callback(indata, frames, time_info, status):
342
+ if status:
343
+ logger.warning(f"Audio status: {status}")
344
+ if is_recording:
345
+ audio_data.append(indata.copy())
346
+
347
+ try:
348
+ with sd.InputStream(samplerate=SAMPLE_RATE, channels=CHANNELS,
349
+ callback=audio_callback, dtype='float32'):
350
+ while is_recording:
351
+ sd.sleep(100) # Sleep 100ms, check if still recording
352
+ except Exception as e:
353
+ logger.error(f"Microphone error: {e}")
354
+
355
+
356
+ def start_voice_recording():
357
+ """Start recording audio in a background thread."""
358
+ global is_recording, audio_thread, audio_data
359
+
360
+ if is_recording:
361
+ return # Already recording
362
+
363
+ logger.info("Voice recording started... (release ESC to stop)")
364
+ is_recording = True
365
+ audio_data = []
366
+
367
+ audio_thread = threading.Thread(target=record_audio, daemon=True)
368
+ audio_thread.start()
369
+
370
+
371
+ def stop_voice_recording_and_send():
372
+ """Stop recording, transcribe audio, and send to API."""
373
+ global is_recording, audio_thread, audio_data
374
+
375
+ if not is_recording:
376
+ return
377
+
378
+ logger.info("Voice recording stopped, processing...")
379
+ is_recording = False
380
+
381
+ # Wait for recording thread to finish
382
+ if audio_thread:
383
+ audio_thread.join(timeout=1.0)
384
+
385
+ # Check if we have audio data
386
+ if not audio_data:
387
+ logger.warning("No audio recorded.")
388
+ return
389
+
390
+ # Combine audio chunks
391
+ try:
392
+ import numpy as np
393
+ import soundfile as sf
394
+
395
+ audio_array = np.concatenate(audio_data, axis=0)
396
+
397
+ # Minimum recording duration check (0.5 seconds)
398
+ if len(audio_array) < SAMPLE_RATE * 0.5:
399
+ logger.warning("Recording too short, ignoring.")
400
+ return
401
+
402
+ # Save to temporary file
403
+ AUDIO_DIR.mkdir(parents=True, exist_ok=True)
404
+ temp_audio_path = AUDIO_DIR / f"recording_{int(time.time())}.wav"
405
+
406
+ sf.write(str(temp_audio_path), audio_array, SAMPLE_RATE)
407
+ logger.info(f"Audio saved: {temp_audio_path}")
408
+
409
+ # Transcribe
410
+ transcribed_text = transcribe_audio(temp_audio_path)
411
+
412
+ if transcribed_text:
413
+ # Send to API
414
+ send_transcribed_text(transcribed_text)
415
+
416
+ except Exception as e:
417
+ logger.error(f"Error processing audio: {e}")
418
+ finally:
419
+ # Cleanup temp file
420
+ try:
421
+ if 'temp_audio_path' in locals() and temp_audio_path.exists():
422
+ temp_audio_path.unlink()
423
+ except Exception:
424
+ pass
425
+
426
+
427
+ def transcribe_audio(audio_path):
428
+ """Transcribe audio file using Whisper."""
429
+ try:
430
+ model = get_whisper_model()
431
+ if model is None:
432
+ return None
433
+
434
+ logger.info("Transcribing audio...")
435
+ segments, info = model.transcribe(str(audio_path), beam_size=5)
436
+
437
+ # Combine all segments
438
+ text = " ".join([segment.text.strip() for segment in segments])
439
+
440
+ if text:
441
+ logger.info(f"Transcription: {text[:100]}{'...' if len(text) > 100 else ''}")
442
+ else:
443
+ logger.warning("Transcription returned empty text.")
444
+
445
+ return text
446
+
447
+ except Exception as e:
448
+ logger.error(f"Transcription error: {e}")
449
+ return None
450
+
451
+
452
+ def send_transcribed_text(text):
453
+ """Send transcribed text to the Code tab API."""
454
+ if not API_TOKEN:
455
+ logger.error("No API token provided!")
456
+ return
457
+
458
+ if not text or not text.strip():
459
+ logger.warning("No text to send.")
460
+ return
461
+
462
+ try:
463
+ response = requests.post(
464
+ f"{API_URL}/chat",
465
+ headers={
466
+ "Authorization": f"Bearer {API_TOKEN}",
467
+ "Content-Type": "application/json"
468
+ },
469
+ json={"message": text}
470
+ )
471
+
472
+ if response.status_code == 200:
473
+ logger.info("Transcribed text sent successfully.")
474
+ else:
475
+ logger.error(f"Failed to send text: {response.text}")
476
+ except Exception as e:
477
+ logger.error(f"Error sending transcribed text: {e}")
478
+
479
+
272
480
  # ============ Keyboard Handlers ============
273
481
 
274
482
  def on_press(key):
483
+ global last_esc_time, is_recording
484
+
275
485
  current_keys.add(key)
486
+
276
487
  try:
277
- if key == keyboard.Key.down and keyboard.Key.esc in current_keys:
488
+ # Double-tap ESC detection for voice recording
489
+ if key == keyboard.Key.esc:
490
+ current_time = time.time()
491
+ time_since_last = current_time - last_esc_time
492
+
493
+ if time_since_last < DOUBLE_TAP_THRESHOLD and not is_recording:
494
+ # Double-tap detected - start recording
495
+ start_voice_recording()
496
+
497
+ last_esc_time = current_time
498
+
499
+ # Other hotkeys (ESC + arrow keys)
500
+ elif key == keyboard.Key.down and keyboard.Key.esc in current_keys:
278
501
  logger.info("Capturing screenshot...")
279
502
  capture_screenshot()
280
503
  elif key == keyboard.Key.up and keyboard.Key.esc in current_keys:
281
504
  logger.info("Sending all screenshots...")
282
505
  send_screenshots()
506
+ elif key == keyboard.Key.right and keyboard.Key.esc in current_keys:
507
+ logger.info("Sending clipboard text to Code tab...")
508
+ send_clipboard_text()
283
509
  except AttributeError:
284
510
  pass
285
511
 
286
512
 
287
513
  def on_release(key):
514
+ global is_recording
515
+
288
516
  try:
289
517
  current_keys.remove(key)
290
518
  except KeyError:
291
519
  pass
292
520
 
521
+ # Stop voice recording when ESC is released
522
+ if is_recording and key == keyboard.Key.esc:
523
+ # Run transcription in background thread to not block keyboard listener
524
+ threading.Thread(target=stop_voice_recording_and_send, daemon=True).start()
525
+
293
526
 
294
527
  # ============ CLI Commands ============
295
528
 
@@ -336,6 +569,8 @@ def cmd_start(args):
336
569
  logger.info(f"Server: {server_mode} ({API_URL})")
337
570
  logger.info("Press ESC + Down to capture a screenshot.")
338
571
  logger.info("Press ESC + Up to send all stored screenshots.")
572
+ logger.info("Press ESC + Right to send clipboard text to Code tab.")
573
+ logger.info("Double-tap ESC (hold on 2nd) to record voice and send transcription.")
339
574
  if not is_daemon:
340
575
  logger.info("Running... (Press Ctrl + C to exit)")
341
576
 
@@ -2,14 +2,19 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name="ai-screenshooter",
5
- version="1.3.0",
5
+ version="1.7.0",
6
6
  packages=find_packages(),
7
7
  py_modules=["ai_screenshot"],
8
8
  install_requires=[
9
9
  "pynput",
10
10
  "requests",
11
11
  "Pillow",
12
- "pygetwindow"
12
+ "pygetwindow",
13
+ "pyperclip",
14
+ "sounddevice",
15
+ "soundfile",
16
+ "numpy",
17
+ "faster-whisper"
13
18
  ],
14
19
  entry_points={
15
20
  "console_scripts": [
@@ -1,4 +0,0 @@
1
- pynput
2
- requests
3
- Pillow
4
- pygetwindow