PyPI - ai-screenshooter - Versions diffs - 1.3.0__tar.gz → 1.7.0__tar.gz - Mend

@@ -6,8 +6,10 @@ import logging
 import atexit
 import time
 import subprocess
+import threading
 import requests
 import pygetwindow as gw
+import pyperclip
 from pathlib import Path
 from PIL import ImageGrab
 from pynput import keyboard
@@ -17,8 +19,15 @@ from pynput import keyboard
 PID_FILE = Path.home() / ".ai-screenshooter.pid"
 LOG_FILE = Path.home() / ".ai-screenshooter.log"
 SCREENSHOT_DIR = Path.home() / ".ai-screenshooter" / "screenshots"
+AUDIO_DIR = Path.home() / ".ai-screenshooter" / "audio"
 TIMEOUT_SECONDS = 5 * 60 * 60  # 5 hours
+# Audio recording constants
+SAMPLE_RATE = 16000  # Whisper expects 16kHz
+CHANNELS = 1  # Mono audio
+WHISPER_MODEL = "base"  # Options: tiny, base, small, medium, large
+DOUBLE_TAP_THRESHOLD = 0.5  # 500ms window for double-tap
 # Server URLs
 PROD_URL = "https://service.tech4vision.net/ai-management-service/api/v1/sessions/code-challenge"
 LOCAL_URL = "http://localhost:8082/api/v1/sessions/code-challenge"
@@ -30,6 +39,13 @@ API_URL = None
 current_keys = set()
 logger = logging.getLogger("ai-screenshooter")
+# Voice recording state
+is_recording = False
+audio_thread = None
+audio_data = []
+whisper_model = None  # Lazy-loaded on first use
+last_esc_time = 0  # For double-tap detection
 if sys.platform == "win32":
     import ctypes
     from ctypes import Structure, c_long
@@ -269,27 +285,244 @@ def send_screenshots():
         logger.error(f"Error uploading screenshots: {e}")
+def send_clipboard_text():
+    """Send clipboard content to Code tab API."""
+    if not API_TOKEN:
+        logger.error("No API token provided!")
+        return
+    try:
+        text = pyperclip.paste()
+        if not text or not text.strip():
+            logger.warning("Clipboard is empty.")
+            return
+        response = requests.post(
+            f"{API_URL}/chat",
+            headers={
+                "Authorization": f"Bearer {API_TOKEN}",
+                "Content-Type": "application/json"
+            },
+            json={"message": text}
+        )
+        if response.status_code == 200:
+            logger.info("Text sent to Code tab successfully.")
+        else:
+            logger.error(f"Failed to send text: {response.text}")
+    except Exception as e:
+        logger.error(f"Error sending clipboard text: {e}")
+# ============ Voice Recording Functions ============
+def get_whisper_model():
+    """Lazy-load Whisper model on first use."""
+    global whisper_model
+    if whisper_model is None:
+        try:
+            from faster_whisper import WhisperModel
+            logger.info(f"Loading Whisper model '{WHISPER_MODEL}' (first time may download ~74MB)...")
+            whisper_model = WhisperModel(WHISPER_MODEL, device="cpu", compute_type="int8")
+            logger.info("Whisper model loaded successfully.")
+        except Exception as e:
+            logger.error(f"Failed to load Whisper model: {e}")
+            return None
+    return whisper_model
+def record_audio():
+    """Record audio from microphone in a separate thread."""
+    global audio_data, is_recording
+    import sounddevice as sd
+    audio_data = []
+    def audio_callback(indata, frames, time_info, status):
+        if status:
+            logger.warning(f"Audio status: {status}")
+        if is_recording:
+            audio_data.append(indata.copy())
+    try:
+        with sd.InputStream(samplerate=SAMPLE_RATE, channels=CHANNELS,
+                           callback=audio_callback, dtype='float32'):
+            while is_recording:
+                sd.sleep(100)  # Sleep 100ms, check if still recording
+    except Exception as e:
+        logger.error(f"Microphone error: {e}")
+def start_voice_recording():
+    """Start recording audio in a background thread."""
+    global is_recording, audio_thread, audio_data
+    if is_recording:
+        return  # Already recording
+    logger.info("Voice recording started... (release ESC to stop)")
+    is_recording = True
+    audio_data = []
+    audio_thread = threading.Thread(target=record_audio, daemon=True)
+    audio_thread.start()
+def stop_voice_recording_and_send():
+    """Stop recording, transcribe audio, and send to API."""
+    global is_recording, audio_thread, audio_data
+    if not is_recording:
+        return
+    logger.info("Voice recording stopped, processing...")
+    is_recording = False
+    # Wait for recording thread to finish
+    if audio_thread:
+        audio_thread.join(timeout=1.0)
+    # Check if we have audio data
+    if not audio_data:
+        logger.warning("No audio recorded.")
+        return
+    # Combine audio chunks
+    try:
+        import numpy as np
+        import soundfile as sf
+        audio_array = np.concatenate(audio_data, axis=0)
+        # Minimum recording duration check (0.5 seconds)
+        if len(audio_array) < SAMPLE_RATE * 0.5:
+            logger.warning("Recording too short, ignoring.")
+            return
+        # Save to temporary file
+        AUDIO_DIR.mkdir(parents=True, exist_ok=True)
+        temp_audio_path = AUDIO_DIR / f"recording_{int(time.time())}.wav"
+        sf.write(str(temp_audio_path), audio_array, SAMPLE_RATE)
+        logger.info(f"Audio saved: {temp_audio_path}")
+        # Transcribe
+        transcribed_text = transcribe_audio(temp_audio_path)
+        if transcribed_text:
+            # Send to API
+            send_transcribed_text(transcribed_text)
+    except Exception as e:
+        logger.error(f"Error processing audio: {e}")
+    finally:
+        # Cleanup temp file
+        try:
+            if 'temp_audio_path' in locals() and temp_audio_path.exists():
+                temp_audio_path.unlink()
+        except Exception:
+            pass
+def transcribe_audio(audio_path):
+    """Transcribe audio file using Whisper."""
+    try:
+        model = get_whisper_model()
+        if model is None:
+            return None
+        logger.info("Transcribing audio...")
+        segments, info = model.transcribe(str(audio_path), beam_size=5)
+        # Combine all segments
+        text = " ".join([segment.text.strip() for segment in segments])
+        if text:
+            logger.info(f"Transcription: {text[:100]}{'...' if len(text) > 100 else ''}")
+        else:
+            logger.warning("Transcription returned empty text.")
+        return text
+    except Exception as e:
+        logger.error(f"Transcription error: {e}")
+        return None
+def send_transcribed_text(text):
+    """Send transcribed text to the Code tab API."""
+    if not API_TOKEN:
+        logger.error("No API token provided!")
+        return
+    if not text or not text.strip():
+        logger.warning("No text to send.")
+        return
+    try:
+        response = requests.post(
+            f"{API_URL}/chat",
+            headers={
+                "Authorization": f"Bearer {API_TOKEN}",
+                "Content-Type": "application/json"
+            },
+            json={"message": text}
+        )
+        if response.status_code == 200:
+            logger.info("Transcribed text sent successfully.")
+        else:
+            logger.error(f"Failed to send text: {response.text}")
+    except Exception as e:
+        logger.error(f"Error sending transcribed text: {e}")
 # ============ Keyboard Handlers ============
 def on_press(key):
+    global last_esc_time, is_recording
     current_keys.add(key)
     try:
-        if key == keyboard.Key.down and keyboard.Key.esc in current_keys:
+        # Double-tap ESC detection for voice recording
+        if key == keyboard.Key.esc:
+            current_time = time.time()
+            time_since_last = current_time - last_esc_time
+            if time_since_last < DOUBLE_TAP_THRESHOLD and not is_recording:
+                # Double-tap detected - start recording
+                start_voice_recording()
+            last_esc_time = current_time
+        # Other hotkeys (ESC + arrow keys)
+        elif key == keyboard.Key.down and keyboard.Key.esc in current_keys:
             logger.info("Capturing screenshot...")
             capture_screenshot()
         elif key == keyboard.Key.up and keyboard.Key.esc in current_keys:
             logger.info("Sending all screenshots...")
             send_screenshots()
+        elif key == keyboard.Key.right and keyboard.Key.esc in current_keys:
+            logger.info("Sending clipboard text to Code tab...")
+            send_clipboard_text()
     except AttributeError:
         pass
 def on_release(key):
+    global is_recording
     try:
         current_keys.remove(key)
     except KeyError:
         pass
+    # Stop voice recording when ESC is released
+    if is_recording and key == keyboard.Key.esc:
+        # Run transcription in background thread to not block keyboard listener
+        threading.Thread(target=stop_voice_recording_and_send, daemon=True).start()
 # ============ CLI Commands ============
@@ -336,6 +569,8 @@ def cmd_start(args):
     logger.info(f"Server: {server_mode} ({API_URL})")
     logger.info("Press ESC + Down to capture a screenshot.")
     logger.info("Press ESC + Up to send all stored screenshots.")
+    logger.info("Press ESC + Right to send clipboard text to Code tab.")
+    logger.info("Double-tap ESC (hold on 2nd) to record voice and send transcription.")
     if not is_daemon:
         logger.info("Running... (Press Ctrl + C to exit)")

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ai-screenshooter
-Version: 1.3.0
+Version: 1.7.0
 Summary: A CLI tool to capture and send AI-powered screenshots
 Home-page: https://github.com/tech4vision/ai-screenshoter
 Author: Last Shot AI
@@ -13,6 +13,11 @@ Requires-Dist: pynput
 Requires-Dist: requests
 Requires-Dist: Pillow
 Requires-Dist: pygetwindow
+Requires-Dist: pyperclip
+Requires-Dist: sounddevice
+Requires-Dist: soundfile
+Requires-Dist: numpy
+Requires-Dist: faster-whisper
 Dynamic: author
 Dynamic: author-email
 Dynamic: classifier

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ai-screenshooter
-Version: 1.3.0
+Version: 1.7.0
 Summary: A CLI tool to capture and send AI-powered screenshots
 Home-page: https://github.com/tech4vision/ai-screenshoter
 Author: Last Shot AI
@@ -13,6 +13,11 @@ Requires-Dist: pynput
 Requires-Dist: requests
 Requires-Dist: Pillow
 Requires-Dist: pygetwindow
+Requires-Dist: pyperclip
+Requires-Dist: sounddevice
+Requires-Dist: soundfile
+Requires-Dist: numpy
+Requires-Dist: faster-whisper
 Dynamic: author
 Dynamic: author-email
 Dynamic: classifier

@@ -2,14 +2,19 @@ from setuptools import setup, find_packages
 setup(
     name="ai-screenshooter",
-    version="1.3.0",
+    version="1.7.0",
     packages=find_packages(),
     py_modules=["ai_screenshot"],
     install_requires=[
         "pynput",
         "requests",
         "Pillow",
-        "pygetwindow"
+        "pygetwindow",
+        "pyperclip",
+        "sounddevice",
+        "soundfile",
+        "numpy",
+        "faster-whisper"
     ],
     entry_points={
         "console_scripts": [

ai-screenshooter 1.3.0__tar.gz → 1.7.0__tar.gz

ai-screenshooter 1.3.0tar.gz → 1.7.0tar.gz