screen-vision 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. screen_vision-0.2.1/PKG-INFO +22 -0
  2. screen_vision-0.2.1/README.md +121 -0
  3. screen_vision-0.2.1/pyproject.toml +40 -0
  4. screen_vision-0.2.1/setup.cfg +4 -0
  5. screen_vision-0.2.1/src/screen_vision/__init__.py +1 -0
  6. screen_vision-0.2.1/src/screen_vision/__main__.py +4 -0
  7. screen_vision-0.2.1/src/screen_vision/analyze.py +149 -0
  8. screen_vision-0.2.1/src/screen_vision/audio.py +211 -0
  9. screen_vision-0.2.1/src/screen_vision/camera_bridge.py +541 -0
  10. screen_vision-0.2.1/src/screen_vision/capture.py +299 -0
  11. screen_vision-0.2.1/src/screen_vision/config.py +100 -0
  12. screen_vision-0.2.1/src/screen_vision/context.py +263 -0
  13. screen_vision-0.2.1/src/screen_vision/ocr.py +366 -0
  14. screen_vision-0.2.1/src/screen_vision/security.py +260 -0
  15. screen_vision-0.2.1/src/screen_vision/server.py +1061 -0
  16. screen_vision-0.2.1/src/screen_vision/understanding.py +277 -0
  17. screen_vision-0.2.1/src/screen_vision/video.py +349 -0
  18. screen_vision-0.2.1/src/screen_vision/watcher.py +279 -0
  19. screen_vision-0.2.1/src/screen_vision.egg-info/PKG-INFO +22 -0
  20. screen_vision-0.2.1/src/screen_vision.egg-info/SOURCES.txt +36 -0
  21. screen_vision-0.2.1/src/screen_vision.egg-info/dependency_links.txt +1 -0
  22. screen_vision-0.2.1/src/screen_vision.egg-info/entry_points.txt +2 -0
  23. screen_vision-0.2.1/src/screen_vision.egg-info/requires.txt +18 -0
  24. screen_vision-0.2.1/src/screen_vision.egg-info/top_level.txt +1 -0
  25. screen_vision-0.2.1/tests/test_analyze.py +121 -0
  26. screen_vision-0.2.1/tests/test_audio.py +201 -0
  27. screen_vision-0.2.1/tests/test_camera_bridge.py +244 -0
  28. screen_vision-0.2.1/tests/test_capture.py +243 -0
  29. screen_vision-0.2.1/tests/test_config.py +81 -0
  30. screen_vision-0.2.1/tests/test_context.py +227 -0
  31. screen_vision-0.2.1/tests/test_ocr.py +173 -0
  32. screen_vision-0.2.1/tests/test_phone_tools.py +334 -0
  33. screen_vision-0.2.1/tests/test_security.py +438 -0
  34. screen_vision-0.2.1/tests/test_server.py +286 -0
  35. screen_vision-0.2.1/tests/test_understand_tool.py +361 -0
  36. screen_vision-0.2.1/tests/test_understanding.py +207 -0
  37. screen_vision-0.2.1/tests/test_video.py +217 -0
  38. screen_vision-0.2.1/tests/test_watcher.py +248 -0
@@ -0,0 +1,22 @@
1
+ Metadata-Version: 2.4
2
+ Name: screen-vision
3
+ Version: 0.2.1
4
+ Summary: MCP server that gives Claude Code the ability to see your screen
5
+ Project-URL: Repository, https://github.com/avicuna/screen-vision
6
+ Requires-Python: >=3.11
7
+ Requires-Dist: mcp[cli]>=1.0.0
8
+ Requires-Dist: mss>=9.0.0
9
+ Requires-Dist: Pillow>=10.0.0
10
+ Requires-Dist: numpy>=1.24.0
11
+ Requires-Dist: qrcode[pil]>=7.4.0
12
+ Requires-Dist: websockets>=12.0
13
+ Requires-Dist: httpx>=0.27.0
14
+ Provides-Extra: full
15
+ Requires-Dist: paddleocr>=2.7.0; extra == "full"
16
+ Requires-Dist: pytesseract>=0.3.10; extra == "full"
17
+ Requires-Dist: faster-whisper>=1.0.0; extra == "full"
18
+ Requires-Dist: sounddevice>=0.4.6; extra == "full"
19
+ Requires-Dist: opencv-python>=4.8.0; extra == "full"
20
+ Provides-Extra: test
21
+ Requires-Dist: pytest>=8.0.0; extra == "test"
22
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == "test"
@@ -0,0 +1,121 @@
1
+ # Screen Vision MCP Server
2
+
3
+ > Give Claude Code the ability to see your screen
4
+
5
+ Screen Vision lets Claude capture screenshots, watch your screen in real-time with audio transcription, analyze video files, and read text via OCR. It runs locally as an MCP server — Claude sees what you see, when you ask.
6
+
7
+ ## Quick Start
8
+
9
+ ```bash
10
+ pip install screen-vision
11
+ ```
12
+
13
+ Then add to your Claude Code MCP config (`.mcp.json`):
14
+ ```json
15
+ {
16
+ "mcpServers": {
17
+ "screen-vision": {
18
+ "command": "python3",
19
+ "args": ["-m", "screen_vision"]
20
+ }
21
+ }
22
+ }
23
+ ```
24
+
25
+ **Other install methods:**
26
+ ```bash
27
+ # From git (with all optional extras)
28
+ pip install "screen-vision[full]"
29
+
30
+ # Local development
31
+ pip install -e ".[full]"
32
+ ```
33
+
34
+ **Optional system deps** (not required — tools gracefully degrade without them):
35
+ ```bash
36
+ brew install tesseract # Enables OCR (read_screen_text)
37
+ brew install ffmpeg # Enables video analysis (analyze_video)
38
+ ```
39
+
40
+ ## What You Can Say
41
+
42
+ ```
43
+ "Take a screenshot of my screen" → capture_screen
44
+ "Capture the Chrome window" → capture_window
45
+ "Watch my screen for 1 minute" → watch_screen (with audio transcription)
46
+ "Analyze the video at ~/Downloads/demo.mp4" → analyze_video
47
+ "Read the text on my screen" → read_screen_text
48
+ "What window am I in?" → get_active_context (no screenshot)
49
+ "What's on my screen right now?" → understand_screen (AI analysis)
50
+ "Analyze this photo I AirDropped" → analyze_image
51
+ ```
52
+
53
+ ## Tools (10)
54
+
55
+ | Tool | What it does | Needs |
56
+ |------|-------------|-------|
57
+ | `capture_screen` | Full screen capture with delay + multi-monitor | — |
58
+ | `capture_region` | Capture a specific rectangular area | — |
59
+ | `capture_window` | Capture a window by title | — |
60
+ | `list_monitors` | List displays with resolutions | — |
61
+ | `get_active_context` | Window/cursor/monitor info (no image) | — |
62
+ | `read_screen_text` | OCR text extraction from screen | tesseract |
63
+ | `understand_screen` | AI-powered screen analysis | GenAI Gateway |
64
+ | `analyze_image` | Analyze a dropped/AirDropped image file | — |
65
+ | `watch_screen` | Watch screen with frame sampling + audio | ffmpeg (audio) |
66
+ | `analyze_video` | Extract keyframes from video files | ffmpeg |
67
+
68
+ ## Security
69
+
70
+ Screen Vision includes security controls for corporate environments:
71
+
72
+ - **PII/PCI scanning** — Detects credit card numbers, SSNs, phone numbers, email addresses in OCR text
73
+ - **App deny-list** — Blocks captures of Slack, Teams, Zoom, banking apps, password managers
74
+ - **Call detection** — Blocks captures during active audio calls
75
+ - **Rate limits** — 200 captures/session, 2s minimum interval, 5min max watch duration
76
+ - **Audit logs** — All captures logged to `~/.screen-vision/audit.log`
77
+
78
+ ## Dependencies
79
+
80
+ **Core** (always installed): `mcp[cli]`, `mss`, `Pillow`, `numpy`, `httpx`
81
+
82
+ **Optional** (`pip install screen-vision[full]`):
83
+ - `pytesseract` — OCR (needs `brew install tesseract`)
84
+ - `faster-whisper` — Audio transcription
85
+ - `sounddevice` — Audio recording
86
+ - `opencv-python` — Video processing
87
+ - `paddleocr` — Alternative OCR engine
88
+
89
+ **Python 3.11+ required.**
90
+
91
+ ## Development
92
+
93
+ ```bash
94
+ pip install -e ".[full,test]"
95
+ pytest tests/ -v
96
+ ruff check src/
97
+ ```
98
+
99
+ ### Project Structure
100
+ ```
101
+ src/screen_vision/
102
+ ├── server.py # MCP server — 13 tools
103
+ ├── capture.py # Screen capture (mss)
104
+ ├── context.py # Window/cursor/monitor info
105
+ ├── ocr.py # OCR (pytesseract + paddleocr)
106
+ ├── security.py # PII/PCI scanner + app deny-list
107
+ ├── watcher.py # Screen watching + audio transcription
108
+ ├── video.py # Video keyframe extraction (ffmpeg)
109
+ ├── audio.py # Audio recording + Whisper transcription
110
+ ├── analyze.py # Image analysis
111
+ ├── understanding.py # AI screen analysis (GenAI Gateway)
112
+ └── config.py # Configuration
113
+ ```
114
+
115
+ ## Author
116
+
117
+ **Alex Vicuna** — [github.com/avicuna](https://github.com/avicuna)
118
+
119
+ ## Contributing
120
+
121
+ Issues and PRs welcome: https://github.com/avicuna/screen-vision
@@ -0,0 +1,40 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "screen-vision"
7
+ version = "0.2.1"
8
+ description = "MCP server that gives Claude Code the ability to see your screen"
9
+ requires-python = ">=3.11"
10
+ dependencies = [
11
+ "mcp[cli]>=1.0.0",
12
+ "mss>=9.0.0",
13
+ "Pillow>=10.0.0",
14
+ "numpy>=1.24.0",
15
+ "qrcode[pil]>=7.4.0",
16
+ "websockets>=12.0",
17
+ "httpx>=0.27.0",
18
+ ]
19
+
20
+ [project.optional-dependencies]
21
+ full = [
22
+ "paddleocr>=2.7.0",
23
+ "pytesseract>=0.3.10",
24
+ "faster-whisper>=1.0.0",
25
+ "sounddevice>=0.4.6",
26
+ "opencv-python>=4.8.0",
27
+ ]
28
+ test = [
29
+ "pytest>=8.0.0",
30
+ "pytest-asyncio>=0.23.0",
31
+ ]
32
+
33
+ [project.scripts]
34
+ screen-vision-mcp = "screen_vision.server:main"
35
+
36
+ [project.urls]
37
+ Repository = "https://github.com/avicuna/screen-vision"
38
+
39
+ [tool.setuptools.packages.find]
40
+ where = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1 @@
1
+ """Screen Vision MCP Server — see your screen from Claude Code."""
@@ -0,0 +1,4 @@
1
+ """Allow running as `python -m screen_vision`."""
2
+ from screen_vision.server import main
3
+
4
+ main()
@@ -0,0 +1,149 @@
1
+ """Image analysis for file-based inputs (AirDrop, file drop)."""
2
+
3
+ import base64
4
+ import io
5
+ import os
6
+ from dataclasses import dataclass
7
+ from datetime import datetime, UTC
8
+ from typing import Optional
9
+
10
+ from PIL import Image
11
+
12
+ from screen_vision.config import get_config
13
+ from screen_vision.ocr import run_ocr
14
+ from screen_vision.security import SecurityScanner
15
+
16
+
17
+ @dataclass
18
+ class AnalyzeResult:
19
+ """Result of image analysis."""
20
+
21
+ base64_image: str
22
+ resolution: str # e.g., "1920x1080"
23
+ source: str # "file"
24
+ file_name: str
25
+ ocr_text: str
26
+ timestamp: str # ISO 8601 format
27
+ security_redactions: int # Count of PII redactions (work mode only)
28
+ error: Optional[str] = None
29
+
30
+
31
+ def analyze_image(file_path: str, prompt: str = "") -> AnalyzeResult:
32
+ """Analyze an image file with OCR and security scanning.
33
+
34
+ Args:
35
+ file_path: Path to the image file
36
+ prompt: Optional analysis prompt (unused for now, reserved for future)
37
+
38
+ Returns:
39
+ AnalyzeResult with image data, OCR text, and security info
40
+
41
+ Raises:
42
+ ValueError: If file doesn't exist or exceeds size limits
43
+ """
44
+ config = get_config()
45
+
46
+ # Validate file exists
47
+ if not os.path.exists(file_path):
48
+ return AnalyzeResult(
49
+ base64_image="",
50
+ resolution="0x0",
51
+ source="file",
52
+ file_name=os.path.basename(file_path),
53
+ ocr_text="",
54
+ timestamp=datetime.now(UTC).isoformat(),
55
+ security_redactions=0,
56
+ error=f"File not found: {file_path}",
57
+ )
58
+
59
+ # Check file size (max 50MB)
60
+ max_size_bytes = 50 * 1024 * 1024
61
+ file_size = os.path.getsize(file_path)
62
+ if file_size > max_size_bytes:
63
+ return AnalyzeResult(
64
+ base64_image="",
65
+ resolution="0x0",
66
+ source="file",
67
+ file_name=os.path.basename(file_path),
68
+ ocr_text="",
69
+ timestamp=datetime.now(UTC).isoformat(),
70
+ security_redactions=0,
71
+ error=f"File size {file_size} bytes exceeds maximum {max_size_bytes} bytes",
72
+ )
73
+
74
+ try:
75
+ # Open and convert to RGB
76
+ image = Image.open(file_path)
77
+ if image.mode != "RGB":
78
+ image = image.convert("RGB")
79
+
80
+ # Resize if too large (max 2048px on longest side)
81
+ max_dimension = 2048
82
+ width, height = image.size
83
+ if max(width, height) > max_dimension:
84
+ if width > height:
85
+ new_width = max_dimension
86
+ new_height = int(height * (max_dimension / width))
87
+ else:
88
+ new_height = max_dimension
89
+ new_width = int(width * (max_dimension / height))
90
+ image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
91
+
92
+ # Run OCR
93
+ ocr_result = run_ocr(image)
94
+ ocr_text = ocr_result.text
95
+
96
+ # Security scan (work mode only)
97
+ security_redactions = 0
98
+ if config.is_work_mode:
99
+ scanner = SecurityScanner(enabled=True)
100
+ scan_result = scanner.scan_text(ocr_text)
101
+
102
+ # Block if PCI or secrets detected
103
+ if scan_result.should_block:
104
+ return AnalyzeResult(
105
+ base64_image="",
106
+ resolution=f"{image.size[0]}x{image.size[1]}",
107
+ source="file",
108
+ file_name=os.path.basename(file_path),
109
+ ocr_text="",
110
+ timestamp=datetime.now(UTC).isoformat(),
111
+ security_redactions=0,
112
+ error=f"Security scan blocked: {len([f for f in scan_result.findings if f.action == 'BLOCK'])} sensitive items detected",
113
+ )
114
+
115
+ # Count PII redactions
116
+ security_redactions = len(
117
+ [f for f in scan_result.findings if f.action == "REDACT"]
118
+ )
119
+
120
+ # Encode as base64 JPEG (quality 75, EXIF stripped)
121
+ buffer = io.BytesIO()
122
+ image.save(
123
+ buffer, format="JPEG", quality=config.default_jpeg_quality, exif=b""
124
+ )
125
+ buffer.seek(0)
126
+ base64_image = base64.b64encode(buffer.read()).decode("utf-8")
127
+
128
+ return AnalyzeResult(
129
+ base64_image=base64_image,
130
+ resolution=f"{image.size[0]}x{image.size[1]}",
131
+ source="file",
132
+ file_name=os.path.basename(file_path),
133
+ ocr_text=ocr_text,
134
+ timestamp=datetime.now(UTC).isoformat(),
135
+ security_redactions=security_redactions,
136
+ error=None,
137
+ )
138
+
139
+ except Exception as e:
140
+ return AnalyzeResult(
141
+ base64_image="",
142
+ resolution="0x0",
143
+ source="file",
144
+ file_name=os.path.basename(file_path),
145
+ ocr_text="",
146
+ timestamp=datetime.now(UTC).isoformat(),
147
+ security_redactions=0,
148
+ error=f"Error processing image: {str(e)}",
149
+ )
@@ -0,0 +1,211 @@
1
+ """Audio recording and transcription for Screen Vision.
2
+
3
+ Handles microphone recording, call detection, and speech-to-text transcription.
4
+ Gracefully degrades when optional dependencies (sounddevice, faster-whisper) are missing.
5
+ """
6
+
7
+ import subprocess
8
+ from dataclasses import dataclass
9
+ from typing import Any
10
+
11
+ import numpy as np
12
+
13
+ # Try to import optional audio dependencies
14
+ try:
15
+ import sounddevice as sd
16
+ SOUNDDEVICE_AVAILABLE = True
17
+ except ImportError:
18
+ sd = None
19
+ SOUNDDEVICE_AVAILABLE = False
20
+
21
+ try:
22
+ from faster_whisper import WhisperModel
23
+ WHISPER_AVAILABLE = True
24
+ except ImportError:
25
+ WhisperModel = None
26
+ WHISPER_AVAILABLE = False
27
+
28
+
29
+ # Lazy singleton for Whisper model to avoid reloading on every transcribe() call
30
+ _whisper_model = None
31
+
32
+ def _get_whisper_model():
33
+ """Get or create the Whisper model singleton."""
34
+ global _whisper_model
35
+ if _whisper_model is None:
36
+ _whisper_model = WhisperModel("tiny", device="cpu", compute_type="int8")
37
+ return _whisper_model
38
+
39
+
40
+ @dataclass
41
+ class TranscriptSegment:
42
+ """A segment of transcribed speech with timing information."""
43
+
44
+ text: str
45
+ start_time: float # seconds
46
+ end_time: float # seconds
47
+ nearest_frame_index: int | None = None
48
+
49
+
50
+ # Call detection constants
51
+ CALL_APPS = ["zoom.us", "Microsoft Teams", "Slack", "FaceTime", "Google Chrome"]
52
+
53
+
54
+ class AudioRecorder:
55
+ """Records audio from microphone and transcribes with Whisper."""
56
+
57
+ def __init__(self, sample_rate: int = 16000):
58
+ """Initialize audio recorder.
59
+
60
+ Args:
61
+ sample_rate: Audio sample rate in Hz (default 16000 for Whisper)
62
+ """
63
+ self.sample_rate = sample_rate
64
+ self.buffer: np.ndarray | None = None
65
+ self._recording = False
66
+
67
+ def start(self, duration_seconds: float) -> None:
68
+ """Record microphone audio for specified duration.
69
+
70
+ Args:
71
+ duration_seconds: How long to record in seconds
72
+
73
+ Raises:
74
+ RuntimeError: If sounddevice is not installed
75
+
76
+ Stores audio in self.buffer as mono float32 numpy array.
77
+ This is a blocking call - it waits for the full duration.
78
+ """
79
+ if not SOUNDDEVICE_AVAILABLE:
80
+ raise RuntimeError(
81
+ "sounddevice not installed. Install with: pip install sounddevice"
82
+ )
83
+
84
+ # Calculate number of frames needed
85
+ frames = int(duration_seconds * self.sample_rate)
86
+
87
+ # Record audio - blocks until complete
88
+ self._recording = True
89
+ recording = sd.rec(
90
+ frames=frames,
91
+ samplerate=self.sample_rate,
92
+ channels=1, # mono
93
+ dtype='float32'
94
+ )
95
+ sd.wait() # Wait for recording to complete
96
+ self._recording = False
97
+
98
+ # Store as flattened mono array
99
+ self.buffer = recording.flatten()
100
+
101
+ def stop(self) -> None:
102
+ """Stop recording if active.
103
+
104
+ This is safe to call even if not recording.
105
+ """
106
+ if self._recording:
107
+ # In blocking mode, this doesn't do much, but provided for API consistency
108
+ self._recording = False
109
+
110
+ def transcribe(self) -> list[TranscriptSegment]:
111
+ """Transcribe audio buffer using Whisper.
112
+
113
+ Returns:
114
+ List of TranscriptSegment objects with timestamped text.
115
+ Empty list if Whisper is not installed or buffer is empty.
116
+
117
+ Requires faster-whisper to be installed. Falls back gracefully if missing.
118
+ """
119
+ if not WHISPER_AVAILABLE:
120
+ return []
121
+
122
+ if self.buffer is None or len(self.buffer) == 0:
123
+ return []
124
+
125
+ # Get the singleton Whisper model (avoids reloading on every call)
126
+ model = _get_whisper_model()
127
+
128
+ # Transcribe with word-level timestamps
129
+ segments_iter, _ = model.transcribe(
130
+ self.buffer,
131
+ language="en",
132
+ word_timestamps=False # Segment-level is sufficient
133
+ )
134
+
135
+ # Convert to our TranscriptSegment format
136
+ segments = []
137
+ for segment in segments_iter:
138
+ segments.append(
139
+ TranscriptSegment(
140
+ text=segment.text,
141
+ start_time=segment.start,
142
+ end_time=segment.end
143
+ )
144
+ )
145
+
146
+ return segments
147
+
148
+ def clear(self) -> None:
149
+ """Clear the audio buffer (zero-persistence)."""
150
+ self.buffer = None
151
+
152
+
153
+ def is_call_active() -> bool:
154
+ """Check if any call application is using the microphone.
155
+
156
+ Returns:
157
+ True if a known call app is using the mic, False otherwise.
158
+
159
+ Gracefully returns False if process checking fails.
160
+ """
161
+ try:
162
+ processes = _get_mic_using_processes()
163
+ return any(p["name"] in CALL_APPS for p in processes)
164
+ except Exception:
165
+ # Fail gracefully - assume no call active
166
+ return False
167
+
168
+
169
+ def _get_mic_using_processes() -> list[dict[str, Any]]:
170
+ """Get processes currently using the microphone.
171
+
172
+ Returns:
173
+ List of dicts with "name" and "pid" keys.
174
+
175
+ Raises:
176
+ subprocess.SubprocessError: If lsof command fails
177
+ """
178
+ try:
179
+ # On macOS, check which processes are accessing audio devices
180
+ # Using lsof to find processes with audio file descriptors
181
+ result = subprocess.run(
182
+ ["lsof", "-c", "zoom", "-c", "Teams", "-c", "Slack", "-c", "FaceTime", "-c", "Chrome"],
183
+ capture_output=True,
184
+ text=True,
185
+ timeout=5
186
+ )
187
+
188
+ processes = []
189
+ if result.returncode == 0 and result.stdout:
190
+ # Parse lsof output
191
+ lines = result.stdout.strip().split('\n')
192
+ seen_pids = set()
193
+
194
+ for line in lines[1:]: # Skip header
195
+ parts = line.split()
196
+ if len(parts) >= 2:
197
+ command = parts[0]
198
+ pid = parts[1]
199
+
200
+ if pid not in seen_pids:
201
+ seen_pids.add(pid)
202
+ processes.append({
203
+ "name": command,
204
+ "pid": int(pid) if pid.isdigit() else 0
205
+ })
206
+
207
+ return processes
208
+
209
+ except (subprocess.SubprocessError, FileNotFoundError, ValueError):
210
+ # lsof not available or failed - return empty list
211
+ return []