screen-vision 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- screen_vision-0.2.1/PKG-INFO +22 -0
- screen_vision-0.2.1/README.md +121 -0
- screen_vision-0.2.1/pyproject.toml +40 -0
- screen_vision-0.2.1/setup.cfg +4 -0
- screen_vision-0.2.1/src/screen_vision/__init__.py +1 -0
- screen_vision-0.2.1/src/screen_vision/__main__.py +4 -0
- screen_vision-0.2.1/src/screen_vision/analyze.py +149 -0
- screen_vision-0.2.1/src/screen_vision/audio.py +211 -0
- screen_vision-0.2.1/src/screen_vision/camera_bridge.py +541 -0
- screen_vision-0.2.1/src/screen_vision/capture.py +299 -0
- screen_vision-0.2.1/src/screen_vision/config.py +100 -0
- screen_vision-0.2.1/src/screen_vision/context.py +263 -0
- screen_vision-0.2.1/src/screen_vision/ocr.py +366 -0
- screen_vision-0.2.1/src/screen_vision/security.py +260 -0
- screen_vision-0.2.1/src/screen_vision/server.py +1061 -0
- screen_vision-0.2.1/src/screen_vision/understanding.py +277 -0
- screen_vision-0.2.1/src/screen_vision/video.py +349 -0
- screen_vision-0.2.1/src/screen_vision/watcher.py +279 -0
- screen_vision-0.2.1/src/screen_vision.egg-info/PKG-INFO +22 -0
- screen_vision-0.2.1/src/screen_vision.egg-info/SOURCES.txt +36 -0
- screen_vision-0.2.1/src/screen_vision.egg-info/dependency_links.txt +1 -0
- screen_vision-0.2.1/src/screen_vision.egg-info/entry_points.txt +2 -0
- screen_vision-0.2.1/src/screen_vision.egg-info/requires.txt +18 -0
- screen_vision-0.2.1/src/screen_vision.egg-info/top_level.txt +1 -0
- screen_vision-0.2.1/tests/test_analyze.py +121 -0
- screen_vision-0.2.1/tests/test_audio.py +201 -0
- screen_vision-0.2.1/tests/test_camera_bridge.py +244 -0
- screen_vision-0.2.1/tests/test_capture.py +243 -0
- screen_vision-0.2.1/tests/test_config.py +81 -0
- screen_vision-0.2.1/tests/test_context.py +227 -0
- screen_vision-0.2.1/tests/test_ocr.py +173 -0
- screen_vision-0.2.1/tests/test_phone_tools.py +334 -0
- screen_vision-0.2.1/tests/test_security.py +438 -0
- screen_vision-0.2.1/tests/test_server.py +286 -0
- screen_vision-0.2.1/tests/test_understand_tool.py +361 -0
- screen_vision-0.2.1/tests/test_understanding.py +207 -0
- screen_vision-0.2.1/tests/test_video.py +217 -0
- screen_vision-0.2.1/tests/test_watcher.py +248 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: screen-vision
|
|
3
|
+
Version: 0.2.1
|
|
4
|
+
Summary: MCP server that gives Claude Code the ability to see your screen
|
|
5
|
+
Project-URL: Repository, https://github.com/avicuna/screen-vision
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Requires-Dist: mcp[cli]>=1.0.0
|
|
8
|
+
Requires-Dist: mss>=9.0.0
|
|
9
|
+
Requires-Dist: Pillow>=10.0.0
|
|
10
|
+
Requires-Dist: numpy>=1.24.0
|
|
11
|
+
Requires-Dist: qrcode[pil]>=7.4.0
|
|
12
|
+
Requires-Dist: websockets>=12.0
|
|
13
|
+
Requires-Dist: httpx>=0.27.0
|
|
14
|
+
Provides-Extra: full
|
|
15
|
+
Requires-Dist: paddleocr>=2.7.0; extra == "full"
|
|
16
|
+
Requires-Dist: pytesseract>=0.3.10; extra == "full"
|
|
17
|
+
Requires-Dist: faster-whisper>=1.0.0; extra == "full"
|
|
18
|
+
Requires-Dist: sounddevice>=0.4.6; extra == "full"
|
|
19
|
+
Requires-Dist: opencv-python>=4.8.0; extra == "full"
|
|
20
|
+
Provides-Extra: test
|
|
21
|
+
Requires-Dist: pytest>=8.0.0; extra == "test"
|
|
22
|
+
Requires-Dist: pytest-asyncio>=0.23.0; extra == "test"
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# Screen Vision MCP Server
|
|
2
|
+
|
|
3
|
+
> Give Claude Code the ability to see your screen
|
|
4
|
+
|
|
5
|
+
Screen Vision lets Claude capture screenshots, watch your screen in real-time with audio transcription, analyze video files, and read text via OCR. It runs locally as an MCP server — Claude sees what you see, when you ask.
|
|
6
|
+
|
|
7
|
+
## Quick Start
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install screen-vision
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
Then add to your Claude Code MCP config (`.mcp.json`):
|
|
14
|
+
```json
|
|
15
|
+
{
|
|
16
|
+
"mcpServers": {
|
|
17
|
+
"screen-vision": {
|
|
18
|
+
"command": "python3",
|
|
19
|
+
"args": ["-m", "screen_vision"]
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
**Other install methods:**
|
|
26
|
+
```bash
|
|
27
|
+
# From git (with all optional extras)
|
|
28
|
+
pip install "screen-vision[full]"
|
|
29
|
+
|
|
30
|
+
# Local development
|
|
31
|
+
pip install -e ".[full]"
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
**Optional system deps** (not required — tools gracefully degrade without them):
|
|
35
|
+
```bash
|
|
36
|
+
brew install tesseract # Enables OCR (read_screen_text)
|
|
37
|
+
brew install ffmpeg # Enables video analysis (analyze_video)
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## What You Can Say
|
|
41
|
+
|
|
42
|
+
```
|
|
43
|
+
"Take a screenshot of my screen" → capture_screen
|
|
44
|
+
"Capture the Chrome window" → capture_window
|
|
45
|
+
"Watch my screen for 1 minute" → watch_screen (with audio transcription)
|
|
46
|
+
"Analyze the video at ~/Downloads/demo.mp4" → analyze_video
|
|
47
|
+
"Read the text on my screen" → read_screen_text
|
|
48
|
+
"What window am I in?" → get_active_context (no screenshot)
|
|
49
|
+
"What's on my screen right now?" → understand_screen (AI analysis)
|
|
50
|
+
"Analyze this photo I AirDropped" → analyze_image
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Tools (10)
|
|
54
|
+
|
|
55
|
+
| Tool | What it does | Needs |
|
|
56
|
+
|------|-------------|-------|
|
|
57
|
+
| `capture_screen` | Full screen capture with delay + multi-monitor | — |
|
|
58
|
+
| `capture_region` | Capture a specific rectangular area | — |
|
|
59
|
+
| `capture_window` | Capture a window by title | — |
|
|
60
|
+
| `list_monitors` | List displays with resolutions | — |
|
|
61
|
+
| `get_active_context` | Window/cursor/monitor info (no image) | — |
|
|
62
|
+
| `read_screen_text` | OCR text extraction from screen | tesseract |
|
|
63
|
+
| `understand_screen` | AI-powered screen analysis | GenAI Gateway |
|
|
64
|
+
| `analyze_image` | Analyze a dropped/AirDropped image file | — |
|
|
65
|
+
| `watch_screen` | Watch screen with frame sampling + audio | ffmpeg (audio) |
|
|
66
|
+
| `analyze_video` | Extract keyframes from video files | ffmpeg |
|
|
67
|
+
|
|
68
|
+
## Security
|
|
69
|
+
|
|
70
|
+
Screen Vision includes security controls for corporate environments:
|
|
71
|
+
|
|
72
|
+
- **PII/PCI scanning** — Detects credit card numbers, SSNs, phone numbers, email addresses in OCR text
|
|
73
|
+
- **App deny-list** — Blocks captures of Slack, Teams, Zoom, banking apps, password managers
|
|
74
|
+
- **Call detection** — Blocks captures during active audio calls
|
|
75
|
+
- **Rate limits** — 200 captures/session, 2s minimum interval, 5min max watch duration
|
|
76
|
+
- **Audit logs** — All captures logged to `~/.screen-vision/audit.log`
|
|
77
|
+
|
|
78
|
+
## Dependencies
|
|
79
|
+
|
|
80
|
+
**Core** (always installed): `mcp[cli]`, `mss`, `Pillow`, `numpy`, `httpx`
|
|
81
|
+
|
|
82
|
+
**Optional** (`pip install screen-vision[full]`):
|
|
83
|
+
- `pytesseract` — OCR (needs `brew install tesseract`)
|
|
84
|
+
- `faster-whisper` — Audio transcription
|
|
85
|
+
- `sounddevice` — Audio recording
|
|
86
|
+
- `opencv-python` — Video processing
|
|
87
|
+
- `paddleocr` — Alternative OCR engine
|
|
88
|
+
|
|
89
|
+
**Python 3.11+ required.**
|
|
90
|
+
|
|
91
|
+
## Development
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
pip install -e ".[full,test]"
|
|
95
|
+
pytest tests/ -v
|
|
96
|
+
ruff check src/
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Project Structure
|
|
100
|
+
```
|
|
101
|
+
src/screen_vision/
|
|
102
|
+
├── server.py # MCP server — 13 tools
|
|
103
|
+
├── capture.py # Screen capture (mss)
|
|
104
|
+
├── context.py # Window/cursor/monitor info
|
|
105
|
+
├── ocr.py # OCR (pytesseract + paddleocr)
|
|
106
|
+
├── security.py # PII/PCI scanner + app deny-list
|
|
107
|
+
├── watcher.py # Screen watching + audio transcription
|
|
108
|
+
├── video.py # Video keyframe extraction (ffmpeg)
|
|
109
|
+
├── audio.py # Audio recording + Whisper transcription
|
|
110
|
+
├── analyze.py # Image analysis
|
|
111
|
+
├── understanding.py # AI screen analysis (GenAI Gateway)
|
|
112
|
+
└── config.py # Configuration
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
## Author
|
|
116
|
+
|
|
117
|
+
**Alex Vicuna** — [github.com/avicuna](https://github.com/avicuna)
|
|
118
|
+
|
|
119
|
+
## Contributing
|
|
120
|
+
|
|
121
|
+
Issues and PRs welcome: https://github.com/avicuna/screen-vision
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "screen-vision"
|
|
7
|
+
version = "0.2.1"
|
|
8
|
+
description = "MCP server that gives Claude Code the ability to see your screen"
|
|
9
|
+
requires-python = ">=3.11"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"mcp[cli]>=1.0.0",
|
|
12
|
+
"mss>=9.0.0",
|
|
13
|
+
"Pillow>=10.0.0",
|
|
14
|
+
"numpy>=1.24.0",
|
|
15
|
+
"qrcode[pil]>=7.4.0",
|
|
16
|
+
"websockets>=12.0",
|
|
17
|
+
"httpx>=0.27.0",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
[project.optional-dependencies]
|
|
21
|
+
full = [
|
|
22
|
+
"paddleocr>=2.7.0",
|
|
23
|
+
"pytesseract>=0.3.10",
|
|
24
|
+
"faster-whisper>=1.0.0",
|
|
25
|
+
"sounddevice>=0.4.6",
|
|
26
|
+
"opencv-python>=4.8.0",
|
|
27
|
+
]
|
|
28
|
+
test = [
|
|
29
|
+
"pytest>=8.0.0",
|
|
30
|
+
"pytest-asyncio>=0.23.0",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.scripts]
|
|
34
|
+
screen-vision-mcp = "screen_vision.server:main"
|
|
35
|
+
|
|
36
|
+
[project.urls]
|
|
37
|
+
Repository = "https://github.com/avicuna/screen-vision"
|
|
38
|
+
|
|
39
|
+
[tool.setuptools.packages.find]
|
|
40
|
+
where = ["src"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Screen Vision MCP Server — see your screen from Claude Code."""
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""Image analysis for file-based inputs (AirDrop, file drop)."""
|
|
2
|
+
|
|
3
|
+
import base64
|
|
4
|
+
import io
|
|
5
|
+
import os
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from datetime import datetime, UTC
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
from PIL import Image
|
|
11
|
+
|
|
12
|
+
from screen_vision.config import get_config
|
|
13
|
+
from screen_vision.ocr import run_ocr
|
|
14
|
+
from screen_vision.security import SecurityScanner
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class AnalyzeResult:
|
|
19
|
+
"""Result of image analysis."""
|
|
20
|
+
|
|
21
|
+
base64_image: str
|
|
22
|
+
resolution: str # e.g., "1920x1080"
|
|
23
|
+
source: str # "file"
|
|
24
|
+
file_name: str
|
|
25
|
+
ocr_text: str
|
|
26
|
+
timestamp: str # ISO 8601 format
|
|
27
|
+
security_redactions: int # Count of PII redactions (work mode only)
|
|
28
|
+
error: Optional[str] = None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def analyze_image(file_path: str, prompt: str = "") -> AnalyzeResult:
|
|
32
|
+
"""Analyze an image file with OCR and security scanning.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
file_path: Path to the image file
|
|
36
|
+
prompt: Optional analysis prompt (unused for now, reserved for future)
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
AnalyzeResult with image data, OCR text, and security info
|
|
40
|
+
|
|
41
|
+
Raises:
|
|
42
|
+
ValueError: If file doesn't exist or exceeds size limits
|
|
43
|
+
"""
|
|
44
|
+
config = get_config()
|
|
45
|
+
|
|
46
|
+
# Validate file exists
|
|
47
|
+
if not os.path.exists(file_path):
|
|
48
|
+
return AnalyzeResult(
|
|
49
|
+
base64_image="",
|
|
50
|
+
resolution="0x0",
|
|
51
|
+
source="file",
|
|
52
|
+
file_name=os.path.basename(file_path),
|
|
53
|
+
ocr_text="",
|
|
54
|
+
timestamp=datetime.now(UTC).isoformat(),
|
|
55
|
+
security_redactions=0,
|
|
56
|
+
error=f"File not found: {file_path}",
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# Check file size (max 50MB)
|
|
60
|
+
max_size_bytes = 50 * 1024 * 1024
|
|
61
|
+
file_size = os.path.getsize(file_path)
|
|
62
|
+
if file_size > max_size_bytes:
|
|
63
|
+
return AnalyzeResult(
|
|
64
|
+
base64_image="",
|
|
65
|
+
resolution="0x0",
|
|
66
|
+
source="file",
|
|
67
|
+
file_name=os.path.basename(file_path),
|
|
68
|
+
ocr_text="",
|
|
69
|
+
timestamp=datetime.now(UTC).isoformat(),
|
|
70
|
+
security_redactions=0,
|
|
71
|
+
error=f"File size {file_size} bytes exceeds maximum {max_size_bytes} bytes",
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
# Open and convert to RGB
|
|
76
|
+
image = Image.open(file_path)
|
|
77
|
+
if image.mode != "RGB":
|
|
78
|
+
image = image.convert("RGB")
|
|
79
|
+
|
|
80
|
+
# Resize if too large (max 2048px on longest side)
|
|
81
|
+
max_dimension = 2048
|
|
82
|
+
width, height = image.size
|
|
83
|
+
if max(width, height) > max_dimension:
|
|
84
|
+
if width > height:
|
|
85
|
+
new_width = max_dimension
|
|
86
|
+
new_height = int(height * (max_dimension / width))
|
|
87
|
+
else:
|
|
88
|
+
new_height = max_dimension
|
|
89
|
+
new_width = int(width * (max_dimension / height))
|
|
90
|
+
image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
|
91
|
+
|
|
92
|
+
# Run OCR
|
|
93
|
+
ocr_result = run_ocr(image)
|
|
94
|
+
ocr_text = ocr_result.text
|
|
95
|
+
|
|
96
|
+
# Security scan (work mode only)
|
|
97
|
+
security_redactions = 0
|
|
98
|
+
if config.is_work_mode:
|
|
99
|
+
scanner = SecurityScanner(enabled=True)
|
|
100
|
+
scan_result = scanner.scan_text(ocr_text)
|
|
101
|
+
|
|
102
|
+
# Block if PCI or secrets detected
|
|
103
|
+
if scan_result.should_block:
|
|
104
|
+
return AnalyzeResult(
|
|
105
|
+
base64_image="",
|
|
106
|
+
resolution=f"{image.size[0]}x{image.size[1]}",
|
|
107
|
+
source="file",
|
|
108
|
+
file_name=os.path.basename(file_path),
|
|
109
|
+
ocr_text="",
|
|
110
|
+
timestamp=datetime.now(UTC).isoformat(),
|
|
111
|
+
security_redactions=0,
|
|
112
|
+
error=f"Security scan blocked: {len([f for f in scan_result.findings if f.action == 'BLOCK'])} sensitive items detected",
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# Count PII redactions
|
|
116
|
+
security_redactions = len(
|
|
117
|
+
[f for f in scan_result.findings if f.action == "REDACT"]
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
# Encode as base64 JPEG (quality 75, EXIF stripped)
|
|
121
|
+
buffer = io.BytesIO()
|
|
122
|
+
image.save(
|
|
123
|
+
buffer, format="JPEG", quality=config.default_jpeg_quality, exif=b""
|
|
124
|
+
)
|
|
125
|
+
buffer.seek(0)
|
|
126
|
+
base64_image = base64.b64encode(buffer.read()).decode("utf-8")
|
|
127
|
+
|
|
128
|
+
return AnalyzeResult(
|
|
129
|
+
base64_image=base64_image,
|
|
130
|
+
resolution=f"{image.size[0]}x{image.size[1]}",
|
|
131
|
+
source="file",
|
|
132
|
+
file_name=os.path.basename(file_path),
|
|
133
|
+
ocr_text=ocr_text,
|
|
134
|
+
timestamp=datetime.now(UTC).isoformat(),
|
|
135
|
+
security_redactions=security_redactions,
|
|
136
|
+
error=None,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
except Exception as e:
|
|
140
|
+
return AnalyzeResult(
|
|
141
|
+
base64_image="",
|
|
142
|
+
resolution="0x0",
|
|
143
|
+
source="file",
|
|
144
|
+
file_name=os.path.basename(file_path),
|
|
145
|
+
ocr_text="",
|
|
146
|
+
timestamp=datetime.now(UTC).isoformat(),
|
|
147
|
+
security_redactions=0,
|
|
148
|
+
error=f"Error processing image: {str(e)}",
|
|
149
|
+
)
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
"""Audio recording and transcription for Screen Vision.
|
|
2
|
+
|
|
3
|
+
Handles microphone recording, call detection, and speech-to-text transcription.
|
|
4
|
+
Gracefully degrades when optional dependencies (sounddevice, faster-whisper) are missing.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import subprocess
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
|
|
13
|
+
# Try to import optional audio dependencies
|
|
14
|
+
try:
|
|
15
|
+
import sounddevice as sd
|
|
16
|
+
SOUNDDEVICE_AVAILABLE = True
|
|
17
|
+
except ImportError:
|
|
18
|
+
sd = None
|
|
19
|
+
SOUNDDEVICE_AVAILABLE = False
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
from faster_whisper import WhisperModel
|
|
23
|
+
WHISPER_AVAILABLE = True
|
|
24
|
+
except ImportError:
|
|
25
|
+
WhisperModel = None
|
|
26
|
+
WHISPER_AVAILABLE = False
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# Lazy singleton for Whisper model to avoid reloading on every transcribe() call
|
|
30
|
+
_whisper_model = None
|
|
31
|
+
|
|
32
|
+
def _get_whisper_model():
|
|
33
|
+
"""Get or create the Whisper model singleton."""
|
|
34
|
+
global _whisper_model
|
|
35
|
+
if _whisper_model is None:
|
|
36
|
+
_whisper_model = WhisperModel("tiny", device="cpu", compute_type="int8")
|
|
37
|
+
return _whisper_model
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class TranscriptSegment:
|
|
42
|
+
"""A segment of transcribed speech with timing information."""
|
|
43
|
+
|
|
44
|
+
text: str
|
|
45
|
+
start_time: float # seconds
|
|
46
|
+
end_time: float # seconds
|
|
47
|
+
nearest_frame_index: int | None = None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# Call detection constants
|
|
51
|
+
CALL_APPS = ["zoom.us", "Microsoft Teams", "Slack", "FaceTime", "Google Chrome"]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class AudioRecorder:
|
|
55
|
+
"""Records audio from microphone and transcribes with Whisper."""
|
|
56
|
+
|
|
57
|
+
def __init__(self, sample_rate: int = 16000):
|
|
58
|
+
"""Initialize audio recorder.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
sample_rate: Audio sample rate in Hz (default 16000 for Whisper)
|
|
62
|
+
"""
|
|
63
|
+
self.sample_rate = sample_rate
|
|
64
|
+
self.buffer: np.ndarray | None = None
|
|
65
|
+
self._recording = False
|
|
66
|
+
|
|
67
|
+
def start(self, duration_seconds: float) -> None:
|
|
68
|
+
"""Record microphone audio for specified duration.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
duration_seconds: How long to record in seconds
|
|
72
|
+
|
|
73
|
+
Raises:
|
|
74
|
+
RuntimeError: If sounddevice is not installed
|
|
75
|
+
|
|
76
|
+
Stores audio in self.buffer as mono float32 numpy array.
|
|
77
|
+
This is a blocking call - it waits for the full duration.
|
|
78
|
+
"""
|
|
79
|
+
if not SOUNDDEVICE_AVAILABLE:
|
|
80
|
+
raise RuntimeError(
|
|
81
|
+
"sounddevice not installed. Install with: pip install sounddevice"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Calculate number of frames needed
|
|
85
|
+
frames = int(duration_seconds * self.sample_rate)
|
|
86
|
+
|
|
87
|
+
# Record audio - blocks until complete
|
|
88
|
+
self._recording = True
|
|
89
|
+
recording = sd.rec(
|
|
90
|
+
frames=frames,
|
|
91
|
+
samplerate=self.sample_rate,
|
|
92
|
+
channels=1, # mono
|
|
93
|
+
dtype='float32'
|
|
94
|
+
)
|
|
95
|
+
sd.wait() # Wait for recording to complete
|
|
96
|
+
self._recording = False
|
|
97
|
+
|
|
98
|
+
# Store as flattened mono array
|
|
99
|
+
self.buffer = recording.flatten()
|
|
100
|
+
|
|
101
|
+
def stop(self) -> None:
|
|
102
|
+
"""Stop recording if active.
|
|
103
|
+
|
|
104
|
+
This is safe to call even if not recording.
|
|
105
|
+
"""
|
|
106
|
+
if self._recording:
|
|
107
|
+
# In blocking mode, this doesn't do much, but provided for API consistency
|
|
108
|
+
self._recording = False
|
|
109
|
+
|
|
110
|
+
def transcribe(self) -> list[TranscriptSegment]:
|
|
111
|
+
"""Transcribe audio buffer using Whisper.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
List of TranscriptSegment objects with timestamped text.
|
|
115
|
+
Empty list if Whisper is not installed or buffer is empty.
|
|
116
|
+
|
|
117
|
+
Requires faster-whisper to be installed. Falls back gracefully if missing.
|
|
118
|
+
"""
|
|
119
|
+
if not WHISPER_AVAILABLE:
|
|
120
|
+
return []
|
|
121
|
+
|
|
122
|
+
if self.buffer is None or len(self.buffer) == 0:
|
|
123
|
+
return []
|
|
124
|
+
|
|
125
|
+
# Get the singleton Whisper model (avoids reloading on every call)
|
|
126
|
+
model = _get_whisper_model()
|
|
127
|
+
|
|
128
|
+
# Transcribe with word-level timestamps
|
|
129
|
+
segments_iter, _ = model.transcribe(
|
|
130
|
+
self.buffer,
|
|
131
|
+
language="en",
|
|
132
|
+
word_timestamps=False # Segment-level is sufficient
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# Convert to our TranscriptSegment format
|
|
136
|
+
segments = []
|
|
137
|
+
for segment in segments_iter:
|
|
138
|
+
segments.append(
|
|
139
|
+
TranscriptSegment(
|
|
140
|
+
text=segment.text,
|
|
141
|
+
start_time=segment.start,
|
|
142
|
+
end_time=segment.end
|
|
143
|
+
)
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
return segments
|
|
147
|
+
|
|
148
|
+
def clear(self) -> None:
|
|
149
|
+
"""Clear the audio buffer (zero-persistence)."""
|
|
150
|
+
self.buffer = None
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def is_call_active() -> bool:
|
|
154
|
+
"""Check if any call application is using the microphone.
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
True if a known call app is using the mic, False otherwise.
|
|
158
|
+
|
|
159
|
+
Gracefully returns False if process checking fails.
|
|
160
|
+
"""
|
|
161
|
+
try:
|
|
162
|
+
processes = _get_mic_using_processes()
|
|
163
|
+
return any(p["name"] in CALL_APPS for p in processes)
|
|
164
|
+
except Exception:
|
|
165
|
+
# Fail gracefully - assume no call active
|
|
166
|
+
return False
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _get_mic_using_processes() -> list[dict[str, Any]]:
|
|
170
|
+
"""Get processes currently using the microphone.
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
List of dicts with "name" and "pid" keys.
|
|
174
|
+
|
|
175
|
+
Raises:
|
|
176
|
+
subprocess.SubprocessError: If lsof command fails
|
|
177
|
+
"""
|
|
178
|
+
try:
|
|
179
|
+
# On macOS, check which processes are accessing audio devices
|
|
180
|
+
# Using lsof to find processes with audio file descriptors
|
|
181
|
+
result = subprocess.run(
|
|
182
|
+
["lsof", "-c", "zoom", "-c", "Teams", "-c", "Slack", "-c", "FaceTime", "-c", "Chrome"],
|
|
183
|
+
capture_output=True,
|
|
184
|
+
text=True,
|
|
185
|
+
timeout=5
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
processes = []
|
|
189
|
+
if result.returncode == 0 and result.stdout:
|
|
190
|
+
# Parse lsof output
|
|
191
|
+
lines = result.stdout.strip().split('\n')
|
|
192
|
+
seen_pids = set()
|
|
193
|
+
|
|
194
|
+
for line in lines[1:]: # Skip header
|
|
195
|
+
parts = line.split()
|
|
196
|
+
if len(parts) >= 2:
|
|
197
|
+
command = parts[0]
|
|
198
|
+
pid = parts[1]
|
|
199
|
+
|
|
200
|
+
if pid not in seen_pids:
|
|
201
|
+
seen_pids.add(pid)
|
|
202
|
+
processes.append({
|
|
203
|
+
"name": command,
|
|
204
|
+
"pid": int(pid) if pid.isdigit() else 0
|
|
205
|
+
})
|
|
206
|
+
|
|
207
|
+
return processes
|
|
208
|
+
|
|
209
|
+
except (subprocess.SubprocessError, FileNotFoundError, ValueError):
|
|
210
|
+
# lsof not available or failed - return empty list
|
|
211
|
+
return []
|