iflow-mcp_jhead_macos-screen-mcp 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,45 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual Environment
24
+ venv/
25
+ env/
26
+ ENV/
27
+
28
+ # IDE
29
+ .idea/
30
+ .vscode/
31
+ *.swp
32
+ *.swo
33
+
34
+ # Logs
35
+ *.log
36
+ logs/
37
+ server.log
38
+ mcp_server.log
39
+
40
+ # Project specific
41
+ *.png
42
+
43
+ # OS specific
44
+ .DS_Store
45
+ Thumbs.db
@@ -0,0 +1,70 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ ## [Unreleased]
6
+
7
+ ### Added
8
+
9
+ - Initial project setup
10
+ - Basic project structure
11
+ - Dependencies configuration
12
+ - Window management module
13
+ - MCP server implementation
14
+ - Screenshot capture functionality
15
+ - Base64 encoding option for image responses
16
+ - Search by owner name option with search_in_owner parameter
17
+ - CORS support for MCP server
18
+ - Health check endpoint for MCP server
19
+ - External connection support (0.0.0.0 binding)
20
+ - Enhanced logging configuration
21
+ - Keyboard control functionality
22
+ - Added KeyboardManager class for handling keyboard input
23
+ - Added send_key tool for sending individual key presses with modifiers
24
+ - Added type_text tool for typing sequences of text
25
+ - Support for common keys and modifier keys (command, shift, control, option)
26
+
27
+ ### Changed
28
+
29
+ - Renamed module from `mcp-window` to `macos-screen-mcp` to better reflect its functionality
30
+ - Updated all documentation and configuration files to use the new module name
31
+ - Enhanced window search to include application name (owner) in search criteria
32
+ - Improved window search to prioritize exact matches in owner field
33
+ - Fixed screenshot capture by using correct Quartz methods for image dimensions
34
+ - Fixed screenshot capture by using correct Quartz methods for data provider access
35
+ - Fixed screenshot capture to only capture the target window by using window bounds
36
+ - Fixed screenshot capture to handle Retina displays by using kCGWindowListOptionIncludingWindow
37
+ - Fixed screenshot capture color accuracy by properly handling BGRA to RGBA conversion
38
+ - Updated server configuration for MCP compatibility
39
+ - Added numpy and python-jose dependencies for enhanced functionality
40
+
41
+ ### Fixed
42
+
43
+ - Fixed incomplete CORS middleware configuration by adding missing allow_headers parameter
44
+ - Fixed truncated logging message in capture_window_screenshot function
45
+ - Fixed uvicorn server configuration in **main**.py to include all necessary parameters
46
+ - Fixed MCP server initialization by using correct initialize() method instead of non-existent wait_for_initialization()
47
+ - Added proper error handling for initialization failures
48
+
49
+ ## [1.0.0] - 2024-04-06
50
+
51
+ ### Added
52
+
53
+ - Initial implementation of window screenshot capture functionality
54
+ - Window management and search capabilities
55
+ - Health check endpoint
56
+ - Logging system with file and console output
57
+ - MCP SDK integration for protocol compliance
58
+
59
+ ### Changed
60
+
61
+ - Refactored server to implement MCP protocol using official SDK
62
+ - Updated API endpoints to follow MCP specifications
63
+ - Improved error handling and logging
64
+ - Standardized base64 encoding for image responses
65
+
66
+ ### Fixed
67
+
68
+ - Window screenshot capture reliability
69
+ - Error handling for non-existent windows
70
+ - Base64 encoding consistency
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Justin Head
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,129 @@
1
+ Metadata-Version: 2.4
2
+ Name: iflow-mcp_jhead_macos-screen-mcp
3
+ Version: 1.0.0
4
+ Summary: MCP server for capturing window screenshots and controlling macOS windows
5
+ License-Expression: MIT
6
+ License-File: LICENSE
7
+ Requires-Python: >=3.8
8
+ Requires-Dist: fastapi>=0.104.0
9
+ Requires-Dist: mcp>=1.6.0
10
+ Requires-Dist: numpy>=1.24.0
11
+ Requires-Dist: pillow>=10.0.0
12
+ Requires-Dist: pydantic>=2.4.2
13
+ Requires-Dist: python-multipart>=0.0.6
14
+ Requires-Dist: uvicorn>=0.24.0
15
+ Provides-Extra: macos
16
+ Requires-Dist: pyobjc-framework-cocoa>=9.2; extra == 'macos'
17
+ Requires-Dist: pyobjc-framework-quartz>=9.2; extra == 'macos'
18
+ Description-Content-Type: text/markdown
19
+
20
+ # macOS Screen View & Control MCP Server
21
+
22
+ A Model Context Protocol server that provides window screenshot capabilities. This server enables LLMs to capture screenshots of specific windows on macOS, either by window title or window ID.
23
+
24
+ ### Available Tools
25
+
26
+ - `capture_window_screenshot` - Captures a screenshot of a specific window by its title or ID
27
+
28
+ - `window_identifier` (string, required): Window title to search for or window ID
29
+ - `search_in_owner` (boolean, optional): Whether to search in window owner names (default: true)
30
+ - `format` (string, optional): Output format (binary or base64) (default: "binary")
31
+
32
+ - `list_windows` - Lists all visible windows
33
+
34
+ - No parameters required
35
+
36
+ - `find_window` - Finds a window by title or owner name
37
+
38
+ - `title` (string, required): Window title or owner name to search for
39
+ - `search_in_owner` (boolean, optional): Whether to search in window owner names (default: true)
40
+
41
+ - `send_key` - Sends a keyboard key press event to the active window
42
+
43
+ - `key` (string, required): The key to press (e.g., 'a', 'return', 'space')
44
+ - `modifiers` (list of strings, optional): List of modifier keys to hold (e.g., ['command', 'shift'])
45
+
46
+ - `type_text` - Types a sequence of text characters
47
+ - `text` (string, required): The text to type
48
+ - `delay` (float, optional): Delay between keystrokes in seconds (default: 0.1)
49
+
50
+ ### Supported Keys
51
+
52
+ The following keys are supported:
53
+
54
+ - Letters: a-z (case-insensitive)
55
+ - Numbers: 0-9
56
+ - Special keys: return, tab, space, delete, escape
57
+ - Arrow keys: up_arrow, down_arrow, left_arrow, right_arrow
58
+ - Modifier keys: command, shift, control, option (also right_shift, right_option, right_control)
59
+
60
+ ### Examples
61
+
62
+ Send a single key press:
63
+
64
+ ```python
65
+ await send_key("return")
66
+ ```
67
+
68
+ Send a key with modifiers:
69
+
70
+ ```python
71
+ await send_key("c", ["command"]) # Command+C (copy)
72
+ ```
73
+
74
+ Type text:
75
+
76
+ ```python
77
+ await type_text("Hello, World!")
78
+ ```
79
+
80
+ ## Installation
81
+
82
+ ### Using pip
83
+
84
+ Install `macos_screen_mcp` via pip:
85
+
86
+ ```bash
87
+ pip install git+ssh://git@github.com/jhead/macos-screen-mcp.git
88
+ ```
89
+
90
+ After installation, you can run it as a script using:
91
+
92
+ ```bash
93
+ python -m macos_screen_mcp
94
+ ```
95
+
96
+ ## Configuration
97
+
98
+ ### Configure
99
+
100
+ Add to your Claude or Cursor settings:
101
+
102
+ ```json
103
+ "mcpServers": {
104
+ "macos-screen": {
105
+ "name": "macos-screen",
106
+ "url": "http://localhost:8000/sse",
107
+ "description": "MCP server for capturing window screenshots",
108
+ "version": "1.0.0"
109
+ }
110
+ }
111
+ ```
112
+
113
+ ## Debugging
114
+
115
+ You can use the MCP inspector to debug the server:
116
+
117
+ ```bash
118
+ npx @modelcontextprotocol/inspector python -m macos_screen_mcp
119
+ ```
120
+
121
+ ## Contributing
122
+
123
+ We encourage contributions to help expand and improve macos-screen-mcp. Whether you want to add new tools, enhance existing functionality, or improve documentation, your input is valuable.
124
+
125
+ Pull requests are welcome! Feel free to contribute new ideas, bug fixes, or enhancements to make macos-screen-mcp even more powerful and useful.
126
+
127
+ ## License
128
+
129
+ macos-screen-mcp is licensed under the MIT License. This means you are free to use, modify, and distribute the software, subject to the terms and conditions of the MIT License. For more details, please see the LICENSE file in the project repository.
@@ -0,0 +1,110 @@
1
+ # macOS Screen View & Control MCP Server
2
+
3
+ A Model Context Protocol server that provides window screenshot capabilities. This server enables LLMs to capture screenshots of specific windows on macOS, either by window title or window ID.
4
+
5
+ ### Available Tools
6
+
7
+ - `capture_window_screenshot` - Captures a screenshot of a specific window by its title or ID
8
+
9
+ - `window_identifier` (string, required): Window title to search for or window ID
10
+ - `search_in_owner` (boolean, optional): Whether to search in window owner names (default: true)
11
+ - `format` (string, optional): Output format (binary or base64) (default: "binary")
12
+
13
+ - `list_windows` - Lists all visible windows
14
+
15
+ - No parameters required
16
+
17
+ - `find_window` - Finds a window by title or owner name
18
+
19
+ - `title` (string, required): Window title or owner name to search for
20
+ - `search_in_owner` (boolean, optional): Whether to search in window owner names (default: true)
21
+
22
+ - `send_key` - Sends a keyboard key press event to the active window
23
+
24
+ - `key` (string, required): The key to press (e.g., 'a', 'return', 'space')
25
+ - `modifiers` (list of strings, optional): List of modifier keys to hold (e.g., ['command', 'shift'])
26
+
27
+ - `type_text` - Types a sequence of text characters
28
+ - `text` (string, required): The text to type
29
+ - `delay` (float, optional): Delay between keystrokes in seconds (default: 0.1)
30
+
31
+ ### Supported Keys
32
+
33
+ The following keys are supported:
34
+
35
+ - Letters: a-z (case-insensitive)
36
+ - Numbers: 0-9
37
+ - Special keys: return, tab, space, delete, escape
38
+ - Arrow keys: up_arrow, down_arrow, left_arrow, right_arrow
39
+ - Modifier keys: command, shift, control, option (also right_shift, right_option, right_control)
40
+
41
+ ### Examples
42
+
43
+ Send a single key press:
44
+
45
+ ```python
46
+ await send_key("return")
47
+ ```
48
+
49
+ Send a key with modifiers:
50
+
51
+ ```python
52
+ await send_key("c", ["command"]) # Command+C (copy)
53
+ ```
54
+
55
+ Type text:
56
+
57
+ ```python
58
+ await type_text("Hello, World!")
59
+ ```
60
+
61
+ ## Installation
62
+
63
+ ### Using pip
64
+
65
+ Install `macos_screen_mcp` via pip:
66
+
67
+ ```bash
68
+ pip install git+ssh://git@github.com/jhead/macos-screen-mcp.git
69
+ ```
70
+
71
+ After installation, you can run it as a script using:
72
+
73
+ ```bash
74
+ python -m macos_screen_mcp
75
+ ```
76
+
77
+ ## Configuration
78
+
79
+ ### Configure
80
+
81
+ Add to your Claude or Cursor settings:
82
+
83
+ ```json
84
+ "mcpServers": {
85
+ "macos-screen": {
86
+ "name": "macos-screen",
87
+ "url": "http://localhost:8000/sse",
88
+ "description": "MCP server for capturing window screenshots",
89
+ "version": "1.0.0"
90
+ }
91
+ }
92
+ ```
93
+
94
+ ## Debugging
95
+
96
+ You can use the MCP inspector to debug the server:
97
+
98
+ ```bash
99
+ npx @modelcontextprotocol/inspector python -m macos_screen_mcp
100
+ ```
101
+
102
+ ## Contributing
103
+
104
+ We encourage contributions to help expand and improve macos-screen-mcp. Whether you want to add new tools, enhance existing functionality, or improve documentation, your input is valuable.
105
+
106
+ Pull requests are welcome! Feel free to contribute new ideas, bug fixes, or enhancements to make macos-screen-mcp even more powerful and useful.
107
+
108
+ ## License
109
+
110
+ macos-screen-mcp is licensed under the MIT License. This means you are free to use, modify, and distribute the software, subject to the terms and conditions of the MIT License. For more details, please see the LICENSE file in the project repository.
@@ -0,0 +1 @@
1
+ python
@@ -0,0 +1 @@
1
+ iflow-mcp_jhead_macos-screen-mcp
@@ -0,0 +1,5 @@
1
+ {
2
+ "push_platform": "github",
3
+ "fork_url": "https://github.com/iflow-mcp/jhead-macos-screen-mcp",
4
+ "fork_branch": "iflow"
5
+ }
@@ -0,0 +1,32 @@
1
+ [project]
2
+ name = "iflow-mcp_jhead_macos-screen-mcp"
3
+ version = "1.0.0"
4
+ description = "MCP server for capturing window screenshots and controlling macOS windows"
5
+ readme = "README.md"
6
+ requires-python = ">=3.8"
7
+ license = "MIT"
8
+ dependencies = [
9
+ "fastapi>=0.104.0",
10
+ "uvicorn>=0.24.0",
11
+ "python-multipart>=0.0.6",
12
+ "pillow>=10.0.0",
13
+ "pydantic>=2.4.2",
14
+ "numpy>=1.24.0",
15
+ "mcp>=1.6.0"
16
+ ]
17
+
18
+ [project.optional-dependencies]
19
+ macos = [
20
+ "pyobjc-framework-Quartz>=9.2",
21
+ "pyobjc-framework-Cocoa>=9.2"
22
+ ]
23
+
24
+ [project.scripts]
25
+ macos-screen-mcp = "macos_screen_mcp.__main__:main"
26
+
27
+ [tool.hatch.build.targets.wheel]
28
+ packages = ["src/macos_screen_mcp"]
29
+
30
+ [build-system]
31
+ requires = ["hatchling"]
32
+ build-backend = "hatchling.build"
@@ -0,0 +1,6 @@
1
+ """MCP Window Screenshot Server
2
+
3
+ A Model Context Protocol server that provides window screenshot capabilities.
4
+ """
5
+
6
+ __version__ = "1.0.0"
@@ -0,0 +1,36 @@
1
+ import argparse
2
+ import logging
3
+ import uvicorn
4
+ from .server import app, mcp
5
+
6
+ def main():
7
+ parser = argparse.ArgumentParser(description="MCP Window Screenshot Server")
8
+ parser.add_argument("--host", default="0.0.0.0", help="Host to bind to")
9
+ parser.add_argument("--port", type=int, default=8000, help="Port to bind to")
10
+ parser.add_argument("--log-level", default="info", help="Logging level")
11
+ parser.add_argument("--transport", default="stdio", choices=["stdio", "sse"], help="Transport protocol (stdio or sse)")
12
+ args = parser.parse_args()
13
+
14
+ # Configure logging
15
+ logging.basicConfig(
16
+ level=getattr(logging, args.log_level.upper()),
17
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
18
+ )
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ if args.transport == "stdio":
23
+ logger.info("Starting MCP Window Server with stdio transport...")
24
+ mcp.run(transport="stdio")
25
+ else:
26
+ logger.info("Starting MCP Window Server with SSE transport...")
27
+ uvicorn.run(
28
+ app,
29
+ host=args.host,
30
+ port=args.port,
31
+ log_level=args.log_level.lower(),
32
+ access_log=True
33
+ )
34
+
35
+ if __name__ == "__main__":
36
+ main()
@@ -0,0 +1,202 @@
1
+ from typing import List, Dict, Optional
2
+ import logging
3
+ import time
4
+
5
+ try:
6
+ from Quartz import (
7
+ CGEventCreateKeyboardEvent,
8
+ CGEventPost,
9
+ kCGHIDEventTap,
10
+ kCGEventKeyDown,
11
+ kCGEventKeyUp,
12
+ CGEventSetFlags,
13
+ kCGEventFlagMaskCommand,
14
+ kCGEventFlagMaskShift,
15
+ kCGEventFlagMaskControl,
16
+ kCGEventFlagMaskAlternate,
17
+ )
18
+ HAS_MACOS_APIS = True
19
+ except ImportError:
20
+ HAS_MACOS_APIS = False
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ class KeyboardManager:
25
+ _initialized = False
26
+
27
+ @classmethod
28
+ def initialize(cls) -> bool:
29
+ """Initialize the keyboard manager.
30
+
31
+ Returns:
32
+ bool: True if initialization successful, False otherwise
33
+ """
34
+ if cls._initialized:
35
+ return True
36
+
37
+ if not HAS_MACOS_APIS:
38
+ logger.warning("macOS APIs not available, using mock mode")
39
+ cls._initialized = True
40
+ return True
41
+
42
+ try:
43
+ # Test keyboard event creation
44
+ test_event = CGEventCreateKeyboardEvent(None, 0x00, True)
45
+ if test_event is None:
46
+ logger.error("Failed to create test keyboard event")
47
+ return False
48
+
49
+ logger.info("Successfully initialized KeyboardManager")
50
+ cls._initialized = True
51
+ return True
52
+
53
+ except Exception as e:
54
+ logger.error(f"Error initializing KeyboardManager: {e}")
55
+ logger.exception("Full traceback:")
56
+ return False
57
+
58
+ @classmethod
59
+ def ensure_initialized(cls) -> bool:
60
+ """Ensure the keyboard manager is initialized.
61
+
62
+ Returns:
63
+ bool: True if initialized or initialization successful, False otherwise
64
+ """
65
+ if not cls._initialized:
66
+ return cls.initialize()
67
+ return True
68
+
69
+ # Key code mapping for common keys
70
+ KEY_CODES = {
71
+ 'a': 0x00, 'b': 0x0B, 'c': 0x08, 'd': 0x02, 'e': 0x0E,
72
+ 'f': 0x03, 'g': 0x05, 'h': 0x04, 'i': 0x22, 'j': 0x26,
73
+ 'k': 0x28, 'l': 0x25, 'm': 0x2E, 'n': 0x2D, 'o': 0x1F,
74
+ 'p': 0x23, 'q': 0x0C, 'r': 0x0F, 's': 0x01, 't': 0x11,
75
+ 'u': 0x20, 'v': 0x09, 'w': 0x0D, 'x': 0x07, 'y': 0x10,
76
+ 'z': 0x06, '1': 0x12, '2': 0x13, '3': 0x14, '4': 0x15,
77
+ '5': 0x17, '6': 0x16, '7': 0x1A, '8': 0x1C, '9': 0x19,
78
+ '0': 0x1D, 'return': 0x24, 'tab': 0x30, 'space': 0x31,
79
+ 'delete': 0x33, 'escape': 0x35, 'command': 0x37,
80
+ 'shift': 0x38, 'capslock': 0x39, 'option': 0x3A,
81
+ 'control': 0x3B, 'right_shift': 0x3C, 'right_option': 0x3D,
82
+ 'right_control': 0x3E, 'left_arrow': 0x7B, 'right_arrow': 0x7C,
83
+ 'down_arrow': 0x7D, 'up_arrow': 0x7E,
84
+ }
85
+
86
+ @classmethod
87
+ def get_modifiers(cls):
88
+ """Get modifier key mapping based on platform availability."""
89
+ if HAS_MACOS_APIS:
90
+ return {
91
+ 'command': kCGEventFlagMaskCommand,
92
+ 'shift': kCGEventFlagMaskShift,
93
+ 'control': kCGEventFlagMaskControl,
94
+ 'option': kCGEventFlagMaskAlternate,
95
+ }
96
+ return {}
97
+
98
+ @classmethod
99
+ def send_key(cls, key: str, modifiers: Optional[List[str]] = None) -> bool:
100
+ """Send a keyboard key press event.
101
+
102
+ Args:
103
+ key: The key to press (e.g., 'a', 'return', 'space')
104
+ modifiers: List of modifier keys to hold (e.g., ['command', 'shift'])
105
+
106
+ Returns:
107
+ bool: True if successful, False otherwise
108
+ """
109
+ if not cls.ensure_initialized():
110
+ return False
111
+
112
+ if not HAS_MACOS_APIS:
113
+ logger.warning(f"Mock mode: send key '{key}' with modifiers {modifiers}")
114
+ return True
115
+
116
+ try:
117
+ # Convert key to lowercase for consistency
118
+ key = key.lower()
119
+
120
+ # Get key code
121
+ if key not in cls.KEY_CODES:
122
+ logger.error(f"Unknown key: {key}")
123
+ return False
124
+
125
+ key_code = cls.KEY_CODES[key]
126
+
127
+ # Calculate modifier flags
128
+ flags = 0
129
+ if modifiers:
130
+ modifier_map = cls.get_modifiers()
131
+ for mod in modifiers:
132
+ mod = mod.lower()
133
+ if mod in modifier_map:
134
+ flags |= modifier_map[mod]
135
+
136
+ # Create key down event
137
+ event_down = CGEventCreateKeyboardEvent(None, key_code, True)
138
+ if event_down is None:
139
+ logger.error("Failed to create key down event")
140
+ return False
141
+
142
+ if flags:
143
+ CGEventSetFlags(event_down, flags)
144
+
145
+ # Create key up event
146
+ event_up = CGEventCreateKeyboardEvent(None, key_code, False)
147
+ if event_up is None:
148
+ logger.error("Failed to create key up event")
149
+ return False
150
+
151
+ if flags:
152
+ CGEventSetFlags(event_up, flags)
153
+
154
+ # Post events
155
+ CGEventPost(kCGHIDEventTap, event_down)
156
+ time.sleep(0.01) # Small delay between down and up events
157
+ CGEventPost(kCGHIDEventTap, event_up)
158
+
159
+ logger.info(f"Successfully sent key '{key}' with modifiers {modifiers if modifiers else 'none'}")
160
+ return True
161
+
162
+ except Exception as e:
163
+ logger.error(f"Error sending keyboard input: {e}")
164
+ logger.exception("Full traceback:")
165
+ return False
166
+
167
+ @classmethod
168
+ def type_text(cls, text: str, delay: float = 0.1) -> bool:
169
+ """Type a sequence of text characters.
170
+
171
+ Args:
172
+ text: The text to type
173
+ delay: Delay between keystrokes in seconds (default: 0.1)
174
+
175
+ Returns:
176
+ bool: True if successful, False otherwise
177
+ """
178
+ if not cls.ensure_initialized():
179
+ return False
180
+
181
+ if not HAS_MACOS_APIS:
182
+ logger.warning(f"Mock mode: type text '{text}'")
183
+ return True
184
+
185
+ try:
186
+ for char in text:
187
+ # Handle uppercase letters
188
+ if char.isupper():
189
+ if not cls.send_key(char.lower(), ['shift']):
190
+ return False
191
+ else:
192
+ if not cls.send_key(char.lower()):
193
+ return False
194
+ time.sleep(delay)
195
+
196
+ logger.info(f"Successfully typed text: {text}")
197
+ return True
198
+
199
+ except Exception as e:
200
+ logger.error(f"Error typing text: {e}")
201
+ logger.exception("Full traceback:")
202
+ return False
@@ -0,0 +1,278 @@
1
+ import logging
2
+ import os
3
+ import uuid
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+ from fastapi import FastAPI, HTTPException
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ from fastapi.staticfiles import StaticFiles
9
+ from pydantic import BaseModel
10
+ from typing import List, Dict, Optional
11
+ from .window_manager import WindowManager
12
+ from .keyboard_manager import KeyboardManager
13
+ from mcp.server.fastmcp import FastMCP
14
+
15
+ # Configure logging
16
+ logger = logging.getLogger(__name__)
17
+
18
+ # Initialize MCP Server
19
+ mcp = FastMCP(
20
+ name="window-screenshot"
21
+ )
22
+
23
+ # Create FastAPI app
24
+ app = FastAPI(
25
+ title="MCP Window Server",
26
+ description="MCP server for window management and screenshot capture",
27
+ version="1.0.0"
28
+ )
29
+
30
+ # Add CORS middleware
31
+ app.add_middleware(
32
+ CORSMiddleware,
33
+ allow_origins=["*"],
34
+ allow_credentials=True,
35
+ allow_methods=["*"],
36
+ allow_headers=["*"]
37
+ )
38
+
39
+ # Configure paths
40
+ SCREENSHOTS_DIR = Path("data/screenshots")
41
+ SCREENSHOTS_DIR.mkdir(parents=True, exist_ok=True)
42
+
43
+ # Mount static files handler for screenshots
44
+ app.mount("/screenshots", StaticFiles(directory=str(SCREENSHOTS_DIR)), name="screenshots")
45
+
46
+ class WindowInfo(BaseModel):
47
+ id: int
48
+ name: str
49
+ owner: str
50
+ bounds: Dict
51
+
52
+ @mcp.tool()
53
+ async def capture_window_screenshot(
54
+ window_identifier: str,
55
+ format: str = "binary"
56
+ ) -> Dict:
57
+ """Capture a screenshot of a specific window by its title or ID.
58
+
59
+ Args:
60
+ window_identifier: Window title to search for or window ID
61
+ format: Output format (binary or base64) (default: "binary")
62
+ """
63
+ try:
64
+ logger.info(f"Attempting to capture screenshot for window identifier: {window_identifier}")
65
+
66
+ # Try to parse as window ID first
67
+ try:
68
+ window_id = int(window_identifier)
69
+ except ValueError:
70
+ # If not a number, search by title
71
+ window_id = WindowManager.find_window_by_title(window_identifier, search_in_owner=True)
72
+ if window_id is None:
73
+ raise HTTPException(
74
+ status_code=404,
75
+ detail=f"No window found with title or owner containing '{window_identifier}'"
76
+ )
77
+
78
+ # Capture the screenshot
79
+ screenshot = WindowManager.capture_window_screenshot(window_id)
80
+ if screenshot is None:
81
+ raise HTTPException(
82
+ status_code=404,
83
+ detail=f"Failed to capture screenshot for window {window_id}"
84
+ )
85
+
86
+ # Get window info for the response
87
+ windows = WindowManager.get_window_list()
88
+ window_info = next((w for w in windows if w['id'] == window_id), None)
89
+ window_name = window_info['name'] if window_info else "Unknown Window"
90
+
91
+ # Generate unique filename and save screenshot
92
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
93
+ unique_id = str(uuid.uuid4())[:8]
94
+ filename = f"{timestamp}_{unique_id}.png"
95
+ filepath = SCREENSHOTS_DIR / filename
96
+
97
+ with open(filepath, "wb") as f:
98
+ f.write(screenshot)
99
+
100
+ logger.info(f"Successfully captured screenshot for window {window_id} ({window_name}) at {filepath}")
101
+
102
+ # Return URL to the saved screenshot
103
+ return {
104
+ "window_id": window_id,
105
+ "window_name": window_name,
106
+ "screenshot_url": f"/screenshots/{filename}"
107
+ }
108
+
109
+ except HTTPException:
110
+ raise
111
+ except Exception as e:
112
+ logger.error(f"Error capturing window screenshot: {e}")
113
+ logger.exception("Full traceback:")
114
+ raise HTTPException(status_code=500, detail=str(e))
115
+
116
+ @mcp.tool()
117
+ async def list_windows() -> List[Dict]:
118
+ """List all visible windows."""
119
+ try:
120
+ windows = WindowManager.get_window_list()
121
+ return windows
122
+ except Exception as e:
123
+ logger.error(f"Error listing windows: {e}")
124
+ raise HTTPException(status_code=500, detail=str(e))
125
+
126
+ @mcp.tool()
127
+ async def find_window(
128
+ title: str,
129
+ search_in_owner: bool = True
130
+ ) -> Dict:
131
+ """Find a window by title or owner name.
132
+
133
+ Args:
134
+ title: Window title or owner name to search for
135
+ search_in_owner: Whether to search in window owner names (default: true)
136
+ """
137
+ try:
138
+ window_id = WindowManager.find_window_by_title(title, search_in_owner)
139
+ if window_id is None:
140
+ search_type = "title or owner" if search_in_owner else "title"
141
+ raise HTTPException(
142
+ status_code=404,
143
+ detail=f"No window found with {search_type} containing '{title}'"
144
+ )
145
+ return {"window_id": window_id}
146
+ except HTTPException:
147
+ raise
148
+ except Exception as e:
149
+ logger.error(f"Error searching for window: {e}")
150
+ raise HTTPException(status_code=500, detail=str(e))
151
+
152
+ @mcp.tool()
153
+ async def send_key(
154
+ key: str,
155
+ modifiers: Optional[List[str]] = None
156
+ ) -> Dict:
157
+ """Send a keyboard key press event to the active window.
158
+
159
+ Args:
160
+ key: The key to press (e.g., 'a', 'return', 'space')
161
+ modifiers: List of modifier keys to hold (e.g., ['command', 'shift'])
162
+ """
163
+ try:
164
+ success = KeyboardManager.send_key(key, modifiers)
165
+ if not success:
166
+ raise HTTPException(
167
+ status_code=500,
168
+ detail=f"Failed to send key '{key}' with modifiers {modifiers if modifiers else 'none'}"
169
+ )
170
+ return {
171
+ "status": "success",
172
+ "key": key,
173
+ "modifiers": modifiers if modifiers else []
174
+ }
175
+ except Exception as e:
176
+ logger.error(f"Error sending key: {e}")
177
+ logger.exception("Full traceback:")
178
+ raise HTTPException(status_code=500, detail=str(e))
179
+
180
+ @mcp.tool()
181
+ async def type_text(
182
+ text: str,
183
+ delay: float = 0.1
184
+ ) -> Dict:
185
+ """Type a sequence of text characters.
186
+
187
+ Args:
188
+ text: The text to type
189
+ delay: Delay between keystrokes in seconds (default: 0.1)
190
+ """
191
+ try:
192
+ success = KeyboardManager.type_text(text, delay)
193
+ if not success:
194
+ raise HTTPException(
195
+ status_code=500,
196
+ detail=f"Failed to type text: {text}"
197
+ )
198
+ return {
199
+ "status": "success",
200
+ "text": text,
201
+ "delay": delay
202
+ }
203
+ except Exception as e:
204
+ logger.error(f"Error typing text: {e}")
205
+ logger.exception("Full traceback:")
206
+ raise HTTPException(status_code=500, detail=str(e))
207
+
208
+ # Initialize managers before starting server
209
+ async def initialize_managers():
210
+ """Initialize all managers before starting the server."""
211
+ logger.info("Initializing managers...")
212
+
213
+ # Initialize window manager
214
+ try:
215
+ windows = WindowManager.get_window_list()
216
+ logger.info(f"Window manager initialized, found {len(windows)} windows")
217
+ except Exception as e:
218
+ logger.error(f"Failed to initialize window manager: {e}")
219
+ raise
220
+
221
+ # Initialize keyboard manager
222
+ try:
223
+ if not KeyboardManager.initialize():
224
+ raise RuntimeError("Failed to initialize keyboard manager")
225
+ logger.info("Keyboard manager initialized successfully")
226
+ except Exception as e:
227
+ logger.error(f"Failed to initialize keyboard manager: {e}")
228
+ raise
229
+
230
+ logger.info("All managers initialized successfully")
231
+
232
+ # Register startup event
233
+ @app.on_event("startup")
234
+ async def startup_event():
235
+ """Initialize all components on server startup."""
236
+ try:
237
+ # Initialize our managers
238
+ await initialize_managers()
239
+ logger.info("Server initialization complete")
240
+ except Exception as e:
241
+ logger.error(f"Failed to initialize server: {e}")
242
+ logger.exception("Full traceback:")
243
+ raise
244
+
245
+ # Create MCP app instance
246
+ mcp_app = mcp.sse_app()
247
+
248
+ # Mount MCP server at /
249
+ # IT MUST BE MOUNTED AT / or else it will not work
250
+ app.mount("/", mcp_app)
251
+
252
+ @app.get("/health")
253
+ async def health_check():
254
+ """Health check endpoint."""
255
+ try:
256
+ windows = WindowManager.get_window_list()
257
+ return {
258
+ "status": "healthy",
259
+ "windows_found": len(windows),
260
+ "version": "1.0.0"
261
+ }
262
+ except Exception as e:
263
+ logger.error(f"Health check failed: {e}")
264
+ raise HTTPException(
265
+ status_code=503,
266
+ detail=f"Service unhealthy: {str(e)}"
267
+ )
268
+
269
+ if __name__ == "__main__":
270
+ import uvicorn
271
+ logger.info("Starting MCP Window Server...")
272
+ uvicorn.run(
273
+ app,
274
+ host="0.0.0.0",
275
+ port=8000,
276
+ log_level="info",
277
+ access_log=True
278
+ )
@@ -0,0 +1,204 @@
1
+ import logging
2
+ from typing import List, Dict, Optional, Tuple
3
+
4
+ try:
5
+ from Quartz import (
6
+ CGWindowListCopyWindowInfo,
7
+ kCGWindowListOptionOnScreenOnly,
8
+ kCGNullWindowID,
9
+ CGWindowListCreateImage,
10
+ CGRectNull,
11
+ kCGWindowImageDefault,
12
+ CGWindowListCreateDescriptionFromArray,
13
+ CGImageGetWidth,
14
+ CGImageGetHeight,
15
+ CGImageGetDataProvider,
16
+ CGDataProviderCopyData,
17
+ CGRectMake,
18
+ kCGWindowImageBoundsIgnoreFraming,
19
+ kCGWindowListOptionIncludingWindow,
20
+ CGImageGetBitsPerComponent,
21
+ CGImageGetBytesPerRow,
22
+ CGImageGetBitsPerPixel,
23
+ )
24
+ from Foundation import NSArray, NSDictionary
25
+ HAS_MACOS_APIS = True
26
+ except ImportError:
27
+ HAS_MACOS_APIS = False
28
+
29
+ from PIL import Image
30
+ import io
31
+ import numpy as np
32
+
33
+ logging.basicConfig(level=logging.INFO)
34
+ logger = logging.getLogger(__name__)
35
+
36
+ class WindowManager:
37
+ @staticmethod
38
+ def get_window_list() -> List[Dict]:
39
+ """Get a list of all visible windows."""
40
+ if not HAS_MACOS_APIS:
41
+ logger.warning("macOS APIs not available, returning mock window list")
42
+ return [
43
+ {'id': 1, 'name': 'Mock Window 1', 'owner': 'Test App', 'bounds': {'X': 0, 'Y': 0, 'Width': 1920, 'Height': 1080}},
44
+ {'id': 2, 'name': 'Mock Window 2', 'owner': 'Another App', 'bounds': {'X': 100, 'Y': 100, 'Width': 800, 'Height': 600}},
45
+ ]
46
+
47
+ try:
48
+ window_list = CGWindowListCopyWindowInfo(
49
+ kCGWindowListOptionOnScreenOnly, kCGNullWindowID
50
+ )
51
+ windows = []
52
+
53
+ for window in window_list:
54
+ window_dict = dict(window)
55
+ if window_dict.get('kCGWindowName'): # Only include windows with names
56
+ windows.append({
57
+ 'id': window_dict.get('kCGWindowNumber'),
58
+ 'name': window_dict.get('kCGWindowName'),
59
+ 'owner': window_dict.get('kCGWindowOwnerName'),
60
+ 'bounds': window_dict.get('kCGWindowBounds'),
61
+ })
62
+
63
+ logger.info(f"Found {len(windows)} visible windows")
64
+ return windows
65
+ except Exception as e:
66
+ logger.error(f"Error getting window list: {e}")
67
+ return []
68
+
69
+ @staticmethod
70
+ def capture_window_screenshot(window_id: int) -> Optional[bytes]:
71
+ """Capture a screenshot of a specific window by its ID."""
72
+ if not HAS_MACOS_APIS:
73
+ logger.warning("macOS APIs not available, returning mock screenshot")
74
+ # Create a simple mock screenshot
75
+ img = Image.new('RGB', (800, 600), color='lightgray')
76
+ img_byte_arr = io.BytesIO()
77
+ img.save(img_byte_arr, format='PNG')
78
+ return img_byte_arr.getvalue()
79
+
80
+ try:
81
+ logger.info(f"Attempting to capture screenshot for window {window_id}")
82
+
83
+ # Get window info to get bounds
84
+ window_list = CGWindowListCopyWindowInfo(
85
+ kCGWindowListOptionOnScreenOnly, kCGNullWindowID
86
+ )
87
+ target_window = None
88
+ for window in window_list:
89
+ window_dict = dict(window)
90
+ if window_dict.get('kCGWindowNumber') == window_id:
91
+ target_window = window_dict
92
+ break
93
+
94
+ if not target_window:
95
+ logger.error(f"Window {window_id} not found")
96
+ return None
97
+
98
+ # Get window bounds
99
+ bounds = target_window.get('kCGWindowBounds')
100
+ if not bounds:
101
+ logger.error(f"No bounds found for window {window_id}")
102
+ return None
103
+
104
+ # Create CGRect from bounds using original dimensions
105
+ window_bounds = CGRectMake(
106
+ bounds['X'],
107
+ bounds['Y'],
108
+ bounds['Width'],
109
+ bounds['Height']
110
+ )
111
+ logger.info(f"Window bounds: X={bounds['X']}, Y={bounds['Y']}, Width={bounds['Width']}, Height={bounds['Height']}")
112
+
113
+ # Get the window image using only the target window
114
+ logger.info("Creating window image...")
115
+ image = CGWindowListCreateImage(
116
+ window_bounds,
117
+ kCGWindowListOptionIncludingWindow, # Only include the target window
118
+ window_id,
119
+ kCGWindowImageDefault
120
+ )
121
+ logger.info(f"Window image created: {image is not None}")
122
+
123
+ if image is None:
124
+ logger.error(f"Failed to capture screenshot for window {window_id}")
125
+ return None
126
+
127
+ # Get image properties
128
+ width = int(CGImageGetWidth(image))
129
+ height = int(CGImageGetHeight(image))
130
+ bits_per_component = CGImageGetBitsPerComponent(image)
131
+ bytes_per_row = CGImageGetBytesPerRow(image)
132
+ bits_per_pixel = CGImageGetBitsPerPixel(image)
133
+
134
+ logger.info(f"Image properties: {width}x{height}, {bits_per_component} bits/component, {bits_per_pixel} bits/pixel, {bytes_per_row} bytes/row")
135
+
136
+ # Create a new PIL Image from the CGImage
137
+ logger.info("Converting to PIL Image...")
138
+ data_provider = CGImageGetDataProvider(image)
139
+ if data_provider is None:
140
+ logger.error("Failed to get data provider from image")
141
+ return None
142
+
143
+ image_data = CGDataProviderCopyData(data_provider)
144
+ if image_data is None:
145
+ logger.error("Failed to copy image data from provider")
146
+ return None
147
+
148
+ # Convert image data to numpy array and handle BGRA to RGBA conversion
149
+ buffer = np.frombuffer(image_data, dtype=np.uint8)
150
+ array = buffer.reshape(height, bytes_per_row // 4, 4)
151
+ # Convert BGRA to RGBA by swapping the R and B channels
152
+ array = array[..., [2, 1, 0, 3]]
153
+
154
+ pil_image = Image.fromarray(array, mode='RGBA')
155
+ logger.info("Successfully converted to PIL Image")
156
+
157
+ # Convert to bytes
158
+ img_byte_arr = io.BytesIO()
159
+ pil_image.save(img_byte_arr, format='PNG')
160
+ img_byte_arr = img_byte_arr.getvalue()
161
+
162
+ logger.info(f"Successfully captured screenshot for window {window_id}")
163
+ return img_byte_arr
164
+
165
+ except Exception as e:
166
+ logger.error(f"Error capturing window screenshot: {e}")
167
+ logger.exception("Full traceback:")
168
+ return None
169
+
170
+ @staticmethod
171
+ def find_window_by_title(title: str, search_in_owner: bool = True) -> Optional[int]:
172
+ """
173
+ Find a window ID by its title or owner name (partial match).
174
+
175
+ Args:
176
+ title: The search term to look for
177
+ search_in_owner: Whether to also search in the owner field (default: True)
178
+
179
+ Returns:
180
+ The window ID if found, None otherwise
181
+ """
182
+ windows = WindowManager.get_window_list()
183
+ search_term = title.lower()
184
+
185
+ # First try exact match in owner field if search_in_owner is True
186
+ if search_in_owner:
187
+ for window in windows:
188
+ owner_name = window['owner'].lower()
189
+ if owner_name == search_term:
190
+ logger.info(f"Found exact match in owner: '{window['owner']}'")
191
+ return window['id']
192
+
193
+ # Then try partial match in either field
194
+ for window in windows:
195
+ window_name = window['name'].lower()
196
+ owner_name = window['owner'].lower()
197
+
198
+ # Check if the search term matches either the window name or owner
199
+ if search_term in window_name or (search_in_owner and search_term in owner_name):
200
+ logger.info(f"Found window with title '{title}' in name '{window['name']}' or owner '{window['owner']}'")
201
+ return window['id']
202
+
203
+ logger.warning(f"No window found with title or owner containing '{title}'")
204
+ return None