PyPI - iflow-mcp_jhead_macos-screen-mcp - Versions diffs - 1.0.0__py3-none-any.whl - Mend

iflow-mcp_jhead_macos-screen-mcp 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

iflow_mcp_jhead_macos_screen_mcp-1.0.0.dist-info/METADATA +129 -0
iflow_mcp_jhead_macos_screen_mcp-1.0.0.dist-info/RECORD +10 -0
iflow_mcp_jhead_macos_screen_mcp-1.0.0.dist-info/WHEEL +4 -0
iflow_mcp_jhead_macos_screen_mcp-1.0.0.dist-info/entry_points.txt +2 -0
iflow_mcp_jhead_macos_screen_mcp-1.0.0.dist-info/licenses/LICENSE +21 -0
macos_screen_mcp/__init__.py +6 -0
macos_screen_mcp/__main__.py +36 -0
macos_screen_mcp/keyboard_manager.py +202 -0
macos_screen_mcp/server.py +278 -0
macos_screen_mcp/window_manager.py +204 -0

iflow_mcp_jhead_macos_screen_mcp-1.0.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,129 @@
+Metadata-Version: 2.4
+Name: iflow-mcp_jhead_macos-screen-mcp
+Version: 1.0.0
+Summary: MCP server for capturing window screenshots and controlling macOS windows
+License-Expression: MIT
+License-File: LICENSE
+Requires-Python: >=3.8
+Requires-Dist: fastapi>=0.104.0
+Requires-Dist: mcp>=1.6.0
+Requires-Dist: numpy>=1.24.0
+Requires-Dist: pillow>=10.0.0
+Requires-Dist: pydantic>=2.4.2
+Requires-Dist: python-multipart>=0.0.6
+Requires-Dist: uvicorn>=0.24.0
+Provides-Extra: macos
+Requires-Dist: pyobjc-framework-cocoa>=9.2; extra == 'macos'
+Requires-Dist: pyobjc-framework-quartz>=9.2; extra == 'macos'
+Description-Content-Type: text/markdown
+# macOS Screen View & Control MCP Server
+A Model Context Protocol server that provides window screenshot capabilities. This server enables LLMs to capture screenshots of specific windows on macOS, either by window title or window ID.
+### Available Tools
+- `capture_window_screenshot` - Captures a screenshot of a specific window by its title or ID
+  - `window_identifier` (string, required): Window title to search for or window ID
+  - `search_in_owner` (boolean, optional): Whether to search in window owner names (default: true)
+  - `format` (string, optional): Output format (binary or base64) (default: "binary")
+- `list_windows` - Lists all visible windows
+  - No parameters required
+- `find_window` - Finds a window by title or owner name
+  - `title` (string, required): Window title or owner name to search for
+  - `search_in_owner` (boolean, optional): Whether to search in window owner names (default: true)
+- `send_key` - Sends a keyboard key press event to the active window
+  - `key` (string, required): The key to press (e.g., 'a', 'return', 'space')
+  - `modifiers` (list of strings, optional): List of modifier keys to hold (e.g., ['command', 'shift'])
+- `type_text` - Types a sequence of text characters
+  - `text` (string, required): The text to type
+  - `delay` (float, optional): Delay between keystrokes in seconds (default: 0.1)
+### Supported Keys
+The following keys are supported:
+- Letters: a-z (case-insensitive)
+- Numbers: 0-9
+- Special keys: return, tab, space, delete, escape
+- Arrow keys: up_arrow, down_arrow, left_arrow, right_arrow
+- Modifier keys: command, shift, control, option (also right_shift, right_option, right_control)
+### Examples
+Send a single key press:
+```python
+await send_key("return")
+```
+Send a key with modifiers:
+```python
+await send_key("c", ["command"])  # Command+C (copy)
+```
+Type text:
+```python
+await type_text("Hello, World!")
+```
+## Installation
+### Using pip
+Install `macos_screen_mcp` via pip:
+```bash
+pip install git+ssh://git@github.com/jhead/macos-screen-mcp.git
+```
+After installation, you can run it as a script using:
+```bash
+python -m macos_screen_mcp
+```
+## Configuration
+### Configure
+Add to your Claude or Cursor settings:
+```json
+"mcpServers": {
+ "macos-screen": {
+    "name": "macos-screen",
+    "url": "http://localhost:8000/sse",
+    "description": "MCP server for capturing window screenshots",
+    "version": "1.0.0"
+  }
+}
+```
+## Debugging
+You can use the MCP inspector to debug the server:
+```bash
+npx @modelcontextprotocol/inspector python -m macos_screen_mcp
+```
+## Contributing
+We encourage contributions to help expand and improve macos-screen-mcp. Whether you want to add new tools, enhance existing functionality, or improve documentation, your input is valuable.
+Pull requests are welcome! Feel free to contribute new ideas, bug fixes, or enhancements to make macos-screen-mcp even more powerful and useful.
+## License
+macos-screen-mcp is licensed under the MIT License. This means you are free to use, modify, and distribute the software, subject to the terms and conditions of the MIT License. For more details, please see the LICENSE file in the project repository.

iflow_mcp_jhead_macos_screen_mcp-1.0.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,10 @@
+macos_screen_mcp/__init__.py,sha256=Fjgt3pNlsKKVBoUOvdVwTTuTAp-yupvifrG9gwVHgw0,138
+macos_screen_mcp/__main__.py,sha256=eKgeScehjaT3e0L6scX8_Owak1cF-Umx-CVsniKvHHg,1230
+macos_screen_mcp/keyboard_manager.py,sha256=tZZWR7ztc0q0M_WJZljOB-ZeEkA3HNl5axsWMLJRatM,6874
+macos_screen_mcp/server.py,sha256=BgK8Oqx-bum2HdJcVKEC7jF-FKjhePlhVJu0J7tnxmg,8675
+macos_screen_mcp/window_manager.py,sha256=bLtX3CfQmoC_XtnAevWUvLgLXf6rrjl5YMUH2WRWNyY,8322
+iflow_mcp_jhead_macos_screen_mcp-1.0.0.dist-info/METADATA,sha256=4on6yS4QdoLh_YY0mQaNr-R_TLSSafC-DA9Ovkz2U0Y,3825
+iflow_mcp_jhead_macos_screen_mcp-1.0.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+iflow_mcp_jhead_macos_screen_mcp-1.0.0.dist-info/entry_points.txt,sha256=OCoLGcqGwVL633hl4gj_SJcuwPWWTfvJXEfnlsNZ-Xc,68
+iflow_mcp_jhead_macos_screen_mcp-1.0.0.dist-info/licenses/LICENSE,sha256=YGSwFVDc-2Lwm4uQmbTSrA7vISOMhSnRES4uzzW9NOA,1068
+iflow_mcp_jhead_macos_screen_mcp-1.0.0.dist-info/RECORD,,

iflow_mcp_jhead_macos_screen_mcp-1.0.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.28.0
+Root-Is-Purelib: true
+Tag: py3-none-any

iflow_mcp_jhead_macos_screen_mcp-1.0.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ macos-screen-mcp = macos_screen_mcp.__main__:main

iflow_mcp_jhead_macos_screen_mcp-1.0.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 Justin Head
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

macos_screen_mcp/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""MCP Window Screenshot Server
+A Model Context Protocol server that provides window screenshot capabilities.
+"""
+__version__ = "1.0.0"

macos_screen_mcp/__main__.py ADDED Viewed

@@ -0,0 +1,36 @@
+import argparse
+import logging
+import uvicorn
+from .server import app, mcp
+def main():
+    parser = argparse.ArgumentParser(description="MCP Window Screenshot Server")
+    parser.add_argument("--host", default="0.0.0.0", help="Host to bind to")
+    parser.add_argument("--port", type=int, default=8000, help="Port to bind to")
+    parser.add_argument("--log-level", default="info", help="Logging level")
+    parser.add_argument("--transport", default="stdio", choices=["stdio", "sse"], help="Transport protocol (stdio or sse)")
+    args = parser.parse_args()
+    # Configure logging
+    logging.basicConfig(
+        level=getattr(logging, args.log_level.upper()),
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    )
+    logger = logging.getLogger(__name__)
+    if args.transport == "stdio":
+        logger.info("Starting MCP Window Server with stdio transport...")
+        mcp.run(transport="stdio")
+    else:
+        logger.info("Starting MCP Window Server with SSE transport...")
+        uvicorn.run(
+            app,
+            host=args.host,
+            port=args.port,
+            log_level=args.log_level.lower(),
+            access_log=True
+        )
+if __name__ == "__main__":
+    main()

macos_screen_mcp/keyboard_manager.py ADDED Viewed

@@ -0,0 +1,202 @@
+from typing import List, Dict, Optional
+import logging
+import time
+try:
+    from Quartz import (
+        CGEventCreateKeyboardEvent,
+        CGEventPost,
+        kCGHIDEventTap,
+        kCGEventKeyDown,
+        kCGEventKeyUp,
+        CGEventSetFlags,
+        kCGEventFlagMaskCommand,
+        kCGEventFlagMaskShift,
+        kCGEventFlagMaskControl,
+        kCGEventFlagMaskAlternate,
+    )
+    HAS_MACOS_APIS = True
+except ImportError:
+    HAS_MACOS_APIS = False
+logger = logging.getLogger(__name__)
+class KeyboardManager:
+    _initialized = False
+    @classmethod
+    def initialize(cls) -> bool:
+        """Initialize the keyboard manager.
+        Returns:
+            bool: True if initialization successful, False otherwise
+        """
+        if cls._initialized:
+            return True
+        if not HAS_MACOS_APIS:
+            logger.warning("macOS APIs not available, using mock mode")
+            cls._initialized = True
+            return True
+        try:
+            # Test keyboard event creation
+            test_event = CGEventCreateKeyboardEvent(None, 0x00, True)
+            if test_event is None:
+                logger.error("Failed to create test keyboard event")
+                return False
+            logger.info("Successfully initialized KeyboardManager")
+            cls._initialized = True
+            return True
+        except Exception as e:
+            logger.error(f"Error initializing KeyboardManager: {e}")
+            logger.exception("Full traceback:")
+            return False
+    @classmethod
+    def ensure_initialized(cls) -> bool:
+        """Ensure the keyboard manager is initialized.
+        Returns:
+            bool: True if initialized or initialization successful, False otherwise
+        """
+        if not cls._initialized:
+            return cls.initialize()
+        return True
+    # Key code mapping for common keys
+    KEY_CODES = {
+        'a': 0x00, 'b': 0x0B, 'c': 0x08, 'd': 0x02, 'e': 0x0E,
+        'f': 0x03, 'g': 0x05, 'h': 0x04, 'i': 0x22, 'j': 0x26,
+        'k': 0x28, 'l': 0x25, 'm': 0x2E, 'n': 0x2D, 'o': 0x1F,
+        'p': 0x23, 'q': 0x0C, 'r': 0x0F, 's': 0x01, 't': 0x11,
+        'u': 0x20, 'v': 0x09, 'w': 0x0D, 'x': 0x07, 'y': 0x10,
+        'z': 0x06, '1': 0x12, '2': 0x13, '3': 0x14, '4': 0x15,
+        '5': 0x17, '6': 0x16, '7': 0x1A, '8': 0x1C, '9': 0x19,
+        '0': 0x1D, 'return': 0x24, 'tab': 0x30, 'space': 0x31,
+        'delete': 0x33, 'escape': 0x35, 'command': 0x37,
+        'shift': 0x38, 'capslock': 0x39, 'option': 0x3A,
+        'control': 0x3B, 'right_shift': 0x3C, 'right_option': 0x3D,
+        'right_control': 0x3E, 'left_arrow': 0x7B, 'right_arrow': 0x7C,
+        'down_arrow': 0x7D, 'up_arrow': 0x7E,
+    }
+    @classmethod
+    def get_modifiers(cls):
+        """Get modifier key mapping based on platform availability."""
+        if HAS_MACOS_APIS:
+            return {
+                'command': kCGEventFlagMaskCommand,
+                'shift': kCGEventFlagMaskShift,
+                'control': kCGEventFlagMaskControl,
+                'option': kCGEventFlagMaskAlternate,
+            }
+        return {}
+    @classmethod
+    def send_key(cls, key: str, modifiers: Optional[List[str]] = None) -> bool:
+        """Send a keyboard key press event.
+        Args:
+            key: The key to press (e.g., 'a', 'return', 'space')
+            modifiers: List of modifier keys to hold (e.g., ['command', 'shift'])
+        Returns:
+            bool: True if successful, False otherwise
+        """
+        if not cls.ensure_initialized():
+            return False
+        if not HAS_MACOS_APIS:
+            logger.warning(f"Mock mode: send key '{key}' with modifiers {modifiers}")
+            return True
+        try:
+            # Convert key to lowercase for consistency
+            key = key.lower()
+            # Get key code
+            if key not in cls.KEY_CODES:
+                logger.error(f"Unknown key: {key}")
+                return False
+            key_code = cls.KEY_CODES[key]
+            # Calculate modifier flags
+            flags = 0
+            if modifiers:
+                modifier_map = cls.get_modifiers()
+                for mod in modifiers:
+                    mod = mod.lower()
+                    if mod in modifier_map:
+                        flags |= modifier_map[mod]
+            # Create key down event
+            event_down = CGEventCreateKeyboardEvent(None, key_code, True)
+            if event_down is None:
+                logger.error("Failed to create key down event")
+                return False
+            if flags:
+                CGEventSetFlags(event_down, flags)
+            # Create key up event
+            event_up = CGEventCreateKeyboardEvent(None, key_code, False)
+            if event_up is None:
+                logger.error("Failed to create key up event")
+                return False
+            if flags:
+                CGEventSetFlags(event_up, flags)
+            # Post events
+            CGEventPost(kCGHIDEventTap, event_down)
+            time.sleep(0.01)  # Small delay between down and up events
+            CGEventPost(kCGHIDEventTap, event_up)
+            logger.info(f"Successfully sent key '{key}' with modifiers {modifiers if modifiers else 'none'}")
+            return True
+        except Exception as e:
+            logger.error(f"Error sending keyboard input: {e}")
+            logger.exception("Full traceback:")
+            return False
+    @classmethod
+    def type_text(cls, text: str, delay: float = 0.1) -> bool:
+        """Type a sequence of text characters.
+        Args:
+            text: The text to type
+            delay: Delay between keystrokes in seconds (default: 0.1)
+        Returns:
+            bool: True if successful, False otherwise
+        """
+        if not cls.ensure_initialized():
+            return False
+        if not HAS_MACOS_APIS:
+            logger.warning(f"Mock mode: type text '{text}'")
+            return True
+        try:
+            for char in text:
+                # Handle uppercase letters
+                if char.isupper():
+                    if not cls.send_key(char.lower(), ['shift']):
+                        return False
+                else:
+                    if not cls.send_key(char.lower()):
+                        return False
+                time.sleep(delay)
+            logger.info(f"Successfully typed text: {text}")
+            return True
+        except Exception as e:
+            logger.error(f"Error typing text: {e}")
+            logger.exception("Full traceback:")
+            return False

macos_screen_mcp/server.py ADDED Viewed

@@ -0,0 +1,278 @@
+import logging
+import os
+import uuid
+from datetime import datetime
+from pathlib import Path
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
+from pydantic import BaseModel
+from typing import List, Dict, Optional
+from .window_manager import WindowManager
+from .keyboard_manager import KeyboardManager
+from mcp.server.fastmcp import FastMCP
+# Configure logging
+logger = logging.getLogger(__name__)
+# Initialize MCP Server
+mcp = FastMCP(
+    name="window-screenshot"
+)
+# Create FastAPI app
+app = FastAPI(
+    title="MCP Window Server",
+    description="MCP server for window management and screenshot capture",
+    version="1.0.0"
+)
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"]
+)
+# Configure paths
+SCREENSHOTS_DIR = Path("data/screenshots")
+SCREENSHOTS_DIR.mkdir(parents=True, exist_ok=True)
+# Mount static files handler for screenshots
+app.mount("/screenshots", StaticFiles(directory=str(SCREENSHOTS_DIR)), name="screenshots")
+class WindowInfo(BaseModel):
+    id: int
+    name: str
+    owner: str
+    bounds: Dict
+@mcp.tool()
+async def capture_window_screenshot(
+    window_identifier: str,
+    format: str = "binary"
+) -> Dict:
+    """Capture a screenshot of a specific window by its title or ID.
+    Args:
+        window_identifier: Window title to search for or window ID
+        format: Output format (binary or base64) (default: "binary")
+    """
+    try:
+        logger.info(f"Attempting to capture screenshot for window identifier: {window_identifier}")
+        # Try to parse as window ID first
+        try:
+            window_id = int(window_identifier)
+        except ValueError:
+            # If not a number, search by title
+            window_id = WindowManager.find_window_by_title(window_identifier, search_in_owner=True)
+            if window_id is None:
+                raise HTTPException(
+                    status_code=404,
+                    detail=f"No window found with title or owner containing '{window_identifier}'"
+                )
+        # Capture the screenshot
+        screenshot = WindowManager.capture_window_screenshot(window_id)
+        if screenshot is None:
+            raise HTTPException(
+                status_code=404,
+                detail=f"Failed to capture screenshot for window {window_id}"
+            )
+        # Get window info for the response
+        windows = WindowManager.get_window_list()
+        window_info = next((w for w in windows if w['id'] == window_id), None)
+        window_name = window_info['name'] if window_info else "Unknown Window"
+        # Generate unique filename and save screenshot
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        unique_id = str(uuid.uuid4())[:8]
+        filename = f"{timestamp}_{unique_id}.png"
+        filepath = SCREENSHOTS_DIR / filename
+        with open(filepath, "wb") as f:
+            f.write(screenshot)
+        logger.info(f"Successfully captured screenshot for window {window_id} ({window_name}) at {filepath}")
+        # Return URL to the saved screenshot
+        return {
+            "window_id": window_id,
+            "window_name": window_name,
+            "screenshot_url": f"/screenshots/{filename}"
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error capturing window screenshot: {e}")
+        logger.exception("Full traceback:")
+        raise HTTPException(status_code=500, detail=str(e))
+@mcp.tool()
+async def list_windows() -> List[Dict]:
+    """List all visible windows."""
+    try:
+        windows = WindowManager.get_window_list()
+        return windows
+    except Exception as e:
+        logger.error(f"Error listing windows: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@mcp.tool()
+async def find_window(
+    title: str,
+    search_in_owner: bool = True
+) -> Dict:
+    """Find a window by title or owner name.
+    Args:
+        title: Window title or owner name to search for
+        search_in_owner: Whether to search in window owner names (default: true)
+    """
+    try:
+        window_id = WindowManager.find_window_by_title(title, search_in_owner)
+        if window_id is None:
+            search_type = "title or owner" if search_in_owner else "title"
+            raise HTTPException(
+                status_code=404,
+                detail=f"No window found with {search_type} containing '{title}'"
+            )
+        return {"window_id": window_id}
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error searching for window: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@mcp.tool()
+async def send_key(
+    key: str,
+    modifiers: Optional[List[str]] = None
+) -> Dict:
+    """Send a keyboard key press event to the active window.
+    Args:
+        key: The key to press (e.g., 'a', 'return', 'space')
+        modifiers: List of modifier keys to hold (e.g., ['command', 'shift'])
+    """
+    try:
+        success = KeyboardManager.send_key(key, modifiers)
+        if not success:
+            raise HTTPException(
+                status_code=500,
+                detail=f"Failed to send key '{key}' with modifiers {modifiers if modifiers else 'none'}"
+            )
+        return {
+            "status": "success",
+            "key": key,
+            "modifiers": modifiers if modifiers else []
+        }
+    except Exception as e:
+        logger.error(f"Error sending key: {e}")
+        logger.exception("Full traceback:")
+        raise HTTPException(status_code=500, detail=str(e))
+@mcp.tool()
+async def type_text(
+    text: str,
+    delay: float = 0.1
+) -> Dict:
+    """Type a sequence of text characters.
+    Args:
+        text: The text to type
+        delay: Delay between keystrokes in seconds (default: 0.1)
+    """
+    try:
+        success = KeyboardManager.type_text(text, delay)
+        if not success:
+            raise HTTPException(
+                status_code=500,
+                detail=f"Failed to type text: {text}"
+            )
+        return {
+            "status": "success",
+            "text": text,
+            "delay": delay
+        }
+    except Exception as e:
+        logger.error(f"Error typing text: {e}")
+        logger.exception("Full traceback:")
+        raise HTTPException(status_code=500, detail=str(e))
+# Initialize managers before starting server
+async def initialize_managers():
+    """Initialize all managers before starting the server."""
+    logger.info("Initializing managers...")
+    # Initialize window manager
+    try:
+        windows = WindowManager.get_window_list()
+        logger.info(f"Window manager initialized, found {len(windows)} windows")
+    except Exception as e:
+        logger.error(f"Failed to initialize window manager: {e}")
+        raise
+    # Initialize keyboard manager
+    try:
+        if not KeyboardManager.initialize():
+            raise RuntimeError("Failed to initialize keyboard manager")
+        logger.info("Keyboard manager initialized successfully")
+    except Exception as e:
+        logger.error(f"Failed to initialize keyboard manager: {e}")
+        raise
+    logger.info("All managers initialized successfully")
+# Register startup event
+@app.on_event("startup")
+async def startup_event():
+    """Initialize all components on server startup."""
+    try:
+        # Initialize our managers
+        await initialize_managers()
+        logger.info("Server initialization complete")
+    except Exception as e:
+        logger.error(f"Failed to initialize server: {e}")
+        logger.exception("Full traceback:")
+        raise
+# Create MCP app instance
+mcp_app = mcp.sse_app()
+# Mount MCP server at /
+# IT MUST BE MOUNTED AT / or else it will not work
+app.mount("/", mcp_app)
+@app.get("/health")
+async def health_check():
+    """Health check endpoint."""
+    try:
+        windows = WindowManager.get_window_list()
+        return {
+            "status": "healthy",
+            "windows_found": len(windows),
+            "version": "1.0.0"
+        }
+    except Exception as e:
+        logger.error(f"Health check failed: {e}")
+        raise HTTPException(
+            status_code=503,
+            detail=f"Service unhealthy: {str(e)}"
+        )
+if __name__ == "__main__":
+    import uvicorn
+    logger.info("Starting MCP Window Server...")
+    uvicorn.run(
+        app,
+        host="0.0.0.0",
+        port=8000,
+        log_level="info",
+        access_log=True
+    )

macos_screen_mcp/window_manager.py ADDED Viewed

@@ -0,0 +1,204 @@
+import logging
+from typing import List, Dict, Optional, Tuple
+try:
+    from Quartz import (
+        CGWindowListCopyWindowInfo,
+        kCGWindowListOptionOnScreenOnly,
+        kCGNullWindowID,
+        CGWindowListCreateImage,
+        CGRectNull,
+        kCGWindowImageDefault,
+        CGWindowListCreateDescriptionFromArray,
+        CGImageGetWidth,
+        CGImageGetHeight,
+        CGImageGetDataProvider,
+        CGDataProviderCopyData,
+        CGRectMake,
+        kCGWindowImageBoundsIgnoreFraming,
+        kCGWindowListOptionIncludingWindow,
+        CGImageGetBitsPerComponent,
+        CGImageGetBytesPerRow,
+        CGImageGetBitsPerPixel,
+    )
+    from Foundation import NSArray, NSDictionary
+    HAS_MACOS_APIS = True
+except ImportError:
+    HAS_MACOS_APIS = False
+from PIL import Image
+import io
+import numpy as np
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class WindowManager:
+    @staticmethod
+    def get_window_list() -> List[Dict]:
+        """Get a list of all visible windows."""
+        if not HAS_MACOS_APIS:
+            logger.warning("macOS APIs not available, returning mock window list")
+            return [
+                {'id': 1, 'name': 'Mock Window 1', 'owner': 'Test App', 'bounds': {'X': 0, 'Y': 0, 'Width': 1920, 'Height': 1080}},
+                {'id': 2, 'name': 'Mock Window 2', 'owner': 'Another App', 'bounds': {'X': 100, 'Y': 100, 'Width': 800, 'Height': 600}},
+            ]
+        try:
+            window_list = CGWindowListCopyWindowInfo(
+                kCGWindowListOptionOnScreenOnly, kCGNullWindowID
+            )
+            windows = []
+            for window in window_list:
+                window_dict = dict(window)
+                if window_dict.get('kCGWindowName'):  # Only include windows with names
+                    windows.append({
+                        'id': window_dict.get('kCGWindowNumber'),
+                        'name': window_dict.get('kCGWindowName'),
+                        'owner': window_dict.get('kCGWindowOwnerName'),
+                        'bounds': window_dict.get('kCGWindowBounds'),
+                    })
+            logger.info(f"Found {len(windows)} visible windows")
+            return windows
+        except Exception as e:
+            logger.error(f"Error getting window list: {e}")
+            return []
+    @staticmethod
+    def capture_window_screenshot(window_id: int) -> Optional[bytes]:
+        """Capture a screenshot of a specific window by its ID."""
+        if not HAS_MACOS_APIS:
+            logger.warning("macOS APIs not available, returning mock screenshot")
+            # Create a simple mock screenshot
+            img = Image.new('RGB', (800, 600), color='lightgray')
+            img_byte_arr = io.BytesIO()
+            img.save(img_byte_arr, format='PNG')
+            return img_byte_arr.getvalue()
+        try:
+            logger.info(f"Attempting to capture screenshot for window {window_id}")
+            # Get window info to get bounds
+            window_list = CGWindowListCopyWindowInfo(
+                kCGWindowListOptionOnScreenOnly, kCGNullWindowID
+            )
+            target_window = None
+            for window in window_list:
+                window_dict = dict(window)
+                if window_dict.get('kCGWindowNumber') == window_id:
+                    target_window = window_dict
+                    break
+            if not target_window:
+                logger.error(f"Window {window_id} not found")
+                return None
+            # Get window bounds
+            bounds = target_window.get('kCGWindowBounds')
+            if not bounds:
+                logger.error(f"No bounds found for window {window_id}")
+                return None
+            # Create CGRect from bounds using original dimensions
+            window_bounds = CGRectMake(
+                bounds['X'],
+                bounds['Y'],
+                bounds['Width'],
+                bounds['Height']
+            )
+            logger.info(f"Window bounds: X={bounds['X']}, Y={bounds['Y']}, Width={bounds['Width']}, Height={bounds['Height']}")
+            # Get the window image using only the target window
+            logger.info("Creating window image...")
+            image = CGWindowListCreateImage(
+                window_bounds,
+                kCGWindowListOptionIncludingWindow,  # Only include the target window
+                window_id,
+                kCGWindowImageDefault
+            )
+            logger.info(f"Window image created: {image is not None}")
+            if image is None:
+                logger.error(f"Failed to capture screenshot for window {window_id}")
+                return None
+            # Get image properties
+            width = int(CGImageGetWidth(image))
+            height = int(CGImageGetHeight(image))
+            bits_per_component = CGImageGetBitsPerComponent(image)
+            bytes_per_row = CGImageGetBytesPerRow(image)
+            bits_per_pixel = CGImageGetBitsPerPixel(image)
+            logger.info(f"Image properties: {width}x{height}, {bits_per_component} bits/component, {bits_per_pixel} bits/pixel, {bytes_per_row} bytes/row")
+            # Create a new PIL Image from the CGImage
+            logger.info("Converting to PIL Image...")
+            data_provider = CGImageGetDataProvider(image)
+            if data_provider is None:
+                logger.error("Failed to get data provider from image")
+                return None
+            image_data = CGDataProviderCopyData(data_provider)
+            if image_data is None:
+                logger.error("Failed to copy image data from provider")
+                return None
+            # Convert image data to numpy array and handle BGRA to RGBA conversion
+            buffer = np.frombuffer(image_data, dtype=np.uint8)
+            array = buffer.reshape(height, bytes_per_row // 4, 4)
+            # Convert BGRA to RGBA by swapping the R and B channels
+            array = array[..., [2, 1, 0, 3]]
+            pil_image = Image.fromarray(array, mode='RGBA')
+            logger.info("Successfully converted to PIL Image")
+            # Convert to bytes
+            img_byte_arr = io.BytesIO()
+            pil_image.save(img_byte_arr, format='PNG')
+            img_byte_arr = img_byte_arr.getvalue()
+            logger.info(f"Successfully captured screenshot for window {window_id}")
+            return img_byte_arr
+        except Exception as e:
+            logger.error(f"Error capturing window screenshot: {e}")
+            logger.exception("Full traceback:")
+            return None
+    @staticmethod
+    def find_window_by_title(title: str, search_in_owner: bool = True) -> Optional[int]:
+        """
+        Find a window ID by its title or owner name (partial match).
+        Args:
+            title: The search term to look for
+            search_in_owner: Whether to also search in the owner field (default: True)
+        Returns:
+            The window ID if found, None otherwise
+        """
+        windows = WindowManager.get_window_list()
+        search_term = title.lower()
+        # First try exact match in owner field if search_in_owner is True
+        if search_in_owner:
+            for window in windows:
+                owner_name = window['owner'].lower()
+                if owner_name == search_term:
+                    logger.info(f"Found exact match in owner: '{window['owner']}'")
+                    return window['id']
+        # Then try partial match in either field
+        for window in windows:
+            window_name = window['name'].lower()
+            owner_name = window['owner'].lower()
+            # Check if the search term matches either the window name or owner
+            if search_term in window_name or (search_in_owner and search_term in owner_name):
+                logger.info(f"Found window with title '{title}' in name '{window['name']}' or owner '{window['owner']}'")
+                return window['id']
+        logger.warning(f"No window found with title or owner containing '{title}'")
+        return None