minitap-mcp 0.1.1__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,62 @@
1
+ You will be given _two screenshots_.
2
+
3
+ 1. "Expected screenshot" — this is the design from Figma.
4
+ 2. "Implemented screenshot" — this is the actual phone screen that has been built.
5
+
6
+ Your task is to **compare the two screenshots** in detail, and generate a structured report that includes:
7
+
8
+ - A comprehensive list of **all visible differences** between the expected design and the implemented screen.
9
+ - For each difference, provide:
10
+ - A clear **description** of what changed (for example: "The 'Submit' button label changed from 'Submit' to 'Send'", "The icon moved 8px to the right", "The background colour of header changed from #FFFFFF to #F6F6F6", etc.).
11
+ - The **type of change** (e.g., text change, color change, position/movement, size change, added element, removed element, style change).
12
+ - The **location** of the change (for example: "bottom-centre of screen", "top header area", "to the right of search bar"). If possible, approximate coordinates or bounding box (e.g., "approx. 240×180 px at screen width 1080").
13
+ - The **impact on implementation** (i.e., reasoning about what this means: "The implemented version uses a different text label – so behaviour may differ", "The icon moved and may overlap another element", etc.).
14
+ - A **recommendation** if relevant (e.g., "Should revert to #FFFFFF to match design", "Check alignment of icon relative to search bar", etc.).
15
+
16
+ **Important**:
17
+
18
+ - Assume the screenshots are aligned (same resolution and scale); if not aligned mention that as a difference.
19
+ - Focus on _visible UI differences_ (layout, text, style, iconography) – you do _not_ need to inspect source code, only what is visually rendered.
20
+ - Do _not_ produce generic comments like "looks like a difference" – aim for _precise, actionable descriptions_.
21
+ - **IGNORE dynamic/personal content** that naturally differs between mockups and real implementations:
22
+ - User profile information (names, usernames, email addresses, profile pictures)
23
+ - Time-based information (current time, dates, timestamps, "2 hours ago", etc.)
24
+ - Dynamic data (notification counts, unread badges, live statistics)
25
+ - Sample/placeholder content that varies (e.g., "John Doe" vs "Jane Smith")
26
+ - System status information (battery level, signal strength, network indicators)
27
+ - Only flag these as differences if the _structure, layout, or styling_ of these elements differs, not the content itself.
28
+ - Output in a structured format, for example:
29
+
30
+ ```
31
+
32
+ 1. Location: [top header – full width]
33
+ Change: Background colour changed from #FFFFFF → #F6F6F6
34
+ Type: Colour change
35
+ Impact: The header will appear darker than design; text contrast may be lower.
36
+ Recommendation: Update header background to #FFFFFF as in design.
37
+
38
+ ```
39
+
40
+ - At the end produce a summary with ONLY:
41
+ - Total number of differences found
42
+ - Overall "match score" out of 100 (your estimation of how closely the implementation matches the design)
43
+ - Do NOT include any recap, overview, or macro-level summary of changes - all details are already captured in the differences list above.
44
+
45
+ ### Input:
46
+
47
+ - Screenshot A: Expected (Figma)
48
+ - Screenshot B: Implemented (Phone)
49
+ Provide both screenshots and then the prompt.
50
+
51
+ ### Output:
52
+
53
+ Structured list of differences + summary.
54
+
55
+ Please use the following to start the analysis.
56
+ **Input:**
57
+ First screen is the Figma screenshot (what is expected)
58
+ Second screen is what is expected (taken from the phone, after the implementation)
59
+
60
+ You will have this data in the next messages sent by the user.
61
+
62
+ Go ahead and generate your report.
@@ -0,0 +1,65 @@
1
+ import asyncio
2
+ from pathlib import Path
3
+ from uuid import uuid4
4
+
5
+ from jinja2 import Template
6
+ from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
7
+ from pydantic import BaseModel
8
+
9
+ from minitap.mcp.core.device import capture_screenshot, find_mobile_device
10
+ from minitap.mcp.core.llm import get_minitap_llm
11
+ from minitap.mcp.core.utils import get_screenshot_message_for_llm
12
+
13
+
14
+ class CompareScreenshotsOutput(BaseModel):
15
+ comparison_text: str
16
+ expected_screenshot_base64: str
17
+ current_screenshot_base64: str
18
+
19
+
20
+ async def compare_screenshots(
21
+ expected_screenshot_base64: str,
22
+ ) -> CompareScreenshotsOutput:
23
+ """
24
+ Compare screenshots and return the comparison text along with both screenshots.
25
+
26
+ Returns:
27
+ CompareScreenshotsOutput
28
+ """
29
+ system_message = Template(
30
+ Path(__file__).parent.joinpath("compare_screenshots.md").read_text(encoding="utf-8")
31
+ ).render()
32
+
33
+ device = find_mobile_device()
34
+ current_screenshot = capture_screenshot(device)
35
+
36
+ messages: list[BaseMessage] = [
37
+ SystemMessage(content=system_message),
38
+ HumanMessage(content="Here is the Figma screenshot (what needs to be matched):"),
39
+ get_screenshot_message_for_llm(expected_screenshot_base64),
40
+ HumanMessage(content="Here is the screenshot of the mobile device:"),
41
+ get_screenshot_message_for_llm(current_screenshot),
42
+ ]
43
+
44
+ llm = get_minitap_llm(
45
+ trace_id=str(uuid4()),
46
+ remote_tracing=True,
47
+ model="google/gemini-2.5-pro",
48
+ temperature=1,
49
+ )
50
+ response = await llm.ainvoke(messages)
51
+ return CompareScreenshotsOutput(
52
+ comparison_text=str(response.content),
53
+ expected_screenshot_base64=expected_screenshot_base64,
54
+ current_screenshot_base64=current_screenshot,
55
+ )
56
+
57
+
58
+ async def main():
59
+ expected_screenshot_base64 = "Base64 encoded screenshot to compare with."
60
+ result = await compare_screenshots(expected_screenshot_base64)
61
+ print(result.model_dump_json(indent=2))
62
+
63
+
64
+ if __name__ == "__main__":
65
+ asyncio.run(main())
@@ -0,0 +1,64 @@
1
+ You are an expert at parsing React/TypeScript code to extract asset URLs and generate clean, documented code implementations.
2
+
3
+ Your task is to:
4
+
5
+ 1. Extract all asset URLs from the provided code snippet
6
+ 2. Generate a clean `code_implementation` output that includes the React code with embedded comments referencing implementation and node guidelines
7
+
8
+ **Instructions:**
9
+
10
+ ## Part 1: Extract Asset URLs
11
+
12
+ 1. Look for all constant declarations that contain URLs pointing to assets (images, SVGs, etc.)
13
+ 2. These constants typically follow patterns like:
14
+
15
+ - `const imgVariableName = "http://localhost:3845/assets/[hash].[extension]";`
16
+ - The variable names usually start with `img` followed by a descriptive name in camelCase
17
+
18
+ 3. For each asset URL found, extract:
19
+ - The **variable name** (e.g., `imgSignal`, `imgBatteryThreeQuarters`)
20
+ - The **full URL** (e.g., `http://localhost:3845/assets/685c5ac58caa29556e29737cf8f8c9605d9c8571.svg`)
21
+ - The **file extension** from the URL (e.g., `svg`, `png`, `jpg`)
22
+
23
+ ## Part 2: Generate Code Implementation
24
+
25
+ The `code_implementation` field should contain:
26
+
27
+ 1. The React/TypeScript code with **LOCAL asset imports** instead of HTTP URLs:
28
+
29
+ - Convert `const imgSignal = "http://localhost:3845/assets/[hash].svg";`
30
+ - To `import imgSignal from './assets/imgSignal.svg';` (or appropriate relative path)
31
+ - Use the **exact same variable names** as in the original const declarations
32
+ - **CRITICAL**: Preserve the variable naming convention
33
+
34
+ 2. Preserve all `data-node-id` attributes and other metadata in the code
35
+
36
+ ## Part 3: Return Format
37
+
38
+ Return a JSON object with two fields:
39
+
40
+ - `assets`: Array of extracted asset objects
41
+ - `code_implementation`: String containing the React code with embedded guideline comments
42
+
43
+ ```json
44
+ {
45
+ "assets": [
46
+ {
47
+ "variable_name": "imgSignal",
48
+ "url": "http://localhost:3845/assets/685c5ac58caa29556e29737cf8f8c9605d9c8571.svg",
49
+ "extension": "svg"
50
+ },
51
+ ...
52
+ ],
53
+ "code_implementation": "import ... function ..."
54
+ }
55
+ ```
56
+
57
+ **Important:**
58
+
59
+ - Only extract asset URLs
60
+ - Preserve the exact variable names as they appear in the code
61
+ - DO NOT MISS any assets
62
+ - If no assets are found, return an empty array for `assets`
63
+ - Return ONLY the JSON object with both `assets` and `code_implementation` fields
64
+ - Do NOT include the const declarations of the assets in the code_implementation output - convert them to imports.
@@ -0,0 +1,65 @@
1
+ """Agent to extract Figma asset URLs from design context code."""
2
+
3
+ from pathlib import Path
4
+ from uuid import uuid4
5
+
6
+ from jinja2 import Template
7
+ from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
8
+ from pydantic import BaseModel, Field
9
+
10
+ from minitap.mcp.core.llm import get_minitap_llm
11
+
12
+
13
+ class FigmaAsset(BaseModel):
14
+ """Represents a single Figma asset."""
15
+
16
+ variable_name: str = Field(description="The variable name from the code (e.g., imgSignal)")
17
+ url: str = Field(description="The full URL to the asset")
18
+ extension: str = Field(description="The file extension (e.g., svg, png, jpg)")
19
+
20
+
21
+ class ExtractedAssets(BaseModel):
22
+ """Container for all extracted Figma assets."""
23
+
24
+ assets: list[FigmaAsset] = Field(
25
+ default_factory=list,
26
+ description="List of all extracted assets from the Figma design context",
27
+ )
28
+ code_implementation: str = Field(
29
+ description=(
30
+ "The React/TypeScript code\n"
31
+ "with the local url declarations turned into const declarations"
32
+ )
33
+ )
34
+
35
+
36
+ async def extract_figma_assets(design_context_code: str) -> ExtractedAssets:
37
+ """Extract asset URLs from Figma design context code.
38
+
39
+ Args:
40
+ design_context_code: The React/TypeScript code from get_design_context
41
+
42
+ Returns:
43
+ List of dictionaries containing variable_name, url, and extension
44
+ """
45
+ system_message = Template(
46
+ Path(__file__).parent.joinpath("extract_figma_assets.md").read_text(encoding="utf-8")
47
+ ).render()
48
+
49
+ messages: list[BaseMessage] = [
50
+ SystemMessage(content=system_message),
51
+ HumanMessage(
52
+ content=f"Here is the code to analyze:\n\n```typescript\n{design_context_code}\n```"
53
+ ),
54
+ ]
55
+
56
+ llm = get_minitap_llm(
57
+ trace_id=str(uuid4()),
58
+ remote_tracing=True,
59
+ model="google/gemini-2.5-pro",
60
+ temperature=0,
61
+ ).with_structured_output(ExtractedAssets)
62
+
63
+ result: ExtractedAssets = await llm.ainvoke(messages) # type: ignore
64
+
65
+ return result
@@ -14,11 +14,14 @@ class MCPSettings(BaseSettings):
14
14
  model_config = SettingsConfigDict(env_file=".env", extra="ignore")
15
15
 
16
16
  # Minitap API configuration
17
- MINITAP_API_KEY: SecretStr
17
+ MINITAP_API_KEY: SecretStr | None = Field(default=None)
18
18
  MINITAP_API_BASE_URL: str = Field(default="https://platform.minitap.ai/api/v1")
19
19
 
20
20
  VISION_MODEL: str = Field(default="qwen/qwen-2.5-vl-7b-instruct")
21
21
 
22
+ # Figma MCP server configuration
23
+ FIGMA_MCP_SERVER_URL: str = Field(default="http://127.0.0.1:3845/mcp")
24
+
22
25
  # MCP server configuration (optional, for remote access)
23
26
  MCP_SERVER_HOST: str = Field(default="0.0.0.0")
24
27
  MCP_SERVER_PORT: int = Field(default=8000)
@@ -0,0 +1,59 @@
1
+ """Core models for the MCP server."""
2
+
3
+ from enum import Enum
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class FigmaAsset(BaseModel):
9
+ """Represents a single Figma asset."""
10
+
11
+ variable_name: str = Field(description="The variable name from the code (e.g., imgSignal)")
12
+ url: str = Field(description="The full URL to the asset")
13
+ extension: str = Field(description="The file extension (e.g., svg, png, jpg)")
14
+
15
+
16
+ class FigmaDesignContextOutput(BaseModel):
17
+ """Output from Figma design context containing code and guidelines."""
18
+
19
+ code_implementation: str = Field(description="The React/TypeScript code implementation")
20
+ code_implementation_guidelines: str | None = Field(
21
+ default=None, description="Guidelines for implementing the code"
22
+ )
23
+ nodes_guidelines: str | None = Field(
24
+ default=None, description="Guidelines specific to the nodes"
25
+ )
26
+
27
+
28
+ class DownloadStatus(str, Enum):
29
+ """Status of asset download operation."""
30
+
31
+ SUCCESS = "success"
32
+ FAILED = "failed"
33
+
34
+
35
+ class AssetDownloadResult(BaseModel):
36
+ """Result of downloading a single asset."""
37
+
38
+ filename: str = Field(description="The filename of the asset")
39
+ status: DownloadStatus = Field(description="The download status")
40
+ error: str | None = Field(default=None, description="Error message if download failed")
41
+
42
+
43
+ class AssetDownloadSummary(BaseModel):
44
+ """Summary of all asset download operations."""
45
+
46
+ successful: list[AssetDownloadResult] = Field(
47
+ default_factory=list, description="List of successfully downloaded assets"
48
+ )
49
+ failed: list[AssetDownloadResult] = Field(
50
+ default_factory=list, description="List of failed asset downloads"
51
+ )
52
+
53
+ def success_count(self) -> int:
54
+ """Return the number of successful downloads."""
55
+ return len(self.successful)
56
+
57
+ def failure_count(self) -> int:
58
+ """Return the number of failed downloads."""
59
+ return len(self.failed)
@@ -0,0 +1,27 @@
1
+ import os
2
+
3
+ from minitap.mobile_use.sdk import Agent
4
+ from minitap.mobile_use.sdk.builders import Builders
5
+
6
+ # Lazy-initialized singleton agent
7
+ _agent: Agent | None = None
8
+
9
+
10
+ def get_mobile_use_agent() -> Agent:
11
+ """Get or create the mobile-use agent singleton.
12
+
13
+ This function lazily initializes the agent on first call, ensuring
14
+ that CLI arguments are parsed before agent creation.
15
+ """
16
+ global _agent
17
+ if _agent is None:
18
+ config = Builders.AgentConfig
19
+ custom_adb_socket = os.getenv("ADB_SERVER_SOCKET")
20
+ if custom_adb_socket:
21
+ parts = custom_adb_socket.split(":")
22
+ if len(parts) != 3:
23
+ raise ValueError(f"Invalid ADB server socket: {custom_adb_socket}")
24
+ _, host, port = parts
25
+ config = config.with_adb_server(host=host, port=int(port))
26
+ _agent = Agent(config=config.build())
27
+ return _agent
minitap/mcp/main.py CHANGED
@@ -23,18 +23,56 @@ if sys.platform == "win32":
23
23
 
24
24
 
25
25
  from fastmcp import FastMCP # noqa: E402
26
+ from minitap.mobile_use.config import settings as sdk_settings
26
27
 
27
- from minitap.mcp.core.agents import agent
28
28
  from minitap.mcp.core.config import settings # noqa: E402
29
- from minitap.mcp.core.device import (
30
- DeviceInfo, # noqa: E402
31
- list_available_devices, # noqa: E402; noqa: E402
32
- )
29
+ from minitap.mcp.core.device import DeviceInfo # noqa: E402
30
+ from minitap.mcp.core.device import list_available_devices
33
31
  from minitap.mcp.server.middleware import MaestroCheckerMiddleware
34
32
  from minitap.mcp.server.poller import device_health_poller
35
33
 
36
- logger = logging.getLogger(__name__)
37
34
 
35
+ def main() -> None:
36
+ """Main entry point for the MCP server."""
37
+
38
+ parser = argparse.ArgumentParser(description="Mobile Use MCP Server")
39
+ parser.add_argument("--api-key", type=str, required=False, default=None)
40
+ parser.add_argument("--llm-profile", type=str, required=False, default=None)
41
+ parser.add_argument(
42
+ "--server",
43
+ action="store_true",
44
+ help="Run as network server (uses MCP_SERVER_HOST and MCP_SERVER_PORT from env)",
45
+ )
46
+
47
+ args = parser.parse_args()
48
+
49
+ if args.api_key:
50
+ os.environ["MINITAP_API_KEY"] = args.api_key
51
+ settings.__init__()
52
+ sdk_settings.__init__()
53
+
54
+ if args.llm_profile:
55
+ os.environ["MINITAP_LLM_PROFILE_NAME"] = args.llm_profile
56
+ settings.__init__()
57
+ sdk_settings.__init__()
58
+
59
+ if not settings.MINITAP_API_KEY:
60
+ raise ValueError("Minitap API key is required to run the MCP")
61
+
62
+ # Run MCP server with optional host/port for remote access
63
+ if args.server:
64
+ logger.info(f"Starting MCP server on {settings.MCP_SERVER_HOST}:{settings.MCP_SERVER_PORT}")
65
+ mcp_lifespan(
66
+ transport="http",
67
+ host=settings.MCP_SERVER_HOST,
68
+ port=settings.MCP_SERVER_PORT,
69
+ )
70
+ else:
71
+ logger.info("Starting MCP server in local mode")
72
+ mcp_lifespan()
73
+
74
+
75
+ logger = logging.getLogger(__name__)
38
76
 
39
77
  mcp = FastMCP(
40
78
  name="mobile-use-mcp",
@@ -44,12 +82,10 @@ mcp = FastMCP(
44
82
  Call get_available_devices() to list them.
45
83
  """,
46
84
  )
47
-
48
- from minitap.mcp.tools import ( # noqa: E402, F401
49
- analyze_screen,
50
- execute_mobile_command,
51
- go_back,
52
- )
85
+ from minitap.mcp.tools import analyze_screen # noqa: E402, F401
86
+ from minitap.mcp.tools import compare_screenshot_with_figma # noqa: E402, F401
87
+ from minitap.mcp.tools import execute_mobile_command # noqa: E402, F401
88
+ from minitap.mcp.tools import save_figma_assets # noqa: E402, F401
53
89
 
54
90
 
55
91
  @mcp.resource("data://devices")
@@ -59,6 +95,9 @@ def get_available_devices() -> list[DeviceInfo]:
59
95
 
60
96
 
61
97
  def mcp_lifespan(**mcp_run_kwargs):
98
+ from minitap.mcp.core.sdk_agent import get_mobile_use_agent # noqa: E402
99
+
100
+ agent = get_mobile_use_agent()
62
101
  mcp.add_middleware(MaestroCheckerMiddleware(agent))
63
102
 
64
103
  # Start device health poller in background
@@ -70,40 +109,25 @@ def mcp_lifespan(**mcp_run_kwargs):
70
109
  stop_event,
71
110
  agent,
72
111
  ),
112
+ daemon=True,
73
113
  )
74
114
  poller_thread.start()
75
115
 
76
116
  try:
77
117
  mcp.run(**mcp_run_kwargs)
78
118
  except KeyboardInterrupt:
79
- pass
80
-
81
- # Stop device health poller
82
- stop_event.set()
83
- logger.info("Device health poller stopping...")
84
- poller_thread.join()
85
- logger.info("Device health poller stopped")
86
-
87
-
88
- def main() -> None:
89
- """Main entry point for the MCP server."""
90
- parser = argparse.ArgumentParser(description="Mobile Use MCP Server")
91
- parser.add_argument(
92
- "--server",
93
- action="store_true",
94
- help="Run as network server (uses MCP_SERVER_HOST and MCP_SERVER_PORT from env)",
95
- )
96
-
97
- args = parser.parse_args()
98
-
99
- # Run MCP server with optional host/port for remote access
100
- if args.server:
101
- logger.info(f"Starting MCP server on {settings.MCP_SERVER_HOST}:{settings.MCP_SERVER_PORT}")
102
- mcp_lifespan(
103
- transport="http",
104
- host=settings.MCP_SERVER_HOST,
105
- port=settings.MCP_SERVER_PORT,
106
- )
107
- else:
108
- logger.info("Starting MCP server in local mode")
109
- mcp_lifespan()
119
+ logger.info("Keyboard interrupt received, shutting down...")
120
+ except Exception as e:
121
+ logger.error(f"Error running MCP server: {e}")
122
+ finally:
123
+ # Stop device health poller
124
+ logger.info("Stopping device health poller...")
125
+ stop_event.set()
126
+
127
+ # Give the poller thread a reasonable time to stop gracefully
128
+ poller_thread.join(timeout=10.0)
129
+
130
+ if poller_thread.is_alive():
131
+ logger.warning("Device health poller thread did not stop gracefully")
132
+ else:
133
+ logger.info("Device health poller stopped successfully")
@@ -1,38 +1,78 @@
1
1
  """Device health monitoring poller for the MCP server."""
2
2
 
3
+ import asyncio
3
4
  import logging
4
- import time
5
5
  import threading
6
6
 
7
- from minitap.mcp.core.device import list_available_devices
8
7
  from minitap.mobile_use.sdk import Agent
9
8
 
9
+ from minitap.mcp.core.device import list_available_devices
10
+
10
11
  logger = logging.getLogger(__name__)
11
12
 
12
13
 
13
- def device_health_poller(stop_event: threading.Event, agent: Agent) -> None:
14
+ async def _async_device_health_poller(stop_event: threading.Event, agent: Agent) -> None:
14
15
  """
15
- Background poller that monitors device availability and agent health.
16
- Runs every 5 seconds to ensure a device is connected and the agent is healthy.
16
+ Async implementation of device health poller.
17
17
 
18
18
  Args:
19
+ stop_event: Threading event to signal when to stop polling.
19
20
  agent: The Agent instance to monitor and reinitialize if needed.
20
21
  """
21
22
  while not stop_event.is_set():
22
23
  try:
23
- time.sleep(5)
24
+ # Sleep in smaller chunks to be more responsive to stop signal
25
+ for _ in range(50): # 50 * 0.1 = 5 seconds total
26
+ if stop_event.is_set():
27
+ break
28
+ await asyncio.sleep(0.1)
29
+
30
+ if stop_event.is_set():
31
+ break
24
32
 
25
33
  devices = list_available_devices()
26
34
 
27
35
  if len(devices) > 0:
28
36
  if not agent.is_healthy():
29
37
  logger.warning("Agent is not healthy. Reinitializing...")
30
- agent.clean(force=True)
31
- agent.init()
38
+ await agent.clean(force=True)
39
+ await agent.init()
32
40
  logger.info("Agent reinitialized successfully")
33
41
  else:
34
42
  logger.info("No mobile device found, retrying in 5 seconds...")
35
43
 
36
44
  except Exception as e:
37
45
  logger.error(f"Error in device health poller: {e}")
38
- agent.clean(force=True)
46
+
47
+ try:
48
+ await agent.clean(force=True)
49
+ logger.info("Agent cleaned up successfully")
50
+ except Exception as e:
51
+ logger.error(f"Error cleaning up agent: {e}")
52
+
53
+
54
+ def device_health_poller(stop_event: threading.Event, agent: Agent) -> None:
55
+ """
56
+ Background poller that monitors device availability and agent health.
57
+ Runs every 5 seconds to ensure a device is connected and the agent is healthy.
58
+
59
+ This is a sync wrapper that runs the async poller in a new event loop.
60
+
61
+ Args:
62
+ stop_event: Threading event to signal when to stop polling.
63
+ agent: The Agent instance to monitor and reinitialize if needed.
64
+ """
65
+ loop = None
66
+ try:
67
+ loop = asyncio.new_event_loop()
68
+ asyncio.set_event_loop(loop)
69
+
70
+ loop.run_until_complete(_async_device_health_poller(stop_event, agent))
71
+ except Exception as e:
72
+ logger.error(f"Error in device health poller thread: {e}")
73
+ finally:
74
+ if loop is not None:
75
+ try:
76
+ loop.close()
77
+ except Exception:
78
+ pass