minitap-mobile-use 2.5.3__py3-none-any.whl → 2.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of minitap-mobile-use might be problematic. Click here for more details.

Files changed (43) hide show
  1. minitap/mobile_use/agents/contextor/contextor.py +0 -8
  2. minitap/mobile_use/agents/cortex/cortex.md +122 -36
  3. minitap/mobile_use/agents/cortex/cortex.py +32 -17
  4. minitap/mobile_use/agents/cortex/types.py +18 -4
  5. minitap/mobile_use/agents/executor/executor.md +3 -3
  6. minitap/mobile_use/agents/executor/executor.py +10 -3
  7. minitap/mobile_use/agents/hopper/hopper.md +30 -2
  8. minitap/mobile_use/agents/hopper/hopper.py +19 -15
  9. minitap/mobile_use/agents/orchestrator/orchestrator.py +14 -5
  10. minitap/mobile_use/agents/outputter/outputter.py +13 -3
  11. minitap/mobile_use/agents/planner/planner.md +20 -9
  12. minitap/mobile_use/agents/planner/planner.py +12 -5
  13. minitap/mobile_use/agents/screen_analyzer/human.md +16 -0
  14. minitap/mobile_use/agents/screen_analyzer/screen_analyzer.py +111 -0
  15. minitap/mobile_use/clients/ios_client.py +7 -3
  16. minitap/mobile_use/config.py +87 -24
  17. minitap/mobile_use/controllers/mobile_command_controller.py +354 -88
  18. minitap/mobile_use/controllers/platform_specific_commands_controller.py +41 -27
  19. minitap/mobile_use/controllers/types.py +95 -0
  20. minitap/mobile_use/graph/graph.py +55 -11
  21. minitap/mobile_use/graph/state.py +10 -3
  22. minitap/mobile_use/main.py +12 -4
  23. minitap/mobile_use/sdk/agent.py +113 -72
  24. minitap/mobile_use/sdk/examples/smart_notification_assistant.py +59 -10
  25. minitap/mobile_use/sdk/services/platform.py +15 -1
  26. minitap/mobile_use/sdk/types/platform.py +1 -0
  27. minitap/mobile_use/sdk/types/task.py +10 -1
  28. minitap/mobile_use/servers/device_hardware_bridge.py +13 -6
  29. minitap/mobile_use/services/llm.py +5 -2
  30. minitap/mobile_use/tools/index.py +7 -9
  31. minitap/mobile_use/tools/mobile/{clear_text.py → focus_and_clear_text.py} +7 -7
  32. minitap/mobile_use/tools/mobile/{input_text.py → focus_and_input_text.py} +8 -8
  33. minitap/mobile_use/tools/mobile/long_press_on.py +130 -15
  34. minitap/mobile_use/tools/mobile/swipe.py +3 -26
  35. minitap/mobile_use/tools/mobile/tap.py +41 -28
  36. minitap/mobile_use/tools/mobile/wait_for_delay.py +84 -0
  37. minitap/mobile_use/utils/cli_helpers.py +10 -6
  38. {minitap_mobile_use-2.5.3.dist-info → minitap_mobile_use-2.7.0.dist-info}/METADATA +1 -1
  39. {minitap_mobile_use-2.5.3.dist-info → minitap_mobile_use-2.7.0.dist-info}/RECORD +41 -39
  40. minitap/mobile_use/tools/mobile/glimpse_screen.py +0 -74
  41. minitap/mobile_use/tools/mobile/wait_for_animation_to_end.py +0 -64
  42. {minitap_mobile_use-2.5.3.dist-info → minitap_mobile_use-2.7.0.dist-info}/WHEEL +0 -0
  43. {minitap_mobile_use-2.5.3.dist-info → minitap_mobile_use-2.7.0.dist-info}/entry_points.txt +0 -0
@@ -24,7 +24,13 @@ You work like an agile tech lead: defining the key milestones without locking in
24
24
  - A list of **agent thoughts**, including observations from the device, challenges encountered, and reasoning about what happened
25
25
  - Take into account the agent thoughts/previous plan to update the plan : maybe some steps are not required as we successfully completed them.
26
26
 
27
- Use these inputs to update the plan: removing dead ends, adapting to what we learned, and suggesting new directions.
27
+ Your job is **not to restart from scratch**. Instead:
28
+
29
+ - Exclude subgoals that are already marked completed.
30
+ - Begin the new plan at the **next major action** after the last success.
31
+ - Use **agent thoughts only** as the source of truth when deciding what went wrong and what is possible next.
32
+ - If a subgoal failed or was partially wrong, redefine it based on what the agent thoughts revealed (e.g., pivot to 'search' if a contact wasn't in recent chats).
33
+ - Ensure the replanned steps still drive toward the original user goal, but always flow logically from the **current known state**.
28
34
 
29
35
  ### Output
30
36
 
@@ -56,17 +62,22 @@ Each subgoal should be:
56
62
 
57
63
  #### **Replanning Example**
58
64
 
59
- **Original Plan**: same as above
60
- **Agent Thoughts**:
65
+ **Original Plan**:
66
+ - Open the WhatsApp app to find the contact "Alice" (COMPLETED)
67
+ - Open the conversation with Alice to send a message (FAILED)
68
+ - Type the message "I'm running late" into the message field (NOT_STARTED)
69
+ - Send the message (NOT_STARTED)
61
70
 
62
- - Couldn't find Alice in recent chats
63
- - Search bar was present on top of the chat screen
64
- - Keyboard appeared after tapping search
71
+ **Agent Thoughts**:
72
+ - Successfully launched WhatsApp app
73
+ - Couldn't find Alice in recent chats - scrolled through visible conversations but no match
74
+ - Search bar was present on top of the chat screen with resource-id com.whatsapp:id/menuitem_search
75
+ - Previous approach of manually scrolling through chats is inefficient for this case
65
76
 
66
77
  **New Plan**:
67
-
68
- - Open WhatsApp
69
78
  - Tap the search bar to find a contact
70
79
  - Search for "Alice" in the search field
71
80
  - Select the correct chat to open the conversation
72
- - Type and send "Im running late"
81
+ - Type and send "I'm running late"
82
+
83
+ **Reasoning**: The agent thoughts reveal that WhatsApp is already open (first subgoal completed), but Alice wasn't in recent chats. Rather than restarting, we pivot to using the search feature that was observed, continuing from the current state.
@@ -7,7 +7,7 @@ from minitap.mobile_use.agents.planner.types import PlannerOutput, Subgoal, Subg
7
7
  from minitap.mobile_use.agents.planner.utils import generate_id, one_of_them_is_failure
8
8
  from minitap.mobile_use.context import MobileUseContext
9
9
  from minitap.mobile_use.graph.state import State
10
- from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message
10
+ from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message, with_fallback
11
11
  from minitap.mobile_use.tools.index import EXECUTOR_WRAPPERS_TOOLS, format_tools_list
12
12
  from minitap.mobile_use.utils.decorators import wrap_with_callbacks
13
13
  from minitap.mobile_use.utils.logger import get_logger
@@ -46,10 +46,17 @@ class PlannerNode:
46
46
  HumanMessage(content=human_message),
47
47
  ]
48
48
 
49
- llm = get_llm(ctx=self.ctx, name="planner")
50
- llm = llm.with_structured_output(PlannerOutput)
51
- response: PlannerOutput = await invoke_llm_with_timeout_message(
52
- llm.ainvoke(messages), agent_name="Planner"
49
+ llm = get_llm(ctx=self.ctx, name="planner").with_structured_output(PlannerOutput)
50
+ llm_fallback = get_llm(
51
+ ctx=self.ctx, name="planner", use_fallback=True
52
+ ).with_structured_output(PlannerOutput)
53
+ response: PlannerOutput = await with_fallback(
54
+ main_call=lambda: invoke_llm_with_timeout_message(
55
+ llm.ainvoke(messages), agent_name="Planner"
56
+ ),
57
+ fallback_call=lambda: invoke_llm_with_timeout_message(
58
+ llm_fallback.ainvoke(messages), agent_name="Planner (Fallback)"
59
+ ),
53
60
  ) # type: ignore
54
61
  subgoals_plan = [
55
62
  Subgoal(
@@ -0,0 +1,16 @@
1
+ ## Task
2
+
3
+ Analyze the provided screenshot and answer the following specific question:
4
+
5
+ {{ prompt }}
6
+
7
+ ## Instructions
8
+
9
+ 1. Look carefully at the screenshot
10
+ 2. Provide a concise, direct answer to the question
11
+ 3. Focus only on what is visible in the screenshot
12
+ 4. Be specific and factual
13
+
14
+ ## Output
15
+
16
+ Provide your analysis as a clear, concise text response.
@@ -0,0 +1,111 @@
1
+ from pathlib import Path
2
+
3
+ from jinja2 import Template
4
+ from langchain_core.messages import HumanMessage, SystemMessage
5
+
6
+ from minitap.mobile_use.context import MobileUseContext
7
+ from minitap.mobile_use.controllers.mobile_command_controller import (
8
+ take_screenshot as take_screenshot_controller,
9
+ )
10
+ from minitap.mobile_use.graph.state import State
11
+ from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message, with_fallback
12
+ from minitap.mobile_use.utils.conversations import get_screenshot_message_for_llm
13
+ from minitap.mobile_use.utils.decorators import wrap_with_callbacks
14
+ from minitap.mobile_use.utils.logger import get_logger
15
+ from minitap.mobile_use.utils.media import compress_base64_jpeg
16
+
17
+ logger = get_logger(__name__)
18
+
19
+
20
+ class ScreenAnalyzerNode:
21
+ def __init__(self, ctx: MobileUseContext):
22
+ self.ctx = ctx
23
+
24
+ @wrap_with_callbacks(
25
+ before=lambda: logger.info("Starting Screen Analyzer Agent..."),
26
+ on_success=lambda _: logger.success("Screen Analyzer Agent"),
27
+ on_failure=lambda _: logger.error("Screen Analyzer Agent"),
28
+ )
29
+ async def __call__(self, state: State):
30
+ # Check if there's a screen analysis request
31
+ if not state.screen_analysis_prompt:
32
+ logger.info("No screen analysis prompt, skipping")
33
+ return {}
34
+
35
+ prompt = state.screen_analysis_prompt
36
+ analysis_result = "Analysis failed"
37
+ has_failed = False
38
+
39
+ try:
40
+ # Take a fresh screenshot
41
+ screenshot_output = take_screenshot_controller(ctx=self.ctx)
42
+ compressed_image_base64 = compress_base64_jpeg(screenshot_output)
43
+
44
+ # Invoke the screen_analyzer
45
+ analysis_result = await screen_analyzer(
46
+ ctx=self.ctx, screenshot_base64=compressed_image_base64, prompt=prompt
47
+ )
48
+
49
+ except Exception as e:
50
+ logger.error(f"Screen analysis failed: {e}")
51
+ analysis_result = f"Failed to analyze screen: {str(e)}"
52
+ has_failed = True
53
+
54
+ # Create outcome message
55
+ if has_failed:
56
+ agent_outcome = f"Screen analysis failed: {analysis_result}"
57
+ else:
58
+ agent_outcome = f"Screen analysis result: {analysis_result}"
59
+
60
+ return await state.asanitize_update(
61
+ ctx=self.ctx,
62
+ update={
63
+ "agents_thoughts": [agent_outcome],
64
+ "screen_analysis_prompt": None,
65
+ },
66
+ agent="screen_analyzer",
67
+ )
68
+
69
+
70
+ async def screen_analyzer(ctx: MobileUseContext, screenshot_base64: str, prompt: str) -> str:
71
+ """
72
+ Analyzes a screenshot using a VLM and returns a textual description based on the prompt.
73
+
74
+ Args:
75
+ ctx: The mobile use context
76
+ screenshot_base64: Base64 encoded screenshot
77
+ prompt: The specific question or instruction for analyzing the screenshot
78
+
79
+ Returns:
80
+ A concise textual description answering the prompt
81
+ """
82
+ logger.info("Starting Screen Analyzer Agent")
83
+
84
+ system_message = (
85
+ "You are a visual analysis assistant. "
86
+ "Your task is to examine screenshots and provide accurate, "
87
+ "concise answers to specific questions about what you see."
88
+ )
89
+
90
+ human_message = Template(
91
+ Path(__file__).parent.joinpath("human.md").read_text(encoding="utf-8")
92
+ ).render(prompt=prompt)
93
+
94
+ messages = [
95
+ SystemMessage(content=system_message),
96
+ get_screenshot_message_for_llm(screenshot_base64),
97
+ HumanMessage(content=human_message),
98
+ ]
99
+
100
+ llm = get_llm(ctx=ctx, name="screen_analyzer", temperature=0)
101
+ llm_fallback = get_llm(ctx=ctx, name="screen_analyzer", use_fallback=True, temperature=0)
102
+
103
+ response = await with_fallback(
104
+ main_call=lambda: invoke_llm_with_timeout_message(
105
+ llm.ainvoke(messages), agent_name="ScreenAnalyzer"
106
+ ),
107
+ fallback_call=lambda: invoke_llm_with_timeout_message(
108
+ llm_fallback.ainvoke(messages), agent_name="ScreenAnalyzer (Fallback)"
109
+ ),
110
+ )
111
+ return response.content # type: ignore
@@ -27,9 +27,13 @@ def get_ios_devices() -> tuple[bool, list[str], str]:
27
27
 
28
28
  for runtime, devices in devices_dict.items():
29
29
  if "ios" in runtime.lower(): # e.g. "com.apple.CoreSimulator.SimRuntime.iOS-17-0"
30
- for dev in devices:
31
- if "udid" in dev:
32
- serials.append(dev["udid"])
30
+ for device in devices:
31
+ if device.get("state") != "Booted":
32
+ continue
33
+ device_udid = device.get("udid")
34
+ if not device_udid:
35
+ continue
36
+ serials.append(device_udid)
33
37
 
34
38
  return True, serials, ""
35
39
 
@@ -94,8 +94,9 @@ def record_events(output_path: Path | None, events: list[str] | BaseModel | Any)
94
94
 
95
95
  LLMProvider = Literal["openai", "google", "openrouter", "xai", "vertexai", "minitap"]
96
96
  LLMUtilsNode = Literal["outputter", "hopper"]
97
- AgentNode = Literal["planner", "orchestrator", "cortex", "executor"]
98
- AgentNodeWithFallback = Literal["cortex"]
97
+ LLMUtilsNodeWithFallback = LLMUtilsNode
98
+ AgentNode = Literal["planner", "orchestrator", "cortex", "screen_analyzer", "executor"]
99
+ AgentNodeWithFallback = AgentNode
99
100
 
100
101
  ROOT_DIR = Path(__file__).parent.parent.parent
101
102
  DEFAULT_LLM_CONFIG_FILENAME = "llm-config.defaults.jsonc"
@@ -149,21 +150,23 @@ class LLMWithFallback(LLM):
149
150
 
150
151
 
151
152
  class LLMConfigUtils(BaseModel):
152
- outputter: LLM
153
- hopper: LLM
153
+ outputter: LLMWithFallback
154
+ hopper: LLMWithFallback
154
155
 
155
156
 
156
157
  class LLMConfig(BaseModel):
157
- planner: LLM
158
- orchestrator: LLM
158
+ planner: LLMWithFallback
159
+ orchestrator: LLMWithFallback
159
160
  cortex: LLMWithFallback
160
- executor: LLM
161
+ screen_analyzer: LLMWithFallback
162
+ executor: LLMWithFallback
161
163
  utils: LLMConfigUtils
162
164
 
163
165
  def validate_providers(self):
164
166
  self.planner.validate_provider("Planner")
165
167
  self.orchestrator.validate_provider("Orchestrator")
166
168
  self.cortex.validate_provider("Cortex")
169
+ self.screen_analyzer.validate_provider("ScreenAnalyzer")
167
170
  self.executor.validate_provider("Executor")
168
171
  self.utils.outputter.validate_provider("Outputter")
169
172
  self.utils.hopper.validate_provider("Hopper")
@@ -173,16 +176,17 @@ class LLMConfig(BaseModel):
173
176
  📃 Planner: {self.planner}
174
177
  🎯 Orchestrator: {self.orchestrator}
175
178
  🧠 Cortex: {self.cortex}
179
+ 👁️ ScreenAnalyzer: {self.screen_analyzer}
176
180
  🛠️ Executor: {self.executor}
177
181
  🧩 Utils:
178
182
  🔽 Hopper: {self.utils.hopper}
179
183
  📝 Outputter: {self.utils.outputter}
180
184
  """
181
185
 
182
- def get_agent(self, item: AgentNode) -> LLM:
186
+ def get_agent(self, item: AgentNode) -> LLMWithFallback:
183
187
  return getattr(self, item)
184
188
 
185
- def get_utils(self, item: LLMUtilsNode) -> LLM:
189
+ def get_utils(self, item: LLMUtilsNode) -> LLMWithFallback:
186
190
  return getattr(self.utils, item)
187
191
 
188
192
 
@@ -196,17 +200,42 @@ def get_default_llm_config() -> LLMConfig:
196
200
  except Exception as e:
197
201
  logger.error(f"Failed to load default llm config: {e}. Falling back to hardcoded config")
198
202
  return LLMConfig(
199
- planner=LLM(provider="openai", model="gpt-4.1"),
200
- orchestrator=LLM(provider="openai", model="gpt-4.1"),
203
+ planner=LLMWithFallback(
204
+ provider="openai",
205
+ model="gpt-5-nano",
206
+ fallback=LLM(provider="openai", model="gpt-5-mini"),
207
+ ),
208
+ orchestrator=LLMWithFallback(
209
+ provider="openai",
210
+ model="gpt-5-nano",
211
+ fallback=LLM(provider="openai", model="gpt-5-mini"),
212
+ ),
201
213
  cortex=LLMWithFallback(
202
214
  provider="openai",
203
- model="o3",
204
- fallback=LLM(provider="openai", model="gpt-5"),
215
+ model="gpt-5",
216
+ fallback=LLM(provider="openai", model="o4-mini"),
217
+ ),
218
+ screen_analyzer=LLMWithFallback(
219
+ provider="openai",
220
+ model="gpt-4o",
221
+ fallback=LLM(provider="openai", model="gpt-5-mini"),
222
+ ),
223
+ executor=LLMWithFallback(
224
+ provider="openai",
225
+ model="gpt-5-nano",
226
+ fallback=LLM(provider="openai", model="gpt-5-mini"),
205
227
  ),
206
- executor=LLM(provider="openai", model="gpt-4.1"),
207
228
  utils=LLMConfigUtils(
208
- outputter=LLM(provider="openai", model="gpt-5-nano"),
209
- hopper=LLM(provider="openai", model="gpt-4.1"),
229
+ outputter=LLMWithFallback(
230
+ provider="openai",
231
+ model="gpt-5-nano",
232
+ fallback=LLM(provider="openai", model="gpt-5-mini"),
233
+ ),
234
+ hopper=LLMWithFallback(
235
+ provider="openai",
236
+ model="gpt-5-nano",
237
+ fallback=LLM(provider="openai", model="gpt-5-mini"),
238
+ ),
210
239
  ),
211
240
  )
212
241
 
@@ -223,26 +252,60 @@ def get_default_minitap_llm_config() -> LLMConfig | None:
223
252
  return None
224
253
 
225
254
  return LLMConfig(
226
- planner=LLM(provider="minitap", model="meta-llama/llama-4-scout"),
227
- orchestrator=LLM(provider="minitap", model="openai/gpt-oss-120b"),
255
+ planner=LLMWithFallback(
256
+ provider="minitap",
257
+ model="meta-llama/llama-4-scout",
258
+ fallback=LLM(provider="minitap", model="meta-llama/llama-4-maverick"),
259
+ ),
260
+ orchestrator=LLMWithFallback(
261
+ provider="minitap",
262
+ model="openai/gpt-oss-120b",
263
+ fallback=LLM(provider="minitap", model="meta-llama/llama-4-maverick"),
264
+ ),
228
265
  cortex=LLMWithFallback(
229
266
  provider="minitap",
230
267
  model="google/gemini-2.5-pro",
231
268
  fallback=LLM(provider="minitap", model="openai/gpt-5"),
232
269
  ),
233
- executor=LLM(provider="minitap", model="meta-llama/llama-3.3-70b-instruct"),
270
+ screen_analyzer=LLMWithFallback(
271
+ provider="minitap",
272
+ model="meta-llama/llama-3.2-90b-vision-instruct",
273
+ fallback=LLM(provider="minitap", model="openai/gpt-4o"),
274
+ ),
275
+ executor=LLMWithFallback(
276
+ provider="minitap",
277
+ model="meta-llama/llama-3.3-70b-instruct",
278
+ fallback=LLM(provider="minitap", model="openai/gpt-5-mini"),
279
+ ),
234
280
  utils=LLMConfigUtils(
235
- outputter=LLM(provider="minitap", model="openai/gpt-4.1"),
236
- hopper=LLM(provider="minitap", model="openai/gpt-5-nano"),
281
+ outputter=LLMWithFallback(
282
+ provider="minitap",
283
+ model="openai/gpt-5-nano",
284
+ fallback=LLM(provider="minitap", model="openai/gpt-5-mini"),
285
+ ),
286
+ hopper=LLMWithFallback(
287
+ provider="minitap",
288
+ model="openai/gpt-5-nano",
289
+ fallback=LLM(provider="minitap", model="openai/gpt-5-mini"),
290
+ ),
237
291
  ),
238
292
  )
239
293
 
240
294
 
241
295
  def deep_merge_llm_config(default: LLMConfig, override: dict) -> LLMConfig:
242
- def _deep_merge_dict(base: dict, extra: dict):
296
+ def _deep_merge_dict(base: dict, extra: dict, path: str = ""):
243
297
  for key, value in extra.items():
244
- if isinstance(value, dict):
245
- _deep_merge_dict(base[key], value)
298
+ current_path = f"{path}.{key}" if path else key
299
+
300
+ if key not in base:
301
+ logger.warning(
302
+ f"Unsupported config key '{current_path}' found in override config. "
303
+ f"Ignoring this key."
304
+ )
305
+ continue
306
+
307
+ if isinstance(value, dict) and isinstance(base[key], dict):
308
+ _deep_merge_dict(base[key], value, current_path)
246
309
  else:
247
310
  base[key] = value
248
311