minitap-mobile-use 3.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. minitap/mobile_use/__init__.py +0 -0
  2. minitap/mobile_use/agents/contextor/contextor.md +55 -0
  3. minitap/mobile_use/agents/contextor/contextor.py +175 -0
  4. minitap/mobile_use/agents/contextor/types.py +36 -0
  5. minitap/mobile_use/agents/cortex/cortex.md +135 -0
  6. minitap/mobile_use/agents/cortex/cortex.py +152 -0
  7. minitap/mobile_use/agents/cortex/types.py +15 -0
  8. minitap/mobile_use/agents/executor/executor.md +42 -0
  9. minitap/mobile_use/agents/executor/executor.py +87 -0
  10. minitap/mobile_use/agents/executor/tool_node.py +152 -0
  11. minitap/mobile_use/agents/hopper/hopper.md +15 -0
  12. minitap/mobile_use/agents/hopper/hopper.py +44 -0
  13. minitap/mobile_use/agents/orchestrator/human.md +12 -0
  14. minitap/mobile_use/agents/orchestrator/orchestrator.md +21 -0
  15. minitap/mobile_use/agents/orchestrator/orchestrator.py +134 -0
  16. minitap/mobile_use/agents/orchestrator/types.py +11 -0
  17. minitap/mobile_use/agents/outputter/human.md +25 -0
  18. minitap/mobile_use/agents/outputter/outputter.py +85 -0
  19. minitap/mobile_use/agents/outputter/test_outputter.py +167 -0
  20. minitap/mobile_use/agents/planner/human.md +14 -0
  21. minitap/mobile_use/agents/planner/planner.md +126 -0
  22. minitap/mobile_use/agents/planner/planner.py +101 -0
  23. minitap/mobile_use/agents/planner/types.py +51 -0
  24. minitap/mobile_use/agents/planner/utils.py +70 -0
  25. minitap/mobile_use/agents/summarizer/summarizer.py +35 -0
  26. minitap/mobile_use/agents/video_analyzer/__init__.py +5 -0
  27. minitap/mobile_use/agents/video_analyzer/human.md +5 -0
  28. minitap/mobile_use/agents/video_analyzer/video_analyzer.md +37 -0
  29. minitap/mobile_use/agents/video_analyzer/video_analyzer.py +111 -0
  30. minitap/mobile_use/clients/browserstack_client.py +477 -0
  31. minitap/mobile_use/clients/idb_client.py +429 -0
  32. minitap/mobile_use/clients/ios_client.py +332 -0
  33. minitap/mobile_use/clients/ios_client_config.py +141 -0
  34. minitap/mobile_use/clients/ui_automator_client.py +330 -0
  35. minitap/mobile_use/clients/wda_client.py +526 -0
  36. minitap/mobile_use/clients/wda_lifecycle.py +367 -0
  37. minitap/mobile_use/config.py +413 -0
  38. minitap/mobile_use/constants.py +3 -0
  39. minitap/mobile_use/context.py +106 -0
  40. minitap/mobile_use/controllers/__init__.py +0 -0
  41. minitap/mobile_use/controllers/android_controller.py +524 -0
  42. minitap/mobile_use/controllers/controller_factory.py +46 -0
  43. minitap/mobile_use/controllers/device_controller.py +182 -0
  44. minitap/mobile_use/controllers/ios_controller.py +436 -0
  45. minitap/mobile_use/controllers/platform_specific_commands_controller.py +199 -0
  46. minitap/mobile_use/controllers/types.py +106 -0
  47. minitap/mobile_use/controllers/unified_controller.py +193 -0
  48. minitap/mobile_use/graph/graph.py +160 -0
  49. minitap/mobile_use/graph/state.py +115 -0
  50. minitap/mobile_use/main.py +309 -0
  51. minitap/mobile_use/sdk/__init__.py +12 -0
  52. minitap/mobile_use/sdk/agent.py +1294 -0
  53. minitap/mobile_use/sdk/builders/__init__.py +10 -0
  54. minitap/mobile_use/sdk/builders/agent_config_builder.py +307 -0
  55. minitap/mobile_use/sdk/builders/index.py +15 -0
  56. minitap/mobile_use/sdk/builders/task_request_builder.py +236 -0
  57. minitap/mobile_use/sdk/constants.py +1 -0
  58. minitap/mobile_use/sdk/examples/README.md +83 -0
  59. minitap/mobile_use/sdk/examples/__init__.py +1 -0
  60. minitap/mobile_use/sdk/examples/app_lock_messaging.py +54 -0
  61. minitap/mobile_use/sdk/examples/platform_manual_task_example.py +67 -0
  62. minitap/mobile_use/sdk/examples/platform_minimal_example.py +48 -0
  63. minitap/mobile_use/sdk/examples/simple_photo_organizer.py +76 -0
  64. minitap/mobile_use/sdk/examples/smart_notification_assistant.py +225 -0
  65. minitap/mobile_use/sdk/examples/video_transcription_example.py +117 -0
  66. minitap/mobile_use/sdk/services/cloud_mobile.py +656 -0
  67. minitap/mobile_use/sdk/services/platform.py +434 -0
  68. minitap/mobile_use/sdk/types/__init__.py +51 -0
  69. minitap/mobile_use/sdk/types/agent.py +84 -0
  70. minitap/mobile_use/sdk/types/exceptions.py +138 -0
  71. minitap/mobile_use/sdk/types/platform.py +183 -0
  72. minitap/mobile_use/sdk/types/task.py +269 -0
  73. minitap/mobile_use/sdk/utils.py +29 -0
  74. minitap/mobile_use/services/accessibility.py +100 -0
  75. minitap/mobile_use/services/llm.py +247 -0
  76. minitap/mobile_use/services/telemetry.py +421 -0
  77. minitap/mobile_use/tools/index.py +67 -0
  78. minitap/mobile_use/tools/mobile/back.py +52 -0
  79. minitap/mobile_use/tools/mobile/erase_one_char.py +56 -0
  80. minitap/mobile_use/tools/mobile/focus_and_clear_text.py +317 -0
  81. minitap/mobile_use/tools/mobile/focus_and_input_text.py +153 -0
  82. minitap/mobile_use/tools/mobile/launch_app.py +86 -0
  83. minitap/mobile_use/tools/mobile/long_press_on.py +169 -0
  84. minitap/mobile_use/tools/mobile/open_link.py +62 -0
  85. minitap/mobile_use/tools/mobile/press_key.py +83 -0
  86. minitap/mobile_use/tools/mobile/stop_app.py +62 -0
  87. minitap/mobile_use/tools/mobile/swipe.py +156 -0
  88. minitap/mobile_use/tools/mobile/tap.py +154 -0
  89. minitap/mobile_use/tools/mobile/video_recording.py +177 -0
  90. minitap/mobile_use/tools/mobile/wait_for_delay.py +81 -0
  91. minitap/mobile_use/tools/scratchpad.py +147 -0
  92. minitap/mobile_use/tools/test_utils.py +413 -0
  93. minitap/mobile_use/tools/tool_wrapper.py +16 -0
  94. minitap/mobile_use/tools/types.py +35 -0
  95. minitap/mobile_use/tools/utils.py +336 -0
  96. minitap/mobile_use/utils/app_launch_utils.py +173 -0
  97. minitap/mobile_use/utils/cli_helpers.py +37 -0
  98. minitap/mobile_use/utils/cli_selection.py +143 -0
  99. minitap/mobile_use/utils/conversations.py +31 -0
  100. minitap/mobile_use/utils/decorators.py +124 -0
  101. minitap/mobile_use/utils/errors.py +6 -0
  102. minitap/mobile_use/utils/file.py +13 -0
  103. minitap/mobile_use/utils/logger.py +183 -0
  104. minitap/mobile_use/utils/media.py +186 -0
  105. minitap/mobile_use/utils/recorder.py +52 -0
  106. minitap/mobile_use/utils/requests_utils.py +37 -0
  107. minitap/mobile_use/utils/shell_utils.py +20 -0
  108. minitap/mobile_use/utils/test_ui_hierarchy.py +178 -0
  109. minitap/mobile_use/utils/time.py +6 -0
  110. minitap/mobile_use/utils/ui_hierarchy.py +132 -0
  111. minitap/mobile_use/utils/video.py +281 -0
  112. minitap_mobile_use-3.3.0.dist-info/METADATA +329 -0
  113. minitap_mobile_use-3.3.0.dist-info/RECORD +115 -0
  114. minitap_mobile_use-3.3.0.dist-info/WHEEL +4 -0
  115. minitap_mobile_use-3.3.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,167 @@
1
+ import sys
2
+ from unittest.mock import AsyncMock, Mock, patch
3
+
4
+ import pytest
5
+ from pydantic import BaseModel
6
+
7
+ sys.modules["langgraph.prebuilt.chat_agent_executor"] = Mock()
8
+ sys.modules["minitap.mobile_use.graph.state"] = Mock()
9
+ sys.modules["langchain_google_vertexai"] = Mock()
10
+ sys.modules["langchain_google_genai"] = Mock()
11
+ sys.modules["langchain_openai"] = Mock()
12
+ sys.modules["langchain_cerebras"] = Mock()
13
+
14
+ from minitap.mobile_use.agents.outputter.outputter import outputter # noqa: E402
15
+ from minitap.mobile_use.config import LLM, OutputConfig # noqa: E402
16
+ from minitap.mobile_use.context import MobileUseContext # noqa: E402
17
+ from minitap.mobile_use.utils.logger import get_logger # noqa: E402
18
+
19
+ logger = get_logger(__name__)
20
+
21
+
22
+ class MockPydanticSchema(BaseModel):
23
+ color: str
24
+ price: float
25
+ currency_symbol: str
26
+ website_url: str
27
+
28
+
29
+ mock_dict = {
30
+ "color": "green",
31
+ "price": 20,
32
+ "currency_symbol": "$",
33
+ "website_url": "http://superwebsite.fr",
34
+ }
35
+
36
+
37
+ class DummyState:
38
+ def __init__(self, messages, initial_goal, agents_thoughts):
39
+ self.messages = messages
40
+ self.initial_goal = initial_goal
41
+ self.agents_thoughts = agents_thoughts
42
+
43
+
44
+ mocked_state = DummyState(
45
+ messages=[],
46
+ initial_goal="Find a green product on my website",
47
+ agents_thoughts=[
48
+ "Going on http://superwebsite.fr",
49
+ "Searching for products",
50
+ "Filtering by color",
51
+ "Color 'green' found for a 20 dollars product",
52
+ ],
53
+ )
54
+
55
+
56
+ @pytest.fixture
57
+ def mock_context():
58
+ """Create a properly mocked context with all required fields."""
59
+ ctx = Mock(spec=MobileUseContext)
60
+ ctx.llm_config = {
61
+ "executor": LLM(provider="openai", model="gpt-5-nano"),
62
+ "cortex": LLM(provider="openai", model="gpt-5-nano"),
63
+ "planner": LLM(provider="openai", model="gpt-5-nano"),
64
+ "orchestrator": LLM(provider="openai", model="gpt-5-nano"),
65
+ }
66
+ ctx.device = Mock()
67
+ return ctx
68
+
69
+
70
+ @pytest.fixture
71
+ def mock_state():
72
+ """Create a mock state with test data."""
73
+ return DummyState(
74
+ messages=[],
75
+ initial_goal="Find a green product on my website",
76
+ agents_thoughts=[
77
+ "Going on http://superwebsite.fr",
78
+ "Searching for products",
79
+ "Filtering by color",
80
+ "Color 'green' found for a 20 dollars product",
81
+ ],
82
+ )
83
+
84
+
85
+ @patch("minitap.mobile_use.agents.outputter.outputter.get_llm")
86
+ @pytest.mark.asyncio
87
+ async def test_outputter_with_pydantic_model(mock_get_llm, mock_context, mock_state):
88
+ """Test outputter with Pydantic model output."""
89
+ # Mock the structured LLM response
90
+ mock_structured_llm = AsyncMock()
91
+ mock_structured_llm.ainvoke.return_value = MockPydanticSchema(
92
+ color="green", price=20, currency_symbol="$", website_url="http://superwebsite.fr"
93
+ )
94
+
95
+ # Mock the base LLM
96
+ mock_llm = Mock()
97
+ mock_llm.with_structured_output.return_value = mock_structured_llm
98
+ mock_get_llm.return_value = mock_llm
99
+
100
+ config = OutputConfig(
101
+ structured_output=MockPydanticSchema,
102
+ output_description=None,
103
+ )
104
+
105
+ result = await outputter(ctx=mock_context, output_config=config, graph_output=mock_state)
106
+
107
+ assert isinstance(result, dict)
108
+ assert result.get("color") == "green"
109
+
110
+
111
+ @patch("minitap.mobile_use.agents.outputter.outputter.get_llm")
112
+ @pytest.mark.asyncio
113
+ async def test_outputter_with_dict(mock_get_llm, mock_context, mock_state):
114
+ """Test outputter with dictionary output."""
115
+ # Mock the structured LLM response for dict
116
+ mock_structured_llm = AsyncMock()
117
+ expected_dict = {
118
+ "color": "green",
119
+ "price": 20,
120
+ "currency_symbol": "$",
121
+ "website_url": "http://superwebsite.fr",
122
+ }
123
+ mock_structured_llm.ainvoke.return_value = expected_dict
124
+
125
+ # Mock the base LLM
126
+ mock_llm = Mock()
127
+ mock_llm.with_structured_output.return_value = mock_structured_llm
128
+ mock_get_llm.return_value = mock_llm
129
+
130
+ config = OutputConfig(
131
+ structured_output=mock_dict,
132
+ output_description=None,
133
+ )
134
+
135
+ result = await outputter(ctx=mock_context, output_config=config, graph_output=mock_state)
136
+
137
+ assert isinstance(result, dict)
138
+ assert result.get("color") == "green"
139
+ assert result.get("price") == 20
140
+ assert result.get("currency_symbol") == "$"
141
+ assert result.get("website_url") == "http://superwebsite.fr"
142
+
143
+
144
+ @patch("minitap.mobile_use.agents.outputter.outputter.get_llm")
145
+ @pytest.mark.asyncio
146
+ async def test_outputter_with_natural_language_output(mock_get_llm, mock_context, mock_state):
147
+ """Test outputter with natural language description output."""
148
+ # Mock the LLM response for natural language output (no structured output)
149
+ mock_llm = AsyncMock()
150
+ expected_json = '{"color": "green", "price": 20, "currency_symbol": "$", "website_url": "http://superwebsite.fr"}'
151
+ mock_llm.ainvoke.return_value = Mock(content=expected_json)
152
+ mock_get_llm.return_value = mock_llm
153
+
154
+ config = OutputConfig(
155
+ structured_output=None,
156
+ output_description=(
157
+ "A JSON object with a color, a price, a currency_symbol and a website_url key"
158
+ ),
159
+ )
160
+
161
+ result = await outputter(ctx=mock_context, output_config=config, graph_output=mock_state)
162
+
163
+ assert isinstance(result, dict)
164
+ assert result.get("color") == "green"
165
+ assert result.get("price") == 20
166
+ assert result.get("currency_symbol") == "$"
167
+ assert result.get("website_url") == "http://superwebsite.fr"
@@ -0,0 +1,14 @@
1
+ Here is your input.
2
+
3
+ ---
4
+
5
+ **Action (plan or replan)**: {{ action }}
6
+
7
+ **Initial Goal**: {{ initial_goal }}
8
+
9
+ {% if action == "replan" %}
10
+ Relevant only if action is replan:
11
+
12
+ **Previous Plan**: {{ previous_plan }}
13
+ **Agent Thoughts**: {{ agent_thoughts }}
14
+ {% endif %}
@@ -0,0 +1,126 @@
1
+ ## You are the **Planner**
2
+
3
+ Break down user goals into **sequential subgoals** for {{ platform }} mobile execution.
4
+
5
+ ---
6
+
7
+ ## 🚨 Critical Rules
8
+
9
+ {% if current_foreground_app %}
10
+ ### App Already Open: `{{ current_foreground_app }}`
11
+ **NEVER** create "Open {{ current_foreground_app }}" subgoal. Start with first action INSIDE the app.
12
+ {% endif %}
13
+ {% if locked_app_package %}
14
+ ### App Lock: `{{ locked_app_package }}`
15
+ All actions must stay within this app (except OAuth flows).
16
+ {% endif %}
17
+
18
+ ---
19
+
20
+ ## Planning Guidelines
21
+
22
+ **Subgoals should be:**
23
+ - **Purpose-driven**: "Open conversation with Alice to send message" not just "Tap chat"
24
+ - **Sequential**: Each step prepares the next
25
+ - **Not too granular**: High-level milestones, not button-by-button
26
+ - **No loops**: Instead of "repeat 3 times", write 3 separate subgoals
27
+ - **Self-Correcting**: If the goal has specific formatting constraints (e.g., "add a new line", "sorted list"), include a final subgoal to **verify the result and fix it** if necessary.
28
+
29
+ **Shortcuts**: Always prefer `launch_app` to open apps (not manual app drawer navigation), `open_link` for URLs.
30
+ {% if video_recording_enabled %}
31
+ ### 🎥 Video Recording Pattern
32
+
33
+ When a goal involves capturing, transcribing, or analyzing video content, follow this execution order:
34
+
35
+ 1. **Start recording BEFORE interacting with the video** — The recording must be active before any click that triggers playback
36
+ 2. **Perform video interactions** — Open/play the video and wait for it to complete
37
+ 3. **Stop recording AFTER the video ends** — Only stop once playback is fully finished
38
+
39
+ ⚠️ **Critical:** Each recording action (start/stop) must be its own dedicated subgoal. This prevents truncation due to short videos or inference latency.
40
+
41
+ **Example subgoals for "transcribe video X":**
42
+ 1. Navigate to video X location
43
+ 2. Start screen recording
44
+ 3. Play video X and wait for completion
45
+ 4. Stop recording and extract transcription
46
+ {% endif %}
47
+
48
+ Available tools: {{ executor_tools_list }}
49
+
50
+ ---
51
+
52
+ ## Replanning
53
+
54
+ When revising a failed plan:
55
+ 1. **Keep completed subgoals** - don't restart from scratch
56
+ 2. **Use agent thoughts** as source of truth for what happened
57
+ 3. **Pivot strategy** based on observations (e.g., use search if scrolling failed)
58
+ 4. **Continue from current state**, not from beginning
59
+
60
+ ---
61
+
62
+ ## Output Format
63
+
64
+ ```json
65
+ {
66
+ "subgoals": [
67
+ {"description": "First subgoal description"},
68
+ {"description": "Second subgoal description"}
69
+ ]
70
+ }
71
+ ```
72
+
73
+ ---
74
+
75
+ ## Examples
76
+
77
+ **Goal:** "Send 'I'm running late' to Alice on WhatsApp"
78
+
79
+ ❌ **Bad subgoals (overlapping/vague):**
80
+ ```
81
+ - Open WhatsApp to find Alice ← What does "find" mean?
82
+ - Open conversation with Alice ← Might already be done if "find" included tapping
83
+ ```
84
+
85
+ ✅ **Good subgoals (atomic, non-overlapping):**
86
+ ```
87
+ - Open WhatsApp
88
+ - Navigate to Alice's conversation
89
+ - Send the message "I'm running late"
90
+ ```
91
+
92
+ **Key principle:** Each subgoal = one clear checkpoint. The Cortex decides HOW, the Planner defines WHAT milestone to reach.
93
+
94
+ ---
95
+
96
+ **Replanning after failure:**
97
+ ```
98
+ Original: "Navigate to Alice's conversation" (FAILED)
99
+ Agent thoughts: Alice not in visible chats, search bar available
100
+
101
+ New plan:
102
+ - Search for "Alice" using search bar
103
+ - Open conversation from search results
104
+ - Send message
105
+ ```
106
+
107
+ **Cross-app data transfer (using note tools):**
108
+ ```
109
+ Goal: "Copy the recipe ingredients from RecipeApp and add them to my shopping list in ShoppingApp"
110
+
111
+ ✅ Correct subgoals:
112
+ - Open RecipeApp and navigate to the recipe
113
+ - Save the ingredients list using the `save_note` tool
114
+ - Open ShoppingApp
115
+ - Read the saved note using the `read_note` tool and add items to shopping list
116
+ ```
117
+ {% if current_foreground_app %}
118
+
119
+ **Foreground app already open (`{{ current_foreground_app }}`):**
120
+ ```
121
+ Goal: "Send message to Bob"
122
+
123
+ ✅ Correct: Navigate to Bob's chat → Send message
124
+ ❌ Wrong: Open WhatsApp → ... (app already open!)
125
+ ```
126
+ {% endif %}
@@ -0,0 +1,101 @@
1
+ from pathlib import Path
2
+
3
+ from jinja2 import Template
4
+ from langchain_core.messages import HumanMessage, SystemMessage
5
+
6
+ from minitap.mobile_use.agents.planner.types import PlannerOutput, Subgoal, SubgoalStatus
7
+ from minitap.mobile_use.agents.planner.utils import generate_id, one_of_them_is_failure
8
+ from minitap.mobile_use.context import MobileUseContext
9
+ from minitap.mobile_use.controllers.platform_specific_commands_controller import (
10
+ get_current_foreground_package,
11
+ )
12
+ from minitap.mobile_use.graph.state import State
13
+ from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message, with_fallback
14
+ from minitap.mobile_use.tools.index import (
15
+ EXECUTOR_WRAPPERS_TOOLS,
16
+ VIDEO_RECORDING_WRAPPERS,
17
+ format_tools_list,
18
+ )
19
+ from minitap.mobile_use.utils.decorators import wrap_with_callbacks
20
+ from minitap.mobile_use.utils.logger import get_logger
21
+
22
+ logger = get_logger(__name__)
23
+
24
+
25
+ class PlannerNode:
26
+ def __init__(self, ctx: MobileUseContext):
27
+ self.ctx = ctx
28
+
29
+ @wrap_with_callbacks(
30
+ before=lambda: logger.info("Starting Planner Agent..."),
31
+ on_success=lambda _: logger.success("Planner Agent"),
32
+ on_failure=lambda _: logger.error("Planner Agent"),
33
+ )
34
+ async def __call__(self, state: State):
35
+ needs_replan = one_of_them_is_failure(state.subgoal_plan)
36
+
37
+ current_locked_app_package = (
38
+ self.ctx.execution_setup.get_locked_app_package() if self.ctx.execution_setup else None
39
+ )
40
+ current_foreground_app = get_current_foreground_package(self.ctx)
41
+
42
+ executor_wrappers = list(EXECUTOR_WRAPPERS_TOOLS)
43
+ if self.ctx.video_recording_enabled:
44
+ executor_wrappers.extend(VIDEO_RECORDING_WRAPPERS)
45
+
46
+ system_message = Template(
47
+ Path(__file__).parent.joinpath("planner.md").read_text(encoding="utf-8")
48
+ ).render(
49
+ platform=self.ctx.device.mobile_platform.value,
50
+ executor_tools_list=format_tools_list(ctx=self.ctx, wrappers=executor_wrappers),
51
+ locked_app_package=current_locked_app_package,
52
+ current_foreground_app=current_foreground_app,
53
+ video_recording_enabled=self.ctx.video_recording_enabled,
54
+ )
55
+ human_message = Template(
56
+ Path(__file__).parent.joinpath("human.md").read_text(encoding="utf-8")
57
+ ).render(
58
+ action="replan" if needs_replan else "plan",
59
+ initial_goal=state.initial_goal,
60
+ previous_plan="\n".join(str(s) for s in state.subgoal_plan),
61
+ agent_thoughts="\n".join(state.agents_thoughts),
62
+ )
63
+ messages = [
64
+ SystemMessage(content=system_message),
65
+ HumanMessage(content=human_message),
66
+ ]
67
+
68
+ llm = get_llm(ctx=self.ctx, name="planner").with_structured_output(PlannerOutput)
69
+ llm_fallback = get_llm(
70
+ ctx=self.ctx, name="planner", use_fallback=True
71
+ ).with_structured_output(PlannerOutput)
72
+ response: PlannerOutput = await with_fallback(
73
+ main_call=lambda: invoke_llm_with_timeout_message(
74
+ llm.ainvoke(messages),
75
+ ),
76
+ fallback_call=lambda: invoke_llm_with_timeout_message(
77
+ llm_fallback.ainvoke(messages),
78
+ ),
79
+ ) # type: ignore
80
+ subgoals_plan = [
81
+ Subgoal(
82
+ id=generate_id(),
83
+ description=subgoal.description,
84
+ status=SubgoalStatus.NOT_STARTED,
85
+ completion_reason=None,
86
+ )
87
+ for subgoal in response.subgoals
88
+ ]
89
+ logger.info("📜 Generated plan:")
90
+ logger.info("\n".join(str(s) for s in subgoals_plan))
91
+
92
+ if self.ctx.on_plan_changes:
93
+ await self.ctx.on_plan_changes(subgoals_plan, needs_replan)
94
+
95
+ return await state.asanitize_update(
96
+ ctx=self.ctx,
97
+ update={
98
+ "subgoal_plan": subgoals_plan,
99
+ },
100
+ agent="planner",
101
+ )
@@ -0,0 +1,51 @@
1
+ from datetime import datetime
2
+ from enum import Enum
3
+ from typing import Annotated
4
+
5
+ from pydantic import BaseModel
6
+
7
+
8
+ class PlannerSubgoalOutput(BaseModel):
9
+ description: str
10
+
11
+
12
+ class PlannerOutput(BaseModel):
13
+ subgoals: list[PlannerSubgoalOutput]
14
+
15
+
16
+ class SubgoalStatus(Enum):
17
+ NOT_STARTED = "NOT_STARTED"
18
+ PENDING = "PENDING"
19
+ SUCCESS = "SUCCESS"
20
+ FAILURE = "FAILURE"
21
+
22
+
23
+ class Subgoal(BaseModel):
24
+ id: Annotated[str, "Unique identifier of the subgoal"]
25
+ description: Annotated[str, "Description of the subgoal"]
26
+ completion_reason: Annotated[
27
+ str | None, "Reason why the subgoal was completed (failure or success)"
28
+ ] = None
29
+ status: SubgoalStatus
30
+ started_at: Annotated[datetime | None, "When the subgoal started"] = None
31
+ ended_at: Annotated[datetime | None, "When the subgoal ended"] = None
32
+
33
+ def __str__(self):
34
+ status_emoji = "❓"
35
+ match self.status:
36
+ case SubgoalStatus.SUCCESS:
37
+ status_emoji = "✅"
38
+ case SubgoalStatus.FAILURE:
39
+ status_emoji = "❌"
40
+ case SubgoalStatus.PENDING:
41
+ status_emoji = "⏳"
42
+ case SubgoalStatus.NOT_STARTED:
43
+ status_emoji = "(not started yet)"
44
+
45
+ output = f"- [ID:{self.id}]: {self.description} : {status_emoji}."
46
+ if self.completion_reason:
47
+ output += f" Completion reason: {self.completion_reason}"
48
+ return output
49
+
50
+ def __repr__(self):
51
+ return str(self)
@@ -0,0 +1,70 @@
1
+ import random
2
+ import string
3
+
4
+ from minitap.mobile_use.agents.planner.types import Subgoal, SubgoalStatus
5
+ from datetime import datetime, UTC
6
+
7
+
8
+ def get_current_subgoal(subgoals: list[Subgoal]) -> Subgoal | None:
9
+ return next((s for s in subgoals if s.status == SubgoalStatus.PENDING), None)
10
+
11
+
12
+ def get_subgoals_by_ids(subgoals: list[Subgoal], ids: list[str]) -> list[Subgoal]:
13
+ return [s for s in subgoals if s.id in ids]
14
+
15
+
16
+ def get_next_subgoal(subgoals: list[Subgoal]) -> Subgoal | None:
17
+ return next((s for s in subgoals if s.status == SubgoalStatus.NOT_STARTED), None)
18
+
19
+
20
+ def nothing_started(subgoals: list[Subgoal]) -> bool:
21
+ return all(s.status == SubgoalStatus.NOT_STARTED for s in subgoals)
22
+
23
+
24
+ def complete_current_subgoal(subgoals: list[Subgoal]) -> list[Subgoal]:
25
+ current_subgoal = get_current_subgoal(subgoals)
26
+ if not current_subgoal:
27
+ return subgoals
28
+ current_subgoal.status = SubgoalStatus.SUCCESS
29
+ current_subgoal.ended_at = datetime.now(UTC)
30
+ return subgoals
31
+
32
+
33
+ def complete_subgoals_by_ids(subgoals: list[Subgoal], ids: list[str]) -> list[Subgoal]:
34
+ for subgoal in subgoals:
35
+ if subgoal.id in ids:
36
+ subgoal.status = SubgoalStatus.SUCCESS
37
+ subgoal.ended_at = datetime.now(UTC)
38
+ return subgoals
39
+
40
+
41
+ def fail_current_subgoal(subgoals: list[Subgoal]) -> list[Subgoal]:
42
+ current_subgoal = get_current_subgoal(subgoals)
43
+ if not current_subgoal:
44
+ return subgoals
45
+ current_subgoal.status = SubgoalStatus.FAILURE
46
+ current_subgoal.ended_at = datetime.now(UTC)
47
+ return subgoals
48
+
49
+
50
+ def all_completed(subgoals: list[Subgoal]) -> bool:
51
+ return all(s.status == SubgoalStatus.SUCCESS for s in subgoals)
52
+
53
+
54
+ def one_of_them_is_failure(subgoals: list[Subgoal]) -> bool:
55
+ return any(s.status == SubgoalStatus.FAILURE for s in subgoals)
56
+
57
+
58
+ def start_next_subgoal(subgoals: list[Subgoal]) -> list[Subgoal]:
59
+ next_subgoal = get_next_subgoal(subgoals)
60
+ if not next_subgoal:
61
+ return subgoals
62
+ next_subgoal.status = SubgoalStatus.PENDING
63
+ next_subgoal.started_at = datetime.now(UTC)
64
+ return subgoals
65
+
66
+
67
+ def generate_id(length: int = 6) -> str:
68
+ """Generates a small and distinct random string ID."""
69
+ chars = string.ascii_lowercase + string.digits
70
+ return "".join(random.choice(chars) for _ in range(length))
@@ -0,0 +1,35 @@
1
+ from langchain_core.messages import (
2
+ HumanMessage,
3
+ RemoveMessage,
4
+ ToolMessage,
5
+ )
6
+
7
+ from minitap.mobile_use.constants import MAX_MESSAGES_IN_HISTORY
8
+ from minitap.mobile_use.context import MobileUseContext
9
+ from minitap.mobile_use.graph.state import State
10
+
11
+
12
+ class SummarizerNode:
13
+ def __init__(self, ctx: MobileUseContext):
14
+ self.ctx = ctx
15
+
16
+ async def __call__(self, state: State):
17
+ if len(state.messages) <= MAX_MESSAGES_IN_HISTORY:
18
+ return {}
19
+
20
+ nb_removal_candidates = len(state.messages) - MAX_MESSAGES_IN_HISTORY
21
+
22
+ remove_messages = []
23
+ start_removal = False
24
+
25
+ for msg in reversed(state.messages[:nb_removal_candidates]):
26
+ if isinstance(msg, ToolMessage | HumanMessage):
27
+ start_removal = True
28
+ if start_removal and msg.id:
29
+ remove_messages.append(RemoveMessage(id=msg.id))
30
+ return await state.asanitize_update(
31
+ ctx=self.ctx,
32
+ update={
33
+ "messages": remove_messages,
34
+ },
35
+ )
@@ -0,0 +1,5 @@
1
+ """Video analyzer utility for analyzing video content with Gemini models."""
2
+
3
+ from minitap.mobile_use.agents.video_analyzer.video_analyzer import analyze_video
4
+
5
+ __all__ = ["analyze_video"]
@@ -0,0 +1,5 @@
1
+ Please analyze the following video recording and respond to my request.
2
+
3
+ ---
4
+
5
+ **My Request**: {{ prompt }}
@@ -0,0 +1,37 @@
1
+ ## You are a **Video Analysis Assistant**
2
+
3
+ You analyze video recordings of mobile device screens and provide accurate, detailed responses based on what you observe.
4
+
5
+ ---
6
+
7
+ ## Your Focus Areas
8
+
9
+ When analyzing videos, pay attention to:
10
+
11
+ - **UI elements** and their states (buttons, text fields, toggles, etc.)
12
+ - **Text content** displayed on screen
13
+ - **Actions that occur** (taps, scrolls, transitions, animations)
14
+ - **Notifications or dialogs** that appear
15
+ - **Changes in the interface** over time
16
+ - **Audio content** if present (transcribe speech, describe sounds)
17
+
18
+ ---
19
+
20
+ ## Guidelines
21
+
22
+ - **Be precise and factual** - Only describe what you can actually see or hear
23
+ - **Note uncertainty** - If you cannot clearly see or determine something, say so
24
+ - **Be thorough** - Capture all relevant details that relate to the user's question
25
+ - **Use timestamps** when describing sequences of events (e.g., "At 0:05, the user taps...")
26
+ - **Structure your response** clearly when there's a lot of information
27
+
28
+ ---
29
+
30
+ ## Response Format
31
+
32
+ Adapt your response format to the user's request:
33
+
34
+ - For **transcription requests**: Provide clean, readable text of what was spoken or displayed
35
+ - For **description requests**: Give a chronological narrative of events
36
+ - For **specific questions**: Answer directly and concisely
37
+ - For **extraction requests**: List items clearly (e.g., notifications, text content)