droidrun 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
droidrun/__init__.py CHANGED
@@ -6,17 +6,21 @@ __version__ = "0.3.0"
6
6
 
7
7
  # Import main classes for easier access
8
8
  from droidrun.agent.utils.llm_picker import load_llm
9
- from droidrun.adb.manager import DeviceManager
10
9
  from droidrun.tools import Tools, AdbTools, IOSTools
11
10
  from droidrun.agent.droid import DroidAgent
12
11
 
12
+ # Import macro functionality
13
+ from droidrun.macro import MacroPlayer, replay_macro_file, replay_macro_folder
14
+
13
15
 
14
16
  # Make main components available at package level
15
17
  __all__ = [
16
18
  "DroidAgent",
17
- "DeviceManager",
18
19
  "load_llm",
19
20
  "Tools",
20
21
  "AdbTools",
21
22
  "IOSTools",
23
+ "MacroPlayer",
24
+ "replay_macro_file",
25
+ "replay_macro_folder",
22
26
  ]
@@ -97,6 +97,7 @@ class CodeActAgent(Workflow):
97
97
  loop=asyncio.get_event_loop(),
98
98
  locals={},
99
99
  tools=self.tool_list,
100
+ tools_instance=tools_instance,
100
101
  globals={"__builtins__": __builtins__},
101
102
  )
102
103
 
@@ -169,7 +170,7 @@ class CodeActAgent(Workflow):
169
170
  "[yellow]DeepSeek doesnt support images. Disabling screenshots[/]"
170
171
  )
171
172
  elif self.vision == True and context == "screenshot":
172
- screenshot = (await self.tools.take_screenshot())[1]
173
+ screenshot = (self.tools.take_screenshot())[1]
173
174
  ctx.write_event_to_stream(ScreenshotEvent(screenshot=screenshot))
174
175
 
175
176
  await ctx.set("screenshot", screenshot)
@@ -177,7 +178,7 @@ class CodeActAgent(Workflow):
177
178
 
178
179
  if context == "ui_state":
179
180
  try:
180
- state = await self.tools.get_state()
181
+ state = self.tools.get_state()
181
182
  await ctx.set("ui_state", state["a11y_tree"])
182
183
  chat_history = await chat_utils.add_ui_text_block(
183
184
  state["a11y_tree"], chat_history
@@ -189,7 +190,7 @@ class CodeActAgent(Workflow):
189
190
 
190
191
  if context == "packages":
191
192
  chat_history = await chat_utils.add_packages_block(
192
- await self.tools.list_packages(include_system_apps=True),
193
+ self.tools.list_packages(include_system_apps=True),
193
194
  chat_history,
194
195
  )
195
196
 
@@ -242,7 +243,7 @@ class CodeActAgent(Workflow):
242
243
  code = ev.code
243
244
  assert code, "Code cannot be empty."
244
245
  logger.info(f"⚡ Executing action...")
245
- logger.debug(f"Code to execute:\n```python\n{code}\n```")
246
+ logger.info(f"Code to execute:\n```python\n{code}\n```")
246
247
 
247
248
  try:
248
249
  self.code_exec_counter += 1
@@ -398,13 +399,13 @@ class CodeActAgent(Workflow):
398
399
  ui_state = None
399
400
 
400
401
  try:
401
- _, screenshot_bytes = await self.tools.take_screenshot()
402
+ _, screenshot_bytes = self.tools.take_screenshot()
402
403
  screenshot = screenshot_bytes
403
404
  except Exception as e:
404
405
  logger.warning(f"Failed to capture final screenshot: {e}")
405
406
 
406
407
  try:
407
- (a11y_tree, phone_state) = await self.tools.get_state()
408
+ (a11y_tree, phone_state) = self.tools.get_state()
408
409
  except Exception as e:
409
410
  logger.warning(f"Failed to capture final UI state: {e}")
410
411
 
@@ -1,4 +1,47 @@
1
1
  from llama_index.core.workflow import Event
2
2
 
3
3
  class ScreenshotEvent(Event):
4
- screenshot: bytes
4
+ screenshot: bytes
5
+
6
+ class MacroEvent(Event):
7
+ """Base class for coordinate-based action events"""
8
+ action_type: str
9
+ description: str
10
+
11
+ class TapActionEvent(MacroEvent):
12
+ """Event for tap actions with coordinates"""
13
+ x: int
14
+ y: int
15
+ element_index: int = None
16
+ element_text: str = ""
17
+ element_bounds: str = ""
18
+
19
+ class SwipeActionEvent(MacroEvent):
20
+ """Event for swipe actions with coordinates"""
21
+ start_x: int
22
+ start_y: int
23
+ end_x: int
24
+ end_y: int
25
+ duration_ms: int
26
+
27
+ class DragActionEvent(MacroEvent):
28
+ """Event for drag actions with coordinates"""
29
+ start_x: int
30
+ start_y: int
31
+ end_x: int
32
+ end_y: int
33
+ duration_ms: int
34
+
35
+ class InputTextActionEvent(MacroEvent):
36
+ """Event for text input actions"""
37
+ text: str
38
+
39
+ class KeyPressActionEvent(MacroEvent):
40
+ """Event for key press actions"""
41
+ keycode: int
42
+ key_name: str = ""
43
+
44
+ class StartAppEvent(MacroEvent):
45
+ """"Event for starting an app"""
46
+ package: str
47
+ activity: str = None
@@ -1,9 +1,11 @@
1
1
  from .default import DEFAULT
2
2
  from .ui_expert import UI_EXPERT
3
3
  from .app_starter import APP_STARTER_EXPERT
4
+ from .big_agent import BIG_AGENT
4
5
 
5
6
  __all__ = [
6
7
  'DEFAULT',
7
8
  'UI_EXPERT',
8
9
  'APP_STARTER_EXPERT',
10
+ 'BIG_AGENT',
9
11
  ]
@@ -0,0 +1,96 @@
1
+ from droidrun.agent.context.agent_persona import AgentPersona
2
+ from droidrun.tools import Tools
3
+
4
+ BIG_AGENT = AgentPersona(
5
+ name="Big Agent",
6
+ description="Big Agent. Use this as your Big Agent",
7
+ expertise_areas=[
8
+ "UI navigation", "button interactions", "text input",
9
+ "menu navigation", "form filling", "scrolling", "app launching"
10
+ ],
11
+ allowed_tools=[
12
+ Tools.swipe.__name__,
13
+ Tools.input_text.__name__,
14
+ Tools.press_key.__name__,
15
+ Tools.drag.__name__,
16
+ Tools.tap_by_index.__name__,
17
+ Tools.start_app.__name__,
18
+ Tools.list_packages.__name__,
19
+ Tools.remember.__name__,
20
+ Tools.complete.__name__
21
+ ],
22
+ required_context=[
23
+ "ui_state",
24
+ "screenshot",
25
+ ],
26
+ user_prompt="""
27
+ **Current Request:**
28
+ {goal}
29
+ **Is the precondition met? What is your reasoning and the next step to address this request?**
30
+ Explain your thought process then provide code in ```python ... ``` tags if needed.
31
+ """"",
32
+
33
+ system_prompt="""
34
+ You are a helpful AI assistant that can write and execute Python code to solve problems.
35
+
36
+ You will be given a task to perform. You should output:
37
+ - Python code wrapped in ``` tags that provides the solution to the task, or a step towards the solution.
38
+ - If there is a precondition for the task, you MUST check if it is met.
39
+ - If a goal's precondition is unmet, fail the task by calling `complete(success=False, reason='...')` with an explanation.
40
+ - If you task is complete, you should use the complete(success:bool, reason:str) function within a code block to mark it as finished. The success parameter should be True if the task was completed successfully, and False otherwise. The reason parameter should be a string explaining the reason for failure if failed.
41
+
42
+
43
+ ## Context:
44
+ The following context is given to you for analysis:
45
+ - **ui_state**: A list of all currently visible UI elements with their indices. Use this to understand what interactive elements are available on the screen.
46
+ - **screenshots**: A visual screenshot of the current state of the Android screen. This provides visual context for what the user sees. screenshots won't be saved in the chat history. So, make sure to describe what you see and explain the key parts of your plan in your thoughts, as those will be saved and used to assist you in future steps.
47
+ - **phone_state**: The current app you are navigating in. This tells you which application context you're working within.
48
+ - **chat history**: You are also given the history of your actions (if any) from your previous steps.
49
+ - **execution result**: The result of your last Action
50
+ NOTE: you don't have access to these inputs in your tool calling context
51
+
52
+ ## Response Format:
53
+ Example of proper code format:
54
+ **Task Assignment:**
55
+ **Task:** "Precondition: Settings app is open. Goal: Navigate to Wi-Fi settings and connect to the network 'HomeNetwork'."
56
+
57
+ **(Step 1) Agent Analysis:** I can see the Settings app is open from the screenshot. This is a multi-step task that requires me to first navigate to Wi-Fi settings, then ensure Wi-Fi is enabled, and finally connect to 'HomeNetwork'. Let me start by finding and tapping on the Wi-Fi option in the settings menu. Looking at the UI elements, I can see "Wi-Fi" option at index 3.
58
+
59
+ **(Step 1) Agent Action:**
60
+ ```python
61
+ # First step: Navigate to Wi-Fi settings
62
+ tap_by_index(3)
63
+ ```
64
+
65
+ **(Step 2) Agent Analysis:** Good! I've successfully navigated to the Wi-Fi settings screen. Now I can see the Wi-Fi settings interface. I notice that Wi-Fi appears to be turned off based on the toggle switch at index 1. I need to turn it on before I can see available networks and connect to 'HomeNetwork'.
66
+
67
+ **(Step 2) Agent Action:**
68
+ ```python
69
+ # Second step: Turn on Wi-Fi to see available networks
70
+ tap_by_index(1)
71
+ ```
72
+
73
+ **(Step 3) Agent Analysis:** Excellent! Wi-Fi is now enabled and I can see a list of available networks appearing on the screen. I can see 'HomeNetwork' in the list at index 5. This is the final step - I need to tap on it to initiate the connection, which will complete my assigned task.
74
+
75
+ **(Step 3) Agent Action:**
76
+ ```python
77
+ # Final step: Connect to the target network
78
+ tap_by_index(5)
79
+ complete(success=True, reason="Successfully navigated to Wi-Fi settings and initiated connection to HomeNetwork")
80
+ ```
81
+ ```
82
+
83
+ ## Tools:
84
+ In addition to the Python Standard Library and any functions you have already written, you can use the following functions:
85
+ {tool_descriptions}
86
+
87
+
88
+ ## Final Answer Guidelines:
89
+ - When providing a final answer, focus on directly answering the user's question in the response format given
90
+ - Present the results clearly and concisely as if you computed them directly
91
+ - Structure your response like you're directly answering the user's query, not explaining how you solved it
92
+
93
+ Reminder: Always place your Python code between ```...``` tags when you want to run code.
94
+ """
95
+
96
+ )
@@ -13,6 +13,7 @@ UI_EXPERT = AgentPersona(
13
13
  Tools.input_text.__name__,
14
14
  Tools.press_key.__name__,
15
15
  Tools.tap_by_index.__name__,
16
+ Tools.drag.__name__,
16
17
  Tools.remember.__name__,
17
18
  Tools.complete.__name__
18
19
  ],
@@ -16,7 +16,7 @@ from droidrun.agent.planner import PlannerAgent
16
16
  from droidrun.agent.context.task_manager import TaskManager
17
17
  from droidrun.agent.utils.trajectory import Trajectory
18
18
  from droidrun.tools import Tools, describe_tools
19
- from droidrun.agent.common.events import ScreenshotEvent
19
+ from droidrun.agent.common.events import ScreenshotEvent, MacroEvent
20
20
  from droidrun.agent.common.default import MockWorkflow
21
21
  from droidrun.agent.context import ContextInjectionManager
22
22
  from droidrun.agent.context.agent_persona import AgentPersona
@@ -69,6 +69,7 @@ A wrapper class that coordinates between PlannerAgent (creates plans) and
69
69
  enable_tracing: bool = False,
70
70
  debug: bool = False,
71
71
  save_trajectories: bool = False,
72
+ excluded_tools: List[str] = None,
72
73
  *args,
73
74
  **kwargs
74
75
  ):
@@ -87,6 +88,7 @@ A wrapper class that coordinates between PlannerAgent (creates plans) and
87
88
  debug: Whether to enable verbose debug logging
88
89
  **kwargs: Additional keyword arguments to pass to the agents
89
90
  """
91
+ self.user_id = kwargs.pop("user_id", None)
90
92
  super().__init__(timeout=timeout ,*args,**kwargs)
91
93
  # Configure default logging if not already configured
92
94
  self._configure_default_logging(debug=debug)
@@ -114,15 +116,17 @@ A wrapper class that coordinates between PlannerAgent (creates plans) and
114
116
  self.event_counter = 0
115
117
  self.save_trajectories = save_trajectories
116
118
 
117
- self.trajectory = Trajectory()
119
+ self.trajectory = Trajectory(goal=goal)
118
120
  self.task_manager = TaskManager()
119
121
  self.task_iter = None
122
+
123
+
120
124
  self.cim = ContextInjectionManager(personas=personas)
121
125
  self.current_episodic_memory = None
122
126
 
123
127
  logger.info("🤖 Initializing DroidAgent...")
124
128
 
125
- self.tool_list = describe_tools(tools)
129
+ self.tool_list = describe_tools(tools, excluded_tools)
126
130
  self.tools_instance = tools
127
131
 
128
132
 
@@ -162,7 +166,8 @@ A wrapper class that coordinates between PlannerAgent (creates plans) and
162
166
  enable_tracing=enable_tracing,
163
167
  debug=debug,
164
168
  save_trajectories=save_trajectories,
165
- )
169
+ ),
170
+ self.user_id
166
171
  )
167
172
 
168
173
 
@@ -369,7 +374,8 @@ A wrapper class that coordinates between PlannerAgent (creates plans) and
369
374
  success=ev.success,
370
375
  output=ev.output,
371
376
  steps=ev.steps,
372
- )
377
+ ),
378
+ self.user_id
373
379
  )
374
380
  flush()
375
381
 
@@ -391,13 +397,16 @@ A wrapper class that coordinates between PlannerAgent (creates plans) and
391
397
  if isinstance(ev, EpisodicMemoryEvent):
392
398
  self.current_episodic_memory = ev.episodic_memory
393
399
  return
400
+
401
+
394
402
 
395
403
  if not isinstance(ev, StopEvent):
396
404
  ctx.write_event_to_stream(ev)
397
405
 
398
406
  if isinstance(ev, ScreenshotEvent):
399
407
  self.trajectory.screenshots.append(ev.screenshot)
400
-
408
+ elif isinstance(ev, MacroEvent):
409
+ self.trajectory.macro.append(ev)
401
410
  else:
402
411
  self.trajectory.events.append(ev)
403
412
 
@@ -131,12 +131,12 @@ class PlannerAgent(Workflow):
131
131
  logger.info(f"🧠 Thinking about how to plan the goal...")
132
132
 
133
133
  if self.vision:
134
- screenshot = (await self.tools_instance.take_screenshot())[1]
134
+ screenshot = (self.tools_instance.take_screenshot())[1]
135
135
  ctx.write_event_to_stream(ScreenshotEvent(screenshot=screenshot))
136
136
  await ctx.set("screenshot", screenshot)
137
137
 
138
138
  try:
139
- state = await self.tools_instance.get_state()
139
+ state = self.tools_instance.get_state()
140
140
  await ctx.set("ui_state", state["a11y_tree"])
141
141
  await ctx.set("phone_state", state["phone_state"])
142
142
  except Exception as e:
@@ -9,6 +9,7 @@ from llama_index.core.workflow import Context
9
9
  import asyncio
10
10
  from asyncio import AbstractEventLoop
11
11
  import threading
12
+ from droidrun.tools.adb import AdbTools
12
13
 
13
14
  logger = logging.getLogger("droidrun")
14
15
 
@@ -29,6 +30,7 @@ class SimpleCodeExecutor:
29
30
  locals: Dict[str, Any] = {},
30
31
  globals: Dict[str, Any] = {},
31
32
  tools={},
33
+ tools_instance=None,
32
34
  use_same_scope: bool = True,
33
35
  ):
34
36
  """
@@ -38,8 +40,11 @@ class SimpleCodeExecutor:
38
40
  locals: Local variables to use in the execution context
39
41
  globals: Global variables to use in the execution context
40
42
  tools: List of tools available for execution
43
+ tools_instance: Original tools instance (e.g., AdbTools instance)
41
44
  """
42
45
 
46
+ self.tools_instance = tools_instance
47
+
43
48
  # loop throught tools and add them to globals, but before that check if tool value is async, if so convert it to sync. tools is a dictionary of tool name: function
44
49
  # e.g. tools = {'tool_name': tool_function}
45
50
 
@@ -74,6 +79,7 @@ class SimpleCodeExecutor:
74
79
  self.locals = locals
75
80
  self.loop = loop
76
81
  self.use_same_scope = use_same_scope
82
+ self.tools = tools
77
83
  if self.use_same_scope:
78
84
  # If using the same scope, set the globals and locals to the same dictionary
79
85
  self.globals = self.locals = {
@@ -93,8 +99,10 @@ class SimpleCodeExecutor:
93
99
  """
94
100
  # Update UI elements before execution
95
101
  self.globals['ui_state'] = await ctx.get("ui_state", None)
96
-
97
- # Capture stdout and stderr
102
+
103
+ if self.tools_instance and isinstance(self.tools_instance, AdbTools):
104
+ self.tools_instance._set_context(ctx)
105
+
98
106
  stdout = io.StringIO()
99
107
  stderr = io.StringIO()
100
108