droidrun 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -45,6 +45,7 @@ class CodeActAgent(Workflow):
45
45
  self,
46
46
  llm: LLM,
47
47
  persona: AgentPersona,
48
+ vision: bool,
48
49
  tools_instance: "Tools",
49
50
  all_tools_list: Dict[str, Callable[..., Any]],
50
51
  max_steps: int = 5,
@@ -62,6 +63,8 @@ class CodeActAgent(Workflow):
62
63
  self.user_prompt = persona.user_prompt
63
64
  self.no_thoughts_prompt = None
64
65
 
66
+ self.vision = vision
67
+
65
68
  self.chat_memory = None
66
69
  self.episodic_memory = EpisodicMemory(persona=persona)
67
70
  self.remembered_info = None
@@ -161,22 +164,28 @@ class CodeActAgent(Workflow):
161
164
  chat_history = await chat_utils.add_memory_block(self.remembered_info, chat_history)
162
165
 
163
166
  for context in self.required_context:
164
- if context == "screenshot" and model != "DeepSeek":
167
+ if model == "DeepSeek":
168
+ logger.warning(
169
+ "[yellow]DeepSeek doesnt support images. Disabling screenshots[/]"
170
+ )
171
+ elif self.vision == True and context == "screenshot":
165
172
  screenshot = (await self.tools.take_screenshot())[1]
166
173
  ctx.write_event_to_stream(ScreenshotEvent(screenshot=screenshot))
167
174
 
168
175
  await ctx.set("screenshot", screenshot)
169
176
  chat_history = await chat_utils.add_screenshot_image_block(screenshot, chat_history)
170
177
 
171
- if context == "phone_state":
172
- chat_history = await chat_utils.add_phone_state_block(await self.tools.get_phone_state(), chat_history)
173
-
174
178
  if context == "ui_state":
175
- ui_state = await self.tools.get_clickables()
176
- await ctx.set("ui_state", ui_state)
177
- chat_history = await chat_utils.add_ui_text_block(
178
- ui_state, chat_history
179
- )
179
+ try:
180
+ state = await self.tools.get_state()
181
+ await ctx.set("ui_state", state["a11y_tree"])
182
+ chat_history = await chat_utils.add_ui_text_block(
183
+ state["a11y_tree"], chat_history
184
+ )
185
+ chat_history = await chat_utils.add_phone_state_block(state["phone_state"], chat_history)
186
+ except Exception as e:
187
+ logger.warning(f"⚠️ Error retrieving state from the connected device. Is the Accessibility Service enabled?")
188
+
180
189
 
181
190
  if context == "packages":
182
191
  chat_history = await chat_utils.add_packages_block(
@@ -394,7 +403,7 @@ class CodeActAgent(Workflow):
394
403
  logger.warning(f"Failed to capture final screenshot: {e}")
395
404
 
396
405
  try:
397
- ui_state = await self.tools.get_clickables()
406
+ (a11y_tree, phone_state) = await self.tools.get_state()
398
407
  except Exception as e:
399
408
  logger.warning(f"Failed to capture final UI state: {e}")
400
409
 
@@ -402,7 +411,7 @@ class CodeActAgent(Workflow):
402
411
  final_chat_history = [{"role": "system", "content": "Final state observation after task completion"}]
403
412
  final_response = {
404
413
  "role": "user",
405
- "content": f"Final State Observation:\nUI State: {ui_state}\nScreenshot: {'Available' if screenshot else 'Not available'}"
414
+ "content": f"Final State Observation:\nUI State: {a11y_tree}\nScreenshot: {'Available' if screenshot else 'Not available'}"
406
415
  }
407
416
 
408
417
  # Create final episodic memory step
@@ -21,7 +21,6 @@ DEFAULT = AgentPersona(
21
21
  required_context=[
22
22
  "ui_state",
23
23
  "screenshot",
24
- "phone_state"
25
24
  ],
26
25
  user_prompt="""
27
26
  **Current Request:**
@@ -46,6 +45,7 @@ DEFAULT = AgentPersona(
46
45
  - **screenshots**: A visual screenshot of the current state of the Android screen. This provides visual context for what the user sees. screenshots won't be saved in the chat history. So, make sure to describe what you see and explain the key parts of your plan in your thoughts, as those will be saved and used to assist you in future steps.
47
46
  - **phone_state**: The current app you are navigating in. This tells you which application context you're working within.
48
47
  - **chat history**: You are also given the history of your actions (if any) from your previous steps.
48
+ - **execution result**: The result of your last Action
49
49
  NOTE: you don't have access to these inputs in your tool calling context
50
50
 
51
51
  ## Response Format:
@@ -61,6 +61,7 @@ A wrapper class that coordinates between PlannerAgent (creates plans) and
61
61
  personas: List[AgentPersona] = [DEFAULT],
62
62
  max_steps: int = 15,
63
63
  timeout: int = 1000,
64
+ vision: bool = False,
64
65
  reasoning: bool = False,
65
66
  reflection: bool = False,
66
67
  enable_tracing: bool = False,
@@ -101,6 +102,7 @@ A wrapper class that coordinates between PlannerAgent (creates plans) and
101
102
 
102
103
  self.goal = goal
103
104
  self.llm = llm
105
+ self.vision = vision
104
106
  self.max_steps = max_steps
105
107
  self.max_codeact_steps = max_steps
106
108
  self.timeout = timeout
@@ -128,6 +130,7 @@ A wrapper class that coordinates between PlannerAgent (creates plans) and
128
130
  self.planner_agent = PlannerAgent(
129
131
  goal=goal,
130
132
  llm=llm,
133
+ vision=vision,
131
134
  personas=personas,
132
135
  task_manager=self.task_manager,
133
136
  tools_instance=tools,
@@ -171,6 +174,7 @@ A wrapper class that coordinates between PlannerAgent (creates plans) and
171
174
  codeact_agent = CodeActAgent(
172
175
  llm=self.llm,
173
176
  persona=persona,
177
+ vision=self.vision,
174
178
  max_steps=self.max_codeact_steps,
175
179
  all_tools_list=self.tool_list,
176
180
  tools_instance=self.tools_instance,
@@ -200,7 +204,7 @@ A wrapper class that coordinates between PlannerAgent (creates plans) and
200
204
  if self.debug:
201
205
  import traceback
202
206
  logger.error(traceback.format_exc())
203
- return CodeActResultEvent(success=False, reason=f"Error: {str(e)}", task=task, steps=result["codeact_steps"])
207
+ return CodeActResultEvent(success=False, reason=f"Error: {str(e)}", task=task, steps=[])
204
208
 
205
209
  @step
206
210
  async def handle_codeact_execute(self, ctx: Context, ev: CodeActResultEvent) -> FinalizeEvent | ReflectionEvent:
@@ -306,6 +310,7 @@ A wrapper class that coordinates between PlannerAgent (creates plans) and
306
310
  Dict containing the execution result
307
311
  """
308
312
  logger.info(f"🚀 Running DroidAgent to achieve goal: {self.goal}")
313
+ ctx.write_event_to_stream(ev)
309
314
 
310
315
  self.step_counter = 0
311
316
  self.retry_counter = 0
@@ -42,6 +42,7 @@ class PlannerAgent(Workflow):
42
42
  self,
43
43
  goal: str,
44
44
  llm: LLM,
45
+ vision: bool,
45
46
  personas: List[AgentPersona],
46
47
  task_manager: TaskManager,
47
48
  tools_instance: Tools,
@@ -57,6 +58,7 @@ class PlannerAgent(Workflow):
57
58
  self.goal = goal
58
59
  self.task_manager = task_manager
59
60
  self.debug = debug
61
+ self.vision = vision
60
62
 
61
63
  self.chat_memory = None
62
64
  self.remembered_info = None
@@ -128,12 +130,19 @@ class PlannerAgent(Workflow):
128
130
  self.steps_counter += 1
129
131
  logger.info(f"🧠 Thinking about how to plan the goal...")
130
132
 
131
- screenshot = (await self.tools_instance.take_screenshot())[1]
132
- ctx.write_event_to_stream(ScreenshotEvent(screenshot=screenshot))
133
- await ctx.set("screenshot", screenshot)
133
+ if self.vision:
134
+ screenshot = (await self.tools_instance.take_screenshot())[1]
135
+ ctx.write_event_to_stream(ScreenshotEvent(screenshot=screenshot))
136
+ await ctx.set("screenshot", screenshot)
137
+
138
+ try:
139
+ state = await self.tools_instance.get_state()
140
+ await ctx.set("ui_state", state["a11y_tree"])
141
+ await ctx.set("phone_state", state["phone_state"])
142
+ except Exception as e:
143
+ logger.warning(f"⚠️ Error retrieving state from the connected device. Is the Accessibility Service enabled?")
144
+
134
145
 
135
- await ctx.set("ui_state", await self.tools_instance.get_clickables())
136
- await ctx.set("phone_state", await self.tools_instance.get_phone_state())
137
146
  await ctx.set("remembered_info", self.remembered_info)
138
147
  await ctx.set("reflection", self.reflection)
139
148
 
@@ -187,7 +196,11 @@ class PlannerAgent(Workflow):
187
196
  await self.chat_memory.aput(
188
197
  ChatMessage(
189
198
  role="user",
190
- content=f"Please either set new tasks using set_tasks_with_agents() or mark the goal as complete using complete_goal() if done.",
199
+ content="""Please either set new tasks using set_tasks_with_agents() or mark the goal as complete using complete_goal() if done.
200
+ wrap your code inside this:
201
+ ```python
202
+ <YOUR CODE HERE>
203
+ ```""",
191
204
  )
192
205
  )
193
206
  logger.debug("🔄 Waiting for next plan or completion.")
@@ -196,7 +209,11 @@ class PlannerAgent(Workflow):
196
209
  await self.chat_memory.aput(
197
210
  ChatMessage(
198
211
  role="user",
199
- content=f"Please either set new tasks using set_tasks_with_agents() or mark the goal as complete using complete_goal() if done.",
212
+ content="""Please either set new tasks using set_tasks_with_agents() or mark the goal as complete using complete_goal() if done.
213
+ wrap your code inside this:
214
+ ```python
215
+ <YOUR CODE HERE>
216
+ ```""",
200
217
  )
201
218
  )
202
219
  logger.debug("🔄 Waiting for next plan or completion.")
@@ -224,15 +241,18 @@ class PlannerAgent(Workflow):
224
241
  logger.debug(f" - Sending {len(chat_history)} messages to LLM.")
225
242
 
226
243
  model = self.llm.class_name()
227
- if model != "DeepSeek":
228
- chat_history = await chat_utils.add_screenshot_image_block(
229
- await ctx.get("screenshot"), chat_history
230
- )
231
- else:
244
+ if model == "DeepSeek":
232
245
  logger.warning(
233
246
  "[yellow]DeepSeek doesnt support images. Disabling screenshots[/]"
234
247
  )
235
248
 
249
+ elif self.vision == True:
250
+ chat_history = await chat_utils.add_screenshot_image_block(
251
+ await ctx.get("screenshot"), chat_history
252
+ )
253
+
254
+
255
+
236
256
  chat_history = await chat_utils.add_task_history_block(
237
257
  self.task_manager.get_completed_tasks(),
238
258
  self.task_manager.get_failed_tasks(),
@@ -132,24 +132,21 @@ async def add_phone_state_block(phone_state, chat_history: List[ChatMessage]) ->
132
132
 
133
133
  # Format the phone state data nicely
134
134
  if isinstance(phone_state, dict) and 'error' not in phone_state:
135
- current_app = phone_state.get('currentApp', 'Unknown')
135
+ current_app = phone_state.get('currentApp', '')
136
136
  package_name = phone_state.get('packageName', 'Unknown')
137
137
  keyboard_visible = phone_state.get('keyboardVisible', False)
138
138
  focused_element = phone_state.get('focusedElement')
139
139
 
140
140
  # Format the focused element
141
141
  if focused_element:
142
- element_text = focused_element.get('text', 'No text')
143
- element_class = focused_element.get('className', 'Unknown')
144
- element_bounds = focused_element.get('bounds', 'Unknown')
145
- element_type = focused_element.get('type', 'unknown')
142
+ element_text = focused_element.get('text', '')
143
+ element_class = focused_element.get('className', '')
146
144
  element_resource_id = focused_element.get('resourceId', '')
147
145
 
148
146
  # Build focused element description
149
- focused_desc = f"'{element_text}' ({element_class})"
147
+ focused_desc = f"'{element_text}' {element_class}"
150
148
  if element_resource_id:
151
149
  focused_desc += f" | ID: {element_resource_id}"
152
- focused_desc += f" | Bounds: {element_bounds} | Type: {element_type}"
153
150
  else:
154
151
  focused_desc = "None"
155
152
 
droidrun/cli/main.py CHANGED
@@ -11,7 +11,7 @@ from rich.console import Console
11
11
  from droidrun.agent.droid import DroidAgent
12
12
  from droidrun.agent.utils.llm_picker import load_llm
13
13
  from droidrun.adb import DeviceManager
14
- from droidrun.tools import AdbTools, IOSTools, Tools
14
+ from droidrun.tools import AdbTools, IOSTools
15
15
  from functools import wraps
16
16
  from droidrun.cli.logs import LogHandler
17
17
 
@@ -59,6 +59,8 @@ async def run_command(
59
59
  model: str,
60
60
  steps: int,
61
61
  base_url: str,
62
+ api_base: str,
63
+ vision: bool,
62
64
  reasoning: bool,
63
65
  reflection: bool,
64
66
  tracing: bool,
@@ -101,7 +103,7 @@ async def run_command(
101
103
  # LLM setup
102
104
  log_handler.update_step("Initializing LLM...")
103
105
  llm = load_llm(
104
- provider_name=provider, model=model, base_url=base_url, **kwargs
106
+ provider_name=provider, model=model, base_url=base_url, api_base=api_base, **kwargs
105
107
  )
106
108
  logger.info(f"🧠 LLM ready: {provider}/{model}")
107
109
 
@@ -120,6 +122,7 @@ async def run_command(
120
122
  tools=tools,
121
123
  max_steps=steps,
122
124
  timeout=1000,
125
+ vision=vision,
123
126
  reasoning=reasoning,
124
127
  reflection=reflection,
125
128
  enable_tracing=tracing,
@@ -176,14 +179,14 @@ class DroidRunCLI(click.Group):
176
179
  @click.option(
177
180
  "--provider",
178
181
  "-p",
179
- help="LLM provider (OpenAI, Ollama, Anthropic, Gemini, DeepSeek)",
180
- default="Gemini",
182
+ help="LLM provider (OpenAI, Ollama, Anthropic, GoogleGenAI, DeepSeek)",
183
+ default="GoogleGenAI",
181
184
  )
182
185
  @click.option(
183
186
  "--model",
184
187
  "-m",
185
188
  help="LLM model name",
186
- default="models/gemini-2.5-pro",
189
+ default="models/gemini-2.5-flash",
187
190
  )
188
191
  @click.option("--temperature", type=float, help="Temperature for LLM", default=0.2)
189
192
  @click.option("--steps", type=int, help="Maximum number of steps", default=15)
@@ -194,7 +197,15 @@ class DroidRunCLI(click.Group):
194
197
  default=None,
195
198
  )
196
199
  @click.option(
197
- "--reasoning", is_flag=True, help="Enable/disable planning with reasoning", default=False
200
+ "--api_base",
201
+ help="Base URL for API (e.g., OpenAI, OpenAI-Like)",
202
+ default=None,
203
+ )
204
+ @click.option(
205
+ "--vision", is_flag=True, help="Enable vision capabilites by using screenshots", default=False
206
+ )
207
+ @click.option(
208
+ "--reasoning", is_flag=True, help="Enable planning with reasoning", default=False
198
209
  )
199
210
  @click.option(
200
211
  "--reflection", is_flag=True, help="Enable reflection step for higher reasoning", default=False
@@ -218,7 +229,9 @@ def cli(
218
229
  model: str,
219
230
  steps: int,
220
231
  base_url: str,
232
+ api_base: str,
221
233
  temperature: float,
234
+ vision: bool,
222
235
  reasoning: bool,
223
236
  reflection: bool,
224
237
  tracing: bool,
@@ -235,14 +248,14 @@ def cli(
235
248
  @click.option(
236
249
  "--provider",
237
250
  "-p",
238
- help="LLM provider (OpenAI, Ollama, Anthropic, Gemini, DeepSeek)",
239
- default="Gemini",
251
+ help="LLM provider (OpenAI, Ollama, Anthropic, GoogleGenAI, DeepSeek)",
252
+ default="GoogleGenAI",
240
253
  )
241
254
  @click.option(
242
255
  "--model",
243
256
  "-m",
244
257
  help="LLM model name",
245
- default="models/gemini-2.5-pro",
258
+ default="models/gemini-2.5-flash",
246
259
  )
247
260
  @click.option("--temperature", type=float, help="Temperature for LLM", default=0.2)
248
261
  @click.option("--steps", type=int, help="Maximum number of steps", default=15)
@@ -253,7 +266,15 @@ def cli(
253
266
  default=None,
254
267
  )
255
268
  @click.option(
256
- "--reasoning", is_flag=True, help="Enable/disable planning with reasoning", default=False
269
+ "--api_base",
270
+ help="Base URL for API (e.g., OpenAI or OpenAI-Like)",
271
+ default=None,
272
+ )
273
+ @click.option(
274
+ "--vision", is_flag=True, help="Enable vision capabilites by using screenshots", default=False
275
+ )
276
+ @click.option(
277
+ "--reasoning", is_flag=True, help="Enable planning with reasoning", default=False
257
278
  )
258
279
  @click.option(
259
280
  "--reflection", is_flag=True, help="Enable reflection step for higher reasoning", default=False
@@ -280,7 +301,9 @@ def run(
280
301
  model: str,
281
302
  steps: int,
282
303
  base_url: str,
304
+ api_base: str,
283
305
  temperature: float,
306
+ vision: bool,
284
307
  reasoning: bool,
285
308
  reflection: bool,
286
309
  tracing: bool,
@@ -297,6 +320,8 @@ def run(
297
320
  model,
298
321
  steps,
299
322
  base_url,
323
+ api_base,
324
+ vision,
300
325
  reasoning,
301
326
  reflection,
302
327
  tracing,
@@ -381,9 +406,9 @@ async def setup(path: str, device: str | None):
381
406
  f"[bold red]Error:[/] Could not get device object for {device}"
382
407
  )
383
408
  return
384
- tools = Tools(serial=device)
409
+
385
410
  console.print(f"[bold blue]Step 1/2: Installing APK:[/] {path}")
386
- result = await tools.install_app(path, False, True)
411
+ result = await device_obj.install_app(path, False, True)
387
412
 
388
413
  if "Error" in result:
389
414
  console.print(f"[bold red]Installation failed:[/] {result}")
@@ -449,13 +474,15 @@ if __name__ == "__main__":
449
474
  provider = "GoogleGenAI"
450
475
  model = "models/gemini-2.5-flash"
451
476
  temperature = 0
452
- api_key = os.getenv("GEMINI_API_KEY")
477
+ api_key = os.getenv("GOOGLE_API_KEY")
453
478
  steps = 15
479
+ vision = True
454
480
  reasoning = True
455
481
  reflection = False
456
482
  tracing = True
457
483
  debug = True
458
484
  base_url = None
485
+ api_base = None
459
486
  ios = False
460
487
  run_command(
461
488
  command=command,
@@ -464,11 +491,13 @@ if __name__ == "__main__":
464
491
  model=model,
465
492
  steps=steps,
466
493
  temperature=temperature,
494
+ vision=vision,
467
495
  reasoning=reasoning,
468
496
  reflection=reflection,
469
497
  tracing=tracing,
470
498
  debug=debug,
471
499
  base_url=base_url,
500
+ api_base=api_base,
472
501
  api_key=api_key,
473
502
  ios=ios
474
503
  )