droidrun 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- droidrun/agent/codeact/codeact_agent.py +20 -11
- droidrun/agent/context/personas/default.py +1 -1
- droidrun/agent/droid/droid_agent.py +6 -1
- droidrun/agent/planner/planner_agent.py +32 -12
- droidrun/agent/utils/chat_utils.py +4 -7
- droidrun/cli/main.py +42 -13
- droidrun/tools/adb.py +219 -291
- droidrun/tools/ios.py +4 -2
- droidrun/tools/tools.py +1 -5
- {droidrun-0.3.0.dist-info → droidrun-0.3.1.dist-info}/METADATA +3 -2
- {droidrun-0.3.0.dist-info → droidrun-0.3.1.dist-info}/RECORD +14 -16
- droidrun/agent/context/todo.txt +0 -4
- droidrun/run.py +0 -105
- {droidrun-0.3.0.dist-info → droidrun-0.3.1.dist-info}/WHEEL +0 -0
- {droidrun-0.3.0.dist-info → droidrun-0.3.1.dist-info}/entry_points.txt +0 -0
- {droidrun-0.3.0.dist-info → droidrun-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -45,6 +45,7 @@ class CodeActAgent(Workflow):
|
|
45
45
|
self,
|
46
46
|
llm: LLM,
|
47
47
|
persona: AgentPersona,
|
48
|
+
vision: bool,
|
48
49
|
tools_instance: "Tools",
|
49
50
|
all_tools_list: Dict[str, Callable[..., Any]],
|
50
51
|
max_steps: int = 5,
|
@@ -62,6 +63,8 @@ class CodeActAgent(Workflow):
|
|
62
63
|
self.user_prompt = persona.user_prompt
|
63
64
|
self.no_thoughts_prompt = None
|
64
65
|
|
66
|
+
self.vision = vision
|
67
|
+
|
65
68
|
self.chat_memory = None
|
66
69
|
self.episodic_memory = EpisodicMemory(persona=persona)
|
67
70
|
self.remembered_info = None
|
@@ -161,22 +164,28 @@ class CodeActAgent(Workflow):
|
|
161
164
|
chat_history = await chat_utils.add_memory_block(self.remembered_info, chat_history)
|
162
165
|
|
163
166
|
for context in self.required_context:
|
164
|
-
if
|
167
|
+
if model == "DeepSeek":
|
168
|
+
logger.warning(
|
169
|
+
"[yellow]DeepSeek doesnt support images. Disabling screenshots[/]"
|
170
|
+
)
|
171
|
+
elif self.vision == True and context == "screenshot":
|
165
172
|
screenshot = (await self.tools.take_screenshot())[1]
|
166
173
|
ctx.write_event_to_stream(ScreenshotEvent(screenshot=screenshot))
|
167
174
|
|
168
175
|
await ctx.set("screenshot", screenshot)
|
169
176
|
chat_history = await chat_utils.add_screenshot_image_block(screenshot, chat_history)
|
170
177
|
|
171
|
-
if context == "phone_state":
|
172
|
-
chat_history = await chat_utils.add_phone_state_block(await self.tools.get_phone_state(), chat_history)
|
173
|
-
|
174
178
|
if context == "ui_state":
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
179
|
+
try:
|
180
|
+
state = await self.tools.get_state()
|
181
|
+
await ctx.set("ui_state", state["a11y_tree"])
|
182
|
+
chat_history = await chat_utils.add_ui_text_block(
|
183
|
+
state["a11y_tree"], chat_history
|
184
|
+
)
|
185
|
+
chat_history = await chat_utils.add_phone_state_block(state["phone_state"], chat_history)
|
186
|
+
except Exception as e:
|
187
|
+
logger.warning(f"⚠️ Error retrieving state from the connected device. Is the Accessibility Service enabled?")
|
188
|
+
|
180
189
|
|
181
190
|
if context == "packages":
|
182
191
|
chat_history = await chat_utils.add_packages_block(
|
@@ -394,7 +403,7 @@ class CodeActAgent(Workflow):
|
|
394
403
|
logger.warning(f"Failed to capture final screenshot: {e}")
|
395
404
|
|
396
405
|
try:
|
397
|
-
|
406
|
+
(a11y_tree, phone_state) = await self.tools.get_state()
|
398
407
|
except Exception as e:
|
399
408
|
logger.warning(f"Failed to capture final UI state: {e}")
|
400
409
|
|
@@ -402,7 +411,7 @@ class CodeActAgent(Workflow):
|
|
402
411
|
final_chat_history = [{"role": "system", "content": "Final state observation after task completion"}]
|
403
412
|
final_response = {
|
404
413
|
"role": "user",
|
405
|
-
"content": f"Final State Observation:\nUI State: {
|
414
|
+
"content": f"Final State Observation:\nUI State: {a11y_tree}\nScreenshot: {'Available' if screenshot else 'Not available'}"
|
406
415
|
}
|
407
416
|
|
408
417
|
# Create final episodic memory step
|
@@ -21,7 +21,6 @@ DEFAULT = AgentPersona(
|
|
21
21
|
required_context=[
|
22
22
|
"ui_state",
|
23
23
|
"screenshot",
|
24
|
-
"phone_state"
|
25
24
|
],
|
26
25
|
user_prompt="""
|
27
26
|
**Current Request:**
|
@@ -46,6 +45,7 @@ DEFAULT = AgentPersona(
|
|
46
45
|
- **screenshots**: A visual screenshot of the current state of the Android screen. This provides visual context for what the user sees. screenshots won't be saved in the chat history. So, make sure to describe what you see and explain the key parts of your plan in your thoughts, as those will be saved and used to assist you in future steps.
|
47
46
|
- **phone_state**: The current app you are navigating in. This tells you which application context you're working within.
|
48
47
|
- **chat history**: You are also given the history of your actions (if any) from your previous steps.
|
48
|
+
- **execution result**: The result of your last Action
|
49
49
|
NOTE: you don't have access to these inputs in your tool calling context
|
50
50
|
|
51
51
|
## Response Format:
|
@@ -61,6 +61,7 @@ A wrapper class that coordinates between PlannerAgent (creates plans) and
|
|
61
61
|
personas: List[AgentPersona] = [DEFAULT],
|
62
62
|
max_steps: int = 15,
|
63
63
|
timeout: int = 1000,
|
64
|
+
vision: bool = False,
|
64
65
|
reasoning: bool = False,
|
65
66
|
reflection: bool = False,
|
66
67
|
enable_tracing: bool = False,
|
@@ -101,6 +102,7 @@ A wrapper class that coordinates between PlannerAgent (creates plans) and
|
|
101
102
|
|
102
103
|
self.goal = goal
|
103
104
|
self.llm = llm
|
105
|
+
self.vision = vision
|
104
106
|
self.max_steps = max_steps
|
105
107
|
self.max_codeact_steps = max_steps
|
106
108
|
self.timeout = timeout
|
@@ -128,6 +130,7 @@ A wrapper class that coordinates between PlannerAgent (creates plans) and
|
|
128
130
|
self.planner_agent = PlannerAgent(
|
129
131
|
goal=goal,
|
130
132
|
llm=llm,
|
133
|
+
vision=vision,
|
131
134
|
personas=personas,
|
132
135
|
task_manager=self.task_manager,
|
133
136
|
tools_instance=tools,
|
@@ -171,6 +174,7 @@ A wrapper class that coordinates between PlannerAgent (creates plans) and
|
|
171
174
|
codeact_agent = CodeActAgent(
|
172
175
|
llm=self.llm,
|
173
176
|
persona=persona,
|
177
|
+
vision=self.vision,
|
174
178
|
max_steps=self.max_codeact_steps,
|
175
179
|
all_tools_list=self.tool_list,
|
176
180
|
tools_instance=self.tools_instance,
|
@@ -200,7 +204,7 @@ A wrapper class that coordinates between PlannerAgent (creates plans) and
|
|
200
204
|
if self.debug:
|
201
205
|
import traceback
|
202
206
|
logger.error(traceback.format_exc())
|
203
|
-
return CodeActResultEvent(success=False, reason=f"Error: {str(e)}", task=task, steps=
|
207
|
+
return CodeActResultEvent(success=False, reason=f"Error: {str(e)}", task=task, steps=[])
|
204
208
|
|
205
209
|
@step
|
206
210
|
async def handle_codeact_execute(self, ctx: Context, ev: CodeActResultEvent) -> FinalizeEvent | ReflectionEvent:
|
@@ -306,6 +310,7 @@ A wrapper class that coordinates between PlannerAgent (creates plans) and
|
|
306
310
|
Dict containing the execution result
|
307
311
|
"""
|
308
312
|
logger.info(f"🚀 Running DroidAgent to achieve goal: {self.goal}")
|
313
|
+
ctx.write_event_to_stream(ev)
|
309
314
|
|
310
315
|
self.step_counter = 0
|
311
316
|
self.retry_counter = 0
|
@@ -42,6 +42,7 @@ class PlannerAgent(Workflow):
|
|
42
42
|
self,
|
43
43
|
goal: str,
|
44
44
|
llm: LLM,
|
45
|
+
vision: bool,
|
45
46
|
personas: List[AgentPersona],
|
46
47
|
task_manager: TaskManager,
|
47
48
|
tools_instance: Tools,
|
@@ -57,6 +58,7 @@ class PlannerAgent(Workflow):
|
|
57
58
|
self.goal = goal
|
58
59
|
self.task_manager = task_manager
|
59
60
|
self.debug = debug
|
61
|
+
self.vision = vision
|
60
62
|
|
61
63
|
self.chat_memory = None
|
62
64
|
self.remembered_info = None
|
@@ -128,12 +130,19 @@ class PlannerAgent(Workflow):
|
|
128
130
|
self.steps_counter += 1
|
129
131
|
logger.info(f"🧠 Thinking about how to plan the goal...")
|
130
132
|
|
131
|
-
|
132
|
-
|
133
|
-
|
133
|
+
if self.vision:
|
134
|
+
screenshot = (await self.tools_instance.take_screenshot())[1]
|
135
|
+
ctx.write_event_to_stream(ScreenshotEvent(screenshot=screenshot))
|
136
|
+
await ctx.set("screenshot", screenshot)
|
137
|
+
|
138
|
+
try:
|
139
|
+
state = await self.tools_instance.get_state()
|
140
|
+
await ctx.set("ui_state", state["a11y_tree"])
|
141
|
+
await ctx.set("phone_state", state["phone_state"])
|
142
|
+
except Exception as e:
|
143
|
+
logger.warning(f"⚠️ Error retrieving state from the connected device. Is the Accessibility Service enabled?")
|
144
|
+
|
134
145
|
|
135
|
-
await ctx.set("ui_state", await self.tools_instance.get_clickables())
|
136
|
-
await ctx.set("phone_state", await self.tools_instance.get_phone_state())
|
137
146
|
await ctx.set("remembered_info", self.remembered_info)
|
138
147
|
await ctx.set("reflection", self.reflection)
|
139
148
|
|
@@ -187,7 +196,11 @@ class PlannerAgent(Workflow):
|
|
187
196
|
await self.chat_memory.aput(
|
188
197
|
ChatMessage(
|
189
198
|
role="user",
|
190
|
-
content=
|
199
|
+
content="""Please either set new tasks using set_tasks_with_agents() or mark the goal as complete using complete_goal() if done.
|
200
|
+
wrap your code inside this:
|
201
|
+
```python
|
202
|
+
<YOUR CODE HERE>
|
203
|
+
```""",
|
191
204
|
)
|
192
205
|
)
|
193
206
|
logger.debug("🔄 Waiting for next plan or completion.")
|
@@ -196,7 +209,11 @@ class PlannerAgent(Workflow):
|
|
196
209
|
await self.chat_memory.aput(
|
197
210
|
ChatMessage(
|
198
211
|
role="user",
|
199
|
-
content=
|
212
|
+
content="""Please either set new tasks using set_tasks_with_agents() or mark the goal as complete using complete_goal() if done.
|
213
|
+
wrap your code inside this:
|
214
|
+
```python
|
215
|
+
<YOUR CODE HERE>
|
216
|
+
```""",
|
200
217
|
)
|
201
218
|
)
|
202
219
|
logger.debug("🔄 Waiting for next plan or completion.")
|
@@ -224,15 +241,18 @@ class PlannerAgent(Workflow):
|
|
224
241
|
logger.debug(f" - Sending {len(chat_history)} messages to LLM.")
|
225
242
|
|
226
243
|
model = self.llm.class_name()
|
227
|
-
if model
|
228
|
-
chat_history = await chat_utils.add_screenshot_image_block(
|
229
|
-
await ctx.get("screenshot"), chat_history
|
230
|
-
)
|
231
|
-
else:
|
244
|
+
if model == "DeepSeek":
|
232
245
|
logger.warning(
|
233
246
|
"[yellow]DeepSeek doesnt support images. Disabling screenshots[/]"
|
234
247
|
)
|
235
248
|
|
249
|
+
elif self.vision == True:
|
250
|
+
chat_history = await chat_utils.add_screenshot_image_block(
|
251
|
+
await ctx.get("screenshot"), chat_history
|
252
|
+
)
|
253
|
+
|
254
|
+
|
255
|
+
|
236
256
|
chat_history = await chat_utils.add_task_history_block(
|
237
257
|
self.task_manager.get_completed_tasks(),
|
238
258
|
self.task_manager.get_failed_tasks(),
|
@@ -132,24 +132,21 @@ async def add_phone_state_block(phone_state, chat_history: List[ChatMessage]) ->
|
|
132
132
|
|
133
133
|
# Format the phone state data nicely
|
134
134
|
if isinstance(phone_state, dict) and 'error' not in phone_state:
|
135
|
-
current_app = phone_state.get('currentApp', '
|
135
|
+
current_app = phone_state.get('currentApp', '')
|
136
136
|
package_name = phone_state.get('packageName', 'Unknown')
|
137
137
|
keyboard_visible = phone_state.get('keyboardVisible', False)
|
138
138
|
focused_element = phone_state.get('focusedElement')
|
139
139
|
|
140
140
|
# Format the focused element
|
141
141
|
if focused_element:
|
142
|
-
element_text = focused_element.get('text', '
|
143
|
-
element_class = focused_element.get('className', '
|
144
|
-
element_bounds = focused_element.get('bounds', 'Unknown')
|
145
|
-
element_type = focused_element.get('type', 'unknown')
|
142
|
+
element_text = focused_element.get('text', '')
|
143
|
+
element_class = focused_element.get('className', '')
|
146
144
|
element_resource_id = focused_element.get('resourceId', '')
|
147
145
|
|
148
146
|
# Build focused element description
|
149
|
-
focused_desc = f"'{element_text}'
|
147
|
+
focused_desc = f"'{element_text}' {element_class}"
|
150
148
|
if element_resource_id:
|
151
149
|
focused_desc += f" | ID: {element_resource_id}"
|
152
|
-
focused_desc += f" | Bounds: {element_bounds} | Type: {element_type}"
|
153
150
|
else:
|
154
151
|
focused_desc = "None"
|
155
152
|
|
droidrun/cli/main.py
CHANGED
@@ -11,7 +11,7 @@ from rich.console import Console
|
|
11
11
|
from droidrun.agent.droid import DroidAgent
|
12
12
|
from droidrun.agent.utils.llm_picker import load_llm
|
13
13
|
from droidrun.adb import DeviceManager
|
14
|
-
from droidrun.tools import AdbTools, IOSTools
|
14
|
+
from droidrun.tools import AdbTools, IOSTools
|
15
15
|
from functools import wraps
|
16
16
|
from droidrun.cli.logs import LogHandler
|
17
17
|
|
@@ -59,6 +59,8 @@ async def run_command(
|
|
59
59
|
model: str,
|
60
60
|
steps: int,
|
61
61
|
base_url: str,
|
62
|
+
api_base: str,
|
63
|
+
vision: bool,
|
62
64
|
reasoning: bool,
|
63
65
|
reflection: bool,
|
64
66
|
tracing: bool,
|
@@ -101,7 +103,7 @@ async def run_command(
|
|
101
103
|
# LLM setup
|
102
104
|
log_handler.update_step("Initializing LLM...")
|
103
105
|
llm = load_llm(
|
104
|
-
provider_name=provider, model=model, base_url=base_url, **kwargs
|
106
|
+
provider_name=provider, model=model, base_url=base_url, api_base=api_base, **kwargs
|
105
107
|
)
|
106
108
|
logger.info(f"🧠 LLM ready: {provider}/{model}")
|
107
109
|
|
@@ -120,6 +122,7 @@ async def run_command(
|
|
120
122
|
tools=tools,
|
121
123
|
max_steps=steps,
|
122
124
|
timeout=1000,
|
125
|
+
vision=vision,
|
123
126
|
reasoning=reasoning,
|
124
127
|
reflection=reflection,
|
125
128
|
enable_tracing=tracing,
|
@@ -176,14 +179,14 @@ class DroidRunCLI(click.Group):
|
|
176
179
|
@click.option(
|
177
180
|
"--provider",
|
178
181
|
"-p",
|
179
|
-
help="LLM provider (OpenAI, Ollama, Anthropic,
|
180
|
-
default="
|
182
|
+
help="LLM provider (OpenAI, Ollama, Anthropic, GoogleGenAI, DeepSeek)",
|
183
|
+
default="GoogleGenAI",
|
181
184
|
)
|
182
185
|
@click.option(
|
183
186
|
"--model",
|
184
187
|
"-m",
|
185
188
|
help="LLM model name",
|
186
|
-
default="models/gemini-2.5-
|
189
|
+
default="models/gemini-2.5-flash",
|
187
190
|
)
|
188
191
|
@click.option("--temperature", type=float, help="Temperature for LLM", default=0.2)
|
189
192
|
@click.option("--steps", type=int, help="Maximum number of steps", default=15)
|
@@ -194,7 +197,15 @@ class DroidRunCLI(click.Group):
|
|
194
197
|
default=None,
|
195
198
|
)
|
196
199
|
@click.option(
|
197
|
-
"--
|
200
|
+
"--api_base",
|
201
|
+
help="Base URL for API (e.g., OpenAI, OpenAI-Like)",
|
202
|
+
default=None,
|
203
|
+
)
|
204
|
+
@click.option(
|
205
|
+
"--vision", is_flag=True, help="Enable vision capabilites by using screenshots", default=False
|
206
|
+
)
|
207
|
+
@click.option(
|
208
|
+
"--reasoning", is_flag=True, help="Enable planning with reasoning", default=False
|
198
209
|
)
|
199
210
|
@click.option(
|
200
211
|
"--reflection", is_flag=True, help="Enable reflection step for higher reasoning", default=False
|
@@ -218,7 +229,9 @@ def cli(
|
|
218
229
|
model: str,
|
219
230
|
steps: int,
|
220
231
|
base_url: str,
|
232
|
+
api_base: str,
|
221
233
|
temperature: float,
|
234
|
+
vision: bool,
|
222
235
|
reasoning: bool,
|
223
236
|
reflection: bool,
|
224
237
|
tracing: bool,
|
@@ -235,14 +248,14 @@ def cli(
|
|
235
248
|
@click.option(
|
236
249
|
"--provider",
|
237
250
|
"-p",
|
238
|
-
help="LLM provider (OpenAI, Ollama, Anthropic,
|
239
|
-
default="
|
251
|
+
help="LLM provider (OpenAI, Ollama, Anthropic, GoogleGenAI, DeepSeek)",
|
252
|
+
default="GoogleGenAI",
|
240
253
|
)
|
241
254
|
@click.option(
|
242
255
|
"--model",
|
243
256
|
"-m",
|
244
257
|
help="LLM model name",
|
245
|
-
default="models/gemini-2.5-
|
258
|
+
default="models/gemini-2.5-flash",
|
246
259
|
)
|
247
260
|
@click.option("--temperature", type=float, help="Temperature for LLM", default=0.2)
|
248
261
|
@click.option("--steps", type=int, help="Maximum number of steps", default=15)
|
@@ -253,7 +266,15 @@ def cli(
|
|
253
266
|
default=None,
|
254
267
|
)
|
255
268
|
@click.option(
|
256
|
-
"--
|
269
|
+
"--api_base",
|
270
|
+
help="Base URL for API (e.g., OpenAI or OpenAI-Like)",
|
271
|
+
default=None,
|
272
|
+
)
|
273
|
+
@click.option(
|
274
|
+
"--vision", is_flag=True, help="Enable vision capabilites by using screenshots", default=False
|
275
|
+
)
|
276
|
+
@click.option(
|
277
|
+
"--reasoning", is_flag=True, help="Enable planning with reasoning", default=False
|
257
278
|
)
|
258
279
|
@click.option(
|
259
280
|
"--reflection", is_flag=True, help="Enable reflection step for higher reasoning", default=False
|
@@ -280,7 +301,9 @@ def run(
|
|
280
301
|
model: str,
|
281
302
|
steps: int,
|
282
303
|
base_url: str,
|
304
|
+
api_base: str,
|
283
305
|
temperature: float,
|
306
|
+
vision: bool,
|
284
307
|
reasoning: bool,
|
285
308
|
reflection: bool,
|
286
309
|
tracing: bool,
|
@@ -297,6 +320,8 @@ def run(
|
|
297
320
|
model,
|
298
321
|
steps,
|
299
322
|
base_url,
|
323
|
+
api_base,
|
324
|
+
vision,
|
300
325
|
reasoning,
|
301
326
|
reflection,
|
302
327
|
tracing,
|
@@ -381,9 +406,9 @@ async def setup(path: str, device: str | None):
|
|
381
406
|
f"[bold red]Error:[/] Could not get device object for {device}"
|
382
407
|
)
|
383
408
|
return
|
384
|
-
|
409
|
+
|
385
410
|
console.print(f"[bold blue]Step 1/2: Installing APK:[/] {path}")
|
386
|
-
result = await
|
411
|
+
result = await device_obj.install_app(path, False, True)
|
387
412
|
|
388
413
|
if "Error" in result:
|
389
414
|
console.print(f"[bold red]Installation failed:[/] {result}")
|
@@ -449,13 +474,15 @@ if __name__ == "__main__":
|
|
449
474
|
provider = "GoogleGenAI"
|
450
475
|
model = "models/gemini-2.5-flash"
|
451
476
|
temperature = 0
|
452
|
-
api_key = os.getenv("
|
477
|
+
api_key = os.getenv("GOOGLE_API_KEY")
|
453
478
|
steps = 15
|
479
|
+
vision = True
|
454
480
|
reasoning = True
|
455
481
|
reflection = False
|
456
482
|
tracing = True
|
457
483
|
debug = True
|
458
484
|
base_url = None
|
485
|
+
api_base = None
|
459
486
|
ios = False
|
460
487
|
run_command(
|
461
488
|
command=command,
|
@@ -464,11 +491,13 @@ if __name__ == "__main__":
|
|
464
491
|
model=model,
|
465
492
|
steps=steps,
|
466
493
|
temperature=temperature,
|
494
|
+
vision=vision,
|
467
495
|
reasoning=reasoning,
|
468
496
|
reflection=reflection,
|
469
497
|
tracing=tracing,
|
470
498
|
debug=debug,
|
471
499
|
base_url=base_url,
|
500
|
+
api_base=api_base,
|
472
501
|
api_key=api_key,
|
473
502
|
ios=ios
|
474
503
|
)
|