droidrun 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,556 @@
1
+ """
2
+ ReAct Agent - Reasoning + Acting agent for controlling Android devices.
3
+
4
+ This module implements a ReAct agent that can control Android devices through
5
+ reasoning about the current state and taking appropriate actions.
6
+ """
7
+
8
+ import time
9
+ import logging
10
+ from enum import Enum
11
+ from typing import Any, Dict, List, Optional, Tuple, Callable, Union
12
+
13
+ # Import tools
14
+ from droidrun.tools import (
15
+ DeviceManager,
16
+ tap,
17
+ swipe,
18
+ input_text,
19
+ press_key,
20
+ start_app,
21
+ install_app,
22
+ uninstall_app,
23
+ take_screenshot,
24
+ list_packages,
25
+ get_clickables,
26
+ complete,
27
+ extract,
28
+ )
29
+
30
+ # Import LLM reasoning
31
+ from .llm_reasoning import LLMReasoner
32
+
33
+ # Set up logger
34
+ logger = logging.getLogger("droidrun")
35
+
36
+ class ReActStepType(Enum):
37
+ """Types of steps in a ReAct agent's reasoning and acting process."""
38
+ THOUGHT = "thought" # Internal reasoning step
39
+ ACTION = "action" # Taking an action
40
+ OBSERVATION = "observation" # Observing the result
41
+ PLAN = "plan" # Planning future steps
42
+ GOAL = "goal" # Setting or refining the goal
43
+
44
+ class ReActStep:
45
+ """A single step in the ReAct agent's process."""
46
+
47
+ def __init__(
48
+ self,
49
+ step_type: ReActStepType,
50
+ content: str,
51
+ ):
52
+ """Initialize a ReAct step.
53
+
54
+ Args:
55
+ step_type: The type of step (thought, action, observation)
56
+ content: The content of the step
57
+ """
58
+ self.step_type = step_type
59
+ self.content = content
60
+ self.timestamp = time.time()
61
+
62
+ def to_dict(self) -> Dict[str, Any]:
63
+ """Convert the step to a dictionary.
64
+
65
+ Returns:
66
+ Dict representation of the step
67
+ """
68
+ return {
69
+ "type": self.step_type.value,
70
+ "content": self.content,
71
+ "timestamp": self.timestamp
72
+ }
73
+
74
+ def __str__(self) -> str:
75
+ """String representation of the step.
76
+
77
+ Returns:
78
+ Formatted string representation
79
+ """
80
+ type_str = self.step_type.value.upper()
81
+
82
+ # Format based on step type
83
+ if self.step_type == ReActStepType.THOUGHT:
84
+ return f"🤔 THOUGHT: {self.content}"
85
+ elif self.step_type == ReActStepType.ACTION:
86
+ return f"🔄 ACTION: {self.content}"
87
+ elif self.step_type == ReActStepType.OBSERVATION:
88
+ return f"👁️ OBSERVATION: {self.content}"
89
+ elif self.step_type == ReActStepType.PLAN:
90
+ return f"📝 PLAN: {self.content}"
91
+ elif self.step_type == ReActStepType.GOAL:
92
+ return f"🎯 GOAL: {self.content}"
93
+
94
+ return f"{type_str}: {self.content}"
95
+
96
+ class ReActAgent:
97
+ """ReAct agent for Android device automation."""
98
+
99
+ def __init__(
100
+ self,
101
+ task: Optional[str] = None,
102
+ llm: Any = None,
103
+ device_serial: Optional[str] = None,
104
+ max_steps: int = 100,
105
+ vision: bool = False
106
+ ):
107
+ """Initialize the ReAct agent.
108
+
109
+ Args:
110
+ task: The automation task to perform (same as goal)
111
+ llm: LLM instance to use for reasoning
112
+ device_serial: Serial number of the Android device to control
113
+ max_steps: Maximum number of steps to take
114
+ vision: Whether to enable vision capabilities (screenshot tool)
115
+ """
116
+ self.device_serial = device_serial
117
+ self.goal = task # Store task as goal for backward compatibility
118
+ self.max_steps = max_steps
119
+ self.llm_instance = llm
120
+ self.vision = vision
121
+
122
+ # Initialize steps list
123
+ self.steps: List[ReActStep] = []
124
+
125
+ # Initialize screenshot storage
126
+ self._last_screenshot: Optional[bytes] = None
127
+
128
+ # Configure logging
129
+ logging.basicConfig(level=logging.INFO)
130
+
131
+ # Define available tools and their functions
132
+ self.tools: Dict[str, Callable] = {
133
+ # UI interaction
134
+ "tap": tap,
135
+ "swipe": swipe,
136
+ "input_text": input_text,
137
+ "press_key": press_key,
138
+
139
+ # App management
140
+ "start_app": start_app,
141
+ "install_app": install_app,
142
+ "uninstall_app": uninstall_app,
143
+ "list_packages": list_packages,
144
+
145
+ # UI analysis
146
+ "get_clickables": get_clickables,
147
+
148
+ # Data extraction
149
+ "extract": extract,
150
+
151
+ # Goal management
152
+ "complete": complete,
153
+ }
154
+
155
+ # Add screenshot tool only if vision is enabled
156
+ if vision:
157
+ self.tools["take_screenshot"] = take_screenshot
158
+ logger.info("Vision capabilities enabled: screenshot tool available")
159
+ else:
160
+ logger.info("Vision capabilities disabled: screenshot tool not available")
161
+
162
+ # Initialize device manager
163
+ self.device_manager = DeviceManager()
164
+
165
+ # Initialize LLM reasoner based on provided LLM or defaults
166
+ try:
167
+ # If an llm instance is provided, use it or adapt it
168
+ if llm is not None:
169
+ # Check if the llm is already an LLMReasoner instance
170
+ if hasattr(llm, "reason") and callable(getattr(llm, "reason")):
171
+ # If it's already an LLMReasoner or compatible, use it directly
172
+ self.reasoner = llm
173
+ logger.info(f"Using provided LLM reasoner: provider={getattr(llm, 'llm_provider', 'unknown')}, model={getattr(llm, 'model_name', 'unknown')}")
174
+ else:
175
+ # Detect the type of LLM and use appropriate initialization
176
+ llm_provider, model_name, api_key = self._detect_llm_type(llm)
177
+
178
+ self.reasoner = LLMReasoner(
179
+ llm_provider=llm_provider,
180
+ model_name=model_name,
181
+ api_key=api_key,
182
+ temperature=0.2,
183
+ max_tokens=2000,
184
+ vision=self.vision
185
+ )
186
+ else:
187
+ # Use default OpenAI if no LLM provided
188
+ self.reasoner = LLMReasoner(
189
+ llm_provider="openai",
190
+ temperature=0.2,
191
+ max_tokens=2000,
192
+ vision=self.vision
193
+ )
194
+
195
+ self.use_llm = True
196
+ except (ImportError, ValueError) as e:
197
+ logger.warning(f"LLM reasoning not available: {e}")
198
+ self.reasoner = None
199
+ self.use_llm = False
200
+
201
+ def _detect_llm_type(self, llm_instance: Any) -> Tuple[str, Optional[str], Optional[str]]:
202
+ """Detect the type of LLM instance and return appropriate configuration.
203
+
204
+ Args:
205
+ llm_instance: The LLM instance provided
206
+
207
+ Returns:
208
+ Tuple of (provider_name, model_name, api_key)
209
+ """
210
+ # First check if it's our own LLMReasoner
211
+ if hasattr(llm_instance, "llm_provider") and hasattr(llm_instance, "model_name"):
212
+ # It's likely our own LLMReasoner, use its attributes directly
213
+ return (
214
+ llm_instance.llm_provider,
215
+ llm_instance.model_name,
216
+ llm_instance.api_key if hasattr(llm_instance, "api_key") else None
217
+ )
218
+
219
+ # Default values if not a recognized type
220
+ provider = "openai"
221
+ model = None
222
+ api_key = None
223
+
224
+ # Check for common attributes to determine provider
225
+ instance_type = str(type(llm_instance))
226
+
227
+ if "openai" in instance_type.lower():
228
+ provider = "openai"
229
+ if hasattr(llm_instance, "model_name"):
230
+ model = llm_instance.model_name
231
+ elif hasattr(llm_instance, "model"):
232
+ model = llm_instance.model
233
+
234
+ # Try to extract API key if available
235
+ if hasattr(llm_instance, "api_key"):
236
+ api_key = llm_instance.api_key
237
+
238
+ elif "anthropic" in instance_type.lower() or "claude" in instance_type.lower():
239
+ provider = "anthropic"
240
+ if hasattr(llm_instance, "model_name"):
241
+ model = llm_instance.model_name
242
+ elif hasattr(llm_instance, "model"):
243
+ model = llm_instance.model
244
+
245
+ # Try to extract API key if available
246
+ if hasattr(llm_instance, "api_key"):
247
+ api_key = llm_instance.api_key
248
+
249
+ elif "gemini" in instance_type.lower():
250
+ provider = "gemini"
251
+ if hasattr(llm_instance, "model_name"):
252
+ model = llm_instance.model_name
253
+ elif hasattr(llm_instance, "model"):
254
+ model = llm_instance.model
255
+
256
+ # Try to extract API key if available
257
+ if hasattr(llm_instance, "api_key"):
258
+ api_key = llm_instance.api_key
259
+
260
+ logger.info(f"Detected LLM type: {provider}, model: {model}")
261
+ return provider, model, api_key
262
+
263
+ async def connect(self) -> bool:
264
+ """Connect to the specified device.
265
+
266
+ Returns:
267
+ True if connection successful, False otherwise
268
+ """
269
+ try:
270
+ devices = await self.device_manager.list_devices()
271
+
272
+ if not self.device_serial:
273
+ # If no device specified, use the first one available
274
+ if not devices:
275
+ logger.error("No devices found")
276
+ return False
277
+
278
+ self.device_serial = devices[0].serial
279
+ logger.info(f"Using first available device: {self.device_serial}")
280
+
281
+ # Check if specified device exists
282
+ device_exists = False
283
+ for device in devices:
284
+ if device.serial == self.device_serial:
285
+ device_exists = True
286
+ break
287
+
288
+ if not device_exists:
289
+ logger.error(f"Device {self.device_serial} not found")
290
+ return False
291
+
292
+ logger.info(f"Connected to device: {self.device_serial}")
293
+ return True
294
+
295
+ except Exception as e:
296
+ logger.error(f"Error connecting to device: {e}")
297
+ return False
298
+
299
+ async def add_step(
300
+ self,
301
+ step_type: ReActStepType,
302
+ content: str,
303
+ ) -> ReActStep:
304
+ """Add a step to the agent's reasoning process.
305
+
306
+ Args:
307
+ step_type: Type of step
308
+ content: Content of the step
309
+
310
+ Returns:
311
+ The created ReActStep
312
+ """
313
+ # Create the step
314
+ step = ReActStep(step_type, content)
315
+
316
+ # Add to steps list
317
+ self.steps.append(step)
318
+
319
+ # Log the step
320
+ logger.info(str(step))
321
+
322
+ return step
323
+
324
+ async def execute_tool(self, tool_name: str, **kwargs) -> Any:
325
+ """Execute a tool by name with the given arguments.
326
+
327
+ Args:
328
+ tool_name: Name of the tool to execute
329
+ **kwargs: Arguments to pass to the tool
330
+
331
+ Returns:
332
+ The result of tool execution
333
+
334
+ Raises:
335
+ ValueError: If tool not found or parameter validation fails
336
+ """
337
+ import inspect
338
+
339
+ if tool_name not in self.tools:
340
+ # Clean up tool name by removing extra parentheses
341
+ cleaned_tool_name = tool_name.replace("()", "")
342
+ if cleaned_tool_name in self.tools:
343
+ tool_name = cleaned_tool_name
344
+ else:
345
+ raise ValueError(f"Tool {tool_name} not found")
346
+
347
+ tool_func = self.tools[tool_name]
348
+
349
+ # Add serial number if needed and not provided
350
+ sig = inspect.signature(tool_func)
351
+ if 'serial' in sig.parameters and 'serial' not in kwargs:
352
+ kwargs['serial'] = self.device_serial
353
+
354
+ try:
355
+ # Execute the tool and capture the result
356
+ result = await tool_func(**kwargs)
357
+
358
+ # Special handling for formatted results
359
+ if tool_name == "list_packages" and isinstance(result, dict):
360
+ # Format package list for better readability
361
+ message = result.get("message", "")
362
+ packages = result.get("packages", [])
363
+ package_list = "\n".join([f"- {pkg.get('package', '')}" for pkg in packages])
364
+
365
+ return f"{message}\n{package_list}"
366
+ elif tool_name == "get_clickables" and isinstance(result, dict):
367
+ # Format clickable elements for better readability
368
+ message = result.get("message", "")
369
+ clickable = result.get("clickable_elements", [])
370
+ return clickable
371
+
372
+
373
+ elif tool_name == "take_screenshot" and isinstance(result, tuple) and len(result) >= 2:
374
+ # For screenshots, store the image data for the LLM and return the path
375
+ path, image_data = result
376
+ # Store the screenshot data for the next LLM call
377
+ self._last_screenshot = image_data
378
+ return f"Screenshot captured and available for analysis"
379
+ else:
380
+ return result
381
+
382
+ except Exception as e:
383
+ logger.error(f"Error executing tool {tool_name}: {e}")
384
+ return f"Error: {str(e)}"
385
+
386
+ async def run(self) -> List[ReActStep]:
387
+ """Run the ReAct agent to achieve the goal.
388
+
389
+ Returns:
390
+ List of steps taken during execution
391
+ """
392
+ if not self.goal:
393
+ raise ValueError("No goal specified")
394
+
395
+ # Connect to device
396
+ if not await self.connect():
397
+ await self.add_step(
398
+ ReActStepType.OBSERVATION,
399
+ "Failed to connect to device"
400
+ )
401
+ return self.steps
402
+
403
+ # Add initial goal step
404
+ await self.add_step(ReActStepType.GOAL, self.goal)
405
+
406
+ # Continue with ReAct loop
407
+ step_count = 0
408
+ goal_achieved = False
409
+
410
+ while step_count < self.max_steps and not goal_achieved:
411
+ # Generate next step using LLM reasoning
412
+ if self.use_llm and self.reasoner:
413
+ try:
414
+ # Convert steps to dictionaries for the LLM
415
+ history = [step.to_dict() for step in self.steps]
416
+
417
+ # Get available tool names
418
+ available_tools = list(self.tools.keys())
419
+
420
+ # Get LLM reasoning, passing the last screenshot if available
421
+ reasoning_result = await self.reasoner.reason(
422
+ goal=self.goal,
423
+ history=history,
424
+ available_tools=available_tools,
425
+ screenshot_data=self._last_screenshot
426
+ )
427
+
428
+ # Clear the screenshot after using it
429
+ self._last_screenshot = None
430
+
431
+ # Extract thought, action, and parameters
432
+ thought = reasoning_result.get("thought", "")
433
+ action = reasoning_result.get("action", "")
434
+ parameters = reasoning_result.get("parameters", {})
435
+
436
+ # Add thought step
437
+ thought_step = await self.add_step(
438
+ ReActStepType.THOUGHT,
439
+ thought,
440
+ )
441
+
442
+ # Add action step
443
+ action_description = f"{action}({', '.join(f'{k}={v}' for k, v in parameters.items())})"
444
+ action_step = await self.add_step(ReActStepType.ACTION, action_description)
445
+
446
+ # Execute the action if it's a valid tool
447
+ result = "No action taken"
448
+ if action in self.tools:
449
+ try:
450
+ # Execute the tool
451
+ result = await self.execute_tool(action, **parameters)
452
+
453
+ # Check if the complete tool was called
454
+ if action == "complete":
455
+ goal_achieved = True
456
+ # Get token usage stats
457
+ stats = self.reasoner.get_token_usage_stats()
458
+ total_tokens = stats['total_tokens']
459
+ cost = (total_tokens / 1_000_000) * 0.10 # $0.10 per 1M tokens
460
+
461
+ print("\n===== Final Token Usage and Cost =====")
462
+ print(f"Total Tokens Used: {total_tokens:,}")
463
+ print(f"Total API Calls: {stats['api_calls']}")
464
+ print(f"Estimated Cost: ${cost:.4f}")
465
+ print("===================================\n")
466
+
467
+ print(f"Summary: {result}")
468
+
469
+ if isinstance(result, bytes):
470
+ result = f"Binary data ({len(result)} bytes)"
471
+ elif isinstance(result, tuple) and len(result) == 2 and isinstance(result[1], bytes):
472
+ # For screenshot which returns (path, bytes)
473
+ result = f"Screenshot saved to {result[0]} ({len(result[1])} bytes)"
474
+ except Exception as e:
475
+ result = f"Error: {str(e)}"
476
+ else:
477
+ result = f"Invalid action: {action}"
478
+
479
+ # Add the observation step with the result
480
+ await self.add_step(
481
+ ReActStepType.OBSERVATION,
482
+ str(result)
483
+ )
484
+
485
+ # Check if goal is achieved (let the LLM determine this)
486
+ if "goal achieved" in thought.lower() or "goal complete" in thought.lower():
487
+ goal_achieved = True
488
+
489
+ except Exception as e:
490
+ logger.error(f"Error in LLM reasoning: {e}")
491
+ await self.add_step(
492
+ ReActStepType.OBSERVATION,
493
+ f"Error in LLM reasoning: {e}"
494
+ )
495
+ # Increment step count
496
+ step_count += 1
497
+
498
+ # Add final step if goal achieved
499
+ if goal_achieved:
500
+ await self.add_step(
501
+ ReActStepType.OBSERVATION,
502
+ f"Goal achieved in {step_count} steps."
503
+ )
504
+ elif step_count >= self.max_steps:
505
+ await self.add_step(
506
+ ReActStepType.OBSERVATION,
507
+ f"Maximum steps ({self.max_steps}) reached without achieving goal."
508
+ )
509
+
510
+ return self.steps
511
+
512
+ async def run_agent(
513
+ task: str,
514
+ llm: Any = None,
515
+ device_serial: Optional[str] = None,
516
+ llm_provider: Optional[str] = None,
517
+ model_name: Optional[str] = None,
518
+ api_key: Optional[str] = None,
519
+ vision: bool = False
520
+ ) -> List[ReActStep]:
521
+ """Run the ReAct agent with the given goal.
522
+
523
+ Args:
524
+ task: The automation task to perform
525
+ llm: LLM instance to use for reasoning
526
+ device_serial: Serial number of the Android device
527
+ llm_provider: LLM provider name (openai, anthropic, or gemini)
528
+ model_name: Name of the LLM model to use
529
+ api_key: API key for accessing the LLM service
530
+ vision: Whether to enable vision capabilities (screenshot tool)
531
+
532
+ Returns:
533
+ List of ReAct steps taken
534
+ """
535
+ # If llm_provider, model_name, and api_key are provided, create an LLMReasoner
536
+ if llm is None and llm_provider and model_name and api_key:
537
+ from .llm_reasoning import LLMReasoner
538
+ logger.info(f"Creating LLMReasoner with provider={llm_provider}, model={model_name}")
539
+ llm = LLMReasoner(
540
+ llm_provider=llm_provider,
541
+ model_name=model_name,
542
+ api_key=api_key,
543
+ temperature=0.2,
544
+ max_tokens=2000,
545
+ vision=vision
546
+ )
547
+
548
+ agent = ReActAgent(
549
+ task=task,
550
+ llm=llm,
551
+ device_serial=device_serial,
552
+ vision=vision
553
+ )
554
+
555
+ steps = await agent.run()
556
+ return steps
@@ -0,0 +1,9 @@
1
+ """
2
+ DroidRun CLI Module.
3
+
4
+ This module provides command-line interfaces for interacting with Android devices.
5
+ """
6
+
7
+ from .main import cli
8
+
9
+ __all__ = ["cli"]