PyPI - droidrun - Versions diffs - 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

droidrun 0.3.0py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

droidrun/agent/codeact/codeact_agent.py +20 -11
droidrun/agent/context/personas/default.py +1 -1
droidrun/agent/droid/droid_agent.py +6 -1
droidrun/agent/planner/planner_agent.py +32 -12
droidrun/agent/utils/chat_utils.py +4 -7
droidrun/cli/main.py +42 -13
droidrun/tools/adb.py +219 -291
droidrun/tools/ios.py +4 -2
droidrun/tools/tools.py +1 -5
{droidrun-0.3.0.dist-info → droidrun-0.3.1.dist-info}/METADATA +3 -2
{droidrun-0.3.0.dist-info → droidrun-0.3.1.dist-info}/RECORD +14 -16
droidrun/agent/context/todo.txt +0 -4
droidrun/run.py +0 -105
{droidrun-0.3.0.dist-info → droidrun-0.3.1.dist-info}/WHEEL +0 -0
{droidrun-0.3.0.dist-info → droidrun-0.3.1.dist-info}/entry_points.txt +0 -0
{droidrun-0.3.0.dist-info → droidrun-0.3.1.dist-info}/licenses/LICENSE +0 -0

droidrun/agent/codeact/codeact_agent.py CHANGED Viewed

@@ -45,6 +45,7 @@ class CodeActAgent(Workflow):
         self,
         llm: LLM,
         persona: AgentPersona,
+        vision: bool,
         tools_instance: "Tools",
         all_tools_list: Dict[str, Callable[..., Any]],
         max_steps: int = 5,
@@ -62,6 +63,8 @@ class CodeActAgent(Workflow):
         self.user_prompt = persona.user_prompt
         self.no_thoughts_prompt = None
+        self.vision = vision
         self.chat_memory = None
         self.episodic_memory = EpisodicMemory(persona=persona)
         self.remembered_info = None
@@ -161,22 +164,28 @@ class CodeActAgent(Workflow):
             chat_history = await chat_utils.add_memory_block(self.remembered_info, chat_history)
         for context in self.required_context:
-            if context == "screenshot" and model != "DeepSeek":
+            if model == "DeepSeek":
+                logger.warning(
+                    "[yellow]DeepSeek doesnt support images. Disabling screenshots[/]"
+                )
+            elif self.vision == True and context == "screenshot":
                 screenshot = (await self.tools.take_screenshot())[1]
                 ctx.write_event_to_stream(ScreenshotEvent(screenshot=screenshot))
                 await ctx.set("screenshot", screenshot)
                 chat_history = await chat_utils.add_screenshot_image_block(screenshot, chat_history)
-            if context == "phone_state":
-                chat_history = await chat_utils.add_phone_state_block(await self.tools.get_phone_state(), chat_history)
             if context == "ui_state":
-                ui_state = await self.tools.get_clickables()
-                await ctx.set("ui_state", ui_state)
-                chat_history = await chat_utils.add_ui_text_block(
-                    ui_state, chat_history
-                )
+                try:
+                    state = await self.tools.get_state()
+                    await ctx.set("ui_state", state["a11y_tree"])
+                    chat_history = await chat_utils.add_ui_text_block(
+                        state["a11y_tree"], chat_history
+                    )
+                    chat_history = await chat_utils.add_phone_state_block(state["phone_state"], chat_history)
+                except Exception as e:
+                    logger.warning(f"⚠️ Error retrieving state from the connected device. Is the Accessibility Service enabled?")
             if context == "packages":
                 chat_history = await chat_utils.add_packages_block(
@@ -394,7 +403,7 @@ class CodeActAgent(Workflow):
                 logger.warning(f"Failed to capture final screenshot: {e}")
             try:
-                ui_state = await self.tools.get_clickables()
+                (a11y_tree, phone_state) = await self.tools.get_state()
             except Exception as e:
                 logger.warning(f"Failed to capture final UI state: {e}")
@@ -402,7 +411,7 @@ class CodeActAgent(Workflow):
             final_chat_history = [{"role": "system", "content": "Final state observation after task completion"}]
             final_response = {
                 "role": "user",
-                "content": f"Final State Observation:\nUI State: {ui_state}\nScreenshot: {'Available' if screenshot else 'Not available'}"
+                "content": f"Final State Observation:\nUI State: {a11y_tree}\nScreenshot: {'Available' if screenshot else 'Not available'}"
             }
             # Create final episodic memory step

droidrun/agent/context/personas/default.py CHANGED Viewed

@@ -21,7 +21,6 @@ DEFAULT = AgentPersona(
     required_context=[
         "ui_state",
         "screenshot",
-        "phone_state"
     ],
     user_prompt="""
     **Current Request:**
@@ -46,6 +45,7 @@ DEFAULT = AgentPersona(
     - **screenshots**: A visual screenshot of the current state of the Android screen. This provides visual context for what the user sees. screenshots won't be saved in the chat history. So, make sure to describe what you see and explain the key parts of your plan in your thoughts, as those will be saved and used to assist you in future steps.
     - **phone_state**: The current app you are navigating in. This tells you which application context you're working within.
     - **chat history**: You are also given the history of your actions (if any) from your previous steps.
+    - **execution result**: The result of your last Action
     NOTE: you don't have access to these inputs in your tool calling context
     ## Response Format:

droidrun/agent/droid/droid_agent.py CHANGED Viewed

@@ -61,6 +61,7 @@ A wrapper class that coordinates between PlannerAgent (creates plans) and
         personas: List[AgentPersona] = [DEFAULT],
         max_steps: int = 15,
         timeout: int = 1000,
+        vision: bool = False,
         reasoning: bool = False,
         reflection: bool = False,
         enable_tracing: bool = False,
@@ -101,6 +102,7 @@ A wrapper class that coordinates between PlannerAgent (creates plans) and
         self.goal = goal
         self.llm = llm
+        self.vision = vision
         self.max_steps = max_steps
         self.max_codeact_steps = max_steps
         self.timeout = timeout
@@ -128,6 +130,7 @@ A wrapper class that coordinates between PlannerAgent (creates plans) and
             self.planner_agent = PlannerAgent(
                 goal=goal,
                 llm=llm,
+                vision=vision,
                 personas=personas,
                 task_manager=self.task_manager,
                 tools_instance=tools,
@@ -171,6 +174,7 @@ A wrapper class that coordinates between PlannerAgent (creates plans) and
             codeact_agent = CodeActAgent(
                 llm=self.llm,
                 persona=persona,
+                vision=self.vision,
                 max_steps=self.max_codeact_steps,
                 all_tools_list=self.tool_list,
                 tools_instance=self.tools_instance,
@@ -200,7 +204,7 @@ A wrapper class that coordinates between PlannerAgent (creates plans) and
             if self.debug:
                 import traceback
                 logger.error(traceback.format_exc())
-            return CodeActResultEvent(success=False, reason=f"Error: {str(e)}", task=task, steps=result["codeact_steps"])
+            return CodeActResultEvent(success=False, reason=f"Error: {str(e)}", task=task, steps=[])
     @step
     async def handle_codeact_execute(self, ctx: Context, ev: CodeActResultEvent) -> FinalizeEvent | ReflectionEvent:
@@ -306,6 +310,7 @@ A wrapper class that coordinates between PlannerAgent (creates plans) and
             Dict containing the execution result
         """
         logger.info(f"🚀 Running DroidAgent to achieve goal: {self.goal}")
+        ctx.write_event_to_stream(ev)
         self.step_counter = 0
         self.retry_counter = 0

droidrun/agent/planner/planner_agent.py CHANGED Viewed

@@ -42,6 +42,7 @@ class PlannerAgent(Workflow):
         self,
         goal: str,
         llm: LLM,
+        vision: bool,
         personas: List[AgentPersona],
         task_manager: TaskManager,
         tools_instance: Tools,
@@ -57,6 +58,7 @@ class PlannerAgent(Workflow):
         self.goal = goal
         self.task_manager = task_manager
         self.debug = debug
+        self.vision = vision
         self.chat_memory = None
         self.remembered_info = None
@@ -128,12 +130,19 @@ class PlannerAgent(Workflow):
         self.steps_counter += 1
         logger.info(f"🧠 Thinking about how to plan the goal...")
-        screenshot = (await self.tools_instance.take_screenshot())[1]
-        ctx.write_event_to_stream(ScreenshotEvent(screenshot=screenshot))
-        await ctx.set("screenshot", screenshot)
+        if self.vision:
+            screenshot = (await self.tools_instance.take_screenshot())[1]
+            ctx.write_event_to_stream(ScreenshotEvent(screenshot=screenshot))
+            await ctx.set("screenshot", screenshot)
+        try:
+            state = await self.tools_instance.get_state()
+            await ctx.set("ui_state", state["a11y_tree"])
+            await ctx.set("phone_state", state["phone_state"])
+        except Exception as e:
+            logger.warning(f"⚠️ Error retrieving state from the connected device. Is the Accessibility Service enabled?")
-        await ctx.set("ui_state", await self.tools_instance.get_clickables())
-        await ctx.set("phone_state", await self.tools_instance.get_phone_state())
         await ctx.set("remembered_info", self.remembered_info)
         await ctx.set("reflection", self.reflection)
@@ -187,7 +196,11 @@ class PlannerAgent(Workflow):
                 await self.chat_memory.aput(
                     ChatMessage(
                         role="user",
-                        content=f"Please either set new tasks using set_tasks_with_agents() or mark the goal as complete using complete_goal() if done.",
+                        content="""Please either set new tasks using set_tasks_with_agents() or mark the goal as complete using complete_goal() if done.
+wrap your code inside this:
+```python
+<YOUR CODE HERE>
+```""",
                     )
                 )
                 logger.debug("🔄 Waiting for next plan or completion.")
@@ -196,7 +209,11 @@ class PlannerAgent(Workflow):
             await self.chat_memory.aput(
                 ChatMessage(
                     role="user",
-                    content=f"Please either set new tasks using set_tasks_with_agents() or mark the goal as complete using complete_goal() if done.",
+                    content="""Please either set new tasks using set_tasks_with_agents() or mark the goal as complete using complete_goal() if done.
+wrap your code inside this:
+```python
+<YOUR CODE HERE>
+```""",
                 )
             )
             logger.debug("🔄 Waiting for next plan or completion.")
@@ -224,15 +241,18 @@ class PlannerAgent(Workflow):
             logger.debug(f"  - Sending {len(chat_history)} messages to LLM.")
             model = self.llm.class_name()
-            if model != "DeepSeek":
-                chat_history = await chat_utils.add_screenshot_image_block(
-                    await ctx.get("screenshot"), chat_history
-                )
-            else:
+            if model == "DeepSeek":
                 logger.warning(
                     "[yellow]DeepSeek doesnt support images. Disabling screenshots[/]"
                 )
+            elif self.vision == True:
+                chat_history = await chat_utils.add_screenshot_image_block(
+                    await ctx.get("screenshot"), chat_history
+                )
             chat_history = await chat_utils.add_task_history_block(
                 self.task_manager.get_completed_tasks(),
                 self.task_manager.get_failed_tasks(),

droidrun/agent/utils/chat_utils.py CHANGED Viewed

@@ -132,24 +132,21 @@ async def add_phone_state_block(phone_state, chat_history: List[ChatMessage]) ->
     # Format the phone state data nicely
     if isinstance(phone_state, dict) and 'error' not in phone_state:
-        current_app = phone_state.get('currentApp', 'Unknown')
+        current_app = phone_state.get('currentApp', '')
         package_name = phone_state.get('packageName', 'Unknown')
         keyboard_visible = phone_state.get('keyboardVisible', False)
         focused_element = phone_state.get('focusedElement')
         # Format the focused element
         if focused_element:
-            element_text = focused_element.get('text', 'No text')
-            element_class = focused_element.get('className', 'Unknown')
-            element_bounds = focused_element.get('bounds', 'Unknown')
-            element_type = focused_element.get('type', 'unknown')
+            element_text = focused_element.get('text', '')
+            element_class = focused_element.get('className', '')
             element_resource_id = focused_element.get('resourceId', '')
             # Build focused element description
-            focused_desc = f"'{element_text}' ({element_class})"
+            focused_desc = f"'{element_text}' {element_class}"
             if element_resource_id:
                 focused_desc += f" | ID: {element_resource_id}"
-            focused_desc += f" | Bounds: {element_bounds} | Type: {element_type}"
         else:
             focused_desc = "None"

droidrun/cli/main.py CHANGED Viewed

@@ -11,7 +11,7 @@ from rich.console import Console
 from droidrun.agent.droid import DroidAgent
 from droidrun.agent.utils.llm_picker import load_llm
 from droidrun.adb import DeviceManager
-from droidrun.tools import AdbTools, IOSTools, Tools
+from droidrun.tools import AdbTools, IOSTools
 from functools import wraps
 from droidrun.cli.logs import LogHandler
@@ -59,6 +59,8 @@ async def run_command(
     model: str,
     steps: int,
     base_url: str,
+    api_base: str,
+    vision: bool,
     reasoning: bool,
     reflection: bool,
     tracing: bool,
@@ -101,7 +103,7 @@ async def run_command(
             # LLM setup
             log_handler.update_step("Initializing LLM...")
             llm = load_llm(
-                provider_name=provider, model=model, base_url=base_url, **kwargs
+                provider_name=provider, model=model, base_url=base_url, api_base=api_base, **kwargs
             )
             logger.info(f"🧠 LLM ready: {provider}/{model}")
@@ -120,6 +122,7 @@ async def run_command(
                 tools=tools,
                 max_steps=steps,
                 timeout=1000,
+                vision=vision,
                 reasoning=reasoning,
                 reflection=reflection,
                 enable_tracing=tracing,
@@ -176,14 +179,14 @@ class DroidRunCLI(click.Group):
 @click.option(
     "--provider",
     "-p",
-    help="LLM provider (OpenAI, Ollama, Anthropic, Gemini, DeepSeek)",
-    default="Gemini",
+    help="LLM provider (OpenAI, Ollama, Anthropic, GoogleGenAI, DeepSeek)",
+    default="GoogleGenAI",
 )
 @click.option(
     "--model",
     "-m",
     help="LLM model name",
-    default="models/gemini-2.5-pro",
+    default="models/gemini-2.5-flash",
 )
 @click.option("--temperature", type=float, help="Temperature for LLM", default=0.2)
 @click.option("--steps", type=int, help="Maximum number of steps", default=15)
@@ -194,7 +197,15 @@ class DroidRunCLI(click.Group):
     default=None,
 )
 @click.option(
-    "--reasoning", is_flag=True, help="Enable/disable planning with reasoning", default=False
+    "--api_base",
+    help="Base URL for API (e.g., OpenAI, OpenAI-Like)",
+    default=None,
+)
+@click.option(
+    "--vision", is_flag=True, help="Enable vision capabilites by using screenshots", default=False
+)
+@click.option(
+    "--reasoning", is_flag=True, help="Enable planning with reasoning", default=False
 )
 @click.option(
     "--reflection", is_flag=True, help="Enable reflection step for higher reasoning", default=False
@@ -218,7 +229,9 @@ def cli(
     model: str,
     steps: int,
     base_url: str,
+    api_base: str,
     temperature: float,
+    vision: bool,
     reasoning: bool,
     reflection: bool,
     tracing: bool,
@@ -235,14 +248,14 @@ def cli(
 @click.option(
     "--provider",
     "-p",
-    help="LLM provider (OpenAI, Ollama, Anthropic, Gemini, DeepSeek)",
-    default="Gemini",
+    help="LLM provider (OpenAI, Ollama, Anthropic, GoogleGenAI, DeepSeek)",
+    default="GoogleGenAI",
 )
 @click.option(
     "--model",
     "-m",
     help="LLM model name",
-    default="models/gemini-2.5-pro",
+    default="models/gemini-2.5-flash",
 )
 @click.option("--temperature", type=float, help="Temperature for LLM", default=0.2)
 @click.option("--steps", type=int, help="Maximum number of steps", default=15)
@@ -253,7 +266,15 @@ def cli(
     default=None,
 )
 @click.option(
-    "--reasoning", is_flag=True, help="Enable/disable planning with reasoning", default=False
+    "--api_base",
+    help="Base URL for API (e.g., OpenAI or OpenAI-Like)",
+    default=None,
+)
+@click.option(
+    "--vision", is_flag=True, help="Enable vision capabilites by using screenshots", default=False
+)
+@click.option(
+    "--reasoning", is_flag=True, help="Enable planning with reasoning", default=False
 )
 @click.option(
     "--reflection", is_flag=True, help="Enable reflection step for higher reasoning", default=False
@@ -280,7 +301,9 @@ def run(
     model: str,
     steps: int,
     base_url: str,
+    api_base: str,
     temperature: float,
+    vision: bool,
     reasoning: bool,
     reflection: bool,
     tracing: bool,
@@ -297,6 +320,8 @@ def run(
         model,
         steps,
         base_url,
+        api_base,
+        vision,
         reasoning,
         reflection,
         tracing,
@@ -381,9 +406,9 @@ async def setup(path: str, device: str | None):
                 f"[bold red]Error:[/] Could not get device object for {device}"
             )
             return
-        tools = Tools(serial=device)
         console.print(f"[bold blue]Step 1/2: Installing APK:[/] {path}")
-        result = await tools.install_app(path, False, True)
+        result = await device_obj.install_app(path, False, True)
         if "Error" in result:
             console.print(f"[bold red]Installation failed:[/] {result}")
@@ -449,13 +474,15 @@ if __name__ == "__main__":
     provider = "GoogleGenAI"
     model = "models/gemini-2.5-flash"
     temperature = 0
-    api_key = os.getenv("GEMINI_API_KEY")
+    api_key = os.getenv("GOOGLE_API_KEY")
     steps = 15
+    vision = True
     reasoning = True
     reflection = False
     tracing = True
     debug = True
     base_url = None
+    api_base = None
     ios = False
     run_command(
         command=command,
@@ -464,11 +491,13 @@ if __name__ == "__main__":
         model=model,
         steps=steps,
         temperature=temperature,
+        vision=vision,
         reasoning=reasoning,
         reflection=reflection,
         tracing=tracing,
         debug=debug,
         base_url=base_url,
+        api_base=api_base,
         api_key=api_key,
         ios=ios
     )

droidrun 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

droidrun 0.3.0py3-none-any.whl → 0.3.1py3-none-any.whl