droidrun 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. droidrun/__init__.py +16 -11
  2. droidrun/__main__.py +1 -1
  3. droidrun/adb/__init__.py +3 -3
  4. droidrun/adb/device.py +1 -1
  5. droidrun/adb/manager.py +2 -2
  6. droidrun/agent/__init__.py +6 -0
  7. droidrun/agent/codeact/__init__.py +2 -4
  8. droidrun/agent/codeact/codeact_agent.py +321 -235
  9. droidrun/agent/codeact/events.py +12 -20
  10. droidrun/agent/codeact/prompts.py +0 -52
  11. droidrun/agent/common/default.py +5 -0
  12. droidrun/agent/common/events.py +4 -0
  13. droidrun/agent/context/__init__.py +23 -0
  14. droidrun/agent/context/agent_persona.py +15 -0
  15. droidrun/agent/context/context_injection_manager.py +66 -0
  16. droidrun/agent/context/episodic_memory.py +15 -0
  17. droidrun/agent/context/personas/__init__.py +11 -0
  18. droidrun/agent/context/personas/app_starter.py +44 -0
  19. droidrun/agent/context/personas/default.py +95 -0
  20. droidrun/agent/context/personas/extractor.py +52 -0
  21. droidrun/agent/context/personas/ui_expert.py +107 -0
  22. droidrun/agent/context/reflection.py +20 -0
  23. droidrun/agent/context/task_manager.py +124 -0
  24. droidrun/agent/context/todo.txt +4 -0
  25. droidrun/agent/droid/__init__.py +2 -2
  26. droidrun/agent/droid/droid_agent.py +264 -325
  27. droidrun/agent/droid/events.py +28 -0
  28. droidrun/agent/oneflows/reflector.py +265 -0
  29. droidrun/agent/planner/__init__.py +2 -4
  30. droidrun/agent/planner/events.py +9 -13
  31. droidrun/agent/planner/planner_agent.py +268 -0
  32. droidrun/agent/planner/prompts.py +33 -53
  33. droidrun/agent/utils/__init__.py +3 -0
  34. droidrun/agent/utils/async_utils.py +1 -40
  35. droidrun/agent/utils/chat_utils.py +268 -48
  36. droidrun/agent/utils/executer.py +49 -14
  37. droidrun/agent/utils/llm_picker.py +14 -10
  38. droidrun/agent/utils/trajectory.py +184 -0
  39. droidrun/cli/__init__.py +1 -1
  40. droidrun/cli/logs.py +283 -0
  41. droidrun/cli/main.py +333 -439
  42. droidrun/run.py +105 -0
  43. droidrun/tools/__init__.py +5 -10
  44. droidrun/tools/{actions.py → adb.py} +279 -238
  45. droidrun/tools/ios.py +594 -0
  46. droidrun/tools/tools.py +99 -0
  47. droidrun-0.3.0.dist-info/METADATA +149 -0
  48. droidrun-0.3.0.dist-info/RECORD +52 -0
  49. droidrun/agent/planner/task_manager.py +0 -355
  50. droidrun/agent/planner/workflow.py +0 -371
  51. droidrun/tools/device.py +0 -29
  52. droidrun/tools/loader.py +0 -60
  53. droidrun-0.2.0.dist-info/METADATA +0 -373
  54. droidrun-0.2.0.dist-info/RECORD +0 -32
  55. {droidrun-0.2.0.dist-info → droidrun-0.3.0.dist-info}/WHEEL +0 -0
  56. {droidrun-0.2.0.dist-info → droidrun-0.3.0.dist-info}/entry_points.txt +0 -0
  57. {droidrun-0.2.0.dist-info → droidrun-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -11,13 +11,15 @@ import asyncio
11
11
  import aiofiles
12
12
  import contextlib
13
13
  from typing import Optional, Dict, Tuple, List, Any
14
- from ..adb import Device, DeviceManager
14
+ from droidrun.adb.device import Device
15
+ from droidrun.adb.manager import DeviceManager
16
+ from droidrun.tools.tools import Tools
15
17
 
16
18
 
17
- class Tools:
19
+ class AdbTools(Tools):
18
20
  """Core UI interaction tools for Android device control."""
19
21
 
20
- def __init__(self, serial: str) -> None:
22
+ def __init__(self, serial: str = "emulator-5554") -> None:
21
23
  # Instance‐level cache for clickable elements (index-based tapping)
22
24
  self.clickable_elements_cache: List[Dict[str, Any]] = []
23
25
  self.serial = serial
@@ -28,30 +30,29 @@ class Tools:
28
30
  self.finished = False
29
31
  # Memory storage for remembering important information
30
32
  self.memory: List[str] = []
33
+ # Store all screenshots with timestamps
34
+ self.screenshots: List[Dict[str, Any]] = []
31
35
 
32
36
  def get_device_serial(self) -> str:
33
37
  """Get the device serial from the instance or environment variable."""
34
38
  # First try using the instance's serial
35
39
  if self.serial:
36
40
  return self.serial
37
-
38
- # Fall back to environment variable if not set on the instance
39
- return os.environ.get("DROIDRUN_DEVICE_SERIAL", "")
40
41
 
41
42
  async def get_device(self) -> Optional[Device]:
42
43
  """Get the device instance using the instance's serial or from environment variable.
43
-
44
+
44
45
  Returns:
45
46
  Device instance or None if not found
46
47
  """
47
48
  serial = self.get_device_serial()
48
49
  if not serial:
49
- raise ValueError("No device serial specified - set DROIDRUN_DEVICE_SERIAL environment variable or provide device_serial parameter")
50
-
50
+ raise ValueError("No device serial specified - set device_serial parameter")
51
+
51
52
  device = await self.device_manager.get_device(serial)
52
53
  if not device:
53
54
  raise ValueError(f"Device {serial} not found")
54
-
55
+
55
56
  return device
56
57
 
57
58
  def parse_package_list(self, output: str) -> List[Dict[str, str]]:
@@ -76,255 +77,257 @@ class Tools:
76
77
  async def get_clickables(self, serial: Optional[str] = None) -> str:
77
78
  """
78
79
  Get all clickable UI elements from the device using the custom TopViewService.
79
-
80
+
80
81
  This function interacts with the TopViewService app installed on the device
81
82
  to capture UI elements. The service writes UI data to a JSON file on the device,
82
83
  which is then pulled to the host. If no elements are found initially, it will
83
84
  retry for up to 30 seconds.
84
-
85
+
85
86
  Args:
86
87
  serial: Optional device serial number
87
-
88
+
88
89
  Returns:
89
90
  JSON string containing UI elements extracted from the device screen
90
91
  """
91
92
  try:
92
93
  # Get the device
93
94
  if serial:
94
- from droidrun.adb import DeviceManager
95
95
  device_manager = DeviceManager()
96
96
  device = await device_manager.get_device(serial)
97
97
  if not device:
98
98
  raise ValueError(f"Device {serial} not found")
99
99
  else:
100
100
  device = await self.get_device()
101
-
101
+
102
102
  # Create a temporary file for the JSON
103
103
  with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as temp:
104
104
  local_path = temp.name
105
-
105
+
106
106
  try:
107
107
  # Set retry parameters
108
108
  max_total_time = 30 # Maximum total time to try in seconds
109
109
  retry_interval = 1.0 # Time between retries in seconds
110
110
  start_total_time = asyncio.get_event_loop().time()
111
-
111
+
112
112
  while True:
113
113
  # Check if we've exceeded total time
114
114
  current_time = asyncio.get_event_loop().time()
115
115
  if current_time - start_total_time > max_total_time:
116
- raise ValueError(f"Failed to get UI elements after {max_total_time} seconds of retries")
117
-
116
+ raise ValueError(
117
+ f"Failed to get UI elements after {max_total_time} seconds of retries"
118
+ )
119
+
118
120
  # Clear logcat to make it easier to find our output
119
121
  await device._adb.shell(device._serial, "logcat -c")
120
-
122
+
121
123
  # Trigger the custom service via broadcast to get only interactive elements
122
- await device._adb.shell(device._serial, "am broadcast -a com.droidrun.portal.GET_ELEMENTS")
123
-
124
+ await device._adb.shell(
125
+ device._serial,
126
+ "am broadcast -a com.droidrun.portal.GET_ELEMENTS",
127
+ )
128
+
124
129
  # Poll for the JSON file path
125
130
  start_time = asyncio.get_event_loop().time()
126
131
  max_wait_time = 10 # Maximum wait time in seconds
127
132
  poll_interval = 0.2 # Check every 200ms
128
-
133
+
129
134
  device_path = None
130
135
  while asyncio.get_event_loop().time() - start_time < max_wait_time:
131
136
  # Check logcat for the file path
132
- logcat_output = await device._adb.shell(device._serial, "logcat -d | grep \"DROIDRUN_FILE\" | grep \"JSON data written to\" | tail -1")
133
-
137
+ logcat_output = await device._adb.shell(
138
+ device._serial,
139
+ 'logcat -d | grep "DROIDRUN_FILE" | grep "JSON data written to" | tail -1',
140
+ )
141
+
134
142
  # Parse the file path if present
135
143
  match = re.search(r"JSON data written to: (.*)", logcat_output)
136
144
  if match:
137
145
  device_path = match.group(1).strip()
138
146
  break
139
-
147
+
140
148
  # Wait before polling again
141
149
  await asyncio.sleep(poll_interval)
142
-
150
+
143
151
  # Check if we found the file path
144
152
  if not device_path:
145
153
  await asyncio.sleep(retry_interval)
146
154
  continue
147
-
155
+
148
156
  # Pull the JSON file from the device
149
157
  await device._adb.pull_file(device._serial, device_path, local_path)
150
-
158
+
151
159
  # Read the JSON file
152
160
  async with aiofiles.open(local_path, "r", encoding="utf-8") as f:
153
161
  json_content = await f.read()
154
-
162
+
155
163
  # Try to parse the JSON
156
164
  try:
157
165
  ui_data = json.loads(json_content)
158
-
166
+
159
167
  # Filter out the "type" attribute from all elements
160
168
  filtered_data = []
161
169
  for element in ui_data:
162
170
  # Create a copy of the element without the "type" attribute
163
- filtered_element = {k: v for k, v in element.items() if k != "type"}
164
-
171
+ filtered_element = {
172
+ k: v for k, v in element.items() if k != "type"
173
+ }
174
+
165
175
  # Also filter children if present
166
176
  if "children" in filtered_element:
167
177
  filtered_element["children"] = [
168
178
  {k: v for k, v in child.items() if k != "type"}
169
179
  for child in filtered_element["children"]
170
180
  ]
171
-
181
+
172
182
  filtered_data.append(filtered_element)
173
-
183
+
174
184
  # If we got elements, store them and return
175
185
  if filtered_data:
176
186
  # Store the filtered UI data in cache
177
187
  global CLICKABLE_ELEMENTS_CACHE
178
188
  CLICKABLE_ELEMENTS_CACHE = filtered_data
179
-
189
+
180
190
  # Add a small sleep to ensure UI is fully loaded/processed
181
191
  await asyncio.sleep(0.5) # 500ms sleep
182
-
192
+
183
193
  # Convert the dictionary to a JSON string before returning
184
- result = {
185
- "clickable_elements": filtered_data,
186
- "count": len(filtered_data),
187
- "message": f"Found {len(filtered_data)} UI elements after retrying"
188
- }
189
-
190
- return result
191
-
194
+
195
+ return filtered_data
196
+
192
197
  # If no elements found, wait and retry
193
198
  await asyncio.sleep(retry_interval)
194
-
199
+
195
200
  except json.JSONDecodeError:
196
201
  # If JSON parsing failed, wait and retry
197
202
  await asyncio.sleep(retry_interval)
198
203
  continue
199
-
204
+
200
205
  except Exception as e:
201
206
  # Clean up in case of error
202
207
  with contextlib.suppress(OSError):
203
208
  os.unlink(local_path)
204
209
  raise ValueError(f"Error retrieving clickable elements: {e}")
205
-
210
+
206
211
  except Exception as e:
207
212
  raise ValueError(f"Error getting clickable elements: {e}")
208
213
 
209
-
210
214
  async def tap_by_index(self, index: int, serial: Optional[str] = None) -> str:
211
215
  """
212
216
  Tap on a UI element by its index.
213
-
217
+
214
218
  This function uses the cached clickable elements
215
219
  to find the element with the given index and tap on its center coordinates.
216
-
220
+
217
221
  Args:
218
222
  index: Index of the element to tap
219
-
223
+
220
224
  Returns:
221
225
  Result message
222
226
  """
223
-
227
+
224
228
  def collect_all_indices(elements):
225
229
  """Recursively collect all indices from elements and their children."""
226
230
  indices = []
227
231
  for item in elements:
228
- if item.get('index') is not None:
229
- indices.append(item.get('index'))
232
+ if item.get("index") is not None:
233
+ indices.append(item.get("index"))
230
234
  # Check children if present
231
- children = item.get('children', [])
235
+ children = item.get("children", [])
232
236
  indices.extend(collect_all_indices(children))
233
237
  return indices
234
238
 
235
239
  def find_element_by_index(elements, target_index):
236
240
  """Recursively find an element with the given index."""
237
241
  for item in elements:
238
- if item.get('index') == target_index:
242
+ if item.get("index") == target_index:
239
243
  return item
240
244
  # Check children if present
241
- children = item.get('children', [])
245
+ children = item.get("children", [])
242
246
  result = find_element_by_index(children, target_index)
243
247
  if result:
244
248
  return result
245
249
  return None
246
-
250
+
247
251
  try:
248
252
  # Check if we have cached elements
249
253
  if not CLICKABLE_ELEMENTS_CACHE:
250
254
  return "Error: No UI elements cached. Call get_clickables first."
251
-
255
+
252
256
  # Find the element with the given index (including in children)
253
257
  element = find_element_by_index(CLICKABLE_ELEMENTS_CACHE, index)
254
-
258
+
255
259
  if not element:
256
260
  # List available indices to help the user
257
261
  indices = sorted(collect_all_indices(CLICKABLE_ELEMENTS_CACHE))
258
262
  indices_str = ", ".join(str(idx) for idx in indices[:20])
259
263
  if len(indices) > 20:
260
264
  indices_str += f"... and {len(indices) - 20} more"
261
-
265
+
262
266
  return f"Error: No element found with index {index}. Available indices: {indices_str}"
263
-
267
+
264
268
  # Get the bounds of the element
265
- bounds_str = element.get('bounds')
269
+ bounds_str = element.get("bounds")
266
270
  if not bounds_str:
267
- element_text = element.get('text', 'No text')
268
- element_type = element.get('type', 'unknown')
269
- element_class = element.get('className', 'Unknown class')
271
+ element_text = element.get("text", "No text")
272
+ element_type = element.get("type", "unknown")
273
+ element_class = element.get("className", "Unknown class")
270
274
  return f"Error: Element with index {index} ('{element_text}', {element_class}, type: {element_type}) has no bounds and cannot be tapped"
271
-
275
+
272
276
  # Parse the bounds (format: "left,top,right,bottom")
273
277
  try:
274
- left, top, right, bottom = map(int, bounds_str.split(','))
278
+ left, top, right, bottom = map(int, bounds_str.split(","))
275
279
  except ValueError:
276
280
  return f"Error: Invalid bounds format for element with index {index}: {bounds_str}"
277
-
281
+
278
282
  # Calculate the center of the element
279
283
  x = (left + right) // 2
280
284
  y = (top + bottom) // 2
281
-
285
+
282
286
  # Get the device and tap at the coordinates
283
287
  if serial:
284
- from droidrun.adb import DeviceManager
285
288
  device_manager = DeviceManager()
286
289
  device = await device_manager.get_device(serial)
287
290
  if not device:
288
291
  return f"Error: Device {serial} not found"
289
292
  else:
290
293
  device = await self.get_device()
291
-
294
+
292
295
  await device.tap(x, y)
293
-
296
+
294
297
  # Add a small delay to allow UI to update
295
298
  await asyncio.sleep(0.5)
296
-
297
-
299
+
298
300
  # Create a descriptive response
299
301
  response_parts = []
300
302
  response_parts.append(f"Tapped element with index {index}")
301
303
  response_parts.append(f"Text: '{element.get('text', 'No text')}'")
302
304
  response_parts.append(f"Class: {element.get('className', 'Unknown class')}")
303
305
  response_parts.append(f"Type: {element.get('type', 'unknown')}")
304
-
306
+
305
307
  # Add information about children if present
306
- children = element.get('children', [])
308
+ children = element.get("children", [])
307
309
  if children:
308
- child_texts = [child.get('text') for child in children if child.get('text')]
310
+ child_texts = [
311
+ child.get("text") for child in children if child.get("text")
312
+ ]
309
313
  if child_texts:
310
314
  response_parts.append(f"Contains text: {' | '.join(child_texts)}")
311
-
315
+
312
316
  response_parts.append(f"Coordinates: ({x}, {y})")
313
-
317
+
314
318
  return " | ".join(response_parts)
315
319
  except ValueError as e:
316
320
  return f"Error: {str(e)}"
317
321
 
318
-
319
322
  # Rename the old tap function to tap_by_coordinates for backward compatibility
320
323
  async def tap_by_coordinates(self, x: int, y: int) -> bool:
321
324
  """
322
- Tap on the device screen at specific coordinates.
323
-
325
+ Tap on the device screen at specific coordinates.
326
+
324
327
  Args:
325
328
  x: X coordinate
326
329
  y: Y coordinate
327
-
330
+
328
331
  Returns:
329
332
  Bool indicating success or failure
330
333
  """
@@ -336,7 +339,7 @@ class Tools:
336
339
  return f"Error: Device {self.serial} not found"
337
340
  else:
338
341
  device = await self.get_device()
339
-
342
+
340
343
  await device.tap(x, y)
341
344
  print(f"Tapped at coordinates ({x}, {y})")
342
345
  return True
@@ -348,29 +351,24 @@ class Tools:
348
351
  async def tap(self, index: int) -> str:
349
352
  """
350
353
  Tap on a UI element by its index.
351
-
354
+
352
355
  This function uses the cached clickable elements from the last get_clickables call
353
356
  to find the element with the given index and tap on its center coordinates.
354
-
357
+
355
358
  Args:
356
359
  index: Index of the element to tap
357
-
360
+
358
361
  Returns:
359
362
  Result message
360
363
  """
361
364
  return await self.tap_by_index(index)
362
365
 
363
366
  async def swipe(
364
- self,
365
- start_x: int,
366
- start_y: int,
367
- end_x: int,
368
- end_y: int,
369
- duration_ms: int = 300
367
+ self, start_x: int, start_y: int, end_x: int, end_y: int, duration_ms: int = 300
370
368
  ) -> bool:
371
369
  """
372
370
  Performs a straight-line swipe gesture on the device screen.
373
- To perform a hold (long press), set the start and end coordinates to the same values and increase the duration as needed.
371
+ To perform a hold (long press), set the start and end coordinates to the same values and increase the duration as needed.
374
372
  Args:
375
373
  start_x: Starting X coordinate
376
374
  start_y: Starting Y coordinate
@@ -388,8 +386,9 @@ class Tools:
388
386
  return f"Error: Device {self.serial} not found"
389
387
  else:
390
388
  device = await self.get_device()
391
-
389
+
392
390
  await device.swipe(start_x, start_y, end_x, end_y, duration_ms)
391
+ await asyncio.sleep(1)
393
392
  print(f"Swiped from ({start_x}, {start_y}) to ({end_x}, {end_y}) in {duration_ms}ms")
394
393
  return True
395
394
  except ValueError as e:
@@ -398,12 +397,12 @@ class Tools:
398
397
 
399
398
  async def input_text(self, text: str, serial: Optional[str] = None) -> str:
400
399
  """
401
- Input text on the device using Base64 encoding and broadcast intent.
402
-
400
+ Input text on the device.
401
+ Always make sure that the Focused Element is not None before inputting text.
402
+
403
403
  Args:
404
404
  text: Text to input. Can contain spaces, newlines, and special characters including non-ASCII.
405
- serial: Optional device serial (for backward compatibility)
406
-
405
+
407
406
  Returns:
408
407
  Result message
409
408
  """
@@ -415,53 +414,75 @@ class Tools:
415
414
  return f"Error: Device {serial} not found"
416
415
  else:
417
416
  device = await self.get_device()
418
-
417
+
419
418
  # Save the current keyboard
420
- original_ime = await device._adb.shell(device._serial, "settings get secure default_input_method")
419
+ original_ime = await device._adb.shell(
420
+ device._serial, "settings get secure default_input_method"
421
+ )
421
422
  original_ime = original_ime.strip()
422
-
423
+
423
424
  # Enable the Droidrun keyboard
424
- await device._adb.shell(device._serial, "ime enable com.droidrun.portal/.DroidrunKeyboardIME")
425
-
425
+ await device._adb.shell(
426
+ device._serial, "ime enable com.droidrun.portal/.DroidrunKeyboardIME"
427
+ )
428
+
426
429
  # Set the Droidrun keyboard as the default
427
- await device._adb.shell(device._serial, "ime set com.droidrun.portal/.DroidrunKeyboardIME")
428
-
430
+ await device._adb.shell(
431
+ device._serial, "ime set com.droidrun.portal/.DroidrunKeyboardIME"
432
+ )
433
+
429
434
  # Wait for keyboard to change
430
435
  await asyncio.sleep(0.2)
431
-
436
+
432
437
  # Encode the text to Base64
433
438
  import base64
439
+
434
440
  encoded_text = base64.b64encode(text.encode()).decode()
435
-
436
- # Send the broadcast intent with the Base64-encoded text
437
- cmd = f'am broadcast -a DROIDRUN_INPUT_B64 --es msg "{encoded_text}"'
441
+
442
+ cmd = f'am broadcast -a com.droidrun.portal.DROIDRUN_INPUT_B64 --es msg "{encoded_text}" -p com.droidrun.portal'
438
443
  await device._adb.shell(device._serial, cmd)
439
-
444
+
440
445
  # Wait for text input to complete
441
446
  await asyncio.sleep(0.5)
442
-
447
+
443
448
  # Restore the original keyboard
444
449
  if original_ime and "com.droidrun.portal" not in original_ime:
445
450
  await device._adb.shell(device._serial, f"ime set {original_ime}")
446
-
451
+
447
452
  return f"Text input completed: {text[:50]}{'...' if len(text) > 50 else ''}"
448
453
  except ValueError as e:
449
454
  return f"Error: {str(e)}"
450
455
  except Exception as e:
451
456
  return f"Error sending text input: {str(e)}"
452
457
 
458
+ async def back(self) -> str:
459
+ """
460
+ Go back on the current view.
461
+ This presses the Android back button.
462
+ """
463
+ try:
464
+ if self.serial:
465
+ device_manager = DeviceManager()
466
+ device = await device_manager.get_device(self.serial)
467
+ if not device:
468
+ return f"Error: Device {self.serial} not found"
469
+ else:
470
+ device = await self.get_device()
471
+
472
+ await device.press_key(3)
473
+ return f"Pressed key BACK"
474
+ except ValueError as e:
475
+ return f"Error: {str(e)}"
476
+
453
477
  async def press_key(self, keycode: int) -> str:
454
478
  """
455
- Press a key on the device.
456
-
479
+ Press a key on the Android device.
480
+
457
481
  Common keycodes:
458
- - 3: HOME
459
482
  - 4: BACK
460
- - 24: VOLUME UP
461
- - 25: VOLUME DOWN
462
- - 26: POWER
463
- - 82: MENU
464
-
483
+ - 66: ENTER
484
+ - 67: DELETE
485
+
465
486
  Args:
466
487
  keycode: Android keycode to press
467
488
  """
@@ -473,30 +494,23 @@ class Tools:
473
494
  return f"Error: Device {self.serial} not found"
474
495
  else:
475
496
  device = await self.get_device()
476
-
497
+
477
498
  key_names = {
478
- 3: "HOME",
499
+ 66: "ENTER",
479
500
  4: "BACK",
480
- 24: "VOLUME UP",
481
- 25: "VOLUME DOWN",
482
- 26: "POWER",
483
- 82: "MENU",
501
+ 67: "DELETE",
484
502
  }
485
503
  key_name = key_names.get(keycode, str(keycode))
486
-
504
+
487
505
  await device.press_key(keycode)
488
506
  return f"Pressed key {key_name}"
489
507
  except ValueError as e:
490
508
  return f"Error: {str(e)}"
491
509
 
492
- async def start_app(
493
- self,
494
- package: str,
495
- activity: str = ""
496
- ) -> str:
510
+ async def start_app(self, package: str, activity: str = "") -> str:
497
511
  """
498
512
  Start an app on the device.
499
-
513
+
500
514
  Args:
501
515
  package: Package name (e.g., "com.android.settings")
502
516
  activity: Optional activity name
@@ -509,21 +523,18 @@ class Tools:
509
523
  return f"Error: Device {self.serial} not found"
510
524
  else:
511
525
  device = await self.get_device()
512
-
526
+
513
527
  result = await device.start_app(package, activity)
514
528
  return result
515
529
  except ValueError as e:
516
530
  return f"Error: {str(e)}"
517
531
 
518
532
  async def install_app(
519
- self,
520
- apk_path: str,
521
- reinstall: bool = False,
522
- grant_permissions: bool = True
533
+ self, apk_path: str, reinstall: bool = False, grant_permissions: bool = True
523
534
  ) -> str:
524
535
  """
525
536
  Install an app on the device.
526
-
537
+
527
538
  Args:
528
539
  apk_path: Path to the APK file
529
540
  reinstall: Whether to reinstall if app exists
@@ -537,25 +548,20 @@ class Tools:
537
548
  return f"Error: Device {self.serial} not found"
538
549
  else:
539
550
  device = await self.get_device()
540
-
551
+
541
552
  if not os.path.exists(apk_path):
542
553
  return f"Error: APK file not found at {apk_path}"
543
-
554
+
544
555
  result = await device.install_app(apk_path, reinstall, grant_permissions)
545
556
  return result
546
557
  except ValueError as e:
547
558
  return f"Error: {str(e)}"
548
559
 
549
- async def take_screenshot(self) -> bool:
560
+ async def take_screenshot(self) -> Tuple[str, bytes]:
550
561
  """
551
562
  Take a screenshot of the device.
552
-
553
563
  This function captures the current screen and adds the screenshot to context in the next message.
554
-
555
- This does not save the screenshot anywhere on the phone, it just attaches it to the next message.
556
-
557
- Returns:
558
- True if successful, False otherwise
564
+ Also stores the screenshot in the screenshots list with timestamp for later GIF creation.
559
565
  """
560
566
  try:
561
567
  if self.serial:
@@ -567,20 +573,26 @@ class Tools:
567
573
  device = await self.get_device()
568
574
  screen_tuple = await device.take_screenshot()
569
575
  self.last_screenshot = screen_tuple[1]
570
- return True
576
+
577
+ # Store screenshot with timestamp
578
+ self.screenshots.append(
579
+ {
580
+ "timestamp": time.time(),
581
+ "image_data": screen_tuple[1],
582
+ "format": screen_tuple[0], # Usually 'PNG'
583
+ }
584
+ )
585
+ return screen_tuple
571
586
  except ValueError as e:
572
587
  raise ValueError(f"Error taking screenshot: {str(e)}")
573
588
 
574
- async def list_packages(
575
- self,
576
- include_system_apps: bool = False
577
- ) -> List[str]:
589
+ async def list_packages(self, include_system_apps: bool = False) -> List[str]:
578
590
  """
579
591
  List installed packages on the device.
580
-
592
+
581
593
  Args:
582
594
  include_system_apps: Whether to include system apps (default: False)
583
-
595
+
584
596
  Returns:
585
597
  List of package names
586
598
  """
@@ -592,32 +604,32 @@ class Tools:
592
604
  raise ValueError(f"Device {self.serial} not found")
593
605
  else:
594
606
  device = await self.get_device()
595
-
607
+
596
608
  # Use the direct ADB command to get packages with paths
597
609
  cmd = ["pm", "list", "packages", "-f"]
598
610
  if not include_system_apps:
599
611
  cmd.append("-3")
600
-
612
+
601
613
  output = await device._adb.shell(device._serial, " ".join(cmd))
602
-
614
+
603
615
  # Parse the package list using the function
604
- packages = self.parse_package_list(output)
616
+ packages = self.parse_package_list(output)
605
617
  # Format package list for better readability
606
618
  package_list = [pack["package"] for pack in packages]
607
- print(f"Returning {len(package_list)} packages")
619
+ print(f"Returning {len(package_list)} packages")
608
620
  return package_list
609
621
  except ValueError as e:
610
622
  raise ValueError(f"Error listing packages: {str(e)}")
611
623
 
612
624
  async def extract(self, filename: Optional[str] = None) -> str:
613
625
  """Extract and save the current UI state to a JSON file.
614
-
626
+
615
627
  This function captures the current UI state including all UI elements
616
628
  and saves it to a JSON file for later analysis or reference.
617
-
629
+
618
630
  Args:
619
631
  filename: Optional filename to save the UI state (defaults to ui_state_TIMESTAMP.json)
620
-
632
+
621
633
  Returns:
622
634
  Path to the saved JSON file
623
635
  """
@@ -626,32 +638,32 @@ class Tools:
626
638
  if not filename:
627
639
  timestamp = int(time.time())
628
640
  filename = f"ui_state_{timestamp}.json"
629
-
641
+
630
642
  # Ensure the filename ends with .json
631
643
  if not filename.endswith(".json"):
632
644
  filename += ".json"
633
-
645
+
634
646
  # Get the UI elements
635
647
  ui_elements = await self.get_all_elements(self.serial)
636
-
648
+
637
649
  # Save to file
638
650
  save_path = os.path.abspath(filename)
639
651
  async with aiofiles.open(save_path, "w", encoding="utf-8") as f:
640
652
  await f.write(json.dumps(ui_elements, indent=2))
641
-
653
+
642
654
  return f"UI state extracted and saved to {save_path}"
643
-
655
+
644
656
  except Exception as e:
645
657
  return f"Error extracting UI state: {e}"
646
658
 
647
659
  async def get_all_elements(self) -> Dict[str, Any]:
648
660
  """
649
661
  Get all UI elements from the device, including non-interactive elements.
650
-
662
+
651
663
  This function interacts with the TopViewService app installed on the device
652
664
  to capture all UI elements, even those that are not interactive. This provides
653
665
  a complete view of the UI hierarchy for analysis or debugging purposes.
654
-
666
+
655
667
  Returns:
656
668
  Dictionary containing all UI elements extracted from the device screen
657
669
  """
@@ -661,85 +673,98 @@ class Tools:
661
673
  device = await device_manager.get_device(self.serial)
662
674
  if not device:
663
675
  raise ValueError(f"Device {self.serial} not found")
664
-
676
+
665
677
  # Create a temporary file for the JSON
666
678
  with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as temp:
667
679
  local_path = temp.name
668
-
680
+
669
681
  try:
670
682
  # Clear logcat to make it easier to find our output
671
683
  await device._adb.shell(device._serial, "logcat -c")
672
-
684
+
673
685
  # Trigger the custom service via broadcast to get ALL elements
674
- await device._adb.shell(device._serial, "am broadcast -a com.droidrun.portal.GET_ALL_ELEMENTS")
675
-
686
+ await device._adb.shell(
687
+ device._serial,
688
+ "am broadcast -a com.droidrun.portal.GET_ALL_ELEMENTS",
689
+ )
690
+
676
691
  # Poll for the JSON file path
677
692
  start_time = asyncio.get_event_loop().time()
678
693
  max_wait_time = 10 # Maximum wait time in seconds
679
694
  poll_interval = 0.2 # Check every 200ms
680
-
695
+
681
696
  device_path = None
682
697
  while asyncio.get_event_loop().time() - start_time < max_wait_time:
683
698
  # Check logcat for the file path
684
- logcat_output = await device._adb.shell(device._serial, "logcat -d | grep \"DROIDRUN_FILE\" | grep \"JSON data written to\" | tail -1")
685
-
699
+ logcat_output = await device._adb.shell(
700
+ device._serial,
701
+ 'logcat -d | grep "DROIDRUN_FILE" | grep "JSON data written to" | tail -1',
702
+ )
703
+
686
704
  # Parse the file path if present
687
705
  match = re.search(r"JSON data written to: (.*)", logcat_output)
688
706
  if match:
689
707
  device_path = match.group(1).strip()
690
708
  break
691
-
709
+
692
710
  # Wait before polling again
693
711
  await asyncio.sleep(poll_interval)
694
-
712
+
695
713
  # Check if we found the file path
696
714
  if not device_path:
697
- raise ValueError(f"Failed to find the JSON file path in logcat after {max_wait_time} seconds")
698
-
715
+ raise ValueError(
716
+ f"Failed to find the JSON file path in logcat after {max_wait_time} seconds"
717
+ )
718
+
699
719
  # Pull the JSON file from the device
700
720
  await device._adb.pull_file(device._serial, device_path, local_path)
701
-
721
+
702
722
  # Read the JSON file
703
723
  async with aiofiles.open(local_path, "r", encoding="utf-8") as f:
704
724
  json_content = await f.read()
705
-
725
+
706
726
  # Clean up the temporary file
707
727
  with contextlib.suppress(OSError):
708
728
  os.unlink(local_path)
709
-
729
+
710
730
  # Try to parse the JSON
711
731
  import json
732
+
712
733
  try:
713
734
  ui_data = json.loads(json_content)
714
-
735
+
715
736
  return {
716
737
  "all_elements": ui_data,
717
- "count": len(ui_data) if isinstance(ui_data, list) else sum(1 for _ in ui_data.get("elements", [])),
718
- "message": "Retrieved all UI elements from the device screen"
738
+ "count": (
739
+ len(ui_data)
740
+ if isinstance(ui_data, list)
741
+ else sum(1 for _ in ui_data.get("elements", []))
742
+ ),
743
+ "message": "Retrieved all UI elements from the device screen",
719
744
  }
720
745
  except json.JSONDecodeError:
721
746
  raise ValueError("Failed to parse UI elements JSON data")
722
-
747
+
723
748
  except Exception as e:
724
749
  # Clean up in case of error
725
750
  with contextlib.suppress(OSError):
726
751
  os.unlink(local_path)
727
752
  raise ValueError(f"Error retrieving all UI elements: {e}")
728
-
753
+
729
754
  except Exception as e:
730
755
  raise ValueError(f"Error getting all UI elements: {e}")
731
-
756
+
732
757
  def complete(self, success: bool, reason: str = ""):
733
758
  """
734
759
  Mark the task as finished.
735
760
 
736
761
  Args:
737
762
  success: Indicates if the task was successful.
738
- reason: Reason for failure, if any. (required if success is False)
763
+ reason: Reason for failure/success
739
764
  """
740
765
  if success:
741
766
  self.success = True
742
- self.reason = self.reason or "Task completed successfully."
767
+ self.reason = reason or "Task completed successfully."
743
768
  self.finished = True
744
769
  else:
745
770
  self.success = False
@@ -748,14 +773,13 @@ class Tools:
748
773
  self.reason = reason
749
774
  self.finished = True
750
775
 
751
-
752
776
  async def get_phone_state(self, serial: Optional[str] = None) -> Dict[str, Any]:
753
777
  """
754
778
  Get the current phone state including current activity and keyboard visibility.
755
-
779
+
756
780
  Args:
757
781
  serial: Optional device serial number
758
-
782
+
759
783
  Returns:
760
784
  Dictionary with current phone state information
761
785
  """
@@ -768,71 +792,88 @@ class Tools:
768
792
  raise ValueError(f"Device {serial} not found")
769
793
  else:
770
794
  device = await self.get_device()
771
-
772
- # Get the top resumed activity
773
- activity_output = await device._adb.shell(device._serial, "dumpsys activity activities | grep topResumedActivity")
774
-
775
- if not activity_output:
776
- # Try alternative command for older Android versions
777
- activity_output = await device._adb.shell(device._serial, "dumpsys activity activities | grep ResumedActivity")
778
-
779
- # Get keyboard visibility state
780
- keyboard_output = await device._adb.shell(device._serial, "dumpsys input_method | grep mInputShown")
781
-
782
- # Process activity information
783
- current_activity = "Unable to determine current activity"
784
- if activity_output:
785
- current_activity = activity_output.strip()
786
-
787
- # Process keyboard information
788
- is_keyboard_shown = False
789
- if keyboard_output:
790
- is_keyboard_shown = "mInputShown=true" in keyboard_output
791
-
792
- # Return combined state
795
+
796
+ # Clear logcat to make it easier to find our output
797
+ await device._adb.shell(device._serial, "logcat -c")
798
+
799
+ # Trigger the custom service via broadcast to get phone state
800
+ await device._adb.shell(
801
+ device._serial, "am broadcast -a com.droidrun.portal.GET_PHONE_STATE"
802
+ )
803
+
804
+ # Poll for the phone state data in logcat
805
+ start_time = asyncio.get_event_loop().time()
806
+ max_wait_time = 10 # Maximum wait time in seconds
807
+ poll_interval = 0.2 # Check every 200ms
808
+
809
+ while asyncio.get_event_loop().time() - start_time < max_wait_time:
810
+ # Check logcat for the phone state data
811
+ logcat_output = await device._adb.shell(
812
+ device._serial,
813
+ 'logcat -d | grep "DROIDRUN_PHONE_STATE_DATA" | tail -1',
814
+ )
815
+
816
+ # Parse the JSON data if present
817
+ if "CHUNK|" in logcat_output:
818
+ # Format: DROIDRUN_PHONE_STATE_DATA: CHUNK|0|1|{json_data}
819
+ # Extract the JSON part after the last |
820
+ parts = logcat_output.split("|")
821
+ if len(parts) >= 4:
822
+ json_data = "|".join(
823
+ parts[3:]
824
+ ) # In case JSON contains | characters
825
+ try:
826
+ phone_state = json.loads(json_data)
827
+ return phone_state
828
+ except json.JSONDecodeError:
829
+ # If JSON parsing failed, wait and retry
830
+ await asyncio.sleep(poll_interval)
831
+ continue
832
+
833
+ # Wait before polling again
834
+ await asyncio.sleep(poll_interval)
835
+
836
+ # If we couldn't get the phone state, return error
793
837
  return {
794
- "current_activity": current_activity,
795
- "keyboard_shown": is_keyboard_shown,
838
+ "error": "Timeout",
839
+ "message": f"Failed to get phone state data after {max_wait_time} seconds",
796
840
  }
797
-
841
+
798
842
  except Exception as e:
799
- return {
800
- "error": str(e),
801
- "message": f"Error getting phone state: {str(e)}"
802
- }
843
+ return {"error": str(e), "message": f"Error getting phone state: {str(e)}"}
803
844
 
804
845
  async def remember(self, information: str) -> str:
805
846
  """
806
847
  Store important information to remember for future context.
807
-
808
- This information will be included in future LLM prompts to help maintain context
848
+
849
+ This information will be extracted and included into your next steps to maintain context
809
850
  across interactions. Use this for critical facts, observations, or user preferences
810
851
  that should influence future decisions.
811
-
852
+
812
853
  Args:
813
854
  information: The information to remember
814
-
855
+
815
856
  Returns:
816
857
  Confirmation message
817
858
  """
818
859
  if not information or not isinstance(information, str):
819
860
  return "Error: Please provide valid information to remember."
820
-
861
+
821
862
  # Add the information to memory
822
863
  self.memory.append(information.strip())
823
-
864
+
824
865
  # Limit memory size to prevent context overflow (keep most recent items)
825
866
  max_memory_items = 10
826
867
  if len(self.memory) > max_memory_items:
827
868
  self.memory = self.memory[-max_memory_items:]
828
-
869
+
829
870
  return f"Remembered: {information}"
830
-
871
+
831
872
  def get_memory(self) -> List[str]:
832
873
  """
833
874
  Retrieve all stored memory items.
834
-
875
+
835
876
  Returns:
836
877
  List of stored memory items
837
878
  """
838
- return self.memory.copy()
879
+ return self.memory.copy()