droidrun 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- droidrun/__init__.py +16 -11
- droidrun/__main__.py +1 -1
- droidrun/adb/__init__.py +3 -3
- droidrun/adb/device.py +1 -1
- droidrun/adb/manager.py +2 -2
- droidrun/agent/__init__.py +6 -0
- droidrun/agent/codeact/__init__.py +2 -4
- droidrun/agent/codeact/codeact_agent.py +321 -235
- droidrun/agent/codeact/events.py +12 -20
- droidrun/agent/codeact/prompts.py +0 -52
- droidrun/agent/common/default.py +5 -0
- droidrun/agent/common/events.py +4 -0
- droidrun/agent/context/__init__.py +23 -0
- droidrun/agent/context/agent_persona.py +15 -0
- droidrun/agent/context/context_injection_manager.py +66 -0
- droidrun/agent/context/episodic_memory.py +15 -0
- droidrun/agent/context/personas/__init__.py +11 -0
- droidrun/agent/context/personas/app_starter.py +44 -0
- droidrun/agent/context/personas/default.py +95 -0
- droidrun/agent/context/personas/extractor.py +52 -0
- droidrun/agent/context/personas/ui_expert.py +107 -0
- droidrun/agent/context/reflection.py +20 -0
- droidrun/agent/context/task_manager.py +124 -0
- droidrun/agent/context/todo.txt +4 -0
- droidrun/agent/droid/__init__.py +2 -2
- droidrun/agent/droid/droid_agent.py +264 -325
- droidrun/agent/droid/events.py +28 -0
- droidrun/agent/oneflows/reflector.py +265 -0
- droidrun/agent/planner/__init__.py +2 -4
- droidrun/agent/planner/events.py +9 -13
- droidrun/agent/planner/planner_agent.py +268 -0
- droidrun/agent/planner/prompts.py +33 -53
- droidrun/agent/utils/__init__.py +3 -0
- droidrun/agent/utils/async_utils.py +1 -40
- droidrun/agent/utils/chat_utils.py +268 -48
- droidrun/agent/utils/executer.py +49 -14
- droidrun/agent/utils/llm_picker.py +14 -10
- droidrun/agent/utils/trajectory.py +184 -0
- droidrun/cli/__init__.py +1 -1
- droidrun/cli/logs.py +283 -0
- droidrun/cli/main.py +333 -439
- droidrun/run.py +105 -0
- droidrun/tools/__init__.py +5 -10
- droidrun/tools/{actions.py → adb.py} +279 -238
- droidrun/tools/ios.py +594 -0
- droidrun/tools/tools.py +99 -0
- droidrun-0.3.0.dist-info/METADATA +149 -0
- droidrun-0.3.0.dist-info/RECORD +52 -0
- droidrun/agent/planner/task_manager.py +0 -355
- droidrun/agent/planner/workflow.py +0 -371
- droidrun/tools/device.py +0 -29
- droidrun/tools/loader.py +0 -60
- droidrun-0.2.0.dist-info/METADATA +0 -373
- droidrun-0.2.0.dist-info/RECORD +0 -32
- {droidrun-0.2.0.dist-info → droidrun-0.3.0.dist-info}/WHEEL +0 -0
- {droidrun-0.2.0.dist-info → droidrun-0.3.0.dist-info}/entry_points.txt +0 -0
- {droidrun-0.2.0.dist-info → droidrun-0.3.0.dist-info}/licenses/LICENSE +0 -0
droidrun/tools/ios.py
ADDED
@@ -0,0 +1,594 @@
|
|
1
|
+
"""
|
2
|
+
UI Actions - Core UI interaction tools for iOS device control.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import re
|
6
|
+
import time
|
7
|
+
import asyncio
|
8
|
+
from typing import Optional, Dict, Tuple, List, Any
|
9
|
+
import logging
|
10
|
+
import aiohttp
|
11
|
+
from droidrun.tools.tools import Tools
|
12
|
+
|
13
|
+
logger = logging.getLogger("IOS")
|
14
|
+
|
15
|
+
SYSTEM_BUNDLE_IDENTIFIERS = [
|
16
|
+
"ai.droidrun.droidrun-ios-portal",
|
17
|
+
"com.apple.Bridge",
|
18
|
+
"com.apple.DocumentsApp",
|
19
|
+
"com.apple.Fitness",
|
20
|
+
"com.apple.Health",
|
21
|
+
"com.apple.Maps",
|
22
|
+
"com.apple.MobileAddressBook",
|
23
|
+
"com.apple.MobileSMS",
|
24
|
+
"com.apple.Passbook",
|
25
|
+
"com.apple.Passwords",
|
26
|
+
"com.apple.Preferences",
|
27
|
+
"com.apple.PreviewShell",
|
28
|
+
"com.apple.mobilecal",
|
29
|
+
"com.apple.mobilesafari",
|
30
|
+
"com.apple.mobileslideshow",
|
31
|
+
"com.apple.news",
|
32
|
+
"com.apple.reminders",
|
33
|
+
"com.apple.shortcuts",
|
34
|
+
"com.apple.webapp",
|
35
|
+
]
|
36
|
+
|
37
|
+
|
38
|
+
class IOSTools(Tools):
|
39
|
+
"""Core UI interaction tools for iOS device control."""
|
40
|
+
|
41
|
+
def __init__(self, url: str, bundle_identifiers: List[str] = []) -> None:
|
42
|
+
self.clickable_elements_cache: List[Dict[str, Any]] = []
|
43
|
+
self.url = url
|
44
|
+
self.last_screenshot = None
|
45
|
+
self.reason = None
|
46
|
+
self.success = None
|
47
|
+
self.finished = False
|
48
|
+
self.memory: List[str] = []
|
49
|
+
self.screenshots: List[Dict[str, Any]] = []
|
50
|
+
self.last_tapped_rect: Optional[str] = (
|
51
|
+
None # Store last tapped element's rect for text input
|
52
|
+
)
|
53
|
+
self.bundle_identifiers = bundle_identifiers
|
54
|
+
logger.info(f"iOS device URL: {url}")
|
55
|
+
|
56
|
+
async def get_clickables(
|
57
|
+
self, serial: Optional[str] = None
|
58
|
+
) -> List[Dict[str, Any]]:
|
59
|
+
"""
|
60
|
+
Get all clickable UI elements from the iOS device using accessibility API.
|
61
|
+
|
62
|
+
Args:
|
63
|
+
serial: Optional device URL (not used for iOS, uses instance URL)
|
64
|
+
|
65
|
+
Returns:
|
66
|
+
List of dictionaries containing UI elements extracted from the device screen
|
67
|
+
"""
|
68
|
+
try:
|
69
|
+
async with aiohttp.ClientSession() as session:
|
70
|
+
a11y_url = f"{self.url}/vision/a11y"
|
71
|
+
async with session.get(a11y_url) as response:
|
72
|
+
if response.status == 200:
|
73
|
+
a11y_data = await response.json()
|
74
|
+
|
75
|
+
# Parse the iOS accessibility tree format
|
76
|
+
elements = self._parse_ios_accessibility_tree(
|
77
|
+
a11y_data["accessibilityTree"]
|
78
|
+
)
|
79
|
+
|
80
|
+
# Cache the elements for tap_by_index usage
|
81
|
+
self.clickable_elements_cache = elements
|
82
|
+
|
83
|
+
return self.clickable_elements_cache
|
84
|
+
else:
|
85
|
+
logger.error(
|
86
|
+
f"Failed to get accessibility data: HTTP {response.status}"
|
87
|
+
)
|
88
|
+
raise ValueError(
|
89
|
+
f"Failed to get accessibility data: HTTP {response.status}"
|
90
|
+
)
|
91
|
+
|
92
|
+
except Exception as e:
|
93
|
+
logger.error(f"Error getting clickable elements: {e}")
|
94
|
+
# raise ValueError(f"Error getting clickable elements: {e}")
|
95
|
+
|
96
|
+
def _parse_ios_accessibility_tree(self, a11y_data: str) -> List[Dict[str, Any]]:
|
97
|
+
"""
|
98
|
+
Parse iOS accessibility tree format into structured elements.
|
99
|
+
|
100
|
+
Args:
|
101
|
+
a11y_data: Raw accessibility data from iOS device
|
102
|
+
|
103
|
+
Returns:
|
104
|
+
List of parsed UI elements with coordinates and properties
|
105
|
+
"""
|
106
|
+
elements = []
|
107
|
+
lines = a11y_data.strip().split("\n")
|
108
|
+
|
109
|
+
# Track current element index
|
110
|
+
element_index = 0
|
111
|
+
|
112
|
+
for line in lines:
|
113
|
+
# Skip empty lines and header lines
|
114
|
+
if (
|
115
|
+
not line.strip()
|
116
|
+
or line.startswith("Attributes:")
|
117
|
+
or line.startswith("Element subtree:")
|
118
|
+
or line.startswith("Path to element:")
|
119
|
+
or line.startswith("Query chain:")
|
120
|
+
):
|
121
|
+
continue
|
122
|
+
|
123
|
+
# Parse UI elements - look for lines with coordinates
|
124
|
+
# Format: ElementType, {{x, y}, {width, height}}, [optional properties]
|
125
|
+
coord_match = re.search(
|
126
|
+
r"\{\{([0-9.]+),\s*([0-9.]+)\},\s*\{([0-9.]+),\s*([0-9.]+)\}\}", line
|
127
|
+
)
|
128
|
+
|
129
|
+
if coord_match:
|
130
|
+
x, y, width, height = map(float, coord_match.groups())
|
131
|
+
|
132
|
+
# Extract element type (the text before the first comma)
|
133
|
+
element_type_match = re.match(r"\s*(.+?),", line)
|
134
|
+
element_type = (
|
135
|
+
element_type_match.group(1).strip()
|
136
|
+
if element_type_match
|
137
|
+
else "Unknown"
|
138
|
+
)
|
139
|
+
|
140
|
+
# Remove leading arrows and spaces
|
141
|
+
element_type = re.sub(r"^[→\s]+", "", element_type)
|
142
|
+
|
143
|
+
# Extract label if present
|
144
|
+
label_match = re.search(r"label:\s*'([^']*)'", line)
|
145
|
+
label = label_match.group(1) if label_match else ""
|
146
|
+
|
147
|
+
# Extract identifier if present
|
148
|
+
identifier_match = re.search(r"identifier:\s*'([^']*)'", line)
|
149
|
+
identifier = identifier_match.group(1) if identifier_match else ""
|
150
|
+
|
151
|
+
# Extract placeholder value if present
|
152
|
+
placeholder_match = re.search(r"placeholderValue:\s*'([^']*)'", line)
|
153
|
+
placeholder = placeholder_match.group(1) if placeholder_match else ""
|
154
|
+
|
155
|
+
# Extract value if present
|
156
|
+
value_match = re.search(r"value:\s*([^,}]+)", line)
|
157
|
+
value = value_match.group(1).strip() if value_match else ""
|
158
|
+
|
159
|
+
# Calculate rect string for iOS tap API (x,y,width,height format)
|
160
|
+
rect_str = f"{x},{y},{width},{height}"
|
161
|
+
|
162
|
+
# Create element dictionary
|
163
|
+
element = {
|
164
|
+
"index": element_index,
|
165
|
+
"type": element_type,
|
166
|
+
"className": element_type,
|
167
|
+
"text": label or identifier or placeholder or "",
|
168
|
+
"label": label,
|
169
|
+
"identifier": identifier,
|
170
|
+
"placeholder": placeholder,
|
171
|
+
"value": value,
|
172
|
+
"bounds": f"{x},{y},{x+width},{y+height}", # left,top,right,bottom format for compatibility
|
173
|
+
"rect": rect_str, # x,y,width,height format for iOS API
|
174
|
+
"x": x,
|
175
|
+
"y": y,
|
176
|
+
"width": width,
|
177
|
+
"height": height,
|
178
|
+
"center_x": x + width / 2,
|
179
|
+
"center_y": y + height / 2,
|
180
|
+
}
|
181
|
+
|
182
|
+
# Only include interactive elements (buttons, text fields, etc.)
|
183
|
+
interactive_types = [
|
184
|
+
"Button",
|
185
|
+
"SearchField",
|
186
|
+
"TextField",
|
187
|
+
"Cell",
|
188
|
+
"Switch",
|
189
|
+
"Slider",
|
190
|
+
"Stepper",
|
191
|
+
"Picker",
|
192
|
+
"Link",
|
193
|
+
]
|
194
|
+
if any(
|
195
|
+
interactive_type in element_type
|
196
|
+
for interactive_type in interactive_types
|
197
|
+
):
|
198
|
+
elements.append(element)
|
199
|
+
element_index += 1
|
200
|
+
|
201
|
+
return elements
|
202
|
+
|
203
|
+
async def tap_by_index(self, index: int, serial: Optional[str] = None) -> str:
|
204
|
+
"""
|
205
|
+
Tap on a UI element by its index.
|
206
|
+
|
207
|
+
This function uses the cached clickable elements
|
208
|
+
to find the element with the given index and tap on its center coordinates.
|
209
|
+
|
210
|
+
Args:
|
211
|
+
index: Index of the element to tap
|
212
|
+
|
213
|
+
Returns:
|
214
|
+
Result message
|
215
|
+
"""
|
216
|
+
|
217
|
+
def find_element_by_index(elements, target_index):
|
218
|
+
"""Find an element with the given index."""
|
219
|
+
for item in elements:
|
220
|
+
if item.get("index") == target_index:
|
221
|
+
return item
|
222
|
+
# Check children if present
|
223
|
+
children = item.get("children", [])
|
224
|
+
result = find_element_by_index(children, target_index)
|
225
|
+
if result:
|
226
|
+
return result
|
227
|
+
return None
|
228
|
+
|
229
|
+
try:
|
230
|
+
# Check if we have cached elements
|
231
|
+
if not self.clickable_elements_cache:
|
232
|
+
return "Error: No UI elements cached. Call get_clickables first."
|
233
|
+
|
234
|
+
# Find the element with the given index
|
235
|
+
element = find_element_by_index(self.clickable_elements_cache, index)
|
236
|
+
|
237
|
+
if not element:
|
238
|
+
# List available indices to help the user
|
239
|
+
indices = [
|
240
|
+
elem.get("index")
|
241
|
+
for elem in self.clickable_elements_cache
|
242
|
+
if elem.get("index") is not None
|
243
|
+
]
|
244
|
+
indices_str = ", ".join(str(idx) for idx in sorted(indices)[:20])
|
245
|
+
if len(indices) > 20:
|
246
|
+
indices_str += f"... and {len(indices) - 20} more"
|
247
|
+
|
248
|
+
return f"Error: No element found with index {index}. Available indices: {indices_str}"
|
249
|
+
|
250
|
+
# Get the element coordinates
|
251
|
+
x = element.get("x", 0)
|
252
|
+
y = element.get("y", 0)
|
253
|
+
width = element.get("width", 0)
|
254
|
+
height = element.get("height", 0)
|
255
|
+
|
256
|
+
if not all(
|
257
|
+
[x is not None, y is not None, width is not None, height is not None]
|
258
|
+
):
|
259
|
+
element_text = element.get("text", "No text")
|
260
|
+
element_class = element.get("className", "Unknown class")
|
261
|
+
return f"Error: Element with index {index} ('{element_text}', {element_class}) has no coordinates and cannot be tapped"
|
262
|
+
|
263
|
+
# Format rect in iOS format: {{x,y},{w,h}}
|
264
|
+
ios_rect = f"{{{{{x},{y}}},{{{width},{height}}}}}"
|
265
|
+
|
266
|
+
# Store the rect for potential text input (keep in simple format for text input)
|
267
|
+
self.last_tapped_rect = f"{x},{y},{width},{height}"
|
268
|
+
|
269
|
+
# Make the tap request
|
270
|
+
async with aiohttp.ClientSession() as session:
|
271
|
+
tap_url = f"{self.url}/gestures/tap"
|
272
|
+
payload = {"rect": ios_rect, "count": 1, "longPress": False}
|
273
|
+
|
274
|
+
logger.info(f"payload {payload}")
|
275
|
+
|
276
|
+
async with session.post(tap_url, json=payload) as response:
|
277
|
+
if response.status == 200:
|
278
|
+
# Add a small delay to allow UI to update
|
279
|
+
await asyncio.sleep(0.5)
|
280
|
+
|
281
|
+
# Create a descriptive response
|
282
|
+
response_parts = []
|
283
|
+
response_parts.append(f"Tapped element with index {index}")
|
284
|
+
response_parts.append(
|
285
|
+
f"Text: '{element.get('text', 'No text')}'"
|
286
|
+
)
|
287
|
+
response_parts.append(
|
288
|
+
f"Class: {element.get('className', 'Unknown class')}"
|
289
|
+
)
|
290
|
+
response_parts.append(f"Rect: {ios_rect}")
|
291
|
+
|
292
|
+
return " | ".join(response_parts)
|
293
|
+
else:
|
294
|
+
return f"Error: Failed to tap element. HTTP {response.status}"
|
295
|
+
|
296
|
+
except Exception as e:
|
297
|
+
return f"Error: {str(e)}"
|
298
|
+
|
299
|
+
"""async def tap_by_coordinates(self, x: int, y: int) -> bool:
|
300
|
+
# Format rect in iOS format: {{x,y},{w,h}}
|
301
|
+
width = 1
|
302
|
+
height = 1
|
303
|
+
ios_rect = f"{{{{{x},{y}}},{{{width},{height}}}}}"
|
304
|
+
|
305
|
+
# Make the tap request
|
306
|
+
async with aiohttp.ClientSession() as session:
|
307
|
+
tap_url = f"{self.url}/gestures/tap"
|
308
|
+
payload = {"rect": ios_rect, "count": 1, "longPress": False}
|
309
|
+
|
310
|
+
logger.info(f"payload {payload}")
|
311
|
+
|
312
|
+
async with session.post(tap_url, json=payload) as response:
|
313
|
+
if response.status == 200:
|
314
|
+
return True
|
315
|
+
else:
|
316
|
+
return False"""
|
317
|
+
|
318
|
+
async def tap(self, index: int) -> str:
|
319
|
+
"""
|
320
|
+
Tap on a UI element by its index.
|
321
|
+
|
322
|
+
This function uses the cached clickable elements from the last get_clickables call
|
323
|
+
to find the element with the given index and tap on its center coordinates.
|
324
|
+
|
325
|
+
Args:
|
326
|
+
index: Index of the element to tap
|
327
|
+
|
328
|
+
Returns:
|
329
|
+
Result message
|
330
|
+
"""
|
331
|
+
return await self.tap_by_index(index)
|
332
|
+
|
333
|
+
async def swipe(
|
334
|
+
self, start_x: int, start_y: int, end_x: int, end_y: int, duration_ms: int = 300
|
335
|
+
) -> bool:
|
336
|
+
"""
|
337
|
+
Performs a straight-line swipe gesture on the device screen.
|
338
|
+
To perform a hold (long press), set the start and end coordinates to the same values and increase the duration as needed.
|
339
|
+
Args:
|
340
|
+
start_x: Starting X coordinate
|
341
|
+
start_y: Starting Y coordinate
|
342
|
+
end_x: Ending X coordinate
|
343
|
+
end_y: Ending Y coordinate
|
344
|
+
duration_ms: Duration of swipe in milliseconds (not used in iOS API)
|
345
|
+
Returns:
|
346
|
+
Bool indicating success or failure
|
347
|
+
"""
|
348
|
+
try:
|
349
|
+
# Calculate swipe direction based on coordinates
|
350
|
+
dx = end_x - start_x
|
351
|
+
dy = end_y - start_y
|
352
|
+
|
353
|
+
# Determine primary direction
|
354
|
+
if abs(dx) > abs(dy):
|
355
|
+
direction = "right" if dx > 0 else "left"
|
356
|
+
else:
|
357
|
+
direction = "down" if dy > 0 else "up"
|
358
|
+
|
359
|
+
async with aiohttp.ClientSession() as session:
|
360
|
+
swipe_url = f"{self.url}/gestures/swipe"
|
361
|
+
payload = {"x": float(start_x), "y": float(start_y), "dir": direction}
|
362
|
+
|
363
|
+
async with session.post(swipe_url, json=payload) as response:
|
364
|
+
if response.status == 200:
|
365
|
+
logger.info(
|
366
|
+
f"Swiped from ({start_x}, {start_y}) to ({end_x}, {end_y}) direction: {direction}"
|
367
|
+
)
|
368
|
+
return True
|
369
|
+
else:
|
370
|
+
logger.error(f"Failed to swipe: HTTP {response.status}")
|
371
|
+
return False
|
372
|
+
|
373
|
+
except Exception as e:
|
374
|
+
logger.error(f"Error performing swipe: {e}")
|
375
|
+
return False
|
376
|
+
|
377
|
+
async def input_text(self, text: str, serial: Optional[str] = None) -> str:
|
378
|
+
"""
|
379
|
+
Input text on the iOS device.
|
380
|
+
|
381
|
+
Args:
|
382
|
+
text: Text to input. Can contain spaces, newlines, and special characters including non-ASCII.
|
383
|
+
serial: Optional device serial (not used for iOS, uses instance URL)
|
384
|
+
|
385
|
+
Returns:
|
386
|
+
Result message
|
387
|
+
"""
|
388
|
+
try:
|
389
|
+
# Use the last tapped element's rect if available, otherwise use a default
|
390
|
+
rect = self.last_tapped_rect if self.last_tapped_rect else "0,0,100,100"
|
391
|
+
|
392
|
+
async with aiohttp.ClientSession() as session:
|
393
|
+
type_url = f"{self.url}/inputs/type"
|
394
|
+
payload = {"rect": rect, "text": text}
|
395
|
+
|
396
|
+
async with session.post(type_url, json=payload) as response:
|
397
|
+
if response.status == 200:
|
398
|
+
await asyncio.sleep(0.5) # Wait for text input to complete
|
399
|
+
return f"Text input completed: {text[:50]}{'...' if len(text) > 50 else ''}"
|
400
|
+
else:
|
401
|
+
return f"Error: Failed to input text. HTTP {response.status}"
|
402
|
+
|
403
|
+
except Exception as e:
|
404
|
+
return f"Error sending text input: {str(e)}"
|
405
|
+
|
406
|
+
async def back(self) -> str:
|
407
|
+
raise NotImplementedError("Back is not yet implemented for iOS")
|
408
|
+
|
409
|
+
async def press_key(self, keycode: int) -> str:
|
410
|
+
# TODO: refactor this. its not about physical keys but BACK, ENTER, DELETE, etc.
|
411
|
+
"""
|
412
|
+
Press a key on the iOS device.
|
413
|
+
|
414
|
+
iOS Key codes:
|
415
|
+
- 0: HOME
|
416
|
+
- 4: ACTION
|
417
|
+
- 5: CAMERA
|
418
|
+
|
419
|
+
Args:
|
420
|
+
keycode: iOS keycode to press
|
421
|
+
"""
|
422
|
+
try:
|
423
|
+
key_names = {0: "HOME", 4: "ACTION", 5: "CAMERA"}
|
424
|
+
key_name = key_names.get(keycode, str(keycode))
|
425
|
+
|
426
|
+
async with aiohttp.ClientSession() as session:
|
427
|
+
key_url = f"{self.url}/inputs/key"
|
428
|
+
payload = {"key": keycode}
|
429
|
+
|
430
|
+
async with session.post(key_url, json=payload) as response:
|
431
|
+
if response.status == 200:
|
432
|
+
return f"Pressed key {key_name}"
|
433
|
+
else:
|
434
|
+
return f"Error: Failed to press key. HTTP {response.status}"
|
435
|
+
|
436
|
+
except Exception as e:
|
437
|
+
return f"Error pressing key: {str(e)}"
|
438
|
+
|
439
|
+
async def start_app(self, package: str, activity: str = "") -> str:
|
440
|
+
"""
|
441
|
+
Start an app on the iOS device.
|
442
|
+
|
443
|
+
Args:
|
444
|
+
package: Bundle identifier (e.g., "com.apple.MobileSMS")
|
445
|
+
activity: Optional activity name (not used on iOS)
|
446
|
+
"""
|
447
|
+
try:
|
448
|
+
async with aiohttp.ClientSession() as session:
|
449
|
+
launch_url = f"{self.url}/inputs/launch"
|
450
|
+
payload = {"bundleIdentifier": package}
|
451
|
+
|
452
|
+
async with session.post(launch_url, json=payload) as response:
|
453
|
+
if response.status == 200:
|
454
|
+
await asyncio.sleep(1.0) # Wait for app to launch
|
455
|
+
return f"Successfully launched app: {package}"
|
456
|
+
else:
|
457
|
+
return f"Error: Failed to launch app {package}. HTTP {response.status}"
|
458
|
+
|
459
|
+
except Exception as e:
|
460
|
+
return f"Error launching app: {str(e)}"
|
461
|
+
|
462
|
+
async def take_screenshot(self) -> Tuple[str, bytes]:
|
463
|
+
"""
|
464
|
+
Take a screenshot of the iOS device.
|
465
|
+
This function captures the current screen and adds the screenshot to context in the next message.
|
466
|
+
Also stores the screenshot in the screenshots list with timestamp for later GIF creation.
|
467
|
+
"""
|
468
|
+
try:
|
469
|
+
async with aiohttp.ClientSession() as session:
|
470
|
+
screenshot_url = f"{self.url}/vision/screenshot"
|
471
|
+
async with session.get(screenshot_url) as response:
|
472
|
+
if response.status == 200:
|
473
|
+
screenshot_data = await response.read()
|
474
|
+
|
475
|
+
# Store screenshot with timestamp
|
476
|
+
screenshot_info = {
|
477
|
+
"timestamp": time.time(),
|
478
|
+
"data": screenshot_data,
|
479
|
+
}
|
480
|
+
self.screenshots.append(screenshot_info)
|
481
|
+
self.last_screenshot = screenshot_data
|
482
|
+
|
483
|
+
logger.info(
|
484
|
+
f"Screenshot captured successfully, size: {len(screenshot_data)} bytes"
|
485
|
+
)
|
486
|
+
return ("PNG", screenshot_data)
|
487
|
+
else:
|
488
|
+
logger.error(
|
489
|
+
f"Failed to capture screenshot: HTTP {response.status}"
|
490
|
+
)
|
491
|
+
raise ValueError(
|
492
|
+
f"Failed to capture screenshot: HTTP {response.status}"
|
493
|
+
)
|
494
|
+
|
495
|
+
except Exception as e:
|
496
|
+
logger.error(f"Error capturing screenshot: {e}")
|
497
|
+
raise ValueError(f"Error taking screenshot: {str(e)}")
|
498
|
+
|
499
|
+
async def get_phone_state(self, serial: Optional[str] = None) -> Dict[str, Any]:
|
500
|
+
"""
|
501
|
+
Get the current phone state including current activity and keyboard visibility.
|
502
|
+
|
503
|
+
Args:
|
504
|
+
serial: Optional device serial number (not used for iOS)
|
505
|
+
|
506
|
+
Returns:
|
507
|
+
Dictionary with current phone state information
|
508
|
+
"""
|
509
|
+
try:
|
510
|
+
# For iOS, we can get some state info from the accessibility API
|
511
|
+
async with aiohttp.ClientSession() as session:
|
512
|
+
a11y_url = f"{self.url}/vision/state"
|
513
|
+
async with session.get(a11y_url) as response:
|
514
|
+
if response.status == 200:
|
515
|
+
state_data = await response.json()
|
516
|
+
|
517
|
+
return {
|
518
|
+
"current_activity": state_data["activity"],
|
519
|
+
"keyboard_shown": state_data["keyboardShown"],
|
520
|
+
}
|
521
|
+
else:
|
522
|
+
return {
|
523
|
+
"error": f"Failed to get device state: HTTP {response.status}",
|
524
|
+
"current_activity": "Unknown",
|
525
|
+
"keyboard_shown": False,
|
526
|
+
}
|
527
|
+
|
528
|
+
except Exception as e:
|
529
|
+
return {"error": str(e), "message": f"Error getting phone state: {str(e)}"}
|
530
|
+
|
531
|
+
async def list_packages(self, include_system_apps: bool = True) -> List[str]:
|
532
|
+
all_packages = set(self.bundle_identifiers)
|
533
|
+
if include_system_apps:
|
534
|
+
all_packages.update(SYSTEM_BUNDLE_IDENTIFIERS)
|
535
|
+
return sorted(list(all_packages))
|
536
|
+
|
537
|
+
async def extract(self, filename: str | None = None) -> str:
|
538
|
+
# TODO
|
539
|
+
return "not implemented"
|
540
|
+
|
541
|
+
async def remember(self, information: str) -> str:
|
542
|
+
"""
|
543
|
+
Store important information to remember for future context.
|
544
|
+
|
545
|
+
This information will be included in future LLM prompts to help maintain context
|
546
|
+
across interactions. Use this for critical facts, observations, or user preferences
|
547
|
+
that should influence future decisions.
|
548
|
+
|
549
|
+
Args:
|
550
|
+
information: The information to remember
|
551
|
+
|
552
|
+
Returns:
|
553
|
+
Confirmation message
|
554
|
+
"""
|
555
|
+
if not information or not isinstance(information, str):
|
556
|
+
return "Error: Please provide valid information to remember."
|
557
|
+
|
558
|
+
# Add the information to memory
|
559
|
+
self.memory.append(information.strip())
|
560
|
+
|
561
|
+
# Limit memory size to prevent context overflow (keep most recent items)
|
562
|
+
max_memory_items = 10
|
563
|
+
if len(self.memory) > max_memory_items:
|
564
|
+
self.memory = self.memory[-max_memory_items:]
|
565
|
+
|
566
|
+
return f"Remembered: {information}"
|
567
|
+
|
568
|
+
def get_memory(self) -> List[str]:
|
569
|
+
"""
|
570
|
+
Retrieve all stored memory items.
|
571
|
+
|
572
|
+
Returns:
|
573
|
+
List of stored memory items
|
574
|
+
"""
|
575
|
+
return self.memory.copy()
|
576
|
+
|
577
|
+
def complete(self, success: bool, reason: str = ""):
|
578
|
+
"""
|
579
|
+
Mark the task as finished.
|
580
|
+
|
581
|
+
Args:
|
582
|
+
success: Indicates if the task was successful.
|
583
|
+
reason: Reason for failure/success
|
584
|
+
"""
|
585
|
+
if success:
|
586
|
+
self.success = True
|
587
|
+
self.reason = reason or "Task completed successfully."
|
588
|
+
self.finished = True
|
589
|
+
else:
|
590
|
+
self.success = False
|
591
|
+
if not reason:
|
592
|
+
raise ValueError("Reason for failure is required if success is False.")
|
593
|
+
self.reason = reason
|
594
|
+
self.finished = True
|
droidrun/tools/tools.py
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import List, Optional, Dict, Any
|
3
|
+
import logging
|
4
|
+
from typing import Tuple, Dict, Callable, Any, Optional
|
5
|
+
|
6
|
+
# Get a logger for this module
|
7
|
+
logger = logging.getLogger(__name__)
|
8
|
+
|
9
|
+
|
10
|
+
class Tools(ABC):
|
11
|
+
@abstractmethod
|
12
|
+
async def get_clickables(self) -> str:
|
13
|
+
pass
|
14
|
+
|
15
|
+
@abstractmethod
|
16
|
+
async def tap_by_index(self, index: int) -> bool:
|
17
|
+
pass
|
18
|
+
|
19
|
+
#@abstractmethod
|
20
|
+
#async def tap_by_coordinates(self, x: int, y: int) -> bool:
|
21
|
+
# pass
|
22
|
+
|
23
|
+
@abstractmethod
|
24
|
+
async def swipe(
|
25
|
+
self, start_x: int, start_y: int, end_x: int, end_y: int, duration_ms: int = 300
|
26
|
+
) -> bool:
|
27
|
+
pass
|
28
|
+
|
29
|
+
@abstractmethod
|
30
|
+
async def input_text(self, text: str) -> bool:
|
31
|
+
pass
|
32
|
+
|
33
|
+
@abstractmethod
|
34
|
+
async def back(self) -> bool:
|
35
|
+
pass
|
36
|
+
|
37
|
+
@abstractmethod
|
38
|
+
async def press_key(self, keycode: int) -> bool:
|
39
|
+
pass
|
40
|
+
|
41
|
+
@abstractmethod
|
42
|
+
async def start_app(self, package: str, activity: str = "") -> bool:
|
43
|
+
pass
|
44
|
+
|
45
|
+
@abstractmethod
|
46
|
+
async def take_screenshot(self) -> Tuple[str, bytes]:
|
47
|
+
pass
|
48
|
+
|
49
|
+
@abstractmethod
|
50
|
+
async def get_phone_state(self) -> Dict[str, Any]:
|
51
|
+
pass
|
52
|
+
|
53
|
+
@abstractmethod
|
54
|
+
async def list_packages(self, include_system_apps: bool = False) -> List[str]:
|
55
|
+
pass
|
56
|
+
|
57
|
+
@abstractmethod
|
58
|
+
async def remember(self, information: str) -> str:
|
59
|
+
pass
|
60
|
+
|
61
|
+
@abstractmethod
|
62
|
+
async def get_memory(self) -> List[str]:
|
63
|
+
pass
|
64
|
+
|
65
|
+
@abstractmethod
|
66
|
+
async def extract(self, filename: Optional[str] = None) -> str:
|
67
|
+
pass
|
68
|
+
|
69
|
+
@abstractmethod
|
70
|
+
def complete(self, success: bool, reason: str = "") -> bool:
|
71
|
+
pass
|
72
|
+
|
73
|
+
|
74
|
+
def describe_tools(tools: Tools) -> Dict[str, Callable[..., Any]]:
|
75
|
+
"""
|
76
|
+
Describe the tools available for the given Tools instance.
|
77
|
+
|
78
|
+
Args:
|
79
|
+
tools: The Tools instance to describe.
|
80
|
+
|
81
|
+
Returns:
|
82
|
+
A dictionary mapping tool names to their descriptions.
|
83
|
+
"""
|
84
|
+
|
85
|
+
return {
|
86
|
+
# UI interaction
|
87
|
+
"swipe": tools.swipe,
|
88
|
+
"input_text": tools.input_text,
|
89
|
+
"press_key": tools.press_key,
|
90
|
+
"tap_by_index": tools.tap_by_index,
|
91
|
+
# "tap_by_coordinates": tools_instance.tap_by_coordinates,
|
92
|
+
# App management
|
93
|
+
"start_app": tools.start_app,
|
94
|
+
"list_packages": tools.list_packages,
|
95
|
+
# state management
|
96
|
+
"extract": tools.extract,
|
97
|
+
"remember": tools.remember,
|
98
|
+
"complete": tools.complete,
|
99
|
+
}
|