droidrun 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
droidrun/tools/ios.py CHANGED
@@ -4,10 +4,9 @@ UI Actions - Core UI interaction tools for iOS device control.
4
4
 
5
5
  import re
6
6
  import time
7
- import asyncio
8
7
  from typing import Optional, Dict, Tuple, List, Any
9
8
  import logging
10
- import aiohttp
9
+ import requests
11
10
  from droidrun.tools.tools import Tools
12
11
 
13
12
  logger = logging.getLogger("IOS")
@@ -39,6 +38,12 @@ class IOSTools(Tools):
39
38
  """Core UI interaction tools for iOS device control."""
40
39
 
41
40
  def __init__(self, url: str, bundle_identifiers: List[str] = []) -> None:
41
+ """Initialize the IOSTools instance.
42
+
43
+ Args:
44
+ url: iOS device URL. This is the URL of the iOS device. It is used to send requests to the iOS device.
45
+ bundle_identifiers: List of bundle identifiers to include in the list of packages
46
+ """
42
47
  self.clickable_elements_cache: List[Dict[str, Any]] = []
43
48
  self.url = url
44
49
  self.last_screenshot = None
@@ -53,43 +58,39 @@ class IOSTools(Tools):
53
58
  self.bundle_identifiers = bundle_identifiers
54
59
  logger.info(f"iOS device URL: {url}")
55
60
 
56
- async def get_state(
57
- self, serial: Optional[str] = None
58
- ) -> List[Dict[str, Any]]:
61
+ def get_state(self) -> List[Dict[str, Any]]:
59
62
  """
60
63
  Get all clickable UI elements from the iOS device using accessibility API.
61
64
 
62
- Args:
63
- serial: Optional device URL (not used for iOS, uses instance URL)
64
-
65
65
  Returns:
66
66
  List of dictionaries containing UI elements extracted from the device screen
67
67
  """
68
68
  try:
69
- async with aiohttp.ClientSession() as session:
70
- a11y_url = f"{self.url}/vision/a11y"
71
- async with session.get(a11y_url) as response:
72
- if response.status == 200:
73
- a11y_data = await response.json()
74
-
75
- # Parse the iOS accessibility tree format
76
- elements = self._parse_ios_accessibility_tree(
77
- a11y_data["accessibilityTree"]
78
- )
79
-
80
- # Cache the elements for tap_by_index usage
81
- self.clickable_elements_cache = elements
82
-
83
- return {
84
- "a11y_tree":self.clickable_elements_cache
85
- }
86
- else:
87
- logger.error(
88
- f"Failed to get accessibility data: HTTP {response.status}"
89
- )
90
- raise ValueError(
91
- f"Failed to get accessibility data: HTTP {response.status}"
92
- )
69
+ a11y_url = f"{self.url}/vision/a11y"
70
+ response = requests.get(a11y_url)
71
+
72
+ if response.status_code == 200:
73
+ a11y_data = response.json()
74
+
75
+ # Parse the iOS accessibility tree format
76
+ elements = self._parse_ios_accessibility_tree(
77
+ a11y_data["accessibilityTree"]
78
+ )
79
+
80
+ # Cache the elements for tap_by_index usage
81
+ self.clickable_elements_cache = elements
82
+
83
+ return {
84
+ "a11y_tree": self.clickable_elements_cache,
85
+ "phone_state": self._get_phone_state(),
86
+ }
87
+ else:
88
+ logger.error(
89
+ f"Failed to get accessibility data: HTTP {response.status_code}"
90
+ )
91
+ raise ValueError(
92
+ f"Failed to get accessibility data: HTTP {response.status_code}"
93
+ )
93
94
 
94
95
  except Exception as e:
95
96
  logger.error(f"Error getting clickable elements: {e}")
@@ -202,7 +203,7 @@ class IOSTools(Tools):
202
203
 
203
204
  return elements
204
205
 
205
- async def tap_by_index(self, index: int, serial: Optional[str] = None) -> str:
206
+ def tap_by_index(self, index: int) -> str:
206
207
  """
207
208
  Tap on a UI element by its index.
208
209
 
@@ -269,55 +270,51 @@ class IOSTools(Tools):
269
270
  self.last_tapped_rect = f"{x},{y},{width},{height}"
270
271
 
271
272
  # Make the tap request
272
- async with aiohttp.ClientSession() as session:
273
- tap_url = f"{self.url}/gestures/tap"
274
- payload = {"rect": ios_rect, "count": 1, "longPress": False}
275
-
276
- logger.info(f"payload {payload}")
277
-
278
- async with session.post(tap_url, json=payload) as response:
279
- if response.status == 200:
280
- # Add a small delay to allow UI to update
281
- await asyncio.sleep(0.5)
282
-
283
- # Create a descriptive response
284
- response_parts = []
285
- response_parts.append(f"Tapped element with index {index}")
286
- response_parts.append(
287
- f"Text: '{element.get('text', 'No text')}'"
288
- )
289
- response_parts.append(
290
- f"Class: {element.get('className', 'Unknown class')}"
291
- )
292
- response_parts.append(f"Rect: {ios_rect}")
293
-
294
- return " | ".join(response_parts)
295
- else:
296
- return f"Error: Failed to tap element. HTTP {response.status}"
273
+ tap_url = f"{self.url}/gestures/tap"
274
+ payload = {"rect": ios_rect, "count": 1, "longPress": False}
275
+
276
+ logger.info(f"payload {payload}")
277
+
278
+ response = requests.post(tap_url, json=payload)
279
+ if response.status_code == 200:
280
+ # Add a small delay to allow UI to update
281
+ time.sleep(0.5)
282
+
283
+ # Create a descriptive response
284
+ response_parts = []
285
+ response_parts.append(f"Tapped element with index {index}")
286
+ response_parts.append(f"Text: '{element.get('text', 'No text')}'")
287
+ response_parts.append(
288
+ f"Class: {element.get('className', 'Unknown class')}"
289
+ )
290
+ response_parts.append(f"Rect: {ios_rect}")
291
+
292
+ return " | ".join(response_parts)
293
+ else:
294
+ return f"Error: Failed to tap element. HTTP {response.status_code}"
297
295
 
298
296
  except Exception as e:
299
297
  return f"Error: {str(e)}"
300
298
 
301
- """async def tap_by_coordinates(self, x: int, y: int) -> bool:
299
+ """def tap_by_coordinates(self, x: int, y: int) -> bool:
302
300
  # Format rect in iOS format: {{x,y},{w,h}}
303
301
  width = 1
304
302
  height = 1
305
303
  ios_rect = f"{{{{{x},{y}}},{{{width},{height}}}}}"
306
304
 
307
305
  # Make the tap request
308
- async with aiohttp.ClientSession() as session:
309
- tap_url = f"{self.url}/gestures/tap"
310
- payload = {"rect": ios_rect, "count": 1, "longPress": False}
306
+ tap_url = f"{self.url}/gestures/tap"
307
+ payload = {"rect": ios_rect, "count": 1, "longPress": False}
311
308
 
312
- logger.info(f"payload {payload}")
309
+ logger.info(f"payload {payload}")
313
310
 
314
- async with session.post(tap_url, json=payload) as response:
315
- if response.status == 200:
316
- return True
317
- else:
318
- return False"""
311
+ response = requests.post(tap_url, json=payload)
312
+ if response.status_code == 200:
313
+ return True
314
+ else:
315
+ return False"""
319
316
 
320
- async def tap(self, index: int) -> str:
317
+ def tap(self, index: int) -> str:
321
318
  """
322
319
  Tap on a UI element by its index.
323
320
 
@@ -330,9 +327,9 @@ class IOSTools(Tools):
330
327
  Returns:
331
328
  Result message
332
329
  """
333
- return await self.tap_by_index(index)
330
+ return self.tap_by_index(index)
334
331
 
335
- async def swipe(
332
+ def swipe(
336
333
  self, start_x: int, start_y: int, end_x: int, end_y: int, duration_ms: int = 300
337
334
  ) -> bool:
338
335
  """
@@ -358,31 +355,47 @@ class IOSTools(Tools):
358
355
  else:
359
356
  direction = "down" if dy > 0 else "up"
360
357
 
361
- async with aiohttp.ClientSession() as session:
362
- swipe_url = f"{self.url}/gestures/swipe"
363
- payload = {"x": float(start_x), "y": float(start_y), "dir": direction}
358
+ swipe_url = f"{self.url}/gestures/swipe"
359
+ payload = {"x": float(start_x), "y": float(start_y), "dir": direction}
364
360
 
365
- async with session.post(swipe_url, json=payload) as response:
366
- if response.status == 200:
367
- logger.info(
368
- f"Swiped from ({start_x}, {start_y}) to ({end_x}, {end_y}) direction: {direction}"
369
- )
370
- return True
371
- else:
372
- logger.error(f"Failed to swipe: HTTP {response.status}")
373
- return False
361
+ response = requests.post(swipe_url, json=payload)
362
+ if response.status_code == 200:
363
+ logger.info(
364
+ f"Swiped from ({start_x}, {start_y}) to ({end_x}, {end_y}) direction: {direction}"
365
+ )
366
+ return True
367
+ else:
368
+ logger.error(f"Failed to swipe: HTTP {response.status_code}")
369
+ return False
374
370
 
375
371
  except Exception as e:
376
372
  logger.error(f"Error performing swipe: {e}")
377
373
  return False
378
374
 
379
- async def input_text(self, text: str, serial: Optional[str] = None) -> str:
375
+ def drag(
376
+ self, start_x: int, start_y: int, end_x: int, end_y: int, duration_ms: int = 3000
377
+ ) -> bool:
378
+ """
379
+ Drag from the given start coordinates to the given end coordinates.
380
+ Args:
381
+ start_x: Starting X coordinate
382
+ start_y: Starting Y coordinate
383
+ end_x: Ending X coordinate
384
+ end_y: Ending Y coordinate
385
+ duration_ms: Duration of swipe in milliseconds
386
+ Returns:
387
+ Bool indicating success or failure
388
+ """
389
+ # TODO: implement this
390
+ logger.info(f"Drag action FAILED! Not implemented for iOS")
391
+ return False
392
+
393
+ def input_text(self, text: str) -> str:
380
394
  """
381
395
  Input text on the iOS device.
382
396
 
383
397
  Args:
384
398
  text: Text to input. Can contain spaces, newlines, and special characters including non-ASCII.
385
- serial: Optional device serial (not used for iOS, uses instance URL)
386
399
 
387
400
  Returns:
388
401
  Result message
@@ -391,24 +404,23 @@ class IOSTools(Tools):
391
404
  # Use the last tapped element's rect if available, otherwise use a default
392
405
  rect = self.last_tapped_rect if self.last_tapped_rect else "0,0,100,100"
393
406
 
394
- async with aiohttp.ClientSession() as session:
395
- type_url = f"{self.url}/inputs/type"
396
- payload = {"rect": rect, "text": text}
407
+ type_url = f"{self.url}/inputs/type"
408
+ payload = {"rect": rect, "text": text}
397
409
 
398
- async with session.post(type_url, json=payload) as response:
399
- if response.status == 200:
400
- await asyncio.sleep(0.5) # Wait for text input to complete
401
- return f"Text input completed: {text[:50]}{'...' if len(text) > 50 else ''}"
402
- else:
403
- return f"Error: Failed to input text. HTTP {response.status}"
410
+ response = requests.post(type_url, json=payload)
411
+ if response.status_code == 200:
412
+ time.sleep(0.5) # Wait for text input to complete
413
+ return f"Text input completed: {text[:50]}{'...' if len(text) > 50 else ''}"
414
+ else:
415
+ return f"Error: Failed to input text. HTTP {response.status_code}"
404
416
 
405
417
  except Exception as e:
406
418
  return f"Error sending text input: {str(e)}"
407
419
 
408
- async def back(self) -> str:
420
+ def back(self) -> str:
409
421
  raise NotImplementedError("Back is not yet implemented for iOS")
410
422
 
411
- async def press_key(self, keycode: int) -> str:
423
+ def press_key(self, keycode: int) -> str:
412
424
  # TODO: refactor this. its not about physical keys but BACK, ENTER, DELETE, etc.
413
425
  """
414
426
  Press a key on the iOS device.
@@ -425,20 +437,19 @@ class IOSTools(Tools):
425
437
  key_names = {0: "HOME", 4: "ACTION", 5: "CAMERA"}
426
438
  key_name = key_names.get(keycode, str(keycode))
427
439
 
428
- async with aiohttp.ClientSession() as session:
429
- key_url = f"{self.url}/inputs/key"
430
- payload = {"key": keycode}
440
+ key_url = f"{self.url}/inputs/key"
441
+ payload = {"key": keycode}
431
442
 
432
- async with session.post(key_url, json=payload) as response:
433
- if response.status == 200:
434
- return f"Pressed key {key_name}"
435
- else:
436
- return f"Error: Failed to press key. HTTP {response.status}"
443
+ response = requests.post(key_url, json=payload)
444
+ if response.status_code == 200:
445
+ return f"Pressed key {key_name}"
446
+ else:
447
+ return f"Error: Failed to press key. HTTP {response.status_code}"
437
448
 
438
449
  except Exception as e:
439
450
  return f"Error pressing key: {str(e)}"
440
451
 
441
- async def start_app(self, package: str, activity: str = "") -> str:
452
+ def start_app(self, package: str, activity: str = "") -> str:
442
453
  """
443
454
  Start an app on the iOS device.
444
455
 
@@ -447,100 +458,92 @@ class IOSTools(Tools):
447
458
  activity: Optional activity name (not used on iOS)
448
459
  """
449
460
  try:
450
- async with aiohttp.ClientSession() as session:
451
- launch_url = f"{self.url}/inputs/launch"
452
- payload = {"bundleIdentifier": package}
461
+ launch_url = f"{self.url}/inputs/launch"
462
+ payload = {"bundleIdentifier": package}
453
463
 
454
- async with session.post(launch_url, json=payload) as response:
455
- if response.status == 200:
456
- await asyncio.sleep(1.0) # Wait for app to launch
457
- return f"Successfully launched app: {package}"
458
- else:
459
- return f"Error: Failed to launch app {package}. HTTP {response.status}"
464
+ response = requests.post(launch_url, json=payload)
465
+ if response.status_code == 200:
466
+ time.sleep(1.0) # Wait for app to launch
467
+ return f"Successfully launched app: {package}"
468
+ else:
469
+ return f"Error: Failed to launch app {package}. HTTP {response.status_code}"
460
470
 
461
471
  except Exception as e:
462
472
  return f"Error launching app: {str(e)}"
463
473
 
464
- async def take_screenshot(self) -> Tuple[str, bytes]:
474
+ def take_screenshot(self) -> Tuple[str, bytes]:
465
475
  """
466
476
  Take a screenshot of the iOS device.
467
477
  This function captures the current screen and adds the screenshot to context in the next message.
468
478
  Also stores the screenshot in the screenshots list with timestamp for later GIF creation.
469
479
  """
470
480
  try:
471
- async with aiohttp.ClientSession() as session:
472
- screenshot_url = f"{self.url}/vision/screenshot"
473
- async with session.get(screenshot_url) as response:
474
- if response.status == 200:
475
- screenshot_data = await response.read()
476
-
477
- # Store screenshot with timestamp
478
- screenshot_info = {
479
- "timestamp": time.time(),
480
- "data": screenshot_data,
481
- }
482
- self.screenshots.append(screenshot_info)
483
- self.last_screenshot = screenshot_data
484
-
485
- logger.info(
486
- f"Screenshot captured successfully, size: {len(screenshot_data)} bytes"
487
- )
488
- return ("PNG", screenshot_data)
489
- else:
490
- logger.error(
491
- f"Failed to capture screenshot: HTTP {response.status}"
492
- )
493
- raise ValueError(
494
- f"Failed to capture screenshot: HTTP {response.status}"
495
- )
481
+ screenshot_url = f"{self.url}/vision/screenshot"
482
+ response = requests.get(screenshot_url)
483
+
484
+ if response.status_code == 200:
485
+ screenshot_data = response.content
486
+
487
+ # Store screenshot with timestamp
488
+ screenshot_info = {
489
+ "timestamp": time.time(),
490
+ "data": screenshot_data,
491
+ }
492
+ self.screenshots.append(screenshot_info)
493
+ self.last_screenshot = screenshot_data
494
+
495
+ logger.info(
496
+ f"Screenshot captured successfully, size: {len(screenshot_data)} bytes"
497
+ )
498
+ return ("PNG", screenshot_data)
499
+ else:
500
+ logger.error(
501
+ f"Failed to capture screenshot: HTTP {response.status_code}"
502
+ )
503
+ raise ValueError(
504
+ f"Failed to capture screenshot: HTTP {response.status_code}"
505
+ )
496
506
 
497
507
  except Exception as e:
498
508
  logger.error(f"Error capturing screenshot: {e}")
499
509
  raise ValueError(f"Error taking screenshot: {str(e)}")
500
510
 
501
- async def get_phone_state(self, serial: Optional[str] = None) -> Dict[str, Any]:
511
+ def _get_phone_state(self) -> Dict[str, Any]:
502
512
  """
503
513
  Get the current phone state including current activity and keyboard visibility.
504
514
 
505
- Args:
506
- serial: Optional device serial number (not used for iOS)
507
-
508
515
  Returns:
509
516
  Dictionary with current phone state information
510
517
  """
511
518
  try:
512
519
  # For iOS, we can get some state info from the accessibility API
513
- async with aiohttp.ClientSession() as session:
514
- a11y_url = f"{self.url}/vision/state"
515
- async with session.get(a11y_url) as response:
516
- if response.status == 200:
517
- state_data = await response.json()
518
-
519
- return {
520
- "current_activity": state_data["activity"],
521
- "keyboard_shown": state_data["keyboardShown"],
522
- }
523
- else:
524
- return {
525
- "error": f"Failed to get device state: HTTP {response.status}",
526
- "current_activity": "Unknown",
527
- "keyboard_shown": False,
528
- }
520
+ a11y_url = f"{self.url}/vision/state"
521
+ response = requests.get(a11y_url)
522
+
523
+ if response.status_code == 200:
524
+ state_data = response.json()
525
+
526
+ return {
527
+ "current_activity": state_data["activity"],
528
+ "keyboard_shown": state_data["keyboardShown"],
529
+ }
530
+ else:
531
+ return {
532
+ "error": f"Failed to get device state: HTTP {response.status_code}",
533
+ "current_activity": "Unknown",
534
+ "keyboard_shown": False,
535
+ }
529
536
 
530
537
  except Exception as e:
531
538
  return {"error": str(e), "message": f"Error getting phone state: {str(e)}"}
532
539
 
533
- async def list_packages(self, include_system_apps: bool = True) -> List[str]:
540
+ def list_packages(self, include_system_apps: bool = True) -> List[str]:
534
541
  all_packages = set(self.bundle_identifiers)
535
542
  if include_system_apps:
536
543
  all_packages.update(SYSTEM_BUNDLE_IDENTIFIERS)
537
544
  return sorted(list(all_packages))
538
545
 
539
- async def extract(self, filename: str | None = None) -> str:
540
- # TODO
541
- return "not implemented"
542
-
543
- async def remember(self, information: str) -> str:
546
+ def remember(self, information: str) -> str:
544
547
  """
545
548
  Store important information to remember for future context.
546
549
 
droidrun/tools/tools.py CHANGED
@@ -8,12 +8,23 @@ logger = logging.getLogger(__name__)
8
8
 
9
9
 
10
10
  class Tools(ABC):
11
+ """
12
+ Abstract base class for all tools.
13
+ This class provides a common interface for all tools to implement.
14
+ """
15
+
11
16
  @abstractmethod
12
- async def get_state(self) -> Dict[str, Any]:
17
+ def get_state(self) -> Dict[str, Any]:
18
+ """
19
+ Get the current state of the tool.
20
+ """
13
21
  pass
14
22
 
15
23
  @abstractmethod
16
- async def tap_by_index(self, index: int) -> bool:
24
+ def tap_by_index(self, index: int) -> str:
25
+ """
26
+ Tap the element at the given index.
27
+ """
17
28
  pass
18
29
 
19
30
  #@abstractmethod
@@ -21,75 +32,117 @@ class Tools(ABC):
21
32
  # pass
22
33
 
23
34
  @abstractmethod
24
- async def swipe(
35
+ def swipe(
25
36
  self, start_x: int, start_y: int, end_x: int, end_y: int, duration_ms: int = 300
26
37
  ) -> bool:
38
+ """
39
+ Swipe from the given start coordinates to the given end coordinates.
40
+ """
27
41
  pass
28
42
 
29
43
  @abstractmethod
30
- async def input_text(self, text: str) -> bool:
44
+ def drag(
45
+ self, start_x: int, start_y: int, end_x: int, end_y: int, duration_ms: int = 3000
46
+ ) -> bool:
47
+ """
48
+ Drag from the given start coordinates to the given end coordinates.
49
+ """
31
50
  pass
32
51
 
33
52
  @abstractmethod
34
- async def back(self) -> bool:
53
+ def input_text(self, text: str) -> str:
54
+ """
55
+ Input the given text into a focused input field.
56
+ """
35
57
  pass
36
58
 
37
59
  @abstractmethod
38
- async def press_key(self, keycode: int) -> bool:
60
+ def back(self) -> str:
61
+ """
62
+ Press the back button.
63
+ """
39
64
  pass
40
65
 
41
66
  @abstractmethod
42
- async def start_app(self, package: str, activity: str = "") -> bool:
67
+ def press_key(self, keycode: int) -> str:
68
+ """
69
+ Enter the given keycode.
70
+ """
43
71
  pass
44
72
 
45
73
  @abstractmethod
46
- async def take_screenshot(self) -> Tuple[str, bytes]:
74
+ def start_app(self, package: str, activity: str = "") -> str:
75
+ """
76
+ Start the given app.
77
+ """
47
78
  pass
48
79
 
49
80
  @abstractmethod
50
- async def list_packages(self, include_system_apps: bool = False) -> List[str]:
81
+ def take_screenshot(self) -> Tuple[str, bytes]:
82
+ """
83
+ Take a screenshot of the device.
84
+ """
51
85
  pass
52
86
 
53
87
  @abstractmethod
54
- async def remember(self, information: str) -> str:
88
+ def list_packages(self, include_system_apps: bool = False) -> List[str]:
89
+ """
90
+ List all packages on the device.
91
+ """
55
92
  pass
56
93
 
57
94
  @abstractmethod
58
- async def get_memory(self) -> List[str]:
95
+ def remember(self, information: str) -> str:
96
+ """
97
+ Remember the given information. This is used to store information in the tool's memory.
98
+ """
59
99
  pass
60
100
 
61
101
  @abstractmethod
62
- async def extract(self, filename: Optional[str] = None) -> str:
102
+ def get_memory(self) -> List[str]:
103
+ """
104
+ Get the memory of the tool.
105
+ """
63
106
  pass
64
107
 
65
108
  @abstractmethod
66
- def complete(self, success: bool, reason: str = "") -> bool:
109
+ def complete(self, success: bool, reason: str = "") -> None:
110
+ """
111
+ Complete the tool. This is used to indicate that the tool has completed its task.
112
+ """
67
113
  pass
68
114
 
69
115
 
70
- def describe_tools(tools: Tools) -> Dict[str, Callable[..., Any]]:
116
+ def describe_tools(tools: Tools, exclude_tools: Optional[List[str]] = None) -> Dict[str, Callable[..., Any]]:
71
117
  """
72
118
  Describe the tools available for the given Tools instance.
73
119
 
74
120
  Args:
75
121
  tools: The Tools instance to describe.
122
+ exclude_tools: List of tool names to exclude from the description.
76
123
 
77
124
  Returns:
78
125
  A dictionary mapping tool names to their descriptions.
79
126
  """
127
+ exclude_tools = exclude_tools or []
80
128
 
81
- return {
129
+ description = {
82
130
  # UI interaction
83
131
  "swipe": tools.swipe,
84
132
  "input_text": tools.input_text,
85
133
  "press_key": tools.press_key,
86
134
  "tap_by_index": tools.tap_by_index,
87
- # "tap_by_coordinates": tools_instance.tap_by_coordinates,
135
+ "drag": tools.drag,
88
136
  # App management
89
137
  "start_app": tools.start_app,
90
138
  "list_packages": tools.list_packages,
91
139
  # state management
92
- "extract": tools.extract,
93
140
  "remember": tools.remember,
94
141
  "complete": tools.complete,
95
142
  }
143
+
144
+ # Remove excluded tools
145
+ for tool_name in exclude_tools:
146
+ description.pop(tool_name, None)
147
+
148
+ return description