oagi 0.2.1__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of oagi might be problematic. Click here for more details.

oagi/pil_image.py ADDED
@@ -0,0 +1,98 @@
1
+ # -----------------------------------------------------------------------------
2
+ # Copyright (c) OpenAGI Foundation
3
+ # All rights reserved.
4
+ #
5
+ # This file is part of the official API project.
6
+ # Licensed under the MIT License.
7
+ # -----------------------------------------------------------------------------
8
+
9
+ import io
10
+ from typing import Optional
11
+
12
+ import pyautogui
13
+ from PIL import Image as PILImageLib
14
+
15
+ from .types.models.image_config import ImageConfig
16
+
17
+
18
+ class PILImage:
19
+ """PIL image wrapper with transformation capabilities."""
20
+
21
+ def __init__(self, image: PILImageLib.Image, config: ImageConfig | None = None):
22
+ """Initialize with a PIL image and optional config."""
23
+ self.image = image
24
+ self.config = config or ImageConfig()
25
+ self._cached_bytes: Optional[bytes] = None
26
+
27
+ @classmethod
28
+ def from_file(cls, path: str, config: ImageConfig | None = None) -> "PILImage":
29
+ """Create PILImage from file path."""
30
+ image = PILImageLib.open(path)
31
+ return cls(image, config)
32
+
33
+ @classmethod
34
+ def from_bytes(cls, data: bytes, config: ImageConfig | None = None) -> "PILImage":
35
+ """Create PILImage from raw bytes."""
36
+ image = PILImageLib.open(io.BytesIO(data))
37
+ return cls(image, config)
38
+
39
+ @classmethod
40
+ def from_screenshot(cls, config: ImageConfig | None = None) -> "PILImage":
41
+ """Create PILImage from screenshot."""
42
+ screenshot = pyautogui.screenshot()
43
+ return cls(screenshot, config)
44
+
45
+ def transform(self, config: ImageConfig) -> "PILImage":
46
+ """Apply transformations (resize) based on config and return new PILImage."""
47
+ # Apply resize if needed
48
+ transformed = self._resize(self.image, config)
49
+ # Return new PILImage with the config (format conversion happens on read())
50
+ return PILImage(transformed, config)
51
+
52
+ def _resize(
53
+ self, image: PILImageLib.Image, config: ImageConfig
54
+ ) -> PILImageLib.Image:
55
+ """Resize image based on config."""
56
+ if config.width or config.height:
57
+ # Get target dimensions (use original if not specified)
58
+ target_width = config.width or image.width
59
+ target_height = config.height or image.height
60
+
61
+ # Map resample string to PIL constant
62
+ resample_map = {
63
+ "NEAREST": PILImageLib.NEAREST,
64
+ "BILINEAR": PILImageLib.BILINEAR,
65
+ "BICUBIC": PILImageLib.BICUBIC,
66
+ "LANCZOS": PILImageLib.LANCZOS,
67
+ }
68
+ resample = resample_map[config.resample]
69
+
70
+ # Resize to exact dimensions
71
+ return image.resize((target_width, target_height), resample)
72
+ return image
73
+
74
+ def _convert_format(self, image: PILImageLib.Image) -> bytes:
75
+ """Convert image to configured format (PNG or JPEG)."""
76
+ buffer = io.BytesIO()
77
+ save_kwargs = {"format": self.config.format}
78
+
79
+ if self.config.format == "JPEG":
80
+ save_kwargs["quality"] = self.config.quality
81
+ # Convert RGBA to RGB for JPEG if needed
82
+ if image.mode == "RGBA":
83
+ rgb_image = PILImageLib.new("RGB", image.size, (255, 255, 255))
84
+ rgb_image.paste(image, mask=image.split()[3])
85
+ rgb_image.save(buffer, **save_kwargs)
86
+ else:
87
+ image.save(buffer, **save_kwargs)
88
+ elif self.config.format == "PNG":
89
+ save_kwargs["optimize"] = self.config.optimize
90
+ image.save(buffer, **save_kwargs)
91
+
92
+ return buffer.getvalue()
93
+
94
+ def read(self) -> bytes:
95
+ """Read image as bytes with current config (implements Image protocol)."""
96
+ if self._cached_bytes is None:
97
+ self._cached_bytes = self._convert_format(self.image)
98
+ return self._cached_bytes
@@ -10,10 +10,28 @@ import re
10
10
  import time
11
11
 
12
12
  import pyautogui
13
+ from pydantic import BaseModel, Field
13
14
 
14
15
  from .types import Action, ActionType
15
16
 
16
17
 
18
+ class PyautoguiConfig(BaseModel):
19
+ """Configuration for PyautoguiActionHandler."""
20
+
21
+ drag_duration: float = Field(
22
+ default=0.5, description="Duration for drag operations in seconds"
23
+ )
24
+ scroll_amount: int = Field(
25
+ default=30, description="Amount to scroll (positive for up, negative for down)"
26
+ )
27
+ wait_duration: float = Field(
28
+ default=1.0, description="Duration for wait actions in seconds"
29
+ )
30
+ action_pause: float = Field(
31
+ default=0.1, description="Pause between PyAutoGUI actions in seconds"
32
+ )
33
+
34
+
17
35
  class PyautoguiActionHandler:
18
36
  """
19
37
  Handles actions to be executed using PyAutoGUI.
@@ -29,11 +47,13 @@ class PyautoguiActionHandler:
29
47
  actions (list[Action]): List of actions to be processed and executed.
30
48
  """
31
49
 
32
- def __init__(self):
50
+ def __init__(self, config: PyautoguiConfig | None = None):
51
+ # Use default config if none provided
52
+ self.config = config or PyautoguiConfig()
33
53
  # Get screen dimensions for coordinate denormalization
34
54
  self.screen_width, self.screen_height = pyautogui.size()
35
55
  # Set default delay between actions
36
- pyautogui.PAUSE = 0.1
56
+ pyautogui.PAUSE = self.config.action_pause
37
57
 
38
58
  def _denormalize_coords(self, x: float, y: float) -> tuple[int, int]:
39
59
  """Convert coordinates from 0-1000 range to actual screen coordinates."""
@@ -82,59 +102,70 @@ class PyautoguiActionHandler:
82
102
  keys = [key.strip() for key in args_str.split("+")]
83
103
  return keys
84
104
 
105
+ def _execute_single_action(self, action: Action) -> None:
106
+ """Execute a single action once."""
107
+ arg = action.argument.strip("()") # Remove outer parentheses if present
108
+
109
+ match action.type:
110
+ case ActionType.CLICK:
111
+ x, y = self._parse_coords(arg)
112
+ pyautogui.click(x, y)
113
+
114
+ case ActionType.LEFT_DOUBLE:
115
+ x, y = self._parse_coords(arg)
116
+ pyautogui.doubleClick(x, y)
117
+
118
+ case ActionType.RIGHT_SINGLE:
119
+ x, y = self._parse_coords(arg)
120
+ pyautogui.rightClick(x, y)
121
+
122
+ case ActionType.DRAG:
123
+ x1, y1, x2, y2 = self._parse_drag_coords(arg)
124
+ pyautogui.moveTo(x1, y1)
125
+ pyautogui.dragTo(
126
+ x2, y2, duration=self.config.drag_duration, button="left"
127
+ )
128
+
129
+ case ActionType.HOTKEY:
130
+ keys = self._parse_hotkey(arg)
131
+ pyautogui.hotkey(*keys)
132
+
133
+ case ActionType.TYPE:
134
+ # Remove quotes if present
135
+ text = arg.strip("\"'")
136
+ pyautogui.typewrite(text)
137
+
138
+ case ActionType.SCROLL:
139
+ x, y, direction = self._parse_scroll(arg)
140
+ pyautogui.moveTo(x, y)
141
+ scroll_amount = (
142
+ self.config.scroll_amount
143
+ if direction == "up"
144
+ else -self.config.scroll_amount
145
+ )
146
+ pyautogui.scroll(scroll_amount)
147
+
148
+ case ActionType.FINISH:
149
+ # Task completion - no action needed
150
+ pass
151
+
152
+ case ActionType.WAIT:
153
+ # Wait for a short period
154
+ time.sleep(self.config.wait_duration)
155
+
156
+ case ActionType.CALL_USER:
157
+ # Call user - implementation depends on requirements
158
+ print("User intervention requested")
159
+
160
+ case _:
161
+ print(f"Unknown action type: {action.type}")
162
+
85
163
  def _execute_action(self, action: Action) -> None:
86
- """Execute a single action."""
164
+ """Execute an action, potentially multiple times."""
87
165
  count = action.count or 1
88
- arg = action.argument.strip("()") # Remove outer parentheses if present
89
166
 
90
167
  for _ in range(count):
91
- match action.type:
92
- case ActionType.CLICK:
93
- x, y = self._parse_coords(arg)
94
- pyautogui.click(x, y)
95
-
96
- case ActionType.LEFT_DOUBLE:
97
- x, y = self._parse_coords(arg)
98
- pyautogui.doubleClick(x, y)
99
-
100
- case ActionType.RIGHT_SINGLE:
101
- x, y = self._parse_coords(arg)
102
- pyautogui.rightClick(x, y)
103
-
104
- case ActionType.DRAG:
105
- x1, y1, x2, y2 = self._parse_drag_coords(arg)
106
- pyautogui.moveTo(x1, y1)
107
- pyautogui.dragTo(x2, y2, duration=0.5, button="left")
108
-
109
- case ActionType.HOTKEY:
110
- keys = self._parse_hotkey(arg)
111
- pyautogui.hotkey(*keys)
112
-
113
- case ActionType.TYPE:
114
- # Remove quotes if present
115
- text = arg.strip("\"'")
116
- pyautogui.typewrite(text)
117
-
118
- case ActionType.SCROLL:
119
- x, y, direction = self._parse_scroll(arg)
120
- pyautogui.moveTo(x, y)
121
- scroll_amount = 5 if direction == "up" else -5
122
- pyautogui.scroll(scroll_amount)
123
-
124
- case ActionType.FINISH:
125
- # Task completion - no action needed
126
- pass
127
-
128
- case ActionType.WAIT:
129
- # Wait for a short period
130
- time.sleep(1)
131
-
132
- case ActionType.CALL_USER:
133
- # Call user - implementation depends on requirements
134
- print("User intervention requested")
135
-
136
- case _:
137
- print(f"Unknown action type: {action.type}")
168
+ self._execute_single_action(action)
138
169
 
139
170
  def __call__(self, actions: list[Action]) -> None:
140
171
  """Execute the provided list of actions."""
oagi/screenshot_maker.py CHANGED
@@ -6,68 +6,36 @@
6
6
  # Licensed under the MIT License.
7
7
  # -----------------------------------------------------------------------------
8
8
 
9
- import io
10
9
  from typing import Optional
11
10
 
12
- import pyautogui
13
-
11
+ from .pil_image import PILImage
14
12
  from .types import Image
15
-
16
-
17
- class FileImage:
18
- def __init__(self, path: str):
19
- self.path = path
20
- with open(path, "rb") as f:
21
- self.data = f.read()
22
-
23
- def read(self) -> bytes:
24
- return self.data
25
-
26
-
27
- class MockImage:
28
- def read(self) -> bytes:
29
- return b"mock screenshot data"
30
-
31
-
32
- class ScreenshotImage:
33
- """Image class that wraps a pyautogui screenshot."""
34
-
35
- def __init__(self, screenshot):
36
- """Initialize with a PIL Image from pyautogui."""
37
- self.screenshot = screenshot
38
- self._cached_bytes: Optional[bytes] = None
39
-
40
- def read(self) -> bytes:
41
- """Convert the screenshot to bytes (PNG format)."""
42
- if self._cached_bytes is None:
43
- # Convert PIL Image to bytes
44
- buffer = io.BytesIO()
45
- self.screenshot.save(buffer, format="PNG")
46
- self._cached_bytes = buffer.getvalue()
47
- return self._cached_bytes
13
+ from .types.models.image_config import ImageConfig
48
14
 
49
15
 
50
16
  class ScreenshotMaker:
51
17
  """Takes screenshots using pyautogui."""
52
18
 
53
- def __init__(self):
54
- self._last_screenshot: Optional[ScreenshotImage] = None
19
+ def __init__(self, config: ImageConfig | None = None):
20
+ self.config = config or ImageConfig()
21
+ self._last_image: Optional[PILImage] = None
55
22
 
56
23
  def __call__(self) -> Image:
57
- """Take a screenshot and return it as an Image."""
58
- # Take a screenshot using pyautogui
59
- screenshot = pyautogui.screenshot()
24
+ """Take and process a screenshot."""
25
+ # Create PILImage from screenshot
26
+ pil_image = PILImage.from_screenshot()
60
27
 
61
- # Wrap it in our ScreenshotImage class
62
- screenshot_image = ScreenshotImage(screenshot)
28
+ # Apply transformation if config is set
29
+ if self.config:
30
+ pil_image = pil_image.transform(self.config)
63
31
 
64
- # Store as the last screenshot
65
- self._last_screenshot = screenshot_image
32
+ # Store as the last image
33
+ self._last_image = pil_image
66
34
 
67
- return screenshot_image
35
+ return pil_image
68
36
 
69
37
  def last_image(self) -> Image:
70
38
  """Return the last screenshot taken, or take a new one if none exists."""
71
- if self._last_screenshot is None:
39
+ if self._last_image is None:
72
40
  return self()
73
- return self._last_screenshot
41
+ return self._last_image
oagi/short_task.py CHANGED
@@ -16,6 +16,14 @@ logger = get_logger("short_task")
16
16
  class ShortTask(Task):
17
17
  """Task implementation with automatic mode for short-duration tasks."""
18
18
 
19
+ def __init__(
20
+ self,
21
+ api_key: str | None = None,
22
+ base_url: str | None = None,
23
+ model: str = "vision-model-v1",
24
+ ):
25
+ super().__init__(api_key=api_key, base_url=base_url, model=model)
26
+
19
27
  def auto_mode(
20
28
  self,
21
29
  task_desc: str,
oagi/single_step.py CHANGED
@@ -8,6 +8,7 @@
8
8
 
9
9
  from pathlib import Path
10
10
 
11
+ from .pil_image import PILImage
11
12
  from .task import Task
12
13
  from .types import Image, Step
13
14
 
@@ -59,12 +60,12 @@ def single_step(
59
60
  ... screenshot=image
60
61
  ... )
61
62
  """
62
- # Convert file paths to bytes
63
+ # Convert file paths to bytes using PILImage
63
64
  if isinstance(screenshot, (str, Path)):
64
65
  path = Path(screenshot) if isinstance(screenshot, str) else screenshot
65
66
  if path.exists():
66
- with open(path, "rb") as f:
67
- screenshot_bytes = f.read()
67
+ pil_image = PILImage.from_file(str(path))
68
+ screenshot_bytes = pil_image.read()
68
69
  else:
69
70
  raise FileNotFoundError(f"Screenshot file not found: {path}")
70
71
  elif isinstance(screenshot, bytes):
oagi/task.py CHANGED
@@ -16,13 +16,18 @@ logger = get_logger("task")
16
16
  class Task:
17
17
  """Base class for task automation with the OAGI API."""
18
18
 
19
- def __init__(self, api_key: str | None = None, base_url: str | None = None):
19
+ def __init__(
20
+ self,
21
+ api_key: str | None = None,
22
+ base_url: str | None = None,
23
+ model: str = "vision-model-v1",
24
+ ):
20
25
  self.client = SyncClient(base_url=base_url, api_key=api_key)
21
26
  self.api_key = self.client.api_key
22
27
  self.base_url = self.client.base_url
23
28
  self.task_id: str | None = None
24
29
  self.task_description: str | None = None
25
- self.model = "vision-model-v1" # default model
30
+ self.model = model
26
31
 
27
32
  def init_task(self, task_desc: str, max_steps: int = 5):
28
33
  """Initialize a new task with the given description."""
oagi/types/__init__.py CHANGED
@@ -7,8 +7,20 @@
7
7
  # -----------------------------------------------------------------------------
8
8
 
9
9
  from .action_handler import ActionHandler
10
+ from .async_action_handler import AsyncActionHandler
11
+ from .async_image_provider import AsyncImageProvider
10
12
  from .image import Image
11
13
  from .image_provider import ImageProvider
12
- from .models import Action, ActionType, Step
14
+ from .models import Action, ActionType, ImageConfig, Step
13
15
 
14
- __all__ = ["Action", "ActionType", "Image", "Step", "ActionHandler", "ImageProvider"]
16
+ __all__ = [
17
+ "Action",
18
+ "ActionType",
19
+ "Image",
20
+ "ImageConfig",
21
+ "Step",
22
+ "ActionHandler",
23
+ "AsyncActionHandler",
24
+ "ImageProvider",
25
+ "AsyncImageProvider",
26
+ ]
@@ -0,0 +1,30 @@
1
+ # -----------------------------------------------------------------------------
2
+ # Copyright (c) OpenAGI Foundation
3
+ # All rights reserved.
4
+ #
5
+ # This file is part of the official API project.
6
+ # Licensed under the MIT License.
7
+ # -----------------------------------------------------------------------------
8
+
9
+ from typing import Protocol
10
+
11
+ from .models import Action
12
+
13
+
14
+ class AsyncActionHandler(Protocol):
15
+ async def __call__(self, actions: list[Action]) -> None:
16
+ """
17
+ Asynchronously executes a list of actions.
18
+
19
+ This method takes a list of `Action` objects and executes them asynchronously.
20
+ It is used to perform operations represented by the `Action` instances. This
21
+ method does not return any value and modifies the system based on the input actions.
22
+
23
+ Parameters:
24
+ actions (list[Action]): A list of `Action` objects to be executed. Each
25
+ `Action` must encapsulate the logic that is intended to be applied
26
+ during the call.
27
+
28
+ Raises:
29
+ RuntimeError: If an error occurs during the execution of the actions.
30
+ """
@@ -0,0 +1,37 @@
1
+ # -----------------------------------------------------------------------------
2
+ # Copyright (c) OpenAGI Foundation
3
+ # All rights reserved.
4
+ #
5
+ # This file is part of the official API project.
6
+ # Licensed under the MIT License.
7
+ # -----------------------------------------------------------------------------
8
+
9
+ from typing import Protocol
10
+
11
+ from .image import Image
12
+
13
+
14
+ class AsyncImageProvider(Protocol):
15
+ async def __call__(self) -> Image:
16
+ """
17
+ Asynchronously provides an image.
18
+
19
+ This method is responsible for asynchronously capturing, generating, or retrieving
20
+ an image that can be used for task execution or analysis. The method should return
21
+ an object that implements the Image protocol.
22
+
23
+ Returns:
24
+ Image: An object implementing the Image protocol that represents
25
+ the captured or generated image.
26
+
27
+ Raises:
28
+ RuntimeError: If an error occurs during image capture or generation.
29
+ """
30
+
31
+ async def last_image(self) -> Image:
32
+ """
33
+ Asynchronously returns the last captured image.
34
+
35
+ Returns:
36
+ Image: The last captured image.
37
+ """
@@ -7,6 +7,7 @@
7
7
  # -----------------------------------------------------------------------------
8
8
 
9
9
  from .action import Action, ActionType
10
+ from .image_config import ImageConfig
10
11
  from .step import Step
11
12
 
12
- __all__ = ["Action", "ActionType", "Step"]
13
+ __all__ = ["Action", "ActionType", "ImageConfig", "Step"]
@@ -0,0 +1,47 @@
1
+ # -----------------------------------------------------------------------------
2
+ # Copyright (c) OpenAGI Foundation
3
+ # All rights reserved.
4
+ #
5
+ # This file is part of the official API project.
6
+ # Licensed under the MIT License.
7
+ # -----------------------------------------------------------------------------
8
+
9
+ from typing import Literal
10
+
11
+ from pydantic import BaseModel, Field, field_validator
12
+
13
+
14
+ class ImageConfig(BaseModel):
15
+ """Configuration for image capture and processing."""
16
+
17
+ format: Literal["PNG", "JPEG"] = Field(
18
+ default="JPEG", description="Image format for encoding"
19
+ )
20
+ quality: int = Field(
21
+ default=85,
22
+ ge=1,
23
+ le=100,
24
+ description="JPEG quality (1-100, only applies to JPEG format)",
25
+ )
26
+ width: int | None = Field(
27
+ default=1260, description="Target width in pixels (will resize to exact size)"
28
+ )
29
+ height: int | None = Field(
30
+ default=700, description="Target height in pixels (will resize to exact size)"
31
+ )
32
+ optimize: bool = Field(
33
+ default=False,
34
+ description="Enable PNG optimization (only applies to PNG format)",
35
+ )
36
+ resample: Literal["NEAREST", "BILINEAR", "BICUBIC", "LANCZOS"] = Field(
37
+ default="LANCZOS", description="Resampling filter for resizing"
38
+ )
39
+
40
+ @field_validator("quality")
41
+ @classmethod
42
+ def validate_quality(cls, v: int, info) -> int:
43
+ """Validate quality parameter based on format."""
44
+ values = info.data
45
+ if values.get("format") == "PNG" and v != 85:
46
+ return 85
47
+ return v