oagi-core 0.11.0__py3-none-any.whl → 0.12.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
oagi/client/sync.py CHANGED
@@ -9,28 +9,24 @@
9
9
  from functools import wraps
10
10
 
11
11
  import httpx
12
- from httpx import Response
12
+ from httpx import HTTPTransport
13
+ from openai import OpenAI
13
14
 
14
15
  from ..constants import (
15
- API_HEALTH_ENDPOINT,
16
16
  API_V1_FILE_UPLOAD_ENDPOINT,
17
17
  API_V1_GENERATE_ENDPOINT,
18
- API_V2_MESSAGE_ENDPOINT,
18
+ DEFAULT_MAX_RETRIES,
19
19
  HTTP_CLIENT_TIMEOUT,
20
20
  )
21
21
  from ..logging import get_logger
22
22
  from ..types import Image
23
- from ..types.models import GenerateResponse, LLMResponse, UploadFileResponse
23
+ from ..types.models import GenerateResponse, UploadFileResponse, Usage
24
+ from ..types.models.step import Step
24
25
  from .base import BaseClient
25
26
 
26
27
  logger = get_logger("sync_client")
27
28
 
28
29
 
29
- def _log_trace_id(response: Response):
30
- logger.error(f"Request Id: {response.headers.get('x-request-id', '')}")
31
- logger.error(f"Trace Id: {response.headers.get('x-trace-id', '')}")
32
-
33
-
34
30
  def log_trace_on_failure(func):
35
31
  """Decorator that logs trace ID when a method fails."""
36
32
 
@@ -41,7 +37,7 @@ def log_trace_on_failure(func):
41
37
  except Exception as e:
42
38
  # Try to get response from the exception if it has one
43
39
  if (response := getattr(e, "response", None)) is not None:
44
- _log_trace_id(response)
40
+ BaseClient._log_trace_id(response)
45
41
  raise
46
42
 
47
43
  return wrapper
@@ -50,113 +46,70 @@ def log_trace_on_failure(func):
50
46
  class SyncClient(BaseClient[httpx.Client]):
51
47
  """Synchronous HTTP client for the OAGI API."""
52
48
 
53
- def __init__(self, base_url: str | None = None, api_key: str | None = None):
54
- super().__init__(base_url, api_key)
55
- self.client = httpx.Client(base_url=self.base_url)
56
- self.upload_client = httpx.Client(timeout=HTTP_CLIENT_TIMEOUT)
49
+ def __init__(
50
+ self,
51
+ base_url: str | None = None,
52
+ api_key: str | None = None,
53
+ max_retries: int = DEFAULT_MAX_RETRIES,
54
+ ):
55
+ super().__init__(base_url, api_key, max_retries)
56
+
57
+ # OpenAI client for chat completions (with retries)
58
+ self.openai_client = OpenAI(
59
+ api_key=self.api_key,
60
+ base_url=f"{self.base_url}/v1",
61
+ max_retries=self.max_retries,
62
+ )
63
+
64
+ # httpx clients for S3 uploads and other endpoints (with retries)
65
+ transport = HTTPTransport(retries=self.max_retries)
66
+ self.http_client = httpx.Client(transport=transport, base_url=self.base_url)
67
+ self.upload_client = httpx.Client(
68
+ transport=transport, timeout=HTTP_CLIENT_TIMEOUT
69
+ )
70
+
57
71
  logger.info(f"SyncClient initialized with base_url: {self.base_url}")
58
72
 
59
73
  def __enter__(self):
60
74
  return self
61
75
 
62
76
  def __exit__(self, exc_type, exc_val, exc_tb):
63
- self.client.close()
64
- self.upload_client.close()
77
+ self.close()
65
78
 
66
79
  def close(self):
67
- """Close the underlying httpx clients."""
68
- self.client.close()
80
+ """Close the underlying clients."""
81
+ self.openai_client.close()
82
+ self.http_client.close()
69
83
  self.upload_client.close()
70
84
 
71
- @log_trace_on_failure
72
- def create_message(
85
+ def chat_completion(
73
86
  self,
74
87
  model: str,
75
- screenshot: bytes | None = None,
76
- screenshot_url: str | None = None,
77
- task_description: str | None = None,
78
- task_id: str | None = None,
79
- instruction: str | None = None,
80
- messages_history: list | None = None,
88
+ messages: list,
81
89
  temperature: float | None = None,
82
- api_version: str | None = None,
83
- ) -> LLMResponse | None:
90
+ task_id: str | None = None,
91
+ ) -> tuple[Step, str, Usage | None]:
84
92
  """
85
- Call the /v2/message endpoint to analyze task and screenshot
93
+ Call OpenAI-compatible /v1/chat/completions endpoint.
86
94
 
87
95
  Args:
88
- model: The model to use for task analysis
89
- screenshot: Screenshot image bytes (mutually exclusive with screenshot_url)
90
- screenshot_url: Direct URL to screenshot (mutually exclusive with screenshot)
91
- task_description: Description of the task (required for new sessions)
92
- task_id: Task ID for continuing existing task
93
- instruction: Additional instruction when continuing a session
94
- messages_history: OpenAI-compatible chat message history
95
- temperature: Sampling temperature (0.0-2.0) for LLM inference
96
- api_version: API version header
96
+ model: Model to use for inference
97
+ messages: Full message history (OpenAI-compatible format)
98
+ temperature: Sampling temperature (0.0-2.0)
99
+ task_id: Optional task ID for multi-turn conversations
97
100
 
98
101
  Returns:
99
- LLMResponse: The response from the API
100
-
101
- Raises:
102
- ValueError: If both or neither screenshot and screenshot_url are provided
103
- httpx.HTTPStatusError: For HTTP error responses
102
+ Tuple of (Step, raw_output, Usage)
103
+ - Step: Parsed actions and reasoning
104
+ - raw_output: Raw model output string (for message history)
105
+ - Usage: Token usage statistics (or None if not available)
104
106
  """
105
- # Validate that exactly one is provided
106
- if (screenshot is None) == (screenshot_url is None):
107
- raise ValueError(
108
- "Exactly one of 'screenshot' or 'screenshot_url' must be provided"
109
- )
110
-
111
- self._log_request_info(model, task_description, task_id)
112
-
113
- # Upload screenshot to S3 if bytes provided, otherwise use URL directly
114
- upload_file_response = None
115
- if screenshot is not None:
116
- upload_file_response = self.put_s3_presigned_url(screenshot, api_version)
117
-
118
- # Prepare message payload
119
- headers, payload = self._prepare_message_payload(
120
- model=model,
121
- upload_file_response=upload_file_response,
122
- task_description=task_description,
123
- task_id=task_id,
124
- instruction=instruction,
125
- messages_history=messages_history,
126
- temperature=temperature,
127
- api_version=api_version,
128
- screenshot_url=screenshot_url,
107
+ logger.info(f"Making chat completion request with model: {model}")
108
+ kwargs = self._build_chat_completion_kwargs(
109
+ model, messages, temperature, task_id
129
110
  )
130
-
131
- # Make request
132
- try:
133
- response = self.client.post(
134
- API_V2_MESSAGE_ENDPOINT,
135
- json=payload,
136
- headers=headers,
137
- timeout=self.timeout,
138
- )
139
- return self._process_response(response)
140
- except (httpx.TimeoutException, httpx.NetworkError) as e:
141
- self._handle_upload_http_errors(e)
142
-
143
- def health_check(self) -> dict:
144
- """
145
- Call the /health endpoint for health check
146
-
147
- Returns:
148
- dict: Health check response
149
- """
150
- logger.debug("Making health check request")
151
- try:
152
- response = self.client.get(API_HEALTH_ENDPOINT)
153
- response.raise_for_status()
154
- result = response.json()
155
- logger.debug("Health check successful")
156
- return result
157
- except httpx.HTTPStatusError as e:
158
- logger.warning(f"Health check failed: {e}")
159
- raise
111
+ response = self.openai_client.chat.completions.create(**kwargs)
112
+ return self._parse_chat_completion_response(response)
160
113
 
161
114
  def get_s3_presigned_url(
162
115
  self,
@@ -175,7 +128,7 @@ class SyncClient(BaseClient[httpx.Client]):
175
128
 
176
129
  try:
177
130
  headers = self._build_headers(api_version)
178
- response = self.client.get(
131
+ response = self.http_client.get(
179
132
  API_V1_FILE_UPLOAD_ENDPOINT, headers=headers, timeout=self.timeout
180
133
  )
181
134
  return self._process_upload_response(response)
@@ -295,7 +248,7 @@ class SyncClient(BaseClient[httpx.Client]):
295
248
 
296
249
  # Make request
297
250
  try:
298
- response = self.client.post(
251
+ response = self.http_client.post(
299
252
  API_V1_GENERATE_ENDPOINT,
300
253
  json=payload,
301
254
  headers=headers,
oagi/constants.py CHANGED
@@ -9,10 +9,8 @@
9
9
  # URLs & API Endpoints
10
10
  DEFAULT_BASE_URL = "https://api.agiopen.org"
11
11
  API_KEY_HELP_URL = "https://developer.agiopen.org/api-keys"
12
- API_V2_MESSAGE_ENDPOINT = "/v2/message"
13
12
  API_V1_FILE_UPLOAD_ENDPOINT = "/v1/file/upload"
14
13
  API_V1_GENERATE_ENDPOINT = "/v1/generate"
15
- API_HEALTH_ENDPOINT = "/health"
16
14
 
17
15
  # Model identifiers
18
16
  MODEL_ACTOR = "lux-actor-1"
@@ -28,6 +26,10 @@ DEFAULT_MAX_STEPS = 20
28
26
  DEFAULT_MAX_STEPS_THINKER = 100
29
27
  DEFAULT_MAX_STEPS_TASKER = 60
30
28
 
29
+ # Maximum allowed steps per model (hard limits)
30
+ MAX_STEPS_ACTOR = 30
31
+ MAX_STEPS_THINKER = 120
32
+
31
33
  # Reflection intervals
32
34
  DEFAULT_REFLECTION_INTERVAL = 4
33
35
  DEFAULT_REFLECTION_INTERVAL_TASKER = 20
@@ -41,3 +43,6 @@ DEFAULT_TEMPERATURE_LOW = 0.1
41
43
 
42
44
  # Timeout Values
43
45
  HTTP_CLIENT_TIMEOUT = 60
46
+
47
+ # Retry Configuration
48
+ DEFAULT_MAX_RETRIES = 2
oagi/handler/__init__.py CHANGED
@@ -5,28 +5,43 @@
5
5
  # This file is part of the official API project.
6
6
  # Licensed under the MIT License.
7
7
  # -----------------------------------------------------------------------------
8
- from oagi.handler.async_pyautogui_action_handler import AsyncPyautoguiActionHandler
9
- from oagi.handler.async_screenshot_maker import AsyncScreenshotMaker
10
- from oagi.handler.pil_image import PILImage
11
- from oagi.handler.pyautogui_action_handler import (
12
- PyautoguiActionHandler,
13
- PyautoguiConfig,
14
- )
15
- from oagi.handler.screenshot_maker import ScreenshotMaker
16
-
17
-
18
- def reset_handler(handler) -> None:
19
- """Reset handler state if supported.
20
-
21
- Uses duck-typing to check if the handler has a reset() method.
22
- This allows handlers to reset their internal state (e.g., capslock state)
23
- at the start of a new automation task.
24
-
25
- Args:
26
- handler: The action handler to reset
27
- """
28
- if hasattr(handler, "reset"):
29
- handler.reset()
8
+ import importlib
9
+ from typing import TYPE_CHECKING
10
+
11
+ from .utils import reset_handler
12
+
13
+ # Lazy imports for pyautogui-dependent modules to avoid import errors on headless systems
14
+ _LAZY_IMPORTS: dict[str, str] = {
15
+ "AsyncPyautoguiActionHandler": "oagi.handler.async_pyautogui_action_handler",
16
+ "AsyncScreenshotMaker": "oagi.handler.async_screenshot_maker",
17
+ "PILImage": "oagi.handler.pil_image",
18
+ "PyautoguiActionHandler": "oagi.handler.pyautogui_action_handler",
19
+ "PyautoguiConfig": "oagi.handler.pyautogui_action_handler",
20
+ "ScreenshotMaker": "oagi.handler.screenshot_maker",
21
+ }
22
+
23
+ if TYPE_CHECKING:
24
+ from oagi.handler.async_pyautogui_action_handler import AsyncPyautoguiActionHandler
25
+ from oagi.handler.async_screenshot_maker import AsyncScreenshotMaker
26
+ from oagi.handler.pil_image import PILImage
27
+ from oagi.handler.pyautogui_action_handler import (
28
+ PyautoguiActionHandler,
29
+ PyautoguiConfig,
30
+ )
31
+ from oagi.handler.screenshot_maker import ScreenshotMaker
32
+
33
+
34
+ def __getattr__(name: str):
35
+ """Lazy import for pyautogui-dependent modules."""
36
+ if name in _LAZY_IMPORTS:
37
+ module = importlib.import_module(_LAZY_IMPORTS[name])
38
+ return getattr(module, name)
39
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
40
+
41
+
42
+ def __dir__() -> list[str]:
43
+ """Return all public names including lazy imports."""
44
+ return sorted(set(__all__) | set(_LAZY_IMPORTS.keys()))
30
45
 
31
46
 
32
47
  __all__ = [
oagi/handler/utils.py ADDED
@@ -0,0 +1,21 @@
1
+ # -----------------------------------------------------------------------------
2
+ # Copyright (c) OpenAGI Foundation
3
+ # All rights reserved.
4
+ #
5
+ # This file is part of the official API project.
6
+ # Licensed under the MIT License.
7
+ # -----------------------------------------------------------------------------
8
+
9
+
10
+ def reset_handler(handler) -> None:
11
+ """Reset handler state if supported.
12
+
13
+ Uses duck-typing to check if the handler has a reset() method.
14
+ This allows handlers to reset their internal state (e.g., capslock state)
15
+ at the start of a new automation task.
16
+
17
+ Args:
18
+ handler: The action handler to reset
19
+ """
20
+ if hasattr(handler, "reset"):
21
+ handler.reset()
oagi/task/__init__.py CHANGED
@@ -6,16 +6,30 @@
6
6
  # Licensed under the MIT License.
7
7
  # -----------------------------------------------------------------------------
8
8
 
9
- from .async_ import AsyncActor, AsyncTask
10
- from .async_short import AsyncShortTask
11
- from .short import ShortTask
12
- from .sync import Actor, Task
9
+ """Deprecated: Use oagi.actor instead. This module will be removed in a future version."""
10
+
11
+ import warnings
12
+
13
+ from oagi.actor import (
14
+ Actor,
15
+ AsyncActor,
16
+ AsyncShortTask,
17
+ AsyncTask,
18
+ ShortTask,
19
+ Task,
20
+ )
21
+
22
+ warnings.warn(
23
+ "oagi.task is deprecated, use oagi.actor instead",
24
+ DeprecationWarning,
25
+ stacklevel=2,
26
+ )
13
27
 
14
28
  __all__ = [
15
29
  "Actor",
16
30
  "AsyncActor",
17
- "Task", # Deprecated: Use Actor instead
18
- "AsyncTask", # Deprecated: Use AsyncActor instead
19
- "ShortTask", # Deprecated
20
- "AsyncShortTask", # Deprecated
31
+ "Task",
32
+ "AsyncTask",
33
+ "ShortTask",
34
+ "AsyncShortTask",
21
35
  ]
@@ -17,7 +17,6 @@ from .client import (
17
17
  ErrorDetail,
18
18
  ErrorResponse,
19
19
  GenerateResponse,
20
- LLMResponse,
21
20
  UploadFileResponse,
22
21
  Usage,
23
22
  )
@@ -31,7 +30,6 @@ __all__ = [
31
30
  "ErrorResponse",
32
31
  "GenerateResponse",
33
32
  "ImageConfig",
34
- "LLMResponse",
35
33
  "Step",
36
34
  "UploadFileResponse",
37
35
  "Usage",
@@ -81,4 +81,7 @@ def parse_scroll(args_str: str) -> tuple[int, int, str] | None:
81
81
  match = re.match(r"(\d+),\s*(\d+),\s*(\w+)", args_str)
82
82
  if not match:
83
83
  return None
84
- return int(match.group(1)), int(match.group(2)), match.group(3).lower()
84
+ direction = match.group(3).lower()
85
+ if direction not in ("up", "down"):
86
+ return None
87
+ return int(match.group(1)), int(match.group(2)), direction
@@ -8,8 +8,6 @@
8
8
 
9
9
  from pydantic import BaseModel, Field
10
10
 
11
- from .action import Action
12
-
13
11
 
14
12
  class Usage(BaseModel):
15
13
  prompt_tokens: int
@@ -30,21 +28,6 @@ class ErrorResponse(BaseModel):
30
28
  error: ErrorDetail | None
31
29
 
32
30
 
33
- class LLMResponse(BaseModel):
34
- id: str
35
- task_id: str
36
- object: str = "task.completion"
37
- created: int
38
- model: str
39
- task_description: str
40
- is_complete: bool
41
- actions: list[Action]
42
- reason: str | None = None
43
- usage: Usage
44
- error: ErrorDetail | None = None
45
- raw_output: str | None = None
46
-
47
-
48
31
  class UploadFileResponse(BaseModel):
49
32
  """Response from S3 presigned URL upload."""
50
33
 
@@ -66,3 +49,4 @@ class GenerateResponse(BaseModel):
66
49
  deprecated=True,
67
50
  description="This field is deprecated",
68
51
  )
52
+ request_id: str | None = None
@@ -35,6 +35,7 @@ class StepEvent(BaseEvent):
35
35
  step_num: int
36
36
  image: bytes | str
37
37
  step: Step
38
+ task_id: str | None = None
38
39
 
39
40
 
40
41
  class ActionEvent(BaseEvent):
@@ -68,6 +69,7 @@ class PlanEvent(BaseEvent):
68
69
  image: bytes | str | None = None
69
70
  reasoning: str
70
71
  result: str | None = None
72
+ request_id: str | None = None
71
73
 
72
74
 
73
75
  ObserverEvent = ImageEvent | StepEvent | ActionEvent | LogEvent | SplitEvent | PlanEvent
oagi/utils/__init__.py ADDED
@@ -0,0 +1,12 @@
1
+ # -----------------------------------------------------------------------------
2
+ # Copyright (c) OpenAGI Foundation
3
+ # All rights reserved.
4
+ #
5
+ # This file is part of the official API project.
6
+ # Licensed under the MIT License.
7
+ # -----------------------------------------------------------------------------
8
+
9
+ from .output_parser import parse_raw_output
10
+ from .prompt_builder import build_prompt
11
+
12
+ __all__ = ["build_prompt", "parse_raw_output"]
@@ -0,0 +1,166 @@
1
+ # -----------------------------------------------------------------------------
2
+ # Copyright (c) OpenAGI Foundation
3
+ # All rights reserved.
4
+ #
5
+ # This file is part of the official API project.
6
+ # Licensed under the MIT License.
7
+ # -----------------------------------------------------------------------------
8
+
9
+ import re
10
+
11
+ from ..types.models.action import Action, ActionType
12
+ from ..types.models.step import Step
13
+
14
+
15
+ def parse_raw_output(raw_output: str) -> Step:
16
+ """Parse raw LLM output into structured Step format.
17
+
18
+ Expected format:
19
+ <|think_start|> reasoning text <|think_end|>
20
+ <|action_start|> action1(args) & action2(args) & ... <|action_end|>
21
+
22
+ Args:
23
+ raw_output: Raw text output from the LLM
24
+
25
+ Returns:
26
+ Step object with parsed reasoning and actions
27
+ """
28
+ # Extract reasoning/thinking
29
+ think_pattern = r"<\|think_start\|>(.*?)<\|think_end\|>"
30
+ think_match = re.search(think_pattern, raw_output, re.DOTALL)
31
+ reason = think_match.group(1).strip() if think_match else ""
32
+
33
+ # Extract action block
34
+ action_pattern = r"<\|action_start\|>(.*?)<\|action_end\|>"
35
+ action_match = re.search(action_pattern, raw_output, re.DOTALL)
36
+
37
+ actions: list[Action] = []
38
+ stop = False
39
+
40
+ if action_match:
41
+ action_block = action_match.group(1).strip()
42
+ action_texts = _split_actions(action_block)
43
+
44
+ for action_text in action_texts:
45
+ parsed_action = _parse_action(action_text.strip())
46
+ if parsed_action:
47
+ actions.append(parsed_action)
48
+ if parsed_action.type == ActionType.FINISH:
49
+ stop = True
50
+
51
+ return Step(reason=reason, actions=actions, stop=stop)
52
+
53
+
54
+ def _split_actions(action_block: str) -> list[str]:
55
+ """Split action block by & separator, but only when & is outside parentheses.
56
+
57
+ Note: This parser does NOT handle '&' inside quoted strings.
58
+ E.g., type("a&b") would incorrectly split. The LLM should avoid
59
+ this pattern by using alternative escape sequences.
60
+
61
+ Args:
62
+ action_block: String containing one or more actions separated by &
63
+
64
+ Returns:
65
+ List of individual action strings
66
+ """
67
+ actions: list[str] = []
68
+ current_action: list[str] = []
69
+ paren_level = 0
70
+
71
+ for char in action_block:
72
+ if char == "(":
73
+ paren_level += 1
74
+ current_action.append(char)
75
+ elif char == ")":
76
+ paren_level -= 1
77
+ current_action.append(char)
78
+ elif char == "&" and paren_level == 0:
79
+ action_str = "".join(current_action).strip()
80
+ if action_str:
81
+ actions.append(action_str)
82
+ current_action = []
83
+ else:
84
+ current_action.append(char)
85
+
86
+ # Add the last action
87
+ action_str = "".join(current_action).strip()
88
+ if action_str:
89
+ actions.append(action_str)
90
+
91
+ return actions
92
+
93
+
94
+ def _parse_action(action_text: str) -> Action | None:
95
+ """Parse individual action text into Action object.
96
+
97
+ Expected formats:
98
+ - click(x, y) # left-click at position
99
+ - left_double(x, y) # left-double-click at position
100
+ - left_triple(x, y) # left-triple-click at position
101
+ - right_single(x, y) # right-click at position
102
+ - drag(x1, y1, x2, y2) # drag from (x1, y1) to (x2, y2)
103
+ - hotkey(key, c) # press key c times
104
+ - type(text) # type text string
105
+ - scroll(x, y, direction, c) # scroll at position
106
+ - wait() # wait for a while
107
+ - finish() # indicate task is finished
108
+
109
+ Args:
110
+ action_text: String representation of a single action
111
+
112
+ Returns:
113
+ Action object or None if parsing fails
114
+ """
115
+ # Match action format: action_type(arguments)
116
+ match = re.match(r"(\w+)\((.*)\)", action_text.strip())
117
+ if not match:
118
+ return None
119
+
120
+ action_type = match.group(1).lower()
121
+ arguments = match.group(2).strip()
122
+
123
+ # Parse count from arguments for actions that support it
124
+ count = 1
125
+
126
+ # Validate and map action type to enum
127
+ try:
128
+ action_enum = ActionType(action_type)
129
+ except ValueError:
130
+ return None
131
+
132
+ # Parse specific action types and extract count where applicable
133
+ match action_enum:
134
+ case ActionType.HOTKEY:
135
+ # hotkey(key, c) - press key c times
136
+ args = arguments.rsplit(",", 1)
137
+ if len(args) >= 2 and args[1].strip():
138
+ key = args[0].strip()
139
+ try:
140
+ count = int(args[1].strip())
141
+ except ValueError:
142
+ count = 1
143
+ else:
144
+ key = arguments.strip()
145
+ count = 1
146
+ arguments = key
147
+
148
+ case ActionType.SCROLL:
149
+ # scroll(x, y, direction, c) - scroll at position
150
+ args = arguments.split(",")
151
+ if len(args) >= 4:
152
+ x = args[0].strip()
153
+ y = args[1].strip()
154
+ direction = args[2].strip()
155
+ try:
156
+ count = int(args[3].strip())
157
+ except (ValueError, IndexError):
158
+ count = 1
159
+ # Reconstruct arguments without count
160
+ arguments = f"{x},{y},{direction}"
161
+
162
+ case _:
163
+ # For other actions, use default count of 1
164
+ pass
165
+
166
+ return Action(type=action_enum, argument=arguments, count=count)
@@ -0,0 +1,44 @@
1
+ # -----------------------------------------------------------------------------
2
+ # Copyright (c) OpenAGI Foundation
3
+ # All rights reserved.
4
+ #
5
+ # This file is part of the official API project.
6
+ # Licensed under the MIT License.
7
+ # -----------------------------------------------------------------------------
8
+
9
+ instruction_template = """You are a Desktop Agent completing computer use tasks from a user instruction.
10
+
11
+ Every step, you will look at the screenshot and output the desired actions in a format as:
12
+
13
+ <|think_start|> brief description of your intent and reasoning <|think_end|>
14
+ <|action_start|> one of the allowed actions as below <|action_end|>
15
+
16
+ In the action field, you have the following action formats:
17
+ 1. click(x, y) # left-click at the position (x, y), where x and y are integers normalized between 0 and 1000
18
+ 2. left_double(x, y) # left-double-click at the position (x, y), where x and y are integers normalized between 0 and 1000
19
+ 3. left_triple(x, y) # left-triple-click at the position (x, y), where x and y are integers normalized between 0 and 1000
20
+ 4. right_single(x, y) # right-click at the position (x, y), where x and y are integers normalized between 0 and 1000
21
+ 5. drag(x1, y1, x2, y2) # drag the mouse from (x1, y1) to (x2, y2) to select or move contents, where x1, y1, x2, y2 are integers normalized between 0 and 1000
22
+ 6. hotkey(key, c) # press the key for c times
23
+ 7. type(text) # type a text string on the keyboard
24
+ 8. scroll(x, y, direction, c) # scroll the mouse at position (x, y) in the direction of up or down for c times, where x and y are integers normalized between 0 and 1000
25
+ 9. wait() # wait for a while
26
+ 10. finish() # indicate the task is finished
27
+
28
+ Directly output the text beginning with <|think_start|>, no additional text is needed for this scenario.
29
+
30
+ The user instruction is:
31
+ {instruction}
32
+ """
33
+
34
+
35
+ def build_prompt(task_description: str) -> str:
36
+ """Build the instruction prompt for the OAGI model.
37
+
38
+ Args:
39
+ task_description: The task description to include in the prompt
40
+
41
+ Returns:
42
+ The formatted prompt string with action format documentation
43
+ """
44
+ return instruction_template.format(instruction=task_description)