oagi-core 0.10.3__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. oagi/__init__.py +1 -3
  2. oagi/actor/__init__.py +21 -0
  3. oagi/{task → actor}/async_.py +23 -7
  4. oagi/{task → actor}/async_short.py +1 -1
  5. oagi/actor/base.py +222 -0
  6. oagi/{task → actor}/short.py +1 -1
  7. oagi/{task → actor}/sync.py +21 -5
  8. oagi/agent/default.py +5 -0
  9. oagi/agent/factories.py +75 -3
  10. oagi/agent/observer/exporters.py +6 -0
  11. oagi/agent/observer/report_template.html +19 -0
  12. oagi/agent/tasker/planner.py +31 -19
  13. oagi/agent/tasker/taskee_agent.py +26 -7
  14. oagi/agent/tasker/tasker_agent.py +4 -0
  15. oagi/cli/agent.py +54 -30
  16. oagi/client/async_.py +54 -96
  17. oagi/client/base.py +81 -133
  18. oagi/client/sync.py +52 -99
  19. oagi/constants.py +7 -2
  20. oagi/handler/__init__.py +16 -0
  21. oagi/handler/_macos.py +137 -0
  22. oagi/handler/_windows.py +101 -0
  23. oagi/handler/async_pyautogui_action_handler.py +8 -0
  24. oagi/handler/capslock_manager.py +55 -0
  25. oagi/handler/pyautogui_action_handler.py +21 -39
  26. oagi/server/session_store.py +3 -3
  27. oagi/server/socketio_server.py +4 -4
  28. oagi/task/__init__.py +22 -8
  29. oagi/types/__init__.py +2 -1
  30. oagi/types/models/__init__.py +0 -2
  31. oagi/types/models/action.py +4 -1
  32. oagi/types/models/client.py +1 -17
  33. oagi/types/step_observer.py +2 -0
  34. oagi/types/url.py +25 -0
  35. oagi/utils/__init__.py +12 -0
  36. oagi/utils/output_parser.py +166 -0
  37. oagi/utils/prompt_builder.py +44 -0
  38. {oagi_core-0.10.3.dist-info → oagi_core-0.12.0.dist-info}/METADATA +90 -10
  39. oagi_core-0.12.0.dist-info/RECORD +76 -0
  40. oagi/task/base.py +0 -158
  41. oagi_core-0.10.3.dist-info/RECORD +0 -70
  42. {oagi_core-0.10.3.dist-info → oagi_core-0.12.0.dist-info}/WHEEL +0 -0
  43. {oagi_core-0.10.3.dist-info → oagi_core-0.12.0.dist-info}/entry_points.txt +0 -0
  44. {oagi_core-0.10.3.dist-info → oagi_core-0.12.0.dist-info}/licenses/LICENSE +0 -0
@@ -11,7 +11,7 @@ from typing import Any
11
11
 
12
12
  from ...client import AsyncClient
13
13
  from ...constants import DEFAULT_REFLECTION_INTERVAL
14
- from ...types import URL, Image
14
+ from ...types import URL, Image, extract_uuid_from_url
15
15
  from .memory import PlannerMemory
16
16
  from .models import Action, PlannerOutput, ReflectionOutput
17
17
 
@@ -122,7 +122,7 @@ class Planner:
122
122
  screenshot: Image | URL | None = None,
123
123
  memory: PlannerMemory | None = None,
124
124
  todo_index: int | None = None,
125
- ) -> PlannerOutput:
125
+ ) -> tuple[PlannerOutput, str | None]:
126
126
  """Generate initial plan for a todo.
127
127
 
128
128
  Args:
@@ -133,16 +133,21 @@ class Planner:
133
133
  todo_index: Optional todo index for formatting internal context
134
134
 
135
135
  Returns:
136
- PlannerOutput with instruction, reasoning, and optional subtodos
136
+ Tuple of (PlannerOutput, request_id) where request_id is from API response
137
137
  """
138
138
  # Ensure we have a client
139
139
  client = self._ensure_client()
140
140
 
141
- # Upload screenshot if provided
141
+ # Get screenshot UUID - either extract from URL or upload
142
142
  screenshot_uuid = None
143
143
  if screenshot:
144
- upload_response = await client.put_s3_presigned_url(screenshot)
145
- screenshot_uuid = upload_response.uuid
144
+ # Check if screenshot is already a URL (already uploaded to S3)
145
+ if isinstance(screenshot, str):
146
+ screenshot_uuid = extract_uuid_from_url(screenshot)
147
+ # If not a URL or UUID extraction failed, upload the image
148
+ if not screenshot_uuid:
149
+ upload_response = await client.put_s3_presigned_url(screenshot)
150
+ screenshot_uuid = upload_response.uuid
146
151
 
147
152
  # Extract memory data if provided
148
153
  (
@@ -165,8 +170,8 @@ class Planner:
165
170
  current_screenshot=screenshot_uuid,
166
171
  )
167
172
 
168
- # Parse response
169
- return self._parse_planner_output(response.response)
173
+ # Parse response and return with request_id
174
+ return self._parse_planner_output(response.response), response.request_id
170
175
 
171
176
  async def reflect(
172
177
  self,
@@ -177,7 +182,7 @@ class Planner:
177
182
  todo_index: int | None = None,
178
183
  current_instruction: str | None = None,
179
184
  reflection_interval: int = DEFAULT_REFLECTION_INTERVAL,
180
- ) -> ReflectionOutput:
185
+ ) -> tuple[ReflectionOutput, str | None]:
181
186
  """Reflect on recent actions and progress.
182
187
 
183
188
  Args:
@@ -190,16 +195,21 @@ class Planner:
190
195
  reflection_interval: Window size for recent actions/screenshots
191
196
 
192
197
  Returns:
193
- ReflectionOutput with continuation decision and reasoning
198
+ Tuple of (ReflectionOutput, request_id) where request_id is from API response
194
199
  """
195
200
  # Ensure we have a client
196
201
  client = self._ensure_client()
197
202
 
198
- # Upload screenshot if provided
203
+ # Get screenshot UUID - either extract from URL or upload
199
204
  result_screenshot_uuid = None
200
205
  if screenshot:
201
- upload_response = await client.put_s3_presigned_url(screenshot)
202
- result_screenshot_uuid = upload_response.uuid
206
+ # Check if screenshot is already a URL (already uploaded to S3)
207
+ if isinstance(screenshot, str):
208
+ result_screenshot_uuid = extract_uuid_from_url(screenshot)
209
+ # If not a URL or UUID extraction failed, upload the image
210
+ if not result_screenshot_uuid:
211
+ upload_response = await client.put_s3_presigned_url(screenshot)
212
+ result_screenshot_uuid = upload_response.uuid
203
213
 
204
214
  # Extract memory data if provided
205
215
  (
@@ -250,8 +260,8 @@ class Planner:
250
260
  prior_notes=prior_notes,
251
261
  )
252
262
 
253
- # Parse response
254
- return self._parse_reflection_output(response.response)
263
+ # Parse response and return with request_id
264
+ return self._parse_reflection_output(response.response), response.request_id
255
265
 
256
266
  async def summarize(
257
267
  self,
@@ -259,7 +269,7 @@ class Planner:
259
269
  context: dict[str, Any],
260
270
  memory: PlannerMemory | None = None,
261
271
  todo_index: int | None = None,
262
- ) -> str:
272
+ ) -> tuple[str, str | None]:
263
273
  """Generate execution summary.
264
274
 
265
275
  Args:
@@ -269,7 +279,7 @@ class Planner:
269
279
  todo_index: Optional todo index for formatting internal context
270
280
 
271
281
  Returns:
272
- String summary of the execution
282
+ Tuple of (summary string, request_id) where request_id is from API response
273
283
  """
274
284
  # Ensure we have a client
275
285
  client = self._ensure_client()
@@ -304,9 +314,11 @@ class Planner:
304
314
  # Parse response and extract summary
305
315
  try:
306
316
  result = json.loads(response.response)
307
- return result.get("task_summary", response.response)
317
+ summary = result.get("task_summary", response.response)
308
318
  except json.JSONDecodeError:
309
- return response.response
319
+ summary = response.response
320
+
321
+ return summary, response.request_id
310
322
 
311
323
  def _format_execution_notes(self, context: dict[str, Any]) -> str:
312
324
  """Format execution history notes.
@@ -19,6 +19,7 @@ from oagi.constants import (
19
19
  DEFAULT_TEMPERATURE,
20
20
  MODEL_ACTOR,
21
21
  )
22
+ from oagi.handler import reset_handler
22
23
  from oagi.types import (
23
24
  URL,
24
25
  ActionEvent,
@@ -28,6 +29,7 @@ from oagi.types import (
28
29
  Image,
29
30
  PlanEvent,
30
31
  StepEvent,
32
+ extract_uuid_from_url,
31
33
  )
32
34
 
33
35
  from ..protocol import AsyncAgent
@@ -121,6 +123,9 @@ class TaskeeAgent(AsyncAgent):
121
123
  Returns:
122
124
  True if successful, False otherwise
123
125
  """
126
+ # Reset handler state at todo execution start
127
+ reset_handler(action_handler)
128
+
124
129
  self.current_todo = instruction
125
130
  self.actions = []
126
131
  self.total_actions = 0
@@ -195,7 +200,7 @@ class TaskeeAgent(AsyncAgent):
195
200
  context = self._get_context()
196
201
 
197
202
  # Generate plan using LLM planner
198
- plan_output = await self.planner.initial_plan(
203
+ plan_output, request_id = await self.planner.initial_plan(
199
204
  self.current_todo,
200
205
  context,
201
206
  screenshot,
@@ -219,6 +224,7 @@ class TaskeeAgent(AsyncAgent):
219
224
  image=_serialize_image(screenshot),
220
225
  reasoning=plan_output.reasoning,
221
226
  result=plan_output.instruction,
227
+ request_id=request_id,
222
228
  )
223
229
  )
224
230
 
@@ -256,11 +262,21 @@ class TaskeeAgent(AsyncAgent):
256
262
  # Capture screenshot
257
263
  screenshot = await image_provider()
258
264
 
259
- # Upload screenshot first to get UUID (avoids re-upload in actor.step)
265
+ # Get screenshot UUID - either extract from URL or upload
260
266
  try:
261
- upload_response = await client.put_s3_presigned_url(screenshot)
262
- screenshot_uuid = upload_response.uuid
263
- screenshot_url = upload_response.download_url
267
+ screenshot_uuid = None
268
+ screenshot_url = None
269
+
270
+ # Check if screenshot is already a URL (from SocketIOImageProvider)
271
+ if isinstance(screenshot, str):
272
+ screenshot_uuid = extract_uuid_from_url(screenshot)
273
+ screenshot_url = screenshot
274
+
275
+ # If not a URL or UUID extraction failed, upload the image
276
+ if not screenshot_uuid:
277
+ upload_response = await client.put_s3_presigned_url(screenshot)
278
+ screenshot_uuid = upload_response.uuid
279
+ screenshot_url = upload_response.download_url
264
280
  except Exception as e:
265
281
  logger.error(f"Error uploading screenshot: {e}")
266
282
  self._record_action(
@@ -294,6 +310,7 @@ class TaskeeAgent(AsyncAgent):
294
310
  step_num=self.total_actions + 1,
295
311
  image=_serialize_image(screenshot),
296
312
  step=step,
313
+ task_id=self.actor.task_id,
297
314
  )
298
315
  )
299
316
 
@@ -378,7 +395,7 @@ class TaskeeAgent(AsyncAgent):
378
395
  recent_actions = self.actions[-self.since_reflection :]
379
396
 
380
397
  # Reflect using planner
381
- reflection = await self.planner.reflect(
398
+ reflection, request_id = await self.planner.reflect(
382
399
  recent_actions,
383
400
  context,
384
401
  screenshot,
@@ -409,6 +426,7 @@ class TaskeeAgent(AsyncAgent):
409
426
  image=_serialize_image(screenshot),
410
427
  reasoning=reflection.reasoning,
411
428
  result=decision,
429
+ request_id=request_id,
412
430
  )
413
431
  )
414
432
 
@@ -441,7 +459,7 @@ class TaskeeAgent(AsyncAgent):
441
459
  context = self._get_context()
442
460
  context["current_todo"] = self.current_todo
443
461
 
444
- summary = await self.planner.summarize(
462
+ summary, request_id = await self.planner.summarize(
445
463
  self.actions,
446
464
  context,
447
465
  memory=self.external_memory,
@@ -463,6 +481,7 @@ class TaskeeAgent(AsyncAgent):
463
481
  image=None,
464
482
  reasoning=summary,
465
483
  result=None,
484
+ request_id=request_id,
466
485
  )
467
486
  )
468
487
 
@@ -16,6 +16,7 @@ from oagi.constants import (
16
16
  DEFAULT_TEMPERATURE,
17
17
  MODEL_ACTOR,
18
18
  )
19
+ from oagi.handler import reset_handler
19
20
  from oagi.types import AsyncActionHandler, AsyncImageProvider, AsyncObserver, SplitEvent
20
21
 
21
22
  from ..protocol import AsyncAgent
@@ -112,6 +113,9 @@ class TaskerAgent(AsyncAgent):
112
113
  Returns:
113
114
  True if all todos completed successfully, False otherwise
114
115
  """
116
+ # Reset handler state at automation start
117
+ reset_handler(action_handler)
118
+
115
119
  overall_success = True
116
120
 
117
121
  # Execute todos until none remain
oagi/cli/agent.py CHANGED
@@ -17,12 +17,9 @@ from oagi.agent.observer import AsyncAgentObserver
17
17
  from oagi.constants import (
18
18
  API_KEY_HELP_URL,
19
19
  DEFAULT_BASE_URL,
20
- DEFAULT_MAX_STEPS,
21
20
  DEFAULT_MAX_STEPS_THINKER,
22
21
  DEFAULT_STEP_DELAY,
23
- DEFAULT_TEMPERATURE,
24
22
  MODE_ACTOR,
25
- MODEL_ACTOR,
26
23
  MODEL_THINKER,
27
24
  )
28
25
  from oagi.exceptions import check_optional_dependency
@@ -40,22 +37,30 @@ def add_agent_parser(subparsers: argparse._SubParsersAction) -> None:
40
37
  "run", help="Run an agent with the given instruction"
41
38
  )
42
39
  run_parser.add_argument(
43
- "instruction", type=str, help="Task instruction for the agent to execute"
40
+ "instruction",
41
+ type=str,
42
+ nargs="?",
43
+ default="",
44
+ help="Task instruction for the agent to execute (optional for pre-configured modes)",
44
45
  )
45
46
  run_parser.add_argument(
46
- "--model", type=str, help=f"Model to use (default: {MODEL_ACTOR})"
47
+ "--model", type=str, help="Model to use (default: determined by mode)"
47
48
  )
48
49
  run_parser.add_argument(
49
- "--max-steps", type=int, help="Maximum number of steps (default: 20)"
50
+ "--max-steps",
51
+ type=int,
52
+ help="Maximum number of steps (default: determined by mode)",
50
53
  )
51
54
  run_parser.add_argument(
52
- "--temperature", type=float, help="Sampling temperature (default: 0.5)"
55
+ "--temperature",
56
+ type=float,
57
+ help="Sampling temperature (default: determined by mode)",
53
58
  )
54
59
  run_parser.add_argument(
55
60
  "--mode",
56
61
  type=str,
57
62
  default=MODE_ACTOR,
58
- help=f"Agent mode to use (default: {MODE_ACTOR}). Available modes: actor, planner",
63
+ help=f"Agent mode to use (default: {MODE_ACTOR}). Use 'oagi agent modes' to list available modes",
59
64
  )
60
65
  run_parser.add_argument(
61
66
  "--oagi-api-key", type=str, help="OAGI API key (default: OAGI_API_KEY env var)"
@@ -82,6 +87,9 @@ def add_agent_parser(subparsers: argparse._SubParsersAction) -> None:
82
87
  help=f"Delay in seconds after each step before next screenshot (default: {DEFAULT_STEP_DELAY})",
83
88
  )
84
89
 
90
+ # agent modes command
91
+ agent_subparsers.add_parser("modes", help="List available agent modes")
92
+
85
93
  # agent permission command
86
94
  agent_subparsers.add_parser(
87
95
  "permission",
@@ -92,10 +100,22 @@ def add_agent_parser(subparsers: argparse._SubParsersAction) -> None:
92
100
  def handle_agent_command(args: argparse.Namespace) -> None:
93
101
  if args.agent_command == "run":
94
102
  run_agent(args)
103
+ elif args.agent_command == "modes":
104
+ list_modes()
95
105
  elif args.agent_command == "permission":
96
106
  check_permissions()
97
107
 
98
108
 
109
+ def list_modes() -> None:
110
+ """List all available agent modes."""
111
+ from oagi.agent import list_agent_modes # noqa: PLC0415
112
+
113
+ modes = list_agent_modes()
114
+ print("Available agent modes:")
115
+ for mode in modes:
116
+ print(f" - {mode}")
117
+
118
+
99
119
  def check_permissions() -> None:
100
120
  """Check and request macOS permissions for screen recording and accessibility.
101
121
 
@@ -207,14 +227,6 @@ def run_agent(args: argparse.Namespace) -> None:
207
227
  sys.exit(1)
208
228
 
209
229
  base_url = args.oagi_base_url or os.getenv("OAGI_BASE_URL", DEFAULT_BASE_URL)
210
- model = args.model or MODEL_ACTOR
211
- default_max_steps = (
212
- DEFAULT_MAX_STEPS_THINKER if model == MODEL_THINKER else DEFAULT_MAX_STEPS
213
- )
214
- max_steps = args.max_steps or default_max_steps
215
- temperature = (
216
- args.temperature if args.temperature is not None else DEFAULT_TEMPERATURE
217
- )
218
230
  mode = args.mode or MODE_ACTOR
219
231
  step_delay = args.step_delay if args.step_delay is not None else DEFAULT_STEP_DELAY
220
232
  export_format = args.export
@@ -233,26 +245,38 @@ def run_agent(args: argparse.Namespace) -> None:
233
245
 
234
246
  observer = CombinedObserver()
235
247
 
236
- # Create agent with observer
237
- agent = create_agent(
238
- mode=mode,
239
- api_key=api_key,
240
- base_url=base_url,
241
- model=model,
242
- max_steps=max_steps,
243
- temperature=temperature,
244
- step_observer=observer,
245
- step_delay=step_delay,
246
- )
248
+ # Build agent kwargs - only pass explicitly provided values, let factory use defaults
249
+ agent_kwargs = {
250
+ "mode": mode,
251
+ "api_key": api_key,
252
+ "base_url": base_url,
253
+ "step_observer": observer,
254
+ "step_delay": step_delay,
255
+ }
256
+ if args.model:
257
+ agent_kwargs["model"] = args.model
258
+ # If thinker model specified without max_steps, use thinker's default
259
+ if args.model == MODEL_THINKER and not args.max_steps:
260
+ agent_kwargs["max_steps"] = DEFAULT_MAX_STEPS_THINKER
261
+ if args.max_steps:
262
+ agent_kwargs["max_steps"] = args.max_steps
263
+ if args.temperature is not None:
264
+ agent_kwargs["temperature"] = args.temperature
265
+
266
+ # Create agent
267
+ agent = create_agent(**agent_kwargs)
247
268
 
248
269
  # Create handlers
249
270
  action_handler = AsyncPyautoguiActionHandler()
250
271
  image_provider = AsyncScreenshotMaker()
251
272
 
252
- print(f"Starting agent with instruction: {args.instruction}")
273
+ if args.instruction:
274
+ print(f"Starting agent with instruction: {args.instruction}")
275
+ else:
276
+ print(f"Starting agent with mode: {mode} (using pre-configured instruction)")
253
277
  print(
254
- f"Mode: {mode}, Model: {model}, Max steps: {max_steps}, "
255
- f"Temperature: {temperature}, Step delay: {step_delay}s"
278
+ f"Mode: {mode}, Model: {agent.model}, Max steps: {agent.max_steps}, "
279
+ f"Temperature: {agent.temperature}, Step delay: {step_delay}s"
256
280
  )
257
281
  print("-" * 60)
258
282
 
oagi/client/async_.py CHANGED
@@ -9,17 +9,19 @@
9
9
  from functools import wraps
10
10
 
11
11
  import httpx
12
+ from httpx import AsyncHTTPTransport
13
+ from openai import AsyncOpenAI
12
14
 
13
15
  from ..constants import (
14
- API_HEALTH_ENDPOINT,
15
16
  API_V1_FILE_UPLOAD_ENDPOINT,
16
17
  API_V1_GENERATE_ENDPOINT,
17
- API_V2_MESSAGE_ENDPOINT,
18
+ DEFAULT_MAX_RETRIES,
18
19
  HTTP_CLIENT_TIMEOUT,
19
20
  )
20
21
  from ..logging import get_logger
21
22
  from ..types import Image
22
- from ..types.models import GenerateResponse, LLMResponse, UploadFileResponse
23
+ from ..types.models import GenerateResponse, UploadFileResponse, Usage
24
+ from ..types.models.step import Step
23
25
  from .base import BaseClient
24
26
 
25
27
  logger = get_logger("async_client")
@@ -35,8 +37,7 @@ def async_log_trace_on_failure(func):
35
37
  except Exception as e:
36
38
  # Try to get response from the exception if it has one
37
39
  if (response := getattr(e, "response", None)) is not None:
38
- logger.error(f"Request Id: {response.headers.get('x-request-id', '')}")
39
- logger.error(f"Trace Id: {response.headers.get('x-trace-id', '')}")
40
+ BaseClient._log_trace_id(response)
40
41
  raise
41
42
 
42
43
  return wrapper
@@ -45,115 +46,72 @@ def async_log_trace_on_failure(func):
45
46
  class AsyncClient(BaseClient[httpx.AsyncClient]):
46
47
  """Asynchronous HTTP client for the OAGI API."""
47
48
 
48
- def __init__(self, base_url: str | None = None, api_key: str | None = None):
49
- super().__init__(base_url, api_key)
50
- self.client = httpx.AsyncClient(base_url=self.base_url)
51
- self.upload_client = httpx.AsyncClient(timeout=HTTP_CLIENT_TIMEOUT)
49
+ def __init__(
50
+ self,
51
+ base_url: str | None = None,
52
+ api_key: str | None = None,
53
+ max_retries: int = DEFAULT_MAX_RETRIES,
54
+ ):
55
+ super().__init__(base_url, api_key, max_retries)
56
+
57
+ # OpenAI client for chat completions (with retries)
58
+ self.openai_client = AsyncOpenAI(
59
+ api_key=self.api_key,
60
+ base_url=f"{self.base_url}/v1",
61
+ max_retries=self.max_retries,
62
+ )
63
+
64
+ # httpx clients for S3 uploads and other endpoints (with retries)
65
+ transport = AsyncHTTPTransport(retries=self.max_retries)
66
+ self.http_client = httpx.AsyncClient(
67
+ transport=transport, base_url=self.base_url
68
+ )
69
+ self.upload_client = httpx.AsyncClient(
70
+ transport=transport, timeout=HTTP_CLIENT_TIMEOUT
71
+ )
72
+
52
73
  logger.info(f"AsyncClient initialized with base_url: {self.base_url}")
53
74
 
54
75
  async def __aenter__(self):
55
76
  return self
56
77
 
57
78
  async def __aexit__(self, exc_type, exc_val, exc_tb):
58
- await self.client.aclose()
59
- await self.upload_client.aclose()
79
+ await self.close()
60
80
 
61
81
  async def close(self):
62
- """Close the underlying httpx async clients."""
63
- await self.client.aclose()
82
+ """Close the underlying async clients."""
83
+ await self.openai_client.close()
84
+ await self.http_client.aclose()
64
85
  await self.upload_client.aclose()
65
86
 
66
- @async_log_trace_on_failure
67
- async def create_message(
87
+ async def chat_completion(
68
88
  self,
69
89
  model: str,
70
- screenshot: bytes | None = None,
71
- screenshot_url: str | None = None,
72
- task_description: str | None = None,
73
- task_id: str | None = None,
74
- instruction: str | None = None,
75
- messages_history: list | None = None,
90
+ messages: list,
76
91
  temperature: float | None = None,
77
- api_version: str | None = None,
78
- ) -> "LLMResponse":
92
+ task_id: str | None = None,
93
+ ) -> tuple[Step, str, Usage | None]:
79
94
  """
80
- Call the /v2/message endpoint to analyze task and screenshot
95
+ Call OpenAI-compatible /v1/chat/completions endpoint.
81
96
 
82
97
  Args:
83
- model: The model to use for task analysis
84
- screenshot: Screenshot image bytes (mutually exclusive with screenshot_url)
85
- screenshot_url: Direct URL to screenshot (mutually exclusive with screenshot)
86
- task_description: Description of the task (required for new sessions)
87
- task_id: Task ID for continuing existing task
88
- instruction: Additional instruction when continuing a session
89
- messages_history: OpenAI-compatible chat message history
90
- temperature: Sampling temperature (0.0-2.0) for LLM inference
91
- api_version: API version header
98
+ model: Model to use for inference
99
+ messages: Full message history (OpenAI-compatible format)
100
+ temperature: Sampling temperature (0.0-2.0)
101
+ task_id: Optional task ID for multi-turn conversations
92
102
 
93
103
  Returns:
94
- LLMResponse: The response from the API
95
-
96
- Raises:
97
- ValueError: If both or neither screenshot and screenshot_url are provided
98
- httpx.HTTPStatusError: For HTTP error responses
104
+ Tuple of (Step, raw_output, Usage)
105
+ - Step: Parsed actions and reasoning
106
+ - raw_output: Raw model output string (for message history)
107
+ - Usage: Token usage statistics (or None if not available)
99
108
  """
100
- # Validate that exactly one is provided
101
- if (screenshot is None) == (screenshot_url is None):
102
- raise ValueError(
103
- "Exactly one of 'screenshot' or 'screenshot_url' must be provided"
104
- )
105
-
106
- self._log_request_info(model, task_description, task_id)
107
-
108
- # Upload screenshot to S3 if bytes provided, otherwise use URL directly
109
- upload_file_response = None
110
- if screenshot is not None:
111
- upload_file_response = await self.put_s3_presigned_url(
112
- screenshot, api_version
113
- )
114
-
115
- # Prepare message payload
116
- headers, payload = self._prepare_message_payload(
117
- model=model,
118
- upload_file_response=upload_file_response,
119
- task_description=task_description,
120
- task_id=task_id,
121
- instruction=instruction,
122
- messages_history=messages_history,
123
- temperature=temperature,
124
- api_version=api_version,
125
- screenshot_url=screenshot_url,
109
+ logger.info(f"Making async chat completion request with model: {model}")
110
+ kwargs = self._build_chat_completion_kwargs(
111
+ model, messages, temperature, task_id
126
112
  )
127
-
128
- # Make request
129
- try:
130
- response = await self.client.post(
131
- API_V2_MESSAGE_ENDPOINT,
132
- json=payload,
133
- headers=headers,
134
- timeout=self.timeout,
135
- )
136
- return self._process_response(response)
137
- except (httpx.TimeoutException, httpx.NetworkError) as e:
138
- self._handle_upload_http_errors(e)
139
-
140
- async def health_check(self) -> dict:
141
- """
142
- Call the /health endpoint for health check
143
-
144
- Returns:
145
- dict: Health check response
146
- """
147
- logger.debug("Making async health check request")
148
- try:
149
- response = await self.client.get(API_HEALTH_ENDPOINT)
150
- response.raise_for_status()
151
- result = response.json()
152
- logger.debug("Async health check successful")
153
- return result
154
- except httpx.HTTPStatusError as e:
155
- logger.warning(f"Async health check failed: {e}")
156
- raise
113
+ response = await self.openai_client.chat.completions.create(**kwargs)
114
+ return self._parse_chat_completion_response(response)
157
115
 
158
116
  async def get_s3_presigned_url(
159
117
  self,
@@ -172,7 +130,7 @@ class AsyncClient(BaseClient[httpx.AsyncClient]):
172
130
 
173
131
  try:
174
132
  headers = self._build_headers(api_version)
175
- response = await self.client.get(
133
+ response = await self.http_client.get(
176
134
  API_V1_FILE_UPLOAD_ENDPOINT, headers=headers, timeout=self.timeout
177
135
  )
178
136
  return self._process_upload_response(response)
@@ -292,7 +250,7 @@ class AsyncClient(BaseClient[httpx.AsyncClient]):
292
250
 
293
251
  # Make request
294
252
  try:
295
- response = await self.client.post(
253
+ response = await self.http_client.post(
296
254
  API_V1_GENERATE_ENDPOINT,
297
255
  json=payload,
298
256
  headers=headers,