hud-python 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (58) hide show
  1. hud/__init__.py +4 -3
  2. hud/adapters/claude/adapter.py +5 -14
  3. hud/adapters/common/adapter.py +3 -3
  4. hud/adapters/common/tests/__init__.py +0 -0
  5. hud/adapters/common/tests/test_adapter.py +277 -0
  6. hud/adapters/common/types.py +3 -3
  7. hud/adapters/operator/adapter.py +16 -23
  8. hud/agent/__init__.py +8 -1
  9. hud/agent/base.py +28 -28
  10. hud/agent/claude.py +69 -60
  11. hud/agent/langchain.py +32 -26
  12. hud/agent/operator.py +75 -67
  13. hud/env/__init__.py +5 -5
  14. hud/env/client.py +2 -2
  15. hud/env/docker_client.py +37 -39
  16. hud/env/environment.py +91 -66
  17. hud/env/local_docker_client.py +5 -7
  18. hud/env/remote_client.py +39 -32
  19. hud/env/remote_docker_client.py +13 -3
  20. hud/evaluators/__init__.py +2 -3
  21. hud/evaluators/base.py +4 -3
  22. hud/evaluators/inspect.py +3 -8
  23. hud/evaluators/judge.py +34 -58
  24. hud/evaluators/match.py +42 -49
  25. hud/evaluators/remote.py +13 -26
  26. hud/evaluators/tests/__init__.py +0 -0
  27. hud/evaluators/tests/test_inspect.py +12 -0
  28. hud/evaluators/tests/test_judge.py +231 -0
  29. hud/evaluators/tests/test_match.py +115 -0
  30. hud/evaluators/tests/test_remote.py +98 -0
  31. hud/exceptions.py +167 -0
  32. hud/gym.py +9 -7
  33. hud/job.py +179 -109
  34. hud/server/__init__.py +2 -2
  35. hud/server/requests.py +148 -186
  36. hud/server/tests/__init__.py +0 -0
  37. hud/server/tests/test_requests.py +275 -0
  38. hud/settings.py +3 -2
  39. hud/task.py +9 -19
  40. hud/taskset.py +44 -11
  41. hud/trajectory.py +6 -9
  42. hud/types.py +12 -9
  43. hud/utils/__init__.py +2 -2
  44. hud/utils/common.py +36 -15
  45. hud/utils/config.py +45 -30
  46. hud/utils/progress.py +34 -21
  47. hud/utils/telemetry.py +10 -11
  48. hud/utils/tests/__init__.py +0 -0
  49. hud/utils/tests/test_common.py +52 -0
  50. hud/utils/tests/test_config.py +129 -0
  51. hud/utils/tests/test_progress.py +225 -0
  52. hud/utils/tests/test_telemetry.py +37 -0
  53. hud/utils/tests/test_version.py +8 -0
  54. {hud_python-0.2.2.dist-info → hud_python-0.2.4.dist-info}/METADATA +9 -6
  55. hud_python-0.2.4.dist-info/RECORD +62 -0
  56. hud_python-0.2.2.dist-info/RECORD +0 -46
  57. {hud_python-0.2.2.dist-info → hud_python-0.2.4.dist-info}/WHEEL +0 -0
  58. {hud_python-0.2.2.dist-info → hud_python-0.2.4.dist-info}/licenses/LICENSE +0 -0
hud/agent/claude.py CHANGED
@@ -1,5 +1,4 @@
1
1
  import logging
2
- import os
3
2
  from typing import Any, cast
4
3
 
5
4
  from anthropic import AsyncAnthropic
@@ -14,52 +13,48 @@ from anthropic.types.beta import (
14
13
  from hud.adapters import Adapter
15
14
  from hud.agent.base import Agent
16
15
  from hud.adapters.claude import ClaudeAdapter
17
- from hud.env.environment import Observation
16
+ from hud.utils.common import Observation
18
17
  from hud.settings import settings
19
18
 
20
19
  logger = logging.getLogger(__name__)
21
20
 
21
+
22
22
  def base64_to_content_block(base64: str) -> BetaImageBlockParam:
23
23
  return {
24
24
  "type": "image",
25
- "source": {
26
- "type": "base64",
27
- "media_type": "image/png",
28
- "data": base64
29
- }
25
+ "source": {"type": "base64", "media_type": "image/png", "data": base64},
30
26
  }
31
27
 
28
+
32
29
  def text_to_content_block(text: str) -> BetaTextBlockParam:
33
- return {
34
- "type": "text",
35
- "text": text
36
- }
30
+ return {"type": "text", "text": text}
31
+
32
+
33
+ def tool_use_content_block(
34
+ tool_use_id: str, content: list[BetaTextBlockParam | BetaImageBlockParam]
35
+ ) -> BetaToolResultBlockParam:
36
+ return {"type": "tool_result", "tool_use_id": tool_use_id, "content": content}
37
37
 
38
- def tool_use_content_block(tool_use_id: str, content: list[BetaTextBlockParam | BetaImageBlockParam]) -> BetaToolResultBlockParam:
39
- return {
40
- "type": "tool_result",
41
- "tool_use_id": tool_use_id,
42
- "content": content
43
- }
44
38
 
45
39
  # Claude's Computer Use Tool definition
46
40
  COMPUTER_TOOL: BetaToolComputerUse20250124Param = {
47
- "type": "computer_20250124",
48
- "name": "computer",
49
- "display_width_px": 1024,
50
- "display_height_px": 768
41
+ "type": "computer_20250124",
42
+ "name": "computer",
43
+ "display_width_px": 1024,
44
+ "display_height_px": 768,
51
45
  }
52
46
 
47
+
53
48
  class ClaudeAgent(Agent[AsyncAnthropic, Any]):
54
49
  """
55
50
  An agent implementation using Anthropic's Claude API with Computer Use.
56
-
51
+
57
52
  This agent interacts with HUD environments using Claude's Computer Use API
58
53
  through the ClaudeAdapter which converts actions to the format expected by HUD.
59
54
  """
60
-
55
+
61
56
  def __init__(
62
- self,
57
+ self,
63
58
  client: AsyncAnthropic | None = None,
64
59
  adapter: Adapter | None = None,
65
60
  model: str = "claude-3-7-sonnet-20250219",
@@ -68,7 +63,7 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
68
63
  ):
69
64
  """
70
65
  Initialize the ClaudeAgent.
71
-
66
+
72
67
  Args:
73
68
  client: The AsyncAnthropic client for API calls (optional, created automatically if not provided)
74
69
  adapter: The adapter to use for preprocessing and postprocessing
@@ -81,28 +76,30 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
81
76
  # Get API key from settings
82
77
  api_key = settings.anthropic_api_key
83
78
  if not api_key:
84
- raise ValueError("Anthropic API key not found in settings or environment variables. Set ANTHROPIC_API_KEY.")
85
-
79
+ raise ValueError(
80
+ "Anthropic API key not found in settings or environment variables. Set ANTHROPIC_API_KEY."
81
+ )
82
+
86
83
  # Create client
87
84
  client = AsyncAnthropic(api_key=api_key)
88
85
 
89
86
  adapter = adapter or ClaudeAdapter()
90
-
87
+
91
88
  super().__init__(client=client, adapter=adapter)
92
-
89
+
93
90
  self.model = model
94
91
  self.max_tokens = max_tokens
95
92
  self.max_iterations = max_iterations
96
-
93
+
97
94
  # Default dimensions - will be updated if adapter is provided
98
95
  self.width_px = 1024
99
96
  self.height_px = 768
100
-
97
+
101
98
  # Update dimensions if adapter is provided
102
99
  if self.adapter:
103
100
  self.width_px = self.adapter.agent_width
104
101
  self.height_px = self.adapter.agent_height
105
-
102
+
106
103
  # Message history
107
104
  self.messages: list[BetaMessageParam] = []
108
105
  self.pending_computer_use_tool_id = None
@@ -110,17 +107,17 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
110
107
  async def fetch_response(self, observation: Observation) -> tuple[list[Any], bool]:
111
108
  """
112
109
  Fetch a response from Claude based on the observation.
113
-
110
+
114
111
  Args:
115
112
  observation: The preprocessed observation
116
-
113
+
117
114
  Returns:
118
115
  tuple[list[Any], bool]: A tuple containing the list of raw actions and a
119
116
  boolean indicating if the agent believes the task is complete
120
117
  """
121
118
  if not self.client:
122
119
  raise ValueError("Client is required")
123
-
120
+
124
121
  # Prepare the user content for Claude
125
122
  user_content: list[BetaImageBlockParam | BetaTextBlockParam | BetaToolResultBlockParam] = []
126
123
 
@@ -128,7 +125,7 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
128
125
  if observation.text:
129
126
  logger.info("Adding text to user content: %s", observation.text)
130
127
  user_content.append(text_to_content_block(str(observation.text)))
131
-
128
+
132
129
  # Add screenshot if present
133
130
  if observation.screenshot:
134
131
  logger.info("Adding screenshot to user content")
@@ -136,20 +133,28 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
136
133
  logger.info("Adding screenshot to user content, no tool id")
137
134
  user_content.append(base64_to_content_block(observation.screenshot))
138
135
  else:
139
- logger.info("Adding screenshot to user content, tool id: %s", self.pending_computer_use_tool_id)
136
+ logger.info(
137
+ "Adding screenshot to user content, tool id: %s",
138
+ self.pending_computer_use_tool_id,
139
+ )
140
140
  user_content.append(
141
141
  tool_use_content_block(
142
- self.pending_computer_use_tool_id,
143
- [base64_to_content_block(observation.screenshot)]
142
+ self.pending_computer_use_tool_id,
143
+ [base64_to_content_block(observation.screenshot)],
144
144
  )
145
145
  )
146
146
  self.pending_computer_use_tool_id = None
147
147
 
148
148
  # Add the user content to the messages
149
- self.messages.append(cast(BetaMessageParam, {
150
- "role": "user",
151
- "content": user_content,
152
- }))
149
+ self.messages.append(
150
+ cast(
151
+ BetaMessageParam,
152
+ {
153
+ "role": "user",
154
+ "content": user_content,
155
+ },
156
+ )
157
+ )
153
158
 
154
159
  # Call Claude API using async client
155
160
  response = await self.client.beta.messages.create(
@@ -158,30 +163,35 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
158
163
  messages=self.messages,
159
164
  tools=[COMPUTER_TOOL],
160
165
  betas=["computer-use-2025-01-24"],
161
- tool_choice={"type": "auto", "disable_parallel_tool_use": True}
166
+ tool_choice={"type": "auto", "disable_parallel_tool_use": True},
162
167
  )
163
168
 
164
169
  # Add Claude's response to the conversation history
165
170
  response_content = response.content
166
- self.messages.append(cast(BetaMessageParam, {
167
- "role": "assistant",
168
- "content": response_content,
169
- }))
171
+ self.messages.append(
172
+ cast(
173
+ BetaMessageParam,
174
+ {
175
+ "role": "assistant",
176
+ "content": response_content,
177
+ },
178
+ )
179
+ )
170
180
 
171
181
  # Process tool use
172
182
  actions: list[Any] = []
173
183
  done = True # Assume we're done unless we find a tool use
174
-
184
+
175
185
  for block in response_content:
176
186
  logger.info("Processing block: %s", block)
177
187
  if block.type == "tool_use":
178
188
  logger.info("Processing tool use: %s", block)
179
189
  assert block.name == "computer"
180
-
190
+
181
191
  # Store the raw action
182
192
  actions.append(block.input)
183
193
  self.pending_computer_use_tool_id = block.id
184
-
194
+
185
195
  # If we found a tool use, we're not done
186
196
  done = False
187
197
  break
@@ -192,16 +202,15 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
192
202
  for block in response_content:
193
203
  if block.type == "text":
194
204
  final_text_response += block.text
195
-
205
+
196
206
  if final_text_response.strip():
197
- logger.info(f"No tool use found. Using final text as response: {final_text_response}")
198
- actions = [{
199
- "action": "response",
200
- "text": final_text_response.strip()
201
- }]
207
+ logger.info(
208
+ f"No tool use found. Using final text as response: {final_text_response}"
209
+ )
210
+ actions = [{"action": "response", "text": final_text_response.strip()}]
202
211
  # Keep done = True
203
212
  else:
204
- logger.info("No tool use and no final text block found.")
205
- # Keep done = True, actions remains empty
206
-
213
+ logger.info("No tool use and no final text block found.")
214
+ # Keep done = True, actions remains empty
215
+
207
216
  return actions, done
hud/agent/langchain.py CHANGED
@@ -10,9 +10,8 @@ from pydantic import Field, BaseModel
10
10
  # HUD imports
11
11
  from hud.adapters import Adapter
12
12
  from hud.agent.base import Agent
13
- from hud.env.environment import Observation
13
+ from hud.utils.common import Observation
14
14
  from hud.adapters.common.types import (
15
- CLA,
16
15
  ClickAction,
17
16
  TypeAction,
18
17
  ScrollAction,
@@ -44,17 +43,23 @@ SingleCLAction = Union[
44
43
  ResponseAction,
45
44
  ]
46
45
 
46
+
47
47
  # Define a Pydantic model to wrap the single action, potentially making it
48
48
  # easier for the LLM to consistently output the desired structure.
49
49
  class StepAction(BaseModel):
50
50
  """Wrapper model requesting a single concrete CLA action from the Langchain model."""
51
- action: SingleCLAction = Field(..., description="The single CLA action to perform for this step.")
51
+
52
+ action: SingleCLAction = Field(
53
+ ..., description="The single CLA action to perform for this step."
54
+ )
55
+
52
56
 
53
57
  # Generic Type for the Langchain Model/Runnable
54
58
  # Allows flexibility in what the user provides (model, chain, etc.)
55
59
  # Bound to BaseLanguageModel as .with_structured_output is expected
56
60
  LangchainModelOrRunnable = TypeVar("LangchainModelOrRunnable", bound=BaseLanguageModel)
57
61
 
62
+
58
63
  class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainModelOrRunnable]):
59
64
  """
60
65
  An agent that uses an arbitrary Langchain model or runnable, leveraging
@@ -80,8 +85,8 @@ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainMode
80
85
  system_prompt: An optional system prompt to guide the Langchain model.
81
86
  If None, a default prompt encouraging single CLA output is used.
82
87
  """
83
- super().__init__(client=langchain_model, adapter=adapter) # Store model as 'client'
84
- self.langchain_model = langchain_model # Also store with specific name
88
+ super().__init__(client=langchain_model, adapter=adapter) # Store model as 'client'
89
+ self.langchain_model = langchain_model # Also store with specific name
85
90
 
86
91
  self.system_prompt_str = system_prompt or self._get_default_system_prompt()
87
92
  self.history: List[BaseMessage] = []
@@ -97,7 +102,7 @@ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainMode
97
102
  "If you believe the task is complete based on the user's prompt and the observations, use the 'ResponseAction'."
98
103
  )
99
104
 
100
- async def fetch_response(self, observation: Observation) -> tuple[CLA | None, bool]:
105
+ async def fetch_response(self, observation: Observation) -> tuple[list[dict], bool]:
101
106
  """
102
107
  Fetches a response from the configured Langchain model, expecting a single
103
108
  structured CLA action.
@@ -117,17 +122,17 @@ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainMode
117
122
  if observation.screenshot:
118
123
  # Assuming the Langchain model/chain can handle base64 images
119
124
  # This might need adjustment based on the specific model used.
120
- human_content.append({
121
- "type": "image_url",
122
- "image_url": {
123
- "url": f"data:image/png;base64,{observation.screenshot}"
125
+ human_content.append(
126
+ {
127
+ "type": "image_url",
128
+ "image_url": {"url": f"data:image/png;base64,{observation.screenshot}"},
124
129
  }
125
- })
126
-
130
+ )
131
+
127
132
  if not human_content:
128
- logger.warning("LangchainAgent received an observation with no text or screenshot.")
129
- # Decide how to handle empty observation - perhaps return no action?
130
- return [], False # Or raise an error?
133
+ logger.warning("LangchainAgent received an observation with no text or screenshot.")
134
+ # Decide how to handle empty observation - perhaps return no action?
135
+ return [], False # Or raise an error?
131
136
 
132
137
  current_human_message = HumanMessage(content=human_content)
133
138
 
@@ -142,8 +147,7 @@ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainMode
142
147
  # We ask for the StepAction wrapper, which contains the actual SingleCLAAction
143
148
  # Explicitly use method="function_calling" to handle schemas with default values
144
149
  structured_llm = self.langchain_model.with_structured_output(
145
- schema=StepAction,
146
- method="function_calling"
150
+ schema=StepAction, method="function_calling"
147
151
  )
148
152
 
149
153
  # 4. Invoke Langchain model asynchronously
@@ -156,7 +160,7 @@ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainMode
156
160
 
157
161
  # 5. Process the structured response
158
162
  is_done = False
159
- ai_message_content_for_history = "" # For storing in history
163
+ ai_message_content_for_history = "" # For storing in history
160
164
 
161
165
  if isinstance(ai_response_structured, StepAction):
162
166
  # Successfully got the wrapper, extract the actual action
@@ -164,22 +168,24 @@ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainMode
164
168
  ai_message_content_for_history = actual_action.model_dump()
165
169
  if isinstance(actual_action, ResponseAction):
166
170
  is_done = True
167
- logger.info(f"LangchainAgent determined task is done with response: {actual_action.text[:100]}...")
171
+ logger.info(
172
+ f"LangchainAgent determined task is done with response: {actual_action.text[:100]}..."
173
+ )
168
174
  else:
169
- logger.info(f"LangchainAgent produced action: {type(actual_action).__name__}")
175
+ logger.info(f"LangchainAgent produced action: {type(actual_action).__name__}")
170
176
 
171
177
  else:
172
178
  logger.warning(
173
179
  f"Langchain model did not return the expected StepAction structure. "
174
180
  f"Received type: {type(ai_response_structured)}. Value: {ai_response_structured!r}"
175
181
  )
176
- # Attempt to add raw response to history for debugging
182
+ # Attempt to add raw response to history for debugging
177
183
  if isinstance(ai_response_structured, BaseMessage):
178
- ai_message_content_for_history = ai_response_structured.content
184
+ ai_message_content_for_history = ai_response_structured.content
179
185
  elif isinstance(ai_response_structured, str):
180
- ai_message_content_for_history = ai_response_structured
186
+ ai_message_content_for_history = ai_response_structured
181
187
  else:
182
- ai_message_content_for_history = repr(ai_response_structured)
188
+ ai_message_content_for_history = repr(ai_response_structured)
183
189
  # Return no action as we didn't get the expected structure
184
190
  return [], False
185
191
 
@@ -192,7 +198,7 @@ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainMode
192
198
 
193
199
  if actual_action:
194
200
  # Return the single action dictionary within a list
195
- return [actual_action], is_done
201
+ return [actual_action.model_dump()], is_done
196
202
  else:
197
203
  # Should ideally not happen if structure validation worked, but as a fallback
198
- return [], is_done
204
+ return [], is_done
hud/agent/operator.py CHANGED
@@ -10,36 +10,37 @@ from openai.types.responses import (
10
10
  ResponseInputItemParam,
11
11
  ResponseOutputMessage,
12
12
  ResponseComputerToolCall,
13
- ResponseOutputText
13
+ ResponseOutputText,
14
14
  )
15
15
 
16
16
  from hud.adapters import Adapter
17
17
  from hud.agent.base import Agent
18
18
  from hud.adapters.operator import OperatorAdapter
19
- from hud.env.environment import Observation
19
+ from hud.utils.common import Observation
20
20
  from hud.settings import settings
21
21
 
22
22
  logger = logging.getLogger(__name__)
23
23
 
24
+
24
25
  class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
25
26
  """
26
27
  An agent implementation using OpenAI's Computer Use API.
27
-
28
+
28
29
  This agent interacts with HUD environments using OpenAI's Computer Use API
29
30
  through the OperatorAdapter which converts actions to the format expected by HUD.
30
31
  """
31
-
32
+
32
33
  def __init__(
33
- self,
34
+ self,
34
35
  client: OpenAI | None = None,
35
36
  model: str = "computer-use-preview",
36
37
  environment: Literal["windows", "mac", "linux", "browser"] = "windows",
37
38
  adapter: Adapter | None = None,
38
- max_iterations: int = 8
39
+ max_iterations: int = 8,
39
40
  ):
40
41
  """
41
42
  Initialize the OperatorAgent.
42
-
43
+
43
44
  Args:
44
45
  client: The OpenAI client for API calls (optional, created automatically if not provided)
45
46
  model: The model to use for computer use
@@ -52,28 +53,30 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
52
53
  # Get API key from settings
53
54
  api_key = settings.openai_api_key
54
55
  if not api_key:
55
- raise ValueError("OpenAI API key not found in settings or environment variables. Set OPENAI_API_KEY.")
56
-
56
+ raise ValueError(
57
+ "OpenAI API key not found in settings or environment variables. Set OPENAI_API_KEY."
58
+ )
59
+
57
60
  # Create synchronous client
58
61
  client = OpenAI(api_key=api_key)
59
62
 
60
63
  adapter = adapter or OperatorAdapter()
61
-
64
+
62
65
  super().__init__(client=client, adapter=adapter)
63
-
66
+
64
67
  self.model = model
65
68
  self.environment = environment
66
69
  self.max_iterations = max_iterations
67
-
70
+
68
71
  # Default dimensions
69
72
  self.width = 1024
70
73
  self.height = 768
71
-
74
+
72
75
  # Update dimensions if adapter is provided
73
76
  if self.adapter:
74
77
  self.width = self.adapter.agent_width
75
78
  self.height = self.adapter.agent_height
76
-
79
+
77
80
  # Message history and state tracking
78
81
  self.last_response_id = None
79
82
  self.pending_call_id = None
@@ -82,86 +85,91 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
82
85
  async def fetch_response(self, observation: Observation) -> tuple[list[dict[str, Any]], bool]:
83
86
  """
84
87
  Fetch a response from the model based on the observation.
85
-
88
+
86
89
  Args:
87
90
  observation: The preprocessed observation
88
-
91
+
89
92
  Returns:
90
93
  tuple[list[dict[str, Any]], bool]: A tuple containing the list of raw actions and a
91
94
  boolean indicating if the agent believes the task is complete
92
95
  """
93
96
  if not self.client:
94
97
  raise ValueError("Client is required")
95
-
98
+
96
99
  # Define the computer use tool with correct type using cast
97
- computer_tool = cast(ToolParam, {
98
- "type": "computer_use_preview",
99
- "display_width": self.width,
100
- "display_height": self.height,
101
- "environment": self.environment
102
- })
103
-
100
+ computer_tool = cast(
101
+ ToolParam,
102
+ {
103
+ "type": "computer_use_preview",
104
+ "display_width": self.width,
105
+ "display_height": self.height,
106
+ "environment": self.environment,
107
+ },
108
+ )
109
+
104
110
  # Process the observation based on whether it's the first one or a response to an action
105
111
  if self.pending_call_id is None and self.last_response_id is None:
106
112
  # This is the first observation, store and send the prompt
107
113
  self.initial_prompt = observation.text
108
-
114
+
109
115
  # Create the initial request following the required structure
110
116
  input_content: list[dict[str, Any]] = [
111
117
  {"type": "input_text", "text": observation.text or ""}
112
118
  ]
113
-
119
+
114
120
  # Add screenshot if present
115
121
  if observation.screenshot:
116
- input_content.append({
117
- "type": "input_image",
118
- "image_url": f"data:image/png;base64,{observation.screenshot}"
119
- })
120
-
122
+ input_content.append(
123
+ {
124
+ "type": "input_image",
125
+ "image_url": f"data:image/png;base64,{observation.screenshot}",
126
+ }
127
+ )
128
+
121
129
  # Structure the input correctly for the API using cast
122
- input_param = cast(ResponseInputParam, [{
123
- "role": "user",
124
- "content": input_content
125
- }])
126
-
130
+ input_param = cast(ResponseInputParam, [{"role": "user", "content": input_content}])
131
+
127
132
  # Call OpenAI API for the initial prompt (synchronous call)
128
133
  response = self.client.responses.create(
129
- model=self.model,
130
- tools=[computer_tool],
131
- input=input_param,
132
- truncation="auto"
134
+ model=self.model, tools=[computer_tool], input=input_param, truncation="auto"
133
135
  )
134
-
136
+
135
137
  else:
136
138
  # This is a response to a previous action
137
139
  if not observation.screenshot:
138
140
  logger.warning("No screenshot provided for response to action")
139
141
  return [], True
140
-
142
+
141
143
  # Create a response to the previous action with the new screenshot
142
- input_param_followup = cast(ResponseInputParam, [
143
- cast(ResponseInputItemParam, {
144
- "call_id": self.pending_call_id,
145
- "type": "computer_call_output",
146
- "output": {
147
- "type": "input_image",
148
- "image_url": f"data:image/png;base64,{observation.screenshot}"
149
- }
150
- })
151
- ])
152
-
144
+ input_param_followup = cast(
145
+ ResponseInputParam,
146
+ [
147
+ cast(
148
+ ResponseInputItemParam,
149
+ {
150
+ "call_id": self.pending_call_id,
151
+ "type": "computer_call_output",
152
+ "output": {
153
+ "type": "input_image",
154
+ "image_url": f"data:image/png;base64,{observation.screenshot}",
155
+ },
156
+ },
157
+ )
158
+ ],
159
+ )
160
+
153
161
  # Call OpenAI API for follow-up (synchronous call)
154
162
  response = self.client.responses.create(
155
163
  model=self.model,
156
164
  previous_response_id=self.last_response_id,
157
165
  tools=[computer_tool],
158
166
  input=input_param_followup,
159
- truncation="auto"
167
+ truncation="auto",
160
168
  )
161
-
169
+
162
170
  # Store the response ID for the next call
163
171
  self.last_response_id = response.id
164
-
172
+
165
173
  # Process the response to extract actions or final text
166
174
  actions = []
167
175
  done = True # Assume done unless a computer call is found
@@ -169,17 +177,18 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
169
177
 
170
178
  # Check for computer calls first
171
179
  computer_calls = [
172
- item for item in response.output
180
+ item
181
+ for item in response.output
173
182
  if isinstance(item, ResponseComputerToolCall) and item.type == "computer_call"
174
183
  ]
175
-
184
+
176
185
  if computer_calls:
177
186
  # If computer calls exist, process them and set done=False
178
187
  done = False
179
188
  for computer_call in computer_calls:
180
189
  self.pending_call_id = computer_call.call_id
181
190
  action = computer_call.action
182
- actions.append(action.model_dump()) # Convert Pydantic model to dict
191
+ actions.append(action.model_dump()) # Convert Pydantic model to dict
183
192
  logger.info(f"Computer call action: {action}")
184
193
  else:
185
194
  # No computer calls, check for a final text message
@@ -188,21 +197,20 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
188
197
  for item in response.output:
189
198
  if isinstance(item, ResponseOutputMessage) and item.type == "message":
190
199
  # Extract text from content blocks within the message
191
- full_text = "".join([c.text for c in item.content if isinstance(c, ResponseOutputText)])
200
+ full_text = "".join(
201
+ [c.text for c in item.content if isinstance(c, ResponseOutputText)]
202
+ )
192
203
  if full_text:
193
204
  final_text_response = full_text
194
205
  logger.info(f"Final text message: {final_text_response}")
195
- break # Stop after finding the first text message
196
-
206
+ break # Stop after finding the first text message
207
+
197
208
  # If we found final text, package it as a 'response' action
198
209
  if final_text_response:
199
- actions = [{
200
- "type": "response",
201
- "text": final_text_response
202
- }]
210
+ actions = [{"type": "response", "text": final_text_response}]
203
211
  # Keep done = True
204
212
  else:
205
213
  logger.info("No computer calls and no final text message found.")
206
214
  # Keep done = True, actions remains empty
207
215
 
208
- return actions, done
216
+ return actions, done