hud-python 0.1.5__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (46) hide show
  1. hud/__init__.py +16 -12
  2. hud/adapters/__init__.py +4 -2
  3. hud/adapters/claude/adapter.py +9 -2
  4. hud/adapters/common/adapter.py +11 -10
  5. hud/adapters/common/types.py +34 -13
  6. hud/adapters/operator/__init__.py +5 -0
  7. hud/adapters/operator/adapter.py +97 -0
  8. hud/agent/__init__.py +7 -0
  9. hud/agent/base.py +109 -0
  10. hud/agent/claude.py +207 -0
  11. hud/agent/operator.py +208 -0
  12. hud/env/__init__.py +11 -0
  13. hud/env/client.py +35 -0
  14. hud/env/docker_client.py +306 -0
  15. hud/env/environment.py +354 -0
  16. hud/env/local_docker_client.py +251 -0
  17. hud/env/remote_client.py +185 -0
  18. hud/env/remote_docker_client.py +221 -0
  19. hud/evaluators/__init__.py +10 -0
  20. hud/evaluators/base.py +31 -0
  21. hud/evaluators/inspect.py +29 -0
  22. hud/evaluators/judge.py +213 -0
  23. hud/evaluators/match.py +163 -0
  24. hud/evaluators/remote.py +78 -0
  25. hud/gym.py +101 -15
  26. hud/job.py +185 -0
  27. hud/server/__init__.py +2 -2
  28. hud/server/requests.py +87 -0
  29. hud/settings.py +13 -2
  30. hud/task.py +144 -0
  31. hud/taskset.py +103 -0
  32. hud/trajectory.py +90 -0
  33. hud/types.py +65 -0
  34. hud/utils/__init__.py +4 -2
  35. hud/utils/common.py +96 -0
  36. hud/utils/config.py +91 -4
  37. hud/utils/telemetry.py +67 -0
  38. hud_python-0.2.1.dist-info/METADATA +181 -0
  39. hud_python-0.2.1.dist-info/RECORD +44 -0
  40. {hud_python-0.1.5.dist-info → hud_python-0.2.1.dist-info}/licenses/LICENSE +1 -1
  41. hud/client.py +0 -200
  42. hud/environment.py +0 -318
  43. hud/run.py +0 -208
  44. hud_python-0.1.5.dist-info/METADATA +0 -125
  45. hud_python-0.1.5.dist-info/RECORD +0 -21
  46. {hud_python-0.1.5.dist-info → hud_python-0.2.1.dist-info}/WHEEL +0 -0
hud/__init__.py CHANGED
@@ -4,19 +4,23 @@ HUD Gym SDK - A Python SDK for interacting with HUD environments.
4
4
 
5
5
  from __future__ import annotations
6
6
 
7
- from hud.client import HUDClient
8
- from hud.environment import Environment, EvalSet, Observation, TaskResult
9
- from hud.gym import Gym
10
- from hud.run import Run
7
+ from . import agent, env, gym, settings, task, taskset, types, utils
8
+ from .job import create_job, job, load_job
9
+ from .taskset import load_taskset
11
10
 
12
- __version__ = "0.1.5"
11
+ __version__ = "0.2.1"
13
12
 
14
13
  __all__ = [
15
- "Environment",
16
- "EvalSet",
17
- "Gym",
18
- "HUDClient",
19
- "Observation",
20
- "Run",
21
- "TaskResult",
14
+ "agent",
15
+ "create_job",
16
+ "env",
17
+ "gym",
18
+ "job",
19
+ "load_job",
20
+ "load_taskset",
21
+ "settings",
22
+ "task",
23
+ "taskset",
24
+ "types",
25
+ "utils",
22
26
  ]
hud/adapters/__init__.py CHANGED
@@ -1,5 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
- from .common import Adapter
3
+ from .claude import ClaudeAdapter
4
+ from .common import CLA, Adapter
5
+ from .operator import OperatorAdapter
4
6
 
5
- __all__ = ["Adapter"]
7
+ __all__ = ["CLA", "Adapter", "ClaudeAdapter", "OperatorAdapter"]
@@ -13,6 +13,7 @@ from hud.adapters.common.types import (
13
13
  Point,
14
14
  PositionFetch,
15
15
  PressAction,
16
+ ResponseAction,
16
17
  ScreenshotFetch,
17
18
  ScrollAction,
18
19
  TypeAction,
@@ -21,7 +22,10 @@ from hud.adapters.common.types import (
21
22
 
22
23
 
23
24
  class ClaudeAdapter(Adapter):
24
- KEY_MAP: ClassVar[dict[str, CLAKey]] = {"Return": "enter"}
25
+ KEY_MAP: ClassVar[dict[str, CLAKey]] = {
26
+ "Return": "enter",
27
+ "Super": "win",
28
+ }
25
29
 
26
30
  def __init__(self) -> None:
27
31
  super().__init__()
@@ -31,7 +35,6 @@ class ClaudeAdapter(Adapter):
31
35
  def _map_key(self, key: str) -> CLAKey:
32
36
  """Map a key to its standardized form."""
33
37
  return self.KEY_MAP.get(key, key.lower()) # type: ignore
34
-
35
38
  def convert(self, data: Any) -> CLA:
36
39
  try:
37
40
  action_type = data.get("action")
@@ -152,6 +155,10 @@ class ClaudeAdapter(Adapter):
152
155
  elif action_type == "wait":
153
156
  assert "duration" in data
154
157
  return WaitAction(time=data["duration"])
158
+
159
+ elif action_type == "response":
160
+ return ResponseAction(text=data.get("text", ""))
161
+
155
162
  else:
156
163
  raise ValueError(f"Unsupported action type: {action_type}")
157
164
  except AssertionError:
@@ -2,16 +2,16 @@ from __future__ import annotations
2
2
 
3
3
  from typing import TYPE_CHECKING, Any
4
4
 
5
+ import numpy as np
5
6
  from PIL import Image
6
7
  from pydantic import TypeAdapter, ValidationError
7
8
 
8
9
  from .types import CLA
9
10
 
10
11
  if TYPE_CHECKING:
11
- import numpy as np # type: ignore
12
- from typing_extensions import TypeAlias, TypeIs
12
+ from typing_extensions import TypeIs
13
13
 
14
- ImageType: TypeAlias = "np.ndarray[Any, Any] | Image.Image | str | None"
14
+ ImageType = np.ndarray[Any, Any] | Image.Image | str | None
15
15
 
16
16
 
17
17
  def _is_numpy_array(observation: Any) -> TypeIs[np.ndarray]:
@@ -61,7 +61,7 @@ class Adapter:
61
61
  observation: Image data, which can be:
62
62
  - numpy array
63
63
  - PIL Image
64
- - base64 string (PNG)
64
+ - base64 string (PNG) # TODO: JPG
65
65
 
66
66
  Returns:
67
67
  Base64-encoded string of the resized image (PNG format)
@@ -146,7 +146,7 @@ class Adapter:
146
146
 
147
147
  return processed_action
148
148
 
149
- def adapt(self, action: Any) -> dict[str, Any]:
149
+ def adapt(self, action: Any) -> CLA:
150
150
  # any preprocessing steps
151
151
  action = self.preprocess(action)
152
152
 
@@ -154,14 +154,15 @@ class Adapter:
154
154
  action = self.convert(action)
155
155
  self.memory.append(action)
156
156
 
157
- # convert to json
157
+ # convert to json and apply coordinate rescaling
158
158
  action_dict = self.json(action)
159
-
160
- # apply coordinate rescaling
161
159
  rescaled_action = self.postprocess_action(action_dict)
162
- return rescaled_action
163
160
 
164
- def adapt_list(self, actions: list[Any]) -> list[dict[str, Any]]:
161
+ # convert back to CLA
162
+ return TypeAdapter(CLA).validate_python(rescaled_action)
163
+
164
+ def adapt_list(self, actions: list[Any]) -> list[CLA]:
165
165
  if not isinstance(actions, list):
166
166
  raise ValueError("Please provide a list of actions")
167
+
167
168
  return [self.adapt(action) for action in actions]
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Annotated, Literal, Union
3
+ from typing import Annotated, Literal
4
4
 
5
5
  from pydantic import BaseModel, Field
6
6
 
@@ -32,10 +32,23 @@ class PressAction(CLAAction):
32
32
  keys: list[CLAKey]
33
33
 
34
34
 
35
+ # KEYDOWN ACTION for key presses/hotkeys
36
+ class KeyDownAction(CLAAction):
37
+ type: Literal["keydown"] = "keydown"
38
+ keys: list[CLAKey]
39
+
40
+
41
+ # KEYUP ACTION for key presses/hotkeys
42
+ class KeyUpAction(CLAAction):
43
+ type: Literal["keyup"] = "keyup"
44
+ keys: list[CLAKey]
45
+
46
+
35
47
  # TYPE ACTION for text typing
36
48
  class TypeAction(CLAAction):
37
49
  type: Literal["type"] = "type"
38
50
  text: str
51
+ selector: str | None = None
39
52
  enter_after: bool | None = False
40
53
 
41
54
 
@@ -69,6 +82,12 @@ class DragAction(CLAAction):
69
82
  hold_keys: list[CLAKey] | None = None
70
83
 
71
84
 
85
+ # RESPONSE ACTION from agent
86
+ class ResponseAction(CLAAction):
87
+ type: Literal["response"] = "response"
88
+ text: str # The final textual response from the agent
89
+
90
+
72
91
  # SCREENSHOT ACTION
73
92
  class ScreenshotFetch(CLAAction):
74
93
  type: Literal["screenshot"] = "screenshot"
@@ -82,20 +101,22 @@ class CustomAction(CLAAction):
82
101
  type: Literal["custom"] = "custom"
83
102
  action: str
84
103
 
104
+
85
105
  # Union of all possible actions
86
106
  CLA = Annotated[
87
- Union[
88
- ClickAction,
89
- PressAction,
90
- TypeAction,
91
- ScrollAction,
92
- MoveAction,
93
- WaitAction,
94
- DragAction,
95
- ScreenshotFetch,
96
- PositionFetch,
97
- CustomAction,
98
- ],
107
+ ClickAction
108
+ | PressAction
109
+ | KeyDownAction
110
+ | KeyUpAction
111
+ | TypeAction
112
+ | ResponseAction
113
+ | ScrollAction
114
+ | MoveAction
115
+ | WaitAction
116
+ | DragAction
117
+ | CustomAction
118
+ | ScreenshotFetch
119
+ | PositionFetch,
99
120
  Field(discriminator="type"),
100
121
  ]
101
122
 
@@ -0,0 +1,5 @@
1
+ from __future__ import annotations
2
+
3
+ from .adapter import OperatorAdapter
4
+
5
+ __all__ = ["OperatorAdapter"]
@@ -0,0 +1,97 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, ClassVar
4
+
5
+ from hud.adapters.common import CLA, Adapter
6
+ from hud.adapters.common.types import (
7
+ CLAKey,
8
+ ClickAction,
9
+ DragAction,
10
+ MoveAction,
11
+ Point,
12
+ PressAction,
13
+ ResponseAction,
14
+ ScreenshotFetch,
15
+ ScrollAction,
16
+ TypeAction,
17
+ WaitAction,
18
+ )
19
+
20
+
21
+ class OperatorAdapter(Adapter):
22
+ KEY_MAP: ClassVar[dict[str, CLAKey]] = {
23
+ "Return": "enter",
24
+ "ArrowUp": "up",
25
+ "ArrowDown": "down",
26
+ "ArrowLeft": "left",
27
+ "ArrowRight": "right",
28
+ }
29
+
30
+ def __init__(self) -> None:
31
+ super().__init__()
32
+ # OpenAI Computer Use default dimensions
33
+ self.agent_width = 1024
34
+ self.agent_height = 768
35
+
36
+ def _map_key(self, key: str) -> CLAKey:
37
+ """Map a key to its standardized form."""
38
+ return self.KEY_MAP.get(key, key.lower()) # type: ignore
39
+
40
+ def convert(self, data: Any) -> CLA:
41
+ """Convert a Computer Use action to a HUD action"""
42
+ try:
43
+ action_type = data.get("type")
44
+
45
+ if action_type == "click":
46
+ x, y = data.get("x", 0), data.get("y", 0)
47
+ button = data.get("button", "left")
48
+ return ClickAction(point=Point(x=x, y=y), button=button)
49
+
50
+ elif action_type == "double_click":
51
+ x, y = data.get("x", 0), data.get("y", 0)
52
+ return ClickAction(
53
+ point=Point(x=x, y=y),
54
+ button="left",
55
+ pattern=[100]
56
+ )
57
+
58
+ elif action_type == "scroll":
59
+ x, y = data.get("x", 0), data.get("y", 0)
60
+ scroll_x = data.get("scroll_x", 0)
61
+ scroll_y = data.get("scroll_y", 0)
62
+ return ScrollAction(
63
+ point=Point(x=x, y=y),
64
+ scroll=Point(x=scroll_x, y=scroll_y)
65
+ )
66
+
67
+ elif action_type == "type":
68
+ text = data.get("text", "")
69
+ return TypeAction(text=text, enter_after=False)
70
+
71
+ elif action_type == "wait":
72
+ ms = data.get("ms", 1000)
73
+ return WaitAction(time=ms)
74
+
75
+ elif action_type == "move":
76
+ x, y = data.get("x", 0), data.get("y", 0)
77
+ return MoveAction(point=Point(x=x, y=y))
78
+
79
+ elif action_type == "keypress":
80
+ keys = data.get("keys", [])
81
+ return PressAction(keys=[self._map_key(k) for k in keys])
82
+
83
+ elif action_type == "drag":
84
+ path = data.get("path", [])
85
+ points = [Point(x=p.get("x", 0), y=p.get("y", 0)) for p in path]
86
+ return DragAction(path=points)
87
+
88
+ elif action_type == "screenshot":
89
+ return ScreenshotFetch()
90
+
91
+ elif action_type == "response":
92
+ return ResponseAction(text=data.get("text", ""))
93
+ else:
94
+ raise ValueError(f"Unsupported action type: {action_type}")
95
+
96
+ except Exception as e:
97
+ raise ValueError(f"Invalid action: {data}. Error: {e!s}") from e
hud/agent/__init__.py ADDED
@@ -0,0 +1,7 @@
1
+ from .base import Agent
2
+ from .claude import ClaudeAgent
3
+ from .operator import OperatorAgent
4
+
5
+ from hud.adapters import OperatorAdapter, ClaudeAdapter
6
+
7
+ __all__ = ["Agent", "ClaudeAgent", "OperatorAgent", "OperatorAdapter", "ClaudeAdapter"]
hud/agent/base.py ADDED
@@ -0,0 +1,109 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Sequence, TypeVar, Generic
3
+
4
+ from hud.adapters import Adapter, CLA
5
+ from hud.env.environment import Observation
6
+
7
+ # Generic type for different client types (Anthropic, OpenAI, etc.)
8
+ ClientT = TypeVar('ClientT')
9
+ ActionT = TypeVar('ActionT')
10
+
11
+ class Agent(Generic[ClientT, ActionT], ABC):
12
+ """
13
+ Base class for all agents.
14
+
15
+ Implements a three-stage prediction process:
16
+ 1. preprocess - Prepare observation data (e.g., rescale screenshot)
17
+ 2. fetch_response - Make API calls to get model response
18
+ 3. postprocess - Convert model actions to HUD format
19
+
20
+ Subclasses only need to implement the fetch_response method.
21
+ """
22
+
23
+ def __init__(self, client: ClientT | None = None, adapter: Adapter | None = None):
24
+ """
25
+ Initialize the agent.
26
+
27
+ Args:
28
+ client: The client to use for API calls
29
+ adapter: The adapter to use for preprocessing and postprocessing
30
+ """
31
+ self.client = client
32
+ self.adapter = adapter
33
+
34
+ def preprocess(self, observation: Observation) -> Observation:
35
+ """
36
+ Preprocess the observation before sending to the model.
37
+
38
+ Args:
39
+ observation: The raw observation from the environment
40
+
41
+ Returns:
42
+ Observation: The processed observation ready for the model
43
+ """
44
+ if not self.adapter or not observation.screenshot:
45
+ return observation
46
+
47
+ # Create a new observation with the rescaled screenshot
48
+ processed_obs = Observation(
49
+ text=observation.text,
50
+ screenshot=self.adapter.rescale(observation.screenshot)
51
+ )
52
+ return processed_obs
53
+
54
+ @abstractmethod
55
+ async def fetch_response(self, observation: Observation) -> tuple[list[ActionT], bool]:
56
+ """
57
+ Fetch a response from the model based on the observation.
58
+
59
+ Args:
60
+ observation: The preprocessed observation
61
+
62
+ Returns:
63
+ tuple[list[ActionT], bool]: A tuple containing the list of raw actions and a
64
+ boolean indicating if the agent believes it has
65
+ completed the task
66
+ """
67
+ pass
68
+
69
+ def postprocess(self, actions: list[ActionT]) -> list[CLA]:
70
+ """
71
+ Convert model actions to HUD actions.
72
+
73
+ Args:
74
+ actions: The raw actions from the model
75
+
76
+ Returns:
77
+ Sequence[CLA]: The actions converted to HUD format
78
+ """
79
+ if not self.adapter:
80
+ raise ValueError("Cannot postprocess actions without an adapter")
81
+
82
+ return self.adapter.adapt_list(actions)
83
+
84
+ async def predict(self, observation: Observation) -> tuple[list[CLA] | list[ActionT], bool]:
85
+ """
86
+ Predict the next action based on the observation.
87
+
88
+ Implements the full three-stage prediction process.
89
+
90
+ Args:
91
+ observation: The observation from the environment
92
+
93
+ Returns:
94
+ tuple[list[CLA] | list[ActionT], bool]: A tuple containing the list of actions and a boolean
95
+ indicating if the agent believes it has completed the task
96
+ """
97
+ # Stage 1: Preprocess the observation
98
+ processed_obs = self.preprocess(observation)
99
+
100
+ # Stage 2: Fetch response from the model
101
+ actions, done = await self.fetch_response(processed_obs)
102
+
103
+ # Stage 3: Postprocess the actions if we have an adapter
104
+ if self.adapter and actions:
105
+ hud_actions = self.postprocess(actions)
106
+ return hud_actions, done
107
+
108
+ # If no adapter, return actions as is
109
+ return actions, done
hud/agent/claude.py ADDED
@@ -0,0 +1,207 @@
1
+ import logging
2
+ import os
3
+ from typing import Any, cast
4
+
5
+ from anthropic import AsyncAnthropic
6
+ from anthropic.types.beta import (
7
+ BetaMessageParam,
8
+ BetaToolResultBlockParam,
9
+ BetaToolComputerUse20250124Param,
10
+ BetaTextBlockParam,
11
+ BetaImageBlockParam,
12
+ )
13
+
14
+ from hud.adapters import Adapter
15
+ from hud.agent.base import Agent
16
+ from hud.adapters.claude import ClaudeAdapter
17
+ from hud.env.environment import Observation
18
+ from hud.settings import settings
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ def base64_to_content_block(base64: str) -> BetaImageBlockParam:
23
+ return {
24
+ "type": "image",
25
+ "source": {
26
+ "type": "base64",
27
+ "media_type": "image/png",
28
+ "data": base64
29
+ }
30
+ }
31
+
32
+ def text_to_content_block(text: str) -> BetaTextBlockParam:
33
+ return {
34
+ "type": "text",
35
+ "text": text
36
+ }
37
+
38
+ def tool_use_content_block(tool_use_id: str, content: list[BetaTextBlockParam | BetaImageBlockParam]) -> BetaToolResultBlockParam:
39
+ return {
40
+ "type": "tool_result",
41
+ "tool_use_id": tool_use_id,
42
+ "content": content
43
+ }
44
+
45
+ # Claude's Computer Use Tool definition
46
+ COMPUTER_TOOL: BetaToolComputerUse20250124Param = {
47
+ "type": "computer_20250124",
48
+ "name": "computer",
49
+ "display_width_px": 1024,
50
+ "display_height_px": 768
51
+ }
52
+
53
+ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
54
+ """
55
+ An agent implementation using Anthropic's Claude API with Computer Use.
56
+
57
+ This agent interacts with HUD environments using Claude's Computer Use API
58
+ through the ClaudeAdapter which converts actions to the format expected by HUD.
59
+ """
60
+
61
+ def __init__(
62
+ self,
63
+ client: AsyncAnthropic | None = None,
64
+ adapter: Adapter | None = None,
65
+ model: str = "claude-3-7-sonnet-20250219",
66
+ max_tokens: int = 4096,
67
+ max_iterations: int = 10,
68
+ ):
69
+ """
70
+ Initialize the ClaudeAgent.
71
+
72
+ Args:
73
+ client: The AsyncAnthropic client for API calls (optional, created automatically if not provided)
74
+ adapter: The adapter to use for preprocessing and postprocessing
75
+ model: The Claude model to use
76
+ max_tokens: Maximum tokens for Claude's response
77
+ max_iterations: Maximum number of iterations for the agent
78
+ """
79
+ # Initialize client if not provided
80
+ if client is None:
81
+ # Get API key from settings
82
+ api_key = settings.anthropic_api_key
83
+ if not api_key:
84
+ raise ValueError("Anthropic API key not found in settings or environment variables. Set ANTHROPIC_API_KEY.")
85
+
86
+ # Create client
87
+ client = AsyncAnthropic(api_key=api_key)
88
+
89
+ adapter = adapter or ClaudeAdapter()
90
+
91
+ super().__init__(client=client, adapter=adapter)
92
+
93
+ self.model = model
94
+ self.max_tokens = max_tokens
95
+ self.max_iterations = max_iterations
96
+
97
+ # Default dimensions - will be updated if adapter is provided
98
+ self.width_px = 1024
99
+ self.height_px = 768
100
+
101
+ # Update dimensions if adapter is provided
102
+ if self.adapter:
103
+ self.width_px = self.adapter.agent_width
104
+ self.height_px = self.adapter.agent_height
105
+
106
+ # Message history
107
+ self.messages: list[BetaMessageParam] = []
108
+ self.pending_computer_use_tool_id = None
109
+
110
+ async def fetch_response(self, observation: Observation) -> tuple[list[Any], bool]:
111
+ """
112
+ Fetch a response from Claude based on the observation.
113
+
114
+ Args:
115
+ observation: The preprocessed observation
116
+
117
+ Returns:
118
+ tuple[list[Any], bool]: A tuple containing the list of raw actions and a
119
+ boolean indicating if the agent believes the task is complete
120
+ """
121
+ if not self.client:
122
+ raise ValueError("Client is required")
123
+
124
+ # Prepare the user content for Claude
125
+ user_content: list[BetaImageBlockParam | BetaTextBlockParam | BetaToolResultBlockParam] = []
126
+
127
+ # Add text instruction if present
128
+ if observation.text:
129
+ logger.info("Adding text to user content: %s", observation.text)
130
+ user_content.append(text_to_content_block(str(observation.text)))
131
+
132
+ # Add screenshot if present
133
+ if observation.screenshot:
134
+ logger.info("Adding screenshot to user content")
135
+ if not self.pending_computer_use_tool_id:
136
+ logger.info("Adding screenshot to user content, no tool id")
137
+ user_content.append(base64_to_content_block(observation.screenshot))
138
+ else:
139
+ logger.info("Adding screenshot to user content, tool id: %s", self.pending_computer_use_tool_id)
140
+ user_content.append(
141
+ tool_use_content_block(
142
+ self.pending_computer_use_tool_id,
143
+ [base64_to_content_block(observation.screenshot)]
144
+ )
145
+ )
146
+ self.pending_computer_use_tool_id = None
147
+
148
+ # Add the user content to the messages
149
+ self.messages.append(cast(BetaMessageParam, {
150
+ "role": "user",
151
+ "content": user_content,
152
+ }))
153
+
154
+ # Call Claude API using async client
155
+ response = await self.client.beta.messages.create(
156
+ model=self.model,
157
+ max_tokens=self.max_tokens,
158
+ messages=self.messages,
159
+ tools=[COMPUTER_TOOL],
160
+ betas=["computer-use-2025-01-24"],
161
+ tool_choice={"type": "auto", "disable_parallel_tool_use": True}
162
+ )
163
+
164
+ # Add Claude's response to the conversation history
165
+ response_content = response.content
166
+ self.messages.append(cast(BetaMessageParam, {
167
+ "role": "assistant",
168
+ "content": response_content,
169
+ }))
170
+
171
+ # Process tool use
172
+ actions: list[Any] = []
173
+ done = True # Assume we're done unless we find a tool use
174
+
175
+ for block in response_content:
176
+ logger.info("Processing block: %s", block)
177
+ if block.type == "tool_use":
178
+ logger.info("Processing tool use: %s", block)
179
+ assert block.name == "computer"
180
+
181
+ # Store the raw action
182
+ actions.append(block.input)
183
+ self.pending_computer_use_tool_id = block.id
184
+
185
+ # If we found a tool use, we're not done
186
+ done = False
187
+ break
188
+
189
+ # If no tool use action was found, check for a final text response
190
+ if not actions and done:
191
+ final_text_response = ""
192
+ for block in response_content:
193
+ if block.type == "text":
194
+ final_text_response += block.text
195
+
196
+ if final_text_response.strip():
197
+ logger.info(f"No tool use found. Using final text as response: {final_text_response}")
198
+ actions = [{
199
+ "action": "response",
200
+ "text": final_text_response.strip()
201
+ }]
202
+ # Keep done = True
203
+ else:
204
+ logger.info("No tool use and no final text block found.")
205
+ # Keep done = True, actions remains empty
206
+
207
+ return actions, done