hud-python 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (59) hide show
  1. hud/__init__.py +5 -3
  2. hud/adapters/__init__.py +2 -1
  3. hud/adapters/claude/adapter.py +13 -17
  4. hud/adapters/common/adapter.py +3 -3
  5. hud/adapters/common/tests/__init__.py +0 -0
  6. hud/adapters/common/tests/test_adapter.py +277 -0
  7. hud/adapters/common/types.py +3 -6
  8. hud/adapters/operator/adapter.py +22 -29
  9. hud/agent/__init__.py +9 -1
  10. hud/agent/base.py +28 -28
  11. hud/agent/claude.py +69 -60
  12. hud/agent/langchain.py +204 -0
  13. hud/agent/operator.py +75 -67
  14. hud/env/__init__.py +5 -5
  15. hud/env/client.py +2 -2
  16. hud/env/docker_client.py +37 -39
  17. hud/env/environment.py +91 -66
  18. hud/env/local_docker_client.py +5 -7
  19. hud/env/remote_client.py +40 -29
  20. hud/env/remote_docker_client.py +13 -3
  21. hud/evaluators/__init__.py +2 -3
  22. hud/evaluators/base.py +4 -3
  23. hud/evaluators/inspect.py +3 -8
  24. hud/evaluators/judge.py +34 -58
  25. hud/evaluators/match.py +42 -49
  26. hud/evaluators/remote.py +13 -26
  27. hud/evaluators/tests/__init__.py +0 -0
  28. hud/evaluators/tests/test_inspect.py +12 -0
  29. hud/evaluators/tests/test_judge.py +231 -0
  30. hud/evaluators/tests/test_match.py +115 -0
  31. hud/evaluators/tests/test_remote.py +98 -0
  32. hud/exceptions.py +167 -0
  33. hud/gym.py +12 -10
  34. hud/job.py +525 -47
  35. hud/server/__init__.py +2 -2
  36. hud/server/requests.py +148 -186
  37. hud/server/tests/__init__.py +0 -0
  38. hud/server/tests/test_requests.py +275 -0
  39. hud/settings.py +3 -2
  40. hud/task.py +12 -22
  41. hud/taskset.py +44 -11
  42. hud/trajectory.py +6 -9
  43. hud/types.py +14 -9
  44. hud/utils/__init__.py +2 -2
  45. hud/utils/common.py +37 -13
  46. hud/utils/config.py +44 -29
  47. hud/utils/progress.py +149 -0
  48. hud/utils/telemetry.py +10 -11
  49. hud/utils/tests/__init__.py +0 -0
  50. hud/utils/tests/test_common.py +52 -0
  51. hud/utils/tests/test_config.py +129 -0
  52. hud/utils/tests/test_progress.py +225 -0
  53. hud/utils/tests/test_telemetry.py +37 -0
  54. hud/utils/tests/test_version.py +8 -0
  55. {hud_python-0.2.1.dist-info → hud_python-0.2.3.dist-info}/METADATA +44 -21
  56. hud_python-0.2.3.dist-info/RECORD +62 -0
  57. hud_python-0.2.1.dist-info/RECORD +0 -44
  58. {hud_python-0.2.1.dist-info → hud_python-0.2.3.dist-info}/WHEEL +0 -0
  59. {hud_python-0.2.1.dist-info → hud_python-0.2.3.dist-info}/licenses/LICENSE +0 -0
hud/agent/base.py CHANGED
@@ -2,108 +2,108 @@ from abc import ABC, abstractmethod
2
2
  from typing import Sequence, TypeVar, Generic
3
3
 
4
4
  from hud.adapters import Adapter, CLA
5
- from hud.env.environment import Observation
5
+ from hud.utils.common import Observation
6
6
 
7
7
  # Generic type for different client types (Anthropic, OpenAI, etc.)
8
- ClientT = TypeVar('ClientT')
9
- ActionT = TypeVar('ActionT')
8
+ ClientT = TypeVar("ClientT")
9
+ ActionT = TypeVar("ActionT")
10
+
10
11
 
11
12
  class Agent(Generic[ClientT, ActionT], ABC):
12
13
  """
13
14
  Base class for all agents.
14
-
15
+
15
16
  Implements a three-stage prediction process:
16
17
  1. preprocess - Prepare observation data (e.g., rescale screenshot)
17
18
  2. fetch_response - Make API calls to get model response
18
19
  3. postprocess - Convert model actions to HUD format
19
-
20
+
20
21
  Subclasses only need to implement the fetch_response method.
21
22
  """
22
-
23
+
23
24
  def __init__(self, client: ClientT | None = None, adapter: Adapter | None = None):
24
25
  """
25
26
  Initialize the agent.
26
-
27
+
27
28
  Args:
28
29
  client: The client to use for API calls
29
30
  adapter: The adapter to use for preprocessing and postprocessing
30
31
  """
31
32
  self.client = client
32
33
  self.adapter = adapter
33
-
34
+
34
35
  def preprocess(self, observation: Observation) -> Observation:
35
36
  """
36
37
  Preprocess the observation before sending to the model.
37
-
38
+
38
39
  Args:
39
40
  observation: The raw observation from the environment
40
-
41
+
41
42
  Returns:
42
43
  Observation: The processed observation ready for the model
43
44
  """
44
45
  if not self.adapter or not observation.screenshot:
45
46
  return observation
46
-
47
+
47
48
  # Create a new observation with the rescaled screenshot
48
49
  processed_obs = Observation(
49
- text=observation.text,
50
- screenshot=self.adapter.rescale(observation.screenshot)
50
+ text=observation.text, screenshot=self.adapter.rescale(observation.screenshot)
51
51
  )
52
52
  return processed_obs
53
-
53
+
54
54
  @abstractmethod
55
55
  async def fetch_response(self, observation: Observation) -> tuple[list[ActionT], bool]:
56
56
  """
57
57
  Fetch a response from the model based on the observation.
58
-
58
+
59
59
  Args:
60
60
  observation: The preprocessed observation
61
-
61
+
62
62
  Returns:
63
63
  tuple[list[ActionT], bool]: A tuple containing the list of raw actions and a
64
64
  boolean indicating if the agent believes it has
65
65
  completed the task
66
66
  """
67
67
  pass
68
-
68
+
69
69
  def postprocess(self, actions: list[ActionT]) -> list[CLA]:
70
70
  """
71
71
  Convert model actions to HUD actions.
72
-
72
+
73
73
  Args:
74
74
  actions: The raw actions from the model
75
-
75
+
76
76
  Returns:
77
77
  Sequence[CLA]: The actions converted to HUD format
78
78
  """
79
79
  if not self.adapter:
80
80
  raise ValueError("Cannot postprocess actions without an adapter")
81
-
81
+
82
82
  return self.adapter.adapt_list(actions)
83
-
83
+
84
84
  async def predict(self, observation: Observation) -> tuple[list[CLA] | list[ActionT], bool]:
85
85
  """
86
86
  Predict the next action based on the observation.
87
-
87
+
88
88
  Implements the full three-stage prediction process.
89
-
89
+
90
90
  Args:
91
91
  observation: The observation from the environment
92
-
92
+
93
93
  Returns:
94
94
  tuple[list[CLA] | list[ActionT], bool]: A tuple containing the list of actions and a boolean
95
95
  indicating if the agent believes it has completed the task
96
96
  """
97
97
  # Stage 1: Preprocess the observation
98
98
  processed_obs = self.preprocess(observation)
99
-
99
+
100
100
  # Stage 2: Fetch response from the model
101
101
  actions, done = await self.fetch_response(processed_obs)
102
-
102
+
103
103
  # Stage 3: Postprocess the actions if we have an adapter
104
104
  if self.adapter and actions:
105
105
  hud_actions = self.postprocess(actions)
106
106
  return hud_actions, done
107
-
107
+
108
108
  # If no adapter, return actions as is
109
- return actions, done
109
+ return actions, done
hud/agent/claude.py CHANGED
@@ -1,5 +1,4 @@
1
1
  import logging
2
- import os
3
2
  from typing import Any, cast
4
3
 
5
4
  from anthropic import AsyncAnthropic
@@ -14,52 +13,48 @@ from anthropic.types.beta import (
14
13
  from hud.adapters import Adapter
15
14
  from hud.agent.base import Agent
16
15
  from hud.adapters.claude import ClaudeAdapter
17
- from hud.env.environment import Observation
16
+ from hud.utils.common import Observation
18
17
  from hud.settings import settings
19
18
 
20
19
  logger = logging.getLogger(__name__)
21
20
 
21
+
22
22
  def base64_to_content_block(base64: str) -> BetaImageBlockParam:
23
23
  return {
24
24
  "type": "image",
25
- "source": {
26
- "type": "base64",
27
- "media_type": "image/png",
28
- "data": base64
29
- }
25
+ "source": {"type": "base64", "media_type": "image/png", "data": base64},
30
26
  }
31
27
 
28
+
32
29
  def text_to_content_block(text: str) -> BetaTextBlockParam:
33
- return {
34
- "type": "text",
35
- "text": text
36
- }
30
+ return {"type": "text", "text": text}
31
+
32
+
33
+ def tool_use_content_block(
34
+ tool_use_id: str, content: list[BetaTextBlockParam | BetaImageBlockParam]
35
+ ) -> BetaToolResultBlockParam:
36
+ return {"type": "tool_result", "tool_use_id": tool_use_id, "content": content}
37
37
 
38
- def tool_use_content_block(tool_use_id: str, content: list[BetaTextBlockParam | BetaImageBlockParam]) -> BetaToolResultBlockParam:
39
- return {
40
- "type": "tool_result",
41
- "tool_use_id": tool_use_id,
42
- "content": content
43
- }
44
38
 
45
39
  # Claude's Computer Use Tool definition
46
40
  COMPUTER_TOOL: BetaToolComputerUse20250124Param = {
47
- "type": "computer_20250124",
48
- "name": "computer",
49
- "display_width_px": 1024,
50
- "display_height_px": 768
41
+ "type": "computer_20250124",
42
+ "name": "computer",
43
+ "display_width_px": 1024,
44
+ "display_height_px": 768,
51
45
  }
52
46
 
47
+
53
48
  class ClaudeAgent(Agent[AsyncAnthropic, Any]):
54
49
  """
55
50
  An agent implementation using Anthropic's Claude API with Computer Use.
56
-
51
+
57
52
  This agent interacts with HUD environments using Claude's Computer Use API
58
53
  through the ClaudeAdapter which converts actions to the format expected by HUD.
59
54
  """
60
-
55
+
61
56
  def __init__(
62
- self,
57
+ self,
63
58
  client: AsyncAnthropic | None = None,
64
59
  adapter: Adapter | None = None,
65
60
  model: str = "claude-3-7-sonnet-20250219",
@@ -68,7 +63,7 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
68
63
  ):
69
64
  """
70
65
  Initialize the ClaudeAgent.
71
-
66
+
72
67
  Args:
73
68
  client: The AsyncAnthropic client for API calls (optional, created automatically if not provided)
74
69
  adapter: The adapter to use for preprocessing and postprocessing
@@ -81,28 +76,30 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
81
76
  # Get API key from settings
82
77
  api_key = settings.anthropic_api_key
83
78
  if not api_key:
84
- raise ValueError("Anthropic API key not found in settings or environment variables. Set ANTHROPIC_API_KEY.")
85
-
79
+ raise ValueError(
80
+ "Anthropic API key not found in settings or environment variables. Set ANTHROPIC_API_KEY."
81
+ )
82
+
86
83
  # Create client
87
84
  client = AsyncAnthropic(api_key=api_key)
88
85
 
89
86
  adapter = adapter or ClaudeAdapter()
90
-
87
+
91
88
  super().__init__(client=client, adapter=adapter)
92
-
89
+
93
90
  self.model = model
94
91
  self.max_tokens = max_tokens
95
92
  self.max_iterations = max_iterations
96
-
93
+
97
94
  # Default dimensions - will be updated if adapter is provided
98
95
  self.width_px = 1024
99
96
  self.height_px = 768
100
-
97
+
101
98
  # Update dimensions if adapter is provided
102
99
  if self.adapter:
103
100
  self.width_px = self.adapter.agent_width
104
101
  self.height_px = self.adapter.agent_height
105
-
102
+
106
103
  # Message history
107
104
  self.messages: list[BetaMessageParam] = []
108
105
  self.pending_computer_use_tool_id = None
@@ -110,17 +107,17 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
110
107
  async def fetch_response(self, observation: Observation) -> tuple[list[Any], bool]:
111
108
  """
112
109
  Fetch a response from Claude based on the observation.
113
-
110
+
114
111
  Args:
115
112
  observation: The preprocessed observation
116
-
113
+
117
114
  Returns:
118
115
  tuple[list[Any], bool]: A tuple containing the list of raw actions and a
119
116
  boolean indicating if the agent believes the task is complete
120
117
  """
121
118
  if not self.client:
122
119
  raise ValueError("Client is required")
123
-
120
+
124
121
  # Prepare the user content for Claude
125
122
  user_content: list[BetaImageBlockParam | BetaTextBlockParam | BetaToolResultBlockParam] = []
126
123
 
@@ -128,7 +125,7 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
128
125
  if observation.text:
129
126
  logger.info("Adding text to user content: %s", observation.text)
130
127
  user_content.append(text_to_content_block(str(observation.text)))
131
-
128
+
132
129
  # Add screenshot if present
133
130
  if observation.screenshot:
134
131
  logger.info("Adding screenshot to user content")
@@ -136,20 +133,28 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
136
133
  logger.info("Adding screenshot to user content, no tool id")
137
134
  user_content.append(base64_to_content_block(observation.screenshot))
138
135
  else:
139
- logger.info("Adding screenshot to user content, tool id: %s", self.pending_computer_use_tool_id)
136
+ logger.info(
137
+ "Adding screenshot to user content, tool id: %s",
138
+ self.pending_computer_use_tool_id,
139
+ )
140
140
  user_content.append(
141
141
  tool_use_content_block(
142
- self.pending_computer_use_tool_id,
143
- [base64_to_content_block(observation.screenshot)]
142
+ self.pending_computer_use_tool_id,
143
+ [base64_to_content_block(observation.screenshot)],
144
144
  )
145
145
  )
146
146
  self.pending_computer_use_tool_id = None
147
147
 
148
148
  # Add the user content to the messages
149
- self.messages.append(cast(BetaMessageParam, {
150
- "role": "user",
151
- "content": user_content,
152
- }))
149
+ self.messages.append(
150
+ cast(
151
+ BetaMessageParam,
152
+ {
153
+ "role": "user",
154
+ "content": user_content,
155
+ },
156
+ )
157
+ )
153
158
 
154
159
  # Call Claude API using async client
155
160
  response = await self.client.beta.messages.create(
@@ -158,30 +163,35 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
158
163
  messages=self.messages,
159
164
  tools=[COMPUTER_TOOL],
160
165
  betas=["computer-use-2025-01-24"],
161
- tool_choice={"type": "auto", "disable_parallel_tool_use": True}
166
+ tool_choice={"type": "auto", "disable_parallel_tool_use": True},
162
167
  )
163
168
 
164
169
  # Add Claude's response to the conversation history
165
170
  response_content = response.content
166
- self.messages.append(cast(BetaMessageParam, {
167
- "role": "assistant",
168
- "content": response_content,
169
- }))
171
+ self.messages.append(
172
+ cast(
173
+ BetaMessageParam,
174
+ {
175
+ "role": "assistant",
176
+ "content": response_content,
177
+ },
178
+ )
179
+ )
170
180
 
171
181
  # Process tool use
172
182
  actions: list[Any] = []
173
183
  done = True # Assume we're done unless we find a tool use
174
-
184
+
175
185
  for block in response_content:
176
186
  logger.info("Processing block: %s", block)
177
187
  if block.type == "tool_use":
178
188
  logger.info("Processing tool use: %s", block)
179
189
  assert block.name == "computer"
180
-
190
+
181
191
  # Store the raw action
182
192
  actions.append(block.input)
183
193
  self.pending_computer_use_tool_id = block.id
184
-
194
+
185
195
  # If we found a tool use, we're not done
186
196
  done = False
187
197
  break
@@ -192,16 +202,15 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
192
202
  for block in response_content:
193
203
  if block.type == "text":
194
204
  final_text_response += block.text
195
-
205
+
196
206
  if final_text_response.strip():
197
- logger.info(f"No tool use found. Using final text as response: {final_text_response}")
198
- actions = [{
199
- "action": "response",
200
- "text": final_text_response.strip()
201
- }]
207
+ logger.info(
208
+ f"No tool use found. Using final text as response: {final_text_response}"
209
+ )
210
+ actions = [{"action": "response", "text": final_text_response.strip()}]
202
211
  # Keep done = True
203
212
  else:
204
- logger.info("No tool use and no final text block found.")
205
- # Keep done = True, actions remains empty
206
-
213
+ logger.info("No tool use and no final text block found.")
214
+ # Keep done = True, actions remains empty
215
+
207
216
  return actions, done
hud/agent/langchain.py ADDED
@@ -0,0 +1,204 @@
1
+ import logging
2
+ from typing import Any, Generic, List, Optional, TypeVar, Union, cast
3
+
4
+ # Langchain imports
5
+ from langchain_core.language_models import BaseLanguageModel
6
+ from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage
7
+ from langchain_core.runnables import Runnable, RunnableSerializable
8
+ from pydantic import Field, BaseModel
9
+
10
+ # HUD imports
11
+ from hud.adapters import Adapter
12
+ from hud.agent.base import Agent
13
+ from hud.utils.common import Observation
14
+ from hud.adapters.common.types import (
15
+ ClickAction,
16
+ TypeAction,
17
+ ScrollAction,
18
+ MoveAction,
19
+ DragAction,
20
+ PressAction,
21
+ KeyDownAction,
22
+ KeyUpAction,
23
+ WaitAction,
24
+ ResponseAction,
25
+ CustomAction,
26
+ # Exclude ScreenshotFetch, PositionFetch as they are internal
27
+ )
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+ # Define a Pydantic Union type representing exactly ONE possible CLA action
32
+ # This is what we'll ask the Langchain model to output.
33
+ SingleCLAction = Union[
34
+ ClickAction,
35
+ TypeAction,
36
+ ScrollAction,
37
+ MoveAction,
38
+ DragAction,
39
+ PressAction,
40
+ KeyDownAction,
41
+ KeyUpAction,
42
+ WaitAction,
43
+ ResponseAction,
44
+ ]
45
+
46
+
47
+ # Define a Pydantic model to wrap the single action, potentially making it
48
+ # easier for the LLM to consistently output the desired structure.
49
+ class StepAction(BaseModel):
50
+ """Wrapper model requesting a single concrete CLA action from the Langchain model."""
51
+
52
+ action: SingleCLAction = Field(
53
+ ..., description="The single CLA action to perform for this step."
54
+ )
55
+
56
+
57
+ # Generic Type for the Langchain Model/Runnable
58
+ # Allows flexibility in what the user provides (model, chain, etc.)
59
+ # Bound to BaseLanguageModel as .with_structured_output is expected
60
+ LangchainModelOrRunnable = TypeVar("LangchainModelOrRunnable", bound=BaseLanguageModel)
61
+
62
+
63
+ class LangchainAgent(Agent[LangchainModelOrRunnable, Any], Generic[LangchainModelOrRunnable]):
64
+ """
65
+ An agent that uses an arbitrary Langchain model or runnable, leveraging
66
+ Langchain's structured output capabilities to produce a single CLA action per step.
67
+ """
68
+
69
+ def __init__(
70
+ self,
71
+ langchain_model: LangchainModelOrRunnable,
72
+ adapter: Optional[Adapter] = None,
73
+ system_prompt: str | None = None,
74
+ ):
75
+ """
76
+ Initialize the LangchainAgent.
77
+
78
+ Args:
79
+ langchain_model: The Langchain language model or runnable chain to use.
80
+ Must support asynchronous invocation (`ainvoke`) and
81
+ `.with_structured_output()`.
82
+ adapter: An optional HUD adapter. If provided, it will be used for
83
+ preprocessing observations (rescaling) and postprocessing
84
+ the single CLA action (coordinate rescaling).
85
+ system_prompt: An optional system prompt to guide the Langchain model.
86
+ If None, a default prompt encouraging single CLA output is used.
87
+ """
88
+ super().__init__(client=langchain_model, adapter=adapter) # Store model as 'client'
89
+ self.langchain_model = langchain_model # Also store with specific name
90
+
91
+ self.system_prompt_str = system_prompt or self._get_default_system_prompt()
92
+ self.history: List[BaseMessage] = []
93
+
94
+ def _get_default_system_prompt(self) -> str:
95
+ # TODO: Refine this prompt based on testing.
96
+ # It needs to strongly encourage outputting *only* the StepAction structure.
97
+ return (
98
+ "You are an agent interacting with a computer environment (either a web browser or an OS desktop). "
99
+ "Your goal is to follow the user's instructions based on the provided text and screenshot observations."
100
+ "For each step, you must choose exactly ONE action to perform from the available CLA action types."
101
+ "Output your chosen action using the provided 'StepAction' tool/function."
102
+ "If you believe the task is complete based on the user's prompt and the observations, use the 'ResponseAction'."
103
+ )
104
+
105
+ async def fetch_response(self, observation: Observation) -> tuple[list[dict], bool]:
106
+ """
107
+ Fetches a response from the configured Langchain model, expecting a single
108
+ structured CLA action.
109
+
110
+ Args:
111
+ observation: The preprocessed observation (screenshot potentially rescaled by adapter).
112
+
113
+ Returns:
114
+ A tuple containing:
115
+ - A list with a single dictionary representing the raw CLA action (before adapter postprocessing).
116
+ - A boolean indicating if the agent chose ResponseAction (task completion).
117
+ """
118
+ # 1. Format observation into Langchain message(s)
119
+ human_content: List[Union[str, dict]] = []
120
+ if observation.text:
121
+ human_content.append(observation.text)
122
+ if observation.screenshot:
123
+ # Assuming the Langchain model/chain can handle base64 images
124
+ # This might need adjustment based on the specific model used.
125
+ human_content.append(
126
+ {
127
+ "type": "image_url",
128
+ "image_url": {"url": f"data:image/png;base64,{observation.screenshot}"},
129
+ }
130
+ )
131
+
132
+ if not human_content:
133
+ logger.warning("LangchainAgent received an observation with no text or screenshot.")
134
+ # Decide how to handle empty observation - perhaps return no action?
135
+ return [], False # Or raise an error?
136
+
137
+ current_human_message = HumanMessage(content=human_content)
138
+
139
+ # 2. Prepare message history for the model
140
+ messages_for_llm: List[BaseMessage] = [
141
+ SystemMessage(content=self.system_prompt_str),
142
+ *self.history,
143
+ current_human_message,
144
+ ]
145
+
146
+ # 3. Configure structured output
147
+ # We ask for the StepAction wrapper, which contains the actual SingleCLAAction
148
+ # Explicitly use method="function_calling" to handle schemas with default values
149
+ structured_llm = self.langchain_model.with_structured_output(
150
+ schema=StepAction, method="function_calling"
151
+ )
152
+
153
+ # 4. Invoke Langchain model asynchronously
154
+ try:
155
+ ai_response_structured = await structured_llm.ainvoke(messages_for_llm)
156
+ except Exception as e:
157
+ logger.error(f"Langchain model invocation failed: {e}", exc_info=True)
158
+ # Decide how to handle LLM errors - maybe retry or return empty action?
159
+ return [], False
160
+
161
+ # 5. Process the structured response
162
+ is_done = False
163
+ ai_message_content_for_history = "" # For storing in history
164
+
165
+ if isinstance(ai_response_structured, StepAction):
166
+ # Successfully got the wrapper, extract the actual action
167
+ actual_action = ai_response_structured.action
168
+ ai_message_content_for_history = actual_action.model_dump()
169
+ if isinstance(actual_action, ResponseAction):
170
+ is_done = True
171
+ logger.info(
172
+ f"LangchainAgent determined task is done with response: {actual_action.text[:100]}..."
173
+ )
174
+ else:
175
+ logger.info(f"LangchainAgent produced action: {type(actual_action).__name__}")
176
+
177
+ else:
178
+ logger.warning(
179
+ f"Langchain model did not return the expected StepAction structure. "
180
+ f"Received type: {type(ai_response_structured)}. Value: {ai_response_structured!r}"
181
+ )
182
+ # Attempt to add raw response to history for debugging
183
+ if isinstance(ai_response_structured, BaseMessage):
184
+ ai_message_content_for_history = ai_response_structured.content
185
+ elif isinstance(ai_response_structured, str):
186
+ ai_message_content_for_history = ai_response_structured
187
+ else:
188
+ ai_message_content_for_history = repr(ai_response_structured)
189
+ # Return no action as we didn't get the expected structure
190
+ return [], False
191
+
192
+ # 6. Update history
193
+ self.history.append(current_human_message)
194
+ # Add the AI response (containing the structured action dict) to history
195
+ # Convert dict to string representation for AIMessage content
196
+ self.history.append(AIMessage(content=repr(ai_message_content_for_history)))
197
+ # TODO: Consider history truncation/summarization if it grows too long
198
+
199
+ if actual_action:
200
+ # Return the single action dictionary within a list
201
+ return [actual_action.model_dump()], is_done
202
+ else:
203
+ # Should ideally not happen if structure validation worked, but as a fallback
204
+ return [], is_done