hud-python 0.4.17__py3-none-any.whl → 0.4.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/agents/base.py +30 -11
- hud/agents/misc/response_agent.py +1 -1
- hud/agents/openai_chat_generic.py +167 -34
- hud/datasets/execution/parallel.py +113 -37
- hud/otel/exporters.py +3 -0
- hud/otel/processors.py +3 -0
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.4.17.dist-info → hud_python-0.4.19.dist-info}/METADATA +1 -1
- {hud_python-0.4.17.dist-info → hud_python-0.4.19.dist-info}/RECORD +13 -13
- {hud_python-0.4.17.dist-info → hud_python-0.4.19.dist-info}/WHEEL +0 -0
- {hud_python-0.4.17.dist-info → hud_python-0.4.19.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.17.dist-info → hud_python-0.4.19.dist-info}/licenses/LICENSE +0 -0
hud/agents/base.py
CHANGED
|
@@ -30,9 +30,19 @@ class MCPAgent(ABC):
|
|
|
30
30
|
"""
|
|
31
31
|
Base class for MCP-enabled agents.
|
|
32
32
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
33
|
+
Provides common behavior for agents that interact with MCP servers, including:
|
|
34
|
+
- Client management: accepts an `AgentMCPClient` or auto-creates one at
|
|
35
|
+
runtime when `run()` is called with a `Task` that includes `mcp_config`.
|
|
36
|
+
- Tool lifecycle: discovery, filtering (`allowed_tools`, `disallowed_tools`),
|
|
37
|
+
and automatic marking of lifecycle tools (setup/evaluate) from a `Task`.
|
|
38
|
+
- Messaging: system prompt handling, optional inclusion of setup output on
|
|
39
|
+
the first turn, and control over initial screenshots.
|
|
40
|
+
- Telemetry & UX: standardized logging/printing via `HUDDesign` and optional
|
|
41
|
+
automatic tracing (`auto_trace`).
|
|
42
|
+
|
|
43
|
+
Subclasses implement provider-specific formatting and response fetching
|
|
44
|
+
by overriding these abstract methods: `get_system_messages`, `get_response`,
|
|
45
|
+
`format_blocks`, and `format_tool_results`.
|
|
36
46
|
"""
|
|
37
47
|
|
|
38
48
|
metadata: dict[str, Any]
|
|
@@ -59,14 +69,23 @@ class MCPAgent(ABC):
|
|
|
59
69
|
Initialize the base MCP agent.
|
|
60
70
|
|
|
61
71
|
Args:
|
|
62
|
-
mcp_client:
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
72
|
+
mcp_client: Client for connecting to MCP servers. If None, a client
|
|
73
|
+
is auto-created at runtime when `run()` is called with a `Task`
|
|
74
|
+
that provides `mcp_config`.
|
|
75
|
+
allowed_tools: Names of tools to allow (None means allow all).
|
|
76
|
+
disallowed_tools: Names of tools to always exclude.
|
|
77
|
+
lifecycle_tools: Tools reserved for lifecycle phases (e.g., setup,
|
|
78
|
+
evaluate). These are hidden from normal tool calling.
|
|
79
|
+
system_prompt: System prompt to seed the conversation.
|
|
80
|
+
append_setup_output: Whether to append setup tool output to the
|
|
81
|
+
first turn's messages.
|
|
82
|
+
initial_screenshot: Whether to include an initial screenshot before
|
|
83
|
+
the first prompt (when supported by the environment).
|
|
84
|
+
model_name: Label used in telemetry/logging to identify the model.
|
|
85
|
+
response_agent: Optional automation that can respond to the model's
|
|
86
|
+
outputs to keep the loop going (e.g., auto-continue/stop).
|
|
87
|
+
auto_trace: If True, automatically creates a trace/span for runs.
|
|
88
|
+
verbose: If True, increases logging verbosity for developer UX.
|
|
70
89
|
"""
|
|
71
90
|
|
|
72
91
|
self.mcp_client = mcp_client
|
|
@@ -7,7 +7,7 @@ through the existing :class:`hud.agent.MCPAgent` scaffolding.
|
|
|
7
7
|
Key points:
|
|
8
8
|
- Stateless, no special server-side conversation state is assumed.
|
|
9
9
|
- Accepts an :class:`openai.AsyncOpenAI` client, caller can supply their own
|
|
10
|
-
base_url / api_key (e.g.
|
|
10
|
+
base_url / api_key (e.g. llama.cpp, together.ai, …)
|
|
11
11
|
- All HUD features (step_count, OTel spans, tool filtering, screenshots, …)
|
|
12
12
|
come from the ``MCPAgent`` base class, we only implement the three abstract
|
|
13
13
|
methods
|
|
@@ -21,6 +21,7 @@ from typing import TYPE_CHECKING, Any, cast
|
|
|
21
21
|
|
|
22
22
|
import mcp.types as types
|
|
23
23
|
|
|
24
|
+
from hud import instrument
|
|
24
25
|
from hud.types import AgentResponse, MCPToolCall, MCPToolResult
|
|
25
26
|
|
|
26
27
|
from .base import MCPAgent
|
|
@@ -29,8 +30,6 @@ if TYPE_CHECKING:
|
|
|
29
30
|
from openai import AsyncOpenAI
|
|
30
31
|
from openai.types.chat import ChatCompletionToolParam
|
|
31
32
|
|
|
32
|
-
from hud.clients import AgentMCPClient
|
|
33
|
-
|
|
34
33
|
logger = logging.getLogger(__name__)
|
|
35
34
|
|
|
36
35
|
|
|
@@ -39,19 +38,20 @@ class GenericOpenAIChatAgent(MCPAgent):
|
|
|
39
38
|
|
|
40
39
|
def __init__(
|
|
41
40
|
self,
|
|
42
|
-
mcp_client: AgentMCPClient,
|
|
43
41
|
*,
|
|
44
42
|
openai_client: AsyncOpenAI,
|
|
45
43
|
model_name: str = "gpt-4o-mini",
|
|
46
44
|
parallel_tool_calls: bool = False,
|
|
47
|
-
|
|
45
|
+
completion_kwargs: dict[str, Any] | None = None,
|
|
48
46
|
**agent_kwargs: Any,
|
|
49
47
|
) -> None:
|
|
50
|
-
|
|
48
|
+
# Accept base-agent settings via **agent_kwargs (e.g., mcp_client, system_prompt, etc.)
|
|
49
|
+
super().__init__(**agent_kwargs)
|
|
51
50
|
self.oai = openai_client
|
|
52
51
|
self.model_name = model_name
|
|
53
52
|
self.parallel_tool_calls = parallel_tool_calls
|
|
54
|
-
self.
|
|
53
|
+
self.completion_kwargs: dict[str, Any] = completion_kwargs or {}
|
|
54
|
+
self.conversation_history = []
|
|
55
55
|
|
|
56
56
|
@staticmethod
|
|
57
57
|
def _oai_to_mcp(tool_call: Any) -> MCPToolCall: # type: ignore[valid-type]
|
|
@@ -64,54 +64,144 @@ class GenericOpenAIChatAgent(MCPAgent):
|
|
|
64
64
|
|
|
65
65
|
async def get_system_messages(self) -> list[Any]:
|
|
66
66
|
"""Get system messages for OpenAI."""
|
|
67
|
-
return [
|
|
68
|
-
{"role": "system", "content": self.system_prompt},
|
|
69
|
-
]
|
|
67
|
+
return [{"role": "system", "content": self.system_prompt}]
|
|
70
68
|
|
|
71
69
|
async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
|
|
72
70
|
"""Format blocks for OpenAI."""
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
71
|
+
content = []
|
|
72
|
+
for block in blocks:
|
|
73
|
+
if isinstance(block, types.TextContent):
|
|
74
|
+
content.append({"type": "text", "text": block.text})
|
|
75
|
+
elif isinstance(block, types.ImageContent):
|
|
76
|
+
content.append(
|
|
77
|
+
{
|
|
78
|
+
"type": "image_url",
|
|
79
|
+
"image_url": {"url": f"data:{block.mimeType};base64,{block.data}"},
|
|
80
|
+
}
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
return [{"role": "user", "content": content}]
|
|
84
|
+
|
|
85
|
+
def _sanitize_schema_for_openai(self, schema: dict) -> dict:
|
|
86
|
+
"""Convert MCP JSON Schema to OpenAI-compatible format.
|
|
87
|
+
|
|
88
|
+
Handles unsupported features like anyOf and prefixItems.
|
|
89
|
+
"""
|
|
90
|
+
if not isinstance(schema, dict):
|
|
91
|
+
return schema
|
|
92
|
+
|
|
93
|
+
sanitized = {}
|
|
94
|
+
|
|
95
|
+
for key, value in schema.items():
|
|
96
|
+
if key == "anyOf" and isinstance(value, list):
|
|
97
|
+
# Handle anyOf patterns (usually for nullable fields)
|
|
98
|
+
non_null_types = [
|
|
99
|
+
v for v in value if not (isinstance(v, dict) and v.get("type") == "null")
|
|
100
|
+
]
|
|
101
|
+
if non_null_types:
|
|
102
|
+
# Use the first non-null type
|
|
103
|
+
sanitized.update(self._sanitize_schema_for_openai(non_null_types[0]))
|
|
104
|
+
else:
|
|
105
|
+
sanitized["type"] = "string" # Fallback
|
|
106
|
+
|
|
107
|
+
elif key == "prefixItems":
|
|
108
|
+
# Convert prefixItems to simple items
|
|
109
|
+
sanitized["type"] = "array"
|
|
110
|
+
if isinstance(value, list) and value:
|
|
111
|
+
# Use the type from the first item as the items schema
|
|
112
|
+
first_item = value[0]
|
|
113
|
+
if isinstance(first_item, dict):
|
|
114
|
+
sanitized["items"] = {"type": first_item.get("type", "string")}
|
|
115
|
+
else:
|
|
116
|
+
sanitized["items"] = {"type": "string"}
|
|
117
|
+
|
|
118
|
+
elif key == "properties" and isinstance(value, dict):
|
|
119
|
+
# Recursively sanitize property schemas
|
|
120
|
+
sanitized[key] = {
|
|
121
|
+
prop_name: self._sanitize_schema_for_openai(prop_schema)
|
|
122
|
+
for prop_name, prop_schema in value.items()
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
elif key == "items" and isinstance(value, dict):
|
|
126
|
+
# Recursively sanitize items schema
|
|
127
|
+
sanitized[key] = self._sanitize_schema_for_openai(value)
|
|
128
|
+
|
|
129
|
+
elif key in (
|
|
130
|
+
"type",
|
|
131
|
+
"description",
|
|
132
|
+
"enum",
|
|
133
|
+
"required",
|
|
134
|
+
"default",
|
|
135
|
+
"minimum",
|
|
136
|
+
"maximum",
|
|
137
|
+
"minItems",
|
|
138
|
+
"maxItems",
|
|
139
|
+
):
|
|
140
|
+
# These are supported by OpenAI
|
|
141
|
+
sanitized[key] = value
|
|
142
|
+
|
|
143
|
+
return sanitized or {"type": "object"}
|
|
83
144
|
|
|
84
145
|
def get_tool_schemas(self) -> list[dict]:
|
|
85
146
|
tool_schemas = super().get_tool_schemas()
|
|
86
147
|
openai_tools = []
|
|
87
148
|
for schema in tool_schemas:
|
|
149
|
+
parameters = schema.get("parameters", {})
|
|
150
|
+
|
|
151
|
+
if parameters:
|
|
152
|
+
sanitized_params = self._sanitize_schema_for_openai(parameters)
|
|
153
|
+
else:
|
|
154
|
+
sanitized_params = {"type": "object", "properties": {}}
|
|
155
|
+
|
|
88
156
|
openai_tool = {
|
|
89
157
|
"type": "function",
|
|
90
158
|
"function": {
|
|
91
159
|
"name": schema["name"],
|
|
92
160
|
"description": schema.get("description", ""),
|
|
93
|
-
"parameters":
|
|
161
|
+
"parameters": sanitized_params,
|
|
94
162
|
},
|
|
95
163
|
}
|
|
96
164
|
openai_tools.append(openai_tool)
|
|
97
165
|
return openai_tools
|
|
98
166
|
|
|
167
|
+
@instrument(
|
|
168
|
+
span_type="agent",
|
|
169
|
+
record_args=False,
|
|
170
|
+
record_result=True,
|
|
171
|
+
)
|
|
99
172
|
async def get_response(self, messages: list[Any]) -> AgentResponse:
|
|
100
173
|
"""Send chat request to OpenAI and convert the response."""
|
|
174
|
+
|
|
101
175
|
# Convert MCP tool schemas to OpenAI format
|
|
102
176
|
mcp_schemas = self.get_tool_schemas()
|
|
103
177
|
|
|
178
|
+
protected_keys = {"model", "messages", "tools", "parallel_tool_calls"}
|
|
179
|
+
extra = {k: v for k, v in (self.completion_kwargs or {}).items() if k not in protected_keys}
|
|
180
|
+
|
|
104
181
|
response = await self.oai.chat.completions.create(
|
|
105
182
|
model=self.model_name,
|
|
106
183
|
messages=messages,
|
|
107
184
|
tools=cast("list[ChatCompletionToolParam]", mcp_schemas),
|
|
108
185
|
parallel_tool_calls=self.parallel_tool_calls,
|
|
109
|
-
|
|
186
|
+
**extra,
|
|
110
187
|
)
|
|
111
188
|
|
|
112
189
|
choice = response.choices[0]
|
|
113
190
|
msg = choice.message
|
|
114
191
|
|
|
192
|
+
assistant_msg: dict[str, Any] = {"role": "assistant"}
|
|
193
|
+
|
|
194
|
+
if msg.content:
|
|
195
|
+
assistant_msg["content"] = msg.content
|
|
196
|
+
|
|
197
|
+
if msg.tool_calls:
|
|
198
|
+
assistant_msg["tool_calls"] = msg.tool_calls
|
|
199
|
+
|
|
200
|
+
messages.append(assistant_msg)
|
|
201
|
+
|
|
202
|
+
# Store the complete conversation history
|
|
203
|
+
self.conversation_history = messages.copy()
|
|
204
|
+
|
|
115
205
|
tool_calls = []
|
|
116
206
|
if msg.tool_calls:
|
|
117
207
|
for tc in msg.tool_calls:
|
|
@@ -123,7 +213,7 @@ class GenericOpenAIChatAgent(MCPAgent):
|
|
|
123
213
|
return AgentResponse(
|
|
124
214
|
content=msg.content or "",
|
|
125
215
|
tool_calls=tool_calls,
|
|
126
|
-
done=choice.finish_reason
|
|
216
|
+
done=choice.finish_reason in ("stop", "length"),
|
|
127
217
|
raw=response, # Include raw response for access to Choice objects
|
|
128
218
|
)
|
|
129
219
|
|
|
@@ -132,23 +222,66 @@ class GenericOpenAIChatAgent(MCPAgent):
|
|
|
132
222
|
tool_calls: list[MCPToolCall],
|
|
133
223
|
tool_results: list[MCPToolResult],
|
|
134
224
|
) -> list[Any]:
|
|
135
|
-
"""Render MCP tool results as OpenAI
|
|
225
|
+
"""Render MCP tool results as OpenAI messages.
|
|
226
|
+
|
|
227
|
+
Note: OpenAI tool messages only support string content.
|
|
228
|
+
When images are present, we return both a tool message and a user message.
|
|
229
|
+
"""
|
|
136
230
|
rendered: list[dict[str, Any]] = []
|
|
137
231
|
for call, res in zip(tool_calls, tool_results, strict=False):
|
|
138
|
-
if
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
232
|
+
# Use structuredContent.result if available, otherwise use content
|
|
233
|
+
items = res.content
|
|
234
|
+
if res.structuredContent and isinstance(res.structuredContent, dict):
|
|
235
|
+
items = res.structuredContent.get("result", res.content)
|
|
236
|
+
|
|
237
|
+
# Separate text and image content
|
|
238
|
+
text_parts = []
|
|
239
|
+
image_parts = []
|
|
240
|
+
|
|
241
|
+
for item in items:
|
|
242
|
+
if isinstance(item, dict):
|
|
243
|
+
if item.get("type") == "text":
|
|
244
|
+
text_parts.append(item.get("text", ""))
|
|
245
|
+
elif item.get("type") == "image":
|
|
246
|
+
mime_type = item.get("mimeType", "image/png")
|
|
247
|
+
data = item.get("data", "")
|
|
248
|
+
image_parts.append(
|
|
249
|
+
{
|
|
250
|
+
"type": "image_url",
|
|
251
|
+
"image_url": {"url": f"data:{mime_type};base64,{data}"},
|
|
252
|
+
}
|
|
253
|
+
)
|
|
254
|
+
elif isinstance(item, types.TextContent):
|
|
255
|
+
text_parts.append(item.text)
|
|
256
|
+
elif isinstance(item, types.ImageContent):
|
|
257
|
+
image_parts.append(
|
|
258
|
+
{
|
|
259
|
+
"type": "image_url",
|
|
260
|
+
"image_url": {"url": f"data:{item.mimeType};base64,{item.data}"},
|
|
261
|
+
}
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
text_content = "".join(text_parts) if text_parts else "Tool executed successfully"
|
|
147
265
|
rendered.append(
|
|
148
266
|
{
|
|
149
267
|
"role": "tool",
|
|
150
268
|
"tool_call_id": call.id,
|
|
151
|
-
"content":
|
|
269
|
+
"content": text_content,
|
|
152
270
|
}
|
|
153
271
|
)
|
|
272
|
+
|
|
273
|
+
# If there are images, add them as a separate user message
|
|
274
|
+
if image_parts:
|
|
275
|
+
# Add a user message with the images
|
|
276
|
+
content_with_images = [
|
|
277
|
+
{"type": "text", "text": "Tool returned the following:"},
|
|
278
|
+
*image_parts,
|
|
279
|
+
]
|
|
280
|
+
rendered.append(
|
|
281
|
+
{
|
|
282
|
+
"role": "user",
|
|
283
|
+
"content": content_with_images,
|
|
284
|
+
}
|
|
285
|
+
)
|
|
286
|
+
|
|
154
287
|
return rendered
|
|
@@ -40,6 +40,7 @@ def _process_worker(
|
|
|
40
40
|
2. Creates its own event loop
|
|
41
41
|
3. Processes a batch of tasks asynchronously
|
|
42
42
|
4. Returns results with their original indices
|
|
43
|
+
5. Handles interruption signals gracefully
|
|
43
44
|
|
|
44
45
|
Args:
|
|
45
46
|
task_batch: List of (index, task_dict) tuples
|
|
@@ -58,6 +59,7 @@ def _process_worker(
|
|
|
58
59
|
List of (index, result) tuples
|
|
59
60
|
"""
|
|
60
61
|
# Import inside worker to avoid pickling issues
|
|
62
|
+
import signal
|
|
61
63
|
import sys
|
|
62
64
|
|
|
63
65
|
import hud
|
|
@@ -72,6 +74,14 @@ def _process_worker(
|
|
|
72
74
|
except AttributeError:
|
|
73
75
|
pass
|
|
74
76
|
|
|
77
|
+
# Set up signal handler for clean interruption
|
|
78
|
+
def signal_handler(signum: int, frame: Any) -> None:
|
|
79
|
+
logger.warning("Worker %s: Received interrupt signal", worker_id)
|
|
80
|
+
# Raise KeyboardInterrupt to actually interrupt the worker
|
|
81
|
+
raise KeyboardInterrupt(f"Worker {worker_id} interrupted by user")
|
|
82
|
+
|
|
83
|
+
signal.signal(signal.SIGINT, signal_handler)
|
|
84
|
+
|
|
75
85
|
# Reinitialize telemetry in this process
|
|
76
86
|
configure_telemetry()
|
|
77
87
|
|
|
@@ -157,8 +167,25 @@ def _process_worker(
|
|
|
157
167
|
# Process all tasks in parallel within this process
|
|
158
168
|
tasks = [process_single_task(idx, task_dict) for idx, task_dict in task_batch]
|
|
159
169
|
|
|
160
|
-
|
|
161
|
-
|
|
170
|
+
try:
|
|
171
|
+
results = await asyncio.gather(*tasks, return_exceptions=False)
|
|
172
|
+
return results
|
|
173
|
+
except asyncio.CancelledError:
|
|
174
|
+
logger.info("Worker %s: Tasks cancelled due to interruption", worker_id)
|
|
175
|
+
# Return error results for all tasks
|
|
176
|
+
return [
|
|
177
|
+
(
|
|
178
|
+
idx,
|
|
179
|
+
{
|
|
180
|
+
"error": "Task cancelled (Ctrl+C)",
|
|
181
|
+
"isError": True,
|
|
182
|
+
"reward": 0.0,
|
|
183
|
+
"done": False,
|
|
184
|
+
"content": "Task cancelled",
|
|
185
|
+
},
|
|
186
|
+
)
|
|
187
|
+
for idx, _ in task_batch
|
|
188
|
+
]
|
|
162
189
|
|
|
163
190
|
try:
|
|
164
191
|
# Run the async batch processing
|
|
@@ -180,6 +207,24 @@ def _process_worker(
|
|
|
180
207
|
logger.warning("Worker %s: Telemetry flush timed out", worker_id)
|
|
181
208
|
|
|
182
209
|
return results
|
|
210
|
+
except KeyboardInterrupt:
|
|
211
|
+
logger.info("Worker %s: Interrupted by user, stopping gracefully", worker_id)
|
|
212
|
+
# Return partial results for tasks that completed
|
|
213
|
+
partial_results = []
|
|
214
|
+
for idx, _ in task_batch:
|
|
215
|
+
partial_results.append(
|
|
216
|
+
(
|
|
217
|
+
idx,
|
|
218
|
+
{
|
|
219
|
+
"error": "Worker interrupted by user (Ctrl+C)",
|
|
220
|
+
"isError": True,
|
|
221
|
+
"reward": 0.0,
|
|
222
|
+
"done": False,
|
|
223
|
+
"content": "Task interrupted",
|
|
224
|
+
},
|
|
225
|
+
)
|
|
226
|
+
)
|
|
227
|
+
return partial_results
|
|
183
228
|
except Exception as e:
|
|
184
229
|
logger.error("[Worker %s] Batch processing failed: %s", worker_id, e)
|
|
185
230
|
logger.error("Worker %s batch processing failed: %s", worker_id, e)
|
|
@@ -365,7 +410,8 @@ async def run_dataset_parallel_manual(
|
|
|
365
410
|
)
|
|
366
411
|
|
|
367
412
|
# Process batches in parallel using ProcessPoolExecutor
|
|
368
|
-
|
|
413
|
+
executor = ProcessPoolExecutor(max_workers=max_workers)
|
|
414
|
+
try:
|
|
369
415
|
# Submit all batches to workers
|
|
370
416
|
future_to_batch = {
|
|
371
417
|
executor.submit(worker_func, batch, worker_id=i): batch
|
|
@@ -377,48 +423,78 @@ async def run_dataset_parallel_manual(
|
|
|
377
423
|
total = len(task_dicts)
|
|
378
424
|
|
|
379
425
|
# Process results as they complete
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
426
|
+
try:
|
|
427
|
+
for future in as_completed(future_to_batch):
|
|
428
|
+
batch = future_to_batch[future]
|
|
429
|
+
|
|
430
|
+
try:
|
|
431
|
+
# Get results from this worker
|
|
432
|
+
batch_results = future.result()
|
|
433
|
+
|
|
434
|
+
# Place results in correct positions
|
|
435
|
+
for index, result in batch_results:
|
|
436
|
+
results[index] = result
|
|
437
|
+
completed += 1
|
|
438
|
+
|
|
439
|
+
# Calculate success rate so far
|
|
440
|
+
successful_so_far = sum(
|
|
441
|
+
1
|
|
442
|
+
for r in results[:completed]
|
|
443
|
+
if r is not None and getattr(r, "reward", 0) > 0
|
|
444
|
+
)
|
|
398
445
|
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
446
|
+
progress_msg = (
|
|
447
|
+
f"Progress: {completed}/{total} tasks completed "
|
|
448
|
+
f"({100 * completed / total:.1f}%) | "
|
|
449
|
+
f"Success rate: {successful_so_far}/{completed} "
|
|
450
|
+
f"({100 * successful_so_far / completed:.1f}%)"
|
|
451
|
+
)
|
|
405
452
|
|
|
406
|
-
|
|
453
|
+
logger.info(progress_msg)
|
|
407
454
|
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
455
|
+
except Exception as e:
|
|
456
|
+
# Handle worker failure
|
|
457
|
+
logger.error(
|
|
458
|
+
"Worker failed with exception: %s\n%s", e, traceback.format_exc()
|
|
459
|
+
)
|
|
411
460
|
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
461
|
+
# Mark all tasks in this batch as failed
|
|
462
|
+
for index, _ in batch:
|
|
463
|
+
results[index] = {
|
|
464
|
+
"error": f"Worker process failed: {e}",
|
|
465
|
+
"isError": True,
|
|
466
|
+
"reward": 0.0,
|
|
467
|
+
"done": False,
|
|
468
|
+
"content": f"Worker process failed: {e}",
|
|
469
|
+
}
|
|
470
|
+
completed += 1
|
|
471
|
+
|
|
472
|
+
except KeyboardInterrupt:
|
|
473
|
+
logger.warning("\n⚠️ Parallel evaluation interrupted by user (Ctrl+C)")
|
|
474
|
+
logger.info("Cancelling pending tasks...")
|
|
475
|
+
|
|
476
|
+
# Cancel all pending futures
|
|
477
|
+
for future in future_to_batch:
|
|
478
|
+
if not future.done():
|
|
479
|
+
future.cancel()
|
|
480
|
+
|
|
481
|
+
# Mark uncompleted tasks as interrupted
|
|
482
|
+
for i, r in enumerate(results):
|
|
483
|
+
if r is None:
|
|
484
|
+
results[i] = {
|
|
485
|
+
"error": "Evaluation interrupted by user",
|
|
416
486
|
"isError": True,
|
|
417
487
|
"reward": 0.0,
|
|
418
488
|
"done": False,
|
|
419
|
-
"content":
|
|
489
|
+
"content": "Task interrupted (Ctrl+C)",
|
|
420
490
|
}
|
|
421
|
-
|
|
491
|
+
|
|
492
|
+
logger.info("Interrupted after %s/%s tasks", completed, total)
|
|
493
|
+
raise # Re-raise to propagate the interrupt
|
|
494
|
+
|
|
495
|
+
finally:
|
|
496
|
+
# Always shutdown the executor properly
|
|
497
|
+
executor.shutdown(wait=False, cancel_futures=True)
|
|
422
498
|
|
|
423
499
|
# Verify all results are populated
|
|
424
500
|
missing = [i for i, r in enumerate(results) if r is None]
|
hud/otel/exporters.py
CHANGED
|
@@ -14,6 +14,7 @@ from __future__ import annotations
|
|
|
14
14
|
import contextlib
|
|
15
15
|
import json
|
|
16
16
|
import logging
|
|
17
|
+
import time
|
|
17
18
|
from collections import defaultdict
|
|
18
19
|
from datetime import UTC, datetime
|
|
19
20
|
from typing import TYPE_CHECKING, Any
|
|
@@ -362,5 +363,7 @@ class HudSpanExporter(SpanExporter):
|
|
|
362
363
|
pass
|
|
363
364
|
|
|
364
365
|
def force_flush(self, timeout_millis: int | None = None) -> bool: # type: ignore[override]
|
|
366
|
+
if timeout_millis:
|
|
367
|
+
time.sleep(timeout_millis / 1000)
|
|
365
368
|
# Synchronous export, nothing buffered here
|
|
366
369
|
return True
|
hud/otel/processors.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
+
import time
|
|
4
5
|
from typing import Any
|
|
5
6
|
|
|
6
7
|
from opentelemetry import baggage
|
|
@@ -115,4 +116,6 @@ class HudEnrichmentProcessor(SpanProcessor):
|
|
|
115
116
|
pass
|
|
116
117
|
|
|
117
118
|
def force_flush(self, timeout_millis: int | None = None) -> bool: # type: ignore[override]
|
|
119
|
+
if timeout_millis:
|
|
120
|
+
time.sleep(timeout_millis / 1000)
|
|
118
121
|
return True
|
hud/utils/tests/test_version.py
CHANGED
hud/version.py
CHANGED
|
@@ -2,15 +2,15 @@ hud/__init__.py,sha256=BjAhZtsHbGN371Q8t3o4v4jltedkmDE85xW0yOILU9g,397
|
|
|
2
2
|
hud/__main__.py,sha256=YR8Dq8OhINOsVfQ55PmRXXg4fEK84Rt_-rMtJ5rvhWo,145
|
|
3
3
|
hud/settings.py,sha256=q9aZiHjvbL4oLE-N8AttTW4rmzS8zPMnsca-iMGyEGc,2362
|
|
4
4
|
hud/types.py,sha256=gNnyS1G7aYHIR5sT3k3bOfSTFnPylUO6lNGLWbjbeYk,5149
|
|
5
|
-
hud/version.py,sha256=
|
|
5
|
+
hud/version.py,sha256=TspylpJFduiccfFe6aqVOAungquP27FSDJSivDPvJ5E,105
|
|
6
6
|
hud/agents/__init__.py,sha256=UoIkljWdbq4bM0LD-mSaw6w826EqdEjOk7r6glNYwYQ,286
|
|
7
|
-
hud/agents/base.py,sha256=
|
|
7
|
+
hud/agents/base.py,sha256=t3bPRTKzGuejhSeo1jLNprlUv6zNU9ezQfP16tX_pXw,29562
|
|
8
8
|
hud/agents/claude.py,sha256=_eD_XKZhVJ6grkHQfbS6JskztueomQcmJeGJMbfNdmE,14534
|
|
9
9
|
hud/agents/langchain.py,sha256=1EgCy8jfjunsWxlPC5XfvfLS6_XZVrIF1ZjtHcrvhYw,9584
|
|
10
10
|
hud/agents/openai.py,sha256=tvFYsZ5yaoLkfjMnHe-COxRttMsLRXBLPdSqgeipQRk,14257
|
|
11
|
-
hud/agents/openai_chat_generic.py,sha256=
|
|
11
|
+
hud/agents/openai_chat_generic.py,sha256=PQAD4GGE6sHs8R95qpgDBHEbSOJ7WXCYGYFmd3Nic1g,10628
|
|
12
12
|
hud/agents/misc/__init__.py,sha256=BYi4Ytp9b_vycpZFXnr5Oyw6ncKLNNGml8Jrb7bWUb4,136
|
|
13
|
-
hud/agents/misc/response_agent.py,sha256=
|
|
13
|
+
hud/agents/misc/response_agent.py,sha256=pnaomb4H-QJm1YKU3tC1YnZXxOlDbTHIXaIH-6Nkb6I,3102
|
|
14
14
|
hud/agents/tests/__init__.py,sha256=W-O-_4i34d9TTyEHV-O_q1Ai1gLhzwDaaPo02_TWQIY,34
|
|
15
15
|
hud/agents/tests/test_base.py,sha256=F39ajSqASGUbPyPoWSY9KARFav62qNTK74W11Tr1Tg4,28970
|
|
16
16
|
hud/agents/tests/test_claude.py,sha256=wqEKlzEvx8obz1sSm4NY0j-Zyt1qWNfDOmRqYIuAEd0,13069
|
|
@@ -79,7 +79,7 @@ hud/datasets/__init__.py,sha256=74T4mrjELKtE04XkZKwU8QAJcg2wjqXLqRO9s4GlPr4,678
|
|
|
79
79
|
hud/datasets/task.py,sha256=V82HzRb2_c2MO9EG5ZcY-PMsLt3234Uks7WlkMta5HY,3615
|
|
80
80
|
hud/datasets/utils.py,sha256=3hKvZTkZuCRkTeITB86nNdA1dtHZAqFfAdSPMtcTUhs,4275
|
|
81
81
|
hud/datasets/execution/__init__.py,sha256=4m1AEpMQaUSJFVN_iAXvY6zFttVgZKwE6oQtC0Rrk7U,330
|
|
82
|
-
hud/datasets/execution/parallel.py,sha256=
|
|
82
|
+
hud/datasets/execution/parallel.py,sha256=4aL1XpS3vOBqZjgs0vrMZJ4eAoi86Td8C-m5SUtVxMs,25231
|
|
83
83
|
hud/datasets/execution/runner.py,sha256=EEvb90vvAqFXXx8NyVKLfK5p-gtsfJqiFJAoqSjyfXg,4695
|
|
84
84
|
hud/misc/__init__.py,sha256=m_pprQQ-G-Y0Sd0NEiR8MtAMbElnuFZ2OWT8TXrw7c4,43
|
|
85
85
|
hud/misc/claude_plays_pokemon.py,sha256=IthAkjDVr2Q-GNvX-QLJyMzN7-0pHqqJbagGNv2m7yo,10453
|
|
@@ -87,9 +87,9 @@ hud/otel/__init__.py,sha256=ii17ayoWiS5vAhA7UAmZ8TkmP52gs2pWyHsD46-uYbE,1003
|
|
|
87
87
|
hud/otel/collector.py,sha256=jLZymZ8r7xt2VDuWexfbnT7PY1-0aiyLMgjBy8KDY1M,4497
|
|
88
88
|
hud/otel/config.py,sha256=6np_C2UXhtKHHjY41HQxZElua2Eh_EUCBiRB_YuiSuc,6249
|
|
89
89
|
hud/otel/context.py,sha256=C9MvO99cRSNNDEDC7ehO3eoTPnb6J7AemUYvEp57yEU,17774
|
|
90
|
-
hud/otel/exporters.py,sha256=
|
|
90
|
+
hud/otel/exporters.py,sha256=RLAjWa8b2DJEU21740Idq4fmeIuabLEqGGUspcFDcH4,14331
|
|
91
91
|
hud/otel/instrumentation.py,sha256=xTjrkn2p490lJ8UlSD1SfzkPZsD8XKDocQqYQfwMMKo,3775
|
|
92
|
-
hud/otel/processors.py,sha256
|
|
92
|
+
hud/otel/processors.py,sha256=-gGRbwifplcExDQBLfx_9tqWreDImULJNcENgO9q7VU,4700
|
|
93
93
|
hud/otel/tests/__init__.py,sha256=VNJKBMaxTtbn7trW-1Ph50zCvCok_wTSGcI1HD6GOLA,43
|
|
94
94
|
hud/otel/tests/test_processors.py,sha256=np0R4ssd9j6LJSJykJ5bNjl0POwNYNhgb7BqOZHwcMY,6778
|
|
95
95
|
hud/server/__init__.py,sha256=8LUwgsXO8xiViWP7uImDwcOsWLu01r5F4r8U8qH3rSY,91
|
|
@@ -157,10 +157,10 @@ hud/utils/tests/test_init.py,sha256=2QLQSGgyP9wJhOvPCusm_zjJad0qApOZi1BXpxcdHXQ,
|
|
|
157
157
|
hud/utils/tests/test_mcp.py,sha256=0pUa16mL-bqbZDXp5NHBnt1gO5o10BOg7zTMHZ1DNPM,4023
|
|
158
158
|
hud/utils/tests/test_progress.py,sha256=QSF7Kpi03Ff_l3mAeqW9qs1nhK50j9vBiSobZq7T4f4,7394
|
|
159
159
|
hud/utils/tests/test_telemetry.py,sha256=5jl7bEx8C8b-FfFUko5pf4UY-mPOR-9HaeL98dGtVHM,2781
|
|
160
|
-
hud/utils/tests/test_version.py,sha256
|
|
160
|
+
hud/utils/tests/test_version.py,sha256=bB6kVxiVIBfXJAEJpmnhn0ml3FG8Gk5ByfSd2fgoARc,160
|
|
161
161
|
hud/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
162
|
-
hud_python-0.4.
|
|
163
|
-
hud_python-0.4.
|
|
164
|
-
hud_python-0.4.
|
|
165
|
-
hud_python-0.4.
|
|
166
|
-
hud_python-0.4.
|
|
162
|
+
hud_python-0.4.19.dist-info/METADATA,sha256=T-D9DILS-I5e6xdOmJOIU6wOPpXn3yY_zxk0bKFfPts,20287
|
|
163
|
+
hud_python-0.4.19.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
164
|
+
hud_python-0.4.19.dist-info/entry_points.txt,sha256=jJbodNFg1m0-CDofe5AHvB4zKBq7sSdP97-ohaQ3ae4,63
|
|
165
|
+
hud_python-0.4.19.dist-info/licenses/LICENSE,sha256=yIzBheVUf86FC1bztAcr7RYWWNxyd3B-UJQ3uddg1HA,1078
|
|
166
|
+
hud_python-0.4.19.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|