hud-python 0.4.34__py3-none-any.whl → 0.4.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

hud/agents/claude.py CHANGED
@@ -6,7 +6,7 @@ import copy
6
6
  import logging
7
7
  from typing import TYPE_CHECKING, Any, ClassVar, cast
8
8
 
9
- from anthropic import AsyncAnthropic, BadRequestError
9
+ from anthropic import Anthropic, AsyncAnthropic, BadRequestError
10
10
  from anthropic.types.beta import BetaContentBlockParam, BetaImageBlockParam, BetaTextBlockParam
11
11
 
12
12
  import hud
@@ -54,6 +54,7 @@ class ClaudeAgent(MCPAgent):
54
54
  model: str = "claude-sonnet-4-20250514",
55
55
  max_tokens: int = 4096,
56
56
  use_computer_beta: bool = True,
57
+ validate_api_key: bool = True,
57
58
  **kwargs: Any,
58
59
  ) -> None:
59
60
  """
@@ -75,6 +76,13 @@ class ClaudeAgent(MCPAgent):
75
76
  raise ValueError("Anthropic API key not found. Set ANTHROPIC_API_KEY.")
76
77
  model_client = AsyncAnthropic(api_key=api_key)
77
78
 
79
+ # validate api key if requested
80
+ if validate_api_key:
81
+ try:
82
+ Anthropic(api_key=model_client.api_key).models.list()
83
+ except Exception as e:
84
+ raise ValueError(f"Anthropic API key is invalid: {e}") from e
85
+
78
86
  self.anthropic_client = model_client
79
87
  self.model = model
80
88
  self.max_tokens = max_tokens
hud/agents/openai.py CHANGED
@@ -6,7 +6,7 @@ import logging
6
6
  from typing import Any, ClassVar, Literal
7
7
 
8
8
  import mcp.types as types
9
- from openai import AsyncOpenAI
9
+ from openai import AsyncOpenAI, OpenAI
10
10
  from openai.types.responses import (
11
11
  ResponseComputerToolCall,
12
12
  ResponseInputMessageContentListParam,
@@ -45,6 +45,7 @@ class OperatorAgent(MCPAgent):
45
45
  model_client: AsyncOpenAI | None = None,
46
46
  model: str = "computer-use-preview",
47
47
  environment: Literal["windows", "mac", "linux", "browser"] = "linux",
48
+ validate_api_key: bool = True,
48
49
  **kwargs: Any,
49
50
  ) -> None:
50
51
  """
@@ -76,6 +77,13 @@ class OperatorAgent(MCPAgent):
76
77
  self.pending_call_id: str | None = None
77
78
  self.pending_safety_checks: list[Any] = []
78
79
 
80
+ # validate api key if requested
81
+ if validate_api_key:
82
+ try:
83
+ OpenAI(api_key=self.openai_client.api_key).models.list()
84
+ except Exception as e:
85
+ raise ValueError(f"OpenAI API key is invalid: {e}") from e
86
+
79
87
  self.model_name = "openai-" + self.model
80
88
 
81
89
  # Append OpenAI-specific instructions to the base system prompt
hud/cli/build.py CHANGED
@@ -489,7 +489,7 @@ def build_environment(
489
489
  hud_console.warning("Could not retrieve image ID for lock file")
490
490
 
491
491
  # Remove temp image after we're done
492
- subprocess.run(["docker", "rmi", temp_tag], capture_output=True) # noqa: S603, S607
492
+ subprocess.run(["docker", "rmi", "-f", temp_tag], capture_output=True) # noqa: S603, S607
493
493
 
494
494
  # Add to local registry
495
495
  if image_id:
hud/cli/eval.py CHANGED
@@ -295,7 +295,7 @@ async def run_full_dataset(
295
295
  agent_type: Literal["claude", "openai", "vllm"] = "claude",
296
296
  model: str | None = None,
297
297
  allowed_tools: list[str] | None = None,
298
- max_concurrent: int = 50,
298
+ max_concurrent: int = 30,
299
299
  max_steps: int = 10,
300
300
  parallel: bool = False,
301
301
  max_workers: int | None = None,
hud/datasets/runner.py CHANGED
@@ -22,7 +22,7 @@ async def run_dataset(
22
22
  dataset: str | Dataset | list[dict[str, Any]],
23
23
  agent_class: type[MCPAgent],
24
24
  agent_config: dict[str, Any] | None = None,
25
- max_concurrent: int = 50,
25
+ max_concurrent: int = 30,
26
26
  metadata: dict[str, Any] | None = None,
27
27
  max_steps: int = 10,
28
28
  split: str = "train",
@@ -5,4 +5,4 @@ def test_import():
5
5
  """Test that the package can be imported."""
6
6
  import hud
7
7
 
8
- assert hud.__version__ == "0.4.34"
8
+ assert hud.__version__ == "0.4.35"
@@ -10,7 +10,8 @@ def _is_call_like(obj: Any) -> bool:
10
10
  return True
11
11
  if len(obj) == 1:
12
12
  _, v = next(iter(obj.items()))
13
- return isinstance(v, dict)
13
+ if isinstance(v, dict):
14
+ return "name" in v or (len(v) == 1 and isinstance(next(iter(v.values())), dict))
14
15
  return False
15
16
 
16
17
 
@@ -19,9 +20,9 @@ def _to_call_dict(obj: Any) -> Any:
19
20
 
20
21
  Rules:
21
22
  - If obj is a dict with {name, arguments}: return {name, arguments: recurse(arguments)}
22
- - Else if obj is a single-key dict {k: v}: return {name: k, arguments: recurse(v)}
23
+ - Else if obj is a single-key dict {k: v} where v looks call-like: return {name: k, arguments: recurse(v)}
23
24
  - Else: return obj unchanged (leaf arguments/value)
24
- """
25
+ """ # noqa: E501
25
26
  if isinstance(obj, dict):
26
27
  if "name" in obj and "arguments" in obj:
27
28
  args = obj.get("arguments")
@@ -31,8 +32,10 @@ def _to_call_dict(obj: Any) -> Any:
31
32
  return {"name": obj.get("name"), "arguments": args}
32
33
  if len(obj) == 1:
33
34
  k, v = next(iter(obj.items()))
34
- if isinstance(v, dict):
35
+ # Only convert single-key dicts if the value looks like it could be a call
36
+ if isinstance(v, dict) and _is_call_like(v):
35
37
  return {"name": k, "arguments": _to_call_dict(v)}
38
+ # Otherwise, leave it as-is (this is the innermost arguments dict)
36
39
  return obj
37
40
  return obj
38
41
 
hud/version.py CHANGED
@@ -4,4 +4,4 @@ Version information for the HUD SDK.
4
4
 
5
5
  from __future__ import annotations
6
6
 
7
- __version__ = "0.4.34"
7
+ __version__ = "0.4.35"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.4.34
3
+ Version: 0.4.35
4
4
  Summary: SDK for the HUD platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -2,13 +2,13 @@ hud/__init__.py,sha256=JMDFUE1pP0J1Xl_miBdt7ERvoffZmTzSFe8yxz512A8,552
2
2
  hud/__main__.py,sha256=YR8Dq8OhINOsVfQ55PmRXXg4fEK84Rt_-rMtJ5rvhWo,145
3
3
  hud/settings.py,sha256=sMS31iW1m-5VpWk-Blhi5-obLcUA0fwxWE1GgJz-vqU,2708
4
4
  hud/types.py,sha256=RtNM2fPU1NAujTmZLOydQIU-ybk3gVRCoJ2TM2hJOlw,10752
5
- hud/version.py,sha256=yDsOrKQyUBp8e8KHUcrT2U56qKvsk4D-58XSWaUqJMU,105
5
+ hud/version.py,sha256=FINeU2_U4IFvIW-XEPRMxtXONropSKKTWBc10NjEGws,105
6
6
  hud/agents/__init__.py,sha256=UoIkljWdbq4bM0LD-mSaw6w826EqdEjOk7r6glNYwYQ,286
7
7
  hud/agents/base.py,sha256=_u1zR3gXzZ1RlTCUYdMcvgHqdJBC4-AB1lZt0yBx8lg,35406
8
- hud/agents/claude.py,sha256=wHiw8iAnjnRmZyKRKcOhagCDQMhz9Z6rlSBWqH1X--M,15781
8
+ hud/agents/claude.py,sha256=TGhm5gE2ltINDAdEsDxKuT9iGMQ5G87R6kmabU3KPt8,16101
9
9
  hud/agents/grounded_openai.py,sha256=U-FHjB2Nh1_o0gmlxY5F17lWJ3oHsNRIB2a7z-IKB64,11231
10
10
  hud/agents/langchain.py,sha256=1EgCy8jfjunsWxlPC5XfvfLS6_XZVrIF1ZjtHcrvhYw,9584
11
- hud/agents/openai.py,sha256=ovARRWNuHqKkZ2Q_OCYSVCIZckrh8XY2jUB2p2x1m88,14259
11
+ hud/agents/openai.py,sha256=O1xV1h1l-W8lmnmXqTYr5CwnmnaniMqOxAZbl2CTTng,14576
12
12
  hud/agents/openai_chat_generic.py,sha256=7n7timn3fvNRnL2xzWyOTeNTchej2r9cAL1mU6YnFdY,11605
13
13
  hud/agents/misc/__init__.py,sha256=BYi4Ytp9b_vycpZFXnr5Oyw6ncKLNNGml8Jrb7bWUb4,136
14
14
  hud/agents/misc/response_agent.py,sha256=uMuRDkz5QgaMQliNzBRepond5sb7KyqIiKm3LstjVnw,3753
@@ -21,11 +21,11 @@ hud/agents/tests/test_openai.py,sha256=1S5IZuc3O3moSp70gqVGjc6m-_b49dCfz2fgX5IGv
21
21
  hud/cli/__init__.py,sha256=xL1l5MfdWubd9AWe-cpW64WFS1SVsTgI8fdNdTZhIvs,40259
22
22
  hud/cli/__main__.py,sha256=fDH7XITyuDITwSDIVwRso06aouADO0CzTHKqp5TOwJE,143
23
23
  hud/cli/analyze.py,sha256=4u5oYfJMquOjT9PzzRTYVcTZDxDi0ilNP_g532_hpOU,14716
24
- hud/cli/build.py,sha256=X8ykInqvDpsBU0rOU_x_sbp27YwCd160hplZLcwZFEg,18479
24
+ hud/cli/build.py,sha256=cCsCgUD-vX7ZL5h14dGadig_PWRdcQKBdj1MV0C9CTk,18485
25
25
  hud/cli/clone.py,sha256=AwVDIuhr8mHb1oT2Af2HrD25SiTdwATpE6zd93vzLgA,6099
26
26
  hud/cli/debug.py,sha256=jtFW8J5F_3rhq1Hf1_SkJ7aLS3wjnyIs_LsC8k5cnzc,14200
27
27
  hud/cli/dev.py,sha256=56vQdH9oe_XGnOcRcFbNIsLEoBnpCl1eANlRFUeddHQ,31734
28
- hud/cli/eval.py,sha256=W_eY4uoIQwHcSCvxNaQeRfWC10uQA1UhBWiNQzQPuXM,22694
28
+ hud/cli/eval.py,sha256=53Xx2Yv6yJrNqvU242qBb8hs2Twh1RIoizNvYy6dGKY,22694
29
29
  hud/cli/get.py,sha256=sksKrdzBGZa7ZuSoQkc0haj-CvOGVSSikoVXeaUd3N4,6274
30
30
  hud/cli/init.py,sha256=McZwpxZMXD-It_PXINCUy-SwUaPiQ7jdpSU5-F-caO8,19671
31
31
  hud/cli/list_func.py,sha256=EVi2Vc3Lb3glBNJxFx4MPnZknZ4xmuJz1OFg_dc8a_E,7177
@@ -88,7 +88,7 @@ hud/clients/utils/retry.py,sha256=mMs2T_mAlb8AYhSqMR4AmCw7838gqCC4mdG3zjMAYM4,57
88
88
  hud/clients/utils/retry_transport.py,sha256=Rsq25eiKKt_pM1bas78QEZvO0illK97X_3opmaS3A3w,6809
89
89
  hud/datasets/__init__.py,sha256=-g05iDy76CU4JiRHjKBBhgh3STtiIjmWhUfPqgf5hJE,697
90
90
  hud/datasets/parallel.py,sha256=m7_z2QwjaRuM9gJFYyiPIJUwrlTxZSvFMAd9L2IDZEo,25772
91
- hud/datasets/runner.py,sha256=2KhGEDzYW_qrSCaNJmsKqiAYZE_-h5VaQ7kv8rSe7Fw,4687
91
+ hud/datasets/runner.py,sha256=43Ua1PUQgnb6cdO9YXJM7kxdlmxPeSV4478Azy5HVGU,4687
92
92
  hud/datasets/utils.py,sha256=hdZfjWH5l3FVJaWBSHEEpjujAG7DqEam_vHgslL8MLs,4279
93
93
  hud/misc/__init__.py,sha256=m_pprQQ-G-Y0Sd0NEiR8MtAMbElnuFZ2OWT8TXrw7c4,43
94
94
  hud/misc/claude_plays_pokemon.py,sha256=IthAkjDVr2Q-GNvX-QLJyMzN7-0pHqqJbagGNv2m7yo,10453
@@ -194,17 +194,17 @@ hud/utils/pretty_errors.py,sha256=WGeL4CTHtlA6KgPuV_JSX5l6H4-xbuTp6Y6tw1bkiFg,24
194
194
  hud/utils/progress.py,sha256=suikwFM8sdSfkV10nAOEaInDhG4XKgOSvFePg4jSj1A,5927
195
195
  hud/utils/tasks.py,sha256=JwFIq0cpPMpMYnICUmx_G4CF6uy9MtiCmmmN7eA6FsA,4682
196
196
  hud/utils/telemetry.py,sha256=hrVIx2rUjSGyy9IVxTZ_3Jii83PiHjyFRd5ls2whimM,1863
197
- hud/utils/tool_shorthand.py,sha256=nWo-Z7D4w8qF1lWKP7TkXMHZiU3vj4jAwfcBXkwrpnE,1833
197
+ hud/utils/tool_shorthand.py,sha256=_haLgK3yazLR2Y0jlEHUUQjw9uZCxi9yTipAwdOAJ70,2148
198
198
  hud/utils/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
199
199
  hud/utils/tests/test_async_utils.py,sha256=RkdSnYErRV3Jn7dfg6CPlcE1RSUL__2B627oIqAyy1s,5945
200
200
  hud/utils/tests/test_init.py,sha256=2QLQSGgyP9wJhOvPCusm_zjJad0qApOZi1BXpxcdHXQ,383
201
201
  hud/utils/tests/test_mcp.py,sha256=0pUa16mL-bqbZDXp5NHBnt1gO5o10BOg7zTMHZ1DNPM,4023
202
202
  hud/utils/tests/test_progress.py,sha256=QSF7Kpi03Ff_l3mAeqW9qs1nhK50j9vBiSobZq7T4f4,7394
203
203
  hud/utils/tests/test_telemetry.py,sha256=5jl7bEx8C8b-FfFUko5pf4UY-mPOR-9HaeL98dGtVHM,2781
204
- hud/utils/tests/test_version.py,sha256=YmDs3a476rPIo4rLodsx3zqGeIIeDCXLjenmKK4-rkE,160
204
+ hud/utils/tests/test_version.py,sha256=gVwJvjGLJ5VNZYJPFRHyfTPWah6I0M4JS0sYTWLoGM4,160
205
205
  hud/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
206
- hud_python-0.4.34.dist-info/METADATA,sha256=8ju6prkWIMJ5ChDPU-evkWqWucKZ1QyU3XTM2717RYA,20861
207
- hud_python-0.4.34.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
208
- hud_python-0.4.34.dist-info/entry_points.txt,sha256=jJbodNFg1m0-CDofe5AHvB4zKBq7sSdP97-ohaQ3ae4,63
209
- hud_python-0.4.34.dist-info/licenses/LICENSE,sha256=yIzBheVUf86FC1bztAcr7RYWWNxyd3B-UJQ3uddg1HA,1078
210
- hud_python-0.4.34.dist-info/RECORD,,
206
+ hud_python-0.4.35.dist-info/METADATA,sha256=bSffhIrX5P4LCM-rTGuZz71gwsIPkoqzFYu-wPcH2SE,20861
207
+ hud_python-0.4.35.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
208
+ hud_python-0.4.35.dist-info/entry_points.txt,sha256=jJbodNFg1m0-CDofe5AHvB4zKBq7sSdP97-ohaQ3ae4,63
209
+ hud_python-0.4.35.dist-info/licenses/LICENSE,sha256=yIzBheVUf86FC1bztAcr7RYWWNxyd3B-UJQ3uddg1HA,1078
210
+ hud_python-0.4.35.dist-info/RECORD,,