hud-python 0.3.5__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (192) hide show
  1. hud/__init__.py +22 -89
  2. hud/agents/__init__.py +15 -0
  3. hud/agents/art.py +101 -0
  4. hud/agents/base.py +599 -0
  5. hud/{mcp → agents}/claude.py +373 -321
  6. hud/{mcp → agents}/langchain.py +250 -250
  7. hud/agents/misc/__init__.py +7 -0
  8. hud/{agent → agents}/misc/response_agent.py +80 -80
  9. hud/{mcp → agents}/openai.py +352 -334
  10. hud/agents/openai_chat_generic.py +154 -0
  11. hud/{mcp → agents}/tests/__init__.py +1 -1
  12. hud/agents/tests/test_base.py +742 -0
  13. hud/agents/tests/test_claude.py +324 -0
  14. hud/{mcp → agents}/tests/test_client.py +363 -324
  15. hud/{mcp → agents}/tests/test_openai.py +237 -238
  16. hud/cli/__init__.py +617 -0
  17. hud/cli/__main__.py +8 -0
  18. hud/cli/analyze.py +371 -0
  19. hud/cli/analyze_metadata.py +230 -0
  20. hud/cli/build.py +427 -0
  21. hud/cli/clone.py +185 -0
  22. hud/cli/cursor.py +92 -0
  23. hud/cli/debug.py +392 -0
  24. hud/cli/docker_utils.py +83 -0
  25. hud/cli/init.py +281 -0
  26. hud/cli/interactive.py +353 -0
  27. hud/cli/mcp_server.py +756 -0
  28. hud/cli/pull.py +336 -0
  29. hud/cli/push.py +370 -0
  30. hud/cli/remote_runner.py +311 -0
  31. hud/cli/runner.py +160 -0
  32. hud/cli/tests/__init__.py +3 -0
  33. hud/cli/tests/test_analyze.py +284 -0
  34. hud/cli/tests/test_cli_init.py +265 -0
  35. hud/cli/tests/test_cli_main.py +27 -0
  36. hud/cli/tests/test_clone.py +142 -0
  37. hud/cli/tests/test_cursor.py +253 -0
  38. hud/cli/tests/test_debug.py +453 -0
  39. hud/cli/tests/test_mcp_server.py +139 -0
  40. hud/cli/tests/test_utils.py +388 -0
  41. hud/cli/utils.py +263 -0
  42. hud/clients/README.md +143 -0
  43. hud/clients/__init__.py +16 -0
  44. hud/clients/base.py +379 -0
  45. hud/clients/fastmcp.py +222 -0
  46. hud/clients/mcp_use.py +278 -0
  47. hud/clients/tests/__init__.py +1 -0
  48. hud/clients/tests/test_client_integration.py +111 -0
  49. hud/clients/tests/test_fastmcp.py +342 -0
  50. hud/clients/tests/test_protocol.py +188 -0
  51. hud/clients/utils/__init__.py +1 -0
  52. hud/clients/utils/retry_transport.py +160 -0
  53. hud/datasets.py +322 -192
  54. hud/misc/__init__.py +1 -0
  55. hud/{agent → misc}/claude_plays_pokemon.py +292 -283
  56. hud/otel/__init__.py +35 -0
  57. hud/otel/collector.py +142 -0
  58. hud/otel/config.py +164 -0
  59. hud/otel/context.py +536 -0
  60. hud/otel/exporters.py +366 -0
  61. hud/otel/instrumentation.py +97 -0
  62. hud/otel/processors.py +118 -0
  63. hud/otel/tests/__init__.py +1 -0
  64. hud/otel/tests/test_processors.py +197 -0
  65. hud/server/__init__.py +5 -5
  66. hud/server/context.py +114 -0
  67. hud/server/helper/__init__.py +5 -0
  68. hud/server/low_level.py +132 -0
  69. hud/server/server.py +166 -0
  70. hud/server/tests/__init__.py +3 -0
  71. hud/settings.py +73 -79
  72. hud/shared/__init__.py +5 -0
  73. hud/{exceptions.py → shared/exceptions.py} +180 -180
  74. hud/{server → shared}/requests.py +264 -264
  75. hud/shared/tests/test_exceptions.py +157 -0
  76. hud/{server → shared}/tests/test_requests.py +275 -275
  77. hud/telemetry/__init__.py +25 -30
  78. hud/telemetry/instrument.py +379 -0
  79. hud/telemetry/job.py +309 -141
  80. hud/telemetry/replay.py +74 -0
  81. hud/telemetry/trace.py +83 -0
  82. hud/tools/__init__.py +33 -34
  83. hud/tools/base.py +365 -65
  84. hud/tools/bash.py +161 -137
  85. hud/tools/computer/__init__.py +15 -13
  86. hud/tools/computer/anthropic.py +437 -420
  87. hud/tools/computer/hud.py +376 -334
  88. hud/tools/computer/openai.py +295 -292
  89. hud/tools/computer/settings.py +82 -0
  90. hud/tools/edit.py +314 -290
  91. hud/tools/executors/__init__.py +30 -30
  92. hud/tools/executors/base.py +539 -532
  93. hud/tools/executors/pyautogui.py +621 -619
  94. hud/tools/executors/tests/__init__.py +1 -1
  95. hud/tools/executors/tests/test_base_executor.py +338 -338
  96. hud/tools/executors/tests/test_pyautogui_executor.py +165 -165
  97. hud/tools/executors/xdo.py +511 -503
  98. hud/tools/{playwright_tool.py → playwright.py} +412 -379
  99. hud/tools/tests/__init__.py +3 -3
  100. hud/tools/tests/test_base.py +282 -0
  101. hud/tools/tests/test_bash.py +158 -152
  102. hud/tools/tests/test_bash_extended.py +197 -0
  103. hud/tools/tests/test_computer.py +425 -52
  104. hud/tools/tests/test_computer_actions.py +34 -34
  105. hud/tools/tests/test_edit.py +259 -240
  106. hud/tools/tests/test_init.py +27 -27
  107. hud/tools/tests/test_playwright_tool.py +183 -183
  108. hud/tools/tests/test_tools.py +145 -157
  109. hud/tools/tests/test_utils.py +156 -156
  110. hud/tools/types.py +72 -0
  111. hud/tools/utils.py +50 -50
  112. hud/types.py +136 -89
  113. hud/utils/__init__.py +10 -16
  114. hud/utils/async_utils.py +65 -0
  115. hud/utils/design.py +168 -0
  116. hud/utils/mcp.py +55 -0
  117. hud/utils/progress.py +149 -149
  118. hud/utils/telemetry.py +66 -66
  119. hud/utils/tests/test_async_utils.py +173 -0
  120. hud/utils/tests/test_init.py +17 -21
  121. hud/utils/tests/test_progress.py +261 -225
  122. hud/utils/tests/test_telemetry.py +82 -37
  123. hud/utils/tests/test_version.py +8 -8
  124. hud/version.py +7 -7
  125. hud_python-0.4.1.dist-info/METADATA +476 -0
  126. hud_python-0.4.1.dist-info/RECORD +132 -0
  127. hud_python-0.4.1.dist-info/entry_points.txt +3 -0
  128. {hud_python-0.3.5.dist-info → hud_python-0.4.1.dist-info}/licenses/LICENSE +21 -21
  129. hud/adapters/__init__.py +0 -8
  130. hud/adapters/claude/__init__.py +0 -5
  131. hud/adapters/claude/adapter.py +0 -180
  132. hud/adapters/claude/tests/__init__.py +0 -1
  133. hud/adapters/claude/tests/test_adapter.py +0 -519
  134. hud/adapters/common/__init__.py +0 -6
  135. hud/adapters/common/adapter.py +0 -178
  136. hud/adapters/common/tests/test_adapter.py +0 -289
  137. hud/adapters/common/types.py +0 -446
  138. hud/adapters/operator/__init__.py +0 -5
  139. hud/adapters/operator/adapter.py +0 -108
  140. hud/adapters/operator/tests/__init__.py +0 -1
  141. hud/adapters/operator/tests/test_adapter.py +0 -370
  142. hud/agent/__init__.py +0 -19
  143. hud/agent/base.py +0 -126
  144. hud/agent/claude.py +0 -271
  145. hud/agent/langchain.py +0 -215
  146. hud/agent/misc/__init__.py +0 -3
  147. hud/agent/operator.py +0 -268
  148. hud/agent/tests/__init__.py +0 -1
  149. hud/agent/tests/test_base.py +0 -202
  150. hud/env/__init__.py +0 -11
  151. hud/env/client.py +0 -35
  152. hud/env/docker_client.py +0 -349
  153. hud/env/environment.py +0 -446
  154. hud/env/local_docker_client.py +0 -358
  155. hud/env/remote_client.py +0 -212
  156. hud/env/remote_docker_client.py +0 -292
  157. hud/gym.py +0 -130
  158. hud/job.py +0 -773
  159. hud/mcp/__init__.py +0 -17
  160. hud/mcp/base.py +0 -631
  161. hud/mcp/client.py +0 -312
  162. hud/mcp/tests/test_base.py +0 -512
  163. hud/mcp/tests/test_claude.py +0 -294
  164. hud/task.py +0 -149
  165. hud/taskset.py +0 -237
  166. hud/telemetry/_trace.py +0 -347
  167. hud/telemetry/context.py +0 -230
  168. hud/telemetry/exporter.py +0 -575
  169. hud/telemetry/instrumentation/__init__.py +0 -3
  170. hud/telemetry/instrumentation/mcp.py +0 -259
  171. hud/telemetry/instrumentation/registry.py +0 -59
  172. hud/telemetry/mcp_models.py +0 -270
  173. hud/telemetry/tests/__init__.py +0 -1
  174. hud/telemetry/tests/test_context.py +0 -210
  175. hud/telemetry/tests/test_trace.py +0 -312
  176. hud/tools/helper/README.md +0 -56
  177. hud/tools/helper/__init__.py +0 -9
  178. hud/tools/helper/mcp_server.py +0 -78
  179. hud/tools/helper/server_initialization.py +0 -115
  180. hud/tools/helper/utils.py +0 -58
  181. hud/trajectory.py +0 -94
  182. hud/utils/agent.py +0 -37
  183. hud/utils/common.py +0 -256
  184. hud/utils/config.py +0 -120
  185. hud/utils/deprecation.py +0 -115
  186. hud/utils/misc.py +0 -53
  187. hud/utils/tests/test_common.py +0 -277
  188. hud/utils/tests/test_config.py +0 -129
  189. hud_python-0.3.5.dist-info/METADATA +0 -284
  190. hud_python-0.3.5.dist-info/RECORD +0 -120
  191. /hud/{adapters/common → shared}/tests/__init__.py +0 -0
  192. {hud_python-0.3.5.dist-info → hud_python-0.4.1.dist-info}/WHEEL +0 -0
hud/__init__.py CHANGED
@@ -1,89 +1,22 @@
1
- """
2
- HUD SDK for interacting with the HUD evaluation platform.
3
- """
4
-
5
- from __future__ import annotations
6
-
7
- import warnings
8
- from typing import Any
9
-
10
- from . import agent, datasets, env, gym, settings, task, taskset, types, utils
11
- from .adapters import ResponseAction as Response
12
- from .datasets import run_dataset, to_taskconfigs
13
- from .job import create_job, load_job, run_job
14
-
15
- # Import deprecated items with deferred warning
16
- from .task import Task as _Task
17
- from .taskset import load_taskset as _load_taskset
18
- from .telemetry import flush, job, trace, trace_open # New context-based job
19
- from .version import __version__
20
-
21
-
22
- def __getattr__(name: str) -> Any:
23
- """Emit deprecation warnings for deprecated imports."""
24
- if name == "Task":
25
- warnings.warn(
26
- "Importing Task from hud is deprecated. "
27
- "Use hud.datasets.TaskConfig instead. "
28
- "Task will be removed in v0.4.0.",
29
- DeprecationWarning,
30
- stacklevel=2,
31
- )
32
- return _Task
33
- elif name == "load_taskset":
34
- warnings.warn(
35
- "Importing load_taskset from hud is deprecated. "
36
- "Use hud-evals HuggingFace datasets instead. "
37
- "load_taskset will be removed in v0.4.0.",
38
- DeprecationWarning,
39
- stacklevel=2,
40
- )
41
- return _load_taskset
42
- raise AttributeError(f"module 'hud' has no attribute '{name}'")
43
-
44
-
45
- def init_telemetry() -> None:
46
- from .telemetry import init_telemetry as _init_telemetry
47
-
48
- _init_telemetry()
49
-
50
-
51
- if settings.settings.fancy_logging:
52
- import logging
53
- import sys
54
-
55
- hud_logger = logging.getLogger("hud")
56
- hud_logger.setLevel(logging.INFO)
57
-
58
- if not hud_logger.handlers:
59
- # Use the configured stream (defaults to stderr)
60
- stream = sys.stderr if settings.settings.log_stream.lower() == "stderr" else sys.stdout
61
- handler = logging.StreamHandler(stream)
62
- formatter = logging.Formatter("[%(levelname)s] %(asctime)s | %(name)s | %(message)s")
63
- handler.setFormatter(formatter)
64
- hud_logger.addHandler(handler)
65
- hud_logger.propagate = False
66
-
67
- __all__ = [
68
- "Response",
69
- "__version__",
70
- "agent",
71
- "create_job",
72
- "datasets",
73
- "env",
74
- "flush",
75
- "gym",
76
- "init_telemetry",
77
- "job",
78
- "load_job",
79
- "run_dataset",
80
- "run_job",
81
- "settings",
82
- "task",
83
- "taskset",
84
- "to_taskconfigs",
85
- "trace",
86
- "trace_open",
87
- "types",
88
- "utils",
89
- ]
1
+ """hud-python.
2
+
3
+ tools for building, evaluating, and training AI agents.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from .telemetry import clear_trace, create_job, get_trace, instrument, job, trace
9
+
10
+ __all__ = [
11
+ "clear_trace",
12
+ "create_job",
13
+ "get_trace",
14
+ "instrument",
15
+ "job",
16
+ "trace",
17
+ ]
18
+
19
+ try:
20
+ from .version import __version__
21
+ except ImportError:
22
+ __version__ = "unknown"
hud/agents/__init__.py ADDED
@@ -0,0 +1,15 @@
1
+ from __future__ import annotations
2
+
3
+ from .art import ArtHUDAgent
4
+ from .base import MCPAgent
5
+ from .claude import ClaudeAgent
6
+ from .openai import OperatorAgent
7
+ from .openai_chat_generic import GenericOpenAIChatAgent
8
+
9
+ __all__ = [
10
+ "ArtHUDAgent",
11
+ "ClaudeAgent",
12
+ "GenericOpenAIChatAgent",
13
+ "MCPAgent",
14
+ "OperatorAgent",
15
+ ]
hud/agents/art.py ADDED
@@ -0,0 +1,101 @@
1
+ """Adapter that plugs a *Trainable* ART model into the HUD MCPAgent stack.
2
+
3
+ This extends GenericOpenAIChatAgent to collect messages_and_choices during
4
+ execution for ART training.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ from typing import TYPE_CHECKING, Any
11
+
12
+ import hud
13
+
14
+ from .openai_chat_generic import GenericOpenAIChatAgent
15
+
16
+ if TYPE_CHECKING:
17
+ import mcp.types as types
18
+
19
+ from hud.clients import AgentMCPClient
20
+ from hud.types import AgentResponse
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ system_prompt = (
26
+ "You are an MCP (Model Context Protocol) agent.\n\n"
27
+ "Use MCP tools through the server to complete your task.\n\n"
28
+ "You have a total of {MAX_STEPS} steps."
29
+ )
30
+
31
+
32
+ class ArtHUDAgent(GenericOpenAIChatAgent):
33
+ """Use an ART *TrainableModel* as the LLM behind a HUD `MCPAgent`.
34
+
35
+ This agent collects messages_and_choices during execution for ART training.
36
+ """
37
+
38
+ def __init__(self, art_model: Any, mcp_client: AgentMCPClient, **agent_kwargs: Any) -> None:
39
+ # Use ART's openai_client() method to get proper timeouts and patching
40
+ openai_client = art_model.openai_client()
41
+
42
+ super().__init__(
43
+ mcp_client=mcp_client,
44
+ openai_client=openai_client,
45
+ model_name=art_model.get_inference_name(),
46
+ logprobs=True,
47
+ **agent_kwargs,
48
+ )
49
+ self.system_prompt = system_prompt
50
+
51
+ self.art_model = art_model
52
+ self.messages_and_choices: list[Any] = [] # Collect for ART training
53
+
54
+ logger.info(
55
+ "ArtHUDAgent initialised with model '%s' (project=%s)",
56
+ art_model.name,
57
+ getattr(art_model, "project", "unknown"),
58
+ )
59
+
60
+ async def get_system_messages(self) -> list[Any]:
61
+ """Get system messages for ART."""
62
+ messages = await super().get_system_messages()
63
+ # Store initial messages as dicts for ART
64
+ self.messages_and_choices.extend(messages)
65
+ return messages
66
+
67
+ async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
68
+ """Format blocks for ART."""
69
+ messages = await super().format_blocks(blocks)
70
+ # Store initial messages as dicts for ART
71
+ self.messages_and_choices.extend(messages)
72
+ return messages
73
+
74
+ @hud.instrument(
75
+ span_type="agent",
76
+ record_args=False, # Messages can be large
77
+ record_result=True,
78
+ )
79
+ async def get_response(self, messages: list[Any]) -> AgentResponse:
80
+ """Get model response and store the Choice for ART."""
81
+ # Call parent's get_model_response
82
+ result = await super().get_response(messages)
83
+
84
+ # Extract and store the Choice from the raw response
85
+ if result.raw and hasattr(result.raw, "choices") and result.raw.choices:
86
+ choice = result.raw.choices[0]
87
+ # Ensure the message has content (required for ART tokenization)
88
+ if choice.message and choice.message.content is None:
89
+ choice.message.content = ""
90
+ self.messages_and_choices.append(choice)
91
+
92
+ return result
93
+
94
+ async def format_tool_results(
95
+ self, tool_calls: list[Any], tool_results: list[Any]
96
+ ) -> list[Any]:
97
+ """Format tool results and store them for ART."""
98
+ tool_messages = await super().format_tool_results(tool_calls, tool_results)
99
+ # Store tool messages for ART
100
+ self.messages_and_choices.extend(tool_messages)
101
+ return tool_messages