hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +11 -5
- hud/agents/base.py +220 -500
- hud/agents/claude.py +200 -240
- hud/agents/gemini.py +275 -0
- hud/agents/gemini_cua.py +335 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +41 -36
- hud/agents/openai.py +291 -292
- hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
- hud/agents/operator.py +211 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +379 -210
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +376 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/cli/__init__.py +461 -545
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +664 -110
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +882 -734
- hud/cli/eval.py +782 -668
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/push.py +29 -11
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +108 -6
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +69 -0
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +40 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +327 -0
- hud/datasets/runner.py +192 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +50 -0
- hud/environment/connection.py +206 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +109 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +694 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +112 -0
- hud/environment/scenarios.py +493 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +218 -0
- hud/environment/tests/test_environment.py +161 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +201 -0
- hud/environment/tests/test_scenarios.py +280 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +674 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +185 -0
- hud/eval/manager.py +466 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +340 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +145 -0
- hud/eval/types.py +63 -0
- hud/eval/utils.py +183 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +151 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +158 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +16 -2
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +4 -0
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +167 -57
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +61 -3
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.1.dist-info/METADATA +264 -0
- hud_python-0.5.1.dist-info/RECORD +299 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
hud/misc/claude_plays_pokemon.py
DELETED
|
@@ -1,292 +0,0 @@
|
|
|
1
|
-
# pyright: reportGeneralTypeIssues=false
|
|
2
|
-
from __future__ import annotations
|
|
3
|
-
|
|
4
|
-
import json
|
|
5
|
-
import logging
|
|
6
|
-
from typing import TYPE_CHECKING, Any, cast
|
|
7
|
-
|
|
8
|
-
from anthropic import AsyncAnthropic
|
|
9
|
-
|
|
10
|
-
from hud.adapters import Adapter
|
|
11
|
-
from hud.adapters.common.types import CLA
|
|
12
|
-
|
|
13
|
-
# Update import to current API; if this script is legacy, keep it optional
|
|
14
|
-
try:
|
|
15
|
-
from hud.agents import MCPAgent as Agent # type: ignore[assignment]
|
|
16
|
-
except Exception: # pragma: no cover - optional example script
|
|
17
|
-
from hud.agents import MCPAgent as Agent # fallback
|
|
18
|
-
from hud.settings import settings
|
|
19
|
-
|
|
20
|
-
if TYPE_CHECKING:
|
|
21
|
-
from anthropic.types.beta import (
|
|
22
|
-
BetaImageBlockParam,
|
|
23
|
-
BetaMessageParam,
|
|
24
|
-
BetaTextBlockParam,
|
|
25
|
-
)
|
|
26
|
-
|
|
27
|
-
from hud.env.environment import Observation
|
|
28
|
-
|
|
29
|
-
logger = logging.getLogger(__name__)
|
|
30
|
-
|
|
31
|
-
# Constants
|
|
32
|
-
DEFAULT_MODEL = "claude-3-7-sonnet-20250219"
|
|
33
|
-
DEFAULT_MAX_TOKENS = 4096
|
|
34
|
-
DEFAULT_MAX_ITERATIONS = 10
|
|
35
|
-
DEFAULT_TEMPERATURE = 0.7
|
|
36
|
-
DEFAULT_MAX_MESSAGE_MEMORY = 20
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
def generate_system_prompt(game_name: str) -> str:
|
|
40
|
-
"""Generate the system prompt for the AI agent.
|
|
41
|
-
|
|
42
|
-
Args:
|
|
43
|
-
game_name: Name of the game being played
|
|
44
|
-
|
|
45
|
-
Returns:
|
|
46
|
-
str: The system prompt for the AI agent
|
|
47
|
-
"""
|
|
48
|
-
return """You are a specialized AI assistant designed to play Pokémon games via screenshot analysis and text instructions. Your task is to understand the current game state from visual input, determine appropriate actions, and respond with structured outputs that control the game.
|
|
49
|
-
|
|
50
|
-
For each turn, you will receive:
|
|
51
|
-
1. A screenshot of the current game state
|
|
52
|
-
2. Contextual information about the game progress, recent events, and objectives
|
|
53
|
-
|
|
54
|
-
Based on this information, you must analyze the situation, determine the best course of action, and provide a structured JSON response.
|
|
55
|
-
|
|
56
|
-
## Response Format
|
|
57
|
-
Your response MUST follow this exact JSON format with no additional markers, tags, or block delimiters:
|
|
58
|
-
|
|
59
|
-
{
|
|
60
|
-
"analysis": "Brief analysis of the current game situation, visible UI elements, and important context (1-3 sentences)",
|
|
61
|
-
"current_objective": "The immediate goal based on the game state (single sentence)",
|
|
62
|
-
"reasoning": "Step-by-step logic explaining your chosen action sequence (2-4 sentences)",
|
|
63
|
-
"progress_assessment": "Evaluation of whether previous action(s) achieved their intended goal and why/why not (1-2 sentences)",
|
|
64
|
-
"actions": [
|
|
65
|
-
{
|
|
66
|
-
"type": "press",
|
|
67
|
-
"keys": ["up"|"down"|"left"|"right"|"a"|"b"|"start"|"select"|"pause"]
|
|
68
|
-
},
|
|
69
|
-
{
|
|
70
|
-
"type": "wait",
|
|
71
|
-
"time": milliseconds_to_wait
|
|
72
|
-
}
|
|
73
|
-
]
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
IMPORTANT: Do not include any conversation markers like <<ASSISTANT_CONVERSATION_START>> or <<ASSISTANT_CONVERSATION_END>> around your response. Provide only the clean JSON object.
|
|
77
|
-
|
|
78
|
-
## Action Types
|
|
79
|
-
- Button presses: {"type": "press", "keys": ["button_name"]} - Valid buttons are: up, down, left, right, a, b, start, select, pause
|
|
80
|
-
- Wait for processing: {"type": "wait", "time": milliseconds}
|
|
81
|
-
|
|
82
|
-
## Important Rules
|
|
83
|
-
1. Never use "wait" commands while the game is paused. The game state will not change while paused, so waiting is ineffective.
|
|
84
|
-
2. If you detect the game is paused, your next action should be to unpause by using {"type": "press", "keys": ["pause"]} before attempting other actions.
|
|
85
|
-
3. Maintain awareness of whether the game is in a paused state based on visual cues in the screenshot.
|
|
86
|
-
|
|
87
|
-
## Game Play Guidelines
|
|
88
|
-
1. **Navigation**: Use directional buttons to move the character or navigate menus
|
|
89
|
-
2. **Interaction**: Use 'a' to confirm selections and interact with objects/NPCs, 'b' to cancel or exit menus
|
|
90
|
-
3. **Menu Access**: Use 'start' to access the game menu
|
|
91
|
-
4. **Battle Strategy**: Analyze Pokémon types, moves, and stats to make optimal battle decisions
|
|
92
|
-
5. **Progressive Play**: Work toward completing the current objective while being mindful of longer-term goals like leveling Pokémon, collecting badges, and advancing the story
|
|
93
|
-
6. **Resource Management**: Monitor and manage HP, PP, items, and Pokéballs effectively
|
|
94
|
-
7. **Memory**: Maintain awareness of the game history and your previous actions to avoid repetitive behaviors
|
|
95
|
-
|
|
96
|
-
Always provide thoughtful analysis and clear reasoning for your decisions. If you're uncertain about the best course of action, prioritize safe moves that gather more information.
|
|
97
|
-
""" # noqa: E501
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
def extract_action_from_response_block(block: dict[str, Any]) -> list[dict[str, Any]]:
|
|
101
|
-
"""Extract actions from a response block.
|
|
102
|
-
|
|
103
|
-
Args:
|
|
104
|
-
block: The response block containing actions
|
|
105
|
-
|
|
106
|
-
Returns:
|
|
107
|
-
list[dict[str, Any]]: List of actions extracted from the block
|
|
108
|
-
"""
|
|
109
|
-
if "actions" in block:
|
|
110
|
-
actions = block["actions"]
|
|
111
|
-
if isinstance(actions, list):
|
|
112
|
-
return actions
|
|
113
|
-
return []
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
def extract_json_from_response(response: str) -> str:
|
|
117
|
-
"""Extract JSON from a response string.
|
|
118
|
-
|
|
119
|
-
Args:
|
|
120
|
-
response: The response string containing JSON
|
|
121
|
-
|
|
122
|
-
Returns:
|
|
123
|
-
str: The extracted JSON string
|
|
124
|
-
"""
|
|
125
|
-
# Try to find JSON block with markdown code block markers
|
|
126
|
-
start = response.find("```json")
|
|
127
|
-
end = response.rfind("```")
|
|
128
|
-
if start != -1 and end != -1:
|
|
129
|
-
start += len("```json")
|
|
130
|
-
return response[start:end].strip()
|
|
131
|
-
|
|
132
|
-
# Try to find JSON object directly
|
|
133
|
-
start = response.find("{")
|
|
134
|
-
end = response.rfind("}")
|
|
135
|
-
if start != -1 and end != -1:
|
|
136
|
-
return response[start : end + 1].strip()
|
|
137
|
-
|
|
138
|
-
return response.strip()
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
class ClaudePlaysPokemon(Agent[AsyncAnthropic, CLA]):
|
|
142
|
-
"""AI agent that plays Pokémon games using Claude."""
|
|
143
|
-
|
|
144
|
-
def __init__(
|
|
145
|
-
self,
|
|
146
|
-
client: AsyncAnthropic | None = None,
|
|
147
|
-
adapter: Adapter | None = None,
|
|
148
|
-
model: str = DEFAULT_MODEL,
|
|
149
|
-
max_tokens: int = DEFAULT_MAX_TOKENS,
|
|
150
|
-
max_iterations: int = DEFAULT_MAX_ITERATIONS,
|
|
151
|
-
temperature: float = DEFAULT_TEMPERATURE,
|
|
152
|
-
max_message_memory: int = DEFAULT_MAX_MESSAGE_MEMORY,
|
|
153
|
-
) -> None:
|
|
154
|
-
"""Initialize the Claude Plays Pokémon agent.
|
|
155
|
-
|
|
156
|
-
Args:
|
|
157
|
-
client: Anthropic API client
|
|
158
|
-
adapter: Game adapter
|
|
159
|
-
model: Claude model to use
|
|
160
|
-
max_tokens: Maximum tokens for response
|
|
161
|
-
max_iterations: Maximum number of iterations
|
|
162
|
-
temperature: Response temperature
|
|
163
|
-
max_message_memory: Maximum number of messages to remember
|
|
164
|
-
|
|
165
|
-
Raises:
|
|
166
|
-
ValueError: If API key is not provided
|
|
167
|
-
"""
|
|
168
|
-
if client is None:
|
|
169
|
-
api_key = settings.anthropic_api_key
|
|
170
|
-
if not api_key:
|
|
171
|
-
raise ValueError("Anthropic API key is required")
|
|
172
|
-
client = AsyncAnthropic(api_key=api_key)
|
|
173
|
-
|
|
174
|
-
if adapter is None:
|
|
175
|
-
adapter = Adapter()
|
|
176
|
-
|
|
177
|
-
super().__init__(
|
|
178
|
-
client=client,
|
|
179
|
-
adapter=adapter,
|
|
180
|
-
)
|
|
181
|
-
|
|
182
|
-
self.model = model
|
|
183
|
-
self.max_tokens = max_tokens
|
|
184
|
-
self.max_iterations = max_iterations
|
|
185
|
-
self.temperature = temperature
|
|
186
|
-
self.max_message_memory = max_message_memory
|
|
187
|
-
|
|
188
|
-
self.system_prompts: list[BetaMessageParam] = [
|
|
189
|
-
{
|
|
190
|
-
"role": "assistant",
|
|
191
|
-
"content": generate_system_prompt("Pokemon Red"),
|
|
192
|
-
}
|
|
193
|
-
]
|
|
194
|
-
|
|
195
|
-
self.messages: list[BetaMessageParam] = []
|
|
196
|
-
|
|
197
|
-
async def fetch_response(self, observation: Observation) -> tuple[list[dict[str, Any]], bool]:
|
|
198
|
-
"""Fetch a response from Claude based on the current observation.
|
|
199
|
-
|
|
200
|
-
Args:
|
|
201
|
-
observation: The current game observation
|
|
202
|
-
|
|
203
|
-
Returns:
|
|
204
|
-
tuple[list[dict[str, Any]], bool, list[LogType] | None]: List of actions, whether the game is done, and a list of strings or dictionaries of logs.
|
|
205
|
-
|
|
206
|
-
Raises:
|
|
207
|
-
ValueError: If client is not initialized
|
|
208
|
-
""" # noqa: E501
|
|
209
|
-
if not self.client:
|
|
210
|
-
raise ValueError("Client is not initialized")
|
|
211
|
-
|
|
212
|
-
user_content: list[BetaTextBlockParam | BetaImageBlockParam] = []
|
|
213
|
-
|
|
214
|
-
if observation.text:
|
|
215
|
-
user_content.append(
|
|
216
|
-
{
|
|
217
|
-
"type": "text",
|
|
218
|
-
"text": observation.text,
|
|
219
|
-
}
|
|
220
|
-
)
|
|
221
|
-
|
|
222
|
-
if observation.screenshot:
|
|
223
|
-
logger.debug("Processing screenshot data")
|
|
224
|
-
user_content.append(
|
|
225
|
-
{
|
|
226
|
-
"type": "image",
|
|
227
|
-
"source": {
|
|
228
|
-
"type": "base64",
|
|
229
|
-
"media_type": "image/png",
|
|
230
|
-
"data": observation.screenshot,
|
|
231
|
-
},
|
|
232
|
-
}
|
|
233
|
-
)
|
|
234
|
-
|
|
235
|
-
self.messages.append(
|
|
236
|
-
{
|
|
237
|
-
"role": "user",
|
|
238
|
-
"content": user_content,
|
|
239
|
-
}
|
|
240
|
-
)
|
|
241
|
-
|
|
242
|
-
logger.debug(
|
|
243
|
-
"Sending messages to Claude", extra={"messages": self.system_prompts + self.messages}
|
|
244
|
-
)
|
|
245
|
-
|
|
246
|
-
response = await self.client.beta.messages.create(
|
|
247
|
-
model=self.model,
|
|
248
|
-
messages=self.system_prompts + self.messages,
|
|
249
|
-
temperature=self.temperature,
|
|
250
|
-
max_tokens=self.max_tokens,
|
|
251
|
-
)
|
|
252
|
-
|
|
253
|
-
response_content = response.content
|
|
254
|
-
self.messages.append(
|
|
255
|
-
cast(
|
|
256
|
-
"BetaMessageParam",
|
|
257
|
-
{
|
|
258
|
-
"role": "user",
|
|
259
|
-
"content": response_content,
|
|
260
|
-
},
|
|
261
|
-
)
|
|
262
|
-
)
|
|
263
|
-
|
|
264
|
-
# Maintain message memory limit
|
|
265
|
-
while len(self.messages) > self.max_message_memory:
|
|
266
|
-
self.messages.pop(0)
|
|
267
|
-
|
|
268
|
-
action_list: list[dict[str, Any]] = []
|
|
269
|
-
|
|
270
|
-
# Parse response content to extract actions
|
|
271
|
-
for block in response_content:
|
|
272
|
-
if block.type == "text":
|
|
273
|
-
text_json = extract_json_from_response(block.text)
|
|
274
|
-
try:
|
|
275
|
-
text = json.loads(text_json)
|
|
276
|
-
if not isinstance(text, dict):
|
|
277
|
-
logger.error("Invalid response format", extra={"text": text})
|
|
278
|
-
raise ValueError("Response is not a dictionary")
|
|
279
|
-
|
|
280
|
-
action_list.extend(extract_action_from_response_block(text))
|
|
281
|
-
|
|
282
|
-
except json.JSONDecodeError as e:
|
|
283
|
-
logger.error(
|
|
284
|
-
"Failed to parse response", extra={"error": str(e), "text": text_json}
|
|
285
|
-
)
|
|
286
|
-
|
|
287
|
-
else:
|
|
288
|
-
logger.error("Unexpected block type", extra={"type": type(block)})
|
|
289
|
-
|
|
290
|
-
logger.debug("Extracted actions", extra={"actions": action_list})
|
|
291
|
-
|
|
292
|
-
return action_list, False
|
hud/otel/__init__.py
DELETED
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
"""HUD OpenTelemetry integration.
|
|
2
|
-
|
|
3
|
-
This package provides the internal OpenTelemetry implementation for HUD telemetry.
|
|
4
|
-
Users should interact with the telemetry APIs through hud.telemetry instead.
|
|
5
|
-
|
|
6
|
-
Internal Components:
|
|
7
|
-
- config: OpenTelemetry configuration and setup
|
|
8
|
-
- context: Trace context management and utilities
|
|
9
|
-
- processors: Span enrichment with HUD context
|
|
10
|
-
- exporters: Sending spans to HUD backend
|
|
11
|
-
- collector: In-memory span collection for replay
|
|
12
|
-
- instrumentation: Auto-instrumentation for agents and MCP
|
|
13
|
-
"""
|
|
14
|
-
|
|
15
|
-
from __future__ import annotations
|
|
16
|
-
|
|
17
|
-
from .collector import enable_trace_collection
|
|
18
|
-
from .config import configure_telemetry, is_telemetry_configured, shutdown_telemetry
|
|
19
|
-
from .context import (
|
|
20
|
-
get_current_task_run_id,
|
|
21
|
-
is_root_trace,
|
|
22
|
-
span_context,
|
|
23
|
-
trace,
|
|
24
|
-
)
|
|
25
|
-
|
|
26
|
-
__all__ = [
|
|
27
|
-
"configure_telemetry",
|
|
28
|
-
"enable_trace_collection",
|
|
29
|
-
"get_current_task_run_id",
|
|
30
|
-
"is_root_trace",
|
|
31
|
-
"is_telemetry_configured",
|
|
32
|
-
"shutdown_telemetry",
|
|
33
|
-
"span_context",
|
|
34
|
-
"trace",
|
|
35
|
-
]
|
hud/otel/collector.py
DELETED
|
@@ -1,142 +0,0 @@
|
|
|
1
|
-
"""Global span collector for building in-memory traces.
|
|
2
|
-
|
|
3
|
-
This module provides a way to collect spans during execution
|
|
4
|
-
and retrieve them as a Trace object, enabling replay functionality
|
|
5
|
-
without modifying agent code.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
from __future__ import annotations
|
|
9
|
-
|
|
10
|
-
import logging
|
|
11
|
-
import threading
|
|
12
|
-
from contextvars import ContextVar
|
|
13
|
-
from typing import TYPE_CHECKING
|
|
14
|
-
|
|
15
|
-
from opentelemetry import trace
|
|
16
|
-
from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
|
|
17
|
-
|
|
18
|
-
from hud.types import Trace
|
|
19
|
-
|
|
20
|
-
if TYPE_CHECKING:
|
|
21
|
-
from opentelemetry.sdk.trace import ReadableSpan
|
|
22
|
-
|
|
23
|
-
logger = logging.getLogger(__name__)
|
|
24
|
-
|
|
25
|
-
# Global storage for collected spans by task_run_id
|
|
26
|
-
_TRACE_STORAGE: dict[str, TraceCollector] = {}
|
|
27
|
-
_LOCK = threading.Lock()
|
|
28
|
-
|
|
29
|
-
# Context variable to track if collection is enabled
|
|
30
|
-
_collecting_enabled: ContextVar[bool] = ContextVar("collecting_enabled", default=False)
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
class TraceCollector:
|
|
34
|
-
"""Collects spans for a single task run."""
|
|
35
|
-
|
|
36
|
-
def __init__(self, task_run_id: str) -> None:
|
|
37
|
-
self.task_run_id = task_run_id
|
|
38
|
-
self.spans: list[ReadableSpan] = []
|
|
39
|
-
self._lock = threading.Lock()
|
|
40
|
-
|
|
41
|
-
def add_span(self, span: ReadableSpan) -> None:
|
|
42
|
-
"""Thread-safe span addition."""
|
|
43
|
-
with self._lock:
|
|
44
|
-
self.spans.append(span)
|
|
45
|
-
|
|
46
|
-
def to_trace(self) -> Trace:
|
|
47
|
-
"""Convert collected spans to a Trace object."""
|
|
48
|
-
from .exporters import HudSpan, _span_to_dict
|
|
49
|
-
|
|
50
|
-
trace = Trace()
|
|
51
|
-
|
|
52
|
-
# Convert spans to TraceSteps
|
|
53
|
-
for span in self.spans:
|
|
54
|
-
try:
|
|
55
|
-
# Use the same conversion logic as the exporter
|
|
56
|
-
span_dict = _span_to_dict(span)
|
|
57
|
-
hud_span = HudSpan.model_validate(span_dict)
|
|
58
|
-
|
|
59
|
-
# The attributes field is already a TraceStep
|
|
60
|
-
step = hud_span.attributes
|
|
61
|
-
# Add timing from the span itself
|
|
62
|
-
step.start_timestamp = hud_span.start_time
|
|
63
|
-
step.end_timestamp = hud_span.end_time
|
|
64
|
-
trace.append(step)
|
|
65
|
-
|
|
66
|
-
except Exception as e:
|
|
67
|
-
# Log but don't fail the whole trace
|
|
68
|
-
logger.debug("Failed to convert span: %s", e)
|
|
69
|
-
|
|
70
|
-
return trace
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
class CollectingSpanExporter(SpanExporter):
|
|
74
|
-
"""A span exporter that collects spans in memory for replay."""
|
|
75
|
-
|
|
76
|
-
def export(self, spans: list[ReadableSpan]) -> SpanExportResult:
|
|
77
|
-
"""Collect spans if collection is enabled."""
|
|
78
|
-
if not _collecting_enabled.get():
|
|
79
|
-
return SpanExportResult.SUCCESS
|
|
80
|
-
|
|
81
|
-
for span in spans:
|
|
82
|
-
# Extract task_run_id from span
|
|
83
|
-
task_run_id = span.attributes.get("hud.task_run_id") if span.attributes else None
|
|
84
|
-
if not task_run_id or not isinstance(task_run_id, str):
|
|
85
|
-
continue
|
|
86
|
-
|
|
87
|
-
# Get or create collector
|
|
88
|
-
with _LOCK:
|
|
89
|
-
if task_run_id not in _TRACE_STORAGE:
|
|
90
|
-
_TRACE_STORAGE[task_run_id] = TraceCollector(task_run_id)
|
|
91
|
-
collector = _TRACE_STORAGE[task_run_id]
|
|
92
|
-
|
|
93
|
-
# Add span
|
|
94
|
-
collector.add_span(span)
|
|
95
|
-
|
|
96
|
-
return SpanExportResult.SUCCESS
|
|
97
|
-
|
|
98
|
-
def shutdown(self) -> None:
|
|
99
|
-
"""Clean up resources."""
|
|
100
|
-
with _LOCK:
|
|
101
|
-
_TRACE_STORAGE.clear()
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
def enable_trace_collection(enabled: bool = True) -> None:
|
|
105
|
-
"""Enable or disable in-memory trace collection."""
|
|
106
|
-
_collecting_enabled.set(enabled)
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
def get_trace(task_run_id: str) -> Trace | None:
|
|
110
|
-
"""Retrieve collected trace for a task run ID.
|
|
111
|
-
|
|
112
|
-
Returns None if no trace was collected or collection was disabled.
|
|
113
|
-
"""
|
|
114
|
-
with _LOCK:
|
|
115
|
-
collector = _TRACE_STORAGE.get(task_run_id)
|
|
116
|
-
if collector:
|
|
117
|
-
return collector.to_trace()
|
|
118
|
-
return None
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
def clear_trace(task_run_id: str) -> None:
|
|
122
|
-
"""Clear collected trace for a task run ID."""
|
|
123
|
-
with _LOCK:
|
|
124
|
-
_TRACE_STORAGE.pop(task_run_id, None)
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
def install_collector() -> None:
|
|
128
|
-
"""Install the collecting span exporter.
|
|
129
|
-
|
|
130
|
-
This should be called after configure_telemetry().
|
|
131
|
-
"""
|
|
132
|
-
provider = trace.get_tracer_provider()
|
|
133
|
-
# Guard for SDK tracer providers only
|
|
134
|
-
if hasattr(provider, "add_span_processor"):
|
|
135
|
-
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
|
|
136
|
-
|
|
137
|
-
exporter = CollectingSpanExporter()
|
|
138
|
-
processor = SimpleSpanProcessor(exporter)
|
|
139
|
-
try:
|
|
140
|
-
provider.add_span_processor(processor) # type: ignore[attr-defined]
|
|
141
|
-
except Exception:
|
|
142
|
-
logger.warning("Failed to add span processor")
|
hud/otel/config.py
DELETED
|
@@ -1,181 +0,0 @@
|
|
|
1
|
-
"""Central configuration for OpenTelemetry inside HUD SDK.
|
|
2
|
-
|
|
3
|
-
This file is responsible for
|
|
4
|
-
1. creating the global ``TracerProvider``
|
|
5
|
-
2. attaching span processors (HUD enrichment, batch + exporter)
|
|
6
|
-
3. activating the community MCP instrumentation so that *every* MCP
|
|
7
|
-
request/response/notification is traced automatically.
|
|
8
|
-
|
|
9
|
-
It is *idempotent*: calling :func:`configure_telemetry` more than once
|
|
10
|
-
returns the same provider and does nothing.
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
from __future__ import annotations
|
|
14
|
-
|
|
15
|
-
import logging
|
|
16
|
-
from typing import Any
|
|
17
|
-
|
|
18
|
-
from opentelemetry import trace
|
|
19
|
-
from opentelemetry.sdk.resources import Resource
|
|
20
|
-
from opentelemetry.sdk.trace import TracerProvider
|
|
21
|
-
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
22
|
-
|
|
23
|
-
from hud.settings import settings
|
|
24
|
-
|
|
25
|
-
from .collector import enable_trace_collection, install_collector
|
|
26
|
-
from .exporters import HudSpanExporter
|
|
27
|
-
from .instrumentation import install_mcp_instrumentation
|
|
28
|
-
from .processors import HudEnrichmentProcessor
|
|
29
|
-
|
|
30
|
-
logger = logging.getLogger(__name__)
|
|
31
|
-
|
|
32
|
-
# Global singleton provider so multiple calls do not create duplicates
|
|
33
|
-
_TRACER_PROVIDER: TracerProvider | None = None
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def is_telemetry_configured() -> bool:
|
|
37
|
-
"""Check if telemetry has been configured."""
|
|
38
|
-
return _TRACER_PROVIDER is not None
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
# ---------------------------------------------------------------------------
|
|
42
|
-
# Public API
|
|
43
|
-
# ---------------------------------------------------------------------------
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
def configure_telemetry(
|
|
47
|
-
*,
|
|
48
|
-
service_name: str = "hud-sdk",
|
|
49
|
-
service_version: str | None = None,
|
|
50
|
-
environment: str | None = None,
|
|
51
|
-
extra_resource_attributes: dict[str, Any] | None = None,
|
|
52
|
-
enable_otlp: bool = False,
|
|
53
|
-
otlp_endpoint: str | None = None,
|
|
54
|
-
otlp_headers: dict[str, str] | None = None,
|
|
55
|
-
enable_collection: bool = True,
|
|
56
|
-
) -> TracerProvider:
|
|
57
|
-
"""Initialise OpenTelemetry for the current Python process.
|
|
58
|
-
|
|
59
|
-
It is safe to call this in every entry-point; the provider will only
|
|
60
|
-
be created once.
|
|
61
|
-
"""
|
|
62
|
-
global _TRACER_PROVIDER
|
|
63
|
-
|
|
64
|
-
if _TRACER_PROVIDER is not None:
|
|
65
|
-
return _TRACER_PROVIDER
|
|
66
|
-
|
|
67
|
-
# ------------------------------------------------------------------
|
|
68
|
-
# 1. Resource (identity of this service)
|
|
69
|
-
# ------------------------------------------------------------------
|
|
70
|
-
res_attrs: dict[str, Any] = {
|
|
71
|
-
"service.name": service_name,
|
|
72
|
-
"telemetry.sdk.name": "hud-otel",
|
|
73
|
-
"telemetry.sdk.language": "python",
|
|
74
|
-
}
|
|
75
|
-
if service_version:
|
|
76
|
-
res_attrs["service.version"] = service_version
|
|
77
|
-
if environment:
|
|
78
|
-
res_attrs["deployment.environment"] = environment
|
|
79
|
-
if extra_resource_attributes:
|
|
80
|
-
res_attrs.update(extra_resource_attributes)
|
|
81
|
-
|
|
82
|
-
resource = Resource.create(res_attrs)
|
|
83
|
-
|
|
84
|
-
# ------------------------------------------------------------------
|
|
85
|
-
# 2. Provider
|
|
86
|
-
# ------------------------------------------------------------------
|
|
87
|
-
provider = TracerProvider(resource=resource)
|
|
88
|
-
_TRACER_PROVIDER = provider
|
|
89
|
-
|
|
90
|
-
# ------------------------------------------------------------------
|
|
91
|
-
# 3. Processors / exporters
|
|
92
|
-
# ------------------------------------------------------------------
|
|
93
|
-
provider.add_span_processor(HudEnrichmentProcessor())
|
|
94
|
-
|
|
95
|
-
# HUD exporter (only if enabled and API key is available)
|
|
96
|
-
if settings.telemetry_enabled and settings.api_key:
|
|
97
|
-
exporter = HudSpanExporter(
|
|
98
|
-
telemetry_url=settings.hud_telemetry_url, api_key=settings.api_key
|
|
99
|
-
)
|
|
100
|
-
# Export more continuously to avoid big end flushes
|
|
101
|
-
provider.add_span_processor(
|
|
102
|
-
BatchSpanProcessor(
|
|
103
|
-
exporter,
|
|
104
|
-
schedule_delay_millis=1000,
|
|
105
|
-
max_queue_size=8192,
|
|
106
|
-
max_export_batch_size=256,
|
|
107
|
-
export_timeout_millis=30000,
|
|
108
|
-
)
|
|
109
|
-
)
|
|
110
|
-
elif settings.telemetry_enabled and not settings.api_key and not enable_otlp:
|
|
111
|
-
# Error if no exporters are configured
|
|
112
|
-
raise ValueError(
|
|
113
|
-
"No telemetry backend configured. Either:\n"
|
|
114
|
-
"1. Set HUD_API_KEY environment variable for HUD telemetry (https://hud.so)\n"
|
|
115
|
-
"2. Use enable_otlp=True with configure_telemetry() for alternative backends (e.g., Jaeger)\n" # noqa: E501
|
|
116
|
-
)
|
|
117
|
-
elif not settings.telemetry_enabled:
|
|
118
|
-
logger.info("HUD telemetry disabled via HUD_TELEMETRY_ENABLED=false")
|
|
119
|
-
|
|
120
|
-
# OTLP exporter (optional - for standard OTel viewers)
|
|
121
|
-
if enable_otlp:
|
|
122
|
-
try:
|
|
123
|
-
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
|
|
124
|
-
|
|
125
|
-
otlp_config = {}
|
|
126
|
-
if otlp_endpoint:
|
|
127
|
-
otlp_config["endpoint"] = otlp_endpoint
|
|
128
|
-
# Default to HTTP endpoint if not specified
|
|
129
|
-
if not otlp_endpoint.startswith(("http://", "https://")):
|
|
130
|
-
otlp_config["endpoint"] = f"http://{otlp_endpoint}/v1/traces"
|
|
131
|
-
else:
|
|
132
|
-
# Default HTTP endpoint
|
|
133
|
-
otlp_config["endpoint"] = "http://localhost:4318/v1/traces"
|
|
134
|
-
|
|
135
|
-
if otlp_headers:
|
|
136
|
-
otlp_config["headers"] = otlp_headers
|
|
137
|
-
|
|
138
|
-
otlp_exporter = OTLPSpanExporter(**otlp_config)
|
|
139
|
-
provider.add_span_processor(
|
|
140
|
-
BatchSpanProcessor(
|
|
141
|
-
otlp_exporter,
|
|
142
|
-
schedule_delay_millis=1000,
|
|
143
|
-
max_queue_size=8192,
|
|
144
|
-
max_export_batch_size=256,
|
|
145
|
-
export_timeout_millis=30000,
|
|
146
|
-
)
|
|
147
|
-
)
|
|
148
|
-
logger.info("OTLP HTTP exporter enabled - endpoint: %s", otlp_config["endpoint"])
|
|
149
|
-
except ImportError:
|
|
150
|
-
logger.warning(
|
|
151
|
-
"OTLP export requested but opentelemetry-exporter-otlp-proto-http not installed. "
|
|
152
|
-
"Install with: pip install 'hud-python[agent]'"
|
|
153
|
-
)
|
|
154
|
-
|
|
155
|
-
# ------------------------------------------------------------------
|
|
156
|
-
# 4. Activate provider and instrumentation
|
|
157
|
-
# ------------------------------------------------------------------
|
|
158
|
-
trace.set_tracer_provider(provider)
|
|
159
|
-
install_mcp_instrumentation(provider)
|
|
160
|
-
|
|
161
|
-
# Install in-memory collector if requested
|
|
162
|
-
if enable_collection:
|
|
163
|
-
install_collector()
|
|
164
|
-
enable_trace_collection(True)
|
|
165
|
-
logger.debug("In-memory trace collection enabled")
|
|
166
|
-
|
|
167
|
-
# Agent instrumentation now handled by @hud.instrument decorators
|
|
168
|
-
logger.debug("OpenTelemetry configuration completed")
|
|
169
|
-
|
|
170
|
-
logger.debug("OpenTelemetry configured (provider id=%s)", id(provider))
|
|
171
|
-
return provider
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
def shutdown_telemetry() -> None:
|
|
175
|
-
"""Flush and shutdown the global provider (if configured)."""
|
|
176
|
-
global _TRACER_PROVIDER
|
|
177
|
-
if _TRACER_PROVIDER is None:
|
|
178
|
-
return
|
|
179
|
-
_TRACER_PROVIDER.shutdown() # type: ignore[arg-type]
|
|
180
|
-
_TRACER_PROVIDER = None
|
|
181
|
-
logger.debug("OpenTelemetry shutdown complete")
|