hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +11 -5
  3. hud/agents/base.py +220 -500
  4. hud/agents/claude.py +200 -240
  5. hud/agents/gemini.py +275 -0
  6. hud/agents/gemini_cua.py +335 -0
  7. hud/agents/grounded_openai.py +98 -100
  8. hud/agents/misc/integration_test_agent.py +51 -20
  9. hud/agents/misc/response_agent.py +41 -36
  10. hud/agents/openai.py +291 -292
  11. hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
  12. hud/agents/operator.py +211 -0
  13. hud/agents/tests/conftest.py +133 -0
  14. hud/agents/tests/test_base.py +300 -622
  15. hud/agents/tests/test_base_runtime.py +233 -0
  16. hud/agents/tests/test_claude.py +379 -210
  17. hud/agents/tests/test_client.py +9 -10
  18. hud/agents/tests/test_gemini.py +369 -0
  19. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  20. hud/agents/tests/test_openai.py +376 -140
  21. hud/agents/tests/test_operator.py +362 -0
  22. hud/agents/tests/test_run_eval.py +179 -0
  23. hud/cli/__init__.py +461 -545
  24. hud/cli/analyze.py +43 -5
  25. hud/cli/build.py +664 -110
  26. hud/cli/debug.py +8 -5
  27. hud/cli/dev.py +882 -734
  28. hud/cli/eval.py +782 -668
  29. hud/cli/flows/dev.py +167 -0
  30. hud/cli/flows/init.py +191 -0
  31. hud/cli/flows/tasks.py +153 -56
  32. hud/cli/flows/templates.py +151 -0
  33. hud/cli/flows/tests/__init__.py +1 -0
  34. hud/cli/flows/tests/test_dev.py +126 -0
  35. hud/cli/init.py +60 -58
  36. hud/cli/push.py +29 -11
  37. hud/cli/rft.py +311 -0
  38. hud/cli/rft_status.py +145 -0
  39. hud/cli/tests/test_analyze.py +5 -5
  40. hud/cli/tests/test_analyze_metadata.py +3 -2
  41. hud/cli/tests/test_analyze_module.py +120 -0
  42. hud/cli/tests/test_build.py +108 -6
  43. hud/cli/tests/test_build_failure.py +41 -0
  44. hud/cli/tests/test_build_module.py +50 -0
  45. hud/cli/tests/test_cli_init.py +6 -1
  46. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  47. hud/cli/tests/test_cli_root.py +140 -0
  48. hud/cli/tests/test_convert.py +361 -0
  49. hud/cli/tests/test_debug.py +12 -10
  50. hud/cli/tests/test_dev.py +197 -0
  51. hud/cli/tests/test_eval.py +251 -0
  52. hud/cli/tests/test_eval_bedrock.py +51 -0
  53. hud/cli/tests/test_init.py +124 -0
  54. hud/cli/tests/test_main_module.py +11 -5
  55. hud/cli/tests/test_mcp_server.py +12 -100
  56. hud/cli/tests/test_push_happy.py +74 -0
  57. hud/cli/tests/test_push_wrapper.py +23 -0
  58. hud/cli/tests/test_registry.py +1 -1
  59. hud/cli/tests/test_utils.py +1 -1
  60. hud/cli/{rl → utils}/celebrate.py +14 -12
  61. hud/cli/utils/config.py +18 -1
  62. hud/cli/utils/docker.py +130 -4
  63. hud/cli/utils/env_check.py +9 -9
  64. hud/cli/utils/git.py +136 -0
  65. hud/cli/utils/interactive.py +39 -5
  66. hud/cli/utils/metadata.py +69 -0
  67. hud/cli/utils/runner.py +1 -1
  68. hud/cli/utils/server.py +2 -2
  69. hud/cli/utils/source_hash.py +3 -3
  70. hud/cli/utils/tasks.py +4 -1
  71. hud/cli/utils/tests/__init__.py +0 -0
  72. hud/cli/utils/tests/test_config.py +58 -0
  73. hud/cli/utils/tests/test_docker.py +93 -0
  74. hud/cli/utils/tests/test_docker_hints.py +71 -0
  75. hud/cli/utils/tests/test_env_check.py +74 -0
  76. hud/cli/utils/tests/test_environment.py +42 -0
  77. hud/cli/utils/tests/test_git.py +142 -0
  78. hud/cli/utils/tests/test_interactive_module.py +60 -0
  79. hud/cli/utils/tests/test_local_runner.py +50 -0
  80. hud/cli/utils/tests/test_logging_utils.py +23 -0
  81. hud/cli/utils/tests/test_metadata.py +49 -0
  82. hud/cli/utils/tests/test_package_runner.py +35 -0
  83. hud/cli/utils/tests/test_registry_utils.py +49 -0
  84. hud/cli/utils/tests/test_remote_runner.py +25 -0
  85. hud/cli/utils/tests/test_runner_modules.py +52 -0
  86. hud/cli/utils/tests/test_source_hash.py +36 -0
  87. hud/cli/utils/tests/test_tasks.py +80 -0
  88. hud/cli/utils/version_check.py +258 -0
  89. hud/cli/{rl → utils}/viewer.py +2 -2
  90. hud/clients/README.md +12 -11
  91. hud/clients/__init__.py +4 -3
  92. hud/clients/base.py +166 -26
  93. hud/clients/environment.py +51 -0
  94. hud/clients/fastmcp.py +13 -6
  95. hud/clients/mcp_use.py +40 -15
  96. hud/clients/tests/test_analyze_scenarios.py +206 -0
  97. hud/clients/tests/test_protocol.py +9 -3
  98. hud/datasets/__init__.py +23 -20
  99. hud/datasets/loader.py +327 -0
  100. hud/datasets/runner.py +192 -105
  101. hud/datasets/tests/__init__.py +0 -0
  102. hud/datasets/tests/test_loader.py +221 -0
  103. hud/datasets/tests/test_utils.py +315 -0
  104. hud/datasets/utils.py +270 -90
  105. hud/environment/__init__.py +50 -0
  106. hud/environment/connection.py +206 -0
  107. hud/environment/connectors/__init__.py +33 -0
  108. hud/environment/connectors/base.py +68 -0
  109. hud/environment/connectors/local.py +177 -0
  110. hud/environment/connectors/mcp_config.py +109 -0
  111. hud/environment/connectors/openai.py +101 -0
  112. hud/environment/connectors/remote.py +172 -0
  113. hud/environment/environment.py +694 -0
  114. hud/environment/integrations/__init__.py +45 -0
  115. hud/environment/integrations/adk.py +67 -0
  116. hud/environment/integrations/anthropic.py +196 -0
  117. hud/environment/integrations/gemini.py +92 -0
  118. hud/environment/integrations/langchain.py +82 -0
  119. hud/environment/integrations/llamaindex.py +68 -0
  120. hud/environment/integrations/openai.py +238 -0
  121. hud/environment/mock.py +306 -0
  122. hud/environment/router.py +112 -0
  123. hud/environment/scenarios.py +493 -0
  124. hud/environment/tests/__init__.py +1 -0
  125. hud/environment/tests/test_connection.py +317 -0
  126. hud/environment/tests/test_connectors.py +218 -0
  127. hud/environment/tests/test_environment.py +161 -0
  128. hud/environment/tests/test_integrations.py +257 -0
  129. hud/environment/tests/test_local_connectors.py +201 -0
  130. hud/environment/tests/test_scenarios.py +280 -0
  131. hud/environment/tests/test_tools.py +208 -0
  132. hud/environment/types.py +23 -0
  133. hud/environment/utils/__init__.py +35 -0
  134. hud/environment/utils/formats.py +215 -0
  135. hud/environment/utils/schema.py +171 -0
  136. hud/environment/utils/tool_wrappers.py +113 -0
  137. hud/eval/__init__.py +67 -0
  138. hud/eval/context.py +674 -0
  139. hud/eval/display.py +299 -0
  140. hud/eval/instrument.py +185 -0
  141. hud/eval/manager.py +466 -0
  142. hud/eval/parallel.py +268 -0
  143. hud/eval/task.py +340 -0
  144. hud/eval/tests/__init__.py +1 -0
  145. hud/eval/tests/test_context.py +178 -0
  146. hud/eval/tests/test_eval.py +210 -0
  147. hud/eval/tests/test_manager.py +152 -0
  148. hud/eval/tests/test_parallel.py +168 -0
  149. hud/eval/tests/test_task.py +145 -0
  150. hud/eval/types.py +63 -0
  151. hud/eval/utils.py +183 -0
  152. hud/patches/__init__.py +19 -0
  153. hud/patches/mcp_patches.py +151 -0
  154. hud/patches/warnings.py +54 -0
  155. hud/samples/browser.py +4 -4
  156. hud/server/__init__.py +2 -1
  157. hud/server/low_level.py +2 -1
  158. hud/server/router.py +164 -0
  159. hud/server/server.py +567 -80
  160. hud/server/tests/test_mcp_server_integration.py +11 -11
  161. hud/server/tests/test_mcp_server_more.py +1 -1
  162. hud/server/tests/test_server_extra.py +2 -0
  163. hud/settings.py +45 -3
  164. hud/shared/exceptions.py +36 -10
  165. hud/shared/hints.py +26 -1
  166. hud/shared/requests.py +15 -3
  167. hud/shared/tests/test_exceptions.py +40 -31
  168. hud/shared/tests/test_hints.py +167 -0
  169. hud/telemetry/__init__.py +20 -19
  170. hud/telemetry/exporter.py +201 -0
  171. hud/telemetry/instrument.py +158 -253
  172. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  173. hud/telemetry/tests/test_exporter.py +258 -0
  174. hud/telemetry/tests/test_instrument.py +401 -0
  175. hud/tools/__init__.py +16 -2
  176. hud/tools/apply_patch.py +639 -0
  177. hud/tools/base.py +54 -4
  178. hud/tools/bash.py +2 -2
  179. hud/tools/computer/__init__.py +4 -0
  180. hud/tools/computer/anthropic.py +2 -2
  181. hud/tools/computer/gemini.py +385 -0
  182. hud/tools/computer/hud.py +23 -6
  183. hud/tools/computer/openai.py +20 -21
  184. hud/tools/computer/qwen.py +434 -0
  185. hud/tools/computer/settings.py +37 -0
  186. hud/tools/edit.py +3 -7
  187. hud/tools/executors/base.py +4 -2
  188. hud/tools/executors/pyautogui.py +1 -1
  189. hud/tools/grounding/grounded_tool.py +13 -18
  190. hud/tools/grounding/grounder.py +10 -31
  191. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  192. hud/tools/jupyter.py +330 -0
  193. hud/tools/playwright.py +18 -3
  194. hud/tools/shell.py +308 -0
  195. hud/tools/tests/test_apply_patch.py +718 -0
  196. hud/tools/tests/test_computer.py +4 -9
  197. hud/tools/tests/test_computer_actions.py +24 -2
  198. hud/tools/tests/test_jupyter_tool.py +181 -0
  199. hud/tools/tests/test_shell.py +596 -0
  200. hud/tools/tests/test_submit.py +85 -0
  201. hud/tools/tests/test_types.py +193 -0
  202. hud/tools/types.py +21 -1
  203. hud/types.py +167 -57
  204. hud/utils/__init__.py +2 -0
  205. hud/utils/env.py +67 -0
  206. hud/utils/hud_console.py +61 -3
  207. hud/utils/mcp.py +15 -58
  208. hud/utils/strict_schema.py +162 -0
  209. hud/utils/tests/test_init.py +1 -2
  210. hud/utils/tests/test_mcp.py +1 -28
  211. hud/utils/tests/test_pretty_errors.py +186 -0
  212. hud/utils/tests/test_tool_shorthand.py +154 -0
  213. hud/utils/tests/test_version.py +1 -1
  214. hud/utils/types.py +20 -0
  215. hud/version.py +1 -1
  216. hud_python-0.5.1.dist-info/METADATA +264 -0
  217. hud_python-0.5.1.dist-info/RECORD +299 -0
  218. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
  219. hud/agents/langchain.py +0 -261
  220. hud/agents/lite_llm.py +0 -72
  221. hud/cli/rl/__init__.py +0 -180
  222. hud/cli/rl/config.py +0 -101
  223. hud/cli/rl/display.py +0 -133
  224. hud/cli/rl/gpu.py +0 -63
  225. hud/cli/rl/gpu_utils.py +0 -321
  226. hud/cli/rl/local_runner.py +0 -595
  227. hud/cli/rl/presets.py +0 -96
  228. hud/cli/rl/remote_runner.py +0 -463
  229. hud/cli/rl/rl_api.py +0 -150
  230. hud/cli/rl/vllm.py +0 -177
  231. hud/cli/rl/wait_utils.py +0 -89
  232. hud/datasets/parallel.py +0 -687
  233. hud/misc/__init__.py +0 -1
  234. hud/misc/claude_plays_pokemon.py +0 -292
  235. hud/otel/__init__.py +0 -35
  236. hud/otel/collector.py +0 -142
  237. hud/otel/config.py +0 -181
  238. hud/otel/context.py +0 -570
  239. hud/otel/exporters.py +0 -369
  240. hud/otel/instrumentation.py +0 -135
  241. hud/otel/processors.py +0 -121
  242. hud/otel/tests/__init__.py +0 -1
  243. hud/otel/tests/test_processors.py +0 -197
  244. hud/rl/README.md +0 -30
  245. hud/rl/__init__.py +0 -1
  246. hud/rl/actor.py +0 -176
  247. hud/rl/buffer.py +0 -405
  248. hud/rl/chat_template.jinja +0 -101
  249. hud/rl/config.py +0 -192
  250. hud/rl/distributed.py +0 -132
  251. hud/rl/learner.py +0 -637
  252. hud/rl/tests/__init__.py +0 -1
  253. hud/rl/tests/test_learner.py +0 -186
  254. hud/rl/train.py +0 -382
  255. hud/rl/types.py +0 -101
  256. hud/rl/utils/start_vllm_server.sh +0 -30
  257. hud/rl/utils.py +0 -524
  258. hud/rl/vllm_adapter.py +0 -143
  259. hud/telemetry/job.py +0 -352
  260. hud/telemetry/replay.py +0 -74
  261. hud/telemetry/tests/test_replay.py +0 -40
  262. hud/telemetry/tests/test_trace.py +0 -63
  263. hud/telemetry/trace.py +0 -158
  264. hud/utils/agent_factories.py +0 -86
  265. hud/utils/async_utils.py +0 -65
  266. hud/utils/group_eval.py +0 -223
  267. hud/utils/progress.py +0 -149
  268. hud/utils/tasks.py +0 -127
  269. hud/utils/tests/test_async_utils.py +0 -173
  270. hud/utils/tests/test_progress.py +0 -261
  271. hud_python-0.4.45.dist-info/METADATA +0 -552
  272. hud_python-0.4.45.dist-info/RECORD +0 -228
  273. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
  274. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
hud/otel/exporters.py DELETED
@@ -1,369 +0,0 @@
1
- """Custom OpenTelemetry exporter that sends spans to the existing HUD telemetry
2
- HTTP endpoint (/trace/<id>/telemetry-upload).
3
-
4
- The exporter groups spans by ``hud.task_run_id`` baggage / attribute so we keep
5
- exactly the same semantics the old async worker in ``hud.telemetry.exporter``
6
- implemented.
7
-
8
- This exporter is *synchronous* (derives from :class:`SpanExporter`). We rely on
9
- ``hud.shared.make_request_sync`` which already contains retry & auth logic.
10
- """
11
-
12
- from __future__ import annotations
13
-
14
- import contextlib
15
- import json
16
- import logging
17
- import time
18
- from collections import defaultdict
19
- from datetime import UTC, datetime
20
- from typing import TYPE_CHECKING, Any
21
-
22
- from mcp.types import ClientRequest, ServerResult
23
- from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
24
- from pydantic import BaseModel, ConfigDict, Field
25
-
26
- from hud.shared import make_request_sync
27
- from hud.types import TraceStep as HudSpanAttributes
28
-
29
- if TYPE_CHECKING:
30
- from opentelemetry.sdk.trace import ReadableSpan
31
-
32
- logger = logging.getLogger(__name__)
33
-
34
-
35
- # ---------------------------------------------------------------------------
36
- # Models
37
- # ---------------------------------------------------------------------------
38
-
39
-
40
- class HudSpan(BaseModel):
41
- """A telemetry span ready for export."""
42
-
43
- name: str
44
- trace_id: str = Field(pattern=r"^[0-9a-fA-F]{32}$")
45
- span_id: str = Field(pattern=r"^[0-9a-fA-F]{16}$")
46
- parent_span_id: str | None = Field(None, pattern=r"^[0-9a-fA-F]{16}$")
47
-
48
- start_time: str # ISO format
49
- end_time: str # ISO format
50
-
51
- status_code: str # "UNSET", "OK", "ERROR"
52
- status_message: str | None = None
53
-
54
- attributes: HudSpanAttributes
55
- exceptions: list[dict[str, Any]] | None = None
56
-
57
- model_config = ConfigDict(extra="forbid")
58
-
59
-
60
- def extract_span_attributes(
61
- attrs: dict[str, Any], method_name: str | None = None, span_name: str | None = None
62
- ) -> HudSpanAttributes:
63
- """Extract and parse span attributes into typed model.
64
-
65
- This handles:
66
- - Detecting span type (MCP vs Agent)
67
- - Renaming verbose OpenTelemetry semantic conventions
68
- - Parsing JSON strings to MCP types
69
- """
70
- # Start with core attributes - map to TraceStep field names
71
- result_attrs = {
72
- "task_run_id": attrs.get(
73
- "hud.task_run_id"
74
- ), # TraceStep expects task_run_id, not hud.task_run_id
75
- "job_id": attrs.get("hud.job_id"), # TraceStep expects job_id, not hud.job_id
76
- "type": attrs.get("span.kind", "CLIENT"), # TraceStep expects type, not span.kind
77
- }
78
-
79
- # Determine span type based on presence of agent or MCP attributes
80
- # Note: The input attrs might already have "category" set
81
- existing_category = attrs.get("category")
82
-
83
- if existing_category:
84
- # Use the explicit category if provided
85
- result_attrs["category"] = existing_category
86
- elif span_name and span_name.startswith("agent."):
87
- # Legacy support for spans named "agent.*"
88
- result_attrs["category"] = "agent"
89
- else:
90
- result_attrs["category"] = "mcp" # Default to MCP
91
-
92
- # No special processing needed for different categories
93
- # The backend will handle them based on the category field
94
-
95
- # Add method_name and request_id for MCP spans
96
- if result_attrs["category"] == "mcp":
97
- if method_name:
98
- result_attrs["method_name"] = method_name
99
- # Check for request_id with and without semconv_ai prefix
100
- request_id = attrs.get("semconv_ai.mcp.request_id") or attrs.get("mcp.request.id")
101
- if request_id:
102
- result_attrs["request_id"] = request_id
103
-
104
- # Parse input/output - check both with and without semconv_ai prefix
105
- input_str = attrs.get("semconv_ai.traceloop.entity.input") or attrs.get(
106
- "traceloop.entity.input"
107
- )
108
- output_str = attrs.get("semconv_ai.traceloop.entity.output") or attrs.get(
109
- "traceloop.entity.output"
110
- )
111
-
112
- logger.debug(
113
- "Category: %s, has input: %s, has output: %s",
114
- result_attrs.get("category"),
115
- bool(input_str),
116
- bool(output_str),
117
- )
118
-
119
- # Check for direct request/result attributes first
120
- if "request" in attrs and not result_attrs.get("request"):
121
- req = attrs["request"]
122
- if isinstance(req, str):
123
- with contextlib.suppress(json.JSONDecodeError):
124
- req = json.loads(req)
125
- result_attrs["request"] = req
126
-
127
- if "result" in attrs and not result_attrs.get("result"):
128
- res = attrs["result"]
129
- if isinstance(res, str):
130
- with contextlib.suppress(json.JSONDecodeError):
131
- res = json.loads(res)
132
- result_attrs["result"] = res
133
-
134
- # Process input/output from MCP instrumentation
135
- if input_str and not result_attrs.get("request"):
136
- try:
137
- input_data = json.loads(input_str) if isinstance(input_str, str) else input_str
138
-
139
- # For MCP category, try to parse as ClientRequest to extract the root
140
- if result_attrs["category"] == "mcp" and isinstance(input_data, dict):
141
- try:
142
- if "method" in input_data and "params" in input_data:
143
- client_request = ClientRequest.model_validate(input_data)
144
- result_attrs["request"] = client_request.root
145
- else:
146
- result_attrs["request"] = input_data
147
- except Exception:
148
- result_attrs["request"] = input_data
149
- else:
150
- # For all other categories, just store the data
151
- result_attrs["request"] = input_data
152
- except Exception as e:
153
- logger.debug("Failed to parse request JSON: %s", e)
154
-
155
- if output_str and not result_attrs.get("result"):
156
- try:
157
- output_data = json.loads(output_str) if isinstance(output_str, str) else output_str
158
-
159
- # For MCP category, try to parse as ServerResult to extract the root
160
- if result_attrs["category"] == "mcp" and isinstance(output_data, dict):
161
- # Check for error
162
- if "error" in output_data:
163
- result_attrs["mcp_error"] = True
164
- try:
165
- server_result = ServerResult.model_validate(output_data)
166
- result_attrs["result"] = server_result.root
167
- # Check for isError in the result
168
- if getattr(server_result.root, "isError", False):
169
- result_attrs["mcp_error"] = True
170
- except Exception:
171
- result_attrs["result"] = output_data
172
- else:
173
- # For all other categories, just store the data
174
- result_attrs["result"] = output_data
175
- except Exception as e:
176
- logger.debug("Failed to parse result JSON: %s", e)
177
-
178
- # Don't include the verbose attributes or ones we've already processed
179
- exclude_keys = {
180
- "hud.task_run_id",
181
- "hud.job_id",
182
- "span.kind",
183
- "semconv_ai.mcp.method_name",
184
- "mcp.method.name", # Also exclude non-prefixed version
185
- "semconv_ai.mcp.request_id",
186
- "mcp.request.id", # Also exclude non-prefixed version
187
- "semconv_ai.traceloop.entity.input",
188
- "semconv_ai.traceloop.entity.output",
189
- "traceloop.entity.input", # Also exclude non-prefixed versions
190
- "traceloop.entity.output",
191
- "mcp_request", # Exclude to prevent overwriting parsed values
192
- "mcp_result", # Exclude to prevent overwriting parsed values
193
- "request", # Exclude to prevent overwriting parsed values
194
- "result", # Exclude to prevent overwriting parsed values
195
- "category", # Already handled above
196
- }
197
-
198
- # Add any extra attributes
199
- for key, value in attrs.items():
200
- if key not in exclude_keys:
201
- result_attrs[key] = value # noqa: PERF403
202
-
203
- logger.debug(
204
- """Final result_attrs before creating HudSpanAttributes:
205
- request=%s,
206
- result=%s""",
207
- result_attrs.get("request"),
208
- result_attrs.get("result"),
209
- )
210
- return HudSpanAttributes(**result_attrs)
211
-
212
-
213
- # ---------------------------------------------------------------------------
214
- # Helpers
215
- # ---------------------------------------------------------------------------
216
-
217
-
218
- def _ts_ns_to_iso(ts_ns: int) -> str:
219
- """Convert a ``Span`` timestamp (nanoseconds) to ISO-8601 string."""
220
- # OpenTelemetry times are epoch nanoseconds
221
- dt = datetime.fromtimestamp(ts_ns / 1_000_000_000, tz=UTC)
222
- return dt.isoformat().replace("+00:00", "Z")
223
-
224
-
225
- def _span_to_dict(span: ReadableSpan) -> dict[str, Any]:
226
- """Convert an OpenTelemetry span to a dict using typed models."""
227
-
228
- attrs = dict(span.attributes or {})
229
-
230
- # Extract method name from span name if not in attributes
231
- # Check both with and without semconv_ai prefix
232
- raw_method = attrs.get("semconv_ai.mcp.method_name") or attrs.get("mcp.method.name")
233
- method_name: str | None = None
234
- if isinstance(raw_method, str):
235
- method_name = raw_method
236
- if method_name is None and isinstance(span.name, str) and span.name.endswith(".mcp"):
237
- method_name = span.name[:-4] # Remove .mcp suffix
238
-
239
- # Create typed attributes
240
- typed_attrs = extract_span_attributes(attrs, method_name, str(span.name))
241
-
242
- # Record span kind as extra attribute (TraceStep allows extras)
243
- try:
244
- typed_attrs.span_kind = span.kind.name # type: ignore[attr-defined]
245
- except Exception:
246
- logger.warning("Failed to set span kind attribute")
247
-
248
- # Build typed span
249
- # Guard context/parent/timestamps
250
- context = getattr(span, "context", None)
251
- trace_id_hex = (
252
- format(context.trace_id, "032x") if context and hasattr(context, "trace_id") else "0" * 32
253
- )
254
- span_id_hex = (
255
- format(context.span_id, "016x") if context and hasattr(context, "span_id") else "0" * 16
256
- )
257
- parent = getattr(span, "parent", None)
258
- parent_id_hex = (
259
- format(parent.span_id, "016x") if parent and hasattr(parent, "span_id") else None
260
- )
261
- start_ns = span.start_time or 0
262
- end_ns = span.end_time or start_ns
263
-
264
- typed_span = HudSpan(
265
- name=span.name,
266
- trace_id=trace_id_hex,
267
- span_id=span_id_hex,
268
- parent_span_id=parent_id_hex,
269
- start_time=_ts_ns_to_iso(int(start_ns)),
270
- end_time=_ts_ns_to_iso(int(end_ns)),
271
- status_code=span.status.status_code.name if span.status else "UNSET",
272
- status_message=span.status.description if span.status else None,
273
- attributes=typed_attrs,
274
- exceptions=None,
275
- )
276
-
277
- # Add error information if present
278
- if span.events:
279
- exceptions = []
280
- exceptions = [
281
- {
282
- "timestamp": _ts_ns_to_iso(event.timestamp),
283
- "attributes": dict(event.attributes or {}),
284
- }
285
- for event in span.events
286
- ]
287
- if exceptions:
288
- typed_span.exceptions = exceptions
289
-
290
- # Convert to dict for export
291
- return typed_span.model_dump(mode="json", by_alias=True, exclude_none=True)
292
-
293
-
294
- # ---------------------------------------------------------------------------
295
- # Exporter
296
- # ---------------------------------------------------------------------------
297
-
298
-
299
- class HudSpanExporter(SpanExporter):
300
- """Exporter that forwards spans to HUD backend using existing endpoint."""
301
-
302
- def __init__(self, *, telemetry_url: str, api_key: str) -> None:
303
- super().__init__()
304
- self._telemetry_url = telemetry_url.rstrip("/")
305
- self._api_key = api_key
306
-
307
- # ------------------------------------------------------------------
308
- # Core API
309
- # ------------------------------------------------------------------
310
- def export(self, spans: list[ReadableSpan]) -> SpanExportResult: # type: ignore[override]
311
- if not spans:
312
- return SpanExportResult.SUCCESS
313
-
314
- # Group spans by hud.task_run_id attribute
315
- grouped: dict[str, list[ReadableSpan]] = defaultdict(list)
316
- for span in spans:
317
- run_id = span.attributes.get("hud.task_run_id") if span.attributes else None
318
- if not run_id:
319
- # Skip spans that are outside HUD traces
320
- continue
321
- grouped[str(run_id)].append(span)
322
-
323
- # Send each group synchronously (retry inside make_request_sync)
324
- for run_id, span_batch in grouped.items():
325
- try:
326
- url = f"{self._telemetry_url}/trace/{run_id}/telemetry-upload"
327
- telemetry_spans = [_span_to_dict(s) for s in span_batch]
328
- # Include current step count in metadata
329
- metadata = {}
330
- # Get the HIGHEST step count from the batch (most recent)
331
- step_count = 0
332
- for span in span_batch:
333
- if span.attributes and "hud.step_count" in span.attributes:
334
- current_step = span.attributes["hud.step_count"]
335
- if isinstance(current_step, int) and current_step > step_count:
336
- step_count = current_step
337
-
338
- payload = {
339
- "metadata": metadata,
340
- "telemetry": telemetry_spans,
341
- }
342
-
343
- # Only include step_count if we found any steps
344
- if step_count > 0:
345
- payload["step_count"] = step_count
346
-
347
- logger.debug("HUD exporter sending %d spans to %s", len(span_batch), url)
348
- make_request_sync(
349
- method="POST",
350
- url=url,
351
- json=payload,
352
- api_key=self._api_key,
353
- )
354
- except Exception as exc:
355
- logger.exception("HUD exporter failed to send spans for task %s: %s", run_id, exc)
356
- # If *any* group fails we return FAILURE so the OTEL SDK can retry
357
- return SpanExportResult.FAILURE
358
-
359
- return SpanExportResult.SUCCESS
360
-
361
- def shutdown(self) -> None: # type: ignore[override]
362
- # Nothing to cleanup, httpx handled inside make_request_sync
363
- pass
364
-
365
- def force_flush(self, timeout_millis: int | None = None) -> bool: # type: ignore[override]
366
- if timeout_millis:
367
- time.sleep(timeout_millis / 1000)
368
- # Synchronous export, nothing buffered here
369
- return True
@@ -1,135 +0,0 @@
1
- """MCP instrumentation support for HUD.
2
-
3
- This module provides functions to enable MCP OpenTelemetry instrumentation
4
- for automatic tracing of MCP protocol communication.
5
- """
6
-
7
- from __future__ import annotations
8
-
9
- import logging
10
- from typing import TYPE_CHECKING, Any
11
-
12
- if TYPE_CHECKING:
13
- from collections.abc import AsyncGenerator, Callable
14
-
15
- from opentelemetry.trace import TracerProvider
16
-
17
- logger = logging.getLogger(__name__)
18
-
19
-
20
- def install_mcp_instrumentation(provider: TracerProvider) -> None:
21
- """Enable community MCP OpenTelemetry instrumentation if present.
22
-
23
- Args:
24
- provider: The TracerProvider to use for instrumentation
25
- """
26
- import logging
27
-
28
- logger = logging.getLogger(__name__)
29
-
30
- try:
31
- # First, patch the _instruments to use our fork
32
- import opentelemetry.instrumentation.mcp.instrumentation as mcp_inst
33
-
34
- mcp_inst._instruments = ("hud-mcp-python-sdk >= 3.13.1",)
35
-
36
- from opentelemetry.instrumentation.mcp.instrumentation import (
37
- McpInstrumentor,
38
- )
39
-
40
- # Then, patch the instrumentation to handle 3-value transports correctly
41
- _patch_mcp_instrumentation()
42
-
43
- McpInstrumentor().instrument(tracer_provider=provider)
44
- logger.debug("MCP instrumentation installed with fastmcp compatibility patch")
45
- except ImportError:
46
- logger.debug("opentelemetry-instrumentation-mcp not available, skipping")
47
- except Exception as exc:
48
- logger.warning("Failed to install MCP instrumentation: %s", exc)
49
-
50
-
51
- def _patch_mcp_instrumentation() -> None:
52
- """Patch MCP instrumentation to handle 3-value transport yields correctly."""
53
- from contextlib import asynccontextmanager
54
-
55
- try:
56
- from opentelemetry.instrumentation.mcp.instrumentation import McpInstrumentor
57
-
58
- # First, patch the get_error_type function to handle invalid HTTP status codes
59
- _patch_get_error_type()
60
-
61
- def patched_transport_wrapper(self: Any, tracer: Any) -> Callable[..., Any]:
62
- @asynccontextmanager
63
- async def traced_method(
64
- wrapped: Callable[..., Any], instance: Any, args: Any, kwargs: Any
65
- ) -> AsyncGenerator[Any, None]:
66
- async with wrapped(*args, **kwargs) as result:
67
- # Check if we got a tuple with 3 values
68
- if isinstance(result, tuple) and len(result) == 3:
69
- read_stream, write_stream, third_value = result
70
- # Import here to avoid circular imports
71
- from opentelemetry.instrumentation.mcp.instrumentation import (
72
- InstrumentedStreamReader,
73
- InstrumentedStreamWriter,
74
- )
75
-
76
- yield (
77
- InstrumentedStreamReader(read_stream, tracer),
78
- InstrumentedStreamWriter(write_stream, tracer),
79
- third_value,
80
- )
81
- else:
82
- # Fall back to 2-value case
83
- read_stream, write_stream = result
84
- from opentelemetry.instrumentation.mcp.instrumentation import (
85
- InstrumentedStreamReader,
86
- InstrumentedStreamWriter,
87
- )
88
-
89
- yield (
90
- InstrumentedStreamReader(read_stream, tracer),
91
- InstrumentedStreamWriter(write_stream, tracer),
92
- )
93
-
94
- return traced_method
95
-
96
- # Apply the patch
97
- McpInstrumentor._transport_wrapper = patched_transport_wrapper
98
-
99
- except Exception as e:
100
- import logging
101
-
102
- logger = logging.getLogger(__name__)
103
- logger.warning("Failed to patch MCP instrumentation: %s", e)
104
-
105
-
106
- def _patch_get_error_type() -> None:
107
- """Patch get_error_type to handle invalid HTTP status codes gracefully."""
108
- import re
109
- from http import HTTPStatus
110
-
111
- try:
112
- import opentelemetry.instrumentation.mcp.instrumentation as mcp_inst
113
-
114
- def patched_get_error_type(error_message: str) -> str | None:
115
- """Extract HTTP status from error message, handling invalid codes."""
116
- if not isinstance(error_message, str):
117
- return None
118
- match = re.search(r"\b(4\d{2}|5\d{2})\b", error_message)
119
- if match:
120
- num = int(match.group())
121
- try:
122
- # Only return if it's a valid HTTPStatus
123
- if 400 <= num <= 599:
124
- return HTTPStatus(num).name
125
- except ValueError:
126
- # Not a valid HTTP status code
127
- logger.debug("Ignoring invalid HTTP status code: %s", num)
128
- return None
129
-
130
- # Apply the patch
131
- mcp_inst.get_error_type = patched_get_error_type
132
- logger.debug("Patched get_error_type to handle invalid HTTP status codes")
133
-
134
- except Exception as e:
135
- logger.warning("Failed to patch get_error_type: %s", e)
hud/otel/processors.py DELETED
@@ -1,121 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import logging
4
- import time
5
- from typing import Any
6
-
7
- from opentelemetry import baggage
8
- from opentelemetry.sdk.trace import ReadableSpan, Span, SpanProcessor
9
-
10
- from .context import (
11
- get_agent_steps,
12
- get_base_mcp_steps,
13
- get_mcp_tool_steps,
14
- increment_agent_steps,
15
- increment_base_mcp_steps,
16
- increment_mcp_tool_steps,
17
- )
18
-
19
- logger = logging.getLogger(__name__)
20
-
21
-
22
- class HudEnrichmentProcessor(SpanProcessor):
23
- """Span processor that enriches every span with HUD-specific context.
24
-
25
- • Adds ``hud.task_run_id`` attribute if available.
26
- • Adds ``hud.job_id`` attribute if available in baggage.
27
- • Adds ``hud.step_count`` attribute if available in baggage.
28
- """
29
-
30
- def __init__(self) -> None:
31
- # No state, everything comes from context vars
32
- super().__init__()
33
-
34
- # --- callback hooks -------------------------------------------------
35
- def on_start(self, span: Span, parent_context: Any) -> None: # type: ignore[override]
36
- try:
37
- # Get task_run_id from baggage in parent context
38
- run_id = baggage.get_baggage("hud.task_run_id", context=parent_context)
39
- if run_id and span.is_recording():
40
- span.set_attribute("hud.task_run_id", str(run_id))
41
-
42
- # Get job_id from baggage if available
43
- job_id = baggage.get_baggage("hud.job_id", context=parent_context)
44
- if job_id and span.is_recording():
45
- span.set_attribute("hud.job_id", str(job_id))
46
-
47
- # Check what type of step this is and increment appropriate counters
48
- if span.is_recording():
49
- step_type = self._get_step_type(span)
50
-
51
- if step_type == "agent":
52
- # Increment agent steps
53
- new_agent_count = increment_agent_steps()
54
- span.set_attribute("hud.agent_steps", new_agent_count)
55
- logger.debug("Incremented agent steps to %d", new_agent_count)
56
-
57
- elif step_type == "base_mcp":
58
- # Increment base MCP steps
59
- new_base_count = increment_base_mcp_steps()
60
- span.set_attribute("hud.base_mcp_steps", new_base_count)
61
- logger.debug("Incremented base MCP steps to %d", new_base_count)
62
-
63
- elif step_type == "mcp_tool":
64
- # Increment both base MCP and MCP tool steps
65
- new_base_count = increment_base_mcp_steps()
66
- new_tool_count = increment_mcp_tool_steps()
67
- span.set_attribute("hud.base_mcp_steps", new_base_count)
68
- span.set_attribute("hud.mcp_tool_steps", new_tool_count)
69
- logger.debug(
70
- "Incremented MCP steps to base=%d, tool=%d", new_base_count, new_tool_count
71
- )
72
-
73
- # Always set all current step counts on the span
74
- span.set_attribute("hud.base_mcp_steps", get_base_mcp_steps())
75
- span.set_attribute("hud.mcp_tool_steps", get_mcp_tool_steps())
76
- span.set_attribute("hud.agent_steps", get_agent_steps())
77
-
78
- except Exception as exc: # defensive; never fail the tracer
79
- logger.debug("HudEnrichmentProcessor.on_start error: %s", exc, exc_info=False)
80
-
81
- def _get_step_type(self, span: Span) -> str | None:
82
- """Determine what type of step this span represents.
83
-
84
- Returns:
85
- 'base_mcp' for any MCP span
86
- 'mcp_tool' for MCP tool calls (tools/call.mcp)
87
- 'agent' for agent spans
88
- None if not a step
89
- """
90
- # Check span attributes
91
- attrs = span.attributes or {}
92
- span_name = span.name
93
-
94
- # Check for agent steps (instrumented with span_type="agent")
95
- if attrs.get("category") == "agent":
96
- return "agent"
97
-
98
- # Check span name pattern for MCP calls
99
- if span_name:
100
- # tools/call.mcp is an mcp_tool step
101
- if span_name == "tools/call.mcp":
102
- return "mcp_tool"
103
-
104
- # Any other .mcp suffixed span is a base MCP step
105
- elif span_name.endswith(".mcp"):
106
- return "base_mcp"
107
-
108
- return None
109
-
110
- def on_end(self, span: ReadableSpan) -> None:
111
- # Nothing to do enrichment is on_start only
112
- pass
113
-
114
- # Required to fully implement abstract base, but we don't batch spans
115
- def shutdown(self) -> None: # type: ignore[override]
116
- pass
117
-
118
- def force_flush(self, timeout_millis: int | None = None) -> bool: # type: ignore[override]
119
- if timeout_millis:
120
- time.sleep(timeout_millis / 1000)
121
- return True
@@ -1 +0,0 @@
1
- """Tests for OpenTelemetry integration."""