hud-python 0.4.51__py3-none-any.whl → 0.4.53__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +13 -1
- hud/agents/base.py +14 -3
- hud/agents/lite_llm.py +1 -1
- hud/agents/openai_chat_generic.py +15 -3
- hud/agents/tests/test_base.py +9 -2
- hud/agents/tests/test_base_runtime.py +164 -0
- hud/cli/__init__.py +18 -25
- hud/cli/build.py +35 -27
- hud/cli/dev.py +11 -29
- hud/cli/eval.py +114 -145
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +26 -3
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +134 -0
- hud/cli/tests/test_eval.py +4 -0
- hud/cli/tests/test_mcp_server.py +8 -7
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/utils/docker.py +120 -1
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +257 -0
- hud/clients/base.py +1 -1
- hud/clients/mcp_use.py +3 -1
- hud/datasets/parallel.py +2 -2
- hud/datasets/runner.py +85 -24
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_runner.py +106 -0
- hud/datasets/tests/test_utils.py +228 -0
- hud/otel/config.py +8 -6
- hud/otel/context.py +4 -4
- hud/otel/exporters.py +231 -57
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_instrumentation.py +207 -0
- hud/rl/learner.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/shared/exceptions.py +35 -9
- hud/shared/hints.py +25 -0
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +39 -30
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +30 -6
- hud/telemetry/async_context.py +331 -0
- hud/telemetry/job.py +51 -12
- hud/telemetry/tests/test_async_context.py +242 -0
- hud/telemetry/tests/test_instrument.py +414 -0
- hud/telemetry/tests/test_job.py +609 -0
- hud/telemetry/tests/test_trace.py +184 -6
- hud/telemetry/trace.py +16 -17
- hud/tools/computer/qwen.py +4 -1
- hud/tools/computer/settings.py +2 -2
- hud/tools/executors/base.py +4 -2
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/types.py +7 -1
- hud/utils/agent_factories.py +1 -3
- hud/utils/mcp.py +1 -1
- hud/utils/task_tracking.py +223 -0
- hud/utils/tests/test_agent_factories.py +60 -0
- hud/utils/tests/test_mcp.py +4 -6
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tasks.py +187 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.4.51.dist-info → hud_python-0.4.53.dist-info}/METADATA +48 -48
- {hud_python-0.4.51.dist-info → hud_python-0.4.53.dist-info}/RECORD +88 -47
- {hud_python-0.4.51.dist-info → hud_python-0.4.53.dist-info}/WHEEL +0 -0
- {hud_python-0.4.51.dist-info → hud_python-0.4.53.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.51.dist-info → hud_python-0.4.53.dist-info}/licenses/LICENSE +0 -0
hud/telemetry/__init__.py
CHANGED
|
@@ -1,14 +1,36 @@
|
|
|
1
|
-
"""HUD Telemetry -
|
|
1
|
+
"""HUD Telemetry - Tracing and job management for agent execution.
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
3
|
+
Provides telemetry APIs for tracking agent execution and experiments.
|
|
4
|
+
|
|
5
|
+
Standard Usage:
|
|
6
|
+
>>> import hud
|
|
7
|
+
>>> with hud.trace("My Task"):
|
|
8
|
+
... do_work()
|
|
9
|
+
|
|
10
|
+
>>> with hud.job("My Job") as job:
|
|
11
|
+
... with hud.trace("Task", job_id=job.id):
|
|
12
|
+
... do_work()
|
|
13
|
+
|
|
14
|
+
High-Concurrency Usage (200+ parallel tasks):
|
|
15
|
+
>>> import hud
|
|
16
|
+
>>> async with hud.async_job("Evaluation") as job:
|
|
17
|
+
... async with hud.async_trace("Task", job_id=job.id):
|
|
18
|
+
... await do_async_work()
|
|
19
|
+
|
|
20
|
+
APIs:
|
|
21
|
+
- trace(), job() - Standard context managers (for typical usage)
|
|
22
|
+
- async_trace(), async_job() - Async context managers (for high concurrency)
|
|
23
|
+
- instrument() - Decorator for instrumenting functions
|
|
24
|
+
- get_trace() - Retrieve collected traces for replay
|
|
25
|
+
|
|
26
|
+
Note:
|
|
27
|
+
Use async_trace/async_job only for high-concurrency scenarios (200+ tasks).
|
|
28
|
+
The run_dataset() function uses them automatically.
|
|
8
29
|
"""
|
|
9
30
|
|
|
10
31
|
from __future__ import annotations
|
|
11
32
|
|
|
33
|
+
from .async_context import async_job, async_trace
|
|
12
34
|
from .instrument import instrument
|
|
13
35
|
from .job import Job, create_job, job
|
|
14
36
|
from .replay import clear_trace, get_trace
|
|
@@ -17,6 +39,8 @@ from .trace import Trace, trace
|
|
|
17
39
|
__all__ = [
|
|
18
40
|
"Job",
|
|
19
41
|
"Trace",
|
|
42
|
+
"async_job",
|
|
43
|
+
"async_trace",
|
|
20
44
|
"clear_trace",
|
|
21
45
|
"create_job",
|
|
22
46
|
"get_trace",
|
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
"""Async context managers for HUD telemetry.
|
|
2
|
+
|
|
3
|
+
Provides async versions of trace and job context managers for high-concurrency
|
|
4
|
+
async code. These prevent event loop blocking by using async I/O operations.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
>>> import hud
|
|
8
|
+
>>> async with hud.async_job("My Job") as job:
|
|
9
|
+
... async with hud.async_trace("Task", job_id=job.id) as trace:
|
|
10
|
+
... await do_work()
|
|
11
|
+
|
|
12
|
+
When to use:
|
|
13
|
+
- High-concurrency scenarios (200+ parallel tasks)
|
|
14
|
+
- Custom async evaluation loops
|
|
15
|
+
- Async frameworks with HUD telemetry integration
|
|
16
|
+
|
|
17
|
+
When NOT to use:
|
|
18
|
+
- Typical scripts/notebooks → use `hud.trace()` and `hud.job()`
|
|
19
|
+
- Low concurrency (< 30 tasks) → standard context managers are fine
|
|
20
|
+
- Synchronous code → must use `hud.trace()` and `hud.job()`
|
|
21
|
+
|
|
22
|
+
Note:
|
|
23
|
+
The `run_dataset()` function automatically uses these async context managers
|
|
24
|
+
internally, so most users don't need to use them directly.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import logging
|
|
30
|
+
import uuid
|
|
31
|
+
from typing import TYPE_CHECKING, Any
|
|
32
|
+
|
|
33
|
+
if TYPE_CHECKING:
|
|
34
|
+
from types import TracebackType
|
|
35
|
+
|
|
36
|
+
from hud.otel import configure_telemetry
|
|
37
|
+
from hud.otel.context import (
|
|
38
|
+
_print_trace_complete_url,
|
|
39
|
+
_print_trace_url,
|
|
40
|
+
_update_task_status_async,
|
|
41
|
+
)
|
|
42
|
+
from hud.otel.context import (
|
|
43
|
+
trace as OtelTrace,
|
|
44
|
+
)
|
|
45
|
+
from hud.settings import settings
|
|
46
|
+
from hud.shared import make_request
|
|
47
|
+
from hud.telemetry.job import Job, _print_job_complete_url, _print_job_url
|
|
48
|
+
from hud.telemetry.trace import Trace
|
|
49
|
+
from hud.utils.task_tracking import track_task
|
|
50
|
+
|
|
51
|
+
logger = logging.getLogger(__name__)
|
|
52
|
+
|
|
53
|
+
# Module exports
|
|
54
|
+
__all__ = ["AsyncJob", "AsyncTrace", "async_job", "async_trace"]
|
|
55
|
+
|
|
56
|
+
# Global state for current job
|
|
57
|
+
_current_job: Job | None = None
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class AsyncTrace:
|
|
61
|
+
"""Async context manager for HUD trace tracking.
|
|
62
|
+
|
|
63
|
+
This is the async equivalent of `hud.trace()`, designed for use in
|
|
64
|
+
high-concurrency async contexts. It tracks task execution with automatic
|
|
65
|
+
status updates that don't block the event loop.
|
|
66
|
+
|
|
67
|
+
The context manager:
|
|
68
|
+
- Creates a unique task_run_id for telemetry correlation
|
|
69
|
+
- Sends async status updates ("running", "completed", "error")
|
|
70
|
+
- Integrates with OpenTelemetry for span collection
|
|
71
|
+
- Tracks all async operations for proper cleanup
|
|
72
|
+
|
|
73
|
+
Use `async_trace()` helper function instead of instantiating directly.
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
def __init__(
|
|
77
|
+
self,
|
|
78
|
+
name: str = "Test task from hud",
|
|
79
|
+
*,
|
|
80
|
+
root: bool = True,
|
|
81
|
+
attrs: dict[str, Any] | None = None,
|
|
82
|
+
job_id: str | None = None,
|
|
83
|
+
task_id: str | None = None,
|
|
84
|
+
) -> None:
|
|
85
|
+
self.name = name
|
|
86
|
+
self.root = root
|
|
87
|
+
self.attrs = attrs or {}
|
|
88
|
+
self.job_id = job_id
|
|
89
|
+
self.task_id = task_id
|
|
90
|
+
self.task_run_id = str(uuid.uuid4())
|
|
91
|
+
self.trace_obj = Trace(self.task_run_id, name, job_id, task_id)
|
|
92
|
+
self._otel_trace = None
|
|
93
|
+
|
|
94
|
+
async def __aenter__(self) -> Trace:
|
|
95
|
+
"""Enter the async trace context."""
|
|
96
|
+
# Ensure telemetry is configured
|
|
97
|
+
configure_telemetry()
|
|
98
|
+
|
|
99
|
+
# Start the OpenTelemetry span
|
|
100
|
+
self._otel_trace = OtelTrace(
|
|
101
|
+
self.task_run_id,
|
|
102
|
+
is_root=self.root,
|
|
103
|
+
span_name=self.name,
|
|
104
|
+
attributes=self.attrs,
|
|
105
|
+
job_id=self.job_id,
|
|
106
|
+
task_id=self.task_id,
|
|
107
|
+
)
|
|
108
|
+
self._otel_trace.__enter__()
|
|
109
|
+
|
|
110
|
+
# Send async status update if this is a root trace
|
|
111
|
+
if self.root and settings.telemetry_enabled and settings.api_key:
|
|
112
|
+
track_task(
|
|
113
|
+
_update_task_status_async(
|
|
114
|
+
self.task_run_id,
|
|
115
|
+
"running",
|
|
116
|
+
job_id=self.job_id,
|
|
117
|
+
trace_name=self.name,
|
|
118
|
+
task_id=self.task_id,
|
|
119
|
+
),
|
|
120
|
+
name=f"trace-status-{self.task_run_id[:8]}",
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# Print trace URL if not part of a job
|
|
124
|
+
if not self.job_id:
|
|
125
|
+
_print_trace_url(self.task_run_id)
|
|
126
|
+
|
|
127
|
+
logger.debug("Started trace: %s (%s)", self.name, self.task_run_id)
|
|
128
|
+
return self.trace_obj
|
|
129
|
+
|
|
130
|
+
async def __aexit__(
|
|
131
|
+
self,
|
|
132
|
+
exc_type: type[BaseException] | None,
|
|
133
|
+
exc_val: BaseException | None,
|
|
134
|
+
exc_tb: TracebackType | None,
|
|
135
|
+
) -> None:
|
|
136
|
+
"""Exit the async trace context."""
|
|
137
|
+
# Send async status update if this is a root trace
|
|
138
|
+
if self.root and settings.telemetry_enabled and settings.api_key:
|
|
139
|
+
status = "error" if exc_type else "completed"
|
|
140
|
+
|
|
141
|
+
track_task(
|
|
142
|
+
_update_task_status_async(
|
|
143
|
+
self.task_run_id,
|
|
144
|
+
status,
|
|
145
|
+
job_id=self.job_id,
|
|
146
|
+
error_message=str(exc_val) if exc_val else None,
|
|
147
|
+
trace_name=self.name,
|
|
148
|
+
task_id=self.task_id,
|
|
149
|
+
),
|
|
150
|
+
name=f"trace-status-{self.task_run_id[:8]}-{status}",
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
# Print completion message if not part of a job
|
|
154
|
+
if not self.job_id:
|
|
155
|
+
_print_trace_complete_url(self.task_run_id, error_occurred=bool(exc_type))
|
|
156
|
+
|
|
157
|
+
# Close the OpenTelemetry span
|
|
158
|
+
if self._otel_trace:
|
|
159
|
+
self._otel_trace.__exit__(exc_type, exc_val, exc_tb)
|
|
160
|
+
|
|
161
|
+
logger.debug("Ended trace: %s (%s)", self.name, self.task_run_id)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class AsyncJob:
|
|
165
|
+
"""Async context manager for HUD job tracking.
|
|
166
|
+
|
|
167
|
+
This is the async equivalent of `hud.job()`, designed for grouping
|
|
168
|
+
related tasks in high-concurrency async contexts. It manages job
|
|
169
|
+
status updates without blocking the event loop.
|
|
170
|
+
|
|
171
|
+
The context manager:
|
|
172
|
+
- Creates or uses a provided job_id
|
|
173
|
+
- Sends async status updates ("running", "completed", "failed")
|
|
174
|
+
- Associates all child traces with this job
|
|
175
|
+
- Tracks async operations for proper cleanup
|
|
176
|
+
|
|
177
|
+
Use `async_job()` helper function instead of instantiating directly.
|
|
178
|
+
"""
|
|
179
|
+
|
|
180
|
+
def __init__(
|
|
181
|
+
self,
|
|
182
|
+
name: str,
|
|
183
|
+
metadata: dict[str, Any] | None = None,
|
|
184
|
+
job_id: str | None = None,
|
|
185
|
+
dataset_link: str | None = None,
|
|
186
|
+
) -> None:
|
|
187
|
+
self.job_id = job_id or str(uuid.uuid4())
|
|
188
|
+
self.job = Job(self.job_id, name, metadata, dataset_link)
|
|
189
|
+
|
|
190
|
+
async def __aenter__(self) -> Job:
|
|
191
|
+
"""Enter the async job context."""
|
|
192
|
+
global _current_job
|
|
193
|
+
|
|
194
|
+
# Save previous job and set this as current
|
|
195
|
+
self._old_job = _current_job
|
|
196
|
+
_current_job = self.job
|
|
197
|
+
|
|
198
|
+
# Send async status update
|
|
199
|
+
if settings.telemetry_enabled:
|
|
200
|
+
payload = {
|
|
201
|
+
"name": self.job.name,
|
|
202
|
+
"status": "running",
|
|
203
|
+
"metadata": self.job.metadata,
|
|
204
|
+
}
|
|
205
|
+
if self.job.dataset_link:
|
|
206
|
+
payload["dataset_link"] = self.job.dataset_link
|
|
207
|
+
|
|
208
|
+
track_task(
|
|
209
|
+
make_request(
|
|
210
|
+
method="POST",
|
|
211
|
+
url=f"{settings.hud_telemetry_url}/jobs/{self.job.id}/status",
|
|
212
|
+
json=payload,
|
|
213
|
+
api_key=settings.api_key,
|
|
214
|
+
),
|
|
215
|
+
name=f"job-status-{self.job.id[:8]}-running",
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
_print_job_url(self.job.id, self.job.name)
|
|
219
|
+
logger.debug("Started job: %s (%s)", self.job.name, self.job.id)
|
|
220
|
+
return self.job
|
|
221
|
+
|
|
222
|
+
async def __aexit__(
|
|
223
|
+
self,
|
|
224
|
+
exc_type: type[BaseException] | None,
|
|
225
|
+
exc_val: BaseException | None,
|
|
226
|
+
exc_tb: TracebackType | None,
|
|
227
|
+
) -> None:
|
|
228
|
+
"""Exit the async job context."""
|
|
229
|
+
global _current_job
|
|
230
|
+
|
|
231
|
+
# Send async status update
|
|
232
|
+
if settings.telemetry_enabled:
|
|
233
|
+
status = "failed" if exc_type else "completed"
|
|
234
|
+
payload = {
|
|
235
|
+
"name": self.job.name,
|
|
236
|
+
"status": status,
|
|
237
|
+
"metadata": self.job.metadata,
|
|
238
|
+
}
|
|
239
|
+
if self.job.dataset_link:
|
|
240
|
+
payload["dataset_link"] = self.job.dataset_link
|
|
241
|
+
|
|
242
|
+
track_task(
|
|
243
|
+
make_request(
|
|
244
|
+
method="POST",
|
|
245
|
+
url=f"{settings.hud_telemetry_url}/jobs/{self.job.id}/status",
|
|
246
|
+
json=payload,
|
|
247
|
+
api_key=settings.api_key,
|
|
248
|
+
),
|
|
249
|
+
name=f"job-status-{self.job.id[:8]}-{status}",
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
_print_job_complete_url(self.job.id, self.job.name, error_occurred=bool(exc_type))
|
|
253
|
+
|
|
254
|
+
# Restore previous job
|
|
255
|
+
_current_job = self._old_job
|
|
256
|
+
|
|
257
|
+
logger.debug("Ended job: %s (%s)", self.job.name, self.job.id)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def async_trace(
|
|
261
|
+
name: str = "Test task from hud",
|
|
262
|
+
*,
|
|
263
|
+
root: bool = True,
|
|
264
|
+
attrs: dict[str, Any] | None = None,
|
|
265
|
+
job_id: str | None = None,
|
|
266
|
+
task_id: str | None = None,
|
|
267
|
+
) -> AsyncTrace:
|
|
268
|
+
"""Create an async trace context for telemetry tracking.
|
|
269
|
+
|
|
270
|
+
This is the async equivalent of `hud.trace()` for use in high-concurrency
|
|
271
|
+
async contexts. Status updates are sent asynchronously and tracked to ensure
|
|
272
|
+
completion before shutdown.
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
name: Descriptive name for this trace/task
|
|
276
|
+
root: Whether this is a root trace (updates task status)
|
|
277
|
+
attrs: Additional attributes to attach to the trace
|
|
278
|
+
job_id: Optional job ID to associate with this trace
|
|
279
|
+
task_id: Optional task ID for custom task identifiers
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
AsyncTrace context manager
|
|
283
|
+
|
|
284
|
+
Example:
|
|
285
|
+
>>> import hud
|
|
286
|
+
>>> async with hud.async_trace("Process Data") as trace:
|
|
287
|
+
... result = await process_async()
|
|
288
|
+
... await trace.log({"items_processed": len(result)})
|
|
289
|
+
|
|
290
|
+
Note:
|
|
291
|
+
Most users should use `hud.trace()` which works fine for typical usage.
|
|
292
|
+
Use this async version only in high-concurrency scenarios (200+ parallel
|
|
293
|
+
tasks) or when writing custom async evaluation frameworks.
|
|
294
|
+
"""
|
|
295
|
+
return AsyncTrace(name, root=root, attrs=attrs, job_id=job_id, task_id=task_id)
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def async_job(
|
|
299
|
+
name: str,
|
|
300
|
+
metadata: dict[str, Any] | None = None,
|
|
301
|
+
job_id: str | None = None,
|
|
302
|
+
dataset_link: str | None = None,
|
|
303
|
+
) -> AsyncJob:
|
|
304
|
+
"""Create an async job context for grouping related tasks.
|
|
305
|
+
|
|
306
|
+
This is the async equivalent of `hud.job()` for use in high-concurrency
|
|
307
|
+
async contexts. Job status updates are sent asynchronously and tracked
|
|
308
|
+
to ensure completion before shutdown.
|
|
309
|
+
|
|
310
|
+
Args:
|
|
311
|
+
name: Human-readable job name
|
|
312
|
+
metadata: Optional metadata dictionary
|
|
313
|
+
job_id: Optional job ID (auto-generated if not provided)
|
|
314
|
+
dataset_link: Optional HuggingFace dataset identifier
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
AsyncJob context manager
|
|
318
|
+
|
|
319
|
+
Example:
|
|
320
|
+
>>> import hud
|
|
321
|
+
>>> async with hud.async_job("Batch Processing") as job:
|
|
322
|
+
... for item in items:
|
|
323
|
+
... async with hud.async_trace(f"Process {item.id}", job_id=job.id):
|
|
324
|
+
... await process(item)
|
|
325
|
+
|
|
326
|
+
Note:
|
|
327
|
+
Most users should use `hud.job()` which works fine for typical usage.
|
|
328
|
+
Use this async version only in high-concurrency scenarios (200+ parallel
|
|
329
|
+
tasks) or when writing custom async evaluation frameworks.
|
|
330
|
+
"""
|
|
331
|
+
return AsyncJob(name, metadata=metadata, job_id=job_id, dataset_link=dataset_link)
|
hud/telemetry/job.py
CHANGED
|
@@ -89,6 +89,33 @@ class Job:
|
|
|
89
89
|
except Exception as e:
|
|
90
90
|
logger.warning("Failed to update job status: %s", e)
|
|
91
91
|
|
|
92
|
+
def update_status_fire_and_forget(self, status: str) -> None:
|
|
93
|
+
"""Update job status without blocking (fire-and-forget)."""
|
|
94
|
+
self.status = status
|
|
95
|
+
if settings.telemetry_enabled:
|
|
96
|
+
from hud.utils.async_utils import fire_and_forget
|
|
97
|
+
|
|
98
|
+
async def _update() -> None:
|
|
99
|
+
try:
|
|
100
|
+
payload = {
|
|
101
|
+
"name": self.name,
|
|
102
|
+
"status": status,
|
|
103
|
+
"metadata": self.metadata,
|
|
104
|
+
}
|
|
105
|
+
if self.dataset_link:
|
|
106
|
+
payload["dataset_link"] = self.dataset_link
|
|
107
|
+
|
|
108
|
+
await make_request(
|
|
109
|
+
method="POST",
|
|
110
|
+
url=f"{settings.hud_telemetry_url}/jobs/{self.id}/status",
|
|
111
|
+
json=payload,
|
|
112
|
+
api_key=settings.api_key,
|
|
113
|
+
)
|
|
114
|
+
except Exception as e:
|
|
115
|
+
logger.warning("Failed to update job status: %s", e)
|
|
116
|
+
|
|
117
|
+
fire_and_forget(_update(), f"update job {self.id} status to {status}")
|
|
118
|
+
|
|
92
119
|
async def log(self, metrics: dict[str, Any]) -> None:
|
|
93
120
|
"""Log metrics to the job.
|
|
94
121
|
|
|
@@ -214,9 +241,9 @@ def job(
|
|
|
214
241
|
job_id: str | None = None,
|
|
215
242
|
dataset_link: str | None = None,
|
|
216
243
|
) -> Generator[Job, None, None]:
|
|
217
|
-
"""Context manager for job tracking.
|
|
244
|
+
"""Context manager for job tracking and organization.
|
|
218
245
|
|
|
219
|
-
Groups related tasks together under a single job for tracking and
|
|
246
|
+
Groups related tasks together under a single job for tracking and visualization.
|
|
220
247
|
|
|
221
248
|
Args:
|
|
222
249
|
name: Human-readable job name
|
|
@@ -228,10 +255,22 @@ def job(
|
|
|
228
255
|
Job: The job object
|
|
229
256
|
|
|
230
257
|
Example:
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
258
|
+
>>> import hud
|
|
259
|
+
>>> # Synchronous code
|
|
260
|
+
>>> with hud.job("training_run", {"model": "gpt-4"}) as job:
|
|
261
|
+
... for epoch in range(10):
|
|
262
|
+
... with hud.trace(f"epoch_{epoch}", job_id=job.id):
|
|
263
|
+
... train_epoch()
|
|
264
|
+
>>> # For async code with HIGH CONCURRENCY (200+ tasks), use async_job
|
|
265
|
+
>>> async with hud.async_job("batch_processing") as job:
|
|
266
|
+
... for item in items:
|
|
267
|
+
... async with hud.async_trace(f"process_{item}", job_id=job.id):
|
|
268
|
+
... await process(item)
|
|
269
|
+
|
|
270
|
+
Note:
|
|
271
|
+
For simple async code (< 30 parallel tasks), this context manager works fine.
|
|
272
|
+
Use `hud.async_job()` only for high-concurrency scenarios (200+ parallel tasks)
|
|
273
|
+
where event loop blocking becomes an issue.
|
|
235
274
|
"""
|
|
236
275
|
global _current_job
|
|
237
276
|
|
|
@@ -245,18 +284,18 @@ def job(
|
|
|
245
284
|
_current_job = job_obj
|
|
246
285
|
|
|
247
286
|
try:
|
|
248
|
-
# Update status to running
|
|
249
|
-
job_obj.
|
|
287
|
+
# Update status to running (fire-and-forget to avoid blocking)
|
|
288
|
+
job_obj.update_status_fire_and_forget("running")
|
|
250
289
|
# Print the nice job URL box
|
|
251
290
|
_print_job_url(job_obj.id, job_obj.name)
|
|
252
291
|
yield job_obj
|
|
253
|
-
# Update status to completed
|
|
254
|
-
job_obj.
|
|
292
|
+
# Update status to completed (fire-and-forget to avoid blocking)
|
|
293
|
+
job_obj.update_status_fire_and_forget("completed")
|
|
255
294
|
# Print job completion message
|
|
256
295
|
_print_job_complete_url(job_obj.id, job_obj.name, error_occurred=False)
|
|
257
296
|
except Exception:
|
|
258
|
-
# Update status to failed
|
|
259
|
-
job_obj.
|
|
297
|
+
# Update status to failed (fire-and-forget to avoid blocking)
|
|
298
|
+
job_obj.update_status_fire_and_forget("failed")
|
|
260
299
|
# Print job failure message
|
|
261
300
|
_print_job_complete_url(job_obj.id, job_obj.name, error_occurred=True)
|
|
262
301
|
raise
|