hud-python 0.2.6__py3-none-any.whl → 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

hud/env/remote_client.py CHANGED
@@ -2,7 +2,9 @@ from __future__ import annotations
2
2
 
3
3
  import logging
4
4
  from base64 import b64decode
5
- from typing import TYPE_CHECKING, Any
5
+ from typing import Any
6
+
7
+ from pydantic import BaseModel
6
8
 
7
9
  from hud.env.client import Client
8
10
  from hud.exceptions import HudResponseError
@@ -10,13 +12,18 @@ from hud.server import make_request
10
12
  from hud.settings import settings
11
13
  from hud.types import EnvironmentStatus
12
14
  from hud.utils import ExecuteResult
13
-
14
- if TYPE_CHECKING:
15
- from hud.utils.config import FunctionConfig
15
+ from hud.utils.config import FunctionConfig
16
16
 
17
17
  logger = logging.getLogger("hud.env.remote_env_client")
18
18
 
19
19
 
20
+ class SetupRequest(BaseModel):
21
+ task_id: str | None = None
22
+ setup: FunctionConfig | None = None
23
+ config: dict[str, Any] | None = None
24
+ metadata: dict[str, Any] | None = None
25
+
26
+
20
27
  class RemoteClient(Client):
21
28
  """
22
29
  Remote environment client implementation.
@@ -183,6 +190,17 @@ class RemoteClient(Client):
183
190
 
184
191
  return data["result"], b64decode(data["stdout"]), b64decode(data["stderr"])
185
192
 
193
+ async def setup(self, setup_request: SetupRequest) -> dict[str, Any]:
194
+ """
195
+ Setup the environment.
196
+ """
197
+ return await make_request(
198
+ method="POST",
199
+ url=f"{settings.base_url}/v1/environments/{self.env_id}/reset",
200
+ json=setup_request.model_dump(),
201
+ api_key=settings.api_key,
202
+ )
203
+
186
204
  async def close(self) -> None:
187
205
  """
188
206
  Close the remote environment by making a request to the server.
@@ -20,10 +20,14 @@ if TYPE_CHECKING:
20
20
  logger = logging.getLogger("hud.env.remote_env_client")
21
21
 
22
22
 
23
- async def upload_bytes_to_presigned_url(presigned_url: str, data_bytes: bytes) -> None:
23
+ async def upload_bytes_to_presigned_url(
24
+ presigned_url: str,
25
+ data_bytes: bytes,
26
+ timeout: float = 600,
27
+ ) -> None:
24
28
  try:
25
29
  async with httpx.AsyncClient() as client:
26
- response = await client.put(presigned_url, content=data_bytes)
30
+ response = await client.put(presigned_url, content=data_bytes, timeout=timeout)
27
31
  response.raise_for_status()
28
32
  except httpx.HTTPStatusError as e:
29
33
  logger.exception("Failed to upload to presigned URL")
hud/gym.py CHANGED
@@ -9,13 +9,13 @@ from hud.env.local_docker_client import LocalDockerClient
9
9
  from hud.env.remote_client import RemoteClient
10
10
  from hud.env.remote_docker_client import RemoteDockerClient
11
11
  from hud.exceptions import GymMakeException
12
+ from hud.task import Task
12
13
  from hud.telemetry.context import get_current_task_run_id
13
14
  from hud.types import CustomGym, Gym
14
15
  from hud.utils.common import get_gym_id
15
16
 
16
17
  if TYPE_CHECKING:
17
18
  from hud.job import Job
18
- from hud.task import Task
19
19
 
20
20
  logger = logging.getLogger("hud.gym")
21
21
 
@@ -39,9 +39,11 @@ async def make(
39
39
  task = None
40
40
  if isinstance(env_src, str | CustomGym):
41
41
  gym = env_src
42
- else:
42
+ elif isinstance(env_src, Task):
43
43
  gym = env_src.gym
44
44
  task = env_src
45
+ else:
46
+ raise GymMakeException(f"Invalid gym source: {env_src}", {})
45
47
 
46
48
  effective_job_id = None
47
49
  if job is not None:
@@ -89,9 +91,18 @@ async def make(
89
91
 
90
92
  if gym.location == "local":
91
93
  logger.info("Creating local environment")
92
- client = await LocalDockerClient.create(uri)
94
+ if gym.host_config:
95
+ logger.info("Using host config: %s", gym.host_config)
96
+ client = await LocalDockerClient.create(uri, gym.host_config)
97
+ else:
98
+ client = await LocalDockerClient.create(uri)
99
+
93
100
  elif gym.location == "remote":
94
101
  logger.info("Creating remote environment")
102
+
103
+ if gym.host_config:
104
+ raise ValueError("host_config is not supported for remote environments")
105
+
95
106
  client = await RemoteDockerClient.create(
96
107
  image_uri=uri,
97
108
  job_id=effective_job_id,
@@ -105,7 +116,7 @@ async def make(
105
116
  logger.info("Setting source path %s", gym.image_or_build_context)
106
117
  client.set_source_path(gym.image_or_build_context)
107
118
  elif isinstance(gym, str):
108
- logger.info("Creating private environment")
119
+ logger.debug("Creating private environment")
109
120
  true_gym_id = await get_gym_id(gym)
110
121
  client, build_data = await RemoteClient.create(
111
122
  gym_id=true_gym_id,
hud/job.py CHANGED
@@ -18,12 +18,12 @@ from hud.settings import settings
18
18
  from hud.task import Task
19
19
  from hud.taskset import TaskSet
20
20
  from hud.trajectory import Trajectory
21
- from hud.utils.common import Observation
22
21
  from hud.utils.progress import StepProgressTracker
23
22
 
24
23
  if TYPE_CHECKING:
25
24
  from hud.adapters.common import Adapter
26
25
  from hud.agent.base import Agent
26
+ from hud.utils.common import Observation
27
27
 
28
28
  logger = logging.getLogger("hud.job")
29
29
 
@@ -275,7 +275,7 @@ async def _maybe_resample_action(
275
275
  decision = await response_agent.determine_response(response_text)
276
276
  if decision == "CONTINUE":
277
277
  logger.info("ResponseAgent indicated CONTINUE. Retrying...")
278
- obs = Observation(text="Please continue.")
278
+ obs.text = "Please continue."
279
279
  return obs, False
280
280
  elif decision == "CONTINUE":
281
281
  logger.warning("Max continue retries reached. Stopping despite CONTINUE.")
@@ -321,6 +321,12 @@ async def _execute_task(
321
321
  if agent_instance is None:
322
322
  raise RuntimeError("Agent could not be instantiated")
323
323
 
324
+ agent_name = agent_instance.name
325
+ logger.info("Using agent: %s", agent_name)
326
+ if task.metadata is None or not isinstance(task.metadata, dict):
327
+ task.metadata = {}
328
+ task.metadata["agent_name"] = agent_name
329
+
324
330
  # Environment creation with semaphore
325
331
  if env_creation_semaphore:
326
332
  async with env_creation_semaphore:
@@ -328,6 +334,9 @@ async def _execute_task(
328
334
  else:
329
335
  env = await gym.make(task, job=job)
330
336
 
337
+ if not env:
338
+ raise ValueError(f"Environment creation failed for task {task_id}")
339
+
331
340
  obs_tuple = await env.reset()
332
341
  if obs_tuple is None:
333
342
  raise ValueError(f"env.reset() returned None for task {task_id}")
@@ -335,24 +344,45 @@ async def _execute_task(
335
344
 
336
345
  step_error = None
337
346
 
347
+ resampled_actions = 0
348
+
338
349
  for step in range(max_steps_per_task):
339
350
  action, done = (None, False)
340
351
  try:
341
352
  # Agent prediction with semaphore
342
- if agent_predict_semaphore:
343
- async with agent_predict_semaphore:
353
+ try:
354
+ if agent_predict_semaphore:
355
+ async with agent_predict_semaphore:
356
+ action, done = await agent_instance.predict(obs)
357
+ else:
344
358
  action, done = await agent_instance.predict(obs)
345
- else:
346
- action, done = await agent_instance.predict(obs)
359
+ except Exception as e:
360
+ # if agent prediction fails, pass back the error to the agent
361
+ logger.exception("[TR: %s] Agent prediction failed: %s", task_id, e)
362
+ resampled_actions += 1
363
+ if resampled_actions > 5:
364
+ logger.warning(
365
+ "[TR: %s] Resampled action %d times. Stopping.",
366
+ task_id,
367
+ resampled_actions,
368
+ )
369
+ break
370
+ continue
347
371
 
348
372
  if tracker:
349
373
  tracker.increment_step(task_id)
350
374
 
351
- if action is None and not done:
352
- done = True
353
-
354
- if done and response_agent:
375
+ finish = False
376
+ if done and response_agent and action and len(action) > 0:
355
377
  obs, finish = await _maybe_resample_action(obs, action[-1], response_agent)
378
+ resampled_actions += 1
379
+ if resampled_actions > 5:
380
+ logger.warning(
381
+ "[TR: %s] Resampled action %d times. Stopping.",
382
+ task_id,
383
+ resampled_actions,
384
+ )
385
+ break
356
386
  if not finish:
357
387
  continue
358
388
 
@@ -361,14 +391,12 @@ async def _execute_task(
361
391
  terminated = True
362
392
  else:
363
393
  obs, _, terminated, _ = step_result
364
- if terminated or done:
394
+ if terminated or done or finish:
365
395
  break
366
396
 
367
397
  except Exception as agent_step_err:
368
398
  logger.exception(
369
- "[Job: %s/%s, Task: %s] Step %d Error: %s",
370
- job.name,
371
- job.id,
399
+ "[TR: %s] Step %d Error: %s",
372
400
  task_id,
373
401
  step + 1,
374
402
  agent_step_err,
@@ -386,7 +414,7 @@ async def _execute_task(
386
414
  )
387
415
  continue
388
416
  else:
389
- logger.warning("[Job: %s/%s, Task: %s] Max steps reached.", job.name, job.id, task_id)
417
+ logger.warning("[TR: %s] Max steps reached.", task_id)
390
418
 
391
419
  # --- Evaluate Task ---
392
420
  evaluation_result = None
@@ -401,9 +429,7 @@ async def _execute_task(
401
429
  # logger.info("Evaluation result: %s", evaluation_result)
402
430
  except Exception as eval_err:
403
431
  logger.exception(
404
- "[Job: %s/%s, Task: %s] Evaluation Error: %s",
405
- job.name,
406
- job.id,
432
+ "[TR: %s] Evaluation Error: %s",
407
433
  task_id,
408
434
  eval_err,
409
435
  )
@@ -420,7 +446,7 @@ async def _execute_task(
420
446
  )
421
447
 
422
448
  except Exception as e:
423
- logger.exception("[Job: %s/%s, Task: %s] Setup/Run Error: %s", job.name, job.id, task_id, e)
449
+ logger.exception("[TR: %s] Setup/Run Error: %s", task_id, e)
424
450
  status = "error"
425
451
  error_msg = str(e)
426
452
  # Store setup/initialization error in job
@@ -440,9 +466,7 @@ async def _execute_task(
440
466
  try:
441
467
  await env.close()
442
468
  except Exception as close_err:
443
- logger.exception(
444
- "[Job: %s/%s, Task: %s] Close Error: %s", job.name, job.id, task_id, close_err
445
- )
469
+ logger.exception("[TR: %s] Close Error: %s", task_id, close_err)
446
470
  # Store environment close error in job
447
471
  job.errors.append(
448
472
  {
@@ -455,9 +479,7 @@ async def _execute_task(
455
479
 
456
480
  log_suffix = f" Error: {error_msg}" if status == "error" else f" Eval: {evaluation_result}"
457
481
  logger.info(
458
- "[Job: %s/%s, Task: %s] Finished local execution. Status: %s.%s",
459
- job.name,
460
- job.id,
482
+ "[TR: %s] Finished local execution. Status: %s.%s",
461
483
  task_id,
462
484
  status,
463
485
  log_suffix,
@@ -499,6 +521,7 @@ async def run_job(
499
521
  run_parallel: bool = True,
500
522
  job_metadata: dict[str, Any] | None = None,
501
523
  show_progress: bool = True,
524
+ verbose: bool = False,
502
525
  # Concurrency control with semaphores
503
526
  max_concurrent_env_creations: int | None = 30, # Limits gym.make calls
504
527
  max_concurrent_agent_predictions: int | None = None, # No limit on LLM calls
@@ -538,10 +561,16 @@ async def run_job(
538
561
  tasks_to_run: list[Task] = []
539
562
  created_job: Job | None = None
540
563
 
564
+ # Get hud logger
565
+ if not verbose:
566
+ logger = logging.getLogger("hud")
567
+ logger.setLevel(logging.CRITICAL)
568
+ logger = logging.getLogger("hud.job")
569
+
541
570
  evalset_id = None
542
571
  if isinstance(task_or_taskset, TaskSet):
543
572
  evalset_id = task_or_taskset.id
544
- await task_or_taskset.fit(agent_cls)
573
+ task_or_taskset.fit(agent_cls)
545
574
 
546
575
  gym_id = None
547
576
  if isinstance(task_or_taskset, Task):
@@ -706,3 +735,39 @@ async def run_job(
706
735
  num_tasks,
707
736
  )
708
737
  return created_job
738
+
739
+
740
+ """
741
+ c7f85f7d-3730-4c9a-85a3-a1dc436c3bd2
742
+
743
+
744
+ de12c3cc-9d9c-4e90-82cc-1d71d30ede54
745
+ 59104743-0a63-4569-a8b5-1eda1a1b55ac
746
+ ff759429-056c-4cde-8851-11e26729ff03
747
+
748
+
749
+ 7b98ea22-e243-4eeb-a6db-79f4a76da2b3
750
+
751
+ 7aad3f7b-d74f-470d-826d-d817f95fdd67
752
+
753
+ e356ede6-074a-49ef-9fcd-69e5bcfbdec9
754
+
755
+ 26cd1192-3991-4d1b-b599-b2bed1bcb606
756
+
757
+ 31ece277-970f-4763-b0c8-bf19a56f56c7
758
+
759
+
760
+ f9b722a0-5f33-466b-bce0-8ece101f2bc6
761
+ 33d1af33-8952-4945-b901-229bcfd88354
762
+
763
+ 6c3d6557-e745-44ab-bc10-300180a81c79
764
+ 6c3d6557-e745-44ab-bc10-300180a81c79
765
+ 502e02b5-9939-4e57-91af-4fcbcb90a979
766
+
767
+ 7aad3f7b-d74f-470d-826d-d817f95fdd67
768
+
769
+
770
+ 31ece277-970f-4763-b0c8-bf19a56f56c7
771
+
772
+
773
+ e356ede6-074a-49ef-9fcd-69e5bcfbdec9"""
hud/settings.py CHANGED
@@ -44,6 +44,12 @@ class Settings(BaseSettings):
44
44
  validation_alias="TELEMETRY_ENABLED",
45
45
  )
46
46
 
47
+ fancy_logging: bool = Field(
48
+ default=True,
49
+ description="Enable fancy logging for the HUD SDK",
50
+ validation_alias="FANCY_LOGGING",
51
+ )
52
+
47
53
 
48
54
  # Create a singleton instance
49
55
  settings = Settings()
hud/task.py CHANGED
@@ -2,12 +2,12 @@ from __future__ import annotations
2
2
 
3
3
  import tempfile
4
4
  from pathlib import Path
5
- from typing import TYPE_CHECKING, Any
5
+ from typing import TYPE_CHECKING, Any, Literal, cast
6
6
 
7
7
  from inspect_ai.util._sandbox import SandboxEnvironmentSpec
8
- from pydantic import BaseModel
8
+ from pydantic import BaseModel, Field
9
9
 
10
- from hud.types import CustomGym, Gym
10
+ from hud.types import CustomGym, Gym, MetadataKeys, SensitiveData
11
11
  from hud.utils.common import FunctionConfig, FunctionConfigs
12
12
 
13
13
  if TYPE_CHECKING:
@@ -40,28 +40,78 @@ class Task(BaseModel):
40
40
  Attributes:
41
41
  id: The remote task ID (optional if local-only)
42
42
  prompt: The task prompt or instruction
43
+ system_prompt: The system prompt for the evalset (optional)
43
44
  setup: Environment setup configuration (optional)
44
45
  evaluate: Configuration for evaluating responses
45
46
  metadata: Additional task metadata
47
+ sensitive_data: Sensitive data such as API keys, passwords, etc.
46
48
  choices: Multiple choice answer list (for Inspect compatibility)
47
49
  target: Ideal target output (for Inspect compatibility)
48
50
  files: Files that go along with the task (for Inspect compatibility)
49
51
  gym: Environment specification
50
52
  """
51
53
 
52
- id: str | None = None
53
- prompt: str
54
+ id: str | None = None # Remote task ID (optional if local-only)
55
+
56
+ prompt: str # Task prompt or instruction
57
+ system_prompt: str | None = None # System prompt for the evalset (optional)
58
+
59
+ gym: Gym | None = None # Environment specification
60
+
61
+ # Setup and evaluate configurations for the environment (environment specific)
54
62
  setup: FunctionConfigs | None = None
55
63
  evaluate: FunctionConfigs | None = None
56
- gym: Gym | None = None
64
+
65
+ # Overflow configuration for environments that don't conform to the standard
57
66
  config: dict[str, Any] | None = None
58
67
 
68
+ # Sensitive data such as API keys, passwords, etc.
69
+ sensitive_data: SensitiveData = Field(default_factory=dict)
70
+
71
+ # Metadata for the task evaluation, information about the agent (see MetadataKeys)
72
+ metadata: dict[MetadataKeys, Any] = Field(default_factory=dict)
73
+
74
+ # Description of the task, for extra information about its purpose and context
59
75
  description: str | None = None
60
76
 
61
77
  @classmethod
62
78
  def from_dict(cls, data: dict[str, Any]) -> Task:
63
79
  return cls(**data)
64
80
 
81
+ @classmethod
82
+ def from_serialized(cls, data: dict[str, Any]) -> Task:
83
+ gym_data = data.get("gym")
84
+ parsed_gym: Gym | None = gym_data
85
+
86
+ parsed_setup = [(param, entry) for param, entry in data.get("setup", [])]
87
+ parsed_evaluate = [(param, entry) for param, entry in data.get("evaluate", [])]
88
+
89
+ # Convert dict gym data to CustomGym if needed
90
+ if (
91
+ isinstance(gym_data, dict)
92
+ and gym_data.get("type") == "public"
93
+ and gym_data.get("location") in ("local", "remote")
94
+ and gym_data.get("image_or_build_context") is not None
95
+ ):
96
+ parsed_gym = CustomGym(
97
+ type=cast("Literal['public']", gym_data["type"]),
98
+ location=cast("Literal['local', 'remote']", gym_data["location"]),
99
+ image_or_build_context=Path(gym_data["image_or_build_context"]),
100
+ )
101
+
102
+ return cls(
103
+ id=data.get("id"),
104
+ prompt=data.get("prompt", ""),
105
+ system_prompt=data.get("system_prompt"),
106
+ setup=parsed_setup,
107
+ evaluate=parsed_evaluate,
108
+ gym=parsed_gym,
109
+ config=data.get("config"),
110
+ description=data.get("description"),
111
+ sensitive_data=data.get("sensitive_data", {}),
112
+ metadata=data.get("metadata", {}),
113
+ )
114
+
65
115
  @classmethod
66
116
  def from_inspect_sample(cls, sample: Sample) -> Task:
67
117
  """Create a Task from an Inspect dataset sample.
@@ -144,3 +194,31 @@ class Task(BaseModel):
144
194
  if self.gym is None:
145
195
  return
146
196
  self.gym = agent.transfer_gyms.get(self.gym, self.gym)
197
+
198
+ def serialize(self) -> dict[str, Any]:
199
+ if isinstance(self.setup, list):
200
+ parsed_setup = [[param, entry] for param, entry in self.setup]
201
+ else:
202
+ parsed_setup = self.setup
203
+ if isinstance(self.evaluate, list):
204
+ parsed_evaluate = [[param, entry] for param, entry in self.evaluate]
205
+ else:
206
+ parsed_evaluate = self.evaluate
207
+
208
+ if isinstance(self.gym, CustomGym):
209
+ parsed_gym = self.gym.model_dump()
210
+ parsed_gym["image_or_build_context"] = str(parsed_gym["image_or_build_context"])
211
+ else: # is ServerGym
212
+ parsed_gym = self.gym
213
+
214
+ return {
215
+ "id": self.id,
216
+ "prompt": self.prompt,
217
+ "config": self.config,
218
+ "description": self.description,
219
+ "setup": parsed_setup,
220
+ "evaluate": parsed_evaluate,
221
+ "gym": parsed_gym,
222
+ "sensitive_data": self.sensitive_data,
223
+ "metadata": self.metadata,
224
+ }
hud/taskset.py CHANGED
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import TYPE_CHECKING
3
+ from pathlib import PosixPath
4
+ from typing import TYPE_CHECKING, Any, get_args
4
5
  from venv import logger
5
6
 
6
7
  from pydantic import BaseModel
@@ -9,6 +10,7 @@ from hud.env.environment import create_remote_config
9
10
  from hud.server import make_request
10
11
  from hud.settings import settings
11
12
  from hud.task import Task
13
+ from hud.types import CustomGym, ServerGym
12
14
  from hud.utils.config import REMOTE_EVALUATE, REMOTE_SETUP
13
15
 
14
16
  if TYPE_CHECKING:
@@ -101,13 +103,30 @@ class TaskSet(BaseModel):
101
103
  else:
102
104
  evaluate_config = None
103
105
 
106
+ if isinstance(task.gym, CustomGym):
107
+ if isinstance(task.gym.image_or_build_context, PosixPath):
108
+ raise ValueError(
109
+ "Local build contexts are not supported for "
110
+ "remote tasksets, attach an image or existing "
111
+ "gym id."
112
+ )
113
+ gym_str = "docker"
114
+ image_uri = task.gym.image_or_build_context
115
+ elif isinstance(task.gym, str) and task.gym in get_args(ServerGym):
116
+ gym_str = task.gym
117
+ image_uri = None
118
+ else:
119
+ raise ValueError(f"Unknown gym type: {type(task.gym)}")
120
+
104
121
  processed_tasks.append(
105
122
  {
106
123
  "prompt": task.prompt,
107
- "gym": task.gym,
124
+ "gym": gym_str,
108
125
  "setup": setup_config,
109
126
  "evaluate": evaluate_config,
110
127
  "config": task.config,
128
+ "image_uri": image_uri,
129
+ "description": task.description,
111
130
  }
112
131
  )
113
132
 
@@ -125,7 +144,15 @@ class TaskSet(BaseModel):
125
144
  "Taskset %s uploaded successfully, see it on app.hud.so/evalsets/%s", name, name
126
145
  )
127
146
 
128
- async def fit(self, agent: Agent | type[Agent]) -> None:
147
+ def _apply(self, dict: dict[str, Any]) -> None:
148
+ """
149
+ Applies a parameter to all tasks in the taskset.
150
+ """
151
+ for task in self.tasks:
152
+ for key, value in dict.items():
153
+ setattr(task, key, value)
154
+
155
+ def fit(self, agent: Agent | type[Agent]) -> None:
129
156
  """
130
157
  Automatically adapts the taskset to the agent's transfer_gyms.
131
158
  """
@@ -133,19 +160,27 @@ class TaskSet(BaseModel):
133
160
  agent = agent()
134
161
 
135
162
  for task in self.tasks:
136
- if task.gym is None:
163
+ if task.gym is None or isinstance(task.gym, CustomGym):
137
164
  continue
138
165
  task.gym = agent.transfer_gyms.get(task.gym, task.gym)
139
166
 
140
167
 
141
- async def load_taskset(taskset_id: str, api_key: str | None = None) -> TaskSet:
168
+ async def load_taskset(
169
+ taskset_id: str,
170
+ api_key: str | None = None,
171
+ metadata: dict[str, Any] | None = None,
172
+ load_custom_as_local: bool = False,
173
+ system_prompt: str | None = None,
174
+ ) -> TaskSet:
142
175
  """
143
176
  Loads a TaskSet by its ID.
144
177
 
145
178
  Args:
146
179
  taskset_id: The ID of the taskset to load
147
180
  api_key: Optional API key to use for the request
148
-
181
+ metadata: Optional metadata to apply to the taskset
182
+ load_custom_as_local: Whether to load custom gyms as local
183
+ system_prompt: Optional system prompt to override the default
149
184
  Returns:
150
185
  TaskSet: The loaded taskset
151
186
  """
@@ -161,13 +196,33 @@ async def load_taskset(taskset_id: str, api_key: str | None = None) -> TaskSet:
161
196
 
162
197
  logger.info(f"Taskset {taskset_id} loaded successfully")
163
198
 
164
- return TaskSet.model_validate(
199
+ tasks = data["evalset"]
200
+ for task in tasks:
201
+ if system_prompt:
202
+ task["system_prompt"] = system_prompt
203
+ if task["gym"] == "docker":
204
+ if "image_uri" not in task:
205
+ raise ValueError(
206
+ "No `image_uri` key found. This taskset may be "
207
+ "incompatible with your version of HUD SDK."
208
+ )
209
+
210
+ task["gym"] = CustomGym(
211
+ location="local" if load_custom_as_local else "remote",
212
+ image_or_build_context=task["image_uri"],
213
+ )
214
+
215
+ taskset = TaskSet.model_validate(
165
216
  {
166
217
  "id": taskset_id,
167
- "tasks": data["evalset"],
218
+ "tasks": tasks,
168
219
  }
169
220
  )
170
221
 
222
+ taskset._apply({"metadata": metadata})
223
+
224
+ return taskset
225
+
171
226
 
172
227
  def load_from_inspect(dataset: Dataset) -> TaskSet:
173
228
  """
hud/telemetry/exporter.py CHANGED
@@ -298,12 +298,10 @@ async def _export_trace_payload_async(payload: dict[str, Any]) -> None:
298
298
  "telemetry": payload.get("mcp_calls", []),
299
299
  }
300
300
 
301
- # Ensure mcp_calls is not empty if that's a requirement, or send as is. For now, send as is.
302
- # if not data_to_send["mcp_calls"]:
303
- # logger.debug("No MCP calls in payload for task run %s, skipping specific export if "
304
- # "desired.", task_run_id)
305
- # # Depending on backend, might not want to send empty mcp_calls list, or it's fine.
301
+ await send_telemetry_to_server(task_run_id, data_to_send)
306
302
 
303
+
304
+ async def send_telemetry_to_server(task_run_id: str, data: dict[str, Any]) -> None:
307
305
  telemetry_url = f"{settings.base_url}/v2/task_runs/{task_run_id}/telemetry-upload"
308
306
 
309
307
  try:
@@ -320,7 +318,7 @@ async def _export_trace_payload_async(payload: dict[str, Any]) -> None:
320
318
  )
321
319
  response = await client.post(
322
320
  telemetry_url,
323
- json=data_to_send, # Send the structured attributes and mcp_calls
321
+ json=data, # Send the structured attributes and mcp_calls
324
322
  headers=headers,
325
323
  timeout=30.0,
326
324
  )
hud/trajectory.py CHANGED
@@ -6,6 +6,8 @@ import datetime
6
6
  from IPython.display import HTML, Markdown, display
7
7
  from pydantic import BaseModel, Field
8
8
 
9
+ from .adapters.common.types import LogType
10
+
9
11
 
10
12
  class TrajectoryStep(BaseModel):
11
13
  """Model representing a single task run's trajectory information."""
@@ -13,6 +15,7 @@ class TrajectoryStep(BaseModel):
13
15
  observation_url: str | None = None
14
16
  observation_text: str | None = None
15
17
  actions: list[dict]
18
+ logs: LogType | None = None
16
19
  start_timestamp: str | None = None
17
20
  end_timestamp: str | None = None
18
21