hud-python 0.2.6__py3-none-any.whl → 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +13 -10
- hud/adapters/claude/adapter.py +30 -18
- hud/adapters/common/adapter.py +0 -1
- hud/adapters/common/types.py +129 -4
- hud/adapters/operator/adapter.py +23 -13
- hud/agent/base.py +5 -4
- hud/agent/claude.py +65 -13
- hud/agent/claude_plays_pokemon.py +2 -2
- hud/agent/langchain.py +8 -2
- hud/agent/operator.py +36 -11
- hud/agent/tests/test_base.py +2 -2
- hud/env/docker_client.py +24 -2
- hud/env/environment.py +86 -40
- hud/env/local_docker_client.py +50 -4
- hud/env/remote_client.py +22 -4
- hud/env/remote_docker_client.py +6 -2
- hud/gym.py +15 -4
- hud/job.py +91 -26
- hud/settings.py +6 -0
- hud/task.py +84 -6
- hud/taskset.py +63 -8
- hud/telemetry/exporter.py +4 -6
- hud/trajectory.py +3 -0
- hud/types.py +28 -2
- hud/utils/agent.py +37 -0
- hud/utils/common.py +142 -26
- hud/utils/config.py +11 -0
- hud/utils/tests/test_common.py +225 -0
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.2.6.dist-info → hud_python-0.2.7.dist-info}/METADATA +9 -6
- {hud_python-0.2.6.dist-info → hud_python-0.2.7.dist-info}/RECORD +34 -33
- {hud_python-0.2.6.dist-info → hud_python-0.2.7.dist-info}/WHEEL +0 -0
- {hud_python-0.2.6.dist-info → hud_python-0.2.7.dist-info}/licenses/LICENSE +0 -0
hud/env/remote_client.py
CHANGED
|
@@ -2,7 +2,9 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
from base64 import b64decode
|
|
5
|
-
from typing import
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel
|
|
6
8
|
|
|
7
9
|
from hud.env.client import Client
|
|
8
10
|
from hud.exceptions import HudResponseError
|
|
@@ -10,13 +12,18 @@ from hud.server import make_request
|
|
|
10
12
|
from hud.settings import settings
|
|
11
13
|
from hud.types import EnvironmentStatus
|
|
12
14
|
from hud.utils import ExecuteResult
|
|
13
|
-
|
|
14
|
-
if TYPE_CHECKING:
|
|
15
|
-
from hud.utils.config import FunctionConfig
|
|
15
|
+
from hud.utils.config import FunctionConfig
|
|
16
16
|
|
|
17
17
|
logger = logging.getLogger("hud.env.remote_env_client")
|
|
18
18
|
|
|
19
19
|
|
|
20
|
+
class SetupRequest(BaseModel):
|
|
21
|
+
task_id: str | None = None
|
|
22
|
+
setup: FunctionConfig | None = None
|
|
23
|
+
config: dict[str, Any] | None = None
|
|
24
|
+
metadata: dict[str, Any] | None = None
|
|
25
|
+
|
|
26
|
+
|
|
20
27
|
class RemoteClient(Client):
|
|
21
28
|
"""
|
|
22
29
|
Remote environment client implementation.
|
|
@@ -183,6 +190,17 @@ class RemoteClient(Client):
|
|
|
183
190
|
|
|
184
191
|
return data["result"], b64decode(data["stdout"]), b64decode(data["stderr"])
|
|
185
192
|
|
|
193
|
+
async def setup(self, setup_request: SetupRequest) -> dict[str, Any]:
|
|
194
|
+
"""
|
|
195
|
+
Setup the environment.
|
|
196
|
+
"""
|
|
197
|
+
return await make_request(
|
|
198
|
+
method="POST",
|
|
199
|
+
url=f"{settings.base_url}/v1/environments/{self.env_id}/reset",
|
|
200
|
+
json=setup_request.model_dump(),
|
|
201
|
+
api_key=settings.api_key,
|
|
202
|
+
)
|
|
203
|
+
|
|
186
204
|
async def close(self) -> None:
|
|
187
205
|
"""
|
|
188
206
|
Close the remote environment by making a request to the server.
|
hud/env/remote_docker_client.py
CHANGED
|
@@ -20,10 +20,14 @@ if TYPE_CHECKING:
|
|
|
20
20
|
logger = logging.getLogger("hud.env.remote_env_client")
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
async def upload_bytes_to_presigned_url(
|
|
23
|
+
async def upload_bytes_to_presigned_url(
|
|
24
|
+
presigned_url: str,
|
|
25
|
+
data_bytes: bytes,
|
|
26
|
+
timeout: float = 600,
|
|
27
|
+
) -> None:
|
|
24
28
|
try:
|
|
25
29
|
async with httpx.AsyncClient() as client:
|
|
26
|
-
response = await client.put(presigned_url, content=data_bytes)
|
|
30
|
+
response = await client.put(presigned_url, content=data_bytes, timeout=timeout)
|
|
27
31
|
response.raise_for_status()
|
|
28
32
|
except httpx.HTTPStatusError as e:
|
|
29
33
|
logger.exception("Failed to upload to presigned URL")
|
hud/gym.py
CHANGED
|
@@ -9,13 +9,13 @@ from hud.env.local_docker_client import LocalDockerClient
|
|
|
9
9
|
from hud.env.remote_client import RemoteClient
|
|
10
10
|
from hud.env.remote_docker_client import RemoteDockerClient
|
|
11
11
|
from hud.exceptions import GymMakeException
|
|
12
|
+
from hud.task import Task
|
|
12
13
|
from hud.telemetry.context import get_current_task_run_id
|
|
13
14
|
from hud.types import CustomGym, Gym
|
|
14
15
|
from hud.utils.common import get_gym_id
|
|
15
16
|
|
|
16
17
|
if TYPE_CHECKING:
|
|
17
18
|
from hud.job import Job
|
|
18
|
-
from hud.task import Task
|
|
19
19
|
|
|
20
20
|
logger = logging.getLogger("hud.gym")
|
|
21
21
|
|
|
@@ -39,9 +39,11 @@ async def make(
|
|
|
39
39
|
task = None
|
|
40
40
|
if isinstance(env_src, str | CustomGym):
|
|
41
41
|
gym = env_src
|
|
42
|
-
|
|
42
|
+
elif isinstance(env_src, Task):
|
|
43
43
|
gym = env_src.gym
|
|
44
44
|
task = env_src
|
|
45
|
+
else:
|
|
46
|
+
raise GymMakeException(f"Invalid gym source: {env_src}", {})
|
|
45
47
|
|
|
46
48
|
effective_job_id = None
|
|
47
49
|
if job is not None:
|
|
@@ -89,9 +91,18 @@ async def make(
|
|
|
89
91
|
|
|
90
92
|
if gym.location == "local":
|
|
91
93
|
logger.info("Creating local environment")
|
|
92
|
-
|
|
94
|
+
if gym.host_config:
|
|
95
|
+
logger.info("Using host config: %s", gym.host_config)
|
|
96
|
+
client = await LocalDockerClient.create(uri, gym.host_config)
|
|
97
|
+
else:
|
|
98
|
+
client = await LocalDockerClient.create(uri)
|
|
99
|
+
|
|
93
100
|
elif gym.location == "remote":
|
|
94
101
|
logger.info("Creating remote environment")
|
|
102
|
+
|
|
103
|
+
if gym.host_config:
|
|
104
|
+
raise ValueError("host_config is not supported for remote environments")
|
|
105
|
+
|
|
95
106
|
client = await RemoteDockerClient.create(
|
|
96
107
|
image_uri=uri,
|
|
97
108
|
job_id=effective_job_id,
|
|
@@ -105,7 +116,7 @@ async def make(
|
|
|
105
116
|
logger.info("Setting source path %s", gym.image_or_build_context)
|
|
106
117
|
client.set_source_path(gym.image_or_build_context)
|
|
107
118
|
elif isinstance(gym, str):
|
|
108
|
-
logger.
|
|
119
|
+
logger.debug("Creating private environment")
|
|
109
120
|
true_gym_id = await get_gym_id(gym)
|
|
110
121
|
client, build_data = await RemoteClient.create(
|
|
111
122
|
gym_id=true_gym_id,
|
hud/job.py
CHANGED
|
@@ -18,12 +18,12 @@ from hud.settings import settings
|
|
|
18
18
|
from hud.task import Task
|
|
19
19
|
from hud.taskset import TaskSet
|
|
20
20
|
from hud.trajectory import Trajectory
|
|
21
|
-
from hud.utils.common import Observation
|
|
22
21
|
from hud.utils.progress import StepProgressTracker
|
|
23
22
|
|
|
24
23
|
if TYPE_CHECKING:
|
|
25
24
|
from hud.adapters.common import Adapter
|
|
26
25
|
from hud.agent.base import Agent
|
|
26
|
+
from hud.utils.common import Observation
|
|
27
27
|
|
|
28
28
|
logger = logging.getLogger("hud.job")
|
|
29
29
|
|
|
@@ -275,7 +275,7 @@ async def _maybe_resample_action(
|
|
|
275
275
|
decision = await response_agent.determine_response(response_text)
|
|
276
276
|
if decision == "CONTINUE":
|
|
277
277
|
logger.info("ResponseAgent indicated CONTINUE. Retrying...")
|
|
278
|
-
obs =
|
|
278
|
+
obs.text = "Please continue."
|
|
279
279
|
return obs, False
|
|
280
280
|
elif decision == "CONTINUE":
|
|
281
281
|
logger.warning("Max continue retries reached. Stopping despite CONTINUE.")
|
|
@@ -321,6 +321,12 @@ async def _execute_task(
|
|
|
321
321
|
if agent_instance is None:
|
|
322
322
|
raise RuntimeError("Agent could not be instantiated")
|
|
323
323
|
|
|
324
|
+
agent_name = agent_instance.name
|
|
325
|
+
logger.info("Using agent: %s", agent_name)
|
|
326
|
+
if task.metadata is None or not isinstance(task.metadata, dict):
|
|
327
|
+
task.metadata = {}
|
|
328
|
+
task.metadata["agent_name"] = agent_name
|
|
329
|
+
|
|
324
330
|
# Environment creation with semaphore
|
|
325
331
|
if env_creation_semaphore:
|
|
326
332
|
async with env_creation_semaphore:
|
|
@@ -328,6 +334,9 @@ async def _execute_task(
|
|
|
328
334
|
else:
|
|
329
335
|
env = await gym.make(task, job=job)
|
|
330
336
|
|
|
337
|
+
if not env:
|
|
338
|
+
raise ValueError(f"Environment creation failed for task {task_id}")
|
|
339
|
+
|
|
331
340
|
obs_tuple = await env.reset()
|
|
332
341
|
if obs_tuple is None:
|
|
333
342
|
raise ValueError(f"env.reset() returned None for task {task_id}")
|
|
@@ -335,24 +344,45 @@ async def _execute_task(
|
|
|
335
344
|
|
|
336
345
|
step_error = None
|
|
337
346
|
|
|
347
|
+
resampled_actions = 0
|
|
348
|
+
|
|
338
349
|
for step in range(max_steps_per_task):
|
|
339
350
|
action, done = (None, False)
|
|
340
351
|
try:
|
|
341
352
|
# Agent prediction with semaphore
|
|
342
|
-
|
|
343
|
-
|
|
353
|
+
try:
|
|
354
|
+
if agent_predict_semaphore:
|
|
355
|
+
async with agent_predict_semaphore:
|
|
356
|
+
action, done = await agent_instance.predict(obs)
|
|
357
|
+
else:
|
|
344
358
|
action, done = await agent_instance.predict(obs)
|
|
345
|
-
|
|
346
|
-
|
|
359
|
+
except Exception as e:
|
|
360
|
+
# if agent prediction fails, pass back the error to the agent
|
|
361
|
+
logger.exception("[TR: %s] Agent prediction failed: %s", task_id, e)
|
|
362
|
+
resampled_actions += 1
|
|
363
|
+
if resampled_actions > 5:
|
|
364
|
+
logger.warning(
|
|
365
|
+
"[TR: %s] Resampled action %d times. Stopping.",
|
|
366
|
+
task_id,
|
|
367
|
+
resampled_actions,
|
|
368
|
+
)
|
|
369
|
+
break
|
|
370
|
+
continue
|
|
347
371
|
|
|
348
372
|
if tracker:
|
|
349
373
|
tracker.increment_step(task_id)
|
|
350
374
|
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
if done and response_agent:
|
|
375
|
+
finish = False
|
|
376
|
+
if done and response_agent and action and len(action) > 0:
|
|
355
377
|
obs, finish = await _maybe_resample_action(obs, action[-1], response_agent)
|
|
378
|
+
resampled_actions += 1
|
|
379
|
+
if resampled_actions > 5:
|
|
380
|
+
logger.warning(
|
|
381
|
+
"[TR: %s] Resampled action %d times. Stopping.",
|
|
382
|
+
task_id,
|
|
383
|
+
resampled_actions,
|
|
384
|
+
)
|
|
385
|
+
break
|
|
356
386
|
if not finish:
|
|
357
387
|
continue
|
|
358
388
|
|
|
@@ -361,14 +391,12 @@ async def _execute_task(
|
|
|
361
391
|
terminated = True
|
|
362
392
|
else:
|
|
363
393
|
obs, _, terminated, _ = step_result
|
|
364
|
-
if terminated or done:
|
|
394
|
+
if terminated or done or finish:
|
|
365
395
|
break
|
|
366
396
|
|
|
367
397
|
except Exception as agent_step_err:
|
|
368
398
|
logger.exception(
|
|
369
|
-
"[
|
|
370
|
-
job.name,
|
|
371
|
-
job.id,
|
|
399
|
+
"[TR: %s] Step %d Error: %s",
|
|
372
400
|
task_id,
|
|
373
401
|
step + 1,
|
|
374
402
|
agent_step_err,
|
|
@@ -386,7 +414,7 @@ async def _execute_task(
|
|
|
386
414
|
)
|
|
387
415
|
continue
|
|
388
416
|
else:
|
|
389
|
-
logger.warning("[
|
|
417
|
+
logger.warning("[TR: %s] Max steps reached.", task_id)
|
|
390
418
|
|
|
391
419
|
# --- Evaluate Task ---
|
|
392
420
|
evaluation_result = None
|
|
@@ -401,9 +429,7 @@ async def _execute_task(
|
|
|
401
429
|
# logger.info("Evaluation result: %s", evaluation_result)
|
|
402
430
|
except Exception as eval_err:
|
|
403
431
|
logger.exception(
|
|
404
|
-
"[
|
|
405
|
-
job.name,
|
|
406
|
-
job.id,
|
|
432
|
+
"[TR: %s] Evaluation Error: %s",
|
|
407
433
|
task_id,
|
|
408
434
|
eval_err,
|
|
409
435
|
)
|
|
@@ -420,7 +446,7 @@ async def _execute_task(
|
|
|
420
446
|
)
|
|
421
447
|
|
|
422
448
|
except Exception as e:
|
|
423
|
-
logger.exception("[
|
|
449
|
+
logger.exception("[TR: %s] Setup/Run Error: %s", task_id, e)
|
|
424
450
|
status = "error"
|
|
425
451
|
error_msg = str(e)
|
|
426
452
|
# Store setup/initialization error in job
|
|
@@ -440,9 +466,7 @@ async def _execute_task(
|
|
|
440
466
|
try:
|
|
441
467
|
await env.close()
|
|
442
468
|
except Exception as close_err:
|
|
443
|
-
logger.exception(
|
|
444
|
-
"[Job: %s/%s, Task: %s] Close Error: %s", job.name, job.id, task_id, close_err
|
|
445
|
-
)
|
|
469
|
+
logger.exception("[TR: %s] Close Error: %s", task_id, close_err)
|
|
446
470
|
# Store environment close error in job
|
|
447
471
|
job.errors.append(
|
|
448
472
|
{
|
|
@@ -455,9 +479,7 @@ async def _execute_task(
|
|
|
455
479
|
|
|
456
480
|
log_suffix = f" Error: {error_msg}" if status == "error" else f" Eval: {evaluation_result}"
|
|
457
481
|
logger.info(
|
|
458
|
-
"[
|
|
459
|
-
job.name,
|
|
460
|
-
job.id,
|
|
482
|
+
"[TR: %s] Finished local execution. Status: %s.%s",
|
|
461
483
|
task_id,
|
|
462
484
|
status,
|
|
463
485
|
log_suffix,
|
|
@@ -499,6 +521,7 @@ async def run_job(
|
|
|
499
521
|
run_parallel: bool = True,
|
|
500
522
|
job_metadata: dict[str, Any] | None = None,
|
|
501
523
|
show_progress: bool = True,
|
|
524
|
+
verbose: bool = False,
|
|
502
525
|
# Concurrency control with semaphores
|
|
503
526
|
max_concurrent_env_creations: int | None = 30, # Limits gym.make calls
|
|
504
527
|
max_concurrent_agent_predictions: int | None = None, # No limit on LLM calls
|
|
@@ -538,10 +561,16 @@ async def run_job(
|
|
|
538
561
|
tasks_to_run: list[Task] = []
|
|
539
562
|
created_job: Job | None = None
|
|
540
563
|
|
|
564
|
+
# Get hud logger
|
|
565
|
+
if not verbose:
|
|
566
|
+
logger = logging.getLogger("hud")
|
|
567
|
+
logger.setLevel(logging.CRITICAL)
|
|
568
|
+
logger = logging.getLogger("hud.job")
|
|
569
|
+
|
|
541
570
|
evalset_id = None
|
|
542
571
|
if isinstance(task_or_taskset, TaskSet):
|
|
543
572
|
evalset_id = task_or_taskset.id
|
|
544
|
-
|
|
573
|
+
task_or_taskset.fit(agent_cls)
|
|
545
574
|
|
|
546
575
|
gym_id = None
|
|
547
576
|
if isinstance(task_or_taskset, Task):
|
|
@@ -706,3 +735,39 @@ async def run_job(
|
|
|
706
735
|
num_tasks,
|
|
707
736
|
)
|
|
708
737
|
return created_job
|
|
738
|
+
|
|
739
|
+
|
|
740
|
+
"""
|
|
741
|
+
c7f85f7d-3730-4c9a-85a3-a1dc436c3bd2
|
|
742
|
+
|
|
743
|
+
|
|
744
|
+
de12c3cc-9d9c-4e90-82cc-1d71d30ede54
|
|
745
|
+
59104743-0a63-4569-a8b5-1eda1a1b55ac
|
|
746
|
+
ff759429-056c-4cde-8851-11e26729ff03
|
|
747
|
+
|
|
748
|
+
|
|
749
|
+
7b98ea22-e243-4eeb-a6db-79f4a76da2b3
|
|
750
|
+
|
|
751
|
+
7aad3f7b-d74f-470d-826d-d817f95fdd67
|
|
752
|
+
|
|
753
|
+
e356ede6-074a-49ef-9fcd-69e5bcfbdec9
|
|
754
|
+
|
|
755
|
+
26cd1192-3991-4d1b-b599-b2bed1bcb606
|
|
756
|
+
|
|
757
|
+
31ece277-970f-4763-b0c8-bf19a56f56c7
|
|
758
|
+
|
|
759
|
+
|
|
760
|
+
f9b722a0-5f33-466b-bce0-8ece101f2bc6
|
|
761
|
+
33d1af33-8952-4945-b901-229bcfd88354
|
|
762
|
+
|
|
763
|
+
6c3d6557-e745-44ab-bc10-300180a81c79
|
|
764
|
+
6c3d6557-e745-44ab-bc10-300180a81c79
|
|
765
|
+
502e02b5-9939-4e57-91af-4fcbcb90a979
|
|
766
|
+
|
|
767
|
+
7aad3f7b-d74f-470d-826d-d817f95fdd67
|
|
768
|
+
|
|
769
|
+
|
|
770
|
+
31ece277-970f-4763-b0c8-bf19a56f56c7
|
|
771
|
+
|
|
772
|
+
|
|
773
|
+
e356ede6-074a-49ef-9fcd-69e5bcfbdec9"""
|
hud/settings.py
CHANGED
|
@@ -44,6 +44,12 @@ class Settings(BaseSettings):
|
|
|
44
44
|
validation_alias="TELEMETRY_ENABLED",
|
|
45
45
|
)
|
|
46
46
|
|
|
47
|
+
fancy_logging: bool = Field(
|
|
48
|
+
default=True,
|
|
49
|
+
description="Enable fancy logging for the HUD SDK",
|
|
50
|
+
validation_alias="FANCY_LOGGING",
|
|
51
|
+
)
|
|
52
|
+
|
|
47
53
|
|
|
48
54
|
# Create a singleton instance
|
|
49
55
|
settings = Settings()
|
hud/task.py
CHANGED
|
@@ -2,12 +2,12 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import tempfile
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import TYPE_CHECKING, Any
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Literal, cast
|
|
6
6
|
|
|
7
7
|
from inspect_ai.util._sandbox import SandboxEnvironmentSpec
|
|
8
|
-
from pydantic import BaseModel
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
9
|
|
|
10
|
-
from hud.types import CustomGym, Gym
|
|
10
|
+
from hud.types import CustomGym, Gym, MetadataKeys, SensitiveData
|
|
11
11
|
from hud.utils.common import FunctionConfig, FunctionConfigs
|
|
12
12
|
|
|
13
13
|
if TYPE_CHECKING:
|
|
@@ -40,28 +40,78 @@ class Task(BaseModel):
|
|
|
40
40
|
Attributes:
|
|
41
41
|
id: The remote task ID (optional if local-only)
|
|
42
42
|
prompt: The task prompt or instruction
|
|
43
|
+
system_prompt: The system prompt for the evalset (optional)
|
|
43
44
|
setup: Environment setup configuration (optional)
|
|
44
45
|
evaluate: Configuration for evaluating responses
|
|
45
46
|
metadata: Additional task metadata
|
|
47
|
+
sensitive_data: Sensitive data such as API keys, passwords, etc.
|
|
46
48
|
choices: Multiple choice answer list (for Inspect compatibility)
|
|
47
49
|
target: Ideal target output (for Inspect compatibility)
|
|
48
50
|
files: Files that go along with the task (for Inspect compatibility)
|
|
49
51
|
gym: Environment specification
|
|
50
52
|
"""
|
|
51
53
|
|
|
52
|
-
id: str | None = None
|
|
53
|
-
|
|
54
|
+
id: str | None = None # Remote task ID (optional if local-only)
|
|
55
|
+
|
|
56
|
+
prompt: str # Task prompt or instruction
|
|
57
|
+
system_prompt: str | None = None # System prompt for the evalset (optional)
|
|
58
|
+
|
|
59
|
+
gym: Gym | None = None # Environment specification
|
|
60
|
+
|
|
61
|
+
# Setup and evaluate configurations for the environment (environment specific)
|
|
54
62
|
setup: FunctionConfigs | None = None
|
|
55
63
|
evaluate: FunctionConfigs | None = None
|
|
56
|
-
|
|
64
|
+
|
|
65
|
+
# Overflow configuration for environments that don't conform to the standard
|
|
57
66
|
config: dict[str, Any] | None = None
|
|
58
67
|
|
|
68
|
+
# Sensitive data such as API keys, passwords, etc.
|
|
69
|
+
sensitive_data: SensitiveData = Field(default_factory=dict)
|
|
70
|
+
|
|
71
|
+
# Metadata for the task evaluation, information about the agent (see MetadataKeys)
|
|
72
|
+
metadata: dict[MetadataKeys, Any] = Field(default_factory=dict)
|
|
73
|
+
|
|
74
|
+
# Description of the task, for extra information about its purpose and context
|
|
59
75
|
description: str | None = None
|
|
60
76
|
|
|
61
77
|
@classmethod
|
|
62
78
|
def from_dict(cls, data: dict[str, Any]) -> Task:
|
|
63
79
|
return cls(**data)
|
|
64
80
|
|
|
81
|
+
@classmethod
|
|
82
|
+
def from_serialized(cls, data: dict[str, Any]) -> Task:
|
|
83
|
+
gym_data = data.get("gym")
|
|
84
|
+
parsed_gym: Gym | None = gym_data
|
|
85
|
+
|
|
86
|
+
parsed_setup = [(param, entry) for param, entry in data.get("setup", [])]
|
|
87
|
+
parsed_evaluate = [(param, entry) for param, entry in data.get("evaluate", [])]
|
|
88
|
+
|
|
89
|
+
# Convert dict gym data to CustomGym if needed
|
|
90
|
+
if (
|
|
91
|
+
isinstance(gym_data, dict)
|
|
92
|
+
and gym_data.get("type") == "public"
|
|
93
|
+
and gym_data.get("location") in ("local", "remote")
|
|
94
|
+
and gym_data.get("image_or_build_context") is not None
|
|
95
|
+
):
|
|
96
|
+
parsed_gym = CustomGym(
|
|
97
|
+
type=cast("Literal['public']", gym_data["type"]),
|
|
98
|
+
location=cast("Literal['local', 'remote']", gym_data["location"]),
|
|
99
|
+
image_or_build_context=Path(gym_data["image_or_build_context"]),
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
return cls(
|
|
103
|
+
id=data.get("id"),
|
|
104
|
+
prompt=data.get("prompt", ""),
|
|
105
|
+
system_prompt=data.get("system_prompt"),
|
|
106
|
+
setup=parsed_setup,
|
|
107
|
+
evaluate=parsed_evaluate,
|
|
108
|
+
gym=parsed_gym,
|
|
109
|
+
config=data.get("config"),
|
|
110
|
+
description=data.get("description"),
|
|
111
|
+
sensitive_data=data.get("sensitive_data", {}),
|
|
112
|
+
metadata=data.get("metadata", {}),
|
|
113
|
+
)
|
|
114
|
+
|
|
65
115
|
@classmethod
|
|
66
116
|
def from_inspect_sample(cls, sample: Sample) -> Task:
|
|
67
117
|
"""Create a Task from an Inspect dataset sample.
|
|
@@ -144,3 +194,31 @@ class Task(BaseModel):
|
|
|
144
194
|
if self.gym is None:
|
|
145
195
|
return
|
|
146
196
|
self.gym = agent.transfer_gyms.get(self.gym, self.gym)
|
|
197
|
+
|
|
198
|
+
def serialize(self) -> dict[str, Any]:
|
|
199
|
+
if isinstance(self.setup, list):
|
|
200
|
+
parsed_setup = [[param, entry] for param, entry in self.setup]
|
|
201
|
+
else:
|
|
202
|
+
parsed_setup = self.setup
|
|
203
|
+
if isinstance(self.evaluate, list):
|
|
204
|
+
parsed_evaluate = [[param, entry] for param, entry in self.evaluate]
|
|
205
|
+
else:
|
|
206
|
+
parsed_evaluate = self.evaluate
|
|
207
|
+
|
|
208
|
+
if isinstance(self.gym, CustomGym):
|
|
209
|
+
parsed_gym = self.gym.model_dump()
|
|
210
|
+
parsed_gym["image_or_build_context"] = str(parsed_gym["image_or_build_context"])
|
|
211
|
+
else: # is ServerGym
|
|
212
|
+
parsed_gym = self.gym
|
|
213
|
+
|
|
214
|
+
return {
|
|
215
|
+
"id": self.id,
|
|
216
|
+
"prompt": self.prompt,
|
|
217
|
+
"config": self.config,
|
|
218
|
+
"description": self.description,
|
|
219
|
+
"setup": parsed_setup,
|
|
220
|
+
"evaluate": parsed_evaluate,
|
|
221
|
+
"gym": parsed_gym,
|
|
222
|
+
"sensitive_data": self.sensitive_data,
|
|
223
|
+
"metadata": self.metadata,
|
|
224
|
+
}
|
hud/taskset.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from
|
|
3
|
+
from pathlib import PosixPath
|
|
4
|
+
from typing import TYPE_CHECKING, Any, get_args
|
|
4
5
|
from venv import logger
|
|
5
6
|
|
|
6
7
|
from pydantic import BaseModel
|
|
@@ -9,6 +10,7 @@ from hud.env.environment import create_remote_config
|
|
|
9
10
|
from hud.server import make_request
|
|
10
11
|
from hud.settings import settings
|
|
11
12
|
from hud.task import Task
|
|
13
|
+
from hud.types import CustomGym, ServerGym
|
|
12
14
|
from hud.utils.config import REMOTE_EVALUATE, REMOTE_SETUP
|
|
13
15
|
|
|
14
16
|
if TYPE_CHECKING:
|
|
@@ -101,13 +103,30 @@ class TaskSet(BaseModel):
|
|
|
101
103
|
else:
|
|
102
104
|
evaluate_config = None
|
|
103
105
|
|
|
106
|
+
if isinstance(task.gym, CustomGym):
|
|
107
|
+
if isinstance(task.gym.image_or_build_context, PosixPath):
|
|
108
|
+
raise ValueError(
|
|
109
|
+
"Local build contexts are not supported for "
|
|
110
|
+
"remote tasksets, attach an image or existing "
|
|
111
|
+
"gym id."
|
|
112
|
+
)
|
|
113
|
+
gym_str = "docker"
|
|
114
|
+
image_uri = task.gym.image_or_build_context
|
|
115
|
+
elif isinstance(task.gym, str) and task.gym in get_args(ServerGym):
|
|
116
|
+
gym_str = task.gym
|
|
117
|
+
image_uri = None
|
|
118
|
+
else:
|
|
119
|
+
raise ValueError(f"Unknown gym type: {type(task.gym)}")
|
|
120
|
+
|
|
104
121
|
processed_tasks.append(
|
|
105
122
|
{
|
|
106
123
|
"prompt": task.prompt,
|
|
107
|
-
"gym":
|
|
124
|
+
"gym": gym_str,
|
|
108
125
|
"setup": setup_config,
|
|
109
126
|
"evaluate": evaluate_config,
|
|
110
127
|
"config": task.config,
|
|
128
|
+
"image_uri": image_uri,
|
|
129
|
+
"description": task.description,
|
|
111
130
|
}
|
|
112
131
|
)
|
|
113
132
|
|
|
@@ -125,7 +144,15 @@ class TaskSet(BaseModel):
|
|
|
125
144
|
"Taskset %s uploaded successfully, see it on app.hud.so/evalsets/%s", name, name
|
|
126
145
|
)
|
|
127
146
|
|
|
128
|
-
|
|
147
|
+
def _apply(self, dict: dict[str, Any]) -> None:
|
|
148
|
+
"""
|
|
149
|
+
Applies a parameter to all tasks in the taskset.
|
|
150
|
+
"""
|
|
151
|
+
for task in self.tasks:
|
|
152
|
+
for key, value in dict.items():
|
|
153
|
+
setattr(task, key, value)
|
|
154
|
+
|
|
155
|
+
def fit(self, agent: Agent | type[Agent]) -> None:
|
|
129
156
|
"""
|
|
130
157
|
Automatically adapts the taskset to the agent's transfer_gyms.
|
|
131
158
|
"""
|
|
@@ -133,19 +160,27 @@ class TaskSet(BaseModel):
|
|
|
133
160
|
agent = agent()
|
|
134
161
|
|
|
135
162
|
for task in self.tasks:
|
|
136
|
-
if task.gym is None:
|
|
163
|
+
if task.gym is None or isinstance(task.gym, CustomGym):
|
|
137
164
|
continue
|
|
138
165
|
task.gym = agent.transfer_gyms.get(task.gym, task.gym)
|
|
139
166
|
|
|
140
167
|
|
|
141
|
-
async def load_taskset(
|
|
168
|
+
async def load_taskset(
|
|
169
|
+
taskset_id: str,
|
|
170
|
+
api_key: str | None = None,
|
|
171
|
+
metadata: dict[str, Any] | None = None,
|
|
172
|
+
load_custom_as_local: bool = False,
|
|
173
|
+
system_prompt: str | None = None,
|
|
174
|
+
) -> TaskSet:
|
|
142
175
|
"""
|
|
143
176
|
Loads a TaskSet by its ID.
|
|
144
177
|
|
|
145
178
|
Args:
|
|
146
179
|
taskset_id: The ID of the taskset to load
|
|
147
180
|
api_key: Optional API key to use for the request
|
|
148
|
-
|
|
181
|
+
metadata: Optional metadata to apply to the taskset
|
|
182
|
+
load_custom_as_local: Whether to load custom gyms as local
|
|
183
|
+
system_prompt: Optional system prompt to override the default
|
|
149
184
|
Returns:
|
|
150
185
|
TaskSet: The loaded taskset
|
|
151
186
|
"""
|
|
@@ -161,13 +196,33 @@ async def load_taskset(taskset_id: str, api_key: str | None = None) -> TaskSet:
|
|
|
161
196
|
|
|
162
197
|
logger.info(f"Taskset {taskset_id} loaded successfully")
|
|
163
198
|
|
|
164
|
-
|
|
199
|
+
tasks = data["evalset"]
|
|
200
|
+
for task in tasks:
|
|
201
|
+
if system_prompt:
|
|
202
|
+
task["system_prompt"] = system_prompt
|
|
203
|
+
if task["gym"] == "docker":
|
|
204
|
+
if "image_uri" not in task:
|
|
205
|
+
raise ValueError(
|
|
206
|
+
"No `image_uri` key found. This taskset may be "
|
|
207
|
+
"incompatible with your version of HUD SDK."
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
task["gym"] = CustomGym(
|
|
211
|
+
location="local" if load_custom_as_local else "remote",
|
|
212
|
+
image_or_build_context=task["image_uri"],
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
taskset = TaskSet.model_validate(
|
|
165
216
|
{
|
|
166
217
|
"id": taskset_id,
|
|
167
|
-
"tasks":
|
|
218
|
+
"tasks": tasks,
|
|
168
219
|
}
|
|
169
220
|
)
|
|
170
221
|
|
|
222
|
+
taskset._apply({"metadata": metadata})
|
|
223
|
+
|
|
224
|
+
return taskset
|
|
225
|
+
|
|
171
226
|
|
|
172
227
|
def load_from_inspect(dataset: Dataset) -> TaskSet:
|
|
173
228
|
"""
|
hud/telemetry/exporter.py
CHANGED
|
@@ -298,12 +298,10 @@ async def _export_trace_payload_async(payload: dict[str, Any]) -> None:
|
|
|
298
298
|
"telemetry": payload.get("mcp_calls", []),
|
|
299
299
|
}
|
|
300
300
|
|
|
301
|
-
|
|
302
|
-
# if not data_to_send["mcp_calls"]:
|
|
303
|
-
# logger.debug("No MCP calls in payload for task run %s, skipping specific export if "
|
|
304
|
-
# "desired.", task_run_id)
|
|
305
|
-
# # Depending on backend, might not want to send empty mcp_calls list, or it's fine.
|
|
301
|
+
await send_telemetry_to_server(task_run_id, data_to_send)
|
|
306
302
|
|
|
303
|
+
|
|
304
|
+
async def send_telemetry_to_server(task_run_id: str, data: dict[str, Any]) -> None:
|
|
307
305
|
telemetry_url = f"{settings.base_url}/v2/task_runs/{task_run_id}/telemetry-upload"
|
|
308
306
|
|
|
309
307
|
try:
|
|
@@ -320,7 +318,7 @@ async def _export_trace_payload_async(payload: dict[str, Any]) -> None:
|
|
|
320
318
|
)
|
|
321
319
|
response = await client.post(
|
|
322
320
|
telemetry_url,
|
|
323
|
-
json=
|
|
321
|
+
json=data, # Send the structured attributes and mcp_calls
|
|
324
322
|
headers=headers,
|
|
325
323
|
timeout=30.0,
|
|
326
324
|
)
|
hud/trajectory.py
CHANGED
|
@@ -6,6 +6,8 @@ import datetime
|
|
|
6
6
|
from IPython.display import HTML, Markdown, display
|
|
7
7
|
from pydantic import BaseModel, Field
|
|
8
8
|
|
|
9
|
+
from .adapters.common.types import LogType
|
|
10
|
+
|
|
9
11
|
|
|
10
12
|
class TrajectoryStep(BaseModel):
|
|
11
13
|
"""Model representing a single task run's trajectory information."""
|
|
@@ -13,6 +15,7 @@ class TrajectoryStep(BaseModel):
|
|
|
13
15
|
observation_url: str | None = None
|
|
14
16
|
observation_text: str | None = None
|
|
15
17
|
actions: list[dict]
|
|
18
|
+
logs: LogType | None = None
|
|
16
19
|
start_timestamp: str | None = None
|
|
17
20
|
end_timestamp: str | None = None
|
|
18
21
|
|