synth-ai 0.2.12__py3-none-any.whl → 0.2.13.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/multi_step/configs/crafter_rl_outcome.toml +74 -0
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +186 -0
- examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +83 -0
- examples/multi_step/configs/crafter_rl_stepwise_simple.toml +78 -0
- examples/multi_step/crafter_rl_lora.md +51 -10
- examples/multi_step/sse_metrics_streaming_notes.md +357 -0
- examples/multi_step/task_app_config_notes.md +7 -1
- examples/swe/task_app/grpo_swe_mini.py +55 -26
- examples/swe/task_app/hosted/rollout.py +40 -0
- examples/swe/task_app/hosted/test_service.py +5 -6
- examples/task_apps/TESTING.md +275 -0
- examples/task_apps/__init__.py +0 -0
- examples/task_apps/crafter/__init__.py +0 -0
- examples/task_apps/crafter/task_app/__init__.py +2 -0
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +21 -46
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +60 -4
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +109 -45
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +67 -49
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +242 -193
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
- examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
- examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
- examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
- examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
- examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
- examples/task_apps/enron/__init__.py +1 -0
- examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
- examples/task_apps/enron/task_app/README.md +14 -0
- examples/task_apps/enron/task_app/__init__.py +1 -0
- examples/task_apps/enron/task_app/grpo_enron.py +906 -0
- examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
- examples/task_apps/enron/tests/__init__.py +2 -0
- examples/task_apps/enron/tests/conftest.py +115 -0
- examples/task_apps/enron/tests/integration/__init__.py +2 -0
- examples/task_apps/enron/tests/integration/test_enron_eval.py +177 -0
- examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
- examples/task_apps/enron/tests/unit/__init__.py +2 -0
- examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
- examples/task_apps/math/__init__.py +0 -0
- examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
- examples/task_apps/pokemon_battle/__init__.py +2 -0
- examples/task_apps/pokemon_battle/modal_app.py +104 -0
- examples/task_apps/pokemon_battle/task_app/README.md +68 -0
- examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
- examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
- examples/task_apps/pokemon_red/README.md +357 -0
- examples/task_apps/pokemon_red/__init__.py +3 -0
- examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
- examples/task_apps/pokemon_red/pallet_town_rl_config.toml +73 -0
- examples/task_apps/pokemon_red/task_app.py +606 -0
- examples/task_apps/pokemon_red/test_pallet_town_rewards.py +191 -0
- examples/task_apps/sokoban/README.md +307 -0
- examples/task_apps/sokoban/__init__.py +3 -0
- examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
- examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
- examples/task_apps/sokoban/task_app.py +1058 -0
- examples/task_apps/sokoban/tests/__init__.py +2 -0
- examples/task_apps/sokoban/tests/conftest.py +113 -0
- examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
- examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
- examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
- examples/task_apps/verilog/__init__.py +1 -0
- examples/task_apps/verilog/eval_groq_qwen32b.toml +20 -0
- examples/task_apps/verilog/task_app/README.md +12 -0
- examples/task_apps/verilog/task_app/__init__.py +1 -0
- examples/task_apps/verilog/task_app/grpo_verilog.py +931 -0
- examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
- examples/task_apps/verilog/tests/__init__.py +2 -0
- examples/task_apps/verilog/tests/conftest.py +115 -0
- examples/task_apps/verilog/tests/integration/__init__.py +2 -0
- examples/task_apps/verilog/tests/integration/test_verilog_eval.py +179 -0
- examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
- examples/task_apps/verilog/tests/unit/__init__.py +2 -0
- examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
- examples/vlm/crafter_openai_vlm_agent.py +4 -4
- examples/vlm/run_crafter_vlm_benchmark.py +4 -4
- examples/warming_up_to_rl/configs/eval_stepwise_complex.toml +4 -2
- examples/warming_up_to_rl/configs/eval_stepwise_simple.toml +4 -2
- examples/warming_up_to_rl/run_eval.py +127 -18
- examples/workflows/__init__.py +0 -0
- examples/workflows/math_rl/__init__.py +0 -0
- examples/workflows/math_rl/download_dataset.py +80 -0
- synth_ai/__init__.py +41 -1
- synth_ai/api/train/builders.py +73 -29
- synth_ai/api/train/cli.py +12 -6
- synth_ai/api/train/configs/__init__.py +44 -0
- synth_ai/api/train/configs/rl.py +134 -0
- synth_ai/api/train/configs/sft.py +95 -0
- synth_ai/api/train/configs/shared.py +24 -0
- synth_ai/api/train/env_resolver.py +5 -2
- synth_ai/api/train/supported_algos.py +10 -5
- synth_ai/api/train/utils.py +7 -4
- synth_ai/cli/__init__.py +7 -51
- synth_ai/cli/_storage.py +4 -3
- synth_ai/cli/_validate_task_app.py +11 -0
- synth_ai/cli/balance.py +4 -3
- synth_ai/cli/calc.py +2 -2
- synth_ai/cli/demo.py +49 -43
- synth_ai/cli/legacy_root_backup.py +1 -1
- synth_ai/cli/rl_demo.py +86 -106
- synth_ai/cli/root.py +0 -97
- synth_ai/cli/task_apps.py +1710 -186
- synth_ai/demos/core/cli.py +121 -159
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +28 -16
- synth_ai/environments/examples/crafter_classic/environment.py +16 -0
- synth_ai/environments/examples/enron/engine.py +7 -2
- synth_ai/environments/examples/enron/environment.py +68 -0
- synth_ai/environments/examples/red/engine.py +27 -0
- synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
- synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
- synth_ai/environments/examples/red/environment.py +60 -0
- synth_ai/environments/examples/sokoban/taskset.py +116 -0
- synth_ai/environments/examples/verilog/engine.py +30 -4
- synth_ai/evals/__init__.py +15 -0
- synth_ai/evals/client.py +82 -0
- synth_ai/evals/types.py +42 -0
- synth_ai/jobs/client.py +16 -4
- synth_ai/judge_schemas.py +127 -0
- synth_ai/py.typed +0 -0
- synth_ai/task/__init__.py +14 -5
- synth_ai/task/contracts.py +124 -38
- synth_ai/task/proxy.py +48 -56
- synth_ai/task/rubrics/__init__.py +53 -0
- synth_ai/task/rubrics/loaders.py +133 -0
- synth_ai/task/rubrics/models.py +57 -0
- synth_ai/task/rubrics/scoring.py +113 -0
- synth_ai/task/rubrics/strict.py +149 -0
- synth_ai/task/server.py +8 -7
- synth_ai/task/validators.py +269 -6
- synth_ai/tracing_v3/decorators.py +7 -3
- synth_ai/tracing_v3/replica_sync.py +4 -4
- synth_ai/tracing_v3/serialization.py +130 -0
- synth_ai/tracing_v3/trace_utils.py +317 -0
- synth_ai/tracing_v3/turso/native_manager.py +3 -3
- {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/METADATA +4 -1
- {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/RECORD +228 -89
- {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/entry_points.txt +0 -1
- synth_ai/task/rubrics.py +0 -219
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/utils.py +0 -0
- /examples/{rl/task_app → task_apps/math}/README.md +0 -0
- /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
- /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
- /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
- {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,8 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
import os
|
|
5
|
+
from collections.abc import Iterable, Sequence
|
|
3
6
|
from dataclasses import asdict, dataclass, fields
|
|
4
7
|
from typing import List, Tuple
|
|
5
8
|
from uuid import UUID, uuid4
|
|
@@ -18,6 +21,7 @@ from synth_ai.environments.tasks.core import (
|
|
|
18
21
|
TaskInstanceMetadataFilter,
|
|
19
22
|
TaskInstanceSet,
|
|
20
23
|
)
|
|
24
|
+
from synth_ai.task.contracts import TaskInfo
|
|
21
25
|
|
|
22
26
|
logger = logging.getLogger(__name__)
|
|
23
27
|
|
|
@@ -96,6 +100,118 @@ class SokobanTaskInstance(TaskInstance):
|
|
|
96
100
|
return cls(**filtered_data)
|
|
97
101
|
|
|
98
102
|
|
|
103
|
+
def _base_task_info_template() -> TaskInfo:
|
|
104
|
+
return TaskInfo(
|
|
105
|
+
task={"id": "sokoban", "name": "Sokoban", "version": "1.0.0"},
|
|
106
|
+
environment="sokoban",
|
|
107
|
+
action_space={
|
|
108
|
+
"type": "tool_call",
|
|
109
|
+
"tools": [{"name": "interact", "schema": {"action": "int"}}],
|
|
110
|
+
"max_calls": 1,
|
|
111
|
+
},
|
|
112
|
+
observation={"summary": "Sokoban grid observation", "keys": ["grid", "player"]},
|
|
113
|
+
dataset={"id": "sokoban", "name": "Sokoban", "version": "1.0.0"},
|
|
114
|
+
rubric={"version": "1", "criteria_count": 1, "source": "inline"},
|
|
115
|
+
inference={"supports_proxy": False},
|
|
116
|
+
capabilities={"supports_rollout": True, "supports_env_lifecycle": True},
|
|
117
|
+
limits={"max_turns": 200},
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class SokobanTaskSet:
|
|
122
|
+
"""Minimal helper compatible with Task App expectations."""
|
|
123
|
+
|
|
124
|
+
def __init__(self) -> None:
|
|
125
|
+
self._taskset: TaskInstanceSet | None = None
|
|
126
|
+
self._seed_index: dict[int, SokobanTaskInstance] = {}
|
|
127
|
+
self._base_info = _base_task_info_template()
|
|
128
|
+
|
|
129
|
+
async def _ensure_loaded(self) -> TaskInstanceSet:
|
|
130
|
+
if self._taskset is None:
|
|
131
|
+
dataset = await create_sokoban_taskset()
|
|
132
|
+
self._taskset = dataset
|
|
133
|
+
self._seed_index.clear()
|
|
134
|
+
for inst in dataset.instances:
|
|
135
|
+
try:
|
|
136
|
+
seed_value = int(getattr(inst.metadata, "seed"))
|
|
137
|
+
except Exception:
|
|
138
|
+
continue
|
|
139
|
+
# Keep the first instance encountered for a seed
|
|
140
|
+
self._seed_index.setdefault(seed_value, inst)
|
|
141
|
+
return self._taskset
|
|
142
|
+
|
|
143
|
+
def describe(self) -> dict[str, object]:
|
|
144
|
+
if not self._taskset:
|
|
145
|
+
return {"id": "sokoban", "name": "Sokoban"}
|
|
146
|
+
return {
|
|
147
|
+
"id": "sokoban",
|
|
148
|
+
"name": self._taskset.name,
|
|
149
|
+
"description": self._taskset.description,
|
|
150
|
+
"instance_count": len(self._taskset.instances),
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
async def provide_task_instances(self, seeds: Sequence[int]) -> Iterable[TaskInfo]:
|
|
154
|
+
await self._ensure_loaded()
|
|
155
|
+
if not seeds:
|
|
156
|
+
return []
|
|
157
|
+
|
|
158
|
+
infos: list[TaskInfo] = []
|
|
159
|
+
for raw_seed in seeds:
|
|
160
|
+
try:
|
|
161
|
+
seed_value = int(raw_seed)
|
|
162
|
+
except Exception:
|
|
163
|
+
continue
|
|
164
|
+
|
|
165
|
+
instance = self._seed_index.get(seed_value)
|
|
166
|
+
if instance is None:
|
|
167
|
+
# Attempt to construct on the fly; try configured difficulties in order
|
|
168
|
+
for difficulty in DIFFICULTY_CONFIGS:
|
|
169
|
+
try:
|
|
170
|
+
instance = await create_task_instance_from_seed(difficulty, seed_value)
|
|
171
|
+
break
|
|
172
|
+
except Exception:
|
|
173
|
+
continue
|
|
174
|
+
if instance is None:
|
|
175
|
+
continue
|
|
176
|
+
self._seed_index[seed_value] = instance
|
|
177
|
+
|
|
178
|
+
metadata = getattr(instance, "metadata", None)
|
|
179
|
+
base_info = self._base_info.model_copy(deep=True)
|
|
180
|
+
|
|
181
|
+
observation = dict(base_info.observation)
|
|
182
|
+
dataset_info = dict(base_info.dataset)
|
|
183
|
+
task_metadata = {"seed": seed_value}
|
|
184
|
+
|
|
185
|
+
if metadata is not None:
|
|
186
|
+
for key in ("difficulty", "num_boxes", "dim_room", "max_steps", "shortest_path_length"):
|
|
187
|
+
value = getattr(metadata, key, None)
|
|
188
|
+
if value is not None:
|
|
189
|
+
observation[key] = value
|
|
190
|
+
task_metadata[key] = value
|
|
191
|
+
dataset_info.update(
|
|
192
|
+
{
|
|
193
|
+
"seed": getattr(metadata, "seed", seed_value),
|
|
194
|
+
"difficulty": getattr(metadata, "difficulty", None),
|
|
195
|
+
"num_boxes": getattr(metadata, "num_boxes", None),
|
|
196
|
+
"dim_room": getattr(metadata, "dim_room", None),
|
|
197
|
+
}
|
|
198
|
+
)
|
|
199
|
+
generation_params = getattr(metadata, "generation_params", None)
|
|
200
|
+
if generation_params is not None:
|
|
201
|
+
task_metadata["generation_params"] = generation_params
|
|
202
|
+
|
|
203
|
+
infos.append(
|
|
204
|
+
base_info.model_copy(
|
|
205
|
+
update={
|
|
206
|
+
"observation": observation,
|
|
207
|
+
"dataset": dataset_info,
|
|
208
|
+
"task_metadata": task_metadata,
|
|
209
|
+
}
|
|
210
|
+
)
|
|
211
|
+
)
|
|
212
|
+
return infos
|
|
213
|
+
|
|
214
|
+
|
|
99
215
|
async def create_sokoban_taskset() -> TaskInstanceSet:
|
|
100
216
|
"""Generates Sokoban task instances from pre-generated verified puzzles."""
|
|
101
217
|
instances = []
|
|
@@ -67,6 +67,16 @@ class VerilogStepPenaltyComponent(RewardComponent):
|
|
|
67
67
|
return self.penalty
|
|
68
68
|
|
|
69
69
|
|
|
70
|
+
class VerilogSubmitSuccessComponent(RewardComponent):
|
|
71
|
+
"""Reward for successful submission (tests passed)."""
|
|
72
|
+
async def score(self, state: VerilogPublicState, action: Any) -> float:
|
|
73
|
+
if hasattr(action, "get") and action.get("type") == "submit":
|
|
74
|
+
# Check if submission passed
|
|
75
|
+
if action.get("passed", False):
|
|
76
|
+
return 10.0 # Large reward for completing the task correctly
|
|
77
|
+
return 0.0
|
|
78
|
+
|
|
79
|
+
|
|
70
80
|
class VerilogEngine(StatefulEngine):
|
|
71
81
|
"""
|
|
72
82
|
Stateful Verilog evaluation engine with persistent artifact snapshots.
|
|
@@ -81,6 +91,7 @@ class VerilogEngine(StatefulEngine):
|
|
|
81
91
|
components=[
|
|
82
92
|
VerilogCompileSuccessComponent(),
|
|
83
93
|
VerilogSimulationPassComponent(),
|
|
94
|
+
VerilogSubmitSuccessComponent(),
|
|
84
95
|
VerilogStepPenaltyComponent(penalty=-0.01),
|
|
85
96
|
]
|
|
86
97
|
)
|
|
@@ -284,13 +295,28 @@ class VerilogEngine(StatefulEngine):
|
|
|
284
295
|
|
|
285
296
|
async def submit(self) -> Dict[str, Any]:
|
|
286
297
|
"""Submit solution for grading."""
|
|
287
|
-
#
|
|
288
|
-
#
|
|
298
|
+
# Check if the last simulation passed
|
|
299
|
+
# Parse the last simulation output to determine if tests passed
|
|
300
|
+
passed = False
|
|
301
|
+
detail = "No simulation run yet"
|
|
302
|
+
|
|
303
|
+
if self._last_simulate_output:
|
|
304
|
+
stdout = self._last_simulate_output
|
|
305
|
+
passed = (
|
|
306
|
+
"ALL_TESTS_PASSED" in stdout
|
|
307
|
+
or ("Mismatches: 0 " in stdout and "samples" in stdout)
|
|
308
|
+
or ("no mismatches" in stdout.lower() and "errors" not in stdout.lower())
|
|
309
|
+
)
|
|
310
|
+
if passed:
|
|
311
|
+
detail = "All tests passed"
|
|
312
|
+
else:
|
|
313
|
+
detail = "Tests failed - please review simulation output"
|
|
314
|
+
|
|
289
315
|
return {
|
|
290
316
|
"ok": True,
|
|
291
317
|
"type": "submit",
|
|
292
|
-
"passed":
|
|
293
|
-
"detail":
|
|
318
|
+
"passed": passed,
|
|
319
|
+
"detail": detail,
|
|
294
320
|
"submitted": True,
|
|
295
321
|
}
|
|
296
322
|
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from .client import JudgeClient, JudgeOptions, JudgeScoreResponse
|
|
2
|
+
from .types import Judgement, RewardJudgement, RewardMetadata, Track, TrackAggregate
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"JudgeClient",
|
|
6
|
+
"JudgeOptions",
|
|
7
|
+
"JudgeScoreResponse",
|
|
8
|
+
"Judgement",
|
|
9
|
+
"RewardJudgement",
|
|
10
|
+
"RewardMetadata",
|
|
11
|
+
"Track",
|
|
12
|
+
"TrackAggregate",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
|
synth_ai/evals/client.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""Experimental Judge API client.
|
|
2
|
+
|
|
3
|
+
This surface is experimental and subject to change without notice.
|
|
4
|
+
Set environment variable `SYNTH_SILENCE_EXPERIMENTAL=1` to silence warnings.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
import warnings
|
|
11
|
+
from typing import Any, Literal, TypedDict
|
|
12
|
+
|
|
13
|
+
from synth_ai.http import AsyncHttpClient, HTTPError
|
|
14
|
+
from synth_ai.tracing_v3.serialization import normalize_for_json
|
|
15
|
+
|
|
16
|
+
Provider = Literal["groq", "gemini"]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class JudgeOptions(TypedDict, total=False):
|
|
20
|
+
event: bool
|
|
21
|
+
outcome: bool
|
|
22
|
+
rubric_id: str
|
|
23
|
+
rubric_overrides: dict[str, Any]
|
|
24
|
+
provider: Provider
|
|
25
|
+
model: str
|
|
26
|
+
max_concurrency: int
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class JudgeScoreResponse(TypedDict, total=False):
|
|
30
|
+
status: str
|
|
31
|
+
event_rewards: list[dict[str, Any]]
|
|
32
|
+
outcome_reward: dict[str, Any]
|
|
33
|
+
details: dict[str, Any]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class JudgeClient:
|
|
37
|
+
def __init__(self, base_url: str, api_key: str, *, timeout: float = 60.0) -> None:
|
|
38
|
+
_silence = (os.getenv("SYNTH_SILENCE_EXPERIMENTAL") or "").strip().lower()
|
|
39
|
+
if _silence not in {"1", "true", "t", "yes", "y", "on"}:
|
|
40
|
+
warnings.warn(
|
|
41
|
+
"Experimental API: synth_ai.evals.JudgeClient is experimental and may change without notice.",
|
|
42
|
+
UserWarning,
|
|
43
|
+
stacklevel=2,
|
|
44
|
+
)
|
|
45
|
+
self._base = base_url.rstrip("/")
|
|
46
|
+
self._key = api_key
|
|
47
|
+
self._timeout = timeout
|
|
48
|
+
|
|
49
|
+
async def score(
|
|
50
|
+
self,
|
|
51
|
+
*,
|
|
52
|
+
trace: dict[str, Any] | Any,
|
|
53
|
+
policy_name: str,
|
|
54
|
+
task_app_id: str,
|
|
55
|
+
options: JudgeOptions,
|
|
56
|
+
task_app_base_url: str | None = None,
|
|
57
|
+
) -> JudgeScoreResponse:
|
|
58
|
+
body = {
|
|
59
|
+
"policy_name": policy_name,
|
|
60
|
+
"task_app": {"id": task_app_id, **({"base_url": task_app_base_url} if task_app_base_url else {})},
|
|
61
|
+
"trace": normalize_for_json(trace),
|
|
62
|
+
"options": options or {},
|
|
63
|
+
}
|
|
64
|
+
try:
|
|
65
|
+
async with AsyncHttpClient(self._base, self._key, timeout=self._timeout) as http:
|
|
66
|
+
js = await http.post_json("/api/judge/v1/score", json=body)
|
|
67
|
+
if not isinstance(js, dict):
|
|
68
|
+
raise ValueError("invalid_judge_response_shape")
|
|
69
|
+
return js # type: ignore[return-value]
|
|
70
|
+
except HTTPError as err: # map to friendlier exceptions
|
|
71
|
+
status = int(getattr(err, "status", 0) or 0)
|
|
72
|
+
if status in (400, 422):
|
|
73
|
+
raise ValueError(f"judge_validation_error: {err.detail}") from err
|
|
74
|
+
if status in (401, 403):
|
|
75
|
+
raise PermissionError(f"judge_auth_error: {err.detail}") from err
|
|
76
|
+
if status == 404:
|
|
77
|
+
raise FileNotFoundError(f"judge_route_not_found: {err.detail}") from err
|
|
78
|
+
if status == 429:
|
|
79
|
+
raise Exception("judge_rate_limited") from err # replace with RetryLater in future
|
|
80
|
+
if status >= 500:
|
|
81
|
+
raise Exception("judge_transient_error") from err # replace with TransientError in future
|
|
82
|
+
raise
|
synth_ai/evals/types.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Literal, TypedDict
|
|
4
|
+
|
|
5
|
+
Track = Literal["process", "reasoning", "progress", "outcome"]
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Judgement(TypedDict, total=False):
|
|
9
|
+
key: str
|
|
10
|
+
title: str
|
|
11
|
+
description: str
|
|
12
|
+
score: float
|
|
13
|
+
reason: str
|
|
14
|
+
confidence: float
|
|
15
|
+
scale: Literal["binary", "bounded", "count", "custom"]
|
|
16
|
+
source: dict
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class RewardJudgement(TypedDict, total=False):
|
|
20
|
+
judgement: Judgement
|
|
21
|
+
scope: Literal["step", "event", "outcome"]
|
|
22
|
+
turn: int | None
|
|
23
|
+
episode_id: str | None
|
|
24
|
+
reward_value: float | None
|
|
25
|
+
links: dict
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class TrackAggregate(TypedDict, total=False):
|
|
29
|
+
mean: float
|
|
30
|
+
median: float
|
|
31
|
+
std: float
|
|
32
|
+
n: int
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class RewardMetadata(TypedDict, total=False):
|
|
36
|
+
per_window: list[RewardJudgement]
|
|
37
|
+
aggregates: dict[Track, TrackAggregate]
|
|
38
|
+
overall: dict[str, float] # {"final_outcome_score": float}
|
|
39
|
+
rubric: dict # {"ids": {...}, "hash": "..."}
|
|
40
|
+
model_info: dict # {"model": "...", ...}
|
|
41
|
+
|
|
42
|
+
|
synth_ai/jobs/client.py
CHANGED
|
@@ -1,20 +1,32 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import importlib
|
|
4
|
-
from
|
|
4
|
+
from collections.abc import Callable
|
|
5
|
+
from typing import Any, cast
|
|
5
6
|
|
|
6
7
|
try:
|
|
7
|
-
|
|
8
|
+
_supported_module = cast(
|
|
9
|
+
Any, importlib.import_module("synth_ai.api.models.supported")
|
|
10
|
+
)
|
|
11
|
+
normalize_model_identifier = cast(
|
|
12
|
+
Callable[[str], str], _supported_module.normalize_model_identifier
|
|
13
|
+
)
|
|
8
14
|
except Exception as exc: # pragma: no cover - critical dependency
|
|
9
15
|
raise RuntimeError("Unable to load supported model utilities") from exc
|
|
10
16
|
|
|
11
17
|
try:
|
|
12
|
-
|
|
18
|
+
_http_module = cast(Any, importlib.import_module("synth_ai.http"))
|
|
19
|
+
AsyncHttpClient = _http_module.AsyncHttpClient
|
|
13
20
|
except Exception as exc: # pragma: no cover - critical dependency
|
|
14
21
|
raise RuntimeError("Unable to load HTTP client") from exc
|
|
15
22
|
|
|
16
23
|
try:
|
|
17
|
-
|
|
24
|
+
_sft_config_module = cast(
|
|
25
|
+
Any, importlib.import_module("synth_ai.learning.sft.config")
|
|
26
|
+
)
|
|
27
|
+
prepare_sft_job_payload = cast(
|
|
28
|
+
Callable[..., dict[str, Any]], _sft_config_module.prepare_sft_job_payload
|
|
29
|
+
)
|
|
18
30
|
except Exception as exc: # pragma: no cover - critical dependency
|
|
19
31
|
raise RuntimeError("Unable to load SFT configuration helpers") from exc
|
|
20
32
|
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Judge API Contract Schemas
|
|
3
|
+
|
|
4
|
+
These schemas define the expected structure for requests and responses
|
|
5
|
+
to the judge scoring endpoint at POST /api/judge/v1/score.
|
|
6
|
+
|
|
7
|
+
This is the canonical contract that the backend MUST conform to.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from typing import Any, Literal
|
|
13
|
+
|
|
14
|
+
from pydantic import BaseModel, Field
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class CriterionScorePayload(BaseModel):
|
|
18
|
+
"""Per-criterion score returned by the judge."""
|
|
19
|
+
|
|
20
|
+
score: float = Field(..., description="Numeric score for this criterion")
|
|
21
|
+
reason: str = Field(default="", description="Explanation for the score")
|
|
22
|
+
weight: float = Field(default=1.0, description="Weight of this criterion")
|
|
23
|
+
description: str = Field(default="", description="Description of the criterion")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ReviewPayload(BaseModel):
|
|
27
|
+
"""Rubric review (event-level or outcome-level)."""
|
|
28
|
+
|
|
29
|
+
criteria: dict[str, CriterionScorePayload] = Field(
|
|
30
|
+
default_factory=dict,
|
|
31
|
+
description="Map of criterion keys to their scores"
|
|
32
|
+
)
|
|
33
|
+
total: float = Field(default=0.0, description="Aggregated total score")
|
|
34
|
+
summary: str | None = Field(None, description="Optional text summary")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class JudgeScoreResponse(BaseModel):
|
|
38
|
+
"""
|
|
39
|
+
Response body for POST /api/judge/v1/score.
|
|
40
|
+
|
|
41
|
+
This is the canonical contract that judge backends MUST return.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
status: Literal["ok", "failed"] = Field(default="ok", description="Request status")
|
|
45
|
+
event_reviews: list[ReviewPayload] = Field(
|
|
46
|
+
default_factory=list,
|
|
47
|
+
description="List of per-event rubric reviews (one per step)"
|
|
48
|
+
)
|
|
49
|
+
outcome_review: ReviewPayload | None = Field(
|
|
50
|
+
None,
|
|
51
|
+
description="Optional outcome-level rubric review"
|
|
52
|
+
)
|
|
53
|
+
event_totals: list[float] = Field(
|
|
54
|
+
default_factory=list,
|
|
55
|
+
description="List of aggregated scores per event (matches event_reviews length)"
|
|
56
|
+
)
|
|
57
|
+
details: dict[str, Any] = Field(
|
|
58
|
+
default_factory=dict,
|
|
59
|
+
description="Additional details (provider, latency, etc.)"
|
|
60
|
+
)
|
|
61
|
+
metadata: dict[str, Any] = Field(
|
|
62
|
+
default_factory=dict,
|
|
63
|
+
description="Request metadata (provider, options, etc.)"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
def aggregate_event_reward(self) -> float | None:
|
|
67
|
+
"""
|
|
68
|
+
Aggregate all event totals into a single reward.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
Sum of all event_totals, or None if empty
|
|
72
|
+
"""
|
|
73
|
+
if not self.event_totals:
|
|
74
|
+
return None
|
|
75
|
+
return sum(self.event_totals)
|
|
76
|
+
|
|
77
|
+
def aggregate_outcome_reward(self) -> float | None:
|
|
78
|
+
"""
|
|
79
|
+
Extract outcome reward from outcome_review.
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
outcome_review.total, or None if no outcome review
|
|
83
|
+
"""
|
|
84
|
+
if self.outcome_review is None:
|
|
85
|
+
return None
|
|
86
|
+
return self.outcome_review.total
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# Request schemas for completeness
|
|
90
|
+
|
|
91
|
+
class JudgeTaskApp(BaseModel):
|
|
92
|
+
"""Task application metadata."""
|
|
93
|
+
|
|
94
|
+
id: str = Field(..., description="Task app identifier")
|
|
95
|
+
base_url: str | None = Field(None, description="Optional base URL for task app")
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class JudgeOptions(BaseModel):
|
|
99
|
+
"""Judge provider and configuration options."""
|
|
100
|
+
|
|
101
|
+
provider: str | None = Field(None, description="Judge provider (e.g., 'openai', 'groq')")
|
|
102
|
+
model: str | None = Field(None, description="Model identifier")
|
|
103
|
+
rubric_id: str | None = Field(None, description="Rubric identifier")
|
|
104
|
+
event: bool = Field(True, description="Enable event-level judging")
|
|
105
|
+
outcome: bool = Field(True, description="Enable outcome-level judging")
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class JudgeTracePayload(BaseModel):
|
|
109
|
+
"""Trace payload containing trajectory context."""
|
|
110
|
+
|
|
111
|
+
event_history: list[dict[str, Any]] = Field(..., description="List of events/steps")
|
|
112
|
+
markov_blanket_message_history: list[dict[str, Any]] = Field(
|
|
113
|
+
default_factory=list,
|
|
114
|
+
description="Optional message history for context"
|
|
115
|
+
)
|
|
116
|
+
metadata: dict[str, Any] = Field(default_factory=dict, description="Trace metadata")
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class JudgeScoreRequest(BaseModel):
|
|
120
|
+
"""Request body for POST /api/judge/v1/score."""
|
|
121
|
+
|
|
122
|
+
policy_name: str = Field(..., description="Name of the policy being evaluated")
|
|
123
|
+
task_app: JudgeTaskApp = Field(..., description="Task application metadata")
|
|
124
|
+
trace: JudgeTracePayload = Field(..., description="Trajectory trace to evaluate")
|
|
125
|
+
options: JudgeOptions = Field(default_factory=lambda: JudgeOptions(), description="Judge options")
|
|
126
|
+
rubric: dict[str, Any] | None = Field(None, description="Optional explicit rubric criteria")
|
|
127
|
+
|
synth_ai/py.typed
ADDED
|
File without changes
|
synth_ai/task/__init__.py
CHANGED
|
@@ -5,6 +5,9 @@ from .auth import (
|
|
|
5
5
|
)
|
|
6
6
|
from .client import TaskAppClient
|
|
7
7
|
from .contracts import (
|
|
8
|
+
DatasetInfo,
|
|
9
|
+
InferenceInfo,
|
|
10
|
+
LimitsInfo,
|
|
8
11
|
RolloutEnvSpec,
|
|
9
12
|
RolloutMetrics,
|
|
10
13
|
RolloutPolicySpec,
|
|
@@ -14,8 +17,10 @@ from .contracts import (
|
|
|
14
17
|
RolloutSafetyConfig,
|
|
15
18
|
RolloutStep,
|
|
16
19
|
RolloutTrajectory,
|
|
17
|
-
|
|
20
|
+
RubricInfo,
|
|
21
|
+
RubricSection,
|
|
18
22
|
TaskAppEndpoints,
|
|
23
|
+
TaskDescriptor,
|
|
19
24
|
TaskInfo,
|
|
20
25
|
)
|
|
21
26
|
from .datasets import TaskDatasetRegistry, TaskDatasetSpec
|
|
@@ -23,7 +28,6 @@ from .errors import error_payload, http_exception, json_error_response
|
|
|
23
28
|
from .health import task_app_health
|
|
24
29
|
from .json import to_jsonable
|
|
25
30
|
from .proxy import (
|
|
26
|
-
INTERACT_TOOL_SCHEMA,
|
|
27
31
|
extract_message_text,
|
|
28
32
|
inject_system_hint,
|
|
29
33
|
parse_tool_call_from_text,
|
|
@@ -46,7 +50,7 @@ from .server import (
|
|
|
46
50
|
create_task_app,
|
|
47
51
|
run_task_app,
|
|
48
52
|
)
|
|
49
|
-
from .validators import validate_task_app_url
|
|
53
|
+
from .validators import validate_task_app_endpoint, validate_task_app_url
|
|
50
54
|
from .vendors import (
|
|
51
55
|
get_groq_key_or_503,
|
|
52
56
|
get_openai_key_or_503,
|
|
@@ -55,8 +59,8 @@ from .vendors import (
|
|
|
55
59
|
|
|
56
60
|
__all__ = [
|
|
57
61
|
"validate_task_app_url",
|
|
62
|
+
"validate_task_app_endpoint",
|
|
58
63
|
"task_app_health",
|
|
59
|
-
"TaskAppContract",
|
|
60
64
|
"TaskAppEndpoints",
|
|
61
65
|
"RolloutEnvSpec",
|
|
62
66
|
"RolloutPolicySpec",
|
|
@@ -67,6 +71,12 @@ __all__ = [
|
|
|
67
71
|
"RolloutTrajectory",
|
|
68
72
|
"RolloutStep",
|
|
69
73
|
"RolloutMetrics",
|
|
74
|
+
"TaskDescriptor",
|
|
75
|
+
"DatasetInfo",
|
|
76
|
+
"RubricInfo",
|
|
77
|
+
"RubricSection",
|
|
78
|
+
"InferenceInfo",
|
|
79
|
+
"LimitsInfo",
|
|
70
80
|
"TaskInfo",
|
|
71
81
|
"to_jsonable",
|
|
72
82
|
"normalize_environment_api_key",
|
|
@@ -75,7 +85,6 @@ __all__ = [
|
|
|
75
85
|
"normalize_vendor_keys",
|
|
76
86
|
"get_openai_key_or_503",
|
|
77
87
|
"get_groq_key_or_503",
|
|
78
|
-
"INTERACT_TOOL_SCHEMA",
|
|
79
88
|
"prepare_for_openai",
|
|
80
89
|
"prepare_for_groq",
|
|
81
90
|
"inject_system_hint",
|