synth-ai 0.2.13.dev1__py3-none-any.whl → 0.2.13.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +12 -1
- examples/swe/task_app/grpo_swe_mini.py +55 -26
- examples/swe/task_app/hosted/rollout.py +40 -0
- examples/swe/task_app/hosted/test_service.py +5 -6
- examples/task_apps/TESTING.md +275 -0
- examples/task_apps/__init__.py +0 -0
- examples/task_apps/crafter/__init__.py +0 -0
- examples/task_apps/crafter/task_app/__init__.py +2 -0
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +18 -13
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +60 -4
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +25 -3
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +10 -0
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
- examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
- examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
- examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
- examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
- examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
- examples/task_apps/enron/__init__.py +1 -0
- examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
- examples/task_apps/enron/task_app/README.md +14 -0
- examples/task_apps/enron/task_app/__init__.py +1 -0
- examples/task_apps/enron/task_app/grpo_enron.py +906 -0
- examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
- examples/task_apps/enron/tests/__init__.py +2 -0
- examples/task_apps/enron/tests/conftest.py +115 -0
- examples/task_apps/enron/tests/integration/__init__.py +2 -0
- examples/task_apps/enron/tests/integration/test_enron_eval.py +177 -0
- examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
- examples/task_apps/enron/tests/unit/__init__.py +2 -0
- examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
- examples/task_apps/math/__init__.py +0 -0
- examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
- examples/task_apps/pokemon_battle/__init__.py +2 -0
- examples/task_apps/pokemon_battle/modal_app.py +104 -0
- examples/task_apps/pokemon_battle/task_app/README.md +68 -0
- examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
- examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
- examples/task_apps/pokemon_red/README.md +357 -0
- examples/task_apps/pokemon_red/__init__.py +3 -0
- examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
- examples/task_apps/pokemon_red/pallet_town_rl_config.toml +73 -0
- examples/task_apps/pokemon_red/task_app.py +606 -0
- examples/task_apps/pokemon_red/test_pallet_town_rewards.py +191 -0
- examples/task_apps/sokoban/README.md +307 -0
- examples/task_apps/sokoban/__init__.py +3 -0
- examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
- examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
- examples/task_apps/sokoban/task_app.py +1058 -0
- examples/task_apps/sokoban/tests/__init__.py +2 -0
- examples/task_apps/sokoban/tests/conftest.py +113 -0
- examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
- examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
- examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
- examples/task_apps/verilog/__init__.py +1 -0
- examples/task_apps/verilog/eval_groq_qwen32b.toml +20 -0
- examples/task_apps/verilog/task_app/README.md +12 -0
- examples/task_apps/verilog/task_app/__init__.py +1 -0
- examples/task_apps/verilog/task_app/grpo_verilog.py +931 -0
- examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
- examples/task_apps/verilog/tests/__init__.py +2 -0
- examples/task_apps/verilog/tests/conftest.py +115 -0
- examples/task_apps/verilog/tests/integration/__init__.py +2 -0
- examples/task_apps/verilog/tests/integration/test_verilog_eval.py +179 -0
- examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
- examples/task_apps/verilog/tests/unit/__init__.py +2 -0
- examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
- examples/vlm/crafter_openai_vlm_agent.py +4 -4
- examples/vlm/run_crafter_vlm_benchmark.py +4 -4
- examples/workflows/__init__.py +0 -0
- examples/workflows/math_rl/__init__.py +0 -0
- examples/workflows/math_rl/download_dataset.py +80 -0
- synth_ai/__init__.py +2 -2
- synth_ai/api/train/builders.py +25 -11
- synth_ai/api/train/cli.py +12 -6
- synth_ai/api/train/configs/__init__.py +10 -10
- synth_ai/api/train/configs/rl.py +5 -4
- synth_ai/api/train/configs/sft.py +4 -3
- synth_ai/api/train/env_resolver.py +5 -2
- synth_ai/api/train/supported_algos.py +10 -5
- synth_ai/api/train/utils.py +7 -4
- synth_ai/cli/__init__.py +7 -51
- synth_ai/cli/_storage.py +4 -3
- synth_ai/cli/_validate_task_app.py +11 -0
- synth_ai/cli/balance.py +4 -3
- synth_ai/cli/calc.py +2 -2
- synth_ai/cli/demo.py +14 -7
- synth_ai/cli/legacy_root_backup.py +1 -1
- synth_ai/cli/rl_demo.py +8 -7
- synth_ai/cli/root.py +0 -97
- synth_ai/cli/task_apps.py +1707 -186
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +28 -16
- synth_ai/environments/examples/enron/engine.py +7 -2
- synth_ai/environments/examples/enron/environment.py +68 -0
- synth_ai/environments/examples/red/engine.py +27 -0
- synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
- synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
- synth_ai/environments/examples/red/environment.py +60 -0
- synth_ai/environments/examples/sokoban/taskset.py +116 -0
- synth_ai/environments/examples/verilog/engine.py +30 -4
- synth_ai/evals/client.py +58 -61
- synth_ai/jobs/client.py +16 -4
- synth_ai/judge_schemas.py +16 -16
- synth_ai/py.typed +0 -0
- synth_ai/task/__init__.py +14 -5
- synth_ai/task/contracts.py +124 -38
- synth_ai/task/proxy.py +48 -56
- synth_ai/task/rubrics/__init__.py +53 -0
- synth_ai/task/rubrics/loaders.py +133 -0
- synth_ai/task/rubrics/models.py +57 -0
- synth_ai/task/rubrics/scoring.py +113 -0
- synth_ai/{rubrics/validators.py → task/rubrics/strict.py} +53 -30
- synth_ai/task/server.py +8 -7
- synth_ai/task/validators.py +269 -6
- synth_ai/tracing_v3/decorators.py +7 -3
- synth_ai/tracing_v3/replica_sync.py +4 -4
- synth_ai/tracing_v3/serialization.py +5 -5
- synth_ai/tracing_v3/trace_utils.py +317 -0
- synth_ai/tracing_v3/turso/native_manager.py +3 -3
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/METADATA +4 -1
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/RECORD +214 -101
- examples/agora_ex/README_MoE.md +0 -224
- examples/agora_ex/__init__.py +0 -7
- examples/agora_ex/agora_ex.py +0 -65
- examples/agora_ex/agora_ex_task_app.py +0 -590
- examples/agora_ex/configs/rl_lora_qwen3_moe_2xh200.toml +0 -121
- examples/agora_ex/reward_fn_grpo-human.py +0 -129
- examples/agora_ex/system_prompt_CURRENT.md +0 -63
- examples/agora_ex/task_app/agora_ex_task_app.py +0 -590
- examples/agora_ex/task_app/reward_fn_grpo-human.py +0 -129
- examples/agora_ex/task_app/system_prompt_CURRENT.md +0 -63
- synth_ai/rubrics/__init__.py +0 -22
- synth_ai/task/rubrics.py +0 -219
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/utils.py +0 -0
- /examples/{rl/task_app → task_apps/math}/README.md +0 -0
- /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
- /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
- /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/top_level.txt +0 -0
|
@@ -1,129 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import requests
|
|
3
|
-
from typing import Any, Dict, Union
|
|
4
|
-
|
|
5
|
-
# ---------------------------------------------------------------------------
|
|
6
|
-
# Run configuration defaults (override via kwargs when invoking reward_fn)
|
|
7
|
-
# ---------------------------------------------------------------------------
|
|
8
|
-
RUN_TYPE: str = "rl_training_human"
|
|
9
|
-
RUN_VERSION: float = 3.5
|
|
10
|
-
MODEL_NAME: str = "Qwen3-30B-A3B-Instruct"
|
|
11
|
-
EXPERIMENT_NAME: str = "qwen3_30b_human"
|
|
12
|
-
USER_PROMPT_VERSION: str = "5.0"
|
|
13
|
-
SYSTEM_PROMPT_VERSION: str = "4.0"
|
|
14
|
-
|
|
15
|
-
logger = logging.getLogger(__name__)
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def _coerce_step_value(value: Any) -> Union[int, None]:
|
|
19
|
-
try:
|
|
20
|
-
if value is None:
|
|
21
|
-
return None
|
|
22
|
-
return int(value)
|
|
23
|
-
except (TypeError, ValueError):
|
|
24
|
-
return None
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
def _build_metadata(kwargs: Dict[str, Any]) -> Dict[str, Any]:
|
|
28
|
-
"""Compose the metadata payload sent with the evaluation request."""
|
|
29
|
-
base_metadata: Dict[str, Any] = {
|
|
30
|
-
"model": kwargs.get("model", MODEL_NAME),
|
|
31
|
-
"experiment": kwargs.get("experiment", EXPERIMENT_NAME),
|
|
32
|
-
"step_number": kwargs.get("step_number"),
|
|
33
|
-
"user_prompt": kwargs.get("user_prompt", USER_PROMPT_VERSION),
|
|
34
|
-
"batch_number": kwargs.get("batch_number"),
|
|
35
|
-
"prompt_index": kwargs.get("prompt_index"),
|
|
36
|
-
"rollout_group": kwargs.get("rollout_group"),
|
|
37
|
-
"system_prompt": kwargs.get("system_prompt", SYSTEM_PROMPT_VERSION),
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
step_metadata = kwargs.get("metadata") or {}
|
|
41
|
-
if isinstance(step_metadata, dict):
|
|
42
|
-
# Map common harness metadata onto our schema when present
|
|
43
|
-
step_value = _coerce_step_value(
|
|
44
|
-
step_metadata.get("step") or step_metadata.get("step_number")
|
|
45
|
-
)
|
|
46
|
-
if step_value is not None:
|
|
47
|
-
base_metadata["step_number"] = step_value
|
|
48
|
-
|
|
49
|
-
rollout_value = step_metadata.get("rollout_group")
|
|
50
|
-
if rollout_value is not None:
|
|
51
|
-
base_metadata["rollout_group"] = rollout_value
|
|
52
|
-
|
|
53
|
-
extras = step_metadata.get("extras")
|
|
54
|
-
if extras:
|
|
55
|
-
base_metadata["extras"] = extras
|
|
56
|
-
|
|
57
|
-
# Preserve any additional custom metadata fields
|
|
58
|
-
for key, value in step_metadata.items():
|
|
59
|
-
if key in {"step", "step_number", "rollout_group", "extras"}:
|
|
60
|
-
continue
|
|
61
|
-
if key not in base_metadata or base_metadata.get(key) is None:
|
|
62
|
-
base_metadata[key] = value
|
|
63
|
-
|
|
64
|
-
# Strip keys that remain None so the JSON is clean
|
|
65
|
-
return {key: value for key, value in base_metadata.items() if value is not None}
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
def reward_fn(
|
|
69
|
-
completion: str,
|
|
70
|
-
**kwargs,
|
|
71
|
-
) -> float:
|
|
72
|
-
"""Evaluate the model response and return a reward score (0.0-1.0)."""
|
|
73
|
-
run_type = kwargs.get("run_type", RUN_TYPE)
|
|
74
|
-
run_version = kwargs.get("run_version", RUN_VERSION)
|
|
75
|
-
metadata = _build_metadata(kwargs)
|
|
76
|
-
|
|
77
|
-
payload = {
|
|
78
|
-
"code": completion,
|
|
79
|
-
"run_type": run_type,
|
|
80
|
-
"run_version": run_version,
|
|
81
|
-
"metadata": metadata,
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
try:
|
|
85
|
-
response = requests.post(
|
|
86
|
-
"https://eames-judge-api-769874896543.us-central1.run.app/evaluations-human",
|
|
87
|
-
json=payload,
|
|
88
|
-
timeout=1800, # 30 minute timeout for screenshot generation
|
|
89
|
-
)
|
|
90
|
-
response.raise_for_status()
|
|
91
|
-
result = response.json()
|
|
92
|
-
|
|
93
|
-
logger.info("Evaluation complete:")
|
|
94
|
-
logger.info(" Score: %s", result.get("score", 0.0))
|
|
95
|
-
logger.info(" Feedback: %s", result.get("explanation", "N/A"))
|
|
96
|
-
logger.info(" Processing Time (ms): %s", result.get("processing_time_ms", "N/A"))
|
|
97
|
-
logger.info(" Worker ID: %s", result.get("worker_id", "N/A"))
|
|
98
|
-
logger.info(" Success: %s", result.get("success", False))
|
|
99
|
-
if metadata:
|
|
100
|
-
logger.info(" Metadata sent: %s", metadata)
|
|
101
|
-
|
|
102
|
-
screenshot_urls = result.get("screenshot_urls", {}) or {}
|
|
103
|
-
if screenshot_urls:
|
|
104
|
-
logger.info(" Screenshot URLs:")
|
|
105
|
-
for key, url in screenshot_urls.items():
|
|
106
|
-
logger.info(" %s: %s", key.capitalize(), url)
|
|
107
|
-
|
|
108
|
-
score = result.get("score", 0.0)
|
|
109
|
-
if not isinstance(score, (int, float)):
|
|
110
|
-
logger.warning("Invalid score type: %s. Defaulting to 0.0", type(score))
|
|
111
|
-
return 0.0
|
|
112
|
-
|
|
113
|
-
return max(0.0, min(1.0, float(score)))
|
|
114
|
-
|
|
115
|
-
except requests.exceptions.Timeout:
|
|
116
|
-
logger.error("Request to evaluation server timed out")
|
|
117
|
-
return 0.0
|
|
118
|
-
|
|
119
|
-
except requests.exceptions.RequestException as exc:
|
|
120
|
-
logger.error("Request to evaluation server failed: %s", exc)
|
|
121
|
-
return 0.0
|
|
122
|
-
|
|
123
|
-
except (KeyError, ValueError, TypeError) as exc:
|
|
124
|
-
logger.error("Error parsing evaluation server response: %s", exc)
|
|
125
|
-
return 0.0
|
|
126
|
-
|
|
127
|
-
except Exception as exc: # pylint: disable=broad-except
|
|
128
|
-
logger.error("Unexpected error in reward_fn: %s", exc)
|
|
129
|
-
return 0.0
|
|
@@ -1,63 +0,0 @@
|
|
|
1
|
-
You are a powerful agentic AI coding assistant called Eames working with a Next.js + Shadcn/UI TypeScript project to generate exactly one complete landing page file for SaaS/Software products.
|
|
2
|
-
|
|
3
|
-
## OUTPUT FORMAT
|
|
4
|
-
|
|
5
|
-
Return ONLY a single TypeScript React component file.
|
|
6
|
-
- Wrap in one code fence: ```tsx ... ```
|
|
7
|
-
- No explanations, no additional text before or after the fence
|
|
8
|
-
- If you cannot generate valid code, return nothing
|
|
9
|
-
|
|
10
|
-
## FILE SPECIFICATION
|
|
11
|
-
|
|
12
|
-
File: app/page.tsx
|
|
13
|
-
Length: Target 800-1200 lines (flexible based on content richness)
|
|
14
|
-
Tech: Next.js 14 App Router, TypeScript, React, Tailwind CSS only
|
|
15
|
-
**IMPORTANT:** DO NOT use images, image imports, or next/image. Use SVG icons, Tailwind patterns, gradients, or CSS shapes instead.
|
|
16
|
-
|
|
17
|
-
## REQUIRED STRUCTURE (in this exact order)
|
|
18
|
-
|
|
19
|
-
1. Imports: NONE (do not import next/image or any other libraries)
|
|
20
|
-
|
|
21
|
-
2. Metadata export:
|
|
22
|
-
```tsx
|
|
23
|
-
export const metadata = {
|
|
24
|
-
title: "Page Title (max 80 chars)",
|
|
25
|
-
description: "Page description (max 160 chars)"
|
|
26
|
-
}
|
|
27
|
-
```
|
|
28
|
-
3. Helper components (if needed): Define small inline components AFTER the Page export
|
|
29
|
-
- Examples: FeatureCard, PricingCard, TestimonialCard, LogoItem
|
|
30
|
-
- Keep minimal, no deep nesting
|
|
31
|
-
|
|
32
|
-
## DESIGN GUIDELINES
|
|
33
|
-
Ship something interesting rather than boring, but never ugly.
|
|
34
|
-
Include images and SVGs that are relevant to the category of business.
|
|
35
|
-
|
|
36
|
-
## TECHNICAL CONSTRAINTS
|
|
37
|
-
|
|
38
|
-
✓ Server component by default (no "use client" unless interactive state/events needed)
|
|
39
|
-
✓ Tailwind utility classes for ALL styling
|
|
40
|
-
✓ Semantic HTML5: `<main>`, `<section>`, `<header>`, `<footer>`, `<h1>`-`<h6>` hierarchy
|
|
41
|
-
✓ Use inline SVG for icons (simple shapes: circles, squares, arrows, checkmarks, etc.)
|
|
42
|
-
✓ Use Tailwind gradients, borders, and shadows for visual elements
|
|
43
|
-
✓ Use CSS shapes and patterns instead of images
|
|
44
|
-
|
|
45
|
-
✗ NO images - do not use `<img>`, `<Image>`, or any image imports
|
|
46
|
-
✗ NO next/image imports
|
|
47
|
-
✗ No data fetching (fetch, axios, server actions)
|
|
48
|
-
✗ No lorem ipsum - write real, specific copy
|
|
49
|
-
|
|
50
|
-
## VALIDATION
|
|
51
|
-
|
|
52
|
-
Your output must:
|
|
53
|
-
1. Be a single valid .tsx file with NO imports whatsoever
|
|
54
|
-
2. Include `export const metadata`
|
|
55
|
-
3. Include `export default function Page()`
|
|
56
|
-
4. Include all 8 required sections in order: Navbar, Hero, Logos, Features, Testimonials, Pricing, Final CTA, Footer
|
|
57
|
-
5. Use only Tailwind for styling
|
|
58
|
-
6. Be deployable in Next.js 14 App Router without errors
|
|
59
|
-
7. Use proper TypeScript syntax
|
|
60
|
-
8. Follow the specific product category requested in the user prompt
|
|
61
|
-
9. **NO images** - use SVG icons and Tailwind styling only
|
|
62
|
-
|
|
63
|
-
Given the user's prompt describing the website theme/product, generate the code immediately.
|
synth_ai/rubrics/__init__.py
DELETED
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Rubric utilities.
|
|
3
|
-
|
|
4
|
-
Exposes helpers for validating rubric specifications that are used across
|
|
5
|
-
Crafter-style judge configurations.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
from .validators import (
|
|
9
|
-
RubricCriterion,
|
|
10
|
-
RubricSpec,
|
|
11
|
-
ValidationError,
|
|
12
|
-
validate_rubric_dict,
|
|
13
|
-
validate_rubric_file,
|
|
14
|
-
)
|
|
15
|
-
|
|
16
|
-
__all__ = [
|
|
17
|
-
"RubricCriterion",
|
|
18
|
-
"RubricSpec",
|
|
19
|
-
"ValidationError",
|
|
20
|
-
"validate_rubric_dict",
|
|
21
|
-
"validate_rubric_file",
|
|
22
|
-
]
|
synth_ai/task/rubrics.py
DELETED
|
@@ -1,219 +0,0 @@
|
|
|
1
|
-
"""Rubric schema, loading, and scoring helpers for Task Apps."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
import json
|
|
6
|
-
from collections.abc import Iterable
|
|
7
|
-
from pathlib import Path
|
|
8
|
-
from typing import Any
|
|
9
|
-
|
|
10
|
-
from pydantic import BaseModel, Field, field_validator
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class Criterion(BaseModel):
|
|
14
|
-
id: str
|
|
15
|
-
description: str
|
|
16
|
-
weight: float = 1.0
|
|
17
|
-
required: bool = False
|
|
18
|
-
|
|
19
|
-
@field_validator("weight")
|
|
20
|
-
@classmethod
|
|
21
|
-
def _validate_weight(cls, value: float) -> float:
|
|
22
|
-
if value <= 0:
|
|
23
|
-
raise ValueError("criterion weight must be positive")
|
|
24
|
-
return value
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
class Rubric(BaseModel):
|
|
28
|
-
version: str
|
|
29
|
-
goal_text: str | None = None
|
|
30
|
-
criteria: list[Criterion] = Field(default_factory=list)
|
|
31
|
-
aggregation: str = "weighted_sum"
|
|
32
|
-
|
|
33
|
-
@field_validator("aggregation")
|
|
34
|
-
@classmethod
|
|
35
|
-
def _validate_aggregation(cls, value: str) -> str:
|
|
36
|
-
allowed = {"sum", "weighted_sum", "custom", "inherit"}
|
|
37
|
-
if value not in allowed:
|
|
38
|
-
raise ValueError(f"aggregation must be one of {sorted(allowed)}")
|
|
39
|
-
return value
|
|
40
|
-
|
|
41
|
-
@field_validator("criteria")
|
|
42
|
-
@classmethod
|
|
43
|
-
def _validate_criteria(cls, criteria: list[Criterion]) -> list[Criterion]:
|
|
44
|
-
seen = set()
|
|
45
|
-
for criterion in criteria:
|
|
46
|
-
if criterion.id in seen:
|
|
47
|
-
raise ValueError(f"duplicate criterion id: {criterion.id}")
|
|
48
|
-
seen.add(criterion.id)
|
|
49
|
-
return criteria
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
def _load_text(source: str) -> tuple[str, str | None]:
|
|
53
|
-
path = Path(source)
|
|
54
|
-
if path.exists():
|
|
55
|
-
return path.read_text(encoding="utf-8"), path.suffix.lower()
|
|
56
|
-
return source, None
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
def _parse_structured(text: str, suffix: str | None) -> dict[str, Any]:
|
|
60
|
-
text = text.strip()
|
|
61
|
-
if not text:
|
|
62
|
-
raise ValueError("Rubric source is empty")
|
|
63
|
-
if suffix in (".yaml", ".yml"):
|
|
64
|
-
try:
|
|
65
|
-
import yaml # type: ignore
|
|
66
|
-
except Exception as exc: # pragma: no cover - optional dependency
|
|
67
|
-
raise RuntimeError("PyYAML is required to load YAML rubrics") from exc
|
|
68
|
-
data = yaml.safe_load(text)
|
|
69
|
-
if not isinstance(data, dict):
|
|
70
|
-
raise ValueError("Rubric YAML must produce a mapping") from None
|
|
71
|
-
return data
|
|
72
|
-
if text.startswith("{"):
|
|
73
|
-
return json.loads(text)
|
|
74
|
-
if text.startswith("http://") or text.startswith("https://"):
|
|
75
|
-
import requests # type: ignore
|
|
76
|
-
|
|
77
|
-
response = requests.get(text, timeout=15)
|
|
78
|
-
response.raise_for_status()
|
|
79
|
-
return _parse_structured(response.text, suffix)
|
|
80
|
-
try:
|
|
81
|
-
return json.loads(text)
|
|
82
|
-
except json.JSONDecodeError:
|
|
83
|
-
try:
|
|
84
|
-
import yaml # type: ignore
|
|
85
|
-
except Exception as exc: # pragma: no cover - optional dependency
|
|
86
|
-
raise RuntimeError("PyYAML is required to load rubric text") from exc
|
|
87
|
-
data = yaml.safe_load(text)
|
|
88
|
-
if not isinstance(data, dict):
|
|
89
|
-
raise ValueError("Rubric text must decode to a mapping") from None
|
|
90
|
-
return data
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
def load_rubric(source: str | dict[str, Any] | Rubric | None) -> Rubric | None:
|
|
94
|
-
if source is None:
|
|
95
|
-
return None
|
|
96
|
-
if isinstance(source, Rubric):
|
|
97
|
-
return source
|
|
98
|
-
if isinstance(source, dict):
|
|
99
|
-
return Rubric.model_validate(source)
|
|
100
|
-
text, suffix = _load_text(str(source))
|
|
101
|
-
data = _parse_structured(text, suffix)
|
|
102
|
-
return Rubric.model_validate(data)
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
def _merge_weights(base: Criterion, override: Criterion) -> float:
|
|
106
|
-
if override.weight != 1.0 and base.weight != 1.0:
|
|
107
|
-
return base.weight * override.weight
|
|
108
|
-
if override.weight != 1.0:
|
|
109
|
-
return override.weight
|
|
110
|
-
return base.weight
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
def blend_rubrics(base: Rubric | None, override: Rubric | None) -> Rubric | None:
|
|
114
|
-
if override is None and base is None:
|
|
115
|
-
return None
|
|
116
|
-
if base is None:
|
|
117
|
-
return override
|
|
118
|
-
if override is None:
|
|
119
|
-
return base
|
|
120
|
-
|
|
121
|
-
base_map = {criterion.id: criterion for criterion in base.criteria}
|
|
122
|
-
merged: list[Criterion] = []
|
|
123
|
-
|
|
124
|
-
for ov in override.criteria:
|
|
125
|
-
if ov.id in base_map:
|
|
126
|
-
existing = base_map.pop(ov.id)
|
|
127
|
-
merged.append(
|
|
128
|
-
Criterion(
|
|
129
|
-
id=ov.id,
|
|
130
|
-
description=ov.description or existing.description,
|
|
131
|
-
weight=_merge_weights(existing, ov),
|
|
132
|
-
required=ov.required if ov.required is not None else existing.required,
|
|
133
|
-
)
|
|
134
|
-
)
|
|
135
|
-
else:
|
|
136
|
-
merged.append(ov)
|
|
137
|
-
|
|
138
|
-
merged.extend(base_map.values())
|
|
139
|
-
|
|
140
|
-
aggregation = override.aggregation
|
|
141
|
-
if aggregation == "inherit":
|
|
142
|
-
aggregation = base.aggregation
|
|
143
|
-
|
|
144
|
-
return Rubric(
|
|
145
|
-
version=override.version or base.version,
|
|
146
|
-
goal_text=override.goal_text or base.goal_text,
|
|
147
|
-
criteria=merged,
|
|
148
|
-
aggregation=aggregation,
|
|
149
|
-
)
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
def _as_float(value: Any) -> float | None:
|
|
153
|
-
try:
|
|
154
|
-
return float(value)
|
|
155
|
-
except Exception:
|
|
156
|
-
return None
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
def _score(
|
|
160
|
-
criteria: Iterable[Criterion], values: dict[str, float], aggregation: str
|
|
161
|
-
) -> dict[str, Any]:
|
|
162
|
-
if aggregation == "inherit":
|
|
163
|
-
aggregation = "weighted_sum"
|
|
164
|
-
per_criterion: dict[str, dict[str, Any]] = {}
|
|
165
|
-
total = 0.0
|
|
166
|
-
total_weight = 0.0
|
|
167
|
-
for criterion in criteria:
|
|
168
|
-
score = values.get(criterion.id, 0.0)
|
|
169
|
-
per_criterion[criterion.id] = {
|
|
170
|
-
"score": score,
|
|
171
|
-
"weight": criterion.weight,
|
|
172
|
-
"required": criterion.required,
|
|
173
|
-
}
|
|
174
|
-
if aggregation == "sum":
|
|
175
|
-
total += score
|
|
176
|
-
elif aggregation == "weighted_sum":
|
|
177
|
-
total += score * criterion.weight
|
|
178
|
-
total_weight += criterion.weight
|
|
179
|
-
if aggregation == "weighted_sum" and total_weight > 0:
|
|
180
|
-
total = total / total_weight
|
|
181
|
-
if aggregation == "custom":
|
|
182
|
-
total = None # type: ignore[assignment]
|
|
183
|
-
return {
|
|
184
|
-
"aggregation": aggregation,
|
|
185
|
-
"score": total,
|
|
186
|
-
"per_criterion": per_criterion,
|
|
187
|
-
}
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
def score_events_against_rubric(
|
|
191
|
-
events: list[dict[str, Any]], rubric: Rubric | None
|
|
192
|
-
) -> dict[str, Any]:
|
|
193
|
-
if rubric is None:
|
|
194
|
-
return {"aggregation": "none", "score": None, "per_criterion": {}}
|
|
195
|
-
values: dict[str, float] = {}
|
|
196
|
-
for event in events or []:
|
|
197
|
-
if not isinstance(event, dict):
|
|
198
|
-
continue
|
|
199
|
-
cid = event.get("criterion_id") or event.get("id") or event.get("criterion")
|
|
200
|
-
score = _as_float(event.get("score"))
|
|
201
|
-
if cid and score is not None:
|
|
202
|
-
values[str(cid)] = score
|
|
203
|
-
return _score(rubric.criteria, values, rubric.aggregation)
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
def score_outcome_against_rubric(outcome: dict[str, Any], rubric: Rubric | None) -> dict[str, Any]:
|
|
207
|
-
if rubric is None:
|
|
208
|
-
return {"aggregation": "none", "score": None, "per_criterion": {}}
|
|
209
|
-
values: dict[str, float] = {}
|
|
210
|
-
if isinstance(outcome, dict):
|
|
211
|
-
candidates = (
|
|
212
|
-
outcome.get("criteria") if isinstance(outcome.get("criteria"), dict) else outcome
|
|
213
|
-
)
|
|
214
|
-
if isinstance(candidates, dict):
|
|
215
|
-
for key, value in candidates.items():
|
|
216
|
-
score = _as_float(value)
|
|
217
|
-
if score is not None:
|
|
218
|
-
values[str(key)] = score
|
|
219
|
-
return _score(rubric.criteria, values, rubric.aggregation)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
/examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py
RENAMED
|
File without changes
|
/examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py
RENAMED
|
File without changes
|
/examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py
RENAMED
|
File without changes
|
/examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
/examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py
RENAMED
|
File without changes
|
/examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py
RENAMED
|
File without changes
|
|
File without changes
|
/examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
/examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py
RENAMED
|
File without changes
|
/examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|