synth-ai 0.2.12__py3-none-any.whl → 0.2.13.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/multi_step/configs/crafter_rl_outcome.toml +74 -0
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +186 -0
- examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +83 -0
- examples/multi_step/configs/crafter_rl_stepwise_simple.toml +78 -0
- examples/multi_step/crafter_rl_lora.md +51 -10
- examples/multi_step/sse_metrics_streaming_notes.md +357 -0
- examples/multi_step/task_app_config_notes.md +7 -1
- examples/swe/task_app/grpo_swe_mini.py +55 -26
- examples/swe/task_app/hosted/rollout.py +40 -0
- examples/swe/task_app/hosted/test_service.py +5 -6
- examples/task_apps/TESTING.md +275 -0
- examples/task_apps/__init__.py +0 -0
- examples/task_apps/crafter/__init__.py +0 -0
- examples/task_apps/crafter/task_app/__init__.py +2 -0
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +21 -46
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +60 -4
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +109 -45
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +67 -49
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +242 -193
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
- examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
- examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
- examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
- examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
- examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
- examples/task_apps/enron/__init__.py +1 -0
- examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
- examples/task_apps/enron/task_app/README.md +14 -0
- examples/task_apps/enron/task_app/__init__.py +1 -0
- examples/task_apps/enron/task_app/grpo_enron.py +906 -0
- examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
- examples/task_apps/enron/tests/__init__.py +2 -0
- examples/task_apps/enron/tests/conftest.py +115 -0
- examples/task_apps/enron/tests/integration/__init__.py +2 -0
- examples/task_apps/enron/tests/integration/test_enron_eval.py +177 -0
- examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
- examples/task_apps/enron/tests/unit/__init__.py +2 -0
- examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
- examples/task_apps/math/__init__.py +0 -0
- examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
- examples/task_apps/pokemon_battle/__init__.py +2 -0
- examples/task_apps/pokemon_battle/modal_app.py +104 -0
- examples/task_apps/pokemon_battle/task_app/README.md +68 -0
- examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
- examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
- examples/task_apps/pokemon_red/README.md +357 -0
- examples/task_apps/pokemon_red/__init__.py +3 -0
- examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
- examples/task_apps/pokemon_red/pallet_town_rl_config.toml +73 -0
- examples/task_apps/pokemon_red/task_app.py +606 -0
- examples/task_apps/pokemon_red/test_pallet_town_rewards.py +191 -0
- examples/task_apps/sokoban/README.md +307 -0
- examples/task_apps/sokoban/__init__.py +3 -0
- examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
- examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
- examples/task_apps/sokoban/task_app.py +1058 -0
- examples/task_apps/sokoban/tests/__init__.py +2 -0
- examples/task_apps/sokoban/tests/conftest.py +113 -0
- examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
- examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
- examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
- examples/task_apps/verilog/__init__.py +1 -0
- examples/task_apps/verilog/eval_groq_qwen32b.toml +20 -0
- examples/task_apps/verilog/task_app/README.md +12 -0
- examples/task_apps/verilog/task_app/__init__.py +1 -0
- examples/task_apps/verilog/task_app/grpo_verilog.py +931 -0
- examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
- examples/task_apps/verilog/tests/__init__.py +2 -0
- examples/task_apps/verilog/tests/conftest.py +115 -0
- examples/task_apps/verilog/tests/integration/__init__.py +2 -0
- examples/task_apps/verilog/tests/integration/test_verilog_eval.py +179 -0
- examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
- examples/task_apps/verilog/tests/unit/__init__.py +2 -0
- examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
- examples/vlm/crafter_openai_vlm_agent.py +4 -4
- examples/vlm/run_crafter_vlm_benchmark.py +4 -4
- examples/warming_up_to_rl/configs/eval_stepwise_complex.toml +4 -2
- examples/warming_up_to_rl/configs/eval_stepwise_simple.toml +4 -2
- examples/warming_up_to_rl/run_eval.py +127 -18
- examples/workflows/__init__.py +0 -0
- examples/workflows/math_rl/__init__.py +0 -0
- examples/workflows/math_rl/download_dataset.py +80 -0
- synth_ai/__init__.py +41 -1
- synth_ai/api/train/builders.py +73 -29
- synth_ai/api/train/cli.py +12 -6
- synth_ai/api/train/configs/__init__.py +44 -0
- synth_ai/api/train/configs/rl.py +134 -0
- synth_ai/api/train/configs/sft.py +95 -0
- synth_ai/api/train/configs/shared.py +24 -0
- synth_ai/api/train/env_resolver.py +5 -2
- synth_ai/api/train/supported_algos.py +10 -5
- synth_ai/api/train/utils.py +7 -4
- synth_ai/cli/__init__.py +7 -51
- synth_ai/cli/_storage.py +4 -3
- synth_ai/cli/_validate_task_app.py +11 -0
- synth_ai/cli/balance.py +4 -3
- synth_ai/cli/calc.py +2 -2
- synth_ai/cli/demo.py +49 -43
- synth_ai/cli/legacy_root_backup.py +1 -1
- synth_ai/cli/rl_demo.py +86 -106
- synth_ai/cli/root.py +0 -97
- synth_ai/cli/task_apps.py +1710 -186
- synth_ai/demos/core/cli.py +121 -159
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +28 -16
- synth_ai/environments/examples/crafter_classic/environment.py +16 -0
- synth_ai/environments/examples/enron/engine.py +7 -2
- synth_ai/environments/examples/enron/environment.py +68 -0
- synth_ai/environments/examples/red/engine.py +27 -0
- synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
- synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
- synth_ai/environments/examples/red/environment.py +60 -0
- synth_ai/environments/examples/sokoban/taskset.py +116 -0
- synth_ai/environments/examples/verilog/engine.py +30 -4
- synth_ai/evals/__init__.py +15 -0
- synth_ai/evals/client.py +82 -0
- synth_ai/evals/types.py +42 -0
- synth_ai/jobs/client.py +16 -4
- synth_ai/judge_schemas.py +127 -0
- synth_ai/py.typed +0 -0
- synth_ai/task/__init__.py +14 -5
- synth_ai/task/contracts.py +124 -38
- synth_ai/task/proxy.py +48 -56
- synth_ai/task/rubrics/__init__.py +53 -0
- synth_ai/task/rubrics/loaders.py +133 -0
- synth_ai/task/rubrics/models.py +57 -0
- synth_ai/task/rubrics/scoring.py +113 -0
- synth_ai/task/rubrics/strict.py +149 -0
- synth_ai/task/server.py +8 -7
- synth_ai/task/validators.py +269 -6
- synth_ai/tracing_v3/decorators.py +7 -3
- synth_ai/tracing_v3/replica_sync.py +4 -4
- synth_ai/tracing_v3/serialization.py +130 -0
- synth_ai/tracing_v3/trace_utils.py +317 -0
- synth_ai/tracing_v3/turso/native_manager.py +3 -3
- {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/METADATA +4 -1
- {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/RECORD +228 -89
- {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/entry_points.txt +0 -1
- synth_ai/task/rubrics.py +0 -219
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/utils.py +0 -0
- /examples/{rl/task_app → task_apps/math}/README.md +0 -0
- /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
- /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
- /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
- {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
# Task App Testing Guide
|
|
2
|
+
|
|
3
|
+
This document describes how to run tests for the task apps in this directory.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
Each task app has unit and integration tests following a consistent pattern inspired by the customer environment tests in `customers/`.
|
|
8
|
+
|
|
9
|
+
## Test Structure
|
|
10
|
+
|
|
11
|
+
```
|
|
12
|
+
examples/task_apps/<app_name>/tests/
|
|
13
|
+
├── __init__.py
|
|
14
|
+
├── integration/
|
|
15
|
+
│ ├── __init__.py
|
|
16
|
+
│ └── test_<app>_eval.py # Server startup + eval tests
|
|
17
|
+
└── unit/
|
|
18
|
+
├── __init__.py
|
|
19
|
+
└── test_<app>_*.py # Environment, scoring, dataset tests
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Running Tests
|
|
23
|
+
|
|
24
|
+
### Prerequisites
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
# Install test dependencies
|
|
28
|
+
uv sync --dev
|
|
29
|
+
|
|
30
|
+
# Set required environment variables
|
|
31
|
+
export GROQ_API_KEY="your-groq-key"
|
|
32
|
+
export OPENAI_API_KEY="your-openai-key" # For Sokoban
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### Run All Tests for a Task App
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
# Verilog
|
|
39
|
+
pytest examples/task_apps/verilog/tests/ -v
|
|
40
|
+
|
|
41
|
+
# Enron
|
|
42
|
+
pytest examples/task_apps/enron/tests/ -v
|
|
43
|
+
|
|
44
|
+
# Sokoban
|
|
45
|
+
pytest examples/task_apps/sokoban/tests/ -v
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### Run Only Unit Tests (Fast)
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
# Runs quickly, no server startup required
|
|
52
|
+
pytest examples/task_apps/verilog/tests/unit/ -v
|
|
53
|
+
pytest examples/task_apps/enron/tests/unit/ -v
|
|
54
|
+
pytest examples/task_apps/sokoban/tests/unit/ -v
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Run Only Integration Tests
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
# Slower, starts servers and runs evals
|
|
61
|
+
pytest examples/task_apps/verilog/tests/integration/ -v
|
|
62
|
+
pytest examples/task_apps/enron/tests/integration/ -v
|
|
63
|
+
pytest examples/task_apps/sokoban/tests/integration/ -v
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### Run All Task App Tests
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
# Run everything
|
|
70
|
+
pytest examples/task_apps/*/tests/ -v
|
|
71
|
+
|
|
72
|
+
# Skip slow tests
|
|
73
|
+
pytest examples/task_apps/*/tests/ -v -m "not slow"
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Test Categories
|
|
77
|
+
|
|
78
|
+
### Unit Tests
|
|
79
|
+
|
|
80
|
+
**Purpose**: Test individual components in isolation
|
|
81
|
+
- Environment initialization
|
|
82
|
+
- Reward calculation
|
|
83
|
+
- Tool implementations
|
|
84
|
+
- State management
|
|
85
|
+
|
|
86
|
+
**Characteristics**:
|
|
87
|
+
- Fast (< 1 second each)
|
|
88
|
+
- No external dependencies
|
|
89
|
+
- No server startup
|
|
90
|
+
- No API calls
|
|
91
|
+
|
|
92
|
+
**Examples**:
|
|
93
|
+
- `test_verilog_scoring.py`: Tests reward components (compile, simulate, submit)
|
|
94
|
+
- `test_enron_environment.py`: Tests search, answer, reward calculation
|
|
95
|
+
- `test_sokoban_environment.py`: Tests actions, rewards, truncation
|
|
96
|
+
|
|
97
|
+
### Integration Tests
|
|
98
|
+
|
|
99
|
+
**Purpose**: Test the full system end-to-end
|
|
100
|
+
- Server startup
|
|
101
|
+
- Health/info endpoints
|
|
102
|
+
- Full evaluation runs
|
|
103
|
+
- **Rollout execution** (manual and policy-driven)
|
|
104
|
+
|
|
105
|
+
**Characteristics**:
|
|
106
|
+
- Slower (30-300 seconds)
|
|
107
|
+
- Requires server startup
|
|
108
|
+
- May require API keys
|
|
109
|
+
- Tests real workflows
|
|
110
|
+
|
|
111
|
+
**Examples**:
|
|
112
|
+
- `test_verilog_eval.py`: Starts server, runs Groq eval with Qwen3-32B
|
|
113
|
+
- `test_verilog_rollout.py`: **Manual & policy rollouts via /rollout endpoint**
|
|
114
|
+
- `test_enron_eval.py`: Starts server, runs Groq eval
|
|
115
|
+
- `test_enron_rollout.py`: **Manual & policy rollouts, auth testing**
|
|
116
|
+
- `test_sokoban_eval.py`: Starts server, tests manual rollout
|
|
117
|
+
- `test_sokoban_rollout.py`: **6 rollout tests (manual, policy, difficulties, limits)**
|
|
118
|
+
|
|
119
|
+
## What Each Test Validates
|
|
120
|
+
|
|
121
|
+
### Verilog Tests
|
|
122
|
+
|
|
123
|
+
**Unit Tests** (4 tests):
|
|
124
|
+
- ✅ Compile success gives +0.1 reward
|
|
125
|
+
- ✅ Simulation pass gives +1.0 reward
|
|
126
|
+
- ✅ Submit success gives +10.0 reward
|
|
127
|
+
- ✅ Submit checks last simulation output correctly
|
|
128
|
+
|
|
129
|
+
**Integration Tests** (5 tests):
|
|
130
|
+
- ✅ Server starts and responds to /health
|
|
131
|
+
- ✅ /task_info returns valid Verilog task metadata
|
|
132
|
+
- ✅ Full eval with Qwen3-32B completes successfully
|
|
133
|
+
- ✅ **Manual rollout** with explicit write/compile/simulate/submit
|
|
134
|
+
- ✅ **Policy rollout** using Groq/Qwen3-32B (verifies LLM integration)
|
|
135
|
+
|
|
136
|
+
### Enron Tests
|
|
137
|
+
|
|
138
|
+
**Unit Tests** (3 tests):
|
|
139
|
+
- ✅ search_emails tool works correctly
|
|
140
|
+
- ✅ answer_question tool calculates rewards
|
|
141
|
+
- ✅ Exact answer match gives high reward (>0.9)
|
|
142
|
+
- ✅ Partial answer match gives medium reward (>0.5)
|
|
143
|
+
- ✅ Wrong answer gives low reward (<0.5)
|
|
144
|
+
|
|
145
|
+
**Integration Tests** (6 tests):
|
|
146
|
+
- ✅ Server starts and responds to /health
|
|
147
|
+
- ✅ /task_info returns valid Enron task metadata
|
|
148
|
+
- ✅ Full eval with Qwen3-32B completes successfully
|
|
149
|
+
- ✅ **Manual rollout** with explicit search/read/answer actions
|
|
150
|
+
- ✅ **Policy rollout** using Groq/Qwen3-32B
|
|
151
|
+
- ✅ **Authentication** enforcement (rejects requests without auth header)
|
|
152
|
+
|
|
153
|
+
### Sokoban Tests
|
|
154
|
+
|
|
155
|
+
**Unit Tests** (3 tests):
|
|
156
|
+
- ✅ Module imports work correctly
|
|
157
|
+
- ✅ Reward components exist (goal achieved, step penalty)
|
|
158
|
+
- ✅ Engine creation with different difficulty levels
|
|
159
|
+
|
|
160
|
+
**Integration Tests** (9 tests):
|
|
161
|
+
- ✅ Server starts and responds to /health
|
|
162
|
+
- ✅ /task_info returns valid Sokoban task metadata
|
|
163
|
+
- ✅ **Manual rollout** with movement actions (left/right/up/down)
|
|
164
|
+
- ✅ **Policy rollout** with OpenAI GPT-5-mini (may skip if slow)
|
|
165
|
+
- ✅ **All difficulty levels** (easy/medium/hard) work correctly
|
|
166
|
+
- ✅ **Max steps limit** enforcement (stops at configured limit)
|
|
167
|
+
- ✅ **Puzzle completion detection** (terminated=True when solved)
|
|
168
|
+
- ✅ Truncation on max_steps
|
|
169
|
+
- ✅ Response structure validation
|
|
170
|
+
|
|
171
|
+
## Debugging Test Failures
|
|
172
|
+
|
|
173
|
+
### Server Won't Start
|
|
174
|
+
|
|
175
|
+
```bash
|
|
176
|
+
# Check if port is already in use
|
|
177
|
+
lsof -i :<port>
|
|
178
|
+
|
|
179
|
+
# Check logs manually
|
|
180
|
+
uv run -m synth_ai task-app serve <app_name> --port 8999
|
|
181
|
+
|
|
182
|
+
# Check environment variables
|
|
183
|
+
echo $GROQ_API_KEY
|
|
184
|
+
echo $OPENAI_API_KEY
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
### Tests Timeout
|
|
188
|
+
|
|
189
|
+
```bash
|
|
190
|
+
# Run with more verbose output
|
|
191
|
+
pytest <test_file> -v -s
|
|
192
|
+
|
|
193
|
+
# Skip slow tests
|
|
194
|
+
pytest <test_file> -v --timeout=60
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
### Import Errors
|
|
198
|
+
|
|
199
|
+
```bash
|
|
200
|
+
# Ensure you're in the right directory
|
|
201
|
+
cd /path/to/synth-ai
|
|
202
|
+
|
|
203
|
+
# Reinstall dependencies
|
|
204
|
+
uv sync --dev
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
## CI/CD Integration
|
|
208
|
+
|
|
209
|
+
These tests can be run in CI with:
|
|
210
|
+
|
|
211
|
+
```yaml
|
|
212
|
+
# .github/workflows/test-task-apps.yml
|
|
213
|
+
- name: Run task app tests
|
|
214
|
+
env:
|
|
215
|
+
GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
|
|
216
|
+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
|
217
|
+
run: |
|
|
218
|
+
# Unit tests (fast, always run)
|
|
219
|
+
pytest examples/task_apps/*/tests/unit/ -v
|
|
220
|
+
|
|
221
|
+
# Integration tests (slower, only on main)
|
|
222
|
+
if [ "$GITHUB_REF" = "refs/heads/main" ]; then
|
|
223
|
+
pytest examples/task_apps/*/tests/integration/ -v --timeout=300
|
|
224
|
+
fi
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
## Adding Tests for New Task Apps
|
|
228
|
+
|
|
229
|
+
When creating a new task app, follow this pattern:
|
|
230
|
+
|
|
231
|
+
1. **Create test structure**:
|
|
232
|
+
```bash
|
|
233
|
+
mkdir -p examples/task_apps/<new_app>/tests/{unit,integration}
|
|
234
|
+
touch examples/task_apps/<new_app>/tests/__init__.py
|
|
235
|
+
touch examples/task_apps/<new_app>/tests/unit/__init__.py
|
|
236
|
+
touch examples/task_apps/<new_app>/tests/integration/__init__.py
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
2. **Create unit tests** (`tests/unit/test_<app>_*.py`):
|
|
240
|
+
- Test environment initialization
|
|
241
|
+
- Test reward calculation
|
|
242
|
+
- Test tool implementations
|
|
243
|
+
- Test edge cases
|
|
244
|
+
|
|
245
|
+
3. **Create integration tests** (`tests/integration/test_<app>_eval.py`):
|
|
246
|
+
- Copy from an existing integration test
|
|
247
|
+
- Update app name, port, config path
|
|
248
|
+
- Add app-specific endpoint tests
|
|
249
|
+
|
|
250
|
+
4. **Add to CI**:
|
|
251
|
+
- Update CI config to include new tests
|
|
252
|
+
- Ensure required env vars are set
|
|
253
|
+
|
|
254
|
+
## Test Coverage Goals
|
|
255
|
+
|
|
256
|
+
- Unit test coverage: >80%
|
|
257
|
+
- Integration test coverage: 100% of critical paths
|
|
258
|
+
- All public APIs have at least one integration test
|
|
259
|
+
- All reward components have unit tests
|
|
260
|
+
|
|
261
|
+
## Common Issues
|
|
262
|
+
|
|
263
|
+
### "Task app terminated immediately"
|
|
264
|
+
- Check that the app name is correct
|
|
265
|
+
- Verify the app is registered in `synth_ai/task/apps.py`
|
|
266
|
+
- Check recent changes to the app code
|
|
267
|
+
|
|
268
|
+
### "GROQ_API_KEY must be set"
|
|
269
|
+
- Set the environment variable
|
|
270
|
+
- Or skip the test: `pytest -k "not groq"`
|
|
271
|
+
|
|
272
|
+
### "Config file not found"
|
|
273
|
+
- Ensure eval config exists in task app directory
|
|
274
|
+
- Check the path in the test matches actual location
|
|
275
|
+
|
|
File without changes
|
|
File without changes
|
|
@@ -68,7 +68,7 @@ def _resolve_repo_root() -> Path:
|
|
|
68
68
|
def _resolve_task_app_root(repo_root: Path) -> Path:
|
|
69
69
|
"""Locate the task_app directory even when the module is copied to a temp mount."""
|
|
70
70
|
|
|
71
|
-
preferred = (repo_root / "examples" / "
|
|
71
|
+
preferred = (repo_root / "examples" / "task_apps" / "crafter" / "task_app").resolve()
|
|
72
72
|
if preferred.is_dir():
|
|
73
73
|
return preferred
|
|
74
74
|
|
|
@@ -81,7 +81,7 @@ def _resolve_task_app_root(repo_root: Path) -> Path:
|
|
|
81
81
|
if (candidate / "synth_envs_hosted").is_dir():
|
|
82
82
|
return candidate
|
|
83
83
|
|
|
84
|
-
fallback = Path("/opt/synth_ai_repo/examples/
|
|
84
|
+
fallback = Path("/opt/synth_ai_repo/examples/task_apps/crafter/task_app")
|
|
85
85
|
if fallback.is_dir():
|
|
86
86
|
return fallback.resolve()
|
|
87
87
|
|
|
@@ -93,6 +93,7 @@ TASK_APP_ROOT = _resolve_task_app_root(REPO_ROOT)
|
|
|
93
93
|
SYNTH_ENVS_HOSTED_ROOT = (TASK_APP_ROOT / "synth_envs_hosted").resolve()
|
|
94
94
|
|
|
95
95
|
EXAMPLES_ROOT = (REPO_ROOT / "examples").resolve()
|
|
96
|
+
RUBRICS_ROOT = (EXAMPLES_ROOT / "multi_step" / "rubrics").resolve()
|
|
96
97
|
|
|
97
98
|
for path in (REPO_ROOT, TASK_APP_ROOT, SYNTH_ENVS_HOSTED_ROOT, EXAMPLES_ROOT):
|
|
98
99
|
try:
|
|
@@ -305,13 +306,16 @@ def build_dataset() -> tuple[TaskDatasetRegistry, CrafterDataset]:
|
|
|
305
306
|
def _base_task_info(dataset: CrafterDataset) -> TaskInfo:
|
|
306
307
|
return TaskInfo(
|
|
307
308
|
task={"id": "crafter_classic", "name": "Crafter Classic", "version": "1.0.0"},
|
|
308
|
-
|
|
309
|
+
environment="crafter",
|
|
309
310
|
action_space={
|
|
310
311
|
"type": "discrete",
|
|
312
|
+
"description": f"Discrete action space with {len(crafter_constants.actions)} actions including movement, crafting, and interaction",
|
|
311
313
|
"size": len(crafter_constants.actions),
|
|
312
314
|
"actions": list(crafter_constants.actions),
|
|
313
315
|
},
|
|
314
316
|
observation={
|
|
317
|
+
"type": "dict",
|
|
318
|
+
"description": "RGB frame (64x64x3) plus inventory counts, achievements, and semantic map patches",
|
|
315
319
|
"summary": "RGB frame plus inventory, achievements, and semantic map patches.",
|
|
316
320
|
"keys": ["image", "inventory", "achievements", "semantic_map_patch7"],
|
|
317
321
|
"image_shape": [64, 64, 3],
|
|
@@ -335,49 +339,13 @@ def _base_task_info(dataset: CrafterDataset) -> TaskInfo:
|
|
|
335
339
|
},
|
|
336
340
|
"tool": {"name": "interact", "parallel_tool_calls": False},
|
|
337
341
|
},
|
|
338
|
-
capabilities={
|
|
339
|
-
"supports_rollout": True,
|
|
340
|
-
"supports_env_lifecycle": True,
|
|
341
|
-
"requires_api_key_header": True,
|
|
342
|
-
},
|
|
343
342
|
limits={"max_ops": 100000, "max_time_s": 3600},
|
|
344
343
|
)
|
|
345
344
|
|
|
346
345
|
|
|
347
|
-
OUTCOME_RUBRIC = load_rubric(
|
|
348
|
-
{
|
|
349
|
-
"version": "1",
|
|
350
|
-
"goal_text": "Reward unlocking Crafter achievements and survival.",
|
|
351
|
-
"aggregation": "weighted_sum",
|
|
352
|
-
"criteria": [
|
|
353
|
-
{
|
|
354
|
-
"id": "achievements",
|
|
355
|
-
"description": "Unlock achievements or crafting milestones.",
|
|
356
|
-
"weight": 1.0,
|
|
357
|
-
},
|
|
358
|
-
{
|
|
359
|
-
"id": "survival",
|
|
360
|
-
"description": "Maintain health, food, and drink levels.",
|
|
361
|
-
"weight": 1.0,
|
|
362
|
-
},
|
|
363
|
-
],
|
|
364
|
-
}
|
|
365
|
-
)
|
|
346
|
+
OUTCOME_RUBRIC = load_rubric(str(RUBRICS_ROOT / "crafter_outcome_rubric.json"))
|
|
366
347
|
|
|
367
|
-
EVENTS_RUBRIC = load_rubric(
|
|
368
|
-
{
|
|
369
|
-
"version": "1",
|
|
370
|
-
"goal_text": "Encourage purposeful step-wise exploration and crafting.",
|
|
371
|
-
"aggregation": "weighted_sum",
|
|
372
|
-
"criteria": [
|
|
373
|
-
{
|
|
374
|
-
"id": "progress_steps",
|
|
375
|
-
"description": "Actions progress quests, crafting, or exploration.",
|
|
376
|
-
"weight": 1.0,
|
|
377
|
-
}
|
|
378
|
-
],
|
|
379
|
-
}
|
|
380
|
-
)
|
|
348
|
+
EVENTS_RUBRIC = load_rubric(str(RUBRICS_ROOT / "crafter_events_rubric.json"))
|
|
381
349
|
|
|
382
350
|
|
|
383
351
|
def describe_taskset(dataset: CrafterDataset) -> dict[str, Any]:
|
|
@@ -396,29 +364,36 @@ def provide_task_instances(
|
|
|
396
364
|
dataset: CrafterDataset, base_info: TaskInfo, seeds: Sequence[int]
|
|
397
365
|
) -> Iterable[TaskInfo]:
|
|
398
366
|
infos: list[TaskInfo] = []
|
|
367
|
+
base_observation = getattr(base_info, "observation", None)
|
|
368
|
+
if hasattr(base_observation, "model_dump"):
|
|
369
|
+
observation_template = base_observation.model_dump()
|
|
370
|
+
elif isinstance(base_observation, dict):
|
|
371
|
+
observation_template = dict(base_observation)
|
|
372
|
+
else:
|
|
373
|
+
observation_template = {}
|
|
374
|
+
|
|
399
375
|
for seed_value in seeds:
|
|
400
376
|
summary = dataset.describe_seed(seed_value)
|
|
401
377
|
infos.append(
|
|
402
378
|
TaskInfo(
|
|
403
379
|
task=base_info.task,
|
|
404
|
-
|
|
380
|
+
environment=base_info.environment,
|
|
405
381
|
action_space=base_info.action_space,
|
|
406
382
|
observation={
|
|
407
|
-
**
|
|
383
|
+
**observation_template,
|
|
408
384
|
"seed": seed_value,
|
|
409
385
|
"traits": summary["traits"],
|
|
410
386
|
"inventory": summary["inventory"],
|
|
411
387
|
"player_position": summary["player_position"],
|
|
412
388
|
},
|
|
413
389
|
dataset={
|
|
414
|
-
**base_info.dataset,
|
|
390
|
+
**base_info.dataset.model_dump(),
|
|
415
391
|
"seed": seed_value,
|
|
416
392
|
"difficulty": summary["difficulty"],
|
|
417
393
|
"config": summary["config"],
|
|
418
394
|
},
|
|
419
395
|
rubric=base_info.rubric,
|
|
420
396
|
inference=base_info.inference,
|
|
421
|
-
capabilities=base_info.capabilities,
|
|
422
397
|
limits=base_info.limits,
|
|
423
398
|
)
|
|
424
399
|
)
|
|
@@ -689,7 +664,7 @@ register_task_app(
|
|
|
689
664
|
# Mount repo root so local modules resolve when deployed on Modal
|
|
690
665
|
(str(REPO_ROOT), "/opt/synth_ai_repo"),
|
|
691
666
|
(str(REPO_ROOT / "synth_ai"), "/opt/synth_ai_repo/synth_ai"),
|
|
692
|
-
(str(TASK_APP_ROOT), "/opt/synth_ai_repo/examples/
|
|
667
|
+
(str(TASK_APP_ROOT), "/opt/synth_ai_repo/examples/task_apps/crafter/task_app"),
|
|
693
668
|
),
|
|
694
669
|
secret_names=("groq-api-key", "openai-api-key"),
|
|
695
670
|
memory=16384,
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Compatibility wrapper for the GRPO Crafter task app.
|
|
2
2
|
|
|
3
3
|
This module now delegates to the TaskAppConfig defined in the colocated example at
|
|
4
|
-
`examples/
|
|
4
|
+
`examples/task_apps/crafter/task_app/grpo_crafter.py`. It is kept for legacy usage
|
|
5
5
|
(running the file directly or targeting `fastapi_app` from external tooling). Prefer using
|
|
6
6
|
`uvx synth-ai serve grpo-crafter` for local development and testing.
|
|
7
7
|
"""
|
examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py
RENAMED
|
@@ -44,6 +44,7 @@ class CrafterPolicy(Policy):
|
|
|
44
44
|
self.inference_url = inference_url
|
|
45
45
|
self.model = model
|
|
46
46
|
self.use_tools = True
|
|
47
|
+
self.use_vision = False # Enable vision for VLMs
|
|
47
48
|
# Sampling parameters (populated via initialize(config))
|
|
48
49
|
self.temperature: float | None = None
|
|
49
50
|
self.top_p: float | None = None
|
|
@@ -63,6 +64,11 @@ class CrafterPolicy(Policy):
|
|
|
63
64
|
self.model = config["model"]
|
|
64
65
|
if "use_tools" in config:
|
|
65
66
|
self.use_tools = bool(config["use_tools"])
|
|
67
|
+
if "use_vision" in config:
|
|
68
|
+
self.use_vision = bool(config["use_vision"])
|
|
69
|
+
# Auto-detect vision capability from model name if not explicitly set
|
|
70
|
+
if "use_vision" not in config and self.model:
|
|
71
|
+
self.use_vision = self._is_vision_model(self.model)
|
|
66
72
|
# Adopt sampling params from policy config (trainer passes these through)
|
|
67
73
|
if "temperature" in config:
|
|
68
74
|
self.temperature = float(config["temperature"]) # fail fast on bad types
|
|
@@ -384,6 +390,7 @@ class CrafterPolicy(Policy):
|
|
|
384
390
|
"inference_url": self.inference_url,
|
|
385
391
|
"model": self.model,
|
|
386
392
|
"use_tools": self.use_tools,
|
|
393
|
+
"use_vision": self.use_vision,
|
|
387
394
|
},
|
|
388
395
|
"state": self.state_dict(),
|
|
389
396
|
}
|
|
@@ -396,7 +403,8 @@ class CrafterPolicy(Policy):
|
|
|
396
403
|
inference_url=config["inference_url"],
|
|
397
404
|
model=config.get("model"),
|
|
398
405
|
)
|
|
399
|
-
policy.use_tools = bool(config
|
|
406
|
+
policy.use_tools = bool(config.get("use_tools", True))
|
|
407
|
+
policy.use_vision = bool(config.get("use_vision", False))
|
|
400
408
|
policy.load_state_dict(state)
|
|
401
409
|
return policy
|
|
402
410
|
|
|
@@ -446,12 +454,60 @@ class CrafterPolicy(Policy):
|
|
|
446
454
|
|
|
447
455
|
return format_observation(obs_data, step_count=step_idx, max_steps=max_steps)
|
|
448
456
|
|
|
457
|
+
@staticmethod
|
|
458
|
+
def _is_vision_model(model_name: str) -> bool:
|
|
459
|
+
"""Check if a model supports vision/image inputs based on its name."""
|
|
460
|
+
if not model_name:
|
|
461
|
+
return False
|
|
462
|
+
|
|
463
|
+
model_lower = model_name.lower()
|
|
464
|
+
|
|
465
|
+
# Known vision-capable model patterns
|
|
466
|
+
vision_patterns = [
|
|
467
|
+
"gpt-4o", # GPT-4o series
|
|
468
|
+
"gpt-4-turbo", # GPT-4 Turbo with vision
|
|
469
|
+
"gpt-4-vision", # Explicit vision variant
|
|
470
|
+
"gpt-5", # GPT-5 series (all variants support vision)
|
|
471
|
+
"claude-3", # All Claude 3 models support vision
|
|
472
|
+
"gemini", # Gemini models
|
|
473
|
+
"qwen-vl", # Qwen Vision-Language models
|
|
474
|
+
"qwen2-vl", # Qwen2 VL
|
|
475
|
+
"pixtral", # Mistral's vision model
|
|
476
|
+
"llava", # LLaVA models
|
|
477
|
+
"phi-3-vision", # Microsoft Phi-3 Vision
|
|
478
|
+
"internvl", # InternVL models
|
|
479
|
+
"cogvlm", # CogVLM models
|
|
480
|
+
"vision", # Generic vision indicator
|
|
481
|
+
]
|
|
482
|
+
|
|
483
|
+
return any(pattern in model_lower for pattern in vision_patterns)
|
|
484
|
+
|
|
449
485
|
def _extract_image_parts(
|
|
450
486
|
self, observation: dict[str, Any] | None
|
|
451
487
|
) -> list[dict[str, Any]]:
|
|
452
|
-
"""
|
|
453
|
-
|
|
454
|
-
|
|
488
|
+
"""Extract image parts from crafter observation for vision-capable models.
|
|
489
|
+
|
|
490
|
+
Returns OpenAI-style image_url format if vision is enabled and image data is available.
|
|
491
|
+
"""
|
|
492
|
+
# Only extract images if vision is enabled for this policy
|
|
493
|
+
if not self.use_vision:
|
|
494
|
+
return []
|
|
495
|
+
|
|
496
|
+
if not observation:
|
|
497
|
+
return []
|
|
498
|
+
|
|
499
|
+
# Get the observation data (could be nested)
|
|
500
|
+
obs = observation.get("observation", observation)
|
|
501
|
+
if not isinstance(obs, dict):
|
|
502
|
+
return []
|
|
503
|
+
|
|
504
|
+
# Extract the data URL (includes base64-encoded image)
|
|
505
|
+
data_url = obs.get("observation_image_data_url")
|
|
506
|
+
if not data_url or not isinstance(data_url, str):
|
|
507
|
+
return []
|
|
508
|
+
|
|
509
|
+
# Return OpenAI-style image_url format
|
|
510
|
+
return [{"type": "image_url", "image_url": {"url": data_url}}]
|
|
455
511
|
|
|
456
512
|
def parse_model_response(
|
|
457
513
|
self, response: str, observation: dict[str, Any]
|