synth-ai 0.1.9__py3-none-any.whl → 0.2.1.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- synth_ai/__init__.py +28 -2
- synth_ai/core/system.py +4 -0
- synth_ai/environments/__init__.py +35 -0
- synth_ai/environments/environment/__init__.py +1 -0
- synth_ai/environments/environment/artifacts/__init__.py +1 -0
- synth_ai/environments/environment/artifacts/base.py +50 -0
- synth_ai/environments/environment/core.py +22 -0
- synth_ai/environments/environment/db/__init__.py +1 -0
- synth_ai/environments/environment/db/sqlite.py +45 -0
- synth_ai/environments/environment/registry.py +24 -0
- synth_ai/environments/environment/resources/sqlite.py +46 -0
- synth_ai/environments/environment/results.py +1 -0
- synth_ai/environments/environment/rewards/__init__.py +1 -0
- synth_ai/environments/environment/rewards/core.py +28 -0
- synth_ai/environments/environment/shared_engine.py +26 -0
- synth_ai/environments/environment/tools/__init__.py +34 -0
- synth_ai/environments/examples/__init__.py +1 -0
- synth_ai/environments/examples/crafter_classic/__init__.py +8 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_comprehensive_evaluation.py +58 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_browser.py +152 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_framework.py +1194 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_quick_evaluation.py +51 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_react_agent.py +872 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_trace_evaluation.py +1412 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/test_crafter_react_agent.py +1110 -0
- synth_ai/environments/examples/crafter_classic/config_logging.py +111 -0
- synth_ai/environments/examples/crafter_classic/engine.py +502 -0
- synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +63 -0
- synth_ai/environments/examples/crafter_classic/engine_helpers/action_map.py +5 -0
- synth_ai/environments/examples/crafter_classic/engine_helpers/serialization.py +74 -0
- synth_ai/environments/examples/crafter_classic/environment.py +255 -0
- synth_ai/environments/examples/crafter_classic/taskset.py +228 -0
- synth_ai/environments/examples/enron/agent_demos/test_synth_react.py +535 -0
- synth_ai/environments/examples/enron/art_helpers/email_search_tools.py +156 -0
- synth_ai/environments/examples/enron/art_helpers/local_email_db.py +280 -0
- synth_ai/environments/examples/enron/art_helpers/types_enron.py +24 -0
- synth_ai/environments/examples/enron/engine.py +291 -0
- synth_ai/environments/examples/enron/environment.py +165 -0
- synth_ai/environments/examples/enron/taskset.py +112 -0
- synth_ai/environments/examples/enron/units/keyword_stats.py +111 -0
- synth_ai/environments/examples/enron/units/test_email_index.py +8 -0
- synth_ai/environments/examples/minigrid/__init__.py +48 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_evaluation_framework.py +1188 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_quick_evaluation.py +47 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_react_agent.py +562 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_trace_evaluation.py +220 -0
- synth_ai/environments/examples/minigrid/agent_demos/test_minigrid_react_agent.py +393 -0
- synth_ai/environments/examples/minigrid/engine.py +589 -0
- synth_ai/environments/examples/minigrid/environment.py +274 -0
- synth_ai/environments/examples/minigrid/environment_mapping.py +242 -0
- synth_ai/environments/examples/minigrid/puzzle_loader.py +416 -0
- synth_ai/environments/examples/minigrid/taskset.py +583 -0
- synth_ai/environments/examples/minigrid/units/test_action_behavior.py +226 -0
- synth_ai/environments/examples/minigrid/units/test_debug_messages.py +83 -0
- synth_ai/environments/examples/minigrid/units/test_exploration.py +120 -0
- synth_ai/environments/examples/minigrid/units/test_minigrid_engine.py +214 -0
- synth_ai/environments/examples/minigrid/units/test_minigrid_environment.py +238 -0
- synth_ai/environments/examples/minigrid/units/test_minigrid_environment_mapping.py +301 -0
- synth_ai/environments/examples/minigrid/units/test_minigrid_taskset.py +210 -0
- synth_ai/environments/examples/nethack/__init__.py +7 -0
- synth_ai/environments/examples/nethack/achievements.py +337 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_evaluation_framework.py +981 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_quick_evaluation.py +74 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_react_agent.py +832 -0
- synth_ai/environments/examples/nethack/agent_demos/test_nethack_react_agent.py +1112 -0
- synth_ai/environments/examples/nethack/engine.py +738 -0
- synth_ai/environments/examples/nethack/environment.py +255 -0
- synth_ai/environments/examples/nethack/helpers/__init__.py +42 -0
- synth_ai/environments/examples/nethack/helpers/action_mapping.py +301 -0
- synth_ai/environments/examples/nethack/helpers/nle_wrapper.py +401 -0
- synth_ai/environments/examples/nethack/helpers/observation_utils.py +433 -0
- synth_ai/environments/examples/nethack/helpers/recording_wrapper.py +201 -0
- synth_ai/environments/examples/nethack/helpers/trajectory_recorder.py +268 -0
- synth_ai/environments/examples/nethack/helpers/visualization/replay_viewer.py +308 -0
- synth_ai/environments/examples/nethack/helpers/visualization/visualizer.py +430 -0
- synth_ai/environments/examples/nethack/taskset.py +323 -0
- synth_ai/environments/examples/nethack/units/test_nethack_engine.py +277 -0
- synth_ai/environments/examples/nethack/units/test_nethack_environment.py +281 -0
- synth_ai/environments/examples/nethack/units/test_nethack_taskset.py +213 -0
- synth_ai/environments/examples/nethack/units/test_recording.py +307 -0
- synth_ai/environments/examples/red/__init__.py +7 -0
- synth_ai/environments/examples/red/agent_demos/__init__.py +1 -0
- synth_ai/environments/examples/red/agent_demos/test_synth_react.py +1471 -0
- synth_ai/environments/examples/red/config_logging.py +110 -0
- synth_ai/environments/examples/red/engine.py +693 -0
- synth_ai/environments/examples/red/engine_helpers/__init__.py +1 -0
- synth_ai/environments/examples/red/engine_helpers/memory_map.py +28 -0
- synth_ai/environments/examples/red/engine_helpers/reward_components.py +275 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/__init__.py +142 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/adaptive_rewards.py +56 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/battle_rewards.py +283 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/composite_rewards.py +149 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/economy_rewards.py +137 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/efficiency_rewards.py +56 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/exploration_rewards.py +330 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/novelty_rewards.py +120 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_rewards.py +558 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/pokemon_rewards.py +312 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/social_rewards.py +147 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/story_rewards.py +246 -0
- synth_ai/environments/examples/red/engine_helpers/screen_analysis.py +367 -0
- synth_ai/environments/examples/red/engine_helpers/state_extraction.py +139 -0
- synth_ai/environments/examples/red/environment.py +235 -0
- synth_ai/environments/examples/red/taskset.py +77 -0
- synth_ai/environments/examples/red/test_fixes.py +125 -0
- synth_ai/environments/examples/red/test_fixes_mock.py +148 -0
- synth_ai/environments/examples/red/units/__init__.py +1 -0
- synth_ai/environments/examples/red/units/test_basic_functionality.py +97 -0
- synth_ai/environments/examples/red/units/test_button_press_requirements.py +217 -0
- synth_ai/environments/examples/red/units/test_engine.py +192 -0
- synth_ai/environments/examples/red/units/test_environment.py +455 -0
- synth_ai/environments/examples/red/units/test_exploration_strategy.py +227 -0
- synth_ai/environments/examples/red/units/test_integration.py +217 -0
- synth_ai/environments/examples/red/units/test_memory_extraction.py +111 -0
- synth_ai/environments/examples/red/units/test_menu_bug_reproduction.py +1100 -0
- synth_ai/environments/examples/red/units/test_movement_debug.py +255 -0
- synth_ai/environments/examples/red/units/test_pokemon_mcts_debug.py +163 -0
- synth_ai/environments/examples/red/units/test_pokemon_mcts_verbose.py +117 -0
- synth_ai/environments/examples/red/units/test_red_basic.py +145 -0
- synth_ai/environments/examples/red/units/test_red_comprehensive.py +323 -0
- synth_ai/environments/examples/red/units/test_retry_movement.py +195 -0
- synth_ai/environments/examples/red/units/test_reward_components.py +186 -0
- synth_ai/environments/examples/red/units/test_rom_integration.py +260 -0
- synth_ai/environments/examples/red/units/test_taskset.py +116 -0
- synth_ai/environments/examples/red/units/test_tree.py +448 -0
- synth_ai/environments/examples/sokoban/__init__.py +1 -0
- synth_ai/environments/examples/sokoban/agent_demos/sokoban_full_eval.py +900 -0
- synth_ai/environments/examples/sokoban/agent_demos/test_dspy_react.py +1 -0
- synth_ai/environments/examples/sokoban/agent_demos/test_sokoban_react_agent.py +498 -0
- synth_ai/environments/examples/sokoban/agent_demos/test_synth_lats.py +1 -0
- synth_ai/environments/examples/sokoban/agent_demos/test_synth_react_locally.py +748 -0
- synth_ai/environments/examples/sokoban/agent_demos/test_synth_react_service.py +296 -0
- synth_ai/environments/examples/sokoban/engine.py +675 -0
- synth_ai/environments/examples/sokoban/engine_helpers/__init__.py +1 -0
- synth_ai/environments/examples/sokoban/engine_helpers/room_utils.py +656 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/__init__.py +17 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/__init__.py +3 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/boxoban_env.py +129 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/render_utils.py +370 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/room_utils.py +331 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env.py +305 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_fixed_targets.py +66 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_pull.py +114 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_two_player.py +122 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_variations.py +394 -0
- synth_ai/environments/examples/sokoban/environment.py +228 -0
- synth_ai/environments/examples/sokoban/generate_verified_puzzles.py +438 -0
- synth_ai/environments/examples/sokoban/puzzle_loader.py +311 -0
- synth_ai/environments/examples/sokoban/taskset.py +425 -0
- synth_ai/environments/examples/sokoban/units/astar_common.py +94 -0
- synth_ai/environments/examples/sokoban/units/test_building_task_set.py +49 -0
- synth_ai/environments/examples/sokoban/units/test_false_positive.py +120 -0
- synth_ai/environments/examples/sokoban/units/test_simple_run_through_environment.py +119 -0
- synth_ai/environments/examples/sokoban/units/test_sokoban_environment.py +98 -0
- synth_ai/environments/examples/sokoban/units/test_tree.py +364 -0
- synth_ai/environments/examples/tictactoe/__init__.py +1 -0
- synth_ai/environments/examples/tictactoe/agent_demos/test_synth_react.py +266 -0
- synth_ai/environments/examples/tictactoe/agent_demos/test_tictactoe_react_agent.py +470 -0
- synth_ai/environments/examples/tictactoe/engine.py +368 -0
- synth_ai/environments/examples/tictactoe/environment.py +239 -0
- synth_ai/environments/examples/tictactoe/taskset.py +214 -0
- synth_ai/environments/examples/tictactoe/units/test_tictactoe_engine.py +393 -0
- synth_ai/environments/examples/tictactoe/units/test_tictactoe_environment.py +493 -0
- synth_ai/environments/examples/tictactoe/units/test_tictactoe_taskset.py +191 -0
- synth_ai/environments/examples/verilog/__init__.py +10 -0
- synth_ai/environments/examples/verilog/agent_demos/test_synth_react.py +520 -0
- synth_ai/environments/examples/verilog/engine.py +328 -0
- synth_ai/environments/examples/verilog/environment.py +349 -0
- synth_ai/environments/examples/verilog/taskset.py +418 -0
- synth_ai/environments/examples/verilog/units/test_verilog_engine.py +466 -0
- synth_ai/environments/examples/verilog/units/test_verilog_environment.py +585 -0
- synth_ai/environments/examples/verilog/units/test_verilog_integration.py +383 -0
- synth_ai/environments/examples/verilog/units/test_verilog_taskset.py +457 -0
- synth_ai/environments/reproducibility/core.py +42 -0
- synth_ai/environments/reproducibility/tree.py +364 -0
- synth_ai/environments/service/app.py +78 -0
- synth_ai/environments/service/core_routes.py +775 -0
- synth_ai/environments/service/external_registry.py +57 -0
- synth_ai/environments/service/registry.py +9 -0
- synth_ai/environments/stateful/__init__.py +1 -0
- synth_ai/environments/stateful/core.py +28 -0
- synth_ai/environments/stateful/engine.py +21 -0
- synth_ai/environments/stateful/state.py +7 -0
- synth_ai/environments/tasks/api.py +19 -0
- synth_ai/environments/tasks/core.py +78 -0
- synth_ai/environments/tasks/filters.py +39 -0
- synth_ai/environments/tasks/utils.py +89 -0
- synth_ai/environments/v0_observability/history.py +3 -0
- synth_ai/environments/v0_observability/log.py +2 -0
- synth_ai/lm/caching/constants.py +1 -0
- synth_ai/{zyk/lms → lm}/caching/ephemeral.py +4 -8
- synth_ai/{zyk/lms → lm}/caching/handler.py +15 -15
- synth_ai/{zyk/lms → lm}/caching/initialize.py +2 -4
- synth_ai/{zyk/lms → lm}/caching/persistent.py +4 -10
- synth_ai/{zyk/lms → lm}/config.py +2 -1
- synth_ai/{zyk/lms → lm}/constants.py +2 -2
- synth_ai/{zyk/lms → lm}/core/all.py +10 -10
- synth_ai/{zyk/lms → lm}/core/main.py +57 -33
- synth_ai/{zyk/lms → lm}/core/vendor_clients.py +12 -10
- synth_ai/lm/cost/monitor.py +1 -0
- synth_ai/lm/cost/statefulness.py +1 -0
- synth_ai/lm/provider_support/__init__.py +8 -0
- synth_ai/lm/provider_support/anthropic.py +945 -0
- synth_ai/lm/provider_support/openai.py +1115 -0
- synth_ai/lm/provider_support/suppress_logging.py +31 -0
- synth_ai/{zyk/lms → lm}/structured_outputs/handler.py +58 -80
- synth_ai/{zyk/lms → lm}/structured_outputs/inject.py +6 -20
- synth_ai/{zyk/lms → lm}/structured_outputs/rehabilitate.py +6 -12
- synth_ai/{zyk/lms → lm}/vendors/core/anthropic_api.py +21 -30
- synth_ai/{zyk/lms → lm}/vendors/core/gemini_api.py +37 -32
- synth_ai/{zyk/lms → lm}/vendors/core/mistral_api.py +19 -28
- synth_ai/{zyk/lms → lm}/vendors/core/openai_api.py +26 -36
- synth_ai/{zyk/lms → lm}/vendors/openai_standard.py +29 -33
- synth_ai/{zyk/lms → lm}/vendors/retries.py +1 -1
- synth_ai/lm/vendors/supported/__init__.py +0 -0
- synth_ai/{zyk/lms → lm}/vendors/supported/custom_endpoint.py +131 -118
- synth_ai/{zyk/lms → lm}/vendors/supported/deepseek.py +4 -8
- synth_ai/{zyk/lms → lm}/vendors/supported/grok.py +6 -8
- synth_ai/{zyk/lms → lm}/vendors/supported/groq.py +1 -1
- synth_ai/{zyk/lms → lm}/vendors/supported/ollama.py +2 -2
- synth_ai/{zyk/lms → lm}/vendors/supported/openrouter.py +18 -16
- synth_ai/{zyk/lms → lm}/vendors/supported/together.py +1 -1
- synth_ai/tracing/__init__.py +0 -0
- synth_ai/tracing/abstractions.py +224 -0
- synth_ai/tracing/base_client.py +91 -0
- synth_ai/tracing/client_manager.py +131 -0
- synth_ai/tracing/config.py +140 -0
- synth_ai/tracing/context.py +146 -0
- synth_ai/tracing/decorators.py +679 -0
- synth_ai/tracing/events/__init__.py +0 -0
- synth_ai/tracing/events/manage.py +147 -0
- synth_ai/tracing/events/scope.py +86 -0
- synth_ai/tracing/events/store.py +227 -0
- synth_ai/tracing/immediate_client.py +152 -0
- synth_ai/tracing/local.py +18 -0
- synth_ai/tracing/log_client_base.py +74 -0
- synth_ai/tracing/retry_queue.py +187 -0
- synth_ai/tracing/trackers.py +515 -0
- synth_ai/tracing/upload.py +504 -0
- synth_ai/tracing/utils.py +9 -0
- synth_ai/zyk/__init__.py +28 -2
- synth_ai-0.2.1.dev0.dist-info/METADATA +349 -0
- synth_ai-0.2.1.dev0.dist-info/RECORD +261 -0
- synth_ai/zyk/lms/caching/constants.py +0 -1
- synth_ai/zyk/lms/cost/monitor.py +0 -1
- synth_ai/zyk/lms/cost/statefulness.py +0 -1
- synth_ai-0.1.9.dist-info/METADATA +0 -37
- synth_ai-0.1.9.dist-info/RECORD +0 -50
- /synth_ai/{zyk/lms/__init__.py → environments/reproducibility/helpers.py} +0 -0
- /synth_ai/{zyk/lms/caching → lm}/__init__.py +0 -0
- /synth_ai/{zyk/lms/core → lm/caching}/__init__.py +0 -0
- /synth_ai/{zyk/lms → lm}/caching/dbs.py +0 -0
- /synth_ai/{zyk/lms/cost → lm/core}/__init__.py +0 -0
- /synth_ai/{zyk/lms → lm}/core/exceptions.py +0 -0
- /synth_ai/{zyk/lms/structured_outputs → lm/cost}/__init__.py +0 -0
- /synth_ai/{zyk/lms/vendors → lm/structured_outputs}/__init__.py +0 -0
- /synth_ai/{zyk/lms → lm}/tools/__init__.py +0 -0
- /synth_ai/{zyk/lms → lm}/tools/base.py +0 -0
- /synth_ai/{zyk/lms/vendors/core → lm/vendors}/__init__.py +0 -0
- /synth_ai/{zyk/lms → lm}/vendors/base.py +0 -0
- /synth_ai/{zyk/lms/vendors/local → lm/vendors/core}/__init__.py +0 -0
- /synth_ai/{zyk/lms/vendors/supported → lm/vendors/local}/__init__.py +0 -0
- /synth_ai/{zyk/lms → lm}/vendors/local/ollama.py +0 -0
- {synth_ai-0.1.9.dist-info → synth_ai-0.2.1.dev0.dist-info}/WHEEL +0 -0
- {synth_ai-0.1.9.dist-info → synth_ai-0.2.1.dev0.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.1.9.dist-info → synth_ai-0.2.1.dev0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1 @@
|
|
1
|
+
# TBD
|
@@ -0,0 +1,498 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
Test script to run ReAct agents against Sokoban environment on synth service (port 8901)
|
4
|
+
Tests gemini-1.5-flash on multiple easy Sokoban instances
|
5
|
+
"""
|
6
|
+
|
7
|
+
import asyncio
|
8
|
+
import json
|
9
|
+
import uuid
|
10
|
+
from datetime import datetime
|
11
|
+
from typing import Dict, Any, Optional, List
|
12
|
+
from pydantic import BaseModel, Field
|
13
|
+
from httpx import AsyncClient
|
14
|
+
import sys
|
15
|
+
import os
|
16
|
+
|
17
|
+
# Add the src directory to the path
|
18
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "src"))
|
19
|
+
|
20
|
+
from synth_ai.zyk import LM
|
21
|
+
from synth_ai.zyk.lms.tools.base import BaseTool
|
22
|
+
|
23
|
+
|
24
|
+
# --- Service Configuration ---
|
25
|
+
SERVICE_BASE_URL = "http://localhost:8901"
|
26
|
+
MODEL_NAME = "gpt-4.1-mini"
|
27
|
+
NUM_INSTANCES = 10
|
28
|
+
MAX_TURNS = 15
|
29
|
+
DIFFICULTY = "ultra_easy"
|
30
|
+
|
31
|
+
# ultra easy - gpt-4.1-nano - 0%, gpt-4.1-mini - 16%, o4-mini - 84%
|
32
|
+
# easy - o4-mini - 10%
|
33
|
+
|
34
|
+
# --- Action Mapping ---
|
35
|
+
ACTION_STRING_TO_INT = {
|
36
|
+
"no operation": 0,
|
37
|
+
"push up": 1,
|
38
|
+
"push down": 2,
|
39
|
+
"push left": 3,
|
40
|
+
"push right": 4,
|
41
|
+
"move up": 5,
|
42
|
+
"move down": 6,
|
43
|
+
"move left": 7,
|
44
|
+
"move right": 8,
|
45
|
+
}
|
46
|
+
|
47
|
+
|
48
|
+
# --- Tool Definitions ---
|
49
|
+
class GameActionArgs(BaseModel):
|
50
|
+
"""Arguments for game actions."""
|
51
|
+
|
52
|
+
action: str = Field(description="The action to take")
|
53
|
+
reasoning: str = Field(description="Brief explanation of why this action was chosen")
|
54
|
+
|
55
|
+
|
56
|
+
class TerminateArgs(BaseModel):
|
57
|
+
"""Arguments for termination."""
|
58
|
+
|
59
|
+
reason: str = Field(description="Reason for termination")
|
60
|
+
|
61
|
+
|
62
|
+
class GameActionTool(BaseTool):
|
63
|
+
"""Tool for performing an action in the game."""
|
64
|
+
|
65
|
+
name: str = "game_action"
|
66
|
+
arguments: type[BaseModel] = GameActionArgs
|
67
|
+
description: str = "Perform an action in the game environment."
|
68
|
+
|
69
|
+
|
70
|
+
class TerminateTool(BaseTool):
|
71
|
+
"""Tool to terminate the episode."""
|
72
|
+
|
73
|
+
name: str = "terminate"
|
74
|
+
arguments: type[BaseModel] = TerminateArgs
|
75
|
+
description: str = "End the episode when finished or no progress can be made."
|
76
|
+
|
77
|
+
|
78
|
+
# --- Base ReAct Agent ---
|
79
|
+
class BaseReActAgent:
|
80
|
+
"""Base ReAct agent for game environments."""
|
81
|
+
|
82
|
+
def __init__(self, llm: LM, max_turns: int = MAX_TURNS, verbose: bool = False):
|
83
|
+
self.llm = llm
|
84
|
+
self.max_turns = max_turns
|
85
|
+
self.verbose = verbose
|
86
|
+
self.history = []
|
87
|
+
self.system_name = "base-react-agent"
|
88
|
+
self.system_instance_id = str(uuid.uuid4())
|
89
|
+
self.tools = [GameActionTool(), TerminateTool()]
|
90
|
+
|
91
|
+
async def decide(self, obs: str, system_message: str, turn: int) -> Dict[str, Any]:
|
92
|
+
"""Get LLM decision for next action."""
|
93
|
+
# Build action history (only last 2 for brevity)
|
94
|
+
action_history = ""
|
95
|
+
if len(self.history) > 0:
|
96
|
+
action_history = "\n\nRECENT HISTORY:\n"
|
97
|
+
for i, h in enumerate(self.history[-2:], 1):
|
98
|
+
action_history += f"{i}. {h}\n"
|
99
|
+
|
100
|
+
user_content = f"Current state:\n{obs}{action_history}\n\nWhat action should I take?"
|
101
|
+
|
102
|
+
# Use the same pattern as Crafter ReAct agent
|
103
|
+
response_obj = await self.llm.respond_async(
|
104
|
+
system_message=system_message, user_message=user_content, tools=self.tools
|
105
|
+
)
|
106
|
+
|
107
|
+
tool_calls = response_obj.tool_calls
|
108
|
+
|
109
|
+
# Handle case where tool_calls is None or empty (graceful fallback)
|
110
|
+
if not tool_calls:
|
111
|
+
if self.verbose:
|
112
|
+
print(f"[WARNING] No tool calls returned by LLM, using default action")
|
113
|
+
return {
|
114
|
+
"name": "game_action",
|
115
|
+
"parameters": {
|
116
|
+
"action": "up",
|
117
|
+
"reasoning": "Default action - no tool call received",
|
118
|
+
},
|
119
|
+
}
|
120
|
+
|
121
|
+
tool_call_data = tool_calls[0]
|
122
|
+
|
123
|
+
# Handle both dict and object formats (same as Crafter)
|
124
|
+
if isinstance(tool_call_data, dict):
|
125
|
+
tool_name = tool_call_data["function"]["name"]
|
126
|
+
tool_args_str = tool_call_data["function"]["arguments"]
|
127
|
+
else:
|
128
|
+
tool_name = tool_call_data.function.name
|
129
|
+
tool_args_str = tool_call_data.function.arguments
|
130
|
+
|
131
|
+
tool_arguments = json.loads(tool_args_str)
|
132
|
+
|
133
|
+
return {"name": tool_name, "parameters": tool_arguments}
|
134
|
+
|
135
|
+
|
136
|
+
# --- Sokoban ReAct Agent ---
|
137
|
+
class SokobanReActAgent(BaseReActAgent):
|
138
|
+
"""ReAct agent for Sokoban environment."""
|
139
|
+
|
140
|
+
def __init__(self, llm: LM, max_turns: int = 15, verbose: bool = False):
|
141
|
+
super().__init__(llm, max_turns, verbose)
|
142
|
+
self.system_name = "sokoban-react-agent"
|
143
|
+
|
144
|
+
def get_system_message(self) -> str:
|
145
|
+
return """You are playing Sokoban. Push all boxes (X) onto targets (O) to win.
|
146
|
+
|
147
|
+
RULES: Move/push in 4 directions. Cannot pull boxes or push into walls/boxes.
|
148
|
+
|
149
|
+
ACTIONS: "move up", "move down", "move left", "move right", "push up", "push down", "push left", "push right", "no operation"
|
150
|
+
|
151
|
+
SYMBOLS: # = wall, _ = empty, O = target, X = box, √ = box on target, P = you
|
152
|
+
|
153
|
+
STRATEGY: Analyze layout, plan moves, avoid getting boxes stuck in corners. Use PUSH actions when next to a box to move it.
|
154
|
+
|
155
|
+
Be concise and decisive. Always use the exact action names listed above."""
|
156
|
+
|
157
|
+
def format_observation(self, obs: Dict[str, Any]) -> str:
|
158
|
+
"""Format observation for Sokoban."""
|
159
|
+
parts = []
|
160
|
+
|
161
|
+
if "room_text" in obs:
|
162
|
+
parts.append(f"Board:\n{obs['room_text']}")
|
163
|
+
|
164
|
+
if "boxes_on_target" in obs and "num_boxes" in obs:
|
165
|
+
parts.append(f"Progress: {obs['boxes_on_target']}/{obs['num_boxes']} boxes on target")
|
166
|
+
|
167
|
+
if "steps_taken" in obs and "max_steps" in obs:
|
168
|
+
parts.append(f"Steps: {obs['steps_taken']}/{obs['max_steps']}")
|
169
|
+
|
170
|
+
return "\n".join(parts)
|
171
|
+
|
172
|
+
|
173
|
+
# --- Episode Runner ---
|
174
|
+
async def run_single_episode(
|
175
|
+
client: AsyncClient, agent: SokobanReActAgent, config: Dict, instance_num: int
|
176
|
+
) -> Dict[str, Any]:
|
177
|
+
"""Run a single Sokoban episode and return episode metrics."""
|
178
|
+
try:
|
179
|
+
# Create environment
|
180
|
+
create_resp = await client.post(f"/env/Sokoban/initialize", json={"initial_state": config})
|
181
|
+
|
182
|
+
if create_resp.status_code != 200:
|
183
|
+
print(
|
184
|
+
f" Instance {instance_num}: Failed to create environment - {create_resp.status_code}: {create_resp.text}"
|
185
|
+
)
|
186
|
+
return {"eval_metric": 0.0, "rubric": {}, "error": True}
|
187
|
+
|
188
|
+
env_id = create_resp.json()["env_id"]
|
189
|
+
|
190
|
+
# Get initial observation
|
191
|
+
obs = create_resp.json()["observation"]
|
192
|
+
formatted_obs = agent.format_observation(obs)
|
193
|
+
|
194
|
+
# DEBUG: Print initial state
|
195
|
+
print(f"\n Instance {instance_num}: Starting puzzle")
|
196
|
+
print(f" Initial state:")
|
197
|
+
print(f" {formatted_obs}")
|
198
|
+
|
199
|
+
# Track episode metrics
|
200
|
+
steps_taken = 0
|
201
|
+
max_steps = config.get("max_steps", 120)
|
202
|
+
|
203
|
+
# Run episode
|
204
|
+
for turn in range(agent.max_turns):
|
205
|
+
# Get agent decision
|
206
|
+
action = await agent.decide(formatted_obs, agent.get_system_message(), turn)
|
207
|
+
|
208
|
+
# DEBUG: Print agent decision
|
209
|
+
print(
|
210
|
+
f" Turn {turn + 1}: Agent chose '{action['parameters']['action']}' - {action['parameters'].get('reasoning', 'no reasoning')}"
|
211
|
+
)
|
212
|
+
|
213
|
+
# Check for termination
|
214
|
+
if action["name"] == "terminate":
|
215
|
+
print(
|
216
|
+
f" Agent terminated: {action['parameters'].get('reason', 'no reason given')}"
|
217
|
+
)
|
218
|
+
break
|
219
|
+
|
220
|
+
# Execute action in environment
|
221
|
+
action_name = action["parameters"]["action"]
|
222
|
+
|
223
|
+
# Convert action string to integer (Sokoban expects integers)
|
224
|
+
if action_name in ACTION_STRING_TO_INT:
|
225
|
+
action_int = ACTION_STRING_TO_INT[action_name]
|
226
|
+
else:
|
227
|
+
print(f" ❌ Unknown action '{action_name}', using no-op")
|
228
|
+
action_int = 0 # Default to "no operation"
|
229
|
+
|
230
|
+
step_resp = await client.post(
|
231
|
+
f"/env/Sokoban/step",
|
232
|
+
json={
|
233
|
+
"env_id": env_id,
|
234
|
+
"request_id": str(uuid.uuid4()),
|
235
|
+
"action": {
|
236
|
+
"tool_calls": [{"tool": "interact", "args": {"action": action_int}}]
|
237
|
+
},
|
238
|
+
},
|
239
|
+
)
|
240
|
+
|
241
|
+
if step_resp.status_code != 200:
|
242
|
+
print(f" ❌ Step failed: {step_resp.status_code}: {step_resp.text}")
|
243
|
+
break
|
244
|
+
|
245
|
+
obs = step_resp.json()["observation"]
|
246
|
+
formatted_obs = agent.format_observation(obs)
|
247
|
+
|
248
|
+
# DEBUG: Print state after action
|
249
|
+
print(f" After action:")
|
250
|
+
print(f" {formatted_obs}")
|
251
|
+
|
252
|
+
# Update history
|
253
|
+
agent.history.append(f"{action_name}: {action['parameters'].get('reasoning', '')[:50]}")
|
254
|
+
|
255
|
+
# Track steps
|
256
|
+
steps_taken = obs.get("steps_taken", steps_taken + 1)
|
257
|
+
|
258
|
+
# Check if game is won
|
259
|
+
boxes_on_target = obs.get("boxes_on_target", 0)
|
260
|
+
num_boxes = obs.get("num_boxes", 0)
|
261
|
+
terminated = obs.get("terminated", False)
|
262
|
+
|
263
|
+
if terminated and boxes_on_target == num_boxes:
|
264
|
+
print(
|
265
|
+
f" ✅ Instance {instance_num}: SUCCESS! All boxes on target in {steps_taken} steps"
|
266
|
+
)
|
267
|
+
|
268
|
+
# Calculate eval metric and rubric
|
269
|
+
eval_metric = 1.0
|
270
|
+
|
271
|
+
# Create rubric - we'll estimate optimal solution as a fraction of max_steps
|
272
|
+
# This is a rough estimate since we don't have actual optimal solutions
|
273
|
+
estimated_optimal = max(num_boxes * 3, 10) # Rough estimate
|
274
|
+
step_efficiency = min(1.0, estimated_optimal / max(steps_taken, 1))
|
275
|
+
|
276
|
+
rubric = {
|
277
|
+
"solved": 1.0,
|
278
|
+
"step_efficiency": step_efficiency,
|
279
|
+
"boxes_placed": float(boxes_on_target) / max(num_boxes, 1),
|
280
|
+
"completed_in_time": 1.0 if steps_taken <= max_steps else 0.0,
|
281
|
+
}
|
282
|
+
|
283
|
+
await client.post(f"/env/Sokoban/terminate", json={"env_id": env_id})
|
284
|
+
return {
|
285
|
+
"eval_metric": eval_metric,
|
286
|
+
"rubric": rubric,
|
287
|
+
"steps_taken": steps_taken,
|
288
|
+
"boxes_on_target": boxes_on_target,
|
289
|
+
"num_boxes": num_boxes,
|
290
|
+
"solved": True,
|
291
|
+
"error": False,
|
292
|
+
}
|
293
|
+
|
294
|
+
if terminated:
|
295
|
+
print(
|
296
|
+
f" ❌ Instance {instance_num}: Game terminated without success (boxes: {boxes_on_target}/{num_boxes})"
|
297
|
+
)
|
298
|
+
break
|
299
|
+
|
300
|
+
print(f" ❌ Instance {instance_num}: Failed to solve in {agent.max_turns} turns")
|
301
|
+
|
302
|
+
# Calculate eval metric and rubric for failed episode
|
303
|
+
eval_metric = 0.0
|
304
|
+
rubric = {
|
305
|
+
"solved": 0.0,
|
306
|
+
"step_efficiency": 0.0,
|
307
|
+
"boxes_placed": float(boxes_on_target) / max(num_boxes, 1),
|
308
|
+
"completed_in_time": 0.0,
|
309
|
+
}
|
310
|
+
|
311
|
+
# Cleanup
|
312
|
+
await client.post(f"/env/Sokoban/terminate", json={"env_id": env_id})
|
313
|
+
return {
|
314
|
+
"eval_metric": eval_metric,
|
315
|
+
"rubric": rubric,
|
316
|
+
"steps_taken": steps_taken,
|
317
|
+
"boxes_on_target": boxes_on_target,
|
318
|
+
"num_boxes": num_boxes,
|
319
|
+
"solved": False,
|
320
|
+
"error": False,
|
321
|
+
}
|
322
|
+
|
323
|
+
except Exception as e:
|
324
|
+
print(f" Instance {instance_num}: Error - {e}")
|
325
|
+
import traceback
|
326
|
+
|
327
|
+
traceback.print_exc()
|
328
|
+
return {"eval_metric": 0.0, "rubric": {}, "error": True}
|
329
|
+
|
330
|
+
|
331
|
+
# --- Batch Evaluation ---
|
332
|
+
async def evaluate_sokoban_batch() -> Dict[str, Any]:
|
333
|
+
"""Evaluate Sokoban agent on multiple easy instances."""
|
334
|
+
print(f"🎯 Evaluating Sokoban on {NUM_INSTANCES} easy instances...")
|
335
|
+
|
336
|
+
llm = LM(model_name=MODEL_NAME, formatting_model_name=MODEL_NAME, temperature=0.0)
|
337
|
+
|
338
|
+
# Get easy task instances using the taskset system
|
339
|
+
from synth_ai.environments.examples.sokoban.taskset import create_task_instance_from_seed
|
340
|
+
|
341
|
+
easy_task_instances = []
|
342
|
+
task_debug_info = []
|
343
|
+
|
344
|
+
for seed in range(NUM_INSTANCES):
|
345
|
+
try:
|
346
|
+
print(f" 🔍 Creating task instance for seed {seed}...")
|
347
|
+
task_instance = await create_task_instance_from_seed(DIFFICULTY, seed)
|
348
|
+
easy_task_instances.append(task_instance)
|
349
|
+
|
350
|
+
# Extract debug info
|
351
|
+
task_id = getattr(task_instance, "id", "unknown")
|
352
|
+
metadata = getattr(task_instance, "metadata", {})
|
353
|
+
initial_snapshot = getattr(task_instance, "initial_engine_snapshot", {})
|
354
|
+
|
355
|
+
debug_info = {
|
356
|
+
"seed": seed,
|
357
|
+
"task_id": task_id,
|
358
|
+
"metadata": metadata,
|
359
|
+
"room_state_hash": hash(str(initial_snapshot.get("room_state", []))),
|
360
|
+
"room_fixed_hash": hash(str(initial_snapshot.get("room_fixed", []))),
|
361
|
+
"num_boxes": initial_snapshot.get("num_boxes", 0),
|
362
|
+
"max_steps": initial_snapshot.get("max_steps", 0),
|
363
|
+
}
|
364
|
+
task_debug_info.append(debug_info)
|
365
|
+
|
366
|
+
print(
|
367
|
+
f" ✅ Seed {seed}: task_id={task_id}, room_state_hash={debug_info['room_state_hash']}"
|
368
|
+
)
|
369
|
+
|
370
|
+
except Exception as e:
|
371
|
+
print(f" ⚠️ Failed to get task instance for seed {seed}: {e}")
|
372
|
+
continue
|
373
|
+
|
374
|
+
print(f" 📝 Generated {len(easy_task_instances)} {DIFFICULTY} task instances from seeds 0,1,2")
|
375
|
+
|
376
|
+
# Print debug summary
|
377
|
+
print(f" 🔍 Task Debug Summary:")
|
378
|
+
for info in task_debug_info:
|
379
|
+
print(
|
380
|
+
f" Seed {info['seed']}: ID={info['task_id']}, StateHash={info['room_state_hash']}, FixedHash={info['room_fixed_hash']}"
|
381
|
+
)
|
382
|
+
|
383
|
+
async with AsyncClient(base_url=SERVICE_BASE_URL, timeout=30.0) as client:
|
384
|
+
tasks = []
|
385
|
+
for i, task_instance in enumerate(easy_task_instances):
|
386
|
+
agent = SokobanReActAgent(llm, max_turns=MAX_TURNS, verbose=False)
|
387
|
+
|
388
|
+
# Extract configuration from task instance
|
389
|
+
config = {
|
390
|
+
"dim_room": list(task_instance.metadata.dim_room),
|
391
|
+
"max_steps": task_instance.metadata.max_steps,
|
392
|
+
"num_boxes": task_instance.metadata.num_boxes,
|
393
|
+
"room_fixed": task_instance.initial_engine_snapshot["room_fixed"],
|
394
|
+
"room_state": task_instance.initial_engine_snapshot["room_state"],
|
395
|
+
"boxes_on_target": task_instance.initial_engine_snapshot.get("boxes_on_target", 0),
|
396
|
+
}
|
397
|
+
|
398
|
+
tasks.append(run_single_episode(client, agent, config, i + 1))
|
399
|
+
|
400
|
+
results = await asyncio.gather(*tasks)
|
401
|
+
|
402
|
+
# Filter out error results
|
403
|
+
valid_results = [r for r in results if not r.get("error", False)]
|
404
|
+
|
405
|
+
if not valid_results:
|
406
|
+
return {
|
407
|
+
"eval_metrics": [],
|
408
|
+
"mean_eval_metric": 0.0,
|
409
|
+
"mean_rubric": {},
|
410
|
+
"num_episodes": 0,
|
411
|
+
}
|
412
|
+
|
413
|
+
# Extract eval metrics and rubrics
|
414
|
+
eval_metrics = [r["eval_metric"] for r in valid_results]
|
415
|
+
mean_eval_metric = sum(eval_metrics) / len(eval_metrics)
|
416
|
+
|
417
|
+
# Calculate mean rubric values
|
418
|
+
all_rubric_keys = set()
|
419
|
+
for r in valid_results:
|
420
|
+
all_rubric_keys.update(r["rubric"].keys())
|
421
|
+
|
422
|
+
mean_rubric = {}
|
423
|
+
for key in all_rubric_keys:
|
424
|
+
values = [r["rubric"].get(key, 0.0) for r in valid_results]
|
425
|
+
mean_rubric[key] = sum(values) / len(values)
|
426
|
+
|
427
|
+
return {
|
428
|
+
"eval_metrics": eval_metrics,
|
429
|
+
"mean_eval_metric": mean_eval_metric,
|
430
|
+
"mean_rubric": mean_rubric,
|
431
|
+
"num_episodes": len(valid_results),
|
432
|
+
}
|
433
|
+
|
434
|
+
|
435
|
+
async def main():
|
436
|
+
"""Run Sokoban evaluation."""
|
437
|
+
print(f"🎮 Sokoban ReAct Agent Evaluation")
|
438
|
+
print(f"Model: {MODEL_NAME}")
|
439
|
+
print(f"Service: {SERVICE_BASE_URL}")
|
440
|
+
print(f"Instances: {NUM_INSTANCES}")
|
441
|
+
print(f"Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
442
|
+
print("=" * 50)
|
443
|
+
|
444
|
+
# Test service health
|
445
|
+
async with AsyncClient(base_url=SERVICE_BASE_URL, timeout=10.0) as client:
|
446
|
+
try:
|
447
|
+
health_resp = await client.get("/health")
|
448
|
+
health_data = health_resp.json()
|
449
|
+
|
450
|
+
if "Sokoban" not in health_data.get("supported_environments", []):
|
451
|
+
print("❌ Sokoban not available on service")
|
452
|
+
return
|
453
|
+
|
454
|
+
print("✅ Service health check passed")
|
455
|
+
|
456
|
+
except Exception as e:
|
457
|
+
print(f"❌ Service health check failed: {e}")
|
458
|
+
return
|
459
|
+
|
460
|
+
# Run evaluation
|
461
|
+
try:
|
462
|
+
results = await evaluate_sokoban_batch()
|
463
|
+
|
464
|
+
print("\n" + "=" * 80)
|
465
|
+
print("🏆 FINAL SOKOBAN EVALUATION RESULTS")
|
466
|
+
print("=" * 80)
|
467
|
+
|
468
|
+
# Print eval metrics
|
469
|
+
print(f"📊 EVAL METRICS:")
|
470
|
+
print(f" Episodes: {results['num_episodes']}")
|
471
|
+
print(f" Individual Scores: {[f'{x:.1f}' for x in results['eval_metrics']]}")
|
472
|
+
print(f" Mean Eval Metric: {results['mean_eval_metric']:.2f}")
|
473
|
+
|
474
|
+
# Print rubric results
|
475
|
+
print(f"\n🎯 RUBRIC RESULTS:")
|
476
|
+
if results["mean_rubric"]:
|
477
|
+
for metric, score in sorted(results["mean_rubric"].items()):
|
478
|
+
print(f" {metric}: {score:.2f}")
|
479
|
+
else:
|
480
|
+
print(" No rubric data available")
|
481
|
+
|
482
|
+
# Overall assessment
|
483
|
+
print(f"\n🔍 ASSESSMENT:")
|
484
|
+
if results["mean_eval_metric"] > 0.5:
|
485
|
+
print("🎉 Excellent performance!")
|
486
|
+
elif results["mean_eval_metric"] > 0.3:
|
487
|
+
print("✅ Good performance!")
|
488
|
+
elif results["mean_eval_metric"] > 0.1:
|
489
|
+
print("⚠️ Moderate performance")
|
490
|
+
else:
|
491
|
+
print("❌ Poor performance - needs improvement")
|
492
|
+
|
493
|
+
except Exception as e:
|
494
|
+
print(f"❌ Evaluation failed: {e}")
|
495
|
+
|
496
|
+
|
497
|
+
if __name__ == "__main__":
|
498
|
+
asyncio.run(main())
|
@@ -0,0 +1 @@
|
|
1
|
+
# TBD
|