synth-ai 0.1.9__py3-none-any.whl → 0.2.1.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- synth_ai/__init__.py +28 -2
- synth_ai/core/system.py +4 -0
- synth_ai/environments/__init__.py +35 -0
- synth_ai/environments/environment/__init__.py +1 -0
- synth_ai/environments/environment/artifacts/__init__.py +1 -0
- synth_ai/environments/environment/artifacts/base.py +50 -0
- synth_ai/environments/environment/core.py +22 -0
- synth_ai/environments/environment/db/__init__.py +1 -0
- synth_ai/environments/environment/db/sqlite.py +45 -0
- synth_ai/environments/environment/registry.py +24 -0
- synth_ai/environments/environment/resources/sqlite.py +46 -0
- synth_ai/environments/environment/results.py +1 -0
- synth_ai/environments/environment/rewards/__init__.py +1 -0
- synth_ai/environments/environment/rewards/core.py +28 -0
- synth_ai/environments/environment/shared_engine.py +26 -0
- synth_ai/environments/environment/tools/__init__.py +34 -0
- synth_ai/environments/examples/__init__.py +1 -0
- synth_ai/environments/examples/crafter_classic/__init__.py +8 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_comprehensive_evaluation.py +58 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_browser.py +152 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_framework.py +1194 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_quick_evaluation.py +51 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_react_agent.py +872 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_trace_evaluation.py +1412 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/test_crafter_react_agent.py +1110 -0
- synth_ai/environments/examples/crafter_classic/config_logging.py +111 -0
- synth_ai/environments/examples/crafter_classic/engine.py +502 -0
- synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +63 -0
- synth_ai/environments/examples/crafter_classic/engine_helpers/action_map.py +5 -0
- synth_ai/environments/examples/crafter_classic/engine_helpers/serialization.py +74 -0
- synth_ai/environments/examples/crafter_classic/environment.py +255 -0
- synth_ai/environments/examples/crafter_classic/taskset.py +228 -0
- synth_ai/environments/examples/enron/agent_demos/test_synth_react.py +535 -0
- synth_ai/environments/examples/enron/art_helpers/email_search_tools.py +156 -0
- synth_ai/environments/examples/enron/art_helpers/local_email_db.py +280 -0
- synth_ai/environments/examples/enron/art_helpers/types_enron.py +24 -0
- synth_ai/environments/examples/enron/engine.py +291 -0
- synth_ai/environments/examples/enron/environment.py +165 -0
- synth_ai/environments/examples/enron/taskset.py +112 -0
- synth_ai/environments/examples/enron/units/keyword_stats.py +111 -0
- synth_ai/environments/examples/enron/units/test_email_index.py +8 -0
- synth_ai/environments/examples/minigrid/__init__.py +48 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_evaluation_framework.py +1188 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_quick_evaluation.py +47 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_react_agent.py +562 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_trace_evaluation.py +220 -0
- synth_ai/environments/examples/minigrid/agent_demos/test_minigrid_react_agent.py +393 -0
- synth_ai/environments/examples/minigrid/engine.py +589 -0
- synth_ai/environments/examples/minigrid/environment.py +274 -0
- synth_ai/environments/examples/minigrid/environment_mapping.py +242 -0
- synth_ai/environments/examples/minigrid/puzzle_loader.py +416 -0
- synth_ai/environments/examples/minigrid/taskset.py +583 -0
- synth_ai/environments/examples/minigrid/units/test_action_behavior.py +226 -0
- synth_ai/environments/examples/minigrid/units/test_debug_messages.py +83 -0
- synth_ai/environments/examples/minigrid/units/test_exploration.py +120 -0
- synth_ai/environments/examples/minigrid/units/test_minigrid_engine.py +214 -0
- synth_ai/environments/examples/minigrid/units/test_minigrid_environment.py +238 -0
- synth_ai/environments/examples/minigrid/units/test_minigrid_environment_mapping.py +301 -0
- synth_ai/environments/examples/minigrid/units/test_minigrid_taskset.py +210 -0
- synth_ai/environments/examples/nethack/__init__.py +7 -0
- synth_ai/environments/examples/nethack/achievements.py +337 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_evaluation_framework.py +981 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_quick_evaluation.py +74 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_react_agent.py +832 -0
- synth_ai/environments/examples/nethack/agent_demos/test_nethack_react_agent.py +1112 -0
- synth_ai/environments/examples/nethack/engine.py +738 -0
- synth_ai/environments/examples/nethack/environment.py +255 -0
- synth_ai/environments/examples/nethack/helpers/__init__.py +42 -0
- synth_ai/environments/examples/nethack/helpers/action_mapping.py +301 -0
- synth_ai/environments/examples/nethack/helpers/nle_wrapper.py +401 -0
- synth_ai/environments/examples/nethack/helpers/observation_utils.py +433 -0
- synth_ai/environments/examples/nethack/helpers/recording_wrapper.py +201 -0
- synth_ai/environments/examples/nethack/helpers/trajectory_recorder.py +268 -0
- synth_ai/environments/examples/nethack/helpers/visualization/replay_viewer.py +308 -0
- synth_ai/environments/examples/nethack/helpers/visualization/visualizer.py +430 -0
- synth_ai/environments/examples/nethack/taskset.py +323 -0
- synth_ai/environments/examples/nethack/units/test_nethack_engine.py +277 -0
- synth_ai/environments/examples/nethack/units/test_nethack_environment.py +281 -0
- synth_ai/environments/examples/nethack/units/test_nethack_taskset.py +213 -0
- synth_ai/environments/examples/nethack/units/test_recording.py +307 -0
- synth_ai/environments/examples/red/__init__.py +7 -0
- synth_ai/environments/examples/red/agent_demos/__init__.py +1 -0
- synth_ai/environments/examples/red/agent_demos/test_synth_react.py +1471 -0
- synth_ai/environments/examples/red/config_logging.py +110 -0
- synth_ai/environments/examples/red/engine.py +693 -0
- synth_ai/environments/examples/red/engine_helpers/__init__.py +1 -0
- synth_ai/environments/examples/red/engine_helpers/memory_map.py +28 -0
- synth_ai/environments/examples/red/engine_helpers/reward_components.py +275 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/__init__.py +142 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/adaptive_rewards.py +56 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/battle_rewards.py +283 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/composite_rewards.py +149 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/economy_rewards.py +137 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/efficiency_rewards.py +56 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/exploration_rewards.py +330 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/novelty_rewards.py +120 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_rewards.py +558 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/pokemon_rewards.py +312 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/social_rewards.py +147 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/story_rewards.py +246 -0
- synth_ai/environments/examples/red/engine_helpers/screen_analysis.py +367 -0
- synth_ai/environments/examples/red/engine_helpers/state_extraction.py +139 -0
- synth_ai/environments/examples/red/environment.py +235 -0
- synth_ai/environments/examples/red/taskset.py +77 -0
- synth_ai/environments/examples/red/test_fixes.py +125 -0
- synth_ai/environments/examples/red/test_fixes_mock.py +148 -0
- synth_ai/environments/examples/red/units/__init__.py +1 -0
- synth_ai/environments/examples/red/units/test_basic_functionality.py +97 -0
- synth_ai/environments/examples/red/units/test_button_press_requirements.py +217 -0
- synth_ai/environments/examples/red/units/test_engine.py +192 -0
- synth_ai/environments/examples/red/units/test_environment.py +455 -0
- synth_ai/environments/examples/red/units/test_exploration_strategy.py +227 -0
- synth_ai/environments/examples/red/units/test_integration.py +217 -0
- synth_ai/environments/examples/red/units/test_memory_extraction.py +111 -0
- synth_ai/environments/examples/red/units/test_menu_bug_reproduction.py +1100 -0
- synth_ai/environments/examples/red/units/test_movement_debug.py +255 -0
- synth_ai/environments/examples/red/units/test_pokemon_mcts_debug.py +163 -0
- synth_ai/environments/examples/red/units/test_pokemon_mcts_verbose.py +117 -0
- synth_ai/environments/examples/red/units/test_red_basic.py +145 -0
- synth_ai/environments/examples/red/units/test_red_comprehensive.py +323 -0
- synth_ai/environments/examples/red/units/test_retry_movement.py +195 -0
- synth_ai/environments/examples/red/units/test_reward_components.py +186 -0
- synth_ai/environments/examples/red/units/test_rom_integration.py +260 -0
- synth_ai/environments/examples/red/units/test_taskset.py +116 -0
- synth_ai/environments/examples/red/units/test_tree.py +448 -0
- synth_ai/environments/examples/sokoban/__init__.py +1 -0
- synth_ai/environments/examples/sokoban/agent_demos/sokoban_full_eval.py +900 -0
- synth_ai/environments/examples/sokoban/agent_demos/test_dspy_react.py +1 -0
- synth_ai/environments/examples/sokoban/agent_demos/test_sokoban_react_agent.py +498 -0
- synth_ai/environments/examples/sokoban/agent_demos/test_synth_lats.py +1 -0
- synth_ai/environments/examples/sokoban/agent_demos/test_synth_react_locally.py +748 -0
- synth_ai/environments/examples/sokoban/agent_demos/test_synth_react_service.py +296 -0
- synth_ai/environments/examples/sokoban/engine.py +675 -0
- synth_ai/environments/examples/sokoban/engine_helpers/__init__.py +1 -0
- synth_ai/environments/examples/sokoban/engine_helpers/room_utils.py +656 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/__init__.py +17 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/__init__.py +3 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/boxoban_env.py +129 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/render_utils.py +370 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/room_utils.py +331 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env.py +305 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_fixed_targets.py +66 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_pull.py +114 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_two_player.py +122 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_variations.py +394 -0
- synth_ai/environments/examples/sokoban/environment.py +228 -0
- synth_ai/environments/examples/sokoban/generate_verified_puzzles.py +438 -0
- synth_ai/environments/examples/sokoban/puzzle_loader.py +311 -0
- synth_ai/environments/examples/sokoban/taskset.py +425 -0
- synth_ai/environments/examples/sokoban/units/astar_common.py +94 -0
- synth_ai/environments/examples/sokoban/units/test_building_task_set.py +49 -0
- synth_ai/environments/examples/sokoban/units/test_false_positive.py +120 -0
- synth_ai/environments/examples/sokoban/units/test_simple_run_through_environment.py +119 -0
- synth_ai/environments/examples/sokoban/units/test_sokoban_environment.py +98 -0
- synth_ai/environments/examples/sokoban/units/test_tree.py +364 -0
- synth_ai/environments/examples/tictactoe/__init__.py +1 -0
- synth_ai/environments/examples/tictactoe/agent_demos/test_synth_react.py +266 -0
- synth_ai/environments/examples/tictactoe/agent_demos/test_tictactoe_react_agent.py +470 -0
- synth_ai/environments/examples/tictactoe/engine.py +368 -0
- synth_ai/environments/examples/tictactoe/environment.py +239 -0
- synth_ai/environments/examples/tictactoe/taskset.py +214 -0
- synth_ai/environments/examples/tictactoe/units/test_tictactoe_engine.py +393 -0
- synth_ai/environments/examples/tictactoe/units/test_tictactoe_environment.py +493 -0
- synth_ai/environments/examples/tictactoe/units/test_tictactoe_taskset.py +191 -0
- synth_ai/environments/examples/verilog/__init__.py +10 -0
- synth_ai/environments/examples/verilog/agent_demos/test_synth_react.py +520 -0
- synth_ai/environments/examples/verilog/engine.py +328 -0
- synth_ai/environments/examples/verilog/environment.py +349 -0
- synth_ai/environments/examples/verilog/taskset.py +418 -0
- synth_ai/environments/examples/verilog/units/test_verilog_engine.py +466 -0
- synth_ai/environments/examples/verilog/units/test_verilog_environment.py +585 -0
- synth_ai/environments/examples/verilog/units/test_verilog_integration.py +383 -0
- synth_ai/environments/examples/verilog/units/test_verilog_taskset.py +457 -0
- synth_ai/environments/reproducibility/core.py +42 -0
- synth_ai/environments/reproducibility/tree.py +364 -0
- synth_ai/environments/service/app.py +78 -0
- synth_ai/environments/service/core_routes.py +775 -0
- synth_ai/environments/service/external_registry.py +57 -0
- synth_ai/environments/service/registry.py +9 -0
- synth_ai/environments/stateful/__init__.py +1 -0
- synth_ai/environments/stateful/core.py +28 -0
- synth_ai/environments/stateful/engine.py +21 -0
- synth_ai/environments/stateful/state.py +7 -0
- synth_ai/environments/tasks/api.py +19 -0
- synth_ai/environments/tasks/core.py +78 -0
- synth_ai/environments/tasks/filters.py +39 -0
- synth_ai/environments/tasks/utils.py +89 -0
- synth_ai/environments/v0_observability/history.py +3 -0
- synth_ai/environments/v0_observability/log.py +2 -0
- synth_ai/lm/caching/constants.py +1 -0
- synth_ai/{zyk/lms → lm}/caching/ephemeral.py +4 -8
- synth_ai/{zyk/lms → lm}/caching/handler.py +15 -15
- synth_ai/{zyk/lms → lm}/caching/initialize.py +2 -4
- synth_ai/{zyk/lms → lm}/caching/persistent.py +4 -10
- synth_ai/{zyk/lms → lm}/config.py +2 -1
- synth_ai/{zyk/lms → lm}/constants.py +2 -2
- synth_ai/{zyk/lms → lm}/core/all.py +10 -10
- synth_ai/{zyk/lms → lm}/core/main.py +57 -33
- synth_ai/{zyk/lms → lm}/core/vendor_clients.py +12 -10
- synth_ai/lm/cost/monitor.py +1 -0
- synth_ai/lm/cost/statefulness.py +1 -0
- synth_ai/lm/provider_support/__init__.py +8 -0
- synth_ai/lm/provider_support/anthropic.py +945 -0
- synth_ai/lm/provider_support/openai.py +1115 -0
- synth_ai/lm/provider_support/suppress_logging.py +31 -0
- synth_ai/{zyk/lms → lm}/structured_outputs/handler.py +58 -80
- synth_ai/{zyk/lms → lm}/structured_outputs/inject.py +6 -20
- synth_ai/{zyk/lms → lm}/structured_outputs/rehabilitate.py +6 -12
- synth_ai/{zyk/lms → lm}/vendors/core/anthropic_api.py +21 -30
- synth_ai/{zyk/lms → lm}/vendors/core/gemini_api.py +37 -32
- synth_ai/{zyk/lms → lm}/vendors/core/mistral_api.py +19 -28
- synth_ai/{zyk/lms → lm}/vendors/core/openai_api.py +26 -36
- synth_ai/{zyk/lms → lm}/vendors/openai_standard.py +29 -33
- synth_ai/{zyk/lms → lm}/vendors/retries.py +1 -1
- synth_ai/lm/vendors/supported/__init__.py +0 -0
- synth_ai/{zyk/lms → lm}/vendors/supported/custom_endpoint.py +131 -118
- synth_ai/{zyk/lms → lm}/vendors/supported/deepseek.py +4 -8
- synth_ai/{zyk/lms → lm}/vendors/supported/grok.py +6 -8
- synth_ai/{zyk/lms → lm}/vendors/supported/groq.py +1 -1
- synth_ai/{zyk/lms → lm}/vendors/supported/ollama.py +2 -2
- synth_ai/{zyk/lms → lm}/vendors/supported/openrouter.py +18 -16
- synth_ai/{zyk/lms → lm}/vendors/supported/together.py +1 -1
- synth_ai/tracing/__init__.py +0 -0
- synth_ai/tracing/abstractions.py +224 -0
- synth_ai/tracing/base_client.py +91 -0
- synth_ai/tracing/client_manager.py +131 -0
- synth_ai/tracing/config.py +140 -0
- synth_ai/tracing/context.py +146 -0
- synth_ai/tracing/decorators.py +679 -0
- synth_ai/tracing/events/__init__.py +0 -0
- synth_ai/tracing/events/manage.py +147 -0
- synth_ai/tracing/events/scope.py +86 -0
- synth_ai/tracing/events/store.py +227 -0
- synth_ai/tracing/immediate_client.py +152 -0
- synth_ai/tracing/local.py +18 -0
- synth_ai/tracing/log_client_base.py +74 -0
- synth_ai/tracing/retry_queue.py +187 -0
- synth_ai/tracing/trackers.py +515 -0
- synth_ai/tracing/upload.py +504 -0
- synth_ai/tracing/utils.py +9 -0
- synth_ai/zyk/__init__.py +28 -2
- synth_ai-0.2.1.dev0.dist-info/METADATA +349 -0
- synth_ai-0.2.1.dev0.dist-info/RECORD +261 -0
- synth_ai/zyk/lms/caching/constants.py +0 -1
- synth_ai/zyk/lms/cost/monitor.py +0 -1
- synth_ai/zyk/lms/cost/statefulness.py +0 -1
- synth_ai-0.1.9.dist-info/METADATA +0 -37
- synth_ai-0.1.9.dist-info/RECORD +0 -50
- /synth_ai/{zyk/lms/__init__.py → environments/reproducibility/helpers.py} +0 -0
- /synth_ai/{zyk/lms/caching → lm}/__init__.py +0 -0
- /synth_ai/{zyk/lms/core → lm/caching}/__init__.py +0 -0
- /synth_ai/{zyk/lms → lm}/caching/dbs.py +0 -0
- /synth_ai/{zyk/lms/cost → lm/core}/__init__.py +0 -0
- /synth_ai/{zyk/lms → lm}/core/exceptions.py +0 -0
- /synth_ai/{zyk/lms/structured_outputs → lm/cost}/__init__.py +0 -0
- /synth_ai/{zyk/lms/vendors → lm/structured_outputs}/__init__.py +0 -0
- /synth_ai/{zyk/lms → lm}/tools/__init__.py +0 -0
- /synth_ai/{zyk/lms → lm}/tools/base.py +0 -0
- /synth_ai/{zyk/lms/vendors/core → lm/vendors}/__init__.py +0 -0
- /synth_ai/{zyk/lms → lm}/vendors/base.py +0 -0
- /synth_ai/{zyk/lms/vendors/local → lm/vendors/core}/__init__.py +0 -0
- /synth_ai/{zyk/lms/vendors/supported → lm/vendors/local}/__init__.py +0 -0
- /synth_ai/{zyk/lms → lm}/vendors/local/ollama.py +0 -0
- {synth_ai-0.1.9.dist-info → synth_ai-0.2.1.dev0.dist-info}/WHEEL +0 -0
- {synth_ai-0.1.9.dist-info → synth_ai-0.2.1.dev0.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.1.9.dist-info → synth_ai-0.2.1.dev0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,281 @@
|
|
1
|
+
"""Unit tests for NetHack environment."""
|
2
|
+
|
3
|
+
import pytest
|
4
|
+
import asyncio
|
5
|
+
from uuid import uuid4
|
6
|
+
|
7
|
+
from synth_ai.environments.environment.tools import EnvToolCall
|
8
|
+
from synth_ai.environments.tasks.core import Impetus, Intent
|
9
|
+
|
10
|
+
from synth_ai.environments.examples.nethack.environment import (
|
11
|
+
NetHackEnvironment,
|
12
|
+
NetHackInteractTool,
|
13
|
+
)
|
14
|
+
from synth_ai.environments.examples.nethack.taskset import (
|
15
|
+
NetHackTaskInstanceMetadata,
|
16
|
+
NetHackTaskInstance,
|
17
|
+
)
|
18
|
+
|
19
|
+
|
20
|
+
class TestNetHackEnvironment:
|
21
|
+
"""Test cases for NetHack environment."""
|
22
|
+
|
23
|
+
@pytest.fixture
|
24
|
+
def mock_task_instance(self):
|
25
|
+
"""Create a mock task instance for testing."""
|
26
|
+
metadata = NetHackTaskInstanceMetadata(
|
27
|
+
character_role="knight",
|
28
|
+
starting_level=1,
|
29
|
+
target_depth=3,
|
30
|
+
time_limit=500,
|
31
|
+
difficulty="beginner",
|
32
|
+
special_objectives=["Defeat 10 monsters"],
|
33
|
+
seed=123,
|
34
|
+
)
|
35
|
+
|
36
|
+
return NetHackTaskInstance(
|
37
|
+
id=uuid4(),
|
38
|
+
impetus=Impetus(instructions="Test knight adventure"),
|
39
|
+
intent=Intent(
|
40
|
+
rubric={"goal": "Reach depth 3"},
|
41
|
+
gold_trajectories=None,
|
42
|
+
gold_state_diff={},
|
43
|
+
),
|
44
|
+
metadata=metadata,
|
45
|
+
is_reproducible=True,
|
46
|
+
initial_engine_snapshot=None,
|
47
|
+
)
|
48
|
+
|
49
|
+
@pytest.mark.asyncio
|
50
|
+
async def test_environment_initialization(self, mock_task_instance):
|
51
|
+
"""Test environment initialization."""
|
52
|
+
env = NetHackEnvironment(mock_task_instance)
|
53
|
+
|
54
|
+
assert env.name == "NetHack"
|
55
|
+
assert env.task_instance == mock_task_instance
|
56
|
+
assert env.engine is not None
|
57
|
+
|
58
|
+
# Initialize and check observation
|
59
|
+
obs = await env.initialize()
|
60
|
+
|
61
|
+
assert isinstance(obs, dict)
|
62
|
+
assert "ascii_map" in obs
|
63
|
+
assert "message" in obs
|
64
|
+
assert "character_stats" in obs
|
65
|
+
assert "terminated" in obs
|
66
|
+
assert obs["terminated"] is False
|
67
|
+
|
68
|
+
@pytest.mark.asyncio
|
69
|
+
async def test_step_with_valid_action(self, mock_task_instance):
|
70
|
+
"""Test stepping with valid actions."""
|
71
|
+
env = NetHackEnvironment(mock_task_instance)
|
72
|
+
await env.initialize()
|
73
|
+
|
74
|
+
# Test simple string action
|
75
|
+
obs = await env.step("north")
|
76
|
+
assert "last_action" in obs
|
77
|
+
assert obs["last_action"] == "north"
|
78
|
+
assert obs["turn_count"] == 1
|
79
|
+
|
80
|
+
# Test another movement
|
81
|
+
obs = await env.step("east")
|
82
|
+
assert obs["last_action"] == "east"
|
83
|
+
assert obs["turn_count"] == 2
|
84
|
+
|
85
|
+
@pytest.mark.asyncio
|
86
|
+
async def test_step_with_invalid_action(self, mock_task_instance):
|
87
|
+
"""Test stepping with invalid actions."""
|
88
|
+
env = NetHackEnvironment(mock_task_instance)
|
89
|
+
await env.initialize()
|
90
|
+
|
91
|
+
# Test invalid action
|
92
|
+
obs = await env.step("invalid_action_xyz")
|
93
|
+
assert "error" in obs
|
94
|
+
assert "Unknown action" in obs["error"]
|
95
|
+
|
96
|
+
@pytest.mark.asyncio
|
97
|
+
async def test_tool_call_formats(self, mock_task_instance):
|
98
|
+
"""Test various tool call input formats."""
|
99
|
+
env = NetHackEnvironment(mock_task_instance)
|
100
|
+
await env.initialize()
|
101
|
+
|
102
|
+
# Test dict with action key
|
103
|
+
obs = await env.step({"action": "wait"})
|
104
|
+
assert obs["last_action"] == "wait"
|
105
|
+
|
106
|
+
# Test EnvToolCall format
|
107
|
+
tool_call = EnvToolCall(tool="interact", args={"action": "search"})
|
108
|
+
obs = await env.step(tool_call)
|
109
|
+
assert obs["last_action"] == "search"
|
110
|
+
|
111
|
+
# Test list format
|
112
|
+
obs = await env.step([{"action": "inventory"}])
|
113
|
+
assert obs["last_action"] == "inventory"
|
114
|
+
|
115
|
+
# Test nested tool_calls format
|
116
|
+
obs = await env.step({"tool_calls": [{"args": {"action": "look"}}]})
|
117
|
+
assert obs["last_action"] == "look"
|
118
|
+
|
119
|
+
@pytest.mark.asyncio
|
120
|
+
async def test_checkpoint(self, mock_task_instance):
|
121
|
+
"""Test checkpoint functionality."""
|
122
|
+
env = NetHackEnvironment(mock_task_instance)
|
123
|
+
await env.initialize()
|
124
|
+
|
125
|
+
# Take some actions
|
126
|
+
await env.step("north")
|
127
|
+
await env.step("east")
|
128
|
+
|
129
|
+
# Create checkpoint
|
130
|
+
checkpoint_obs = await env.checkpoint()
|
131
|
+
|
132
|
+
assert "final_score" in checkpoint_obs
|
133
|
+
assert "max_depth" in checkpoint_obs
|
134
|
+
assert "turn_count_final" in checkpoint_obs
|
135
|
+
assert "total_reward" in checkpoint_obs
|
136
|
+
|
137
|
+
@pytest.mark.asyncio
|
138
|
+
async def test_terminate(self, mock_task_instance):
|
139
|
+
"""Test environment termination."""
|
140
|
+
env = NetHackEnvironment(mock_task_instance)
|
141
|
+
await env.initialize()
|
142
|
+
|
143
|
+
# Take an action
|
144
|
+
await env.step("wait")
|
145
|
+
|
146
|
+
# Terminate
|
147
|
+
final_obs = await env.terminate()
|
148
|
+
|
149
|
+
assert final_obs["terminated"] is True
|
150
|
+
assert "final_score" in final_obs
|
151
|
+
assert "total_reward" in final_obs
|
152
|
+
|
153
|
+
@pytest.mark.asyncio
|
154
|
+
async def test_validate_tool_calls_edge_cases(self, mock_task_instance):
|
155
|
+
"""Test tool call validation edge cases."""
|
156
|
+
env = NetHackEnvironment(mock_task_instance)
|
157
|
+
|
158
|
+
# Test empty list
|
159
|
+
with pytest.raises(ValueError, match="Empty tool calls list"):
|
160
|
+
env.validate_tool_calls([])
|
161
|
+
|
162
|
+
# Test invalid format
|
163
|
+
with pytest.raises(ValueError, match="Invalid tool call format"):
|
164
|
+
env.validate_tool_calls(123) # type: ignore[arg-type] # Not a valid format
|
165
|
+
|
166
|
+
# Test nested args
|
167
|
+
call = env.validate_tool_calls({"args": {"action": "north"}})
|
168
|
+
assert call.args["action"] == "north"
|
169
|
+
|
170
|
+
# Test parameters key
|
171
|
+
call = env.validate_tool_calls({"parameters": {"action": "south"}})
|
172
|
+
assert call.args["action"] == "south"
|
173
|
+
|
174
|
+
@pytest.mark.asyncio
|
175
|
+
async def test_available_actions(self, mock_task_instance):
|
176
|
+
"""Test getting available actions."""
|
177
|
+
env = NetHackEnvironment(mock_task_instance)
|
178
|
+
|
179
|
+
actions = env.get_available_actions()
|
180
|
+
assert isinstance(actions, list)
|
181
|
+
assert "north" in actions
|
182
|
+
assert "inventory" in actions
|
183
|
+
assert "a" in actions # Menu action
|
184
|
+
|
185
|
+
descriptions = env.get_action_descriptions()
|
186
|
+
assert isinstance(descriptions, dict)
|
187
|
+
assert descriptions["north"] == "move north"
|
188
|
+
assert descriptions["inventory"] == "check inventory"
|
189
|
+
|
190
|
+
|
191
|
+
class TestNetHackInteractTool:
|
192
|
+
"""Test cases for NetHack interact tool."""
|
193
|
+
|
194
|
+
@pytest.fixture
|
195
|
+
def mock_task_instance(self):
|
196
|
+
"""Create a mock task instance for testing."""
|
197
|
+
metadata = NetHackTaskInstanceMetadata(
|
198
|
+
character_role="knight",
|
199
|
+
starting_level=1,
|
200
|
+
target_depth=3,
|
201
|
+
time_limit=500,
|
202
|
+
difficulty="beginner",
|
203
|
+
special_objectives=["Defeat 10 monsters"],
|
204
|
+
seed=123,
|
205
|
+
)
|
206
|
+
|
207
|
+
return NetHackTaskInstance(
|
208
|
+
id=uuid4(),
|
209
|
+
impetus=Impetus(instructions="Test knight adventure"),
|
210
|
+
intent=Intent(
|
211
|
+
rubric={"goal": "Test objectives"},
|
212
|
+
gold_trajectories=None,
|
213
|
+
gold_state_diff={},
|
214
|
+
),
|
215
|
+
metadata=metadata,
|
216
|
+
is_reproducible=True,
|
217
|
+
initial_engine_snapshot=None,
|
218
|
+
)
|
219
|
+
|
220
|
+
@pytest.fixture
|
221
|
+
def mock_engine(self, mock_task_instance):
|
222
|
+
"""Create a mock engine for testing."""
|
223
|
+
from synth_ai.environments.examples.nethack.engine import NetHackEngine
|
224
|
+
|
225
|
+
return NetHackEngine(mock_task_instance)
|
226
|
+
|
227
|
+
@pytest.mark.asyncio
|
228
|
+
async def test_interact_tool_valid_action(self, mock_engine):
|
229
|
+
"""Test interact tool with valid action."""
|
230
|
+
await mock_engine._reset_engine()
|
231
|
+
tool = NetHackInteractTool(mock_engine)
|
232
|
+
|
233
|
+
call = EnvToolCall(tool="interact", args={"action": "wait"})
|
234
|
+
result = await tool(call)
|
235
|
+
|
236
|
+
assert result.ok is True
|
237
|
+
assert "public_state" in result.payload
|
238
|
+
assert "private_state" in result.payload
|
239
|
+
assert result.payload["public_state"].last_action == "wait"
|
240
|
+
|
241
|
+
@pytest.mark.asyncio
|
242
|
+
async def test_interact_tool_no_action(self, mock_engine):
|
243
|
+
"""Test interact tool with missing action."""
|
244
|
+
await mock_engine._reset_engine()
|
245
|
+
tool = NetHackInteractTool(mock_engine)
|
246
|
+
|
247
|
+
call = EnvToolCall(tool="interact", args={})
|
248
|
+
result = await tool(call)
|
249
|
+
|
250
|
+
assert result.ok is False
|
251
|
+
# KeyError is caught and returned as string
|
252
|
+
assert "'action'" in result.error
|
253
|
+
|
254
|
+
@pytest.mark.asyncio
|
255
|
+
async def test_interact_tool_invalid_action(self, mock_engine):
|
256
|
+
"""Test interact tool with invalid action."""
|
257
|
+
await mock_engine._reset_engine()
|
258
|
+
tool = NetHackInteractTool(mock_engine)
|
259
|
+
|
260
|
+
call = EnvToolCall(tool="interact", args={"action": "fly"})
|
261
|
+
result = await tool(call)
|
262
|
+
|
263
|
+
assert result.ok is False
|
264
|
+
assert "Unknown action" in result.error
|
265
|
+
|
266
|
+
@pytest.mark.asyncio
|
267
|
+
async def test_interact_tool_game_over_validation(self, mock_engine):
|
268
|
+
"""Test interact tool validation when game is over."""
|
269
|
+
await mock_engine._reset_engine()
|
270
|
+
tool = NetHackInteractTool(mock_engine)
|
271
|
+
|
272
|
+
# Manually terminate the game
|
273
|
+
mock_engine.public_state.terminated = True
|
274
|
+
mock_engine.private_state.terminated = True
|
275
|
+
|
276
|
+
# Try non-quit action
|
277
|
+
call = EnvToolCall(tool="interact", args={"action": "north"})
|
278
|
+
result = await tool(call)
|
279
|
+
|
280
|
+
assert result.ok is False
|
281
|
+
assert "Game is over" in result.error
|
@@ -0,0 +1,213 @@
|
|
1
|
+
"""Unit tests for NetHack taskset."""
|
2
|
+
|
3
|
+
import pytest
|
4
|
+
import asyncio
|
5
|
+
|
6
|
+
from synth_ai.environments.examples.nethack.taskset import (
|
7
|
+
create_nethack_taskset,
|
8
|
+
NetHackTaskInstance,
|
9
|
+
NetHackTaskInstanceMetadata,
|
10
|
+
CHARACTER_ROLES,
|
11
|
+
SPECIAL_OBJECTIVES,
|
12
|
+
)
|
13
|
+
|
14
|
+
|
15
|
+
class TestNetHackTaskSet:
|
16
|
+
"""Test cases for NetHack taskset generation."""
|
17
|
+
|
18
|
+
@pytest.mark.asyncio
|
19
|
+
async def test_taskset_creation(self):
|
20
|
+
"""Test basic taskset creation."""
|
21
|
+
taskset = await create_nethack_taskset()
|
22
|
+
|
23
|
+
assert taskset.name == "NetHack TaskSet"
|
24
|
+
assert len(taskset.instances) == 100 # Sum of all difficulty counts
|
25
|
+
assert taskset.split_info._is_split_defined is True
|
26
|
+
|
27
|
+
@pytest.mark.asyncio
|
28
|
+
async def test_task_instance_properties(self):
|
29
|
+
"""Test properties of generated task instances."""
|
30
|
+
taskset = await create_nethack_taskset()
|
31
|
+
|
32
|
+
for instance in taskset.instances[:10]: # Check first 10
|
33
|
+
assert isinstance(instance, NetHackTaskInstance)
|
34
|
+
assert isinstance(instance.metadata, NetHackTaskInstanceMetadata)
|
35
|
+
|
36
|
+
# Check required fields
|
37
|
+
assert instance.id is not None
|
38
|
+
assert instance.impetus.instructions != ""
|
39
|
+
assert instance.intent.rubric is not None
|
40
|
+
assert instance.is_reproducible is True
|
41
|
+
|
42
|
+
# Check metadata
|
43
|
+
meta = instance.metadata
|
44
|
+
assert meta.character_role in CHARACTER_ROLES
|
45
|
+
assert meta.starting_level == 1
|
46
|
+
assert meta.target_depth > 0
|
47
|
+
assert meta.time_limit > 0
|
48
|
+
assert meta.difficulty in [
|
49
|
+
"tutorial",
|
50
|
+
"beginner",
|
51
|
+
"intermediate",
|
52
|
+
"advanced",
|
53
|
+
"expert",
|
54
|
+
]
|
55
|
+
assert isinstance(meta.special_objectives, list)
|
56
|
+
assert meta.seed is not None
|
57
|
+
|
58
|
+
@pytest.mark.asyncio
|
59
|
+
async def test_difficulty_distribution(self):
|
60
|
+
"""Test that difficulties are properly distributed."""
|
61
|
+
taskset = await create_nethack_taskset()
|
62
|
+
|
63
|
+
difficulty_counts = {
|
64
|
+
"tutorial": 0,
|
65
|
+
"beginner": 0,
|
66
|
+
"intermediate": 0,
|
67
|
+
"advanced": 0,
|
68
|
+
"expert": 0,
|
69
|
+
}
|
70
|
+
|
71
|
+
for instance in taskset.instances:
|
72
|
+
difficulty_counts[instance.metadata.difficulty] += 1
|
73
|
+
|
74
|
+
assert difficulty_counts["tutorial"] == 20
|
75
|
+
assert difficulty_counts["beginner"] == 30
|
76
|
+
assert difficulty_counts["intermediate"] == 25
|
77
|
+
assert difficulty_counts["advanced"] == 15
|
78
|
+
assert difficulty_counts["expert"] == 10
|
79
|
+
|
80
|
+
@pytest.mark.asyncio
|
81
|
+
async def test_character_role_assignment(self):
|
82
|
+
"""Test character role assignment by difficulty."""
|
83
|
+
taskset = await create_nethack_taskset()
|
84
|
+
|
85
|
+
# Check tutorial only has tourist
|
86
|
+
tutorial_instances = [i for i in taskset.instances if i.metadata.difficulty == "tutorial"]
|
87
|
+
for inst in tutorial_instances:
|
88
|
+
assert inst.metadata.character_role == "tourist"
|
89
|
+
|
90
|
+
# Check expert has all roles
|
91
|
+
expert_instances = [i for i in taskset.instances if i.metadata.difficulty == "expert"]
|
92
|
+
expert_roles = set(inst.metadata.character_role for inst in expert_instances)
|
93
|
+
assert len(expert_roles) > 1 # Should have multiple roles
|
94
|
+
|
95
|
+
@pytest.mark.asyncio
|
96
|
+
async def test_objective_assignment(self):
|
97
|
+
"""Test special objectives assignment."""
|
98
|
+
taskset = await create_nethack_taskset()
|
99
|
+
|
100
|
+
# Check objectives are from valid categories
|
101
|
+
all_valid_objectives = []
|
102
|
+
for category in SPECIAL_OBJECTIVES.values():
|
103
|
+
all_valid_objectives.extend(category)
|
104
|
+
|
105
|
+
for instance in taskset.instances:
|
106
|
+
for obj in instance.metadata.special_objectives:
|
107
|
+
assert obj in all_valid_objectives
|
108
|
+
|
109
|
+
# Check objective count by difficulty
|
110
|
+
tutorial_inst = next(i for i in taskset.instances if i.metadata.difficulty == "tutorial")
|
111
|
+
assert len(tutorial_inst.metadata.special_objectives) == 1
|
112
|
+
|
113
|
+
expert_inst = next(i for i in taskset.instances if i.metadata.difficulty == "expert")
|
114
|
+
assert len(expert_inst.metadata.special_objectives) == 4
|
115
|
+
|
116
|
+
@pytest.mark.asyncio
|
117
|
+
async def test_instruction_content(self):
|
118
|
+
"""Test that instructions contain necessary information."""
|
119
|
+
taskset = await create_nethack_taskset()
|
120
|
+
|
121
|
+
for instance in taskset.instances[:5]: # Check first 5
|
122
|
+
instructions = instance.impetus.instructions
|
123
|
+
|
124
|
+
# Check key elements are present
|
125
|
+
assert instance.metadata.character_role in instructions
|
126
|
+
assert str(instance.metadata.target_depth) in instructions
|
127
|
+
assert str(instance.metadata.time_limit) in instructions
|
128
|
+
assert "Additional objectives:" in instructions
|
129
|
+
assert "Character strengths:" in instructions
|
130
|
+
assert "Character weaknesses:" in instructions
|
131
|
+
assert "Tips:" in instructions
|
132
|
+
|
133
|
+
@pytest.mark.asyncio
|
134
|
+
async def test_rubric_structure(self):
|
135
|
+
"""Test intent rubric structure."""
|
136
|
+
taskset = await create_nethack_taskset()
|
137
|
+
|
138
|
+
for instance in taskset.instances[:5]:
|
139
|
+
rubric = instance.intent.rubric
|
140
|
+
|
141
|
+
assert "goal" in rubric
|
142
|
+
assert "success_criteria" in rubric
|
143
|
+
assert "evaluation_metrics" in rubric
|
144
|
+
|
145
|
+
# Check success criteria
|
146
|
+
assert "primary" in rubric["success_criteria"]
|
147
|
+
assert "secondary" in rubric["success_criteria"]
|
148
|
+
|
149
|
+
# Check evaluation metrics
|
150
|
+
metrics = rubric["evaluation_metrics"]
|
151
|
+
assert metrics["depth_reached"] == instance.metadata.target_depth
|
152
|
+
assert metrics["time_limit"] == instance.metadata.time_limit
|
153
|
+
assert metrics["objectives_completed"] == len(instance.metadata.special_objectives)
|
154
|
+
|
155
|
+
@pytest.mark.asyncio
|
156
|
+
async def test_split_info(self):
|
157
|
+
"""Test train/val/test split."""
|
158
|
+
taskset = await create_nethack_taskset()
|
159
|
+
|
160
|
+
total_instances = len(taskset.instances)
|
161
|
+
val_size = len(taskset.split_info.val_instance_ids)
|
162
|
+
test_size = len(taskset.split_info.test_instance_ids)
|
163
|
+
|
164
|
+
# Check split sizes (should be ~10% each)
|
165
|
+
assert val_size == total_instances // 10
|
166
|
+
assert test_size == total_instances // 10
|
167
|
+
|
168
|
+
# Check no overlap
|
169
|
+
assert len(taskset.split_info.val_instance_ids & taskset.split_info.test_instance_ids) == 0
|
170
|
+
|
171
|
+
# Check all split IDs are valid
|
172
|
+
all_ids = {inst.id for inst in taskset.instances}
|
173
|
+
assert taskset.split_info.val_instance_ids.issubset(all_ids)
|
174
|
+
assert taskset.split_info.test_instance_ids.issubset(all_ids)
|
175
|
+
|
176
|
+
@pytest.mark.asyncio
|
177
|
+
async def test_task_serialization(self):
|
178
|
+
"""Test task instance serialization."""
|
179
|
+
taskset = await create_nethack_taskset()
|
180
|
+
instance = taskset.instances[0]
|
181
|
+
|
182
|
+
# Serialize
|
183
|
+
serialized = await instance.serialize()
|
184
|
+
|
185
|
+
assert isinstance(serialized, dict)
|
186
|
+
assert "id" in serialized
|
187
|
+
assert "impetus" in serialized
|
188
|
+
assert "intent" in serialized
|
189
|
+
assert "metadata" in serialized
|
190
|
+
|
191
|
+
# Check metadata fields
|
192
|
+
meta = serialized["metadata"]
|
193
|
+
assert meta["character_role"] == instance.metadata.character_role
|
194
|
+
assert meta["target_depth"] == instance.metadata.target_depth
|
195
|
+
assert meta["time_limit"] == instance.metadata.time_limit
|
196
|
+
|
197
|
+
# Deserialize
|
198
|
+
restored = await NetHackTaskInstance.deserialize(serialized)
|
199
|
+
|
200
|
+
assert restored.metadata.character_role == instance.metadata.character_role
|
201
|
+
assert restored.metadata.target_depth == instance.metadata.target_depth
|
202
|
+
assert restored.metadata.time_limit == instance.metadata.time_limit
|
203
|
+
assert restored.metadata.special_objectives == instance.metadata.special_objectives
|
204
|
+
|
205
|
+
@pytest.mark.asyncio
|
206
|
+
async def test_reproducibility(self):
|
207
|
+
"""Test that tasks are marked as reproducible."""
|
208
|
+
taskset = await create_nethack_taskset()
|
209
|
+
|
210
|
+
for instance in taskset.instances:
|
211
|
+
assert instance.is_reproducible is True
|
212
|
+
assert instance.metadata.seed is not None
|
213
|
+
assert 0 <= instance.metadata.seed < 2**31
|