synth-ai 0.1.9__py3-none-any.whl → 0.2.1.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- synth_ai/__init__.py +28 -2
- synth_ai/core/system.py +4 -0
- synth_ai/environments/__init__.py +35 -0
- synth_ai/environments/environment/__init__.py +1 -0
- synth_ai/environments/environment/artifacts/__init__.py +1 -0
- synth_ai/environments/environment/artifacts/base.py +50 -0
- synth_ai/environments/environment/core.py +22 -0
- synth_ai/environments/environment/db/__init__.py +1 -0
- synth_ai/environments/environment/db/sqlite.py +45 -0
- synth_ai/environments/environment/registry.py +24 -0
- synth_ai/environments/environment/resources/sqlite.py +46 -0
- synth_ai/environments/environment/results.py +1 -0
- synth_ai/environments/environment/rewards/__init__.py +1 -0
- synth_ai/environments/environment/rewards/core.py +28 -0
- synth_ai/environments/environment/shared_engine.py +26 -0
- synth_ai/environments/environment/tools/__init__.py +34 -0
- synth_ai/environments/examples/__init__.py +1 -0
- synth_ai/environments/examples/crafter_classic/__init__.py +8 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_comprehensive_evaluation.py +58 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_browser.py +152 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_framework.py +1194 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_quick_evaluation.py +51 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_react_agent.py +872 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_trace_evaluation.py +1412 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/test_crafter_react_agent.py +1110 -0
- synth_ai/environments/examples/crafter_classic/config_logging.py +111 -0
- synth_ai/environments/examples/crafter_classic/engine.py +502 -0
- synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +63 -0
- synth_ai/environments/examples/crafter_classic/engine_helpers/action_map.py +5 -0
- synth_ai/environments/examples/crafter_classic/engine_helpers/serialization.py +74 -0
- synth_ai/environments/examples/crafter_classic/environment.py +255 -0
- synth_ai/environments/examples/crafter_classic/taskset.py +228 -0
- synth_ai/environments/examples/enron/agent_demos/test_synth_react.py +535 -0
- synth_ai/environments/examples/enron/art_helpers/email_search_tools.py +156 -0
- synth_ai/environments/examples/enron/art_helpers/local_email_db.py +280 -0
- synth_ai/environments/examples/enron/art_helpers/types_enron.py +24 -0
- synth_ai/environments/examples/enron/engine.py +291 -0
- synth_ai/environments/examples/enron/environment.py +165 -0
- synth_ai/environments/examples/enron/taskset.py +112 -0
- synth_ai/environments/examples/enron/units/keyword_stats.py +111 -0
- synth_ai/environments/examples/enron/units/test_email_index.py +8 -0
- synth_ai/environments/examples/minigrid/__init__.py +48 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_evaluation_framework.py +1188 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_quick_evaluation.py +47 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_react_agent.py +562 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_trace_evaluation.py +220 -0
- synth_ai/environments/examples/minigrid/agent_demos/test_minigrid_react_agent.py +393 -0
- synth_ai/environments/examples/minigrid/engine.py +589 -0
- synth_ai/environments/examples/minigrid/environment.py +274 -0
- synth_ai/environments/examples/minigrid/environment_mapping.py +242 -0
- synth_ai/environments/examples/minigrid/puzzle_loader.py +416 -0
- synth_ai/environments/examples/minigrid/taskset.py +583 -0
- synth_ai/environments/examples/minigrid/units/test_action_behavior.py +226 -0
- synth_ai/environments/examples/minigrid/units/test_debug_messages.py +83 -0
- synth_ai/environments/examples/minigrid/units/test_exploration.py +120 -0
- synth_ai/environments/examples/minigrid/units/test_minigrid_engine.py +214 -0
- synth_ai/environments/examples/minigrid/units/test_minigrid_environment.py +238 -0
- synth_ai/environments/examples/minigrid/units/test_minigrid_environment_mapping.py +301 -0
- synth_ai/environments/examples/minigrid/units/test_minigrid_taskset.py +210 -0
- synth_ai/environments/examples/nethack/__init__.py +7 -0
- synth_ai/environments/examples/nethack/achievements.py +337 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_evaluation_framework.py +981 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_quick_evaluation.py +74 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_react_agent.py +832 -0
- synth_ai/environments/examples/nethack/agent_demos/test_nethack_react_agent.py +1112 -0
- synth_ai/environments/examples/nethack/engine.py +738 -0
- synth_ai/environments/examples/nethack/environment.py +255 -0
- synth_ai/environments/examples/nethack/helpers/__init__.py +42 -0
- synth_ai/environments/examples/nethack/helpers/action_mapping.py +301 -0
- synth_ai/environments/examples/nethack/helpers/nle_wrapper.py +401 -0
- synth_ai/environments/examples/nethack/helpers/observation_utils.py +433 -0
- synth_ai/environments/examples/nethack/helpers/recording_wrapper.py +201 -0
- synth_ai/environments/examples/nethack/helpers/trajectory_recorder.py +268 -0
- synth_ai/environments/examples/nethack/helpers/visualization/replay_viewer.py +308 -0
- synth_ai/environments/examples/nethack/helpers/visualization/visualizer.py +430 -0
- synth_ai/environments/examples/nethack/taskset.py +323 -0
- synth_ai/environments/examples/nethack/units/test_nethack_engine.py +277 -0
- synth_ai/environments/examples/nethack/units/test_nethack_environment.py +281 -0
- synth_ai/environments/examples/nethack/units/test_nethack_taskset.py +213 -0
- synth_ai/environments/examples/nethack/units/test_recording.py +307 -0
- synth_ai/environments/examples/red/__init__.py +7 -0
- synth_ai/environments/examples/red/agent_demos/__init__.py +1 -0
- synth_ai/environments/examples/red/agent_demos/test_synth_react.py +1471 -0
- synth_ai/environments/examples/red/config_logging.py +110 -0
- synth_ai/environments/examples/red/engine.py +693 -0
- synth_ai/environments/examples/red/engine_helpers/__init__.py +1 -0
- synth_ai/environments/examples/red/engine_helpers/memory_map.py +28 -0
- synth_ai/environments/examples/red/engine_helpers/reward_components.py +275 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/__init__.py +142 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/adaptive_rewards.py +56 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/battle_rewards.py +283 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/composite_rewards.py +149 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/economy_rewards.py +137 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/efficiency_rewards.py +56 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/exploration_rewards.py +330 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/novelty_rewards.py +120 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_rewards.py +558 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/pokemon_rewards.py +312 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/social_rewards.py +147 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/story_rewards.py +246 -0
- synth_ai/environments/examples/red/engine_helpers/screen_analysis.py +367 -0
- synth_ai/environments/examples/red/engine_helpers/state_extraction.py +139 -0
- synth_ai/environments/examples/red/environment.py +235 -0
- synth_ai/environments/examples/red/taskset.py +77 -0
- synth_ai/environments/examples/red/test_fixes.py +125 -0
- synth_ai/environments/examples/red/test_fixes_mock.py +148 -0
- synth_ai/environments/examples/red/units/__init__.py +1 -0
- synth_ai/environments/examples/red/units/test_basic_functionality.py +97 -0
- synth_ai/environments/examples/red/units/test_button_press_requirements.py +217 -0
- synth_ai/environments/examples/red/units/test_engine.py +192 -0
- synth_ai/environments/examples/red/units/test_environment.py +455 -0
- synth_ai/environments/examples/red/units/test_exploration_strategy.py +227 -0
- synth_ai/environments/examples/red/units/test_integration.py +217 -0
- synth_ai/environments/examples/red/units/test_memory_extraction.py +111 -0
- synth_ai/environments/examples/red/units/test_menu_bug_reproduction.py +1100 -0
- synth_ai/environments/examples/red/units/test_movement_debug.py +255 -0
- synth_ai/environments/examples/red/units/test_pokemon_mcts_debug.py +163 -0
- synth_ai/environments/examples/red/units/test_pokemon_mcts_verbose.py +117 -0
- synth_ai/environments/examples/red/units/test_red_basic.py +145 -0
- synth_ai/environments/examples/red/units/test_red_comprehensive.py +323 -0
- synth_ai/environments/examples/red/units/test_retry_movement.py +195 -0
- synth_ai/environments/examples/red/units/test_reward_components.py +186 -0
- synth_ai/environments/examples/red/units/test_rom_integration.py +260 -0
- synth_ai/environments/examples/red/units/test_taskset.py +116 -0
- synth_ai/environments/examples/red/units/test_tree.py +448 -0
- synth_ai/environments/examples/sokoban/__init__.py +1 -0
- synth_ai/environments/examples/sokoban/agent_demos/sokoban_full_eval.py +900 -0
- synth_ai/environments/examples/sokoban/agent_demos/test_dspy_react.py +1 -0
- synth_ai/environments/examples/sokoban/agent_demos/test_sokoban_react_agent.py +498 -0
- synth_ai/environments/examples/sokoban/agent_demos/test_synth_lats.py +1 -0
- synth_ai/environments/examples/sokoban/agent_demos/test_synth_react_locally.py +748 -0
- synth_ai/environments/examples/sokoban/agent_demos/test_synth_react_service.py +296 -0
- synth_ai/environments/examples/sokoban/engine.py +675 -0
- synth_ai/environments/examples/sokoban/engine_helpers/__init__.py +1 -0
- synth_ai/environments/examples/sokoban/engine_helpers/room_utils.py +656 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/__init__.py +17 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/__init__.py +3 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/boxoban_env.py +129 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/render_utils.py +370 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/room_utils.py +331 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env.py +305 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_fixed_targets.py +66 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_pull.py +114 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_two_player.py +122 -0
- synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_variations.py +394 -0
- synth_ai/environments/examples/sokoban/environment.py +228 -0
- synth_ai/environments/examples/sokoban/generate_verified_puzzles.py +438 -0
- synth_ai/environments/examples/sokoban/puzzle_loader.py +311 -0
- synth_ai/environments/examples/sokoban/taskset.py +425 -0
- synth_ai/environments/examples/sokoban/units/astar_common.py +94 -0
- synth_ai/environments/examples/sokoban/units/test_building_task_set.py +49 -0
- synth_ai/environments/examples/sokoban/units/test_false_positive.py +120 -0
- synth_ai/environments/examples/sokoban/units/test_simple_run_through_environment.py +119 -0
- synth_ai/environments/examples/sokoban/units/test_sokoban_environment.py +98 -0
- synth_ai/environments/examples/sokoban/units/test_tree.py +364 -0
- synth_ai/environments/examples/tictactoe/__init__.py +1 -0
- synth_ai/environments/examples/tictactoe/agent_demos/test_synth_react.py +266 -0
- synth_ai/environments/examples/tictactoe/agent_demos/test_tictactoe_react_agent.py +470 -0
- synth_ai/environments/examples/tictactoe/engine.py +368 -0
- synth_ai/environments/examples/tictactoe/environment.py +239 -0
- synth_ai/environments/examples/tictactoe/taskset.py +214 -0
- synth_ai/environments/examples/tictactoe/units/test_tictactoe_engine.py +393 -0
- synth_ai/environments/examples/tictactoe/units/test_tictactoe_environment.py +493 -0
- synth_ai/environments/examples/tictactoe/units/test_tictactoe_taskset.py +191 -0
- synth_ai/environments/examples/verilog/__init__.py +10 -0
- synth_ai/environments/examples/verilog/agent_demos/test_synth_react.py +520 -0
- synth_ai/environments/examples/verilog/engine.py +328 -0
- synth_ai/environments/examples/verilog/environment.py +349 -0
- synth_ai/environments/examples/verilog/taskset.py +418 -0
- synth_ai/environments/examples/verilog/units/test_verilog_engine.py +466 -0
- synth_ai/environments/examples/verilog/units/test_verilog_environment.py +585 -0
- synth_ai/environments/examples/verilog/units/test_verilog_integration.py +383 -0
- synth_ai/environments/examples/verilog/units/test_verilog_taskset.py +457 -0
- synth_ai/environments/reproducibility/core.py +42 -0
- synth_ai/environments/reproducibility/tree.py +364 -0
- synth_ai/environments/service/app.py +78 -0
- synth_ai/environments/service/core_routes.py +775 -0
- synth_ai/environments/service/external_registry.py +57 -0
- synth_ai/environments/service/registry.py +9 -0
- synth_ai/environments/stateful/__init__.py +1 -0
- synth_ai/environments/stateful/core.py +28 -0
- synth_ai/environments/stateful/engine.py +21 -0
- synth_ai/environments/stateful/state.py +7 -0
- synth_ai/environments/tasks/api.py +19 -0
- synth_ai/environments/tasks/core.py +78 -0
- synth_ai/environments/tasks/filters.py +39 -0
- synth_ai/environments/tasks/utils.py +89 -0
- synth_ai/environments/v0_observability/history.py +3 -0
- synth_ai/environments/v0_observability/log.py +2 -0
- synth_ai/lm/caching/constants.py +1 -0
- synth_ai/{zyk/lms → lm}/caching/ephemeral.py +4 -8
- synth_ai/{zyk/lms → lm}/caching/handler.py +15 -15
- synth_ai/{zyk/lms → lm}/caching/initialize.py +2 -4
- synth_ai/{zyk/lms → lm}/caching/persistent.py +4 -10
- synth_ai/{zyk/lms → lm}/config.py +2 -1
- synth_ai/{zyk/lms → lm}/constants.py +2 -2
- synth_ai/{zyk/lms → lm}/core/all.py +10 -10
- synth_ai/{zyk/lms → lm}/core/main.py +57 -33
- synth_ai/{zyk/lms → lm}/core/vendor_clients.py +12 -10
- synth_ai/lm/cost/monitor.py +1 -0
- synth_ai/lm/cost/statefulness.py +1 -0
- synth_ai/lm/provider_support/__init__.py +8 -0
- synth_ai/lm/provider_support/anthropic.py +945 -0
- synth_ai/lm/provider_support/openai.py +1115 -0
- synth_ai/lm/provider_support/suppress_logging.py +31 -0
- synth_ai/{zyk/lms → lm}/structured_outputs/handler.py +58 -80
- synth_ai/{zyk/lms → lm}/structured_outputs/inject.py +6 -20
- synth_ai/{zyk/lms → lm}/structured_outputs/rehabilitate.py +6 -12
- synth_ai/{zyk/lms → lm}/vendors/core/anthropic_api.py +21 -30
- synth_ai/{zyk/lms → lm}/vendors/core/gemini_api.py +37 -32
- synth_ai/{zyk/lms → lm}/vendors/core/mistral_api.py +19 -28
- synth_ai/{zyk/lms → lm}/vendors/core/openai_api.py +26 -36
- synth_ai/{zyk/lms → lm}/vendors/openai_standard.py +29 -33
- synth_ai/{zyk/lms → lm}/vendors/retries.py +1 -1
- synth_ai/lm/vendors/supported/__init__.py +0 -0
- synth_ai/{zyk/lms → lm}/vendors/supported/custom_endpoint.py +131 -118
- synth_ai/{zyk/lms → lm}/vendors/supported/deepseek.py +4 -8
- synth_ai/{zyk/lms → lm}/vendors/supported/grok.py +6 -8
- synth_ai/{zyk/lms → lm}/vendors/supported/groq.py +1 -1
- synth_ai/{zyk/lms → lm}/vendors/supported/ollama.py +2 -2
- synth_ai/{zyk/lms → lm}/vendors/supported/openrouter.py +18 -16
- synth_ai/{zyk/lms → lm}/vendors/supported/together.py +1 -1
- synth_ai/tracing/__init__.py +0 -0
- synth_ai/tracing/abstractions.py +224 -0
- synth_ai/tracing/base_client.py +91 -0
- synth_ai/tracing/client_manager.py +131 -0
- synth_ai/tracing/config.py +140 -0
- synth_ai/tracing/context.py +146 -0
- synth_ai/tracing/decorators.py +679 -0
- synth_ai/tracing/events/__init__.py +0 -0
- synth_ai/tracing/events/manage.py +147 -0
- synth_ai/tracing/events/scope.py +86 -0
- synth_ai/tracing/events/store.py +227 -0
- synth_ai/tracing/immediate_client.py +152 -0
- synth_ai/tracing/local.py +18 -0
- synth_ai/tracing/log_client_base.py +74 -0
- synth_ai/tracing/retry_queue.py +187 -0
- synth_ai/tracing/trackers.py +515 -0
- synth_ai/tracing/upload.py +504 -0
- synth_ai/tracing/utils.py +9 -0
- synth_ai/zyk/__init__.py +28 -2
- synth_ai-0.2.1.dev0.dist-info/METADATA +349 -0
- synth_ai-0.2.1.dev0.dist-info/RECORD +261 -0
- synth_ai/zyk/lms/caching/constants.py +0 -1
- synth_ai/zyk/lms/cost/monitor.py +0 -1
- synth_ai/zyk/lms/cost/statefulness.py +0 -1
- synth_ai-0.1.9.dist-info/METADATA +0 -37
- synth_ai-0.1.9.dist-info/RECORD +0 -50
- /synth_ai/{zyk/lms/__init__.py → environments/reproducibility/helpers.py} +0 -0
- /synth_ai/{zyk/lms/caching → lm}/__init__.py +0 -0
- /synth_ai/{zyk/lms/core → lm/caching}/__init__.py +0 -0
- /synth_ai/{zyk/lms → lm}/caching/dbs.py +0 -0
- /synth_ai/{zyk/lms/cost → lm/core}/__init__.py +0 -0
- /synth_ai/{zyk/lms → lm}/core/exceptions.py +0 -0
- /synth_ai/{zyk/lms/structured_outputs → lm/cost}/__init__.py +0 -0
- /synth_ai/{zyk/lms/vendors → lm/structured_outputs}/__init__.py +0 -0
- /synth_ai/{zyk/lms → lm}/tools/__init__.py +0 -0
- /synth_ai/{zyk/lms → lm}/tools/base.py +0 -0
- /synth_ai/{zyk/lms/vendors/core → lm/vendors}/__init__.py +0 -0
- /synth_ai/{zyk/lms → lm}/vendors/base.py +0 -0
- /synth_ai/{zyk/lms/vendors/local → lm/vendors/core}/__init__.py +0 -0
- /synth_ai/{zyk/lms/vendors/supported → lm/vendors/local}/__init__.py +0 -0
- /synth_ai/{zyk/lms → lm}/vendors/local/ollama.py +0 -0
- {synth_ai-0.1.9.dist-info → synth_ai-0.2.1.dev0.dist-info}/WHEEL +0 -0
- {synth_ai-0.1.9.dist-info → synth_ai-0.2.1.dev0.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.1.9.dist-info → synth_ai-0.2.1.dev0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,220 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
Simple MiniGrid evaluation script to generate traces.
|
4
|
+
"""
|
5
|
+
|
6
|
+
import asyncio
|
7
|
+
import json
|
8
|
+
import os
|
9
|
+
import sys
|
10
|
+
import uuid
|
11
|
+
from datetime import datetime
|
12
|
+
from pathlib import Path
|
13
|
+
|
14
|
+
# Add parent directories to path
|
15
|
+
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent.parent))
|
16
|
+
|
17
|
+
import gymnasium as gym
|
18
|
+
import minigrid
|
19
|
+
from minigrid.wrappers import ImgObsWrapper, RGBImgPartialObsWrapper
|
20
|
+
import numpy as np
|
21
|
+
import base64
|
22
|
+
from PIL import Image
|
23
|
+
import io
|
24
|
+
|
25
|
+
|
26
|
+
# Environment setup
|
27
|
+
def create_minigrid_env(env_name="MiniGrid-Empty-6x6-v0"):
|
28
|
+
"""Create a MiniGrid environment with image observations."""
|
29
|
+
env = gym.make(env_name)
|
30
|
+
# Wrap to get RGB image observations
|
31
|
+
env = RGBImgPartialObsWrapper(env)
|
32
|
+
env = ImgObsWrapper(env)
|
33
|
+
return env
|
34
|
+
|
35
|
+
|
36
|
+
def image_to_base64(image_array):
|
37
|
+
"""Convert numpy image array to base64 string."""
|
38
|
+
# Convert to PIL Image
|
39
|
+
img = Image.fromarray(image_array.astype(np.uint8))
|
40
|
+
# Save to bytes buffer
|
41
|
+
buffer = io.BytesIO()
|
42
|
+
img.save(buffer, format="PNG")
|
43
|
+
buffer.seek(0)
|
44
|
+
# Encode to base64
|
45
|
+
img_base64 = base64.b64encode(buffer.read()).decode("utf-8")
|
46
|
+
return img_base64
|
47
|
+
|
48
|
+
|
49
|
+
def get_action_name(action_idx):
|
50
|
+
"""Map action index to name."""
|
51
|
+
action_names = {
|
52
|
+
0: "left",
|
53
|
+
1: "right",
|
54
|
+
2: "forward",
|
55
|
+
3: "pickup",
|
56
|
+
4: "drop",
|
57
|
+
5: "toggle",
|
58
|
+
6: "done",
|
59
|
+
}
|
60
|
+
return action_names.get(action_idx, f"action_{action_idx}")
|
61
|
+
|
62
|
+
|
63
|
+
async def run_simple_minigrid_eval(
|
64
|
+
model_name="simple-agent",
|
65
|
+
env_name="MiniGrid-Empty-6x6-v0",
|
66
|
+
num_episodes=3,
|
67
|
+
max_steps=50,
|
68
|
+
):
|
69
|
+
"""Run a simple evaluation to generate MiniGrid traces."""
|
70
|
+
|
71
|
+
print(f"\n🎮 Running MiniGrid Evaluation")
|
72
|
+
print(f" Environment: {env_name}")
|
73
|
+
print(f" Episodes: {num_episodes}")
|
74
|
+
print(f" Max steps: {max_steps}")
|
75
|
+
|
76
|
+
# Create output directory
|
77
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
78
|
+
run_id = f"run_{int(datetime.now().timestamp())}"
|
79
|
+
output_dir = Path(f"src/evals/minigrid/{run_id}")
|
80
|
+
traces_dir = output_dir / "traces"
|
81
|
+
traces_dir.mkdir(parents=True, exist_ok=True)
|
82
|
+
|
83
|
+
results = []
|
84
|
+
|
85
|
+
for episode in range(num_episodes):
|
86
|
+
print(f"\n📍 Episode {episode + 1}/{num_episodes}")
|
87
|
+
|
88
|
+
# Create environment
|
89
|
+
env = create_minigrid_env(env_name)
|
90
|
+
obs, info = env.reset()
|
91
|
+
|
92
|
+
# Initialize trace
|
93
|
+
trace_id = str(uuid.uuid4())
|
94
|
+
trace_data = {
|
95
|
+
"trace": {
|
96
|
+
"metadata": {
|
97
|
+
"model_name": model_name,
|
98
|
+
"env_name": env_name,
|
99
|
+
"difficulty": "easy",
|
100
|
+
"seed": episode,
|
101
|
+
"max_steps": max_steps,
|
102
|
+
},
|
103
|
+
"partition": [],
|
104
|
+
},
|
105
|
+
"dataset": {"reward_signals": []},
|
106
|
+
}
|
107
|
+
|
108
|
+
total_reward = 0.0
|
109
|
+
done = False
|
110
|
+
step = 0
|
111
|
+
|
112
|
+
while not done and step < max_steps:
|
113
|
+
# Simple policy: random actions with bias towards forward
|
114
|
+
if np.random.random() < 0.6:
|
115
|
+
action = 2 # forward
|
116
|
+
else:
|
117
|
+
action = env.action_space.sample()
|
118
|
+
|
119
|
+
# Take action
|
120
|
+
next_obs, reward, terminated, truncated, info = env.step(action)
|
121
|
+
done = terminated or truncated
|
122
|
+
total_reward += reward
|
123
|
+
|
124
|
+
# Create partition for this step
|
125
|
+
partition = {
|
126
|
+
"events": [
|
127
|
+
{
|
128
|
+
"environment_compute_steps": [
|
129
|
+
{
|
130
|
+
"compute_output": [
|
131
|
+
{
|
132
|
+
"outputs": {
|
133
|
+
"observation": {
|
134
|
+
"mission": getattr(
|
135
|
+
env.unwrapped,
|
136
|
+
"mission",
|
137
|
+
"Reach the goal",
|
138
|
+
),
|
139
|
+
"image_base64": image_to_base64(
|
140
|
+
obs
|
141
|
+
if isinstance(obs, np.ndarray)
|
142
|
+
else obs["image"]
|
143
|
+
),
|
144
|
+
},
|
145
|
+
"action": action,
|
146
|
+
"reward": float(reward),
|
147
|
+
"terminated": terminated,
|
148
|
+
"truncated": truncated,
|
149
|
+
}
|
150
|
+
}
|
151
|
+
]
|
152
|
+
}
|
153
|
+
]
|
154
|
+
}
|
155
|
+
]
|
156
|
+
}
|
157
|
+
|
158
|
+
trace_data["trace"]["partition"].append(partition)
|
159
|
+
|
160
|
+
obs = next_obs
|
161
|
+
step += 1
|
162
|
+
|
163
|
+
if done and reward > 0:
|
164
|
+
print(f" ✅ Success! Reached goal in {step} steps")
|
165
|
+
|
166
|
+
if not done:
|
167
|
+
print(f" ⏰ Timeout after {step} steps")
|
168
|
+
|
169
|
+
# Update trace metadata
|
170
|
+
trace_data["trace"]["metadata"]["success"] = reward > 0
|
171
|
+
trace_data["trace"]["metadata"]["num_steps"] = step
|
172
|
+
trace_data["dataset"]["reward_signals"].append({"reward": float(total_reward)})
|
173
|
+
|
174
|
+
# Save trace
|
175
|
+
trace_file = traces_dir / f"minigrid_trace_{trace_id}.json"
|
176
|
+
with open(trace_file, "w") as f:
|
177
|
+
json.dump(trace_data, f, indent=2)
|
178
|
+
|
179
|
+
results.append(
|
180
|
+
{
|
181
|
+
"trace_id": trace_id,
|
182
|
+
"success": reward > 0,
|
183
|
+
"steps": step,
|
184
|
+
"total_reward": total_reward,
|
185
|
+
}
|
186
|
+
)
|
187
|
+
|
188
|
+
print(f" 💾 Saved trace: {trace_file.name}")
|
189
|
+
|
190
|
+
# Save evaluation summary
|
191
|
+
summary = {
|
192
|
+
"run_id": run_id,
|
193
|
+
"timestamp": timestamp,
|
194
|
+
"environment": env_name,
|
195
|
+
"model_name": model_name,
|
196
|
+
"num_episodes": num_episodes,
|
197
|
+
"results": results,
|
198
|
+
"success_rate": sum(1 for r in results if r["success"]) / len(results),
|
199
|
+
"avg_steps": sum(r["steps"] for r in results) / len(results),
|
200
|
+
"models_evaluated": [model_name],
|
201
|
+
"difficulties_evaluated": ["easy"],
|
202
|
+
}
|
203
|
+
|
204
|
+
summary_file = output_dir / "evaluation_summary.json"
|
205
|
+
with open(summary_file, "w") as f:
|
206
|
+
json.dump(summary, f, indent=2)
|
207
|
+
|
208
|
+
print(f"\n✅ Evaluation complete!")
|
209
|
+
print(f" Success rate: {summary['success_rate']:.1%}")
|
210
|
+
print(f" Average steps: {summary['avg_steps']:.1f}")
|
211
|
+
print(f" Output directory: {output_dir}")
|
212
|
+
|
213
|
+
return summary
|
214
|
+
|
215
|
+
|
216
|
+
if __name__ == "__main__":
|
217
|
+
# Run evaluation
|
218
|
+
asyncio.run(
|
219
|
+
run_simple_minigrid_eval(env_name="MiniGrid-Empty-6x6-v0", num_episodes=3, max_steps=30)
|
220
|
+
)
|
@@ -0,0 +1,393 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
Test script to run ReAct agents against MiniGrid environment on synth service (port 8901)
|
4
|
+
Tests on multiple easy MiniGrid instances with enhanced debugging
|
5
|
+
"""
|
6
|
+
|
7
|
+
import asyncio
|
8
|
+
import json
|
9
|
+
import uuid
|
10
|
+
from datetime import datetime
|
11
|
+
from typing import Dict, Any, Optional, List
|
12
|
+
from pydantic import BaseModel, Field
|
13
|
+
from httpx import AsyncClient
|
14
|
+
import sys
|
15
|
+
import os
|
16
|
+
|
17
|
+
# Add the src directory to the path
|
18
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "src"))
|
19
|
+
|
20
|
+
from synth_ai.zyk import LM
|
21
|
+
from synth_ai.zyk.lms.tools.base import BaseTool
|
22
|
+
|
23
|
+
|
24
|
+
# --- Service Configuration ---
|
25
|
+
SERVICE_BASE_URL = "http://localhost:8901"
|
26
|
+
MODEL_NAME = "o3"
|
27
|
+
NUM_INSTANCES = 1
|
28
|
+
MAX_TURNS = 20
|
29
|
+
DIFFICULTY = "ultra_easy"
|
30
|
+
|
31
|
+
|
32
|
+
# --- Tool Definitions ---
|
33
|
+
class NavigationActionArgs(BaseModel):
|
34
|
+
"""Arguments for navigation actions."""
|
35
|
+
|
36
|
+
action: str = Field(
|
37
|
+
description="The action to take: left, right, forward, pickup, drop, toggle, done"
|
38
|
+
)
|
39
|
+
reasoning: str = Field(description="Brief explanation of why this action was chosen")
|
40
|
+
|
41
|
+
|
42
|
+
class TerminateArgs(BaseModel):
|
43
|
+
"""Arguments for termination."""
|
44
|
+
|
45
|
+
reason: str = Field(description="Reason for termination")
|
46
|
+
|
47
|
+
|
48
|
+
class NavigationActionTool(BaseTool):
|
49
|
+
"""Tool for performing an action in the MiniGrid environment."""
|
50
|
+
|
51
|
+
name: str = "navigation_action"
|
52
|
+
arguments: type[BaseModel] = NavigationActionArgs
|
53
|
+
description: str = "Perform a navigation action in the MiniGrid environment."
|
54
|
+
|
55
|
+
|
56
|
+
class TerminateTool(BaseTool):
|
57
|
+
"""Tool to terminate the episode."""
|
58
|
+
|
59
|
+
name: str = "terminate"
|
60
|
+
arguments: type[BaseModel] = TerminateArgs
|
61
|
+
description: str = "End the episode when finished or no progress can be made."
|
62
|
+
|
63
|
+
|
64
|
+
# --- Base ReAct Agent ---
|
65
|
+
class BaseReActAgent:
|
66
|
+
"""Base ReAct agent for environment interaction."""
|
67
|
+
|
68
|
+
def __init__(self, llm: LM, max_turns: int = 15, verbose: bool = False):
|
69
|
+
self.llm = llm
|
70
|
+
self.max_turns = max_turns
|
71
|
+
self.verbose = verbose
|
72
|
+
self.history = []
|
73
|
+
self.system_name = "base-react-agent"
|
74
|
+
|
75
|
+
# Define tools in OpenAI format (like Enron agent)
|
76
|
+
self.tools = [
|
77
|
+
NavigationActionTool(),
|
78
|
+
TerminateTool(),
|
79
|
+
]
|
80
|
+
|
81
|
+
async def decide(self, obs: str, system_message: str, turn: int) -> Dict[str, Any]:
|
82
|
+
"""Get agent decision based on observation."""
|
83
|
+
# Create conversation context
|
84
|
+
context = f"Turn {turn + 1}/{self.max_turns}\n\n{obs}"
|
85
|
+
|
86
|
+
# Generate response using LLM (same pattern as Crafter)
|
87
|
+
response_obj = await self.llm.respond_async(
|
88
|
+
system_message=system_message, user_message=context, tools=self.tools
|
89
|
+
)
|
90
|
+
|
91
|
+
tool_calls = response_obj.tool_calls
|
92
|
+
|
93
|
+
# Handle case where tool_calls is None or empty (graceful fallback)
|
94
|
+
if not tool_calls:
|
95
|
+
if self.verbose:
|
96
|
+
print(f"[WARNING] No tool calls returned by LLM, using default action")
|
97
|
+
return {
|
98
|
+
"name": "navigation_action",
|
99
|
+
"parameters": {
|
100
|
+
"action": "forward",
|
101
|
+
"reasoning": "Default action - no tool call received",
|
102
|
+
},
|
103
|
+
}
|
104
|
+
|
105
|
+
tool_call_data = tool_calls[0]
|
106
|
+
|
107
|
+
# Handle both dict and object formats (same as Crafter)
|
108
|
+
if isinstance(tool_call_data, dict):
|
109
|
+
tool_name = tool_call_data["function"]["name"]
|
110
|
+
tool_args_str = tool_call_data["function"]["arguments"]
|
111
|
+
else:
|
112
|
+
tool_name = tool_call_data.function.name
|
113
|
+
tool_args_str = tool_call_data.function.arguments
|
114
|
+
|
115
|
+
tool_arguments = json.loads(tool_args_str)
|
116
|
+
|
117
|
+
return {"name": tool_name, "parameters": tool_arguments}
|
118
|
+
|
119
|
+
|
120
|
+
# --- MiniGrid ReAct Agent ---
|
121
|
+
class MiniGridReActAgent(BaseReActAgent):
|
122
|
+
"""ReAct agent for MiniGrid environment."""
|
123
|
+
|
124
|
+
def __init__(self, llm: LM, max_turns: int = 15, verbose: bool = False):
|
125
|
+
super().__init__(llm, max_turns, verbose)
|
126
|
+
self.system_name = "minigrid-react-agent"
|
127
|
+
|
128
|
+
def get_system_message(self) -> str:
|
129
|
+
return """You are navigating a MiniGrid environment. Your goal is to reach the goal (G) to complete the mission successfully.
|
130
|
+
|
131
|
+
ACTIONS:
|
132
|
+
- "left": turn left (counter-clockwise)
|
133
|
+
- "right": turn right (clockwise)
|
134
|
+
- "forward": move forward one step
|
135
|
+
- "pickup": pick up object in front of you
|
136
|
+
- "drop": drop carried object
|
137
|
+
- "toggle": open/close door or interact with object
|
138
|
+
- "done": complete mission when you reach the goal
|
139
|
+
|
140
|
+
SYMBOLS:
|
141
|
+
- # = wall (blocks movement)
|
142
|
+
- . = empty space (can move through)
|
143
|
+
- G = goal (your destination)
|
144
|
+
- K = key (pick up to unlock doors)
|
145
|
+
- D = door (may need key to open)
|
146
|
+
- L = lava (avoid - will end mission)
|
147
|
+
- @ = you (your current position)
|
148
|
+
|
149
|
+
STRATEGY:
|
150
|
+
1. Analyze the grid layout to understand the environment
|
151
|
+
2. Plan a path to reach the goal (G)
|
152
|
+
3. Navigate systematically - turn to face the right direction, then move forward
|
153
|
+
4. Pick up keys (K) before trying to open doors (D)
|
154
|
+
5. Use "toggle" to open doors when you have the key
|
155
|
+
6. Avoid lava (L) at all costs
|
156
|
+
7. Use "done" when you reach the goal
|
157
|
+
|
158
|
+
IMPORTANT: You can only see a limited view around you. Move and explore to discover the full environment. Be systematic in your exploration."""
|
159
|
+
|
160
|
+
def format_observation(self, obs: Dict[str, Any]) -> str:
|
161
|
+
"""Format observation for MiniGrid."""
|
162
|
+
parts = []
|
163
|
+
|
164
|
+
if "grid" in obs:
|
165
|
+
parts.append(f"Grid view:\n{obs['grid']}")
|
166
|
+
elif "observation" in obs:
|
167
|
+
parts.append(f"Observation:\n{obs['observation']}")
|
168
|
+
|
169
|
+
if "direction" in obs:
|
170
|
+
parts.append(f"Facing: {obs['direction']}")
|
171
|
+
|
172
|
+
if "carrying" in obs and obs["carrying"]:
|
173
|
+
parts.append(f"Carrying: {obs['carrying']}")
|
174
|
+
|
175
|
+
if "step_count" in obs:
|
176
|
+
parts.append(f"Steps: {obs['step_count']}")
|
177
|
+
|
178
|
+
if "mission" in obs:
|
179
|
+
parts.append(f"Mission: {obs['mission']}")
|
180
|
+
|
181
|
+
# Add more possible observation fields
|
182
|
+
if "terminated" in obs:
|
183
|
+
parts.append(f"Terminated: {obs['terminated']}")
|
184
|
+
|
185
|
+
if "success" in obs:
|
186
|
+
parts.append(f"Success: {obs['success']}")
|
187
|
+
|
188
|
+
if "reward_last" in obs:
|
189
|
+
parts.append(f"Last reward: {obs['reward_last']}")
|
190
|
+
|
191
|
+
return "\n".join(parts) if parts else "No formatted observation available"
|
192
|
+
|
193
|
+
|
194
|
+
# --- Episode Runner ---
|
195
|
+
async def run_single_episode(
|
196
|
+
client: AsyncClient, agent: MiniGridReActAgent, task_instance, instance_num: int
|
197
|
+
) -> bool:
|
198
|
+
"""Run a single MiniGrid episode and return success status."""
|
199
|
+
try:
|
200
|
+
# Create environment using the task instance
|
201
|
+
create_resp = await client.post(
|
202
|
+
f"/env/MiniGrid/initialize", json={"task_instance": await task_instance.serialize()}
|
203
|
+
)
|
204
|
+
|
205
|
+
if create_resp.status_code != 200:
|
206
|
+
print(
|
207
|
+
f" Instance {instance_num}: Failed to create environment - {create_resp.status_code}: {create_resp.text}"
|
208
|
+
)
|
209
|
+
return False
|
210
|
+
|
211
|
+
env_id = create_resp.json()["env_id"]
|
212
|
+
|
213
|
+
# Get initial observation
|
214
|
+
obs = create_resp.json()["observation"]
|
215
|
+
formatted_obs = agent.format_observation(obs)
|
216
|
+
|
217
|
+
# DEBUG: Print initial state
|
218
|
+
print(f"\n Instance {instance_num}: Starting MiniGrid mission")
|
219
|
+
print(f" Environment: {task_instance.metadata.env_name}")
|
220
|
+
print(f" Mission: {task_instance.impetus.instructions[:100]}...")
|
221
|
+
print(f" Initial observation:")
|
222
|
+
print(f" {formatted_obs}")
|
223
|
+
|
224
|
+
# Run episode
|
225
|
+
for turn in range(agent.max_turns):
|
226
|
+
# Get agent decision
|
227
|
+
action = await agent.decide(formatted_obs, agent.get_system_message(), turn)
|
228
|
+
|
229
|
+
# DEBUG: Print agent decision
|
230
|
+
print(
|
231
|
+
f" Turn {turn + 1}: Agent chose '{action['parameters']['action']}' - {action['parameters'].get('reasoning', 'no reasoning')}"
|
232
|
+
)
|
233
|
+
|
234
|
+
# Check for termination
|
235
|
+
if action["name"] == "terminate":
|
236
|
+
print(
|
237
|
+
f" Agent terminated: {action['parameters'].get('reason', 'no reason given')}"
|
238
|
+
)
|
239
|
+
break
|
240
|
+
|
241
|
+
# Execute action in environment
|
242
|
+
action_name = action["parameters"]["action"]
|
243
|
+
|
244
|
+
step_resp = await client.post(
|
245
|
+
f"/env/MiniGrid/step",
|
246
|
+
json={
|
247
|
+
"env_id": env_id,
|
248
|
+
"request_id": str(uuid.uuid4()),
|
249
|
+
"action": {
|
250
|
+
"tool_calls": [{"tool": "minigrid_act", "args": {"action": action_name}}]
|
251
|
+
},
|
252
|
+
},
|
253
|
+
)
|
254
|
+
|
255
|
+
if step_resp.status_code != 200:
|
256
|
+
print(f" ❌ Step failed: {step_resp.status_code}: {step_resp.text}")
|
257
|
+
break
|
258
|
+
|
259
|
+
obs = step_resp.json()["observation"]
|
260
|
+
formatted_obs = agent.format_observation(obs)
|
261
|
+
|
262
|
+
# DEBUG: Print state after action
|
263
|
+
print(f" After action: {formatted_obs}")
|
264
|
+
|
265
|
+
# Update history
|
266
|
+
agent.history.append(f"{action_name}: {action['parameters'].get('reasoning', '')[:50]}")
|
267
|
+
|
268
|
+
# Check if goal is reached
|
269
|
+
terminated = obs.get("terminated", False)
|
270
|
+
success = obs.get("success", False)
|
271
|
+
reward_last = obs.get("reward_last", 0.0)
|
272
|
+
|
273
|
+
# MiniGrid success is typically indicated by positive reward when terminated
|
274
|
+
# Success reward is usually close to 1.0 (1.0 - step_penalties)
|
275
|
+
actual_success = terminated and reward_last > 0.1 # Threshold for success reward
|
276
|
+
|
277
|
+
if terminated and actual_success:
|
278
|
+
print(
|
279
|
+
f" ✅ Instance {instance_num}: SUCCESS! Mission completed in {turn + 1} turns (reward: {reward_last:.3f})"
|
280
|
+
)
|
281
|
+
await client.post(f"/env/MiniGrid/terminate", json={"env_id": env_id})
|
282
|
+
return True
|
283
|
+
|
284
|
+
if terminated:
|
285
|
+
print(
|
286
|
+
f" ❌ Instance {instance_num}: Terminated without success (success field: {success}, reward: {reward_last:.3f})"
|
287
|
+
)
|
288
|
+
break
|
289
|
+
|
290
|
+
print(
|
291
|
+
f" ❌ Instance {instance_num}: Failed to complete mission in {agent.max_turns} turns"
|
292
|
+
)
|
293
|
+
|
294
|
+
# Cleanup
|
295
|
+
await client.post(f"/env/MiniGrid/terminate", json={"env_id": env_id})
|
296
|
+
return False
|
297
|
+
|
298
|
+
except Exception as e:
|
299
|
+
print(f" Instance {instance_num}: Error - {e}")
|
300
|
+
import traceback
|
301
|
+
|
302
|
+
traceback.print_exc()
|
303
|
+
return False
|
304
|
+
|
305
|
+
|
306
|
+
# --- Batch Evaluation ---
|
307
|
+
async def evaluate_minigrid_batch() -> float:
|
308
|
+
"""Evaluate MiniGrid agent on multiple easy instances."""
|
309
|
+
print(f"🎯 Evaluating MiniGrid on {NUM_INSTANCES} easy instances...")
|
310
|
+
|
311
|
+
llm = LM(model_name=MODEL_NAME, formatting_model_name=MODEL_NAME, temperature=0.0)
|
312
|
+
|
313
|
+
# Get easy task instances using the taskset system
|
314
|
+
from synth_ai.environments.examples.minigrid.taskset import create_minigrid_task_from_seed
|
315
|
+
|
316
|
+
easy_task_instances = []
|
317
|
+
for seed in range(NUM_INSTANCES):
|
318
|
+
try:
|
319
|
+
task_instance = await create_minigrid_task_from_seed(DIFFICULTY, seed)
|
320
|
+
easy_task_instances.append(task_instance)
|
321
|
+
except Exception as e:
|
322
|
+
print(f" ⚠️ Failed to get task instance for seed {seed}: {e}")
|
323
|
+
continue
|
324
|
+
|
325
|
+
print(
|
326
|
+
f" 📝 Generated {len(easy_task_instances)} {DIFFICULTY} task instances from seeds 0-{NUM_INSTANCES - 1}"
|
327
|
+
)
|
328
|
+
|
329
|
+
async with AsyncClient(base_url=SERVICE_BASE_URL, timeout=30.0) as client:
|
330
|
+
tasks = []
|
331
|
+
for i, task_instance in enumerate(easy_task_instances):
|
332
|
+
agent = MiniGridReActAgent(llm, max_turns=MAX_TURNS, verbose=False)
|
333
|
+
tasks.append(run_single_episode(client, agent, task_instance, i + 1))
|
334
|
+
|
335
|
+
results = await asyncio.gather(*tasks)
|
336
|
+
success_count = sum(results)
|
337
|
+
success_rate = success_count / len(easy_task_instances)
|
338
|
+
|
339
|
+
print(
|
340
|
+
f" 📊 MiniGrid Results: {success_count}/{len(easy_task_instances)} solved ({success_rate:.1%})"
|
341
|
+
)
|
342
|
+
return success_rate
|
343
|
+
|
344
|
+
|
345
|
+
async def main():
|
346
|
+
"""Run MiniGrid evaluation."""
|
347
|
+
print(f"🎮 MiniGrid ReAct Agent Evaluation")
|
348
|
+
print(f"Model: {MODEL_NAME}")
|
349
|
+
print(f"Service: {SERVICE_BASE_URL}")
|
350
|
+
print(f"Instances: {NUM_INSTANCES}")
|
351
|
+
print(f"Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
352
|
+
print("=" * 50)
|
353
|
+
|
354
|
+
# Test service health
|
355
|
+
async with AsyncClient(base_url=SERVICE_BASE_URL, timeout=10.0) as client:
|
356
|
+
try:
|
357
|
+
health_resp = await client.get("/health")
|
358
|
+
health_data = health_resp.json()
|
359
|
+
|
360
|
+
if "MiniGrid" not in health_data.get("supported_environments", []):
|
361
|
+
print("❌ MiniGrid not available on service")
|
362
|
+
return
|
363
|
+
|
364
|
+
print("✅ Service health check passed")
|
365
|
+
|
366
|
+
except Exception as e:
|
367
|
+
print(f"❌ Service health check failed: {e}")
|
368
|
+
return
|
369
|
+
|
370
|
+
# Run evaluation
|
371
|
+
try:
|
372
|
+
success_rate = await evaluate_minigrid_batch()
|
373
|
+
|
374
|
+
print("\n" + "=" * 50)
|
375
|
+
print("🏆 FINAL MINIGRID RESULTS")
|
376
|
+
print("=" * 50)
|
377
|
+
print(f"Success Rate: {success_rate:.1%}")
|
378
|
+
|
379
|
+
if success_rate > 0.5:
|
380
|
+
print("🎉 Excellent performance!")
|
381
|
+
elif success_rate > 0.3:
|
382
|
+
print("✅ Good performance!")
|
383
|
+
elif success_rate > 0.1:
|
384
|
+
print("⚠️ Moderate performance")
|
385
|
+
else:
|
386
|
+
print("❌ Poor performance - needs improvement")
|
387
|
+
|
388
|
+
except Exception as e:
|
389
|
+
print(f"❌ Evaluation failed: {e}")
|
390
|
+
|
391
|
+
|
392
|
+
if __name__ == "__main__":
|
393
|
+
asyncio.run(main())
|