synth-ai 0.2.4.dev8__py3-none-any.whl ā 0.2.4.dev9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- synth_ai/cli/__init__.py +6 -0
- synth_ai/cli/demo.py +68 -9
- synth_ai/cli/rl_demo.py +137 -0
- synth_ai/cli/root.py +65 -0
- synth_ai/demos/core/__init__.py +1 -0
- synth_ai/demos/core/cli.py +621 -0
- synth_ai/demos/demo_task_apps/__init__.py +1 -0
- synth_ai/demos/demo_task_apps/core.py +374 -0
- synth_ai/demos/demo_task_apps/math/__init__.py +1 -0
- synth_ai/demos/demo_task_apps/math/app.py +37 -0
- synth_ai/demos/demo_task_apps/math/config.toml +44 -0
- synth_ai/demos/demo_task_apps/math/deploy_modal.py +60 -0
- synth_ai/demos/demo_task_apps/math/deploy_task_app.sh +22 -0
- synth_ai/environments/examples/bandit/__init__.py +33 -0
- synth_ai/environments/examples/bandit/engine.py +294 -0
- synth_ai/environments/examples/bandit/environment.py +194 -0
- synth_ai/environments/examples/bandit/taskset.py +200 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/analyze_semantic_words_markdown.py +250 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_comprehensive_evaluation.py +59 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_browser.py +152 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_config.toml +24 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_framework.py +1194 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/crafter_synth_config.toml +56 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_config_modal.toml +32 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_traces_sft_turso.py +724 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/kick_off_ft_modal.py +384 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_action_results.py +53 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_agent_actions.py +178 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_latest_run.py +222 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_lm_traces.py +183 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_no_rewards.py +210 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_trace_issue.py +206 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_db_schema.py +49 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_latest_results.py +64 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/debug_agent_responses.py +88 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/quick_trace_check.py +77 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/compare_experiments.py +324 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/filter_traces_sft_turso.py +580 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/kick_off_ft_oai.py +362 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/multi_model_config.toml +49 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_enhanced_hooks.py +332 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_events.py +97 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_results.py +217 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_hook_storage.py +87 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_seeds.py +88 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/compare_seed_performance.py +195 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/custom_eval_pipelines.py +400 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/plot_hook_frequency.py +195 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/seed_analysis_summary.py +56 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/run_rollouts_for_models_and_compare_v3.py +858 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_quick_evaluation.py +52 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_react_agent.py +874 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_trace_evaluation.py +1412 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/example_v3_usage.py +216 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/compare_traces.py +296 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_comprehensive_evaluation.py +58 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_env_serialization.py +464 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_evaluation_browser.py +152 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_quick_evaluation.py +51 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_trace_evaluation.py +1412 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/debug_player_loss.py +112 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_service.py +203 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_slowness.py +305 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_by_difficulty.py +126 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_example.py +94 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/explore_saved_states.py +142 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft.py +26 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft_OLD.py +984 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_gemini.py +724 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_modal.py +386 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_metadata.py +205 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_gemini.py +150 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_modal.py +283 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/prepare_vertex_ft.py +280 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/profile_env_slowness.py +456 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/replicate_issue.py +166 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/run_and_eval.py +102 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/run_comparison.py +128 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/run_qwen_rollouts.py +655 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/trace_eval_OLD.py +202 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/validate_openai_format.py +166 -0
- synth_ai/environments/examples/crafter_classic/environment.py +41 -2
- synth_ai/environments/examples/crafter_custom/agent_demos/__init__.py +1 -0
- synth_ai/environments/examples/crafter_custom/agent_demos/trace_eval.py +202 -0
- synth_ai/environments/examples/crafter_custom/old/analyze_diamond_issue.py +159 -0
- synth_ai/environments/examples/crafter_custom/old/analyze_diamond_spawning.py +158 -0
- synth_ai/environments/examples/crafter_custom/old/compare_worlds.py +71 -0
- synth_ai/environments/examples/crafter_custom/old/dataset_stats.py +105 -0
- synth_ai/environments/examples/crafter_custom/old/diamond_spawning_summary.py +119 -0
- synth_ai/environments/examples/crafter_custom/old/example_dataset_usage.py +52 -0
- synth_ai/environments/examples/enron/units/keyword_stats.py +112 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_evaluation_framework.py +1188 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_quick_evaluation.py +48 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_react_agent.py +562 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_trace_evaluation.py +221 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_evaluation_framework.py +981 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_quick_evaluation.py +74 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_react_agent.py +831 -0
- synth_ai/environments/examples/red/agent_demos/__init__.py +1 -0
- synth_ai/environments/examples/red/units/__init__.py +1 -0
- synth_ai/environments/examples/sokoban/agent_demos/sokoban_full_eval.py +899 -0
- synth_ai/environments/examples/sokoban/units/astar_common.py +95 -0
- synth_ai/environments/service/app.py +8 -0
- synth_ai/install_sqld.sh +40 -0
- synth_ai-0.2.4.dev9.dist-info/METADATA +91 -0
- {synth_ai-0.2.4.dev8.dist-info ā synth_ai-0.2.4.dev9.dist-info}/RECORD +110 -11
- {synth_ai-0.2.4.dev8.dist-info ā synth_ai-0.2.4.dev9.dist-info}/entry_points.txt +1 -0
- synth_ai-0.2.4.dev8.dist-info/METADATA +0 -635
- {synth_ai-0.2.4.dev8.dist-info ā synth_ai-0.2.4.dev9.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.4.dev8.dist-info ā synth_ai-0.2.4.dev9.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.4.dev8.dist-info ā synth_ai-0.2.4.dev9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Run Crafter agent evaluation and automatically evaluate traces.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import subprocess
|
|
7
|
+
import sys
|
|
8
|
+
import time
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from trace_eval import evaluate_all_traces, print_evaluation_summary, print_trace_evaluation
|
|
11
|
+
|
|
12
|
+
def main():
|
|
13
|
+
# Run the agent evaluation
|
|
14
|
+
print("š® Running Crafter Agent Evaluation...")
|
|
15
|
+
print("=" * 60)
|
|
16
|
+
|
|
17
|
+
# Pass all arguments to the test script
|
|
18
|
+
cmd = [sys.executable, "test_crafter_react_agent_openai.py"] + sys.argv[1:]
|
|
19
|
+
|
|
20
|
+
# Record start time
|
|
21
|
+
start_time = time.time()
|
|
22
|
+
|
|
23
|
+
# Run the evaluation
|
|
24
|
+
result = subprocess.run(cmd)
|
|
25
|
+
|
|
26
|
+
if result.returncode != 0:
|
|
27
|
+
print(f"\nā Agent evaluation failed with return code {result.returncode}")
|
|
28
|
+
return
|
|
29
|
+
|
|
30
|
+
# Wait a moment for files to be written
|
|
31
|
+
time.sleep(1)
|
|
32
|
+
|
|
33
|
+
# Find recent trace files
|
|
34
|
+
print("\n" + "=" * 80)
|
|
35
|
+
print("š TRACE EVALUATION")
|
|
36
|
+
print("=" * 80)
|
|
37
|
+
|
|
38
|
+
trace_dir = Path("traces")
|
|
39
|
+
if not trace_dir.exists():
|
|
40
|
+
print("ā No traces directory found")
|
|
41
|
+
return
|
|
42
|
+
|
|
43
|
+
# Find traces created since we started
|
|
44
|
+
recent_traces = []
|
|
45
|
+
for trace_file in trace_dir.glob("*.json"):
|
|
46
|
+
if trace_file.stat().st_mtime >= start_time:
|
|
47
|
+
recent_traces.append(trace_file)
|
|
48
|
+
|
|
49
|
+
if not recent_traces:
|
|
50
|
+
print("ā No new trace files found")
|
|
51
|
+
return
|
|
52
|
+
|
|
53
|
+
print(f"Found {len(recent_traces)} new trace files")
|
|
54
|
+
|
|
55
|
+
# Evaluate all recent traces
|
|
56
|
+
results = []
|
|
57
|
+
for trace_file in recent_traces:
|
|
58
|
+
from trace_eval import evaluate_trace
|
|
59
|
+
result = evaluate_trace(trace_file)
|
|
60
|
+
results.append(result)
|
|
61
|
+
|
|
62
|
+
# Sort by score
|
|
63
|
+
results.sort(key=lambda x: x['total_score'], reverse=True)
|
|
64
|
+
|
|
65
|
+
# Show individual evaluations if not too many
|
|
66
|
+
if len(results) <= 5:
|
|
67
|
+
for result in results:
|
|
68
|
+
print_trace_evaluation(result)
|
|
69
|
+
|
|
70
|
+
# Always show summary
|
|
71
|
+
print_evaluation_summary(results)
|
|
72
|
+
|
|
73
|
+
# Show achievement distribution
|
|
74
|
+
print("\n" + "=" * 80)
|
|
75
|
+
print("š ACHIEVEMENT DISTRIBUTION")
|
|
76
|
+
print("=" * 80)
|
|
77
|
+
|
|
78
|
+
total_easy = sum(r['counts'].get('easy_achievement', 0) for r in results)
|
|
79
|
+
total_medium = sum(r['counts'].get('medium_achievement', 0) for r in results)
|
|
80
|
+
total_hard = sum(r['counts'].get('hard_achievement', 0) for r in results)
|
|
81
|
+
total_invalid = sum(r['counts'].get('invalid_action', 0) for r in results)
|
|
82
|
+
|
|
83
|
+
print(f"Easy achievements: {total_easy} total ({total_easy/len(results):.1f} per episode)")
|
|
84
|
+
print(f"Medium achievements: {total_medium} total ({total_medium/len(results):.1f} per episode)")
|
|
85
|
+
print(f"Hard achievements: {total_hard} total ({total_hard/len(results):.1f} per episode)")
|
|
86
|
+
print(f"Invalid actions: {total_invalid} total ({total_invalid/len(results):.1f} per episode)")
|
|
87
|
+
|
|
88
|
+
# Score interpretation
|
|
89
|
+
avg_score = sum(r['total_score'] for r in results) / len(results)
|
|
90
|
+
print(f"\nAverage Score: {avg_score:.2f}")
|
|
91
|
+
|
|
92
|
+
if avg_score >= 2.0:
|
|
93
|
+
print("š Excellent performance!")
|
|
94
|
+
elif avg_score >= 1.0:
|
|
95
|
+
print("ā
Good performance")
|
|
96
|
+
elif avg_score >= 0.0:
|
|
97
|
+
print("š Room for improvement")
|
|
98
|
+
else:
|
|
99
|
+
print("ā ļø Many invalid actions detected")
|
|
100
|
+
|
|
101
|
+
if __name__ == "__main__":
|
|
102
|
+
main()
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Run comparison between OpenAI and LM implementations to verify trace equivalence.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import asyncio
|
|
7
|
+
import subprocess
|
|
8
|
+
import json
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
import sys
|
|
11
|
+
|
|
12
|
+
async def run_tests():
|
|
13
|
+
"""Run both tests and compare results."""
|
|
14
|
+
print("š Running Crafter v2 tracing comparison test")
|
|
15
|
+
print("=" * 80)
|
|
16
|
+
|
|
17
|
+
# Test parameters
|
|
18
|
+
model = "gpt-4o-mini"
|
|
19
|
+
episodes = 2
|
|
20
|
+
max_turns = 5
|
|
21
|
+
|
|
22
|
+
# Run OpenAI implementation
|
|
23
|
+
print("\nš Running OpenAI implementation...")
|
|
24
|
+
openai_cmd = [
|
|
25
|
+
sys.executable,
|
|
26
|
+
"synth_ai/environments/examples/crafter_classic/agent_demos/test_crafter_react_agent_openai.py",
|
|
27
|
+
"--episodes", str(episodes),
|
|
28
|
+
"--model", model,
|
|
29
|
+
"--max-turns", str(max_turns)
|
|
30
|
+
]
|
|
31
|
+
openai_result = subprocess.run(openai_cmd, capture_output=True, text=True)
|
|
32
|
+
|
|
33
|
+
if openai_result.returncode != 0:
|
|
34
|
+
print(f"ā OpenAI test failed: {openai_result.stderr}")
|
|
35
|
+
return
|
|
36
|
+
|
|
37
|
+
print("ā
OpenAI test completed")
|
|
38
|
+
|
|
39
|
+
# Run LM implementation
|
|
40
|
+
print("\nš Running LM implementation...")
|
|
41
|
+
lm_cmd = [
|
|
42
|
+
sys.executable,
|
|
43
|
+
"synth_ai/environments/examples/crafter_classic/agent_demos/test_crafter_react_agent_lm.py",
|
|
44
|
+
"--episodes", str(episodes),
|
|
45
|
+
"--model", model,
|
|
46
|
+
"--max-turns", str(max_turns)
|
|
47
|
+
]
|
|
48
|
+
lm_result = subprocess.run(lm_cmd, capture_output=True, text=True)
|
|
49
|
+
|
|
50
|
+
if lm_result.returncode != 0:
|
|
51
|
+
print(f"ā LM test failed: {lm_result.stderr}")
|
|
52
|
+
return
|
|
53
|
+
|
|
54
|
+
print("ā
LM test completed")
|
|
55
|
+
|
|
56
|
+
# Compare results
|
|
57
|
+
print("\nš Comparing results...")
|
|
58
|
+
|
|
59
|
+
# Load OpenAI results
|
|
60
|
+
openai_results_path = Path("traces/results.json")
|
|
61
|
+
if openai_results_path.exists():
|
|
62
|
+
with open(openai_results_path) as f:
|
|
63
|
+
openai_results = json.load(f)
|
|
64
|
+
print(f"\nOpenAI Results:")
|
|
65
|
+
print(f" Episodes: {openai_results['summary']['successful']}/{episodes}")
|
|
66
|
+
print(f" Avg Reward: {openai_results['summary']['avg_reward']:.2f}")
|
|
67
|
+
print(f" Avg Steps: {openai_results['summary']['avg_steps']:.1f}")
|
|
68
|
+
else:
|
|
69
|
+
print("ā OpenAI results not found")
|
|
70
|
+
|
|
71
|
+
# Load LM results
|
|
72
|
+
lm_results_path = Path("traces_v2_lm/results.json")
|
|
73
|
+
if lm_results_path.exists():
|
|
74
|
+
with open(lm_results_path) as f:
|
|
75
|
+
lm_results = json.load(f)
|
|
76
|
+
print(f"\nLM Results:")
|
|
77
|
+
print(f" Episodes: {lm_results['summary']['successful']}/{episodes}")
|
|
78
|
+
print(f" Avg Reward: {lm_results['summary']['avg_reward']:.2f}")
|
|
79
|
+
print(f" Avg Steps: {lm_results['summary']['avg_steps']:.1f}")
|
|
80
|
+
else:
|
|
81
|
+
print("ā LM results not found")
|
|
82
|
+
|
|
83
|
+
# Compare trace structures
|
|
84
|
+
print("\nš Comparing trace structures...")
|
|
85
|
+
|
|
86
|
+
openai_trace = Path("traces/trace_episode_0.json")
|
|
87
|
+
lm_trace = Path("traces_v2_lm/trace_episode_0.json")
|
|
88
|
+
|
|
89
|
+
if openai_trace.exists() and lm_trace.exists():
|
|
90
|
+
with open(openai_trace) as f:
|
|
91
|
+
openai_data = json.load(f)
|
|
92
|
+
with open(lm_trace) as f:
|
|
93
|
+
lm_data = json.load(f)
|
|
94
|
+
|
|
95
|
+
# Check key structures
|
|
96
|
+
print(f"\nOpenAI trace:")
|
|
97
|
+
print(f" Messages: {len(openai_data.get('message_history', []))}")
|
|
98
|
+
print(f" Events: {len(openai_data.get('event_history', []))}")
|
|
99
|
+
print(f" Timesteps: {len(openai_data.get('session_time_steps', []))}")
|
|
100
|
+
|
|
101
|
+
print(f"\nLM trace:")
|
|
102
|
+
print(f" Messages: {len(lm_data.get('message_history', []))}")
|
|
103
|
+
print(f" Events: {len(lm_data.get('event_history', []))}")
|
|
104
|
+
print(f" Timesteps: {len(lm_data.get('session_time_steps', []))}")
|
|
105
|
+
|
|
106
|
+
# Check for AI events
|
|
107
|
+
openai_ai_events = [e for e in openai_data.get('event_history', [])
|
|
108
|
+
if 'gen_ai.request.model' in e.get('system_state_before', {})]
|
|
109
|
+
lm_ai_events = [e for e in lm_data.get('event_history', [])
|
|
110
|
+
if 'gen_ai.request.model' in e.get('system_state_before', {})]
|
|
111
|
+
|
|
112
|
+
print(f"\nAI Events:")
|
|
113
|
+
print(f" OpenAI: {len(openai_ai_events)}")
|
|
114
|
+
print(f" LM: {len(lm_ai_events)}")
|
|
115
|
+
|
|
116
|
+
if len(openai_ai_events) == len(lm_ai_events):
|
|
117
|
+
print("ā
Same number of AI events captured")
|
|
118
|
+
else:
|
|
119
|
+
print("ā ļø Different number of AI events")
|
|
120
|
+
|
|
121
|
+
else:
|
|
122
|
+
print("ā Trace files not found")
|
|
123
|
+
|
|
124
|
+
print("\n" + "=" * 80)
|
|
125
|
+
print("ā
Comparison complete!")
|
|
126
|
+
|
|
127
|
+
if __name__ == "__main__":
|
|
128
|
+
asyncio.run(run_tests())
|