synth-ai 0.2.4.dev8__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (112) hide show
  1. synth_ai/__init__.py +1 -1
  2. synth_ai/cli/__init__.py +6 -0
  3. synth_ai/cli/demo.py +68 -9
  4. synth_ai/cli/rl_demo.py +137 -0
  5. synth_ai/cli/root.py +65 -0
  6. synth_ai/demos/core/__init__.py +1 -0
  7. synth_ai/demos/core/cli.py +685 -0
  8. synth_ai/demos/demo_task_apps/__init__.py +1 -0
  9. synth_ai/demos/demo_task_apps/core.py +374 -0
  10. synth_ai/demos/demo_task_apps/math/__init__.py +1 -0
  11. synth_ai/demos/demo_task_apps/math/app.py +37 -0
  12. synth_ai/demos/demo_task_apps/math/config.toml +44 -0
  13. synth_ai/demos/demo_task_apps/math/deploy_modal.py +60 -0
  14. synth_ai/demos/demo_task_apps/math/deploy_task_app.sh +22 -0
  15. synth_ai/environments/examples/bandit/__init__.py +33 -0
  16. synth_ai/environments/examples/bandit/engine.py +294 -0
  17. synth_ai/environments/examples/bandit/environment.py +194 -0
  18. synth_ai/environments/examples/bandit/taskset.py +200 -0
  19. synth_ai/environments/examples/crafter_classic/agent_demos/analyze_semantic_words_markdown.py +250 -0
  20. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_comprehensive_evaluation.py +59 -0
  21. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_browser.py +152 -0
  22. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_config.toml +24 -0
  23. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_framework.py +1194 -0
  24. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/crafter_synth_config.toml +56 -0
  25. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_config_modal.toml +32 -0
  26. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_traces_sft_turso.py +724 -0
  27. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/kick_off_ft_modal.py +384 -0
  28. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_action_results.py +53 -0
  29. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_agent_actions.py +178 -0
  30. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_latest_run.py +222 -0
  31. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_lm_traces.py +183 -0
  32. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_no_rewards.py +210 -0
  33. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_trace_issue.py +206 -0
  34. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_db_schema.py +49 -0
  35. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_latest_results.py +64 -0
  36. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/debug_agent_responses.py +88 -0
  37. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/quick_trace_check.py +77 -0
  38. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/compare_experiments.py +324 -0
  39. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/filter_traces_sft_turso.py +580 -0
  40. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/kick_off_ft_oai.py +362 -0
  41. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/multi_model_config.toml +49 -0
  42. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_enhanced_hooks.py +332 -0
  43. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_events.py +97 -0
  44. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_results.py +217 -0
  45. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_hook_storage.py +87 -0
  46. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_seeds.py +88 -0
  47. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/compare_seed_performance.py +195 -0
  48. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/custom_eval_pipelines.py +400 -0
  49. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/plot_hook_frequency.py +195 -0
  50. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/seed_analysis_summary.py +56 -0
  51. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/run_rollouts_for_models_and_compare_v3.py +858 -0
  52. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_quick_evaluation.py +52 -0
  53. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_react_agent.py +874 -0
  54. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_trace_evaluation.py +1412 -0
  55. synth_ai/environments/examples/crafter_classic/agent_demos/example_v3_usage.py +216 -0
  56. synth_ai/environments/examples/crafter_classic/agent_demos/old/compare_traces.py +296 -0
  57. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_comprehensive_evaluation.py +58 -0
  58. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_env_serialization.py +464 -0
  59. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_evaluation_browser.py +152 -0
  60. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_quick_evaluation.py +51 -0
  61. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_trace_evaluation.py +1412 -0
  62. synth_ai/environments/examples/crafter_classic/agent_demos/old/debug_player_loss.py +112 -0
  63. synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_service.py +203 -0
  64. synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_slowness.py +305 -0
  65. synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_by_difficulty.py +126 -0
  66. synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_example.py +94 -0
  67. synth_ai/environments/examples/crafter_classic/agent_demos/old/explore_saved_states.py +142 -0
  68. synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft.py +26 -0
  69. synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft_OLD.py +984 -0
  70. synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_gemini.py +724 -0
  71. synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_modal.py +386 -0
  72. synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_metadata.py +205 -0
  73. synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_gemini.py +150 -0
  74. synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_modal.py +283 -0
  75. synth_ai/environments/examples/crafter_classic/agent_demos/old/prepare_vertex_ft.py +280 -0
  76. synth_ai/environments/examples/crafter_classic/agent_demos/old/profile_env_slowness.py +456 -0
  77. synth_ai/environments/examples/crafter_classic/agent_demos/old/replicate_issue.py +166 -0
  78. synth_ai/environments/examples/crafter_classic/agent_demos/old/run_and_eval.py +102 -0
  79. synth_ai/environments/examples/crafter_classic/agent_demos/old/run_comparison.py +128 -0
  80. synth_ai/environments/examples/crafter_classic/agent_demos/old/run_qwen_rollouts.py +655 -0
  81. synth_ai/environments/examples/crafter_classic/agent_demos/old/trace_eval_OLD.py +202 -0
  82. synth_ai/environments/examples/crafter_classic/agent_demos/old/validate_openai_format.py +166 -0
  83. synth_ai/environments/examples/crafter_classic/environment.py +41 -2
  84. synth_ai/environments/examples/crafter_custom/agent_demos/__init__.py +1 -0
  85. synth_ai/environments/examples/crafter_custom/agent_demos/trace_eval.py +202 -0
  86. synth_ai/environments/examples/crafter_custom/old/analyze_diamond_issue.py +159 -0
  87. synth_ai/environments/examples/crafter_custom/old/analyze_diamond_spawning.py +158 -0
  88. synth_ai/environments/examples/crafter_custom/old/compare_worlds.py +71 -0
  89. synth_ai/environments/examples/crafter_custom/old/dataset_stats.py +105 -0
  90. synth_ai/environments/examples/crafter_custom/old/diamond_spawning_summary.py +119 -0
  91. synth_ai/environments/examples/crafter_custom/old/example_dataset_usage.py +52 -0
  92. synth_ai/environments/examples/enron/units/keyword_stats.py +112 -0
  93. synth_ai/environments/examples/minigrid/agent_demos/minigrid_evaluation_framework.py +1188 -0
  94. synth_ai/environments/examples/minigrid/agent_demos/minigrid_quick_evaluation.py +48 -0
  95. synth_ai/environments/examples/minigrid/agent_demos/minigrid_react_agent.py +562 -0
  96. synth_ai/environments/examples/minigrid/agent_demos/minigrid_trace_evaluation.py +221 -0
  97. synth_ai/environments/examples/nethack/agent_demos/nethack_evaluation_framework.py +981 -0
  98. synth_ai/environments/examples/nethack/agent_demos/nethack_quick_evaluation.py +74 -0
  99. synth_ai/environments/examples/nethack/agent_demos/nethack_react_agent.py +831 -0
  100. synth_ai/environments/examples/red/agent_demos/__init__.py +1 -0
  101. synth_ai/environments/examples/red/units/__init__.py +1 -0
  102. synth_ai/environments/examples/sokoban/agent_demos/sokoban_full_eval.py +899 -0
  103. synth_ai/environments/examples/sokoban/units/astar_common.py +95 -0
  104. synth_ai/environments/service/app.py +8 -0
  105. synth_ai/install_sqld.sh +40 -0
  106. synth_ai-0.2.5.dist-info/METADATA +106 -0
  107. {synth_ai-0.2.4.dev8.dist-info → synth_ai-0.2.5.dist-info}/RECORD +111 -12
  108. {synth_ai-0.2.4.dev8.dist-info → synth_ai-0.2.5.dist-info}/entry_points.txt +1 -0
  109. synth_ai-0.2.4.dev8.dist-info/METADATA +0 -635
  110. {synth_ai-0.2.4.dev8.dist-info → synth_ai-0.2.5.dist-info}/WHEEL +0 -0
  111. {synth_ai-0.2.4.dev8.dist-info → synth_ai-0.2.5.dist-info}/licenses/LICENSE +0 -0
  112. {synth_ai-0.2.4.dev8.dist-info → synth_ai-0.2.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,102 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Run Crafter agent evaluation and automatically evaluate traces.
4
+ """
5
+
6
+ import subprocess
7
+ import sys
8
+ import time
9
+ from pathlib import Path
10
+ from trace_eval import evaluate_all_traces, print_evaluation_summary, print_trace_evaluation
11
+
12
+ def main():
13
+ # Run the agent evaluation
14
+ print("šŸŽ® Running Crafter Agent Evaluation...")
15
+ print("=" * 60)
16
+
17
+ # Pass all arguments to the test script
18
+ cmd = [sys.executable, "test_crafter_react_agent_openai.py"] + sys.argv[1:]
19
+
20
+ # Record start time
21
+ start_time = time.time()
22
+
23
+ # Run the evaluation
24
+ result = subprocess.run(cmd)
25
+
26
+ if result.returncode != 0:
27
+ print(f"\nāŒ Agent evaluation failed with return code {result.returncode}")
28
+ return
29
+
30
+ # Wait a moment for files to be written
31
+ time.sleep(1)
32
+
33
+ # Find recent trace files
34
+ print("\n" + "=" * 80)
35
+ print("šŸ“Š TRACE EVALUATION")
36
+ print("=" * 80)
37
+
38
+ trace_dir = Path("traces")
39
+ if not trace_dir.exists():
40
+ print("āŒ No traces directory found")
41
+ return
42
+
43
+ # Find traces created since we started
44
+ recent_traces = []
45
+ for trace_file in trace_dir.glob("*.json"):
46
+ if trace_file.stat().st_mtime >= start_time:
47
+ recent_traces.append(trace_file)
48
+
49
+ if not recent_traces:
50
+ print("āŒ No new trace files found")
51
+ return
52
+
53
+ print(f"Found {len(recent_traces)} new trace files")
54
+
55
+ # Evaluate all recent traces
56
+ results = []
57
+ for trace_file in recent_traces:
58
+ from trace_eval import evaluate_trace
59
+ result = evaluate_trace(trace_file)
60
+ results.append(result)
61
+
62
+ # Sort by score
63
+ results.sort(key=lambda x: x['total_score'], reverse=True)
64
+
65
+ # Show individual evaluations if not too many
66
+ if len(results) <= 5:
67
+ for result in results:
68
+ print_trace_evaluation(result)
69
+
70
+ # Always show summary
71
+ print_evaluation_summary(results)
72
+
73
+ # Show achievement distribution
74
+ print("\n" + "=" * 80)
75
+ print("šŸ“Š ACHIEVEMENT DISTRIBUTION")
76
+ print("=" * 80)
77
+
78
+ total_easy = sum(r['counts'].get('easy_achievement', 0) for r in results)
79
+ total_medium = sum(r['counts'].get('medium_achievement', 0) for r in results)
80
+ total_hard = sum(r['counts'].get('hard_achievement', 0) for r in results)
81
+ total_invalid = sum(r['counts'].get('invalid_action', 0) for r in results)
82
+
83
+ print(f"Easy achievements: {total_easy} total ({total_easy/len(results):.1f} per episode)")
84
+ print(f"Medium achievements: {total_medium} total ({total_medium/len(results):.1f} per episode)")
85
+ print(f"Hard achievements: {total_hard} total ({total_hard/len(results):.1f} per episode)")
86
+ print(f"Invalid actions: {total_invalid} total ({total_invalid/len(results):.1f} per episode)")
87
+
88
+ # Score interpretation
89
+ avg_score = sum(r['total_score'] for r in results) / len(results)
90
+ print(f"\nAverage Score: {avg_score:.2f}")
91
+
92
+ if avg_score >= 2.0:
93
+ print("šŸŽ‰ Excellent performance!")
94
+ elif avg_score >= 1.0:
95
+ print("āœ… Good performance")
96
+ elif avg_score >= 0.0:
97
+ print("šŸ“ˆ Room for improvement")
98
+ else:
99
+ print("āš ļø Many invalid actions detected")
100
+
101
+ if __name__ == "__main__":
102
+ main()
@@ -0,0 +1,128 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Run comparison between OpenAI and LM implementations to verify trace equivalence.
4
+ """
5
+
6
+ import asyncio
7
+ import subprocess
8
+ import json
9
+ from pathlib import Path
10
+ import sys
11
+
12
+ async def run_tests():
13
+ """Run both tests and compare results."""
14
+ print("šŸš€ Running Crafter v2 tracing comparison test")
15
+ print("=" * 80)
16
+
17
+ # Test parameters
18
+ model = "gpt-4o-mini"
19
+ episodes = 2
20
+ max_turns = 5
21
+
22
+ # Run OpenAI implementation
23
+ print("\nšŸ“ Running OpenAI implementation...")
24
+ openai_cmd = [
25
+ sys.executable,
26
+ "synth_ai/environments/examples/crafter_classic/agent_demos/test_crafter_react_agent_openai.py",
27
+ "--episodes", str(episodes),
28
+ "--model", model,
29
+ "--max-turns", str(max_turns)
30
+ ]
31
+ openai_result = subprocess.run(openai_cmd, capture_output=True, text=True)
32
+
33
+ if openai_result.returncode != 0:
34
+ print(f"āŒ OpenAI test failed: {openai_result.stderr}")
35
+ return
36
+
37
+ print("āœ… OpenAI test completed")
38
+
39
+ # Run LM implementation
40
+ print("\nšŸ“ Running LM implementation...")
41
+ lm_cmd = [
42
+ sys.executable,
43
+ "synth_ai/environments/examples/crafter_classic/agent_demos/test_crafter_react_agent_lm.py",
44
+ "--episodes", str(episodes),
45
+ "--model", model,
46
+ "--max-turns", str(max_turns)
47
+ ]
48
+ lm_result = subprocess.run(lm_cmd, capture_output=True, text=True)
49
+
50
+ if lm_result.returncode != 0:
51
+ print(f"āŒ LM test failed: {lm_result.stderr}")
52
+ return
53
+
54
+ print("āœ… LM test completed")
55
+
56
+ # Compare results
57
+ print("\nšŸ“Š Comparing results...")
58
+
59
+ # Load OpenAI results
60
+ openai_results_path = Path("traces/results.json")
61
+ if openai_results_path.exists():
62
+ with open(openai_results_path) as f:
63
+ openai_results = json.load(f)
64
+ print(f"\nOpenAI Results:")
65
+ print(f" Episodes: {openai_results['summary']['successful']}/{episodes}")
66
+ print(f" Avg Reward: {openai_results['summary']['avg_reward']:.2f}")
67
+ print(f" Avg Steps: {openai_results['summary']['avg_steps']:.1f}")
68
+ else:
69
+ print("āŒ OpenAI results not found")
70
+
71
+ # Load LM results
72
+ lm_results_path = Path("traces_v2_lm/results.json")
73
+ if lm_results_path.exists():
74
+ with open(lm_results_path) as f:
75
+ lm_results = json.load(f)
76
+ print(f"\nLM Results:")
77
+ print(f" Episodes: {lm_results['summary']['successful']}/{episodes}")
78
+ print(f" Avg Reward: {lm_results['summary']['avg_reward']:.2f}")
79
+ print(f" Avg Steps: {lm_results['summary']['avg_steps']:.1f}")
80
+ else:
81
+ print("āŒ LM results not found")
82
+
83
+ # Compare trace structures
84
+ print("\nšŸ” Comparing trace structures...")
85
+
86
+ openai_trace = Path("traces/trace_episode_0.json")
87
+ lm_trace = Path("traces_v2_lm/trace_episode_0.json")
88
+
89
+ if openai_trace.exists() and lm_trace.exists():
90
+ with open(openai_trace) as f:
91
+ openai_data = json.load(f)
92
+ with open(lm_trace) as f:
93
+ lm_data = json.load(f)
94
+
95
+ # Check key structures
96
+ print(f"\nOpenAI trace:")
97
+ print(f" Messages: {len(openai_data.get('message_history', []))}")
98
+ print(f" Events: {len(openai_data.get('event_history', []))}")
99
+ print(f" Timesteps: {len(openai_data.get('session_time_steps', []))}")
100
+
101
+ print(f"\nLM trace:")
102
+ print(f" Messages: {len(lm_data.get('message_history', []))}")
103
+ print(f" Events: {len(lm_data.get('event_history', []))}")
104
+ print(f" Timesteps: {len(lm_data.get('session_time_steps', []))}")
105
+
106
+ # Check for AI events
107
+ openai_ai_events = [e for e in openai_data.get('event_history', [])
108
+ if 'gen_ai.request.model' in e.get('system_state_before', {})]
109
+ lm_ai_events = [e for e in lm_data.get('event_history', [])
110
+ if 'gen_ai.request.model' in e.get('system_state_before', {})]
111
+
112
+ print(f"\nAI Events:")
113
+ print(f" OpenAI: {len(openai_ai_events)}")
114
+ print(f" LM: {len(lm_ai_events)}")
115
+
116
+ if len(openai_ai_events) == len(lm_ai_events):
117
+ print("āœ… Same number of AI events captured")
118
+ else:
119
+ print("āš ļø Different number of AI events")
120
+
121
+ else:
122
+ print("āŒ Trace files not found")
123
+
124
+ print("\n" + "=" * 80)
125
+ print("āœ… Comparison complete!")
126
+
127
+ if __name__ == "__main__":
128
+ asyncio.run(run_tests())