synth-ai 0.2.4.dev7__py3-none-any.whl → 0.2.4.dev9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (154) hide show
  1. synth_ai/__init__.py +1 -1
  2. synth_ai/cli/__init__.py +6 -0
  3. synth_ai/cli/balance.py +3 -15
  4. synth_ai/cli/demo.py +68 -9
  5. synth_ai/cli/rl_demo.py +137 -0
  6. synth_ai/cli/root.py +65 -0
  7. synth_ai/config/base_url.py +47 -0
  8. synth_ai/demos/core/__init__.py +1 -0
  9. synth_ai/demos/core/cli.py +621 -0
  10. synth_ai/demos/demo_task_apps/__init__.py +1 -0
  11. synth_ai/demos/demo_task_apps/core.py +374 -0
  12. synth_ai/demos/demo_task_apps/math/__init__.py +1 -0
  13. synth_ai/demos/demo_task_apps/math/app.py +37 -0
  14. synth_ai/demos/demo_task_apps/math/config.toml +44 -0
  15. synth_ai/demos/demo_task_apps/math/deploy_modal.py +60 -0
  16. synth_ai/demos/demo_task_apps/math/deploy_task_app.sh +22 -0
  17. synth_ai/environments/examples/bandit/__init__.py +33 -0
  18. synth_ai/environments/examples/bandit/engine.py +294 -0
  19. synth_ai/environments/examples/bandit/environment.py +194 -0
  20. synth_ai/environments/examples/bandit/taskset.py +200 -0
  21. synth_ai/environments/examples/crafter_classic/agent_demos/analyze_semantic_words_markdown.py +250 -0
  22. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_comprehensive_evaluation.py +59 -0
  23. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_browser.py +152 -0
  24. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_config.toml +24 -0
  25. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_framework.py +1194 -0
  26. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/crafter_synth_config.toml +56 -0
  27. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_config_modal.toml +32 -0
  28. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_traces_sft_turso.py +724 -0
  29. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/kick_off_ft_modal.py +384 -0
  30. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_action_results.py +53 -0
  31. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_agent_actions.py +178 -0
  32. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_latest_run.py +222 -0
  33. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_lm_traces.py +183 -0
  34. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_no_rewards.py +210 -0
  35. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_trace_issue.py +206 -0
  36. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_db_schema.py +49 -0
  37. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_latest_results.py +64 -0
  38. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/debug_agent_responses.py +88 -0
  39. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/quick_trace_check.py +77 -0
  40. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/compare_experiments.py +324 -0
  41. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/filter_traces_sft_turso.py +580 -0
  42. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/kick_off_ft_oai.py +362 -0
  43. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/multi_model_config.toml +49 -0
  44. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_enhanced_hooks.py +332 -0
  45. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_events.py +97 -0
  46. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_results.py +217 -0
  47. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_hook_storage.py +87 -0
  48. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_seeds.py +88 -0
  49. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/compare_seed_performance.py +195 -0
  50. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/custom_eval_pipelines.py +400 -0
  51. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/plot_hook_frequency.py +195 -0
  52. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/seed_analysis_summary.py +56 -0
  53. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/run_rollouts_for_models_and_compare_v3.py +858 -0
  54. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_quick_evaluation.py +52 -0
  55. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_react_agent.py +874 -0
  56. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_trace_evaluation.py +1412 -0
  57. synth_ai/environments/examples/crafter_classic/agent_demos/example_v3_usage.py +216 -0
  58. synth_ai/environments/examples/crafter_classic/agent_demos/old/compare_traces.py +296 -0
  59. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_comprehensive_evaluation.py +58 -0
  60. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_env_serialization.py +464 -0
  61. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_evaluation_browser.py +152 -0
  62. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_quick_evaluation.py +51 -0
  63. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_trace_evaluation.py +1412 -0
  64. synth_ai/environments/examples/crafter_classic/agent_demos/old/debug_player_loss.py +112 -0
  65. synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_service.py +203 -0
  66. synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_slowness.py +305 -0
  67. synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_by_difficulty.py +126 -0
  68. synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_example.py +94 -0
  69. synth_ai/environments/examples/crafter_classic/agent_demos/old/explore_saved_states.py +142 -0
  70. synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft.py +26 -0
  71. synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft_OLD.py +984 -0
  72. synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_gemini.py +724 -0
  73. synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_modal.py +386 -0
  74. synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_metadata.py +205 -0
  75. synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_gemini.py +150 -0
  76. synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_modal.py +283 -0
  77. synth_ai/environments/examples/crafter_classic/agent_demos/old/prepare_vertex_ft.py +280 -0
  78. synth_ai/environments/examples/crafter_classic/agent_demos/old/profile_env_slowness.py +456 -0
  79. synth_ai/environments/examples/crafter_classic/agent_demos/old/replicate_issue.py +166 -0
  80. synth_ai/environments/examples/crafter_classic/agent_demos/old/run_and_eval.py +102 -0
  81. synth_ai/environments/examples/crafter_classic/agent_demos/old/run_comparison.py +128 -0
  82. synth_ai/environments/examples/crafter_classic/agent_demos/old/run_qwen_rollouts.py +655 -0
  83. synth_ai/environments/examples/crafter_classic/agent_demos/old/trace_eval_OLD.py +202 -0
  84. synth_ai/environments/examples/crafter_classic/agent_demos/old/validate_openai_format.py +166 -0
  85. synth_ai/environments/examples/crafter_classic/environment.py +41 -2
  86. synth_ai/environments/examples/crafter_custom/agent_demos/__init__.py +1 -0
  87. synth_ai/environments/examples/crafter_custom/agent_demos/trace_eval.py +202 -0
  88. synth_ai/environments/examples/crafter_custom/old/analyze_diamond_issue.py +159 -0
  89. synth_ai/environments/examples/crafter_custom/old/analyze_diamond_spawning.py +158 -0
  90. synth_ai/environments/examples/crafter_custom/old/compare_worlds.py +71 -0
  91. synth_ai/environments/examples/crafter_custom/old/dataset_stats.py +105 -0
  92. synth_ai/environments/examples/crafter_custom/old/diamond_spawning_summary.py +119 -0
  93. synth_ai/environments/examples/crafter_custom/old/example_dataset_usage.py +52 -0
  94. synth_ai/environments/examples/enron/units/keyword_stats.py +112 -0
  95. synth_ai/environments/examples/minigrid/agent_demos/minigrid_evaluation_framework.py +1188 -0
  96. synth_ai/environments/examples/minigrid/agent_demos/minigrid_quick_evaluation.py +48 -0
  97. synth_ai/environments/examples/minigrid/agent_demos/minigrid_react_agent.py +562 -0
  98. synth_ai/environments/examples/minigrid/agent_demos/minigrid_trace_evaluation.py +221 -0
  99. synth_ai/environments/examples/nethack/agent_demos/nethack_evaluation_framework.py +981 -0
  100. synth_ai/environments/examples/nethack/agent_demos/nethack_quick_evaluation.py +74 -0
  101. synth_ai/environments/examples/nethack/agent_demos/nethack_react_agent.py +831 -0
  102. synth_ai/environments/examples/red/agent_demos/__init__.py +1 -0
  103. synth_ai/environments/examples/red/units/__init__.py +1 -0
  104. synth_ai/environments/examples/sokoban/agent_demos/sokoban_full_eval.py +899 -0
  105. synth_ai/environments/examples/sokoban/units/astar_common.py +95 -0
  106. synth_ai/environments/service/app.py +8 -0
  107. synth_ai/http.py +102 -0
  108. synth_ai/inference/__init__.py +7 -0
  109. synth_ai/inference/client.py +20 -0
  110. synth_ai/install_sqld.sh +40 -0
  111. synth_ai/jobs/client.py +246 -0
  112. synth_ai/learning/__init__.py +24 -0
  113. synth_ai/learning/client.py +149 -0
  114. synth_ai/learning/config.py +43 -0
  115. synth_ai/learning/constants.py +29 -0
  116. synth_ai/learning/ft_client.py +59 -0
  117. synth_ai/learning/health.py +43 -0
  118. synth_ai/learning/jobs.py +205 -0
  119. synth_ai/learning/rl_client.py +256 -0
  120. synth_ai/learning/sse.py +58 -0
  121. synth_ai/learning/validators.py +48 -0
  122. synth_ai/lm/core/main_v3.py +13 -0
  123. synth_ai/lm/core/synth_models.py +48 -0
  124. synth_ai/lm/core/vendor_clients.py +9 -6
  125. synth_ai/lm/vendors/core/openai_api.py +31 -3
  126. synth_ai/lm/vendors/openai_standard.py +45 -14
  127. synth_ai/lm/vendors/supported/custom_endpoint.py +12 -2
  128. synth_ai/lm/vendors/synth_client.py +372 -28
  129. synth_ai/rl/__init__.py +30 -0
  130. synth_ai/rl/contracts.py +32 -0
  131. synth_ai/rl/env_keys.py +137 -0
  132. synth_ai/rl/secrets.py +19 -0
  133. synth_ai/scripts/verify_rewards.py +100 -0
  134. synth_ai/task/__init__.py +10 -0
  135. synth_ai/task/contracts.py +120 -0
  136. synth_ai/task/health.py +28 -0
  137. synth_ai/task/validators.py +12 -0
  138. synth_ai/tracing_v3/hooks.py +3 -1
  139. synth_ai/tracing_v3/session_tracer.py +123 -2
  140. synth_ai/tracing_v3/turso/manager.py +218 -0
  141. synth_ai/tracing_v3/turso/models.py +53 -0
  142. synth_ai-0.2.4.dev9.dist-info/METADATA +91 -0
  143. {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev9.dist-info}/RECORD +147 -30
  144. {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev9.dist-info}/entry_points.txt +1 -0
  145. synth_ai/tui/__init__.py +0 -1
  146. synth_ai/tui/__main__.py +0 -13
  147. synth_ai/tui/cli/__init__.py +0 -1
  148. synth_ai/tui/cli/query_experiments.py +0 -164
  149. synth_ai/tui/cli/query_experiments_v3.py +0 -164
  150. synth_ai/tui/dashboard.py +0 -340
  151. synth_ai-0.2.4.dev7.dist-info/METADATA +0 -193
  152. {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev9.dist-info}/WHEEL +0 -0
  153. {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev9.dist-info}/licenses/LICENSE +0 -0
  154. {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,250 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Run Crafter agent and analyze semantic map words - output as markdown tables only.
4
+
5
+ This script:
6
+ 1. Runs a Crafter agent for multiple episodes
7
+ 2. Extracts all unique words from the semantic map observations
8
+ 3. Outputs analysis as markdown tables (no plotting dependencies)
9
+
10
+ Usage:
11
+ python analyze_semantic_words_markdown.py --model gemini-1.5-flash --episodes 3
12
+ """
13
+
14
+ import argparse
15
+ import asyncio
16
+ import json
17
+ import re
18
+
19
+ # Import the Crafter agent
20
+ import sys
21
+ from collections import Counter
22
+ from datetime import datetime
23
+ from pathlib import Path
24
+ from typing import Dict, List, Set
25
+
26
+ sys.path.append(str(Path(__file__).parent))
27
+ from test_crafter_react_agent import run_crafter_episodes
28
+
29
+
30
+ def extract_words_from_semantic_map(observation: str) -> set[str]:
31
+ """Extract meaningful words from a semantic map observation string."""
32
+ if not observation or "semantic_map" not in observation.lower():
33
+ return set()
34
+
35
+ # Look for patterns like object names in the semantic map
36
+ # Common Crafter objects/entities
37
+ crafter_words = {
38
+ # Resources
39
+ 'wood', 'stone', 'coal', 'iron', 'diamond', 'water',
40
+ # Animals
41
+ 'cow', 'pig', 'skeleton', 'zombie',
42
+ # Structures/Objects
43
+ 'tree', 'grass', 'furnace', 'table', 'bed', 'chest',
44
+ 'house', 'fence', 'door', 'wall',
45
+ # Tools
46
+ 'axe', 'pickaxe', 'sword', 'shovel',
47
+ # Food
48
+ 'bread', 'meat', 'apple',
49
+ # Environment
50
+ 'mountain', 'river', 'forest', 'desert', 'cave',
51
+ 'lava', 'sand', 'dirt', 'path',
52
+ # Actions/States
53
+ 'crafting', 'mining', 'building', 'farming',
54
+ 'health', 'hunger', 'energy'
55
+ }
56
+
57
+ # Extract words using regex - look for alphabetic words
58
+ words = re.findall(r'\b[a-zA-Z]{3,}\b', observation.lower())
59
+
60
+ # Filter to keep only meaningful Crafter-related words
61
+ found_words = set()
62
+ for word in words:
63
+ if word in crafter_words or any(cw in word for cw in crafter_words):
64
+ found_words.add(word)
65
+ return found_words
66
+
67
+ def analyze_episode_traces(traces_data: list[dict]) -> dict[str, int]:
68
+ """Analyze traces to extract semantic map words."""
69
+ word_counter = Counter()
70
+
71
+ for episode_data in traces_data:
72
+ if 'observations' in episode_data:
73
+ for obs in episode_data['observations']:
74
+ if isinstance(obs, dict):
75
+ # Look for semantic map in observation
76
+ obs_str = str(obs)
77
+ words = extract_words_from_semantic_map(obs_str)
78
+ word_counter.update(words)
79
+ elif isinstance(obs, str):
80
+ words = extract_words_from_semantic_map(obs)
81
+ word_counter.update(words)
82
+
83
+ return dict(word_counter)
84
+
85
+ def generate_markdown_report(word_counts: dict[str, int], model: str, episodes: int) -> str:
86
+ """Generate a markdown report of the semantic map analysis."""
87
+ if not word_counts:
88
+ return "# Semantic Map Analysis\n\n**No words found in semantic maps!**\n"
89
+
90
+ total_words = sum(word_counts.values())
91
+ unique_words = len(word_counts)
92
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
93
+
94
+ # Sort words by frequency
95
+ sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
96
+
97
+ # Generate markdown
98
+ md = f"""# Semantic Map Word Analysis
99
+
100
+ **Model:** {model}
101
+ **Episodes:** {episodes}
102
+ **Generated:** {timestamp}
103
+
104
+ ## Summary
105
+
106
+ - **Total word occurrences:** {total_words}
107
+ - **Unique words discovered:** {unique_words}
108
+ - **Average occurrences per word:** {total_words/unique_words:.1f}
109
+
110
+ ## Top Words by Frequency
111
+
112
+ | Rank | Word | Count | Percentage |
113
+ |------|------|-------|------------|
114
+ """
115
+
116
+ # Top 15 words table
117
+ for i, (word, count) in enumerate(sorted_words[:15], 1):
118
+ percentage = (count / total_words) * 100
119
+ md += f"| {i:2d} | {word} | {count} | {percentage:.1f}% |\n"
120
+
121
+ # Word categories
122
+ categories = {
123
+ "Resources": ['wood', 'stone', 'coal', 'iron', 'diamond', 'water'],
124
+ "Animals": ['cow', 'pig', 'skeleton', 'zombie'],
125
+ "Structures": ['tree', 'furnace', 'table', 'house', 'chest', 'fence', 'door'],
126
+ "Tools": ['axe', 'pickaxe', 'sword', 'shovel'],
127
+ "Environment": ['mountain', 'river', 'forest', 'desert', 'cave', 'lava', 'grass'],
128
+ "Food": ['bread', 'meat', 'apple']
129
+ }
130
+
131
+ md += "\n## Words by Category\n\n"
132
+
133
+ for category, words in categories.items():
134
+ found_words = [(w, word_counts[w]) for w in words if w in word_counts]
135
+ if found_words:
136
+ md += f"### {category}\n\n"
137
+ md += "| Word | Count |\n|------|-------|\n"
138
+ for word, count in sorted(found_words, key=lambda x: x[1], reverse=True):
139
+ md += f"| {word} | {count} |\n"
140
+ md += "\n"
141
+
142
+ # Frequency distribution
143
+ freq_counts = Counter(word_counts.values())
144
+ md += "## Frequency Distribution\n\n"
145
+ md += "| Frequency | Number of Words |\n|-----------|----------------|\n"
146
+ for freq in sorted(freq_counts.keys(), reverse=True):
147
+ md += f"| {freq} | {freq_counts[freq]} |\n"
148
+
149
+ # All words alphabetically
150
+ md += "\n## All Words (Alphabetical)\n\n"
151
+ md += "| Word | Count |\n|------|-------|\n"
152
+ for word in sorted(word_counts.keys()):
153
+ md += f"| {word} | {word_counts[word]} |\n"
154
+
155
+ return md
156
+
157
+ async def main():
158
+ parser = argparse.ArgumentParser(description="Analyze semantic map words - markdown output only")
159
+ parser.add_argument("--model", default="gemini-1.5-flash",
160
+ help="Model to use for agent (default: gemini-1.5-flash)")
161
+ parser.add_argument("--episodes", type=int, default=3,
162
+ help="Number of episodes to run (default: 3)")
163
+ parser.add_argument("--max-turns", type=int, default=50,
164
+ help="Maximum turns per episode (default: 50)")
165
+ parser.add_argument("--output-dir", default="semantic_analysis",
166
+ help="Directory to save analysis results")
167
+
168
+ args = parser.parse_args()
169
+
170
+ print(f"šŸš€ Running {args.episodes} episodes with {args.model}")
171
+ print("šŸ“Š Will analyze semantic map words and generate markdown report")
172
+
173
+ # Create output directory
174
+ output_dir = Path(args.output_dir)
175
+ output_dir.mkdir(exist_ok=True)
176
+
177
+ # Run the agent episodes
178
+ try:
179
+ print("\nšŸŽ® Starting Crafter episodes...")
180
+ traces_result = await run_crafter_episodes(
181
+ model_name=args.model,
182
+ num_episodes=args.episodes,
183
+ max_turns=args.max_turns,
184
+ difficulty="easy",
185
+ base_seed=1000
186
+ )
187
+
188
+ print(f"āœ… Completed {args.episodes} episodes")
189
+
190
+ # Analyze semantic map words
191
+ print("\nšŸ” Analyzing semantic map words...")
192
+ word_counts = analyze_episode_traces(traces_result)
193
+
194
+ # Generate markdown report
195
+ print("\nšŸ“ Generating markdown report...")
196
+ markdown_report = generate_markdown_report(word_counts, args.model, args.episodes)
197
+
198
+ # Save markdown report
199
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
200
+ report_file = output_dir / f"semantic_analysis_{args.model}_{timestamp}.md"
201
+
202
+ with open(report_file, 'w') as f:
203
+ f.write(markdown_report)
204
+
205
+ print(f"šŸ’¾ Markdown report saved to: {report_file}")
206
+
207
+ # Also save raw data as JSON
208
+ analysis_data = {
209
+ "model": args.model,
210
+ "episodes": args.episodes,
211
+ "timestamp": timestamp,
212
+ "word_counts": word_counts,
213
+ "total_unique_words": len(word_counts),
214
+ "total_word_occurrences": sum(word_counts.values())
215
+ }
216
+
217
+ json_file = output_dir / f"word_data_{args.model}_{timestamp}.json"
218
+ with open(json_file, 'w') as f:
219
+ json.dump(analysis_data, f, indent=2)
220
+
221
+ print(f"šŸ’¾ Raw data saved to: {json_file}")
222
+
223
+ # Print summary to console
224
+ print("\n" + "="*60)
225
+ print("SEMANTIC MAP WORD ANALYSIS SUMMARY")
226
+ print("="*60)
227
+
228
+ if word_counts:
229
+ total_words = sum(word_counts.values())
230
+ unique_words = len(word_counts)
231
+ print(f"Total word occurrences: {total_words}")
232
+ print(f"Unique words discovered: {unique_words}")
233
+
234
+ # Top 10 most common words
235
+ sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
236
+ print("\nTop 10 most frequent words:")
237
+ for i, (word, count) in enumerate(sorted_words[:10], 1):
238
+ print(f"{i:2d}. {word:<12} ({count} times)")
239
+ else:
240
+ print("No semantic map words found!")
241
+
242
+ print(f"\nšŸ“„ Full analysis available in: {report_file}")
243
+ print("\nšŸŽ‰ Analysis complete!")
244
+
245
+ except Exception as e:
246
+ print(f"āŒ Error during analysis: {e}")
247
+ raise
248
+
249
+ if __name__ == "__main__":
250
+ asyncio.run(main())
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Run script for Full Enchilada Crafter Evaluation
4
+ """
5
+
6
+ import argparse
7
+ import asyncio
8
+
9
+ from src.synth_env.examples.crafter_classic.agent_demos.full_enchilada import (
10
+ run_full_enchilada_eval,
11
+ )
12
+
13
+
14
+ async def main():
15
+ parser = argparse.ArgumentParser(description="Run Full Enchilada Crafter Evaluation")
16
+ parser.add_argument(
17
+ "--models", nargs="+", default=["gpt-4o-mini"], help="Model names to evaluate"
18
+ )
19
+ parser.add_argument(
20
+ "--difficulties",
21
+ nargs="+",
22
+ default=["easy", "hard"],
23
+ help="Difficulty levels to test",
24
+ )
25
+ parser.add_argument(
26
+ "--num-trajectories",
27
+ type=int,
28
+ default=3,
29
+ help="Number of trajectories per condition",
30
+ )
31
+ parser.add_argument("--max-turns", type=int, default=30, help="Maximum turns per trajectory")
32
+ parser.add_argument("--no-images", action="store_true", help="Disable image capture")
33
+ parser.add_argument(
34
+ "--no-viewer",
35
+ action="store_true",
36
+ help="Don't launch the viewer after evaluation",
37
+ )
38
+ parser.add_argument(
39
+ "--output-dir",
40
+ type=str,
41
+ default=None,
42
+ help="Output directory (default: src/evals/crafter/run_TIMESTAMP)",
43
+ )
44
+
45
+ args = parser.parse_args()
46
+
47
+ await run_full_enchilada_eval(
48
+ model_names=args.models,
49
+ difficulties=args.difficulties,
50
+ num_trajectories=args.num_trajectories,
51
+ max_turns=args.max_turns,
52
+ capture_images=not args.no_images,
53
+ launch_viewer=not args.no_viewer,
54
+ output_dir=args.output_dir,
55
+ )
56
+
57
+
58
+ if __name__ == "__main__":
59
+ asyncio.run(main())
@@ -0,0 +1,152 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Browse existing Crafter evaluations and launch viewer for a selected run.
4
+ """
5
+
6
+ import argparse
7
+ import asyncio
8
+ import json
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+
12
+ import uvicorn
13
+ from fastapi.staticfiles import StaticFiles
14
+ from src.synth_env.examples.crafter_classic.agent_demos.full_enchilada import (
15
+ app,
16
+ set_current_eval_dir,
17
+ )
18
+ from tabulate import tabulate
19
+
20
+
21
+ def list_evaluations(evals_dir: Path = Path("src/evals/crafter")):
22
+ """List all available evaluations with summary info."""
23
+ if not evals_dir.exists():
24
+ print(f"No evaluations found at {evals_dir}")
25
+ return []
26
+
27
+ evaluations = []
28
+ for run_dir in sorted(evals_dir.glob("run_*"), reverse=True):
29
+ if run_dir.is_dir():
30
+ summary_file = run_dir / "evaluation_summary.json"
31
+ if summary_file.exists():
32
+ with open(summary_file) as f:
33
+ summary = json.load(f)
34
+
35
+ eval_info = {
36
+ "run_id": run_dir.name,
37
+ "timestamp": summary["evaluation_metadata"]["timestamp"],
38
+ "models": ", ".join(summary["models_evaluated"]),
39
+ "difficulties": ", ".join(summary["difficulties_evaluated"]),
40
+ "num_trajectories": summary["evaluation_metadata"]["num_trajectories"],
41
+ "path": run_dir,
42
+ }
43
+ evaluations.append(eval_info)
44
+
45
+ return evaluations
46
+
47
+
48
+ async def view_evaluation(eval_dir: Path):
49
+ """Launch viewer for a specific evaluation."""
50
+ if not eval_dir.exists():
51
+ print(f"Evaluation directory not found: {eval_dir}")
52
+ return
53
+
54
+ viewer_dir = eval_dir / "viewer"
55
+ if not viewer_dir.exists():
56
+ print(f"Viewer files not found in {eval_dir}")
57
+ return
58
+
59
+ print(f"\nšŸ“ Viewing evaluation: {eval_dir}")
60
+ print("🌐 Launching viewer at http://localhost:8000")
61
+ print(" Press Ctrl+C to stop the viewer")
62
+
63
+ # Set the current eval directory for the viewer
64
+ set_current_eval_dir(eval_dir)
65
+
66
+ # Mount static files from the viewer directory
67
+ app.mount("/", StaticFiles(directory=str(viewer_dir), html=True), name="viewer")
68
+
69
+ # Run viewer
70
+ config = uvicorn.Config(app, host="0.0.0.0", port=8000, log_level="error")
71
+ server = uvicorn.Server(config)
72
+ await server.serve()
73
+
74
+
75
+ async def main():
76
+ parser = argparse.ArgumentParser(description="Browse Crafter evaluations")
77
+ parser.add_argument(
78
+ "--eval-dir",
79
+ type=str,
80
+ default="src/evals/crafter",
81
+ help="Base directory for evaluations",
82
+ )
83
+ parser.add_argument(
84
+ "--run-id", type=str, help="Specific run ID to view (e.g., run_20240115_143022)"
85
+ )
86
+ parser.add_argument("--latest", action="store_true", help="View the latest evaluation")
87
+
88
+ args = parser.parse_args()
89
+ evals_dir = Path(args.eval_dir)
90
+
91
+ # List evaluations
92
+ evaluations = list_evaluations(evals_dir)
93
+
94
+ if not evaluations:
95
+ return
96
+
97
+ # Display table of evaluations
98
+ if not args.run_id and not args.latest:
99
+ print("\nšŸ“Š Available Crafter Evaluations:")
100
+ table_data = []
101
+ for i, eval_info in enumerate(evaluations):
102
+ # Parse timestamp for cleaner display
103
+ try:
104
+ ts = datetime.fromisoformat(eval_info["timestamp"])
105
+ ts_str = ts.strftime("%Y-%m-%d %H:%M:%S")
106
+ except Exception:
107
+ ts_str = eval_info["timestamp"]
108
+
109
+ table_data.append(
110
+ [
111
+ i + 1,
112
+ eval_info["run_id"],
113
+ ts_str,
114
+ eval_info["models"],
115
+ eval_info["difficulties"],
116
+ eval_info["num_trajectories"],
117
+ ]
118
+ )
119
+
120
+ headers = ["#", "Run ID", "Timestamp", "Models", "Difficulties", "Trajectories"]
121
+ print(tabulate(table_data, headers=headers, tablefmt="grid"))
122
+
123
+ # Ask user to select
124
+ print("\nEnter the number of the evaluation to view (or 'q' to quit): ", end="")
125
+ choice = input().strip()
126
+
127
+ if choice.lower() == "q":
128
+ return
129
+
130
+ try:
131
+ idx = int(choice) - 1
132
+ if 0 <= idx < len(evaluations):
133
+ selected_eval = evaluations[idx]
134
+ await view_evaluation(selected_eval["path"])
135
+ else:
136
+ print("Invalid selection")
137
+ except ValueError:
138
+ print("Invalid input")
139
+
140
+ # View specific run
141
+ elif args.run_id:
142
+ eval_path = evals_dir / args.run_id
143
+ await view_evaluation(eval_path)
144
+
145
+ # View latest
146
+ elif args.latest and evaluations:
147
+ latest_eval = evaluations[0]
148
+ await view_evaluation(latest_eval["path"])
149
+
150
+
151
+ if __name__ == "__main__":
152
+ asyncio.run(main())
@@ -0,0 +1,24 @@
1
+ [evaluation]
2
+ # Maximum number of turns per agent
3
+ max_turns = 100
4
+
5
+ # Number of trajectories per model-difficulty combination
6
+ trajectories_per_condition = 10
7
+
8
+ # Difficulty modes to test
9
+ difficulties = ["easy"]
10
+
11
+ # Models to evaluate
12
+ models = [
13
+ "gpt-4.1-nano",
14
+ "gpt-4o-mini"
15
+ ]
16
+
17
+ # Parallel execution settings
18
+ parallel_episodes = true
19
+ timeout_seconds = 300
20
+
21
+ # Output settings
22
+ show_progress_bars = true
23
+ show_detailed_logging = false
24
+ show_final_table = true