synth-ai 0.2.4.dev8__py3-none-any.whl ā 0.2.4.dev9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- synth_ai/cli/__init__.py +6 -0
- synth_ai/cli/demo.py +68 -9
- synth_ai/cli/rl_demo.py +137 -0
- synth_ai/cli/root.py +65 -0
- synth_ai/demos/core/__init__.py +1 -0
- synth_ai/demos/core/cli.py +621 -0
- synth_ai/demos/demo_task_apps/__init__.py +1 -0
- synth_ai/demos/demo_task_apps/core.py +374 -0
- synth_ai/demos/demo_task_apps/math/__init__.py +1 -0
- synth_ai/demos/demo_task_apps/math/app.py +37 -0
- synth_ai/demos/demo_task_apps/math/config.toml +44 -0
- synth_ai/demos/demo_task_apps/math/deploy_modal.py +60 -0
- synth_ai/demos/demo_task_apps/math/deploy_task_app.sh +22 -0
- synth_ai/environments/examples/bandit/__init__.py +33 -0
- synth_ai/environments/examples/bandit/engine.py +294 -0
- synth_ai/environments/examples/bandit/environment.py +194 -0
- synth_ai/environments/examples/bandit/taskset.py +200 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/analyze_semantic_words_markdown.py +250 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_comprehensive_evaluation.py +59 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_browser.py +152 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_config.toml +24 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_framework.py +1194 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/crafter_synth_config.toml +56 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_config_modal.toml +32 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_traces_sft_turso.py +724 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/kick_off_ft_modal.py +384 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_action_results.py +53 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_agent_actions.py +178 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_latest_run.py +222 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_lm_traces.py +183 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_no_rewards.py +210 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_trace_issue.py +206 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_db_schema.py +49 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_latest_results.py +64 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/debug_agent_responses.py +88 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/quick_trace_check.py +77 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/compare_experiments.py +324 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/filter_traces_sft_turso.py +580 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/kick_off_ft_oai.py +362 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/multi_model_config.toml +49 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_enhanced_hooks.py +332 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_events.py +97 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_results.py +217 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_hook_storage.py +87 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_seeds.py +88 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/compare_seed_performance.py +195 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/custom_eval_pipelines.py +400 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/plot_hook_frequency.py +195 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/seed_analysis_summary.py +56 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/run_rollouts_for_models_and_compare_v3.py +858 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_quick_evaluation.py +52 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_react_agent.py +874 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_trace_evaluation.py +1412 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/example_v3_usage.py +216 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/compare_traces.py +296 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_comprehensive_evaluation.py +58 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_env_serialization.py +464 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_evaluation_browser.py +152 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_quick_evaluation.py +51 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_trace_evaluation.py +1412 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/debug_player_loss.py +112 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_service.py +203 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_slowness.py +305 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_by_difficulty.py +126 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_example.py +94 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/explore_saved_states.py +142 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft.py +26 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft_OLD.py +984 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_gemini.py +724 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_modal.py +386 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_metadata.py +205 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_gemini.py +150 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_modal.py +283 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/prepare_vertex_ft.py +280 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/profile_env_slowness.py +456 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/replicate_issue.py +166 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/run_and_eval.py +102 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/run_comparison.py +128 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/run_qwen_rollouts.py +655 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/trace_eval_OLD.py +202 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/validate_openai_format.py +166 -0
- synth_ai/environments/examples/crafter_classic/environment.py +41 -2
- synth_ai/environments/examples/crafter_custom/agent_demos/__init__.py +1 -0
- synth_ai/environments/examples/crafter_custom/agent_demos/trace_eval.py +202 -0
- synth_ai/environments/examples/crafter_custom/old/analyze_diamond_issue.py +159 -0
- synth_ai/environments/examples/crafter_custom/old/analyze_diamond_spawning.py +158 -0
- synth_ai/environments/examples/crafter_custom/old/compare_worlds.py +71 -0
- synth_ai/environments/examples/crafter_custom/old/dataset_stats.py +105 -0
- synth_ai/environments/examples/crafter_custom/old/diamond_spawning_summary.py +119 -0
- synth_ai/environments/examples/crafter_custom/old/example_dataset_usage.py +52 -0
- synth_ai/environments/examples/enron/units/keyword_stats.py +112 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_evaluation_framework.py +1188 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_quick_evaluation.py +48 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_react_agent.py +562 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_trace_evaluation.py +221 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_evaluation_framework.py +981 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_quick_evaluation.py +74 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_react_agent.py +831 -0
- synth_ai/environments/examples/red/agent_demos/__init__.py +1 -0
- synth_ai/environments/examples/red/units/__init__.py +1 -0
- synth_ai/environments/examples/sokoban/agent_demos/sokoban_full_eval.py +899 -0
- synth_ai/environments/examples/sokoban/units/astar_common.py +95 -0
- synth_ai/environments/service/app.py +8 -0
- synth_ai/install_sqld.sh +40 -0
- synth_ai-0.2.4.dev9.dist-info/METADATA +91 -0
- {synth_ai-0.2.4.dev8.dist-info ā synth_ai-0.2.4.dev9.dist-info}/RECORD +110 -11
- {synth_ai-0.2.4.dev8.dist-info ā synth_ai-0.2.4.dev9.dist-info}/entry_points.txt +1 -0
- synth_ai-0.2.4.dev8.dist-info/METADATA +0 -635
- {synth_ai-0.2.4.dev8.dist-info ā synth_ai-0.2.4.dev9.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.4.dev8.dist-info ā synth_ai-0.2.4.dev9.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.4.dev8.dist-info ā synth_ai-0.2.4.dev9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Run Crafter agent and analyze semantic map words - output as markdown tables only.
|
|
4
|
+
|
|
5
|
+
This script:
|
|
6
|
+
1. Runs a Crafter agent for multiple episodes
|
|
7
|
+
2. Extracts all unique words from the semantic map observations
|
|
8
|
+
3. Outputs analysis as markdown tables (no plotting dependencies)
|
|
9
|
+
|
|
10
|
+
Usage:
|
|
11
|
+
python analyze_semantic_words_markdown.py --model gemini-1.5-flash --episodes 3
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import argparse
|
|
15
|
+
import asyncio
|
|
16
|
+
import json
|
|
17
|
+
import re
|
|
18
|
+
|
|
19
|
+
# Import the Crafter agent
|
|
20
|
+
import sys
|
|
21
|
+
from collections import Counter
|
|
22
|
+
from datetime import datetime
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
from typing import Dict, List, Set
|
|
25
|
+
|
|
26
|
+
sys.path.append(str(Path(__file__).parent))
|
|
27
|
+
from test_crafter_react_agent import run_crafter_episodes
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def extract_words_from_semantic_map(observation: str) -> set[str]:
|
|
31
|
+
"""Extract meaningful words from a semantic map observation string."""
|
|
32
|
+
if not observation or "semantic_map" not in observation.lower():
|
|
33
|
+
return set()
|
|
34
|
+
|
|
35
|
+
# Look for patterns like object names in the semantic map
|
|
36
|
+
# Common Crafter objects/entities
|
|
37
|
+
crafter_words = {
|
|
38
|
+
# Resources
|
|
39
|
+
'wood', 'stone', 'coal', 'iron', 'diamond', 'water',
|
|
40
|
+
# Animals
|
|
41
|
+
'cow', 'pig', 'skeleton', 'zombie',
|
|
42
|
+
# Structures/Objects
|
|
43
|
+
'tree', 'grass', 'furnace', 'table', 'bed', 'chest',
|
|
44
|
+
'house', 'fence', 'door', 'wall',
|
|
45
|
+
# Tools
|
|
46
|
+
'axe', 'pickaxe', 'sword', 'shovel',
|
|
47
|
+
# Food
|
|
48
|
+
'bread', 'meat', 'apple',
|
|
49
|
+
# Environment
|
|
50
|
+
'mountain', 'river', 'forest', 'desert', 'cave',
|
|
51
|
+
'lava', 'sand', 'dirt', 'path',
|
|
52
|
+
# Actions/States
|
|
53
|
+
'crafting', 'mining', 'building', 'farming',
|
|
54
|
+
'health', 'hunger', 'energy'
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
# Extract words using regex - look for alphabetic words
|
|
58
|
+
words = re.findall(r'\b[a-zA-Z]{3,}\b', observation.lower())
|
|
59
|
+
|
|
60
|
+
# Filter to keep only meaningful Crafter-related words
|
|
61
|
+
found_words = set()
|
|
62
|
+
for word in words:
|
|
63
|
+
if word in crafter_words or any(cw in word for cw in crafter_words):
|
|
64
|
+
found_words.add(word)
|
|
65
|
+
return found_words
|
|
66
|
+
|
|
67
|
+
def analyze_episode_traces(traces_data: list[dict]) -> dict[str, int]:
|
|
68
|
+
"""Analyze traces to extract semantic map words."""
|
|
69
|
+
word_counter = Counter()
|
|
70
|
+
|
|
71
|
+
for episode_data in traces_data:
|
|
72
|
+
if 'observations' in episode_data:
|
|
73
|
+
for obs in episode_data['observations']:
|
|
74
|
+
if isinstance(obs, dict):
|
|
75
|
+
# Look for semantic map in observation
|
|
76
|
+
obs_str = str(obs)
|
|
77
|
+
words = extract_words_from_semantic_map(obs_str)
|
|
78
|
+
word_counter.update(words)
|
|
79
|
+
elif isinstance(obs, str):
|
|
80
|
+
words = extract_words_from_semantic_map(obs)
|
|
81
|
+
word_counter.update(words)
|
|
82
|
+
|
|
83
|
+
return dict(word_counter)
|
|
84
|
+
|
|
85
|
+
def generate_markdown_report(word_counts: dict[str, int], model: str, episodes: int) -> str:
|
|
86
|
+
"""Generate a markdown report of the semantic map analysis."""
|
|
87
|
+
if not word_counts:
|
|
88
|
+
return "# Semantic Map Analysis\n\n**No words found in semantic maps!**\n"
|
|
89
|
+
|
|
90
|
+
total_words = sum(word_counts.values())
|
|
91
|
+
unique_words = len(word_counts)
|
|
92
|
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
93
|
+
|
|
94
|
+
# Sort words by frequency
|
|
95
|
+
sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
|
|
96
|
+
|
|
97
|
+
# Generate markdown
|
|
98
|
+
md = f"""# Semantic Map Word Analysis
|
|
99
|
+
|
|
100
|
+
**Model:** {model}
|
|
101
|
+
**Episodes:** {episodes}
|
|
102
|
+
**Generated:** {timestamp}
|
|
103
|
+
|
|
104
|
+
## Summary
|
|
105
|
+
|
|
106
|
+
- **Total word occurrences:** {total_words}
|
|
107
|
+
- **Unique words discovered:** {unique_words}
|
|
108
|
+
- **Average occurrences per word:** {total_words/unique_words:.1f}
|
|
109
|
+
|
|
110
|
+
## Top Words by Frequency
|
|
111
|
+
|
|
112
|
+
| Rank | Word | Count | Percentage |
|
|
113
|
+
|------|------|-------|------------|
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
# Top 15 words table
|
|
117
|
+
for i, (word, count) in enumerate(sorted_words[:15], 1):
|
|
118
|
+
percentage = (count / total_words) * 100
|
|
119
|
+
md += f"| {i:2d} | {word} | {count} | {percentage:.1f}% |\n"
|
|
120
|
+
|
|
121
|
+
# Word categories
|
|
122
|
+
categories = {
|
|
123
|
+
"Resources": ['wood', 'stone', 'coal', 'iron', 'diamond', 'water'],
|
|
124
|
+
"Animals": ['cow', 'pig', 'skeleton', 'zombie'],
|
|
125
|
+
"Structures": ['tree', 'furnace', 'table', 'house', 'chest', 'fence', 'door'],
|
|
126
|
+
"Tools": ['axe', 'pickaxe', 'sword', 'shovel'],
|
|
127
|
+
"Environment": ['mountain', 'river', 'forest', 'desert', 'cave', 'lava', 'grass'],
|
|
128
|
+
"Food": ['bread', 'meat', 'apple']
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
md += "\n## Words by Category\n\n"
|
|
132
|
+
|
|
133
|
+
for category, words in categories.items():
|
|
134
|
+
found_words = [(w, word_counts[w]) for w in words if w in word_counts]
|
|
135
|
+
if found_words:
|
|
136
|
+
md += f"### {category}\n\n"
|
|
137
|
+
md += "| Word | Count |\n|------|-------|\n"
|
|
138
|
+
for word, count in sorted(found_words, key=lambda x: x[1], reverse=True):
|
|
139
|
+
md += f"| {word} | {count} |\n"
|
|
140
|
+
md += "\n"
|
|
141
|
+
|
|
142
|
+
# Frequency distribution
|
|
143
|
+
freq_counts = Counter(word_counts.values())
|
|
144
|
+
md += "## Frequency Distribution\n\n"
|
|
145
|
+
md += "| Frequency | Number of Words |\n|-----------|----------------|\n"
|
|
146
|
+
for freq in sorted(freq_counts.keys(), reverse=True):
|
|
147
|
+
md += f"| {freq} | {freq_counts[freq]} |\n"
|
|
148
|
+
|
|
149
|
+
# All words alphabetically
|
|
150
|
+
md += "\n## All Words (Alphabetical)\n\n"
|
|
151
|
+
md += "| Word | Count |\n|------|-------|\n"
|
|
152
|
+
for word in sorted(word_counts.keys()):
|
|
153
|
+
md += f"| {word} | {word_counts[word]} |\n"
|
|
154
|
+
|
|
155
|
+
return md
|
|
156
|
+
|
|
157
|
+
async def main():
|
|
158
|
+
parser = argparse.ArgumentParser(description="Analyze semantic map words - markdown output only")
|
|
159
|
+
parser.add_argument("--model", default="gemini-1.5-flash",
|
|
160
|
+
help="Model to use for agent (default: gemini-1.5-flash)")
|
|
161
|
+
parser.add_argument("--episodes", type=int, default=3,
|
|
162
|
+
help="Number of episodes to run (default: 3)")
|
|
163
|
+
parser.add_argument("--max-turns", type=int, default=50,
|
|
164
|
+
help="Maximum turns per episode (default: 50)")
|
|
165
|
+
parser.add_argument("--output-dir", default="semantic_analysis",
|
|
166
|
+
help="Directory to save analysis results")
|
|
167
|
+
|
|
168
|
+
args = parser.parse_args()
|
|
169
|
+
|
|
170
|
+
print(f"š Running {args.episodes} episodes with {args.model}")
|
|
171
|
+
print("š Will analyze semantic map words and generate markdown report")
|
|
172
|
+
|
|
173
|
+
# Create output directory
|
|
174
|
+
output_dir = Path(args.output_dir)
|
|
175
|
+
output_dir.mkdir(exist_ok=True)
|
|
176
|
+
|
|
177
|
+
# Run the agent episodes
|
|
178
|
+
try:
|
|
179
|
+
print("\nš® Starting Crafter episodes...")
|
|
180
|
+
traces_result = await run_crafter_episodes(
|
|
181
|
+
model_name=args.model,
|
|
182
|
+
num_episodes=args.episodes,
|
|
183
|
+
max_turns=args.max_turns,
|
|
184
|
+
difficulty="easy",
|
|
185
|
+
base_seed=1000
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
print(f"ā
Completed {args.episodes} episodes")
|
|
189
|
+
|
|
190
|
+
# Analyze semantic map words
|
|
191
|
+
print("\nš Analyzing semantic map words...")
|
|
192
|
+
word_counts = analyze_episode_traces(traces_result)
|
|
193
|
+
|
|
194
|
+
# Generate markdown report
|
|
195
|
+
print("\nš Generating markdown report...")
|
|
196
|
+
markdown_report = generate_markdown_report(word_counts, args.model, args.episodes)
|
|
197
|
+
|
|
198
|
+
# Save markdown report
|
|
199
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
200
|
+
report_file = output_dir / f"semantic_analysis_{args.model}_{timestamp}.md"
|
|
201
|
+
|
|
202
|
+
with open(report_file, 'w') as f:
|
|
203
|
+
f.write(markdown_report)
|
|
204
|
+
|
|
205
|
+
print(f"š¾ Markdown report saved to: {report_file}")
|
|
206
|
+
|
|
207
|
+
# Also save raw data as JSON
|
|
208
|
+
analysis_data = {
|
|
209
|
+
"model": args.model,
|
|
210
|
+
"episodes": args.episodes,
|
|
211
|
+
"timestamp": timestamp,
|
|
212
|
+
"word_counts": word_counts,
|
|
213
|
+
"total_unique_words": len(word_counts),
|
|
214
|
+
"total_word_occurrences": sum(word_counts.values())
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
json_file = output_dir / f"word_data_{args.model}_{timestamp}.json"
|
|
218
|
+
with open(json_file, 'w') as f:
|
|
219
|
+
json.dump(analysis_data, f, indent=2)
|
|
220
|
+
|
|
221
|
+
print(f"š¾ Raw data saved to: {json_file}")
|
|
222
|
+
|
|
223
|
+
# Print summary to console
|
|
224
|
+
print("\n" + "="*60)
|
|
225
|
+
print("SEMANTIC MAP WORD ANALYSIS SUMMARY")
|
|
226
|
+
print("="*60)
|
|
227
|
+
|
|
228
|
+
if word_counts:
|
|
229
|
+
total_words = sum(word_counts.values())
|
|
230
|
+
unique_words = len(word_counts)
|
|
231
|
+
print(f"Total word occurrences: {total_words}")
|
|
232
|
+
print(f"Unique words discovered: {unique_words}")
|
|
233
|
+
|
|
234
|
+
# Top 10 most common words
|
|
235
|
+
sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
|
|
236
|
+
print("\nTop 10 most frequent words:")
|
|
237
|
+
for i, (word, count) in enumerate(sorted_words[:10], 1):
|
|
238
|
+
print(f"{i:2d}. {word:<12} ({count} times)")
|
|
239
|
+
else:
|
|
240
|
+
print("No semantic map words found!")
|
|
241
|
+
|
|
242
|
+
print(f"\nš Full analysis available in: {report_file}")
|
|
243
|
+
print("\nš Analysis complete!")
|
|
244
|
+
|
|
245
|
+
except Exception as e:
|
|
246
|
+
print(f"ā Error during analysis: {e}")
|
|
247
|
+
raise
|
|
248
|
+
|
|
249
|
+
if __name__ == "__main__":
|
|
250
|
+
asyncio.run(main())
|
synth_ai/environments/examples/crafter_classic/agent_demos/crafter_comprehensive_evaluation.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Run script for Full Enchilada Crafter Evaluation
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import argparse
|
|
7
|
+
import asyncio
|
|
8
|
+
|
|
9
|
+
from src.synth_env.examples.crafter_classic.agent_demos.full_enchilada import (
|
|
10
|
+
run_full_enchilada_eval,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
async def main():
|
|
15
|
+
parser = argparse.ArgumentParser(description="Run Full Enchilada Crafter Evaluation")
|
|
16
|
+
parser.add_argument(
|
|
17
|
+
"--models", nargs="+", default=["gpt-4o-mini"], help="Model names to evaluate"
|
|
18
|
+
)
|
|
19
|
+
parser.add_argument(
|
|
20
|
+
"--difficulties",
|
|
21
|
+
nargs="+",
|
|
22
|
+
default=["easy", "hard"],
|
|
23
|
+
help="Difficulty levels to test",
|
|
24
|
+
)
|
|
25
|
+
parser.add_argument(
|
|
26
|
+
"--num-trajectories",
|
|
27
|
+
type=int,
|
|
28
|
+
default=3,
|
|
29
|
+
help="Number of trajectories per condition",
|
|
30
|
+
)
|
|
31
|
+
parser.add_argument("--max-turns", type=int, default=30, help="Maximum turns per trajectory")
|
|
32
|
+
parser.add_argument("--no-images", action="store_true", help="Disable image capture")
|
|
33
|
+
parser.add_argument(
|
|
34
|
+
"--no-viewer",
|
|
35
|
+
action="store_true",
|
|
36
|
+
help="Don't launch the viewer after evaluation",
|
|
37
|
+
)
|
|
38
|
+
parser.add_argument(
|
|
39
|
+
"--output-dir",
|
|
40
|
+
type=str,
|
|
41
|
+
default=None,
|
|
42
|
+
help="Output directory (default: src/evals/crafter/run_TIMESTAMP)",
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
args = parser.parse_args()
|
|
46
|
+
|
|
47
|
+
await run_full_enchilada_eval(
|
|
48
|
+
model_names=args.models,
|
|
49
|
+
difficulties=args.difficulties,
|
|
50
|
+
num_trajectories=args.num_trajectories,
|
|
51
|
+
max_turns=args.max_turns,
|
|
52
|
+
capture_images=not args.no_images,
|
|
53
|
+
launch_viewer=not args.no_viewer,
|
|
54
|
+
output_dir=args.output_dir,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
if __name__ == "__main__":
|
|
59
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Browse existing Crafter evaluations and launch viewer for a selected run.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import argparse
|
|
7
|
+
import asyncio
|
|
8
|
+
import json
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
import uvicorn
|
|
13
|
+
from fastapi.staticfiles import StaticFiles
|
|
14
|
+
from src.synth_env.examples.crafter_classic.agent_demos.full_enchilada import (
|
|
15
|
+
app,
|
|
16
|
+
set_current_eval_dir,
|
|
17
|
+
)
|
|
18
|
+
from tabulate import tabulate
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def list_evaluations(evals_dir: Path = Path("src/evals/crafter")):
|
|
22
|
+
"""List all available evaluations with summary info."""
|
|
23
|
+
if not evals_dir.exists():
|
|
24
|
+
print(f"No evaluations found at {evals_dir}")
|
|
25
|
+
return []
|
|
26
|
+
|
|
27
|
+
evaluations = []
|
|
28
|
+
for run_dir in sorted(evals_dir.glob("run_*"), reverse=True):
|
|
29
|
+
if run_dir.is_dir():
|
|
30
|
+
summary_file = run_dir / "evaluation_summary.json"
|
|
31
|
+
if summary_file.exists():
|
|
32
|
+
with open(summary_file) as f:
|
|
33
|
+
summary = json.load(f)
|
|
34
|
+
|
|
35
|
+
eval_info = {
|
|
36
|
+
"run_id": run_dir.name,
|
|
37
|
+
"timestamp": summary["evaluation_metadata"]["timestamp"],
|
|
38
|
+
"models": ", ".join(summary["models_evaluated"]),
|
|
39
|
+
"difficulties": ", ".join(summary["difficulties_evaluated"]),
|
|
40
|
+
"num_trajectories": summary["evaluation_metadata"]["num_trajectories"],
|
|
41
|
+
"path": run_dir,
|
|
42
|
+
}
|
|
43
|
+
evaluations.append(eval_info)
|
|
44
|
+
|
|
45
|
+
return evaluations
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
async def view_evaluation(eval_dir: Path):
|
|
49
|
+
"""Launch viewer for a specific evaluation."""
|
|
50
|
+
if not eval_dir.exists():
|
|
51
|
+
print(f"Evaluation directory not found: {eval_dir}")
|
|
52
|
+
return
|
|
53
|
+
|
|
54
|
+
viewer_dir = eval_dir / "viewer"
|
|
55
|
+
if not viewer_dir.exists():
|
|
56
|
+
print(f"Viewer files not found in {eval_dir}")
|
|
57
|
+
return
|
|
58
|
+
|
|
59
|
+
print(f"\nš Viewing evaluation: {eval_dir}")
|
|
60
|
+
print("š Launching viewer at http://localhost:8000")
|
|
61
|
+
print(" Press Ctrl+C to stop the viewer")
|
|
62
|
+
|
|
63
|
+
# Set the current eval directory for the viewer
|
|
64
|
+
set_current_eval_dir(eval_dir)
|
|
65
|
+
|
|
66
|
+
# Mount static files from the viewer directory
|
|
67
|
+
app.mount("/", StaticFiles(directory=str(viewer_dir), html=True), name="viewer")
|
|
68
|
+
|
|
69
|
+
# Run viewer
|
|
70
|
+
config = uvicorn.Config(app, host="0.0.0.0", port=8000, log_level="error")
|
|
71
|
+
server = uvicorn.Server(config)
|
|
72
|
+
await server.serve()
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
async def main():
|
|
76
|
+
parser = argparse.ArgumentParser(description="Browse Crafter evaluations")
|
|
77
|
+
parser.add_argument(
|
|
78
|
+
"--eval-dir",
|
|
79
|
+
type=str,
|
|
80
|
+
default="src/evals/crafter",
|
|
81
|
+
help="Base directory for evaluations",
|
|
82
|
+
)
|
|
83
|
+
parser.add_argument(
|
|
84
|
+
"--run-id", type=str, help="Specific run ID to view (e.g., run_20240115_143022)"
|
|
85
|
+
)
|
|
86
|
+
parser.add_argument("--latest", action="store_true", help="View the latest evaluation")
|
|
87
|
+
|
|
88
|
+
args = parser.parse_args()
|
|
89
|
+
evals_dir = Path(args.eval_dir)
|
|
90
|
+
|
|
91
|
+
# List evaluations
|
|
92
|
+
evaluations = list_evaluations(evals_dir)
|
|
93
|
+
|
|
94
|
+
if not evaluations:
|
|
95
|
+
return
|
|
96
|
+
|
|
97
|
+
# Display table of evaluations
|
|
98
|
+
if not args.run_id and not args.latest:
|
|
99
|
+
print("\nš Available Crafter Evaluations:")
|
|
100
|
+
table_data = []
|
|
101
|
+
for i, eval_info in enumerate(evaluations):
|
|
102
|
+
# Parse timestamp for cleaner display
|
|
103
|
+
try:
|
|
104
|
+
ts = datetime.fromisoformat(eval_info["timestamp"])
|
|
105
|
+
ts_str = ts.strftime("%Y-%m-%d %H:%M:%S")
|
|
106
|
+
except Exception:
|
|
107
|
+
ts_str = eval_info["timestamp"]
|
|
108
|
+
|
|
109
|
+
table_data.append(
|
|
110
|
+
[
|
|
111
|
+
i + 1,
|
|
112
|
+
eval_info["run_id"],
|
|
113
|
+
ts_str,
|
|
114
|
+
eval_info["models"],
|
|
115
|
+
eval_info["difficulties"],
|
|
116
|
+
eval_info["num_trajectories"],
|
|
117
|
+
]
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
headers = ["#", "Run ID", "Timestamp", "Models", "Difficulties", "Trajectories"]
|
|
121
|
+
print(tabulate(table_data, headers=headers, tablefmt="grid"))
|
|
122
|
+
|
|
123
|
+
# Ask user to select
|
|
124
|
+
print("\nEnter the number of the evaluation to view (or 'q' to quit): ", end="")
|
|
125
|
+
choice = input().strip()
|
|
126
|
+
|
|
127
|
+
if choice.lower() == "q":
|
|
128
|
+
return
|
|
129
|
+
|
|
130
|
+
try:
|
|
131
|
+
idx = int(choice) - 1
|
|
132
|
+
if 0 <= idx < len(evaluations):
|
|
133
|
+
selected_eval = evaluations[idx]
|
|
134
|
+
await view_evaluation(selected_eval["path"])
|
|
135
|
+
else:
|
|
136
|
+
print("Invalid selection")
|
|
137
|
+
except ValueError:
|
|
138
|
+
print("Invalid input")
|
|
139
|
+
|
|
140
|
+
# View specific run
|
|
141
|
+
elif args.run_id:
|
|
142
|
+
eval_path = evals_dir / args.run_id
|
|
143
|
+
await view_evaluation(eval_path)
|
|
144
|
+
|
|
145
|
+
# View latest
|
|
146
|
+
elif args.latest and evaluations:
|
|
147
|
+
latest_eval = evaluations[0]
|
|
148
|
+
await view_evaluation(latest_eval["path"])
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
if __name__ == "__main__":
|
|
152
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
[evaluation]
|
|
2
|
+
# Maximum number of turns per agent
|
|
3
|
+
max_turns = 100
|
|
4
|
+
|
|
5
|
+
# Number of trajectories per model-difficulty combination
|
|
6
|
+
trajectories_per_condition = 10
|
|
7
|
+
|
|
8
|
+
# Difficulty modes to test
|
|
9
|
+
difficulties = ["easy"]
|
|
10
|
+
|
|
11
|
+
# Models to evaluate
|
|
12
|
+
models = [
|
|
13
|
+
"gpt-4.1-nano",
|
|
14
|
+
"gpt-4o-mini"
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
# Parallel execution settings
|
|
18
|
+
parallel_episodes = true
|
|
19
|
+
timeout_seconds = 300
|
|
20
|
+
|
|
21
|
+
# Output settings
|
|
22
|
+
show_progress_bars = true
|
|
23
|
+
show_detailed_logging = false
|
|
24
|
+
show_final_table = true
|