synth-ai 0.2.4.dev8__py3-none-any.whl ā 0.2.4.dev9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- synth_ai/cli/__init__.py +6 -0
- synth_ai/cli/demo.py +68 -9
- synth_ai/cli/rl_demo.py +137 -0
- synth_ai/cli/root.py +65 -0
- synth_ai/demos/core/__init__.py +1 -0
- synth_ai/demos/core/cli.py +621 -0
- synth_ai/demos/demo_task_apps/__init__.py +1 -0
- synth_ai/demos/demo_task_apps/core.py +374 -0
- synth_ai/demos/demo_task_apps/math/__init__.py +1 -0
- synth_ai/demos/demo_task_apps/math/app.py +37 -0
- synth_ai/demos/demo_task_apps/math/config.toml +44 -0
- synth_ai/demos/demo_task_apps/math/deploy_modal.py +60 -0
- synth_ai/demos/demo_task_apps/math/deploy_task_app.sh +22 -0
- synth_ai/environments/examples/bandit/__init__.py +33 -0
- synth_ai/environments/examples/bandit/engine.py +294 -0
- synth_ai/environments/examples/bandit/environment.py +194 -0
- synth_ai/environments/examples/bandit/taskset.py +200 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/analyze_semantic_words_markdown.py +250 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_comprehensive_evaluation.py +59 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_browser.py +152 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_config.toml +24 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_framework.py +1194 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/crafter_synth_config.toml +56 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_config_modal.toml +32 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_traces_sft_turso.py +724 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/kick_off_ft_modal.py +384 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_action_results.py +53 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_agent_actions.py +178 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_latest_run.py +222 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_lm_traces.py +183 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_no_rewards.py +210 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_trace_issue.py +206 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_db_schema.py +49 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_latest_results.py +64 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/debug_agent_responses.py +88 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/quick_trace_check.py +77 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/compare_experiments.py +324 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/filter_traces_sft_turso.py +580 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/kick_off_ft_oai.py +362 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/multi_model_config.toml +49 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_enhanced_hooks.py +332 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_events.py +97 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_results.py +217 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_hook_storage.py +87 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_seeds.py +88 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/compare_seed_performance.py +195 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/custom_eval_pipelines.py +400 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/plot_hook_frequency.py +195 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/seed_analysis_summary.py +56 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/run_rollouts_for_models_and_compare_v3.py +858 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_quick_evaluation.py +52 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_react_agent.py +874 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_trace_evaluation.py +1412 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/example_v3_usage.py +216 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/compare_traces.py +296 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_comprehensive_evaluation.py +58 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_env_serialization.py +464 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_evaluation_browser.py +152 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_quick_evaluation.py +51 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_trace_evaluation.py +1412 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/debug_player_loss.py +112 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_service.py +203 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_slowness.py +305 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_by_difficulty.py +126 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_example.py +94 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/explore_saved_states.py +142 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft.py +26 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft_OLD.py +984 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_gemini.py +724 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_modal.py +386 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_metadata.py +205 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_gemini.py +150 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_modal.py +283 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/prepare_vertex_ft.py +280 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/profile_env_slowness.py +456 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/replicate_issue.py +166 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/run_and_eval.py +102 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/run_comparison.py +128 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/run_qwen_rollouts.py +655 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/trace_eval_OLD.py +202 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/validate_openai_format.py +166 -0
- synth_ai/environments/examples/crafter_classic/environment.py +41 -2
- synth_ai/environments/examples/crafter_custom/agent_demos/__init__.py +1 -0
- synth_ai/environments/examples/crafter_custom/agent_demos/trace_eval.py +202 -0
- synth_ai/environments/examples/crafter_custom/old/analyze_diamond_issue.py +159 -0
- synth_ai/environments/examples/crafter_custom/old/analyze_diamond_spawning.py +158 -0
- synth_ai/environments/examples/crafter_custom/old/compare_worlds.py +71 -0
- synth_ai/environments/examples/crafter_custom/old/dataset_stats.py +105 -0
- synth_ai/environments/examples/crafter_custom/old/diamond_spawning_summary.py +119 -0
- synth_ai/environments/examples/crafter_custom/old/example_dataset_usage.py +52 -0
- synth_ai/environments/examples/enron/units/keyword_stats.py +112 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_evaluation_framework.py +1188 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_quick_evaluation.py +48 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_react_agent.py +562 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_trace_evaluation.py +221 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_evaluation_framework.py +981 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_quick_evaluation.py +74 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_react_agent.py +831 -0
- synth_ai/environments/examples/red/agent_demos/__init__.py +1 -0
- synth_ai/environments/examples/red/units/__init__.py +1 -0
- synth_ai/environments/examples/sokoban/agent_demos/sokoban_full_eval.py +899 -0
- synth_ai/environments/examples/sokoban/units/astar_common.py +95 -0
- synth_ai/environments/service/app.py +8 -0
- synth_ai/install_sqld.sh +40 -0
- synth_ai-0.2.4.dev9.dist-info/METADATA +91 -0
- {synth_ai-0.2.4.dev8.dist-info ā synth_ai-0.2.4.dev9.dist-info}/RECORD +110 -11
- {synth_ai-0.2.4.dev8.dist-info ā synth_ai-0.2.4.dev9.dist-info}/entry_points.txt +1 -0
- synth_ai-0.2.4.dev8.dist-info/METADATA +0 -635
- {synth_ai-0.2.4.dev8.dist-info ā synth_ai-0.2.4.dev9.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.4.dev8.dist-info ā synth_ai-0.2.4.dev9.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.4.dev8.dist-info ā synth_ai-0.2.4.dev9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Evaluate traces grouped by difficulty level.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import json
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from collections import defaultdict
|
|
9
|
+
from trace_eval import evaluate_trace, WEIGHTS
|
|
10
|
+
|
|
11
|
+
def get_trace_difficulty(trace_path: Path) -> str:
|
|
12
|
+
"""Extract difficulty from trace metadata."""
|
|
13
|
+
try:
|
|
14
|
+
with open(trace_path, 'r') as f:
|
|
15
|
+
data = json.load(f)
|
|
16
|
+
|
|
17
|
+
# Try to find difficulty in metadata
|
|
18
|
+
metadata = data.get('metadata', {})
|
|
19
|
+
if 'difficulty' in metadata:
|
|
20
|
+
return metadata['difficulty']
|
|
21
|
+
|
|
22
|
+
# Try to find in task instance metadata
|
|
23
|
+
if 'task_instance' in metadata:
|
|
24
|
+
task_metadata = metadata['task_instance'].get('metadata', {})
|
|
25
|
+
if 'difficulty' in task_metadata:
|
|
26
|
+
return task_metadata['difficulty']
|
|
27
|
+
|
|
28
|
+
return 'unknown'
|
|
29
|
+
except:
|
|
30
|
+
return 'unknown'
|
|
31
|
+
|
|
32
|
+
def main():
|
|
33
|
+
traces_dir = Path("traces")
|
|
34
|
+
if not traces_dir.exists():
|
|
35
|
+
print(f"Traces directory not found: {traces_dir}")
|
|
36
|
+
return
|
|
37
|
+
|
|
38
|
+
# Group traces by difficulty
|
|
39
|
+
traces_by_difficulty = defaultdict(list)
|
|
40
|
+
|
|
41
|
+
for trace_file in traces_dir.glob("*.json"):
|
|
42
|
+
difficulty = get_trace_difficulty(trace_file)
|
|
43
|
+
result = evaluate_trace(trace_file)
|
|
44
|
+
traces_by_difficulty[difficulty].append(result)
|
|
45
|
+
|
|
46
|
+
# Sort difficulties
|
|
47
|
+
difficulty_order = ['easy', 'medium', 'hard', 'unknown']
|
|
48
|
+
|
|
49
|
+
print("=" * 80)
|
|
50
|
+
print("CRAFTER EVALUATION BY DIFFICULTY")
|
|
51
|
+
print("=" * 80)
|
|
52
|
+
|
|
53
|
+
for difficulty in difficulty_order:
|
|
54
|
+
traces = traces_by_difficulty[difficulty]
|
|
55
|
+
if not traces:
|
|
56
|
+
continue
|
|
57
|
+
|
|
58
|
+
print(f"\n{difficulty.upper()} ({len(traces)} traces)")
|
|
59
|
+
print("-" * 40)
|
|
60
|
+
|
|
61
|
+
# Calculate statistics
|
|
62
|
+
scores = [t['total_score'] for t in traces]
|
|
63
|
+
avg_score = sum(scores) / len(scores) if scores else 0
|
|
64
|
+
max_score = max(scores) if scores else 0
|
|
65
|
+
min_score = min(scores) if scores else 0
|
|
66
|
+
|
|
67
|
+
# Count achievements and invalid actions
|
|
68
|
+
total_easy = sum(t['counts'].get('easy_achievement', 0) for t in traces)
|
|
69
|
+
total_medium = sum(t['counts'].get('medium_achievement', 0) for t in traces)
|
|
70
|
+
total_hard = sum(t['counts'].get('hard_achievement', 0) for t in traces)
|
|
71
|
+
total_invalid = sum(t['counts'].get('invalid_action', 0) for t in traces)
|
|
72
|
+
|
|
73
|
+
print(f"Average Score: {avg_score:.2f}")
|
|
74
|
+
print(f"Score Range: {min_score:.2f} to {max_score:.2f}")
|
|
75
|
+
print(f"\nAchievements per trace:")
|
|
76
|
+
print(f" Easy: {total_easy / len(traces):.2f}")
|
|
77
|
+
print(f" Medium: {total_medium / len(traces):.2f}")
|
|
78
|
+
print(f" Hard: {total_hard / len(traces):.2f}")
|
|
79
|
+
print(f"\nInvalid actions per trace: {total_invalid / len(traces):.2f}")
|
|
80
|
+
|
|
81
|
+
# Show score distribution
|
|
82
|
+
positive_scores = [s for s in scores if s > 0]
|
|
83
|
+
zero_scores = [s for s in scores if s == 0]
|
|
84
|
+
negative_scores = [s for s in scores if s < 0]
|
|
85
|
+
|
|
86
|
+
print(f"\nScore distribution:")
|
|
87
|
+
print(f" Positive: {len(positive_scores)} ({len(positive_scores)/len(scores)*100:.1f}%)")
|
|
88
|
+
print(f" Zero: {len(zero_scores)} ({len(zero_scores)/len(scores)*100:.1f}%)")
|
|
89
|
+
print(f" Negative: {len(negative_scores)} ({len(negative_scores)/len(scores)*100:.1f}%)")
|
|
90
|
+
|
|
91
|
+
# Show top 3 traces
|
|
92
|
+
traces_sorted = sorted(traces, key=lambda x: x['total_score'], reverse=True)
|
|
93
|
+
print(f"\nTop 3 traces:")
|
|
94
|
+
for i, trace in enumerate(traces_sorted[:3], 1):
|
|
95
|
+
print(f" {i}. Score: {trace['total_score']:.2f}, Trajectory: {trace['trajectory'][:50]}")
|
|
96
|
+
|
|
97
|
+
# Overall summary
|
|
98
|
+
print("\n" + "=" * 80)
|
|
99
|
+
print("OVERALL SUMMARY")
|
|
100
|
+
print("=" * 80)
|
|
101
|
+
|
|
102
|
+
all_traces = []
|
|
103
|
+
for traces in traces_by_difficulty.values():
|
|
104
|
+
all_traces.extend(traces)
|
|
105
|
+
|
|
106
|
+
if all_traces:
|
|
107
|
+
all_scores = [t['total_score'] for t in all_traces]
|
|
108
|
+
print(f"Total traces evaluated: {len(all_traces)}")
|
|
109
|
+
print(f"Overall average score: {sum(all_scores) / len(all_scores):.2f}")
|
|
110
|
+
|
|
111
|
+
# Achievement type distribution
|
|
112
|
+
total_achievements = defaultdict(int)
|
|
113
|
+
for trace in all_traces:
|
|
114
|
+
for achievement_type in ['easy_achievement', 'medium_achievement', 'hard_achievement']:
|
|
115
|
+
total_achievements[achievement_type] += trace['counts'].get(achievement_type, 0)
|
|
116
|
+
|
|
117
|
+
print(f"\nTotal achievements unlocked:")
|
|
118
|
+
print(f" Easy: {total_achievements['easy_achievement']} (worth {total_achievements['easy_achievement'] * WEIGHTS['easy_achievement']:.1f} points)")
|
|
119
|
+
print(f" Medium: {total_achievements['medium_achievement']} (worth {total_achievements['medium_achievement'] * WEIGHTS['medium_achievement']:.1f} points)")
|
|
120
|
+
print(f" Hard: {total_achievements['hard_achievement']} (worth {total_achievements['hard_achievement'] * WEIGHTS['hard_achievement']:.1f} points)")
|
|
121
|
+
|
|
122
|
+
total_invalid = sum(t['counts'].get('invalid_action', 0) for t in all_traces)
|
|
123
|
+
print(f"\nTotal invalid actions: {total_invalid} (penalty: {total_invalid * WEIGHTS['invalid_action']:.1f} points)")
|
|
124
|
+
|
|
125
|
+
if __name__ == "__main__":
|
|
126
|
+
main()
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Example of using the trace evaluation system programmatically.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from trace_eval import evaluate_trace, evaluate_all_traces, print_trace_evaluation, print_evaluation_summary
|
|
8
|
+
|
|
9
|
+
def main():
|
|
10
|
+
# Example 1: Evaluate a single trace
|
|
11
|
+
print("=" * 60)
|
|
12
|
+
print("Example 1: Evaluating a single trace")
|
|
13
|
+
print("=" * 60)
|
|
14
|
+
|
|
15
|
+
# Pick a high-scoring trace
|
|
16
|
+
trace_path = Path("traces/session_crafter_episode_1_f2cea96d-34b6-46a3-9991-fe74ef263462_20250724_162140.json")
|
|
17
|
+
if trace_path.exists():
|
|
18
|
+
result = evaluate_trace(trace_path)
|
|
19
|
+
print_trace_evaluation(result)
|
|
20
|
+
else:
|
|
21
|
+
print(f"Trace file not found: {trace_path}")
|
|
22
|
+
|
|
23
|
+
# Example 2: Evaluate all traces and show top 5
|
|
24
|
+
print("\n" + "=" * 60)
|
|
25
|
+
print("Example 2: Top 5 traces by score")
|
|
26
|
+
print("=" * 60)
|
|
27
|
+
|
|
28
|
+
traces_dir = Path("traces")
|
|
29
|
+
if traces_dir.exists():
|
|
30
|
+
all_results = evaluate_all_traces(traces_dir)
|
|
31
|
+
|
|
32
|
+
# Show only top 5
|
|
33
|
+
print(f"\nFound {len(all_results)} traces. Showing top 5:\n")
|
|
34
|
+
for i, result in enumerate(all_results[:5], 1):
|
|
35
|
+
print(f"{i}. {result['trace_file']}")
|
|
36
|
+
print(f" Score: {result['total_score']:.2f}")
|
|
37
|
+
print(f" Trajectory: {result['trajectory']}")
|
|
38
|
+
if result['counts']:
|
|
39
|
+
print(" Breakdown:")
|
|
40
|
+
for score_type, count in result['counts'].items():
|
|
41
|
+
weight = {
|
|
42
|
+
'easy_achievement': 1.0,
|
|
43
|
+
'medium_achievement': 2.5,
|
|
44
|
+
'hard_achievement': 5.0,
|
|
45
|
+
'invalid_action': -0.05
|
|
46
|
+
}[score_type]
|
|
47
|
+
print(f" {score_type}: {count} Ć {weight} = {count * weight:.2f}")
|
|
48
|
+
print()
|
|
49
|
+
|
|
50
|
+
# Example 3: Score distribution analysis
|
|
51
|
+
print("=" * 60)
|
|
52
|
+
print("Example 3: Score distribution analysis")
|
|
53
|
+
print("=" * 60)
|
|
54
|
+
|
|
55
|
+
if traces_dir.exists():
|
|
56
|
+
all_results = evaluate_all_traces(traces_dir)
|
|
57
|
+
scores = [r['total_score'] for r in all_results]
|
|
58
|
+
|
|
59
|
+
# Group by score ranges
|
|
60
|
+
score_ranges = {
|
|
61
|
+
"Negative (<0)": 0,
|
|
62
|
+
"Low (0-0.5)": 0,
|
|
63
|
+
"Medium (0.5-1.5)": 0,
|
|
64
|
+
"High (1.5-2.5)": 0,
|
|
65
|
+
"Very High (>2.5)": 0
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
for score in scores:
|
|
69
|
+
if score < 0:
|
|
70
|
+
score_ranges["Negative (<0)"] += 1
|
|
71
|
+
elif score <= 0.5:
|
|
72
|
+
score_ranges["Low (0-0.5)"] += 1
|
|
73
|
+
elif score <= 1.5:
|
|
74
|
+
score_ranges["Medium (0.5-1.5)"] += 1
|
|
75
|
+
elif score <= 2.5:
|
|
76
|
+
score_ranges["High (1.5-2.5)"] += 1
|
|
77
|
+
else:
|
|
78
|
+
score_ranges["Very High (>2.5)"] += 1
|
|
79
|
+
|
|
80
|
+
print(f"\nScore distribution across {len(scores)} traces:")
|
|
81
|
+
for range_name, count in score_ranges.items():
|
|
82
|
+
percentage = (count / len(scores) * 100) if scores else 0
|
|
83
|
+
bar = "ā" * int(percentage / 2) # Scale to 50 chars max
|
|
84
|
+
print(f" {range_name:<20} {count:3d} ({percentage:5.1f}%) {bar}")
|
|
85
|
+
|
|
86
|
+
# Additional statistics
|
|
87
|
+
if scores:
|
|
88
|
+
print(f"\nStatistics:")
|
|
89
|
+
print(f" Mean score: {sum(scores) / len(scores):.2f}")
|
|
90
|
+
print(f" Median score: {sorted(scores)[len(scores)//2]:.2f}")
|
|
91
|
+
print(f" Std deviation: {(sum((x - sum(scores)/len(scores))**2 for x in scores) / len(scores))**0.5:.2f}")
|
|
92
|
+
|
|
93
|
+
if __name__ == "__main__":
|
|
94
|
+
main()
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Explore and visualize saved Crafter states
|
|
4
|
+
==========================================
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import gzip
|
|
8
|
+
import pickle
|
|
9
|
+
import json
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
import asyncio
|
|
13
|
+
from uuid import uuid4
|
|
14
|
+
|
|
15
|
+
from synth_ai.environments.examples.crafter_classic.environment import CrafterClassicEnvironment
|
|
16
|
+
from synth_ai.environments.examples.crafter_classic.taskset import CrafterTaskInstance, CrafterTaskInstanceMetadata
|
|
17
|
+
from synth_ai.environments.tasks.core import Impetus, Intent
|
|
18
|
+
|
|
19
|
+
STATES_DIR = Path("synth_ai/environments/examples/crafter_classic/env_states")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def list_sessions():
|
|
23
|
+
"""List all saved MCTS sessions."""
|
|
24
|
+
if not STATES_DIR.exists():
|
|
25
|
+
print("No env_states directory found!")
|
|
26
|
+
return []
|
|
27
|
+
|
|
28
|
+
sessions = []
|
|
29
|
+
for session_dir in sorted(STATES_DIR.iterdir()):
|
|
30
|
+
if session_dir.is_dir() and session_dir.name.startswith("mcts_"):
|
|
31
|
+
metadata_file = session_dir / "session_metadata.json"
|
|
32
|
+
if metadata_file.exists():
|
|
33
|
+
metadata = json.loads(metadata_file.read_text())
|
|
34
|
+
sessions.append((session_dir, metadata))
|
|
35
|
+
|
|
36
|
+
return sessions
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
async def explore_state_file(state_file: Path):
|
|
40
|
+
"""Load and display info about a single state file."""
|
|
41
|
+
print(f"\nš State file: {state_file.name}")
|
|
42
|
+
print(f" Size: {state_file.stat().st_size / 1024:.1f} KB")
|
|
43
|
+
|
|
44
|
+
# Load the state
|
|
45
|
+
try:
|
|
46
|
+
env_blob = gzip.decompress(state_file.read_bytes())
|
|
47
|
+
env_snapshot = pickle.loads(env_blob)
|
|
48
|
+
|
|
49
|
+
# Create dummy task for deserialization
|
|
50
|
+
task = CrafterTaskInstance(
|
|
51
|
+
id=uuid4(),
|
|
52
|
+
impetus=Impetus(instructions="Analysis"),
|
|
53
|
+
intent=Intent(rubric={"goal": "Analysis"}, gold_trajectories=None, gold_state_diff={}),
|
|
54
|
+
metadata=CrafterTaskInstanceMetadata(
|
|
55
|
+
difficulty="easy", seed=0, num_trees_radius=0,
|
|
56
|
+
num_cows_radius=0, num_hostiles_radius=0
|
|
57
|
+
),
|
|
58
|
+
is_reproducible=True,
|
|
59
|
+
initial_engine_snapshot=None
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
env = await CrafterClassicEnvironment._deserialize_engine(env_snapshot, task)
|
|
63
|
+
pub = env.engine._get_public_state_from_env()
|
|
64
|
+
priv = env.engine._get_private_state_from_env(0, False, False)
|
|
65
|
+
|
|
66
|
+
print(f" Position: {pub.player_position}")
|
|
67
|
+
print(f" Health: {priv.player_internal_stats.get('health', 0)}")
|
|
68
|
+
print(f" Steps: {pub.num_steps_taken}")
|
|
69
|
+
|
|
70
|
+
# Show achievements
|
|
71
|
+
achievements = [k for k, v in pub.achievements_status.items() if v]
|
|
72
|
+
if achievements:
|
|
73
|
+
print(f" Achievements ({len(achievements)}): {', '.join(achievements[:3])}{'...' if len(achievements) > 3 else ''}")
|
|
74
|
+
|
|
75
|
+
# Show inventory
|
|
76
|
+
inventory = [(k, v) for k, v in pub.inventory.items()
|
|
77
|
+
if v > 0 and k not in ['health', 'food', 'drink', 'energy']]
|
|
78
|
+
if inventory:
|
|
79
|
+
print(f" Inventory: {dict(inventory)}")
|
|
80
|
+
|
|
81
|
+
except Exception as e:
|
|
82
|
+
print(f" ā Error loading state: {e}")
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
async def main():
|
|
86
|
+
print("š Exploring Saved Crafter States")
|
|
87
|
+
print("=" * 60)
|
|
88
|
+
|
|
89
|
+
sessions = list_sessions()
|
|
90
|
+
|
|
91
|
+
if not sessions:
|
|
92
|
+
print("No saved sessions found!")
|
|
93
|
+
return
|
|
94
|
+
|
|
95
|
+
print(f"Found {len(sessions)} saved sessions:\n")
|
|
96
|
+
|
|
97
|
+
for i, (session_dir, metadata) in enumerate(sessions):
|
|
98
|
+
print(f"{i+1}. {session_dir.name}")
|
|
99
|
+
print(f" Seed: {metadata['seed']}")
|
|
100
|
+
print(f" Time: {metadata['timestamp']}")
|
|
101
|
+
print(f" States: {len(list(session_dir.glob('*.snapshot.gz')))}")
|
|
102
|
+
|
|
103
|
+
# Explore the most recent session
|
|
104
|
+
if sessions:
|
|
105
|
+
latest_session, latest_metadata = sessions[-1]
|
|
106
|
+
print(f"\nš Exploring latest session: {latest_session.name}")
|
|
107
|
+
print("-" * 60)
|
|
108
|
+
|
|
109
|
+
# Get all state files
|
|
110
|
+
state_files = sorted(latest_session.glob("*.snapshot.gz"))
|
|
111
|
+
print(f"Total states saved: {len(state_files)}")
|
|
112
|
+
|
|
113
|
+
# Show the root state
|
|
114
|
+
root_id = latest_metadata['root_node_id']
|
|
115
|
+
root_file = latest_session / f"{root_id}.snapshot.gz"
|
|
116
|
+
if root_file.exists():
|
|
117
|
+
print("\nš± Root state:")
|
|
118
|
+
await explore_state_file(root_file)
|
|
119
|
+
|
|
120
|
+
# Show a few random states
|
|
121
|
+
import random
|
|
122
|
+
sample_size = min(3, len(state_files) - 1)
|
|
123
|
+
if sample_size > 0:
|
|
124
|
+
print("\nš² Random sample states:")
|
|
125
|
+
for state_file in random.sample(state_files[1:], sample_size):
|
|
126
|
+
await explore_state_file(state_file)
|
|
127
|
+
|
|
128
|
+
# Show summary if it exists
|
|
129
|
+
summary_file = latest_session / "summary.txt"
|
|
130
|
+
if summary_file.exists():
|
|
131
|
+
print("\nš Session Summary:")
|
|
132
|
+
print("-" * 40)
|
|
133
|
+
print(summary_file.read_text())
|
|
134
|
+
|
|
135
|
+
print("\nā
Done exploring saved states!")
|
|
136
|
+
print(f"\nš” Tip: States are saved in: {STATES_DIR}")
|
|
137
|
+
print(" Each .snapshot.gz file contains a complete game state")
|
|
138
|
+
print(" You can analyze these to understand MCTS exploration patterns")
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
if __name__ == "__main__":
|
|
142
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
DEPRECATED: This script reads individual JSON trace files which are no longer generated.
|
|
4
|
+
Please use filter_traces_sft_duckdb.py instead, which reads from DuckDB.
|
|
5
|
+
|
|
6
|
+
Original functionality:
|
|
7
|
+
- Filter traces to create OpenAI SFT-ready .jsonl files
|
|
8
|
+
- Supports trajectory-level and window-based filtering
|
|
9
|
+
|
|
10
|
+
Migration guide:
|
|
11
|
+
1. Run your agent with DuckDB enabled (this happens automatically now)
|
|
12
|
+
2. Use filter_traces_sft_duckdb.py to filter and extract training data
|
|
13
|
+
3. Example: python filter_traces_sft_duckdb.py -d crafter_traces.duckdb -o training.jsonl
|
|
14
|
+
|
|
15
|
+
This file is kept for reference only and will be removed in a future version.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import sys
|
|
19
|
+
print("=" * 80)
|
|
20
|
+
print("DEPRECATED: This script is no longer supported.")
|
|
21
|
+
print("Please use filter_traces_sft_duckdb.py instead.")
|
|
22
|
+
print("=" * 80)
|
|
23
|
+
print("\nExample usage:")
|
|
24
|
+
print(" python filter_traces_sft_duckdb.py -d crafter_traces.duckdb -o training.jsonl")
|
|
25
|
+
print("\nSee DUCKDB_FILTERING_GUIDE.md for more information.")
|
|
26
|
+
sys.exit(1)
|