PyPI - synth-ai - Versions diffs - 0.2.4.dev7__py3-none-any.whl → 0.2.4.dev9__py3-none-any.whl - Mend - Supply Chain Defender

synth-ai 0.2.4.dev7py3-none-any.whl → 0.2.4.dev9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of synth-ai might be problematic. Click here for more details.

Files changed (154) hide show

synth_ai/environments/examples/crafter_classic/agent_demos/crafter_quick_evaluation.py ADDED Viewed

@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+"""
+Script to run Crafter evaluation using the standardized eval framework
+"""
+import asyncio
+from pathlib import Path
+import toml
+from src.synth_env.examples.crafter_classic.agent_demos.eval_framework import (
+    CrafterEvalFramework,
+    run_crafter_eval,
+)
+async def main():
+    # Load configuration
+    config_path = Path(__file__).parent / "eval_config.toml"
+    if not config_path.exists():
+        raise FileNotFoundError(f"Configuration file not found: {config_path}")
+    config = toml.load(config_path)
+    eval_config = config["evaluation"]
+    models = eval_config["models"]
+    difficulties = eval_config["difficulties"]
+    max_turns = eval_config["max_turns"]
+    n_trajectories = eval_config["trajectories_per_condition"]
+    print("🎯 Crafter Multi-Action Model Comparison (Eval Framework)")
+    print("=" * 60)
+    print(f"Models: {', '.join(models)}")
+    print(f"Difficulties: {', '.join(difficulties)}")
+    print(f"Max turns: {max_turns}")
+    print(f"Trajectories per condition: {n_trajectories}")
+    print("=" * 60)
+    # Run evaluation using the framework
+    results = await run_crafter_eval(
+        model_names=models,
+        difficulties=difficulties,
+        num_trajectories=n_trajectories,
+        max_turns=max_turns,
+    )
+    # The framework already prints detailed reports
+    print("\n🏆 Evaluation completed!")
+    return results
+if __name__ == "__main__":
+    asyncio.run(main())