synth-ai 0.2.9.dev2__py3-none-any.whl → 0.2.9.dev4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (112) hide show
  1. examples/analyze_semantic_words.sh +17 -0
  2. examples/common_old/backend.py +21 -0
  3. examples/crafter_debug_render.py +180 -0
  4. examples/evals_old/README.md +98 -0
  5. examples/evals_old/__init__.py +6 -0
  6. examples/evals_old/compare_models.py +1037 -0
  7. examples/evals_old/example_log.md +145 -0
  8. examples/evals_old/run_demo.sh +126 -0
  9. examples/evals_old/trace_analysis.py +270 -0
  10. examples/finetuning_old/_backup_synth_qwen/config.toml +29 -0
  11. examples/finetuning_old/_backup_synth_qwen/example_log.md +324 -0
  12. examples/finetuning_old/_backup_synth_qwen/filter_traces.py +60 -0
  13. examples/finetuning_old/_backup_synth_qwen/filter_traces_achievements.py +239 -0
  14. examples/finetuning_old/_backup_synth_qwen/purge_v3_traces.py +109 -0
  15. examples/finetuning_old/_backup_synth_qwen/react_agent_lm.py +1924 -0
  16. examples/finetuning_old/_backup_synth_qwen/readme.md +49 -0
  17. examples/finetuning_old/_backup_synth_qwen/run_crafter_qwen4b.py +114 -0
  18. examples/finetuning_old/_backup_synth_qwen/run_demo.sh +195 -0
  19. examples/finetuning_old/_backup_synth_qwen/sft_kickoff.py +118 -0
  20. examples/finetuning_old/synth_qwen_v1/README.md +68 -0
  21. examples/finetuning_old/synth_qwen_v1/filter_traces.py +60 -0
  22. examples/finetuning_old/synth_qwen_v1/filter_traces_achievements.py +239 -0
  23. examples/finetuning_old/synth_qwen_v1/finetune.py +46 -0
  24. examples/finetuning_old/synth_qwen_v1/hello_ft_model.py +71 -0
  25. examples/finetuning_old/synth_qwen_v1/infer.py +37 -0
  26. examples/finetuning_old/synth_qwen_v1/poll.py +44 -0
  27. examples/finetuning_old/synth_qwen_v1/prepare_data.py +35 -0
  28. examples/finetuning_old/synth_qwen_v1/purge_v3_traces.py +109 -0
  29. examples/finetuning_old/synth_qwen_v1/react_agent_lm.py +1932 -0
  30. examples/finetuning_old/synth_qwen_v1/run_crafter_sft_job.py +207 -0
  31. examples/finetuning_old/synth_qwen_v1/run_ft_job.py +232 -0
  32. examples/finetuning_old/synth_qwen_v1/upload_data.py +34 -0
  33. examples/finetuning_old/synth_qwen_v1/util.py +147 -0
  34. examples/rl/README.md +169 -0
  35. examples/rl/configs/eval_base_qwen.toml +15 -0
  36. examples/rl/configs/eval_rl_qwen.toml +11 -0
  37. examples/rl/configs/rl_from_base_qwen.toml +35 -0
  38. examples/rl/configs/rl_from_base_qwen17.toml +74 -0
  39. examples/rl/configs/rl_from_ft_qwen.toml +35 -0
  40. examples/rl/download_dataset.py +64 -0
  41. examples/rl/run_eval.py +435 -0
  42. examples/rl/run_rl_and_save.py +94 -0
  43. examples/rl/task_app/README.md +22 -0
  44. {synth_ai/task/apps → examples/rl/task_app}/math_single_step.py +8 -8
  45. examples/rl/task_app/math_task_app.py +107 -0
  46. examples/rl_old/task_app.py +962 -0
  47. examples/run_crafter_demo.sh +10 -0
  48. examples/warming_up_to_rl/analyze_trace_db.py +420 -0
  49. examples/warming_up_to_rl/configs/crafter_fft.toml +48 -0
  50. examples/warming_up_to_rl/configs/crafter_fft_4b.toml +54 -0
  51. examples/warming_up_to_rl/configs/eval_fft_qwen4b.toml +20 -0
  52. examples/warming_up_to_rl/configs/eval_groq_qwen32b.toml +13 -0
  53. examples/warming_up_to_rl/configs/eval_modal_qwen4b.toml +23 -0
  54. examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +73 -0
  55. examples/warming_up_to_rl/configs/rl_from_ft.toml +56 -0
  56. examples/warming_up_to_rl/export_trace_sft.py +541 -0
  57. examples/warming_up_to_rl/groq_test.py +88 -0
  58. examples/warming_up_to_rl/manage_secrets.py +127 -0
  59. examples/warming_up_to_rl/old/event_rewards.md +234 -0
  60. examples/warming_up_to_rl/old/notes.md +73 -0
  61. examples/warming_up_to_rl/readme.md +172 -0
  62. examples/warming_up_to_rl/run_eval.py +434 -0
  63. examples/warming_up_to_rl/run_fft_and_save.py +309 -0
  64. examples/warming_up_to_rl/run_local_rollout.py +188 -0
  65. examples/warming_up_to_rl/run_local_rollout_modal.py +160 -0
  66. examples/warming_up_to_rl/run_local_rollout_parallel.py +342 -0
  67. examples/warming_up_to_rl/run_local_rollout_traced.py +372 -0
  68. examples/warming_up_to_rl/run_rl_and_save.py +101 -0
  69. examples/warming_up_to_rl/run_rollout_remote.py +129 -0
  70. examples/warming_up_to_rl/task_app/README.md +38 -0
  71. {synth_ai/task/apps → examples/warming_up_to_rl/task_app}/grpo_crafter.py +7 -7
  72. examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +165 -0
  73. examples/warming_up_to_rl/task_app/synth_envs_hosted/README.md +173 -0
  74. examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +5 -0
  75. examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +145 -0
  76. examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +1271 -0
  77. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -0
  78. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +6 -0
  79. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -0
  80. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +429 -0
  81. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +442 -0
  82. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +96 -0
  83. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +302 -0
  84. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +47 -0
  85. examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +202 -0
  86. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +5 -0
  87. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +512 -0
  88. examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +102 -0
  89. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +985 -0
  90. examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +197 -0
  91. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +1749 -0
  92. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +5 -0
  93. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +217 -0
  94. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +160 -0
  95. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +146 -0
  96. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_stepwise_rewards.py +58 -0
  97. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +61 -0
  98. synth_ai/api/train/config_finder.py +18 -18
  99. synth_ai/api/train/env_resolver.py +28 -1
  100. synth_ai/cli/task_apps.py +264 -55
  101. synth_ai/demo_registry.py +7 -7
  102. synth_ai/demos/demo_task_apps/crafter/__init__.py +1 -0
  103. synth_ai/demos/demo_task_apps/crafter/configs/crafter_fft_4b.toml +54 -0
  104. synth_ai/demos/demo_task_apps/crafter/configs/rl_from_base_qwen4b.toml +73 -0
  105. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +165 -0
  106. synth_ai/task/apps/__init__.py +54 -13
  107. {synth_ai-0.2.9.dev2.dist-info → synth_ai-0.2.9.dev4.dist-info}/METADATA +1 -1
  108. {synth_ai-0.2.9.dev2.dist-info → synth_ai-0.2.9.dev4.dist-info}/RECORD +112 -13
  109. {synth_ai-0.2.9.dev2.dist-info → synth_ai-0.2.9.dev4.dist-info}/top_level.txt +1 -0
  110. {synth_ai-0.2.9.dev2.dist-info → synth_ai-0.2.9.dev4.dist-info}/WHEEL +0 -0
  111. {synth_ai-0.2.9.dev2.dist-info → synth_ai-0.2.9.dev4.dist-info}/entry_points.txt +0 -0
  112. {synth_ai-0.2.9.dev2.dist-info → synth_ai-0.2.9.dev4.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,17 @@
1
+ #!/bin/bash
2
+
3
+ # Run Crafter agent and analyze semantic map word distribution
4
+ # This script demonstrates semantic analysis of agent observations
5
+ # Output: Markdown tables and JSON data (no plotting dependencies)
6
+
7
+ echo "🔍 Analyzing semantic map words from Crafter agent..."
8
+ echo "Make sure the synth-ai service is running: uvx synth-ai serve"
9
+ echo ""
10
+
11
+ cd synth_ai/environments/examples/crafter_classic/agent_demos/
12
+
13
+ # Run the semantic analysis (markdown output only)
14
+ python analyze_semantic_words_markdown.py --model gemini-1.5-flash --episodes 3 --max-turns 30
15
+
16
+ echo ""
17
+ echo "✅ Analysis complete! Check the generated markdown report and JSON files."
@@ -0,0 +1,21 @@
1
+ from __future__ import annotations
2
+
3
+ from synth_ai.config.base_url import get_backend_from_env, PROD_BASE_URL_DEFAULT
4
+
5
+
6
+ DEFAULT_PROD_BACKEND = f"{PROD_BASE_URL_DEFAULT.rstrip('/')}/api"
7
+
8
+
9
+ def resolve_backend_url() -> str:
10
+ """Resolve backend base URL honoring BACKEND_OVERRIDE and env overrides.
11
+
12
+ Always returns a URL ending with /api.
13
+ """
14
+ base, _ = get_backend_from_env()
15
+ base = base.rstrip("/")
16
+ return base if base.endswith("/api") else f"{base}/api"
17
+
18
+
19
+ if __name__ == "__main__":
20
+ print(resolve_backend_url())
21
+
@@ -0,0 +1,180 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Quick local Crafter observation inspector.
4
+
5
+ - Initializes a CrafterClassic env via local service (default http://localhost:8901)
6
+ - Fetches one observation
7
+ - Renders a 7x7 semantic view around the player with best-effort item names
8
+ - Prints status (health/food/energy), inventory, and achievements
9
+
10
+ Run:
11
+ uv run python examples/crafter_debug_render.py --base-url http://localhost:8901 --seed 1
12
+ """
13
+ import argparse
14
+ import math
15
+ import os
16
+ from typing import Any, Dict, List
17
+
18
+ import httpx
19
+
20
+
21
+ def try_import_crafter_mapping():
22
+ try:
23
+ import crafter # type: ignore
24
+
25
+ env = crafter.Env()
26
+ try:
27
+ max_id = (
28
+ max(max(env._world._mat_ids.values()), max(env._sem_view._obj_ids.values())) + 1
29
+ )
30
+ id_to_item = ["void"] * max_id
31
+ for name, ind in env._world._mat_ids.items():
32
+ label = name.__name__ if hasattr(name, "__name__") else str(name)
33
+ id_to_item[ind] = label.lower()
34
+ for name, ind in env._sem_view._obj_ids.items():
35
+ label = name.__name__ if hasattr(name, "__name__") else str(name)
36
+ id_to_item[ind] = label.lower()
37
+ return id_to_item
38
+ finally:
39
+ try:
40
+ env.close()
41
+ except Exception:
42
+ pass
43
+ except Exception:
44
+ return None
45
+
46
+
47
+ def format_semantic_map_view(obs: Dict[str, Any], view_size: int = 7) -> str:
48
+ sem = obs.get("semantic_map") or obs.get("sem_map") or obs.get("map")
49
+ if sem is None:
50
+ return "No semantic map available"
51
+
52
+ # Normalize to 2D grid
53
+ grid: List[List[int]]
54
+ if isinstance(sem, list) and sem and isinstance(sem[0], list):
55
+ grid = sem
56
+ elif isinstance(sem, list):
57
+ try:
58
+ n = int(math.sqrt(len(sem)))
59
+ if n * n != len(sem) or n == 0:
60
+ return "Semantic map format not recognized"
61
+ grid = [sem[i * n : (i + 1) * n] for i in range(n)]
62
+ except Exception:
63
+ return "Semantic map format not recognized"
64
+ else:
65
+ return "Semantic map format not recognized"
66
+
67
+ rows = len(grid)
68
+ cols = len(grid[0]) if rows > 0 else 0
69
+ if rows == 0 or cols == 0:
70
+ return "Empty semantic map"
71
+
72
+ # Resolve item mapping if available
73
+ id_to_item = try_import_crafter_mapping()
74
+
75
+ # Player position if provided; otherwise center
76
+ ppos = obs.get("player_position") or [rows // 2, cols // 2]
77
+ try:
78
+ px = int(ppos[0])
79
+ py = int(ppos[1])
80
+ except Exception:
81
+ px, py = rows // 2, cols // 2
82
+
83
+ half = max(1, view_size // 2)
84
+ lines: List[str] = []
85
+ visible: set[str] = set()
86
+ for dy in range(-half, half + 1):
87
+ row_cells: List[str] = []
88
+ for dx in range(-half, half + 1):
89
+ x = px + dx
90
+ y = py + dy
91
+ if dx == 0 and dy == 0:
92
+ row_cells.append("you")
93
+ elif 0 <= x < rows and 0 <= y < cols:
94
+ try:
95
+ val = int(grid[x][y])
96
+ except Exception:
97
+ val = -1
98
+ if id_to_item and 0 <= val < len(id_to_item):
99
+ name = id_to_item[val]
100
+ else:
101
+ # Fallback: simple mapping for common ids (best-effort)
102
+ name = {
103
+ 0: "grass",
104
+ 1: "stone",
105
+ 2: "stone",
106
+ 3: "tree",
107
+ 4: "coal",
108
+ 5: "iron",
109
+ 6: "water",
110
+ 7: "zombie",
111
+ 14: "wood",
112
+ }.get(val, str(val))
113
+ row_cells.append(name)
114
+ if name not in {"grass", "you", "0"}:
115
+ visible.add(name)
116
+ else:
117
+ row_cells.append("void")
118
+ lines.append(" ".join(row_cells))
119
+
120
+ legend = f"Visible items: {', '.join(sorted(visible))}" if visible else "No notable items visible"
121
+ return "\n".join(lines) + "\n" + legend
122
+
123
+
124
+ async def main():
125
+ parser = argparse.ArgumentParser()
126
+ parser.add_argument("--base-url", default=os.getenv("CRAFTER_BASE_URL", "http://localhost:8901"))
127
+ parser.add_argument("--seed", type=int, default=1)
128
+ args = parser.parse_args()
129
+
130
+ async with httpx.AsyncClient(timeout=30.0) as client:
131
+ init = await client.post(
132
+ f"{args.base_url}/env/CrafterClassic/initialize",
133
+ json={"config": {"difficulty": "easy", "seed": args.seed}},
134
+ )
135
+ init.raise_for_status()
136
+ data = init.json()
137
+ env_id = data["env_id"]
138
+ obs = data["observation"]
139
+
140
+ print("=== INITIAL OBSERVATION ===")
141
+ print(format_semantic_map_view(obs, view_size=7))
142
+ inv = obs.get("inventory", {})
143
+ ach = obs.get("achievements_status", {})
144
+ print("\n=== STATUS ===")
145
+ print(f"Health: {obs.get('health', 10)}/10")
146
+ print(f"Hunger: {obs.get('food', 10)}/10")
147
+ print(f"Energy: {obs.get('energy', 10)}/10")
148
+ inv_items = ", ".join([f"{k}: {v}" for k, v in inv.items() if v]) if isinstance(inv, dict) else str(inv)
149
+ print(f"Inventory: {inv_items if inv_items else 'empty'}")
150
+ if isinstance(ach, dict):
151
+ unlocked = sum(1 for v in ach.values() if v)
152
+ print(f"Achievements: {unlocked}/{len(ach)} unlocked")
153
+
154
+ # Take one step right to get a new obs
155
+ step = await client.post(
156
+ f"{args.base_url}/env/CrafterClassic/step",
157
+ json={
158
+ "env_id": env_id,
159
+ "action": {"tool_calls": [{"tool": "interact", "args": {"action": 2}}]},
160
+ },
161
+ )
162
+ step.raise_for_status()
163
+ sdata = step.json()
164
+ sobs = sdata["observation"]
165
+ print("\n=== NEXT OBSERVATION (after move_right) ===")
166
+ print(format_semantic_map_view(sobs, view_size=7))
167
+
168
+ # Cleanup
169
+ try:
170
+ await client.post(f"{args.base_url}/env/CrafterClassic/terminate", json={"env_id": env_id})
171
+ except Exception:
172
+ pass
173
+
174
+
175
+ if __name__ == "__main__":
176
+ import asyncio
177
+
178
+ asyncio.run(main())
179
+
180
+
@@ -0,0 +1,98 @@
1
+ # Crafter Model Comparison Cookbook
2
+
3
+ This cookbook demonstrates how to run parallel experiments comparing different language models on the Crafter environment, with robust timeout handling and performance analysis.
4
+
5
+ ## Overview
6
+
7
+ This cookbook runs episodes of the Crafter game environment with different language models (e.g., gpt-5-nano and Qwen/Qwen3-32B-Instruct) in parallel, collecting performance metrics and analyzing the results.
8
+
9
+ ## Features
10
+
11
+ - **Parallel episode execution**: Runs multiple episodes simultaneously for faster experimentation
12
+ - **Timeout handling**:
13
+ - Turn-level timeout (20s per LLM call)
14
+ - Episode-level timeout (180s total)
15
+ - Action execution timeout (5s)
16
+ - **Progress tracking**: Real-time progress bars showing steps across all episodes
17
+ - **Performance comparison**: Analyzes achievements, invalid action rates, and model usage statistics
18
+ - **Deterministic seeding**: Uses consecutive seeds for reproducible experiments
19
+
20
+ ## Prerequisites
21
+
22
+ 1. Ensure the Crafter environment service is running:
23
+ ```bash
24
+ cd synth-ai/
25
+ bash serve.sh
26
+ ```
27
+
28
+ 2. Set up your API keys (Synth/OpenAI or provider as needed):
29
+ ```bash
30
+ export OPENAI_API_KEY="your-api-key"
31
+ ```
32
+
33
+ ## Usage
34
+
35
+ Run the comparison script (gpt-5-nano vs Qwen/Qwen3-32B-Instruct):
36
+ ```bash
37
+ uvpm examples.evals.compare_models --episodes 5 --max-turns 100 --difficulty easy \
38
+ --models "gpt-5-nano" "Qwen/Qwen3-32B-Instruct"
39
+ ```
40
+
41
+ Or with custom parameters:
42
+ ```bash
43
+ python compare_models.py \
44
+ --episodes 10 \
45
+ --max-turns 100 \
46
+ --difficulty easy \
47
+ --models "gpt-5-nano" "Qwen/Qwen3-32B-Instruct" \
48
+ --base-seed 1000 \
49
+ --turn-timeout 30.0 \
50
+ --episode-timeout 300.0
51
+ ```
52
+
53
+ ## Parameters
54
+
55
+ - `--episodes`: Number of episodes per model (default: 5)
56
+ - `--max-turns`: Maximum turns per episode (default: 50)
57
+ - `--difficulty`: Game difficulty - easy, medium, hard (default: easy)
58
+ - `--models`: Models to test (default: gpt-4o-mini gpt-4.1-mini)
59
+ - `--base-seed`: Starting seed for episodes (default: 1000)
60
+ - `--turn-timeout`: Timeout per turn in seconds (default: 20.0)
61
+ - `--episode-timeout`: Total timeout per episode in seconds (default: 180.0)
62
+
63
+ ## Output
64
+
65
+ The script produces:
66
+ 1. Real-time progress bars showing episode execution
67
+ 2. Performance summary table comparing models
68
+ 3. Achievement frequency analysis
69
+ 4. Model usage statistics (filtered to current experiment only)
70
+ 5. JSON file with detailed results
71
+
72
+ ## Example Output (abridged)
73
+
74
+ ```
75
+ 📊 Analysis Results:
76
+ ================================================================================
77
+
78
+ 📈 Model Performance Summary:
79
+ Model Avg Achievements Max Achievements Invalid Rate Success Rate
80
+ --------------------------------------------------------------------------------------
81
+ gpt-5-nano 1.60 ± 1.10 4 1.20% 100.00%
82
+ Qwen/Qwen3-32B-Inst 1.40 ± 1.05 3 1.80% 100.00%
83
+
84
+ 🏆 Achievement Frequencies:
85
+ Achievement gpt-5-na qwen3-32
86
+ -----------------------------------------------
87
+ collect_drink 1/5 ( 20%) 3/5 ( 60%)
88
+ collect_sapling 2/5 ( 40%) 2/5 ( 40%)
89
+ collect_wood 4/5 ( 80%) 2/5 ( 40%)
90
+ ```
91
+
92
+ ## Implementation Details
93
+
94
+ The comparison uses:
95
+ - Async/await for parallel episode execution
96
+ - Session-based tracing with v3 architecture
97
+ - Structured output tools for consistent LLM interactions
98
+ - SQLite database for tracking model usage and costs
@@ -0,0 +1,6 @@
1
+ """
2
+ Crafter Model Comparison Cookbook
3
+
4
+ A self-contained example for comparing language model performance on the Crafter environment
5
+ with parallel execution and robust timeout handling.
6
+ """