synth-ai 0.2.9.dev3__py3-none-any.whl → 0.2.9.dev4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (107) hide show
  1. examples/analyze_semantic_words.sh +17 -0
  2. examples/common_old/backend.py +21 -0
  3. examples/crafter_debug_render.py +180 -0
  4. examples/evals_old/README.md +98 -0
  5. examples/evals_old/__init__.py +6 -0
  6. examples/evals_old/compare_models.py +1037 -0
  7. examples/evals_old/example_log.md +145 -0
  8. examples/evals_old/run_demo.sh +126 -0
  9. examples/evals_old/trace_analysis.py +270 -0
  10. examples/finetuning_old/_backup_synth_qwen/config.toml +29 -0
  11. examples/finetuning_old/_backup_synth_qwen/example_log.md +324 -0
  12. examples/finetuning_old/_backup_synth_qwen/filter_traces.py +60 -0
  13. examples/finetuning_old/_backup_synth_qwen/filter_traces_achievements.py +239 -0
  14. examples/finetuning_old/_backup_synth_qwen/purge_v3_traces.py +109 -0
  15. examples/finetuning_old/_backup_synth_qwen/react_agent_lm.py +1924 -0
  16. examples/finetuning_old/_backup_synth_qwen/readme.md +49 -0
  17. examples/finetuning_old/_backup_synth_qwen/run_crafter_qwen4b.py +114 -0
  18. examples/finetuning_old/_backup_synth_qwen/run_demo.sh +195 -0
  19. examples/finetuning_old/_backup_synth_qwen/sft_kickoff.py +118 -0
  20. examples/finetuning_old/synth_qwen_v1/README.md +68 -0
  21. examples/finetuning_old/synth_qwen_v1/filter_traces.py +60 -0
  22. examples/finetuning_old/synth_qwen_v1/filter_traces_achievements.py +239 -0
  23. examples/finetuning_old/synth_qwen_v1/finetune.py +46 -0
  24. examples/finetuning_old/synth_qwen_v1/hello_ft_model.py +71 -0
  25. examples/finetuning_old/synth_qwen_v1/infer.py +37 -0
  26. examples/finetuning_old/synth_qwen_v1/poll.py +44 -0
  27. examples/finetuning_old/synth_qwen_v1/prepare_data.py +35 -0
  28. examples/finetuning_old/synth_qwen_v1/purge_v3_traces.py +109 -0
  29. examples/finetuning_old/synth_qwen_v1/react_agent_lm.py +1932 -0
  30. examples/finetuning_old/synth_qwen_v1/run_crafter_sft_job.py +207 -0
  31. examples/finetuning_old/synth_qwen_v1/run_ft_job.py +232 -0
  32. examples/finetuning_old/synth_qwen_v1/upload_data.py +34 -0
  33. examples/finetuning_old/synth_qwen_v1/util.py +147 -0
  34. examples/rl/README.md +169 -0
  35. examples/rl/configs/eval_base_qwen.toml +15 -0
  36. examples/rl/configs/eval_rl_qwen.toml +11 -0
  37. examples/rl/configs/rl_from_base_qwen.toml +35 -0
  38. examples/rl/configs/rl_from_base_qwen17.toml +74 -0
  39. examples/rl/configs/rl_from_ft_qwen.toml +35 -0
  40. examples/rl/download_dataset.py +64 -0
  41. examples/rl/run_eval.py +435 -0
  42. examples/rl/run_rl_and_save.py +94 -0
  43. examples/rl/task_app/README.md +22 -0
  44. {synth_ai/task/apps → examples/rl/task_app}/math_single_step.py +8 -8
  45. examples/rl/task_app/math_task_app.py +107 -0
  46. examples/rl_old/task_app.py +962 -0
  47. examples/run_crafter_demo.sh +10 -0
  48. examples/warming_up_to_rl/analyze_trace_db.py +420 -0
  49. examples/warming_up_to_rl/configs/crafter_fft.toml +48 -0
  50. examples/warming_up_to_rl/configs/crafter_fft_4b.toml +54 -0
  51. examples/warming_up_to_rl/configs/eval_fft_qwen4b.toml +20 -0
  52. examples/warming_up_to_rl/configs/eval_groq_qwen32b.toml +13 -0
  53. examples/warming_up_to_rl/configs/eval_modal_qwen4b.toml +23 -0
  54. examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +73 -0
  55. examples/warming_up_to_rl/configs/rl_from_ft.toml +56 -0
  56. examples/warming_up_to_rl/export_trace_sft.py +541 -0
  57. examples/warming_up_to_rl/groq_test.py +88 -0
  58. examples/warming_up_to_rl/manage_secrets.py +127 -0
  59. examples/warming_up_to_rl/old/event_rewards.md +234 -0
  60. examples/warming_up_to_rl/old/notes.md +73 -0
  61. examples/warming_up_to_rl/readme.md +172 -0
  62. examples/warming_up_to_rl/run_eval.py +434 -0
  63. examples/warming_up_to_rl/run_fft_and_save.py +309 -0
  64. examples/warming_up_to_rl/run_local_rollout.py +188 -0
  65. examples/warming_up_to_rl/run_local_rollout_modal.py +160 -0
  66. examples/warming_up_to_rl/run_local_rollout_parallel.py +342 -0
  67. examples/warming_up_to_rl/run_local_rollout_traced.py +372 -0
  68. examples/warming_up_to_rl/run_rl_and_save.py +101 -0
  69. examples/warming_up_to_rl/run_rollout_remote.py +129 -0
  70. examples/warming_up_to_rl/task_app/README.md +38 -0
  71. {synth_ai/task/apps → examples/warming_up_to_rl/task_app}/grpo_crafter.py +7 -7
  72. examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +165 -0
  73. examples/warming_up_to_rl/task_app/synth_envs_hosted/README.md +173 -0
  74. examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +5 -0
  75. examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +145 -0
  76. examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +1271 -0
  77. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -0
  78. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +6 -0
  79. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -0
  80. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +429 -0
  81. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +442 -0
  82. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +96 -0
  83. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +302 -0
  84. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +47 -0
  85. examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +202 -0
  86. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +5 -0
  87. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +512 -0
  88. examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +102 -0
  89. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +985 -0
  90. examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +197 -0
  91. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +1749 -0
  92. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +5 -0
  93. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +217 -0
  94. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +160 -0
  95. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +146 -0
  96. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_stepwise_rewards.py +58 -0
  97. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +61 -0
  98. synth_ai/api/train/config_finder.py +18 -18
  99. synth_ai/api/train/env_resolver.py +28 -1
  100. synth_ai/cli/task_apps.py +264 -55
  101. synth_ai/task/apps/__init__.py +54 -13
  102. {synth_ai-0.2.9.dev3.dist-info → synth_ai-0.2.9.dev4.dist-info}/METADATA +1 -1
  103. {synth_ai-0.2.9.dev3.dist-info → synth_ai-0.2.9.dev4.dist-info}/RECORD +107 -12
  104. {synth_ai-0.2.9.dev3.dist-info → synth_ai-0.2.9.dev4.dist-info}/top_level.txt +1 -0
  105. {synth_ai-0.2.9.dev3.dist-info → synth_ai-0.2.9.dev4.dist-info}/WHEEL +0 -0
  106. {synth_ai-0.2.9.dev3.dist-info → synth_ai-0.2.9.dev4.dist-info}/entry_points.txt +0 -0
  107. {synth_ai-0.2.9.dev3.dist-info → synth_ai-0.2.9.dev4.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1932 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script to run ReAct agents against Crafter environment using LM class with Synth backend.
4
+ This demonstrates using the LM class with Synth models through native integration.
5
+
6
+ This version uses the new tracing_v3 system with async Turso/SQLite backend.
7
+ """
8
+ import argparse
9
+ import asyncio
10
+ import contextlib
11
+ from contextlib import asynccontextmanager
12
+ import glob
13
+ import itertools
14
+ import json
15
+ import logging
16
+ import os
17
+ import random
18
+ import sys
19
+ import time
20
+ import uuid
21
+ from collections import defaultdict
22
+ from datetime import datetime
23
+ from pathlib import Path
24
+ from typing import Any
25
+
26
+ import httpx
27
+ import numpy as np
28
+ import toml
29
+ import yaml
30
+ from httpx import AsyncClient
31
+ from tqdm import tqdm
32
+
33
+ from synth_ai.config.base_url import get_backend_from_env
34
+
35
+
36
+ def _resolve_backend_default() -> str:
37
+ base, _ = get_backend_from_env()
38
+ base = base.rstrip("/")
39
+ return base if base.endswith("/api") else f"{base}/api"
40
+
41
+ # Disable httpx logging immediately
42
+ logging.getLogger("httpx").setLevel(logging.ERROR)
43
+ logging.getLogger("httpcore").setLevel(logging.ERROR)
44
+
45
+
46
+ # Configure logging to suppress noisy third-party logs when in quiet mode
47
+ def setup_logging(quiet_mode: bool = False):
48
+ """Setup logging configuration."""
49
+ if quiet_mode:
50
+ # Suppress most third-party logging in quiet mode
51
+ logging.getLogger("httpx").setLevel(logging.ERROR)
52
+ logging.getLogger("synth_ai.tracing_v3").setLevel(logging.ERROR)
53
+ logging.getLogger("synth_ai.tracing_v3.turso").setLevel(logging.ERROR)
54
+ logging.getLogger("sqlalchemy").setLevel(logging.ERROR)
55
+ logging.getLogger("aiosqlite").setLevel(logging.ERROR)
56
+ # Suppress httpcore as well (used by httpx)
57
+ logging.getLogger("httpcore").setLevel(logging.ERROR)
58
+ else:
59
+ # Normal logging levels
60
+ logging.getLogger("httpx").setLevel(logging.ERROR) # Always suppress httpx logs
61
+ logging.getLogger("synth_ai.tracing_v3").setLevel(logging.INFO)
62
+
63
+
64
+ # Set default logging to avoid noisy logs during import
65
+ setup_logging(quiet_mode=True)
66
+
67
+ # Setup environment
68
+ sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent.parent.parent))
69
+
70
+ # Disable v1 logging to see v3 tracing clearly
71
+ os.environ["LANGFUSE_ENABLED"] = "false"
72
+ os.environ["SYNTH_LOGGING"] = "false"
73
+
74
+ from synth_ai.lm.config import SynthConfig # noqa: E402
75
+
76
+ # Import Synth warmup utilities
77
+ from synth_ai.lm.warmup import warmup_synth_model # noqa: E402
78
+
79
+ # Import session tracer for v3 tracing
80
+ from synth_ai.tracing_v3 import SessionTracer # noqa: E402
81
+ from synth_ai.tracing_v3.abstractions import ( # noqa: E402
82
+ EnvironmentEvent,
83
+ RuntimeEvent,
84
+ SessionEventMarkovBlanketMessage,
85
+ TimeRecord,
86
+ )
87
+
88
+ # Import Crafter hooks for v3
89
+ from synth_ai.tracing_v3.hooks import HookManager # noqa: E402
90
+ from synth_ai.tracing_v3.turso.daemon import SqldDaemon # noqa: E402
91
+
92
+ # create_experiment_context will be defined as a helper function below
93
+ from synth_ai.tracing_v3.turso.manager import AsyncSQLTraceManager # noqa: E402
94
+
95
+ # Create a custom hook manager without default print statements
96
+ QUIET_HOOKS = HookManager()
97
+
98
+ # Import LM components (v3 version if available)
99
+ try:
100
+ from synth_ai.lm.core.main_v3 import LM # noqa: E402
101
+ except ImportError:
102
+ from synth_ai.lm.core.main_v2 import LM # noqa: E402
103
+
104
+ # Configuration constants
105
+ HTTP_TIMEOUT = (
106
+ 30.0 # Increased from 10.0 for better handling of concurrent load and LM response times
107
+ )
108
+ MAX_RETRIES = 3
109
+ RETRY_DELAY = 1.0
110
+
111
+ # Use the backend
112
+ @asynccontextmanager
113
+ async def _noop_async_context():
114
+ yield
115
+
116
+
117
+
118
+ async def create_experiment_context(
119
+ db_manager: AsyncSQLTraceManager, experiment_name: str, description: str
120
+ ) -> dict[str, Any]:
121
+ """Create an experiment context for v3 tracing."""
122
+ experiment_id = f"exp_{uuid.uuid4().hex[:12]}"
123
+ await db_manager.create_experiment(
124
+ experiment_id=experiment_id, name=experiment_name, description=description, configuration={}
125
+ )
126
+ return {
127
+ "experiment_id": experiment_id,
128
+ "experiment_name": experiment_name,
129
+ "description": description,
130
+ }
131
+
132
+
133
+ def cleanup_old_files():
134
+ """Clean up old trace files and result files to keep directory clean."""
135
+ # Remove old JSON result files (keep only the latest 5)
136
+ result_files = glob.glob("crafter_lm_synth_results_*.json")
137
+ if len(result_files) > 5:
138
+ # Sort by modification time and keep only the latest 5
139
+ result_files.sort(key=lambda x: os.path.getmtime(x), reverse=True)
140
+ for old_file in result_files[5:]:
141
+ try:
142
+ os.remove(old_file)
143
+ print(f"🗑️ Cleaned up old result file: {old_file}")
144
+ except OSError:
145
+ pass
146
+
147
+
148
+ def _load_env_from_monorepo() -> dict:
149
+ """Load environment variables from monorepo/.env.local if present."""
150
+ env_file = (
151
+ Path(__file__).resolve().parent.parent.parent.parent.parent.parent / "monorepo/.env.local"
152
+ )
153
+ env_vars = {}
154
+
155
+ if env_file.exists():
156
+ with open(env_file) as f:
157
+ for line in f:
158
+ line = line.strip()
159
+ if line and not line.startswith("#") and "=" in line:
160
+ key, value = line.split("=", 1)
161
+ # Remove quotes if present
162
+ value = value.strip().strip('"').strip("'")
163
+ env_vars[key] = value
164
+
165
+ return env_vars
166
+
167
+
168
+ def _load_testing_yaml_api_key() -> str | None:
169
+ """Load SYNTH_API_KEY from monorepo/tests/prod/testing_info.yaml if present."""
170
+ # First try the new env vars from monorepo/.env.local
171
+ env_vars = _load_env_from_monorepo()
172
+
173
+ # Try production key first, then test key
174
+ if "SYNTH_API_KEY_PROD" in env_vars:
175
+ return env_vars["SYNTH_API_KEY_PROD"]
176
+ elif "SYNTH_API_KEY_TEST" in env_vars:
177
+ return env_vars["SYNTH_API_KEY_TEST"]
178
+
179
+ # Fallback to the old YAML method
180
+ yaml_path = (
181
+ Path(__file__).resolve().parent.parent.parent.parent.parent.parent
182
+ / "monorepo/tests/prod/testing_info.yaml"
183
+ )
184
+ if yaml_path.exists():
185
+ with open(yaml_path) as f:
186
+ data = yaml.safe_load(f)
187
+ return data.get("SYNTH_API_KEY")
188
+ return None
189
+
190
+
191
+ def setup_synth_environment():
192
+ """Setup environment variables for Synth/Modal endpoints.
193
+
194
+ Resolution order for the base URL:
195
+ 1. Explicit environment variables (SYNTH_BASE_URL or MODAL_BASE_URL)
196
+ 2. PROD_API_URL env var used in production integration tests
197
+ 3. Synth production default (PROD_BASE_URL_DEFAULT)
198
+
199
+ The API key is resolved from the matching *_API_KEY env vars or, if not
200
+ present, from the shared testing_info.yaml used by the prod tests.
201
+ """
202
+ # Load environment variables from monorepo/.env.local
203
+ env_vars = _load_env_from_monorepo()
204
+
205
+ synth_base_url = (
206
+ os.getenv("SYNTH_BASE_URL")
207
+ or os.getenv("MODAL_BASE_URL")
208
+ or os.getenv("PROD_API_URL")
209
+ or env_vars.get("SYNTH_BASE_URL_PROD") # Use production URL from .env.local
210
+ or _resolve_backend_default()
211
+ )
212
+
213
+ synth_api_key = os.getenv("SYNTH_API_KEY") or _load_testing_yaml_api_key()
214
+
215
+ # # --- Validate API key format ---
216
+ # if synth_api_key:
217
+ # VALID_PREFIXES = ("sk-", "sk_live_", "sk_test_")
218
+ # if not any(synth_api_key.startswith(p) for p in VALID_PREFIXES):
219
+ # truncated = synth_api_key[:8] if len(synth_api_key) >= 8 else synth_api_key
220
+ # expected_formats = " or ".join(VALID_PREFIXES)
221
+ # raise ValueError(
222
+ # f"Invalid API key format. Expected prefix {expected_formats}. Provided key begins with '{truncated}'."
223
+ # )
224
+ # else:
225
+ # raise ValueError(
226
+ # "SYNTH_API_KEY or MODAL_API_KEY must be provided via environment variables or testing_info.yaml"
227
+ # )
228
+
229
+ # Ensure trailing /v1 for OpenAI-compatible endpoints
230
+ if not synth_base_url.endswith("/v1"):
231
+ synth_base_url = synth_base_url.rstrip("/") + "/v1"
232
+ synth_base_url = synth_base_url.rstrip("/")
233
+
234
+ # Propagate to OpenAI SDK env vars expected by LM class
235
+ os.environ["OPENAI_API_BASE"] = synth_base_url
236
+ os.environ["OPENAI_BASE_URL"] = synth_base_url
237
+ os.environ["OPENAI_API_KEY"] = synth_api_key
238
+
239
+ return synth_base_url, synth_api_key
240
+
241
+
242
+ async def retry_http_request(client: AsyncClient, method: str, url: str, **kwargs) -> Any:
243
+ """Retry HTTP requests with exponential backoff and jitter."""
244
+ last_exception = None
245
+
246
+ for attempt in range(MAX_RETRIES):
247
+ try:
248
+ if attempt > 0:
249
+ delay = min(RETRY_DELAY * (2 ** (attempt - 1)), RETRY_DELAY * 2) # Use RETRY_DELAY
250
+ jitter = random.uniform(0, 0.1 * delay)
251
+ total_delay = delay + jitter
252
+ await asyncio.sleep(total_delay)
253
+
254
+ response = await client.request(method, url, timeout=HTTP_TIMEOUT, **kwargs)
255
+
256
+ if response.status_code < 500:
257
+ return response
258
+
259
+ last_exception = Exception(f"HTTP {response.status_code}: {response.text}")
260
+
261
+ except httpx.ReadError as e:
262
+ last_exception = e
263
+ if attempt < MAX_RETRIES - 1:
264
+ read_error_delay = min(1.0 * (2**attempt), 5.0)
265
+ await asyncio.sleep(read_error_delay)
266
+ except Exception as e:
267
+ last_exception = e
268
+
269
+ print(
270
+ f" ❌ HTTP request failed after {MAX_RETRIES} attempts: {type(last_exception).__name__}: {str(last_exception)[:200]}"
271
+ )
272
+ raise last_exception
273
+
274
+
275
+ def create_message(
276
+ content: Any, message_type: str, origin_system_id: Any, turn: int
277
+ ) -> SessionEventMarkovBlanketMessage:
278
+ """Create a message with origin system ID embedded in content."""
279
+ # Map custom message types to valid v3 message types
280
+ type_mapping = {
281
+ "observation": "system", # Map observation to system message
282
+ "user": "user",
283
+ "assistant": "assistant",
284
+ "system": "system",
285
+ "tool_use": "tool_use",
286
+ "tool_result": "tool_result",
287
+ }
288
+
289
+ return SessionEventMarkovBlanketMessage(
290
+ content=json.dumps({"origin_system_id": str(origin_system_id), "payload": content}),
291
+ message_type=type_mapping.get(message_type, "system"), # Default to system
292
+ time_record=TimeRecord(event_time=time.time(), message_time=turn),
293
+ )
294
+
295
+
296
+ def compress_observation_for_trace(obs: dict[str, Any]) -> dict[str, Any]:
297
+ """Compress observation for trace storage to avoid huge trace files."""
298
+ compressed = obs.copy()
299
+
300
+ # Compress semantic map if present
301
+ if "semantic_map" in compressed:
302
+ del compressed["semantic_map"]
303
+
304
+ # Compress other large fields
305
+ if "rgb" in compressed:
306
+ del compressed["rgb"]
307
+
308
+ return compressed
309
+
310
+
311
+ def format_semantic_map_view_v2(obs: dict[str, Any], view_size: int = 7) -> str:
312
+ """Format a semantic map view around the player with normal names using real Crafter mapping."""
313
+ # Get semantic map
314
+ semantic_map = obs.get("semantic_map")
315
+ if semantic_map is None:
316
+ return "No semantic map available"
317
+
318
+ # Convert to numpy array if needed
319
+ sem_arr = np.asarray(semantic_map)
320
+ if sem_arr.ndim == 1:
321
+ # Assuming square map, reshape
322
+ size = int(np.sqrt(sem_arr.size))
323
+ sem_arr = sem_arr.reshape(size, size)
324
+
325
+ # Get player position
326
+ player_pos = obs.get("player_position", [sem_arr.shape[0] // 2, sem_arr.shape[1] // 2])
327
+ px, py = int(player_pos[0]), int(player_pos[1])
328
+
329
+ # Get real crafter semantic mapping directly from crafter library
330
+ import crafter
331
+
332
+ dummyenv = crafter.Env()
333
+ try:
334
+ max_id = (
335
+ max(max(dummyenv._world._mat_ids.values()), max(dummyenv._sem_view._obj_ids.values()))
336
+ + 1
337
+ )
338
+ id_to_item = ["void"] * max_id
339
+ for name, ind in itertools.chain(
340
+ dummyenv._world._mat_ids.items(), dummyenv._sem_view._obj_ids.items()
341
+ ):
342
+ clean = (
343
+ name.__name__
344
+ if hasattr(name, "__name__")
345
+ else (str(name) if name is not None else "none")
346
+ )
347
+ id_to_item[ind] = clean.lower()
348
+ finally:
349
+ with contextlib.suppress(AttributeError, Exception):
350
+ dummyenv.close()
351
+
352
+ # Create view
353
+ half = view_size // 2
354
+ lines = []
355
+ visible_items = set()
356
+
357
+ for dy in range(-half, half + 1):
358
+ row = []
359
+ for dx in range(-half, half + 1):
360
+ x, y = px + dx, py + dy
361
+
362
+ if dx == 0 and dy == 0:
363
+ row.append("you") # Player
364
+ elif 0 <= x < sem_arr.shape[0] and 0 <= y < sem_arr.shape[1]:
365
+ val = int(sem_arr[x, y])
366
+ # Use the real crafter mapping
367
+ item_name = id_to_item[val] if val < len(id_to_item) else f"unknown_{val}"
368
+ row.append(item_name)
369
+ if item_name not in ["grass", "you", "void"]:
370
+ visible_items.add(item_name)
371
+ else:
372
+ row.append("void") # Out of bounds
373
+
374
+ lines.append(" ".join(row))
375
+
376
+ # Add legend of visible items
377
+ legend = (
378
+ f"Visible items: {', '.join(sorted(visible_items))}"
379
+ if visible_items
380
+ else "No special items visible (mostly grass)"
381
+ )
382
+
383
+ return "\n".join(lines) + "\n" + legend
384
+
385
+
386
+ def get_openai_tools():
387
+ """Get OpenAI-compatible tool definitions for Synth models."""
388
+ return [
389
+ {
390
+ "type": "function",
391
+ "function": {
392
+ "name": "interact",
393
+ "description": "Perform actions in the Crafter environment.",
394
+ "parameters": {
395
+ "type": "object",
396
+ "properties": {
397
+ "actions": {
398
+ "type": "array",
399
+ "items": {"type": "string"},
400
+ "description": "List of actions to perform in sequence (e.g., ['move_right', 'move_right', 'do']). Available actions: move_left, move_right, move_up, move_down, do, sleep, place_stone, place_table, place_furnace, place_plant, make_wood_pickaxe, make_stone_pickaxe, make_iron_pickaxe, make_wood_sword, make_stone_sword, make_iron_sword, noop",
401
+ },
402
+ "reasoning": {
403
+ "type": "string",
404
+ "description": "Reasoning for these actions",
405
+ },
406
+ },
407
+ "required": ["actions", "reasoning"],
408
+ },
409
+ },
410
+ },
411
+ {
412
+ "type": "function",
413
+ "function": {
414
+ "name": "terminate",
415
+ "description": "End the episode when finished or no progress can be made.",
416
+ "parameters": {
417
+ "type": "object",
418
+ "properties": {
419
+ "reason": {"type": "string", "description": "Reason for termination"}
420
+ },
421
+ "required": ["reason"],
422
+ },
423
+ },
424
+ },
425
+ ]
426
+
427
+
428
+ # --- Configuration Class ---
429
+ class CrafterConfig:
430
+ """Configuration for Crafter evaluation with Synth backend."""
431
+
432
+ def __init__(self, config_path: str | None = None):
433
+ # Default values
434
+ self.model_name: str | None = None
435
+ self.num_instances = 1
436
+ self.max_turns = 2
437
+ self.difficulty = "easy"
438
+ self.service_base_url = "http://localhost:8901"
439
+ self.service_timeout = 30.0
440
+ self.seed = 42
441
+ self.save_traces = True
442
+ self.save_detailed_results = True
443
+ self.verbose = False
444
+ self.quiet = False # Add quiet mode support
445
+ self.analyze_traces = False
446
+
447
+ # V3 tracing settings
448
+ self.enable_v3_tracing = True
449
+ # Standardize to a single shared v3 DB by default; allow env override
450
+ self.v3_trace_dir = os.getenv("SYNTH_TRACES_ROOT", "./traces/v3")
451
+ # Use shared DB path unless explicitly overridden via env or config
452
+ self.turso_db_path = os.getenv(
453
+ "SQLD_DB_PATH", os.path.join(self.v3_trace_dir, "synth_ai.db")
454
+ )
455
+ self.start_sqld_daemon = True # Whether to start sqld daemon
456
+ self.auto_cleanup = True # Clean up old files automatically
457
+
458
+ # Synth-specific settings
459
+ self.warmup_model = True
460
+ self.warmup_max_attempts = 30
461
+ self.warmup_timeout = 60.0 # Default timeout in seconds
462
+ self.use_synth_backend = True # Flag to indicate Synth backend
463
+
464
+ # Load from TOML if provided
465
+ if config_path and os.path.exists(config_path):
466
+ self.load_from_toml(config_path)
467
+
468
+ def load_from_toml(self, config_path: str):
469
+ """Load configuration from TOML file."""
470
+ config = toml.load(config_path)
471
+
472
+ eval_config = config.get("eval", {})
473
+ self.model_name = eval_config.get("model_name", self.model_name)
474
+ self.num_instances = eval_config.get("episodes", self.num_instances)
475
+ self.max_turns = eval_config.get("max_steps", self.max_turns)
476
+ self.difficulty = eval_config.get("difficulty", self.difficulty)
477
+ self.seed = eval_config.get("seed", self.seed)
478
+
479
+ service_config = config.get("service", {})
480
+ self.service_base_url = service_config.get("base_url", self.service_base_url)
481
+ self.service_timeout = service_config.get("timeout", self.service_timeout)
482
+
483
+ output_config = config.get("output", {})
484
+ self.save_traces = output_config.get("save_traces", self.save_traces)
485
+ self.save_detailed_results = output_config.get(
486
+ "save_detailed_results", self.save_detailed_results
487
+ )
488
+
489
+ # V3 tracing config
490
+ tracing_config = config.get("tracing_v3", {})
491
+ self.enable_v3_tracing = tracing_config.get("enabled", self.enable_v3_tracing)
492
+ self.v3_trace_dir = tracing_config.get("trace_dir", self.v3_trace_dir)
493
+ self.turso_db_path = tracing_config.get("db_path", self.turso_db_path)
494
+ self.start_sqld_daemon = tracing_config.get("start_daemon", self.start_sqld_daemon)
495
+ self.auto_cleanup = tracing_config.get("auto_cleanup", self.auto_cleanup)
496
+
497
+ # Synth config
498
+ synth_config = config.get("synth", {})
499
+ self.warmup_model = synth_config.get("warmup_model", self.warmup_model)
500
+ self.warmup_max_attempts = synth_config.get("warmup_max_attempts", self.warmup_max_attempts)
501
+ self.warmup_timeout = synth_config.get("warmup_timeout", self.warmup_timeout)
502
+ self.use_synth_backend = synth_config.get("use_synth_backend", self.use_synth_backend)
503
+
504
+
505
+ # --- Base ReAct Agent using LM with Synth ---
506
+ class BaseReActAgentWithLMSynth:
507
+ """Base ReAct agent using LM class configured for Synth backend."""
508
+
509
+ def __init__(
510
+ self,
511
+ model_name: str,
512
+ max_turns: int = 20,
513
+ verbose: bool = False,
514
+ tracer: SessionTracer | None = None,
515
+ episode_id: int = 0,
516
+ quiet: bool = False,
517
+ model_params: dict[str, Any] | None = None,
518
+ ):
519
+ self.model_name = model_name
520
+ self.max_turns = max_turns
521
+ self.verbose = verbose
522
+ self.quiet = quiet
523
+ self.history = []
524
+ self.system_name = "base-react-agent-lm-synth"
525
+ self.tools = get_openai_tools()
526
+ self.tracer = tracer
527
+ self.system_id = f"{self.system_name}_{uuid.uuid4()}"
528
+ self.episode_id = episode_id
529
+
530
+ # Default model parameters
531
+ default_model_params = {
532
+ "temperature": 0.7,
533
+ "max_tokens": 512,
534
+ "top_p": 1.0,
535
+ "frequency_penalty": 0.0,
536
+ "presence_penalty": 0.0,
537
+ "tool_choice": "auto",
538
+ }
539
+
540
+ # Merge user-provided parameters with defaults
541
+ self.model_params = {**default_model_params, **(model_params or {})}
542
+
543
+ # Setup Synth environment variables
544
+ setup_synth_environment()
545
+
546
+ # Create LM instance with synth provider and configurable parameters
547
+ self.lm = LM(
548
+ model_name=model_name,
549
+ formatting_model_name=model_name,
550
+ temperature=self.model_params["temperature"],
551
+ synth_logging=False, # Disable v1 tracing
552
+ provider="synth", # Use synth provider
553
+ session_tracer=tracer,
554
+ system_id=self.system_id,
555
+ enable_v3_tracing=True,
556
+ # Pass additional model parameters
557
+ max_tokens=self.model_params["max_tokens"],
558
+ top_p=self.model_params["top_p"],
559
+ frequency_penalty=self.model_params["frequency_penalty"],
560
+ presence_penalty=self.model_params["presence_penalty"],
561
+ # Qwen3 think mode (propagated by vendor to chat_template_kwargs)
562
+ enable_thinking=self.model_params.get("enable_thinking"),
563
+ # Forward arbitrary extra_body to vendor for features like
564
+ # stop_after_tool_calls. The runner sets this to 1.
565
+ extra_body=self.model_params.get("extra_body"),
566
+ )
567
+
568
+ # Agent state tracking
569
+ self.agent_state = {
570
+ "message_history": [],
571
+ "steps_taken": 0,
572
+ "steps_remaining": max_turns,
573
+ "total_tokens_used": 0,
574
+ "tool_calls_made": 0,
575
+ "current_turn": 0,
576
+ "last_failure": None, # Track last failure for prompting
577
+ "recent_tool_calls": [],
578
+ }
579
+
580
+ async def decide(self, obs: str, system_message: str, turn: int) -> dict[str, Any]:
581
+ """Get agent decision based on observation using LM class with Synth backend."""
582
+ # Update agent state
583
+ self.agent_state["current_turn"] = turn
584
+ self.agent_state["steps_taken"] = turn
585
+ self.agent_state["steps_remaining"] = self.max_turns - turn
586
+
587
+ # Include last 3 tool calls (reasoning and actions) to provide short action history
588
+ recent_calls = self.agent_state.get("recent_tool_calls", [])
589
+ recent_tail = recent_calls[-3:] if isinstance(recent_calls, list) else []
590
+ if recent_tail:
591
+ lines = ["\nRecent tool calls (last 3):"]
592
+ for entry in recent_tail:
593
+ tnum = entry.get("turn")
594
+ name = entry.get("name")
595
+ reasoning = entry.get("reasoning")
596
+ actions = entry.get("actions")
597
+ actions_str = ", ".join(actions) if isinstance(actions, list) else ""
598
+ lines.append(
599
+ f"- Turn {tnum}: {name} — reasoning: {reasoning}; actions: {actions_str}"
600
+ )
601
+ obs_with_history = f"{obs}\n" + "\n".join(lines)
602
+ else:
603
+ obs_with_history = obs
604
+
605
+ # Create conversation context with unique episode ID to prevent caching
606
+ context = (
607
+ f"Episode {self.episode_id} - Turn {turn + 1}/{self.max_turns}\n\n{obs_with_history}"
608
+ )
609
+
610
+ # Build messages in OpenAI format for tools
611
+ # Augment the system message if the previous turn failed to produce a tool call
612
+ local_system_message = system_message
613
+ last_failure = self.agent_state.get("last_failure")
614
+ if last_failure:
615
+ local_system_message = (
616
+ f"{system_message}\n\nIMPORTANT: In the previous turn, no valid tool call was returned. "
617
+ f"Error: {last_failure}. You MUST respond with a single function tool call in the OpenAI tools format."
618
+ )
619
+ messages = [
620
+ {"role": "system", "content": local_system_message},
621
+ {"role": "user", "content": context},
622
+ ]
623
+
624
+ # Add to message history
625
+ self.agent_state["message_history"].extend(messages)
626
+
627
+ # Truncate history if too long
628
+ max_history_length = 20
629
+ if len(self.agent_state["message_history"]) > max_history_length:
630
+ self.agent_state["message_history"] = [
631
+ self.agent_state["message_history"][0]
632
+ ] + self.agent_state["message_history"][-(max_history_length - 1) :]
633
+
634
+ try:
635
+ llm_start = time.time()
636
+
637
+ # Optionally print full prompt on final turn when verbose
638
+ if self.verbose and turn == self.max_turns - 1:
639
+ print("\n🔍 FINAL TURN PROMPT:")
640
+ print("=" * 80)
641
+ print(f"System: {local_system_message[:200]}...")
642
+ print(f"\nUser message:\n{context}")
643
+ print("=" * 80)
644
+
645
+ # Debug: Print request info only when verbose
646
+ if self.verbose:
647
+ print(f"\n🔍 DEBUG: LM call details (turn {turn})")
648
+ print(f" Model: {self.model_name}")
649
+ print(" Provider: synth")
650
+ print(f" Messages: {len(messages)} messages")
651
+ print(f" Tools: {len(self.tools) if self.tools else 0} tools")
652
+ if self.tools:
653
+ print(
654
+ f" Tool 0 name: {self.tools[0].get('function', {}).get('name', 'unknown')}"
655
+ )
656
+ print(f" Tools structure: {json.dumps(self.tools[0], indent=4)[:300]}...")
657
+
658
+ # Call LM with turn number for v3 tracing
659
+ # The LM class should handle Synth routing internally
660
+ if self.verbose:
661
+ print(
662
+ f"🔍 DEBUG: LM sampling params => max_tokens={self.model_params.get('max_tokens')} temp={self.model_params.get('temperature')} top_p={self.model_params.get('top_p')} tool_choice={self.model_params.get('tool_choice')}"
663
+ )
664
+
665
+ # Optional full input logging (system, user, tools). Enable with CRAFTER_LOG_FULL_INPUTS=1
666
+ _log_full_inputs = os.getenv("CRAFTER_LOG_FULL_INPUTS", "0").lower() in (
667
+ "1",
668
+ "true",
669
+ "yes",
670
+ "on",
671
+ )
672
+ # if _log_full_inputs:
673
+ # print("\n" + "=" * 80)
674
+ # print(f"FULL LM INPUT (turn {turn})")
675
+ # print("-" * 80)
676
+ # print("System message:\n" + local_system_message)
677
+ # print("\nUser message:\n" + context)
678
+ # print("\nMessages JSON:")
679
+ # print(json.dumps(messages, indent=2))
680
+ # print("\nTools definition:")
681
+ # print(json.dumps(self.tools, indent=2))
682
+ # print("\nSampling/tool params:")
683
+ # print(
684
+ # json.dumps(
685
+ # {
686
+ # "tool_choice": self.model_params.get("tool_choice"),
687
+ # "extra_body": self.model_params.get("extra_body"),
688
+ # "temperature": self.model_params.get("temperature"),
689
+ # "max_tokens": self.model_params.get("max_tokens"),
690
+ # "top_p": self.model_params.get("top_p"),
691
+ # "frequency_penalty": self.model_params.get("frequency_penalty"),
692
+ # "presence_penalty": self.model_params.get("presence_penalty"),
693
+ # },
694
+ # indent=2,
695
+ # )
696
+ # )
697
+ # print("=" * 80)
698
+
699
+ response = await self.lm.respond_async(
700
+ messages=messages,
701
+ turn_number=turn,
702
+ # Pass tools in the format expected by LM class
703
+ tools=self.tools,
704
+ max_tokens=self.model_params["max_tokens"],
705
+ tool_choice=self.model_params.get("tool_choice", "auto"),
706
+ # Pass extra_body per call to ensure backend receives stop_after_tool_calls
707
+ extra_body=self.model_params.get("extra_body"),
708
+ )
709
+
710
+ llm_end = time.time()
711
+
712
+ # Minimal output: show only tool_call presence, number of actions, and tokens
713
+ completion_tokens = None
714
+ prompt_tokens = None
715
+ toks_per_sec = None
716
+ if hasattr(response, "usage") and isinstance(response.usage, dict):
717
+ completion_tokens = response.usage.get("completion_tokens")
718
+ prompt_tokens = response.usage.get("prompt_tokens")
719
+ # Compute tokens/sec if we have duration and completion tokens
720
+ try:
721
+ if completion_tokens is not None:
722
+ duration_s = max(1e-6, (llm_end - llm_start))
723
+ toks_per_sec = round(float(completion_tokens) / duration_s, 2)
724
+ except Exception:
725
+ toks_per_sec = None
726
+
727
+ # Parse the response to extract tool calls
728
+ raw_response = response.raw_response
729
+ decision: dict[str, Any]
730
+
731
+ if hasattr(response, "tool_calls") and response.tool_calls:
732
+ tool_call = response.tool_calls[0]
733
+ parsed_decision = None
734
+ fn = tool_call.get("function") if isinstance(tool_call, dict) else None
735
+ if isinstance(fn, dict) and ("name" in fn):
736
+ name = fn.get("name", "interact")
737
+ args_raw = fn.get("arguments", "{}")
738
+ try:
739
+ import json as _json
740
+
741
+ args = (
742
+ _json.loads(args_raw) if isinstance(args_raw, str) else (args_raw or {})
743
+ )
744
+ if isinstance(args, dict):
745
+ parsed_decision = {"name": name, "parameters": args}
746
+ except Exception as _e:
747
+ parsed_decision = {"name": name, "parameters": {"arguments": args_raw}}
748
+ if (
749
+ not parsed_decision
750
+ and isinstance(tool_call, dict)
751
+ and ("name" in tool_call or "parameters" in tool_call)
752
+ ):
753
+ parsed_decision = {
754
+ "name": tool_call.get("name", "interact"),
755
+ "parameters": tool_call.get("parameters", {}),
756
+ }
757
+ if parsed_decision:
758
+ decision = parsed_decision
759
+ try:
760
+ pname = decision.get("name")
761
+ pparams = (
762
+ decision.get("parameters", {}) if isinstance(decision, dict) else {}
763
+ )
764
+ preason = pparams.get("reasoning") if isinstance(pparams, dict) else None
765
+ pacts = pparams.get("actions") if isinstance(pparams, dict) else None
766
+ entry = {
767
+ "turn": turn,
768
+ "name": pname,
769
+ "reasoning": preason,
770
+ "actions": pacts if isinstance(pacts, list) else [],
771
+ }
772
+ self.agent_state["recent_tool_calls"].append(entry)
773
+ if len(self.agent_state["recent_tool_calls"]) > 10:
774
+ self.agent_state["recent_tool_calls"] = self.agent_state[
775
+ "recent_tool_calls"
776
+ ][-10:]
777
+ except Exception:
778
+ pass
779
+ # Clear failure flag on success
780
+ if self.agent_state.get("last_failure"):
781
+ self.agent_state["last_failure"] = None
782
+ params = decision.get("parameters", {}) if isinstance(decision, dict) else {}
783
+ actions = params.get("actions", []) if isinstance(params, dict) else []
784
+ num_actions = len(actions) if isinstance(actions, list) else 0
785
+ # Store metrics for tqdm postfix update in run_episode
786
+ self.agent_state["last_metrics"] = {
787
+ "tc": 1,
788
+ "act": num_actions,
789
+ "tok": completion_tokens,
790
+ "in": prompt_tokens,
791
+ "tps": f"{toks_per_sec}" if toks_per_sec is not None else "-",
792
+ }
793
+ else:
794
+ # Unrecognized tool_calls structure: do nothing, record failure
795
+ failure_msg = "Unrecognized tool_calls structure"
796
+ self.agent_state["last_failure"] = failure_msg
797
+ decision = {
798
+ "name": "interact",
799
+ "parameters": {"actions": [], "reasoning": failure_msg},
800
+ }
801
+ if self.verbose:
802
+ print(f"🔍 DEBUG: {failure_msg}")
803
+ else:
804
+ # No tool calls: do nothing, record failure for next prompt
805
+ failure_msg = "No valid tool_calls in assistant message"
806
+ self.agent_state["last_failure"] = failure_msg
807
+ decision = {
808
+ "name": "interact",
809
+ "parameters": {"actions": [], "reasoning": failure_msg},
810
+ }
811
+ # Store metrics for tqdm postfix update in run_episode
812
+ self.agent_state["last_metrics"] = {
813
+ "tc": 0,
814
+ "act": 0,
815
+ "tok": completion_tokens,
816
+ "in": prompt_tokens,
817
+ "tps": f"{toks_per_sec}" if toks_per_sec is not None else "-",
818
+ }
819
+
820
+ # Update agent state
821
+ self.agent_state["tool_calls_made"] += 1
822
+
823
+ # Add assistant response to history
824
+ assistant_message = {"role": "assistant", "content": raw_response}
825
+ self.agent_state["message_history"].append(assistant_message)
826
+
827
+ if self.verbose:
828
+ print(f"🤖 LM Response (turn {turn}): {json.dumps(decision, indent=2)}")
829
+ print(f"📊 Response time: {llm_end - llm_start:.2f}s")
830
+ except Exception as e:
831
+ print(f"❌ Error in LM decide: {e}")
832
+ import traceback
833
+
834
+ traceback.print_exc()
835
+ # Record failure and do nothing this turn
836
+ failure_msg = f"Exception during decide: {str(e)}"
837
+ self.agent_state["last_failure"] = failure_msg
838
+ decision = {"name": "interact", "parameters": {"actions": [], "reasoning": failure_msg}}
839
+
840
+ return decision
841
+
842
+ def _parse_tool_response(self, raw_response: str) -> dict[str, Any]:
843
+ """Parse raw LM response to extract tool calls."""
844
+ # Try to parse JSON if present
845
+ try:
846
+ # Look for JSON in the response
847
+ import re
848
+
849
+ json_match = re.search(r"\{.*\}", raw_response, re.DOTALL)
850
+ if json_match:
851
+ data = json.loads(json_match.group())
852
+ if "name" in data:
853
+ return data
854
+ elif "function" in data:
855
+ return {
856
+ "name": data["function"].get("name", "interact"),
857
+ "parameters": data["function"].get("arguments", {}),
858
+ }
859
+ except Exception:
860
+ pass
861
+
862
+ # Fallback to text parsing
863
+ if "terminate" in raw_response.lower():
864
+ return {"name": "terminate", "parameters": {"reason": "Agent decided to terminate"}}
865
+
866
+ # Try to extract actions from the response
867
+ actions = []
868
+ action_keywords = [
869
+ "move_up",
870
+ "move_down",
871
+ "move_left",
872
+ "move_right",
873
+ "do",
874
+ "sleep",
875
+ "place_stone",
876
+ "place_table",
877
+ "place_furnace",
878
+ "place_plant",
879
+ "make_wood_pickaxe",
880
+ "make_stone_pickaxe",
881
+ "make_iron_pickaxe",
882
+ "make_wood_sword",
883
+ "make_stone_sword",
884
+ "make_iron_sword",
885
+ ]
886
+
887
+ for keyword in action_keywords:
888
+ if keyword in raw_response.lower():
889
+ actions.append(keyword)
890
+
891
+ if not actions:
892
+ actions = ["do"] # Default action
893
+
894
+ return {
895
+ "name": "interact",
896
+ "parameters": {
897
+ "actions": actions, # Return as array of actions
898
+ "reasoning": "Parsed from response",
899
+ },
900
+ }
901
+
902
+ def get_system_message(self) -> str:
903
+ """Return system message for agent. Override in subclasses."""
904
+ return """You are an AI agent playing Crafter. Use the available tools to interact with the environment.
905
+
906
+ CRITICAL RULE: You MUST provide MULTIPLE actions (2-5) in EVERY interact() tool call!
907
+
908
+ The 'interact' function accepts a LIST of 1-5 actions. ALWAYS provide 2-5 actions for efficiency.
909
+
910
+ GOOD Examples (what you SHOULD do):
911
+ ✓ interact(actions=["move_right", "move_right", "do"], reasoning="Move to tree and collect wood")
912
+ ✓ interact(actions=["move_up", "move_up", "move_right", "do"], reasoning="Navigate to stone and mine it")
913
+ ✓ interact(actions=["place_table", "make_wood_pickaxe", "move_left"], reasoning="Craft and continue exploring")
914
+
915
+ BAD Examples (what you should AVOID):
916
+ ✗ interact(actions=["move_right"], reasoning="Move right") - TOO FEW ACTIONS!
917
+ ✗ interact(actions=["do"], reasoning="Collect") - TOO FEW ACTIONS!
918
+
919
+ REMEMBER: Single actions waste time. Always plan 2-5 actions ahead and execute them together!"""
920
+
921
+ def format_observation(self, obs: dict[str, Any]) -> str:
922
+ """Format observation for agent. Override in subclasses."""
923
+ return str(obs)
924
+
925
+
926
+ # --- Crafter-specific ReAct Agent ---
927
+ class CrafterReActAgentWithLMSynth(BaseReActAgentWithLMSynth):
928
+ """Crafter-specific ReAct agent with enhanced prompting for Synth models."""
929
+
930
+ def get_system_message(self) -> str:
931
+ """Return Crafter-specific system message optimized for Synth models."""
932
+ override = os.getenv("CRAFTER_SYSTEM_PROMPT")
933
+ if override:
934
+ return override
935
+ return """You are CrafterAgent playing Crafter survival environment. Your goal is to unlock as many achievements as possible while staying alive.
936
+
937
+ You will see a semantic map view showing your surroundings. Use this to navigate toward resources.
938
+
939
+ Key mechanics:
940
+ • 'do' action: collect wood from trees, stone from deposits, food from cows/plants
941
+ • 'do' does nothing on grass/water - move to find resources first
942
+ • Craft progression: wood → table → wood_pickaxe → stone → stone_pickaxe → iron tools
943
+ • Sleep when energy low to restore and unlock wake_up achievement
944
+ • Use semantic map view to navigate toward resources you can see
945
+
946
+ Available actions: move_left, move_right, move_up, move_down, do, sleep, place_stone, place_table, place_furnace, place_plant, make_wood_pickaxe, make_stone_pickaxe, make_iron_pickaxe, make_wood_sword, make_stone_sword, make_iron_sword, noop
947
+
948
+ KEY ACHIEVEMENTS TO UNLOCK:
949
+ Basic Resource Collection (PRIORITY #1):
950
+ - collect_wood: Move NEXT TO a tree, then use action="do" to collect wood
951
+ - collect_stone: Move NEXT TO stone, then use action="do" (requires wood_pickaxe in inventory)
952
+ - collect_coal: Move NEXT TO coal, then use action="do" (requires stone_pickaxe)
953
+ - collect_iron: Move NEXT TO iron, then use action="do" (requires stone_pickaxe)
954
+ - collect_diamond: Move NEXT TO diamond, then use action="do" (requires iron_pickaxe)
955
+
956
+ Tool Crafting (enables resource collection):
957
+ - make_wood_pickaxe: Use action="make_wood_pickaxe" when you have wood (unlocks ability to mine stone)
958
+ - make_stone_pickaxe: Use action="make_stone_pickaxe" when you have wood and stone (unlocks coal/iron mining)
959
+ - make_iron_pickaxe: Use action="make_iron_pickaxe" when you have wood, coal, and iron (unlocks diamond mining)
960
+
961
+ Weapon Crafting (for defense):
962
+ - make_wood_sword: Use action="make_wood_sword" when you have wood
963
+ - make_stone_sword: Use action="make_stone_sword" when you have wood and stone
964
+ - make_iron_sword: Use action="make_iron_sword" when you have wood, coal, and iron
965
+
966
+ Survival Actions:
967
+ - eat_plant: Use action="eat_plant" when food < 9 and you see a plant nearby
968
+ - eat_cow: Move NEXT TO cow, use action="do" to kill it, then action="eat_cow"
969
+ - collect_drink: Move NEXT TO water, then use action="drink" when drink < 9
970
+ - sleep: Use action="sleep" when energy < 5 (restores energy to 9)
971
+
972
+ Building/Placing:
973
+ - place_table: Use action="place_table" when you have wood (enables advanced crafting)
974
+ - place_furnace: Use action="place_furnace" when you have stone (for smelting)
975
+ - place_plant: Use action="place_plant" when you have sapling (grows into tree)
976
+ - place_stone: Use action="place_stone" when you have stone (creates barrier)
977
+
978
+ Combat:
979
+ - defeat_zombie: Move NEXT TO zombie, then use action="do" repeatedly to attack
980
+ - defeat_skeleton: Move NEXT TO skeleton, then use action="do" repeatedly to attack
981
+
982
+ CRITICAL: The action="do" is your INTERACTION button! Use it when adjacent to:
983
+ - Trees → get wood
984
+ - Stone/Coal/Iron/Diamond → mine resources (need appropriate pickaxe)
985
+ - Enemies → attack them
986
+ - Cows → kill for food
987
+
988
+ Simple Strategy:
989
+ 1. Look for resources (trees, stones) in the semantic map
990
+ 2. Move toward the nearest resource
991
+ 3. When adjacent to a resource, use action="do" to collect it
992
+ 4. If you have wood, try action="make_wood_pickaxe"
993
+ 5. Repeat: find resources, move to them, use "do"
994
+
995
+ Critical Gameplay Tips:
996
+ - You must be ADJACENT (one tile away) to objects to interact with them
997
+ - Use "do" when next to: trees (for wood), stone (for stone), coal, iron, diamond
998
+ - Use "do" to attack zombies/skeletons when adjacent
999
+ - First priority: Find a tree, move next to it, then use "do" to collect wood
1000
+ - Wood is essential for crafting your first pickaxe
1001
+ - With wood_pickaxe you can mine stone, with stone_pickaxe you can mine iron, etc.
1002
+
1003
+ CRITICAL INSTRUCTION: You MUST ALWAYS provide MULTIPLE actions (2-5) in EVERY interact() tool call!
1004
+
1005
+ The 'interact' function accepts a LIST of 1-5 actions. NEVER use single actions - always plan 2-5 actions ahead!
1006
+
1007
+ MANDATORY action sequences (ALWAYS use multiple):
1008
+ ✓ interact(actions=["move_right", "move_right", "do"], reasoning="Move to tree and collect wood")
1009
+ ✓ interact(actions=["move_up", "move_up", "move_right", "do"], reasoning="Navigate and collect")
1010
+ ✓ interact(actions=["place_table", "make_wood_pickaxe", "move_left", "move_left"], reasoning="Craft and explore")
1011
+ ✓ interact(actions=["do", "move_right", "do", "move_right", "do"], reasoning="Collect multiple resources")
1012
+
1013
+ FORBIDDEN (NEVER do this):
1014
+ ✗ interact(actions=["move_right"], ...) - WRONG! Too few actions!
1015
+ ✗ interact(actions=["do"], ...) - WRONG! Too few actions!
1016
+
1017
+ RULE: If you use less than 2 actions, you are playing inefficiently. Always think 2-5 steps ahead!
1018
+
1019
+ Key Strategy:
1020
+ 1. Plan a sequence of moves to reach resources
1021
+ 2. Execute multiple moves in one tool call (e.g., ["move_right", "move_right", "move_up"])
1022
+ 3. When adjacent to a resource, use "do" to collect it
1023
+ 4. Chain crafting actions together (e.g., ["place_table", "make_wood_pickaxe"])
1024
+
1025
+ Remember:
1026
+ - Use "do" when ADJACENT to trees (for wood), stones, or other resources
1027
+ - Collect wood FIRST before trying to craft anything
1028
+ - Be efficient - use multiple actions per tool call!
1029
+ - Focus on unlocking achievements by collecting resources and crafting items."""
1030
+
1031
+ def format_observation(self, obs: dict[str, Any]) -> str:
1032
+ """Format Crafter observation with semantic map view."""
1033
+ # Get semantic map view
1034
+ semantic_view = format_semantic_map_view_v2(obs, view_size=7)
1035
+
1036
+ # Extract key information
1037
+ inventory = obs.get("inventory", {})
1038
+ # Try both possible keys for achievements
1039
+ achievements = obs.get("achievements_status", obs.get("achievements_info", {}))
1040
+ health = obs.get("health", 10)
1041
+ food = obs.get("food", 10)
1042
+ drink = obs.get("drink", 10)
1043
+ energy = obs.get("energy", 10)
1044
+
1045
+ # Count achievements
1046
+ achieved = sum(1 for v in achievements.values() if v)
1047
+ total_achievements = len(achievements)
1048
+
1049
+ # Format inventory (only show non-zero items)
1050
+ inv_items = []
1051
+ for item, count in inventory.items():
1052
+ if count > 0:
1053
+ inv_items.append(f"{item}: {count}")
1054
+ inv_str = ", ".join(inv_items) if inv_items else "empty"
1055
+
1056
+ # List unlocked achievements
1057
+ unlocked = [k for k, v in achievements.items() if v]
1058
+ unlocked_str = ", ".join(unlocked) if unlocked else "none"
1059
+
1060
+ # Recent achievements (from info if available)
1061
+ recent_str = ""
1062
+
1063
+ suppress_reminder = os.getenv("CRAFTER_SUPPRESS_OBS_REMINDER")
1064
+ base = (
1065
+ f"=== SEMANTIC MAP VIEW (7x7) ===\n"
1066
+ f"{semantic_view}\n\n"
1067
+ f"=== STATUS ===\n"
1068
+ f"Health: {health}/10 | Food: {food}/10 | Drink: {drink}/10 | Energy: {energy}/10\n"
1069
+ f"Inventory: {inv_str}\n"
1070
+ f"Achievements: {achieved}/{total_achievements} unlocked\n"
1071
+ f"Unlocked: {unlocked_str}\n"
1072
+ f"{recent_str}\n\n"
1073
+ # f"What do you see in the map? What actions should you take? "
1074
+ )
1075
+ if suppress_reminder:
1076
+ return base
1077
+ return (
1078
+ base
1079
+ # + "\n\nREMINDER: You MUST provide 2-5 actions in your interact() tool call. Plan multiple steps ahead!\n"
1080
+ # + 'Example: interact(actions=["move_right", "move_right", "do"], reasoning="Move to tree and collect wood")'
1081
+ )
1082
+
1083
+
1084
+ async def run_episode(
1085
+ episode_id: int,
1086
+ config: CrafterConfig,
1087
+ session_tracer: SessionTracer | None = None,
1088
+ progress_bar: tqdm | None = None,
1089
+ quiet: bool = False,
1090
+ model_params: dict[str, Any] | None = None,
1091
+ ):
1092
+ """Run a single episode."""
1093
+ episode_start_time = time.time()
1094
+
1095
+ # Create agent - always disable verbose for cleaner output
1096
+ agent = CrafterReActAgentWithLMSynth(
1097
+ model_name=config.model_name,
1098
+ max_turns=config.max_turns,
1099
+ verbose=False, # Always disable verbose logging in agent
1100
+ tracer=session_tracer,
1101
+ episode_id=episode_id,
1102
+ quiet=True, # Always use quiet mode for agent
1103
+ model_params=model_params,
1104
+ )
1105
+
1106
+ # Initialize environment
1107
+ async with AsyncClient(base_url=config.service_base_url) as client:
1108
+ try:
1109
+ # Initialize environment with unique seed for each episode
1110
+ # Use simple sequential seeds: 1, 2, 3, 4, etc.
1111
+ episode_seed = episode_id + 1 # Start from 1 instead of 0
1112
+
1113
+ init_response = await retry_http_request(
1114
+ client,
1115
+ "POST",
1116
+ "/env/CrafterClassic/initialize",
1117
+ json={"config": {"difficulty": config.difficulty, "seed": episode_seed}},
1118
+ )
1119
+
1120
+ init_data = init_response.json()
1121
+ instance_id = init_data["env_id"]
1122
+ obs = init_data["observation"]
1123
+
1124
+ # Start initial timestep and send initial observation as message
1125
+ if session_tracer:
1126
+ async with session_tracer.timestep("init", turn_number=0):
1127
+ obs_msg = create_message(
1128
+ compress_observation_for_trace(obs),
1129
+ "observation",
1130
+ f"crafter_env_{instance_id}",
1131
+ 0,
1132
+ )
1133
+ await session_tracer.record_message(
1134
+ content=obs_msg.content, message_type=obs_msg.message_type
1135
+ )
1136
+
1137
+ # Run episode
1138
+ episode_reward = 0
1139
+ termination_reason = None
1140
+ step_results = []
1141
+ consecutive_no_tool_calls = 0
1142
+
1143
+ # Create progress bar for this episode
1144
+ episode_progress = tqdm(
1145
+ total=config.max_turns,
1146
+ desc=f"Episode {episode_id}",
1147
+ position=episode_id,
1148
+ leave=True,
1149
+ ncols=100,
1150
+ )
1151
+
1152
+ for turn in range(config.max_turns):
1153
+ episode_progress.update(1)
1154
+
1155
+ # Use timestep context for this turn
1156
+ timestep_name = f"turn_{turn + 1}"
1157
+ async with (
1158
+ session_tracer.timestep(timestep_name, turn_number=turn + 1)
1159
+ if session_tracer
1160
+ else _noop_async_context()
1161
+ ):
1162
+ # Get agent decision
1163
+ obs_formatted = agent.format_observation(obs)
1164
+ system_msg = agent.get_system_message()
1165
+
1166
+ decision = await agent.decide(obs_formatted, system_msg, turn)
1167
+ # Update tqdm postfix with latest metrics from agent
1168
+ try:
1169
+ metrics = agent.agent_state.get("last_metrics")
1170
+ if isinstance(metrics, dict):
1171
+ episode_progress.set_postfix(metrics, refresh=False)
1172
+ except Exception:
1173
+ pass
1174
+
1175
+ # Handle termination
1176
+ if decision["name"] == "terminate":
1177
+ termination_reason = decision["parameters"]["reason"]
1178
+ break
1179
+
1180
+ # Detect consecutive no-tool-call responses and abort after 3
1181
+ decision_params = (
1182
+ decision.get("parameters") if isinstance(decision, dict) else None
1183
+ )
1184
+ decision_actions = (
1185
+ decision_params.get("actions", [])
1186
+ if isinstance(decision_params, dict)
1187
+ else []
1188
+ )
1189
+ if (
1190
+ decision.get("name") == "interact"
1191
+ and isinstance(decision_actions, list)
1192
+ and len(decision_actions) == 0
1193
+ ):
1194
+ consecutive_no_tool_calls += 1
1195
+ print(f"🔍 DEBUG: consecutive_no_tool_calls={consecutive_no_tool_calls}")
1196
+ else:
1197
+ consecutive_no_tool_calls = 0
1198
+ if consecutive_no_tool_calls >= 3:
1199
+ # Gracefully end the episode without recording this problematic turn
1200
+ termination_reason = "no_tool_calls_abort"
1201
+ break
1202
+
1203
+ # Execute actions in sequence
1204
+ actions = (
1205
+ decision["parameters"].get("actions", [])
1206
+ if isinstance(decision.get("parameters"), dict)
1207
+ else []
1208
+ )
1209
+
1210
+ # Ensure control variables are defined even if no actions are taken this turn
1211
+ done = False
1212
+ reward = 0.0
1213
+ info = {}
1214
+
1215
+ # Define action mapping
1216
+ crafter_action_map = {
1217
+ "noop": 0,
1218
+ "move_left": 1,
1219
+ "move_right": 2,
1220
+ "move_up": 3,
1221
+ "move_down": 4,
1222
+ "do": 5,
1223
+ "sleep": 6,
1224
+ "place_stone": 7,
1225
+ "place_table": 8,
1226
+ "place_furnace": 9,
1227
+ "place_plant": 10,
1228
+ "make_wood_pickaxe": 11,
1229
+ "make_stone_pickaxe": 12,
1230
+ "make_iron_pickaxe": 13,
1231
+ "make_wood_sword": 14,
1232
+ "make_stone_sword": 15,
1233
+ "make_iron_sword": 16,
1234
+ }
1235
+
1236
+ # Execute each action in the sequence (may be empty)
1237
+ for action in actions:
1238
+ # Convert action name to integer
1239
+ action_int = crafter_action_map.get(action, 0) # Default to noop
1240
+
1241
+ # Get state before action
1242
+ state_before = {"observation": obs} if "obs" in locals() else {}
1243
+ prev_obs = obs.copy()
1244
+
1245
+ # Step environment
1246
+ step_response = await retry_http_request(
1247
+ client,
1248
+ "POST",
1249
+ "/env/CrafterClassic/step",
1250
+ json={
1251
+ "env_id": instance_id,
1252
+ "action": {
1253
+ "tool_calls": [
1254
+ {"tool": "interact", "args": {"action": action_int}}
1255
+ ]
1256
+ },
1257
+ },
1258
+ )
1259
+ step_data = step_response.json()
1260
+
1261
+ # Check if response has expected structure
1262
+ if "observation" not in step_data:
1263
+ print(
1264
+ f"\n❌ Error: Missing observation in step response. Keys: {list(step_data.keys())}"
1265
+ )
1266
+ if "error" in step_data:
1267
+ print(f" Error message: {step_data['error']}")
1268
+ # Try to recover or break
1269
+ break
1270
+
1271
+ obs = step_data["observation"]
1272
+ reward = step_data.get("reward", 0) # Default to 0 if None
1273
+ done = step_data.get("done", False) # Default to False if None
1274
+ info = step_data.get("info", {})
1275
+
1276
+ # Calculate achievement reward if not provided by service
1277
+ if (
1278
+ (reward == 0 or reward is None)
1279
+ and ("achievements_status" in obs and "achievements_status" in prev_obs)
1280
+ ):
1281
+ prev_achievements = prev_obs["achievements_status"]
1282
+ curr_achievements = obs["achievements_status"]
1283
+ new_unlocks = sum(
1284
+ 1
1285
+ for k in curr_achievements
1286
+ if curr_achievements.get(k) and not prev_achievements.get(k)
1287
+ )
1288
+ if new_unlocks > 0:
1289
+ reward = float(new_unlocks) # +1 for each new achievement
1290
+
1291
+ if reward is not None:
1292
+ episode_reward += reward
1293
+
1294
+ # Record step result
1295
+ step_results.append(
1296
+ {
1297
+ "turn": turn,
1298
+ "action": action,
1299
+ "reward": reward,
1300
+ "done": done,
1301
+ "info": info,
1302
+ }
1303
+ )
1304
+
1305
+ # Record environment event for hooks to catch
1306
+ if session_tracer:
1307
+ # Create environment event with state transition
1308
+ env_event = EnvironmentEvent(
1309
+ time_record=TimeRecord(event_time=time.time(), message_time=turn),
1310
+ system_instance_id=f"crafter_env_{instance_id}",
1311
+ system_state_before={"public_state": prev_obs},
1312
+ system_state_after={"public_state": obs},
1313
+ reward=reward, # This now includes calculated achievement rewards
1314
+ terminated=done,
1315
+ metadata={"action": action, "action_int": action_int, "info": info},
1316
+ )
1317
+ await session_tracer.record_event(env_event)
1318
+
1319
+ # Also record runtime event for invalid action detection
1320
+ runtime_event = RuntimeEvent(
1321
+ time_record=TimeRecord(event_time=time.time(), message_time=turn),
1322
+ system_instance_id=f"crafter_runtime_{instance_id}",
1323
+ actions=[action_int],
1324
+ metadata={
1325
+ "action_name": action,
1326
+ "action_int": action_int,
1327
+ "reward": reward,
1328
+ "state_before": state_before,
1329
+ "state_after": {"observation": obs},
1330
+ },
1331
+ )
1332
+ await session_tracer.record_event(runtime_event)
1333
+
1334
+ if done:
1335
+ break
1336
+
1337
+ # After all actions (or none), send final observation message
1338
+ if session_tracer:
1339
+ obs_msg = create_message(
1340
+ compress_observation_for_trace(obs),
1341
+ "observation",
1342
+ f"crafter_env_{instance_id}",
1343
+ turn + 1,
1344
+ )
1345
+ await session_tracer.record_message(
1346
+ content=obs_msg.content, message_type=obs_msg.message_type
1347
+ )
1348
+
1349
+ if done:
1350
+ break
1351
+
1352
+ # Close progress bar
1353
+ episode_progress.close()
1354
+
1355
+ # Terminate instance
1356
+ terminate_response = await retry_http_request(
1357
+ client, "POST", "/env/CrafterClassic/terminate", json={"env_id": instance_id}
1358
+ )
1359
+
1360
+ except Exception as e:
1361
+ if "episode_progress" in locals():
1362
+ episode_progress.close()
1363
+ print(f"\n❌ Episode {episode_id} failed: {e}")
1364
+ if config.verbose:
1365
+ import traceback
1366
+
1367
+ traceback.print_exc()
1368
+ return {
1369
+ "episode_id": episode_id,
1370
+ "error": str(e),
1371
+ "duration": time.time() - episode_start_time,
1372
+ }
1373
+
1374
+ # Extract final achievements
1375
+ final_achievements = []
1376
+ if obs and "achievements_status" in obs:
1377
+ final_achievements = [k for k, v in obs["achievements_status"].items() if v]
1378
+
1379
+ # Return results
1380
+ return {
1381
+ "episode_id": episode_id,
1382
+ "total_reward": episode_reward,
1383
+ "steps": len(step_results),
1384
+ "termination_reason": termination_reason,
1385
+ "duration": time.time() - episode_start_time,
1386
+ "step_results": step_results,
1387
+ "achievements_unlocked": final_achievements,
1388
+ }
1389
+
1390
+
1391
+ # --- Main ---
1392
+ async def main():
1393
+ """Main entry point with v3 tracing."""
1394
+ parser = argparse.ArgumentParser(description="Run Crafter evaluation with LM Synth backend")
1395
+ parser.add_argument("--config", type=str, help="Path to TOML config file")
1396
+ parser.add_argument("--model", type=str, help="Model name (overrides config)")
1397
+ parser.add_argument("--episodes", type=int, help="Number of episodes (overrides config)")
1398
+ parser.add_argument("--max-steps", type=int, help="Max steps per episode (overrides config)")
1399
+ parser.add_argument(
1400
+ "--difficulty", type=str, choices=["easy", "normal", "hard"], help="Difficulty override"
1401
+ )
1402
+ parser.add_argument("--verbose", action="store_true", help="Enable verbose output")
1403
+ parser.add_argument("--quiet", action="store_true", help="Suppress most output except results")
1404
+ parser.add_argument("--no-traces", action="store_true", help="Disable trace saving")
1405
+ parser.add_argument("--analyze", action="store_true", help="Analyze traces after running")
1406
+ parser.add_argument("--skip-warmup", action="store_true", help="Skip model warmup")
1407
+ parser.add_argument(
1408
+ "--no-daemon",
1409
+ action="store_true",
1410
+ help="Don't start sqld daemon (assumes it's already running)",
1411
+ )
1412
+
1413
+ # Qwen3 thinking mode flags (mutually exclusive)
1414
+ think_group = parser.add_mutually_exclusive_group()
1415
+ think_group.add_argument(
1416
+ "--think",
1417
+ dest="enable_thinking",
1418
+ action="store_true",
1419
+ help="Enable Qwen3 thinking mode (chat_template_kwargs.enable_thinking=True)",
1420
+ )
1421
+ think_group.add_argument(
1422
+ "--no-think",
1423
+ dest="enable_thinking",
1424
+ action="store_false",
1425
+ help="Disable Qwen3 thinking mode (chat_template_kwargs.enable_thinking=False)",
1426
+ )
1427
+ parser.set_defaults(enable_thinking=None)
1428
+
1429
+ # Model parameter arguments
1430
+ parser.add_argument(
1431
+ "--temperature",
1432
+ type=float,
1433
+ default=0.7,
1434
+ help="Temperature for model responses (default: 0.7)",
1435
+ )
1436
+ parser.add_argument(
1437
+ "--max-tokens", type=int, default=512, help="Maximum tokens to generate (default: 512)"
1438
+ )
1439
+ parser.add_argument(
1440
+ "--top-p", type=float, default=1.0, help="Top-p sampling parameter (default: 1.0)"
1441
+ )
1442
+ parser.add_argument(
1443
+ "--frequency-penalty", type=float, default=0.0, help="Frequency penalty (default: 0.0)"
1444
+ )
1445
+ parser.add_argument(
1446
+ "--presence-penalty", type=float, default=0.0, help="Presence penalty (default: 0.0)"
1447
+ )
1448
+ parser.add_argument(
1449
+ "--tool-choice",
1450
+ type=str,
1451
+ choices=["auto", "required", "none"],
1452
+ default="auto",
1453
+ help="Tool choice mode (default: auto)",
1454
+ )
1455
+
1456
+ args = parser.parse_args()
1457
+
1458
+ # Load configuration
1459
+ config = CrafterConfig(args.config)
1460
+
1461
+ # Setup Synth environment variables
1462
+ setup_synth_environment()
1463
+
1464
+ # Clean up old files to keep directory clean
1465
+ if config.auto_cleanup:
1466
+ cleanup_old_files()
1467
+
1468
+ # Apply command-line overrides
1469
+ if args.model:
1470
+ config.model_name = args.model
1471
+ if args.episodes:
1472
+ config.num_instances = args.episodes
1473
+ if args.max_steps:
1474
+ config.max_turns = args.max_steps
1475
+ if args.difficulty:
1476
+ config.difficulty = args.difficulty
1477
+ if args.verbose:
1478
+ config.verbose = True
1479
+ if args.quiet:
1480
+ config.quiet = True
1481
+ if not args.verbose: # Don't show this if verbose is also on
1482
+ print("🔇 Quiet mode enabled - suppressing verbose logs")
1483
+ else:
1484
+ config.quiet = False
1485
+ if args.no_daemon:
1486
+ config.start_sqld_daemon = False
1487
+
1488
+ # Environment overrides for model parameters (fail-fast on bad values)
1489
+ env_temp = os.getenv("CRAFTER_TEMPERATURE")
1490
+ if env_temp is not None:
1491
+ args.temperature = float(env_temp)
1492
+ env_max_tok = os.getenv("CRAFTER_MAX_TOKENS")
1493
+ if env_max_tok is not None:
1494
+ args.max_tokens = int(env_max_tok)
1495
+ env_tool_choice = os.getenv("CRAFTER_TOOL_CHOICE")
1496
+ if env_tool_choice is not None:
1497
+ if env_tool_choice not in {"auto", "required", "none"}:
1498
+ raise ValueError(f"Invalid CRAFTER_TOOL_CHOICE: {env_tool_choice}")
1499
+ args.tool_choice = env_tool_choice
1500
+ env_top_p = os.getenv("CRAFTER_TOP_P")
1501
+ if env_top_p is not None:
1502
+ args.top_p = float(env_top_p)
1503
+ env_freq_pen = os.getenv("CRAFTER_FREQUENCY_PENALTY")
1504
+ if env_freq_pen is not None:
1505
+ args.frequency_penalty = float(env_freq_pen)
1506
+ env_pres_pen = os.getenv("CRAFTER_PRESENCE_PENALTY")
1507
+ if env_pres_pen is not None:
1508
+ args.presence_penalty = float(env_pres_pen)
1509
+
1510
+ # Resolve stop-after-tool-calls from environment (wrapper sets this)
1511
+ try:
1512
+ _satc = int(os.getenv("CRAFTER_STOP_AFTER_TOOL_CALLS", "1"))
1513
+ except Exception:
1514
+ _satc = 1
1515
+ _extra_body = {"stop_after_tool_calls": _satc} if _satc and _satc > 0 else {}
1516
+
1517
+ # Create model parameters dictionary from command line arguments
1518
+ model_params = {
1519
+ "temperature": args.temperature,
1520
+ "max_tokens": args.max_tokens,
1521
+ "top_p": args.top_p,
1522
+ "frequency_penalty": args.frequency_penalty,
1523
+ "presence_penalty": args.presence_penalty,
1524
+ "tool_choice": args.tool_choice,
1525
+ # Request early stop after N tool call blocks to avoid spillover
1526
+ "extra_body": _extra_body,
1527
+ }
1528
+ # Optionally carry thinking mode through to LM config
1529
+ if args.enable_thinking is not None:
1530
+ model_params["enable_thinking"] = args.enable_thinking
1531
+
1532
+ # Configure logging based on quiet mode
1533
+ setup_logging(quiet_mode=config.quiet)
1534
+
1535
+ # Display configuration (only if not in quiet mode)
1536
+ if not config.quiet:
1537
+ print("🎮 Crafter ReAct Agent Evaluation (LM with Synth Backend - v3)")
1538
+ print(f"Model: {config.model_name}")
1539
+ print("Model Parameters:")
1540
+ print(f" Temperature: {model_params['temperature']}")
1541
+ print(f" Max Tokens: {model_params['max_tokens']}")
1542
+ print(f" Top-p: {model_params['top_p']}")
1543
+ print(f" Frequency Penalty: {model_params['frequency_penalty']}")
1544
+ print(f" Presence Penalty: {model_params['presence_penalty']}")
1545
+ print(f"Service: {config.service_base_url}")
1546
+ print(f"Instances: {config.num_instances}")
1547
+ print(f"Max Turns: {config.max_turns}")
1548
+ print(f"Difficulty: {config.difficulty}")
1549
+ print(f"Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
1550
+ print("=" * 50)
1551
+
1552
+ if args.no_traces:
1553
+ config.save_traces = False
1554
+ config.enable_v3_tracing = False
1555
+ if args.analyze:
1556
+ config.analyze_traces = True
1557
+ if args.skip_warmup:
1558
+ config.warmup_model = False
1559
+
1560
+ # Ensure model is specified
1561
+ if not config.model_name:
1562
+ parser.error("Model name must be specified via --model or config file")
1563
+
1564
+ # Test service health
1565
+ async with AsyncClient(base_url=config.service_base_url) as client:
1566
+ try:
1567
+ health_resp = await retry_http_request(client, "GET", "/health")
1568
+ health_data = health_resp.json()
1569
+ print(f"✅ Crafter service is healthy: {health_data}")
1570
+ except Exception as e:
1571
+ print(f"❌ Failed to connect to Crafter service: {e}")
1572
+ return
1573
+
1574
+ # Warm up the model if requested
1575
+ if config.warmup_model and not args.skip_warmup:
1576
+ print(f"\n🔥 Warming up {config.model_name} on Synth backend...")
1577
+ try:
1578
+ synth_base_url = os.getenv("SYNTH_BASE_URL") # or os.getenv('MODAL_BASE_URL')
1579
+ synth_api_key = os.getenv("SYNTH_API_KEY") # or os.getenv('MODAL_API_KEY')
1580
+ if synth_base_url and synth_api_key:
1581
+ synth_config = SynthConfig(
1582
+ base_url=synth_base_url,
1583
+ api_key=synth_api_key,
1584
+ timeout=config.warmup_timeout, # Use configurable timeout
1585
+ )
1586
+ warmed = await warmup_synth_model(config.model_name, synth_config)
1587
+ if warmed:
1588
+ print("✅ Model warmed up successfully!")
1589
+ else:
1590
+ print("⚠️ Warmup did not complete; continuing anyway...")
1591
+ else:
1592
+ print("⚠️ Missing SYNTH_BASE_URL or SYNTH_API_KEY, skipping warmup")
1593
+ except Exception as e:
1594
+ print(f"⚠️ Warmup failed: {e}")
1595
+ print("Continuing anyway...")
1596
+
1597
+ # Set up v3 tracing if enabled
1598
+ trace_manager = None
1599
+ experiment_ctx = None
1600
+ sqld_daemon = None
1601
+
1602
+ if config.enable_v3_tracing:
1603
+ # Create trace directory first
1604
+ os.makedirs(config.v3_trace_dir, exist_ok=True)
1605
+
1606
+ # Start sqld daemon if requested
1607
+ if config.start_sqld_daemon:
1608
+ print("\n🚀 Starting sqld daemon for v3 tracing...")
1609
+ sqld_daemon = SqldDaemon(db_path=config.turso_db_path)
1610
+ sqld_daemon.__enter__() # Start the daemon
1611
+ await asyncio.sleep(2) # Give it time to start
1612
+ print("✅ sqld daemon started")
1613
+
1614
+ # Initialize trace manager with proper URL format
1615
+ # If SQLD_DB_PATH is a directory managed by sqld, use its data file
1616
+ _db_path = config.turso_db_path
1617
+ if os.path.isdir(_db_path):
1618
+ _candidate = os.path.join(_db_path, "dbs", "default", "data")
1619
+ if os.path.exists(_candidate):
1620
+ _db_path = _candidate
1621
+ db_url = f"sqlite+aiosqlite:///{os.path.abspath(_db_path)}"
1622
+ trace_manager = AsyncSQLTraceManager(db_url=db_url)
1623
+ await trace_manager.initialize()
1624
+
1625
+ # Create experiment context
1626
+ experiment_ctx = await create_experiment_context(
1627
+ db_manager=trace_manager,
1628
+ experiment_name=f"crafter_lm_synth_{config.model_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
1629
+ description=f"Crafter LM Synth experiment with {config.model_name} on {config.difficulty} difficulty, using LM class with v3 tracing",
1630
+ )
1631
+
1632
+ print(f"\n📊 V3 Tracing enabled. Traces will be saved to: {config.turso_db_path}")
1633
+ print(f" Experiment: {experiment_ctx['experiment_name']}")
1634
+
1635
+ # Run episodes with bounded concurrency using asyncio.Semaphore
1636
+ # Control concurrency with env var CRAFTER_CONCURRENCY (default 5)
1637
+ try:
1638
+ _conc_str = os.getenv("CRAFTER_CONCURRENCY")
1639
+ max_concurrency = int(_conc_str) if _conc_str else 5
1640
+ except Exception:
1641
+ max_concurrency = 5
1642
+ concurrency_limiter = asyncio.Semaphore(max_concurrency)
1643
+
1644
+ print(f"\n🚀 Running {config.num_instances} episodes (concurrency={max_concurrency})...")
1645
+
1646
+ episode_seeds = [] # Track seeds used for each episode
1647
+
1648
+ # Prepare episode tasks
1649
+ episode_tasks = []
1650
+ session_ids = []
1651
+
1652
+ for i in range(config.num_instances):
1653
+ # Calculate episode seed for logging (simple sequential: 1, 2, 3, etc)
1654
+ episode_seed = i + 1
1655
+ episode_seeds.append(episode_seed)
1656
+
1657
+ # Create session tracer for this episode if v3 tracing is enabled
1658
+ session_tracer = None
1659
+ if config.enable_v3_tracing and trace_manager:
1660
+ session_tracer = SessionTracer(hooks=QUIET_HOOKS) # Use quiet hooks
1661
+ session_tracer.db = trace_manager # Use existing manager
1662
+ session_tracer._initialized = True
1663
+
1664
+ # Generate session ID
1665
+ session_id = f"crafter_episode_{i}_{uuid.uuid4().hex[:8]}"
1666
+ session_ids.append(session_id)
1667
+
1668
+ # Create episode task with proper session context
1669
+ async def run_episode_with_session(ep_id, cfg, tracer, pb, quiet, sess_id, model_params):
1670
+ if tracer:
1671
+ async with tracer.session(
1672
+ session_id=sess_id,
1673
+ metadata={
1674
+ "episode_id": ep_id,
1675
+ "experiment_id": experiment_ctx["experiment_id"]
1676
+ if experiment_ctx
1677
+ else None,
1678
+ },
1679
+ ):
1680
+ return await run_episode(ep_id, cfg, tracer, pb, quiet, model_params)
1681
+ else:
1682
+ return await run_episode(ep_id, cfg, tracer, pb, quiet, model_params)
1683
+
1684
+ # Freeze per-iteration values to avoid late-binding bugs in closures
1685
+ this_tracer = session_tracer
1686
+ this_session_id = session_ids[i] if session_ids else None
1687
+
1688
+ async def _limited_episode(ep_idx=i, tracer=this_tracer, sess_id=this_session_id):
1689
+ async with concurrency_limiter:
1690
+ return await run_episode_with_session(
1691
+ ep_idx, config, tracer, None, args.quiet, sess_id, model_params
1692
+ )
1693
+
1694
+ episode_task = _limited_episode()
1695
+ episode_tasks.append(episode_task)
1696
+
1697
+ print("\n📤 Starting episodes...")
1698
+ start_time = time.time()
1699
+
1700
+ # Run all episodes in parallel and fail fast on first error
1701
+ try:
1702
+ results = await asyncio.gather(*episode_tasks, return_exceptions=False)
1703
+ except Exception as e:
1704
+ print(f"\n❌ Run aborted due to error: {e}")
1705
+ # Ensure resources are cleaned up before exiting
1706
+ if trace_manager:
1707
+ await trace_manager.close()
1708
+ if sqld_daemon:
1709
+ sqld_daemon.__exit__(None, None, None)
1710
+ print("\n✅ Stopped sqld daemon")
1711
+ raise
1712
+
1713
+ end_time = time.time()
1714
+ parallel_time = end_time - start_time
1715
+
1716
+ print(f"\n✅ Completed {len(episode_tasks)} episodes in {parallel_time:.2f} seconds")
1717
+
1718
+ # Process results and handle any exceptions
1719
+ successful_results = []
1720
+ failed_results = []
1721
+
1722
+ for i, result in enumerate(results):
1723
+ if isinstance(result, Exception):
1724
+ print(f"❌ Episode {i} failed: {result}")
1725
+ failed_results.append({"episode_id": i, "error": str(result)})
1726
+ else:
1727
+ successful_results.append(result)
1728
+
1729
+ # Link session to experiment if tracing enabled
1730
+ if (
1731
+ config.enable_v3_tracing
1732
+ and trace_manager
1733
+ and experiment_ctx
1734
+ and i < len(session_ids)
1735
+ ):
1736
+ await trace_manager.link_session_to_experiment(
1737
+ session_ids[i], experiment_ctx["experiment_id"]
1738
+ )
1739
+
1740
+ # Use successful results for analysis
1741
+ results = successful_results + failed_results
1742
+
1743
+ # Analyze results
1744
+ print("\n" + "=" * 50)
1745
+ print("📊 EVALUATION RESULTS")
1746
+ print("=" * 50)
1747
+
1748
+ successful_episodes = [r for r in results if "error" not in r]
1749
+ failed_episodes = [r for r in results if "error" in r]
1750
+
1751
+ if successful_episodes:
1752
+ total_reward = sum(r["total_reward"] for r in successful_episodes)
1753
+ total_steps = sum(r["steps"] for r in successful_episodes)
1754
+ avg_reward = total_reward / len(successful_episodes)
1755
+ avg_steps = total_steps / len(successful_episodes)
1756
+
1757
+ print(f"Episodes completed: {len(successful_episodes)}/{config.num_instances}")
1758
+ print(f"Failed episodes: {len(failed_episodes)}")
1759
+ print(f"Total reward: {total_reward:.2f}")
1760
+ print(f"Average reward per episode: {avg_reward:.2f}")
1761
+ print(f"Total steps: {total_steps}")
1762
+ print(f"Average steps per episode: {avg_steps:.2f}")
1763
+
1764
+ # Show seeds used
1765
+ if episode_seeds:
1766
+ print("\nSeeds used:")
1767
+ for i, seed in enumerate(episode_seeds[: len(successful_episodes)]):
1768
+ print(f" Episode {i}: seed {seed}")
1769
+
1770
+ # Extract unique achievements
1771
+ all_achievements = set()
1772
+ achievement_counts = defaultdict(int)
1773
+
1774
+ for result in successful_episodes:
1775
+ # Use the achievements_unlocked field we added
1776
+ if "achievements_unlocked" in result:
1777
+ for achievement in result["achievements_unlocked"]:
1778
+ all_achievements.add(achievement)
1779
+ achievement_counts[achievement] += 1
1780
+
1781
+ # Extract and count all actions from successful episodes
1782
+ action_counts = defaultdict(int)
1783
+ total_actions = 0
1784
+
1785
+ for result in successful_episodes:
1786
+ if "step_results" in result:
1787
+ for step in result["step_results"]:
1788
+ if "action" in step:
1789
+ action_counts[step["action"]] += 1
1790
+ total_actions += 1
1791
+
1792
+ print(f"Unique achievements unlocked: {len(all_achievements)}")
1793
+ if all_achievements:
1794
+ print("\nAchievements unlocked:")
1795
+ for achievement, count in sorted(achievement_counts.items()):
1796
+ print(
1797
+ f" - {achievement}: {count} episodes ({count / len(successful_episodes) * 100:.1f}%)"
1798
+ )
1799
+
1800
+ # Display action counts
1801
+ if action_counts:
1802
+ print(f"\nAction counts (total: {total_actions}):")
1803
+ for action, count in sorted(action_counts.items(), key=lambda x: x[1], reverse=True):
1804
+ percentage = count / total_actions * 100 if total_actions > 0 else 0
1805
+ print(f" - {action}: {count} ({percentage:.1f}%)")
1806
+ else:
1807
+ print("No successful episodes completed.")
1808
+
1809
+ # Save detailed results
1810
+ if config.save_detailed_results and config.enable_v3_tracing and trace_manager:
1811
+ # For v3, results are automatically saved in the database
1812
+ print(f"\n💾 Results available in Turso database: {config.turso_db_path}")
1813
+ print(f" Experiment ID: {experiment_ctx['experiment_id']}")
1814
+ print(" Use the filter_traces_sft_turso.py script to extract fine-tuning data")
1815
+ elif config.save_detailed_results:
1816
+ # Fallback to JSON if no tracing - write under temp/ (git-ignored)
1817
+ from pathlib import Path
1818
+
1819
+ out_dir = Path(os.getenv("SYNTH_OUTPUT_DIR", "temp")).resolve()
1820
+ out_dir.mkdir(parents=True, exist_ok=True)
1821
+ results_path = (
1822
+ out_dir / f"crafter_lm_synth_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
1823
+ )
1824
+ with open(results_path, "w") as f:
1825
+ json.dump(
1826
+ {
1827
+ "config": {
1828
+ "model": config.model_name,
1829
+ "episodes": config.num_instances,
1830
+ "max_steps": config.max_turns,
1831
+ "difficulty": config.difficulty,
1832
+ "backend": "synth",
1833
+ "tracing": "v3",
1834
+ },
1835
+ "results": results,
1836
+ "summary": {
1837
+ "successful_episodes": len(successful_episodes),
1838
+ "failed_episodes": len(failed_episodes),
1839
+ "total_reward": total_reward if successful_episodes else 0,
1840
+ "avg_reward": avg_reward if successful_episodes else 0,
1841
+ "unique_achievements": list(all_achievements)
1842
+ if successful_episodes
1843
+ else [],
1844
+ },
1845
+ },
1846
+ f,
1847
+ indent=2,
1848
+ )
1849
+ print(f"\n💾 Detailed results saved to: {results_path}")
1850
+
1851
+ # Print a markdown row compatible with Environments/crafter.md tables
1852
+ if successful_episodes:
1853
+ # Columns: | model | trajectories | avg achievements | adj score | unique | steps sum | avg steps |
1854
+ model_label = config.model_name.replace("/", "/")
1855
+ trajectories = len(successful_episodes)
1856
+ avg_ach = avg_reward # our reward == achievements unlocked per episode
1857
+
1858
+ # Compute weighted scores (shaped and K-Score) from final achievements across episodes
1859
+ # K coefficients taken from crafter.md (representative weights)
1860
+ k_weights = {
1861
+ "collect_drink": 0.1,
1862
+ "collect_sapling": 0.1,
1863
+ "wake_up": 0.1,
1864
+ "collect_wood": 1.0,
1865
+ "collect_stone": 1.0,
1866
+ "eat_cow": 1.0,
1867
+ "defeat_zombie": 1.0,
1868
+ "defeat_skeleton": 1.0,
1869
+ "make_wood_pickaxe": 3.0,
1870
+ "place_table": 3.0,
1871
+ "collect_coal": 3.0,
1872
+ "make_stone_pickaxe": 10.0,
1873
+ "place_furnace": 10.0,
1874
+ "collect_iron": 10.0,
1875
+ "make_stone_sword": 10.0,
1876
+ "make_wood_sword": 3.0,
1877
+ "place_plant": 0.1,
1878
+ }
1879
+
1880
+ # Aggregate final achievements across successful episodes
1881
+ from collections import Counter
1882
+
1883
+ ach_counter: Counter[str] = Counter()
1884
+ for ep in successful_episodes:
1885
+ for name in ep.get("achievements_unlocked", []):
1886
+ ach_counter[name] += 1
1887
+
1888
+ shaped_total = 0.0
1889
+ for name, count in ach_counter.items():
1890
+ k = k_weights.get(name, 1.0)
1891
+ shaped_total += k * count
1892
+
1893
+ # Shaped reward per episode average
1894
+ shaped_reward_avg = shaped_total / trajectories if trajectories > 0 else 0.0
1895
+ k_score_avg = shaped_reward_avg / 20.0 # normalize roughly to match table scale
1896
+
1897
+ # unique = len(all_achievements) # unused
1898
+ steps_sum = total_steps
1899
+ avg_steps_md = avg_steps
1900
+ print("\nMarkdown row:")
1901
+ print(
1902
+ f"| {model_label:<15} | {trajectories:7d} | {avg_ach:8.2f} | {shaped_reward_avg:13.3f} | {k_score_avg:12.3f} | {steps_sum:12.3f} | {avg_steps_md:8.3f} |"
1903
+ )
1904
+
1905
+ # Cleanup
1906
+ if trace_manager:
1907
+ await trace_manager.close()
1908
+
1909
+ if sqld_daemon:
1910
+ sqld_daemon.__exit__(None, None, None)
1911
+ print("\n✅ Stopped sqld daemon")
1912
+
1913
+
1914
+ if __name__ == "__main__":
1915
+ asyncio.run(main())
1916
+
1917
+
1918
+ # === SEMANTIC MAP VIEW (15x15) ===
1919
+ # stone coal iron coal coal coal coal
1920
+ # stone stone iron coal coal coal coal
1921
+ # stone stone zombie coal coal iron iron
1922
+ # stone stone stone you stone iron iron
1923
+ # stone stone stone stone stone stone stone
1924
+ # stone stone stone stone stone stone stone
1925
+ # stone stone stone stone stone stone stone
1926
+ # Visible items: coal, iron, stone, zombie
1927
+
1928
+ # === STATUS ===
1929
+ # Health: 10/10 | Food: 10/10 | Drink: 10/10 | Energy: 10/10
1930
+ # Inventory: health: 9, food: 7, drink: 7, energy: 9, wood: 1, wood_pickaxe: 1
1931
+ # Achievements: 4/22 unlocked
1932
+ # Unlocked: collect_wood, make_wood_pickaxe, place_table, wake_up