synth-ai 0.2.9.dev3__py3-none-any.whl → 0.2.9.dev5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (107) hide show
  1. examples/analyze_semantic_words.sh +17 -0
  2. examples/common_old/backend.py +21 -0
  3. examples/crafter_debug_render.py +180 -0
  4. examples/evals_old/README.md +98 -0
  5. examples/evals_old/__init__.py +6 -0
  6. examples/evals_old/compare_models.py +1037 -0
  7. examples/evals_old/example_log.md +145 -0
  8. examples/evals_old/run_demo.sh +126 -0
  9. examples/evals_old/trace_analysis.py +270 -0
  10. examples/finetuning_old/_backup_synth_qwen/config.toml +29 -0
  11. examples/finetuning_old/_backup_synth_qwen/example_log.md +324 -0
  12. examples/finetuning_old/_backup_synth_qwen/filter_traces.py +60 -0
  13. examples/finetuning_old/_backup_synth_qwen/filter_traces_achievements.py +239 -0
  14. examples/finetuning_old/_backup_synth_qwen/purge_v3_traces.py +109 -0
  15. examples/finetuning_old/_backup_synth_qwen/react_agent_lm.py +1924 -0
  16. examples/finetuning_old/_backup_synth_qwen/readme.md +49 -0
  17. examples/finetuning_old/_backup_synth_qwen/run_crafter_qwen4b.py +114 -0
  18. examples/finetuning_old/_backup_synth_qwen/run_demo.sh +195 -0
  19. examples/finetuning_old/_backup_synth_qwen/sft_kickoff.py +118 -0
  20. examples/finetuning_old/synth_qwen_v1/README.md +68 -0
  21. examples/finetuning_old/synth_qwen_v1/filter_traces.py +60 -0
  22. examples/finetuning_old/synth_qwen_v1/filter_traces_achievements.py +239 -0
  23. examples/finetuning_old/synth_qwen_v1/finetune.py +46 -0
  24. examples/finetuning_old/synth_qwen_v1/hello_ft_model.py +71 -0
  25. examples/finetuning_old/synth_qwen_v1/infer.py +37 -0
  26. examples/finetuning_old/synth_qwen_v1/poll.py +44 -0
  27. examples/finetuning_old/synth_qwen_v1/prepare_data.py +35 -0
  28. examples/finetuning_old/synth_qwen_v1/purge_v3_traces.py +109 -0
  29. examples/finetuning_old/synth_qwen_v1/react_agent_lm.py +1932 -0
  30. examples/finetuning_old/synth_qwen_v1/run_crafter_sft_job.py +207 -0
  31. examples/finetuning_old/synth_qwen_v1/run_ft_job.py +232 -0
  32. examples/finetuning_old/synth_qwen_v1/upload_data.py +34 -0
  33. examples/finetuning_old/synth_qwen_v1/util.py +147 -0
  34. examples/rl/README.md +169 -0
  35. examples/rl/configs/eval_base_qwen.toml +15 -0
  36. examples/rl/configs/eval_rl_qwen.toml +11 -0
  37. examples/rl/configs/rl_from_base_qwen.toml +35 -0
  38. examples/rl/configs/rl_from_base_qwen17.toml +74 -0
  39. examples/rl/configs/rl_from_ft_qwen.toml +35 -0
  40. examples/rl/download_dataset.py +64 -0
  41. examples/rl/run_eval.py +435 -0
  42. examples/rl/run_rl_and_save.py +94 -0
  43. examples/rl/task_app/README.md +22 -0
  44. {synth_ai/task/apps → examples/rl/task_app}/math_single_step.py +8 -8
  45. examples/rl/task_app/math_task_app.py +107 -0
  46. examples/rl_old/task_app.py +962 -0
  47. examples/run_crafter_demo.sh +10 -0
  48. examples/warming_up_to_rl/analyze_trace_db.py +420 -0
  49. examples/warming_up_to_rl/configs/crafter_fft.toml +48 -0
  50. examples/warming_up_to_rl/configs/crafter_fft_4b.toml +54 -0
  51. examples/warming_up_to_rl/configs/eval_fft_qwen4b.toml +20 -0
  52. examples/warming_up_to_rl/configs/eval_groq_qwen32b.toml +13 -0
  53. examples/warming_up_to_rl/configs/eval_modal_qwen4b.toml +23 -0
  54. examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +73 -0
  55. examples/warming_up_to_rl/configs/rl_from_ft.toml +56 -0
  56. examples/warming_up_to_rl/export_trace_sft.py +541 -0
  57. examples/warming_up_to_rl/groq_test.py +88 -0
  58. examples/warming_up_to_rl/manage_secrets.py +127 -0
  59. examples/warming_up_to_rl/old/event_rewards.md +234 -0
  60. examples/warming_up_to_rl/old/notes.md +73 -0
  61. examples/warming_up_to_rl/readme.md +172 -0
  62. examples/warming_up_to_rl/run_eval.py +434 -0
  63. examples/warming_up_to_rl/run_fft_and_save.py +309 -0
  64. examples/warming_up_to_rl/run_local_rollout.py +188 -0
  65. examples/warming_up_to_rl/run_local_rollout_modal.py +160 -0
  66. examples/warming_up_to_rl/run_local_rollout_parallel.py +342 -0
  67. examples/warming_up_to_rl/run_local_rollout_traced.py +372 -0
  68. examples/warming_up_to_rl/run_rl_and_save.py +101 -0
  69. examples/warming_up_to_rl/run_rollout_remote.py +129 -0
  70. examples/warming_up_to_rl/task_app/README.md +38 -0
  71. {synth_ai/task/apps → examples/warming_up_to_rl/task_app}/grpo_crafter.py +7 -7
  72. examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +165 -0
  73. examples/warming_up_to_rl/task_app/synth_envs_hosted/README.md +173 -0
  74. examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +5 -0
  75. examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +145 -0
  76. examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +1271 -0
  77. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -0
  78. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +6 -0
  79. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -0
  80. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +429 -0
  81. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +442 -0
  82. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +96 -0
  83. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +302 -0
  84. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +47 -0
  85. examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +202 -0
  86. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +5 -0
  87. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +512 -0
  88. examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +102 -0
  89. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +985 -0
  90. examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +197 -0
  91. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +1749 -0
  92. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +5 -0
  93. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +217 -0
  94. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +160 -0
  95. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +146 -0
  96. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +61 -0
  97. synth_ai/api/train/config_finder.py +18 -18
  98. synth_ai/api/train/env_resolver.py +28 -1
  99. synth_ai/cli/task_apps.py +291 -56
  100. synth_ai/task/apps/__init__.py +54 -13
  101. {synth_ai-0.2.9.dev3.dist-info → synth_ai-0.2.9.dev5.dist-info}/METADATA +1 -1
  102. {synth_ai-0.2.9.dev3.dist-info → synth_ai-0.2.9.dev5.dist-info}/RECORD +106 -13
  103. {synth_ai-0.2.9.dev3.dist-info → synth_ai-0.2.9.dev5.dist-info}/top_level.txt +1 -0
  104. synth_ai/environments/examples/sokoban/units/astar_common.py +0 -95
  105. {synth_ai-0.2.9.dev3.dist-info → synth_ai-0.2.9.dev5.dist-info}/WHEEL +0 -0
  106. {synth_ai-0.2.9.dev3.dist-info → synth_ai-0.2.9.dev5.dist-info}/entry_points.txt +0 -0
  107. {synth_ai-0.2.9.dev3.dist-info → synth_ai-0.2.9.dev5.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,442 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, List, Optional, Tuple
4
+ from abc import ABC, abstractmethod
5
+ from .react_agent import CrafterReActAgent
6
+ from .tools import TOOLS_SCHEMA
7
+
8
+ # Define Policy base class here to avoid circular import
9
+ class Policy(ABC):
10
+ """Base class for environment-specific policies."""
11
+
12
+ @abstractmethod
13
+ def prepare_inference_request(
14
+ self, observation: Dict[str, Any], history: List[Dict[str, Any]] = None
15
+ ) -> Tuple[List[Dict[str, Any]], Optional[List[Dict[str, Any]]]]:
16
+ """Prepare an inference request."""
17
+ pass
18
+
19
+ @abstractmethod
20
+ def parse_model_response(
21
+ self, response: str, observation: Dict[str, Any]
22
+ ) -> List[Dict[str, Any]]:
23
+ """Parse model response into tool calls."""
24
+ pass
25
+
26
+ # (imports moved to top of file to satisfy linter)
27
+
28
+
29
+ class CrafterPolicy(Policy):
30
+ """Thin policy scaffold for Crafter using the ReAct agent prompts.
31
+
32
+ This class does not run inference itself. It prepares an inference request
33
+ (messages and optional tools schema) that the Task App can send to the
34
+ inference service, and provides helpers to parse the model response into
35
+ environment tool calls.
36
+ """
37
+
38
+ name: str = "crafter-react"
39
+
40
+ def __init__(self, inference_url: str, model: Optional[str] = None) -> None:
41
+ self.inference_url = inference_url
42
+ self.model = model
43
+ self.use_tools = True
44
+ # Sampling parameters (populated via initialize(config))
45
+ self.temperature: Optional[float] = None
46
+ self.top_p: Optional[float] = None
47
+ self.max_tokens: Optional[int] = None
48
+ # Thinking controls (populated via initialize(config))
49
+ self.thinking_mode: Optional[str] = None
50
+ self.thinking_budget: Optional[int] = None
51
+ # Rolling conversation and action history for non-Markov policies
52
+ self.history_messages: List[Dict[str, str]] = [] # chat-style without system
53
+ self.turn_index: int = 0
54
+ self.trajectory_history: List[Dict[str, Any]] = [] # env/policy step records
55
+
56
+ async def initialize(self, config: Dict[str, Any]) -> None:
57
+ if "inference_url" in config:
58
+ self.inference_url = config["inference_url"]
59
+ if "model" in config:
60
+ self.model = config["model"]
61
+ if "use_tools" in config:
62
+ self.use_tools = bool(config["use_tools"])
63
+ # Adopt sampling params from policy config (trainer passes these through)
64
+ if "temperature" in config:
65
+ self.temperature = float(config["temperature"]) # fail fast on bad types
66
+ if "top_p" in config:
67
+ self.top_p = float(config["top_p"]) # fail fast on bad types
68
+ if "max_tokens" in config:
69
+ self.max_tokens = int(config["max_tokens"]) # fail fast on bad types
70
+ # Thinking mode/budget forwarded into vLLM request (mirrors Wordle policy)
71
+ if "thinking_mode" in config:
72
+ self.thinking_mode = str(config["thinking_mode"]) # expect "think" or "no_think"
73
+ if "thinking_budget" in config and config["thinking_budget"] is not None:
74
+ self.thinking_budget = int(config["thinking_budget"]) # number of tokens inside <think>
75
+ if self.thinking_budget is None:
76
+ try:
77
+ if "openai.com" not in (self.inference_url or "").lower():
78
+ self.thinking_budget = 1028
79
+ except Exception:
80
+ self.thinking_budget = 1028
81
+ # Reset state on (re)initialize
82
+ self.history_messages = []
83
+ self.turn_index = 0
84
+ self.trajectory_history = []
85
+
86
+ def _append_user_observation(self, observation_text: str) -> None:
87
+ self.history_messages.append({"role": "user", "content": observation_text})
88
+ self.turn_index += 1
89
+
90
+ def _append_assistant_turn(
91
+ self,
92
+ assistant_text: Optional[str],
93
+ tool_calls: Optional[List[Dict[str, Any]]],
94
+ env_result: Optional[Dict[str, Any]],
95
+ ) -> None:
96
+ # Record assistant content (if any)
97
+ if assistant_text is not None:
98
+ self.history_messages.append({"role": "assistant", "content": assistant_text})
99
+ # Keep structured step record for training/analysis
100
+ record: Dict[str, Any] = {"turn": self.turn_index}
101
+ if tool_calls is not None:
102
+ record["tool_calls"] = tool_calls
103
+ if env_result is not None:
104
+ record["env_result"] = env_result
105
+ self.trajectory_history.append(record)
106
+
107
+ def build_inference_request(
108
+ self,
109
+ observation_text: str,
110
+ history: Optional[List[Dict[str, str]]] = None,
111
+ turn: Optional[int] = None,
112
+ ) -> Dict[str, Any]:
113
+ messages = CrafterReActAgent.build_messages(
114
+ observation=observation_text, history=history, turn=turn
115
+ )
116
+ payload: Dict[str, Any] = {
117
+ "messages": messages,
118
+ }
119
+ if self.model is not None:
120
+ payload["model"] = self.model
121
+ # Thinking controls
122
+ if self.thinking_mode is None and "openai.com" not in (self.inference_url or "").lower():
123
+ self.thinking_mode = "think"
124
+ if self.thinking_mode is not None:
125
+ payload["thinking_mode"] = self.thinking_mode
126
+ if self.thinking_budget is None and "openai.com" not in (self.inference_url or "").lower():
127
+ self.thinking_budget = 1028
128
+ if self.thinking_budget is not None:
129
+ payload["thinking_budget"] = self.thinking_budget
130
+ # Inject sampling parameters if set via initialize(config)
131
+ if self.max_tokens is not None:
132
+ # Use max_completion_tokens for newer models, max_tokens for older ones
133
+ if self.model and ("gpt-5" in self.model):
134
+ payload["max_completion_tokens"] = self.max_tokens
135
+ else:
136
+ payload["max_tokens"] = self.max_tokens
137
+ if self.temperature is not None:
138
+ payload["temperature"] = self.temperature
139
+ if self.top_p is not None:
140
+ payload["top_p"] = self.top_p
141
+ if self.use_tools:
142
+ payload["tools"] = TOOLS_SCHEMA
143
+ payload["tool_choice"] = "required"
144
+ # Ensure the inference server injects family-specific stop sequences
145
+ # to terminate immediately after the first tool call for compliance.
146
+ payload["stop_after_tool_calls"] = 1
147
+ return payload
148
+
149
+ @staticmethod
150
+ def parse_response_to_tool_calls(
151
+ response: Dict[str, Any],
152
+ use_tools: bool = True,
153
+ ) -> List[Dict[str, Any]]:
154
+ """Turn an inference response into environment tool calls.
155
+
156
+ - If tools were used, expect tool_calls-compatible output and forward as-is
157
+ in our simple JSON format: {"tool_name": str, "arguments": {...}}.
158
+ - If no tools, parse plain-text actions using CrafterReActAgent parser and
159
+ wrap them into a single interact_many tool call.
160
+ """
161
+ # First check if we got actual tool calls
162
+ choices = response.get("choices", [])
163
+ tool_calls: List[Dict[str, Any]] = []
164
+
165
+ for choice in choices:
166
+ msg = choice.get("message", {})
167
+ if "tool_calls" in msg and msg["tool_calls"] is not None:
168
+ for tc in msg["tool_calls"]:
169
+ if tc is None:
170
+ continue
171
+ # Handle both OpenAI format and simplified format
172
+ if "function" in tc:
173
+ # Standard OpenAI format
174
+ tool_calls.append(
175
+ {
176
+ "tool_name": tc["function"]["name"],
177
+ "arguments": tc["function"]["arguments"],
178
+ }
179
+ )
180
+ elif "name" in tc:
181
+ # Simplified format from our vLLM service
182
+ tool_calls.append(
183
+ {
184
+ "tool_name": tc["name"],
185
+ "arguments": tc["arguments"],
186
+ }
187
+ )
188
+
189
+ # If we got tool calls, return them
190
+ if tool_calls:
191
+ # Normalize common degenerate pattern ["move_right", "do"] when nothing is nearby.
192
+ # If previous env_result indicates no interaction target, drop trailing 'do'.
193
+ normalized: List[Dict[str, Any]] = []
194
+ for tc in tool_calls:
195
+ if tc and isinstance(tc, dict) and tc.get("tool_name") == "interact_many":
196
+ args = tc.get("arguments")
197
+ if isinstance(args, str):
198
+ try:
199
+ import json
200
+ args = json.loads(args)
201
+ except (json.JSONDecodeError, ValueError):
202
+ args = {}
203
+ actions = []
204
+ if isinstance(args, dict):
205
+ maybe_actions = args.get("actions")
206
+ if isinstance(maybe_actions, list):
207
+ actions = maybe_actions
208
+ # Simple heuristic: avoid repeating same pair; avoid 'do' with no context
209
+ if len(actions) == 2 and actions[0] == "move_right" and actions[1] == "do":
210
+ actions = ["move_right"]
211
+ normalized.append({"tool_name": "interact_many", "arguments": {"actions": actions or []}})
212
+ else:
213
+ normalized.append(tc)
214
+ return normalized
215
+
216
+ # Otherwise, parse plain text content for actions
217
+ text = ""
218
+ for choice in choices:
219
+ msg = choice.get("message", {})
220
+ content = msg.get("content", "")
221
+ if content:
222
+ text = content
223
+ break
224
+
225
+ if text:
226
+ # Try to parse actions from the text
227
+ from .shared import parse_actions
228
+ actions = parse_actions(text)
229
+ if actions:
230
+ # Wrap actions in interact_many tool call
231
+ return [{"tool_name": "interact_many", "arguments": {"actions": actions}}]
232
+
233
+ # No actions found
234
+ return []
235
+
236
+ async def step(
237
+ self,
238
+ observation_text: str,
239
+ state: Optional[Dict[str, Any]] = None,
240
+ metadata: Optional[Dict[str, Any]] = None,
241
+ ) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
242
+ """Stateful step: update policy history and prepare inference request.
243
+
244
+ Inputs (via metadata, optional):
245
+ - "prev_assistant_text": str — assistant text from prior turn
246
+ - "prev_tool_calls": List[Dict] — tool calls executed last turn
247
+ - "prev_env_result": Dict — env step result for prior tool calls
248
+ - "prev_inference_response": Dict — raw LLM response; if present and
249
+ use_tools=False, we record assistant_text parsed from content.
250
+
251
+ Returns (tool_calls, meta):
252
+ - tool_calls: empty list; coordinator should call inference and then use
253
+ parse_response_to_tool_calls() to derive tool_calls
254
+ - meta: { inference_url, inference_request, turn_index, history_len }
255
+ """
256
+ # If caller provided results from previous cycle, record them first
257
+ if metadata is not None:
258
+ prev_assistant_text: Optional[str] = None
259
+ prev_tool_calls: Optional[List[Dict[str, Any]]] = None
260
+ prev_env_result: Optional[Dict[str, Any]] = None
261
+ if "prev_assistant_text" in metadata:
262
+ prev_assistant_text = metadata["prev_assistant_text"]
263
+ if "prev_tool_calls" in metadata:
264
+ prev_tool_calls = metadata["prev_tool_calls"]
265
+ if "prev_env_result" in metadata:
266
+ prev_env_result = metadata["prev_env_result"]
267
+ if prev_assistant_text is not None or prev_tool_calls is not None or prev_env_result is not None:
268
+ self._append_assistant_turn(prev_assistant_text, prev_tool_calls, prev_env_result)
269
+
270
+ # Append current observation as the next user message (internal history only)
271
+ self._append_user_observation(observation_text)
272
+
273
+ # Build user message by combining the current observation text
274
+ # (formatted surroundings/inventory) with the previous 3 tool calls as context.
275
+ # Most recent first.
276
+ lines: List[str] = []
277
+ def _format_tool_call_line_for_context(tool_name: str, arguments: Any, max_chars: int = 500) -> str:
278
+ import json as _json
279
+ # Render arguments compactly, then clip to max_chars
280
+ if isinstance(arguments, (dict, list)):
281
+ try:
282
+ rendered = _json.dumps(arguments, ensure_ascii=False, separators=(",", ":"))
283
+ except Exception:
284
+ rendered = str(arguments)
285
+ elif isinstance(arguments, str):
286
+ rendered = arguments
287
+ else:
288
+ rendered = str(arguments)
289
+ if isinstance(rendered, str) and len(rendered) > max_chars:
290
+ rendered = rendered[:max_chars]
291
+ return f"- {tool_name}: {rendered}"
292
+ # Prefer pulling from trajectory_history (accumulates over turns)
293
+ for record in reversed(self.trajectory_history):
294
+ if len(lines) >= 3:
295
+ break
296
+ tc_list = record.get("tool_calls")
297
+ if not tc_list:
298
+ continue
299
+ # Use the first tool call for that turn if multiple exist
300
+ tc = tc_list[0] if isinstance(tc_list, list) and tc_list else None
301
+ if not isinstance(tc, dict):
302
+ continue
303
+ name = tc.get("tool_name") or tc.get("name") or "unknown"
304
+ args = tc.get("arguments")
305
+ lines.append(_format_tool_call_line_for_context(name, args))
306
+
307
+ # If trajectory history is empty (first few turns), fall back to metadata once
308
+ if not lines and metadata is not None and metadata.get("prev_tool_calls"):
309
+ calls: List[Dict[str, Any]] = metadata["prev_tool_calls"]
310
+ for call in reversed(calls):
311
+ if len(lines) >= 3:
312
+ break
313
+ if not isinstance(call, dict):
314
+ continue
315
+ name = call.get("tool_name") or call.get("name") or "unknown"
316
+ args = call.get("arguments")
317
+ lines.append(_format_tool_call_line_for_context(name, args))
318
+
319
+ context_text = "Previous tool calls (most recent first):\n" + ("\n".join(lines) if lines else "- none")
320
+
321
+ # Combine observation with context so the model always sees surroundings/inventory
322
+ combined_text = f"{observation_text}\n\n{context_text}"
323
+
324
+ payload = self.build_inference_request(
325
+ combined_text,
326
+ history=[], # no prior user/assistant history
327
+ turn=self.turn_index,
328
+ )
329
+ #print("Debugging only:; ", payload)
330
+ meta_out = {
331
+ "inference_url": self.inference_url,
332
+ "inference_request": payload,
333
+ "turn_index": self.turn_index,
334
+ "history_len": len(self.history_messages),
335
+ }
336
+ return [], meta_out
337
+
338
+ def state_dict(self) -> Dict[str, Any]:
339
+ return {
340
+ "turn_index": self.turn_index,
341
+ "history_messages": self.history_messages,
342
+ "trajectory_history": self.trajectory_history,
343
+ }
344
+
345
+ def load_state_dict(self, state: Dict[str, Any]) -> None:
346
+ self.turn_index = int(state["turn_index"])
347
+ self.history_messages = state["history_messages"]
348
+ self.trajectory_history = state["trajectory_history"]
349
+
350
+ async def serialize(self) -> Dict[str, Any]:
351
+ return {
352
+ "name": self.name,
353
+ "config": {
354
+ "inference_url": self.inference_url,
355
+ "model": self.model,
356
+ "use_tools": self.use_tools,
357
+ },
358
+ "state": self.state_dict(),
359
+ }
360
+
361
+ @classmethod
362
+ async def deserialize(cls, payload: Dict[str, Any]) -> "CrafterPolicy":
363
+ config = payload["config"]
364
+ state = payload["state"]
365
+ policy = cls(
366
+ inference_url=config["inference_url"],
367
+ model=config.get("model"),
368
+ )
369
+ policy.use_tools = bool(config["use_tools"])
370
+ policy.load_state_dict(state)
371
+ return policy
372
+
373
+ async def terminate(self) -> None:
374
+ return None
375
+
376
+ def prepare_inference_request(
377
+ self, observation: Dict[str, Any], history: List[Dict[str, Any]] = None
378
+ ) -> Tuple[List[Dict[str, Any]], Optional[List[Dict[str, Any]]]]:
379
+ """Prepare an inference request (implementing abstract method)."""
380
+ # Format observation with rich contextual information
381
+ observation_text = self._format_observation_for_llm(observation)
382
+
383
+ # Build messages (observation_text already formatted; no raw matrices)
384
+ messages = CrafterReActAgent.build_messages(
385
+ observation=observation_text,
386
+ history=history,
387
+ turn=self.turn_index
388
+ )
389
+
390
+ # Return messages and tools schema
391
+ tools = TOOLS_SCHEMA if self.use_tools else None
392
+ return messages, tools
393
+
394
+ def _format_observation_for_llm(self, observation: Dict[str, Any]) -> str:
395
+ """Format observation with rich contextual information for the LLM using the shared formatter."""
396
+ from .shared import format_observation
397
+
398
+ # Get the observation data (could be nested)
399
+ obs_data = observation.get("observation", observation)
400
+
401
+ # Ensure obs_data is a dict for safe access
402
+ if not isinstance(obs_data, dict):
403
+ return f"Observation: {str(observation)}"
404
+
405
+
406
+ # Use the shared format_observation function with step information
407
+ step_idx = observation.get("step_idx", 0)
408
+ max_steps = 100 # Default max steps, could be made configurable
409
+
410
+ # Get additional info from the observation wrapper
411
+ info = observation.get("info", {})
412
+ if isinstance(info, dict):
413
+ # Merge health from info into obs_data for the formatter
414
+ if "health" in info and "health" not in obs_data:
415
+ obs_data = dict(obs_data) # Make a copy
416
+ obs_data["health"] = info["health"]
417
+
418
+ return format_observation(obs_data, step_count=step_idx, max_steps=max_steps)
419
+
420
+ def parse_model_response(
421
+ self, response: str, observation: Dict[str, Any]
422
+ ) -> List[Dict[str, Any]]:
423
+ """Parse model response into tool calls (implementing abstract method).
424
+
425
+ Note: Despite the type hint, vLLM actually returns a dict response,
426
+ not a string. We handle both cases.
427
+ """
428
+ # Handle dict response from vLLM (the actual case)
429
+ if isinstance(response, dict):
430
+ return self.parse_response_to_tool_calls(response, self.use_tools)
431
+
432
+ # Handle string response (fallback case for raw text)
433
+ if isinstance(response, str):
434
+ actions = CrafterReActAgent.parse_actions_from_response(response)
435
+ if actions:
436
+ return [{"tool_name": "interact_many", "arguments": {"actions": actions}}]
437
+
438
+ # Default empty response
439
+ return []
440
+
441
+
442
+ __all__ = ["CrafterPolicy"]
@@ -0,0 +1,96 @@
1
+ """Crafter ReAct agent: system prompt and message assembly.
2
+
3
+ This agent encapsulates the Crafter-specific system prompt and helpers to
4
+ construct OpenAI-style message lists. Response parsing delegates to shared
5
+ utilities to keep a single parser.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Dict, List, Optional
11
+
12
+ from .shared import parse_actions
13
+
14
+
15
+ class CrafterReActAgent:
16
+ """Lightweight ReAct-style agent surface for Crafter prompts."""
17
+
18
+ @staticmethod
19
+ def get_system_prompt() -> str:
20
+ return (
21
+ "You are playing Crafter, a survival game by Danijar Hafner. Your goal is to collect resources, "
22
+ "craft tools, survive, and unlock achievements.\n\n"
23
+ "Core rules:\n"
24
+ "- The world contains trees (wood), stone, coal, iron, plants, cows, zombies, and water.\n"
25
+ "- Movement constraints: you cannot walk onto blocking tiles: tree, stone, water, lava, coal, iron. Navigate around obstacles.\n"
26
+ "- You start with empty hands and low health/hunger.\n"
27
+ "- Interact ('do') only when adjacent to a resource (tree, stone, cow, zombie, etc.).\n"
28
+ "- Movement is essential: you can and should move multiple steps in one turn to explore effectively.\n"
29
+ "- Achievements are unlocked by collecting resources, crafting tools, placing objects, fighting, and surviving longer.\n\n"
30
+ "Key strategies:\n"
31
+ "1. Begin by moving around to find trees. Use 'do' to collect wood when adjacent.\n"
32
+ "2. Craft a wood pickaxe as soon as you have enough wood ('make_wood_pickaxe').\n"
33
+ "3. Use the pickaxe to gather stone, then craft a stone pickaxe. Progress to iron tools as you find iron.\n"
34
+ "4. Build a table ('place_table') to unlock more crafting options (furnace, sword, etc.).\n"
35
+ "5. Manage hunger by collecting and eating plants or interacting with cows.\n"
36
+ "6. Fight zombies with a sword for achievements and resources.\n"
37
+ "7. Survive by balancing exploration, combat, and resource gathering.\n\n"
38
+ "8. Keep moving to discover new resources and stay alive. If you're in the middle of nowhere, take 5-8 consecutive move-related actions to explore and see what's outside your field of view. Don't delay exploration when it's the right move.\n\n"
39
+ "Achievements to aim for:\n"
40
+ "- Collecting resources (wood, stone, coal, iron, plants).\n"
41
+ "- Crafting tools (wood/stone/iron pickaxe, wood/stone/iron sword).\n"
42
+ "- Placing structures (table, furnace, plant).\n"
43
+ "- Combat (killing a cow or zombie).\n"
44
+ "- Survival milestones (staying alive over time).\n\n"
45
+ "Action policy:\n"
46
+ "- Always return a single tool call: interact_many({actions: [...]})\n"
47
+ "- Use 2–5 actions per call; prefer long movement sequences to explore.\n"
48
+ "- Mix in 'do' only when it makes sense (tree, stone, animal, enemy nearby).\n"
49
+ "- Do not spam the same exact sequence twice in a row—explore in varied directions.\n\n"
50
+ "Available actions: noop, move_up, move_down, move_left, move_right, do (interact), sleep, "
51
+ "place_stone, place_table, place_furnace, place_plant, make_wood_pickaxe, make_stone_pickaxe, "
52
+ "make_iron_pickaxe, make_wood_sword, make_stone_sword, make_iron_sword\n"
53
+ )
54
+
55
+ @staticmethod
56
+ def get_system_prompt_with_tools() -> str:
57
+ """System prompt for tool-based interaction (e.g., Qwen3 models)."""
58
+ return (
59
+ "You are playing Crafter, a survival game by Danijar Hafner. Your goal is to collect resources, "
60
+ "craft tools, survive, and unlock achievements.\n\n"
61
+ "Rules & world:\n"
62
+ "- Explore by chaining multiple movement actions in one turn.\n"
63
+ "- You cannot walk onto blocking tiles: tree, stone, water, lava, coal, iron. Plan routes around obstacles.\n"
64
+ "- Use 'do' intentionally when standing next to resources (trees, stone, cows, zombies, etc.).\n"
65
+ "- Achievements come from collecting, crafting, building, fighting, and surviving.\n\n"
66
+ "Strategy path:\n"
67
+ "1. Move around to find trees → 'do' to collect wood.\n"
68
+ "2. Craft a wood pickaxe.\n"
69
+ "3. Gather stone → craft stone pickaxe.\n"
70
+ "4. Place a table → unlock furnace and swords.\n"
71
+ "5. Fight enemies (cow/zombie) with swords for achievements.\n"
72
+ "6. Keep moving to discover new resources and stay alive. If you're in the middle of nowhere, take 5-8 consecutive move-related actions to explore and see what's outside your field of view. Don't delay exploration when it's the right move.\n\n"
73
+ "You must use the 'interact_many' tool to perform actions in the game. "
74
+ "This tool accepts an array of 1–5 actions to execute sequentially. Prefer sequences like "
75
+ "[move_up, move_up, move_left, do] instead of single steps.\n\n"
76
+ "Available actions: noop, move_up, move_down, move_left, move_right, do (interact), sleep, "
77
+ "place_stone, place_table, place_furnace, place_plant, make_wood_pickaxe, make_stone_pickaxe, "
78
+ "make_iron_pickaxe, make_wood_sword, make_stone_sword, make_iron_sword\n\n"
79
+ "Always call the interact_many tool with your chosen actions. Do not write plain text actions.\n"
80
+ )
81
+
82
+ @staticmethod
83
+ def build_messages(observation: str, history: Optional[List[Dict[str, str]]] = None, turn: Optional[int] = None) -> List[Dict[str, str]]:
84
+ """Construct OpenAI-style messages list for vLLM generation."""
85
+ msgs: List[Dict[str, str]] = [{"role": "system", "content": CrafterReActAgent.get_system_prompt()}]
86
+ if history:
87
+ msgs.extend(history)
88
+ msgs.append({"role": "user", "content": observation})
89
+ return msgs
90
+
91
+ @staticmethod
92
+ def parse_actions_from_response(response_text: str) -> List[str]:
93
+ return parse_actions(response_text)
94
+
95
+
96
+ __all__ = ["CrafterReActAgent"]