synth-ai 0.2.10__py3-none-any.whl → 0.2.13.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (73) hide show
  1. examples/agora_ex/README_MoE.md +224 -0
  2. examples/agora_ex/__init__.py +7 -0
  3. examples/agora_ex/agora_ex.py +65 -0
  4. examples/agora_ex/agora_ex_task_app.py +590 -0
  5. examples/agora_ex/configs/rl_lora_qwen3_moe_2xh200.toml +121 -0
  6. examples/agora_ex/reward_fn_grpo-human.py +129 -0
  7. examples/agora_ex/system_prompt_CURRENT.md +63 -0
  8. examples/agora_ex/task_app/agora_ex_task_app.py +590 -0
  9. examples/agora_ex/task_app/reward_fn_grpo-human.py +129 -0
  10. examples/agora_ex/task_app/system_prompt_CURRENT.md +63 -0
  11. examples/multi_step/configs/crafter_rl_outcome.toml +74 -0
  12. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +175 -0
  13. examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +83 -0
  14. examples/multi_step/configs/crafter_rl_stepwise_simple.toml +78 -0
  15. examples/multi_step/crafter_rl_lora.md +51 -10
  16. examples/multi_step/sse_metrics_streaming_notes.md +357 -0
  17. examples/multi_step/task_app_config_notes.md +494 -0
  18. examples/warming_up_to_rl/configs/eval_stepwise_complex.toml +35 -0
  19. examples/warming_up_to_rl/configs/eval_stepwise_consistent.toml +26 -0
  20. examples/warming_up_to_rl/configs/eval_stepwise_per_achievement.toml +36 -0
  21. examples/warming_up_to_rl/configs/eval_stepwise_simple.toml +32 -0
  22. examples/warming_up_to_rl/run_eval.py +267 -41
  23. examples/warming_up_to_rl/task_app/grpo_crafter.py +3 -33
  24. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +109 -45
  25. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +42 -46
  26. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +376 -193
  27. synth_ai/__init__.py +41 -1
  28. synth_ai/api/train/builders.py +74 -33
  29. synth_ai/api/train/cli.py +29 -6
  30. synth_ai/api/train/configs/__init__.py +44 -0
  31. synth_ai/api/train/configs/rl.py +133 -0
  32. synth_ai/api/train/configs/sft.py +94 -0
  33. synth_ai/api/train/configs/shared.py +24 -0
  34. synth_ai/api/train/env_resolver.py +18 -19
  35. synth_ai/api/train/supported_algos.py +8 -5
  36. synth_ai/api/train/utils.py +6 -1
  37. synth_ai/cli/__init__.py +4 -2
  38. synth_ai/cli/_storage.py +19 -0
  39. synth_ai/cli/balance.py +14 -2
  40. synth_ai/cli/calc.py +37 -22
  41. synth_ai/cli/demo.py +38 -39
  42. synth_ai/cli/legacy_root_backup.py +12 -14
  43. synth_ai/cli/recent.py +12 -7
  44. synth_ai/cli/rl_demo.py +81 -102
  45. synth_ai/cli/status.py +4 -3
  46. synth_ai/cli/task_apps.py +146 -137
  47. synth_ai/cli/traces.py +4 -3
  48. synth_ai/cli/watch.py +3 -2
  49. synth_ai/demos/core/cli.py +121 -159
  50. synth_ai/environments/examples/crafter_classic/environment.py +16 -0
  51. synth_ai/evals/__init__.py +15 -0
  52. synth_ai/evals/client.py +85 -0
  53. synth_ai/evals/types.py +42 -0
  54. synth_ai/jobs/client.py +15 -3
  55. synth_ai/judge_schemas.py +127 -0
  56. synth_ai/rubrics/__init__.py +22 -0
  57. synth_ai/rubrics/validators.py +126 -0
  58. synth_ai/task/server.py +14 -7
  59. synth_ai/tracing_v3/decorators.py +51 -26
  60. synth_ai/tracing_v3/examples/basic_usage.py +12 -7
  61. synth_ai/tracing_v3/llm_call_record_helpers.py +107 -53
  62. synth_ai/tracing_v3/replica_sync.py +8 -4
  63. synth_ai/tracing_v3/serialization.py +130 -0
  64. synth_ai/tracing_v3/storage/utils.py +11 -9
  65. synth_ai/tracing_v3/turso/__init__.py +12 -0
  66. synth_ai/tracing_v3/turso/daemon.py +2 -1
  67. synth_ai/tracing_v3/turso/native_manager.py +28 -15
  68. {synth_ai-0.2.10.dist-info → synth_ai-0.2.13.dev1.dist-info}/METADATA +4 -2
  69. {synth_ai-0.2.10.dist-info → synth_ai-0.2.13.dev1.dist-info}/RECORD +73 -40
  70. {synth_ai-0.2.10.dist-info → synth_ai-0.2.13.dev1.dist-info}/entry_points.txt +0 -1
  71. {synth_ai-0.2.10.dist-info → synth_ai-0.2.13.dev1.dist-info}/WHEEL +0 -0
  72. {synth_ai-0.2.10.dist-info → synth_ai-0.2.13.dev1.dist-info}/licenses/LICENSE +0 -0
  73. {synth_ai-0.2.10.dist-info → synth_ai-0.2.13.dev1.dist-info}/top_level.txt +0 -0
@@ -6,10 +6,10 @@ import logging
6
6
  import os
7
7
  import time as _time
8
8
  from datetime import datetime
9
- from typing import Any
9
+ from typing import Any, Mapping
10
10
 
11
11
  from fastapi import APIRouter, HTTPException, Request, status
12
- from pydantic import BaseModel
12
+ from pydantic import BaseModel, Field
13
13
  from synth_ai.lm.vendors.base import BaseLMResponse
14
14
  from synth_ai.task.tracing_utils import unique_sft_path
15
15
  from synth_ai.tracing_v3.abstractions import EnvironmentEvent, LMCAISEvent, TimeRecord
@@ -142,12 +142,178 @@ class RolloutTrajectory(BaseModel):
142
142
  decision_samples: list[dict[str, Any]] | None = None
143
143
 
144
144
 
145
+ def _normalize_step_strategy(raw_strategy: Any) -> str:
146
+ if not isinstance(raw_strategy, str):
147
+ return "consistent"
148
+ candidate = raw_strategy.strip().lower()
149
+ if not candidate:
150
+ return "consistent"
151
+ mapping = {
152
+ "simple": "consistent",
153
+ "consistent": "consistent",
154
+ "consistent_stepwise": "consistent",
155
+ "decision_consistent": "consistent",
156
+ "per_achievement": "per_achievement",
157
+ "per-achievement": "per_achievement",
158
+ "perachievement": "per_achievement",
159
+ "achievement_weighted": "per_achievement",
160
+ "complex": "per_achievement",
161
+ }
162
+ return mapping.get(candidate, "consistent")
163
+
164
+
165
+ def _coerce_weights(raw_weights: Any) -> dict[str, float]:
166
+ weights: dict[str, float] = {}
167
+ if isinstance(raw_weights, dict):
168
+ for key, value in raw_weights.items():
169
+ try:
170
+ weights[str(key)] = float(value)
171
+ except Exception:
172
+ continue
173
+ return weights
174
+
175
+
176
+ def _coerce_k_limits(raw_limits: Any) -> dict[str, int]:
177
+ limits: dict[str, int] = {}
178
+ if isinstance(raw_limits, dict):
179
+ for key, value in raw_limits.items():
180
+ try:
181
+ limits[str(key)] = int(value)
182
+ except Exception:
183
+ continue
184
+ return limits
185
+
186
+
187
+ def _coerce_int_value(value: Any) -> int | None:
188
+ if isinstance(value, bool):
189
+ return int(value)
190
+ try:
191
+ return int(value) # type: ignore[arg-type]
192
+ except Exception:
193
+ try:
194
+ return int(float(value)) # type: ignore[arg-type]
195
+ except Exception:
196
+ return None
197
+
198
+
199
+ def _compute_resource_reward(
200
+ prev_inventory: Mapping[str, Any] | None,
201
+ new_inventory: Mapping[str, Any] | None,
202
+ prev_counts: Mapping[str, Any] | None,
203
+ new_counts: Mapping[str, Any] | None,
204
+ ) -> tuple[float, list[dict[str, Any]], dict[str, int], dict[str, int]]:
205
+ reward_total = 0.0
206
+ components: list[dict[str, Any]] = []
207
+ inventory_deltas: dict[str, int] = {}
208
+ achievement_deltas: dict[str, int] = {}
209
+
210
+ resource_weights = {
211
+ "wood": 0.10,
212
+ "sapling": 0.08,
213
+ "stone": 0.15,
214
+ "coal": 0.18,
215
+ "iron": 0.22,
216
+ "plant": 0.06,
217
+ "meat": 0.12,
218
+ "drink": 0.07,
219
+ "food": 0.07,
220
+ "water": 0.07,
221
+ "energy": 0.04,
222
+ }
223
+ tool_weights = {
224
+ "wood_pickaxe": 0.40,
225
+ "stone_pickaxe": 0.55,
226
+ "iron_pickaxe": 0.75,
227
+ "wood_sword": 0.35,
228
+ "stone_sword": 0.50,
229
+ "iron_sword": 0.70,
230
+ "furnace": 0.45,
231
+ "table": 0.30,
232
+ "bow": 0.45,
233
+ }
234
+ achievement_weights = {
235
+ "collect_wood": 0.08,
236
+ "collect_sapling": 0.06,
237
+ "collect_stone": 0.10,
238
+ "collect_coal": 0.12,
239
+ "collect_iron": 0.14,
240
+ "collect_drink": 0.06,
241
+ "collect_food": 0.06,
242
+ "collect_plant": 0.06,
243
+ }
244
+ default_resource_weight = 0.05
245
+ default_achievement_weight = 0.05
246
+
247
+ prev_inv = prev_inventory or {}
248
+ new_inv = new_inventory or {}
249
+ for key, raw_value in new_inv.items():
250
+ new_val = _coerce_int_value(raw_value)
251
+ if new_val is None:
252
+ continue
253
+ prev_val = _coerce_int_value(prev_inv.get(key, 0)) or 0
254
+ delta = new_val - prev_val
255
+ if delta <= 0:
256
+ continue
257
+ weight = resource_weights.get(key)
258
+ if weight is None and key in tool_weights:
259
+ weight = tool_weights[key]
260
+ if weight is None:
261
+ weight = default_resource_weight
262
+ gain = weight * delta
263
+ reward_total += gain
264
+ inventory_deltas[str(key)] = delta
265
+ components.append(
266
+ {
267
+ "type": "inventory",
268
+ "item": str(key),
269
+ "delta": delta,
270
+ "weight": weight,
271
+ "reward": gain,
272
+ }
273
+ )
274
+
275
+ prev_ct = prev_counts or {}
276
+ new_ct = new_counts or {}
277
+ for key, raw_value in new_ct.items():
278
+ new_val = _coerce_int_value(raw_value)
279
+ if new_val is None:
280
+ continue
281
+ prev_val = _coerce_int_value(prev_ct.get(key, 0)) or 0
282
+ delta = new_val - prev_val
283
+ if delta <= 0:
284
+ continue
285
+ weight = achievement_weights.get(key, default_achievement_weight)
286
+ gain = weight * delta
287
+ reward_total += gain
288
+ achievement_deltas[str(key)] = delta
289
+ components.append(
290
+ {
291
+ "type": "achievement_count",
292
+ "name": str(key),
293
+ "delta": delta,
294
+ "weight": weight,
295
+ "reward": gain,
296
+ }
297
+ )
298
+
299
+ return reward_total, components, inventory_deltas, achievement_deltas
300
+
301
+
145
302
  def compute_stepwise_reward(
146
303
  prev_achievements: dict[str, bool],
147
304
  new_achievements: dict[str, bool],
148
305
  decision_index: int,
149
306
  actions_summary: list[dict[str, Any]],
150
307
  indicator_lambda: float,
308
+ *,
309
+ strategy: str | None = None,
310
+ weights: dict[str, float] | None = None,
311
+ k_limits: dict[str, int] | None = None,
312
+ episode_counts: dict[str, int] | None = None,
313
+ prev_inventory: dict[str, int] | None = None,
314
+ new_inventory: dict[str, int] | None = None,
315
+ prev_counts: dict[str, int] | None = None,
316
+ new_counts: dict[str, int] | None = None,
151
317
  ) -> tuple[dict[str, Any], dict[str, Any], dict[str, float]]:
152
318
  """Compute stepwise reward metadata given achievement states before/after a decision."""
153
319
 
@@ -155,26 +321,121 @@ def compute_stepwise_reward(
155
321
  next_map = new_achievements or {}
156
322
 
157
323
  unlocked = [name for name, value in next_map.items() if value and not prev_map.get(name, False)]
158
- indicator = 1 if unlocked else 0
159
- reward_value = float(indicator_lambda) * indicator
324
+ indicator_from_achievements = 1 if unlocked else 0
325
+ normalized_strategy = _normalize_step_strategy(strategy)
326
+ base_reward = 0.0
327
+ reward_components: list[dict[str, Any]] = []
328
+ credited: list[str] = []
329
+
330
+ if indicator_from_achievements:
331
+ if normalized_strategy == "per_achievement":
332
+ weight_map = weights or {}
333
+ limit_map = k_limits or {}
334
+ counts = episode_counts if isinstance(episode_counts, dict) else {}
335
+ for name in unlocked:
336
+ try:
337
+ limit_val = int(limit_map.get(name, 1))
338
+ except Exception:
339
+ limit_val = 1
340
+ # limit_val <= 0 implies unlimited rewards
341
+ unlimited = limit_val <= 0
342
+ try:
343
+ prev_count = int(counts.get(name, 0))
344
+ except Exception:
345
+ prev_count = 0
346
+ should_credit = unlimited or (prev_count < max(limit_val, 0))
347
+ if should_credit:
348
+ try:
349
+ weight_val = float(weight_map.get(name, 1.0))
350
+ except Exception:
351
+ weight_val = 1.0
352
+ base_reward += weight_val
353
+ reward_components.append(
354
+ {
355
+ "achievement": name,
356
+ "weight": weight_val,
357
+ "count_prior": prev_count,
358
+ "count_limit": limit_val,
359
+ }
360
+ )
361
+ credited.append(name)
362
+ if episode_counts is not None:
363
+ episode_counts[name] = prev_count + 1
364
+ else:
365
+ base_reward = 1.0
366
+ reward_components.append(
367
+ {
368
+ "achievement": "__indicator__",
369
+ "weight": 1.0,
370
+ "count_prior": 0,
371
+ "count_limit": 1,
372
+ }
373
+ )
374
+
375
+ resource_reward = 0.0
376
+ resource_components: list[dict[str, Any]] = []
377
+ inventory_deltas: dict[str, int] = {}
378
+ achievement_deltas: dict[str, int] = {}
379
+ if normalized_strategy == "per_achievement":
380
+ (
381
+ resource_reward,
382
+ resource_components,
383
+ inventory_deltas,
384
+ achievement_deltas,
385
+ ) = _compute_resource_reward(prev_inventory, new_inventory, prev_counts, new_counts)
386
+ if resource_components:
387
+ reward_components.extend(resource_components)
388
+ base_reward += resource_reward
389
+
390
+ indicator = 1 if base_reward > 0 else 0
391
+ if indicator == 0 and indicator_from_achievements:
392
+ indicator = indicator_from_achievements
393
+ lambda_effective = indicator_lambda if indicator_lambda not in (None, 0) else 1.0
394
+ reward_value = float(lambda_effective) * float(base_reward)
160
395
 
161
396
  stepwise_info = {
162
397
  "decision_index": decision_index,
163
398
  "indicator": indicator,
164
399
  "new_achievements": unlocked,
165
400
  "reward": reward_value,
401
+ "strategy": normalized_strategy,
402
+ "base_reward": float(base_reward),
166
403
  }
404
+ if indicator_from_achievements and not unlocked:
405
+ stepwise_info["indicator_from_achievements"] = indicator_from_achievements
406
+ if reward_components:
407
+ stepwise_info["components"] = reward_components
408
+ if credited:
409
+ stepwise_info["credited_achievements"] = credited
410
+ if resource_reward:
411
+ stepwise_info["resource_reward"] = float(resource_reward)
412
+ if inventory_deltas:
413
+ stepwise_info["inventory_deltas"] = inventory_deltas
414
+ if achievement_deltas:
415
+ stepwise_info["achievement_count_deltas"] = achievement_deltas
416
+
167
417
  decision_sample = {
168
418
  "decision_index": decision_index,
169
419
  "indicator": indicator,
170
420
  "r_i": reward_value,
421
+ "base": float(base_reward),
422
+ "strategy": normalized_strategy,
171
423
  "actions": actions_summary,
172
424
  }
425
+ if reward_components:
426
+ decision_sample["components"] = reward_components
427
+ if resource_reward:
428
+ decision_sample["resource_reward"] = float(resource_reward)
429
+
173
430
  stats = {
174
431
  "indicator": float(indicator),
175
432
  "reward": reward_value,
176
433
  "new_achievements_count": float(len(unlocked)),
434
+ "base_reward": float(base_reward),
435
+ "credited_achievements_count": float(len(credited)),
177
436
  }
437
+ if resource_reward:
438
+ stats["resource_reward"] = float(resource_reward)
178
439
  return stepwise_info, decision_sample, stats
179
440
 
180
441
 
@@ -183,6 +444,9 @@ class RolloutMetrics(BaseModel):
183
444
  mean_return: float
184
445
  num_steps: int
185
446
  num_episodes: int = 0
447
+ outcome_score: float | None = None
448
+ events_score: float | None = None
449
+ details: dict[str, Any] = Field(default_factory=dict)
186
450
 
187
451
 
188
452
  class RolloutResponse(BaseModel):
@@ -254,7 +518,7 @@ class RolloutTracingContext:
254
518
  session_id=self.run_id, metadata=dict(self.metadata_base)
255
519
  )
256
520
  except Exception as exc:
257
- logger.warning("TRACING_START_FAIL: %s", exc)
521
+ logger.info("TRACING_START_FAIL: %s", exc)
258
522
  self.enabled = False
259
523
  self.tracer = None
260
524
 
@@ -1053,6 +1317,9 @@ async def execute_rollout(
1053
1317
 
1054
1318
  step_rewards_enabled = bool(step_rewards_cfg_raw.get("enabled", False))
1055
1319
  step_rewards_mode = str(step_rewards_cfg_raw.get("mode") or "off").lower()
1320
+ step_rewards_strategy = _normalize_step_strategy(step_rewards_cfg_raw.get("strategy"))
1321
+ step_rewards_weights = _coerce_weights(step_rewards_cfg_raw.get("weights"))
1322
+ step_rewards_k_limits = _coerce_k_limits(step_rewards_cfg_raw.get("k_limits"))
1056
1323
  try:
1057
1324
  step_rewards_indicator_lambda = float(
1058
1325
  step_rewards_cfg_raw.get("indicator_lambda") or 0.0
@@ -1073,6 +1340,34 @@ async def execute_rollout(
1073
1340
  return {str(k): bool(v) for k, v in ach.items()}
1074
1341
  return {}
1075
1342
 
1343
+ def _extract_inventory(obs: Any) -> dict[str, int]:
1344
+ if not isinstance(obs, dict):
1345
+ return {}
1346
+ inv = obs.get("inventory")
1347
+ if not isinstance(inv, dict):
1348
+ return {}
1349
+ cleaned: dict[str, int] = {}
1350
+ for key, value in inv.items():
1351
+ coerced = _coerce_int_value(value)
1352
+ if coerced is None:
1353
+ continue
1354
+ cleaned[str(key)] = coerced
1355
+ return cleaned
1356
+
1357
+ def _extract_achievement_counts(obs: Any) -> dict[str, int]:
1358
+ if not isinstance(obs, dict):
1359
+ return {}
1360
+ counts = obs.get("achievements_counts")
1361
+ if not isinstance(counts, dict):
1362
+ return {}
1363
+ cleaned: dict[str, int] = {}
1364
+ for key, value in counts.items():
1365
+ coerced = _coerce_int_value(value)
1366
+ if coerced is None:
1367
+ continue
1368
+ cleaned[str(key)] = coerced
1369
+ return cleaned
1370
+
1076
1371
  def _summarize_tool_calls(tool_calls: Any) -> list[dict[str, Any]]:
1077
1372
  if not tool_calls:
1078
1373
  return []
@@ -1109,12 +1404,16 @@ async def execute_rollout(
1109
1404
  session_trace = None
1110
1405
  finalized = False
1111
1406
  prev_achievements = _extract_achievements(current_obs)
1407
+ prev_inventory_state = _extract_inventory(current_obs)
1408
+ prev_achievement_counts_state = _extract_achievement_counts(current_obs)
1112
1409
  # Track episode-level achievements that have been seen as true at any point so far
1113
1410
  episode_seen_achievements: set[str] = {
1114
1411
  k for k, v in (prev_achievements or {}).items() if bool(v)
1115
1412
  }
1413
+ episode_achievement_counts: dict[str, int] = {}
1116
1414
  stepwise_indicator_sum = 0.0
1117
1415
  stepwise_reward_sum = 0.0
1416
+ stepwise_resource_reward_sum = 0.0
1118
1417
  stepwise_new_achievements_total = 0
1119
1418
  final_achievement_count = sum(1 for v in prev_achievements.values() if v)
1120
1419
 
@@ -1228,58 +1527,14 @@ async def execute_rollout(
1228
1527
  req,
1229
1528
  )
1230
1529
  except Exception as _pe:
1231
- # Do not 500 the rollout; finalize with partial trajectory
1232
- with contextlib.suppress(Exception):
1233
- logger.warning(
1234
- "POLICY_STEP_FAIL: terminating episode early run_id=%s op_idx=%s err=%s",
1235
- request.run_id,
1236
- str(op_idx),
1237
- str(_pe),
1238
- )
1239
-
1240
- # Build partial trajectory and return HTTP 200
1241
- trajectory = RolloutTrajectory(
1242
- env_id=env_id,
1243
- policy_id=policy_id,
1244
- steps=trajectory_steps,
1245
- final={
1246
- "observation": current_obs,
1247
- "rollout_status": "partial_policy_error",
1248
- "error": str(_pe),
1249
- "at_op": op,
1250
- },
1251
- length=len(trajectory_steps),
1252
- decision_samples=decision_samples if step_rewards_active else None,
1253
- )
1254
- metrics = RolloutMetrics(
1255
- episode_returns=[total_reward],
1256
- mean_return=total_reward,
1257
- num_steps=len(trajectory_steps),
1258
- num_episodes=1,
1259
- )
1260
- aborted = registry.is_run_aborted(request.run_id)
1261
- if not aborted:
1262
- registry.complete_run(request.run_id)
1263
- if decision_open:
1264
- await tracing_context.end_decision()
1265
- decision_open = False
1266
- if not finalized:
1267
- session_trace = await tracing_context.finalize(
1268
- total_reward=total_reward,
1269
- achievement_state=prev_achievements,
1270
- total_steps=len(trajectory_steps),
1271
- )
1272
- finalized = True
1273
- trace_payload = tracing_context.build_trace_payload(session_trace)
1274
- return RolloutResponse(
1275
- run_id=request.run_id,
1276
- trajectories=[trajectory],
1277
- branches={},
1278
- metrics=metrics,
1279
- aborted=aborted,
1280
- ops_executed=ops_executed,
1281
- trace=trace_payload,
1530
+ # Hard fail the rollout on policy step error (e.g., inference auth 4xx)
1531
+ logger.error(
1532
+ "POLICY_STEP_HARD_FAIL: run_id=%s op_idx=%s err=%s",
1533
+ request.run_id,
1534
+ str(op_idx),
1535
+ str(_pe),
1282
1536
  )
1537
+ raise HTTPException(status_code=500, detail=f"policy_step_failed: {str(_pe)}")
1283
1538
 
1284
1539
  agent_response_ts = _time.perf_counter()
1285
1540
  if isinstance(policy_response.meta, dict):
@@ -1346,69 +1601,15 @@ async def execute_rollout(
1346
1601
 
1347
1602
  elif op == "env":
1348
1603
  if not pending_tool_calls:
1349
- # Treat absence of tool calls as a soft terminal condition; yield partial trajectory
1350
1604
  with contextlib.suppress(Exception):
1351
1605
  logger.warning(
1352
- "NO_TOOL_CALLS: terminating episode early run_id=%s op_idx=%s",
1606
+ "POLICY_STEP_FAIL: missing tool_calls; failing rollout run_id=%s op_idx=%s",
1353
1607
  request.run_id,
1354
1608
  str(op_idx),
1355
1609
  )
1356
- print(
1357
- f"[rollout] no tool_calls; terminating early run_id={request.run_id} op_idx={op_idx}",
1358
- flush=True,
1359
- )
1360
- term_step = RolloutStep(
1361
- obs=current_obs,
1362
- tool_calls=[],
1363
- reward=None,
1364
- done=True,
1365
- truncated=False,
1366
- info={
1367
- "terminated": True,
1368
- "reason": "no_tool_calls",
1369
- },
1370
- )
1371
- trajectory_steps.append(term_step)
1372
- trajectory = RolloutTrajectory(
1373
- env_id=env_id,
1374
- policy_id=policy_id,
1375
- steps=trajectory_steps,
1376
- final={
1377
- "observation": current_obs,
1378
- "rollout_status": "partial_no_tool_calls",
1379
- "at_op": op,
1380
- },
1381
- length=len(trajectory_steps),
1382
- decision_samples=decision_samples if step_rewards_active else None,
1383
- )
1384
- metrics = RolloutMetrics(
1385
- episode_returns=[total_reward],
1386
- mean_return=total_reward,
1387
- num_steps=len(trajectory_steps),
1388
- num_episodes=1,
1389
- )
1390
- aborted = registry.is_run_aborted(request.run_id)
1391
- if not aborted:
1392
- registry.complete_run(request.run_id)
1393
- if decision_open:
1394
- await tracing_context.end_decision()
1395
- decision_open = False
1396
- if not finalized:
1397
- session_trace = await tracing_context.finalize(
1398
- total_reward=total_reward,
1399
- achievement_state=prev_achievements,
1400
- total_steps=len(trajectory_steps),
1401
- )
1402
- finalized = True
1403
- trace_payload = tracing_context.build_trace_payload(session_trace)
1404
- return RolloutResponse(
1405
- run_id=request.run_id,
1406
- trajectories=[trajectory],
1407
- branches={},
1408
- metrics=metrics,
1409
- aborted=aborted,
1410
- ops_executed=ops_executed,
1411
- trace=trace_payload,
1610
+ raise HTTPException(
1611
+ status_code=500,
1612
+ detail="policy_step_failed: missing tool_calls (no_tool_calls)",
1412
1613
  )
1413
1614
 
1414
1615
  # Environment step
@@ -1437,85 +1638,16 @@ async def execute_rollout(
1437
1638
  timing_env["env_step_end_s"] = env_step_end
1438
1639
 
1439
1640
  if env_step_error is not None:
1440
- # Invalid action or environment rejection — terminate episode early with partial trajectory
1441
1641
  with contextlib.suppress(Exception):
1442
1642
  logger.warning(
1443
- "ENV_STEP_FAIL: terminating episode early run_id=%s op_idx=%s err=%s",
1643
+ "ENV_STEP_FAIL: failing rollout run_id=%s op_idx=%s err=%s",
1444
1644
  request.run_id,
1445
1645
  str(op_idx),
1446
1646
  str(env_step_error),
1447
1647
  )
1448
-
1449
- term_step = RolloutStep(
1450
- obs=current_obs,
1451
- tool_calls=pending_tool_calls,
1452
- reward=None,
1453
- done=True,
1454
- truncated=False,
1455
- info={
1456
- "terminated": True,
1457
- "reason": "invalid_action",
1458
- "error": str(env_step_error),
1459
- },
1460
- )
1461
- trajectory_steps.append(term_step)
1462
- # Build partial response
1463
- trajectory = RolloutTrajectory(
1464
- env_id=env_id,
1465
- policy_id=policy_id,
1466
- steps=trajectory_steps,
1467
- final={
1468
- "observation": current_obs,
1469
- "rollout_status": "partial_invalid_action",
1470
- "error": str(env_step_error),
1471
- "at_op": op,
1472
- },
1473
- length=len(trajectory_steps),
1474
- decision_samples=decision_samples if step_rewards_active else None,
1475
- )
1476
- metrics = RolloutMetrics(
1477
- episode_returns=[total_reward],
1478
- mean_return=total_reward,
1479
- num_steps=len(trajectory_steps),
1480
- num_episodes=1,
1481
- )
1482
- aborted = registry.is_run_aborted(request.run_id)
1483
- if not aborted:
1484
- registry.complete_run(request.run_id)
1485
- if (
1486
- last_policy_meta is not None
1487
- and last_agent_response_ts is not None
1488
- and "decision_ms" not in last_policy_meta.get("timing", {})
1489
- ):
1490
- with contextlib.suppress(Exception):
1491
- timing_last = last_policy_meta.setdefault("timing", {})
1492
- decision_ms = max(
1493
- 0.0,
1494
- (env_step_end - float(last_agent_response_ts)) * 1000.0,
1495
- )
1496
- timing_last["decision_ms"] = decision_ms
1497
- timing_last.setdefault(
1498
- "overhead_ms", max(0.0, decision_ms - env_step_duration_ms)
1499
- )
1500
- if decision_open:
1501
- await tracing_context.end_decision()
1502
- decision_open = False
1503
- if not finalized:
1504
- session_trace = await tracing_context.finalize(
1505
- total_reward=total_reward,
1506
- achievement_state=prev_achievements,
1507
- total_steps=len(trajectory_steps),
1508
- )
1509
- finalized = True
1510
- trace_payload = tracing_context.build_trace_payload(session_trace)
1511
- return RolloutResponse(
1512
- run_id=request.run_id,
1513
- trajectories=[trajectory],
1514
- branches={},
1515
- metrics=metrics,
1516
- aborted=aborted,
1517
- ops_executed=ops_executed,
1518
- trace=trace_payload,
1648
+ raise HTTPException(
1649
+ status_code=500,
1650
+ detail=f"env_step_failed: {str(env_step_error)}",
1519
1651
  )
1520
1652
 
1521
1653
  # Reaching here means env step succeeded
@@ -1546,12 +1678,16 @@ async def execute_rollout(
1546
1678
  decision_index += 1
1547
1679
  next_obs = env_response.observation
1548
1680
  new_achievement_state = _extract_achievements(next_obs)
1681
+ new_inventory_state = _extract_inventory(next_obs)
1682
+ new_achievement_counts_state = _extract_achievement_counts(next_obs)
1549
1683
  final_achievement_count = sum(
1550
1684
  1 for _, unlocked in new_achievement_state.items() if unlocked
1551
1685
  )
1552
1686
  indicator_val = 0
1553
1687
  reward_stepwise = 0.0
1554
1688
  decision_rewards_meta: dict[str, Any] | None = None
1689
+ decision_record = None
1690
+ _info = {} if not isinstance(_info, dict) else dict(_info)
1555
1691
  if step_rewards_active:
1556
1692
  decision_actions = _summarize_tool_calls(pending_tool_calls)
1557
1693
  stepwise_info, decision_record, stats = compute_stepwise_reward(
@@ -1560,13 +1696,24 @@ async def execute_rollout(
1560
1696
  decision_index,
1561
1697
  decision_actions,
1562
1698
  step_rewards_indicator_lambda,
1699
+ strategy=step_rewards_strategy,
1700
+ weights=step_rewards_weights,
1701
+ k_limits=step_rewards_k_limits,
1702
+ episode_counts=episode_achievement_counts,
1703
+ prev_inventory=prev_inventory_state,
1704
+ new_inventory=new_inventory_state,
1705
+ prev_counts=prev_achievement_counts_state,
1706
+ new_counts=new_achievement_counts_state,
1563
1707
  )
1564
1708
  indicator_val = int(stats.get("indicator", 0.0))
1565
1709
  reward_stepwise = float(stats.get("reward", 0.0))
1566
1710
  stepwise_indicator_sum += float(stats.get("indicator", 0.0))
1567
1711
  stepwise_reward_sum += reward_stepwise
1568
1712
  stepwise_new_achievements_total += int(stats.get("new_achievements_count", 0.0))
1569
- _info = {} if not isinstance(_info, dict) else dict(_info)
1713
+ with contextlib.suppress(Exception):
1714
+ resource_component = stats.get("resource_reward")
1715
+ if resource_component is not None:
1716
+ stepwise_resource_reward_sum += float(resource_component)
1570
1717
  _info["stepwise"] = stepwise_info
1571
1718
  # Compute decision-level rewards (absolute vs unique) and attach to metadata
1572
1719
  with contextlib.suppress(Exception):
@@ -1588,13 +1735,16 @@ async def execute_rollout(
1588
1735
  "all": all_list,
1589
1736
  "unique": new_unique,
1590
1737
  }
1591
- decision_rewards_meta = decision_rewards
1592
- meta_block["decision_rewards"] = decision_rewards
1593
- _info["meta"] = meta_block
1594
- # Update episode-level seen set after attributing uniqueness to this decision
1595
- episode_seen_achievements.update(turned_true)
1738
+ decision_rewards_meta = decision_rewards
1739
+ meta_block["decision_rewards"] = decision_rewards
1740
+ _info["meta"] = meta_block
1741
+ # Update episode-level seen set after attributing uniqueness to this decision
1742
+ episode_seen_achievements.update(turned_true)
1743
+ if decision_record is not None:
1596
1744
  decision_samples.append(decision_record)
1597
1745
  prev_achievements = new_achievement_state
1746
+ prev_inventory_state = new_inventory_state
1747
+ prev_achievement_counts_state = new_achievement_counts_state
1598
1748
 
1599
1749
  await tracing_context.record_decision_reward(
1600
1750
  event_id=event_id,
@@ -1656,6 +1806,11 @@ async def execute_rollout(
1656
1806
 
1657
1807
  reset_response = await reset_environment(EnvResetRequest(env_id=env_id))
1658
1808
  current_obs = reset_response.observation
1809
+ prev_achievements = _extract_achievements(current_obs)
1810
+ episode_seen_achievements = {
1811
+ k for k, v in (prev_achievements or {}).items() if bool(v)
1812
+ }
1813
+ episode_achievement_counts.clear()
1659
1814
  elif request.on_done == "terminate":
1660
1815
  break
1661
1816
 
@@ -1704,6 +1859,30 @@ async def execute_rollout(
1704
1859
  num_steps=len(trajectory_steps),
1705
1860
  num_episodes=1,
1706
1861
  )
1862
+ if step_rewards_active:
1863
+ stepwise_summary: dict[str, Any] = {
1864
+ "indicator_sum": float(stepwise_indicator_sum),
1865
+ "reward_sum": float(stepwise_reward_sum),
1866
+ "resource_reward": float(stepwise_resource_reward_sum),
1867
+ "new_achievements_total": int(stepwise_new_achievements_total),
1868
+ "mode": step_rewards_mode,
1869
+ "strategy": step_rewards_strategy,
1870
+ "indicator_lambda": float(step_rewards_indicator_lambda),
1871
+ }
1872
+ if step_rewards_beta:
1873
+ stepwise_summary["step_beta"] = float(step_rewards_beta)
1874
+ if step_rewards_strategy == "per_achievement":
1875
+ if step_rewards_weights:
1876
+ stepwise_summary["weights"] = dict(step_rewards_weights)
1877
+ if step_rewards_k_limits:
1878
+ stepwise_summary["k_limits"] = dict(step_rewards_k_limits)
1879
+ final_achievements_list = sorted(
1880
+ key for key, val in (prev_achievements or {}).items() if bool(val)
1881
+ )
1882
+ stepwise_summary["unique_achievements_total"] = int(len(episode_seen_achievements))
1883
+ stepwise_summary["unique_achievements"] = sorted(episode_seen_achievements)
1884
+ stepwise_summary["final_achievements"] = final_achievements_list
1885
+ metrics.details["stepwise"] = stepwise_summary
1707
1886
 
1708
1887
  # Environment-specific: Log summary if available
1709
1888
  try:
@@ -1760,6 +1939,10 @@ async def execute_rollout(
1760
1939
  finalized = True
1761
1940
  trace_payload = tracing_context.build_trace_payload(session_trace)
1762
1941
 
1942
+ # Hard-fail if no steps executed (avg_turns == 0 scenario)
1943
+ if metrics.num_steps <= 0:
1944
+ raise HTTPException(status_code=500, detail="no_steps_executed: avg_turns == 0")
1945
+
1763
1946
  return RolloutResponse(
1764
1947
  run_id=request.run_id,
1765
1948
  trajectories=[trajectory],