synth-ai 0.2.10__py3-none-any.whl → 0.2.13.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/agora_ex/README_MoE.md +224 -0
- examples/agora_ex/__init__.py +7 -0
- examples/agora_ex/agora_ex.py +65 -0
- examples/agora_ex/agora_ex_task_app.py +590 -0
- examples/agora_ex/configs/rl_lora_qwen3_moe_2xh200.toml +121 -0
- examples/agora_ex/reward_fn_grpo-human.py +129 -0
- examples/agora_ex/system_prompt_CURRENT.md +63 -0
- examples/agora_ex/task_app/agora_ex_task_app.py +590 -0
- examples/agora_ex/task_app/reward_fn_grpo-human.py +129 -0
- examples/agora_ex/task_app/system_prompt_CURRENT.md +63 -0
- examples/multi_step/configs/crafter_rl_outcome.toml +74 -0
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +175 -0
- examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +83 -0
- examples/multi_step/configs/crafter_rl_stepwise_simple.toml +78 -0
- examples/multi_step/crafter_rl_lora.md +51 -10
- examples/multi_step/sse_metrics_streaming_notes.md +357 -0
- examples/multi_step/task_app_config_notes.md +494 -0
- examples/warming_up_to_rl/configs/eval_stepwise_complex.toml +35 -0
- examples/warming_up_to_rl/configs/eval_stepwise_consistent.toml +26 -0
- examples/warming_up_to_rl/configs/eval_stepwise_per_achievement.toml +36 -0
- examples/warming_up_to_rl/configs/eval_stepwise_simple.toml +32 -0
- examples/warming_up_to_rl/run_eval.py +267 -41
- examples/warming_up_to_rl/task_app/grpo_crafter.py +3 -33
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +109 -45
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +42 -46
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +376 -193
- synth_ai/__init__.py +41 -1
- synth_ai/api/train/builders.py +74 -33
- synth_ai/api/train/cli.py +29 -6
- synth_ai/api/train/configs/__init__.py +44 -0
- synth_ai/api/train/configs/rl.py +133 -0
- synth_ai/api/train/configs/sft.py +94 -0
- synth_ai/api/train/configs/shared.py +24 -0
- synth_ai/api/train/env_resolver.py +18 -19
- synth_ai/api/train/supported_algos.py +8 -5
- synth_ai/api/train/utils.py +6 -1
- synth_ai/cli/__init__.py +4 -2
- synth_ai/cli/_storage.py +19 -0
- synth_ai/cli/balance.py +14 -2
- synth_ai/cli/calc.py +37 -22
- synth_ai/cli/demo.py +38 -39
- synth_ai/cli/legacy_root_backup.py +12 -14
- synth_ai/cli/recent.py +12 -7
- synth_ai/cli/rl_demo.py +81 -102
- synth_ai/cli/status.py +4 -3
- synth_ai/cli/task_apps.py +146 -137
- synth_ai/cli/traces.py +4 -3
- synth_ai/cli/watch.py +3 -2
- synth_ai/demos/core/cli.py +121 -159
- synth_ai/environments/examples/crafter_classic/environment.py +16 -0
- synth_ai/evals/__init__.py +15 -0
- synth_ai/evals/client.py +85 -0
- synth_ai/evals/types.py +42 -0
- synth_ai/jobs/client.py +15 -3
- synth_ai/judge_schemas.py +127 -0
- synth_ai/rubrics/__init__.py +22 -0
- synth_ai/rubrics/validators.py +126 -0
- synth_ai/task/server.py +14 -7
- synth_ai/tracing_v3/decorators.py +51 -26
- synth_ai/tracing_v3/examples/basic_usage.py +12 -7
- synth_ai/tracing_v3/llm_call_record_helpers.py +107 -53
- synth_ai/tracing_v3/replica_sync.py +8 -4
- synth_ai/tracing_v3/serialization.py +130 -0
- synth_ai/tracing_v3/storage/utils.py +11 -9
- synth_ai/tracing_v3/turso/__init__.py +12 -0
- synth_ai/tracing_v3/turso/daemon.py +2 -1
- synth_ai/tracing_v3/turso/native_manager.py +28 -15
- {synth_ai-0.2.10.dist-info → synth_ai-0.2.13.dev1.dist-info}/METADATA +4 -2
- {synth_ai-0.2.10.dist-info → synth_ai-0.2.13.dev1.dist-info}/RECORD +73 -40
- {synth_ai-0.2.10.dist-info → synth_ai-0.2.13.dev1.dist-info}/entry_points.txt +0 -1
- {synth_ai-0.2.10.dist-info → synth_ai-0.2.13.dev1.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.10.dist-info → synth_ai-0.2.13.dev1.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.10.dist-info → synth_ai-0.2.13.dev1.dist-info}/top_level.txt +0 -0
|
@@ -6,10 +6,10 @@ import logging
|
|
|
6
6
|
import os
|
|
7
7
|
import time as _time
|
|
8
8
|
from datetime import datetime
|
|
9
|
-
from typing import Any
|
|
9
|
+
from typing import Any, Mapping
|
|
10
10
|
|
|
11
11
|
from fastapi import APIRouter, HTTPException, Request, status
|
|
12
|
-
from pydantic import BaseModel
|
|
12
|
+
from pydantic import BaseModel, Field
|
|
13
13
|
from synth_ai.lm.vendors.base import BaseLMResponse
|
|
14
14
|
from synth_ai.task.tracing_utils import unique_sft_path
|
|
15
15
|
from synth_ai.tracing_v3.abstractions import EnvironmentEvent, LMCAISEvent, TimeRecord
|
|
@@ -142,12 +142,178 @@ class RolloutTrajectory(BaseModel):
|
|
|
142
142
|
decision_samples: list[dict[str, Any]] | None = None
|
|
143
143
|
|
|
144
144
|
|
|
145
|
+
def _normalize_step_strategy(raw_strategy: Any) -> str:
|
|
146
|
+
if not isinstance(raw_strategy, str):
|
|
147
|
+
return "consistent"
|
|
148
|
+
candidate = raw_strategy.strip().lower()
|
|
149
|
+
if not candidate:
|
|
150
|
+
return "consistent"
|
|
151
|
+
mapping = {
|
|
152
|
+
"simple": "consistent",
|
|
153
|
+
"consistent": "consistent",
|
|
154
|
+
"consistent_stepwise": "consistent",
|
|
155
|
+
"decision_consistent": "consistent",
|
|
156
|
+
"per_achievement": "per_achievement",
|
|
157
|
+
"per-achievement": "per_achievement",
|
|
158
|
+
"perachievement": "per_achievement",
|
|
159
|
+
"achievement_weighted": "per_achievement",
|
|
160
|
+
"complex": "per_achievement",
|
|
161
|
+
}
|
|
162
|
+
return mapping.get(candidate, "consistent")
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _coerce_weights(raw_weights: Any) -> dict[str, float]:
|
|
166
|
+
weights: dict[str, float] = {}
|
|
167
|
+
if isinstance(raw_weights, dict):
|
|
168
|
+
for key, value in raw_weights.items():
|
|
169
|
+
try:
|
|
170
|
+
weights[str(key)] = float(value)
|
|
171
|
+
except Exception:
|
|
172
|
+
continue
|
|
173
|
+
return weights
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _coerce_k_limits(raw_limits: Any) -> dict[str, int]:
|
|
177
|
+
limits: dict[str, int] = {}
|
|
178
|
+
if isinstance(raw_limits, dict):
|
|
179
|
+
for key, value in raw_limits.items():
|
|
180
|
+
try:
|
|
181
|
+
limits[str(key)] = int(value)
|
|
182
|
+
except Exception:
|
|
183
|
+
continue
|
|
184
|
+
return limits
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _coerce_int_value(value: Any) -> int | None:
|
|
188
|
+
if isinstance(value, bool):
|
|
189
|
+
return int(value)
|
|
190
|
+
try:
|
|
191
|
+
return int(value) # type: ignore[arg-type]
|
|
192
|
+
except Exception:
|
|
193
|
+
try:
|
|
194
|
+
return int(float(value)) # type: ignore[arg-type]
|
|
195
|
+
except Exception:
|
|
196
|
+
return None
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def _compute_resource_reward(
|
|
200
|
+
prev_inventory: Mapping[str, Any] | None,
|
|
201
|
+
new_inventory: Mapping[str, Any] | None,
|
|
202
|
+
prev_counts: Mapping[str, Any] | None,
|
|
203
|
+
new_counts: Mapping[str, Any] | None,
|
|
204
|
+
) -> tuple[float, list[dict[str, Any]], dict[str, int], dict[str, int]]:
|
|
205
|
+
reward_total = 0.0
|
|
206
|
+
components: list[dict[str, Any]] = []
|
|
207
|
+
inventory_deltas: dict[str, int] = {}
|
|
208
|
+
achievement_deltas: dict[str, int] = {}
|
|
209
|
+
|
|
210
|
+
resource_weights = {
|
|
211
|
+
"wood": 0.10,
|
|
212
|
+
"sapling": 0.08,
|
|
213
|
+
"stone": 0.15,
|
|
214
|
+
"coal": 0.18,
|
|
215
|
+
"iron": 0.22,
|
|
216
|
+
"plant": 0.06,
|
|
217
|
+
"meat": 0.12,
|
|
218
|
+
"drink": 0.07,
|
|
219
|
+
"food": 0.07,
|
|
220
|
+
"water": 0.07,
|
|
221
|
+
"energy": 0.04,
|
|
222
|
+
}
|
|
223
|
+
tool_weights = {
|
|
224
|
+
"wood_pickaxe": 0.40,
|
|
225
|
+
"stone_pickaxe": 0.55,
|
|
226
|
+
"iron_pickaxe": 0.75,
|
|
227
|
+
"wood_sword": 0.35,
|
|
228
|
+
"stone_sword": 0.50,
|
|
229
|
+
"iron_sword": 0.70,
|
|
230
|
+
"furnace": 0.45,
|
|
231
|
+
"table": 0.30,
|
|
232
|
+
"bow": 0.45,
|
|
233
|
+
}
|
|
234
|
+
achievement_weights = {
|
|
235
|
+
"collect_wood": 0.08,
|
|
236
|
+
"collect_sapling": 0.06,
|
|
237
|
+
"collect_stone": 0.10,
|
|
238
|
+
"collect_coal": 0.12,
|
|
239
|
+
"collect_iron": 0.14,
|
|
240
|
+
"collect_drink": 0.06,
|
|
241
|
+
"collect_food": 0.06,
|
|
242
|
+
"collect_plant": 0.06,
|
|
243
|
+
}
|
|
244
|
+
default_resource_weight = 0.05
|
|
245
|
+
default_achievement_weight = 0.05
|
|
246
|
+
|
|
247
|
+
prev_inv = prev_inventory or {}
|
|
248
|
+
new_inv = new_inventory or {}
|
|
249
|
+
for key, raw_value in new_inv.items():
|
|
250
|
+
new_val = _coerce_int_value(raw_value)
|
|
251
|
+
if new_val is None:
|
|
252
|
+
continue
|
|
253
|
+
prev_val = _coerce_int_value(prev_inv.get(key, 0)) or 0
|
|
254
|
+
delta = new_val - prev_val
|
|
255
|
+
if delta <= 0:
|
|
256
|
+
continue
|
|
257
|
+
weight = resource_weights.get(key)
|
|
258
|
+
if weight is None and key in tool_weights:
|
|
259
|
+
weight = tool_weights[key]
|
|
260
|
+
if weight is None:
|
|
261
|
+
weight = default_resource_weight
|
|
262
|
+
gain = weight * delta
|
|
263
|
+
reward_total += gain
|
|
264
|
+
inventory_deltas[str(key)] = delta
|
|
265
|
+
components.append(
|
|
266
|
+
{
|
|
267
|
+
"type": "inventory",
|
|
268
|
+
"item": str(key),
|
|
269
|
+
"delta": delta,
|
|
270
|
+
"weight": weight,
|
|
271
|
+
"reward": gain,
|
|
272
|
+
}
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
prev_ct = prev_counts or {}
|
|
276
|
+
new_ct = new_counts or {}
|
|
277
|
+
for key, raw_value in new_ct.items():
|
|
278
|
+
new_val = _coerce_int_value(raw_value)
|
|
279
|
+
if new_val is None:
|
|
280
|
+
continue
|
|
281
|
+
prev_val = _coerce_int_value(prev_ct.get(key, 0)) or 0
|
|
282
|
+
delta = new_val - prev_val
|
|
283
|
+
if delta <= 0:
|
|
284
|
+
continue
|
|
285
|
+
weight = achievement_weights.get(key, default_achievement_weight)
|
|
286
|
+
gain = weight * delta
|
|
287
|
+
reward_total += gain
|
|
288
|
+
achievement_deltas[str(key)] = delta
|
|
289
|
+
components.append(
|
|
290
|
+
{
|
|
291
|
+
"type": "achievement_count",
|
|
292
|
+
"name": str(key),
|
|
293
|
+
"delta": delta,
|
|
294
|
+
"weight": weight,
|
|
295
|
+
"reward": gain,
|
|
296
|
+
}
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
return reward_total, components, inventory_deltas, achievement_deltas
|
|
300
|
+
|
|
301
|
+
|
|
145
302
|
def compute_stepwise_reward(
|
|
146
303
|
prev_achievements: dict[str, bool],
|
|
147
304
|
new_achievements: dict[str, bool],
|
|
148
305
|
decision_index: int,
|
|
149
306
|
actions_summary: list[dict[str, Any]],
|
|
150
307
|
indicator_lambda: float,
|
|
308
|
+
*,
|
|
309
|
+
strategy: str | None = None,
|
|
310
|
+
weights: dict[str, float] | None = None,
|
|
311
|
+
k_limits: dict[str, int] | None = None,
|
|
312
|
+
episode_counts: dict[str, int] | None = None,
|
|
313
|
+
prev_inventory: dict[str, int] | None = None,
|
|
314
|
+
new_inventory: dict[str, int] | None = None,
|
|
315
|
+
prev_counts: dict[str, int] | None = None,
|
|
316
|
+
new_counts: dict[str, int] | None = None,
|
|
151
317
|
) -> tuple[dict[str, Any], dict[str, Any], dict[str, float]]:
|
|
152
318
|
"""Compute stepwise reward metadata given achievement states before/after a decision."""
|
|
153
319
|
|
|
@@ -155,26 +321,121 @@ def compute_stepwise_reward(
|
|
|
155
321
|
next_map = new_achievements or {}
|
|
156
322
|
|
|
157
323
|
unlocked = [name for name, value in next_map.items() if value and not prev_map.get(name, False)]
|
|
158
|
-
|
|
159
|
-
|
|
324
|
+
indicator_from_achievements = 1 if unlocked else 0
|
|
325
|
+
normalized_strategy = _normalize_step_strategy(strategy)
|
|
326
|
+
base_reward = 0.0
|
|
327
|
+
reward_components: list[dict[str, Any]] = []
|
|
328
|
+
credited: list[str] = []
|
|
329
|
+
|
|
330
|
+
if indicator_from_achievements:
|
|
331
|
+
if normalized_strategy == "per_achievement":
|
|
332
|
+
weight_map = weights or {}
|
|
333
|
+
limit_map = k_limits or {}
|
|
334
|
+
counts = episode_counts if isinstance(episode_counts, dict) else {}
|
|
335
|
+
for name in unlocked:
|
|
336
|
+
try:
|
|
337
|
+
limit_val = int(limit_map.get(name, 1))
|
|
338
|
+
except Exception:
|
|
339
|
+
limit_val = 1
|
|
340
|
+
# limit_val <= 0 implies unlimited rewards
|
|
341
|
+
unlimited = limit_val <= 0
|
|
342
|
+
try:
|
|
343
|
+
prev_count = int(counts.get(name, 0))
|
|
344
|
+
except Exception:
|
|
345
|
+
prev_count = 0
|
|
346
|
+
should_credit = unlimited or (prev_count < max(limit_val, 0))
|
|
347
|
+
if should_credit:
|
|
348
|
+
try:
|
|
349
|
+
weight_val = float(weight_map.get(name, 1.0))
|
|
350
|
+
except Exception:
|
|
351
|
+
weight_val = 1.0
|
|
352
|
+
base_reward += weight_val
|
|
353
|
+
reward_components.append(
|
|
354
|
+
{
|
|
355
|
+
"achievement": name,
|
|
356
|
+
"weight": weight_val,
|
|
357
|
+
"count_prior": prev_count,
|
|
358
|
+
"count_limit": limit_val,
|
|
359
|
+
}
|
|
360
|
+
)
|
|
361
|
+
credited.append(name)
|
|
362
|
+
if episode_counts is not None:
|
|
363
|
+
episode_counts[name] = prev_count + 1
|
|
364
|
+
else:
|
|
365
|
+
base_reward = 1.0
|
|
366
|
+
reward_components.append(
|
|
367
|
+
{
|
|
368
|
+
"achievement": "__indicator__",
|
|
369
|
+
"weight": 1.0,
|
|
370
|
+
"count_prior": 0,
|
|
371
|
+
"count_limit": 1,
|
|
372
|
+
}
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
resource_reward = 0.0
|
|
376
|
+
resource_components: list[dict[str, Any]] = []
|
|
377
|
+
inventory_deltas: dict[str, int] = {}
|
|
378
|
+
achievement_deltas: dict[str, int] = {}
|
|
379
|
+
if normalized_strategy == "per_achievement":
|
|
380
|
+
(
|
|
381
|
+
resource_reward,
|
|
382
|
+
resource_components,
|
|
383
|
+
inventory_deltas,
|
|
384
|
+
achievement_deltas,
|
|
385
|
+
) = _compute_resource_reward(prev_inventory, new_inventory, prev_counts, new_counts)
|
|
386
|
+
if resource_components:
|
|
387
|
+
reward_components.extend(resource_components)
|
|
388
|
+
base_reward += resource_reward
|
|
389
|
+
|
|
390
|
+
indicator = 1 if base_reward > 0 else 0
|
|
391
|
+
if indicator == 0 and indicator_from_achievements:
|
|
392
|
+
indicator = indicator_from_achievements
|
|
393
|
+
lambda_effective = indicator_lambda if indicator_lambda not in (None, 0) else 1.0
|
|
394
|
+
reward_value = float(lambda_effective) * float(base_reward)
|
|
160
395
|
|
|
161
396
|
stepwise_info = {
|
|
162
397
|
"decision_index": decision_index,
|
|
163
398
|
"indicator": indicator,
|
|
164
399
|
"new_achievements": unlocked,
|
|
165
400
|
"reward": reward_value,
|
|
401
|
+
"strategy": normalized_strategy,
|
|
402
|
+
"base_reward": float(base_reward),
|
|
166
403
|
}
|
|
404
|
+
if indicator_from_achievements and not unlocked:
|
|
405
|
+
stepwise_info["indicator_from_achievements"] = indicator_from_achievements
|
|
406
|
+
if reward_components:
|
|
407
|
+
stepwise_info["components"] = reward_components
|
|
408
|
+
if credited:
|
|
409
|
+
stepwise_info["credited_achievements"] = credited
|
|
410
|
+
if resource_reward:
|
|
411
|
+
stepwise_info["resource_reward"] = float(resource_reward)
|
|
412
|
+
if inventory_deltas:
|
|
413
|
+
stepwise_info["inventory_deltas"] = inventory_deltas
|
|
414
|
+
if achievement_deltas:
|
|
415
|
+
stepwise_info["achievement_count_deltas"] = achievement_deltas
|
|
416
|
+
|
|
167
417
|
decision_sample = {
|
|
168
418
|
"decision_index": decision_index,
|
|
169
419
|
"indicator": indicator,
|
|
170
420
|
"r_i": reward_value,
|
|
421
|
+
"base": float(base_reward),
|
|
422
|
+
"strategy": normalized_strategy,
|
|
171
423
|
"actions": actions_summary,
|
|
172
424
|
}
|
|
425
|
+
if reward_components:
|
|
426
|
+
decision_sample["components"] = reward_components
|
|
427
|
+
if resource_reward:
|
|
428
|
+
decision_sample["resource_reward"] = float(resource_reward)
|
|
429
|
+
|
|
173
430
|
stats = {
|
|
174
431
|
"indicator": float(indicator),
|
|
175
432
|
"reward": reward_value,
|
|
176
433
|
"new_achievements_count": float(len(unlocked)),
|
|
434
|
+
"base_reward": float(base_reward),
|
|
435
|
+
"credited_achievements_count": float(len(credited)),
|
|
177
436
|
}
|
|
437
|
+
if resource_reward:
|
|
438
|
+
stats["resource_reward"] = float(resource_reward)
|
|
178
439
|
return stepwise_info, decision_sample, stats
|
|
179
440
|
|
|
180
441
|
|
|
@@ -183,6 +444,9 @@ class RolloutMetrics(BaseModel):
|
|
|
183
444
|
mean_return: float
|
|
184
445
|
num_steps: int
|
|
185
446
|
num_episodes: int = 0
|
|
447
|
+
outcome_score: float | None = None
|
|
448
|
+
events_score: float | None = None
|
|
449
|
+
details: dict[str, Any] = Field(default_factory=dict)
|
|
186
450
|
|
|
187
451
|
|
|
188
452
|
class RolloutResponse(BaseModel):
|
|
@@ -254,7 +518,7 @@ class RolloutTracingContext:
|
|
|
254
518
|
session_id=self.run_id, metadata=dict(self.metadata_base)
|
|
255
519
|
)
|
|
256
520
|
except Exception as exc:
|
|
257
|
-
logger.
|
|
521
|
+
logger.info("TRACING_START_FAIL: %s", exc)
|
|
258
522
|
self.enabled = False
|
|
259
523
|
self.tracer = None
|
|
260
524
|
|
|
@@ -1053,6 +1317,9 @@ async def execute_rollout(
|
|
|
1053
1317
|
|
|
1054
1318
|
step_rewards_enabled = bool(step_rewards_cfg_raw.get("enabled", False))
|
|
1055
1319
|
step_rewards_mode = str(step_rewards_cfg_raw.get("mode") or "off").lower()
|
|
1320
|
+
step_rewards_strategy = _normalize_step_strategy(step_rewards_cfg_raw.get("strategy"))
|
|
1321
|
+
step_rewards_weights = _coerce_weights(step_rewards_cfg_raw.get("weights"))
|
|
1322
|
+
step_rewards_k_limits = _coerce_k_limits(step_rewards_cfg_raw.get("k_limits"))
|
|
1056
1323
|
try:
|
|
1057
1324
|
step_rewards_indicator_lambda = float(
|
|
1058
1325
|
step_rewards_cfg_raw.get("indicator_lambda") or 0.0
|
|
@@ -1073,6 +1340,34 @@ async def execute_rollout(
|
|
|
1073
1340
|
return {str(k): bool(v) for k, v in ach.items()}
|
|
1074
1341
|
return {}
|
|
1075
1342
|
|
|
1343
|
+
def _extract_inventory(obs: Any) -> dict[str, int]:
|
|
1344
|
+
if not isinstance(obs, dict):
|
|
1345
|
+
return {}
|
|
1346
|
+
inv = obs.get("inventory")
|
|
1347
|
+
if not isinstance(inv, dict):
|
|
1348
|
+
return {}
|
|
1349
|
+
cleaned: dict[str, int] = {}
|
|
1350
|
+
for key, value in inv.items():
|
|
1351
|
+
coerced = _coerce_int_value(value)
|
|
1352
|
+
if coerced is None:
|
|
1353
|
+
continue
|
|
1354
|
+
cleaned[str(key)] = coerced
|
|
1355
|
+
return cleaned
|
|
1356
|
+
|
|
1357
|
+
def _extract_achievement_counts(obs: Any) -> dict[str, int]:
|
|
1358
|
+
if not isinstance(obs, dict):
|
|
1359
|
+
return {}
|
|
1360
|
+
counts = obs.get("achievements_counts")
|
|
1361
|
+
if not isinstance(counts, dict):
|
|
1362
|
+
return {}
|
|
1363
|
+
cleaned: dict[str, int] = {}
|
|
1364
|
+
for key, value in counts.items():
|
|
1365
|
+
coerced = _coerce_int_value(value)
|
|
1366
|
+
if coerced is None:
|
|
1367
|
+
continue
|
|
1368
|
+
cleaned[str(key)] = coerced
|
|
1369
|
+
return cleaned
|
|
1370
|
+
|
|
1076
1371
|
def _summarize_tool_calls(tool_calls: Any) -> list[dict[str, Any]]:
|
|
1077
1372
|
if not tool_calls:
|
|
1078
1373
|
return []
|
|
@@ -1109,12 +1404,16 @@ async def execute_rollout(
|
|
|
1109
1404
|
session_trace = None
|
|
1110
1405
|
finalized = False
|
|
1111
1406
|
prev_achievements = _extract_achievements(current_obs)
|
|
1407
|
+
prev_inventory_state = _extract_inventory(current_obs)
|
|
1408
|
+
prev_achievement_counts_state = _extract_achievement_counts(current_obs)
|
|
1112
1409
|
# Track episode-level achievements that have been seen as true at any point so far
|
|
1113
1410
|
episode_seen_achievements: set[str] = {
|
|
1114
1411
|
k for k, v in (prev_achievements or {}).items() if bool(v)
|
|
1115
1412
|
}
|
|
1413
|
+
episode_achievement_counts: dict[str, int] = {}
|
|
1116
1414
|
stepwise_indicator_sum = 0.0
|
|
1117
1415
|
stepwise_reward_sum = 0.0
|
|
1416
|
+
stepwise_resource_reward_sum = 0.0
|
|
1118
1417
|
stepwise_new_achievements_total = 0
|
|
1119
1418
|
final_achievement_count = sum(1 for v in prev_achievements.values() if v)
|
|
1120
1419
|
|
|
@@ -1228,58 +1527,14 @@ async def execute_rollout(
|
|
|
1228
1527
|
req,
|
|
1229
1528
|
)
|
|
1230
1529
|
except Exception as _pe:
|
|
1231
|
-
#
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
str(_pe),
|
|
1238
|
-
)
|
|
1239
|
-
|
|
1240
|
-
# Build partial trajectory and return HTTP 200
|
|
1241
|
-
trajectory = RolloutTrajectory(
|
|
1242
|
-
env_id=env_id,
|
|
1243
|
-
policy_id=policy_id,
|
|
1244
|
-
steps=trajectory_steps,
|
|
1245
|
-
final={
|
|
1246
|
-
"observation": current_obs,
|
|
1247
|
-
"rollout_status": "partial_policy_error",
|
|
1248
|
-
"error": str(_pe),
|
|
1249
|
-
"at_op": op,
|
|
1250
|
-
},
|
|
1251
|
-
length=len(trajectory_steps),
|
|
1252
|
-
decision_samples=decision_samples if step_rewards_active else None,
|
|
1253
|
-
)
|
|
1254
|
-
metrics = RolloutMetrics(
|
|
1255
|
-
episode_returns=[total_reward],
|
|
1256
|
-
mean_return=total_reward,
|
|
1257
|
-
num_steps=len(trajectory_steps),
|
|
1258
|
-
num_episodes=1,
|
|
1259
|
-
)
|
|
1260
|
-
aborted = registry.is_run_aborted(request.run_id)
|
|
1261
|
-
if not aborted:
|
|
1262
|
-
registry.complete_run(request.run_id)
|
|
1263
|
-
if decision_open:
|
|
1264
|
-
await tracing_context.end_decision()
|
|
1265
|
-
decision_open = False
|
|
1266
|
-
if not finalized:
|
|
1267
|
-
session_trace = await tracing_context.finalize(
|
|
1268
|
-
total_reward=total_reward,
|
|
1269
|
-
achievement_state=prev_achievements,
|
|
1270
|
-
total_steps=len(trajectory_steps),
|
|
1271
|
-
)
|
|
1272
|
-
finalized = True
|
|
1273
|
-
trace_payload = tracing_context.build_trace_payload(session_trace)
|
|
1274
|
-
return RolloutResponse(
|
|
1275
|
-
run_id=request.run_id,
|
|
1276
|
-
trajectories=[trajectory],
|
|
1277
|
-
branches={},
|
|
1278
|
-
metrics=metrics,
|
|
1279
|
-
aborted=aborted,
|
|
1280
|
-
ops_executed=ops_executed,
|
|
1281
|
-
trace=trace_payload,
|
|
1530
|
+
# Hard fail the rollout on policy step error (e.g., inference auth 4xx)
|
|
1531
|
+
logger.error(
|
|
1532
|
+
"POLICY_STEP_HARD_FAIL: run_id=%s op_idx=%s err=%s",
|
|
1533
|
+
request.run_id,
|
|
1534
|
+
str(op_idx),
|
|
1535
|
+
str(_pe),
|
|
1282
1536
|
)
|
|
1537
|
+
raise HTTPException(status_code=500, detail=f"policy_step_failed: {str(_pe)}")
|
|
1283
1538
|
|
|
1284
1539
|
agent_response_ts = _time.perf_counter()
|
|
1285
1540
|
if isinstance(policy_response.meta, dict):
|
|
@@ -1346,69 +1601,15 @@ async def execute_rollout(
|
|
|
1346
1601
|
|
|
1347
1602
|
elif op == "env":
|
|
1348
1603
|
if not pending_tool_calls:
|
|
1349
|
-
# Treat absence of tool calls as a soft terminal condition; yield partial trajectory
|
|
1350
1604
|
with contextlib.suppress(Exception):
|
|
1351
1605
|
logger.warning(
|
|
1352
|
-
"
|
|
1606
|
+
"POLICY_STEP_FAIL: missing tool_calls; failing rollout run_id=%s op_idx=%s",
|
|
1353
1607
|
request.run_id,
|
|
1354
1608
|
str(op_idx),
|
|
1355
1609
|
)
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
)
|
|
1360
|
-
term_step = RolloutStep(
|
|
1361
|
-
obs=current_obs,
|
|
1362
|
-
tool_calls=[],
|
|
1363
|
-
reward=None,
|
|
1364
|
-
done=True,
|
|
1365
|
-
truncated=False,
|
|
1366
|
-
info={
|
|
1367
|
-
"terminated": True,
|
|
1368
|
-
"reason": "no_tool_calls",
|
|
1369
|
-
},
|
|
1370
|
-
)
|
|
1371
|
-
trajectory_steps.append(term_step)
|
|
1372
|
-
trajectory = RolloutTrajectory(
|
|
1373
|
-
env_id=env_id,
|
|
1374
|
-
policy_id=policy_id,
|
|
1375
|
-
steps=trajectory_steps,
|
|
1376
|
-
final={
|
|
1377
|
-
"observation": current_obs,
|
|
1378
|
-
"rollout_status": "partial_no_tool_calls",
|
|
1379
|
-
"at_op": op,
|
|
1380
|
-
},
|
|
1381
|
-
length=len(trajectory_steps),
|
|
1382
|
-
decision_samples=decision_samples if step_rewards_active else None,
|
|
1383
|
-
)
|
|
1384
|
-
metrics = RolloutMetrics(
|
|
1385
|
-
episode_returns=[total_reward],
|
|
1386
|
-
mean_return=total_reward,
|
|
1387
|
-
num_steps=len(trajectory_steps),
|
|
1388
|
-
num_episodes=1,
|
|
1389
|
-
)
|
|
1390
|
-
aborted = registry.is_run_aborted(request.run_id)
|
|
1391
|
-
if not aborted:
|
|
1392
|
-
registry.complete_run(request.run_id)
|
|
1393
|
-
if decision_open:
|
|
1394
|
-
await tracing_context.end_decision()
|
|
1395
|
-
decision_open = False
|
|
1396
|
-
if not finalized:
|
|
1397
|
-
session_trace = await tracing_context.finalize(
|
|
1398
|
-
total_reward=total_reward,
|
|
1399
|
-
achievement_state=prev_achievements,
|
|
1400
|
-
total_steps=len(trajectory_steps),
|
|
1401
|
-
)
|
|
1402
|
-
finalized = True
|
|
1403
|
-
trace_payload = tracing_context.build_trace_payload(session_trace)
|
|
1404
|
-
return RolloutResponse(
|
|
1405
|
-
run_id=request.run_id,
|
|
1406
|
-
trajectories=[trajectory],
|
|
1407
|
-
branches={},
|
|
1408
|
-
metrics=metrics,
|
|
1409
|
-
aborted=aborted,
|
|
1410
|
-
ops_executed=ops_executed,
|
|
1411
|
-
trace=trace_payload,
|
|
1610
|
+
raise HTTPException(
|
|
1611
|
+
status_code=500,
|
|
1612
|
+
detail="policy_step_failed: missing tool_calls (no_tool_calls)",
|
|
1412
1613
|
)
|
|
1413
1614
|
|
|
1414
1615
|
# Environment step
|
|
@@ -1437,85 +1638,16 @@ async def execute_rollout(
|
|
|
1437
1638
|
timing_env["env_step_end_s"] = env_step_end
|
|
1438
1639
|
|
|
1439
1640
|
if env_step_error is not None:
|
|
1440
|
-
# Invalid action or environment rejection — terminate episode early with partial trajectory
|
|
1441
1641
|
with contextlib.suppress(Exception):
|
|
1442
1642
|
logger.warning(
|
|
1443
|
-
"ENV_STEP_FAIL:
|
|
1643
|
+
"ENV_STEP_FAIL: failing rollout run_id=%s op_idx=%s err=%s",
|
|
1444
1644
|
request.run_id,
|
|
1445
1645
|
str(op_idx),
|
|
1446
1646
|
str(env_step_error),
|
|
1447
1647
|
)
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
tool_calls=pending_tool_calls,
|
|
1452
|
-
reward=None,
|
|
1453
|
-
done=True,
|
|
1454
|
-
truncated=False,
|
|
1455
|
-
info={
|
|
1456
|
-
"terminated": True,
|
|
1457
|
-
"reason": "invalid_action",
|
|
1458
|
-
"error": str(env_step_error),
|
|
1459
|
-
},
|
|
1460
|
-
)
|
|
1461
|
-
trajectory_steps.append(term_step)
|
|
1462
|
-
# Build partial response
|
|
1463
|
-
trajectory = RolloutTrajectory(
|
|
1464
|
-
env_id=env_id,
|
|
1465
|
-
policy_id=policy_id,
|
|
1466
|
-
steps=trajectory_steps,
|
|
1467
|
-
final={
|
|
1468
|
-
"observation": current_obs,
|
|
1469
|
-
"rollout_status": "partial_invalid_action",
|
|
1470
|
-
"error": str(env_step_error),
|
|
1471
|
-
"at_op": op,
|
|
1472
|
-
},
|
|
1473
|
-
length=len(trajectory_steps),
|
|
1474
|
-
decision_samples=decision_samples if step_rewards_active else None,
|
|
1475
|
-
)
|
|
1476
|
-
metrics = RolloutMetrics(
|
|
1477
|
-
episode_returns=[total_reward],
|
|
1478
|
-
mean_return=total_reward,
|
|
1479
|
-
num_steps=len(trajectory_steps),
|
|
1480
|
-
num_episodes=1,
|
|
1481
|
-
)
|
|
1482
|
-
aborted = registry.is_run_aborted(request.run_id)
|
|
1483
|
-
if not aborted:
|
|
1484
|
-
registry.complete_run(request.run_id)
|
|
1485
|
-
if (
|
|
1486
|
-
last_policy_meta is not None
|
|
1487
|
-
and last_agent_response_ts is not None
|
|
1488
|
-
and "decision_ms" not in last_policy_meta.get("timing", {})
|
|
1489
|
-
):
|
|
1490
|
-
with contextlib.suppress(Exception):
|
|
1491
|
-
timing_last = last_policy_meta.setdefault("timing", {})
|
|
1492
|
-
decision_ms = max(
|
|
1493
|
-
0.0,
|
|
1494
|
-
(env_step_end - float(last_agent_response_ts)) * 1000.0,
|
|
1495
|
-
)
|
|
1496
|
-
timing_last["decision_ms"] = decision_ms
|
|
1497
|
-
timing_last.setdefault(
|
|
1498
|
-
"overhead_ms", max(0.0, decision_ms - env_step_duration_ms)
|
|
1499
|
-
)
|
|
1500
|
-
if decision_open:
|
|
1501
|
-
await tracing_context.end_decision()
|
|
1502
|
-
decision_open = False
|
|
1503
|
-
if not finalized:
|
|
1504
|
-
session_trace = await tracing_context.finalize(
|
|
1505
|
-
total_reward=total_reward,
|
|
1506
|
-
achievement_state=prev_achievements,
|
|
1507
|
-
total_steps=len(trajectory_steps),
|
|
1508
|
-
)
|
|
1509
|
-
finalized = True
|
|
1510
|
-
trace_payload = tracing_context.build_trace_payload(session_trace)
|
|
1511
|
-
return RolloutResponse(
|
|
1512
|
-
run_id=request.run_id,
|
|
1513
|
-
trajectories=[trajectory],
|
|
1514
|
-
branches={},
|
|
1515
|
-
metrics=metrics,
|
|
1516
|
-
aborted=aborted,
|
|
1517
|
-
ops_executed=ops_executed,
|
|
1518
|
-
trace=trace_payload,
|
|
1648
|
+
raise HTTPException(
|
|
1649
|
+
status_code=500,
|
|
1650
|
+
detail=f"env_step_failed: {str(env_step_error)}",
|
|
1519
1651
|
)
|
|
1520
1652
|
|
|
1521
1653
|
# Reaching here means env step succeeded
|
|
@@ -1546,12 +1678,16 @@ async def execute_rollout(
|
|
|
1546
1678
|
decision_index += 1
|
|
1547
1679
|
next_obs = env_response.observation
|
|
1548
1680
|
new_achievement_state = _extract_achievements(next_obs)
|
|
1681
|
+
new_inventory_state = _extract_inventory(next_obs)
|
|
1682
|
+
new_achievement_counts_state = _extract_achievement_counts(next_obs)
|
|
1549
1683
|
final_achievement_count = sum(
|
|
1550
1684
|
1 for _, unlocked in new_achievement_state.items() if unlocked
|
|
1551
1685
|
)
|
|
1552
1686
|
indicator_val = 0
|
|
1553
1687
|
reward_stepwise = 0.0
|
|
1554
1688
|
decision_rewards_meta: dict[str, Any] | None = None
|
|
1689
|
+
decision_record = None
|
|
1690
|
+
_info = {} if not isinstance(_info, dict) else dict(_info)
|
|
1555
1691
|
if step_rewards_active:
|
|
1556
1692
|
decision_actions = _summarize_tool_calls(pending_tool_calls)
|
|
1557
1693
|
stepwise_info, decision_record, stats = compute_stepwise_reward(
|
|
@@ -1560,13 +1696,24 @@ async def execute_rollout(
|
|
|
1560
1696
|
decision_index,
|
|
1561
1697
|
decision_actions,
|
|
1562
1698
|
step_rewards_indicator_lambda,
|
|
1699
|
+
strategy=step_rewards_strategy,
|
|
1700
|
+
weights=step_rewards_weights,
|
|
1701
|
+
k_limits=step_rewards_k_limits,
|
|
1702
|
+
episode_counts=episode_achievement_counts,
|
|
1703
|
+
prev_inventory=prev_inventory_state,
|
|
1704
|
+
new_inventory=new_inventory_state,
|
|
1705
|
+
prev_counts=prev_achievement_counts_state,
|
|
1706
|
+
new_counts=new_achievement_counts_state,
|
|
1563
1707
|
)
|
|
1564
1708
|
indicator_val = int(stats.get("indicator", 0.0))
|
|
1565
1709
|
reward_stepwise = float(stats.get("reward", 0.0))
|
|
1566
1710
|
stepwise_indicator_sum += float(stats.get("indicator", 0.0))
|
|
1567
1711
|
stepwise_reward_sum += reward_stepwise
|
|
1568
1712
|
stepwise_new_achievements_total += int(stats.get("new_achievements_count", 0.0))
|
|
1569
|
-
|
|
1713
|
+
with contextlib.suppress(Exception):
|
|
1714
|
+
resource_component = stats.get("resource_reward")
|
|
1715
|
+
if resource_component is not None:
|
|
1716
|
+
stepwise_resource_reward_sum += float(resource_component)
|
|
1570
1717
|
_info["stepwise"] = stepwise_info
|
|
1571
1718
|
# Compute decision-level rewards (absolute vs unique) and attach to metadata
|
|
1572
1719
|
with contextlib.suppress(Exception):
|
|
@@ -1588,13 +1735,16 @@ async def execute_rollout(
|
|
|
1588
1735
|
"all": all_list,
|
|
1589
1736
|
"unique": new_unique,
|
|
1590
1737
|
}
|
|
1591
|
-
|
|
1592
|
-
|
|
1593
|
-
|
|
1594
|
-
|
|
1595
|
-
|
|
1738
|
+
decision_rewards_meta = decision_rewards
|
|
1739
|
+
meta_block["decision_rewards"] = decision_rewards
|
|
1740
|
+
_info["meta"] = meta_block
|
|
1741
|
+
# Update episode-level seen set after attributing uniqueness to this decision
|
|
1742
|
+
episode_seen_achievements.update(turned_true)
|
|
1743
|
+
if decision_record is not None:
|
|
1596
1744
|
decision_samples.append(decision_record)
|
|
1597
1745
|
prev_achievements = new_achievement_state
|
|
1746
|
+
prev_inventory_state = new_inventory_state
|
|
1747
|
+
prev_achievement_counts_state = new_achievement_counts_state
|
|
1598
1748
|
|
|
1599
1749
|
await tracing_context.record_decision_reward(
|
|
1600
1750
|
event_id=event_id,
|
|
@@ -1656,6 +1806,11 @@ async def execute_rollout(
|
|
|
1656
1806
|
|
|
1657
1807
|
reset_response = await reset_environment(EnvResetRequest(env_id=env_id))
|
|
1658
1808
|
current_obs = reset_response.observation
|
|
1809
|
+
prev_achievements = _extract_achievements(current_obs)
|
|
1810
|
+
episode_seen_achievements = {
|
|
1811
|
+
k for k, v in (prev_achievements or {}).items() if bool(v)
|
|
1812
|
+
}
|
|
1813
|
+
episode_achievement_counts.clear()
|
|
1659
1814
|
elif request.on_done == "terminate":
|
|
1660
1815
|
break
|
|
1661
1816
|
|
|
@@ -1704,6 +1859,30 @@ async def execute_rollout(
|
|
|
1704
1859
|
num_steps=len(trajectory_steps),
|
|
1705
1860
|
num_episodes=1,
|
|
1706
1861
|
)
|
|
1862
|
+
if step_rewards_active:
|
|
1863
|
+
stepwise_summary: dict[str, Any] = {
|
|
1864
|
+
"indicator_sum": float(stepwise_indicator_sum),
|
|
1865
|
+
"reward_sum": float(stepwise_reward_sum),
|
|
1866
|
+
"resource_reward": float(stepwise_resource_reward_sum),
|
|
1867
|
+
"new_achievements_total": int(stepwise_new_achievements_total),
|
|
1868
|
+
"mode": step_rewards_mode,
|
|
1869
|
+
"strategy": step_rewards_strategy,
|
|
1870
|
+
"indicator_lambda": float(step_rewards_indicator_lambda),
|
|
1871
|
+
}
|
|
1872
|
+
if step_rewards_beta:
|
|
1873
|
+
stepwise_summary["step_beta"] = float(step_rewards_beta)
|
|
1874
|
+
if step_rewards_strategy == "per_achievement":
|
|
1875
|
+
if step_rewards_weights:
|
|
1876
|
+
stepwise_summary["weights"] = dict(step_rewards_weights)
|
|
1877
|
+
if step_rewards_k_limits:
|
|
1878
|
+
stepwise_summary["k_limits"] = dict(step_rewards_k_limits)
|
|
1879
|
+
final_achievements_list = sorted(
|
|
1880
|
+
key for key, val in (prev_achievements or {}).items() if bool(val)
|
|
1881
|
+
)
|
|
1882
|
+
stepwise_summary["unique_achievements_total"] = int(len(episode_seen_achievements))
|
|
1883
|
+
stepwise_summary["unique_achievements"] = sorted(episode_seen_achievements)
|
|
1884
|
+
stepwise_summary["final_achievements"] = final_achievements_list
|
|
1885
|
+
metrics.details["stepwise"] = stepwise_summary
|
|
1707
1886
|
|
|
1708
1887
|
# Environment-specific: Log summary if available
|
|
1709
1888
|
try:
|
|
@@ -1760,6 +1939,10 @@ async def execute_rollout(
|
|
|
1760
1939
|
finalized = True
|
|
1761
1940
|
trace_payload = tracing_context.build_trace_payload(session_trace)
|
|
1762
1941
|
|
|
1942
|
+
# Hard-fail if no steps executed (avg_turns == 0 scenario)
|
|
1943
|
+
if metrics.num_steps <= 0:
|
|
1944
|
+
raise HTTPException(status_code=500, detail="no_steps_executed: avg_turns == 0")
|
|
1945
|
+
|
|
1763
1946
|
return RolloutResponse(
|
|
1764
1947
|
run_id=request.run_id,
|
|
1765
1948
|
trajectories=[trajectory],
|