PyPI - synth-ai - Versions diffs - 0.2.13.dev2__py3-none-any.whl → 0.2.16__py3-none-any.whl - Mend

synth-ai 0.2.13.dev2py3-none-any.whl → 0.2.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of synth-ai might be problematic. Click here for more details.

Files changed (293) hide show

examples/README.md +1 -0
examples/multi_step/SFT_README.md +147 -0
examples/multi_step/configs/README_verilog_rl.md +77 -0
examples/multi_step/configs/VERILOG_REWARDS.md +90 -0
examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +183 -0
examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +12 -11
examples/multi_step/configs/crafter_sft_qwen30b_lora.toml +62 -0
examples/multi_step/configs/crafter_synth_backend.md +40 -0
examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
examples/multi_step/configs/verilog_rl_lora.toml +190 -0
examples/multi_step/convert_traces_to_sft.py +84 -0
examples/multi_step/judges/crafter_backend_judge.py +220 -0
examples/multi_step/judges/verilog_backend_judge.py +234 -0
examples/multi_step/readme.md +48 -0
examples/multi_step/run_sft_qwen30b.sh +45 -0
examples/multi_step/verilog_rl_lora.md +218 -0
examples/qwen_coder/configs/coder_lora_30b.toml +3 -2
examples/qwen_coder/configs/coder_lora_4b.toml +2 -1
examples/qwen_coder/configs/coder_lora_small.toml +2 -1
examples/qwen_vl/BUGS_AND_FIXES.md +232 -0
examples/qwen_vl/IMAGE_VALIDATION_COMPLETE.md +271 -0
examples/qwen_vl/IMAGE_VALIDATION_SUMMARY.md +260 -0
examples/qwen_vl/INFERENCE_SFT_TESTS.md +412 -0
examples/qwen_vl/NEXT_STEPS_2B.md +325 -0
examples/qwen_vl/QUICKSTART.md +327 -0
examples/qwen_vl/QUICKSTART_RL_VISION.md +110 -0
examples/qwen_vl/README.md +154 -0
examples/qwen_vl/RL_VISION_COMPLETE.md +475 -0
examples/qwen_vl/RL_VISION_TESTING.md +333 -0
examples/qwen_vl/SDK_VISION_INTEGRATION.md +328 -0
examples/qwen_vl/SETUP_COMPLETE.md +275 -0
examples/qwen_vl/VISION_TESTS_COMPLETE.md +490 -0
examples/qwen_vl/VLM_PIPELINE_COMPLETE.md +242 -0
examples/qwen_vl/__init__.py +2 -0
examples/qwen_vl/collect_data_via_cli.md +423 -0
examples/qwen_vl/collect_vision_traces.py +368 -0
examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +127 -0
examples/qwen_vl/configs/crafter_vlm_sft_example.toml +60 -0
examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +43 -0
examples/qwen_vl/configs/eval_gpt4o_vision_proper.toml +29 -0
examples/qwen_vl/configs/eval_gpt5nano_vision.toml +45 -0
examples/qwen_vl/configs/eval_qwen2vl_vision.toml +44 -0
examples/qwen_vl/configs/filter_qwen2vl_sft.toml +50 -0
examples/qwen_vl/configs/filter_vision_sft.toml +53 -0
examples/qwen_vl/configs/filter_vision_test.toml +8 -0
examples/qwen_vl/configs/sft_qwen3_vl_2b_test.toml +54 -0
examples/qwen_vl/crafter_gpt5nano_agent.py +308 -0
examples/qwen_vl/crafter_qwen_vl_agent.py +300 -0
examples/qwen_vl/run_vision_comparison.sh +62 -0
examples/qwen_vl/run_vision_sft_pipeline.sh +175 -0
examples/qwen_vl/test_image_validation.py +201 -0
examples/qwen_vl/test_sft_vision_data.py +110 -0
examples/rl/README.md +1 -1
examples/rl/configs/eval_base_qwen.toml +17 -0
examples/rl/configs/eval_rl_qwen.toml +13 -0
examples/rl/configs/rl_from_base_qwen.toml +37 -0
examples/rl/configs/rl_from_base_qwen17.toml +76 -0
examples/rl/configs/rl_from_ft_qwen.toml +37 -0
examples/rl/run_eval.py +436 -0
examples/rl/run_rl_and_save.py +111 -0
examples/rl/task_app/README.md +22 -0
examples/rl/task_app/math_single_step.py +990 -0
examples/rl/task_app/math_task_app.py +111 -0
examples/sft/README.md +5 -5
examples/sft/configs/crafter_fft_qwen0p6b.toml +4 -2
examples/sft/configs/crafter_lora_qwen0p6b.toml +4 -3
examples/sft/evaluate.py +4 -4
examples/sft/export_dataset.py +7 -4
examples/sft/generate_traces.py +2 -0
examples/swe/task_app/README.md +1 -1
examples/swe/task_app/grpo_swe_mini.py +1 -1
examples/swe/task_app/grpo_swe_mini_task_app.py +0 -12
examples/swe/task_app/hosted/envs/mini_swe/environment.py +13 -13
examples/swe/task_app/hosted/policy_routes.py +0 -2
examples/swe/task_app/hosted/rollout.py +2 -8
examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
examples/task_apps/crafter/CREATE_SFT_DATASET.md +273 -0
examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +174 -0
examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +268 -0
examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
examples/task_apps/crafter/task_app/__init__.py +3 -0
examples/task_apps/crafter/task_app/grpo_crafter.py +309 -14
examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/environment.py +10 -0
examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +75 -4
examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +17 -2
examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +55 -3
examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +114 -32
examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +127 -27
examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +156 -0
examples/task_apps/enron/__init__.py +1 -0
examples/task_apps/enron/filter_sft.toml +5 -0
examples/task_apps/enron/tests/__init__.py +2 -0
examples/task_apps/enron/tests/integration/__init__.py +2 -0
examples/task_apps/enron/tests/integration/test_enron_eval.py +2 -0
examples/task_apps/enron/tests/unit/__init__.py +2 -0
examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +415 -0
examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +29 -0
examples/task_apps/pokemon_red/pallet_town_rl_config.toml +2 -0
examples/task_apps/pokemon_red/task_app.py +199 -6
examples/task_apps/pokemon_red/test_pallet_town_rewards.py +2 -0
examples/task_apps/sokoban/filter_sft.toml +5 -0
examples/task_apps/sokoban/tests/__init__.py +2 -0
examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
examples/task_apps/verilog/eval_groq_qwen32b.toml +8 -4
examples/task_apps/verilog/filter_sft.toml +5 -0
examples/task_apps/verilog/task_app/grpo_verilog.py +258 -23
examples/task_apps/verilog/tests/__init__.py +2 -0
examples/task_apps/verilog/tests/integration/__init__.py +2 -0
examples/task_apps/verilog/tests/integration/test_verilog_eval.py +2 -0
examples/task_apps/verilog/tests/unit/__init__.py +2 -0
examples/vlm/README.md +3 -3
examples/vlm/configs/crafter_vlm_gpt4o.toml +2 -0
examples/vlm/crafter_openai_vlm_agent.py +3 -5
examples/vlm/filter_image_rows.py +1 -1
examples/vlm/run_crafter_vlm_benchmark.py +2 -2
examples/warming_up_to_rl/_utils.py +92 -0
examples/warming_up_to_rl/analyze_trace_db.py +1 -1
examples/warming_up_to_rl/configs/crafter_fft.toml +2 -0
examples/warming_up_to_rl/configs/crafter_fft_4b.toml +2 -0
examples/warming_up_to_rl/configs/eval_fft_qwen4b.toml +2 -0
examples/warming_up_to_rl/configs/eval_groq_qwen32b.toml +2 -0
examples/warming_up_to_rl/configs/eval_modal_qwen4b.toml +2 -1
examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +2 -1
examples/warming_up_to_rl/configs/rl_from_ft.toml +2 -0
examples/warming_up_to_rl/export_trace_sft.py +174 -60
examples/warming_up_to_rl/groq_test.py +2 -0
examples/warming_up_to_rl/readme.md +63 -132
examples/warming_up_to_rl/run_fft_and_save.py +1 -1
examples/warming_up_to_rl/run_local_rollout.py +2 -0
examples/warming_up_to_rl/run_local_rollout_modal.py +2 -0
examples/warming_up_to_rl/run_local_rollout_parallel.py +2 -0
examples/warming_up_to_rl/run_local_rollout_traced.py +2 -0
examples/warming_up_to_rl/run_rl_and_save.py +1 -1
examples/warming_up_to_rl/run_rollout_remote.py +2 -0
examples/warming_up_to_rl/task_app/README.md +42 -0
examples/warming_up_to_rl/task_app/grpo_crafter.py +696 -0
examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +135 -0
examples/warming_up_to_rl/task_app/synth_envs_hosted/README.md +173 -0
examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +5 -0
examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +143 -0
examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +1226 -0
examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -0
examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +6 -0
examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -0
examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +522 -0
examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +478 -0
examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +108 -0
examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +305 -0
examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +47 -0
examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +204 -0
examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +5 -0
examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +618 -0
examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +100 -0
examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +1081 -0
examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +195 -0
examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +1861 -0
examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +5 -0
examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +211 -0
examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +161 -0
examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +137 -0
examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +62 -0
synth_ai/__init__.py +44 -30
synth_ai/_utils/__init__.py +47 -0
synth_ai/_utils/base_url.py +10 -0
synth_ai/_utils/http.py +10 -0
synth_ai/_utils/prompts.py +10 -0
synth_ai/_utils/task_app_state.py +12 -0
synth_ai/_utils/user_config.py +10 -0
synth_ai/api/models/supported.py +145 -7
synth_ai/api/train/__init__.py +13 -1
synth_ai/api/train/cli.py +30 -7
synth_ai/api/train/config_finder.py +18 -11
synth_ai/api/train/env_resolver.py +13 -10
synth_ai/cli/__init__.py +66 -49
synth_ai/cli/_modal_wrapper.py +9 -6
synth_ai/cli/_typer_patch.py +0 -2
synth_ai/cli/_validate_task_app.py +22 -4
synth_ai/cli/legacy_root_backup.py +3 -1
synth_ai/cli/lib/__init__.py +10 -0
synth_ai/cli/lib/task_app_discovery.py +7 -0
synth_ai/cli/lib/task_app_env.py +518 -0
synth_ai/cli/recent.py +1 -0
synth_ai/cli/setup.py +266 -0
synth_ai/cli/task_app_deploy.py +16 -0
synth_ai/cli/task_app_list.py +25 -0
synth_ai/cli/task_app_modal_serve.py +16 -0
synth_ai/cli/task_app_serve.py +18 -0
synth_ai/cli/task_apps.py +392 -141
synth_ai/cli/train.py +18 -0
synth_ai/cli/tui.py +62 -0
synth_ai/demos/__init__.py +10 -0
synth_ai/demos/core/__init__.py +28 -1
synth_ai/demos/crafter/__init__.py +1 -0
synth_ai/demos/crafter/crafter_fft_4b.toml +55 -0
synth_ai/demos/crafter/grpo_crafter_task_app.py +185 -0
synth_ai/demos/crafter/rl_from_base_qwen4b.toml +74 -0
synth_ai/demos/demo_registry.py +176 -0
synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
synth_ai/demos/math/__init__.py +1 -0
synth_ai/demos/math/_common.py +16 -0
synth_ai/demos/math/app.py +38 -0
synth_ai/demos/math/config.toml +76 -0
synth_ai/demos/math/deploy_modal.py +54 -0
synth_ai/demos/math/modal_task_app.py +702 -0
synth_ai/demos/math/task_app_entry.py +51 -0
synth_ai/environments/environment/core.py +7 -1
synth_ai/environments/examples/bandit/engine.py +0 -1
synth_ai/environments/examples/bandit/environment.py +0 -1
synth_ai/environments/examples/crafter_classic/environment.py +1 -1
synth_ai/environments/examples/verilog/engine.py +76 -10
synth_ai/environments/examples/wordle/environment.py +0 -1
synth_ai/evals/base.py +16 -5
synth_ai/evals/client.py +1 -1
synth_ai/inference/client.py +1 -1
synth_ai/learning/client.py +1 -1
synth_ai/learning/health.py +1 -1
synth_ai/learning/jobs.py +1 -1
synth_ai/learning/rl/client.py +1 -1
synth_ai/learning/rl/env_keys.py +1 -1
synth_ai/learning/rl/secrets.py +1 -1
synth_ai/learning/sft/client.py +1 -1
synth_ai/learning/sft/data.py +407 -4
synth_ai/learning/validators.py +4 -1
synth_ai/task/__init__.py +11 -1
synth_ai/task/apps/__init__.py +5 -2
synth_ai/task/config.py +259 -0
synth_ai/task/contracts.py +15 -2
synth_ai/task/rubrics/__init__.py +4 -2
synth_ai/task/rubrics/loaders.py +27 -4
synth_ai/task/rubrics/scoring.py +3 -0
synth_ai/task/rubrics.py +219 -0
synth_ai/task/trace_correlation_helpers.py +328 -0
synth_ai/task/tracing_utils.py +14 -3
synth_ai/task/validators.py +145 -2
synth_ai/tracing_v3/config.py +15 -13
synth_ai/tracing_v3/constants.py +21 -0
synth_ai/tracing_v3/db_config.py +3 -1
synth_ai/tracing_v3/decorators.py +10 -7
synth_ai/tracing_v3/session_tracer.py +10 -0
synth_ai/tracing_v3/turso/daemon.py +2 -2
synth_ai/tracing_v3/turso/native_manager.py +108 -77
synth_ai/tracing_v3/utils.py +1 -1
synth_ai/tui/__init__.py +5 -0
synth_ai/tui/__main__.py +13 -0
synth_ai/tui/cli/__init__.py +1 -0
synth_ai/tui/cli/query_experiments.py +164 -0
synth_ai/tui/cli/query_experiments_v3.py +164 -0
synth_ai/tui/dashboard.py +911 -0
synth_ai/utils/__init__.py +101 -0
synth_ai/utils/base_url.py +94 -0
synth_ai/utils/cli.py +131 -0
synth_ai/utils/env.py +287 -0
synth_ai/utils/http.py +169 -0
synth_ai/utils/modal.py +308 -0
synth_ai/utils/process.py +212 -0
synth_ai/utils/prompts.py +39 -0
synth_ai/utils/sqld.py +122 -0
synth_ai/utils/task_app_discovery.py +882 -0
synth_ai/utils/task_app_env.py +186 -0
synth_ai/utils/task_app_state.py +318 -0
synth_ai/utils/user_config.py +137 -0
synth_ai/v0/config/__init__.py +1 -5
synth_ai/v0/config/base_url.py +1 -7
synth_ai/v0/tracing/config.py +1 -1
synth_ai/v0/tracing/decorators.py +1 -1
synth_ai/v0/tracing/upload.py +1 -1
synth_ai/v0/tracing_v1/config.py +1 -1
synth_ai/v0/tracing_v1/decorators.py +1 -1
synth_ai/v0/tracing_v1/upload.py +1 -1
{synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/METADATA +85 -31
{synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/RECORD +286 -135
synth_ai/cli/man.py +0 -106
synth_ai/compound/cais.py +0 -0
synth_ai/core/experiment.py +0 -13
synth_ai/core/system.py +0 -15
synth_ai/demo_registry.py +0 -295
synth_ai/handshake.py +0 -109
synth_ai/http.py +0 -26
{synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/WHEEL +0 -0
{synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/entry_points.txt +0 -0
{synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/licenses/LICENSE +0 -0
{synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/top_level.txt +0 -0

synth_ai/cli/task_apps.py CHANGED Viewed

@@ -36,19 +36,29 @@ except Exception:  # pragma: no cover - fallback
 import click
 from click.exceptions import Abort
-from synth_ai.tracing_v3 import (  # type: ignore[import-untyped]
-    BaseEvent,
-    EnvironmentEvent,
-    RuntimeEvent,
-    SessionEventMarkovBlanketMessage,
-    SessionMessageContent,
-    SessionTimeStep,
-    SessionTracer,
-    TimeRecord,
-)
-from synth_ai.tracing_v3 import (  # type: ignore[import-untyped]
-    SessionTrace as V3SessionTrace,
-)
+# Tracing imports - make conditional for optional dependencies
+try:
+    from synth_ai.tracing_v3 import (  # type: ignore[import-untyped]
+        BaseEvent,
+        EnvironmentEvent,
+        RuntimeEvent,
+        SessionEventMarkovBlanketMessage,
+        SessionMessageContent,
+        SessionTimeStep,
+        SessionTracer,
+        TimeRecord,
+    )
+    from synth_ai.tracing_v3 import (  # type: ignore[import-untyped]
+        SessionTrace as V3SessionTrace,
+    )
+    _TRACING_AVAILABLE = True
+except (ImportError, ModuleNotFoundError, TypeError):
+    # Tracing system not available (missing optional dependencies)
+    BaseEvent = EnvironmentEvent = RuntimeEvent = None  # type: ignore
+    SessionEventMarkovBlanketMessage = SessionMessageContent = None  # type: ignore
+    SessionTimeStep = SessionTracer = TimeRecord = None  # type: ignore
+    V3SessionTrace = None  # type: ignore
+    _TRACING_AVAILABLE = False
 # ---------------------------------------------------------------------------
 # Dynamic imports to avoid hard dependencies during type checking.
@@ -231,6 +241,24 @@ def _event_from_dict(payload: dict[str, Any]) -> BaseEvent:
             system_state_after=payload.get("system_state_after"),
             **base_kwargs,
         )
+    # Check for LM CAIS event fields
+    if any(key in payload for key in ("model_name", "provider", "call_records")):
+        from synth_ai.tracing_v3.abstractions import LMCAISEvent
+        # Note: call_records are left as dicts - the storage layer will handle serialization
+        call_records = payload.get("call_records") or []
+        return LMCAISEvent(
+            model_name=payload.get("model_name", ""),
+            provider=payload.get("provider", ""),
+            input_tokens=payload.get("input_tokens"),
+            output_tokens=payload.get("output_tokens"),
+            total_tokens=payload.get("total_tokens"),
+            cost_usd=payload.get("cost_usd"),
+            latency_ms=payload.get("latency_ms"),
+            span_id=payload.get("span_id"),
+            trace_id=payload.get("trace_id"),
+            call_records=call_records,
+            **base_kwargs,
+        )
     return BaseEvent(**base_kwargs)
@@ -320,21 +348,51 @@ async def _store_trace(
     trace_namespace: dict[str, Any] | None,
     extra_metadata: dict[str, Any] | None = None,
 ):
+    import logging
+    _logger = logging.getLogger(__name__)
+    _logger.info(f"[STORE_TRACE_DEBUG] Called with tracer={tracer is not None}, trace_namespace={trace_namespace is not None}")
     if tracer is None or not isinstance(trace_namespace, dict):
+        _logger.warning(f"[STORE_TRACE_DEBUG] Early return: tracer={tracer is not None}, trace_namespace type={type(trace_namespace)}")
         return
+    _logger.info(f"[STORE_TRACE_DEBUG] trace_namespace keys: {list(trace_namespace.keys())}")
+    # Handle both formats:
+    # - With session_trace key: {"session_trace": {...}}
+    # - Without session_trace key (trace itself is the session): {"session_id": ..., "markov_blanket_message_history": ...}
     session_payload = trace_namespace.get("session_trace")
     if not isinstance(session_payload, dict):
-        return
+        # If no session_trace key, assume "full" format where trace itself is the session_trace
+        if "session_id" in trace_namespace:
+            session_payload = trace_namespace
+            _logger.info("[STORE_TRACE_DEBUG] Using trace_namespace directly as session_payload (no session_trace key)")
+        else:
+            _logger.warning(f"[STORE_TRACE_DEBUG] No session_trace found or wrong type: {type(session_payload)}")
+            return
+    _logger.info(f"[STORE_TRACE_DEBUG] session_payload keys: {list(session_payload.keys())}")
+    msg_count = len(session_payload.get("markov_blanket_message_history", []))
+    _logger.info(f"[STORE_TRACE_DEBUG] Found {msg_count} messages in session_payload")
     trace_obj = _session_trace_from_dict(session_payload)
     if trace_obj is None:
+        _logger.warning("[STORE_TRACE_DEBUG] _session_trace_from_dict returned None")
         return
+    _logger.info(f"[STORE_TRACE_DEBUG] Created SessionTrace object with {len(trace_obj.markov_blanket_message_history)} messages")
     if tracer.db is None:
         await tracer.initialize()
     meta = dict(trace_obj.metadata or {})
     if extra_metadata:
         meta.update(extra_metadata)
     trace_obj.metadata = meta
+    _logger.info(f"[STORE_TRACE_DEBUG] Calling insert_session_trace for session_id={trace_obj.session_id}")
     await tracer.db.insert_session_trace(trace_obj)
+    _logger.info("[STORE_TRACE_DEBUG] Successfully inserted trace")
 def _temporary_sys_path(paths: Sequence[Path]):
     """Context manager to prepend entries to sys.path temporarily."""
@@ -881,43 +939,43 @@ def _build_modal_config_from_ast(modal_call: ast.Call) -> ModalDeploymentConfigT
         for kw in modal_call.keywords:
             if kw.arg and isinstance(kw.value, ast.Constant):
                 kwargs[kw.arg] = kw.value.value
-            elif kw.arg == "pip_packages" and isinstance(kw.value, (ast.List, ast.Tuple)):
+            elif kw.arg == "pip_packages" and isinstance(kw.value, ast.List | ast.Tuple):
                 # Handle pip_packages list/tuple
                 packages: list[str] = []
                 value_node = kw.value
-                if isinstance(value_node, (ast.List, ast.Tuple)):
+                if isinstance(value_node, ast.List | ast.Tuple):
                     for elt in value_node.elts:
                         if isinstance(elt, ast.Constant):
                             packages.append(elt.value)
                 kwargs[kw.arg] = tuple(packages)
-            elif kw.arg == "extra_local_dirs" and isinstance(kw.value, (ast.List, ast.Tuple)):
+            elif kw.arg == "extra_local_dirs" and isinstance(kw.value, ast.List | ast.Tuple):
                 # Handle extra_local_dirs list/tuple of tuples
                 dirs = []
                 value_node = kw.value
-                if isinstance(value_node, (ast.List, ast.Tuple)):
+                if isinstance(value_node, ast.List | ast.Tuple):
                     for elt in value_node.elts:
-                        if isinstance(elt, (ast.List, ast.Tuple)) and len(elt.elts) == 2:
+                        if isinstance(elt, ast.List | ast.Tuple) and len(elt.elts) == 2:
                             src = elt.elts[0].value if isinstance(elt.elts[0], ast.Constant) else None
                             dst = elt.elts[1].value if isinstance(elt.elts[1], ast.Constant) else None
                             if src and dst:
                                 dirs.append((src, dst))
                 kwargs[kw.arg] = tuple(dirs)
-            elif kw.arg == "secret_names" and isinstance(kw.value, (ast.List, ast.Tuple)):
+            elif kw.arg == "secret_names" and isinstance(kw.value, ast.List | ast.Tuple):
                 # Handle secret_names list/tuple
                 secrets = []
                 value_node = kw.value
-                if isinstance(value_node, (ast.List, ast.Tuple)):
+                if isinstance(value_node, ast.List | ast.Tuple):
                     for elt in value_node.elts:
                         if isinstance(elt, ast.Constant):
                             secrets.append(elt.value)
                 kwargs[kw.arg] = tuple(secrets)
-            elif kw.arg == "volume_mounts" and isinstance(kw.value, (ast.List, ast.Tuple)):
+            elif kw.arg == "volume_mounts" and isinstance(kw.value, ast.List | ast.Tuple):
                 # Handle volume_mounts list/tuple of tuples
                 mounts = []
                 value_node = kw.value
-                if isinstance(value_node, (ast.List, ast.Tuple)):
+                if isinstance(value_node, ast.List | ast.Tuple):
                     for elt in value_node.elts:
-                        if isinstance(elt, (ast.List, ast.Tuple)) and len(elt.elts) == 2:
+                        if isinstance(elt, ast.List | ast.Tuple) and len(elt.elts) == 2:
                             name = elt.elts[0].value if isinstance(elt.elts[0], ast.Constant) else None
                             mount = elt.elts[1].value if isinstance(elt.elts[1], ast.Constant) else None
                             if name and mount:
@@ -2213,7 +2271,7 @@ def validate_task_app_cmd(
     import time
     # Import the validate_task_app function defined in this module
-    from synth_ai.cli._validate_task_app import validate_task_app  # type: ignore[attr-defined]
+    from ._validate_task_app import validate_task_app  # type: ignore[attr-defined]
     proc = None
     task_app_url = url
@@ -3044,6 +3102,11 @@ def _write_modal_entrypoint(
     if not any(str(p).startswith("synth-ai") for p in pip_packages):
         pip_packages.insert(0, synth_pkg)
+    apt_packages = list(modal_cfg.apt_packages)
+    click.echo(f"[DEBUG] modal_cfg.apt_packages type: {type(modal_cfg.apt_packages)}")
+    click.echo(f"[DEBUG] modal_cfg.apt_packages value: {modal_cfg.apt_packages}")
+    click.echo(f"[DEBUG] apt_packages after list(): {apt_packages}")
     local_dirs = [(str(Path(src)), dst) for src, dst in modal_cfg.extra_local_dirs]
     # Also mount the host synth_ai source if available to ensure latest code is used
     if host_synth is not None:
@@ -3090,6 +3153,15 @@ INLINE_SECRET_VALUES = {inline_secret_values!r}
 image = Image.debian_slim(python_version={modal_cfg.python_version!r})
+# CRITICAL: Install iverilog for Verilog task app (hardcoded to prevent config issues)
+if {entry.app_id!r} == "grpo-verilog":
+    image = image.apt_install("iverilog")
+# Install apt packages first (before pip)
+apt_packages = {apt_packages!r}
+if apt_packages:
+    image = image.apt_install(*apt_packages)
 pip_packages = {pip_packages!r}
 if pip_packages:
     image = image.pip_install(*pip_packages)
@@ -3251,7 +3323,7 @@ def register(cli: click.Group) -> None:
 )
 @click.option(
     "--trace-db",
-    default="traces/v3/eval_traces.db",
+    default="traces/v3/synth_ai.db",
     show_default=True,
     help="SQLite/Turso URL for storing rollout traces set to 'none' to disable persistence.",
 )
@@ -3284,8 +3356,13 @@ def eval_command(
     pointing at a remote `--url`, supply matching `--env-file` values so the CLI can
     forward authentication headers to the running service.
     """
+    # Parse and validate TOML config
+    from synth_ai.task.config import EvalConfig
     cfg: dict[str, Any] = {}
+    eval_cfg: EvalConfig | None = None
     config_path: Path | None = None
     if config:
         config_path = Path(config)
     else:
@@ -3307,21 +3384,37 @@ def eval_command(
             if isinstance(parsed, dict):
                 section = parsed.get("eval")
                 cfg = dict(section) if isinstance(section, dict) else dict(parsed)
+            # Validate config with dataclass
+            try:
+                eval_cfg = EvalConfig.from_dict(cfg)
+                click.echo(f"✓ Config validated: {len(eval_cfg.seeds)} seeds, model={eval_cfg.model}")
+            except (ValueError, TypeError) as validation_error:
+                raise click.ClickException(f"Invalid eval config: {validation_error}") from validation_error
+        except click.ClickException:
+            raise
         except Exception as exc:
             raise click.ClickException(f"Failed to parse TOML '{config_path}': {exc}") from exc
-    app_id = app_id or (cfg.get("app_id") if isinstance(cfg.get("app_id"), str) else None)  # type: ignore
+    # CLI args override config
+    if eval_cfg:
+        app_id = app_id or eval_cfg.app_id
+    else:
+        app_id = app_id or (cfg.get("app_id") if isinstance(cfg.get("app_id"), str) else None)  # type: ignore
     metadata_filters: dict[str, str] = {}
-    cfg_metadata = cfg.get("metadata")
-    if isinstance(cfg_metadata, dict):
-        for key, value in cfg_metadata.items():
-            metadata_filters[str(key)] = str(value)
-    elif isinstance(cfg_metadata, list):
-        for item in cfg_metadata:
-            if isinstance(item, str) and "=" in item:
-                key, value = item.split("=", 1)
-                metadata_filters[key.strip()] = value.strip()
+    if eval_cfg:
+        metadata_filters.update(eval_cfg.metadata)
+    else:
+        cfg_metadata = cfg.get("metadata")
+        if isinstance(cfg_metadata, dict):
+            for key, value in cfg_metadata.items():
+                metadata_filters[str(key)] = str(value)
+        elif isinstance(cfg_metadata, list):
+            for item in cfg_metadata:
+                if isinstance(item, str) and "=" in item:
+                    key, value = item.split("=", 1)
+                    metadata_filters[key.strip()] = value.strip()
     for item in metadata or ():
         if "=" not in item:
@@ -3334,11 +3427,14 @@ def eval_command(
         metadata_filters[key] = value
     metadata_sql_query: str | None = None
-    cfg_metadata_sql = cfg.get("metadata_sql")
-    if isinstance(cfg_metadata_sql, dict):
-        metadata_sql_query = cfg_metadata_sql.get("query") or cfg_metadata_sql.get("sql")
-    elif isinstance(cfg_metadata_sql, str):
-        metadata_sql_query = cfg_metadata_sql
+    if eval_cfg and eval_cfg.metadata_sql:
+        metadata_sql_query = eval_cfg.metadata_sql
+    else:
+        cfg_metadata_sql = cfg.get("metadata_sql")
+        if isinstance(cfg_metadata_sql, dict):
+            metadata_sql_query = cfg_metadata_sql.get("query") or cfg_metadata_sql.get("sql")
+        elif isinstance(cfg_metadata_sql, str):
+            metadata_sql_query = cfg_metadata_sql
     if metadata_sql:
         metadata_sql_query = metadata_sql
@@ -3780,18 +3876,52 @@ def eval_command(
             async def _run_seed(seed_val: int) -> None:
                 nonlocal successes, failures, outcome_sum, outcome_count, outcome_correct, records
+                # Read env_name and policy_name from config if available
+                env_name = cfg.get("env_name") or (cfg.get("env", {}).get("env_name") if isinstance(cfg.get("env"), dict) else None)
+                policy_name = cfg.get("policy_name") or (cfg.get("policy", {}).get("policy_name") if isinstance(cfg.get("policy"), dict) else None)
+                env_config_overrides = cfg.get("env_config", {}) if isinstance(cfg.get("env_config"), dict) else {}
+                policy_config_overrides = cfg.get("policy_config", {}) if isinstance(cfg.get("policy_config"), dict) else {}
+                # Debug: print config parsing
+                if seed_val == 0:
+                    click.echo(f"[DEBUG] env_name from config: {env_name}")
+                    click.echo(f"[DEBUG] policy_name from config: {policy_name}")
+                # Generate default ops sequence if not provided
+                max_llm_calls = policy_config_overrides.get("max_llm_calls", 10)
+                ops_list = cfg.get("ops", [])
+                if not ops_list:
+                    # Generate default "agent, env" pairs for max_llm_calls
+                    ops_list = ["agent", "env"] * int(max_llm_calls)
                 body = {
                     "run_id": str(uuid.uuid4()),
-                    "env": {"config": {"split": split, "index": seed_val}, "seed": seed_val},
+                    "env": {"config": {"split": split, "index": seed_val, **env_config_overrides}, "seed": seed_val},
                     "policy": {
-                        "policy_name": selected_model,
-                        "config": {"model": selected_model, **policy_overrides},
+                        "policy_name": policy_name or selected_model,
+                        "config": {"model": selected_model, **policy_overrides, **policy_config_overrides},
+                    },
+                    "ops": ops_list,
+                    "record": {
+                        "return_trace": cfg.get("return_trace", True),
+                        "trace_format": cfg.get("trace_format", "structured"),
                     },
-                    "ops": [],
+                    "mode": "eval",  # RolloutMode.EVAL: use inference URLs as-is, no transformations
                 }
+                if env_name:
+                    body["env"]["env_name"] = env_name
+                # Debug: print the body being sent
+                if seed_val == 0:
+                    click.echo(f"[DEBUG] rollout body env: {body['env']}")
+                    click.echo(f"[DEBUG] rollout body policy: {body['policy']}")
+                    click.echo(f"[DEBUG] rollout body mode: {body.get('mode', 'NOT SET')}")
                 rollout_elapsed: float | None = None
                 rollout_start = time.perf_counter()
                 try:
+                    import logging
+                    _log = logging.getLogger(__name__)
+                    _log.info(f"[EVAL_BODY_DEBUG] Sending body with mode={body.get('mode')}")
                     async with semaphore:
                         response = await async_client.post("/rollout", json=body)
                     rollout_elapsed = time.perf_counter() - rollout_start
@@ -3812,6 +3942,10 @@ def eval_command(
                     data = response.json()
                 except Exception:
                     data = None
+                # Debug: print validation errors
+                if response.status_code == 422 and data:
+                    click.echo(f"[DEBUG] 422 Validation Error: {data}")
                 metrics: dict[str, Any] | None = None
                 completion: str | None = None
@@ -3825,16 +3959,33 @@ def eval_command(
                 session_trace_dict: dict[str, Any] | None = None
                 if isinstance(data, dict):
+                    import logging
+                    _logger = logging.getLogger(__name__)
+                    _logger.info(f"[EVAL_DEBUG] Response data keys: {list(data.keys())}")
+                    if "detail" in data:
+                        _logger.error(f"[EVAL_DEBUG] Task app returned error: {data['detail']}")
                     trace_namespace = data.get("trace")
+                    _logger.info(f"[EVAL_DEBUG] trace_namespace type: {type(trace_namespace)}, value: {trace_namespace if not isinstance(trace_namespace, dict) else 'dict with keys: ' + str(list(trace_namespace.keys()) if trace_namespace else 'None')}")
                     if not isinstance(trace_namespace, dict):
                         raise RuntimeError(
-                            "rollout response missing trace payload; task app must return tracing_v3 data"
+                            "The 'synth-ai eval' command requires trace payloads in rollout responses. "
+                            "Ensure the rollout request includes 'trace_format': 'structured' and 'return_trace': true, "
+                            "and that task app tracing is enabled (TASKAPP_TRACING_ENABLED=1). "
+                            "Note: This is specific to the eval command - general rollout endpoints don't require traces."
                         )
+                    # Handle both "compact" and "full" trace formats:
+                    # - compact: trace_namespace contains {session_id, metadata, ...}
+                    # - full: trace_namespace IS the full session_trace dict
                     session_trace_dict = trace_namespace.get("session_trace")
                     if not isinstance(session_trace_dict, dict):
-                        raise RuntimeError(
-                            "rollout response trace missing 'session_trace'; ensure the task app is serving the tracing_v3 build"
-                        )
+                        # If no session_trace key, assume "full" format where trace itself is the session_trace
+                        if "session_id" in trace_namespace:
+                            session_trace_dict = trace_namespace
+                        else:
+                            raise RuntimeError(
+                                "The 'synth-ai eval' command requires 'session_trace' in the trace payload or a valid full trace format. "
+                                "Ensure the task app is using tracing_v3 and returning structured trace data."
+                            )
                     metrics = data.get("metrics") if isinstance(data.get("metrics"), dict) else None
                     if metrics:
                         mean_return = metrics.get("mean_return") or metrics.get("total_reward")
@@ -3956,26 +4107,27 @@ def eval_command(
                     for spec in judge_specs:
                         score_value: float | None = None
                         judge_elapsed: float | None = None
-                        if completion is not None:
-                            judge_payload = {
-                                "seed": seed_val,
-                                "prompt_index": prompt_index,
-                                "prompt": prompt_text,
-                                "completion": completion,
-                                "metrics": metrics,
-                                "response": data,
-                                "trace": trace_namespace,
-                            }
-                            try:
-                                judge_start = time.perf_counter()
-                                result = spec.fn(judge_payload, **spec.kwargs)
+                        # Run judges for all tasks (text-based and trajectory-based)
+                        # Text-based tasks have completion, trajectory-based tasks use response
+                        judge_payload = {
+                            "seed": seed_val,
+                            "prompt_index": prompt_index,
+                            "prompt": prompt_text,
+                            "completion": completion,
+                            "metrics": metrics,
+                            "response": data,
+                            "trace": trace_namespace,
+                        }
+                        try:
+                            judge_start = time.perf_counter()
+                            result = spec.fn(judge_payload, **spec.kwargs)
+                            judge_elapsed = time.perf_counter() - judge_start
+                            if isinstance(result, int | float):
+                                score_value = float(result)
+                        except Exception as exc:
+                            if judge_elapsed is None:
                                 judge_elapsed = time.perf_counter() - judge_start
-                                if isinstance(result, int | float):
-                                    score_value = float(result)
-                            except Exception as exc:
-                                if judge_elapsed is None:
-                                    judge_elapsed = time.perf_counter() - judge_start
-                                click.echo(f"seed={seed_val} judge[{spec.name}]_error={exc}")
+                            click.echo(f"seed={seed_val} judge[{spec.name}]_error={exc}")
                         judges_timings[spec.name] = judge_elapsed
                         judge_scores[spec.name] = score_value
@@ -4129,6 +4281,9 @@ def filter_command(config_path: str) -> None:
     high-quality traces. See `customers/agora_single_file/configs/filter_local.toml`
     for a working example.
     """
+    # Parse and validate TOML config
+    from synth_ai.task.config import FilterConfig
     if _toml is None:
         raise click.ClickException("TOML parser not available; install tomli or use Python 3.11+")
@@ -4141,58 +4296,37 @@ def filter_command(config_path: str) -> None:
     except Exception as exc:
         raise click.ClickException(f"Failed to parse TOML '{cfg_path}': {exc}") from exc
-    filter_cfg = config_data.get("filter") if isinstance(config_data, dict) else None
-    if not isinstance(filter_cfg, dict):
+    filter_cfg_dict = config_data.get("filter") if isinstance(config_data, dict) else None
+    if not isinstance(filter_cfg_dict, dict):
         raise click.ClickException("Config must contain a [filter] table")
-    db_value = str(filter_cfg.get("db", "traces/v3/eval_traces.db")).strip()
-    if not db_value:
-        raise click.ClickException("filter.db must be provided")
-    if "://" in db_value:
-        db_url = db_value
-    else:
-        db_path = Path(db_value).expanduser()
-        db_path.parent.mkdir(parents=True, exist_ok=True)
-        db_url = f"sqlite+aiosqlite:///{db_path}"
-    output_value = filter_cfg.get("output")
-    if not output_value:
-        raise click.ClickException("filter.output must be provided")
-    output_path = Path(str(output_value)).expanduser()
-    splits = set(filter_cfg.get("splits", []) or [])
-    task_ids = set(filter_cfg.get("task_ids", []) or [])
-    models = set(filter_cfg.get("models", []) or [])
-    min_official = filter_cfg.get("min_official_score")
-    max_official = filter_cfg.get("max_official_score")
-    if min_official is not None:
-        try:
-            min_official = float(min_official)
-        except Exception as err:
-            raise click.ClickException("filter.min_official_score must be numeric") from err
-    if max_official is not None:
-        try:
-            max_official = float(max_official)
-        except Exception as err:
-            raise click.ClickException("filter.max_official_score must be numeric") from err
-    min_judge_scores = filter_cfg.get("min_judge_scores", {}) or {}
-    max_judge_scores = filter_cfg.get("max_judge_scores", {}) or {}
+    # Validate config with dataclass
     try:
-        min_judge_scores = {k: float(v) for k, v in min_judge_scores.items()}
-    except Exception as err:
-        raise click.ClickException("filter.min_judge_scores values must be numeric") from err
-    try:
-        max_judge_scores = {k: float(v) for k, v in max_judge_scores.items()}
-    except Exception as err:
-        raise click.ClickException("filter.max_judge_scores values must be numeric") from err
-    min_created = _parse_datetime_for_trace(filter_cfg.get("min_created_at"))
-    max_created = _parse_datetime_for_trace(filter_cfg.get("max_created_at"))
-    limit = filter_cfg.get("limit")
-    if limit is not None:
-        try:
-            limit = int(limit)
-        except Exception as err:
-            raise click.ClickException("filter.limit must be an integer") from err
+        filter_cfg = FilterConfig.from_dict(filter_cfg_dict)
+        click.echo(f"✓ Config validated: db={filter_cfg.db}, output={filter_cfg.output}")
+        if filter_cfg.min_official_score is not None:
+            click.echo(f"  → Filtering for official score >= {filter_cfg.min_official_score}")
+        if filter_cfg.limit:
+            click.echo(f"  → Limiting to {filter_cfg.limit} examples")
+    except (ValueError, TypeError) as validation_error:
+        raise click.ClickException(f"Invalid filter config: {validation_error}") from validation_error
+    # Use validated config
+    db_url = filter_cfg.get_db_url()
+    output_path = filter_cfg.get_output_path()
+    # Extract validated fields from dataclass
+    splits = set(filter_cfg.splits)
+    task_ids = set(filter_cfg.task_ids)
+    models = set(filter_cfg.models)
+    min_official = filter_cfg.min_official_score
+    max_official = filter_cfg.max_official_score
+    min_judge_scores = filter_cfg.min_judge_scores
+    max_judge_scores = filter_cfg.max_judge_scores
+    # Note: min_created_at and max_created_at not yet in FilterConfig dataclass
+    min_created = _parse_datetime_for_trace(filter_cfg_dict.get("min_created_at"))
+    max_created = _parse_datetime_for_trace(filter_cfg_dict.get("max_created_at"))
+    limit = filter_cfg.limit
     def _score_ok(value: Any, min_val: Any, max_val: Any) -> bool:
         try:
@@ -4247,8 +4381,21 @@ def filter_command(config_path: str) -> None:
             if max_created and (created_at_dt is None or created_at_dt > max_created):
                 continue
-            if not _score_ok(metadata.get("official_score"), min_official, max_official):
-                continue
+            # Check against outcome_rewards if score filter is set
+            total_reward = None
+            achievements_count = None
+            if min_official is not None or max_official is not None:
+                reward_query = "SELECT total_reward, achievements_count FROM outcome_rewards WHERE session_id = :session_id"
+                reward_rows = await tracer.db.query_traces(reward_query, {"session_id": session_id})
+                reward_records = reward_rows.to_dict("records") if hasattr(reward_rows, "to_dict") else []
+                if reward_records:
+                    total_reward = reward_records[0].get("total_reward")
+                    achievements_count = reward_records[0].get("achievements_count")
+                    if not _score_ok(total_reward, min_official, max_official):
+                        continue
+                elif min_official is not None:
+                    # No reward found, but score filter requires it
+                    continue
             judge_scores = metadata.get("judge_scores") or {}
             include = True
@@ -4265,30 +4412,134 @@ def filter_command(config_path: str) -> None:
             if not include:
                 continue
-            prompt = metadata.get("prompt") or ""
-            completion = metadata.get("completion") or ""
-            if not prompt or not completion:
+            # Query messages for this session
+            messages_query = """
+                SELECT message_type, content, timestamp
+                FROM messages
+                WHERE session_id = :session_id
+                ORDER BY timestamp ASC, id ASC
+            """
+            msg_df = await tracer.db.query_traces(messages_query, {"session_id": session_id})
+            message_rows = msg_df.to_dict("records") if hasattr(msg_df, "to_dict") else []
+            if not message_rows:
+                # Fallback: check if prompt/completion in metadata (old format)
+                prompt = metadata.get("prompt") or ""
+                completion = metadata.get("completion") or ""
+                if prompt and completion:
+                    record = {
+                        "messages": [
+                            {"role": "user", "content": str(prompt)},
+                            {"role": "assistant", "content": str(completion)},
+                        ],
+                        "metadata": {
+                            "session_id": session_id,
+                            "env_name": metadata.get("env_name"),
+                            "policy_name": metadata.get("policy_name"),
+                            "seed": metadata.get("seed"),
+                            "total_reward": total_reward,
+                            "achievements_count": achievements_count,
+                            "model": metadata.get("model"),
+                            "created_at": created_at_dt.isoformat() if created_at_dt else created_at_raw,
+                        },
+                    }
+                    accepted.append(record)
                 continue
-            record = {
-                "messages": [
-                    {"role": "user", "content": str(prompt)},
-                    {"role": "assistant", "content": str(completion)},
-                ],
-                "metadata": {
-                    "session_id": session_id,
-                    "task_id": metadata.get("task_id"),
-                    "task_split": metadata.get("task_split"),
-                    "task_rubric_id": metadata.get("task_rubric_id"),
-                    "official_score": metadata.get("official_score"),
-                    "judge_scores": judge_scores,
-                    "model": metadata.get("model"),
-                    "created_at": created_at_dt.isoformat() if created_at_dt else created_at_raw,
-                    "prompt": prompt,
-                    "completion": completion,
-                },
-            }
-            accepted.append(record)
+            # Extract user/assistant pairs from messages
+            for i, msg_row in enumerate(message_rows):
+                msg_type = msg_row.get("message_type")
+                content_raw = msg_row.get("content")
+                # Look for user message
+                if msg_type in ("user", "policy_user_prompt"):
+                    # Find next policy_system_prompt or assistant
+                    assistant_msg = None
+                    for j in range(i + 1, len(message_rows)):
+                        next_type = message_rows[j].get("message_type")
+                        if next_type in ("assistant", "policy_system_prompt"):
+                            if next_type == "assistant":
+                                assistant_msg = message_rows[j]
+                            break
+                    # Parse content
+                    try:
+                        user_content = json.loads(content_raw) if isinstance(content_raw, str) else content_raw
+                    except Exception:
+                        user_content = content_raw
+                    # If user_content is a message dict with a 'content' key, extract it
+                    if isinstance(user_content, dict) and "content" in user_content:
+                        user_content = user_content["content"]
+                    # Extract text from structured content
+                    def extract_text(content: Any) -> str:
+                        if isinstance(content, str):
+                            return content
+                        if isinstance(content, dict):
+                            # Try payload.content for user prompts
+                            if "payload" in content and isinstance(content["payload"], dict):
+                                payload = content["payload"]
+                                if "content" in payload:
+                                    return extract_text(payload["content"])
+                            # Try common keys
+                            for key in ["text", "content", "content_text"]:
+                                if key in content:
+                                    val = content[key]
+                                    if isinstance(val, str):
+                                        return val
+                            return json.dumps(content)
+                        if isinstance(content, list):
+                            # Multimodal content - concatenate text parts
+                            parts = []
+                            for item in content:
+                                if isinstance(item, dict) and item.get("type") == "text":
+                                    parts.append(item.get("text", ""))
+                            return " ".join(parts) if parts else str(content)
+                        return str(content)
+                    user_text = extract_text(user_content)
+                    # For assistant, we might not have it recorded, so use tool calls as completion
+                    assistant_text = ""
+                    assistant_content = None
+                    if assistant_msg:
+                        assistant_content_raw = assistant_msg.get("content")
+                        try:
+                            assistant_content = json.loads(assistant_content_raw) if isinstance(assistant_content_raw, str) else assistant_content_raw
+                        except Exception:
+                            assistant_content = assistant_content_raw
+                        # If assistant_content is a message dict with a 'content' key, extract it
+                        if isinstance(assistant_content, dict) and "content" in assistant_content:
+                            assistant_content = assistant_content["content"]
+                        assistant_text = extract_text(assistant_content)
+                    if not user_text:
+                        continue
+                    # Use full multimodal content if it's a list (contains images), otherwise use text
+                    user_content_for_message = user_content if isinstance(user_content, list) else user_text
+                    assistant_content_for_message = assistant_content if isinstance(assistant_content, list) else (assistant_text if assistant_text else "[no response recorded]")
+                    record = {
+                        "messages": [
+                            {"role": "user", "content": user_content_for_message},
+                            {"role": "assistant", "content": assistant_content_for_message},
+                        ],
+                        "metadata": {
+                            "session_id": session_id,
+                            "env_name": metadata.get("env_name"),
+                            "policy_name": metadata.get("policy_name"),
+                            "seed": metadata.get("seed"),
+                            "total_reward": total_reward,
+                            "achievements_count": achievements_count,
+                            "model": metadata.get("model"),
+                            "created_at": created_at_dt.isoformat() if created_at_dt else created_at_raw,
+                        },
+                    }
+                    accepted.append(record)
         if not accepted:
             raise click.ClickException("No sessions matched the provided filters")

synth-ai 0.2.13.dev2__py3-none-any.whl → 0.2.16__py3-none-any.whl

Potentially problematic release.

synth-ai 0.2.13.dev2py3-none-any.whl → 0.2.16py3-none-any.whl