PyPI - synth-ai - Versions diffs - 0.2.16__py3-none-any.whl → 0.2.19__py3-none-any.whl - Mend

synth-ai 0.2.16py3-none-any.whl → 0.2.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of synth-ai might be problematic. Click here for more details.

Files changed (299) hide show

examples/analyze_semantic_words.sh +2 -2
examples/baseline/banking77_baseline.py +204 -0
examples/baseline/crafter_baseline.py +407 -0
examples/baseline/pokemon_red_baseline.py +326 -0
examples/baseline/simple_baseline.py +56 -0
examples/baseline/warming_up_to_rl_baseline.py +239 -0
examples/blog_posts/gepa/README.md +355 -0
examples/blog_posts/gepa/configs/banking77_gepa_local.toml +95 -0
examples/blog_posts/gepa/configs/banking77_gepa_test.toml +82 -0
examples/blog_posts/gepa/configs/banking77_mipro_local.toml +52 -0
examples/blog_posts/gepa/configs/hotpotqa_gepa_local.toml +59 -0
examples/blog_posts/gepa/configs/hotpotqa_gepa_qwen.toml +36 -0
examples/blog_posts/gepa/configs/hotpotqa_mipro_local.toml +53 -0
examples/blog_posts/gepa/configs/hover_gepa_local.toml +59 -0
examples/blog_posts/gepa/configs/hover_gepa_qwen.toml +36 -0
examples/blog_posts/gepa/configs/hover_mipro_local.toml +53 -0
examples/blog_posts/gepa/configs/ifbench_gepa_local.toml +59 -0
examples/blog_posts/gepa/configs/ifbench_gepa_qwen.toml +36 -0
examples/blog_posts/gepa/configs/ifbench_mipro_local.toml +53 -0
examples/blog_posts/gepa/configs/pupa_gepa_local.toml +60 -0
examples/blog_posts/gepa/configs/pupa_mipro_local.toml +54 -0
examples/blog_posts/gepa/deploy_banking77_task_app.sh +41 -0
examples/blog_posts/gepa/gepa_baseline.py +204 -0
examples/blog_posts/gepa/query_prompts_example.py +97 -0
examples/blog_posts/gepa/run_gepa_banking77.sh +87 -0
examples/blog_posts/gepa/task_apps.py +105 -0
examples/blog_posts/gepa/test_gepa_local.sh +67 -0
examples/blog_posts/gepa/verify_banking77_setup.sh +123 -0
examples/blog_posts/pokemon_vl/README.md +98 -0
examples/blog_posts/pokemon_vl/configs/eval_gpt5nano.toml +26 -0
examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml +27 -0
examples/blog_posts/pokemon_vl/configs/eval_rl_final.toml +24 -0
examples/blog_posts/pokemon_vl/configs/filter_high_reward.toml +10 -0
examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml +43 -0
examples/blog_posts/pokemon_vl/configs/train_sft_qwen4b_vl.toml +40 -0
examples/blog_posts/pokemon_vl/extract_images.py +239 -0
examples/blog_posts/pokemon_vl/pokemon_vl_baseline.py +326 -0
examples/blog_posts/pokemon_vl/run_eval_extract_images.py +209 -0
examples/blog_posts/pokemon_vl/run_qwen_eval_extract_images.py +212 -0
examples/blog_posts/pokemon_vl/text_box_analysis.md +106 -0
examples/blog_posts/warming_up_to_rl/ARCHITECTURE.md +195 -0
examples/blog_posts/warming_up_to_rl/FINAL_TEST_RESULTS.md +127 -0
examples/blog_posts/warming_up_to_rl/INFERENCE_SUCCESS.md +132 -0
examples/blog_posts/warming_up_to_rl/README.md +158 -0
examples/blog_posts/warming_up_to_rl/SMOKE_TESTING.md +164 -0
examples/blog_posts/warming_up_to_rl/SMOKE_TEST_COMPLETE.md +253 -0
examples/blog_posts/warming_up_to_rl/configs/eval_baseline_qwen32b_10x20.toml +25 -0
examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b.toml +25 -0
examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b_10x20.toml +26 -0
examples/blog_posts/warming_up_to_rl/configs/eval_groq_qwen32b.toml +25 -0
examples/blog_posts/warming_up_to_rl/configs/eval_openai_gpt_oss_120b.toml +29 -0
examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml +10 -0
examples/blog_posts/warming_up_to_rl/configs/smoke_test.toml +75 -0
examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml +91 -0
examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml +40 -0
examples/blog_posts/warming_up_to_rl/warming_up_to_rl_baseline.py +187 -0
examples/dev/qwen3_32b_qlora_4xh100.toml +5 -0
examples/multi_step/configs/VERILOG_REWARDS.md +4 -0
examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +4 -0
examples/multi_step/configs/crafter_rl_outcome.toml +2 -1
examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +65 -107
examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +2 -1
examples/multi_step/configs/crafter_rl_stepwise_simple.toml +2 -1
examples/multi_step/configs/crafter_rl_stepwise_simple_NEW_FORMAT.toml +105 -0
examples/multi_step/configs/verilog_rl_lora.toml +80 -123
examples/qwen_coder/configs/coder_lora_30b.toml +1 -3
examples/qwen_coder/configs/coder_lora_4b.toml +4 -1
examples/qwen_coder/configs/coder_lora_small.toml +1 -3
examples/qwen_vl/README.md +10 -12
examples/qwen_vl/SETUP_COMPLETE.md +7 -8
examples/qwen_vl/VISION_TESTS_COMPLETE.md +2 -3
examples/qwen_vl/collect_data_via_cli.md +76 -84
examples/qwen_vl/collect_vision_traces.py +4 -4
examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +40 -57
examples/qwen_vl/configs/crafter_vlm_sft_example.toml +1 -2
examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +20 -37
examples/qwen_vl/configs/eval_gpt5nano_vision.toml +21 -40
examples/qwen_vl/configs/eval_qwen3vl_vision.toml +26 -0
examples/qwen_vl/configs/{filter_qwen2vl_sft.toml → filter_qwen3vl_sft.toml} +4 -5
examples/qwen_vl/configs/filter_vision_sft.toml +2 -3
examples/qwen_vl/crafter_qwen_vl_agent.py +5 -5
examples/qwen_vl/run_vision_comparison.sh +6 -7
examples/rl/README.md +5 -5
examples/rl/configs/rl_from_base_qwen.toml +26 -1
examples/rl/configs/rl_from_base_qwen17.toml +6 -2
examples/rl/task_app/README.md +1 -2
examples/rl/task_app/math_single_step.py +2 -2
examples/run_crafter_demo.sh +2 -2
examples/sft/README.md +1 -1
examples/sft/configs/crafter_fft_qwen0p6b.toml +4 -1
examples/sft/configs/crafter_lora_qwen0p6b.toml +4 -1
examples/swe/task_app/README.md +32 -2
examples/swe/task_app/grpo_swe_mini.py +4 -0
examples/swe/task_app/hosted/envs/crafter/react_agent.py +1 -1
examples/swe/task_app/hosted/envs/mini_swe/environment.py +37 -10
examples/swe/task_app/hosted/inference/openai_client.py +4 -38
examples/swe/task_app/hosted/policy_routes.py +17 -0
examples/swe/task_app/hosted/rollout.py +4 -2
examples/swe/task_app/morph_backend.py +178 -0
examples/task_apps/banking77/__init__.py +6 -0
examples/task_apps/banking77/banking77_task_app.py +841 -0
examples/task_apps/banking77/deploy_wrapper.py +46 -0
examples/task_apps/crafter/CREATE_SFT_DATASET.md +4 -0
examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +4 -0
examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +4 -0
examples/task_apps/crafter/task_app/README.md +1 -1
examples/task_apps/crafter/task_app/grpo_crafter.py +90 -5
examples/task_apps/crafter/task_app/grpo_crafter_task_app.py +1 -1
examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +4 -26
examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -2
examples/task_apps/crafter/task_app/synth_envs_hosted/hosted_app.py +49 -0
examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +372 -107
examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +81 -12
examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +82 -11
examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +194 -1
examples/task_apps/enron/task_app/grpo_enron_task_app.py +1 -1
examples/task_apps/gepa_benchmarks/__init__.py +7 -0
examples/task_apps/gepa_benchmarks/common.py +260 -0
examples/task_apps/gepa_benchmarks/hotpotqa_task_app.py +507 -0
examples/task_apps/gepa_benchmarks/hover_task_app.py +436 -0
examples/task_apps/gepa_benchmarks/ifbench_task_app.py +563 -0
examples/task_apps/gepa_benchmarks/pupa_task_app.py +460 -0
examples/task_apps/math/README.md +1 -2
examples/task_apps/pokemon_red/README.md +3 -4
examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +4 -0
examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +6 -5
examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +1 -2
examples/task_apps/pokemon_red/task_app.py +288 -39
examples/task_apps/sokoban/README.md +2 -3
examples/task_apps/verilog/eval_groq_qwen32b.toml +12 -14
examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +1 -1
examples/vlm/configs/crafter_vlm_gpt4o.toml +4 -1
examples/warming_up_to_rl/configs/crafter_fft.toml +4 -1
examples/warming_up_to_rl/configs/crafter_fft_4b.toml +0 -2
examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +3 -2
examples/warming_up_to_rl/run_local_rollout_traced.py +1 -1
examples/warming_up_to_rl/task_app/README.md +1 -1
examples/warming_up_to_rl/task_app/grpo_crafter.py +185 -5
examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +1 -1
examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +3 -27
examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -1
examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +49 -0
examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +156 -45
examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +37 -4
examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +33 -3
examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +67 -0
examples/workflows/math_rl/configs/rl_from_base_qwen.toml +27 -0
examples/workflows/math_rl/configs/rl_from_base_qwen17.toml +6 -0
synth_ai/api/train/builders.py +99 -4
synth_ai/api/train/cli.py +516 -26
synth_ai/api/train/config_finder.py +13 -2
synth_ai/api/train/configs/__init__.py +23 -2
synth_ai/api/train/configs/prompt_learning.py +442 -0
synth_ai/api/train/configs/rl.py +61 -7
synth_ai/api/train/configs/sft.py +6 -2
synth_ai/api/train/configs/shared.py +59 -2
synth_ai/api/train/task_app.py +1 -1
synth_ai/api/train/validators.py +277 -0
synth_ai/auth/credentials.py +119 -0
synth_ai/baseline/__init__.py +25 -0
synth_ai/baseline/config.py +209 -0
synth_ai/baseline/discovery.py +214 -0
synth_ai/baseline/execution.py +146 -0
synth_ai/cli/__init__.py +94 -18
synth_ai/cli/__main__.py +0 -0
synth_ai/cli/claude.py +70 -0
synth_ai/cli/codex.py +84 -0
synth_ai/cli/commands/__init__.py +18 -0
synth_ai/cli/commands/baseline/__init__.py +12 -0
synth_ai/cli/commands/baseline/core.py +637 -0
synth_ai/cli/commands/baseline/list.py +93 -0
synth_ai/cli/commands/demo/__init__.py +6 -0
synth_ai/cli/commands/demo/core.py +163 -0
synth_ai/cli/commands/eval/__init__.py +19 -0
synth_ai/cli/commands/eval/core.py +1112 -0
synth_ai/cli/commands/eval/errors.py +81 -0
synth_ai/cli/commands/eval/validation.py +133 -0
synth_ai/cli/commands/filter/__init__.py +12 -0
synth_ai/cli/commands/filter/core.py +424 -0
synth_ai/cli/commands/filter/errors.py +55 -0
synth_ai/cli/commands/filter/validation.py +77 -0
synth_ai/cli/commands/help/__init__.py +177 -0
synth_ai/cli/commands/help/core.py +72 -0
synth_ai/cli/commands/smoke/__init__.py +7 -0
synth_ai/cli/commands/smoke/core.py +1436 -0
synth_ai/cli/commands/status/__init__.py +64 -0
synth_ai/cli/commands/status/client.py +192 -0
synth_ai/cli/commands/status/config.py +92 -0
synth_ai/cli/commands/status/errors.py +20 -0
synth_ai/cli/commands/status/formatters.py +164 -0
synth_ai/cli/commands/status/subcommands/__init__.py +9 -0
synth_ai/cli/commands/status/subcommands/files.py +79 -0
synth_ai/cli/commands/status/subcommands/jobs.py +334 -0
synth_ai/cli/commands/status/subcommands/models.py +79 -0
synth_ai/cli/commands/status/subcommands/pricing.py +22 -0
synth_ai/cli/commands/status/subcommands/runs.py +81 -0
synth_ai/cli/commands/status/subcommands/summary.py +47 -0
synth_ai/cli/commands/status/subcommands/usage.py +203 -0
synth_ai/cli/commands/status/utils.py +114 -0
synth_ai/cli/commands/train/__init__.py +53 -0
synth_ai/cli/commands/train/core.py +21 -0
synth_ai/cli/commands/train/errors.py +117 -0
synth_ai/cli/commands/train/judge_schemas.py +200 -0
synth_ai/cli/commands/train/judge_validation.py +305 -0
synth_ai/cli/commands/train/validation.py +386 -0
synth_ai/cli/demo.py +30 -158
synth_ai/cli/deploy/__init__.py +43 -0
synth_ai/cli/deploy.py +162 -0
synth_ai/cli/eval/__init__.py +36 -0
synth_ai/cli/eval/core.py +5 -0
synth_ai/cli/eval/errors.py +31 -0
synth_ai/cli/eval/validation.py +5 -0
synth_ai/cli/filter/__init__.py +28 -0
synth_ai/cli/filter/core.py +5 -0
synth_ai/cli/filter/errors.py +23 -0
synth_ai/cli/filter/validation.py +5 -0
synth_ai/cli/legacy_root_backup.py +14 -8
synth_ai/cli/modal_serve/__init__.py +12 -0
synth_ai/cli/modal_serve/core.py +14 -0
synth_ai/cli/modal_serve/errors.py +8 -0
synth_ai/cli/modal_serve/validation.py +11 -0
synth_ai/cli/opencode.py +107 -0
synth_ai/cli/root.py +9 -5
synth_ai/cli/serve/__init__.py +12 -0
synth_ai/cli/serve/core.py +14 -0
synth_ai/cli/serve/errors.py +8 -0
synth_ai/cli/serve/validation.py +11 -0
synth_ai/cli/setup.py +20 -265
synth_ai/cli/status.py +7 -126
synth_ai/cli/task_app_deploy.py +1 -10
synth_ai/cli/task_app_modal_serve.py +4 -9
synth_ai/cli/task_app_serve.py +4 -11
synth_ai/cli/task_apps.py +51 -1480
synth_ai/cli/train/__init__.py +12 -0
synth_ai/cli/train/core.py +21 -0
synth_ai/cli/train/errors.py +8 -0
synth_ai/cli/train/validation.py +24 -0
synth_ai/cli/train.py +1 -14
synth_ai/demos/crafter/grpo_crafter_task_app.py +1 -1
synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +7 -4
synth_ai/environments/examples/crafter_classic/engine_serialization_patch_v3.py +9 -5
synth_ai/environments/examples/crafter_classic/world_config_patch_simple.py +4 -3
synth_ai/environments/examples/red/engine.py +33 -12
synth_ai/environments/examples/red/engine_helpers/reward_components.py +151 -179
synth_ai/environments/examples/red/environment.py +26 -0
synth_ai/environments/examples/red/trace_hooks_v3.py +168 -0
synth_ai/http.py +12 -0
synth_ai/judge_schemas.py +10 -10
synth_ai/learning/__init__.py +10 -0
synth_ai/learning/prompt_learning_client.py +276 -0
synth_ai/learning/prompt_learning_types.py +184 -0
synth_ai/learning/rl/client.py +3 -1
synth_ai/pricing/__init__.py +2 -0
synth_ai/pricing/model_pricing.py +57 -0
synth_ai/streaming/__init__.py +29 -0
synth_ai/streaming/config.py +94 -0
synth_ai/streaming/handlers.py +518 -0
synth_ai/streaming/streamer.py +320 -0
synth_ai/streaming/types.py +95 -0
synth_ai/task/apps/__init__.py +1 -0
synth_ai/task/config.py +2 -0
synth_ai/task/tracing_utils.py +25 -25
synth_ai/task/validators.py +45 -9
synth_ai/task_app_cfgs.py +21 -0
synth_ai/tracing_v3/config.py +162 -19
synth_ai/tracing_v3/constants.py +1 -1
synth_ai/tracing_v3/db_config.py +24 -38
synth_ai/tracing_v3/migration_helper.py +1 -2
synth_ai/tracing_v3/storage/config.py +47 -13
synth_ai/tracing_v3/storage/factory.py +3 -3
synth_ai/tracing_v3/turso/daemon.py +113 -11
synth_ai/tracing_v3/turso/native_manager.py +92 -16
synth_ai/types.py +8 -0
synth_ai/urls.py +11 -0
synth_ai/utils/__init__.py +30 -1
synth_ai/utils/agents.py +74 -0
synth_ai/utils/bin.py +39 -0
synth_ai/utils/cli.py +149 -5
synth_ai/utils/env.py +40 -33
synth_ai/utils/http.py +4 -1
synth_ai/utils/json.py +72 -0
synth_ai/utils/modal.py +285 -3
synth_ai/utils/paths.py +48 -0
synth_ai/utils/uvicorn.py +113 -0
{synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/METADATA +109 -6
{synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/RECORD +291 -142
examples/qwen_vl/configs/eval_qwen2vl_vision.toml +0 -44
synth_ai/cli/tui.py +0 -62
synth_ai/tui/__init__.py +0 -5
synth_ai/tui/__main__.py +0 -13
synth_ai/tui/cli/__init__.py +0 -1
synth_ai/tui/cli/query_experiments.py +0 -164
synth_ai/tui/cli/query_experiments_v3.py +0 -164
synth_ai/tui/dashboard.py +0 -911
{synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/WHEEL +0 -0
{synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/entry_points.txt +0 -0
{synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/licenses/LICENSE +0 -0
{synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/top_level.txt +0 -0

examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py CHANGED Viewed

@@ -4,6 +4,7 @@ import contextlib
 import logging
 import os
 from datetime import datetime
+import asyncio
 from typing import Any
 from fastapi import APIRouter, HTTPException, Request
@@ -35,6 +36,13 @@ logger = logging.getLogger(__name__)
 router = APIRouter()
+# Global concurrency limit for outbound inference to avoid backend overload/timeouts
+try:
+    _INFERENCE_CONCURRENCY = int(os.getenv("INFERENCE_CONCURRENCY", "2") or "2")
+except Exception:  # pragma: no cover
+    _INFERENCE_CONCURRENCY = 2
+_inference_sem = asyncio.Semaphore(max(1, _INFERENCE_CONCURRENCY))
 class PolicyCreateRequest(BaseModel):
     policy_name: str
@@ -250,6 +258,11 @@ async def step_policy(
         task_app = req.app.state.task_app
         policy = handle.policy
         tracing_context = getattr(req.state, "rollout_tracing", None)
+        if tracing_context is None:
+            print(
+                f"[TRACE_DEBUG] Missing tracing context on policy step; policy_id={request.policy_id}",
+                flush=True,
+            )
         obs_text = request.observation
         if isinstance(request.observation, dict):
@@ -462,6 +475,8 @@ async def step_policy(
                 )
             # Emit full system/user prompts for observability (no secrets included)
+            system_prompt_records: list[dict[str, Any]] = []
+            user_prompt_records: list[dict[str, Any]] = []
             try:
                 def _as_text(content: object) -> str:
@@ -481,8 +496,6 @@ async def step_policy(
                         return "".join(parts)
                     return str(content)
-                system_prompt_records: list[dict[str, Any]] = []
-                user_prompt_records: list[dict[str, Any]] = []
                 for message in msgs:
                     role = message.get("role")
                     raw_content = message.get("content")
@@ -525,6 +538,11 @@ async def step_policy(
             if tracing_context is not None:
                 try:
+                    logger.info(
+                        "[TRACE_DEBUG] record_policy_prompts sys=%s user=%s",
+                        len(system_prompt_records),
+                        len(user_prompt_records),
+                    )
                     await tracing_context.record_policy_prompts(
                         system_prompt_records, user_prompt_records
                     )
@@ -541,6 +559,14 @@ async def step_policy(
             # Ensure meta carries the final target URL for downstream logging/clients
             with contextlib.suppress(Exception):
+                # Bulletproof normalizer at the call site (in addition to client-side)
+                try:
+                    from examples.task_apps.crafter.task_app.synth_envs_hosted.utils import (
+                        force_normalize_chat_completions_url,
+                    )
+                    target_url = force_normalize_chat_completions_url(target_url)
+                except Exception:
+                    pass
                 sanitized_target = ensure_chat_completions_url(target_url)
                 if sanitized_target and sanitized_target != target_url:
                     logger.warning(
@@ -589,6 +615,28 @@ async def step_policy(
             except Exception:
                 api_key_override = None
+            # Fallback: If target is OpenAI but OPENAI_API_KEY is missing, route to Synth API
+            try:
+                import os as _os2
+                _low = str(target_url or "").lower()
+                if ("api.openai.com" in _low) and not (_os2.getenv("OPENAI_API_KEY")):
+                    # Prefer task_app.synth_base_url if available; else default
+                    synth_base = getattr(task_app, "synth_base_url", None)
+                    if isinstance(synth_base, str) and synth_base.strip():
+                        base = synth_base.rstrip("/")
+                        fallback = base + "/inference/v1/chat/completions"
+                    else:
+                        fallback = "https://api.synth.run/api/inference/v1/chat/completions"
+                    fixed = ensure_chat_completions_url(fallback)
+                    logger.warning(
+                        "POLICY_STEP: OPENAI key missing; falling back to Synth route %s",
+                        fixed,
+                    )
+                    meta["inference_url"] = fixed
+                    target_url = fixed
+            except Exception:
+                pass
             if api_key_override:
                 try:
                     masked = f"{api_key_override[:6]}…{api_key_override[-4:]}"
@@ -780,9 +828,10 @@ async def step_policy(
                 "sokoban-react",
                 "crafter-react",
             ) and getattr(policy, "use_tools", True):
-                req_tools = meta["inference_request"]["tools"]
-                req_tool_choice = meta["inference_request"]["tool_choice"]
-                req_stop_after = meta["inference_request"]["stop_after_tool_calls"]
+                inf_req = meta.get("inference_request", {})
+                req_tools = inf_req.get("tools")
+                req_tool_choice = inf_req.get("tool_choice")
+                req_stop_after = inf_req.get("stop_after_tool_calls")
                 logger.info(
                     f"TOOLCALL_CONFIG: policy={policy_name} tools_present={bool(req_tools)} tool_choice={req_tool_choice} stop_after={req_stop_after}"
                 )
@@ -791,6 +840,8 @@ async def step_policy(
                         status_code=500,
                         detail=f"TOOLCALL_ASSERTION_FAIL: Missing tools or tool_choice!=required for policy {policy_name}",
                     )
+                if req_stop_after is None:
+                    inf_req["stop_after_tool_calls"] = 1
             # Call inference service with retries for Flash cold-start (503)
             import time as _t
@@ -967,13 +1018,14 @@ async def step_policy(
             _t_start = _t.time()
             call_started_at = datetime.utcnow()
-            inference_response = await client.generate_with_retries(
-                request=meta["inference_request"],
-                base_url=meta["inference_url"],
-                max_retries=12,
-                backoff_factor=2.0,
-                extra_headers=extra_headers,
-            )
+            async with _inference_sem:
+                inference_response = await client.generate_with_retries(
+                    request=meta["inference_request"],
+                    base_url=meta["inference_url"],
+                    max_retries=12,
+                    backoff_factor=2.0,
+                    extra_headers=extra_headers,
+                )
             meta["inference_ms"] = int((_t.time() - _t_start) * 1000)
             call_completed_at = datetime.utcnow()
@@ -1053,6 +1105,23 @@ async def step_policy(
                 except Exception as exc:
                     logger.debug(f"TRACING_LLM_FAIL: {exc}")
+        if not tool_calls:
+            preview = ""
+            try:
+                preview = str(meta.get("raw_response") or "")[:400]
+            except Exception:
+                preview = "<unavailable>"
+            logger.error(
+                {
+                    "rollout.policy_step": True,
+                    "policy_id": request.policy_id,
+                    "error": "no_tool_calls",
+                    "inference_url": meta.get("inference_url"),
+                    "raw_preview": preview,
+                }
+            )
+            raise RuntimeError("Policy step produced no tool calls; inference response unusable.")
         return PolicyStepResponse(
             tool_calls=tool_calls,
             meta=meta,

examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py CHANGED Viewed

@@ -491,6 +491,10 @@ class RolloutTracingContext:
             getattr(request.record, "trace_format", "compact") or "compact"
         ).lower()
         self.return_trace = bool(getattr(request.record, "return_trace", False))
+        print(
+            f"[TRACE_DEBUG] RolloutTracingContext init: trace_format={self.trace_format} return_trace={self.return_trace}",
+            flush=True,
+        )
         self.sft_output_dir = getattr(fastapi_request.app.state, "sft_output_dir", None)
         self.session_trace = None
         self.metadata_updates: dict[str, Any] = {}
@@ -513,19 +517,24 @@ class RolloutTracingContext:
     async def start_session(self) -> None:
         if not self.enabled or self.tracer is None:
+            print("[TRACE_DEBUG] start_session skipped: tracer disabled", flush=True)
             return
         try:
             await self.tracer.initialize()
+            print("[TRACE_DEBUG] tracer initialized", flush=True)
         except Exception as exc:
             logger.debug("TRACING_INIT_FAIL: %s", exc)
+            # Hard fail: tracing requested but cannot initialize
+            raise
         try:
             await self.tracer.start_session(
                 session_id=self.run_id, metadata=dict(self.metadata_base)
             )
+            print(f"[TRACE_DEBUG] start_session succeeded for run_id={self.run_id}", flush=True)
         except Exception as exc:
             logger.info("TRACING_START_FAIL: %s", exc)
-            self.enabled = False
-            self.tracer = None
+            # Hard fail: tracing requested but cannot start session
+            raise
     async def start_decision(self, turn_number: int) -> None:
         self.current_turn = turn_number
@@ -590,7 +599,7 @@ class RolloutTracingContext:
         # Debug: Check message count
         if self.tracer and self.tracer._current_trace:
             msg_count = len(self.tracer._current_trace.markov_blanket_message_history)
-            logger.info(f"[TRACE_DEBUG] After record_policy_prompts: {msg_count} messages in trace")
+            print(f"[TRACE_DEBUG] After record_policy_prompts: {msg_count} messages", flush=True)
     def _content_to_text(self, content: Any) -> str:
         if isinstance(content, str):
@@ -664,11 +673,20 @@ class RolloutTracingContext:
             return
         if self.enabled and self.tracer is not None:
             try:
+                payload = {
+                    "role": "assistant",
+                    "tool_calls": tool_calls,
+                }
                 await self.tracer.record_message(
-                    content=self._safe_json(tool_calls),
-                    message_type="assistant",  # Map to standard assistant message type
+                    content=payload,
+                    message_type="assistant",
                     metadata={**self._message_metadata(), "is_tool_call": True},
                 )
+                if self.tracer._current_trace:
+                    print(
+                        f"[TRACE_DEBUG] After tool invocation: messages={len(self.tracer._current_trace.markov_blanket_message_history)}",
+                        flush=True,
+                    )
             except Exception as exc:
                 logger.debug("TRACING_TOOL_MSG_FAIL: %s", exc)
@@ -774,9 +792,33 @@ class RolloutTracingContext:
             }
         )
+        assistant_structured = assistant_content if assistant_content is not None else ""
+        assistant_text = self._content_to_text(assistant_content)
+        if self.enabled and self.tracer is not None:
+            assistant_payload: dict[str, Any] = {
+                "role": "assistant",
+                "content": assistant_structured,
+                "text": assistant_text,
+            }
+            if isinstance(assistant_message, dict):
+                if assistant_message.get("tool_calls"):
+                    assistant_payload["tool_calls"] = assistant_message.get("tool_calls")
+                if assistant_message.get("reasoning"):
+                    assistant_payload["reasoning"] = assistant_message.get("reasoning")
+                if assistant_message.get("thinking"):
+                    assistant_payload["thinking"] = assistant_message.get("thinking")
+            try:
+                await self.tracer.record_message(
+                    content=assistant_payload,
+                    message_type="assistant",
+                    metadata=self._message_metadata(),
+                )
+            except Exception as exc:
+                logger.debug("TRACING_ASSISTANT_MSG_FAIL: %s", exc)
         if self.sft_output_dir is not None:
             assistant_structured = assistant_content if assistant_content is not None else ""
-            assistant_text = self._content_to_text(assistant_content)
             dialogue_structured: list[dict[str, Any]] = []
             for content in self.latest_system_prompt_content:
                 if content is None:
@@ -941,17 +983,23 @@ class RolloutTracingContext:
                 # Debug: Check message count before end_session
                 if self.tracer._current_trace:
                     msg_count = len(self.tracer._current_trace.markov_blanket_message_history)
-                    logger.info(f"[TRACE_DEBUG] Before end_session: {msg_count} messages in trace")
+                    print(f"[TRACE_DEBUG] Before end_session: {msg_count} messages in trace", flush=True)
                 self.session_trace = await self.tracer.end_session()
                 # Debug: Check if session was saved
                 if self.session_trace:
-                    logger.info(f"[TRACE_DEBUG] Session ended successfully, session_id={self.session_trace.session_id}")
+                    print(
+                        f"[TRACE_DEBUG] Session ended successfully, session_id={self.session_trace.session_id}",
+                        flush=True,
+                    )
                     self.session_trace.metadata.update(self.metadata_updates)
-                    logger.info(f"[TRACE_DEBUG] session_trace.metadata keys: {list(self.session_trace.metadata.keys())}")
+                    print(
+                        f"[TRACE_DEBUG] session_trace.metadata keys: {list(self.session_trace.metadata.keys())}",
+                        flush=True,
+                    )
                 else:
-                    logger.warning("[TRACE_DEBUG] end_session returned None!")
+                    print("[TRACE_DEBUG] end_session returned None!", flush=True)
             except Exception as exc:
                 logger.warning(f"TRACING_END_SESSION_FAIL: {exc}", exc_info=True)
                 self.session_trace = None
@@ -991,6 +1039,10 @@ class RolloutTracingContext:
         if self.trace_format in ("full", "structured"):
             payload = session_trace.to_dict()
             payload.setdefault("metadata", {}).update(self.metadata_updates)
+            print(
+                f"[TRACE_DEBUG] build_trace_payload returning structured trace with messages={len(payload.get('markov_blanket_message_history') or [])}",
+                flush=True,
+            )
             return payload
         # For "compact" format, return only summary stats
@@ -1929,6 +1981,15 @@ async def execute_rollout(
         if 'policy_config_snapshot' not in locals():
             policy_config_snapshot = {}
+        # Normalize inference URL for trajectory (and ensure no path in query)
+        try:
+            from .utils import force_normalize_chat_completions_url, ensure_chat_completions_url
+            inference_url = force_normalize_chat_completions_url(inference_url)
+            # apply mode-aware normalization too (keeps cid, appends path if missing)
+            inference_url = ensure_chat_completions_url(inference_url, mode=request.mode)
+        except Exception:
+            pass
         logger.info(
             "ROLLOUT_TRAJECTORY: run_id=%s policy_id=%s inference_url=%s trace_id=%s",
             request.run_id,
@@ -2043,6 +2104,16 @@ async def execute_rollout(
         if metrics.num_steps <= 0:
             raise HTTPException(status_code=500, detail="no_steps_executed: avg_turns == 0")
+        # Ensure at least one tool call executed successfully
+        tool_call_executed = any(
+            isinstance(step.tool_calls, list) and len(step.tool_calls) > 0 for step in trajectory_steps
+        )
+        if not tool_call_executed:
+            raise HTTPException(
+                status_code=502,
+                detail="no_tool_calls_executed: model failed to produce actionable tool calls.",
+            )
         response = RolloutResponse(
             run_id=request.run_id,
             trajectories=[trajectory],

examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py CHANGED Viewed

@@ -11,6 +11,129 @@ logger = logging.getLogger(__name__)
 _CHAT_COMPLETIONS_SUFFIX = "/v1/chat/completions"
+def force_normalize_chat_completions_url(raw_url: Any) -> str:
+    """
+    Bulletproof normalizer: converts ANY malformed inference URL into the
+    correct chat-completions URL form.
+    Rules:
+    - Final path MUST end with /v1/chat/completions
+    - Query MUST NOT contain any '/' characters (no path segments in query)
+    - If the original query contained a path (e.g., '?cid=.../v1/chat/completions'),
+      extract that path and move it to the URL path; keep remaining query params
+    - Preserve scheme, host, port and existing query params order as much as possible
+    Examples:
+      https://host?cid=trace_123/v1/chat/completions
+        -> https://host/v1/chat/completions?cid=trace_123
+      https://host:8000?cid=trace_abc/v1/chat/completions&foo=bar
+        -> https://host:8000/v1/chat/completions?cid=trace_abc&foo=bar
+      https://host?cid=trace_123/v1/chat/completions?other=param
+        -> https://host/v1/chat/completions?cid=trace_123&other=param
+    """
+    if not isinstance(raw_url, str):
+        return raw_url
+    url = raw_url.strip()
+    if not url:
+        return raw_url
+    parsed = urlparse(url)
+    path = (parsed.path or "").rstrip("/")
+    query = parsed.query or ""
+    # If query contains a path (has '/'), extract and repair
+    if query and "/" in query:
+        # Split query at the first '/' (everything before is real query params)
+        before_slash, after_slash = query.split("/", 1)
+        # after_slash may contain path and then more query params separated by '&' or '?' (malformed)
+        sep_indices = [i for i in [after_slash.find("&"), after_slash.find("?")] if i >= 0]
+        cut_idx = min(sep_indices) if sep_indices else len(after_slash)
+        path_from_query = "/" + after_slash[:cut_idx]  # restore leading '/'
+        extra_query = after_slash[cut_idx + 1 :] if cut_idx < len(after_slash) else ""
+        # Merge query params: base (before_slash) + extra_query
+        merged_query = before_slash
+        if extra_query:
+            merged_query = f"{merged_query}&{extra_query}" if merged_query else extra_query
+        # Decide final path
+        if path_from_query.startswith(_CHAT_COMPLETIONS_SUFFIX):
+            final_path = path_from_query
+        else:
+            final_path = f"{path_from_query.rstrip('/')}{_CHAT_COMPLETIONS_SUFFIX}"
+        parsed = parsed._replace(path=final_path, query=merged_query)
+        url = urlunparse(parsed)
+        parsed = urlparse(url)
+        path = parsed.path or ""
+        query = parsed.query or ""
+    # Ensure path ends with chat completions suffix
+    if not path.endswith(_CHAT_COMPLETIONS_SUFFIX):
+        new_path = f"{path}{_CHAT_COMPLETIONS_SUFFIX}" if path else _CHAT_COMPLETIONS_SUFFIX
+        parsed = parsed._replace(path=new_path)
+        url = urlunparse(parsed)
+        parsed = urlparse(url)
+        path = parsed.path or ""
+        query = parsed.query or ""
+    # Final validation: no '/' in query
+    if query and "/" in query:
+        # As a last resort, drop anything after the first '/'
+        safe_query = query.split("/")[0]
+        parsed = parsed._replace(query=safe_query)
+        url = urlunparse(parsed)
+    return url
+def _validate_url_structure(url: str, context: str = "") -> None:
+    """
+    Validate that a URL has correct structure (path before query, not vice versa).
+    Raises ValueError if URL is malformed.
+    Args:
+        url: The URL to validate
+        context: Optional context for error messages
+    Raises:
+        ValueError: If URL is malformed (path-like segments in query string)
+    """
+    if not isinstance(url, str) or not url.strip():
+        return
+    try:
+        parsed = urlparse(url)
+        query = parsed.query or ""
+        # CRITICAL CHECK: If query contains path-like segments (contains /), it's malformed
+        if query and "/" in query:
+            path_segment = query.split("/", 1)[1] if "/" in query else ""
+            error_msg = (
+                f"FATAL [TASK_APP_URL_VALIDATION]: Malformed inference URL detected!\n"
+                f"\n"
+                f"URL: {url}\n"
+                f"Context: {context}\n"
+                f"\n"
+                f"The URL has a path-like segment ('/{path_segment}') in the query string.\n"
+                f"This indicates incorrect URL construction upstream.\n"
+                f"\n"
+                f"Expected: https://host/v1/chat/completions?cid=trace_123\n"
+                f"Malformed: https://host?cid=trace_123/v1/chat/completions\n"
+                f"\n"
+                f"This should be caught by the trainer, but if you see this,\n"
+                f"the trainer's URL validation may have failed.\n"
+            )
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+    except ValueError:
+        raise
+    except Exception as e:
+        logger.warning(f"[URL_VALIDATION] Failed to parse URL: {url} (context: {context}, error: {e})")
 def ensure_chat_completions_url(raw_url: Any, mode: str | None = None) -> Any:
     """
     Ensure inference URLs point at the chat completions endpoint.
@@ -43,9 +166,75 @@ def ensure_chat_completions_url(raw_url: Any, mode: str | None = None) -> Any:
     parsed = urlparse(url)
     path = (parsed.path or "").rstrip("/")
+    query = parsed.query
+    logger.debug(
+        "ensure_chat_completions_url: parsing url=%s -> path=%r query=%r",
+        url,
+        path,
+        query,
+    )
+    # CRITICAL: Check for malformed URLs (path in query) and fix them FIRST
+    # Example: https://host?cid=trace_123/v1/chat/completions
+    # Should be: https://host/v1/chat/completions?cid=trace_123
+    if query and "/" in query:
+        logger.error(
+            f"[URL_FIX] Detected malformed URL in ensure_chat_completions_url: {url}\n"
+            f"Path-like segment found in query string. Attempting to fix..."
+        )
+        # Split query at first "/" to separate query params from path
+        query_parts = query.split("/", 1)
+        if len(query_parts) == 2:
+            # query_parts[0] is the actual query (e.g., "cid=trace_123")
+            # query_parts[1] is the path that was incorrectly put in query
+            actual_query = query_parts[0]
+            path_and_more = query_parts[1]  # Could be "v1/chat/completions" or "v1/chat/completions&foo=bar"
+            # Extract the path part (everything before "&" or "?" if present)
+            # Handle both "&" (query param separator) and "?" (another malformed query separator)
+            if "&" in path_and_more:
+                # Path is followed by more query params (separated by &)
+                path_segment, extra_query = path_and_more.split("&", 1)
+                path_in_query = "/" + path_segment  # Restore leading slash
+                # Merge extra query params with actual_query
+                actual_query = f"{actual_query}&{extra_query}"
+            elif "?" in path_and_more:
+                # Path is followed by more query params (separated by ?, which is malformed)
+                path_segment, extra_query = path_and_more.split("?", 1)
+                path_in_query = "/" + path_segment  # Restore leading slash
+                # Merge extra query params with actual_query (use & as separator)
+                actual_query = f"{actual_query}&{extra_query}"
+            else:
+                # No extra query params, just the path
+                path_in_query = "/" + path_and_more  # Restore leading slash
+            # If the path_in_query already contains /v1/chat/completions, use it
+            # Otherwise, append /v1/chat/completions
+            if path_in_query.startswith("/v1/chat/completions"):
+                final_path = path_in_query
+            else:
+                # Append /v1/chat/completions to whatever path we found
+                final_path = path_in_query.rstrip("/") + "/v1/chat/completions"
+            # Reconstruct URL correctly: path comes before query
+            parsed = parsed._replace(path=final_path, query=actual_query)
+            fixed_url = urlunparse(parsed)
+            logger.warning(f"[URL_FIX] Fixed malformed URL:\n  FROM: {url}\n  TO:   {fixed_url}")
+            url = fixed_url
+            # Re-parse after fix
+            parsed = urlparse(url)
+            path = parsed.path.rstrip("/")
+            query = parsed.query
+        else:
+            # Can't parse - this shouldn't happen but validate will catch it
+            logger.error(f"[URL_FIX] Could not parse malformed query: {query}")
+            _validate_url_structure(url, context="ensure_chat_completions_url input - cannot fix")
     if path.endswith("/v1/chat/completions"):
         logger.debug("ensure_chat_completions_url: URL already normalized %s", url)
-        # Already targeting the desired endpoint; keep original to preserve trailing slash.
+        # Validate final URL
+        _validate_url_structure(url, context="ensure_chat_completions_url output")
         return url
     if not path:
@@ -55,6 +244,10 @@ def ensure_chat_completions_url(raw_url: Any, mode: str | None = None) -> Any:
     rebuilt = parsed._replace(path=new_path)
     normalized = urlunparse(rebuilt)
+    # CRITICAL: Validate the normalized URL
+    _validate_url_structure(normalized, context="ensure_chat_completions_url output")
     logger.info(
         "ensure_chat_completions_url: RL mode - normalized inference URL from %s to %s",
         url,

examples/task_apps/enron/task_app/grpo_enron_task_app.py CHANGED Viewed

@@ -2,7 +2,7 @@
 This mirrors the structure of the Crafter task app wrapper while delegating
 all configuration to the colocated `grpo_enron.py` module. Normal usage should
-prefer invoking `uvx synth-ai serve grpo-enron`, but this module remains for
+prefer invoking `uvx synth-ai deploy --runtime uvicorn grpo-enron`, but this module remains for
 direct execution or importing the FastAPI app object.
 """

examples/task_apps/gepa_benchmarks/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""GEPA benchmark task apps (HotpotQA, IFBench, HoVer, PUPA)."""
+# Import modules for side effects (task app registration) when package is imported.
+from . import hotpotqa_task_app  # noqa: F401
+from . import hover_task_app  # noqa: F401
+from . import ifbench_task_app  # noqa: F401
+from . import pupa_task_app  # noqa: F401

synth-ai 0.2.16__py3-none-any.whl → 0.2.19__py3-none-any.whl

Potentially problematic release.

synth-ai 0.2.16py3-none-any.whl → 0.2.19py3-none-any.whl