PyPI - synth-ai - Versions diffs - 0.2.17__py3-none-any.whl → 0.2.19__py3-none-any.whl - Mend

synth-ai 0.2.17py3-none-any.whl → 0.2.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of synth-ai might be problematic. Click here for more details.

Files changed (169) hide show

examples/baseline/banking77_baseline.py +204 -0
examples/baseline/crafter_baseline.py +407 -0
examples/baseline/pokemon_red_baseline.py +326 -0
examples/baseline/simple_baseline.py +56 -0
examples/baseline/warming_up_to_rl_baseline.py +239 -0
examples/blog_posts/gepa/README.md +355 -0
examples/blog_posts/gepa/configs/banking77_gepa_local.toml +95 -0
examples/blog_posts/gepa/configs/banking77_gepa_test.toml +82 -0
examples/blog_posts/gepa/configs/banking77_mipro_local.toml +52 -0
examples/blog_posts/gepa/configs/hotpotqa_gepa_local.toml +59 -0
examples/blog_posts/gepa/configs/hotpotqa_gepa_qwen.toml +36 -0
examples/blog_posts/gepa/configs/hotpotqa_mipro_local.toml +53 -0
examples/blog_posts/gepa/configs/hover_gepa_local.toml +59 -0
examples/blog_posts/gepa/configs/hover_gepa_qwen.toml +36 -0
examples/blog_posts/gepa/configs/hover_mipro_local.toml +53 -0
examples/blog_posts/gepa/configs/ifbench_gepa_local.toml +59 -0
examples/blog_posts/gepa/configs/ifbench_gepa_qwen.toml +36 -0
examples/blog_posts/gepa/configs/ifbench_mipro_local.toml +53 -0
examples/blog_posts/gepa/configs/pupa_gepa_local.toml +60 -0
examples/blog_posts/gepa/configs/pupa_mipro_local.toml +54 -0
examples/blog_posts/gepa/deploy_banking77_task_app.sh +41 -0
examples/blog_posts/gepa/gepa_baseline.py +204 -0
examples/blog_posts/gepa/query_prompts_example.py +97 -0
examples/blog_posts/gepa/run_gepa_banking77.sh +87 -0
examples/blog_posts/gepa/task_apps.py +105 -0
examples/blog_posts/gepa/test_gepa_local.sh +67 -0
examples/blog_posts/gepa/verify_banking77_setup.sh +123 -0
examples/blog_posts/pokemon_vl/configs/eval_gpt5nano.toml +26 -0
examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml +12 -10
examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml +1 -0
examples/blog_posts/pokemon_vl/extract_images.py +239 -0
examples/blog_posts/pokemon_vl/pokemon_vl_baseline.py +326 -0
examples/blog_posts/pokemon_vl/run_eval_extract_images.py +209 -0
examples/blog_posts/pokemon_vl/run_qwen_eval_extract_images.py +212 -0
examples/blog_posts/pokemon_vl/text_box_analysis.md +106 -0
examples/blog_posts/warming_up_to_rl/ARCHITECTURE.md +195 -0
examples/blog_posts/warming_up_to_rl/FINAL_TEST_RESULTS.md +127 -0
examples/blog_posts/warming_up_to_rl/INFERENCE_SUCCESS.md +132 -0
examples/blog_posts/warming_up_to_rl/SMOKE_TESTING.md +164 -0
examples/blog_posts/warming_up_to_rl/SMOKE_TEST_COMPLETE.md +253 -0
examples/blog_posts/warming_up_to_rl/configs/eval_baseline_qwen32b_10x20.toml +25 -0
examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b_10x20.toml +26 -0
examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml +1 -1
examples/blog_posts/warming_up_to_rl/configs/smoke_test.toml +75 -0
examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml +60 -10
examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml +1 -1
examples/blog_posts/warming_up_to_rl/warming_up_to_rl_baseline.py +187 -0
examples/multi_step/configs/VERILOG_REWARDS.md +4 -0
examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +4 -0
examples/multi_step/configs/crafter_rl_outcome.toml +1 -0
examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +1 -0
examples/multi_step/configs/crafter_rl_stepwise_simple.toml +1 -0
examples/rl/configs/rl_from_base_qwen17.toml +1 -0
examples/swe/task_app/hosted/inference/openai_client.py +0 -34
examples/swe/task_app/hosted/policy_routes.py +17 -0
examples/swe/task_app/hosted/rollout.py +4 -2
examples/task_apps/banking77/__init__.py +6 -0
examples/task_apps/banking77/banking77_task_app.py +841 -0
examples/task_apps/banking77/deploy_wrapper.py +46 -0
examples/task_apps/crafter/CREATE_SFT_DATASET.md +4 -0
examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +4 -0
examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +4 -0
examples/task_apps/crafter/task_app/grpo_crafter.py +24 -2
examples/task_apps/crafter/task_app/synth_envs_hosted/hosted_app.py +49 -0
examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +355 -58
examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +68 -7
examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +78 -21
examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +194 -1
examples/task_apps/gepa_benchmarks/__init__.py +7 -0
examples/task_apps/gepa_benchmarks/common.py +260 -0
examples/task_apps/gepa_benchmarks/hotpotqa_task_app.py +507 -0
examples/task_apps/gepa_benchmarks/hover_task_app.py +436 -0
examples/task_apps/gepa_benchmarks/ifbench_task_app.py +563 -0
examples/task_apps/gepa_benchmarks/pupa_task_app.py +460 -0
examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +4 -0
examples/task_apps/pokemon_red/task_app.py +254 -36
examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +1 -0
examples/warming_up_to_rl/task_app/grpo_crafter.py +53 -4
examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +49 -0
examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +152 -41
examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +31 -1
examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +33 -3
examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +67 -0
examples/workflows/math_rl/configs/rl_from_base_qwen17.toml +1 -0
synth_ai/api/train/builders.py +90 -1
synth_ai/api/train/cli.py +396 -21
synth_ai/api/train/config_finder.py +13 -2
synth_ai/api/train/configs/__init__.py +15 -1
synth_ai/api/train/configs/prompt_learning.py +442 -0
synth_ai/api/train/configs/rl.py +29 -0
synth_ai/api/train/task_app.py +1 -1
synth_ai/api/train/validators.py +277 -0
synth_ai/baseline/__init__.py +25 -0
synth_ai/baseline/config.py +209 -0
synth_ai/baseline/discovery.py +214 -0
synth_ai/baseline/execution.py +146 -0
synth_ai/cli/__init__.py +85 -17
synth_ai/cli/__main__.py +0 -0
synth_ai/cli/claude.py +70 -0
synth_ai/cli/codex.py +84 -0
synth_ai/cli/commands/__init__.py +1 -0
synth_ai/cli/commands/baseline/__init__.py +12 -0
synth_ai/cli/commands/baseline/core.py +637 -0
synth_ai/cli/commands/baseline/list.py +93 -0
synth_ai/cli/commands/eval/core.py +13 -10
synth_ai/cli/commands/filter/core.py +53 -17
synth_ai/cli/commands/help/core.py +0 -1
synth_ai/cli/commands/smoke/__init__.py +7 -0
synth_ai/cli/commands/smoke/core.py +1436 -0
synth_ai/cli/commands/status/subcommands/pricing.py +22 -0
synth_ai/cli/commands/status/subcommands/usage.py +203 -0
synth_ai/cli/commands/train/judge_schemas.py +1 -0
synth_ai/cli/commands/train/judge_validation.py +1 -0
synth_ai/cli/commands/train/validation.py +0 -57
synth_ai/cli/demo.py +35 -3
synth_ai/cli/deploy/__init__.py +40 -25
synth_ai/cli/deploy.py +162 -0
synth_ai/cli/legacy_root_backup.py +14 -8
synth_ai/cli/opencode.py +107 -0
synth_ai/cli/root.py +9 -5
synth_ai/cli/task_app_deploy.py +1 -1
synth_ai/cli/task_apps.py +53 -53
synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +7 -4
synth_ai/environments/examples/crafter_classic/engine_serialization_patch_v3.py +9 -5
synth_ai/environments/examples/crafter_classic/world_config_patch_simple.py +4 -3
synth_ai/judge_schemas.py +1 -0
synth_ai/learning/__init__.py +10 -0
synth_ai/learning/prompt_learning_client.py +276 -0
synth_ai/learning/prompt_learning_types.py +184 -0
synth_ai/pricing/__init__.py +2 -0
synth_ai/pricing/model_pricing.py +57 -0
synth_ai/streaming/handlers.py +53 -4
synth_ai/streaming/streamer.py +19 -0
synth_ai/task/apps/__init__.py +1 -0
synth_ai/task/config.py +2 -0
synth_ai/task/tracing_utils.py +25 -25
synth_ai/task/validators.py +44 -8
synth_ai/task_app_cfgs.py +21 -0
synth_ai/tracing_v3/config.py +162 -19
synth_ai/tracing_v3/constants.py +1 -1
synth_ai/tracing_v3/db_config.py +24 -38
synth_ai/tracing_v3/storage/config.py +47 -13
synth_ai/tracing_v3/storage/factory.py +3 -3
synth_ai/tracing_v3/turso/daemon.py +113 -11
synth_ai/tracing_v3/turso/native_manager.py +92 -16
synth_ai/types.py +8 -0
synth_ai/urls.py +11 -0
synth_ai/utils/__init__.py +30 -1
synth_ai/utils/agents.py +74 -0
synth_ai/utils/bin.py +39 -0
synth_ai/utils/cli.py +149 -5
synth_ai/utils/env.py +17 -17
synth_ai/utils/json.py +72 -0
synth_ai/utils/modal.py +283 -1
synth_ai/utils/paths.py +48 -0
synth_ai/utils/uvicorn.py +113 -0
{synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/METADATA +102 -4
{synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/RECORD +162 -88
synth_ai/cli/commands/deploy/__init__.py +0 -23
synth_ai/cli/commands/deploy/core.py +0 -614
synth_ai/cli/commands/deploy/errors.py +0 -72
synth_ai/cli/commands/deploy/validation.py +0 -11
synth_ai/cli/deploy/core.py +0 -5
synth_ai/cli/deploy/errors.py +0 -23
synth_ai/cli/deploy/validation.py +0 -5
{synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/WHEEL +0 -0
{synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/entry_points.txt +0 -0
{synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/licenses/LICENSE +0 -0
{synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/top_level.txt +0 -0

examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py CHANGED Viewed

@@ -491,10 +491,9 @@ class RolloutTracingContext:
             getattr(request.record, "trace_format", "compact") or "compact"
         ).lower()
         self.return_trace = bool(getattr(request.record, "return_trace", False))
-        logger.warning(
-            "[TRACE_DEBUG] RolloutTracingContext init: trace_format=%s return_trace=%s",
-            self.trace_format,
-            self.return_trace,
+        print(
+            f"[TRACE_DEBUG] RolloutTracingContext init: trace_format={self.trace_format} return_trace={self.return_trace}",
+            flush=True,
         )
         self.sft_output_dir = getattr(fastapi_request.app.state, "sft_output_dir", None)
         self.session_trace = None
@@ -518,19 +517,24 @@ class RolloutTracingContext:
     async def start_session(self) -> None:
         if not self.enabled or self.tracer is None:
+            print("[TRACE_DEBUG] start_session skipped: tracer disabled", flush=True)
             return
         try:
             await self.tracer.initialize()
+            print("[TRACE_DEBUG] tracer initialized", flush=True)
         except Exception as exc:
             logger.debug("TRACING_INIT_FAIL: %s", exc)
+            # Hard fail: tracing requested but cannot initialize
+            raise
         try:
             await self.tracer.start_session(
                 session_id=self.run_id, metadata=dict(self.metadata_base)
             )
+            print(f"[TRACE_DEBUG] start_session succeeded for run_id={self.run_id}", flush=True)
         except Exception as exc:
             logger.info("TRACING_START_FAIL: %s", exc)
-            self.enabled = False
-            self.tracer = None
+            # Hard fail: tracing requested but cannot start session
+            raise
     async def start_decision(self, turn_number: int) -> None:
         self.current_turn = turn_number
@@ -595,7 +599,7 @@ class RolloutTracingContext:
         # Debug: Check message count
         if self.tracer and self.tracer._current_trace:
             msg_count = len(self.tracer._current_trace.markov_blanket_message_history)
-            logger.warning("[TRACE_DEBUG] After record_policy_prompts: %s messages", msg_count)
+            print(f"[TRACE_DEBUG] After record_policy_prompts: {msg_count} messages", flush=True)
     def _content_to_text(self, content: Any) -> str:
         if isinstance(content, str):
@@ -669,15 +673,19 @@ class RolloutTracingContext:
             return
         if self.enabled and self.tracer is not None:
             try:
+                payload = {
+                    "role": "assistant",
+                    "tool_calls": tool_calls,
+                }
                 await self.tracer.record_message(
-                    content=self._safe_json(tool_calls),
-                    message_type="assistant",  # Map to standard assistant message type
+                    content=payload,
+                    message_type="assistant",
                     metadata={**self._message_metadata(), "is_tool_call": True},
                 )
                 if self.tracer._current_trace:
-                    logger.warning(
-                        "[TRACE_DEBUG] After tool invocation: messages=%s",
-                        len(self.tracer._current_trace.markov_blanket_message_history),
+                    print(
+                        f"[TRACE_DEBUG] After tool invocation: messages={len(self.tracer._current_trace.markov_blanket_message_history)}",
+                        flush=True,
                     )
             except Exception as exc:
                 logger.debug("TRACING_TOOL_MSG_FAIL: %s", exc)
@@ -784,9 +792,33 @@ class RolloutTracingContext:
             }
         )
+        assistant_structured = assistant_content if assistant_content is not None else ""
+        assistant_text = self._content_to_text(assistant_content)
+        if self.enabled and self.tracer is not None:
+            assistant_payload: dict[str, Any] = {
+                "role": "assistant",
+                "content": assistant_structured,
+                "text": assistant_text,
+            }
+            if isinstance(assistant_message, dict):
+                if assistant_message.get("tool_calls"):
+                    assistant_payload["tool_calls"] = assistant_message.get("tool_calls")
+                if assistant_message.get("reasoning"):
+                    assistant_payload["reasoning"] = assistant_message.get("reasoning")
+                if assistant_message.get("thinking"):
+                    assistant_payload["thinking"] = assistant_message.get("thinking")
+            try:
+                await self.tracer.record_message(
+                    content=assistant_payload,
+                    message_type="assistant",
+                    metadata=self._message_metadata(),
+                )
+            except Exception as exc:
+                logger.debug("TRACING_ASSISTANT_MSG_FAIL: %s", exc)
         if self.sft_output_dir is not None:
             assistant_structured = assistant_content if assistant_content is not None else ""
-            assistant_text = self._content_to_text(assistant_content)
             dialogue_structured: list[dict[str, Any]] = []
             for content in self.latest_system_prompt_content:
                 if content is None:
@@ -951,17 +983,23 @@ class RolloutTracingContext:
                 # Debug: Check message count before end_session
                 if self.tracer._current_trace:
                     msg_count = len(self.tracer._current_trace.markov_blanket_message_history)
-                    logger.info(f"[TRACE_DEBUG] Before end_session: {msg_count} messages in trace")
+                    print(f"[TRACE_DEBUG] Before end_session: {msg_count} messages in trace", flush=True)
                 self.session_trace = await self.tracer.end_session()
                 # Debug: Check if session was saved
                 if self.session_trace:
-                    logger.info(f"[TRACE_DEBUG] Session ended successfully, session_id={self.session_trace.session_id}")
+                    print(
+                        f"[TRACE_DEBUG] Session ended successfully, session_id={self.session_trace.session_id}",
+                        flush=True,
+                    )
                     self.session_trace.metadata.update(self.metadata_updates)
-                    logger.info(f"[TRACE_DEBUG] session_trace.metadata keys: {list(self.session_trace.metadata.keys())}")
+                    print(
+                        f"[TRACE_DEBUG] session_trace.metadata keys: {list(self.session_trace.metadata.keys())}",
+                        flush=True,
+                    )
                 else:
-                    logger.warning("[TRACE_DEBUG] end_session returned None!")
+                    print("[TRACE_DEBUG] end_session returned None!", flush=True)
             except Exception as exc:
                 logger.warning(f"TRACING_END_SESSION_FAIL: {exc}", exc_info=True)
                 self.session_trace = None
@@ -1001,9 +1039,9 @@ class RolloutTracingContext:
         if self.trace_format in ("full", "structured"):
             payload = session_trace.to_dict()
             payload.setdefault("metadata", {}).update(self.metadata_updates)
-            logger.warning(
-                "[TRACE_DEBUG] build_trace_payload returning structured trace with messages=%s",
-                len(payload.get("markov_blanket_message_history") or []),
+            print(
+                f"[TRACE_DEBUG] build_trace_payload returning structured trace with messages={len(payload.get('markov_blanket_message_history') or [])}",
+                flush=True,
             )
             return payload
@@ -1943,6 +1981,15 @@ async def execute_rollout(
         if 'policy_config_snapshot' not in locals():
             policy_config_snapshot = {}
+        # Normalize inference URL for trajectory (and ensure no path in query)
+        try:
+            from .utils import force_normalize_chat_completions_url, ensure_chat_completions_url
+            inference_url = force_normalize_chat_completions_url(inference_url)
+            # apply mode-aware normalization too (keeps cid, appends path if missing)
+            inference_url = ensure_chat_completions_url(inference_url, mode=request.mode)
+        except Exception:
+            pass
         logger.info(
             "ROLLOUT_TRAJECTORY: run_id=%s policy_id=%s inference_url=%s trace_id=%s",
             request.run_id,
@@ -2057,6 +2104,16 @@ async def execute_rollout(
         if metrics.num_steps <= 0:
             raise HTTPException(status_code=500, detail="no_steps_executed: avg_turns == 0")
+        # Ensure at least one tool call executed successfully
+        tool_call_executed = any(
+            isinstance(step.tool_calls, list) and len(step.tool_calls) > 0 for step in trajectory_steps
+        )
+        if not tool_call_executed:
+            raise HTTPException(
+                status_code=502,
+                detail="no_tool_calls_executed: model failed to produce actionable tool calls.",
+            )
         response = RolloutResponse(
             run_id=request.run_id,
             trajectories=[trajectory],

examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py CHANGED Viewed

@@ -11,6 +11,129 @@ logger = logging.getLogger(__name__)
 _CHAT_COMPLETIONS_SUFFIX = "/v1/chat/completions"
+def force_normalize_chat_completions_url(raw_url: Any) -> str:
+    """
+    Bulletproof normalizer: converts ANY malformed inference URL into the
+    correct chat-completions URL form.
+    Rules:
+    - Final path MUST end with /v1/chat/completions
+    - Query MUST NOT contain any '/' characters (no path segments in query)
+    - If the original query contained a path (e.g., '?cid=.../v1/chat/completions'),
+      extract that path and move it to the URL path; keep remaining query params
+    - Preserve scheme, host, port and existing query params order as much as possible
+    Examples:
+      https://host?cid=trace_123/v1/chat/completions
+        -> https://host/v1/chat/completions?cid=trace_123
+      https://host:8000?cid=trace_abc/v1/chat/completions&foo=bar
+        -> https://host:8000/v1/chat/completions?cid=trace_abc&foo=bar
+      https://host?cid=trace_123/v1/chat/completions?other=param
+        -> https://host/v1/chat/completions?cid=trace_123&other=param
+    """
+    if not isinstance(raw_url, str):
+        return raw_url
+    url = raw_url.strip()
+    if not url:
+        return raw_url
+    parsed = urlparse(url)
+    path = (parsed.path or "").rstrip("/")
+    query = parsed.query or ""
+    # If query contains a path (has '/'), extract and repair
+    if query and "/" in query:
+        # Split query at the first '/' (everything before is real query params)
+        before_slash, after_slash = query.split("/", 1)
+        # after_slash may contain path and then more query params separated by '&' or '?' (malformed)
+        sep_indices = [i for i in [after_slash.find("&"), after_slash.find("?")] if i >= 0]
+        cut_idx = min(sep_indices) if sep_indices else len(after_slash)
+        path_from_query = "/" + after_slash[:cut_idx]  # restore leading '/'
+        extra_query = after_slash[cut_idx + 1 :] if cut_idx < len(after_slash) else ""
+        # Merge query params: base (before_slash) + extra_query
+        merged_query = before_slash
+        if extra_query:
+            merged_query = f"{merged_query}&{extra_query}" if merged_query else extra_query
+        # Decide final path
+        if path_from_query.startswith(_CHAT_COMPLETIONS_SUFFIX):
+            final_path = path_from_query
+        else:
+            final_path = f"{path_from_query.rstrip('/')}{_CHAT_COMPLETIONS_SUFFIX}"
+        parsed = parsed._replace(path=final_path, query=merged_query)
+        url = urlunparse(parsed)
+        parsed = urlparse(url)
+        path = parsed.path or ""
+        query = parsed.query or ""
+    # Ensure path ends with chat completions suffix
+    if not path.endswith(_CHAT_COMPLETIONS_SUFFIX):
+        new_path = f"{path}{_CHAT_COMPLETIONS_SUFFIX}" if path else _CHAT_COMPLETIONS_SUFFIX
+        parsed = parsed._replace(path=new_path)
+        url = urlunparse(parsed)
+        parsed = urlparse(url)
+        path = parsed.path or ""
+        query = parsed.query or ""
+    # Final validation: no '/' in query
+    if query and "/" in query:
+        # As a last resort, drop anything after the first '/'
+        safe_query = query.split("/")[0]
+        parsed = parsed._replace(query=safe_query)
+        url = urlunparse(parsed)
+    return url
+def _validate_url_structure(url: str, context: str = "") -> None:
+    """
+    Validate that a URL has correct structure (path before query, not vice versa).
+    Raises ValueError if URL is malformed.
+    Args:
+        url: The URL to validate
+        context: Optional context for error messages
+    Raises:
+        ValueError: If URL is malformed (path-like segments in query string)
+    """
+    if not isinstance(url, str) or not url.strip():
+        return
+    try:
+        parsed = urlparse(url)
+        query = parsed.query or ""
+        # CRITICAL CHECK: If query contains path-like segments (contains /), it's malformed
+        if query and "/" in query:
+            path_segment = query.split("/", 1)[1] if "/" in query else ""
+            error_msg = (
+                f"FATAL [TASK_APP_URL_VALIDATION]: Malformed inference URL detected!\n"
+                f"\n"
+                f"URL: {url}\n"
+                f"Context: {context}\n"
+                f"\n"
+                f"The URL has a path-like segment ('/{path_segment}') in the query string.\n"
+                f"This indicates incorrect URL construction upstream.\n"
+                f"\n"
+                f"Expected: https://host/v1/chat/completions?cid=trace_123\n"
+                f"Malformed: https://host?cid=trace_123/v1/chat/completions\n"
+                f"\n"
+                f"This should be caught by the trainer, but if you see this,\n"
+                f"the trainer's URL validation may have failed.\n"
+            )
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+    except ValueError:
+        raise
+    except Exception as e:
+        logger.warning(f"[URL_VALIDATION] Failed to parse URL: {url} (context: {context}, error: {e})")
 def ensure_chat_completions_url(raw_url: Any, mode: str | None = None) -> Any:
     """
     Ensure inference URLs point at the chat completions endpoint.
@@ -43,9 +166,75 @@ def ensure_chat_completions_url(raw_url: Any, mode: str | None = None) -> Any:
     parsed = urlparse(url)
     path = (parsed.path or "").rstrip("/")
+    query = parsed.query
+    logger.debug(
+        "ensure_chat_completions_url: parsing url=%s -> path=%r query=%r",
+        url,
+        path,
+        query,
+    )
+    # CRITICAL: Check for malformed URLs (path in query) and fix them FIRST
+    # Example: https://host?cid=trace_123/v1/chat/completions
+    # Should be: https://host/v1/chat/completions?cid=trace_123
+    if query and "/" in query:
+        logger.error(
+            f"[URL_FIX] Detected malformed URL in ensure_chat_completions_url: {url}\n"
+            f"Path-like segment found in query string. Attempting to fix..."
+        )
+        # Split query at first "/" to separate query params from path
+        query_parts = query.split("/", 1)
+        if len(query_parts) == 2:
+            # query_parts[0] is the actual query (e.g., "cid=trace_123")
+            # query_parts[1] is the path that was incorrectly put in query
+            actual_query = query_parts[0]
+            path_and_more = query_parts[1]  # Could be "v1/chat/completions" or "v1/chat/completions&foo=bar"
+            # Extract the path part (everything before "&" or "?" if present)
+            # Handle both "&" (query param separator) and "?" (another malformed query separator)
+            if "&" in path_and_more:
+                # Path is followed by more query params (separated by &)
+                path_segment, extra_query = path_and_more.split("&", 1)
+                path_in_query = "/" + path_segment  # Restore leading slash
+                # Merge extra query params with actual_query
+                actual_query = f"{actual_query}&{extra_query}"
+            elif "?" in path_and_more:
+                # Path is followed by more query params (separated by ?, which is malformed)
+                path_segment, extra_query = path_and_more.split("?", 1)
+                path_in_query = "/" + path_segment  # Restore leading slash
+                # Merge extra query params with actual_query (use & as separator)
+                actual_query = f"{actual_query}&{extra_query}"
+            else:
+                # No extra query params, just the path
+                path_in_query = "/" + path_and_more  # Restore leading slash
+            # If the path_in_query already contains /v1/chat/completions, use it
+            # Otherwise, append /v1/chat/completions
+            if path_in_query.startswith("/v1/chat/completions"):
+                final_path = path_in_query
+            else:
+                # Append /v1/chat/completions to whatever path we found
+                final_path = path_in_query.rstrip("/") + "/v1/chat/completions"
+            # Reconstruct URL correctly: path comes before query
+            parsed = parsed._replace(path=final_path, query=actual_query)
+            fixed_url = urlunparse(parsed)
+            logger.warning(f"[URL_FIX] Fixed malformed URL:\n  FROM: {url}\n  TO:   {fixed_url}")
+            url = fixed_url
+            # Re-parse after fix
+            parsed = urlparse(url)
+            path = parsed.path.rstrip("/")
+            query = parsed.query
+        else:
+            # Can't parse - this shouldn't happen but validate will catch it
+            logger.error(f"[URL_FIX] Could not parse malformed query: {query}")
+            _validate_url_structure(url, context="ensure_chat_completions_url input - cannot fix")
     if path.endswith("/v1/chat/completions"):
         logger.debug("ensure_chat_completions_url: URL already normalized %s", url)
-        # Already targeting the desired endpoint; keep original to preserve trailing slash.
+        # Validate final URL
+        _validate_url_structure(url, context="ensure_chat_completions_url output")
         return url
     if not path:
@@ -55,6 +244,10 @@ def ensure_chat_completions_url(raw_url: Any, mode: str | None = None) -> Any:
     rebuilt = parsed._replace(path=new_path)
     normalized = urlunparse(rebuilt)
+    # CRITICAL: Validate the normalized URL
+    _validate_url_structure(normalized, context="ensure_chat_completions_url output")
     logger.info(
         "ensure_chat_completions_url: RL mode - normalized inference URL from %s to %s",
         url,

examples/task_apps/gepa_benchmarks/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""GEPA benchmark task apps (HotpotQA, IFBench, HoVer, PUPA)."""
+# Import modules for side effects (task app registration) when package is imported.
+from . import hotpotqa_task_app  # noqa: F401
+from . import hover_task_app  # noqa: F401
+from . import ifbench_task_app  # noqa: F401
+from . import pupa_task_app  # noqa: F401

examples/task_apps/gepa_benchmarks/common.py ADDED Viewed

@@ -0,0 +1,260 @@
+"""Shared helpers for GEPA benchmark task apps (HotpotQA, IFBench, HoVer, PUPA)."""
+from __future__ import annotations
+import json
+import os
+import re
+from typing import Any, Iterable, Mapping, Sequence
+import httpx
+from fastapi import HTTPException
+def _resolve_inference_url(base_url: str) -> str:
+    """Normalise a base inference URL to the chat completions endpoint."""
+    normalised = (base_url or "").rstrip("/")
+    if not normalised:
+        raise RuntimeError("policy.config.inference_url required")
+    if normalised.endswith("/v1/chat/completions"):
+        return normalised
+    if normalised.endswith("/chat/completions"):
+        return normalised
+    if normalised.endswith("/v1"):
+        return f"{normalised}/chat/completions"
+    return f"{normalised}/v1/chat/completions"
+_PLACEHOLDER_PATTERN = re.compile(r"\{([^{}]+)\}")
+def _substitute_placeholders(text: str, values: Mapping[str, Any]) -> str:
+    """Replace `{placeholder}` tokens in `text` with entries from `values`."""
+    def _replace(match: re.Match[str]) -> str:
+        key = match.group(1)
+        replacement = values.get(key)
+        return str(replacement) if replacement is not None else match.group(0)
+    return _PLACEHOLDER_PATTERN.sub(_replace, text)
+def render_messages(
+    policy_config: Mapping[str, Any],
+    placeholders: Mapping[str, Any],
+    default_messages: Sequence[Mapping[str, str]],
+) -> list[dict[str, str]]:
+    """Render chat messages either from policy prompt patterns or defaults."""
+    prompt_config = policy_config.get("prompt") if isinstance(policy_config, Mapping) else None
+    rendered: list[dict[str, str]] = []
+    if prompt_config and isinstance(prompt_config, Mapping):
+        messages = prompt_config.get("messages")
+        if isinstance(messages, Sequence):
+            for entry in messages:
+                if not isinstance(entry, Mapping):
+                    continue
+                role = str(entry.get("role") or "user")
+                pattern = entry.get("pattern") or entry.get("content") or ""
+                content = _substitute_placeholders(str(pattern), placeholders)
+                rendered.append({"role": role, "content": content})
+    if not rendered:
+        for entry in default_messages:
+            role = str(entry.get("role") or "user")
+            pattern = entry.get("pattern") or entry.get("content") or ""
+            content = _substitute_placeholders(str(pattern), placeholders)
+            rendered.append({"role": role, "content": content})
+    return rendered
+async def call_chat_completion(
+    policy_config: Mapping[str, Any],
+    placeholders: Mapping[str, Any],
+    default_messages: Sequence[Mapping[str, str]],
+    *,
+    tool_spec: Sequence[Mapping[str, Any]] | None = None,
+    tool_choice: Mapping[str, Any] | None = None,
+    timeout: float = 60.0,
+) -> tuple[str, dict[str, Any], list[dict[str, Any]]]:
+    """Invoke an OpenAI-compatible chat/completions endpoint.
+    Returns:
+        response_text: The assistant message text (empty string if missing).
+        raw_response: The JSON payload from the provider.
+        messages: The messages sent to the model (after placeholder substitution).
+    """
+    if not isinstance(policy_config, Mapping):
+        raise RuntimeError("policy.config must be a mapping for chat completion calls")
+    messages = render_messages(policy_config, placeholders, default_messages)
+    model = policy_config.get("model")
+    if not model:
+        raise RuntimeError("policy.config.model required for rollout")
+    temperature = policy_config.get("temperature", 0.0)
+    max_tokens = policy_config.get("max_tokens")
+    max_completion_tokens = policy_config.get("max_completion_tokens", max_tokens or 512)
+    inference_url = policy_config.get("inference_url") or ""
+    final_url = _resolve_inference_url(str(inference_url))
+    payload: dict[str, Any] = {
+        "model": model,
+        "messages": messages,
+        "temperature": temperature,
+        "max_completion_tokens": max_completion_tokens,
+    }
+    if tool_spec:
+        payload["tools"] = list(tool_spec)
+    if tool_choice:
+        payload["tool_choice"] = tool_choice
+    # Prefer provider-specific keys, fall back to SYNTH/OPENAI.
+    proxy_keys = {
+        "GROQ_API_KEY": os.getenv("GROQ_API_KEY"),
+        "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY"),
+        "SYNTH_API_KEY": os.getenv("SYNTH_API_KEY"),
+    }
+    api_key = next((value for value in proxy_keys.values() if value), None)
+    headers = {"Content-Type": "application/json"}
+    if api_key:
+        headers["Authorization"] = f"Bearer {api_key}"
+    async with httpx.AsyncClient(timeout=httpx.Timeout(timeout)) as client:
+        response = await client.post(final_url, json=payload, headers=headers)
+    try:
+        data = response.json()
+    except json.JSONDecodeError as exc:  # pragma: no cover - defensive
+        raise HTTPException(
+            status_code=502,
+            detail=f"Inference provider returned invalid JSON: {response.text[:800]}",
+        ) from exc
+    if response.status_code >= 500:
+        raise HTTPException(
+            status_code=502,
+            detail=f"Inference provider returned an error: {data}",
+        )
+    if response.status_code >= 400:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Invalid inference request: {data}",
+        )
+    response_text = ""
+    choices = data.get("choices") if isinstance(data, Mapping) else None
+    if isinstance(choices, Sequence) and choices:
+        message = choices[0].get("message")
+        if isinstance(message, Mapping):
+            response_text = str(message.get("content") or "")
+    return response_text, data, messages
+def normalise_answer(text: str) -> str:
+    """Normalise free-form text answers (HotpotQA style)."""
+    lowered = text.lower()
+    # Remove punctuation and articles.
+    cleaned = re.sub(r"[^a-z0-9\s]", " ", lowered)
+    cleaned = re.sub(r"\b(a|an|the)\b", " ", cleaned)
+    cleaned = re.sub(r"\s+", " ", cleaned).strip()
+    return cleaned
+_EMOJI_PATTERN = re.compile(
+    "["
+    "\U0001F600-\U0001F64F"  # emoticons
+    "\U0001F300-\U0001F5FF"  # symbols & pictographs
+    "\U0001F680-\U0001F6FF"  # transport & map symbols
+    "\U0001F1E0-\U0001F1FF"  # flags (iOS)
+    "\U00002700-\U000027BF"
+    "\U0001F900-\U0001F9FF"
+    "\U00002600-\U000026FF"
+    "\U00002B00-\U00002BFF"
+    "]",
+    flags=re.UNICODE,
+)
+def count_emojis(text: str) -> int:
+    """Return rough count of emoji characters."""
+    return len(_EMOJI_PATTERN.findall(text))
+def tokenize(text: str) -> list[str]:
+    """Simple whitespace/token splitter with punctuation stripping."""
+    cleaned = re.sub(r"[^\w\s]", " ", text.lower())
+    return [token for token in cleaned.split() if token]
+def sentence_split(text: str) -> list[str]:
+    """Split text into sentences using punctuation heuristics."""
+    parts = re.split(r"(?<=[.!?])\s+", text.strip())
+    return [part.strip() for part in parts if part.strip()]
+def count_numbers(text: str) -> int:
+    """Count occurrences of numeric tokens."""
+    return len(re.findall(r"\b\d+(?:\.\d+)?\b", text))
+def unique_word_count(tokens: Iterable[str]) -> int:
+    """Return number of unique tokens."""
+    return len(set(tokens))
+PRONOUNS = {
+    "i",
+    "me",
+    "you",
+    "he",
+    "him",
+    "she",
+    "her",
+    "it",
+    "we",
+    "us",
+    "they",
+    "them",
+    "my",
+    "mine",
+    "your",
+    "yours",
+    "his",
+    "hers",
+    "its",
+    "our",
+    "ours",
+    "their",
+    "theirs",
+}
+def count_pronouns(tokens: Iterable[str]) -> int:
+    """Count pronoun tokens from a predefined list."""
+    return sum(1 for token in tokens if token in PRONOUNS)
+__all__ = [
+    "call_chat_completion",
+    "count_emojis",
+    "count_numbers",
+    "count_pronouns",
+    "normalise_answer",
+    "render_messages",
+    "sentence_split",
+    "tokenize",
+    "unique_word_count",
+]

synth-ai 0.2.17__py3-none-any.whl → 0.2.19__py3-none-any.whl

Potentially problematic release.

synth-ai 0.2.17py3-none-any.whl → 0.2.19py3-none-any.whl