PyPI - synth-ai - Versions diffs - 0.2.9.dev4__py3-none-any.whl → 0.2.9.dev7__py3-none-any.whl - Mend

synth-ai 0.2.9.dev4py3-none-any.whl → 0.2.9.dev7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of synth-ai might be problematic. Click here for more details.

Files changed (157) hide show

examples/common_old/backend.py +0 -1
examples/crafter_debug_render.py +15 -6
examples/evals_old/compare_models.py +1 -0
examples/finetuning_old/_backup_synth_qwen/filter_traces_achievements.py +6 -2
examples/finetuning_old/_backup_synth_qwen/react_agent_lm.py +4 -4
examples/finetuning_old/_backup_synth_qwen/sft_kickoff.py +4 -3
examples/finetuning_old/synth_qwen_v1/filter_traces_achievements.py +6 -2
examples/finetuning_old/synth_qwen_v1/finetune.py +1 -1
examples/finetuning_old/synth_qwen_v1/hello_ft_model.py +4 -4
examples/finetuning_old/synth_qwen_v1/infer.py +1 -2
examples/finetuning_old/synth_qwen_v1/poll.py +4 -2
examples/finetuning_old/synth_qwen_v1/prepare_data.py +8 -8
examples/finetuning_old/synth_qwen_v1/react_agent_lm.py +5 -4
examples/finetuning_old/synth_qwen_v1/run_crafter_sft_job.py +11 -8
examples/finetuning_old/synth_qwen_v1/run_ft_job.py +17 -12
examples/finetuning_old/synth_qwen_v1/upload_data.py +1 -1
examples/finetuning_old/synth_qwen_v1/util.py +7 -2
examples/rl/configs/eval_base_qwen.toml +1 -1
examples/rl/configs/rl_from_base_qwen17.toml +1 -1
examples/rl/download_dataset.py +26 -10
examples/rl/run_eval.py +17 -15
examples/rl/run_rl_and_save.py +24 -7
examples/rl/task_app/math_single_step.py +128 -11
examples/rl/task_app/math_task_app.py +11 -3
examples/rl_old/task_app.py +222 -53
examples/warming_up_to_rl/analyze_trace_db.py +7 -5
examples/warming_up_to_rl/export_trace_sft.py +141 -16
examples/warming_up_to_rl/groq_test.py +11 -4
examples/warming_up_to_rl/manage_secrets.py +15 -6
examples/warming_up_to_rl/readme.md +9 -2
examples/warming_up_to_rl/run_eval.py +108 -30
examples/warming_up_to_rl/run_fft_and_save.py +128 -52
examples/warming_up_to_rl/run_local_rollout.py +87 -36
examples/warming_up_to_rl/run_local_rollout_modal.py +113 -25
examples/warming_up_to_rl/run_local_rollout_parallel.py +80 -16
examples/warming_up_to_rl/run_local_rollout_traced.py +125 -20
examples/warming_up_to_rl/run_rl_and_save.py +31 -7
examples/warming_up_to_rl/run_rollout_remote.py +37 -10
examples/warming_up_to_rl/task_app/grpo_crafter.py +90 -27
examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +9 -27
examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +46 -108
examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -1
examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +1 -1
examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -1
examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +50 -17
examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +35 -21
examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +8 -4
examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +29 -26
examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +1 -1
examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +17 -13
examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +1 -1
examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +106 -63
examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +82 -84
examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +76 -59
examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +1 -1
examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +43 -49
examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +5 -15
synth_ai/__init__.py +1 -0
synth_ai/api/train/builders.py +34 -10
synth_ai/api/train/cli.py +172 -32
synth_ai/api/train/config_finder.py +59 -4
synth_ai/api/train/env_resolver.py +32 -14
synth_ai/api/train/pollers.py +11 -3
synth_ai/api/train/task_app.py +4 -1
synth_ai/api/train/utils.py +20 -4
synth_ai/cli/__init__.py +11 -4
synth_ai/cli/balance.py +1 -1
synth_ai/cli/demo.py +19 -5
synth_ai/cli/rl_demo.py +75 -16
synth_ai/cli/root.py +116 -37
synth_ai/cli/task_apps.py +1286 -170
synth_ai/cli/traces.py +1 -0
synth_ai/cli/turso.py +73 -0
synth_ai/core/experiment.py +0 -2
synth_ai/demo_registry.py +67 -30
synth_ai/demos/core/cli.py +493 -164
synth_ai/demos/demo_task_apps/core.py +50 -6
synth_ai/demos/demo_task_apps/crafter/configs/crafter_fft_4b.toml +2 -3
synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +36 -28
synth_ai/demos/demo_task_apps/math/_common.py +1 -2
synth_ai/demos/demo_task_apps/math/deploy_modal.py +0 -2
synth_ai/demos/demo_task_apps/math/modal_task_app.py +168 -65
synth_ai/demos/demo_task_apps/math/task_app_entry.py +0 -1
synth_ai/environments/examples/bandit/engine.py +12 -4
synth_ai/environments/examples/bandit/taskset.py +4 -4
synth_ai/environments/reproducibility/tree.py +3 -1
synth_ai/environments/service/core_routes.py +6 -2
synth_ai/evals/base.py +0 -2
synth_ai/experimental/synth_oss.py +11 -12
synth_ai/handshake.py +3 -1
synth_ai/http_client.py +31 -7
synth_ai/inference/__init__.py +0 -2
synth_ai/inference/client.py +8 -4
synth_ai/jobs/client.py +40 -10
synth_ai/learning/client.py +33 -8
synth_ai/learning/config.py +0 -2
synth_ai/learning/constants.py +0 -2
synth_ai/learning/ft_client.py +6 -3
synth_ai/learning/health.py +9 -2
synth_ai/learning/jobs.py +17 -5
synth_ai/learning/prompts/hello_world_in_context_injection_ex.py +1 -3
synth_ai/learning/prompts/random_search.py +4 -1
synth_ai/learning/prompts/run_random_search_banking77.py +6 -1
synth_ai/learning/rl_client.py +42 -14
synth_ai/learning/sse.py +0 -2
synth_ai/learning/validators.py +6 -2
synth_ai/lm/caching/ephemeral.py +1 -3
synth_ai/lm/core/exceptions.py +0 -2
synth_ai/lm/core/main.py +13 -1
synth_ai/lm/core/synth_models.py +0 -1
synth_ai/lm/core/vendor_clients.py +4 -2
synth_ai/lm/overrides.py +2 -2
synth_ai/lm/vendors/core/anthropic_api.py +7 -7
synth_ai/lm/vendors/core/openai_api.py +2 -0
synth_ai/lm/vendors/openai_standard.py +3 -1
synth_ai/lm/vendors/openai_standard_responses.py +6 -3
synth_ai/lm/vendors/supported/custom_endpoint.py +1 -3
synth_ai/lm/vendors/synth_client.py +37 -10
synth_ai/rl/__init__.py +0 -1
synth_ai/rl/contracts.py +0 -2
synth_ai/rl/env_keys.py +6 -1
synth_ai/task/__init__.py +1 -0
synth_ai/task/apps/__init__.py +11 -11
synth_ai/task/auth.py +29 -17
synth_ai/task/client.py +3 -1
synth_ai/task/contracts.py +1 -0
synth_ai/task/datasets.py +3 -1
synth_ai/task/errors.py +3 -2
synth_ai/task/health.py +0 -2
synth_ai/task/json.py +0 -1
synth_ai/task/proxy.py +2 -5
synth_ai/task/rubrics.py +9 -3
synth_ai/task/server.py +31 -5
synth_ai/task/tracing_utils.py +8 -3
synth_ai/task/validators.py +0 -1
synth_ai/task/vendors.py +0 -1
synth_ai/tracing_v3/db_config.py +26 -1
synth_ai/tracing_v3/decorators.py +1 -0
synth_ai/tracing_v3/examples/basic_usage.py +3 -2
synth_ai/tracing_v3/hooks.py +2 -0
synth_ai/tracing_v3/replica_sync.py +1 -0
synth_ai/tracing_v3/session_tracer.py +24 -3
synth_ai/tracing_v3/storage/base.py +4 -1
synth_ai/tracing_v3/storage/factory.py +0 -1
synth_ai/tracing_v3/turso/manager.py +102 -38
synth_ai/tracing_v3/turso/models.py +4 -1
synth_ai/tracing_v3/utils.py +1 -0
synth_ai/v0/tracing/upload.py +32 -135
{synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev7.dist-info}/METADATA +1 -1
{synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev7.dist-info}/RECORD +154 -156
examples/warming_up_to_rl/task_app/synth_envs_hosted/test_stepwise_rewards.py +0 -58
synth_ai/environments/examples/sokoban/units/astar_common.py +0 -95
synth_ai/install_sqld.sh +0 -40
{synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev7.dist-info}/WHEEL +0 -0
{synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev7.dist-info}/entry_points.txt +0 -0
{synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev7.dist-info}/licenses/LICENSE +0 -0
{synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev7.dist-info}/top_level.txt +0 -0

synth_ai/cli/task_apps.py CHANGED Viewed

@@ -8,6 +8,7 @@ import importlib
 import importlib.util
 import inspect
 import os
+import json
 import signal
 import shutil
 import subprocess
@@ -15,11 +16,19 @@ import sys
 import tempfile
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Callable, Iterable, Sequence
+import types
+from typing import Any, Callable, Iterable, Sequence, Iterator, cast
+try:  # Python 3.11+
+    import tomllib as _toml
+except Exception:  # pragma: no cover - fallback
+    _toml = None  # type: ignore
+import uuid
 import click
 from synth_ai.task.apps import ModalDeploymentConfig, TaskAppConfig, TaskAppEntry, registry
-from synth_ai.task.server import run_task_app
+from synth_ai.task.server import run_task_app, create_task_app
+from synth_ai.config.base_url import PROD_BASE_URL_DEFAULT
 REPO_ROOT = Path(__file__).resolve().parents[2]
@@ -37,6 +46,8 @@ DEFAULT_IGNORE_DIRS = {
 DEFAULT_SEARCH_RELATIVE = (
     Path("."),
+    Path("examples"),
+    Path("synth_ai"),
 )
@@ -63,6 +74,73 @@ class AppChoice:
         return entry
+def _temporary_sys_path(paths: Sequence[Path]):
+    """Context manager to prepend entries to sys.path temporarily."""
+    @contextlib.contextmanager
+    def _manager() -> Iterator[None]:
+        added: list[str] = []
+        for p in paths:
+            try:
+                resolved = str(p.resolve())
+            except Exception:
+                continue
+            if resolved in sys.path:
+                continue
+            sys.path.insert(0, resolved)
+            added.append(resolved)
+        try:
+            yield None
+        finally:
+            for entry in added:
+                with contextlib.suppress(ValueError):
+                    sys.path.remove(entry)
+    return _manager()
+def _possible_module_names(
+    path: Path, module_search_roots: Sequence[Path]
+) -> list[tuple[str, Path]]:
+    """Return potential module names based on candidate roots."""
+    candidates: list[tuple[str, Path]] = []
+    for root in module_search_roots:
+        try:
+            resolved_root = root.resolve()
+        except Exception:
+            continue
+        if not resolved_root.exists() or not path.is_relative_to(resolved_root):
+            continue
+        relative = path.relative_to(resolved_root)
+        stem = relative.with_suffix("")
+        parts = list(stem.parts)
+        if not parts:
+            continue
+        module_name = ".".join(parts)
+        if module_name:
+            candidates.append((module_name, resolved_root))
+    return candidates
+def _ensure_parent_namespace(module_name: str, search_root: Path) -> None:
+    """Ensure namespace packages exist for dotted module names."""
+    parts = module_name.split(".")
+    for depth in range(1, len(parts)):
+        parent_name = ".".join(parts[:depth])
+        if parent_name in sys.modules:
+            continue
+        parent_module = types.ModuleType(parent_name)
+        candidate_dir = search_root.joinpath(*parts[:depth])
+        try:
+            resolved = candidate_dir.resolve()
+        except Exception:
+            resolved = search_root.resolve()
+        parent_module.__path__ = [str(resolved)]  # type: ignore[attr-defined]
+        sys.modules[parent_name] = parent_module
 def _should_ignore_path(path: Path) -> bool:
     return any(part in DEFAULT_IGNORE_DIRS for part in path.parts)
@@ -70,7 +148,19 @@ def _should_ignore_path(path: Path) -> bool:
 def _candidate_search_roots() -> list[Path]:
     """Only search for task apps in the current working directory and subdirectories."""
     roots: list[Path] = []
+    # Prioritize demo directory if it exists
+    try:
+        from synth_ai.demos.demo_task_apps.core import load_demo_dir
+        demo_dir = load_demo_dir()
+        if demo_dir:
+            demo_path = Path(demo_dir)
+            if demo_path.exists() and demo_path.is_dir():
+                roots.append(demo_path.resolve())
+    except Exception:
+        pass
     # Allow explicit search paths via environment variable
     env_paths = os.environ.get("SYNTH_TASK_APP_SEARCH_PATH")
     if env_paths:
@@ -82,6 +172,13 @@ def _candidate_search_roots() -> list[Path]:
     cwd = Path.cwd().resolve()
     roots.append(cwd)
+    for rel in DEFAULT_SEARCH_RELATIVE:
+        try:
+            candidate = (cwd / rel).resolve()
+        except Exception:
+            continue
+        roots.append(candidate)
     # Remove duplicates while preserving order
     seen: set[Path] = set()
     ordered: list[Path] = []
@@ -97,6 +194,49 @@ def _candidate_search_roots() -> list[Path]:
     return ordered
+def _eval_config_sort_key(path: Path) -> tuple[int, int, int, str]:
+    name = path.name.lower()
+    parent_names = {p.name.lower() for p in path.parents}
+    in_configs = 0 if "configs" in parent_names else 1
+    in_examples = 0 if "examples" in parent_names else 1
+    starts_eval = 0 if name.startswith("eval") else 1
+    return (in_configs, in_examples, starts_eval, str(path))
+def _discover_eval_config_paths() -> list[Path]:
+    """Find candidate eval TOML files near the current working directory."""
+    candidates: list[Path] = []
+    seen: set[Path] = set()
+    search_roots = _candidate_search_roots()
+    for root in search_roots:
+        if not root.exists() or not root.is_dir():
+            continue
+        try:
+            root_resolved = root.resolve()
+        except Exception:
+            continue
+        for path in root.rglob("*.toml"):
+            if not path.is_file():
+                continue
+            if _should_ignore_path(path):
+                continue
+            name_lower = path.name.lower()
+            if "eval" not in name_lower and "evaluation" not in name_lower:
+                continue
+            try:
+                resolved = path.resolve()
+            except Exception:
+                continue
+            if resolved in seen:
+                continue
+            seen.add(resolved)
+            candidates.append(resolved)
+    candidates.sort(key=_eval_config_sort_key)
+    return candidates
 class _TaskAppConfigVisitor(ast.NodeVisitor):
     def __init__(self) -> None:
         self.matches: list[tuple[str, int]] = []
@@ -124,7 +264,11 @@ def _is_task_app_config_call(node: ast.Call) -> bool:
 def _extract_app_id(node: ast.Call) -> str | None:
     for kw in node.keywords:
-        if kw.arg == "app_id" and isinstance(kw.value, ast.Constant) and isinstance(kw.value.value, str):
+        if (
+            kw.arg == "app_id"
+            and isinstance(kw.value, ast.Constant)
+            and isinstance(kw.value.value, str)
+        ):
             return kw.value.value
     if node.args:
         first = node.args[0]
@@ -149,7 +293,11 @@ def _extract_register_app_id(node: ast.Call) -> str | None:
             entry_call = kw.value
             if isinstance(entry_call.func, ast.Name) and entry_call.func.id == "TaskAppEntry":
                 for entry_kw in entry_call.keywords:
-                    if entry_kw.arg == "app_id" and isinstance(entry_kw.value, ast.Constant) and isinstance(entry_kw.value.value, str):
+                    if (
+                        entry_kw.arg == "app_id"
+                        and isinstance(entry_kw.value, ast.Constant)
+                        and isinstance(entry_kw.value.value, str)
+                    ):
                         return entry_kw.value.value
     return None
@@ -180,7 +328,11 @@ class _ModalAppVisitor(ast.NodeVisitor):
             if name:
                 self.matches.append((name, getattr(node, "lineno", 0)))
         elif isinstance(func, ast.Attribute):
-            if isinstance(func.value, ast.Name) and func.value.id in self.modal_aliases and func.attr == "App":
+            if (
+                isinstance(func.value, ast.Name)
+                and func.value.id in self.modal_aliases
+                and func.attr == "App"
+            ):
                 name = _extract_modal_app_name(node)
                 if name:
                     self.matches.append((name, getattr(node, "lineno", 0)))
@@ -189,7 +341,11 @@ class _ModalAppVisitor(ast.NodeVisitor):
 def _extract_modal_app_name(node: ast.Call) -> str | None:
     for kw in node.keywords:
-        if kw.arg in {"name", "app_name"} and isinstance(kw.value, ast.Constant) and isinstance(kw.value.value, str):
+        if (
+            kw.arg in {"name", "app_name"}
+            and isinstance(kw.value, ast.Constant)
+            and isinstance(kw.value.value, str)
+        ):
             return kw.value.value
     if node.args:
         first = node.args[0]
@@ -201,7 +357,7 @@ def _extract_modal_app_name(node: ast.Call) -> str | None:
 def _collect_task_app_choices() -> list[AppChoice]:
     # Clear registry to avoid duplicate registration errors
     registry.clear()
     choices: list[AppChoice] = []
     with contextlib.suppress(Exception):
         import synth_ai.demos.demo_task_apps  # noqa: F401
@@ -224,6 +380,7 @@ def _collect_task_app_choices() -> list[AppChoice]:
             continue
         unique[key] = choice
         ordered.append(choice)
+    ordered.sort(key=_app_choice_sort_key)
     return ordered
@@ -254,6 +411,10 @@ def _collect_scanned_task_configs() -> list[AppChoice]:
     results: list[AppChoice] = []
     seen: set[tuple[str, Path]] = set()
     for root in _candidate_search_roots():
+        try:
+            root_resolved = root.resolve()
+        except Exception:
+            continue
         if not root.exists() or not root.is_dir():
             continue
         for path in root.rglob("*.py"):
@@ -283,7 +444,11 @@ def _collect_scanned_task_configs() -> list[AppChoice]:
                         path=path.resolve(),
                         source="discovered",
                         description=f"TaskAppConfig in {path.name} (line {lineno})",
-                        entry_loader=lambda p=path.resolve(), a=app_id: _load_entry_from_path(p, a),
+                        entry_loader=lambda p=path.resolve(),
+                        a=app_id,
+                        roots=(root_resolved,): _load_entry_from_path(
+                            p, a, module_search_roots=roots
+                        ),
                         lineno=lineno,
                     )
                 )
@@ -330,6 +495,62 @@ def _collect_modal_scripts() -> list[AppChoice]:
     return results
+def _app_choice_sort_key(choice: AppChoice) -> tuple[int, int, int, int, int, str, str]:
+    """Ranking heuristic so wrapper-style task apps surface first."""
+    # Prioritize apps in the current working directory (demo or otherwise)
+    cwd_rank = 1
+    try:
+        cwd = Path.cwd().resolve()
+        if choice.path.is_relative_to(cwd):
+            # Check if this is directly in CWD (not in subdirectories like examples/)
+            try:
+                rel_path = choice.path.relative_to(cwd)
+                # If it's in the immediate directory or one level deep, prioritize it
+                if len(rel_path.parts) <= 2:
+                    cwd_rank = 0
+            except Exception:
+                pass
+    except Exception:
+        pass
+    # Further prioritize apps in the demo directory if one is set
+    demo_rank = 1
+    try:
+        from synth_ai.demos.demo_task_apps.core import load_demo_dir
+        demo_dir = load_demo_dir()
+        if demo_dir:
+            demo_path = Path(demo_dir).resolve()
+            if choice.path.is_relative_to(demo_path):
+                demo_rank = 0
+    except Exception:
+        pass
+    modal_rank = 1 if choice.modal_script else 0
+    name = choice.path.name.lower()
+    file_rank = 3
+    if name.endswith("_task_app.py") or name.endswith("task_app.py"):
+        file_rank = 0
+    elif name.endswith("_app.py") or "task_app" in name:
+        file_rank = 1
+    elif name.endswith(".py"):
+        file_rank = 2
+    directory_rank = 0 if choice.path.parent.name.lower() in {"task_app", "task_apps"} else 1
+    return (
+        demo_rank,
+        cwd_rank,
+        modal_rank,
+        file_rank,
+        directory_rank,
+        choice.app_id,
+        str(choice.path),
+    )
 def _choice_matches_identifier(choice: AppChoice, identifier: str) -> bool:
     ident = identifier.strip()
     if not ident:
@@ -357,7 +578,7 @@ def _has_modal_support_in_file(path: Path) -> bool:
     try:
         source = path.read_text(encoding="utf-8")
         tree = ast.parse(source, filename=str(path))
         # Look for ModalDeploymentConfig in register_task_app calls
         for node in ast.walk(tree):
             if isinstance(node, ast.Call):
@@ -366,11 +587,19 @@ def _has_modal_support_in_file(path: Path) -> bool:
                     for kw in node.keywords:
                         if kw.arg == "entry" and isinstance(kw.value, ast.Call):
                             entry_call = kw.value
-                            if isinstance(entry_call.func, ast.Name) and entry_call.func.id == "TaskAppEntry":
+                            if (
+                                isinstance(entry_call.func, ast.Name)
+                                and entry_call.func.id == "TaskAppEntry"
+                            ):
                                 for entry_kw in entry_call.keywords:
-                                    if entry_kw.arg == "modal" and isinstance(entry_kw.value, ast.Call):
+                                    if entry_kw.arg == "modal" and isinstance(
+                                        entry_kw.value, ast.Call
+                                    ):
                                         modal_call = entry_kw.value
-                                        if isinstance(modal_call.func, ast.Name) and modal_call.func.id == "ModalDeploymentConfig":
+                                        if (
+                                            isinstance(modal_call.func, ast.Name)
+                                            and modal_call.func.id == "ModalDeploymentConfig"
+                                        ):
                                             return True
     except Exception:
         pass
@@ -382,7 +611,7 @@ def _extract_modal_config_from_file(path: Path) -> ModalDeploymentConfig | None:
     try:
         source = path.read_text(encoding="utf-8")
         tree = ast.parse(source, filename=str(path))
         # Look for ModalDeploymentConfig in register_task_app calls
         for node in ast.walk(tree):
             if isinstance(node, ast.Call):
@@ -391,11 +620,19 @@ def _extract_modal_config_from_file(path: Path) -> ModalDeploymentConfig | None:
                     for kw in node.keywords:
                         if kw.arg == "entry" and isinstance(kw.value, ast.Call):
                             entry_call = kw.value
-                            if isinstance(entry_call.func, ast.Name) and entry_call.func.id == "TaskAppEntry":
+                            if (
+                                isinstance(entry_call.func, ast.Name)
+                                and entry_call.func.id == "TaskAppEntry"
+                            ):
                                 for entry_kw in entry_call.keywords:
-                                    if entry_kw.arg == "modal" and isinstance(entry_kw.value, ast.Call):
+                                    if entry_kw.arg == "modal" and isinstance(
+                                        entry_kw.value, ast.Call
+                                    ):
                                         modal_call = entry_kw.value
-                                        if isinstance(modal_call.func, ast.Name) and modal_call.func.id == "ModalDeploymentConfig":
+                                        if (
+                                            isinstance(modal_call.func, ast.Name)
+                                            and modal_call.func.id == "ModalDeploymentConfig"
+                                        ):
                                             # Extract the arguments to ModalDeploymentConfig
                                             return _build_modal_config_from_ast(modal_call)
     except Exception:
@@ -445,9 +682,10 @@ def _build_modal_config_from_ast(modal_call: ast.Call) -> ModalDeploymentConfig
                         if name and mount:
                             mounts.append((name, mount))
                 kwargs[kw.arg] = tuple(mounts)
         # Create ModalDeploymentConfig with extracted arguments
         from synth_ai.task.apps import ModalDeploymentConfig
         return ModalDeploymentConfig(**kwargs)
     except Exception:
         return None
@@ -465,20 +703,29 @@ def _choice_has_local_support(choice: AppChoice) -> bool:
 def _format_choice(choice: AppChoice, index: int | None = None) -> str:
     prefix = f"[{index}] " if index is not None else ""
-    rel_path: str
+    # Get file modification timestamp
     try:
-        rel_path = str(choice.path.relative_to(REPO_ROOT))
+        from datetime import datetime
+        mtime = choice.path.stat().st_mtime
+        modified_str = datetime.fromtimestamp(mtime).strftime("%Y-%m-%d %H:%M:%S")
+        details = f"Modified: {modified_str}"
     except Exception:
-        rel_path = str(choice.path)
-    details = choice.description or f"Located at {rel_path}"
-    return f"{prefix}{choice.app_id} ({choice.source}) – {details}"
+        # Fallback if timestamp unavailable
+        details = choice.description or "No timestamp available"
+    # Format: single line with timestamp
+    main_line = f"{prefix}{choice.app_id} ({choice.source}) – {details}"
+    return main_line
 def _prompt_user_for_choice(choices: list[AppChoice]) -> AppChoice:
     click.echo("Select a task app:")
     for idx, choice in enumerate(choices, start=1):
         click.echo(_format_choice(choice, idx))
-    response = click.prompt("Enter choice", default="1", type=str).strip() or "1"
+    try:
+        response = click.prompt("Enter choice", default="1", type=str).strip() or "1"
+    except (click.exceptions.Abort, EOFError, KeyboardInterrupt):
+        raise click.ClickException("Task app selection cancelled by user")
     if not response.isdigit():
         raise click.ClickException("Selection must be a number")
     index = int(response)
@@ -489,7 +736,7 @@ def _prompt_user_for_choice(choices: list[AppChoice]) -> AppChoice:
 def _select_app_choice(app_id: str | None, purpose: str) -> AppChoice:
     choices = _collect_task_app_choices()
-    if purpose == "serve":
+    if purpose in {"serve", "eval"}:
         filtered = [c for c in choices if not c.modal_script]
     elif purpose in {"deploy", "modal-serve"}:
         filtered = []
@@ -499,6 +746,8 @@ def _select_app_choice(app_id: str | None, purpose: str) -> AppChoice:
     else:
         filtered = choices
+    filtered.sort(key=_app_choice_sort_key)
     if not filtered:
         raise click.ClickException("No task apps discovered for this command.")
@@ -526,22 +775,90 @@ def _select_app_choice(app_id: str | None, purpose: str) -> AppChoice:
     return _prompt_user_for_choice(filtered)
-def _load_entry_from_path(path: Path, app_id: str) -> TaskAppEntry:
-    resolved = path.resolve()
-    module_name = f"_synth_task_app_{hashlib.md5(str(resolved).encode(), usedforsecurity=False).hexdigest()}"
+def _import_task_app_module(
+    resolved: Path,
+    module_name: str,
+    *,
+    namespace_root: Path | None,
+    sys_path_roots: Sequence[Path],
+    ensure_namespace: bool = True,
+) -> types.ModuleType:
     spec = importlib.util.spec_from_file_location(module_name, str(resolved))
     if spec is None or spec.loader is None:
         raise click.ClickException(f"Unable to load Python module from {resolved}")
     module = importlib.util.module_from_spec(spec)
     sys.modules[module_name] = module
-    # Clear registry before importing to avoid duplicate registration errors
-    registry.clear()
-    try:
-        spec.loader.exec_module(module)
-    except Exception as exc:
-        raise click.ClickException(f"Failed to import {resolved}: {exc}") from exc
+    with _temporary_sys_path(sys_path_roots):
+        if ensure_namespace and namespace_root is not None and "." in module_name:
+            _ensure_parent_namespace(module_name, namespace_root)
+        # Clear registry before importing to avoid duplicate registration errors
+        registry.clear()
+        try:
+            spec.loader.exec_module(module)
+        except Exception:
+            # Remove partially-imported module to avoid reuse
+            sys.modules.pop(module_name, None)
+            raise
+    return module
+def _load_entry_from_path(
+    path: Path, app_id: str, module_search_roots: Sequence[Path] | None = None
+) -> TaskAppEntry:
+    resolved = path.resolve()
+    search_roots: list[Path] = []
+    seen_roots: set[Path] = set()
+    def _append_root(candidate: Path) -> None:
+        try:
+            resolved_root = candidate.resolve()
+        except Exception:
+            return
+        if resolved_root in seen_roots:
+            return
+        seen_roots.add(resolved_root)
+        search_roots.append(resolved_root)
+    for root in module_search_roots or []:
+        _append_root(root)
+    _append_root(resolved.parent)
+    _append_root(REPO_ROOT)
+    last_error: Exception | None = None
+    module: types.ModuleType | None = None
+    for module_name, namespace_root in _possible_module_names(resolved, search_roots):
+        try:
+            module = _import_task_app_module(
+                resolved,
+                module_name,
+                namespace_root=namespace_root,
+                sys_path_roots=search_roots,
+                ensure_namespace=True,
+            )
+            break
+        except Exception as exc:  # pragma: no cover - best-effort fallbacks
+            last_error = exc
+            continue
+    if module is None:
+        hashed_name = f"_synth_task_app_{hashlib.md5(str(resolved).encode(), usedforsecurity=False).hexdigest()}"
+        try:
+            module = _import_task_app_module(
+                resolved,
+                hashed_name,
+                namespace_root=None,
+                sys_path_roots=search_roots,
+                ensure_namespace=False,
+            )
+        except Exception as exc:  # pragma: no cover - propagate meaningful error
+            detail = last_error or exc
+            raise click.ClickException(f"Failed to import {resolved}: {detail}") from detail
     config_obj: TaskAppConfig | None = None
     factory_callable: Callable[[], TaskAppConfig] | None = None
@@ -572,7 +889,11 @@ def _load_entry_from_path(path: Path, app_id: str) -> TaskAppEntry:
                 continue
             has_required = False
             for param in sig.parameters.values():
-                if param.kind in (inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD) and param.default is inspect._empty:
+                if (
+                    param.kind
+                    in (inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD)
+                    and param.default is inspect._empty
+                ):
                     has_required = True
                     break
             if has_required:
@@ -582,9 +903,13 @@ def _load_entry_from_path(path: Path, app_id: str) -> TaskAppEntry:
             except Exception:
                 continue
             if isinstance(result, TaskAppConfig) and result.app_id == app_id:
-                def _factory() -> TaskAppConfig:
-                    return attr()  # type: ignore[call-arg]
-                factory_callable = _factory
+                # Bind attr to a local and close over it without exposing parameters
+                _bound_func: Callable[[], TaskAppConfig] = cast(Callable[[], TaskAppConfig], attr)  # type: ignore[assignment]
+                def _factory_noargs() -> TaskAppConfig:
+                    return _bound_func()
+                factory_callable = _factory_noargs
                 config_obj = result
                 break
@@ -608,7 +933,7 @@ def _load_entry_from_path(path: Path, app_id: str) -> TaskAppEntry:
         if isinstance(attr, ModalDeploymentConfig):
             modal_cfg = attr
             break
     # If no ModalDeploymentConfig found, try to detect it via AST parsing
     if modal_cfg is None:
         modal_cfg = _extract_modal_config_from_file(resolved)
@@ -640,31 +965,31 @@ def _resolve_env_paths_for_script(script_path: Path, explicit: Sequence[str]) ->
     # Always prompt for env file selection instead of auto-loading defaults
     script_dir = script_path.parent.resolve()
     cwd = Path.cwd()
     # Look for env files in current working directory first, then repo root
     env_candidates = []
     # Add CWD env files first (prioritized)
-    cwd_env_files = sorted(cwd.glob('**/*.env'))
+    cwd_env_files = sorted(cwd.glob("**/*.env"))
     env_candidates.extend(cwd_env_files)
     # Add repo root env files
-    repo_env_files = sorted(REPO_ROOT.glob('**/*.env'))
+    repo_env_files = sorted(REPO_ROOT.glob("**/*.env"))
     # Avoid duplicates
     for repo_file in repo_env_files:
         if repo_file not in env_candidates:
             env_candidates.append(repo_file)
     if not env_candidates:
         created = _interactive_create_env(script_dir)
         if created is None:
             raise click.ClickException("Env file required (--env-file) for this task app")
         return [created]
-    click.echo('Select env file to load:')
+    click.echo("Select env file to load:")
     for idx, path in enumerate(env_candidates, start=1):
-        click.echo(f"  {idx}) {path}")
-    choice = click.prompt('Enter choice', type=click.IntRange(1, len(env_candidates)))
+        click.echo(f"  {idx}) {path.resolve()}")
+    choice = click.prompt("Enter choice", type=click.IntRange(1, len(env_candidates)), default=1)
     return [env_candidates[choice - 1]]
@@ -694,17 +1019,47 @@ def _run_modal_script(
         click.echo("Dry run: " + " ".join(cmd))
         return
     try:
-        subprocess.run(cmd, check=True)
+        # Capture output to extract URL
+        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
+        # Print output as it would normally appear
+        if result.stdout:
+            click.echo(result.stdout, nl=False)
+        if result.stderr:
+            click.echo(result.stderr, nl=False, err=True)
+        # Extract and save task app URL from output
+        task_app_url = None
+        for line in result.stdout.splitlines():
+            # Look for lines containing modal.run URLs
+            if "modal.run" in line and "=>" in line:
+                # Extract URL from lines like: "└── 🔨 Created web function fastapi_app => https://...modal.run"
+                parts = line.split("=>")
+                if len(parts) >= 2:
+                    task_app_url = parts[-1].strip()
+                    break
+        # Save URL to .env file if found
+        if task_app_url and env_paths_list:
+            env_file = env_paths_list[0]  # Use the first .env file
+            _save_to_env_file(env_file, "TASK_APP_BASE_URL", task_app_url)
+            click.echo(f"\n✓ Task app URL: {task_app_url}")
     except subprocess.CalledProcessError as exc:
-        raise click.ClickException(f"modal {command} failed with exit code {exc.returncode}") from exc
+        raise click.ClickException(
+            f"modal {command} failed with exit code {exc.returncode}"
+        ) from exc
 def _preflight_env_key(crash_on_failure: bool = False) -> None:
     try:
-        raw_backend = os.environ.get("BACKEND_BASE_URL") or os.environ.get("SYNTH_BASE_URL") or "http://localhost:8000/api"
-        backend_base = raw_backend.rstrip('/')
-        if not backend_base.endswith('/api'):
-            backend_base = backend_base + '/api'
+        raw_backend = (
+            os.environ.get("BACKEND_BASE_URL")
+            or os.environ.get("SYNTH_BASE_URL")
+            or f"{PROD_BASE_URL_DEFAULT}/api"
+        )
+        backend_base = raw_backend.rstrip("/")
+        if not backend_base.endswith("/api"):
+            backend_base = backend_base + "/api"
         synth_key = os.environ.get("SYNTH_API_KEY") or ""
         env_api_key = (
             os.environ.get("ENVIRONMENT_API_KEY")
@@ -727,20 +1082,49 @@ def _preflight_env_key(crash_on_failure: bool = False) -> None:
                     pub = PublicKey(base64.b64decode(pk, validate=True))
                     sb = SealedBox(pub)
-                    ct_b64 = base64.b64encode(sb.encrypt(env_api_key.encode('utf-8'))).decode()
+                    ct_b64 = base64.b64encode(sb.encrypt(env_api_key.encode("utf-8"))).decode()
                     payload = {"name": "ENVIRONMENT_API_KEY", "ciphertext_b64": ct_b64}
-                    with httpx.Client(timeout=15.0, headers={"Authorization": f"Bearer {synth_key}", "Content-Type": "application/json"}) as c:
+                    with httpx.Client(
+                        timeout=15.0,
+                        headers={
+                            "Authorization": f"Bearer {synth_key}",
+                            "Content-Type": "application/json",
+                        },
+                    ) as c:
                         click.echo("[preflight] upserting env key…")
                         up = c.post(f"{backend_base.rstrip('/')}/v1/env-keys", json=payload)
                         click.echo(f"[preflight] upsert status={up.status_code}")
-                        click.echo("[preflight] verifying env key presence…")
-                        ver = c.get(f"{backend_base.rstrip('/')}/v1/env-keys/verify")
-                        if ver.status_code == 200 and (ver.json() or {}).get("present"):
-                            # Show first and last 5 chars of the API key for verification
-                            key_preview = f"{env_api_key[:5]}...{env_api_key[-5:]}" if len(env_api_key) > 10 else env_api_key
-                            click.echo(f"✅ ENVIRONMENT_API_KEY upserted and verified in backend ({key_preview})")
+                        # If upload succeeded (2xx), consider it successful even if verification fails
+                        # This handles cases where verification endpoint has issues
+                        if 200 <= up.status_code < 300:
+                            key_preview = (
+                                f"{env_api_key[:5]}...{env_api_key[-5:]}"
+                                if len(env_api_key) > 10
+                                else env_api_key
+                            )
+                            click.echo(
+                                f"✅ ENVIRONMENT_API_KEY uploaded successfully ({key_preview})"
+                            )
+                            # Try verification, but don't fail if it doesn't work
+                            click.echo("[preflight] verifying env key presence…")
+                            try:
+                                ver = c.get(f"{backend_base.rstrip('/')}/v1/env-keys/verify")
+                                if ver.status_code == 200 and (ver.json() or {}).get("present"):
+                                    click.echo("✅ Key verified in backend")
+                                else:
+                                    click.echo(
+                                        f"⚠️  Verification returned {ver.status_code}, but upload succeeded - proceeding"
+                                    )
+                            except Exception as verify_err:
+                                click.echo(
+                                    f"⚠️  Verification check failed ({verify_err}), but upload succeeded - proceeding"
+                                )
                         else:
-                            error_msg = "ENVIRONMENT_API_KEY verification failed"
+                            error_msg = (
+                                f"ENVIRONMENT_API_KEY upload failed with status {up.status_code}"
+                            )
                             if crash_on_failure:
                                 raise click.ClickException(f"[CRITICAL] {error_msg}")
                             click.echo(f"[WARN] {error_msg}; proceeding anyway")
@@ -794,15 +1178,39 @@ def _run_modal_with_entry(
         return
     try:
-        subprocess.run(cmd, check=True)
+        # Capture output to extract URL
+        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
+        # Print output as it would normally appear
+        if result.stdout:
+            click.echo(result.stdout, nl=False)
+        if result.stderr:
+            click.echo(result.stderr, nl=False, err=True)
+        # Extract and save task app URL from output
+        task_app_url = None
+        for line in result.stdout.splitlines():
+            # Look for lines containing modal.run URLs
+            if "modal.run" in line and "=>" in line:
+                # Extract URL from lines like: "└── 🔨 Created web function fastapi_app => https://...modal.run"
+                parts = line.split("=>")
+                if len(parts) >= 2:
+                    task_app_url = parts[-1].strip()
+                    break
+        # Save URL to .env file if found
+        if task_app_url and env_paths_list:
+            env_file = env_paths_list[0]  # Use the first .env file
+            _save_to_env_file(env_file, "TASK_APP_BASE_URL", task_app_url)
+            click.echo(f"\n✓ Task app URL: {task_app_url}")
     except subprocess.CalledProcessError as exc:
-        raise click.ClickException(f"modal {command} failed with exit code {exc.returncode}") from exc
+        raise click.ClickException(
+            f"modal {command} failed with exit code {exc.returncode}"
+        ) from exc
     finally:
         script_path.unlink(missing_ok=True)
 def _load_env_values(paths: list[Path], *, allow_empty: bool = False) -> dict[str, str]:
     values: dict[str, str] = {}
     for p in paths:
@@ -811,15 +1219,17 @@ def _load_env_values(paths: list[Path], *, allow_empty: bool = False) -> dict[st
         except FileNotFoundError:
             continue
         for line in content.splitlines():
-            if not line or line.lstrip().startswith('#') or '=' not in line:
+            if not line or line.lstrip().startswith("#") or "=" not in line:
                 continue
-            key, value = line.split('=', 1)
+            key, value = line.split("=", 1)
             if key and key not in values:
                 values[key.strip()] = value.strip()
     if not allow_empty and not values:
         raise click.ClickException("No environment values found")
     os.environ.update({k: v for k, v in values.items() if k and v})
     return values
 def _interactive_create_env(target_dir: Path) -> Path | None:
     env_path = (target_dir / ".env").resolve()
     if env_path.exists():
@@ -838,9 +1248,9 @@ def _parse_env_file(path: Path) -> dict[str, str]:
     data: dict[str, str] = {}
     try:
         for line in path.read_text(encoding="utf-8").splitlines():
-            if not line or line.lstrip().startswith('#') or '=' not in line:
+            if not line or line.lstrip().startswith("#") or "=" not in line:
                 continue
-            key, value = line.split('=', 1)
+            key, value = line.split("=", 1)
             data[key.strip()] = value.strip()
     except FileNotFoundError:
         pass
@@ -853,7 +1263,9 @@ def _interactive_fill_env(env_path: Path) -> Path | None:
     def _prompt(label: str, *, default: str = "", required: bool) -> str | None:
         while True:
             try:
-                value = click.prompt(label, default=default, show_default=bool(default) or not required).strip()
+                value = click.prompt(
+                    label, default=default, show_default=bool(default) or not required
+                ).strip()
             except (click.exceptions.Abort, EOFError, KeyboardInterrupt):
                 click.echo("Aborted env creation.")
                 return None
@@ -904,11 +1316,22 @@ def _deploy_entry(
 ) -> None:
     modal_cfg = entry.modal
     if modal_cfg is None:
-        raise click.ClickException(f"Task app '{entry.app_id}' does not define Modal deployment settings")
+        raise click.ClickException(
+            f"Task app '{entry.app_id}' does not define Modal deployment settings"
+        )
     env_paths = _determine_env_files(entry, env_file)
-    click.echo('Using env file(s): ' + ', '.join(str(p) for p in env_paths))
-    _run_modal_with_entry(entry, modal_cfg, modal_cli, modal_name, env_paths, command="deploy", dry_run=dry_run, original_path=original_path)
+    click.echo("Using env file(s): " + ", ".join(str(p.resolve()) for p in env_paths))
+    _run_modal_with_entry(
+        entry,
+        modal_cfg,
+        modal_cli,
+        modal_name,
+        env_paths,
+        command="deploy",
+        dry_run=dry_run,
+        original_path=original_path,
+    )
 def _modal_serve_entry(
@@ -920,21 +1343,29 @@ def _modal_serve_entry(
 ) -> None:
     modal_cfg = entry.modal
     if modal_cfg is None:
-        raise click.ClickException(f"Task app '{entry.app_id}' does not define Modal deployment settings")
+        raise click.ClickException(
+            f"Task app '{entry.app_id}' does not define Modal deployment settings"
+        )
     env_paths = _determine_env_files(entry, env_file)
-    click.echo('Using env file(s): ' + ', '.join(str(p) for p in env_paths))
-    _run_modal_with_entry(entry, modal_cfg, modal_cli, modal_name, env_paths, command="serve", original_path=original_path)
+    click.echo("Using env file(s): " + ", ".join(str(p.resolve()) for p in env_paths))
+    _run_modal_with_entry(
+        entry,
+        modal_cfg,
+        modal_cli,
+        modal_name,
+        env_paths,
+        command="serve",
+        original_path=original_path,
+    )
-@click.group(
-    name='task-app',
-    help='Utilities for serving and deploying Synth task apps.'
-)
+@click.group(name="task-app", help="Utilities for serving and deploying Synth task apps.")
 def task_app_group() -> None:
     pass
-@task_app_group.command('list')
+@task_app_group.command("list")
 def list_apps() -> None:
     """List registered task apps."""
@@ -945,6 +1376,8 @@ def list_apps() -> None:
     for entry in entries:
         aliases = f" (aliases: {', '.join(entry.aliases)})" if entry.aliases else ""
         click.echo(f"- {entry.app_id}{aliases}: {entry.description}")
 def _load_env_files_into_process(paths: Sequence[str]) -> None:
     for p in paths:
         try:
@@ -952,9 +1385,9 @@ def _load_env_files_into_process(paths: Sequence[str]) -> None:
         except Exception:
             continue
         for line in txt.splitlines():
-            if not line or line.startswith('#') or '=' not in line:
+            if not line or line.startswith("#") or "=" not in line:
                 continue
-            k, v = line.split('=', 1)
+            k, v = line.split("=", 1)
             key = k.strip()
             val = v.strip().strip('"').strip("'")
             # Load into process, but allow overriding if the current value is empty
@@ -964,53 +1397,181 @@ def _load_env_files_into_process(paths: Sequence[str]) -> None:
                     os.environ[key] = val
-@click.command('serve')
-@click.argument('app_id', type=str, required=False)
-@click.option('--host', default='0.0.0.0', show_default=True)
-@click.option('--port', default=8001, show_default=True, type=int)
-@click.option('--env-file', multiple=True, type=click.Path(), help='Extra .env files to load')
-@click.option('--reload/--no-reload', 'reload_flag', default=False, help='Enable uvicorn auto-reload')
-@click.option('--force/--no-force', 'force', default=False, help='Kill any process already bound to the selected port before starting')
-@click.option('--trace', 'trace_dir', type=click.Path(), default=None, help='Enable tracing and write SFT JSONL files to this directory')
-@click.option('--trace-db', 'trace_db', type=click.Path(), default=None, help='Override local trace DB path (maps to SQLD_DB_PATH)')
+@click.command("serve")
+@click.argument("app_id", type=str, required=False)
+@click.option("--host", default="0.0.0.0", show_default=True)
+@click.option("--port", default=None, type=int, help="Port to serve on (default: 8001)")
+@click.option("--env-file", multiple=True, type=click.Path(), help="Extra .env files to load")
+@click.option(
+    "--reload/--no-reload", "reload_flag", default=False, help="Enable uvicorn auto-reload"
+)
+@click.option(
+    "--force/--no-force",
+    "force",
+    default=False,
+    help="Kill any process already bound to the selected port before starting",
+)
+@click.option(
+    "--trace",
+    "trace_dir",
+    type=click.Path(),
+    default=None,
+    help="Enable tracing and write SFT JSONL files to this directory (default: traces/v3)",
+)
+@click.option(
+    "--trace-db",
+    "trace_db",
+    type=click.Path(),
+    default=None,
+    help="Override local trace DB path (default: traces/v3/synth_ai.db)",
+)
 def serve_command(
     app_id: str | None,
     host: str,
-    port: int,
+    port: int | None,
     env_file: Sequence[str],
     reload_flag: bool,
     force: bool,
     trace_dir: str | None,
     trace_db: str | None,
 ) -> None:
+    # Change to demo directory if stored (REQUIRED for demo isolation)
+    from synth_ai.demos.demo_task_apps.core import load_demo_dir
+    demo_dir = load_demo_dir()
+    if demo_dir:
+        demo_path = Path(demo_dir)
+        if not demo_path.is_dir():
+            raise click.ClickException(
+                f"Demo directory not found: {demo_dir}\nRun 'synth-ai setup' to create a demo."
+            )
+        os.chdir(demo_dir)
+        click.echo(f"Using demo directory: {demo_dir}\n")
+        # Store demo directory for path resolution
+        os.environ["SYNTH_DEMO_DIR"] = str(demo_path.resolve())
+    # Prompt for port if not provided
+    if port is None:
+        port = click.prompt("Port to serve on", type=int, default=8001)
+    # Prompt for trace directory if not provided
+    if trace_dir is None:
+        click.echo(
+            "\nTracing captures rollout data (actions, rewards, model outputs) to a local SQLite DB."
+        )
+        click.echo("This data can be exported to JSONL for supervised fine-tuning (SFT).")
+        enable_tracing = click.confirm("Enable tracing?", default=True)
+        if enable_tracing:
+            demo_base = Path(os.environ.get("SYNTH_DEMO_DIR") or Path.cwd())
+            default_trace_dir = str((demo_base / "traces/v3").resolve())
+            trace_dir = click.prompt(
+                "Trace directory", type=str, default=default_trace_dir, show_default=True
+            )
+        else:
+            trace_dir = None
+    # Prompt for trace DB if not provided and tracing is enabled
+    if trace_dir and trace_db is None:
+        demo_base = Path(os.environ.get("SYNTH_DEMO_DIR") or Path.cwd())
+        default_trace_db = str((demo_base / "traces/v3/synth_ai.db").resolve())
+        trace_db = click.prompt(
+            "Trace DB path", type=str, default=default_trace_db, show_default=True
+        )
     choice = _select_app_choice(app_id, purpose="serve")
     entry = choice.ensure_entry()
-    _serve_entry(entry, host, port, env_file, reload_flag, force, trace_dir=trace_dir, trace_db=trace_db)
-@task_app_group.command('serve')
-@click.argument('app_id', type=str, required=False)
-@click.option('--host', default='0.0.0.0', show_default=True)
-@click.option('--port', default=8001, show_default=True, type=int)
-@click.option('--env-file', multiple=True, type=click.Path(), help='Extra .env files to load')
-@click.option('--reload/--no-reload', 'reload_flag', default=False, help='Enable uvicorn auto-reload')
-@click.option('--force/--no-force', 'force', default=False, help='Kill any process already bound to the selected port before starting')
-@click.option('--trace', 'trace_dir', type=click.Path(), default=None, help='Enable tracing and write SFT JSONL files to this directory')
-@click.option('--trace-db', 'trace_db', type=click.Path(), default=None, help='Override local trace DB path (maps to SQLD_DB_PATH)')
+    _serve_entry(
+        entry, host, port, env_file, reload_flag, force, trace_dir=trace_dir, trace_db=trace_db
+    )
+@task_app_group.command("serve")
+@click.argument("app_id", type=str, required=False)
+@click.option("--host", default="0.0.0.0", show_default=True)
+@click.option("--port", default=None, type=int, help="Port to serve on (default: 8001)")
+@click.option("--env-file", multiple=True, type=click.Path(), help="Extra .env files to load")
+@click.option(
+    "--reload/--no-reload", "reload_flag", default=False, help="Enable uvicorn auto-reload"
+)
+@click.option(
+    "--force/--no-force",
+    "force",
+    default=False,
+    help="Kill any process already bound to the selected port before starting",
+)
+@click.option(
+    "--trace",
+    "trace_dir",
+    type=click.Path(),
+    default=None,
+    help="Enable tracing and write SFT JSONL files to this directory (default: traces/v3)",
+)
+@click.option(
+    "--trace-db",
+    "trace_db",
+    type=click.Path(),
+    default=None,
+    help="Override local trace DB path (default: traces/v3/synth_ai.db)",
+)
 def serve_task_group(
     app_id: str | None,
     host: str,
-    port: int,
+    port: int | None,
     env_file: Sequence[str],
     reload_flag: bool,
     force: bool,
     trace_dir: str | None,
     trace_db: str | None,
 ) -> None:
+    # Change to demo directory if stored (REQUIRED for demo isolation)
+    from synth_ai.demos.demo_task_apps.core import load_demo_dir
+    demo_dir = load_demo_dir()
+    if demo_dir:
+        demo_path = Path(demo_dir)
+        if not demo_path.is_dir():
+            raise click.ClickException(
+                f"Demo directory not found: {demo_dir}\nRun 'synth-ai setup' to create a demo."
+            )
+        os.chdir(demo_dir)
+        click.echo(f"Using demo directory: {demo_dir}\n")
+        # Store demo directory for path resolution
+        os.environ["SYNTH_DEMO_DIR"] = str(demo_path.resolve())
+    # Prompt for port if not provided
+    if port is None:
+        port = click.prompt("Port to serve on", type=int, default=8001)
+    # Prompt for trace directory if not provided
+    if trace_dir is None:
+        click.echo(
+            "\nTracing captures rollout data (actions, rewards, model outputs) to a local SQLite DB."
+        )
+        click.echo("This data can be exported to JSONL for supervised fine-tuning (SFT).")
+        enable_tracing = click.confirm("Enable tracing?", default=True)
+        if enable_tracing:
+            demo_base = Path(os.environ.get("SYNTH_DEMO_DIR") or Path.cwd())
+            default_trace_dir = str((demo_base / "traces/v3").resolve())
+            trace_dir = click.prompt(
+                "Trace directory", type=str, default=default_trace_dir, show_default=True
+            )
+        else:
+            trace_dir = None
+    # Prompt for trace DB if not provided and tracing is enabled
+    if trace_dir and trace_db is None:
+        demo_base = Path(os.environ.get("SYNTH_DEMO_DIR") or Path.cwd())
+        default_trace_db = str((demo_base / "traces/v3/synth_ai.db").resolve())
+        trace_db = click.prompt(
+            "Trace DB path", type=str, default=default_trace_db, show_default=True
+        )
     choice = _select_app_choice(app_id, purpose="serve")
     entry = choice.ensure_entry()
-    _serve_entry(entry, host, port, env_file, reload_flag, force, trace_dir=trace_dir, trace_db=trace_db)
+    _serve_entry(
+        entry, host, port, env_file, reload_flag, force, trace_dir=trace_dir, trace_db=trace_db
+    )
 def _determine_env_files(entry: TaskAppEntry, user_env_files: Sequence[str]) -> list[Path]:
     resolved: list[Path] = []
@@ -1026,25 +1587,25 @@ def _determine_env_files(entry: TaskAppEntry, user_env_files: Sequence[str]) ->
     # Look for env files in current working directory first, then repo root
     cwd = Path.cwd()
     env_candidates = []
     # Add CWD env files first (prioritized)
-    cwd_env_files = sorted(cwd.glob('**/*.env'))
+    cwd_env_files = sorted(cwd.glob("**/*.env"))
     env_candidates.extend(cwd_env_files)
     # Add repo root env files
-    repo_env_files = sorted(REPO_ROOT.glob('**/*.env'))
+    repo_env_files = sorted(REPO_ROOT.glob("**/*.env"))
     # Avoid duplicates
     for repo_file in repo_env_files:
         if repo_file not in env_candidates:
             env_candidates.append(repo_file)
     if not env_candidates:
-        raise click.ClickException('No env file found. Pass --env-file explicitly.')
+        raise click.ClickException("No env file found. Pass --env-file explicitly.")
-    click.echo('Select env file to load:')
+    click.echo("Select env file to load:")
     for idx, path in enumerate(env_candidates, start=1):
-        click.echo(f"  {idx}) {path}")
-    choice = click.prompt('Enter choice', type=click.IntRange(1, len(env_candidates)))
+        click.echo(f"  {idx}) {path.resolve()}")
+    choice = click.prompt("Enter choice", type=click.IntRange(1, len(env_candidates)), default=1)
     return [env_candidates[choice - 1]]
@@ -1060,7 +1621,9 @@ def _ensure_port_free(port: int, host: str, *, force: bool) -> None:
         return
     try:
-        out = subprocess.run(["lsof", "-ti", f"TCP:{port}"], capture_output=True, text=True, check=False)
+        out = subprocess.run(
+            ["lsof", "-ti", f"TCP:{port}"], capture_output=True, text=True, check=False
+        )
         pids = [pid for pid in out.stdout.strip().splitlines() if pid]
     except FileNotFoundError:
         pids = []
@@ -1075,7 +1638,7 @@ def _ensure_port_free(port: int, host: str, *, force: bool) -> None:
         try:
             os.kill(int(pid), signal.SIGTERM)
         except Exception as exc:
-            raise click.ClickException(f'Failed to terminate PID {pid}: {exc}')
+            raise click.ClickException(f"Failed to terminate PID {pid}: {exc}")
     time.sleep(0.5)
@@ -1087,13 +1650,113 @@ def _ensure_port_free(port: int, host: str, *, force: bool) -> None:
             try:
                 os.kill(int(pid), signal.SIGKILL)
             except Exception as exc:
-                raise click.ClickException(f'Failed to force terminate PID {pid}: {exc}')
+                raise click.ClickException(f"Failed to force terminate PID {pid}: {exc}")
         time.sleep(0.5)
     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
         in_use_after = s.connect_ex((host, port)) == 0
     if in_use_after:
-        raise click.ClickException(f'Port {port} is still in use after attempting to terminate processes.')
+        raise click.ClickException(
+            f"Port {port} is still in use after attempting to terminate processes."
+        )
+def _save_to_env_file(env_path: Path, key: str, value: str) -> None:
+    """Save or update a key-value pair in the .env file."""
+    try:
+        # Read existing .env
+        existing_lines = []
+        if env_path.exists():
+            existing_lines = env_path.read_text().splitlines()
+        # Check if key already exists and update it
+        key_updated = False
+        new_lines = []
+        for line in existing_lines:
+            if line.strip().startswith(f"{key}="):
+                new_lines.append(f"{key}={value}")
+                key_updated = True
+            else:
+                new_lines.append(line)
+        if key_updated:
+            # Write updated lines back
+            env_path.write_text("\n".join(new_lines) + "\n")
+            click.echo(f"Updated {key} in {env_path}")
+        else:
+            # Append to .env
+            with open(env_path, "a") as f:
+                if existing_lines and not existing_lines[-1].strip():
+                    # File exists and last line is not empty
+                    pass
+                elif existing_lines:
+                    # Add newline before appending
+                    f.write("\n")
+                f.write(f"{key}={value}\n")
+            click.echo(f"Saved {key} to {env_path}")
+    except Exception as e:
+        click.echo(f"Warning: Could not save {key} to .env: {e}", err=True)
+def _validate_required_env_keys() -> None:
+    """Validate required environment keys are set, prompting if missing."""
+    # Use demo directory .env file if set, otherwise current directory
+    demo_base = Path(os.environ.get("SYNTH_DEMO_DIR") or Path.cwd())
+    env_file = demo_base / ".env"
+    if env_file.exists():
+        try:
+            from dotenv import load_dotenv
+            load_dotenv(env_file, override=False)
+        except Exception:
+            pass  # Best effort
+    env_api_key = os.environ.get("ENVIRONMENT_API_KEY", "").strip()
+    if not env_api_key:
+        env_api_key = input("Please enter your RL Environment API key:\n> ").strip()
+        if not env_api_key:
+            raise click.ClickException("RL Environment API key is required to start the server")
+        os.environ["ENVIRONMENT_API_KEY"] = env_api_key
+        _save_to_env_file(env_file, "ENVIRONMENT_API_KEY", env_api_key)
+    # Check for Groq API key
+    groq_api_key = os.environ.get("GROQ_API_KEY", "").strip()
+    if not groq_api_key:
+        click.echo("\nInference API key configuration:")
+        click.echo("This workflow requires a Groq API key.")
+        groq_api_key = input("Groq API key (or press Enter to skip): ").strip()
+        if groq_api_key:
+            os.environ["GROQ_API_KEY"] = groq_api_key
+            _save_to_env_file(env_file, "GROQ_API_KEY", groq_api_key)
+def _print_demo_next_steps_if_applicable() -> None:
+    """Print next steps if currently in a demo directory."""
+    try:
+        from synth_ai.demos.demo_task_apps.core import load_demo_dir
+        cwd = Path.cwd().resolve()
+        demo_dir = load_demo_dir()
+        # Check if we're in the demo directory
+        if demo_dir and Path(demo_dir).resolve() == cwd:
+            # Check if this looks like the crafter demo (has run_local_rollout_traced.py)
+            if (cwd / "run_local_rollout_traced.py").exists():
+                click.echo("\n" + "=" * 60)
+                click.echo("Next step: Collect traced rollouts")
+                click.echo("=" * 60)
+                click.echo("\nIn another terminal, run:")
+                click.echo(f"  cd {cwd}")
+                click.echo("  uv run python run_local_rollout_traced.py")
+                click.echo("\nRun this 5-10 times to collect diverse traces.")
+                click.echo("=" * 60 + "\n")
+    except Exception:
+        # Silently fail - this is just a helpful printout
+        pass
 def _serve_entry(
     entry: TaskAppEntry,
@@ -1111,34 +1774,51 @@ def _serve_entry(
     trace_enabled = trace_dir is not None or trace_db is not None
     if trace_enabled:
-        os.environ['TASKAPP_TRACING_ENABLED'] = '1'
+        os.environ["TASKAPP_TRACING_ENABLED"] = "1"
+        # Ensure paths are absolute relative to demo directory
+        demo_base = Path(os.environ.get("SYNTH_DEMO_DIR") or Path.cwd())
         if trace_dir is not None:
             dir_path = Path(trace_dir).expanduser()
+            if not dir_path.is_absolute():
+                dir_path = (demo_base / dir_path).resolve()
             try:
                 dir_path.mkdir(parents=True, exist_ok=True)
             except Exception as exc:
-                raise click.ClickException(f"Failed to create trace directory {dir_path}: {exc}") from exc
-            os.environ['TASKAPP_SFT_OUTPUT_DIR'] = str(dir_path)
+                raise click.ClickException(
+                    f"Failed to create trace directory {dir_path}: {exc}"
+                ) from exc
+            os.environ["TASKAPP_SFT_OUTPUT_DIR"] = str(dir_path)
             click.echo(f"Tracing enabled. SFT JSONL will be written to {dir_path}")
         if trace_db is not None:
             db_path = Path(trace_db).expanduser()
-            os.environ['SQLD_DB_PATH'] = str(db_path)
-            os.environ.pop('TURSO_LOCAL_DB_URL', None)
+            if not db_path.is_absolute():
+                db_path = (demo_base / db_path).resolve()
+            # Construct the sqlite URL from the absolute path
+            db_url = f"sqlite+aiosqlite:///{db_path}"
+            os.environ["SQLD_DB_PATH"] = str(db_path)
+            os.environ["TURSO_LOCAL_DB_URL"] = db_url
             click.echo(f"Tracing DB path set to {db_path}")
         from synth_ai.tracing_v3.config import CONFIG as TRACE_CONFIG
-        # recompute db_url based on current environment
-        new_db_url = os.getenv('TURSO_LOCAL_DB_URL') or TRACE_CONFIG.db_url
+        # Use the explicitly set URL if available
+        new_db_url = os.getenv("TURSO_LOCAL_DB_URL") or TRACE_CONFIG.db_url
         TRACE_CONFIG.db_url = new_db_url
         if new_db_url:
-            os.environ['TURSO_LOCAL_DB_URL'] = new_db_url
             click.echo(f"Tracing DB URL resolved to {new_db_url}")
-    elif os.getenv('TASKAPP_TRACING_ENABLED'):
+    elif os.getenv("TASKAPP_TRACING_ENABLED"):
         click.echo("Tracing enabled via environment variables")
     _ensure_port_free(port, host, force=force)
+    _validate_required_env_keys()
     _preflight_env_key()
+    # Print next steps if in demo context
+    if trace_enabled:
+        _print_demo_next_steps_if_applicable()
     run_task_app(
         entry.config_factory,
         host=host,
@@ -1148,37 +1828,76 @@ def _serve_entry(
     )
-@task_app_group.command('deploy')
+@task_app_group.command("deploy")
 @click.argument("app_id", type=str, required=False)
 @click.option("--name", "modal_name", default=None, help="Override Modal app name")
 @click.option("--dry-run", is_flag=True, help="Print modal deploy command without executing")
 @click.option("--modal-cli", default="modal", help="Path to modal CLI executable")
-@click.option('--env-file', multiple=True, type=click.Path(), help='Env file to load into the container (can be repeated)')
-def deploy_app(app_id: str | None, modal_name: str | None, dry_run: bool, modal_cli: str, env_file: Sequence[str]) -> None:
+@click.option(
+    "--env-file",
+    multiple=True,
+    type=click.Path(),
+    help="Env file to load into the container (can be repeated)",
+)
+def deploy_app(
+    app_id: str | None,
+    modal_name: str | None,
+    dry_run: bool,
+    modal_cli: str,
+    env_file: Sequence[str],
+) -> None:
     """Deploy a task app to Modal."""
+    # Change to demo directory if stored (for consistent discovery)
+    from synth_ai.demos.demo_task_apps.core import load_demo_dir
+    demo_dir = load_demo_dir()
+    if demo_dir:
+        demo_path = Path(demo_dir)
+        if not demo_path.is_dir():
+            raise click.ClickException(
+                f"Demo directory not found: {demo_dir}\nRun 'synth-ai demo' to create a demo."
+            )
+        os.chdir(demo_dir)
+        click.echo(f"Using demo directory: {demo_dir}\n")
     choice = _select_app_choice(app_id, purpose="deploy")
     if choice.modal_script:
         env_paths = _resolve_env_paths_for_script(choice.modal_script, env_file)
-        click.echo('Using env file(s): ' + ', '.join(str(p) for p in env_paths))
-        _run_modal_script(choice.modal_script, modal_cli, "deploy", env_paths, modal_name=modal_name, dry_run=dry_run)
+        click.echo("Using env file(s): " + ", ".join(str(p.resolve()) for p in env_paths))
+        _run_modal_script(
+            choice.modal_script,
+            modal_cli,
+            "deploy",
+            env_paths,
+            modal_name=modal_name,
+            dry_run=dry_run,
+        )
         return
     entry = choice.ensure_entry()
     _deploy_entry(entry, modal_name, dry_run, modal_cli, env_file, original_path=choice.path)
-@task_app_group.command('modal-serve')
-@click.argument('app_id', type=str, required=False)
-@click.option('--modal-cli', default='modal', help='Path to modal CLI executable')
-@click.option('--name', 'modal_name', default=None, help='Override Modal app name (optional)')
-@click.option('--env-file', multiple=True, type=click.Path(), help='Env file to load into the container (can be repeated)')
-def modal_serve_app(app_id: str | None, modal_cli: str, modal_name: str | None, env_file: Sequence[str]) -> None:
+@task_app_group.command("modal-serve")
+@click.argument("app_id", type=str, required=False)
+@click.option("--modal-cli", default="modal", help="Path to modal CLI executable")
+@click.option("--name", "modal_name", default=None, help="Override Modal app name (optional)")
+@click.option(
+    "--env-file",
+    multiple=True,
+    type=click.Path(),
+    help="Env file to load into the container (can be repeated)",
+)
+def modal_serve_app(
+    app_id: str | None, modal_cli: str, modal_name: str | None, env_file: Sequence[str]
+) -> None:
     choice = _select_app_choice(app_id, purpose="modal-serve")
     if choice.modal_script:
         env_paths = _resolve_env_paths_for_script(choice.modal_script, env_file)
-        click.echo('Using env file(s): ' + ', '.join(str(p) for p in env_paths))
+        click.echo("Using env file(s): " + ", ".join(str(p.resolve()) for p in env_paths))
         _run_modal_script(choice.modal_script, modal_cli, "serve", env_paths, modal_name=modal_name)
         return
@@ -1203,7 +1922,8 @@ def _write_modal_entrypoint(
         try:
             # Build lookup of local->remote mounts
             mount_map: list[tuple[Path, Path]] = [
-                (Path(local).resolve(), Path(remote)) for (local, remote) in modal_cfg.extra_local_dirs
+                (Path(local).resolve(), Path(remote))
+                for (local, remote) in modal_cfg.extra_local_dirs
             ]
             orig = Path(original_path).resolve()
             for local_src, remote_dst in mount_map:
@@ -1220,12 +1940,51 @@ def _write_modal_entrypoint(
         except Exception:
             remote_file_str = None
     module_name = entry.config_factory.__module__
+    # Prefer a guaranteed mount for the discovered file to avoid package import issues
+    guaranteed_file_str: str | None = None
+    if original_path:
+        guaranteed_file_str = str(
+            (Path("/opt/synth_ai_repo/__local_task_app__") / Path(original_path).stem).with_suffix(
+                ".py"
+            )
+        )
     dotenv_paths = [str(Path(path)) for path in (dotenv_paths or [])]
     pip_packages = list(modal_cfg.pip_packages)
+    # Ensure synth-ai (matching host version if available) is installed in the container
+    synth_pkg = "synth-ai"
+    try:
+        import synth_ai as _host_synth
+        host_ver = getattr(_host_synth, "__version__", None)
+        if host_ver:
+            synth_pkg = f"synth-ai=={host_ver}"
+    except Exception:
+        pass
+    if not any(str(p).startswith("synth-ai") for p in pip_packages):
+        pip_packages.insert(0, synth_pkg)
     local_dirs = [(str(Path(src)), dst) for src, dst in modal_cfg.extra_local_dirs]
+    # Also mount the host synth_ai source if available to ensure latest code is used
+    try:
+        import synth_ai as _host_synth
+        host_synth_dir = Path(_host_synth.__file__).resolve().parent
+        # host_synth_dir points to .../synth_ai; mount that directory
+        sy_dst = "/opt/synth_ai_repo/synth_ai"
+        candidate = (str(host_synth_dir), sy_dst)
+        if candidate not in local_dirs:
+            local_dirs.insert(0, candidate)
+    except Exception:
+        pass
+    # Ensure the discovered app directory is mounted, regardless of modal_cfg
+    if original_path:
+        discovered_dir = str(Path(original_path).resolve().parent)
+        mount_dst = "/opt/synth_ai_repo/__local_task_app__"
+        if (discovered_dir, mount_dst) not in local_dirs:
+            local_dirs.append((discovered_dir, mount_dst))
     secret_names = list(modal_cfg.secret_names)
     volume_mounts = [(name, mount) for name, mount in modal_cfg.volume_mounts]
@@ -1234,17 +1993,21 @@ def _write_modal_entrypoint(
 import importlib
 import importlib.util
 import sys
+import os
+import shutil
+import tempfile
+from pathlib import Path as _Path
+import fnmatch
 sys.path.insert(0, '/opt/synth_ai_repo')
 from modal import App, Image, Secret, Volume, asgi_app
-from synth_ai.task.apps import registry
-from synth_ai.task.server import create_task_app
+ # Defer importing synth_ai until inside fastapi_app to avoid local import errors
 ENTRY_ID = {entry.app_id!r}
 MODAL_APP_NAME = {modal_name!r}
 MODULE_NAME = {module_name!r}
-MODULE_FILE = {remote_file_str!r}
+MODULE_FILE = {guaranteed_file_str or remote_file_str!r}
 DOTENV_PATHS = {dotenv_paths!r}
 image = Image.debian_slim(python_version={modal_cfg.python_version!r})
@@ -1254,8 +2017,37 @@ if pip_packages:
     image = image.pip_install(*pip_packages)
 local_dirs = {local_dirs!r}
+def _copy_tree_filtered(src_dir: str) -> str:
+    src = _Path(src_dir)
+    temp_dir = _Path(tempfile.mkdtemp(prefix='synth_mount_'))
+    exclude_dirs = {".cache", ".git", "__pycache__"}
+    exclude_globs = ['*.db', '*.db-journal', '*-wal', '*-shm']
+    for root, dirs, files in os.walk(src):
+        rel_root = _Path(root).relative_to(src)
+        # filter dirs in-place
+        dirs[:] = [d for d in dirs if d not in exclude_dirs]
+        # ensure target directory exists
+        target_dir = (temp_dir / rel_root)
+        target_dir.mkdir(parents=True, exist_ok=True)
+        # copy files with filtering
+        for name in files:
+            if any(fnmatch.fnmatch(name, pat) for pat in exclude_globs):
+                continue
+            src_file = _Path(root) / name
+            dst_file = target_dir / name
+            try:
+                shutil.copy2(src_file, dst_file)
+            except Exception:
+                # ignore problematic files
+                continue
+    return str(temp_dir)
 for local_src, remote_dst in local_dirs:
-    image = image.add_local_dir(local_src, remote_dst)
+    safe_src = _copy_tree_filtered(local_src)
+    image = image.add_local_dir(safe_src, remote_dst)
 secrets = {secret_names!r}
 secret_objs = [Secret.from_name(name) for name in secrets]
@@ -1268,24 +2060,6 @@ volume_map = {{}}
 for vol_name, mount_path in volume_mounts:
     volume_map[mount_path] = Volume.from_name(vol_name, create_if_missing=True)
-# Import the module to trigger registration
-if MODULE_FILE:
-    spec = importlib.util.spec_from_file_location(MODULE_NAME or 'task_app_module', MODULE_FILE)
-    if spec and spec.loader:
-        mod = importlib.util.module_from_spec(spec)
-        sys.modules[MODULE_NAME or 'task_app_module'] = mod
-        spec.loader.exec_module(mod)
-    else:
-        raise RuntimeError("Failed to import task app from file: " + str(MODULE_FILE))
-else:
-    importlib.import_module(MODULE_NAME)
-# Get the entry from registry (now that it's registered)
-entry = registry.get(ENTRY_ID)
-modal_cfg = entry.modal
-if modal_cfg is None:
-    raise RuntimeError("Modal configuration missing for task app {entry.app_id}")
 app = App(MODAL_APP_NAME)
 @app.function(
@@ -1300,6 +2074,47 @@ app = App(MODAL_APP_NAME)
 )
 @asgi_app()
 def fastapi_app():
+    # Import the module to trigger registration (inside container)
+    import os
+    # Prefer mounted source over any preinstalled site-packages version
+    import sys as _sys
+    for k in list(_sys.modules.keys()):
+        if k == 'synth_ai' or k.startswith('synth_ai.'):
+            _sys.modules.pop(k, None)
+    import importlib as _importlib
+    _importlib.invalidate_caches()
+    try:
+        if MODULE_FILE and os.path.exists(MODULE_FILE):
+            spec = importlib.util.spec_from_file_location(MODULE_NAME or 'task_app_module', MODULE_FILE)
+            if not spec or not spec.loader:
+                raise RuntimeError("Failed to prepare spec for: " + str(MODULE_FILE))
+            mod = importlib.util.module_from_spec(spec)
+            sys.modules[MODULE_NAME or 'task_app_module'] = mod
+            spec.loader.exec_module(mod)
+        else:
+            try:
+                importlib.import_module(MODULE_NAME)
+            except Exception:
+                fallback_file = '/opt/synth_ai_repo/__local_task_app__/' + (MODULE_NAME.split('.')[-1] if MODULE_NAME else 'task_app') + '.py'
+                if os.path.exists(fallback_file):
+                    spec = importlib.util.spec_from_file_location(MODULE_NAME or 'task_app_module', fallback_file)
+                    if not spec or not spec.loader:
+                        raise RuntimeError("Failed to prepare fallback spec for: " + str(fallback_file))
+                    mod = importlib.util.module_from_spec(spec)
+                    sys.modules[MODULE_NAME or 'task_app_module'] = mod
+                    spec.loader.exec_module(mod)
+                else:
+                    raise
+    except Exception as e:
+        raise RuntimeError("Task app import failed: " + str(e))
+    # Get the entry from registry (now that it's registered)
+    from synth_ai.task.apps import registry
+    from synth_ai.task.server import create_task_app
+    entry = registry.get(ENTRY_ID)
+    cfg = entry.modal
+    if cfg is None:
+        raise RuntimeError("Modal configuration missing for task app " + ENTRY_ID)
     config = entry.config_factory()
     return create_task_app(config)
 """
@@ -1314,3 +2129,304 @@ def fastapi_app():
 def register(cli: click.Group) -> None:
     cli.add_command(serve_command)
     cli.add_command(task_app_group)
+    cli.add_command(eval_command)
+@click.command("eval")
+@click.argument("app_id", type=str, required=False)
+@click.option("--config", type=click.Path(), default=None, help="Path to eval TOML (short schema)")
+@click.option(
+    "--url",
+    "task_app_url",
+    type=str,
+    default=None,
+    help="Base URL of a running task app (skip in-process server)",
+)
+@click.option("--seeds", default="0,1,2,3,4", help="Comma-separated seeds/indices to evaluate")
+@click.option("--split", default="train", show_default=True, help="Dataset split to use")
+@click.option("--model", default=None, help="Model identifier (prompted if omitted)")
+@click.option("--env-file", multiple=True, type=click.Path(), help="Env file(s) for keys")
+def eval_command(
+    app_id: str | None,
+    config: str | None,
+    task_app_url: str | None,
+    seeds: str,
+    split: str,
+    model: str | None,
+    env_file: Sequence[str],
+) -> None:
+    """Run local rollouts against a task app using in-process ASGI and summarize results."""
+    cfg: dict[str, Any] = {}
+    config_path: Path | None = None
+    if config:
+        config_path = Path(config)
+    else:
+        auto_configs = _discover_eval_config_paths()
+        if auto_configs:
+            config_path = auto_configs[0]
+            click.echo(f"Using eval config: {config_path}")
+    if config_path:
+        if _toml is None:
+            raise click.ClickException(
+                "TOML parser not available; use Python 3.11+ or install tomli"
+            )
+        if not config_path.exists():
+            raise click.ClickException(f"Eval config not found: {config_path}")
+        try:
+            data = config_path.read_bytes()
+            parsed = _toml.loads(data.decode("utf-8"))
+            if isinstance(parsed, dict):
+                section = parsed.get("eval")
+                if isinstance(section, dict):
+                    cfg = dict(section)
+                else:
+                    cfg = dict(parsed)
+        except Exception as exc:
+            raise click.ClickException(f"Failed to parse TOML '{config_path}': {exc}")
+    app_id = app_id or (cfg.get("app_id") if isinstance(cfg.get("app_id"), str) else None)  # type: ignore
+    # Determine selection params (CLI takes precedence; TOML only fills unset model/seeds/env)
+    if cfg.get("model") and not model:
+        model = str(cfg["model"])  # type: ignore[index]
+    if cfg.get("seeds") and seeds == "0,1,2,3,4":
+        val = cfg["seeds"]
+        if isinstance(val, list):
+            try:
+                seeds = ",".join(str(int(x)) for x in val)
+            except Exception:
+                pass
+        elif isinstance(val, str):
+            seeds = val
+        elif isinstance(val, int):
+            seeds = str(val)
+    if cfg.get("env_file") and not env_file:
+        ef = cfg["env_file"]
+        if isinstance(ef, str):
+            env_file = (ef,)  # type: ignore[assignment]
+        elif isinstance(ef, list):
+            env_file = tuple(str(x) for x in ef)  # type: ignore[assignment]
+    entry: TaskAppEntry | None = None
+    if task_app_url is None:
+        choice = _select_app_choice(app_id, purpose="eval")
+        entry = choice.ensure_entry()
+    env_paths: list[Path] = []
+    if entry is not None:
+        env_paths = _determine_env_files(entry, env_file)
+    else:
+        if not env_file:
+            raise click.ClickException("--env-file is required when using --url")
+        for candidate in env_file:
+            p = Path(candidate).expanduser()
+            if not p.exists():
+                raise click.ClickException(f"Env file not found: {p}")
+            env_paths.append(p)
+    click.echo("Using env file(s): " + ", ".join(str(p) for p in env_paths))
+    _load_env_files_into_process([str(Path(p)) for p in env_paths])
+    if task_app_url is None:
+        config = entry.config_factory()  # type: ignore[union-attr]
+        # Help the type checker; runtime check also enforced in server.run_task_app
+        if not isinstance(config, TaskAppConfig):
+            raise click.ClickException(
+                "Invalid task app: config_factory did not return TaskAppConfig"
+            )
+        app = create_task_app(config)
+    # Determine supported models
+    supported: list[str] = []
+    if task_app_url is None:
+        try:
+            supported = list((config.base_task_info.inference or {}).get("models") or [])  # type: ignore[union-attr]
+        except Exception:
+            supported = []
+    else:
+        try:
+            import httpx as _hx
+            headers = {}
+            api_key = (os.environ.get("ENVIRONMENT_API_KEY") or "").strip()
+            if api_key:
+                headers["X-API-Key"] = api_key
+            with _hx.Client(base_url=task_app_url, headers=headers, timeout=15.0) as c:
+                info = c.get("/info").json()
+            inf = info.get("inference") if isinstance(info, dict) else None
+            if isinstance(inf, dict):
+                m = inf.get("models")
+                if isinstance(m, list):
+                    supported = [str(x) for x in m]
+                if not supported:
+                    providers = inf.get("providers")
+                    if isinstance(providers, list):
+                        if "openai" in providers:
+                            supported.append("gpt-5")
+                        if "groq" in providers:
+                            supported.append("groq:llama-3.1-70b-versatile")
+                        supported.append("synth:qwen-0.6b")
+        except Exception:
+            supported = []
+    if not supported:
+        # Only fall back to local config-derived providers when running in-process
+        if task_app_url is None:
+            try:
+                providers = list((config.base_task_info.inference or {}).get("providers") or [])  # type: ignore[union-attr]
+            except Exception:
+                providers = []
+            if "openai" in providers:
+                supported.append("gpt-5")
+            if "groq" in providers:
+                supported.append("groq:llama-3.1-70b-versatile")
+        # Always include a local synth model option for smoke tests
+        supported.append("synth:qwen-0.6b")
+    selected_model = model
+    if not selected_model:
+        if not supported:
+            raise click.ClickException(
+                "No supported models; supply --model or add base_task_info.inference.models"
+            )
+        click.echo("Select model to evaluate:")
+        for idx, m in enumerate(supported, start=1):
+            click.echo(f"  {idx}) {m}")
+        choice_idx = click.prompt("Enter choice", type=click.IntRange(1, len(supported)))
+        selected_model = supported[choice_idx - 1]
+    try:
+        seed_values = [int(s.strip()) for s in seeds.split(",") if s.strip()]
+    except Exception:
+        raise click.ClickException("Invalid --seeds; expected comma-separated integers")
+    import httpx
+    headers = {}
+    api_key = (os.environ.get("ENVIRONMENT_API_KEY") or "").strip()
+    if api_key:
+        headers["X-API-Key"] = api_key
+    successes = 0
+    failures = 0
+    # Aggregate outcome stats across successful seeds
+    outcome_sum: float = 0.0
+    outcome_count: int = 0
+    outcome_correct: int = 0
+    if task_app_url is None:
+        transport = httpx.ASGITransport(app=app)  # type: ignore[name-defined]
+        # Newer httpx types consider ASGITransport under httpx._transports; cast to satisfy type checker
+        client = httpx.Client(
+            transport=cast(Any, transport),
+            base_url="http://eval.local",
+            timeout=60.0,
+            headers=headers,
+        )
+    else:
+        client = httpx.Client(base_url=task_app_url, timeout=60.0, headers=headers)
+    with client as client:
+        try:
+            client.get("/task_info")
+        except Exception:
+            pass
+        # Precompute optional policy overrides from TOML
+        policy_overrides: dict[str, Any] = {}
+        try:
+            # Accept [eval.policy] table or top-level keys for convenience
+            if isinstance(cfg.get("policy"), dict):
+                policy_overrides.update(dict(cfg["policy"]))
+            # Back-compat: allow temperature/max_tokens at top level
+            for k in (
+                "temperature",
+                "max_tokens",
+                "reasoning_effort",
+                "system_hint",
+                "tool_choice",
+            ):
+                if k in cfg and k not in policy_overrides:
+                    policy_overrides[k] = cfg.get(k)
+        except Exception:
+            policy_overrides = {}
+        for seed_val in seed_values:
+            body = {
+                "run_id": str(uuid.uuid4()),
+                "env": {"config": {"split": split, "index": seed_val}, "seed": seed_val},
+                "policy": {
+                    "policy_name": selected_model,
+                    "config": {"model": selected_model, **policy_overrides},
+                },
+                "ops": [],
+            }
+            try:
+                resp = client.post("/rollout", json=body)
+                ok = 200 <= resp.status_code < 300
+                if ok:
+                    successes += 1
+                else:
+                    failures += 1
+                # Print summary with any available metrics/tool calls
+                summary = [f"seed={seed_val}", f"status={resp.status_code}"]
+                try:
+                    data = resp.json()
+                except Exception:
+                    data = None
+                if isinstance(data, dict):
+                    metrics = data.get("metrics") if isinstance(data.get("metrics"), dict) else None
+                    if metrics:
+                        mean_return = metrics.get("mean_return") or metrics.get("total_reward")
+                        outcome = metrics.get("outcome_score")
+                        if mean_return is not None:
+                            summary.append(f"mean_return={mean_return}")
+                        if outcome is not None:
+                            summary.append(f"outcome={outcome}")
+                            # Aggregate outcome stats
+                            try:
+                                val = float(outcome)
+                                outcome_sum += val
+                                outcome_count += 1
+                                if val >= 0.5:
+                                    outcome_correct += 1
+                            except Exception:
+                                pass
+                    # Try to infer tool call count from first trajectory step
+                    trajs = (
+                        data.get("trajectories")
+                        if isinstance(data.get("trajectories"), list)
+                        else None
+                    )
+                    if trajs:
+                        first = trajs[0] if trajs else None
+                        steps = first.get("steps") if isinstance(first, dict) else None
+                        if isinstance(steps, list) and steps:
+                            step0 = steps[0]
+                            tool_calls = step0.get("tool_calls") or step0.get("tools") or []
+                            if isinstance(tool_calls, list):
+                                summary.append(f"tool_calls={len(tool_calls)}")
+                    click.echo(" ".join(summary))
+                    # Print the full response JSON (trace, trajectories, metrics)
+                    try:
+                        click.echo(json.dumps(data, indent=2))
+                    except Exception:
+                        pass
+                else:
+                    click.echo(" ".join(summary))
+            except Exception as exc:
+                failures += 1
+                click.echo(f"seed={seed_val} error={exc}")
+    click.echo(
+        f"Eval complete: {successes} ok, {failures} failed; model={selected_model}, split={split}"
+    )
+    # Print outcome summary if any successes
+    if outcome_count > 0:
+        mean_outcome = outcome_sum / float(outcome_count)
+        frac_right = outcome_correct / float(outcome_count)
+        click.echo(
+            f"Outcome summary: correct={outcome_correct}/{outcome_count} ({frac_right:.2%}), mean_outcome={mean_outcome:.3f}"
+        )
+def register_eval(cli: click.Group) -> None:
+    cli.add_command(eval_command)

synth-ai 0.2.9.dev4__py3-none-any.whl → 0.2.9.dev7__py3-none-any.whl

Potentially problematic release.

synth-ai 0.2.9.dev4py3-none-any.whl → 0.2.9.dev7py3-none-any.whl