PyPI - synth-ai - Versions diffs - 0.2.13.dev1__py3-none-any.whl → 0.2.13.dev2__py3-none-any.whl - Mend

synth-ai 0.2.13.dev1py3-none-any.whl → 0.2.13.dev2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of synth-ai might be problematic. Click here for more details.

Files changed (226) hide show

examples/task_apps/enron/tests/unit/test_enron_environment.py ADDED Viewed

@@ -0,0 +1,126 @@
+"""Unit tests for Enron environment tools and rewards."""
+import pytest
+@pytest.mark.asyncio
+@pytest.mark.fast
+async def test_enron_search_tool():
+    """Test that the search_emails tool works correctly."""
+    from synth_ai.environments.examples.enron.environment import SearchEmailsTool
+    from synth_ai.environments.examples.enron.engine import EnronEngine
+    from synth_ai.environments.tasks.core import TaskInstance, Impetus, Intent
+    # Create a minimal task instance
+    task = TaskInstance(
+        id="test",
+        impetus=Impetus(instructions="Test question"),
+        intent=Intent(
+            rubric={"goal": "test"},
+            gold_trajectories=None,
+            gold_state_diff={},
+            deterministic_eval_functions=[],
+        ),
+        metadata={
+            "question": "Test?",
+            "gold_answer": "Test answer",
+            "inbox_address": "test@enron.com",
+        },
+        is_reproducible=False,
+        initial_engine_snapshot=None,
+    )
+    engine = EnronEngine(task)
+    tool = SearchEmailsTool(engine)
+    # Test that tool has correct name
+    assert tool.name == "search_emails"
+    # Test that tool requires keywords
+    from synth_ai.environments.environment.tools import EnvToolCall
+    # Call with minimal args should work (or fail gracefully)
+    result = await tool(EnvToolCall(tool="search_emails", args={"keywords": ["test"]}))
+    assert result.ok in (True, False)  # Either succeeds or fails gracefully
+    # Result should have search_results field
+    if result.ok:
+        assert "search_results" in result.payload
+@pytest.mark.asyncio
+async def test_enron_answer_tool():
+    """Test that the answer_question tool calculates rewards correctly."""
+    from synth_ai.environments.examples.enron.environment import AnswerQuestionTool
+    from synth_ai.environments.examples.enron.engine import EnronEngine
+    from synth_ai.environments.tasks.core import TaskInstance, Impetus, Intent
+    task = TaskInstance(
+        id="test",
+        impetus=Impetus(instructions="Test question"),
+        intent=Intent(
+            rubric={"goal": "test"},
+            gold_trajectories=None,
+            gold_state_diff={},
+            deterministic_eval_functions=[],
+        ),
+        metadata={
+            "question": "What is the answer?",
+            "gold_answer": "The answer is 42",
+            "inbox_address": "test@enron.com",
+        },
+        is_reproducible=False,
+        initial_engine_snapshot=None,
+    )
+    engine = EnronEngine(task)
+    tool = AnswerQuestionTool(engine)
+    # Test exact match
+    from synth_ai.environments.environment.tools import EnvToolCall
+    result_exact = await tool(EnvToolCall(tool="answer_question", args={"answer": "The answer is 42"}))
+    assert result_exact.ok is True
+    assert "status" in result_exact.payload
+    # Test partial match (should still give some reward)
+    result_partial = await tool(EnvToolCall(tool="answer_question", args={"answer": "answer is 42"}))
+    assert result_partial.ok is True
+@pytest.mark.asyncio
+async def test_enron_reward_calculation():
+    """Test that Enron rewards are calculated correctly."""
+    from synth_ai.environments.examples.enron.engine import EnronEngine
+    from synth_ai.environments.tasks.core import TaskInstance, Impetus, Intent
+    task = TaskInstance(
+        id="test",
+        impetus=Impetus(instructions="Test question"),
+        intent=Intent(
+            rubric={"goal": "test"},
+            gold_trajectories=None,
+            gold_state_diff={},
+            deterministic_eval_functions=[],
+        ),
+        metadata={
+            "question": "What is the answer?",
+            "gold_answer": "forty two",
+            "inbox_address": "test@enron.com",
+        },
+        is_reproducible=False,
+        initial_engine_snapshot=None,
+    )
+    engine = EnronEngine(task)
+    # Test exact match gives high reward
+    reward_exact = await engine._judge_answer("forty two")
+    assert reward_exact > 0.9, f"Expected high reward for exact match, got {reward_exact}"
+    # Test partial match gives medium reward
+    reward_partial = await engine._judge_answer("the answer is forty two")
+    assert reward_partial > 0.5, f"Expected medium reward for partial match, got {reward_partial}"
+    # Test wrong answer gives low/zero reward
+    reward_wrong = await engine._judge_answer("completely wrong answer")
+    assert reward_wrong < 0.5, f"Expected low reward for wrong answer, got {reward_wrong}"

examples/task_apps/math/__init__.py ADDED Viewed

File without changes

examples/{rl/task_app → task_apps/math}/math_single_step.py RENAMED Viewed

@@ -723,6 +723,9 @@ async def rollout_executor(request: RolloutRequest, fastapi_request: Request) ->
         },
     )
+    # Extract inference_url from policy config
+    inference_url = (request.policy.config or {}).get("inference_url")
     trajectory = RolloutTrajectory(
         env_id=f"math::{sample['split']}::{sample['index']}",
         policy_id=request.policy.policy_id or "policy",
@@ -732,6 +735,7 @@ async def rollout_executor(request: RolloutRequest, fastapi_request: Request) ->
             "reward": reward,
         },
         length=1,
+        inference_url=inference_url,  # NEW: Required for trace correlation
     )
     metrics = RolloutMetrics(
         episode_returns=[reward],
@@ -800,7 +804,7 @@ def build_dataset() -> tuple[TaskDatasetRegistry, MathDataset]:
 def _base_task_info() -> TaskInfo:
     return TaskInfo(
         task={"id": "math_single_step", "name": "Math Single Step", "version": "1.0.0"},
-        environments=["math"],
+        environment="math",
         action_space={
             "type": "tool_call",
             "tools": [
@@ -830,11 +834,6 @@ def _base_task_info() -> TaskInfo:
             "supports_proxy": True,
             "tool": {"name": TOOL_NAME, "parallel_tool_calls": False},
         },
-        capabilities={
-            "supports_rollout": True,
-            "supports_env_lifecycle": True,
-            "requires_api_key_header": True,
-        },
         limits={"max_turns": 1},
     )
@@ -887,21 +886,31 @@ def describe_taskset(dataset: MathDataset) -> dict[str, Any]:
 def provide_task_instances(dataset: MathDataset, seeds: Sequence[int]) -> Iterable[TaskInfo]:
     info = _base_task_info()
+    base_observation = getattr(info, "observation", None)
+    if hasattr(base_observation, "model_dump"):
+        observation_template = base_observation.model_dump()
+    elif isinstance(base_observation, dict):
+        observation_template = dict(base_observation)
+    else:
+        observation_template = {}
     for seed in seeds:
         sample = dataset.sample(split=DEFAULT_SPLIT, index=seed)
         yield TaskInfo(
             task=info.task,
-            environments=info.environments,
+            environment=info.environment,
             action_space=info.action_space,
-            observation={**info.observation, "sample_index": sample["index"]},
+            observation={
+                **observation_template,
+                "sample_index": sample["index"],
+            },
             dataset={
-                **info.dataset,
+                **info.dataset.model_dump(),
                 "split": sample["split"],
                 "index": sample["index"],
             },
             rubric=info.rubric,
             inference=info.inference,
-            capabilities=info.capabilities,
             limits=info.limits,
         )

examples/task_apps/pokemon_battle/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ """Pokemon competitive battle task app examples."""
2	+

examples/task_apps/pokemon_battle/modal_app.py ADDED Viewed

@@ -0,0 +1,104 @@
+"""Modal deployment helper for the Pokémon Showdown task app example.
+This file mirrors the manual setup steps documented in the README:
+- Clone `pokechamp` and install its Python dependencies.
+- Clone the reference Pokémon Showdown server and install Node dependencies.
+- Mount the local `synth-ai` repository so the task app code is available.
+Deploy with:
+```
+modal deploy examples/task_apps/pokemon_battle/modal_app.py
+```
+After deployment the FastAPI service will be reachable at a URL similar to
+`https://<org>--pokemon-showdown-task-app-example.modal.run`.
+"""
+from __future__ import annotations
+import subprocess
+import sys
+from pathlib import Path
+import modal
+REPO_ROOT = Path(__file__).resolve().parents[3]
+POKECHAMP_REPO = "https://github.com/sethkarten/pokechamp.git"
+SHOWDOWN_REPO = "https://github.com/jakegrigsby/pokemon-showdown.git"
+app = modal.App("pokemon-showdown-task-app-example")
+BASE_IMAGE = (
+    modal.Image.debian_slim(python_version="3.11")
+    .apt_install("git", "nodejs", "npm")
+    .pip_install(["uvicorn[standard]", "fastapi", "httpx", "horizons-ai"])
+    .run_commands(
+        [
+            "mkdir -p /external",
+            f"git clone --depth 1 {POKECHAMP_REPO} /external/pokechamp || true",
+            "pip install --no-cache-dir -r /external/pokechamp/requirements.txt",
+            f"git clone --depth 1 {SHOWDOWN_REPO} /external/pokemon-showdown || true",
+            "cd /external/pokemon-showdown && npm ci --no-optional",
+        ]
+    )
+)
+REPO_MOUNT = modal.Mount.from_local_dir(REPO_ROOT, remote_path="/workspace/synth-ai")
+@app.function(
+    image=BASE_IMAGE,
+    mounts=[REPO_MOUNT],
+    timeout=900,
+    memory=8192,
+    cpu=4.0,
+    secrets=[modal.Secret.from_name("environment-api-key")],
+    keep_warm=1,
+)
+@modal.asgi_app()
+def fastapi_app():
+    """Serve the Synth task app via Modal."""
+    import os
+    from fastapi import APIRouter
+    repo_path = Path("/workspace/synth-ai").resolve()
+    if str(repo_path) not in sys.path:
+        sys.path.insert(0, str(repo_path))
+    marker = Path("/tmp/.synth_ai_editable")
+    if not marker.exists():
+        subprocess.check_call([sys.executable, "-m", "pip", "install", "-e", str(repo_path)])
+        marker.touch()
+    os.environ.setdefault("PYTHONHASHSEED", "0")
+    os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
+    os.environ.setdefault("OMP_NUM_THREADS", "1")
+    os.environ.setdefault("POKECHAMP_ROOT", "/external/pokechamp")
+    os.environ.setdefault("POKEMON_SHOWDOWN_ROOT", "/external/pokemon-showdown")
+    from examples.task_apps.pokemon_battle.task_app.pokemon_showdown import build_config
+    from synth_ai.task.server import create_task_app
+    app = create_task_app(build_config())
+    health_router = APIRouter()
+    @health_router.get("/healthz")
+    def healthz():
+        return {"status": "ok"}
+    app.include_router(health_router)
+    return app
+@app.local_entrypoint()
+def main():
+    """Print handy commands for local testing."""
+    print("Pokémon Showdown task app Modal helper")
+    print("Deploy with: modal deploy examples/task_apps/pokemon_battle/modal_app.py")
+    print("Test locally: modal serve examples/task_apps/pokemon_battle/modal_app.py")
+    print("Once deployed, set TASK_APP_URL to the issued modal.run domain.")

examples/task_apps/pokemon_battle/task_app/README.md ADDED Viewed

@@ -0,0 +1,68 @@
+# Pokemon Battle Task App
+This example shows how to expose a Horizons-compatible Pokémon Showdown battle
+environment through the Synth AI task app scaffolding. The adapter runs fully
+locally by driving pokechamp’s deterministic `LocalSim`, so battles can be
+snapshotted and restored without a live Showdown server.
+## Local setup (Track 1)
+1. Clone and install **PokeChamp** together with its `poke-env` fork:
+   ```bash
+   git clone https://github.com/sethkarten/pokechamp.git
+   cd pokechamp
+   pip install -r requirements.txt
+   ```
+2. Export environment variables so the task app can locate the cloned repo:
+   ```bash
+   export POKECHAMP_ROOT=/path/to/pokechamp
+   export POKEMON_SHOWDOWN_ROOT=/path/to/pokemon-showdown
+   ```
+3. Run a rollout to sanity-check the wiring:
+   ```bash
+   uv run python -m synth_ai.task.describe pokemon_showdown
+   uv run python -m synth_ai.task.rollout pokemon_showdown --seed 1001
+   ```
+The adapter uses the pokechamp dataset teams bundled with the repository to
+instantiate deterministic Gen 9 OU battles. You can point `POKECHAMP_ROOT` at a
+fork with custom teams to experiment with other formats.
+## Modal deployment
+A ready-to-use deployment helper is available at
+`examples/task_apps/pokemon_battle/modal_app.py`. It mirrors the above manual
+steps (cloning `pokechamp`, installing requirements, and mounting the Synth AI
+repo). Deploy with:
+```bash
+modal deploy examples/task_apps/pokemon_battle/modal_app.py
+```
+The resulting URL can be plugged into Synth AI workflows via `TASK_APP_URL`.
+## Notes
+- The dataset catalog resolves team files from the PokeChamp repo when available
+  (`POKECHAMP_ROOT`). If the assets are missing, `/info` marks the scenario as
+  unavailable.
+- Snapshots serialise the entire deterministic battle state, allowing training
+  algorithms to branch or reset mid-match.
+- Deterministic RNG seeding (Python, NumPy, PyTorch) keeps rollouts reproducible
+  across Modal replicas and local runs.
+- The opponent policy now favours super-effective moves to provide a stronger
+  baseline; swap it out with a pokechamp minimax bot for ladder-level play.
+- A `/healthz` endpoint is exposed in the Modal service for liveness probes.
+## Status & Next Steps
+- **Observation polish**: expose richer per-turn summaries (hazards, stat boosts, tera states) and compact text strings tailored for language agents.
+- **Action helpers**: surface explicit target slots/tera/mega toggles so higher formats (doubles, VGC) can plug in with minimal code.
+- **Benchmark opponent**: replace the heuristic opponent with a pokechamp bot (e.g. minimax) or hook into the official PokéAgent ladder for eval parity.
+- **Integration tests**: add pytest smoke tests covering `/snapshot` → `/restore` loops and multi-step rollouts.
+- **Agent wiring**: ship a reference RL/LLM policy config (Synth CLI or Modal job) that exercises the adapter end-to-end and logs battle traces.

examples/task_apps/pokemon_battle/task_app/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Pokemon Showdown task app configuration."""
+from .pokemon_showdown import build_config
+__all__ = ["build_config"]

synth-ai 0.2.13.dev1__py3-none-any.whl → 0.2.13.dev2__py3-none-any.whl

Potentially problematic release.

synth-ai 0.2.13.dev1py3-none-any.whl → 0.2.13.dev2py3-none-any.whl