PyPI - rnow - Versions diffs - 0.2.4__py3-none-any.whl → 0.3.9__py3-none-any.whl - Mend

rnow 0.2.4py3-none-any.whl → 0.3.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

rnow/cli/commands.py +226 -84
rnow/cli/test.py +536 -441
rnow/core/__init__.py +4 -1
rnow/core/reward.py +34 -3
rnow/core/tool.py +29 -7
rnow/models.py +88 -6
rnow/templates/deepseek-aha/config.yml +1 -1
rnow/templates/mcp-tavily/config.yml +1 -1
rnow/templates/rl-single/config.yml +7 -7
rnow/templates/rl-single/train.jsonl +0 -908
rnow/templates/rl-tools/config.yml +1 -1
rnow/templates/tutorial-reward/config.yml +7 -7
rnow/templates/tutorial-reward/train.jsonl +0 -908
rnow/templates/tutorial-tool/config.yml +1 -1
{rnow-0.2.4.dist-info → rnow-0.3.9.dist-info}/METADATA +23 -9
{rnow-0.2.4.dist-info → rnow-0.3.9.dist-info}/RECORD +22 -22
/rnow/templates/rl-tools/{env.py → tools.py} +0 -0
/rnow/templates/tutorial-tool/{env.py → tools.py} +0 -0
{rnow-0.2.4.dist-info → rnow-0.3.9.dist-info}/WHEEL +0 -0
{rnow-0.2.4.dist-info → rnow-0.3.9.dist-info}/entry_points.txt +0 -0
{rnow-0.2.4.dist-info → rnow-0.3.9.dist-info}/licenses/LICENSE +0 -0
{rnow-0.2.4.dist-info → rnow-0.3.9.dist-info}/top_level.txt +0 -0

rnow/cli/test.py CHANGED Viewed

@@ -1,22 +1,26 @@
 # rnow/cli/test.py
 """
-Test command for running RL rollouts locally.
+Test command for running RL rollouts via API.
-Requires authentication for billing.
+Uses the /api/rnow/rollout endpoint which runs rollouts on Cloud Run.
+Modes:
+- Default: Uses tinker models (requires auth)
+- --smoke-test: Uses OpenAI gpt-5-nano (requires OPENAI_API_KEY)
 """
+from __future__ import annotations
 import asyncio
 import itertools
 import json
+import os
 import random
-import re
 import signal
 import sys
 import threading
 import time
-from collections.abc import Callable
 from pathlib import Path
-from string import Template
 import click
 import httpx
@@ -74,66 +78,86 @@ class Spinner:
 from rnow.cli.common import require_auth
-from rnow.core.reward import REWARD_REGISTRY, clear_reward_registry, compute_total_reward
-from rnow.core.tool import TOOL_REGISTRY, clear_tool_registry
-from rnow.models import ProjectConfig, RewardArgs
+from rnow.models import ProjectConfig
 DEFAULT_API_URL = "https://www.reinforcenow.ai"
-class ModelCompleter:
+class RolloutClient:
     """
-    Completer that handles tokenization and calls Next.js API.
-    Requires authentication for billing.
+    Client for running rollouts via the /api/rnow/rollout endpoint.
+    Uses async polling: POST starts job, GET polls for results.
     """
-    def __init__(self, api_base: str, model: str, max_tokens: int = 2048, temperature: float = 1.0):
+    def __init__(
+        self,
+        api_base: str,
+        model: str,
+        max_tokens: int = 2048,
+        temperature: float = 1.0,
+        max_turns: int = 1,
+        termination_policy: str = "last_tool",
+        debug: bool = False,
+        smoke_test: bool = False,
+        openai_api_key: str | None = None,
+        mcp_url: str | list[str] | None = None,
+    ):
         self.api_base = api_base.rstrip("/")
         self.model = model
         self.max_tokens = max_tokens
         self.temperature = temperature
+        self.max_turns = max_turns
+        self.termination_policy = termination_policy
+        self.debug = debug
+        self.smoke_test = smoke_test
+        self.openai_api_key = openai_api_key
+        self.mcp_url = mcp_url
         self.auth_headers = get_auth_headers()
-        self.client = httpx.AsyncClient(timeout=120.0)
-        self.session_id: str | None = None  # Cached session ID for reuse
-        self.total_latency_ms = 0
-        self.request_count = 0
-        # Initialize tokenizer and renderer
-        from tinker_cookbook import renderers
-        from tinker_cookbook.model_info import get_recommended_renderer_name
-        from tinker_cookbook.tokenizer_utils import get_tokenizer
-        self.tokenizer = get_tokenizer(model)
-        renderer_name = get_recommended_renderer_name(model)
-        self.renderer = renderers.get_renderer(renderer_name, self.tokenizer)
-    async def __call__(self, messages: list[dict], stop: list[str] | None = None) -> dict:
+        self.client = httpx.AsyncClient(timeout=60.0)
+        self.total_charged_dollars = 0.0
+    async def start_rollout(
+        self,
+        samples: list[dict],
+        tools_py_code: str | None = None,
+        rewards_py_code: str | None = None,
+        dockerfiles: dict[str, str] | None = None,
+        secrets: dict[str, str] | None = None,
+    ) -> str:
         """
-        Tokenize messages, call Next.js API, decode response.
+        Start rollouts and return rollout ID immediately.
+        Use poll_rollout() to check for results.
         """
-        # Build model input using renderer
-        model_input = self.renderer.build_generation_prompt(messages)
-        tokens = model_input.to_ints()
-        # Get stop sequences from renderer if not provided
-        if stop is None:
-            stop = self.renderer.get_stop_sequences()
-        # Build request payload
         payload = {
+            "samples": samples,
             "model": self.model,
-            "tokens": tokens,
-            "stop": stop,
             "max_tokens": self.max_tokens,
             "temperature": self.temperature,
+            "max_turns": self.max_turns,
+            "termination_policy": self.termination_policy,
+            "tools_py_code": tools_py_code,
+            "rewards_py_code": rewards_py_code,
+            "debug": self.debug,
         }
-        # Include session_id if we have one cached
-        if self.session_id:
-            payload["session_id"] = self.session_id
-        # Call Next.js API with tokens
+        if self.mcp_url:
+            payload["mcp_url"] = self.mcp_url
+        # Send Dockerfiles for local/ images
+        if dockerfiles:
+            payload["dockerfiles"] = dockerfiles
+        # Send project secrets (from .env file)
+        if secrets:
+            payload["secrets"] = secrets
+        if self.smoke_test:
+            payload["smoke_test"] = True
+            payload["openai_api_key"] = self.openai_api_key
         resp = await self.client.post(
-            f"{self.api_base}/api/rnow/sample",
+            f"{self.api_base}/api/rnow/rollout",
             json=payload,
             headers=self.auth_headers,
         )
@@ -143,94 +167,79 @@ class ModelCompleter:
         if "error" in data:
             raise Exception(f"API error: {data.get('detail', data.get('error'))}")
-        # Cache the session_id for future requests
-        if "session_id" in data and data["session_id"]:
-            self.session_id = data["session_id"]
-        # Track latency
-        if "latency_ms" in data:
-            self.total_latency_ms += data["latency_ms"]
-            self.request_count += 1
+        return data["rollout_id"]
-        # Decode tokens back to text
-        output_tokens = data.get("tokens", [])
-        parsed_message, _success = self.renderer.parse_response(output_tokens)
+    async def poll_rollout(self, rollout_id: str) -> dict:
+        """Poll for rollout status. Returns dict with 'status' field."""
+        resp = await self.client.get(
+            f"{self.api_base}/api/rnow/rollout",
+            params={"id": rollout_id},
+            headers=self.auth_headers,
+        )
+        resp.raise_for_status()
+        return resp.json()
+    async def run_batch_rollouts(
+        self,
+        samples: list[dict],
+        tools_py_code: str | None = None,
+        rewards_py_code: str | None = None,
+        dockerfiles: dict[str, str] | None = None,
+        secrets: dict[str, str] | None = None,
+        spinner: Spinner | None = None,
+        timeout_minutes: int = 30,
+    ) -> tuple[str, list[dict]]:
+        """
+        Run rollouts with exponential backoff polling.
+        Returns (rollout_id, results).
+        """
+        # Start the rollout
+        rollout_id = await self.start_rollout(
+            samples, tools_py_code, rewards_py_code, dockerfiles, secrets
+        )
-        return {
-            "content": parsed_message.get("content", ""),
-            "latency_ms": data.get("latency_ms", 0),
-        }
+        if spinner:
+            spinner.update(f"Running rollouts... (ID: {rollout_id[:8]})")
-    async def close(self):
-        await self.client.aclose()
+        # Poll with exponential backoff
+        poll_interval = 2.0  # Start at 2 seconds
+        max_interval = 10.0  # Cap at 10 seconds
+        timeout = timeout_minutes * 60
+        start_time = time.time()
+        while time.time() - start_time < timeout:
+            if _shutdown_requested:
+                raise asyncio.CancelledError()
-async def flush_pending_charges(api_url: str) -> dict | None:
-    """
-    Flush any pending ROLLOUT charges at the end of the test.
-    Returns the flush result or None if it failed.
-    """
-    try:
-        headers = get_auth_headers()
-        async with httpx.AsyncClient(timeout=30.0) as client:
-            resp = await client.post(
-                f"{api_url.rstrip('/')}/api/billing/flush-rollout",
-                headers=headers,
-            )
-            resp.raise_for_status()
-            return resp.json()
-    except Exception as e:
-        click.echo(click.style(f"Warning: Failed to flush pending charges: {e}", fg="yellow"))
-        return None
-def _exec_file(path: Path, module_name: str) -> None:
-    """Execute a Python file to populate registries."""
-    import importlib.util
-    spec = importlib.util.spec_from_file_location(module_name, path)
-    if spec is None or spec.loader is None:
-        raise ImportError(f"Could not load module from {path}")
-    module = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(module)
-def _build_tools_block(tool_registry: dict[str, Callable]) -> str:
-    """Build the tools description block from registered tool functions."""
-    if not tool_registry:
-        return ""
-    tools_json = []
-    for name, fn in tool_registry.items():
-        schema = getattr(fn, "_schema", {"type": "object", "properties": {}})
-        description = getattr(fn, "_description", "No description available.")
-        tools_json.append(
-            {
-                "name": name,
-                "description": description,
-                "parameters": schema,
-            }
-        )
+            # Add jitter (±20%)
+            jitter = poll_interval * 0.2 * (random.random() * 2 - 1)
+            await asyncio.sleep(poll_interval + jitter)
-    tools_block = f"""# Tools
+            result = await self.poll_rollout(rollout_id)
+            status = result.get("status")
-You may call one or more functions to assist with the user query.
+            if status == "completed":
+                # Track billing
+                if "billing" in result:
+                    billing = result["billing"]
+                    tokens = billing.get("prompt_tokens", 0) + billing.get("completion_tokens", 0)
+                    self.total_charged_dollars += tokens * 0.000001
+                return rollout_id, result.get("results", [])
-You are provided with function signatures within <tools></tools> XML tags:
-<tools>
-{json.dumps(tools_json, indent=2)}
-</tools>
+            if status == "failed":
+                raise Exception(f"Rollout failed: {result.get('error', 'Unknown error')}")
-For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
-<tool_call>
-{{"name": "<function-name>", "arguments": {{"<arg-name>": "<value>"}}}}
-</tool_call>
-"""
+            # Exponential backoff
+            poll_interval = min(poll_interval * 1.5, max_interval)
-    return tools_block
+            if spinner:
+                elapsed = int(time.time() - start_time)
+                spinner.update(f"Running rollouts... ({elapsed}s, ID: {rollout_id[:8]})")
+        raise TimeoutError(f"Rollout timed out after {timeout_minutes} minutes")
-TOOL_CALL_RE = re.compile(r"<tool_call>\s*(.*?)\s*</tool_call>", re.DOTALL)
+    async def close(self):
+        await self.client.aclose()
 def _format_message(msg: dict, max_len: int = 300) -> str:
@@ -247,154 +256,27 @@ def _format_message(msg: dict, max_len: int = 300) -> str:
 async def _run_single_rollout(
-    completer: ModelCompleter,
+    client: RolloutClient,
     sample: dict,
-    reward_registry: dict[str, Callable],
-    tool_registry: dict[str, Callable],
-    max_turns: int,
-    termination_policy: str,
+    tools_py_code: str | None,
+    rewards_py_code: str | None,
     verbose: bool = False,
 ) -> dict:
-    """Run a single rollout for an RL sample."""
-    import inspect
-    messages_templates = sample["messages"]
-    reward_names = sample["rewards"]
-    variables = sample.get("variables", {})
-    metadata = sample.get("metadata", {})
-    reward_fns = []
-    for name in reward_names:
-        if name not in reward_registry:
-            raise ValueError(f"Reward function '{name}' not found in registry")
-        reward_fns.append(reward_registry[name])
-    ctx = {**metadata, **variables}
-    messages = [
-        {"role": msg["role"], "content": Template(msg["content"]).safe_substitute(ctx)}
-        for msg in messages_templates
-    ]
-    if tool_registry:
-        tools_block = _build_tools_block(tool_registry)
-        system_found = False
-        for msg in messages:
-            if msg["role"] == "system":
-                msg["content"] = tools_block + "\n\n" + msg["content"]
-                system_found = True
-                break
-        if not system_found:
-            messages.insert(0, {"role": "system", "content": tools_block})
-    conversation = messages.copy()
-    turn_count = 0
-    total_tool_calls = 0
-    # Show initial messages in verbose mode
+    """Run a single rollout via the API."""
+    result = await client.run_rollout(
+        sample=sample,
+        tools_py_code=tools_py_code,
+        rewards_py_code=rewards_py_code,
+    )
+    # Show conversation in verbose mode
     if verbose:
-        click.echo("  --- Initial Messages ---")
-        for msg in messages:
+        click.echo("  --- Conversation ---")
+        for msg in result.get("conversation", []):
             click.echo(f"    {_format_message(msg)}")
-        click.echo("  -------------------------")
-    while turn_count < max_turns:
-        turn_count += 1
+        click.echo("  ---------------------")
-        result = await completer(conversation, stop=None)
-        response_content = result.get("content", "")
-        conversation.append({"role": "assistant", "content": response_content})
-        if verbose:
-            click.echo(
-                f"  [Turn {turn_count}] {_format_message({'role': 'assistant', 'content': response_content}, max_len=500)}"
-            )
-        tool_matches = TOOL_CALL_RE.findall(response_content)
-        tool_call_count = len(tool_matches)
-        total_tool_calls += tool_call_count
-        for raw_call in tool_matches:
-            if not tool_registry:
-                break
-            try:
-                tool_data = json.loads(raw_call)
-                tool_name = tool_data.get("name")
-                args = tool_data.get("arguments", {})
-                if tool_name not in tool_registry:
-                    tool_response = f"<tool_error>Tool '{tool_name}' not found</tool_error>"
-                    conversation.append({"role": "tool", "content": tool_response})
-                    if verbose:
-                        click.echo(
-                            f"    {_format_message({'role': 'tool', 'content': tool_response})}"
-                        )
-                    continue
-                tool_fn = tool_registry[tool_name]
-                tool_result = (
-                    await tool_fn(**args)
-                    if inspect.iscoroutinefunction(tool_fn)
-                    else tool_fn(**args)
-                )
-                tool_response = f"<tool_result>{json.dumps(tool_result)}</tool_result>"
-                conversation.append({"role": "tool", "content": tool_response})
-                if verbose:
-                    click.echo(
-                        f"    Tool {click.style(tool_name, fg=TEAL_RGB)}: {str(tool_result)[:200]}"
-                    )
-            except json.JSONDecodeError as e:
-                tool_response = f"<tool_error>Invalid JSON: {str(e)}</tool_error>"
-                conversation.append({"role": "tool", "content": tool_response})
-                if verbose:
-                    click.echo(f"    {_format_message({'role': 'tool', 'content': tool_response})}")
-            except Exception as e:
-                tool_response = f"<tool_error>{str(e)}</tool_error>"
-                conversation.append({"role": "tool", "content": tool_response})
-                if verbose:
-                    click.echo(f"    {_format_message({'role': 'tool', 'content': tool_response})}")
-        if termination_policy == "last_tool" and tool_call_count == 0:
-            break
-    # Show final conversation summary in verbose mode
-    if verbose:
-        click.echo(f"  --- Rollout Complete: {turn_count} turns, {total_tool_calls} tool calls ---")
-    reward_args = RewardArgs(metadata=metadata, variables=variables)
-    rewards = {}
-    for fn, name in zip(reward_fns, reward_names, strict=False):
-        value = await fn(reward_args, conversation)
-        rewards[name] = value
-    total_reward = compute_total_reward(rewards) if rewards else 0.0
-    return {
-        "total_reward": total_reward,
-        "rewards": rewards,
-        "turns": turn_count,
-        "tools_used": total_tool_calls,
-        "conversation": conversation,
-    }
-def _check_test_dependencies():
-    """Check if optional test dependencies are installed."""
-    try:
-        import tinker_cookbook  # noqa: F401
-    except ImportError:
-        click.echo()
-        click.echo(
-            click.style("Error: ", fg="red", bold=True)
-            + "The 'rnow test' command requires additional dependencies."
-        )
-        pip_cmd = "pip install 'rnow[test]'"
-        click.echo(f"Install them with: {click.style(pip_cmd, fg=TEAL_RGB)}")
-        click.echo()
-        raise SystemExit(1)
+    return result
 @click.command(name="test")
@@ -404,12 +286,12 @@ def _check_test_dependencies():
     "project_dir",
     type=click.Path(file_okay=False, dir_okay=True, path_type=Path),
     default=".",
-    help="Project directory containing config.yml, rewards.py, env.py, train.jsonl",
+    help="Project directory containing config.yml, rewards.py, tools.py, train.jsonl",
 )
 @click.option(
     "--num-rollouts",
     "-n",
-    default=3,
+    default=1,
     show_default=True,
     help="Number of rollouts to run",
 )
@@ -449,51 +331,290 @@ def _check_test_dependencies():
     type=int,
     help="Truncate message content to N characters (default: no truncation)",
 )
+@click.option(
+    "--debug",
+    is_flag=True,
+    help="Use debug trainer image from Docker Hub (for testing trainer changes)",
+)
+@click.option(
+    "--output-dir",
+    "-o",
+    "output_dir",
+    type=click.Path(file_okay=False, dir_okay=True, path_type=Path),
+    default=None,
+    help="Save rollout results as JSON files in this directory",
+)
+@click.option(
+    "--smoke-test",
+    is_flag=True,
+    help="Use OpenAI gpt-5-nano instead of tinker (requires OPENAI_API_KEY env var)",
+)
+@click.option(
+    "--id",
+    "rollout_id",
+    default=None,
+    help="Fetch results for an existing rollout ID (skip running new rollout)",
+)
+@click.option(
+    "--store",
+    is_flag=True,
+    help="Store rollout ID in ./rollouts/<id>.txt for later retrieval",
+)
+@click.option(
+    "--timeout",
+    default=60,
+    show_default=True,
+    help="Timeout in minutes for polling results",
+)
+@click.option(
+    "--entry",
+    "-e",
+    "entries",
+    default=None,
+    help="Entry indices from train.jsonl (0-indexed). Examples: -e 5, -e 0,2,5, -e 0 -e 2 -e 5",
+    multiple=True,
+)
 @click.pass_context
-def test(ctx, project_dir, num_rollouts, multi_turn, with_tools, model, api_url, verbose, truncate):
-    """Test RL rollouts locally before submitting.
+def test(
+    ctx,
+    project_dir,
+    num_rollouts,
+    multi_turn,
+    with_tools,
+    model,
+    api_url,
+    verbose,
+    truncate,
+    debug,
+    output_dir,
+    smoke_test,
+    rollout_id,
+    store,
+    timeout,
+    entries,
+):
+    """Test RL rollouts before submitting.
+    Runs rollouts via the /api/rnow/rollout endpoint on Cloud Run.
+    Use --smoke-test to use OpenAI gpt-5-nano instead of tinker models
+    (requires OPENAI_API_KEY environment variable).
-    This command runs local RL rollouts by calling the Next.js API
-    for model sampling.
+    Use --id to fetch results for an existing rollout.
     Only works with RL projects (dataset_type: rl).
     """
     global _shutdown_requested
     _shutdown_requested = False
-    def handle_sigint(signum, frame):
-        global _shutdown_requested
-        if _shutdown_requested:
-            # Second Ctrl+C, force exit
-            sys.exit(1)
-        _shutdown_requested = True
-        click.echo("\n" + click.style("Interrupted. Shutting down gracefully...", fg="yellow"))
-    # Set up signal handler
-    original_handler = signal.signal(signal.SIGINT, handle_sigint)
+    resolved_api_url = api_url or ctx.obj.get("api_url", "").replace("/api", "") or DEFAULT_API_URL
-    require_auth()
-    _check_test_dependencies()
-    try:
+    # Handle --id flag: just fetch existing rollout results
+    if rollout_id:
         asyncio.run(
-            _test_async(
+            _fetch_rollout_results(
+                rollout_id=rollout_id,
+                api_url=resolved_api_url,
+                store=store,
+                truncate=truncate,
+                output_dir=output_dir,
+            )
+        )
+        return
+    # Check for OpenAI API key in smoke test mode
+    openai_api_key = None
+    if smoke_test:
+        openai_api_key = os.environ.get("OPENAI_API_KEY")
+        if not openai_api_key:
+            raise click.ClickException(
+                "OPENAI_API_KEY environment variable is required for smoke test mode.\n"
+                "Set it with: export OPENAI_API_KEY=sk-..."
+            )
+    else:
+        require_auth()
+    async def run_with_cancellation():
+        """Run test with proper cancellation support."""
+        loop = asyncio.get_running_loop()
+        task = asyncio.current_task()
+        def handle_sigint():
+            global _shutdown_requested
+            if _shutdown_requested:
+                sys.exit(1)
+            _shutdown_requested = True
+            click.echo("\n" + click.style("Interrupted. Cancelling...", fg="yellow"))
+            task.cancel()
+        loop.add_signal_handler(signal.SIGINT, handle_sigint)
+        try:
+            await _test_async(
                 project_dir=project_dir,
                 num_rollouts=num_rollouts,
                 multi_turn=multi_turn,
                 with_tools=with_tools,
                 model_override=model,
-                api_url=api_url
-                or ctx.obj.get("api_url", "").replace("/api", "")
-                or DEFAULT_API_URL,
+                api_url=resolved_api_url,
                 verbose=verbose,
                 truncate=truncate,
+                debug=debug,
+                output_dir=output_dir,
+                smoke_test=smoke_test,
+                openai_api_key=openai_api_key,
+                store=store,
+                timeout_minutes=timeout,
+                entries=entries,
             )
-        )
+        except asyncio.CancelledError:
+            click.echo(click.style("Aborted.", fg="yellow"))
+        finally:
+            loop.remove_signal_handler(signal.SIGINT)
+    try:
+        asyncio.run(run_with_cancellation())
     except KeyboardInterrupt:
         click.echo(click.style("Aborted.", fg="yellow"))
+async def _fetch_rollout_results(
+    rollout_id: str,
+    api_url: str,
+    store: bool = False,
+    truncate: int | None = None,
+    output_dir: Path | None = None,
+):
+    """Fetch results for an existing rollout ID."""
+    click.echo(f"Fetching results for rollout: {click.style(rollout_id, fg=TEAL_RGB)}")
+    client = httpx.AsyncClient(timeout=30.0)
+    auth_headers = get_auth_headers()
+    try:
+        resp = await client.get(
+            f"{api_url}/api/rnow/rollout",
+            params={"id": rollout_id},
+            headers=auth_headers,
+        )
+        resp.raise_for_status()
+        data = resp.json()
     finally:
-        # Restore original signal handler
-        signal.signal(signal.SIGINT, original_handler)
+        await client.aclose()
+    status = data.get("status")
+    if status == "pending":
+        click.echo(click.style("Rollout still running...", fg="yellow"))
+        click.echo(f"Poll again with: rnow test --id {rollout_id}")
+        return
+    if status == "failed":
+        click.echo(click.style(f"Rollout failed: {data.get('error', 'Unknown')}", fg="red"))
+        return
+    # Store rollout ID if requested
+    if store:
+        _store_rollout_id(rollout_id, data)
+    # Display results
+    results = data.get("results", [])
+    _display_results(results, truncate, output_dir, rollout_id)
+    # Show billing
+    billing = data.get("billing", {})
+    tokens = billing.get("prompt_tokens", 0) + billing.get("completion_tokens", 0)
+    if tokens > 0:
+        click.echo(f"Tokens: {tokens}")
+def _store_rollout_id(rollout_id: str, data: dict):
+    """Store rollout ID and results in ./rollouts/<id>.txt"""
+    rollouts_dir = Path("rollouts")
+    rollouts_dir.mkdir(exist_ok=True)
+    filepath = rollouts_dir / f"{rollout_id}.txt"
+    with open(filepath, "w") as f:
+        f.write(f"Rollout ID: {rollout_id}\n")
+        f.write(f"Status: {data.get('status', 'unknown')}\n")
+        f.write(f"S3 Path: rollouts/{rollout_id}/result.json\n")
+        f.write("\n")
+        # Write summary
+        results = data.get("results", [])
+        successful = [r for r in results if r.get("success")]
+        if successful:
+            rewards = [r.get("total_reward", 0) for r in successful]
+            f.write(f"Successful: {len(successful)}/{len(results)}\n")
+            f.write(f"Mean Reward: {sum(rewards) / len(rewards):.3f}\n")
+        # Write billing
+        billing = data.get("billing", {})
+        tokens = billing.get("prompt_tokens", 0) + billing.get("completion_tokens", 0)
+        if tokens > 0:
+            f.write(f"Tokens: {tokens}\n")
+        f.write("\n--- Full Results ---\n")
+        f.write(json.dumps(data, indent=2))
+    click.echo(f"Stored: {click.style(str(filepath), fg=TEAL_RGB)}")
+def _display_results(
+    results: list[dict],
+    truncate: int | None,
+    output_dir: Path | None,
+    rollout_id: str | None = None,
+):
+    """Display rollout results."""
+    rewards = []
+    for idx, result in enumerate(results):
+        click.echo(f"Rollout {idx + 1}/{len(results)}")
+        if not result.get("success"):
+            click.echo(click.style(f"  ✗ {result.get('error', 'Unknown error')}", fg="red"))
+            click.echo()
+            continue
+        total_reward = result.get("total_reward", 0.0)
+        rewards.append(total_reward)
+        # Show conversation
+        for msg in result.get("conversation", []):
+            role = msg.get("role", "unknown")
+            content = msg.get("content", "")
+            if truncate and len(content) > truncate:
+                content = content[:truncate] + "..."
+            tag = click.style(f"[{role}]", fg="red")
+            click.echo(f"  {tag} {content}")
+        reward_breakdown = result.get("rewards", {})
+        reward_str = ", ".join(f"{k}={v:.3f}" for k, v in reward_breakdown.items())
+        turns = result.get("turns", 0)
+        click.echo(
+            f"  {click.style('reward', fg=TEAL_RGB)}={total_reward:.3f} "
+            f"| turns={turns} "
+            f"| [{reward_str}]"
+        )
+        click.echo()
+    # Save to files if requested
+    if output_dir and results:
+        output_dir.mkdir(parents=True, exist_ok=True)
+        timestamp = time.strftime("%Y%m%d_%H%M%S")
+        for idx, result in enumerate(results):
+            if result.get("success"):
+                filename = output_dir / f"rollout_{timestamp}_{idx + 1}.json"
+                filename.write_text(json.dumps(result, indent=2))
+        click.echo(f"Results saved to {click.style(str(output_dir), fg=TEAL_RGB)}")
+    # Summary
+    if rewards:
+        mean_reward = sum(rewards) / len(rewards)
+        click.echo()
+        click.echo(f"Mean reward: {click.style(f'{mean_reward:.3f}', fg=TEAL_RGB)}")
+        if rollout_id:
+            click.echo(f"Rollout ID: {click.style(rollout_id, fg=TEAL_RGB)}")
 async def _test_async(
@@ -505,6 +626,13 @@ async def _test_async(
     api_url: str,
     verbose: bool,
     truncate: int | None,
+    debug: bool = False,
+    output_dir: Path | None = None,
+    smoke_test: bool = False,
+    openai_api_key: str | None = None,
+    store: bool = False,
+    timeout_minutes: int = 60,
+    entries: tuple[int, ...] = (),
 ):
     project_dir = Path(project_dir)
@@ -529,7 +657,7 @@ async def _test_async(
         )
     rewards_path = project_dir / "rewards.py"
-    env_path = project_dir / "env.py"
+    tools_path = project_dir / "tools.py"
     train_path = project_dir / "train.jsonl"
     if not rewards_path.exists():
@@ -537,192 +665,159 @@ async def _test_async(
     if not train_path.exists():
         raise click.ClickException("train.jsonl not found in project directory")
-    # Validate max_tokens vs prompt size
-    from rnow.cli.commands import get_max_prompt_tokens, validate_max_tokens_for_context
-    from rnow.models import MAX_CONTEXT_WINDOW
-    if config.rollout:
-        max_prompt_tokens = get_max_prompt_tokens(train_path)
-        if max_prompt_tokens > 0:
-            context_error, recommended = validate_max_tokens_for_context(
-                config.rollout.max_tokens, max_prompt_tokens
-            )
-            if context_error:
-                click.echo()
-                click.echo(click.style("✗ Context window exceeded", fg="red", bold=True))
-                click.echo()
-                click.echo(
-                    f"  Your longest prompt in train.jsonl is ~{max_prompt_tokens:,} tokens."
-                )
-                click.echo(f"  With max_tokens={config.rollout.max_tokens:,}, the total exceeds")
-                click.echo(f"  the {MAX_CONTEXT_WINDOW:,} token context window.")
-                click.echo()
-                click.echo(
-                    click.style("  Fix:", bold=True)
-                    + f" Set rollout.max_tokens to {recommended:,} or less"
-                )
-                click.echo()
-                raise click.ClickException("max_tokens + prompt length exceeds context window")
-    clear_reward_registry()
-    clear_tool_registry()
-    _exec_file(rewards_path, "rewards")
-    if with_tools and env_path.exists():
-        _exec_file(env_path, "env")
+    # Read user code files to send to the API
+    rewards_py_code = rewards_path.read_text()
+    tools_py_code = tools_path.read_text() if with_tools and tools_path.exists() else None
+    # Load samples
     samples = [json.loads(line) for line in train_path.read_text().splitlines() if line.strip()]
+    # Read Dockerfile.* files for local/ docker images
+    dockerfiles: dict[str, str] = {}
+    for dockerfile_path in project_dir.glob("Dockerfile.*"):
+        dockerfiles[dockerfile_path.name] = dockerfile_path.read_text()
+        click.echo(f"  Found {dockerfile_path.name}")
+    # Read .env file for project secrets
+    project_secrets: dict[str, str] = {}
+    env_path = project_dir / ".env"
+    if env_path.exists():
+        for line in env_path.read_text().splitlines():
+            line = line.strip()
+            if line and not line.startswith("#") and "=" in line:
+                key, _, value = line.partition("=")
+                # Remove quotes if present
+                value = value.strip().strip("'\"")
+                project_secrets[key.strip()] = value
+        if project_secrets:
+            click.echo(f"  Loaded secrets: {list(project_secrets.keys())}")
     if not samples:
         raise click.ClickException("train.jsonl is empty")
-    model_name = model_override or config.model.path
+    # For smoke test, always use gpt-5-nano
+    model_name = "gpt-5-nano" if smoke_test else model_override or config.model.path
     max_tokens = config.rollout.max_tokens if config.rollout else 2048
     max_turns_config = config.rollout.max_turns if config.rollout else 1
     termination_policy = config.rollout.termination_policy if config.rollout else "last_tool"
+    mcp_url = config.rollout.mcp_url if config.rollout else None
     max_turns = 1 if not multi_turn else max_turns_config
-    # Check for gpt-oss with tools - not supported in rnow test
-    is_gpt_oss = "gpt-oss" in model_name.lower() or "gptoss" in model_name.lower()
-    has_tools = with_tools and (env_path.exists() or (config.rollout and config.rollout.mcp_url))
-    if is_gpt_oss and has_tools:
-        click.echo(
-            click.style("Warning: ", fg="yellow")
-            + "Tool calling with gpt-oss models is not supported in 'rnow test'. Running without tools."
-        )
-        with_tools = False
-    rewards = []
-    tool_registry_to_use = TOOL_REGISTRY if with_tools else {}
+    # Display mode and model info
+    if smoke_test:
+        click.echo(f"Mode: {click.style('SMOKE TEST', fg=TEAL_RGB)} (OpenAI gpt-5-nano)")
+    else:
+        thinking_display = get_thinking_mode_display(config)
+        click.echo(f"Model: {model_name} ({click.style(thinking_display, fg=TEAL_RGB)})")
-    # Display model info with reasoning mode (same format as rnow run)
-    thinking_display = get_thinking_mode_display(config)
-    click.echo(f"Model: {model_name} ({click.style(thinking_display, fg=TEAL_RGB)})")
     click.echo()
     try:
-        # Create one completer per concurrent rollout to avoid session conflicts
-        completers = [
-            ModelCompleter(
-                api_base=api_url,
-                model=model_name,
-                max_tokens=max_tokens,
-            )
-            for _ in range(num_rollouts)
-        ]
-        # Select samples for each rollout upfront
-        selected_samples = [random.choice(samples) for _ in range(num_rollouts)]
+        # Create one RolloutClient for all rollouts
+        client = RolloutClient(
+            api_base=api_url,
+            model=model_name,
+            max_tokens=max_tokens,
+            temperature=1.0,
+            max_turns=max_turns,
+            termination_policy=termination_policy,
+            debug=debug,
+            smoke_test=smoke_test,
+            openai_api_key=openai_api_key,
+            mcp_url=mcp_url,
+        )
-        # Start spinner for concurrent rollouts
-        spinner = Spinner(f"Running {num_rollouts} rollouts concurrently...")
+        # Select samples for batch rollout
+        if entries:
+            # Parse entries - support both "-e 0 -e 2" and "-e 0,2,5"
+            entry_indices = []
+            for entry in entries:
+                # Handle comma-separated values
+                for part in str(entry).split(","):
+                    part = part.strip()
+                    if part:
+                        try:
+                            idx = int(part)
+                        except ValueError:
+                            raise click.ClickException(f"Invalid entry index: {part}")
+                        if idx < 0 or idx >= len(samples):
+                            raise click.ClickException(
+                                f"Entry index {idx} out of range. train.jsonl has {len(samples)} entries (0-{len(samples) - 1})"
+                            )
+                        entry_indices.append(idx)
+            if not entry_indices:
+                raise click.ClickException("No valid entry indices provided")
+            selected_samples = [samples[idx] for idx in entry_indices]
+            click.echo(f"Testing entries: {entry_indices}")
+        else:
+            # Random selection
+            selected_samples = [random.choice(samples) for _ in range(num_rollouts)]
+        # Start spinner for batch rollout
+        spinner = Spinner(f"Starting {len(selected_samples)} rollouts...")
         spinner.start()
-        async def run_rollout_with_index(idx: int) -> tuple[int, dict | Exception]:
-            """Run a single rollout and return (index, result or exception)."""
-            if _shutdown_requested:
-                return (idx, asyncio.CancelledError("Shutdown requested"))
-            try:
-                result = await _run_single_rollout(
-                    completer=completers[idx],
-                    sample=selected_samples[idx],
-                    reward_registry=REWARD_REGISTRY,
-                    tool_registry=tool_registry_to_use,
-                    max_turns=max_turns,
-                    termination_policy=termination_policy,
-                    verbose=False,
-                )
-                return (idx, result)
-            except asyncio.CancelledError:
-                return (idx, asyncio.CancelledError("Cancelled"))
-            except Exception as e:
-                return (idx, e)
-        # Run all rollouts concurrently
         start_time = time.time()
-        tasks = [asyncio.create_task(run_rollout_with_index(i)) for i in range(num_rollouts)]
+        rollout_id = None
         try:
-            results = await asyncio.gather(*tasks, return_exceptions=True)
+            # Start rollout and poll for results with exponential backoff
+            rollout_id, batch_results = await client.run_batch_rollouts(
+                samples=selected_samples,
+                tools_py_code=tools_py_code,
+                rewards_py_code=rewards_py_code,
+                dockerfiles=dockerfiles if dockerfiles else None,
+                secrets=project_secrets if project_secrets else None,
+                spinner=spinner,
+                timeout_minutes=timeout_minutes,
+            )
         except asyncio.CancelledError:
-            # Cancel all tasks if we get interrupted
-            for task in tasks:
-                task.cancel()
-            results = []
+            batch_results = []
+        except Exception as e:
+            spinner.stop()
+            raise e
         total_time = time.time() - start_time
         spinner.stop()
+        # Show rollout ID
+        if rollout_id:
+            click.echo(f"Rollout ID: {click.style(rollout_id, fg=TEAL_RGB)}")
+            click.echo()
         # Check if shutdown was requested
         if _shutdown_requested:
-            # Close completers and exit early
-            for c in completers:
-                await c.close()
+            await client.close()
             return
-        # Display results in order
-        for idx, result in sorted(results, key=lambda x: x[0]):
-            click.echo(f"Rollout {idx+1}/{num_rollouts}")
-            if isinstance(result, Exception):
-                if isinstance(result, httpx.HTTPStatusError):
-                    click.echo(
-                        click.style(f"  ✗ HTTP Error: {result.response.status_code}", fg="red")
-                    )
-                else:
-                    click.echo(click.style(f"  ✗ {result}", fg="red"))
-                click.echo()
-                continue
-            total_reward = result["total_reward"]
-            rewards.append(total_reward)
-            # Get conversation
-            conversation = result["conversation"]
-            # Show all messages with red tags
-            for msg in conversation:
-                role = msg.get("role", "unknown")
-                content = msg.get("content", "")
-                # Truncate if flag is set
-                if truncate and len(content) > truncate:
-                    content = content[:truncate] + "..."
-                tag = click.style(f"[{role}]", fg="red")
-                click.echo(f"  {tag} {content}")
-            reward_str = ", ".join(f"{k}={v:.3f}" for k, v in result["rewards"].items())
-            click.echo(
-                f"  {click.style('reward', fg=TEAL_RGB)}={total_reward:.3f} "
-                f"| turns={result['turns']} "
-                f"| tools_used={result['tools_used']} "
-                f"| [{reward_str}]"
+        # Store results if requested
+        if store and rollout_id:
+            _store_rollout_id(
+                rollout_id,
+                {
+                    "status": "completed",
+                    "results": batch_results,
+                    "billing": {"prompt_tokens": 0, "completion_tokens": 0},
+                },
             )
-            click.echo()
-        # Close all completers
-        for c in completers:
-            await c.close()
+        # Display results using shared function
+        _display_results(batch_results, truncate, output_dir, rollout_id)
-    except Exception:
-        raise
+        # Get total billing
+        total_charged = client.total_charged_dollars
-    # Flush any pending billing charges
-    flush_result = await flush_pending_charges(api_url)
+        # Close client
+        await client.close()
-    if rewards:
-        mean_reward = sum(rewards) / len(rewards)
-        click.echo()
-        click.echo(f"Mean reward: {click.style(f'{mean_reward:.3f}', fg=TEAL_RGB)}")
-        click.echo(f"Latency: {click.style(f'{total_time:.1f}s', fg=TEAL_RGB)}")
-    else:
-        click.echo(click.style("\nNo successful rollouts completed.", fg="yellow"))
+    except Exception:
+        raise
-    # Show billing summary if charges were flushed
-    if flush_result and flush_result.get("flushed"):
-        amount_cents = flush_result.get("amountCents", 0)
-        total_tokens = flush_result.get("totalTokens", 0)
-        click.echo(
-            f"Billing: {click.style(f'${amount_cents/100:.2f}', fg=TEAL_RGB)} ({total_tokens:,} tokens)"
-        )
+    # Show timing and cost
+    click.echo(f"Latency: {click.style(f'{total_time:.1f}s', fg=TEAL_RGB)}")
+    if total_charged > 0:
+        click.echo(f"Cost: {click.style(f'${total_charged:.4f}', fg=TEAL_RGB)}")

rnow 0.2.4__py3-none-any.whl → 0.3.9__py3-none-any.whl

rnow 0.2.4py3-none-any.whl → 0.3.9py3-none-any.whl