PyPI - rnow - Versions diffs - 0.2.4__py3-none-any.whl → 0.3.9__py3-none-any.whl - Mend

rnow 0.2.4py3-none-any.whl → 0.3.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

rnow/cli/commands.py +226 -84
rnow/cli/test.py +536 -441
rnow/core/__init__.py +4 -1
rnow/core/reward.py +34 -3
rnow/core/tool.py +29 -7
rnow/models.py +88 -6
rnow/templates/deepseek-aha/config.yml +1 -1
rnow/templates/mcp-tavily/config.yml +1 -1
rnow/templates/rl-single/config.yml +7 -7
rnow/templates/rl-single/train.jsonl +0 -908
rnow/templates/rl-tools/config.yml +1 -1
rnow/templates/tutorial-reward/config.yml +7 -7
rnow/templates/tutorial-reward/train.jsonl +0 -908
rnow/templates/tutorial-tool/config.yml +1 -1
{rnow-0.2.4.dist-info → rnow-0.3.9.dist-info}/METADATA +23 -9
{rnow-0.2.4.dist-info → rnow-0.3.9.dist-info}/RECORD +22 -22
/rnow/templates/rl-tools/{env.py → tools.py} +0 -0
/rnow/templates/tutorial-tool/{env.py → tools.py} +0 -0
{rnow-0.2.4.dist-info → rnow-0.3.9.dist-info}/WHEEL +0 -0
{rnow-0.2.4.dist-info → rnow-0.3.9.dist-info}/entry_points.txt +0 -0
{rnow-0.2.4.dist-info → rnow-0.3.9.dist-info}/licenses/LICENSE +0 -0
{rnow-0.2.4.dist-info → rnow-0.3.9.dist-info}/top_level.txt +0 -0

rnow/core/__init__.py CHANGED Viewed

@@ -14,9 +14,10 @@ from .reward import (
     clear_reward_registry,
     compute_total_reward,
     is_precondition,
+    is_sandbox_reward,
     reward,
 )
-from .tool import TOOL_REGISTRY, clear_tool_registry, tool
+from .tool import TOOL_REGISTRY, clear_tool_registry, is_sandbox_tool, tool
 __all__ = [
     # User-facing API
@@ -29,5 +30,7 @@ __all__ = [
     "clear_reward_registry",
     "clear_tool_registry",
     "is_precondition",
+    "is_sandbox_reward",
+    "is_sandbox_tool",
     "compute_total_reward",
 ]

rnow/core/reward.py CHANGED Viewed

@@ -32,6 +32,14 @@ def is_precondition(name: str) -> bool:
     return getattr(fn, "_is_precondition", False)
+def is_sandbox_reward(name: str) -> bool:
+    """Check if a reward function should run inside the Docker sandbox."""
+    fn = REWARD_REGISTRY.get(name)
+    if fn is None:
+        return False
+    return getattr(fn, "_is_sandbox", False)
 def compute_total_reward(reward_results: dict[str, float]) -> float:
     """
     Compute total reward with precondition logic.
@@ -105,8 +113,7 @@ def _validate_reward_signature(func: Callable) -> None:
     second_param = params[1]
     if second_param.name not in hints:
         raise TypeError(
-            f"Reward '{func.__name__}': parameter '{second_param.name}' must have "
-            "type hint 'list'."
+            f"Reward '{func.__name__}': parameter '{second_param.name}' must have type hint 'list'."
         )
     second_type = hints[second_param.name]
     # Allow list or List (from typing)
@@ -127,7 +134,13 @@ def _validate_reward_signature(func: Callable) -> None:
         raise TypeError(f"Reward '{func.__name__}' must return 'float', got '{return_type}'.")
-def reward(fn: Callable = None, *, precondition: bool = False) -> Callable:
+def reward(
+    fn: Callable = None,
+    *,
+    precondition: bool = False,
+    sandbox: bool = False,
+    timeout: int = 60,
+) -> Callable:
     """
     Decorator to register reward functions with validation.
@@ -150,10 +163,26 @@ def reward(fn: Callable = None, *, precondition: bool = False) -> Callable:
             # If this returns 1, total reward is 1 + sum(other rewards)
             return 1.0 if valid_format else 0.0
+        @reward(sandbox=True, timeout=120)  # Run inside Docker sandbox with 2min timeout
+        def test_code(args: RewardArgs, messages: list) -> float:
+            # This executes inside the sandbox container
+            # Has access to files created by LLM, can run pytest, etc.
+            import subprocess
+            result = subprocess.run(["pytest", "-q"])
+            return 1.0 if result.returncode == 0 else 0.0
     Args:
         precondition: If True, this reward acts as a gate:
             - If precondition reward is 0, total reward is 0
             - If precondition reward is 1, total reward is 1 + sum(other rewards)
+        sandbox: If True, this reward runs inside the Docker sandbox container
+            instead of the trainer. Useful for rewards that need to:
+            - Access files created during LLM interaction
+            - Run tests (pytest, etc.)
+            - Execute code in the same environment as tools
+        timeout: Timeout in seconds for this reward function (default: 60).
+            If the reward times out, it returns a special "timeout" status
+            instead of a numeric value.
     """
     def decorator(func):
@@ -177,6 +206,8 @@ def reward(fn: Callable = None, *, precondition: bool = False) -> Callable:
         func._is_reward = True
         func._reward_name = func.__name__
         func._is_precondition = precondition
+        func._is_sandbox = sandbox
+        func._timeout = timeout
         # Register the function
         REWARD_REGISTRY[func.__name__] = func

rnow/core/tool.py CHANGED Viewed

@@ -34,6 +34,14 @@ def clear_tool_registry() -> None:
     TOOL_REGISTRY.clear()
+def is_sandbox_tool(name: str) -> bool:
+    """Check if a tool should run inside the Docker sandbox."""
+    fn = TOOL_REGISTRY.get(name)
+    if fn is None:
+        return False
+    return getattr(fn, "_is_sandbox", False)
 def _map_type_to_json_schema(py_type: Any) -> dict[str, Any]:
     """
     Map a Python type annotation to a JSON Schema fragment.
@@ -332,7 +340,7 @@ def _try_coerce(value: Any, expected_types: list[str]) -> tuple[bool, Any]:
     return False, value
-def tool(fn: Callable = None) -> Callable:
+def tool(fn: Callable = None, *, sandbox: bool = False, timeout: int = 60) -> Callable:
     """
     Decorator to register tool functions with robust validation.
@@ -351,10 +359,12 @@ def tool(fn: Callable = None) -> Callable:
             '''Search the web.'''
             return requests.get(...).json()
-        @tool
-        def calculator(expr: str) -> float:
-            '''Evaluate math expression.'''
-            return eval(expr)
+        @tool(sandbox=True, timeout=120)  # Run inside Docker sandbox with 2min timeout
+        def run_python(code: str) -> str:
+            '''Execute Python code in isolated environment.'''
+            import subprocess
+            result = subprocess.run(["python", "-c", code], capture_output=True)
+            return result.stdout.decode()
     Supported parameter types:
         - Primitives: str, int, float, bool
@@ -362,6 +372,16 @@ def tool(fn: Callable = None) -> Callable:
         - Optional: Optional[T], T | None
         - Literal: Literal["option1", "option2"]
         - Union: Union[str, int]
+    Args:
+        sandbox: If True, this tool runs inside the Docker sandbox container.
+            Required when the train.jsonl entry has a "docker" field.
+            Tools with sandbox=True can:
+            - Execute code in an isolated environment
+            - Create/modify files that sandbox rewards can check
+            - Access custom dependencies installed in the Docker image
+        timeout: Timeout in seconds for this tool function (default: 60).
+            If the tool times out, it returns a timeout error message.
     """
     def decorator(func: Callable) -> Callable:
@@ -400,18 +420,20 @@ def tool(fn: Callable = None) -> Callable:
         func._tool_name = func.__name__
         func._schema = schema
         func._description = doc  # Already validated and stripped above
+        func._is_sandbox = sandbox
+        func._timeout = timeout
         TOOL_REGISTRY[func._tool_name] = func
         return func
-    # Support both @tool and @tool()
+    # Support both @tool and @tool(sandbox=True)
     return decorator(fn) if fn else decorator
 def validate_tools_file(filepath) -> list:
     """
-    Validate an env.py file without executing it.
+    Validate a tools.py file without executing it.
     Parses the AST to find @tool decorated functions and checks:
     - Function has a non-empty docstring

rnow/models.py CHANGED Viewed

@@ -10,6 +10,8 @@ Trainer-internal types (Env, StepResult, Observation) live in docker/trainer/
 where tinker is available.
 """
+from __future__ import annotations
 from enum import Enum
 from typing import Literal
@@ -37,7 +39,7 @@ class OrgRole(str, Enum):
 class DatasetType(str, Enum):
-    SFT = "sft"  # Supervised Fine-Tuning
+    SFT = "sft"  # Supervised Finetuning
     RL = "rl"  # Reinforcement Learning
@@ -62,11 +64,51 @@ class RewardArgs(BaseModel):
     metadata: dict = Field(default_factory=dict)
     variables: dict = Field(default_factory=dict)
+    secrets: dict = Field(
+        default_factory=dict
+    )  # User-defined secrets from .env file or project settings
     class Config:
         arbitrary_types_allowed = True
+# --- train.jsonl validation models ---
+class Message(BaseModel):
+    """A single message in a conversation."""
+    model_config = ConfigDict(extra="allow")  # Allow extra fields like tool_calls
+    role: Literal["system", "user", "assistant", "tool"]
+    content: str
+class TrainEntry(BaseModel):
+    """A single entry in train.jsonl."""
+    model_config = ConfigDict(extra="allow")  # Allow extra fields like variables, metadata
+    messages: list[Message] = Field(..., min_length=1)
+    rewards: list[str] | None = None  # Required for RL, optional for SFT
+    tools: list[str] | None = None  # Optional: filter which tools are available
+    docker: str | None = None  # Optional: Docker image for isolated sandbox
+    metadata: dict | None = None
+    variables: dict | None = None
+    @model_validator(mode="after")
+    def validate_messages_not_empty(self):
+        if not self.messages:
+            raise ValueError("messages list cannot be empty")
+        return self
+class TrainEntryRL(TrainEntry):
+    """Train entry for RL datasets - rewards field is required."""
+    rewards: list[str] = Field(..., min_length=1)
 class DeviceCode(BaseModel):
     device_code: str
     user_code: str
@@ -97,7 +139,7 @@ class Organizations(BaseModel):
 # Supported model IDs
 SUPPORTED_MODELS = Literal[
-    # Qwen models
+    # Qwen models (text)
     "Qwen/Qwen3-235B-A22B-Instruct-2507",
     "Qwen/Qwen3-30B-A3B-Instruct-2507",
     "Qwen/Qwen3-30B-A3B",
@@ -106,7 +148,10 @@ SUPPORTED_MODELS = Literal[
     "Qwen/Qwen3-8B",
     "Qwen/Qwen3-8B-Base",
     "Qwen/Qwen3-4B-Instruct-2507",
-    # OpenAI models
+    # Qwen models (vision)
+    "Qwen/Qwen3-VL-235B-A22B-Instruct",
+    "Qwen/Qwen3-VL-30B-A3B-Instruct",
+    # OpenAI models (reasoning)
     "openai/gpt-oss-120b",
     "openai/gpt-oss-20b",
     # DeepSeek models
@@ -119,6 +164,8 @@ SUPPORTED_MODELS = Literal[
     "meta-llama/Llama-3.1-8B-Instruct",
     "meta-llama/Llama-3.2-3B",
     "meta-llama/Llama-3.2-1B",
+    # Moonshot models (reasoning)
+    "moonshotai/Kimi-K2-Thinking",
 ]
 # Maximum context window for all supported models
@@ -127,15 +174,41 @@ MAX_CONTEXT_WINDOW = 32768
 # Conservative max_tokens limit (leaves room for prompts)
 MAX_GENERATION_TOKENS = 30000
+# Models that do NOT support tool calling
+# - gpt-oss models use GptOssRenderer which doesn't support tools
+# - Base/non-instruct models use RoleColonRenderer which doesn't support tools
+MODELS_WITHOUT_TOOL_SUPPORT: set[str] = {
+    # OpenAI reasoning models (GptOssRenderer)
+    "openai/gpt-oss-120b",
+    "openai/gpt-oss-20b",
+    # Base models (RoleColonRenderer)
+    "Qwen/Qwen3-30B-A3B-Base",
+    "Qwen/Qwen3-8B-Base",
+    "deepseek-ai/DeepSeek-V3.1-Base",
+    "meta-llama/Llama-3.1-70B",
+    "meta-llama/Llama-3.1-8B",
+    "meta-llama/Llama-3.2-3B",
+    "meta-llama/Llama-3.2-1B",
+}
+def supports_tool_calling(model_path: str) -> bool:
+    """Check if a model supports tool calling."""
+    return model_path not in MODELS_WITHOUT_TOOL_SUPPORT
 # Maximum LoRA rank per model
 # Models not listed here default to 128
 MODEL_MAX_LORA_RANK: dict[str, int] = {
-    # Max 32
+    # Max 32 (reasoning models)
     "openai/gpt-oss-120b": 32,
     "openai/gpt-oss-20b": 32,
-    # Max 64
+    "moonshotai/Kimi-K2-Thinking": 32,
+    # Max 64 (large MoE models)
     "Qwen/Qwen3-235B-A22B-Instruct-2507": 64,
+    "Qwen/Qwen3-VL-235B-A22B-Instruct": 64,
     "Qwen/Qwen3-30B-A3B-Instruct-2507": 64,
+    "Qwen/Qwen3-VL-30B-A3B-Instruct": 64,
     "Qwen/Qwen3-30B-A3B": 64,
     "Qwen/Qwen3-30B-A3B-Base": 64,
     "deepseek-ai/DeepSeek-V3.1": 64,
@@ -229,13 +302,22 @@ class RolloutConfig(BaseModel):
     )
     mcp_url: str | list[str] | None = Field(
         default=None,
-        description="MCP server URL(s) for tools. Can be a single URL or a list of URLs. Can be used alongside env.py to combine both tool sources.",
+        description="MCP server URL(s) for tools. Can be a single URL or a list of URLs. Can be used alongside tools.py to combine both tool sources.",
+    )
+    tool_timeout: int = Field(
+        default=60,
+        gt=0,
+        description="Timeout in seconds for tool calls. Browser automation may need longer timeouts (default: 60s).",
     )
     max_tool_response_chars: int | None = Field(
         default=4000,
         gt=0,
         description="Maximum characters for tool responses. Longer responses are truncated. Set to null/None to disable truncation.",
     )
+    include_thinking: bool = Field(
+        default=False,
+        description="Whether to include <think>...</think> blocks in messages passed to reward functions. Default is False (thinking is stripped).",
+    )
 class TrainerConfig(BaseModel):

rnow/templates/deepseek-aha/config.yml CHANGED Viewed

@@ -9,7 +9,7 @@ data:
   batch_size: 32
   group_size: 16
 model:
-  path: openai/gpt-oss-20b
+  path: Qwen/Qwen3-8B
   qlora_rank: 32
   name: "Countdown Reasoning Model"
   description: "Reproduces DeepSeek R1 aha moment using GRPO on the Countdown game"

rnow/templates/mcp-tavily/config.yml CHANGED Viewed

@@ -9,7 +9,7 @@ data:
   batch_size: 32
   group_size: 16
 model:
-  path: Qwen/Qwen3-8B
+  path: Qwen/Qwen3-30B-A3B-Instruct-2507
   qlora_rank: 32
   name: "SimpleQA Agent"
   description: "Multi-turn RL model trained on SimpleQA factual questions using Tavily MCP"

rnow/templates/rl-single/config.yml CHANGED Viewed

@@ -1,13 +1,13 @@
 project_id: ""
-project_name: "OpenMathReasoning"
+project_name: "rl-project"
 dataset_id: ""
-dataset_name: "math-problems"
+dataset_name: "train"
 dataset_type: rl
 organization_id: ""
 data:
   train_file: train.jsonl
-  batch_size: 32
-  group_size: 16
+  batch_size: 16
+  group_size: 8
 model:
   path: Qwen/Qwen3-8B
   qlora_rank: 32
@@ -19,9 +19,9 @@ algorithm:
   kl_penalty_coef: 0.01
 rollout:
   max_turns: 1
-  max_tokens: 16384
+  max_tokens: 4096
   termination_policy: last_tool
 trainer:
-  num_epochs: 4
+  num_epochs: 6
   learning_rate: 0.0001
-  save_step: 333
+  save_step: 8

rnow 0.2.4__py3-none-any.whl → 0.3.9__py3-none-any.whl

rnow 0.2.4py3-none-any.whl → 0.3.9py3-none-any.whl