rnow 0.2.4__py3-none-any.whl → 0.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnow/cli/commands.py +226 -84
- rnow/cli/test.py +536 -441
- rnow/core/__init__.py +4 -1
- rnow/core/reward.py +34 -3
- rnow/core/tool.py +29 -7
- rnow/models.py +88 -6
- rnow/templates/deepseek-aha/config.yml +1 -1
- rnow/templates/mcp-tavily/config.yml +1 -1
- rnow/templates/rl-single/config.yml +7 -7
- rnow/templates/rl-single/train.jsonl +0 -908
- rnow/templates/rl-tools/config.yml +1 -1
- rnow/templates/tutorial-reward/config.yml +7 -7
- rnow/templates/tutorial-reward/train.jsonl +0 -908
- rnow/templates/tutorial-tool/config.yml +1 -1
- {rnow-0.2.4.dist-info → rnow-0.3.9.dist-info}/METADATA +23 -9
- {rnow-0.2.4.dist-info → rnow-0.3.9.dist-info}/RECORD +22 -22
- /rnow/templates/rl-tools/{env.py → tools.py} +0 -0
- /rnow/templates/tutorial-tool/{env.py → tools.py} +0 -0
- {rnow-0.2.4.dist-info → rnow-0.3.9.dist-info}/WHEEL +0 -0
- {rnow-0.2.4.dist-info → rnow-0.3.9.dist-info}/entry_points.txt +0 -0
- {rnow-0.2.4.dist-info → rnow-0.3.9.dist-info}/licenses/LICENSE +0 -0
- {rnow-0.2.4.dist-info → rnow-0.3.9.dist-info}/top_level.txt +0 -0
rnow/core/__init__.py
CHANGED
|
@@ -14,9 +14,10 @@ from .reward import (
|
|
|
14
14
|
clear_reward_registry,
|
|
15
15
|
compute_total_reward,
|
|
16
16
|
is_precondition,
|
|
17
|
+
is_sandbox_reward,
|
|
17
18
|
reward,
|
|
18
19
|
)
|
|
19
|
-
from .tool import TOOL_REGISTRY, clear_tool_registry, tool
|
|
20
|
+
from .tool import TOOL_REGISTRY, clear_tool_registry, is_sandbox_tool, tool
|
|
20
21
|
|
|
21
22
|
__all__ = [
|
|
22
23
|
# User-facing API
|
|
@@ -29,5 +30,7 @@ __all__ = [
|
|
|
29
30
|
"clear_reward_registry",
|
|
30
31
|
"clear_tool_registry",
|
|
31
32
|
"is_precondition",
|
|
33
|
+
"is_sandbox_reward",
|
|
34
|
+
"is_sandbox_tool",
|
|
32
35
|
"compute_total_reward",
|
|
33
36
|
]
|
rnow/core/reward.py
CHANGED
|
@@ -32,6 +32,14 @@ def is_precondition(name: str) -> bool:
|
|
|
32
32
|
return getattr(fn, "_is_precondition", False)
|
|
33
33
|
|
|
34
34
|
|
|
35
|
+
def is_sandbox_reward(name: str) -> bool:
|
|
36
|
+
"""Check if a reward function should run inside the Docker sandbox."""
|
|
37
|
+
fn = REWARD_REGISTRY.get(name)
|
|
38
|
+
if fn is None:
|
|
39
|
+
return False
|
|
40
|
+
return getattr(fn, "_is_sandbox", False)
|
|
41
|
+
|
|
42
|
+
|
|
35
43
|
def compute_total_reward(reward_results: dict[str, float]) -> float:
|
|
36
44
|
"""
|
|
37
45
|
Compute total reward with precondition logic.
|
|
@@ -105,8 +113,7 @@ def _validate_reward_signature(func: Callable) -> None:
|
|
|
105
113
|
second_param = params[1]
|
|
106
114
|
if second_param.name not in hints:
|
|
107
115
|
raise TypeError(
|
|
108
|
-
f"Reward '{func.__name__}': parameter '{second_param.name}' must have "
|
|
109
|
-
"type hint 'list'."
|
|
116
|
+
f"Reward '{func.__name__}': parameter '{second_param.name}' must have type hint 'list'."
|
|
110
117
|
)
|
|
111
118
|
second_type = hints[second_param.name]
|
|
112
119
|
# Allow list or List (from typing)
|
|
@@ -127,7 +134,13 @@ def _validate_reward_signature(func: Callable) -> None:
|
|
|
127
134
|
raise TypeError(f"Reward '{func.__name__}' must return 'float', got '{return_type}'.")
|
|
128
135
|
|
|
129
136
|
|
|
130
|
-
def reward(
|
|
137
|
+
def reward(
|
|
138
|
+
fn: Callable = None,
|
|
139
|
+
*,
|
|
140
|
+
precondition: bool = False,
|
|
141
|
+
sandbox: bool = False,
|
|
142
|
+
timeout: int = 60,
|
|
143
|
+
) -> Callable:
|
|
131
144
|
"""
|
|
132
145
|
Decorator to register reward functions with validation.
|
|
133
146
|
|
|
@@ -150,10 +163,26 @@ def reward(fn: Callable = None, *, precondition: bool = False) -> Callable:
|
|
|
150
163
|
# If this returns 1, total reward is 1 + sum(other rewards)
|
|
151
164
|
return 1.0 if valid_format else 0.0
|
|
152
165
|
|
|
166
|
+
@reward(sandbox=True, timeout=120) # Run inside Docker sandbox with 2min timeout
|
|
167
|
+
def test_code(args: RewardArgs, messages: list) -> float:
|
|
168
|
+
# This executes inside the sandbox container
|
|
169
|
+
# Has access to files created by LLM, can run pytest, etc.
|
|
170
|
+
import subprocess
|
|
171
|
+
result = subprocess.run(["pytest", "-q"])
|
|
172
|
+
return 1.0 if result.returncode == 0 else 0.0
|
|
173
|
+
|
|
153
174
|
Args:
|
|
154
175
|
precondition: If True, this reward acts as a gate:
|
|
155
176
|
- If precondition reward is 0, total reward is 0
|
|
156
177
|
- If precondition reward is 1, total reward is 1 + sum(other rewards)
|
|
178
|
+
sandbox: If True, this reward runs inside the Docker sandbox container
|
|
179
|
+
instead of the trainer. Useful for rewards that need to:
|
|
180
|
+
- Access files created during LLM interaction
|
|
181
|
+
- Run tests (pytest, etc.)
|
|
182
|
+
- Execute code in the same environment as tools
|
|
183
|
+
timeout: Timeout in seconds for this reward function (default: 60).
|
|
184
|
+
If the reward times out, it returns a special "timeout" status
|
|
185
|
+
instead of a numeric value.
|
|
157
186
|
"""
|
|
158
187
|
|
|
159
188
|
def decorator(func):
|
|
@@ -177,6 +206,8 @@ def reward(fn: Callable = None, *, precondition: bool = False) -> Callable:
|
|
|
177
206
|
func._is_reward = True
|
|
178
207
|
func._reward_name = func.__name__
|
|
179
208
|
func._is_precondition = precondition
|
|
209
|
+
func._is_sandbox = sandbox
|
|
210
|
+
func._timeout = timeout
|
|
180
211
|
|
|
181
212
|
# Register the function
|
|
182
213
|
REWARD_REGISTRY[func.__name__] = func
|
rnow/core/tool.py
CHANGED
|
@@ -34,6 +34,14 @@ def clear_tool_registry() -> None:
|
|
|
34
34
|
TOOL_REGISTRY.clear()
|
|
35
35
|
|
|
36
36
|
|
|
37
|
+
def is_sandbox_tool(name: str) -> bool:
|
|
38
|
+
"""Check if a tool should run inside the Docker sandbox."""
|
|
39
|
+
fn = TOOL_REGISTRY.get(name)
|
|
40
|
+
if fn is None:
|
|
41
|
+
return False
|
|
42
|
+
return getattr(fn, "_is_sandbox", False)
|
|
43
|
+
|
|
44
|
+
|
|
37
45
|
def _map_type_to_json_schema(py_type: Any) -> dict[str, Any]:
|
|
38
46
|
"""
|
|
39
47
|
Map a Python type annotation to a JSON Schema fragment.
|
|
@@ -332,7 +340,7 @@ def _try_coerce(value: Any, expected_types: list[str]) -> tuple[bool, Any]:
|
|
|
332
340
|
return False, value
|
|
333
341
|
|
|
334
342
|
|
|
335
|
-
def tool(fn: Callable = None) -> Callable:
|
|
343
|
+
def tool(fn: Callable = None, *, sandbox: bool = False, timeout: int = 60) -> Callable:
|
|
336
344
|
"""
|
|
337
345
|
Decorator to register tool functions with robust validation.
|
|
338
346
|
|
|
@@ -351,10 +359,12 @@ def tool(fn: Callable = None) -> Callable:
|
|
|
351
359
|
'''Search the web.'''
|
|
352
360
|
return requests.get(...).json()
|
|
353
361
|
|
|
354
|
-
@tool
|
|
355
|
-
def
|
|
356
|
-
'''
|
|
357
|
-
|
|
362
|
+
@tool(sandbox=True, timeout=120) # Run inside Docker sandbox with 2min timeout
|
|
363
|
+
def run_python(code: str) -> str:
|
|
364
|
+
'''Execute Python code in isolated environment.'''
|
|
365
|
+
import subprocess
|
|
366
|
+
result = subprocess.run(["python", "-c", code], capture_output=True)
|
|
367
|
+
return result.stdout.decode()
|
|
358
368
|
|
|
359
369
|
Supported parameter types:
|
|
360
370
|
- Primitives: str, int, float, bool
|
|
@@ -362,6 +372,16 @@ def tool(fn: Callable = None) -> Callable:
|
|
|
362
372
|
- Optional: Optional[T], T | None
|
|
363
373
|
- Literal: Literal["option1", "option2"]
|
|
364
374
|
- Union: Union[str, int]
|
|
375
|
+
|
|
376
|
+
Args:
|
|
377
|
+
sandbox: If True, this tool runs inside the Docker sandbox container.
|
|
378
|
+
Required when the train.jsonl entry has a "docker" field.
|
|
379
|
+
Tools with sandbox=True can:
|
|
380
|
+
- Execute code in an isolated environment
|
|
381
|
+
- Create/modify files that sandbox rewards can check
|
|
382
|
+
- Access custom dependencies installed in the Docker image
|
|
383
|
+
timeout: Timeout in seconds for this tool function (default: 60).
|
|
384
|
+
If the tool times out, it returns a timeout error message.
|
|
365
385
|
"""
|
|
366
386
|
|
|
367
387
|
def decorator(func: Callable) -> Callable:
|
|
@@ -400,18 +420,20 @@ def tool(fn: Callable = None) -> Callable:
|
|
|
400
420
|
func._tool_name = func.__name__
|
|
401
421
|
func._schema = schema
|
|
402
422
|
func._description = doc # Already validated and stripped above
|
|
423
|
+
func._is_sandbox = sandbox
|
|
424
|
+
func._timeout = timeout
|
|
403
425
|
|
|
404
426
|
TOOL_REGISTRY[func._tool_name] = func
|
|
405
427
|
|
|
406
428
|
return func
|
|
407
429
|
|
|
408
|
-
# Support both @tool and @tool()
|
|
430
|
+
# Support both @tool and @tool(sandbox=True)
|
|
409
431
|
return decorator(fn) if fn else decorator
|
|
410
432
|
|
|
411
433
|
|
|
412
434
|
def validate_tools_file(filepath) -> list:
|
|
413
435
|
"""
|
|
414
|
-
Validate
|
|
436
|
+
Validate a tools.py file without executing it.
|
|
415
437
|
|
|
416
438
|
Parses the AST to find @tool decorated functions and checks:
|
|
417
439
|
- Function has a non-empty docstring
|
rnow/models.py
CHANGED
|
@@ -10,6 +10,8 @@ Trainer-internal types (Env, StepResult, Observation) live in docker/trainer/
|
|
|
10
10
|
where tinker is available.
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
13
15
|
from enum import Enum
|
|
14
16
|
from typing import Literal
|
|
15
17
|
|
|
@@ -37,7 +39,7 @@ class OrgRole(str, Enum):
|
|
|
37
39
|
|
|
38
40
|
|
|
39
41
|
class DatasetType(str, Enum):
|
|
40
|
-
SFT = "sft" # Supervised
|
|
42
|
+
SFT = "sft" # Supervised Finetuning
|
|
41
43
|
RL = "rl" # Reinforcement Learning
|
|
42
44
|
|
|
43
45
|
|
|
@@ -62,11 +64,51 @@ class RewardArgs(BaseModel):
|
|
|
62
64
|
|
|
63
65
|
metadata: dict = Field(default_factory=dict)
|
|
64
66
|
variables: dict = Field(default_factory=dict)
|
|
67
|
+
secrets: dict = Field(
|
|
68
|
+
default_factory=dict
|
|
69
|
+
) # User-defined secrets from .env file or project settings
|
|
65
70
|
|
|
66
71
|
class Config:
|
|
67
72
|
arbitrary_types_allowed = True
|
|
68
73
|
|
|
69
74
|
|
|
75
|
+
# --- train.jsonl validation models ---
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class Message(BaseModel):
|
|
79
|
+
"""A single message in a conversation."""
|
|
80
|
+
|
|
81
|
+
model_config = ConfigDict(extra="allow") # Allow extra fields like tool_calls
|
|
82
|
+
|
|
83
|
+
role: Literal["system", "user", "assistant", "tool"]
|
|
84
|
+
content: str
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class TrainEntry(BaseModel):
|
|
88
|
+
"""A single entry in train.jsonl."""
|
|
89
|
+
|
|
90
|
+
model_config = ConfigDict(extra="allow") # Allow extra fields like variables, metadata
|
|
91
|
+
|
|
92
|
+
messages: list[Message] = Field(..., min_length=1)
|
|
93
|
+
rewards: list[str] | None = None # Required for RL, optional for SFT
|
|
94
|
+
tools: list[str] | None = None # Optional: filter which tools are available
|
|
95
|
+
docker: str | None = None # Optional: Docker image for isolated sandbox
|
|
96
|
+
metadata: dict | None = None
|
|
97
|
+
variables: dict | None = None
|
|
98
|
+
|
|
99
|
+
@model_validator(mode="after")
|
|
100
|
+
def validate_messages_not_empty(self):
|
|
101
|
+
if not self.messages:
|
|
102
|
+
raise ValueError("messages list cannot be empty")
|
|
103
|
+
return self
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class TrainEntryRL(TrainEntry):
|
|
107
|
+
"""Train entry for RL datasets - rewards field is required."""
|
|
108
|
+
|
|
109
|
+
rewards: list[str] = Field(..., min_length=1)
|
|
110
|
+
|
|
111
|
+
|
|
70
112
|
class DeviceCode(BaseModel):
|
|
71
113
|
device_code: str
|
|
72
114
|
user_code: str
|
|
@@ -97,7 +139,7 @@ class Organizations(BaseModel):
|
|
|
97
139
|
|
|
98
140
|
# Supported model IDs
|
|
99
141
|
SUPPORTED_MODELS = Literal[
|
|
100
|
-
# Qwen models
|
|
142
|
+
# Qwen models (text)
|
|
101
143
|
"Qwen/Qwen3-235B-A22B-Instruct-2507",
|
|
102
144
|
"Qwen/Qwen3-30B-A3B-Instruct-2507",
|
|
103
145
|
"Qwen/Qwen3-30B-A3B",
|
|
@@ -106,7 +148,10 @@ SUPPORTED_MODELS = Literal[
|
|
|
106
148
|
"Qwen/Qwen3-8B",
|
|
107
149
|
"Qwen/Qwen3-8B-Base",
|
|
108
150
|
"Qwen/Qwen3-4B-Instruct-2507",
|
|
109
|
-
#
|
|
151
|
+
# Qwen models (vision)
|
|
152
|
+
"Qwen/Qwen3-VL-235B-A22B-Instruct",
|
|
153
|
+
"Qwen/Qwen3-VL-30B-A3B-Instruct",
|
|
154
|
+
# OpenAI models (reasoning)
|
|
110
155
|
"openai/gpt-oss-120b",
|
|
111
156
|
"openai/gpt-oss-20b",
|
|
112
157
|
# DeepSeek models
|
|
@@ -119,6 +164,8 @@ SUPPORTED_MODELS = Literal[
|
|
|
119
164
|
"meta-llama/Llama-3.1-8B-Instruct",
|
|
120
165
|
"meta-llama/Llama-3.2-3B",
|
|
121
166
|
"meta-llama/Llama-3.2-1B",
|
|
167
|
+
# Moonshot models (reasoning)
|
|
168
|
+
"moonshotai/Kimi-K2-Thinking",
|
|
122
169
|
]
|
|
123
170
|
|
|
124
171
|
# Maximum context window for all supported models
|
|
@@ -127,15 +174,41 @@ MAX_CONTEXT_WINDOW = 32768
|
|
|
127
174
|
# Conservative max_tokens limit (leaves room for prompts)
|
|
128
175
|
MAX_GENERATION_TOKENS = 30000
|
|
129
176
|
|
|
177
|
+
# Models that do NOT support tool calling
|
|
178
|
+
# - gpt-oss models use GptOssRenderer which doesn't support tools
|
|
179
|
+
# - Base/non-instruct models use RoleColonRenderer which doesn't support tools
|
|
180
|
+
MODELS_WITHOUT_TOOL_SUPPORT: set[str] = {
|
|
181
|
+
# OpenAI reasoning models (GptOssRenderer)
|
|
182
|
+
"openai/gpt-oss-120b",
|
|
183
|
+
"openai/gpt-oss-20b",
|
|
184
|
+
# Base models (RoleColonRenderer)
|
|
185
|
+
"Qwen/Qwen3-30B-A3B-Base",
|
|
186
|
+
"Qwen/Qwen3-8B-Base",
|
|
187
|
+
"deepseek-ai/DeepSeek-V3.1-Base",
|
|
188
|
+
"meta-llama/Llama-3.1-70B",
|
|
189
|
+
"meta-llama/Llama-3.1-8B",
|
|
190
|
+
"meta-llama/Llama-3.2-3B",
|
|
191
|
+
"meta-llama/Llama-3.2-1B",
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def supports_tool_calling(model_path: str) -> bool:
|
|
196
|
+
"""Check if a model supports tool calling."""
|
|
197
|
+
return model_path not in MODELS_WITHOUT_TOOL_SUPPORT
|
|
198
|
+
|
|
199
|
+
|
|
130
200
|
# Maximum LoRA rank per model
|
|
131
201
|
# Models not listed here default to 128
|
|
132
202
|
MODEL_MAX_LORA_RANK: dict[str, int] = {
|
|
133
|
-
# Max 32
|
|
203
|
+
# Max 32 (reasoning models)
|
|
134
204
|
"openai/gpt-oss-120b": 32,
|
|
135
205
|
"openai/gpt-oss-20b": 32,
|
|
136
|
-
|
|
206
|
+
"moonshotai/Kimi-K2-Thinking": 32,
|
|
207
|
+
# Max 64 (large MoE models)
|
|
137
208
|
"Qwen/Qwen3-235B-A22B-Instruct-2507": 64,
|
|
209
|
+
"Qwen/Qwen3-VL-235B-A22B-Instruct": 64,
|
|
138
210
|
"Qwen/Qwen3-30B-A3B-Instruct-2507": 64,
|
|
211
|
+
"Qwen/Qwen3-VL-30B-A3B-Instruct": 64,
|
|
139
212
|
"Qwen/Qwen3-30B-A3B": 64,
|
|
140
213
|
"Qwen/Qwen3-30B-A3B-Base": 64,
|
|
141
214
|
"deepseek-ai/DeepSeek-V3.1": 64,
|
|
@@ -229,13 +302,22 @@ class RolloutConfig(BaseModel):
|
|
|
229
302
|
)
|
|
230
303
|
mcp_url: str | list[str] | None = Field(
|
|
231
304
|
default=None,
|
|
232
|
-
description="MCP server URL(s) for tools. Can be a single URL or a list of URLs. Can be used alongside
|
|
305
|
+
description="MCP server URL(s) for tools. Can be a single URL or a list of URLs. Can be used alongside tools.py to combine both tool sources.",
|
|
306
|
+
)
|
|
307
|
+
tool_timeout: int = Field(
|
|
308
|
+
default=60,
|
|
309
|
+
gt=0,
|
|
310
|
+
description="Timeout in seconds for tool calls. Browser automation may need longer timeouts (default: 60s).",
|
|
233
311
|
)
|
|
234
312
|
max_tool_response_chars: int | None = Field(
|
|
235
313
|
default=4000,
|
|
236
314
|
gt=0,
|
|
237
315
|
description="Maximum characters for tool responses. Longer responses are truncated. Set to null/None to disable truncation.",
|
|
238
316
|
)
|
|
317
|
+
include_thinking: bool = Field(
|
|
318
|
+
default=False,
|
|
319
|
+
description="Whether to include <think>...</think> blocks in messages passed to reward functions. Default is False (thinking is stripped).",
|
|
320
|
+
)
|
|
239
321
|
|
|
240
322
|
|
|
241
323
|
class TrainerConfig(BaseModel):
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
project_id: ""
|
|
2
|
-
project_name: "
|
|
2
|
+
project_name: "rl-project"
|
|
3
3
|
dataset_id: ""
|
|
4
|
-
dataset_name: "
|
|
4
|
+
dataset_name: "train"
|
|
5
5
|
dataset_type: rl
|
|
6
6
|
organization_id: ""
|
|
7
7
|
data:
|
|
8
8
|
train_file: train.jsonl
|
|
9
|
-
batch_size:
|
|
10
|
-
group_size:
|
|
9
|
+
batch_size: 16
|
|
10
|
+
group_size: 8
|
|
11
11
|
model:
|
|
12
12
|
path: Qwen/Qwen3-8B
|
|
13
13
|
qlora_rank: 32
|
|
@@ -19,9 +19,9 @@ algorithm:
|
|
|
19
19
|
kl_penalty_coef: 0.01
|
|
20
20
|
rollout:
|
|
21
21
|
max_turns: 1
|
|
22
|
-
max_tokens:
|
|
22
|
+
max_tokens: 4096
|
|
23
23
|
termination_policy: last_tool
|
|
24
24
|
trainer:
|
|
25
|
-
num_epochs:
|
|
25
|
+
num_epochs: 6
|
|
26
26
|
learning_rate: 0.0001
|
|
27
|
-
save_step:
|
|
27
|
+
save_step: 8
|