rnow 0.2.4__py3-none-any.whl → 0.3.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rnow/core/__init__.py CHANGED
@@ -14,9 +14,10 @@ from .reward import (
14
14
  clear_reward_registry,
15
15
  compute_total_reward,
16
16
  is_precondition,
17
+ is_sandbox_reward,
17
18
  reward,
18
19
  )
19
- from .tool import TOOL_REGISTRY, clear_tool_registry, tool
20
+ from .tool import TOOL_REGISTRY, clear_tool_registry, is_sandbox_tool, tool
20
21
 
21
22
  __all__ = [
22
23
  # User-facing API
@@ -29,5 +30,7 @@ __all__ = [
29
30
  "clear_reward_registry",
30
31
  "clear_tool_registry",
31
32
  "is_precondition",
33
+ "is_sandbox_reward",
34
+ "is_sandbox_tool",
32
35
  "compute_total_reward",
33
36
  ]
rnow/core/reward.py CHANGED
@@ -32,6 +32,14 @@ def is_precondition(name: str) -> bool:
32
32
  return getattr(fn, "_is_precondition", False)
33
33
 
34
34
 
35
+ def is_sandbox_reward(name: str) -> bool:
36
+ """Check if a reward function should run inside the Docker sandbox."""
37
+ fn = REWARD_REGISTRY.get(name)
38
+ if fn is None:
39
+ return False
40
+ return getattr(fn, "_is_sandbox", False)
41
+
42
+
35
43
  def compute_total_reward(reward_results: dict[str, float]) -> float:
36
44
  """
37
45
  Compute total reward with precondition logic.
@@ -105,8 +113,7 @@ def _validate_reward_signature(func: Callable) -> None:
105
113
  second_param = params[1]
106
114
  if second_param.name not in hints:
107
115
  raise TypeError(
108
- f"Reward '{func.__name__}': parameter '{second_param.name}' must have "
109
- "type hint 'list'."
116
+ f"Reward '{func.__name__}': parameter '{second_param.name}' must have type hint 'list'."
110
117
  )
111
118
  second_type = hints[second_param.name]
112
119
  # Allow list or List (from typing)
@@ -127,7 +134,13 @@ def _validate_reward_signature(func: Callable) -> None:
127
134
  raise TypeError(f"Reward '{func.__name__}' must return 'float', got '{return_type}'.")
128
135
 
129
136
 
130
- def reward(fn: Callable = None, *, precondition: bool = False) -> Callable:
137
+ def reward(
138
+ fn: Callable = None,
139
+ *,
140
+ precondition: bool = False,
141
+ sandbox: bool = False,
142
+ timeout: int = 60,
143
+ ) -> Callable:
131
144
  """
132
145
  Decorator to register reward functions with validation.
133
146
 
@@ -150,10 +163,26 @@ def reward(fn: Callable = None, *, precondition: bool = False) -> Callable:
150
163
  # If this returns 1, total reward is 1 + sum(other rewards)
151
164
  return 1.0 if valid_format else 0.0
152
165
 
166
+ @reward(sandbox=True, timeout=120) # Run inside Docker sandbox with 2min timeout
167
+ def test_code(args: RewardArgs, messages: list) -> float:
168
+ # This executes inside the sandbox container
169
+ # Has access to files created by LLM, can run pytest, etc.
170
+ import subprocess
171
+ result = subprocess.run(["pytest", "-q"])
172
+ return 1.0 if result.returncode == 0 else 0.0
173
+
153
174
  Args:
154
175
  precondition: If True, this reward acts as a gate:
155
176
  - If precondition reward is 0, total reward is 0
156
177
  - If precondition reward is 1, total reward is 1 + sum(other rewards)
178
+ sandbox: If True, this reward runs inside the Docker sandbox container
179
+ instead of the trainer. Useful for rewards that need to:
180
+ - Access files created during LLM interaction
181
+ - Run tests (pytest, etc.)
182
+ - Execute code in the same environment as tools
183
+ timeout: Timeout in seconds for this reward function (default: 60).
184
+ If the reward times out, it returns a special "timeout" status
185
+ instead of a numeric value.
157
186
  """
158
187
 
159
188
  def decorator(func):
@@ -177,6 +206,8 @@ def reward(fn: Callable = None, *, precondition: bool = False) -> Callable:
177
206
  func._is_reward = True
178
207
  func._reward_name = func.__name__
179
208
  func._is_precondition = precondition
209
+ func._is_sandbox = sandbox
210
+ func._timeout = timeout
180
211
 
181
212
  # Register the function
182
213
  REWARD_REGISTRY[func.__name__] = func
rnow/core/tool.py CHANGED
@@ -34,6 +34,14 @@ def clear_tool_registry() -> None:
34
34
  TOOL_REGISTRY.clear()
35
35
 
36
36
 
37
+ def is_sandbox_tool(name: str) -> bool:
38
+ """Check if a tool should run inside the Docker sandbox."""
39
+ fn = TOOL_REGISTRY.get(name)
40
+ if fn is None:
41
+ return False
42
+ return getattr(fn, "_is_sandbox", False)
43
+
44
+
37
45
  def _map_type_to_json_schema(py_type: Any) -> dict[str, Any]:
38
46
  """
39
47
  Map a Python type annotation to a JSON Schema fragment.
@@ -332,7 +340,7 @@ def _try_coerce(value: Any, expected_types: list[str]) -> tuple[bool, Any]:
332
340
  return False, value
333
341
 
334
342
 
335
- def tool(fn: Callable = None) -> Callable:
343
+ def tool(fn: Callable = None, *, sandbox: bool = False, timeout: int = 60) -> Callable:
336
344
  """
337
345
  Decorator to register tool functions with robust validation.
338
346
 
@@ -351,10 +359,12 @@ def tool(fn: Callable = None) -> Callable:
351
359
  '''Search the web.'''
352
360
  return requests.get(...).json()
353
361
 
354
- @tool
355
- def calculator(expr: str) -> float:
356
- '''Evaluate math expression.'''
357
- return eval(expr)
362
+ @tool(sandbox=True, timeout=120) # Run inside Docker sandbox with 2min timeout
363
+ def run_python(code: str) -> str:
364
+ '''Execute Python code in isolated environment.'''
365
+ import subprocess
366
+ result = subprocess.run(["python", "-c", code], capture_output=True)
367
+ return result.stdout.decode()
358
368
 
359
369
  Supported parameter types:
360
370
  - Primitives: str, int, float, bool
@@ -362,6 +372,16 @@ def tool(fn: Callable = None) -> Callable:
362
372
  - Optional: Optional[T], T | None
363
373
  - Literal: Literal["option1", "option2"]
364
374
  - Union: Union[str, int]
375
+
376
+ Args:
377
+ sandbox: If True, this tool runs inside the Docker sandbox container.
378
+ Required when the train.jsonl entry has a "docker" field.
379
+ Tools with sandbox=True can:
380
+ - Execute code in an isolated environment
381
+ - Create/modify files that sandbox rewards can check
382
+ - Access custom dependencies installed in the Docker image
383
+ timeout: Timeout in seconds for this tool function (default: 60).
384
+ If the tool times out, it returns a timeout error message.
365
385
  """
366
386
 
367
387
  def decorator(func: Callable) -> Callable:
@@ -400,18 +420,20 @@ def tool(fn: Callable = None) -> Callable:
400
420
  func._tool_name = func.__name__
401
421
  func._schema = schema
402
422
  func._description = doc # Already validated and stripped above
423
+ func._is_sandbox = sandbox
424
+ func._timeout = timeout
403
425
 
404
426
  TOOL_REGISTRY[func._tool_name] = func
405
427
 
406
428
  return func
407
429
 
408
- # Support both @tool and @tool()
430
+ # Support both @tool and @tool(sandbox=True)
409
431
  return decorator(fn) if fn else decorator
410
432
 
411
433
 
412
434
  def validate_tools_file(filepath) -> list:
413
435
  """
414
- Validate an env.py file without executing it.
436
+ Validate a tools.py file without executing it.
415
437
 
416
438
  Parses the AST to find @tool decorated functions and checks:
417
439
  - Function has a non-empty docstring
rnow/models.py CHANGED
@@ -10,6 +10,8 @@ Trainer-internal types (Env, StepResult, Observation) live in docker/trainer/
10
10
  where tinker is available.
11
11
  """
12
12
 
13
+ from __future__ import annotations
14
+
13
15
  from enum import Enum
14
16
  from typing import Literal
15
17
 
@@ -37,7 +39,7 @@ class OrgRole(str, Enum):
37
39
 
38
40
 
39
41
  class DatasetType(str, Enum):
40
- SFT = "sft" # Supervised Fine-Tuning
42
+ SFT = "sft" # Supervised Finetuning
41
43
  RL = "rl" # Reinforcement Learning
42
44
 
43
45
 
@@ -62,11 +64,51 @@ class RewardArgs(BaseModel):
62
64
 
63
65
  metadata: dict = Field(default_factory=dict)
64
66
  variables: dict = Field(default_factory=dict)
67
+ secrets: dict = Field(
68
+ default_factory=dict
69
+ ) # User-defined secrets from .env file or project settings
65
70
 
66
71
  class Config:
67
72
  arbitrary_types_allowed = True
68
73
 
69
74
 
75
+ # --- train.jsonl validation models ---
76
+
77
+
78
+ class Message(BaseModel):
79
+ """A single message in a conversation."""
80
+
81
+ model_config = ConfigDict(extra="allow") # Allow extra fields like tool_calls
82
+
83
+ role: Literal["system", "user", "assistant", "tool"]
84
+ content: str
85
+
86
+
87
+ class TrainEntry(BaseModel):
88
+ """A single entry in train.jsonl."""
89
+
90
+ model_config = ConfigDict(extra="allow") # Allow extra fields like variables, metadata
91
+
92
+ messages: list[Message] = Field(..., min_length=1)
93
+ rewards: list[str] | None = None # Required for RL, optional for SFT
94
+ tools: list[str] | None = None # Optional: filter which tools are available
95
+ docker: str | None = None # Optional: Docker image for isolated sandbox
96
+ metadata: dict | None = None
97
+ variables: dict | None = None
98
+
99
+ @model_validator(mode="after")
100
+ def validate_messages_not_empty(self):
101
+ if not self.messages:
102
+ raise ValueError("messages list cannot be empty")
103
+ return self
104
+
105
+
106
+ class TrainEntryRL(TrainEntry):
107
+ """Train entry for RL datasets - rewards field is required."""
108
+
109
+ rewards: list[str] = Field(..., min_length=1)
110
+
111
+
70
112
  class DeviceCode(BaseModel):
71
113
  device_code: str
72
114
  user_code: str
@@ -97,7 +139,7 @@ class Organizations(BaseModel):
97
139
 
98
140
  # Supported model IDs
99
141
  SUPPORTED_MODELS = Literal[
100
- # Qwen models
142
+ # Qwen models (text)
101
143
  "Qwen/Qwen3-235B-A22B-Instruct-2507",
102
144
  "Qwen/Qwen3-30B-A3B-Instruct-2507",
103
145
  "Qwen/Qwen3-30B-A3B",
@@ -106,7 +148,10 @@ SUPPORTED_MODELS = Literal[
106
148
  "Qwen/Qwen3-8B",
107
149
  "Qwen/Qwen3-8B-Base",
108
150
  "Qwen/Qwen3-4B-Instruct-2507",
109
- # OpenAI models
151
+ # Qwen models (vision)
152
+ "Qwen/Qwen3-VL-235B-A22B-Instruct",
153
+ "Qwen/Qwen3-VL-30B-A3B-Instruct",
154
+ # OpenAI models (reasoning)
110
155
  "openai/gpt-oss-120b",
111
156
  "openai/gpt-oss-20b",
112
157
  # DeepSeek models
@@ -119,6 +164,8 @@ SUPPORTED_MODELS = Literal[
119
164
  "meta-llama/Llama-3.1-8B-Instruct",
120
165
  "meta-llama/Llama-3.2-3B",
121
166
  "meta-llama/Llama-3.2-1B",
167
+ # Moonshot models (reasoning)
168
+ "moonshotai/Kimi-K2-Thinking",
122
169
  ]
123
170
 
124
171
  # Maximum context window for all supported models
@@ -127,15 +174,41 @@ MAX_CONTEXT_WINDOW = 32768
127
174
  # Conservative max_tokens limit (leaves room for prompts)
128
175
  MAX_GENERATION_TOKENS = 30000
129
176
 
177
+ # Models that do NOT support tool calling
178
+ # - gpt-oss models use GptOssRenderer which doesn't support tools
179
+ # - Base/non-instruct models use RoleColonRenderer which doesn't support tools
180
+ MODELS_WITHOUT_TOOL_SUPPORT: set[str] = {
181
+ # OpenAI reasoning models (GptOssRenderer)
182
+ "openai/gpt-oss-120b",
183
+ "openai/gpt-oss-20b",
184
+ # Base models (RoleColonRenderer)
185
+ "Qwen/Qwen3-30B-A3B-Base",
186
+ "Qwen/Qwen3-8B-Base",
187
+ "deepseek-ai/DeepSeek-V3.1-Base",
188
+ "meta-llama/Llama-3.1-70B",
189
+ "meta-llama/Llama-3.1-8B",
190
+ "meta-llama/Llama-3.2-3B",
191
+ "meta-llama/Llama-3.2-1B",
192
+ }
193
+
194
+
195
+ def supports_tool_calling(model_path: str) -> bool:
196
+ """Check if a model supports tool calling."""
197
+ return model_path not in MODELS_WITHOUT_TOOL_SUPPORT
198
+
199
+
130
200
  # Maximum LoRA rank per model
131
201
  # Models not listed here default to 128
132
202
  MODEL_MAX_LORA_RANK: dict[str, int] = {
133
- # Max 32
203
+ # Max 32 (reasoning models)
134
204
  "openai/gpt-oss-120b": 32,
135
205
  "openai/gpt-oss-20b": 32,
136
- # Max 64
206
+ "moonshotai/Kimi-K2-Thinking": 32,
207
+ # Max 64 (large MoE models)
137
208
  "Qwen/Qwen3-235B-A22B-Instruct-2507": 64,
209
+ "Qwen/Qwen3-VL-235B-A22B-Instruct": 64,
138
210
  "Qwen/Qwen3-30B-A3B-Instruct-2507": 64,
211
+ "Qwen/Qwen3-VL-30B-A3B-Instruct": 64,
139
212
  "Qwen/Qwen3-30B-A3B": 64,
140
213
  "Qwen/Qwen3-30B-A3B-Base": 64,
141
214
  "deepseek-ai/DeepSeek-V3.1": 64,
@@ -229,13 +302,22 @@ class RolloutConfig(BaseModel):
229
302
  )
230
303
  mcp_url: str | list[str] | None = Field(
231
304
  default=None,
232
- description="MCP server URL(s) for tools. Can be a single URL or a list of URLs. Can be used alongside env.py to combine both tool sources.",
305
+ description="MCP server URL(s) for tools. Can be a single URL or a list of URLs. Can be used alongside tools.py to combine both tool sources.",
306
+ )
307
+ tool_timeout: int = Field(
308
+ default=60,
309
+ gt=0,
310
+ description="Timeout in seconds for tool calls. Browser automation may need longer timeouts (default: 60s).",
233
311
  )
234
312
  max_tool_response_chars: int | None = Field(
235
313
  default=4000,
236
314
  gt=0,
237
315
  description="Maximum characters for tool responses. Longer responses are truncated. Set to null/None to disable truncation.",
238
316
  )
317
+ include_thinking: bool = Field(
318
+ default=False,
319
+ description="Whether to include <think>...</think> blocks in messages passed to reward functions. Default is False (thinking is stripped).",
320
+ )
239
321
 
240
322
 
241
323
  class TrainerConfig(BaseModel):
@@ -9,7 +9,7 @@ data:
9
9
  batch_size: 32
10
10
  group_size: 16
11
11
  model:
12
- path: openai/gpt-oss-20b
12
+ path: Qwen/Qwen3-8B
13
13
  qlora_rank: 32
14
14
  name: "Countdown Reasoning Model"
15
15
  description: "Reproduces DeepSeek R1 aha moment using GRPO on the Countdown game"
@@ -9,7 +9,7 @@ data:
9
9
  batch_size: 32
10
10
  group_size: 16
11
11
  model:
12
- path: Qwen/Qwen3-8B
12
+ path: Qwen/Qwen3-30B-A3B-Instruct-2507
13
13
  qlora_rank: 32
14
14
  name: "SimpleQA Agent"
15
15
  description: "Multi-turn RL model trained on SimpleQA factual questions using Tavily MCP"
@@ -1,13 +1,13 @@
1
1
  project_id: ""
2
- project_name: "OpenMathReasoning"
2
+ project_name: "rl-project"
3
3
  dataset_id: ""
4
- dataset_name: "math-problems"
4
+ dataset_name: "train"
5
5
  dataset_type: rl
6
6
  organization_id: ""
7
7
  data:
8
8
  train_file: train.jsonl
9
- batch_size: 32
10
- group_size: 16
9
+ batch_size: 16
10
+ group_size: 8
11
11
  model:
12
12
  path: Qwen/Qwen3-8B
13
13
  qlora_rank: 32
@@ -19,9 +19,9 @@ algorithm:
19
19
  kl_penalty_coef: 0.01
20
20
  rollout:
21
21
  max_turns: 1
22
- max_tokens: 16384
22
+ max_tokens: 4096
23
23
  termination_policy: last_tool
24
24
  trainer:
25
- num_epochs: 4
25
+ num_epochs: 6
26
26
  learning_rate: 0.0001
27
- save_step: 333
27
+ save_step: 8