synth-ai 0.2.10__py3-none-any.whl → 0.2.13.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/agora_ex/README_MoE.md +224 -0
- examples/agora_ex/__init__.py +7 -0
- examples/agora_ex/agora_ex.py +65 -0
- examples/agora_ex/agora_ex_task_app.py +590 -0
- examples/agora_ex/configs/rl_lora_qwen3_moe_2xh200.toml +121 -0
- examples/agora_ex/reward_fn_grpo-human.py +129 -0
- examples/agora_ex/system_prompt_CURRENT.md +63 -0
- examples/agora_ex/task_app/agora_ex_task_app.py +590 -0
- examples/agora_ex/task_app/reward_fn_grpo-human.py +129 -0
- examples/agora_ex/task_app/system_prompt_CURRENT.md +63 -0
- examples/multi_step/configs/crafter_rl_outcome.toml +74 -0
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +175 -0
- examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +83 -0
- examples/multi_step/configs/crafter_rl_stepwise_simple.toml +78 -0
- examples/multi_step/crafter_rl_lora.md +51 -10
- examples/multi_step/sse_metrics_streaming_notes.md +357 -0
- examples/multi_step/task_app_config_notes.md +494 -0
- examples/warming_up_to_rl/configs/eval_stepwise_complex.toml +35 -0
- examples/warming_up_to_rl/configs/eval_stepwise_consistent.toml +26 -0
- examples/warming_up_to_rl/configs/eval_stepwise_per_achievement.toml +36 -0
- examples/warming_up_to_rl/configs/eval_stepwise_simple.toml +32 -0
- examples/warming_up_to_rl/run_eval.py +267 -41
- examples/warming_up_to_rl/task_app/grpo_crafter.py +3 -33
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +109 -45
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +42 -46
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +376 -193
- synth_ai/__init__.py +41 -1
- synth_ai/api/train/builders.py +74 -33
- synth_ai/api/train/cli.py +29 -6
- synth_ai/api/train/configs/__init__.py +44 -0
- synth_ai/api/train/configs/rl.py +133 -0
- synth_ai/api/train/configs/sft.py +94 -0
- synth_ai/api/train/configs/shared.py +24 -0
- synth_ai/api/train/env_resolver.py +18 -19
- synth_ai/api/train/supported_algos.py +8 -5
- synth_ai/api/train/utils.py +6 -1
- synth_ai/cli/__init__.py +4 -2
- synth_ai/cli/_storage.py +19 -0
- synth_ai/cli/balance.py +14 -2
- synth_ai/cli/calc.py +37 -22
- synth_ai/cli/demo.py +38 -39
- synth_ai/cli/legacy_root_backup.py +12 -14
- synth_ai/cli/recent.py +12 -7
- synth_ai/cli/rl_demo.py +81 -102
- synth_ai/cli/status.py +4 -3
- synth_ai/cli/task_apps.py +146 -137
- synth_ai/cli/traces.py +4 -3
- synth_ai/cli/watch.py +3 -2
- synth_ai/demos/core/cli.py +121 -159
- synth_ai/environments/examples/crafter_classic/environment.py +16 -0
- synth_ai/evals/__init__.py +15 -0
- synth_ai/evals/client.py +85 -0
- synth_ai/evals/types.py +42 -0
- synth_ai/jobs/client.py +15 -3
- synth_ai/judge_schemas.py +127 -0
- synth_ai/rubrics/__init__.py +22 -0
- synth_ai/rubrics/validators.py +126 -0
- synth_ai/task/server.py +14 -7
- synth_ai/tracing_v3/decorators.py +51 -26
- synth_ai/tracing_v3/examples/basic_usage.py +12 -7
- synth_ai/tracing_v3/llm_call_record_helpers.py +107 -53
- synth_ai/tracing_v3/replica_sync.py +8 -4
- synth_ai/tracing_v3/serialization.py +130 -0
- synth_ai/tracing_v3/storage/utils.py +11 -9
- synth_ai/tracing_v3/turso/__init__.py +12 -0
- synth_ai/tracing_v3/turso/daemon.py +2 -1
- synth_ai/tracing_v3/turso/native_manager.py +28 -15
- {synth_ai-0.2.10.dist-info → synth_ai-0.2.13.dev1.dist-info}/METADATA +4 -2
- {synth_ai-0.2.10.dist-info → synth_ai-0.2.13.dev1.dist-info}/RECORD +73 -40
- {synth_ai-0.2.10.dist-info → synth_ai-0.2.13.dev1.dist-info}/entry_points.txt +0 -1
- {synth_ai-0.2.10.dist-info → synth_ai-0.2.13.dev1.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.10.dist-info → synth_ai-0.2.13.dev1.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.10.dist-info → synth_ai-0.2.13.dev1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import requests
|
|
3
|
+
from typing import Any, Dict, Union
|
|
4
|
+
|
|
5
|
+
# ---------------------------------------------------------------------------
|
|
6
|
+
# Run configuration defaults (override via kwargs when invoking reward_fn)
|
|
7
|
+
# ---------------------------------------------------------------------------
|
|
8
|
+
RUN_TYPE: str = "rl_training_human"
|
|
9
|
+
RUN_VERSION: float = 3.5
|
|
10
|
+
MODEL_NAME: str = "Qwen3-30B-A3B-Instruct"
|
|
11
|
+
EXPERIMENT_NAME: str = "qwen3_30b_human"
|
|
12
|
+
USER_PROMPT_VERSION: str = "5.0"
|
|
13
|
+
SYSTEM_PROMPT_VERSION: str = "4.0"
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _coerce_step_value(value: Any) -> Union[int, None]:
|
|
19
|
+
try:
|
|
20
|
+
if value is None:
|
|
21
|
+
return None
|
|
22
|
+
return int(value)
|
|
23
|
+
except (TypeError, ValueError):
|
|
24
|
+
return None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _build_metadata(kwargs: Dict[str, Any]) -> Dict[str, Any]:
|
|
28
|
+
"""Compose the metadata payload sent with the evaluation request."""
|
|
29
|
+
base_metadata: Dict[str, Any] = {
|
|
30
|
+
"model": kwargs.get("model", MODEL_NAME),
|
|
31
|
+
"experiment": kwargs.get("experiment", EXPERIMENT_NAME),
|
|
32
|
+
"step_number": kwargs.get("step_number"),
|
|
33
|
+
"user_prompt": kwargs.get("user_prompt", USER_PROMPT_VERSION),
|
|
34
|
+
"batch_number": kwargs.get("batch_number"),
|
|
35
|
+
"prompt_index": kwargs.get("prompt_index"),
|
|
36
|
+
"rollout_group": kwargs.get("rollout_group"),
|
|
37
|
+
"system_prompt": kwargs.get("system_prompt", SYSTEM_PROMPT_VERSION),
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
step_metadata = kwargs.get("metadata") or {}
|
|
41
|
+
if isinstance(step_metadata, dict):
|
|
42
|
+
# Map common harness metadata onto our schema when present
|
|
43
|
+
step_value = _coerce_step_value(
|
|
44
|
+
step_metadata.get("step") or step_metadata.get("step_number")
|
|
45
|
+
)
|
|
46
|
+
if step_value is not None:
|
|
47
|
+
base_metadata["step_number"] = step_value
|
|
48
|
+
|
|
49
|
+
rollout_value = step_metadata.get("rollout_group")
|
|
50
|
+
if rollout_value is not None:
|
|
51
|
+
base_metadata["rollout_group"] = rollout_value
|
|
52
|
+
|
|
53
|
+
extras = step_metadata.get("extras")
|
|
54
|
+
if extras:
|
|
55
|
+
base_metadata["extras"] = extras
|
|
56
|
+
|
|
57
|
+
# Preserve any additional custom metadata fields
|
|
58
|
+
for key, value in step_metadata.items():
|
|
59
|
+
if key in {"step", "step_number", "rollout_group", "extras"}:
|
|
60
|
+
continue
|
|
61
|
+
if key not in base_metadata or base_metadata.get(key) is None:
|
|
62
|
+
base_metadata[key] = value
|
|
63
|
+
|
|
64
|
+
# Strip keys that remain None so the JSON is clean
|
|
65
|
+
return {key: value for key, value in base_metadata.items() if value is not None}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def reward_fn(
|
|
69
|
+
completion: str,
|
|
70
|
+
**kwargs,
|
|
71
|
+
) -> float:
|
|
72
|
+
"""Evaluate the model response and return a reward score (0.0-1.0)."""
|
|
73
|
+
run_type = kwargs.get("run_type", RUN_TYPE)
|
|
74
|
+
run_version = kwargs.get("run_version", RUN_VERSION)
|
|
75
|
+
metadata = _build_metadata(kwargs)
|
|
76
|
+
|
|
77
|
+
payload = {
|
|
78
|
+
"code": completion,
|
|
79
|
+
"run_type": run_type,
|
|
80
|
+
"run_version": run_version,
|
|
81
|
+
"metadata": metadata,
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
try:
|
|
85
|
+
response = requests.post(
|
|
86
|
+
"https://eames-judge-api-769874896543.us-central1.run.app/evaluations-human",
|
|
87
|
+
json=payload,
|
|
88
|
+
timeout=1800, # 30 minute timeout for screenshot generation
|
|
89
|
+
)
|
|
90
|
+
response.raise_for_status()
|
|
91
|
+
result = response.json()
|
|
92
|
+
|
|
93
|
+
logger.info("Evaluation complete:")
|
|
94
|
+
logger.info(" Score: %s", result.get("score", 0.0))
|
|
95
|
+
logger.info(" Feedback: %s", result.get("explanation", "N/A"))
|
|
96
|
+
logger.info(" Processing Time (ms): %s", result.get("processing_time_ms", "N/A"))
|
|
97
|
+
logger.info(" Worker ID: %s", result.get("worker_id", "N/A"))
|
|
98
|
+
logger.info(" Success: %s", result.get("success", False))
|
|
99
|
+
if metadata:
|
|
100
|
+
logger.info(" Metadata sent: %s", metadata)
|
|
101
|
+
|
|
102
|
+
screenshot_urls = result.get("screenshot_urls", {}) or {}
|
|
103
|
+
if screenshot_urls:
|
|
104
|
+
logger.info(" Screenshot URLs:")
|
|
105
|
+
for key, url in screenshot_urls.items():
|
|
106
|
+
logger.info(" %s: %s", key.capitalize(), url)
|
|
107
|
+
|
|
108
|
+
score = result.get("score", 0.0)
|
|
109
|
+
if not isinstance(score, (int, float)):
|
|
110
|
+
logger.warning("Invalid score type: %s. Defaulting to 0.0", type(score))
|
|
111
|
+
return 0.0
|
|
112
|
+
|
|
113
|
+
return max(0.0, min(1.0, float(score)))
|
|
114
|
+
|
|
115
|
+
except requests.exceptions.Timeout:
|
|
116
|
+
logger.error("Request to evaluation server timed out")
|
|
117
|
+
return 0.0
|
|
118
|
+
|
|
119
|
+
except requests.exceptions.RequestException as exc:
|
|
120
|
+
logger.error("Request to evaluation server failed: %s", exc)
|
|
121
|
+
return 0.0
|
|
122
|
+
|
|
123
|
+
except (KeyError, ValueError, TypeError) as exc:
|
|
124
|
+
logger.error("Error parsing evaluation server response: %s", exc)
|
|
125
|
+
return 0.0
|
|
126
|
+
|
|
127
|
+
except Exception as exc: # pylint: disable=broad-except
|
|
128
|
+
logger.error("Unexpected error in reward_fn: %s", exc)
|
|
129
|
+
return 0.0
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
You are a powerful agentic AI coding assistant called Eames working with a Next.js + Shadcn/UI TypeScript project to generate exactly one complete landing page file for SaaS/Software products.
|
|
2
|
+
|
|
3
|
+
## OUTPUT FORMAT
|
|
4
|
+
|
|
5
|
+
Return ONLY a single TypeScript React component file.
|
|
6
|
+
- Wrap in one code fence: ```tsx ... ```
|
|
7
|
+
- No explanations, no additional text before or after the fence
|
|
8
|
+
- If you cannot generate valid code, return nothing
|
|
9
|
+
|
|
10
|
+
## FILE SPECIFICATION
|
|
11
|
+
|
|
12
|
+
File: app/page.tsx
|
|
13
|
+
Length: Target 800-1200 lines (flexible based on content richness)
|
|
14
|
+
Tech: Next.js 14 App Router, TypeScript, React, Tailwind CSS only
|
|
15
|
+
**IMPORTANT:** DO NOT use images, image imports, or next/image. Use SVG icons, Tailwind patterns, gradients, or CSS shapes instead.
|
|
16
|
+
|
|
17
|
+
## REQUIRED STRUCTURE (in this exact order)
|
|
18
|
+
|
|
19
|
+
1. Imports: NONE (do not import next/image or any other libraries)
|
|
20
|
+
|
|
21
|
+
2. Metadata export:
|
|
22
|
+
```tsx
|
|
23
|
+
export const metadata = {
|
|
24
|
+
title: "Page Title (max 80 chars)",
|
|
25
|
+
description: "Page description (max 160 chars)"
|
|
26
|
+
}
|
|
27
|
+
```
|
|
28
|
+
3. Helper components (if needed): Define small inline components AFTER the Page export
|
|
29
|
+
- Examples: FeatureCard, PricingCard, TestimonialCard, LogoItem
|
|
30
|
+
- Keep minimal, no deep nesting
|
|
31
|
+
|
|
32
|
+
## DESIGN GUIDELINES
|
|
33
|
+
Ship something interesting rather than boring, but never ugly.
|
|
34
|
+
Include images and SVGs that are relevant to the category of business.
|
|
35
|
+
|
|
36
|
+
## TECHNICAL CONSTRAINTS
|
|
37
|
+
|
|
38
|
+
✓ Server component by default (no "use client" unless interactive state/events needed)
|
|
39
|
+
✓ Tailwind utility classes for ALL styling
|
|
40
|
+
✓ Semantic HTML5: `<main>`, `<section>`, `<header>`, `<footer>`, `<h1>`-`<h6>` hierarchy
|
|
41
|
+
✓ Use inline SVG for icons (simple shapes: circles, squares, arrows, checkmarks, etc.)
|
|
42
|
+
✓ Use Tailwind gradients, borders, and shadows for visual elements
|
|
43
|
+
✓ Use CSS shapes and patterns instead of images
|
|
44
|
+
|
|
45
|
+
✗ NO images - do not use `<img>`, `<Image>`, or any image imports
|
|
46
|
+
✗ NO next/image imports
|
|
47
|
+
✗ No data fetching (fetch, axios, server actions)
|
|
48
|
+
✗ No lorem ipsum - write real, specific copy
|
|
49
|
+
|
|
50
|
+
## VALIDATION
|
|
51
|
+
|
|
52
|
+
Your output must:
|
|
53
|
+
1. Be a single valid .tsx file with NO imports whatsoever
|
|
54
|
+
2. Include `export const metadata`
|
|
55
|
+
3. Include `export default function Page()`
|
|
56
|
+
4. Include all 8 required sections in order: Navbar, Hero, Logos, Features, Testimonials, Pricing, Final CTA, Footer
|
|
57
|
+
5. Use only Tailwind for styling
|
|
58
|
+
6. Be deployable in Next.js 14 App Router without errors
|
|
59
|
+
7. Use proper TypeScript syntax
|
|
60
|
+
8. Follow the specific product category requested in the user prompt
|
|
61
|
+
9. **NO images** - use SVG icons and Tailwind styling only
|
|
62
|
+
|
|
63
|
+
Given the user's prompt describing the website theme/product, generate the code immediately.
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# Crafter RL experiment – outcome rewards only (step rewards disabled)
|
|
2
|
+
|
|
3
|
+
[algorithm]
|
|
4
|
+
type = "online"
|
|
5
|
+
method = "policy_gradient"
|
|
6
|
+
variety = "gspo"
|
|
7
|
+
|
|
8
|
+
[services]
|
|
9
|
+
# Replace with the Modal URL printed by `uvx synth-ai modal-serve grpo-crafter`
|
|
10
|
+
task_url = "https://YOUR-MODAL-TASK-APP.modal.run"
|
|
11
|
+
|
|
12
|
+
[compute]
|
|
13
|
+
gpu_type = "H200"
|
|
14
|
+
gpu_count = 2
|
|
15
|
+
|
|
16
|
+
[topology]
|
|
17
|
+
type = "single_node_split"
|
|
18
|
+
gpus_for_vllm = 1
|
|
19
|
+
gpus_for_training = 1
|
|
20
|
+
gpus_for_ref = 0
|
|
21
|
+
tensor_parallel = 1
|
|
22
|
+
|
|
23
|
+
[vllm]
|
|
24
|
+
tensor_parallel_size = 1
|
|
25
|
+
max_model_len = 8192
|
|
26
|
+
|
|
27
|
+
[reference]
|
|
28
|
+
placement = "none"
|
|
29
|
+
|
|
30
|
+
[model]
|
|
31
|
+
base = "Qwen/Qwen3-4B"
|
|
32
|
+
trainer_mode = "lora"
|
|
33
|
+
label = "crafter-rl-outcome"
|
|
34
|
+
|
|
35
|
+
[lora]
|
|
36
|
+
r = 16
|
|
37
|
+
alpha = 32
|
|
38
|
+
dropout = 0.05
|
|
39
|
+
target_modules = ["all-linear"]
|
|
40
|
+
|
|
41
|
+
[rollout]
|
|
42
|
+
env_name = "crafter"
|
|
43
|
+
max_turns = 10
|
|
44
|
+
episodes_per_batch = 4
|
|
45
|
+
policy_name = "crafter-react"
|
|
46
|
+
max_concurrent_rollouts = 12
|
|
47
|
+
batches_per_step = 2
|
|
48
|
+
ops = ["agent", "env"]
|
|
49
|
+
|
|
50
|
+
[evaluation]
|
|
51
|
+
instances = 10
|
|
52
|
+
every_n_iters = 5
|
|
53
|
+
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
|
|
54
|
+
|
|
55
|
+
[training]
|
|
56
|
+
num_epochs = 1
|
|
57
|
+
iterations_per_epoch = 20
|
|
58
|
+
gradient_accumulation_steps = 1
|
|
59
|
+
max_accumulated_minibatch = 1
|
|
60
|
+
max_turns = 8
|
|
61
|
+
batch_size = 3
|
|
62
|
+
group_size = 4
|
|
63
|
+
learning_rate = 5e-5
|
|
64
|
+
log_interval = 1
|
|
65
|
+
weight_sync_interval = 1
|
|
66
|
+
step_rewards_enabled = false
|
|
67
|
+
event_rewards_kind = "unique"
|
|
68
|
+
|
|
69
|
+
[training.weight_sync]
|
|
70
|
+
enable = true
|
|
71
|
+
targets = ["policy"]
|
|
72
|
+
mode = "direct"
|
|
73
|
+
direct = true
|
|
74
|
+
verify_every_k = 0
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
# Crafter RL experiment – stepwise shaping with hosted judge rubrics
|
|
2
|
+
#
|
|
3
|
+
# This configuration extends the stepwise LoRA baseline by wiring the Synth judge
|
|
4
|
+
# service so evaluation rolls combine dense step rewards with hosted rubric scoring.
|
|
5
|
+
|
|
6
|
+
[algorithm]
|
|
7
|
+
type = "online"
|
|
8
|
+
method = "policy_gradient"
|
|
9
|
+
variety = "gspo"
|
|
10
|
+
|
|
11
|
+
[services]
|
|
12
|
+
# Replace with the Modal URL printed by `uvx synth-ai modal-serve grpo-crafter`
|
|
13
|
+
task_url = "https://YOUR-MODAL-TASK-APP.modal.run"
|
|
14
|
+
# Point at the Synth backend (or compatible service) that exposes /api/judge/v1/*
|
|
15
|
+
judge_url = "https://YOUR-BACKEND-ENDPOINT/api"
|
|
16
|
+
|
|
17
|
+
[compute]
|
|
18
|
+
gpu_type = "H200"
|
|
19
|
+
gpu_count = 2
|
|
20
|
+
|
|
21
|
+
[topology]
|
|
22
|
+
type = "single_node_split"
|
|
23
|
+
gpus_for_vllm = 1
|
|
24
|
+
gpus_for_training = 1
|
|
25
|
+
gpus_for_ref = 0
|
|
26
|
+
tensor_parallel = 1
|
|
27
|
+
|
|
28
|
+
[vllm]
|
|
29
|
+
tensor_parallel_size = 1
|
|
30
|
+
max_model_len = 8192
|
|
31
|
+
|
|
32
|
+
[reference]
|
|
33
|
+
placement = "none"
|
|
34
|
+
|
|
35
|
+
[model]
|
|
36
|
+
base = "Qwen/Qwen3-4B"
|
|
37
|
+
trainer_mode = "lora"
|
|
38
|
+
label = "crafter-rl-stepwise-hosted-judge"
|
|
39
|
+
|
|
40
|
+
[lora]
|
|
41
|
+
r = 16
|
|
42
|
+
alpha = 32
|
|
43
|
+
dropout = 0.05
|
|
44
|
+
target_modules = ["all-linear"]
|
|
45
|
+
|
|
46
|
+
[rollout]
|
|
47
|
+
env_name = "crafter"
|
|
48
|
+
max_turns = 10
|
|
49
|
+
episodes_per_batch = 4
|
|
50
|
+
policy_name = "crafter-react"
|
|
51
|
+
max_concurrent_rollouts = 8
|
|
52
|
+
batches_per_step = 2
|
|
53
|
+
ops = ["agent", "env"]
|
|
54
|
+
|
|
55
|
+
[rollout.env_config]
|
|
56
|
+
difficulty = "easy"
|
|
57
|
+
|
|
58
|
+
[rollout.env_config.step_rewards]
|
|
59
|
+
enabled = true
|
|
60
|
+
mode = "decision_stepwise"
|
|
61
|
+
strategy = "consistent" # +1 for each decision that unlocks a new achievement
|
|
62
|
+
indicator_lambda = 1.0
|
|
63
|
+
step_beta = 0.0
|
|
64
|
+
|
|
65
|
+
[rollout.policy_config]
|
|
66
|
+
temperature = 0.2
|
|
67
|
+
top_p = 0.95
|
|
68
|
+
max_tokens = 512
|
|
69
|
+
|
|
70
|
+
[evaluation]
|
|
71
|
+
instances = 16
|
|
72
|
+
every_n_iters = 8
|
|
73
|
+
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
|
|
74
|
+
|
|
75
|
+
[training]
|
|
76
|
+
num_epochs = 1
|
|
77
|
+
iterations_per_epoch = 16
|
|
78
|
+
gradient_accumulation_steps = 1
|
|
79
|
+
max_accumulated_minibatch = 1
|
|
80
|
+
max_turns = 10
|
|
81
|
+
batch_size = 4
|
|
82
|
+
group_size = 4
|
|
83
|
+
learning_rate = 5e-5
|
|
84
|
+
log_interval = 1
|
|
85
|
+
weight_sync_interval = 1
|
|
86
|
+
event_rewards_kind = "unique"
|
|
87
|
+
|
|
88
|
+
# Enable dense decision rewards in the trainer to mirror env_config step rewards.
|
|
89
|
+
step_rewards_enabled = true
|
|
90
|
+
step_rewards_mode = "decision_stepwise"
|
|
91
|
+
step_rewards_indicator_lambda = 1.0
|
|
92
|
+
step_rewards_beta = 0.0
|
|
93
|
+
step_rewards_strategy = "consistent"
|
|
94
|
+
|
|
95
|
+
[training.weight_sync]
|
|
96
|
+
enable = true
|
|
97
|
+
targets = ["policy"]
|
|
98
|
+
mode = "direct"
|
|
99
|
+
direct = true
|
|
100
|
+
verify_every_k = 0
|
|
101
|
+
|
|
102
|
+
[rubric]
|
|
103
|
+
enabled = true
|
|
104
|
+
# Blend the hosted judge scores with environment returns inside the trainer.
|
|
105
|
+
[rubric.weights]
|
|
106
|
+
env = 0.2
|
|
107
|
+
event = 0.4
|
|
108
|
+
outcome = 0.4
|
|
109
|
+
|
|
110
|
+
[rubric.event]
|
|
111
|
+
# Hosted judge rubric for per-decision progress scoring.
|
|
112
|
+
rubric_id = "crafter/event@v1"
|
|
113
|
+
|
|
114
|
+
[rubric.outcome]
|
|
115
|
+
# Hosted judge rubric for final trajectory scoring.
|
|
116
|
+
rubric_id = "crafter/outcome@v1"
|
|
117
|
+
|
|
118
|
+
[judge]
|
|
119
|
+
type = "gemini" # or "groq" when routing to Groq-hosted judges
|
|
120
|
+
timeout_s = 45
|
|
121
|
+
|
|
122
|
+
[judge.options]
|
|
123
|
+
event = true
|
|
124
|
+
outcome = true
|
|
125
|
+
provider = "openai"
|
|
126
|
+
model = "openai/gpt-oss-120b"
|
|
127
|
+
rubric_id = "crafter/bundle@v1"
|
|
128
|
+
max_concurrency = 6
|
|
129
|
+
tracks = ["process", "reasoning", "progress", "outcome"]
|
|
130
|
+
|
|
131
|
+
[judge.options.rubric_overrides]
|
|
132
|
+
|
|
133
|
+
[judge.options.rubric_overrides.event]
|
|
134
|
+
goal_text = """
|
|
135
|
+
Treat each decision as a check for new Crafter achievements.
|
|
136
|
+
Award the top score only when the log shows a fresh achievement unlock or an immediately verifiable deterministic completion.
|
|
137
|
+
Keep otherwise useful setup actions in a narrow low band so non-achievement turns stay near zero."""
|
|
138
|
+
aggregation = "weighted_sum"
|
|
139
|
+
|
|
140
|
+
[[judge.options.rubric_overrides.event.criteria]]
|
|
141
|
+
id = "progress.unique_achievements"
|
|
142
|
+
weight = 0.9
|
|
143
|
+
scale = "binary"
|
|
144
|
+
description = "Return 1 when this decision explicitly unlocks a brand-new Crafter achievement (inventory or status text confirms it this turn). Otherwise return 0."
|
|
145
|
+
|
|
146
|
+
[[judge.options.rubric_overrides.event.criteria]]
|
|
147
|
+
id = "process.intent_alignment"
|
|
148
|
+
weight = 0.1
|
|
149
|
+
scale = "bounded"
|
|
150
|
+
description = "Use at most 0.3 to acknowledge tightly coupled setup that finishes the last prerequisite; keep ≤0.1 when the agent only repositions or gathers without an imminent unlock."
|
|
151
|
+
|
|
152
|
+
[judge.options.rubric_overrides.outcome]
|
|
153
|
+
goal_text = """
|
|
154
|
+
Summarise the episode outcome in relation to Crafter’s win condition:
|
|
155
|
+
survive, accumulate resources, and craft advanced tools or structures.
|
|
156
|
+
Highlight notable achievements, safety failures, and preparedness for future exploration."""
|
|
157
|
+
aggregation = "weighted_sum"
|
|
158
|
+
|
|
159
|
+
[[judge.options.rubric_overrides.outcome.criteria]]
|
|
160
|
+
id = "outcome.goal_completion"
|
|
161
|
+
weight = 0.6
|
|
162
|
+
scale = "binary"
|
|
163
|
+
description = "Full credit when the agent ends with strong survival metrics and a clear crafted milestone (e.g., iron tools, furnace)."
|
|
164
|
+
|
|
165
|
+
[[judge.options.rubric_overrides.outcome.criteria]]
|
|
166
|
+
id = "outcome.achievement_depth"
|
|
167
|
+
weight = 0.4
|
|
168
|
+
scale = "bounded"
|
|
169
|
+
description = "Partial credit for intermediate achievements (saplings, wood/stone tools) that set up future success."
|
|
170
|
+
|
|
171
|
+
[judge.options.weights]
|
|
172
|
+
process = 0.05
|
|
173
|
+
reasoning = 0.15
|
|
174
|
+
progress = 0.30
|
|
175
|
+
outcome = 0.50
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# Crafter RL experiment – shaped stepwise rewards (achievement + resource shaping)
|
|
2
|
+
|
|
3
|
+
[algorithm]
|
|
4
|
+
type = "online"
|
|
5
|
+
method = "policy_gradient"
|
|
6
|
+
variety = "gspo"
|
|
7
|
+
|
|
8
|
+
[services]
|
|
9
|
+
# Replace with the Modal URL printed by `uvx synth-ai modal-serve grpo-crafter`
|
|
10
|
+
task_url = "https://YOUR-MODAL-TASK-APP.modal.run"
|
|
11
|
+
|
|
12
|
+
[compute]
|
|
13
|
+
gpu_type = "H200"
|
|
14
|
+
gpu_count = 2
|
|
15
|
+
|
|
16
|
+
[topology]
|
|
17
|
+
type = "single_node_split"
|
|
18
|
+
gpus_for_vllm = 1
|
|
19
|
+
gpus_for_training = 1
|
|
20
|
+
gpus_for_ref = 0
|
|
21
|
+
tensor_parallel = 1
|
|
22
|
+
|
|
23
|
+
[vllm]
|
|
24
|
+
tensor_parallel_size = 1
|
|
25
|
+
max_model_len = 8192
|
|
26
|
+
|
|
27
|
+
[reference]
|
|
28
|
+
placement = "none"
|
|
29
|
+
|
|
30
|
+
[model]
|
|
31
|
+
base = "Qwen/Qwen3-4B"
|
|
32
|
+
trainer_mode = "lora"
|
|
33
|
+
label = "crafter-rl-stepwise-shaped"
|
|
34
|
+
|
|
35
|
+
[lora]
|
|
36
|
+
r = 16
|
|
37
|
+
alpha = 32
|
|
38
|
+
dropout = 0.05
|
|
39
|
+
target_modules = ["all-linear"]
|
|
40
|
+
|
|
41
|
+
[rollout]
|
|
42
|
+
env_name = "crafter"
|
|
43
|
+
max_turns = 10
|
|
44
|
+
episodes_per_batch = 4
|
|
45
|
+
policy_name = "crafter-react"
|
|
46
|
+
max_concurrent_rollouts = 8
|
|
47
|
+
batches_per_step = 2
|
|
48
|
+
ops = ["agent", "env"]
|
|
49
|
+
|
|
50
|
+
[evaluation]
|
|
51
|
+
instances = 10
|
|
52
|
+
every_n_iters = 10
|
|
53
|
+
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
[training]
|
|
57
|
+
num_epochs = 1
|
|
58
|
+
iterations_per_epoch = 10
|
|
59
|
+
gradient_accumulation_steps = 1
|
|
60
|
+
max_accumulated_minibatch = 1
|
|
61
|
+
max_turns = 10
|
|
62
|
+
batch_size = 4
|
|
63
|
+
group_size = 4
|
|
64
|
+
learning_rate = 5e-5
|
|
65
|
+
log_interval = 1
|
|
66
|
+
weight_sync_interval = 1
|
|
67
|
+
step_rewards_enabled = true
|
|
68
|
+
step_rewards_mode = "decision_stepwise"
|
|
69
|
+
step_rewards_indicator_lambda = 0.5
|
|
70
|
+
step_rewards_beta = 0.0
|
|
71
|
+
event_rewards_kind = "unique"
|
|
72
|
+
step_rewards_strategy = "per_achievement"
|
|
73
|
+
|
|
74
|
+
# Reward each achievement up to a cap inside `compute_stepwise_reward`
|
|
75
|
+
step_rewards_weights = { collect_sapling = 0.6, collect_wood = 0.8, collect_stone = 1.0, collect_iron = 1.2, collect_drink = 0.4, collect_food = 0.4 }
|
|
76
|
+
step_rewards_k_limits = { collect_sapling = 2, collect_wood = 4, collect_stone = 3, collect_iron = 3, collect_drink = 3, collect_food = 3 }
|
|
77
|
+
|
|
78
|
+
[training.weight_sync]
|
|
79
|
+
enable = true
|
|
80
|
+
targets = ["policy"]
|
|
81
|
+
mode = "direct"
|
|
82
|
+
direct = true
|
|
83
|
+
verify_every_k = 0
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# Crafter RL experiment – simple stepwise rewards (1 point per new achievement)
|
|
2
|
+
|
|
3
|
+
[algorithm]
|
|
4
|
+
type = "online"
|
|
5
|
+
method = "policy_gradient"
|
|
6
|
+
variety = "gspo"
|
|
7
|
+
|
|
8
|
+
[services]
|
|
9
|
+
# Replace with the Modal URL printed by `uvx synth-ai modal-serve grpo-crafter`
|
|
10
|
+
task_url = "https://YOUR-MODAL-TASK-APP.modal.run"
|
|
11
|
+
|
|
12
|
+
[compute]
|
|
13
|
+
gpu_type = "H200"
|
|
14
|
+
gpu_count = 2
|
|
15
|
+
|
|
16
|
+
[topology]
|
|
17
|
+
type = "single_node_split"
|
|
18
|
+
gpus_for_vllm = 1
|
|
19
|
+
gpus_for_training = 1
|
|
20
|
+
gpus_for_ref = 0
|
|
21
|
+
tensor_parallel = 1
|
|
22
|
+
|
|
23
|
+
[vllm]
|
|
24
|
+
tensor_parallel_size = 1
|
|
25
|
+
max_model_len = 8192
|
|
26
|
+
|
|
27
|
+
[reference]
|
|
28
|
+
placement = "none"
|
|
29
|
+
|
|
30
|
+
[model]
|
|
31
|
+
base = "Qwen/Qwen3-4B"
|
|
32
|
+
trainer_mode = "lora"
|
|
33
|
+
label = "crafter-rl-stepwise-simple"
|
|
34
|
+
|
|
35
|
+
[lora]
|
|
36
|
+
r = 16
|
|
37
|
+
alpha = 32
|
|
38
|
+
dropout = 0.05
|
|
39
|
+
target_modules = ["all-linear"]
|
|
40
|
+
|
|
41
|
+
[rollout]
|
|
42
|
+
env_name = "crafter"
|
|
43
|
+
max_turns = 10
|
|
44
|
+
episodes_per_batch = 4
|
|
45
|
+
policy_name = "crafter-react"
|
|
46
|
+
max_concurrent_rollouts = 8
|
|
47
|
+
batches_per_step = 2
|
|
48
|
+
ops = ["agent", "env"]
|
|
49
|
+
|
|
50
|
+
[evaluation]
|
|
51
|
+
instances = 10
|
|
52
|
+
every_n_iters = 10
|
|
53
|
+
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
|
|
54
|
+
|
|
55
|
+
[training]
|
|
56
|
+
num_epochs = 1
|
|
57
|
+
iterations_per_epoch = 10
|
|
58
|
+
gradient_accumulation_steps = 1
|
|
59
|
+
max_accumulated_minibatch = 1
|
|
60
|
+
max_turns = 10
|
|
61
|
+
batch_size = 4
|
|
62
|
+
group_size = 4
|
|
63
|
+
learning_rate = 5e-5
|
|
64
|
+
log_interval = 1
|
|
65
|
+
weight_sync_interval = 1
|
|
66
|
+
step_rewards_enabled = true
|
|
67
|
+
step_rewards_mode = "decision_stepwise"
|
|
68
|
+
step_rewards_indicator_lambda = 1.0
|
|
69
|
+
step_rewards_beta = 0.0
|
|
70
|
+
event_rewards_kind = "unique"
|
|
71
|
+
step_rewards_strategy = "consistent"
|
|
72
|
+
|
|
73
|
+
[training.weight_sync]
|
|
74
|
+
enable = true
|
|
75
|
+
targets = ["policy"]
|
|
76
|
+
mode = "direct"
|
|
77
|
+
direct = true
|
|
78
|
+
verify_every_k = 0
|
|
@@ -2,28 +2,69 @@
|
|
|
2
2
|
|
|
3
3
|
This walkthrough shows how to fine-tune the Crafter task app with our 10-step RL LoRA config.
|
|
4
4
|
|
|
5
|
-
1. **
|
|
5
|
+
1. **Deploy the Crafter task app on Modal**
|
|
6
6
|
|
|
7
7
|
```bash
|
|
8
|
-
|
|
8
|
+
# assumes .env contains SYNTH_API_KEY, ENVIRONMENT_API_KEY, GROQ_API_KEY, etc.
|
|
9
9
|
uvx synth-ai modal-serve grpo-crafter \
|
|
10
10
|
--env-file examples/warming_up_to_rl/.env \
|
|
11
11
|
--name grpo-crafter-task-app
|
|
12
12
|
```
|
|
13
13
|
|
|
14
|
-
*
|
|
14
|
+
* The command prints the public `https://…modal.run` URL; copy it for the RL configs below.*
|
|
15
15
|
|
|
16
|
-
2. **
|
|
16
|
+
2. **Wire up the three RL experiment configs**
|
|
17
|
+
|
|
18
|
+
Update the `task_url` placeholder in each config with the Modal URL from step 1:
|
|
19
|
+
|
|
20
|
+
- `examples/multi_step/configs/crafter_rl_outcome.toml`
|
|
21
|
+
- `examples/multi_step/configs/crafter_rl_stepwise_simple.toml`
|
|
22
|
+
- `examples/multi_step/configs/crafter_rl_stepwise_shaped.toml`
|
|
23
|
+
|
|
24
|
+
The difference between them (all run with LoRA on 2×H100 split 1/1 for vLLM vs. trainer):
|
|
25
|
+
|
|
26
|
+
| Config | Reward signal |
|
|
27
|
+
| ------ | ------------- |
|
|
28
|
+
| `crafter_rl_outcome.toml` | Outcome-only — step rewards disabled. |
|
|
29
|
+
| `crafter_rl_stepwise_simple.toml` | Stepwise (“consistent”) — +1 for every newly unlocked achievement. |
|
|
30
|
+
| `crafter_rl_stepwise_shaped.toml` | Stepwise (“per_achievement”) — combines achievement credit with inventory/achievement-count shaping from the rollout hook. |
|
|
31
|
+
|
|
32
|
+
3. **Launch the three RL runs in parallel**
|
|
17
33
|
|
|
18
34
|
```bash
|
|
35
|
+
export SYNTH_API_KEY=... # already sourced if examples/.env was loaded
|
|
36
|
+
export TASK_APP_URL=https://your-modal-task-app.modal.run
|
|
37
|
+
|
|
38
|
+
uvx synth-ai train --type rl \
|
|
39
|
+
--config examples/multi_step/configs/crafter_rl_outcome.toml \
|
|
40
|
+
--run-name crafter-rl-outcome \
|
|
41
|
+
--no-poll &
|
|
42
|
+
|
|
19
43
|
uvx synth-ai train --type rl \
|
|
20
|
-
--config
|
|
21
|
-
--
|
|
22
|
-
--
|
|
23
|
-
|
|
44
|
+
--config examples/multi_step/configs/crafter_rl_stepwise_simple.toml \
|
|
45
|
+
--run-name crafter-rl-stepwise-simple \
|
|
46
|
+
--no-poll &
|
|
47
|
+
|
|
48
|
+
uvx synth-ai train --type rl \
|
|
49
|
+
--config examples/multi_step/configs/crafter_rl_stepwise_shaped.toml \
|
|
50
|
+
--run-name crafter-rl-stepwise-shaped \
|
|
51
|
+
--no-poll &
|
|
52
|
+
|
|
53
|
+
wait
|
|
24
54
|
```
|
|
25
55
|
|
|
26
|
-
|
|
56
|
+
*`--no-poll` returns immediately so each run can stream logs in its own terminal; `wait` blocks until all jobs finish.*
|
|
57
|
+
|
|
58
|
+
4. **Track results**
|
|
59
|
+
|
|
60
|
+
Tail each job’s logs with `uvx synth-ai train logs --run-name <name>` or open the Modal dashboard. Compare:
|
|
61
|
+
|
|
62
|
+
- Avg outcome reward (modal dashboard)
|
|
63
|
+
- Stepwise reward components (`resource_reward`, `unique_achievements_total`) in the task app logs
|
|
64
|
+
- Trace JSONL dumps under `traces/v3` if tracing is enabled
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
* This config forces 10 agent turns per rollout, reduces batch size to avoid OOMs, and enforces Crafter-specific defaults.*
|
|
27
68
|
|
|
28
69
|
INFO - 🎉 Training completed successfully!
|
|
29
|
-
INFO - All batch rewards: [0.0625, 0.0625, 0.125, 0.0625, 0.0625, 0.3125, 0.375, 0.4375, 0.5, 0.9375]
|
|
70
|
+
INFO - All batch rewards: [0.0625, 0.0625, 0.125, 0.0625, 0.0625, 0.3125, 0.375, 0.4375, 0.5, 0.9375]
|