synth-ai 0.2.13.dev2__py3-none-any.whl → 0.2.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (110) hide show
  1. examples/multi_step/configs/README_verilog_rl.md +77 -0
  2. examples/multi_step/configs/VERILOG_REWARDS.md +90 -0
  3. examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +183 -0
  4. examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
  5. examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
  6. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +5 -4
  7. examples/multi_step/configs/crafter_synth_backend.md +40 -0
  8. examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
  9. examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
  10. examples/multi_step/configs/verilog_rl_lora.toml +190 -0
  11. examples/multi_step/judges/crafter_backend_judge.py +220 -0
  12. examples/multi_step/judges/verilog_backend_judge.py +234 -0
  13. examples/multi_step/readme.md +48 -0
  14. examples/multi_step/verilog_rl_lora.md +218 -0
  15. examples/qwen_coder/configs/coder_lora_30b.toml +1 -1
  16. examples/sft/evaluate.py +2 -0
  17. examples/sft/generate_traces.py +2 -0
  18. examples/swe/task_app/grpo_swe_mini.py +1 -0
  19. examples/swe/task_app/hosted/rollout.py +2 -0
  20. examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
  21. examples/task_apps/crafter/CREATE_SFT_DATASET.md +273 -0
  22. examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
  23. examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +174 -0
  24. examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +268 -0
  25. examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
  26. examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
  27. examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
  28. examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
  29. examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
  30. examples/task_apps/crafter/task_app/__init__.py +3 -0
  31. examples/task_apps/crafter/task_app/grpo_crafter.py +306 -8
  32. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/environment.py +10 -0
  33. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +16 -3
  34. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +17 -2
  35. examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +25 -3
  36. examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +52 -1
  37. examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +111 -13
  38. examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +156 -0
  39. examples/task_apps/enron/filter_sft.toml +5 -0
  40. examples/task_apps/enron/tests/__init__.py +2 -0
  41. examples/task_apps/enron/tests/integration/__init__.py +2 -0
  42. examples/task_apps/enron/tests/integration/test_enron_eval.py +2 -0
  43. examples/task_apps/enron/tests/unit/__init__.py +2 -0
  44. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
  45. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
  46. examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +415 -0
  47. examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +29 -0
  48. examples/task_apps/pokemon_red/pallet_town_rl_config.toml +2 -0
  49. examples/task_apps/pokemon_red/task_app.py +199 -6
  50. examples/task_apps/pokemon_red/test_pallet_town_rewards.py +2 -0
  51. examples/task_apps/sokoban/filter_sft.toml +5 -0
  52. examples/task_apps/sokoban/tests/__init__.py +2 -0
  53. examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
  54. examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
  55. examples/task_apps/verilog/eval_groq_qwen32b.toml +8 -4
  56. examples/task_apps/verilog/filter_sft.toml +5 -0
  57. examples/task_apps/verilog/task_app/grpo_verilog.py +258 -23
  58. examples/task_apps/verilog/tests/__init__.py +2 -0
  59. examples/task_apps/verilog/tests/integration/__init__.py +2 -0
  60. examples/task_apps/verilog/tests/integration/test_verilog_eval.py +2 -0
  61. examples/task_apps/verilog/tests/unit/__init__.py +2 -0
  62. examples/warming_up_to_rl/groq_test.py +2 -0
  63. examples/warming_up_to_rl/run_local_rollout.py +2 -0
  64. examples/warming_up_to_rl/run_local_rollout_modal.py +2 -0
  65. examples/warming_up_to_rl/run_local_rollout_parallel.py +2 -0
  66. examples/warming_up_to_rl/run_local_rollout_traced.py +2 -0
  67. examples/warming_up_to_rl/run_rollout_remote.py +2 -0
  68. synth_ai/api/models/supported.py +1 -0
  69. synth_ai/cli/__init__.py +46 -13
  70. synth_ai/cli/_modal_wrapper.py +3 -2
  71. synth_ai/cli/recent.py +1 -1
  72. synth_ai/cli/status.py +1 -1
  73. synth_ai/cli/task_apps.py +354 -143
  74. synth_ai/cli/traces.py +1 -1
  75. synth_ai/cli/tui.py +57 -0
  76. synth_ai/cli/turso.py +1 -1
  77. synth_ai/cli/watch.py +1 -1
  78. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
  79. synth_ai/environments/examples/crafter_classic/environment.py +1 -1
  80. synth_ai/environments/examples/verilog/engine.py +76 -10
  81. synth_ai/judge_schemas.py +8 -8
  82. synth_ai/task/__init__.py +11 -1
  83. synth_ai/task/apps/__init__.py +1 -0
  84. synth_ai/task/config.py +257 -0
  85. synth_ai/task/contracts.py +15 -2
  86. synth_ai/task/rubrics/__init__.py +3 -0
  87. synth_ai/task/rubrics/loaders.py +22 -3
  88. synth_ai/task/rubrics/scoring.py +3 -0
  89. synth_ai/task/trace_correlation_helpers.py +315 -0
  90. synth_ai/task/validators.py +144 -0
  91. synth_ai/tracing_v3/abstractions.py +3 -3
  92. synth_ai/tracing_v3/llm_call_record_helpers.py +5 -5
  93. synth_ai/tracing_v3/session_tracer.py +16 -6
  94. synth_ai/tracing_v3/storage/base.py +29 -29
  95. synth_ai/tracing_v3/storage/config.py +3 -3
  96. synth_ai/tracing_v3/turso/daemon.py +8 -7
  97. synth_ai/tracing_v3/turso/native_manager.py +63 -40
  98. synth_ai/tracing_v3/utils.py +3 -3
  99. synth_ai/tui/__init__.py +5 -0
  100. synth_ai/tui/__main__.py +13 -0
  101. synth_ai/tui/cli/__init__.py +1 -0
  102. synth_ai/tui/cli/query_experiments.py +164 -0
  103. synth_ai/tui/cli/query_experiments_v3.py +164 -0
  104. synth_ai/tui/dashboard.py +906 -0
  105. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/METADATA +1 -1
  106. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/RECORD +110 -71
  107. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/WHEEL +0 -0
  108. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/entry_points.txt +0 -0
  109. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/licenses/LICENSE +0 -0
  110. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,190 @@
1
+ # Verilog RL experiment – LoRA training on Qwen3-0.6B
2
+ #
3
+ # This configuration adapts the Crafter RL setup for Verilog spec-to-RTL tasks.
4
+ # Uses the same proven pipeline but optimized for 0.6B model and Verilog domain.
5
+
6
+ [algorithm]
7
+ type = "online"
8
+ method = "policy_gradient"
9
+ variety = "gspo"
10
+
11
+ [services]
12
+ # Replace with the Modal URL printed by `uvx synth-ai modal-serve grpo-verilog`
13
+ task_url = "https://synth-laboratories--grpo-verilog-task-app-fastapi-app-dev.modal.run"
14
+ # Point at the Synth backend (or compatible service) that exposes /api/judge/v1/*
15
+ judge_url = "https://synth-backend-dev-docker.onrender.com/api"
16
+
17
+ [compute]
18
+ gpu_type = "H200" # ✅ 8B model needs H200 for larger context window
19
+ gpu_count = 2 # ✅ Minimum 2x GPUs (1 for vLLM inference + 1 for training)
20
+ nodes = 1
21
+
22
+ [topology]
23
+ type = "single_node_split"
24
+ gpus_for_vllm = 1 # ✅ vLLM for inference
25
+ gpus_for_training = 1 # ✅ Training GPU (8B LoRA fits well)
26
+ gpus_for_ref = 0
27
+ tensor_parallel = 1
28
+
29
+ [vllm]
30
+ tensor_parallel_size = 1
31
+ max_model_len = 24576 # ✅ Increased to 24K to accommodate long Verilog prompts (16K + 8K buffer for testbenches + history)
32
+
33
+ [reference]
34
+ placement = "none"
35
+
36
+ [model]
37
+ base = "Qwen/Qwen3-8B" # ✅ 8B model for RL training with good balance of speed and capability
38
+ trainer_mode = "lora"
39
+ label = "verilog-rl-lora-qwen8b"
40
+
41
+ [lora]
42
+ r = 16
43
+ alpha = 32
44
+ dropout = 0.05
45
+ target_modules = ["all-linear"]
46
+
47
+ [rollout]
48
+ env_name = "verilog" # ✅ Changed from "crafter" to "verilog"
49
+ max_turns = 6 # ✅ More steps for compilation chains vs Crafter's 10
50
+ episodes_per_batch = 4 # ✅ Good batch size for 8B model
51
+ policy_name = "verilog-designer"
52
+ max_concurrent_rollouts = 8
53
+ batches_per_step = 2
54
+ ops = ["agent", "env"]
55
+
56
+ [rollout.env_config]
57
+ # Verilog-specific environment settings
58
+ difficulty = "medium" # Can be "easy", "medium", or "hard"
59
+
60
+ [rollout.env_config.step_rewards]
61
+ enabled = true
62
+ mode = "decision_stepwise"
63
+ strategy = "consistent"
64
+ indicator_lambda = 0.5 # ✅ Reduced from Crafter (sparser rewards)
65
+ step_beta = 0.0
66
+
67
+ [rollout.policy_config]
68
+ provider = "openai"
69
+ model = "Qwen/Qwen3-8B" # ✅ Use the model being trained (8B) for rollouts
70
+ temperature = 0.2
71
+ max_tokens = 4096 # ✅ Balanced for Verilog generation while leaving room for long input prompts (testbenches + history)
72
+
73
+ [evaluation]
74
+ instances = 16
75
+ every_n_iters = 10
76
+ seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
77
+
78
+ [training]
79
+ num_epochs = 1
80
+ iterations_per_epoch = 5
81
+ gradient_accumulation_steps = 1
82
+ max_accumulated_minibatch = 1
83
+ max_turns = 15
84
+ batch_size = 4 # ✅ Same as Crafter (works well for 8B LoRA)
85
+ group_size = 4
86
+ learning_rate = 5e-5 # ✅ Same as Crafter
87
+ log_interval = 1
88
+ weight_sync_interval = 1
89
+ event_rewards_kind = "unique"
90
+ async_semaphore_max = 20 # Max concurrent rollouts in streaming pipeline
91
+
92
+ # Enable dense decision rewards in the trainer
93
+ step_rewards_enabled = true
94
+ step_rewards_mode = "decision_stepwise"
95
+ step_rewards_indicator_lambda = 0.5 # ✅ Reduced for Verilog's sparser rewards
96
+ step_rewards_beta = 0.0
97
+ step_rewards_strategy = "consistent"
98
+
99
+ [training.weight_sync]
100
+ enable = true
101
+ targets = ["policy"]
102
+ mode = "direct"
103
+ direct = true
104
+ verify_every_k = 0
105
+
106
+ [rubric]
107
+ enabled = true
108
+ model = "openai/gpt-oss-120b"
109
+ api_base = "https://synth-backend-dev-docker.onrender.com/api/judge"
110
+ api_key_env = "OPENAI_API_KEY"
111
+
112
+ # Blend the hosted judge scores with environment returns
113
+ [rubric.weights]
114
+ env = 0.3 # ✅ Higher weight on env rewards for Verilog (vs Crafter's 0.2)
115
+ event = 0.3 # ✅ Adjusted for Verilog's different reward structure
116
+ outcome = 0.4
117
+
118
+ [rubric.event]
119
+ # Verilog-specific event rubric for process efficiency
120
+ rubric_id = "verilog/event@v1"
121
+ criteria = [
122
+ { key = "process.compilation_success", weight = 0.7, description = "Return 1.0 when compilation succeeds, 0.5 for partial success, 0.0 for failure", aggregation = "weighted_sum" },
123
+ { key = "process.design_iterations", weight = 0.3, description = "Reward efficient design iterations without unnecessary recompilation", aggregation = "weighted_sum" },
124
+ ]
125
+
126
+ [rubric.outcome]
127
+ # Verilog-specific outcome rubric for final results
128
+ rubric_id = "verilog/outcome@v1"
129
+ criteria = [
130
+ { key = "outcome.tests_passed", weight = 0.8, description = "Full credit when all tests pass, partial for some tests", aggregation = "weighted_sum" },
131
+ { key = "outcome.design_quality", weight = 0.2, description = "Code quality, documentation, and design efficiency", aggregation = "weighted_sum" },
132
+ ]
133
+
134
+ [judge]
135
+ type = "groq"
136
+ timeout_s = 45
137
+
138
+ [judge.options]
139
+ event = true
140
+ outcome = true
141
+ provider = "openai"
142
+ model = "openai/gpt-oss-120b"
143
+ rubric_id = "verilog/bundle@v1"
144
+ max_concurrency = 6
145
+ tracks = ["process", "reasoning", "progress", "outcome"]
146
+
147
+ [judge.options.rubric_overrides]
148
+
149
+ [judge.options.rubric_overrides.event]
150
+ goal_text = """
151
+ Evaluate each Verilog design decision for compilation success and process efficiency.
152
+ High scores for successful compilation and strategic tool usage.
153
+ Penalize unnecessary operations and compilation failures."""
154
+ aggregation = "weighted_sum"
155
+
156
+ [[judge.options.rubric_overrides.event.criteria]]
157
+ id = "process.compilation_success"
158
+ weight = 0.7
159
+ scale = "bounded"
160
+ description = "Return 1.0 when compilation succeeds cleanly, 0.5 for warnings, 0.0 for errors"
161
+
162
+ [[judge.options.rubric_overrides.event.criteria]]
163
+ id = "process.design_iterations"
164
+ weight = 0.3
165
+ scale = "bounded"
166
+ description = "Reward efficient write→compile→simulate workflow, penalize redundant operations"
167
+
168
+ [judge.options.rubric_overrides.outcome]
169
+ goal_text = """
170
+ Evaluate the final Verilog implementation for correctness and quality.
171
+ High scores for working designs that pass all tests with good code quality."""
172
+ aggregation = "weighted_sum"
173
+
174
+ [[judge.options.rubric_overrides.outcome.criteria]]
175
+ id = "outcome.tests_passed"
176
+ weight = 0.8
177
+ scale = "binary"
178
+ description = "Full credit when all tests pass, partial credit for some tests passing"
179
+
180
+ [[judge.options.rubric_overrides.outcome.criteria]]
181
+ id = "outcome.design_quality"
182
+ weight = 0.2
183
+ scale = "bounded"
184
+ description = "Code clarity, proper documentation, and efficient design patterns"
185
+
186
+ [judge.options.weights]
187
+ process = 0.1
188
+ reasoning = 0.2
189
+ progress = 0.3
190
+ outcome = 0.4
@@ -0,0 +1,220 @@
1
+ """Crafter backend judge that calls the Synth judge API with inline rubric."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import os
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List, TypedDict
9
+
10
+ import httpx
11
+
12
+
13
+ class TraceMetadata(TypedDict, total=False):
14
+ """Metadata for the trace."""
15
+ env_id: str
16
+ policy_id: str
17
+ length: int
18
+
19
+
20
+ class JudgeTracePayload(TypedDict):
21
+ """Trace payload sent to backend judge."""
22
+ event_history: List[Dict[str, Any]]
23
+ markov_blanket_message_history: List[Dict[str, Any]]
24
+ metadata: TraceMetadata
25
+
26
+
27
+ class JudgeOptions(TypedDict, total=False):
28
+ """Options for judge scoring."""
29
+ model: str
30
+ timeout_s: int
31
+ event: bool
32
+ outcome: bool
33
+
34
+
35
+ class TaskApp(TypedDict):
36
+ """Task application metadata."""
37
+ id: str
38
+
39
+
40
+ class JudgeScoreRequest(TypedDict):
41
+ """Request to backend judge API."""
42
+ policy_name: str
43
+ task_app: TaskApp
44
+ trace: JudgeTracePayload
45
+ rubric: Dict[str, Any]
46
+ options: JudgeOptions
47
+
48
+
49
+ # Load rubric from file (cached at module level)
50
+ _RUBRIC_PATH = Path(__file__).parent.parent / "rubrics" / "crafter_backend_judge.json"
51
+ _RUBRIC: Dict[str, Any] | None = None
52
+
53
+
54
+ def _load_rubric() -> Dict[str, Any]:
55
+ """Load rubric from file with fallback to inline default."""
56
+ global _RUBRIC
57
+ if _RUBRIC is None:
58
+ try:
59
+ with open(_RUBRIC_PATH, 'r') as f:
60
+ _RUBRIC = json.load(f)
61
+ assert isinstance(_RUBRIC, dict), "Rubric must be a dict"
62
+ assert "outcome" in _RUBRIC, "Rubric must have 'outcome' key"
63
+ assert isinstance(_RUBRIC["outcome"], list), "Rubric 'outcome' must be a list"
64
+ except Exception as e:
65
+ print(f"[crafter_backend_judge] Warning: Failed to load rubric from {_RUBRIC_PATH}: {e}")
66
+ # Fallback inline rubric (matching RubricCriteriaBlock format)
67
+ _RUBRIC = {
68
+ "event": [],
69
+ "outcome": [
70
+ {"id": "achievement_progression", "description": "Achievement progression", "weight": 0.35, "scale": "bounded"},
71
+ {"id": "resource_stockpile", "description": "Resource stockpile", "weight": 0.2, "scale": "bounded"},
72
+ {"id": "survival_state", "description": "Survival state", "weight": 0.2, "scale": "bounded"},
73
+ {"id": "failure_analysis", "description": "Failure analysis", "weight": 0.15, "scale": "bounded"},
74
+ {"id": "future_readiness", "description": "Future readiness", "weight": 0.1, "scale": "bounded"}
75
+ ]
76
+ }
77
+ return _RUBRIC
78
+
79
+
80
+ def judge(payload: Dict[str, Any], **kwargs: Any) -> float:
81
+ """
82
+ Call the Synth backend judge API to score a Crafter rollout.
83
+
84
+ Args:
85
+ payload: Dict with keys: seed, prompt, completion, metrics, response, trace
86
+ **kwargs: Additional config (backend_url, model, timeout_s, etc.)
87
+
88
+ Returns:
89
+ float: Aggregate score from 0.0 to 1.0
90
+ """
91
+ try:
92
+ # Extract configuration
93
+ backend_url = kwargs.get("backend_url", "http://localhost:8000/api")
94
+ model = kwargs.get("model", "openai/gpt-oss-120b")
95
+ timeout = kwargs.get("timeout_s", 45)
96
+
97
+ assert isinstance(backend_url, str), "backend_url must be a string"
98
+ assert isinstance(model, str), "model must be a string"
99
+ assert isinstance(timeout, (int, float)), "timeout_s must be numeric"
100
+
101
+ # Extract trajectory from response
102
+ response_data = payload.get("response", {})
103
+ assert isinstance(response_data, dict), "response must be a dict"
104
+
105
+ trajectories = response_data.get("trajectories", [])
106
+ assert isinstance(trajectories, list), "trajectories must be a list"
107
+
108
+ if not trajectories:
109
+ print("[crafter_backend_judge] No trajectories in response")
110
+ return 0.0
111
+
112
+ trajectory = trajectories[0] # First trajectory
113
+ assert isinstance(trajectory, dict), "trajectory must be a dict"
114
+
115
+ # Load rubric
116
+ rubric = _load_rubric()
117
+
118
+ # Transform trajectory into JudgeTracePayload format
119
+ steps = trajectory.get("steps", [])
120
+ assert isinstance(steps, list), "trajectory steps must be a list"
121
+
122
+ event_history: List[Dict[str, Any]] = []
123
+ for idx, step in enumerate(steps):
124
+ assert isinstance(step, dict), f"step {idx} must be a dict"
125
+ # Each step becomes an event
126
+ event_history.append({
127
+ "observation": step.get("obs", {}),
128
+ "tool_calls": step.get("tool_calls", []),
129
+ "reward": step.get("reward", 0.0),
130
+ "done": step.get("done", False),
131
+ "truncated": step.get("truncated", False),
132
+ "info": step.get("info", {}),
133
+ })
134
+
135
+ # Add final observation - backend will extract this as outcome context
136
+ final_data = trajectory.get("final", {})
137
+ if final_data:
138
+ assert isinstance(final_data, dict), "final data must be a dict"
139
+ final_obs = final_data.get("observation", {})
140
+ assert isinstance(final_obs, dict), "final observation must be a dict"
141
+
142
+ event_history.append({
143
+ "observation": final_obs,
144
+ "reward": final_data.get("reward", 0.0),
145
+ "done": final_data.get("done", True),
146
+ "truncated": final_data.get("truncated", False),
147
+ "info": final_data.get("info", {}),
148
+ })
149
+
150
+ # Build trace metadata
151
+ metadata: TraceMetadata = {
152
+ "env_id": trajectory.get("env_id", "crafter"),
153
+ "policy_id": trajectory.get("policy_id", "crafter-react"),
154
+ "length": trajectory.get("length", len(steps)),
155
+ }
156
+
157
+ # Build judge request with rubric included
158
+ judge_request: JudgeScoreRequest = {
159
+ "policy_name": "crafter-react",
160
+ "task_app": {"id": "grpo-crafter-task-app"},
161
+ "trace": {
162
+ "event_history": event_history,
163
+ "markov_blanket_message_history": [],
164
+ "metadata": metadata,
165
+ },
166
+ "rubric": rubric,
167
+ "options": {
168
+ "model": model,
169
+ "timeout_s": timeout,
170
+ "event": False, # Not scoring per-event
171
+ "outcome": True, # Score the final outcome
172
+ }
173
+ }
174
+
175
+ # Call backend judge API
176
+ with httpx.Client(timeout=timeout) as client:
177
+ # Get API key from env
178
+ api_key = os.environ.get("SYNTH_API_KEY") or os.environ.get("OPENAI_API_KEY")
179
+ headers = {}
180
+ if api_key:
181
+ headers["Authorization"] = f"Bearer {api_key}"
182
+
183
+ url = f"{backend_url.rstrip('/')}/judge/v1/score"
184
+
185
+ # Debug: print request summary
186
+ print(f"\n[crafter_backend_judge] Scoring trajectory with {len(event_history)} events")
187
+ if event_history:
188
+ last_obs = event_history[-1].get('observation', {})
189
+ print(f" Final observation keys: {list(last_obs.keys())[:5]}...")
190
+
191
+ response = client.post(url, json=judge_request, headers=headers)
192
+
193
+ response.raise_for_status()
194
+ result = response.json()
195
+ assert isinstance(result, dict), "Response must be a dict"
196
+
197
+ # Extract aggregate score
198
+ aggregate_score = result.get("aggregate_score", 0.0)
199
+
200
+ # Try outcome_review.total if aggregate_score not found
201
+ if aggregate_score == 0.0 and "outcome_review" in result:
202
+ outcome_review = result["outcome_review"]
203
+ if isinstance(outcome_review, dict):
204
+ aggregate_score = outcome_review.get("total", 0.0)
205
+
206
+ print(f" Backend judge score: {aggregate_score:.3f}\n")
207
+ return float(aggregate_score)
208
+
209
+ except httpx.HTTPStatusError as e:
210
+ print(f"\n[crafter_backend_judge] HTTP ERROR:")
211
+ print(f" Status: {e.response.status_code}")
212
+ print(f" Response: {e.response.text[:300]}\n")
213
+ return 0.0
214
+ except AssertionError as e:
215
+ print(f"[crafter_backend_judge] Assertion error: {e}")
216
+ return 0.0
217
+ except Exception as e:
218
+ print(f"[crafter_backend_judge] Unexpected error: {e}")
219
+ return 0.0
220
+
@@ -0,0 +1,234 @@
1
+ """Verilog backend judge that calls the Synth judge API with inline rubric."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import os
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List, TypedDict
9
+
10
+ import httpx
11
+
12
+
13
+ class TraceMetadata(TypedDict, total=False):
14
+ """Metadata for the trace."""
15
+ env_id: str
16
+ policy_id: str
17
+ length: int
18
+
19
+
20
+ class JudgeTracePayload(TypedDict):
21
+ """Trace payload sent to backend judge."""
22
+ event_history: List[Dict[str, Any]]
23
+ markov_blanket_message_history: List[Dict[str, Any]]
24
+ metadata: TraceMetadata
25
+
26
+
27
+ class JudgeOptions(TypedDict, total=False):
28
+ """Options for judge scoring."""
29
+ model: str
30
+ timeout_s: int
31
+ event: bool
32
+ outcome: bool
33
+
34
+
35
+ class TaskApp(TypedDict):
36
+ """Task application metadata."""
37
+ id: str
38
+
39
+
40
+ class JudgeScoreRequest(TypedDict):
41
+ """Request to backend judge API."""
42
+ policy_name: str
43
+ task_app: TaskApp
44
+ trace: JudgeTracePayload
45
+ rubric: Dict[str, Any]
46
+ options: JudgeOptions
47
+
48
+
49
+ # Load rubric from file (cached at module level)
50
+ _RUBRIC_PATH = Path(__file__).parent.parent / "rubrics" / "verilog_backend_judge.json"
51
+ _RUBRIC: Dict[str, Any] | None = None
52
+
53
+
54
+ def _load_rubric() -> Dict[str, Any]:
55
+ """Load rubric from file with fallback to inline default."""
56
+ global _RUBRIC
57
+ if _RUBRIC is None:
58
+ try:
59
+ with open(_RUBRIC_PATH, 'r') as f:
60
+ _RUBRIC = json.load(f)
61
+ assert isinstance(_RUBRIC, dict), "Rubric must be a dict"
62
+ assert "outcome" in _RUBRIC, "Rubric must have 'outcome' key"
63
+ assert isinstance(_RUBRIC["outcome"], list), "Rubric 'outcome' must be a list"
64
+ except Exception as e:
65
+ print(f"[verilog_backend_judge] Warning: Failed to load rubric from {_RUBRIC_PATH}: {e}")
66
+ # Fallback inline rubric (matching RubricCriteriaBlock format)
67
+ _RUBRIC = {
68
+ "event": [],
69
+ "outcome": [
70
+ {"id": "correctness.tests_pass", "description": "Tests pass", "weight": 0.5, "scale": "bounded"},
71
+ {"id": "efficiency.code_quality", "description": "Code quality", "weight": 0.3, "scale": "bounded"},
72
+ {"id": "efficiency.solution_steps", "description": "Solution efficiency", "weight": 0.2, "scale": "bounded"}
73
+ ]
74
+ }
75
+ return _RUBRIC
76
+
77
+
78
+ def judge(payload: Dict[str, Any], **kwargs: Any) -> float:
79
+ """
80
+ Call the Synth backend judge API to score a Verilog rollout.
81
+
82
+ Args:
83
+ payload: Dict with keys: seed, prompt, completion, metrics, response, trace
84
+ **kwargs: Additional config (backend_url, model, timeout_s, etc.)
85
+
86
+ Returns:
87
+ float: Aggregate score from 0.0 to 1.0
88
+ """
89
+ try:
90
+ # Extract configuration
91
+ backend_url = kwargs.get("backend_url", "http://localhost:8000/api")
92
+ model = kwargs.get("model", "openai/gpt-oss-120b")
93
+ timeout = kwargs.get("timeout_s", 45)
94
+
95
+ assert isinstance(backend_url, str), "backend_url must be a string"
96
+ assert isinstance(model, str), "model must be a string"
97
+ assert isinstance(timeout, (int, float)), "timeout_s must be numeric"
98
+
99
+ # Extract trajectory from response
100
+ response_data = payload.get("response", {})
101
+ assert isinstance(response_data, dict), "response must be a dict"
102
+
103
+ trajectories = response_data.get("trajectories", [])
104
+ assert isinstance(trajectories, list), "trajectories must be a list"
105
+
106
+ if not trajectories:
107
+ print("[verilog_backend_judge] No trajectories in response")
108
+ return 0.0
109
+
110
+ trajectory = trajectories[0] # First trajectory
111
+ assert isinstance(trajectory, dict), "trajectory must be a dict"
112
+
113
+ # Load rubric
114
+ rubric = _load_rubric()
115
+
116
+ # Transform trajectory into JudgeTracePayload format
117
+ # The backend expects: event_history, markov_blanket_message_history, metadata
118
+ steps = trajectory.get("steps", [])
119
+ assert isinstance(steps, list), "trajectory steps must be a list"
120
+
121
+ event_history: List[Dict[str, Any]] = []
122
+ for idx, step in enumerate(steps):
123
+ assert isinstance(step, dict), f"step {idx} must be a dict"
124
+ # Each step becomes an event with obs, tool_calls, reward, done, info
125
+ event_history.append({
126
+ "observation": step.get("obs", {}),
127
+ "tool_calls": step.get("tool_calls", []),
128
+ "reward": step.get("reward", 0.0),
129
+ "done": step.get("done", False),
130
+ "truncated": step.get("truncated", False),
131
+ "info": step.get("info", {}),
132
+ })
133
+
134
+ # Add final observation - backend will extract this as outcome context
135
+ final_data = trajectory.get("final", {})
136
+ if final_data:
137
+ assert isinstance(final_data, dict), "final data must be a dict"
138
+ final_obs = final_data.get("observation", {})
139
+ assert isinstance(final_obs, dict), "final observation must be a dict"
140
+
141
+ event_history.append({
142
+ "observation": final_obs,
143
+ "reward": final_data.get("reward", 0.0),
144
+ "done": final_data.get("done", True),
145
+ "truncated": final_data.get("truncated", False),
146
+ "info": final_data.get("info", {}),
147
+ })
148
+
149
+ # Build trace metadata
150
+ metadata: TraceMetadata = {
151
+ "env_id": trajectory.get("env_id", "verilog"),
152
+ "policy_id": trajectory.get("policy_id", "verilog-designer"),
153
+ "length": trajectory.get("length", len(steps)),
154
+ }
155
+
156
+ # Build judge request with rubric included
157
+ judge_request: JudgeScoreRequest = {
158
+ "policy_name": "verilog-designer",
159
+ "task_app": {"id": "grpo-verilog"},
160
+ "trace": {
161
+ "event_history": event_history,
162
+ "markov_blanket_message_history": [],
163
+ "metadata": metadata,
164
+ },
165
+ "rubric": rubric,
166
+ "options": {
167
+ "model": model,
168
+ "timeout_s": timeout,
169
+ "event": False, # Not scoring per-event
170
+ "outcome": True, # Score the final outcome
171
+ }
172
+ }
173
+
174
+ # Call backend judge API
175
+ with httpx.Client(timeout=timeout) as client:
176
+ # Get API key from env
177
+ api_key = os.environ.get("SYNTH_API_KEY") or os.environ.get("OPENAI_API_KEY")
178
+ headers = {}
179
+ if api_key:
180
+ headers["Authorization"] = f"Bearer {api_key}"
181
+
182
+ url = f"{backend_url.rstrip('/')}/judge/v1/score"
183
+
184
+ # Debug: print request details
185
+ print(f"\n[verilog_backend_judge] REQUEST DEBUG:")
186
+ print(f" URL: {url}")
187
+ print(f" Request body keys: {list(judge_request.keys())}")
188
+ rubric_data = judge_request.get('rubric', {})
189
+ print(f" Rubric event criteria: {len(rubric_data.get('event', []))}")
190
+ print(f" Rubric outcome criteria: {len(rubric_data.get('outcome', []))}")
191
+ trace_data = judge_request.get('trace', {})
192
+ event_hist = trace_data.get('event_history', [])
193
+ print(f" Trace event_history count: {len(event_hist)}")
194
+ if event_hist:
195
+ last_event = event_hist[-1]
196
+ last_obs = last_event.get('observation', {})
197
+ print(f" Last event done: {last_event.get('done', False)}")
198
+ print(f" Last obs keys: {list(last_obs.keys())}")
199
+ print(f" Task completed: {last_obs.get('task_completed', 'N/A')}")
200
+
201
+ response = client.post(url, json=judge_request, headers=headers)
202
+
203
+ # Debug: print response details
204
+ print(f"\n[verilog_backend_judge] RESPONSE DEBUG:")
205
+ print(f" Status: {response.status_code}")
206
+ print(f" Response body: {response.text[:500]}") # First 500 chars
207
+
208
+ response.raise_for_status()
209
+ result = response.json()
210
+ assert isinstance(result, dict), "Response must be a dict"
211
+
212
+ # Extract aggregate score
213
+ aggregate_score = result.get("aggregate_score", 0.0)
214
+
215
+ # Try outcome_review.total if aggregate_score not found
216
+ if aggregate_score == 0.0 and "outcome_review" in result:
217
+ outcome_review = result["outcome_review"]
218
+ if isinstance(outcome_review, dict):
219
+ aggregate_score = outcome_review.get("total", 0.0)
220
+
221
+ print(f" Aggregate score: {aggregate_score}\n")
222
+ return float(aggregate_score)
223
+
224
+ except httpx.HTTPStatusError as e:
225
+ print(f"\n[verilog_backend_judge] HTTP ERROR:")
226
+ print(f" Status: {e.response.status_code}")
227
+ print(f" Response body: {e.response.text}\n")
228
+ return 0.0
229
+ except AssertionError as e:
230
+ print(f"[verilog_backend_judge] Assertion error: {e}")
231
+ return 0.0
232
+ except Exception as e:
233
+ print(f"[verilog_backend_judge] Unexpected error: {e}")
234
+ return 0.0