skillopt 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scripts/__init__.py +0 -0
- scripts/eval_only.py +451 -0
- scripts/train.py +548 -0
- skillopt/__init__.py +28 -0
- skillopt/config.py +286 -0
- skillopt/datasets/__init__.py +7 -0
- skillopt/datasets/base.py +512 -0
- skillopt/engine/__init__.py +9 -0
- skillopt/engine/trainer.py +2083 -0
- skillopt/envs/__init__.py +1 -0
- skillopt/envs/_template/env_template.py +196 -0
- skillopt/envs/_template/loader_template.py +87 -0
- skillopt/envs/alfworld/__init__.py +5 -0
- skillopt/envs/alfworld/adapter.py +459 -0
- skillopt/envs/alfworld/dataloader.py +123 -0
- skillopt/envs/alfworld/reflect.py +4 -0
- skillopt/envs/alfworld/rollout.py +347 -0
- skillopt/envs/alfworld/vendor/__init__.py +9 -0
- skillopt/envs/alfworld/vendor/alfworld_envs.py +221 -0
- skillopt/envs/alfworld/vendor/alfworld_projection.py +60 -0
- skillopt/envs/alfworld/vendor/alfworld_prompts.py +8 -0
- skillopt/envs/alfworld/vendor/env_base.py +84 -0
- skillopt/envs/alfworld/vendor/env_manager.py +139 -0
- skillopt/envs/alfworld/vendor/memory.py +87 -0
- skillopt/envs/base.py +309 -0
- skillopt/envs/docvqa/__init__.py +1 -0
- skillopt/envs/docvqa/adapter.py +115 -0
- skillopt/envs/docvqa/dataloader.py +61 -0
- skillopt/envs/docvqa/evaluator.py +113 -0
- skillopt/envs/docvqa/rollout.py +391 -0
- skillopt/envs/livemathematicianbench/__init__.py +1 -0
- skillopt/envs/livemathematicianbench/adapter.py +162 -0
- skillopt/envs/livemathematicianbench/dataloader.py +308 -0
- skillopt/envs/livemathematicianbench/evaluator.py +62 -0
- skillopt/envs/livemathematicianbench/reflect.py +4 -0
- skillopt/envs/livemathematicianbench/rollout.py +434 -0
- skillopt/envs/officeqa/__init__.py +1 -0
- skillopt/envs/officeqa/adapter.py +135 -0
- skillopt/envs/officeqa/dataloader.py +71 -0
- skillopt/envs/officeqa/evaluator.py +46 -0
- skillopt/envs/officeqa/rollout.py +799 -0
- skillopt/envs/officeqa/tool_runtime.py +552 -0
- skillopt/envs/searchqa/__init__.py +1 -0
- skillopt/envs/searchqa/adapter.py +129 -0
- skillopt/envs/searchqa/dataloader.py +42 -0
- skillopt/envs/searchqa/evaluator.py +100 -0
- skillopt/envs/searchqa/reflect.py +4 -0
- skillopt/envs/searchqa/rollout.py +481 -0
- skillopt/envs/spreadsheetbench/__init__.py +5 -0
- skillopt/envs/spreadsheetbench/adapter.py +192 -0
- skillopt/envs/spreadsheetbench/codegen_agent.py +726 -0
- skillopt/envs/spreadsheetbench/dataloader.py +37 -0
- skillopt/envs/spreadsheetbench/evaluator.py +158 -0
- skillopt/envs/spreadsheetbench/executor.py +67 -0
- skillopt/envs/spreadsheetbench/react_agent.py +395 -0
- skillopt/envs/spreadsheetbench/reflect.py +4 -0
- skillopt/envs/spreadsheetbench/rollout.py +934 -0
- skillopt/evaluation/__init__.py +13 -0
- skillopt/evaluation/gate.py +148 -0
- skillopt/gradient/__init__.py +15 -0
- skillopt/gradient/aggregate.py +253 -0
- skillopt/gradient/reflect.py +588 -0
- skillopt/model/__init__.py +512 -0
- skillopt/model/azure_openai.py +915 -0
- skillopt/model/backend_config.py +185 -0
- skillopt/model/claude_backend.py +359 -0
- skillopt/model/codex_backend.py +664 -0
- skillopt/model/codex_harness.py +1057 -0
- skillopt/model/common.py +229 -0
- skillopt/model/minimax_backend.py +277 -0
- skillopt/model/qwen_backend.py +455 -0
- skillopt/model/router.py +236 -0
- skillopt/optimizer/__init__.py +15 -0
- skillopt/optimizer/clip.py +109 -0
- skillopt/optimizer/lr_autonomous.py +108 -0
- skillopt/optimizer/meta_skill.py +79 -0
- skillopt/optimizer/rewrite.py +59 -0
- skillopt/optimizer/scheduler.py +127 -0
- skillopt/optimizer/select.py +4 -0
- skillopt/optimizer/skill.py +164 -0
- skillopt/optimizer/slow_update.py +396 -0
- skillopt/optimizer/update_modes.py +136 -0
- skillopt/prompts/__init__.py +63 -0
- skillopt/scheduler/__init__.py +8 -0
- skillopt/types.py +306 -0
- skillopt/utils/__init__.py +4 -0
- skillopt/utils/json_utils.py +42 -0
- skillopt/utils/scoring.py +28 -0
- skillopt-0.1.0.dist-info/LICENSE +21 -0
- skillopt-0.1.0.dist-info/METADATA +444 -0
- skillopt-0.1.0.dist-info/RECORD +97 -0
- skillopt-0.1.0.dist-info/WHEEL +5 -0
- skillopt-0.1.0.dist-info/entry_points.txt +3 -0
- skillopt-0.1.0.dist-info/top_level.txt +3 -0
- skillopt_webui/__init__.py +0 -0
- skillopt_webui/__main__.py +3 -0
- skillopt_webui/app.py +550 -0
scripts/__init__.py
ADDED
|
File without changes
|
scripts/eval_only.py
ADDED
|
@@ -0,0 +1,451 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""SkillOpt eval-only: run a single skill on a dataset without training.
|
|
3
|
+
|
|
4
|
+
Usage
|
|
5
|
+
-----
|
|
6
|
+
python scripts/eval_only.py \
|
|
7
|
+
--config configs/spreadsheetbench/default.yaml \
|
|
8
|
+
--skill skillopt/envs/spreadsheetbench/skills/initial.md \
|
|
9
|
+
--split_dir /path/to/split \
|
|
10
|
+
--out_root outputs/eval_skill0
|
|
11
|
+
|
|
12
|
+
All YAML keys can be overridden from the CLI, same as train.py.
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import datetime
|
|
18
|
+
import json
|
|
19
|
+
import os
|
|
20
|
+
import sys
|
|
21
|
+
|
|
22
|
+
_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
23
|
+
_PROJECT_ROOT = os.path.dirname(_SCRIPT_DIR)
|
|
24
|
+
if _PROJECT_ROOT not in sys.path:
|
|
25
|
+
sys.path.insert(0, _PROJECT_ROOT)
|
|
26
|
+
|
|
27
|
+
from skillopt.model import (
|
|
28
|
+
configure_azure_openai,
|
|
29
|
+
configure_claude_code_exec,
|
|
30
|
+
configure_codex_exec,
|
|
31
|
+
set_reasoning_effort,
|
|
32
|
+
set_target_backend,
|
|
33
|
+
set_target_deployment,
|
|
34
|
+
set_optimizer_backend,
|
|
35
|
+
set_optimizer_deployment,
|
|
36
|
+
)
|
|
37
|
+
from skillopt.model.common import default_model_for_backend, normalize_backend_name
|
|
38
|
+
|
|
39
|
+
_OPENAI_DEFAULT_MODEL_SENTINELS = {"gpt-5.4", "gpt-5.5"}
|
|
40
|
+
from skillopt.utils import compute_score
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# ── Reuse registry from train.py ───────────────────────────────────────────
|
|
44
|
+
|
|
45
|
+
_ENV_REGISTRY: dict[str, type] = {}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _register_builtins() -> None:
|
|
49
|
+
try:
|
|
50
|
+
from skillopt.envs.alfworld.adapter import ALFWorldAdapter
|
|
51
|
+
_ENV_REGISTRY["alfworld"] = ALFWorldAdapter
|
|
52
|
+
except ImportError:
|
|
53
|
+
pass
|
|
54
|
+
try:
|
|
55
|
+
from skillopt.envs.searchqa.adapter import SearchQAAdapter
|
|
56
|
+
_ENV_REGISTRY["searchqa"] = SearchQAAdapter
|
|
57
|
+
except ImportError:
|
|
58
|
+
pass
|
|
59
|
+
try:
|
|
60
|
+
from skillopt.envs.livemathematicianbench.adapter import LiveMathematicianBenchAdapter
|
|
61
|
+
_ENV_REGISTRY["livemathematicianbench"] = LiveMathematicianBenchAdapter
|
|
62
|
+
except ImportError:
|
|
63
|
+
pass
|
|
64
|
+
try:
|
|
65
|
+
from skillopt.envs.babyvision.adapter import BabyVisionAdapter
|
|
66
|
+
_ENV_REGISTRY["babyvision"] = BabyVisionAdapter
|
|
67
|
+
except ImportError:
|
|
68
|
+
pass
|
|
69
|
+
try:
|
|
70
|
+
from skillopt.envs.spreadsheetbench.adapter import SpreadsheetBenchAdapter
|
|
71
|
+
_ENV_REGISTRY["spreadsheetbench"] = SpreadsheetBenchAdapter
|
|
72
|
+
except ImportError:
|
|
73
|
+
pass
|
|
74
|
+
try:
|
|
75
|
+
from skillopt.envs.mmrb.adapter import MMRBAdapter
|
|
76
|
+
_ENV_REGISTRY["mmrb"] = MMRBAdapter
|
|
77
|
+
except ImportError:
|
|
78
|
+
pass
|
|
79
|
+
try:
|
|
80
|
+
from skillopt.envs.docvqa.adapter import DocVQAAdapter
|
|
81
|
+
_ENV_REGISTRY["docvqa"] = DocVQAAdapter
|
|
82
|
+
except ImportError:
|
|
83
|
+
pass
|
|
84
|
+
try:
|
|
85
|
+
from skillopt.envs.mathverse.adapter import MathVerseAdapter
|
|
86
|
+
_ENV_REGISTRY["mathverse"] = MathVerseAdapter
|
|
87
|
+
except ImportError:
|
|
88
|
+
pass
|
|
89
|
+
try:
|
|
90
|
+
from skillopt.envs.officeqa.adapter import OfficeQAAdapter
|
|
91
|
+
_ENV_REGISTRY["officeqa"] = OfficeQAAdapter
|
|
92
|
+
except ImportError:
|
|
93
|
+
pass
|
|
94
|
+
try:
|
|
95
|
+
from skillopt.envs.sealqa.adapter import SealQAAdapter
|
|
96
|
+
_ENV_REGISTRY["sealqa"] = SealQAAdapter
|
|
97
|
+
except ImportError:
|
|
98
|
+
pass
|
|
99
|
+
try:
|
|
100
|
+
from skillopt.envs.swebench.adapter import SWEBenchAdapter
|
|
101
|
+
_ENV_REGISTRY["swebench"] = SWEBenchAdapter
|
|
102
|
+
except ImportError:
|
|
103
|
+
pass
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def get_adapter(cfg: dict):
|
|
107
|
+
_register_builtins()
|
|
108
|
+
env_name = cfg.get("env", "alfworld")
|
|
109
|
+
if env_name not in _ENV_REGISTRY:
|
|
110
|
+
raise ValueError(
|
|
111
|
+
f"Unknown environment '{env_name}'. "
|
|
112
|
+
f"Available: {list(_ENV_REGISTRY.keys())}"
|
|
113
|
+
)
|
|
114
|
+
adapter_cls = _ENV_REGISTRY[env_name]
|
|
115
|
+
|
|
116
|
+
import inspect
|
|
117
|
+
sig = inspect.signature(adapter_cls.__init__)
|
|
118
|
+
accepted = set(sig.parameters.keys()) - {"self"}
|
|
119
|
+
adapter_kwargs = {k: cfg[k] for k in accepted if k in cfg}
|
|
120
|
+
return adapter_cls(**adapter_kwargs)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
# ── CLI ────────────────────────────────────────────────────────────────────
|
|
124
|
+
|
|
125
|
+
_BOOL = lambda x: str(x).lower() in ("true", "1", "yes") # noqa: E731
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def parse_args() -> argparse.Namespace:
|
|
129
|
+
p = argparse.ArgumentParser(description="SkillOpt eval-only")
|
|
130
|
+
p.add_argument("--config", type=str, required=True)
|
|
131
|
+
p.add_argument("--skill", type=str, required=True,
|
|
132
|
+
help="Path to skill .md file to evaluate")
|
|
133
|
+
p.add_argument("--split", type=str, default="all",
|
|
134
|
+
help="Which split to eval: train/valid_seen/valid_unseen/all (default: all)")
|
|
135
|
+
p.add_argument("--cfg-options", nargs="+", default=[],
|
|
136
|
+
help="Override config: section.key=value")
|
|
137
|
+
# Legacy flat overrides
|
|
138
|
+
p.add_argument("--env", type=str)
|
|
139
|
+
p.add_argument("--backend", type=str,
|
|
140
|
+
choices=["azure_openai", "codex", "codex_exec", "claude", "claude_chat", "claude_code_exec"])
|
|
141
|
+
p.add_argument("--optimizer_model", type=str)
|
|
142
|
+
p.add_argument("--target_model", type=str)
|
|
143
|
+
p.add_argument("--optimizer_backend", type=str)
|
|
144
|
+
p.add_argument("--target_backend", type=str)
|
|
145
|
+
p.add_argument("--reasoning_effort", type=str,
|
|
146
|
+
choices=["", "low", "medium", "high", "xhigh", "max"])
|
|
147
|
+
p.add_argument("--azure_endpoint", type=str)
|
|
148
|
+
p.add_argument("--azure_api_version", type=str)
|
|
149
|
+
p.add_argument("--azure_api_key", type=str)
|
|
150
|
+
p.add_argument("--azure_openai_endpoint", type=str)
|
|
151
|
+
p.add_argument("--azure_openai_api_version", type=str)
|
|
152
|
+
p.add_argument("--azure_openai_api_key", type=str)
|
|
153
|
+
p.add_argument("--azure_openai_auth_mode", type=str)
|
|
154
|
+
p.add_argument("--azure_openai_ad_scope", type=str)
|
|
155
|
+
p.add_argument("--azure_openai_managed_identity_client_id", type=str)
|
|
156
|
+
p.add_argument("--optimizer_azure_openai_endpoint", type=str)
|
|
157
|
+
p.add_argument("--optimizer_azure_openai_api_version", type=str)
|
|
158
|
+
p.add_argument("--optimizer_azure_openai_api_key", type=str)
|
|
159
|
+
p.add_argument("--optimizer_azure_openai_auth_mode", type=str)
|
|
160
|
+
p.add_argument("--optimizer_azure_openai_ad_scope", type=str)
|
|
161
|
+
p.add_argument("--optimizer_azure_openai_managed_identity_client_id", type=str)
|
|
162
|
+
p.add_argument("--target_azure_openai_endpoint", type=str)
|
|
163
|
+
p.add_argument("--target_azure_openai_api_version", type=str)
|
|
164
|
+
p.add_argument("--target_azure_openai_api_key", type=str)
|
|
165
|
+
p.add_argument("--target_azure_openai_auth_mode", type=str)
|
|
166
|
+
p.add_argument("--target_azure_openai_ad_scope", type=str)
|
|
167
|
+
p.add_argument("--target_azure_openai_managed_identity_client_id", type=str)
|
|
168
|
+
p.add_argument("--codex_exec_path", type=str)
|
|
169
|
+
p.add_argument("--codex_exec_sandbox", type=str)
|
|
170
|
+
p.add_argument("--codex_exec_profile", type=str)
|
|
171
|
+
p.add_argument("--codex_exec_full_auto", type=_BOOL)
|
|
172
|
+
p.add_argument("--codex_exec_reasoning_effort", type=str)
|
|
173
|
+
p.add_argument("--codex_exec_use_sdk", type=str)
|
|
174
|
+
p.add_argument("--codex_exec_network_access", type=_BOOL)
|
|
175
|
+
p.add_argument("--codex_exec_web_search", type=_BOOL)
|
|
176
|
+
p.add_argument("--codex_exec_approval_policy", type=str)
|
|
177
|
+
p.add_argument("--claude_code_exec_path", type=str)
|
|
178
|
+
p.add_argument("--claude_code_exec_profile", type=str)
|
|
179
|
+
p.add_argument("--claude_code_exec_use_sdk", type=str)
|
|
180
|
+
p.add_argument("--claude_code_exec_effort", type=str)
|
|
181
|
+
p.add_argument("--claude_code_exec_max_thinking_tokens", type=int)
|
|
182
|
+
p.add_argument("--out_root", type=str)
|
|
183
|
+
p.add_argument("--data_path", type=str)
|
|
184
|
+
p.add_argument("--split_mode", type=str,
|
|
185
|
+
choices=["ratio", "split_dir"])
|
|
186
|
+
p.add_argument("--split_ratio", type=str)
|
|
187
|
+
p.add_argument("--split_seed", type=int)
|
|
188
|
+
p.add_argument("--split_dir", type=str)
|
|
189
|
+
p.add_argument("--split_output_dir", type=str)
|
|
190
|
+
p.add_argument("--data_root", type=str)
|
|
191
|
+
p.add_argument("--max_turns", type=int)
|
|
192
|
+
p.add_argument("--workers", type=int)
|
|
193
|
+
p.add_argument("--max_api_workers", type=int)
|
|
194
|
+
p.add_argument("--seed", type=int)
|
|
195
|
+
p.add_argument("--test_env_num", type=int)
|
|
196
|
+
p.add_argument("--mode", type=str,
|
|
197
|
+
help="SpreadsheetBench: single/multi/react (default comes from config)")
|
|
198
|
+
return p.parse_args()
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def main() -> None:
|
|
202
|
+
args = parse_args()
|
|
203
|
+
|
|
204
|
+
from skillopt.config import load_config as _load, flatten_config, is_structured
|
|
205
|
+
|
|
206
|
+
cfg = _load(args.config, overrides=args.cfg_options)
|
|
207
|
+
structured = is_structured(cfg)
|
|
208
|
+
|
|
209
|
+
# Apply legacy --key value overrides
|
|
210
|
+
cli = {k: v for k, v in vars(args).items()
|
|
211
|
+
if v is not None and k not in ("config", "skill", "split", "cfg_options")}
|
|
212
|
+
if cli:
|
|
213
|
+
if structured:
|
|
214
|
+
from skillopt.config import apply_overrides
|
|
215
|
+
_MAP = {
|
|
216
|
+
"backend": "model.backend",
|
|
217
|
+
"optimizer_model": "model.optimizer",
|
|
218
|
+
"target_model": "model.target",
|
|
219
|
+
"optimizer_backend": "model.optimizer_backend",
|
|
220
|
+
"target_backend": "model.target_backend",
|
|
221
|
+
"reasoning_effort": "model.reasoning_effort",
|
|
222
|
+
"azure_endpoint": "model.azure_endpoint",
|
|
223
|
+
"azure_api_version": "model.azure_api_version",
|
|
224
|
+
"azure_api_key": "model.azure_api_key",
|
|
225
|
+
"azure_openai_endpoint": "model.azure_openai_endpoint",
|
|
226
|
+
"azure_openai_api_version": "model.azure_openai_api_version",
|
|
227
|
+
"azure_openai_api_key": "model.azure_openai_api_key",
|
|
228
|
+
"azure_openai_auth_mode": "model.azure_openai_auth_mode",
|
|
229
|
+
"azure_openai_ad_scope": "model.azure_openai_ad_scope",
|
|
230
|
+
"azure_openai_managed_identity_client_id": "model.azure_openai_managed_identity_client_id",
|
|
231
|
+
"optimizer_azure_openai_endpoint": "model.optimizer_azure_openai_endpoint",
|
|
232
|
+
"optimizer_azure_openai_api_version": "model.optimizer_azure_openai_api_version",
|
|
233
|
+
"optimizer_azure_openai_api_key": "model.optimizer_azure_openai_api_key",
|
|
234
|
+
"optimizer_azure_openai_auth_mode": "model.optimizer_azure_openai_auth_mode",
|
|
235
|
+
"optimizer_azure_openai_ad_scope": "model.optimizer_azure_openai_ad_scope",
|
|
236
|
+
"optimizer_azure_openai_managed_identity_client_id": "model.optimizer_azure_openai_managed_identity_client_id",
|
|
237
|
+
"target_azure_openai_endpoint": "model.target_azure_openai_endpoint",
|
|
238
|
+
"target_azure_openai_api_version": "model.target_azure_openai_api_version",
|
|
239
|
+
"target_azure_openai_api_key": "model.target_azure_openai_api_key",
|
|
240
|
+
"target_azure_openai_auth_mode": "model.target_azure_openai_auth_mode",
|
|
241
|
+
"target_azure_openai_ad_scope": "model.target_azure_openai_ad_scope",
|
|
242
|
+
"target_azure_openai_managed_identity_client_id": "model.target_azure_openai_managed_identity_client_id",
|
|
243
|
+
"codex_exec_path": "model.codex_exec_path",
|
|
244
|
+
"codex_exec_sandbox": "model.codex_exec_sandbox",
|
|
245
|
+
"codex_exec_profile": "model.codex_exec_profile",
|
|
246
|
+
"codex_exec_full_auto": "model.codex_exec_full_auto",
|
|
247
|
+
"codex_exec_reasoning_effort": "model.codex_exec_reasoning_effort",
|
|
248
|
+
"codex_exec_use_sdk": "model.codex_exec_use_sdk",
|
|
249
|
+
"codex_exec_network_access": "model.codex_exec_network_access",
|
|
250
|
+
"codex_exec_web_search": "model.codex_exec_web_search",
|
|
251
|
+
"codex_exec_approval_policy": "model.codex_exec_approval_policy",
|
|
252
|
+
"claude_code_exec_path": "model.claude_code_exec_path",
|
|
253
|
+
"claude_code_exec_profile": "model.claude_code_exec_profile",
|
|
254
|
+
"claude_code_exec_use_sdk": "model.claude_code_exec_use_sdk",
|
|
255
|
+
"claude_code_exec_effort": "model.claude_code_exec_effort",
|
|
256
|
+
"claude_code_exec_max_thinking_tokens": "model.claude_code_exec_max_thinking_tokens",
|
|
257
|
+
"seed": "train.seed",
|
|
258
|
+
"test_env_num": "evaluation.test_env_num",
|
|
259
|
+
"env": "env.name",
|
|
260
|
+
"out_root": "env.out_root",
|
|
261
|
+
}
|
|
262
|
+
mapped = []
|
|
263
|
+
for k, v in cli.items():
|
|
264
|
+
dotted = _MAP.get(k)
|
|
265
|
+
if dotted:
|
|
266
|
+
mapped.append(f"{dotted}={v}")
|
|
267
|
+
else:
|
|
268
|
+
mapped.append(f"env.{k}={v}")
|
|
269
|
+
apply_overrides(cfg, mapped)
|
|
270
|
+
else:
|
|
271
|
+
cfg.update(cli)
|
|
272
|
+
|
|
273
|
+
cfg = flatten_config(cfg) if structured else cfg
|
|
274
|
+
|
|
275
|
+
for new_key, old_key in (
|
|
276
|
+
("azure_openai_endpoint", "azure_endpoint"),
|
|
277
|
+
("azure_openai_api_version", "azure_api_version"),
|
|
278
|
+
("azure_openai_api_key", "azure_api_key"),
|
|
279
|
+
):
|
|
280
|
+
if cfg.get(new_key) in (None, "") and cfg.get(old_key) not in (None, ""):
|
|
281
|
+
cfg[new_key] = cfg[old_key]
|
|
282
|
+
|
|
283
|
+
explicit_backend = getattr(args, "backend", None)
|
|
284
|
+
if explicit_backend is None:
|
|
285
|
+
for option in args.cfg_options or []:
|
|
286
|
+
key = str(option).split("=", 1)[0].strip()
|
|
287
|
+
if key == "model.backend":
|
|
288
|
+
explicit_backend = str(option).split("=", 1)[1].strip()
|
|
289
|
+
break
|
|
290
|
+
|
|
291
|
+
backend = normalize_backend_name(cfg.get("model_backend") or cfg.get("target_backend") or "azure_openai")
|
|
292
|
+
|
|
293
|
+
def _has_model_override(dotted_key: str, legacy_key: str) -> bool:
|
|
294
|
+
if getattr(args, legacy_key, None) is not None:
|
|
295
|
+
return True
|
|
296
|
+
for option in args.cfg_options or []:
|
|
297
|
+
key = str(option).split("=", 1)[0].strip()
|
|
298
|
+
if key == dotted_key:
|
|
299
|
+
return True
|
|
300
|
+
return False
|
|
301
|
+
|
|
302
|
+
if explicit_backend is not None:
|
|
303
|
+
backend = normalize_backend_name(explicit_backend)
|
|
304
|
+
cfg["model_backend"] = backend
|
|
305
|
+
if backend in {"claude", "claude_chat"}:
|
|
306
|
+
cfg.setdefault("optimizer_backend", "claude_chat")
|
|
307
|
+
cfg.setdefault("target_backend", "claude_chat")
|
|
308
|
+
elif backend in {"codex", "codex_exec"}:
|
|
309
|
+
cfg.setdefault("optimizer_backend", "openai_chat")
|
|
310
|
+
cfg.setdefault("target_backend", "codex_exec")
|
|
311
|
+
elif backend == "claude_code_exec":
|
|
312
|
+
cfg.setdefault("optimizer_backend", "openai_chat")
|
|
313
|
+
cfg.setdefault("target_backend", "claude_code_exec")
|
|
314
|
+
else:
|
|
315
|
+
cfg.setdefault("optimizer_backend", "openai_chat")
|
|
316
|
+
cfg.setdefault("target_backend", "openai_chat")
|
|
317
|
+
else:
|
|
318
|
+
cfg.setdefault("optimizer_backend", "openai_chat")
|
|
319
|
+
cfg.setdefault("target_backend", "openai_chat")
|
|
320
|
+
|
|
321
|
+
if cfg.get("optimizer_backend") == "claude_chat":
|
|
322
|
+
if (
|
|
323
|
+
str(cfg.get("optimizer_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS
|
|
324
|
+
and not _has_model_override("model.optimizer", "optimizer_model")
|
|
325
|
+
):
|
|
326
|
+
cfg["optimizer_model"] = default_model_for_backend("claude_chat")
|
|
327
|
+
if cfg.get("target_backend") == "claude_chat":
|
|
328
|
+
if (
|
|
329
|
+
str(cfg.get("target_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS
|
|
330
|
+
and not _has_model_override("model.target", "target_model")
|
|
331
|
+
):
|
|
332
|
+
cfg["target_model"] = default_model_for_backend("claude_chat")
|
|
333
|
+
if cfg.get("target_backend") == "claude_code_exec":
|
|
334
|
+
if (
|
|
335
|
+
str(cfg.get("target_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS
|
|
336
|
+
and not _has_model_override("model.target", "target_model")
|
|
337
|
+
):
|
|
338
|
+
cfg["target_model"] = default_model_for_backend("claude_chat")
|
|
339
|
+
|
|
340
|
+
if not cfg.get("out_root"):
|
|
341
|
+
env = cfg.get("env", "unknown")
|
|
342
|
+
model = cfg.get("target_model", "unknown").replace("/", "-")
|
|
343
|
+
ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
344
|
+
cfg["out_root"] = os.path.join("outputs", f"eval_{env}_{model}_{ts}")
|
|
345
|
+
|
|
346
|
+
cfg["out_root"] = os.path.abspath(cfg["out_root"])
|
|
347
|
+
|
|
348
|
+
out_root = cfg["out_root"]
|
|
349
|
+
os.makedirs(out_root, exist_ok=True)
|
|
350
|
+
|
|
351
|
+
# Load skill
|
|
352
|
+
skill_path = os.path.abspath(args.skill)
|
|
353
|
+
with open(skill_path) as f:
|
|
354
|
+
skill_content = f.read()
|
|
355
|
+
print(f" [skill] {skill_path} ({len(skill_content)} chars)")
|
|
356
|
+
|
|
357
|
+
# Configure models
|
|
358
|
+
configure_azure_openai(
|
|
359
|
+
endpoint=(cfg.get("azure_openai_endpoint") or cfg.get("azure_endpoint") or None),
|
|
360
|
+
api_version=(cfg.get("azure_openai_api_version") or cfg.get("azure_api_version") or None),
|
|
361
|
+
api_key=(cfg.get("azure_openai_api_key") or cfg.get("azure_api_key") or None),
|
|
362
|
+
auth_mode=cfg.get("azure_openai_auth_mode") or None,
|
|
363
|
+
ad_scope=cfg.get("azure_openai_ad_scope") or None,
|
|
364
|
+
managed_identity_client_id=cfg.get("azure_openai_managed_identity_client_id") or None,
|
|
365
|
+
optimizer_endpoint=cfg.get("optimizer_azure_openai_endpoint") or None,
|
|
366
|
+
optimizer_api_version=cfg.get("optimizer_azure_openai_api_version") or None,
|
|
367
|
+
optimizer_api_key=cfg.get("optimizer_azure_openai_api_key") or None,
|
|
368
|
+
optimizer_auth_mode=cfg.get("optimizer_azure_openai_auth_mode") or None,
|
|
369
|
+
optimizer_ad_scope=cfg.get("optimizer_azure_openai_ad_scope") or None,
|
|
370
|
+
optimizer_managed_identity_client_id=(
|
|
371
|
+
cfg.get("optimizer_azure_openai_managed_identity_client_id") or None
|
|
372
|
+
),
|
|
373
|
+
target_endpoint=cfg.get("target_azure_openai_endpoint") or None,
|
|
374
|
+
target_api_version=cfg.get("target_azure_openai_api_version") or None,
|
|
375
|
+
target_api_key=cfg.get("target_azure_openai_api_key") or None,
|
|
376
|
+
target_auth_mode=cfg.get("target_azure_openai_auth_mode") or None,
|
|
377
|
+
target_ad_scope=cfg.get("target_azure_openai_ad_scope") or None,
|
|
378
|
+
target_managed_identity_client_id=(
|
|
379
|
+
cfg.get("target_azure_openai_managed_identity_client_id") or None
|
|
380
|
+
),
|
|
381
|
+
)
|
|
382
|
+
set_optimizer_backend(cfg.get("optimizer_backend", "openai_chat"))
|
|
383
|
+
set_target_backend(cfg.get("target_backend", "openai_chat"))
|
|
384
|
+
set_optimizer_deployment(cfg.get("optimizer_model", default_model_for_backend(backend)))
|
|
385
|
+
set_target_deployment(cfg.get("target_model", default_model_for_backend(backend)))
|
|
386
|
+
configure_codex_exec(
|
|
387
|
+
path=cfg.get("codex_exec_path", "codex"),
|
|
388
|
+
sandbox=cfg.get("codex_exec_sandbox", "workspace-write"),
|
|
389
|
+
profile=cfg.get("codex_exec_profile", ""),
|
|
390
|
+
full_auto=cfg.get("codex_exec_full_auto", False),
|
|
391
|
+
reasoning_effort=cfg.get("codex_exec_reasoning_effort", "none"),
|
|
392
|
+
use_sdk=cfg.get("codex_exec_use_sdk", None),
|
|
393
|
+
network_access=cfg.get("codex_exec_network_access", False),
|
|
394
|
+
web_search=cfg.get("codex_exec_web_search", False),
|
|
395
|
+
approval_policy=cfg.get("codex_exec_approval_policy", "never"),
|
|
396
|
+
)
|
|
397
|
+
configure_claude_code_exec(
|
|
398
|
+
path=cfg.get("claude_code_exec_path", "claude"),
|
|
399
|
+
profile=cfg.get("claude_code_exec_profile", ""),
|
|
400
|
+
use_sdk=cfg.get("claude_code_exec_use_sdk", None),
|
|
401
|
+
effort=cfg.get("claude_code_exec_effort", cfg.get("reasoning_effort", "medium")),
|
|
402
|
+
max_thinking_tokens=cfg.get("claude_code_exec_max_thinking_tokens", 16384),
|
|
403
|
+
)
|
|
404
|
+
set_reasoning_effort(cfg.get("reasoning_effort", "") or None)
|
|
405
|
+
|
|
406
|
+
# Build adapter
|
|
407
|
+
adapter = get_adapter(cfg)
|
|
408
|
+
adapter.setup(cfg)
|
|
409
|
+
|
|
410
|
+
seed = cfg.get("seed", 42)
|
|
411
|
+
split = args.split or "all"
|
|
412
|
+
|
|
413
|
+
if split == "all":
|
|
414
|
+
items = (
|
|
415
|
+
adapter.build_eval_env(0, "train", seed)
|
|
416
|
+
+ adapter.build_eval_env(0, "valid_seen", seed)
|
|
417
|
+
+ adapter.build_eval_env(0, "valid_unseen", seed)
|
|
418
|
+
)
|
|
419
|
+
else:
|
|
420
|
+
env_num = cfg.get("test_env_num", 0)
|
|
421
|
+
items = adapter.build_eval_env(env_num, split, seed)
|
|
422
|
+
|
|
423
|
+
print(f"\n [eval] split={split} items={len(items)}")
|
|
424
|
+
print(f" [eval] out_root={out_root}")
|
|
425
|
+
print(f"{'='*60}")
|
|
426
|
+
|
|
427
|
+
# Run rollout
|
|
428
|
+
results = adapter.rollout(items, skill_content, out_root)
|
|
429
|
+
|
|
430
|
+
# Score
|
|
431
|
+
hard, soft = compute_score(results)
|
|
432
|
+
print(f"\n{'='*60}")
|
|
433
|
+
print(f" Results: hard={hard:.4f} soft={soft:.4f} (n={len(results)})")
|
|
434
|
+
print(f"{'='*60}")
|
|
435
|
+
|
|
436
|
+
# Save summary
|
|
437
|
+
summary = {
|
|
438
|
+
"skill": skill_path,
|
|
439
|
+
"split": split,
|
|
440
|
+
"n_items": len(results),
|
|
441
|
+
"hard": hard,
|
|
442
|
+
"soft": soft,
|
|
443
|
+
}
|
|
444
|
+
with open(os.path.join(out_root, "eval_summary.json"), "w") as f:
|
|
445
|
+
json.dump(summary, f, indent=2, ensure_ascii=False)
|
|
446
|
+
|
|
447
|
+
print(f" Saved to: {out_root}")
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
if __name__ == "__main__":
|
|
451
|
+
main()
|