skillopt 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. scripts/__init__.py +0 -0
  2. scripts/eval_only.py +451 -0
  3. scripts/train.py +548 -0
  4. skillopt/__init__.py +28 -0
  5. skillopt/config.py +286 -0
  6. skillopt/datasets/__init__.py +7 -0
  7. skillopt/datasets/base.py +512 -0
  8. skillopt/engine/__init__.py +9 -0
  9. skillopt/engine/trainer.py +2083 -0
  10. skillopt/envs/__init__.py +1 -0
  11. skillopt/envs/_template/env_template.py +196 -0
  12. skillopt/envs/_template/loader_template.py +87 -0
  13. skillopt/envs/alfworld/__init__.py +5 -0
  14. skillopt/envs/alfworld/adapter.py +459 -0
  15. skillopt/envs/alfworld/dataloader.py +123 -0
  16. skillopt/envs/alfworld/reflect.py +4 -0
  17. skillopt/envs/alfworld/rollout.py +347 -0
  18. skillopt/envs/alfworld/vendor/__init__.py +9 -0
  19. skillopt/envs/alfworld/vendor/alfworld_envs.py +221 -0
  20. skillopt/envs/alfworld/vendor/alfworld_projection.py +60 -0
  21. skillopt/envs/alfworld/vendor/alfworld_prompts.py +8 -0
  22. skillopt/envs/alfworld/vendor/env_base.py +84 -0
  23. skillopt/envs/alfworld/vendor/env_manager.py +139 -0
  24. skillopt/envs/alfworld/vendor/memory.py +87 -0
  25. skillopt/envs/base.py +309 -0
  26. skillopt/envs/docvqa/__init__.py +1 -0
  27. skillopt/envs/docvqa/adapter.py +115 -0
  28. skillopt/envs/docvqa/dataloader.py +61 -0
  29. skillopt/envs/docvqa/evaluator.py +113 -0
  30. skillopt/envs/docvqa/rollout.py +391 -0
  31. skillopt/envs/livemathematicianbench/__init__.py +1 -0
  32. skillopt/envs/livemathematicianbench/adapter.py +162 -0
  33. skillopt/envs/livemathematicianbench/dataloader.py +308 -0
  34. skillopt/envs/livemathematicianbench/evaluator.py +62 -0
  35. skillopt/envs/livemathematicianbench/reflect.py +4 -0
  36. skillopt/envs/livemathematicianbench/rollout.py +434 -0
  37. skillopt/envs/officeqa/__init__.py +1 -0
  38. skillopt/envs/officeqa/adapter.py +135 -0
  39. skillopt/envs/officeqa/dataloader.py +71 -0
  40. skillopt/envs/officeqa/evaluator.py +46 -0
  41. skillopt/envs/officeqa/rollout.py +799 -0
  42. skillopt/envs/officeqa/tool_runtime.py +552 -0
  43. skillopt/envs/searchqa/__init__.py +1 -0
  44. skillopt/envs/searchqa/adapter.py +129 -0
  45. skillopt/envs/searchqa/dataloader.py +42 -0
  46. skillopt/envs/searchqa/evaluator.py +100 -0
  47. skillopt/envs/searchqa/reflect.py +4 -0
  48. skillopt/envs/searchqa/rollout.py +481 -0
  49. skillopt/envs/spreadsheetbench/__init__.py +5 -0
  50. skillopt/envs/spreadsheetbench/adapter.py +192 -0
  51. skillopt/envs/spreadsheetbench/codegen_agent.py +726 -0
  52. skillopt/envs/spreadsheetbench/dataloader.py +37 -0
  53. skillopt/envs/spreadsheetbench/evaluator.py +158 -0
  54. skillopt/envs/spreadsheetbench/executor.py +67 -0
  55. skillopt/envs/spreadsheetbench/react_agent.py +395 -0
  56. skillopt/envs/spreadsheetbench/reflect.py +4 -0
  57. skillopt/envs/spreadsheetbench/rollout.py +934 -0
  58. skillopt/evaluation/__init__.py +13 -0
  59. skillopt/evaluation/gate.py +148 -0
  60. skillopt/gradient/__init__.py +15 -0
  61. skillopt/gradient/aggregate.py +253 -0
  62. skillopt/gradient/reflect.py +588 -0
  63. skillopt/model/__init__.py +512 -0
  64. skillopt/model/azure_openai.py +915 -0
  65. skillopt/model/backend_config.py +185 -0
  66. skillopt/model/claude_backend.py +359 -0
  67. skillopt/model/codex_backend.py +664 -0
  68. skillopt/model/codex_harness.py +1057 -0
  69. skillopt/model/common.py +229 -0
  70. skillopt/model/minimax_backend.py +277 -0
  71. skillopt/model/qwen_backend.py +455 -0
  72. skillopt/model/router.py +236 -0
  73. skillopt/optimizer/__init__.py +15 -0
  74. skillopt/optimizer/clip.py +109 -0
  75. skillopt/optimizer/lr_autonomous.py +108 -0
  76. skillopt/optimizer/meta_skill.py +79 -0
  77. skillopt/optimizer/rewrite.py +59 -0
  78. skillopt/optimizer/scheduler.py +127 -0
  79. skillopt/optimizer/select.py +4 -0
  80. skillopt/optimizer/skill.py +164 -0
  81. skillopt/optimizer/slow_update.py +396 -0
  82. skillopt/optimizer/update_modes.py +136 -0
  83. skillopt/prompts/__init__.py +63 -0
  84. skillopt/scheduler/__init__.py +8 -0
  85. skillopt/types.py +306 -0
  86. skillopt/utils/__init__.py +4 -0
  87. skillopt/utils/json_utils.py +42 -0
  88. skillopt/utils/scoring.py +28 -0
  89. skillopt-0.1.0.dist-info/LICENSE +21 -0
  90. skillopt-0.1.0.dist-info/METADATA +444 -0
  91. skillopt-0.1.0.dist-info/RECORD +97 -0
  92. skillopt-0.1.0.dist-info/WHEEL +5 -0
  93. skillopt-0.1.0.dist-info/entry_points.txt +3 -0
  94. skillopt-0.1.0.dist-info/top_level.txt +3 -0
  95. skillopt_webui/__init__.py +0 -0
  96. skillopt_webui/__main__.py +3 -0
  97. skillopt_webui/app.py +550 -0
scripts/__init__.py ADDED
File without changes
scripts/eval_only.py ADDED
@@ -0,0 +1,451 @@
1
+ #!/usr/bin/env python3
2
+ """SkillOpt eval-only: run a single skill on a dataset without training.
3
+
4
+ Usage
5
+ -----
6
+ python scripts/eval_only.py \
7
+ --config configs/spreadsheetbench/default.yaml \
8
+ --skill skillopt/envs/spreadsheetbench/skills/initial.md \
9
+ --split_dir /path/to/split \
10
+ --out_root outputs/eval_skill0
11
+
12
+ All YAML keys can be overridden from the CLI, same as train.py.
13
+ """
14
+ from __future__ import annotations
15
+
16
+ import argparse
17
+ import datetime
18
+ import json
19
+ import os
20
+ import sys
21
+
22
+ _SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
23
+ _PROJECT_ROOT = os.path.dirname(_SCRIPT_DIR)
24
+ if _PROJECT_ROOT not in sys.path:
25
+ sys.path.insert(0, _PROJECT_ROOT)
26
+
27
+ from skillopt.model import (
28
+ configure_azure_openai,
29
+ configure_claude_code_exec,
30
+ configure_codex_exec,
31
+ set_reasoning_effort,
32
+ set_target_backend,
33
+ set_target_deployment,
34
+ set_optimizer_backend,
35
+ set_optimizer_deployment,
36
+ )
37
+ from skillopt.model.common import default_model_for_backend, normalize_backend_name
38
+
39
+ _OPENAI_DEFAULT_MODEL_SENTINELS = {"gpt-5.4", "gpt-5.5"}
40
+ from skillopt.utils import compute_score
41
+
42
+
43
+ # ── Reuse registry from train.py ───────────────────────────────────────────
44
+
45
+ _ENV_REGISTRY: dict[str, type] = {}
46
+
47
+
48
+ def _register_builtins() -> None:
49
+ try:
50
+ from skillopt.envs.alfworld.adapter import ALFWorldAdapter
51
+ _ENV_REGISTRY["alfworld"] = ALFWorldAdapter
52
+ except ImportError:
53
+ pass
54
+ try:
55
+ from skillopt.envs.searchqa.adapter import SearchQAAdapter
56
+ _ENV_REGISTRY["searchqa"] = SearchQAAdapter
57
+ except ImportError:
58
+ pass
59
+ try:
60
+ from skillopt.envs.livemathematicianbench.adapter import LiveMathematicianBenchAdapter
61
+ _ENV_REGISTRY["livemathematicianbench"] = LiveMathematicianBenchAdapter
62
+ except ImportError:
63
+ pass
64
+ try:
65
+ from skillopt.envs.babyvision.adapter import BabyVisionAdapter
66
+ _ENV_REGISTRY["babyvision"] = BabyVisionAdapter
67
+ except ImportError:
68
+ pass
69
+ try:
70
+ from skillopt.envs.spreadsheetbench.adapter import SpreadsheetBenchAdapter
71
+ _ENV_REGISTRY["spreadsheetbench"] = SpreadsheetBenchAdapter
72
+ except ImportError:
73
+ pass
74
+ try:
75
+ from skillopt.envs.mmrb.adapter import MMRBAdapter
76
+ _ENV_REGISTRY["mmrb"] = MMRBAdapter
77
+ except ImportError:
78
+ pass
79
+ try:
80
+ from skillopt.envs.docvqa.adapter import DocVQAAdapter
81
+ _ENV_REGISTRY["docvqa"] = DocVQAAdapter
82
+ except ImportError:
83
+ pass
84
+ try:
85
+ from skillopt.envs.mathverse.adapter import MathVerseAdapter
86
+ _ENV_REGISTRY["mathverse"] = MathVerseAdapter
87
+ except ImportError:
88
+ pass
89
+ try:
90
+ from skillopt.envs.officeqa.adapter import OfficeQAAdapter
91
+ _ENV_REGISTRY["officeqa"] = OfficeQAAdapter
92
+ except ImportError:
93
+ pass
94
+ try:
95
+ from skillopt.envs.sealqa.adapter import SealQAAdapter
96
+ _ENV_REGISTRY["sealqa"] = SealQAAdapter
97
+ except ImportError:
98
+ pass
99
+ try:
100
+ from skillopt.envs.swebench.adapter import SWEBenchAdapter
101
+ _ENV_REGISTRY["swebench"] = SWEBenchAdapter
102
+ except ImportError:
103
+ pass
104
+
105
+
106
+ def get_adapter(cfg: dict):
107
+ _register_builtins()
108
+ env_name = cfg.get("env", "alfworld")
109
+ if env_name not in _ENV_REGISTRY:
110
+ raise ValueError(
111
+ f"Unknown environment '{env_name}'. "
112
+ f"Available: {list(_ENV_REGISTRY.keys())}"
113
+ )
114
+ adapter_cls = _ENV_REGISTRY[env_name]
115
+
116
+ import inspect
117
+ sig = inspect.signature(adapter_cls.__init__)
118
+ accepted = set(sig.parameters.keys()) - {"self"}
119
+ adapter_kwargs = {k: cfg[k] for k in accepted if k in cfg}
120
+ return adapter_cls(**adapter_kwargs)
121
+
122
+
123
+ # ── CLI ────────────────────────────────────────────────────────────────────
124
+
125
+ _BOOL = lambda x: str(x).lower() in ("true", "1", "yes") # noqa: E731
126
+
127
+
128
+ def parse_args() -> argparse.Namespace:
129
+ p = argparse.ArgumentParser(description="SkillOpt eval-only")
130
+ p.add_argument("--config", type=str, required=True)
131
+ p.add_argument("--skill", type=str, required=True,
132
+ help="Path to skill .md file to evaluate")
133
+ p.add_argument("--split", type=str, default="all",
134
+ help="Which split to eval: train/valid_seen/valid_unseen/all (default: all)")
135
+ p.add_argument("--cfg-options", nargs="+", default=[],
136
+ help="Override config: section.key=value")
137
+ # Legacy flat overrides
138
+ p.add_argument("--env", type=str)
139
+ p.add_argument("--backend", type=str,
140
+ choices=["azure_openai", "codex", "codex_exec", "claude", "claude_chat", "claude_code_exec"])
141
+ p.add_argument("--optimizer_model", type=str)
142
+ p.add_argument("--target_model", type=str)
143
+ p.add_argument("--optimizer_backend", type=str)
144
+ p.add_argument("--target_backend", type=str)
145
+ p.add_argument("--reasoning_effort", type=str,
146
+ choices=["", "low", "medium", "high", "xhigh", "max"])
147
+ p.add_argument("--azure_endpoint", type=str)
148
+ p.add_argument("--azure_api_version", type=str)
149
+ p.add_argument("--azure_api_key", type=str)
150
+ p.add_argument("--azure_openai_endpoint", type=str)
151
+ p.add_argument("--azure_openai_api_version", type=str)
152
+ p.add_argument("--azure_openai_api_key", type=str)
153
+ p.add_argument("--azure_openai_auth_mode", type=str)
154
+ p.add_argument("--azure_openai_ad_scope", type=str)
155
+ p.add_argument("--azure_openai_managed_identity_client_id", type=str)
156
+ p.add_argument("--optimizer_azure_openai_endpoint", type=str)
157
+ p.add_argument("--optimizer_azure_openai_api_version", type=str)
158
+ p.add_argument("--optimizer_azure_openai_api_key", type=str)
159
+ p.add_argument("--optimizer_azure_openai_auth_mode", type=str)
160
+ p.add_argument("--optimizer_azure_openai_ad_scope", type=str)
161
+ p.add_argument("--optimizer_azure_openai_managed_identity_client_id", type=str)
162
+ p.add_argument("--target_azure_openai_endpoint", type=str)
163
+ p.add_argument("--target_azure_openai_api_version", type=str)
164
+ p.add_argument("--target_azure_openai_api_key", type=str)
165
+ p.add_argument("--target_azure_openai_auth_mode", type=str)
166
+ p.add_argument("--target_azure_openai_ad_scope", type=str)
167
+ p.add_argument("--target_azure_openai_managed_identity_client_id", type=str)
168
+ p.add_argument("--codex_exec_path", type=str)
169
+ p.add_argument("--codex_exec_sandbox", type=str)
170
+ p.add_argument("--codex_exec_profile", type=str)
171
+ p.add_argument("--codex_exec_full_auto", type=_BOOL)
172
+ p.add_argument("--codex_exec_reasoning_effort", type=str)
173
+ p.add_argument("--codex_exec_use_sdk", type=str)
174
+ p.add_argument("--codex_exec_network_access", type=_BOOL)
175
+ p.add_argument("--codex_exec_web_search", type=_BOOL)
176
+ p.add_argument("--codex_exec_approval_policy", type=str)
177
+ p.add_argument("--claude_code_exec_path", type=str)
178
+ p.add_argument("--claude_code_exec_profile", type=str)
179
+ p.add_argument("--claude_code_exec_use_sdk", type=str)
180
+ p.add_argument("--claude_code_exec_effort", type=str)
181
+ p.add_argument("--claude_code_exec_max_thinking_tokens", type=int)
182
+ p.add_argument("--out_root", type=str)
183
+ p.add_argument("--data_path", type=str)
184
+ p.add_argument("--split_mode", type=str,
185
+ choices=["ratio", "split_dir"])
186
+ p.add_argument("--split_ratio", type=str)
187
+ p.add_argument("--split_seed", type=int)
188
+ p.add_argument("--split_dir", type=str)
189
+ p.add_argument("--split_output_dir", type=str)
190
+ p.add_argument("--data_root", type=str)
191
+ p.add_argument("--max_turns", type=int)
192
+ p.add_argument("--workers", type=int)
193
+ p.add_argument("--max_api_workers", type=int)
194
+ p.add_argument("--seed", type=int)
195
+ p.add_argument("--test_env_num", type=int)
196
+ p.add_argument("--mode", type=str,
197
+ help="SpreadsheetBench: single/multi/react (default comes from config)")
198
+ return p.parse_args()
199
+
200
+
201
+ def main() -> None:
202
+ args = parse_args()
203
+
204
+ from skillopt.config import load_config as _load, flatten_config, is_structured
205
+
206
+ cfg = _load(args.config, overrides=args.cfg_options)
207
+ structured = is_structured(cfg)
208
+
209
+ # Apply legacy --key value overrides
210
+ cli = {k: v for k, v in vars(args).items()
211
+ if v is not None and k not in ("config", "skill", "split", "cfg_options")}
212
+ if cli:
213
+ if structured:
214
+ from skillopt.config import apply_overrides
215
+ _MAP = {
216
+ "backend": "model.backend",
217
+ "optimizer_model": "model.optimizer",
218
+ "target_model": "model.target",
219
+ "optimizer_backend": "model.optimizer_backend",
220
+ "target_backend": "model.target_backend",
221
+ "reasoning_effort": "model.reasoning_effort",
222
+ "azure_endpoint": "model.azure_endpoint",
223
+ "azure_api_version": "model.azure_api_version",
224
+ "azure_api_key": "model.azure_api_key",
225
+ "azure_openai_endpoint": "model.azure_openai_endpoint",
226
+ "azure_openai_api_version": "model.azure_openai_api_version",
227
+ "azure_openai_api_key": "model.azure_openai_api_key",
228
+ "azure_openai_auth_mode": "model.azure_openai_auth_mode",
229
+ "azure_openai_ad_scope": "model.azure_openai_ad_scope",
230
+ "azure_openai_managed_identity_client_id": "model.azure_openai_managed_identity_client_id",
231
+ "optimizer_azure_openai_endpoint": "model.optimizer_azure_openai_endpoint",
232
+ "optimizer_azure_openai_api_version": "model.optimizer_azure_openai_api_version",
233
+ "optimizer_azure_openai_api_key": "model.optimizer_azure_openai_api_key",
234
+ "optimizer_azure_openai_auth_mode": "model.optimizer_azure_openai_auth_mode",
235
+ "optimizer_azure_openai_ad_scope": "model.optimizer_azure_openai_ad_scope",
236
+ "optimizer_azure_openai_managed_identity_client_id": "model.optimizer_azure_openai_managed_identity_client_id",
237
+ "target_azure_openai_endpoint": "model.target_azure_openai_endpoint",
238
+ "target_azure_openai_api_version": "model.target_azure_openai_api_version",
239
+ "target_azure_openai_api_key": "model.target_azure_openai_api_key",
240
+ "target_azure_openai_auth_mode": "model.target_azure_openai_auth_mode",
241
+ "target_azure_openai_ad_scope": "model.target_azure_openai_ad_scope",
242
+ "target_azure_openai_managed_identity_client_id": "model.target_azure_openai_managed_identity_client_id",
243
+ "codex_exec_path": "model.codex_exec_path",
244
+ "codex_exec_sandbox": "model.codex_exec_sandbox",
245
+ "codex_exec_profile": "model.codex_exec_profile",
246
+ "codex_exec_full_auto": "model.codex_exec_full_auto",
247
+ "codex_exec_reasoning_effort": "model.codex_exec_reasoning_effort",
248
+ "codex_exec_use_sdk": "model.codex_exec_use_sdk",
249
+ "codex_exec_network_access": "model.codex_exec_network_access",
250
+ "codex_exec_web_search": "model.codex_exec_web_search",
251
+ "codex_exec_approval_policy": "model.codex_exec_approval_policy",
252
+ "claude_code_exec_path": "model.claude_code_exec_path",
253
+ "claude_code_exec_profile": "model.claude_code_exec_profile",
254
+ "claude_code_exec_use_sdk": "model.claude_code_exec_use_sdk",
255
+ "claude_code_exec_effort": "model.claude_code_exec_effort",
256
+ "claude_code_exec_max_thinking_tokens": "model.claude_code_exec_max_thinking_tokens",
257
+ "seed": "train.seed",
258
+ "test_env_num": "evaluation.test_env_num",
259
+ "env": "env.name",
260
+ "out_root": "env.out_root",
261
+ }
262
+ mapped = []
263
+ for k, v in cli.items():
264
+ dotted = _MAP.get(k)
265
+ if dotted:
266
+ mapped.append(f"{dotted}={v}")
267
+ else:
268
+ mapped.append(f"env.{k}={v}")
269
+ apply_overrides(cfg, mapped)
270
+ else:
271
+ cfg.update(cli)
272
+
273
+ cfg = flatten_config(cfg) if structured else cfg
274
+
275
+ for new_key, old_key in (
276
+ ("azure_openai_endpoint", "azure_endpoint"),
277
+ ("azure_openai_api_version", "azure_api_version"),
278
+ ("azure_openai_api_key", "azure_api_key"),
279
+ ):
280
+ if cfg.get(new_key) in (None, "") and cfg.get(old_key) not in (None, ""):
281
+ cfg[new_key] = cfg[old_key]
282
+
283
+ explicit_backend = getattr(args, "backend", None)
284
+ if explicit_backend is None:
285
+ for option in args.cfg_options or []:
286
+ key = str(option).split("=", 1)[0].strip()
287
+ if key == "model.backend":
288
+ explicit_backend = str(option).split("=", 1)[1].strip()
289
+ break
290
+
291
+ backend = normalize_backend_name(cfg.get("model_backend") or cfg.get("target_backend") or "azure_openai")
292
+
293
+ def _has_model_override(dotted_key: str, legacy_key: str) -> bool:
294
+ if getattr(args, legacy_key, None) is not None:
295
+ return True
296
+ for option in args.cfg_options or []:
297
+ key = str(option).split("=", 1)[0].strip()
298
+ if key == dotted_key:
299
+ return True
300
+ return False
301
+
302
+ if explicit_backend is not None:
303
+ backend = normalize_backend_name(explicit_backend)
304
+ cfg["model_backend"] = backend
305
+ if backend in {"claude", "claude_chat"}:
306
+ cfg.setdefault("optimizer_backend", "claude_chat")
307
+ cfg.setdefault("target_backend", "claude_chat")
308
+ elif backend in {"codex", "codex_exec"}:
309
+ cfg.setdefault("optimizer_backend", "openai_chat")
310
+ cfg.setdefault("target_backend", "codex_exec")
311
+ elif backend == "claude_code_exec":
312
+ cfg.setdefault("optimizer_backend", "openai_chat")
313
+ cfg.setdefault("target_backend", "claude_code_exec")
314
+ else:
315
+ cfg.setdefault("optimizer_backend", "openai_chat")
316
+ cfg.setdefault("target_backend", "openai_chat")
317
+ else:
318
+ cfg.setdefault("optimizer_backend", "openai_chat")
319
+ cfg.setdefault("target_backend", "openai_chat")
320
+
321
+ if cfg.get("optimizer_backend") == "claude_chat":
322
+ if (
323
+ str(cfg.get("optimizer_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS
324
+ and not _has_model_override("model.optimizer", "optimizer_model")
325
+ ):
326
+ cfg["optimizer_model"] = default_model_for_backend("claude_chat")
327
+ if cfg.get("target_backend") == "claude_chat":
328
+ if (
329
+ str(cfg.get("target_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS
330
+ and not _has_model_override("model.target", "target_model")
331
+ ):
332
+ cfg["target_model"] = default_model_for_backend("claude_chat")
333
+ if cfg.get("target_backend") == "claude_code_exec":
334
+ if (
335
+ str(cfg.get("target_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS
336
+ and not _has_model_override("model.target", "target_model")
337
+ ):
338
+ cfg["target_model"] = default_model_for_backend("claude_chat")
339
+
340
+ if not cfg.get("out_root"):
341
+ env = cfg.get("env", "unknown")
342
+ model = cfg.get("target_model", "unknown").replace("/", "-")
343
+ ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
344
+ cfg["out_root"] = os.path.join("outputs", f"eval_{env}_{model}_{ts}")
345
+
346
+ cfg["out_root"] = os.path.abspath(cfg["out_root"])
347
+
348
+ out_root = cfg["out_root"]
349
+ os.makedirs(out_root, exist_ok=True)
350
+
351
+ # Load skill
352
+ skill_path = os.path.abspath(args.skill)
353
+ with open(skill_path) as f:
354
+ skill_content = f.read()
355
+ print(f" [skill] {skill_path} ({len(skill_content)} chars)")
356
+
357
+ # Configure models
358
+ configure_azure_openai(
359
+ endpoint=(cfg.get("azure_openai_endpoint") or cfg.get("azure_endpoint") or None),
360
+ api_version=(cfg.get("azure_openai_api_version") or cfg.get("azure_api_version") or None),
361
+ api_key=(cfg.get("azure_openai_api_key") or cfg.get("azure_api_key") or None),
362
+ auth_mode=cfg.get("azure_openai_auth_mode") or None,
363
+ ad_scope=cfg.get("azure_openai_ad_scope") or None,
364
+ managed_identity_client_id=cfg.get("azure_openai_managed_identity_client_id") or None,
365
+ optimizer_endpoint=cfg.get("optimizer_azure_openai_endpoint") or None,
366
+ optimizer_api_version=cfg.get("optimizer_azure_openai_api_version") or None,
367
+ optimizer_api_key=cfg.get("optimizer_azure_openai_api_key") or None,
368
+ optimizer_auth_mode=cfg.get("optimizer_azure_openai_auth_mode") or None,
369
+ optimizer_ad_scope=cfg.get("optimizer_azure_openai_ad_scope") or None,
370
+ optimizer_managed_identity_client_id=(
371
+ cfg.get("optimizer_azure_openai_managed_identity_client_id") or None
372
+ ),
373
+ target_endpoint=cfg.get("target_azure_openai_endpoint") or None,
374
+ target_api_version=cfg.get("target_azure_openai_api_version") or None,
375
+ target_api_key=cfg.get("target_azure_openai_api_key") or None,
376
+ target_auth_mode=cfg.get("target_azure_openai_auth_mode") or None,
377
+ target_ad_scope=cfg.get("target_azure_openai_ad_scope") or None,
378
+ target_managed_identity_client_id=(
379
+ cfg.get("target_azure_openai_managed_identity_client_id") or None
380
+ ),
381
+ )
382
+ set_optimizer_backend(cfg.get("optimizer_backend", "openai_chat"))
383
+ set_target_backend(cfg.get("target_backend", "openai_chat"))
384
+ set_optimizer_deployment(cfg.get("optimizer_model", default_model_for_backend(backend)))
385
+ set_target_deployment(cfg.get("target_model", default_model_for_backend(backend)))
386
+ configure_codex_exec(
387
+ path=cfg.get("codex_exec_path", "codex"),
388
+ sandbox=cfg.get("codex_exec_sandbox", "workspace-write"),
389
+ profile=cfg.get("codex_exec_profile", ""),
390
+ full_auto=cfg.get("codex_exec_full_auto", False),
391
+ reasoning_effort=cfg.get("codex_exec_reasoning_effort", "none"),
392
+ use_sdk=cfg.get("codex_exec_use_sdk", None),
393
+ network_access=cfg.get("codex_exec_network_access", False),
394
+ web_search=cfg.get("codex_exec_web_search", False),
395
+ approval_policy=cfg.get("codex_exec_approval_policy", "never"),
396
+ )
397
+ configure_claude_code_exec(
398
+ path=cfg.get("claude_code_exec_path", "claude"),
399
+ profile=cfg.get("claude_code_exec_profile", ""),
400
+ use_sdk=cfg.get("claude_code_exec_use_sdk", None),
401
+ effort=cfg.get("claude_code_exec_effort", cfg.get("reasoning_effort", "medium")),
402
+ max_thinking_tokens=cfg.get("claude_code_exec_max_thinking_tokens", 16384),
403
+ )
404
+ set_reasoning_effort(cfg.get("reasoning_effort", "") or None)
405
+
406
+ # Build adapter
407
+ adapter = get_adapter(cfg)
408
+ adapter.setup(cfg)
409
+
410
+ seed = cfg.get("seed", 42)
411
+ split = args.split or "all"
412
+
413
+ if split == "all":
414
+ items = (
415
+ adapter.build_eval_env(0, "train", seed)
416
+ + adapter.build_eval_env(0, "valid_seen", seed)
417
+ + adapter.build_eval_env(0, "valid_unseen", seed)
418
+ )
419
+ else:
420
+ env_num = cfg.get("test_env_num", 0)
421
+ items = adapter.build_eval_env(env_num, split, seed)
422
+
423
+ print(f"\n [eval] split={split} items={len(items)}")
424
+ print(f" [eval] out_root={out_root}")
425
+ print(f"{'='*60}")
426
+
427
+ # Run rollout
428
+ results = adapter.rollout(items, skill_content, out_root)
429
+
430
+ # Score
431
+ hard, soft = compute_score(results)
432
+ print(f"\n{'='*60}")
433
+ print(f" Results: hard={hard:.4f} soft={soft:.4f} (n={len(results)})")
434
+ print(f"{'='*60}")
435
+
436
+ # Save summary
437
+ summary = {
438
+ "skill": skill_path,
439
+ "split": split,
440
+ "n_items": len(results),
441
+ "hard": hard,
442
+ "soft": soft,
443
+ }
444
+ with open(os.path.join(out_root, "eval_summary.json"), "w") as f:
445
+ json.dump(summary, f, indent=2, ensure_ascii=False)
446
+
447
+ print(f" Saved to: {out_root}")
448
+
449
+
450
+ if __name__ == "__main__":
451
+ main()