skillopt 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. skillopt-0.1.0/LICENSE +21 -0
  2. skillopt-0.1.0/PKG-INFO +444 -0
  3. skillopt-0.1.0/README.md +395 -0
  4. skillopt-0.1.0/pyproject.toml +75 -0
  5. skillopt-0.1.0/scripts/__init__.py +0 -0
  6. skillopt-0.1.0/scripts/eval_only.py +451 -0
  7. skillopt-0.1.0/scripts/train.py +548 -0
  8. skillopt-0.1.0/setup.cfg +4 -0
  9. skillopt-0.1.0/skillopt/__init__.py +28 -0
  10. skillopt-0.1.0/skillopt/config.py +286 -0
  11. skillopt-0.1.0/skillopt/datasets/__init__.py +7 -0
  12. skillopt-0.1.0/skillopt/datasets/base.py +512 -0
  13. skillopt-0.1.0/skillopt/engine/__init__.py +9 -0
  14. skillopt-0.1.0/skillopt/engine/trainer.py +2083 -0
  15. skillopt-0.1.0/skillopt/envs/__init__.py +1 -0
  16. skillopt-0.1.0/skillopt/envs/_template/env_template.py +196 -0
  17. skillopt-0.1.0/skillopt/envs/_template/loader_template.py +87 -0
  18. skillopt-0.1.0/skillopt/envs/alfworld/__init__.py +5 -0
  19. skillopt-0.1.0/skillopt/envs/alfworld/adapter.py +459 -0
  20. skillopt-0.1.0/skillopt/envs/alfworld/dataloader.py +123 -0
  21. skillopt-0.1.0/skillopt/envs/alfworld/reflect.py +4 -0
  22. skillopt-0.1.0/skillopt/envs/alfworld/rollout.py +347 -0
  23. skillopt-0.1.0/skillopt/envs/alfworld/vendor/__init__.py +9 -0
  24. skillopt-0.1.0/skillopt/envs/alfworld/vendor/alfworld_envs.py +221 -0
  25. skillopt-0.1.0/skillopt/envs/alfworld/vendor/alfworld_projection.py +60 -0
  26. skillopt-0.1.0/skillopt/envs/alfworld/vendor/alfworld_prompts.py +8 -0
  27. skillopt-0.1.0/skillopt/envs/alfworld/vendor/env_base.py +84 -0
  28. skillopt-0.1.0/skillopt/envs/alfworld/vendor/env_manager.py +139 -0
  29. skillopt-0.1.0/skillopt/envs/alfworld/vendor/memory.py +87 -0
  30. skillopt-0.1.0/skillopt/envs/base.py +309 -0
  31. skillopt-0.1.0/skillopt/envs/docvqa/__init__.py +1 -0
  32. skillopt-0.1.0/skillopt/envs/docvqa/adapter.py +115 -0
  33. skillopt-0.1.0/skillopt/envs/docvqa/dataloader.py +61 -0
  34. skillopt-0.1.0/skillopt/envs/docvqa/evaluator.py +113 -0
  35. skillopt-0.1.0/skillopt/envs/docvqa/rollout.py +391 -0
  36. skillopt-0.1.0/skillopt/envs/livemathematicianbench/__init__.py +1 -0
  37. skillopt-0.1.0/skillopt/envs/livemathematicianbench/adapter.py +162 -0
  38. skillopt-0.1.0/skillopt/envs/livemathematicianbench/dataloader.py +308 -0
  39. skillopt-0.1.0/skillopt/envs/livemathematicianbench/evaluator.py +62 -0
  40. skillopt-0.1.0/skillopt/envs/livemathematicianbench/reflect.py +4 -0
  41. skillopt-0.1.0/skillopt/envs/livemathematicianbench/rollout.py +434 -0
  42. skillopt-0.1.0/skillopt/envs/officeqa/__init__.py +1 -0
  43. skillopt-0.1.0/skillopt/envs/officeqa/adapter.py +135 -0
  44. skillopt-0.1.0/skillopt/envs/officeqa/dataloader.py +71 -0
  45. skillopt-0.1.0/skillopt/envs/officeqa/evaluator.py +46 -0
  46. skillopt-0.1.0/skillopt/envs/officeqa/rollout.py +799 -0
  47. skillopt-0.1.0/skillopt/envs/officeqa/tool_runtime.py +552 -0
  48. skillopt-0.1.0/skillopt/envs/searchqa/__init__.py +1 -0
  49. skillopt-0.1.0/skillopt/envs/searchqa/adapter.py +129 -0
  50. skillopt-0.1.0/skillopt/envs/searchqa/dataloader.py +42 -0
  51. skillopt-0.1.0/skillopt/envs/searchqa/evaluator.py +100 -0
  52. skillopt-0.1.0/skillopt/envs/searchqa/reflect.py +4 -0
  53. skillopt-0.1.0/skillopt/envs/searchqa/rollout.py +481 -0
  54. skillopt-0.1.0/skillopt/envs/spreadsheetbench/__init__.py +5 -0
  55. skillopt-0.1.0/skillopt/envs/spreadsheetbench/adapter.py +192 -0
  56. skillopt-0.1.0/skillopt/envs/spreadsheetbench/codegen_agent.py +726 -0
  57. skillopt-0.1.0/skillopt/envs/spreadsheetbench/dataloader.py +37 -0
  58. skillopt-0.1.0/skillopt/envs/spreadsheetbench/evaluator.py +158 -0
  59. skillopt-0.1.0/skillopt/envs/spreadsheetbench/executor.py +67 -0
  60. skillopt-0.1.0/skillopt/envs/spreadsheetbench/react_agent.py +395 -0
  61. skillopt-0.1.0/skillopt/envs/spreadsheetbench/reflect.py +4 -0
  62. skillopt-0.1.0/skillopt/envs/spreadsheetbench/rollout.py +934 -0
  63. skillopt-0.1.0/skillopt/evaluation/__init__.py +13 -0
  64. skillopt-0.1.0/skillopt/evaluation/gate.py +148 -0
  65. skillopt-0.1.0/skillopt/gradient/__init__.py +15 -0
  66. skillopt-0.1.0/skillopt/gradient/aggregate.py +253 -0
  67. skillopt-0.1.0/skillopt/gradient/reflect.py +588 -0
  68. skillopt-0.1.0/skillopt/model/__init__.py +512 -0
  69. skillopt-0.1.0/skillopt/model/azure_openai.py +915 -0
  70. skillopt-0.1.0/skillopt/model/backend_config.py +185 -0
  71. skillopt-0.1.0/skillopt/model/claude_backend.py +359 -0
  72. skillopt-0.1.0/skillopt/model/codex_backend.py +664 -0
  73. skillopt-0.1.0/skillopt/model/codex_harness.py +1057 -0
  74. skillopt-0.1.0/skillopt/model/common.py +229 -0
  75. skillopt-0.1.0/skillopt/model/minimax_backend.py +277 -0
  76. skillopt-0.1.0/skillopt/model/qwen_backend.py +455 -0
  77. skillopt-0.1.0/skillopt/model/router.py +236 -0
  78. skillopt-0.1.0/skillopt/optimizer/__init__.py +15 -0
  79. skillopt-0.1.0/skillopt/optimizer/clip.py +109 -0
  80. skillopt-0.1.0/skillopt/optimizer/lr_autonomous.py +108 -0
  81. skillopt-0.1.0/skillopt/optimizer/meta_skill.py +79 -0
  82. skillopt-0.1.0/skillopt/optimizer/rewrite.py +59 -0
  83. skillopt-0.1.0/skillopt/optimizer/scheduler.py +127 -0
  84. skillopt-0.1.0/skillopt/optimizer/select.py +4 -0
  85. skillopt-0.1.0/skillopt/optimizer/skill.py +164 -0
  86. skillopt-0.1.0/skillopt/optimizer/slow_update.py +396 -0
  87. skillopt-0.1.0/skillopt/optimizer/update_modes.py +136 -0
  88. skillopt-0.1.0/skillopt/prompts/__init__.py +63 -0
  89. skillopt-0.1.0/skillopt/scheduler/__init__.py +8 -0
  90. skillopt-0.1.0/skillopt/types.py +306 -0
  91. skillopt-0.1.0/skillopt/utils/__init__.py +4 -0
  92. skillopt-0.1.0/skillopt/utils/json_utils.py +42 -0
  93. skillopt-0.1.0/skillopt/utils/scoring.py +28 -0
  94. skillopt-0.1.0/skillopt.egg-info/PKG-INFO +444 -0
  95. skillopt-0.1.0/skillopt.egg-info/SOURCES.txt +103 -0
  96. skillopt-0.1.0/skillopt.egg-info/dependency_links.txt +1 -0
  97. skillopt-0.1.0/skillopt.egg-info/entry_points.txt +3 -0
  98. skillopt-0.1.0/skillopt.egg-info/requires.txt +33 -0
  99. skillopt-0.1.0/skillopt.egg-info/top_level.txt +4 -0
  100. skillopt-0.1.0/skillopt_webui/__init__.py +0 -0
  101. skillopt-0.1.0/skillopt_webui/__main__.py +3 -0
  102. skillopt-0.1.0/skillopt_webui/app.py +550 -0
  103. skillopt-0.1.0/tests/test_json_utils.py +112 -0
  104. skillopt-0.1.0/tests/test_scoring.py +106 -0
  105. skillopt-0.1.0/tests/test_types.py +249 -0
skillopt-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Microsoft Corporation
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,444 @@
1
+ Metadata-Version: 2.1
2
+ Name: skillopt
3
+ Version: 0.1.0
4
+ Summary: SkillOpt: Agentic Skill Optimization via Reflective Training Loops
5
+ Author: SkillOpt Team
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/microsoft/SkillOpt
8
+ Project-URL: Documentation, https://microsoft.github.io/SkillOpt
9
+ Project-URL: Repository, https://github.com/microsoft/SkillOpt
10
+ Project-URL: Issues, https://github.com/microsoft/SkillOpt/issues
11
+ Keywords: agent,prompt-optimization,skill-learning,LLM,agentic
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Requires-Python: >=3.10
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: openai>=1.30.0
24
+ Requires-Dist: pyyaml>=6.0
25
+ Requires-Dist: numpy>=1.24.0
26
+ Requires-Dist: openpyxl>=3.1.0
27
+ Requires-Dist: azure-identity>=1.15.0
28
+ Requires-Dist: azure-core>=1.30.0
29
+ Requires-Dist: httpx>=0.27.0
30
+ Provides-Extra: alfworld
31
+ Requires-Dist: alfworld>=0.4.0; extra == "alfworld"
32
+ Requires-Dist: gymnasium>=0.29.0; extra == "alfworld"
33
+ Provides-Extra: claude
34
+ Requires-Dist: claude-agent-sdk>=0.1.0; extra == "claude"
35
+ Provides-Extra: qwen
36
+ Requires-Dist: vllm>=0.4.0; extra == "qwen"
37
+ Provides-Extra: docs
38
+ Requires-Dist: mkdocs-material>=9.5.0; extra == "docs"
39
+ Requires-Dist: mkdocstrings[python]>=0.24.0; extra == "docs"
40
+ Provides-Extra: webui
41
+ Requires-Dist: gradio>=4.0.0; extra == "webui"
42
+ Provides-Extra: dev
43
+ Requires-Dist: ruff>=0.4.0; extra == "dev"
44
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
45
+ Provides-Extra: all
46
+ Requires-Dist: alfworld>=0.4.0; extra == "all"
47
+ Requires-Dist: gymnasium>=0.29.0; extra == "all"
48
+ Requires-Dist: claude-agent-sdk>=0.1.0; extra == "all"
49
+
50
+ # SkillOpt: Executive Strategy for Self-Evolving Agent Skills
51
+
52
+ *Train agent skills like you train neural networks — with epochs, (mini-)batchsize, learning rates, and validation gates — but without touching model weights.*
53
+
54
+ [![Project Page](https://img.shields.io/badge/Project%20Page-SkillOpt-8dbb3c)](https://microsoft.github.io/SkillOpt/) [![Paper](https://img.shields.io/badge/Paper-arXiv-b31b1b)](https://arxiv.org/abs/2605.23904) [![Project Video](https://img.shields.io/badge/Project%20Video-Watch%20Demo-ff0000)](https://youtu.be/JUBMDTCiM0M) [![Python 3.10+](https://img.shields.io/badge/Python-3.10%2B-blue.svg)](https://www.python.org/) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
55
+
56
+ ---
57
+
58
+ ## Overview
59
+
60
+ Modern agent skills are usually hand-crafted, generated one-shot by a strong
61
+ LLM, or evolved through loosely controlled self-revision — none of which
62
+ behaves like a deep-learning optimizer for the skill itself, and none of
63
+ which reliably improves over its starting point under feedback.
64
+
65
+ **SkillOpt treats the skill document as the trainable state of a frozen
66
+ agent**, and trains it with the discipline that makes weight-space
67
+ optimization reproducible. A separate optimizer model turns scored rollouts
68
+ into bounded add / delete / replace edits on a single skill document; a
69
+ candidate edit is accepted only when it strictly improves a held-out
70
+ validation score. A textual learning-rate budget, a rejected-edit buffer,
71
+ and an epoch-wise slow / meta update make skill training stable while
72
+ adding **zero inference-time model calls** at deployment.
73
+
74
+ The deployed artifact is a compact `best_skill.md` (typically 300–2,000
75
+ tokens) that runs against the unchanged target model. Across **six
76
+ benchmarks, seven target models, and three execution harnesses** (direct
77
+ chat, Codex CLI, Claude Code CLI), SkillOpt is best or tied-best on **all
78
+ 52 evaluated (model, benchmark, harness) cells** and on GPT-5.5 lifts the
79
+ average no-skill accuracy by **+23.5 points in direct chat, +24.8 inside
80
+ the Codex agentic loop, and +19.1 inside Claude Code**. Optimized skill
81
+ artifacts transfer across model scales, between Codex and Claude Code
82
+ harnesses, and to nearby benchmarks without further optimization.
83
+
84
+ For the full method, ablations, and per-cell results see the [paper](https://arxiv.org/abs/2605.23904); for a visual walkthrough of the loop see the [project page](https://microsoft.github.io/SkillOpt/); for deeper API / backend / benchmark docs see [`docs/`](docs/).
85
+
86
+ ## 🎬 Demo Video
87
+
88
+ https://github.com/user-attachments/assets/eb12d3bc-371c-467f-904d-91b61f339ed7
89
+
90
+ <p align="center">
91
+ <a href="https://youtu.be/JUBMDTCiM0M"><b>▶ Watch the full demo on YouTube</b></a>
92
+ </p>
93
+
94
+ ---
95
+
96
+ ## Install
97
+
98
+ ### Requirements
99
+
100
+ - Python 3.10+
101
+
102
+ ```bash
103
+ git clone https://github.com/microsoft/SkillOpt.git
104
+ cd SkillOpt
105
+ pip install -e .
106
+
107
+ # For the ALFWorld benchmark (optional):
108
+ pip install -e ".[alfworld]"
109
+ alfworld-download
110
+ ```
111
+
112
+ ### Configure API Credentials
113
+
114
+ ```bash
115
+ cp .env.example .env
116
+ # Edit .env with your API credentials, then:
117
+ source .env
118
+ ```
119
+
120
+ #### Azure OpenAI *(recommended)*
121
+
122
+ ```bash
123
+ export AZURE_OPENAI_ENDPOINT="https://your-resource.openai.azure.com/"
124
+ # Option 1: API key auth
125
+ export AZURE_OPENAI_API_KEY="your-key"
126
+ # Option 2: Azure CLI auth (no API key needed)
127
+ export AZURE_OPENAI_AUTH_MODE="azure_cli"
128
+ ```
129
+
130
+ > **Note:** `AZURE_OPENAI_ENDPOINT` is required for all three modes (`api_key`, `azure_cli`, `openai_compatible`). Without it, all LLM calls will fail.
131
+
132
+ #### OpenAI-compatible endpoints
133
+
134
+ ```bash
135
+ export AZURE_OPENAI_ENDPOINT="https://api.openai.com/v1"
136
+ export AZURE_OPENAI_API_KEY="sk-..."
137
+ export AZURE_OPENAI_AUTH_MODE="openai_compatible"
138
+ ```
139
+
140
+ This routes all calls through the plain OpenAI Python client (no Azure auth, no `api-version` header).
141
+
142
+ > **Note:** SkillOpt reuses the `AZURE_OPENAI_*` env var names even in this mode — there is no separate `OPENAI_API_KEY` knob.
143
+
144
+ #### Anthropic Claude
145
+
146
+ ```bash
147
+ export ANTHROPIC_API_KEY="sk-ant-..."
148
+ ```
149
+
150
+ #### Qwen *(local vLLM)*
151
+
152
+ ```bash
153
+ export QWEN_CHAT_BASE_URL="http://localhost:8000/v1"
154
+ export QWEN_CHAT_MODEL="Qwen/Qwen3.5-4B"
155
+ ```
156
+
157
+ `qwen_chat` can also be used as the optimizer backend. When optimizer and
158
+ target should point to different local vLLM services, use the role-specific
159
+ settings:
160
+
161
+ ```bash
162
+ python scripts/train.py \
163
+ --config configs/searchqa/default.yaml \
164
+ --optimizer_backend qwen_chat \
165
+ --target_backend qwen_chat \
166
+ --optimizer_model Qwen/Qwen3.5-4B \
167
+ --target_model Qwen/Qwen3.5-4B \
168
+ --optimizer_qwen_chat_base_url http://localhost:8001/v1 \
169
+ --target_qwen_chat_base_url http://localhost:8000/v1
170
+ ```
171
+
172
+ #### MiniMax
173
+
174
+ ```bash
175
+ export MINIMAX_BASE_URL="https://api.minimax.io/v1"
176
+ export MINIMAX_API_KEY="..."
177
+ export MINIMAX_MODEL="MiniMax-M2.7"
178
+ ```
179
+
180
+ ---
181
+
182
+ ## Quick Start
183
+
184
+ ### Training
185
+
186
+ ```bash
187
+ # Minimal example — train on SearchQA:
188
+ python scripts/train.py \
189
+ --config configs/searchqa/default.yaml \
190
+ --split_dir /path/to/your/searchqa_split \
191
+ --azure_openai_endpoint https://your-resource.openai.azure.com/ \
192
+ --optimizer_model gpt-5.5 \
193
+ --target_model gpt-5.5
194
+
195
+ # Train on LiveMathematicianBench:
196
+ python scripts/train.py \
197
+ --config configs/livemathematicianbench/default.yaml \
198
+ --split_dir /path/to/your/livemath_split \
199
+ --azure_openai_endpoint https://your-resource.openai.azure.com/ \
200
+ --optimizer_model gpt-5.5 \
201
+ --target_model gpt-5.5
202
+
203
+ # Train on ALFWorld:
204
+ python scripts/train.py \
205
+ --config configs/alfworld/default.yaml \
206
+ --split_dir data/alfworld_path_split \
207
+ --azure_openai_endpoint https://your-resource.openai.azure.com/ \
208
+ --optimizer_model gpt-5.5 \
209
+ --target_model gpt-5.5
210
+ ```
211
+
212
+ Key CLI arguments:
213
+
214
+ | Argument | Description | Example |
215
+ |---|---|---|
216
+ | `--config` | Benchmark config YAML | `configs/searchqa/default.yaml` |
217
+ | `--split_dir` | Path to data split directory | `/path/to/split` |
218
+ | `--azure_openai_endpoint` | Azure OpenAI endpoint URL | `https://your-resource.openai.azure.com/` |
219
+ | `--optimizer_model` | Optimizer model deployment name | `gpt-5.5` |
220
+ | `--target_model` | Target model deployment name | `gpt-5.5` |
221
+ | `--num_epochs` | Number of training epochs | `4` |
222
+ | `--batch_size` | Batch size per step | `40` |
223
+ | `--workers` | Parallel rollout workers | `8` |
224
+ | `--out_root` | Output directory | `outputs/my_run` |
225
+
226
+ ### Eval Only
227
+
228
+ Evaluate a trained skill on specific data splits without training:
229
+
230
+ ```bash
231
+ # Evaluate the packaged GPT-5.5 SearchQA skill on the test split:
232
+ python scripts/eval_only.py \
233
+ --config configs/searchqa/default.yaml \
234
+ --skill ckpt/searchqa/gpt5.5_skill.md \
235
+ --split valid_unseen \
236
+ --split_dir /path/to/searchqa_split \
237
+ --azure_openai_endpoint https://your-resource.openai.azure.com/
238
+
239
+ # Evaluate on all splits (train + val + test):
240
+ python scripts/eval_only.py \
241
+ --config configs/searchqa/default.yaml \
242
+ --skill ckpt/searchqa/gpt5.5_skill.md \
243
+ --split all \
244
+ --split_dir /path/to/searchqa_split \
245
+ --azure_openai_endpoint https://your-resource.openai.azure.com/
246
+ ```
247
+
248
+ To evaluate a skill produced by your own training run, replace `--skill` with that run's best-skill path, for example `outputs/my_run/best_skill.md`.
249
+
250
+ | Split | Description |
251
+ |---|---|
252
+ | `valid_unseen` | Test set |
253
+ | `valid_seen` | Validation set |
254
+ | `train` | Training set |
255
+ | `all` | All splits combined (default) |
256
+
257
+ ### Output Structure
258
+
259
+ Each training run writes to a structured output directory:
260
+
261
+ ```
262
+ outputs/<run_name>/
263
+ ├── config.json # Flattened runtime config
264
+ ├── history.json # Per-step training history
265
+ ├── runtime_state.json # Resume checkpoint
266
+ ├── best_skill.md # Best validated skill document
267
+ ├── skills/skill_vXXXX.md # Skill snapshot per step
268
+ ├── steps/step_XXXX/ # Per-step artifacts (patches, evals)
269
+ ├── slow_update/epoch_XX/ # Slow update logs
270
+ └── meta_skill/epoch_XX/ # Meta skill logs
271
+ ```
272
+
273
+ Re-running the same command auto-resumes from the last completed step.
274
+
275
+ ### Pretrained Skill Artifacts
276
+
277
+ We provide a subset of the paper's main Table 1 GPT-5.5 optimized skills in
278
+ [`ckpt/`](ckpt/) as reference artifacts. Use them with `scripts/eval_only.py`
279
+ to evaluate the provided skills on a matching data split without re-running
280
+ training. See [`ckpt/README.md`](ckpt/README.md) for the full per-benchmark
281
+ command. This is the first artifact batch; we plan to continue uploading
282
+ the remaining optimized skills and benchmark split manifests as they are
283
+ cleaned and verified.
284
+
285
+ ---
286
+
287
+ ## Data Preparation
288
+
289
+ ### Directory layout
290
+
291
+ SkillOpt expects data in a **split directory** with `train/`, `val/`, `test/` subdirectories, each containing a JSON file (e.g., `items.json`):
292
+
293
+ ```
294
+ data/my_split/
295
+ ├── train/items.json
296
+ ├── val/items.json
297
+ └── test/items.json
298
+ ```
299
+
300
+ Each JSON file is an array of task items. The required fields depend on the benchmark. For example, SearchQA items look like:
301
+
302
+ ```json
303
+ [
304
+ {
305
+ "id": "unique_item_id",
306
+ "question": "Who wrote the novel ...",
307
+ "context": "[DOC] relevant passage text ...",
308
+ "answers": ["expected answer"]
309
+ }
310
+ ]
311
+ ```
312
+
313
+ See `skillopt/envs/<benchmark>/dataloader.py` for the exact format each benchmark expects.
314
+
315
+ > **Note:** Most benchmark datasets are not included in this repository. Prepare your own data following the format above. The exact SearchQA split used in the paper is provided at [`data/searchqa_id_split/`](data/searchqa_id_split) (400 train / 200 val / 1400 test). We are preparing the remaining benchmark split manifests for upload.
316
+
317
+ ### Supported Benchmarks
318
+
319
+ | Benchmark | Type | Config |
320
+ |---|---|---|
321
+ | SearchQA | QA | `configs/searchqa/default.yaml` |
322
+ | ALFWorld | Embodied agent | `configs/alfworld/default.yaml` |
323
+ | DocVQA | Document QA | `configs/docvqa/default.yaml` |
324
+ | LiveMathematicianBench | Math | `configs/livemathematicianbench/default.yaml` |
325
+ | SpreadsheetBench | Code generation | `configs/spreadsheetbench/default.yaml` |
326
+ | OfficeQA | Tool-augmented QA | `configs/officeqa/default.yaml` |
327
+
328
+ ---
329
+
330
+ ## Configuration
331
+
332
+ ### Default settings and paper-reproduction knobs
333
+
334
+ `configs/_base_/default.yaml` is the single source of truth for SkillOpt's
335
+ runtime knobs. Out of the box, every included benchmark config inherits
336
+ from it and keeps the paper protocol visible: 4 epochs, rollout batch 40,
337
+ reflection minibatch 8, textual learning rate 4 with cosine decay, strict
338
+ hard validation gating, and slow-update + meta-skill enabled. One detail to
339
+ watch is slow-update acceptance: the current `main` default is the newer
340
+ post-submission force-accept mode, while the paper protocol and the
341
+ paper-aligned skills under `ckpt/` use the gated semantics described in
342
+ paper Section 3.6.
343
+
344
+ ### Slow-update acceptance mode
345
+
346
+ The epoch-boundary slow / meta update can be applied two ways, controlled
347
+ by `optimizer.slow_update_gate_with_selection`:
348
+
349
+ ```yaml
350
+ optimizer:
351
+ slow_update_gate_with_selection: false # current main default
352
+ ```
353
+
354
+ - **`false`** *(current `main` default)*: force-accept. The
355
+ slow-update guidance is injected into both `current_skill` and
356
+ `best_skill` unconditionally at the epoch boundary. This is the newer
357
+ post-submission behavior on `main`.
358
+ - **`true`** *(paper / ckpt-skill reproduction)*: gated, matching paper
359
+ Section 3.6 verbatim. The slow-update candidate is evaluated on the
360
+ selection split and accepted only if it passes the same validation gate
361
+ as a step-level edit. Use this setting when re-running optimization to
362
+ match the paper protocol and the provenance of the provided `ckpt/` skills.
363
+
364
+ The trainer prints which mode is active at startup
365
+ (`[slow update] acceptance=...`). See issue #22 for the discussion that
366
+ led to the flag.
367
+
368
+ ### Gate metric (`hard` / `soft` / `mixed`)
369
+
370
+ The validation gate compares candidate vs. current skills on the selection
371
+ split using `gate_metric`:
372
+
373
+ - **`hard`** *(default, paper)*: exact-match accuracy, strictly greater
374
+ than the current score is required.
375
+ - **`soft`**: per-item soft / partial-credit score. Useful when the
376
+ selection split is small (e.g. ≤10 items) and the reward is continuous,
377
+ where the discrete hard gate often rejects every candidate.
378
+ - **`mixed`**: weighted average, `(1 - w) * hard + w * soft`, with `w`
379
+ set by `gate_mixed_weight` (default `0.5`).
380
+
381
+ Default is `hard`. Use the optional feature config below to switch.
382
+
383
+ ### Optional feature configs
384
+
385
+ These are **not** default SkillOpt settings — they are optional feature configs
386
+ contributed by users for specific scenarios. The paper-reported numbers
387
+ were obtained with the default settings, not these.
388
+
389
+ - **[`configs/features/soft_gate.yaml`](configs/features/soft_gate.yaml)**
390
+ *(PR #25, contributed by [@lvbaocheng](https://github.com/lvbaocheng))* —
391
+ switches `gate_metric` to `soft` (or `mixed`). See the comment at the
392
+ top of the file for when to use and when not to.
393
+
394
+ ---
395
+
396
+ ## Extensibility & WebUI
397
+
398
+ ### Adding a new backend
399
+
400
+ A backend = a chat / exec target (e.g. `openai_chat`, `claude_chat`,
401
+ `qwen_chat`, `minimax_chat`, `codex_exec`, `claude_code_exec`). See
402
+ [`docs/guide/new-backend.md`](docs/guide/new-backend.md) for the full
403
+ contract; in short you add a `skillopt/model/<name>_backend.py` module,
404
+ register it in `skillopt/model/common.py` + `backend_config.py`, and wire
405
+ it through the router in `skillopt/model/__init__.py`. `qwen_backend.py`
406
+ and `minimax_backend.py` are good templates.
407
+
408
+ ### Adding a new benchmark
409
+
410
+ A benchmark = a `skillopt/envs/<name>/` package with a `dataloader.py`, a
411
+ `rollout.py`, and an `initial.md` seed skill. See
412
+ [`docs/guide/new-benchmark.md`](docs/guide/new-benchmark.md) for the full
413
+ contract; the simplest reference is `skillopt/envs/searchqa/`.
414
+
415
+ ### WebUI
416
+
417
+ Launch the monitoring dashboard (optional):
418
+
419
+ ```bash
420
+ pip install -e ".[webui]"
421
+ python -m skillopt_webui.app
422
+ ```
423
+
424
+ | Flag | Default | Description |
425
+ |---|---|---|
426
+ | `--port` | 7860 | Server port |
427
+ | `--host` | `0.0.0.0` | Bind address |
428
+ | `--share` | off | Create a public Gradio share link |
429
+
430
+ ---
431
+
432
+ ## Citation
433
+
434
+ ```bibtex
435
+ @misc{yang2026skilloptexecutivestrategyselfevolving,
436
+ title={SkillOpt: Executive Strategy for Self-Evolving Agent Skills},
437
+ author={Yifan Yang and Ziyang Gong and Weiquan Huang and Qihao Yang and Ziwei Zhou and Zisu Huang and Yan Li and Xuemei Gao and Qi Dai and Bei Liu and Kai Qiu and Yuqing Yang and Dongdong Chen and Xue Yang and Chong Luo},
438
+ year={2026},
439
+ eprint={2605.23904},
440
+ archivePrefix={arXiv},
441
+ primaryClass={cs.AI},
442
+ url={https://arxiv.org/abs/2605.23904}
443
+ }
444
+ ```