agent-tune-kit 0.3.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_tune_kit-0.3.6/.codex-plugin/plugin.json +41 -0
- agent_tune_kit-0.3.6/.gitignore +39 -0
- agent_tune_kit-0.3.6/PKG-INFO +314 -0
- agent_tune_kit-0.3.6/README.en.md +300 -0
- agent_tune_kit-0.3.6/README.md +302 -0
- agent_tune_kit-0.3.6/README.zh-CN.md +302 -0
- agent_tune_kit-0.3.6/docs/codex_agent_tuning_prd.md +305 -0
- agent_tune_kit-0.3.6/docs/shared-versioning-and-confirmation.md +167 -0
- agent_tune_kit-0.3.6/docs/skill-template-pack-usage.md +149 -0
- agent_tune_kit-0.3.6/pyproject.toml +75 -0
- agent_tune_kit-0.3.6/scripts/check-release.py +255 -0
- agent_tune_kit-0.3.6/scripts/install_plugin.py +28 -0
- agent_tune_kit-0.3.6/scripts/publish-pypi.sh +18 -0
- agent_tune_kit-0.3.6/scripts/publish-release.py +271 -0
- agent_tune_kit-0.3.6/scripts/validate_skill_pack.py +714 -0
- agent_tune_kit-0.3.6/skills/atk-find-failures/SKILL.md +81 -0
- agent_tune_kit-0.3.6/skills/atk-find-failures-by-rule/SKILL.md +83 -0
- agent_tune_kit-0.3.6/skills/atk-init/SKILL.md +127 -0
- agent_tune_kit-0.3.6/skills/atk-init-failure-rule/SKILL.md +83 -0
- agent_tune_kit-0.3.6/skills/atk-report/SKILL.md +114 -0
- agent_tune_kit-0.3.6/skills/atk-run/SKILL.md +102 -0
- agent_tune_kit-0.3.6/skills/atk-status/SKILL.md +87 -0
- agent_tune_kit-0.3.6/skills/atk-tune/SKILL.md +94 -0
- agent_tune_kit-0.3.6/skills/atk-visualize-failures/SKILL.md +93 -0
- agent_tune_kit-0.3.6/skills/atk-visualize-failures/assets/app.js +725 -0
- agent_tune_kit-0.3.6/skills/atk-visualize-failures/assets/page.html +106 -0
- agent_tune_kit-0.3.6/skills/atk-visualize-failures/assets/styles.css +548 -0
- agent_tune_kit-0.3.6/skills/atk-visualize-failures/scripts/generate_failure_browser.py +487 -0
- agent_tune_kit-0.3.6/src/agent_tune_kit/__init__.py +3 -0
- agent_tune_kit-0.3.6/src/agent_tune_kit/cli.py +15 -0
- agent_tune_kit-0.3.6/src/agent_tune_kit/installer.py +915 -0
- agent_tune_kit-0.3.6/templates/.atk/runner/eval_runner.py.md +508 -0
- agent_tune_kit-0.3.6/templates/.atk/runner/failure_rule.py.md +101 -0
- agent_tune_kit-0.3.6/templates/.gitkeep +0 -0
- agent_tune_kit-0.3.6/tests/__init__.py +0 -0
- agent_tune_kit-0.3.6/tests/test_generate_failure_browser.py +240 -0
- agent_tune_kit-0.3.6/tests/test_install_plugin.py +398 -0
- agent_tune_kit-0.3.6/tests/test_release_scripts.py +67 -0
- agent_tune_kit-0.3.6/tests/test_runner_template_row_logging.py +204 -0
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "agent-tune-kit",
|
|
3
|
+
"version": "0.3.6",
|
|
4
|
+
"description": "Local Codex plugin for iterative Agent tuning with guided Skills, reusable runner templates, versioned results, and static validation.",
|
|
5
|
+
"author": {
|
|
6
|
+
"name": "hustyichi",
|
|
7
|
+
"email": "hustyichi@163.com",
|
|
8
|
+
"url": "https://github.com/hustyichi"
|
|
9
|
+
},
|
|
10
|
+
"homepage": "https://github.com/hustyichi/agent-tune-kit",
|
|
11
|
+
"repository": "https://github.com/hustyichi/agent-tune-kit",
|
|
12
|
+
"license": "UNLICENSED",
|
|
13
|
+
"keywords": [
|
|
14
|
+
"agent-tune-kit",
|
|
15
|
+
"codex-skills",
|
|
16
|
+
"evaluation",
|
|
17
|
+
"reporting",
|
|
18
|
+
"local-plugin"
|
|
19
|
+
],
|
|
20
|
+
"skills": "./skills/",
|
|
21
|
+
"interface": {
|
|
22
|
+
"displayName": "Agent Tune Kit",
|
|
23
|
+
"shortDescription": "Run a guided local Agent tuning loop",
|
|
24
|
+
"longDescription": "Agent Tune Kit provides Codex Skills for generating local test runners, finding failing cases, reporting regressions and improvements, and applying focused Agent tuning with versioned local artifacts.",
|
|
25
|
+
"developerName": "hustyichi",
|
|
26
|
+
"category": "Coding",
|
|
27
|
+
"capabilities": [
|
|
28
|
+
"Interactive",
|
|
29
|
+
"Read",
|
|
30
|
+
"Write"
|
|
31
|
+
],
|
|
32
|
+
"websiteURL": "https://github.com/hustyichi/agent-tune-kit",
|
|
33
|
+
"defaultPrompt": [
|
|
34
|
+
"Check Agent tuning status",
|
|
35
|
+
"Find Agent failure cases",
|
|
36
|
+
"Generate an Agent tuning report"
|
|
37
|
+
],
|
|
38
|
+
"brandColor": "#334155",
|
|
39
|
+
"screenshots": []
|
|
40
|
+
}
|
|
41
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# macOS / editor noise
|
|
2
|
+
.DS_Store
|
|
3
|
+
*.swp
|
|
4
|
+
*.swo
|
|
5
|
+
*~
|
|
6
|
+
|
|
7
|
+
# Local runtime state
|
|
8
|
+
.omx/
|
|
9
|
+
|
|
10
|
+
# Environment and secrets
|
|
11
|
+
.env
|
|
12
|
+
.env.*
|
|
13
|
+
!.env.example
|
|
14
|
+
|
|
15
|
+
# Python caches and virtual environments
|
|
16
|
+
__pycache__/
|
|
17
|
+
*.py[cod]
|
|
18
|
+
*$py.class
|
|
19
|
+
.pytest_cache/
|
|
20
|
+
.ruff_cache/
|
|
21
|
+
.mypy_cache/
|
|
22
|
+
.coverage
|
|
23
|
+
htmlcov/
|
|
24
|
+
.venv/
|
|
25
|
+
venv/
|
|
26
|
+
env/
|
|
27
|
+
|
|
28
|
+
# Build and package output
|
|
29
|
+
build/
|
|
30
|
+
dist/
|
|
31
|
+
*.egg-info/
|
|
32
|
+
|
|
33
|
+
# Node dependencies and caches, if frontend/tooling is added later
|
|
34
|
+
node_modules/
|
|
35
|
+
.npm/
|
|
36
|
+
.pnpm-store/
|
|
37
|
+
|
|
38
|
+
# Agent tune kit generated outputs
|
|
39
|
+
.atk/
|
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: agent-tune-kit
|
|
3
|
+
Version: 0.3.6
|
|
4
|
+
Summary: Local Codex plugin for iterative Agent tuning with guided Skills, reusable runner templates, versioned results, and static validation.
|
|
5
|
+
Project-URL: Homepage, https://github.com/hustyichi/agent-tune-kit
|
|
6
|
+
Project-URL: Repository, https://github.com/hustyichi/agent-tune-kit
|
|
7
|
+
Author-email: hustyichi <hustyichi@163.com>
|
|
8
|
+
License: UNLICENSED
|
|
9
|
+
Keywords: agent-tune-kit,codex-skills,evaluation,local-plugin,reporting
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
12
|
+
Requires-Python: >=3.11
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
# Agent Tune Kit
|
|
16
|
+
|
|
17
|
+
English | [简体中文](README.md)
|
|
18
|
+
|
|
19
|
+
Agent Tune Kit is a **local Codex plugin** that helps you evaluate and improve the quality of your own local Agent.
|
|
20
|
+
|
|
21
|
+
If you already have a working Agent but are not sure where it fails, why it fails, or what to tune next, this project lets Codex help you run a complete loop: batch test the Agent, find failure cases, write an analysis report, tune the Agent, and verify the next run.
|
|
22
|
+
|
|
23
|
+
Its main advantage is a **low-friction start**. You do not need to design a complex evaluation schema or expose a universal Agent interface first. Bring a local Agent project and a small evaluation dataset; Codex reads the code and data samples, then generates the project-specific runner and tuning workflow.
|
|
24
|
+
|
|
25
|
+
## Who it is for
|
|
26
|
+
|
|
27
|
+
Use this if you have:
|
|
28
|
+
|
|
29
|
+
- a local Agent, chatbot, tool-using Agent, or RAG Agent;
|
|
30
|
+
- a few test questions, sample inputs, expected answers, or human-judgable results;
|
|
31
|
+
- a need to quickly find weak spots and let Codex help tune prompts, code, parameters, or tool configuration;
|
|
32
|
+
- a desire to keep each tuning loop traceable with result files and reports.
|
|
33
|
+
|
|
34
|
+
You do not need a full evaluation platform to start. For the first validation, 5 to 20 CSV rows are enough.
|
|
35
|
+
|
|
36
|
+
## Prerequisites
|
|
37
|
+
|
|
38
|
+
You only need:
|
|
39
|
+
|
|
40
|
+
- Codex with local plugin/Skill support.
|
|
41
|
+
- Python 3.
|
|
42
|
+
- A local Agent project that Codex can inspect and edit.
|
|
43
|
+
- A simple evaluation dataset, preferably CSV. Column names do not need to follow a strict Schema; Codex will infer inputs and expected results where possible.
|
|
44
|
+
|
|
45
|
+
Create a git checkpoint before tuning if you want an easy rollback path. Agent Tune Kit does not automate Agent tuning rollback; installer rollback only restores local marketplace/plugin-store install state.
|
|
46
|
+
|
|
47
|
+
## Quickstart: install the plugin
|
|
48
|
+
|
|
49
|
+
No repository clone is needed for normal use. Run the packaged installer directly with uvx:
|
|
50
|
+
|
|
51
|
+
```sh
|
|
52
|
+
uvx --from agent-tune-kit atk install
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
For a persistent command, install the tool first, then run `atk`:
|
|
56
|
+
|
|
57
|
+
```sh
|
|
58
|
+
uv tool install agent-tune-kit
|
|
59
|
+
atk install
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
If you prefer pipx:
|
|
63
|
+
|
|
64
|
+
```sh
|
|
65
|
+
pipx install agent-tune-kit
|
|
66
|
+
atk install
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
The installer validates the packaged plugin manifest, adds the plugin to the Personal marketplace, writes or updates `~/.agents/plugins/marketplace.json`, copies the packaged payload into `~/plugins/agent-tune-kit`, and runs local smoke/status checks by default. It proves local files and marketplace state only; it does not bypass or modify hidden Codex UI enablement state.
|
|
70
|
+
|
|
71
|
+
Useful helper commands:
|
|
72
|
+
|
|
73
|
+
```sh
|
|
74
|
+
atk preview --smoke # preview only; no writes
|
|
75
|
+
atk status # read local install status and next steps
|
|
76
|
+
atk rollback --backup <backup-id> # restore installer-managed local install state only
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
When an existing marketplace/plugin-store conflict is found, interactive terminals prompt before replacement. Noninteractive replacement requires `--yes --force`; destructive replacement creates a backup first and prints a rollback command. The installer supports explicit subcommands only and does not keep old entry points; use `preview` for no-write preview.
|
|
80
|
+
|
|
81
|
+
Contributor checkout path, for editing this repository itself:
|
|
82
|
+
|
|
83
|
+
```sh
|
|
84
|
+
git clone git@github.com:hustyichi/agent-tune-kit.git
|
|
85
|
+
cd agent-tune-kit
|
|
86
|
+
uv sync
|
|
87
|
+
uv run atk install
|
|
88
|
+
# or: python3 scripts/install_plugin.py install
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
After install, Agent Tune Kit should be visible/available in `/plugins`.
|
|
92
|
+
|
|
93
|
+
You still need to enable it in Codex:
|
|
94
|
+
|
|
95
|
+
```text
|
|
96
|
+
/plugins
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Select `Agent Tune Kit` in the plugin list and follow the UI prompt to install/enable it. After you enable it in the UI, `$atk-status` and the other Skill commands should appear in autocomplete.
|
|
100
|
+
|
|
101
|
+
If the plugin is enabled in `/plugins` but `$atk-status` still does not appear in the current session, that is expected: Codex usually loads plugin Skills when a session starts, so newly enabled plugins may not be hot-loaded into an already running session. Restart Codex, or close the current Codex session and reopen this project, then type `$atk-status` again to verify.
|
|
102
|
+
|
|
103
|
+
If your environment cannot use local plugins, do not split-copy individual `skills/*` directories; this repository now treats the local Codex plugin install path as the only recommended entry point.
|
|
104
|
+
|
|
105
|
+
## Maintainer release to PyPI
|
|
106
|
+
|
|
107
|
+
The release scripts follow the two-step release gate/publish shape used by `agent-tune-cli`: default mode is a dry run, and uploads only happen with an explicit `--publish`.
|
|
108
|
+
|
|
109
|
+
Run the full local release gate first. It checks version alignment, static validation, tests, `uv build --no-sources`, archive contents, and packaged `atk` smoke installs outside the repository:
|
|
110
|
+
|
|
111
|
+
```sh
|
|
112
|
+
UV_NO_CONFIG=1 uv run python scripts/check-release.py
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
Prepare clean `dist/` artifacts without uploading:
|
|
116
|
+
|
|
117
|
+
```sh
|
|
118
|
+
UV_NO_CONFIG=1 uv run python scripts/publish-release.py
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
Publish to TestPyPI first:
|
|
122
|
+
|
|
123
|
+
```sh
|
|
124
|
+
export UV_PUBLISH_TOKEN='pypi-your-testpypi-token'
|
|
125
|
+
UV_NO_CONFIG=1 uv run python scripts/publish-release.py --repository testpypi --publish
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
After TestPyPI install validation, publish to PyPI:
|
|
129
|
+
|
|
130
|
+
```sh
|
|
131
|
+
export UV_PUBLISH_TOKEN='pypi-your-pypi-token'
|
|
132
|
+
UV_NO_CONFIG=1 uv run python scripts/publish-release.py --repository pypi --publish
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
The publish script checks whether the current `project.name` + `project.version` already exists before uploading. If it exists, bump the version in `pyproject.toml`, `.codex-plugin/plugin.json`, and `src/agent_tune_kit/__init__.py` first. Never commit or paste PyPI tokens.
|
|
136
|
+
|
|
137
|
+
For the fixed production PyPI path, you can run the zero-argument wrapper:
|
|
138
|
+
|
|
139
|
+
```sh
|
|
140
|
+
scripts/publish-pypi.sh
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
It is equivalent to `UV_NO_CONFIG=1 uv run python scripts/publish-release.py --repository pypi --publish`, but checks that `UV_PUBLISH_TOKEN` is set first.
|
|
144
|
+
|
|
145
|
+
## Minimal tuning loop
|
|
146
|
+
|
|
147
|
+
Run these steps in **your Agent repository**, not in this Agent Tune Kit repository.
|
|
148
|
+
|
|
149
|
+
### 1. Generate a test runner
|
|
150
|
+
|
|
151
|
+
Run:
|
|
152
|
+
|
|
153
|
+
```text
|
|
154
|
+
$atk-init
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
Point Codex to your Agent entrypoint and evaluation dataset. Codex generates:
|
|
158
|
+
|
|
159
|
+
```text
|
|
160
|
+
.atk/runner/eval_runner.py
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
The runner keeps your original dataset columns and adds the Agent's actual output as `agent_output`. It also adds `agent_output_log_path`; when trustworthy Python `logging` capture is configured, this column points to row-specific files such as `logs/row_000001.log` for serial or same-process concurrent runs.
|
|
164
|
+
|
|
165
|
+
`$atk-init` first snapshots the provided dataset into `.atk/datasets/`, and the generated runner reads that project-local copy. If a same-name snapshot already exists with identical content, it is reused; if the name exists with different content, ATK uses readable incrementing names such as `dataset_2.csv` and `dataset_3.csv`.
|
|
166
|
+
|
|
167
|
+
### 2. Run the Agent on the dataset
|
|
168
|
+
|
|
169
|
+
Run:
|
|
170
|
+
|
|
171
|
+
```text
|
|
172
|
+
$atk-run
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
This writes:
|
|
176
|
+
|
|
177
|
+
```text
|
|
178
|
+
.atk/results/v1/eval_results.csv
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
If row logging is active, the same version also contains `.atk/results/v1/logs/row_*.log`. Row logs are generated for configured same-process Python `logging` capture in serial runs and, when `CONCURRENT_ROW_LOGGING_ENABLED` remains enabled, with `--concurrency > 1`. The runner only writes records emitted while an ATK row context is active; stdout/stderr, subprocess, multiprocess, and post-row background logs remain out of scope. If concurrent row logging is disabled, concurrent runs visibly downgrade to `app.log`/CSV evidence instead of creating row logs.
|
|
182
|
+
|
|
183
|
+
### 3. Find failing cases
|
|
184
|
+
|
|
185
|
+
For the simplest path, let Codex judge which cases failed:
|
|
186
|
+
|
|
187
|
+
```text
|
|
188
|
+
$atk-find-failures
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
If you already have a clear rule, first create or update the reusable rule script:
|
|
192
|
+
|
|
193
|
+
```text
|
|
194
|
+
$atk-init-failure-rule rule: mark a row as failed when the expected field differs from agent_output
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
Codex uses the rule you provide in the command to generate the rule script at:
|
|
198
|
+
|
|
199
|
+
```text
|
|
200
|
+
.atk/runner/failure_rule.py
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
Then execute that rule script to write the failing cases:
|
|
204
|
+
|
|
205
|
+
```text
|
|
206
|
+
$atk-find-failures-by-rule
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
If `.atk/runner/failure_rule.py` is missing, `$atk-find-failures-by-rule` stops and tells you to run `$atk-init-failure-rule` first.
|
|
210
|
+
|
|
211
|
+
The failing cases are written to:
|
|
212
|
+
|
|
213
|
+
```text
|
|
214
|
+
.atk/results/v1/failure_cases.csv
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
### 4. Generate the analysis report
|
|
218
|
+
|
|
219
|
+
Run:
|
|
220
|
+
|
|
221
|
+
```text
|
|
222
|
+
$atk-report
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
Codex writes:
|
|
226
|
+
|
|
227
|
+
```text
|
|
228
|
+
.atk/results/v1/report.md
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
The report summarizes test results, failure cases, likely causes, and recommended tuning priorities.
|
|
232
|
+
|
|
233
|
+
### 5. Optionally review failures in HTML
|
|
234
|
+
|
|
235
|
+
Run:
|
|
236
|
+
|
|
237
|
+
```text
|
|
238
|
+
$atk-visualize-failures
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
Codex writes:
|
|
242
|
+
|
|
243
|
+
```text
|
|
244
|
+
.atk/results/v1/failure_cases.html
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
This optional browser can run any time `failure_cases.csv` exists. If same-version `report.md` exists, it is used as best-effort, non-blocking context; missing or unparseable report context does not block the visualization. The Skill uses a fixed plugin-owned stdlib generator script, so output is deterministic and dependency-free while still offering expected-vs-actual review, search/filter/pagination, schema-adaptive role switching, and safe relative log links.
|
|
248
|
+
|
|
249
|
+
### 6. Let Codex tune the Agent
|
|
250
|
+
|
|
251
|
+
Run:
|
|
252
|
+
|
|
253
|
+
```text
|
|
254
|
+
$atk-tune
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
Codex edits the Agent based on the report and records the tuning plan in:
|
|
258
|
+
|
|
259
|
+
```text
|
|
260
|
+
.atk/results/v1/tuning_plan.md
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
## Verify that tuning worked
|
|
264
|
+
|
|
265
|
+
After tuning, run the test again:
|
|
266
|
+
|
|
267
|
+
```text
|
|
268
|
+
$atk-run
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
This creates `.atk/results/v2/eval_results.csv`. Then run:
|
|
272
|
+
|
|
273
|
+
```text
|
|
274
|
+
$atk-find-failures
|
|
275
|
+
$atk-report
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
Starting with the second loop, the report reads the previous `tuning_plan.md` and tells you whether the target failures were resolved, partially resolved, unresolved, or impossible to judge.
|
|
279
|
+
|
|
280
|
+
## Expected output
|
|
281
|
+
|
|
282
|
+
```text
|
|
283
|
+
.atk/
|
|
284
|
+
├── datasets/
|
|
285
|
+
│ └── service_source_codes.csv
|
|
286
|
+
├── runner/
|
|
287
|
+
│ ├── eval_runner.py
|
|
288
|
+
│ └── failure_rule.py
|
|
289
|
+
└── results/
|
|
290
|
+
├── v1/
|
|
291
|
+
│ ├── eval_results.csv
|
|
292
|
+
│ ├── logs/ # optional row logs
|
|
293
|
+
│ │ └── row_000001.log
|
|
294
|
+
│ ├── failure_cases.csv
|
|
295
|
+
│ ├── failure_cases.html # optional failure browser
|
|
296
|
+
│ ├── report.md
|
|
297
|
+
│ └── tuning_plan.md
|
|
298
|
+
└── v2/
|
|
299
|
+
└── ...
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
Most users only need to read `eval_results.csv`, `failure_cases.csv`, optional `failure_cases.html`, `report.md`, and row logs linked from `agent_output_log_path` when available. Version directories are managed automatically.
|
|
303
|
+
|
|
304
|
+
## Available Skills
|
|
305
|
+
|
|
306
|
+
- `$atk-status`: inspect progress and recommend the next step.
|
|
307
|
+
- `$atk-init`: generate a test runner for the current Agent.
|
|
308
|
+
- `$atk-run`: run the test runner and create the current result version.
|
|
309
|
+
- `$atk-find-failures`: let Codex identify failing cases.
|
|
310
|
+
- `$atk-init-failure-rule`: create or update `.atk/runner/failure_rule.py`.
|
|
311
|
+
- `$atk-find-failures-by-rule`: execute `.atk/runner/failure_rule.py` to identify failing cases with explicit rules.
|
|
312
|
+
- `$atk-report`: generate analysis and cross-loop validation.
|
|
313
|
+
- `$atk-visualize-failures`: generate optional `.atk/results/vN/failure_cases.html` from current `failure_cases.csv`.
|
|
314
|
+
- `$atk-tune`: tune the Agent and record the tuning plan.
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
# Agent Tune Kit
|
|
2
|
+
|
|
3
|
+
English | [简体中文](README.md)
|
|
4
|
+
|
|
5
|
+
Agent Tune Kit is a **local Codex plugin** that helps you evaluate and improve the quality of your own local Agent.
|
|
6
|
+
|
|
7
|
+
If you already have a working Agent but are not sure where it fails, why it fails, or what to tune next, this project lets Codex help you run a complete loop: batch test the Agent, find failure cases, write an analysis report, tune the Agent, and verify the next run.
|
|
8
|
+
|
|
9
|
+
Its main advantage is a **low-friction start**. You do not need to design a complex evaluation schema or expose a universal Agent interface first. Bring a local Agent project and a small evaluation dataset; Codex reads the code and data samples, then generates the project-specific runner and tuning workflow.
|
|
10
|
+
|
|
11
|
+
## Who it is for
|
|
12
|
+
|
|
13
|
+
Use this if you have:
|
|
14
|
+
|
|
15
|
+
- a local Agent, chatbot, tool-using Agent, or RAG Agent;
|
|
16
|
+
- a few test questions, sample inputs, expected answers, or human-judgable results;
|
|
17
|
+
- a need to quickly find weak spots and let Codex help tune prompts, code, parameters, or tool configuration;
|
|
18
|
+
- a desire to keep each tuning loop traceable with result files and reports.
|
|
19
|
+
|
|
20
|
+
You do not need a full evaluation platform to start. For the first validation, 5 to 20 CSV rows are enough.
|
|
21
|
+
|
|
22
|
+
## Prerequisites
|
|
23
|
+
|
|
24
|
+
You only need:
|
|
25
|
+
|
|
26
|
+
- Codex with local plugin/Skill support.
|
|
27
|
+
- Python 3.
|
|
28
|
+
- A local Agent project that Codex can inspect and edit.
|
|
29
|
+
- A simple evaluation dataset, preferably CSV. Column names do not need to follow a strict Schema; Codex will infer inputs and expected results where possible.
|
|
30
|
+
|
|
31
|
+
Create a git checkpoint before tuning if you want an easy rollback path. Agent Tune Kit does not automate Agent tuning rollback; installer rollback only restores local marketplace/plugin-store install state.
|
|
32
|
+
|
|
33
|
+
## Quickstart: install the plugin
|
|
34
|
+
|
|
35
|
+
No repository clone is needed for normal use. Run the packaged installer directly with uvx:
|
|
36
|
+
|
|
37
|
+
```sh
|
|
38
|
+
uvx --from agent-tune-kit atk install
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
For a persistent command, install the tool first, then run `atk`:
|
|
42
|
+
|
|
43
|
+
```sh
|
|
44
|
+
uv tool install agent-tune-kit
|
|
45
|
+
atk install
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
If you prefer pipx:
|
|
49
|
+
|
|
50
|
+
```sh
|
|
51
|
+
pipx install agent-tune-kit
|
|
52
|
+
atk install
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
The installer validates the packaged plugin manifest, adds the plugin to the Personal marketplace, writes or updates `~/.agents/plugins/marketplace.json`, copies the packaged payload into `~/plugins/agent-tune-kit`, and runs local smoke/status checks by default. It proves local files and marketplace state only; it does not bypass or modify hidden Codex UI enablement state.
|
|
56
|
+
|
|
57
|
+
Useful helper commands:
|
|
58
|
+
|
|
59
|
+
```sh
|
|
60
|
+
atk preview --smoke # preview only; no writes
|
|
61
|
+
atk status # read local install status and next steps
|
|
62
|
+
atk rollback --backup <backup-id> # restore installer-managed local install state only
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
When an existing marketplace/plugin-store conflict is found, interactive terminals prompt before replacement. Noninteractive replacement requires `--yes --force`; destructive replacement creates a backup first and prints a rollback command. The installer supports explicit subcommands only and does not keep old entry points; use `preview` for no-write preview.
|
|
66
|
+
|
|
67
|
+
Contributor checkout path, for editing this repository itself:
|
|
68
|
+
|
|
69
|
+
```sh
|
|
70
|
+
git clone git@github.com:hustyichi/agent-tune-kit.git
|
|
71
|
+
cd agent-tune-kit
|
|
72
|
+
uv sync
|
|
73
|
+
uv run atk install
|
|
74
|
+
# or: python3 scripts/install_plugin.py install
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
After install, Agent Tune Kit should be visible/available in `/plugins`.
|
|
78
|
+
|
|
79
|
+
You still need to enable it in Codex:
|
|
80
|
+
|
|
81
|
+
```text
|
|
82
|
+
/plugins
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Select `Agent Tune Kit` in the plugin list and follow the UI prompt to install/enable it. After you enable it in the UI, `$atk-status` and the other Skill commands should appear in autocomplete.
|
|
86
|
+
|
|
87
|
+
If the plugin is enabled in `/plugins` but `$atk-status` still does not appear in the current session, that is expected: Codex usually loads plugin Skills when a session starts, so newly enabled plugins may not be hot-loaded into an already running session. Restart Codex, or close the current Codex session and reopen this project, then type `$atk-status` again to verify.
|
|
88
|
+
|
|
89
|
+
If your environment cannot use local plugins, do not split-copy individual `skills/*` directories; this repository now treats the local Codex plugin install path as the only recommended entry point.
|
|
90
|
+
|
|
91
|
+
## Maintainer release to PyPI
|
|
92
|
+
|
|
93
|
+
The release scripts follow the two-step release gate/publish shape used by `agent-tune-cli`: default mode is a dry run, and uploads only happen with an explicit `--publish`.
|
|
94
|
+
|
|
95
|
+
Run the full local release gate first. It checks version alignment, static validation, tests, `uv build --no-sources`, archive contents, and packaged `atk` smoke installs outside the repository:
|
|
96
|
+
|
|
97
|
+
```sh
|
|
98
|
+
UV_NO_CONFIG=1 uv run python scripts/check-release.py
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
Prepare clean `dist/` artifacts without uploading:
|
|
102
|
+
|
|
103
|
+
```sh
|
|
104
|
+
UV_NO_CONFIG=1 uv run python scripts/publish-release.py
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Publish to TestPyPI first:
|
|
108
|
+
|
|
109
|
+
```sh
|
|
110
|
+
export UV_PUBLISH_TOKEN='pypi-your-testpypi-token'
|
|
111
|
+
UV_NO_CONFIG=1 uv run python scripts/publish-release.py --repository testpypi --publish
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
After TestPyPI install validation, publish to PyPI:
|
|
115
|
+
|
|
116
|
+
```sh
|
|
117
|
+
export UV_PUBLISH_TOKEN='pypi-your-pypi-token'
|
|
118
|
+
UV_NO_CONFIG=1 uv run python scripts/publish-release.py --repository pypi --publish
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
The publish script checks whether the current `project.name` + `project.version` already exists before uploading. If it exists, bump the version in `pyproject.toml`, `.codex-plugin/plugin.json`, and `src/agent_tune_kit/__init__.py` first. Never commit or paste PyPI tokens.
|
|
122
|
+
|
|
123
|
+
For the fixed production PyPI path, you can run the zero-argument wrapper:
|
|
124
|
+
|
|
125
|
+
```sh
|
|
126
|
+
scripts/publish-pypi.sh
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
It is equivalent to `UV_NO_CONFIG=1 uv run python scripts/publish-release.py --repository pypi --publish`, but checks that `UV_PUBLISH_TOKEN` is set first.
|
|
130
|
+
|
|
131
|
+
## Minimal tuning loop
|
|
132
|
+
|
|
133
|
+
Run these steps in **your Agent repository**, not in this Agent Tune Kit repository.
|
|
134
|
+
|
|
135
|
+
### 1. Generate a test runner
|
|
136
|
+
|
|
137
|
+
Run:
|
|
138
|
+
|
|
139
|
+
```text
|
|
140
|
+
$atk-init
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
Point Codex to your Agent entrypoint and evaluation dataset. Codex generates:
|
|
144
|
+
|
|
145
|
+
```text
|
|
146
|
+
.atk/runner/eval_runner.py
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
The runner keeps your original dataset columns and adds the Agent's actual output as `agent_output`. It also adds `agent_output_log_path`; when trustworthy Python `logging` capture is configured, this column points to row-specific files such as `logs/row_000001.log` for serial or same-process concurrent runs.
|
|
150
|
+
|
|
151
|
+
`$atk-init` first snapshots the provided dataset into `.atk/datasets/`, and the generated runner reads that project-local copy. If a same-name snapshot already exists with identical content, it is reused; if the name exists with different content, ATK uses readable incrementing names such as `dataset_2.csv` and `dataset_3.csv`.
|
|
152
|
+
|
|
153
|
+
### 2. Run the Agent on the dataset
|
|
154
|
+
|
|
155
|
+
Run:
|
|
156
|
+
|
|
157
|
+
```text
|
|
158
|
+
$atk-run
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
This writes:
|
|
162
|
+
|
|
163
|
+
```text
|
|
164
|
+
.atk/results/v1/eval_results.csv
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
If row logging is active, the same version also contains `.atk/results/v1/logs/row_*.log`. Row logs are generated for configured same-process Python `logging` capture in serial runs and, when `CONCURRENT_ROW_LOGGING_ENABLED` remains enabled, with `--concurrency > 1`. The runner only writes records emitted while an ATK row context is active; stdout/stderr, subprocess, multiprocess, and post-row background logs remain out of scope. If concurrent row logging is disabled, concurrent runs visibly downgrade to `app.log`/CSV evidence instead of creating row logs.
|
|
168
|
+
|
|
169
|
+
### 3. Find failing cases
|
|
170
|
+
|
|
171
|
+
For the simplest path, let Codex judge which cases failed:
|
|
172
|
+
|
|
173
|
+
```text
|
|
174
|
+
$atk-find-failures
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
If you already have a clear rule, first create or update the reusable rule script:
|
|
178
|
+
|
|
179
|
+
```text
|
|
180
|
+
$atk-init-failure-rule rule: mark a row as failed when the expected field differs from agent_output
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
Codex uses the rule you provide in the command to generate the rule script at:
|
|
184
|
+
|
|
185
|
+
```text
|
|
186
|
+
.atk/runner/failure_rule.py
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
Then execute that rule script to write the failing cases:
|
|
190
|
+
|
|
191
|
+
```text
|
|
192
|
+
$atk-find-failures-by-rule
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
If `.atk/runner/failure_rule.py` is missing, `$atk-find-failures-by-rule` stops and tells you to run `$atk-init-failure-rule` first.
|
|
196
|
+
|
|
197
|
+
The failing cases are written to:
|
|
198
|
+
|
|
199
|
+
```text
|
|
200
|
+
.atk/results/v1/failure_cases.csv
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
### 4. Generate the analysis report
|
|
204
|
+
|
|
205
|
+
Run:
|
|
206
|
+
|
|
207
|
+
```text
|
|
208
|
+
$atk-report
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
Codex writes:
|
|
212
|
+
|
|
213
|
+
```text
|
|
214
|
+
.atk/results/v1/report.md
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
The report summarizes test results, failure cases, likely causes, and recommended tuning priorities.
|
|
218
|
+
|
|
219
|
+
### 5. Optionally review failures in HTML
|
|
220
|
+
|
|
221
|
+
Run:
|
|
222
|
+
|
|
223
|
+
```text
|
|
224
|
+
$atk-visualize-failures
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
Codex writes:
|
|
228
|
+
|
|
229
|
+
```text
|
|
230
|
+
.atk/results/v1/failure_cases.html
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
This optional browser can run any time `failure_cases.csv` exists. If same-version `report.md` exists, it is used as best-effort, non-blocking context; missing or unparseable report context does not block the visualization. The Skill uses a fixed plugin-owned stdlib generator script, so output is deterministic and dependency-free while still offering expected-vs-actual review, search/filter/pagination, schema-adaptive role switching, and safe relative log links.
|
|
234
|
+
|
|
235
|
+
### 6. Let Codex tune the Agent
|
|
236
|
+
|
|
237
|
+
Run:
|
|
238
|
+
|
|
239
|
+
```text
|
|
240
|
+
$atk-tune
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
Codex edits the Agent based on the report and records the tuning plan in:
|
|
244
|
+
|
|
245
|
+
```text
|
|
246
|
+
.atk/results/v1/tuning_plan.md
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
## Verify that tuning worked
|
|
250
|
+
|
|
251
|
+
After tuning, run the test again:
|
|
252
|
+
|
|
253
|
+
```text
|
|
254
|
+
$atk-run
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
This creates `.atk/results/v2/eval_results.csv`. Then run:
|
|
258
|
+
|
|
259
|
+
```text
|
|
260
|
+
$atk-find-failures
|
|
261
|
+
$atk-report
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
Starting with the second loop, the report reads the previous `tuning_plan.md` and tells you whether the target failures were resolved, partially resolved, unresolved, or impossible to judge.
|
|
265
|
+
|
|
266
|
+
## Expected output
|
|
267
|
+
|
|
268
|
+
```text
|
|
269
|
+
.atk/
|
|
270
|
+
├── datasets/
|
|
271
|
+
│ └── service_source_codes.csv
|
|
272
|
+
├── runner/
|
|
273
|
+
│ ├── eval_runner.py
|
|
274
|
+
│ └── failure_rule.py
|
|
275
|
+
└── results/
|
|
276
|
+
├── v1/
|
|
277
|
+
│ ├── eval_results.csv
|
|
278
|
+
│ ├── logs/ # optional row logs
|
|
279
|
+
│ │ └── row_000001.log
|
|
280
|
+
│ ├── failure_cases.csv
|
|
281
|
+
│ ├── failure_cases.html # optional failure browser
|
|
282
|
+
│ ├── report.md
|
|
283
|
+
│ └── tuning_plan.md
|
|
284
|
+
└── v2/
|
|
285
|
+
└── ...
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
Most users only need to read `eval_results.csv`, `failure_cases.csv`, optional `failure_cases.html`, `report.md`, and row logs linked from `agent_output_log_path` when available. Version directories are managed automatically.
|
|
289
|
+
|
|
290
|
+
## Available Skills
|
|
291
|
+
|
|
292
|
+
- `$atk-status`: inspect progress and recommend the next step.
|
|
293
|
+
- `$atk-init`: generate a test runner for the current Agent.
|
|
294
|
+
- `$atk-run`: run the test runner and create the current result version.
|
|
295
|
+
- `$atk-find-failures`: let Codex identify failing cases.
|
|
296
|
+
- `$atk-init-failure-rule`: create or update `.atk/runner/failure_rule.py`.
|
|
297
|
+
- `$atk-find-failures-by-rule`: execute `.atk/runner/failure_rule.py` to identify failing cases with explicit rules.
|
|
298
|
+
- `$atk-report`: generate analysis and cross-loop validation.
|
|
299
|
+
- `$atk-visualize-failures`: generate optional `.atk/results/vN/failure_cases.html` from current `failure_cases.csv`.
|
|
300
|
+
- `$atk-tune`: tune the Agent and record the tuning plan.
|