agent-tune-kit 0.3.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. agent_tune_kit-0.3.6/.codex-plugin/plugin.json +41 -0
  2. agent_tune_kit-0.3.6/.gitignore +39 -0
  3. agent_tune_kit-0.3.6/PKG-INFO +314 -0
  4. agent_tune_kit-0.3.6/README.en.md +300 -0
  5. agent_tune_kit-0.3.6/README.md +302 -0
  6. agent_tune_kit-0.3.6/README.zh-CN.md +302 -0
  7. agent_tune_kit-0.3.6/docs/codex_agent_tuning_prd.md +305 -0
  8. agent_tune_kit-0.3.6/docs/shared-versioning-and-confirmation.md +167 -0
  9. agent_tune_kit-0.3.6/docs/skill-template-pack-usage.md +149 -0
  10. agent_tune_kit-0.3.6/pyproject.toml +75 -0
  11. agent_tune_kit-0.3.6/scripts/check-release.py +255 -0
  12. agent_tune_kit-0.3.6/scripts/install_plugin.py +28 -0
  13. agent_tune_kit-0.3.6/scripts/publish-pypi.sh +18 -0
  14. agent_tune_kit-0.3.6/scripts/publish-release.py +271 -0
  15. agent_tune_kit-0.3.6/scripts/validate_skill_pack.py +714 -0
  16. agent_tune_kit-0.3.6/skills/atk-find-failures/SKILL.md +81 -0
  17. agent_tune_kit-0.3.6/skills/atk-find-failures-by-rule/SKILL.md +83 -0
  18. agent_tune_kit-0.3.6/skills/atk-init/SKILL.md +127 -0
  19. agent_tune_kit-0.3.6/skills/atk-init-failure-rule/SKILL.md +83 -0
  20. agent_tune_kit-0.3.6/skills/atk-report/SKILL.md +114 -0
  21. agent_tune_kit-0.3.6/skills/atk-run/SKILL.md +102 -0
  22. agent_tune_kit-0.3.6/skills/atk-status/SKILL.md +87 -0
  23. agent_tune_kit-0.3.6/skills/atk-tune/SKILL.md +94 -0
  24. agent_tune_kit-0.3.6/skills/atk-visualize-failures/SKILL.md +93 -0
  25. agent_tune_kit-0.3.6/skills/atk-visualize-failures/assets/app.js +725 -0
  26. agent_tune_kit-0.3.6/skills/atk-visualize-failures/assets/page.html +106 -0
  27. agent_tune_kit-0.3.6/skills/atk-visualize-failures/assets/styles.css +548 -0
  28. agent_tune_kit-0.3.6/skills/atk-visualize-failures/scripts/generate_failure_browser.py +487 -0
  29. agent_tune_kit-0.3.6/src/agent_tune_kit/__init__.py +3 -0
  30. agent_tune_kit-0.3.6/src/agent_tune_kit/cli.py +15 -0
  31. agent_tune_kit-0.3.6/src/agent_tune_kit/installer.py +915 -0
  32. agent_tune_kit-0.3.6/templates/.atk/runner/eval_runner.py.md +508 -0
  33. agent_tune_kit-0.3.6/templates/.atk/runner/failure_rule.py.md +101 -0
  34. agent_tune_kit-0.3.6/templates/.gitkeep +0 -0
  35. agent_tune_kit-0.3.6/tests/__init__.py +0 -0
  36. agent_tune_kit-0.3.6/tests/test_generate_failure_browser.py +240 -0
  37. agent_tune_kit-0.3.6/tests/test_install_plugin.py +398 -0
  38. agent_tune_kit-0.3.6/tests/test_release_scripts.py +67 -0
  39. agent_tune_kit-0.3.6/tests/test_runner_template_row_logging.py +204 -0
@@ -0,0 +1,41 @@
1
+ {
2
+ "name": "agent-tune-kit",
3
+ "version": "0.3.6",
4
+ "description": "Local Codex plugin for iterative Agent tuning with guided Skills, reusable runner templates, versioned results, and static validation.",
5
+ "author": {
6
+ "name": "hustyichi",
7
+ "email": "hustyichi@163.com",
8
+ "url": "https://github.com/hustyichi"
9
+ },
10
+ "homepage": "https://github.com/hustyichi/agent-tune-kit",
11
+ "repository": "https://github.com/hustyichi/agent-tune-kit",
12
+ "license": "UNLICENSED",
13
+ "keywords": [
14
+ "agent-tune-kit",
15
+ "codex-skills",
16
+ "evaluation",
17
+ "reporting",
18
+ "local-plugin"
19
+ ],
20
+ "skills": "./skills/",
21
+ "interface": {
22
+ "displayName": "Agent Tune Kit",
23
+ "shortDescription": "Run a guided local Agent tuning loop",
24
+ "longDescription": "Agent Tune Kit provides Codex Skills for generating local test runners, finding failing cases, reporting regressions and improvements, and applying focused Agent tuning with versioned local artifacts.",
25
+ "developerName": "hustyichi",
26
+ "category": "Coding",
27
+ "capabilities": [
28
+ "Interactive",
29
+ "Read",
30
+ "Write"
31
+ ],
32
+ "websiteURL": "https://github.com/hustyichi/agent-tune-kit",
33
+ "defaultPrompt": [
34
+ "Check Agent tuning status",
35
+ "Find Agent failure cases",
36
+ "Generate an Agent tuning report"
37
+ ],
38
+ "brandColor": "#334155",
39
+ "screenshots": []
40
+ }
41
+ }
@@ -0,0 +1,39 @@
1
+ # macOS / editor noise
2
+ .DS_Store
3
+ *.swp
4
+ *.swo
5
+ *~
6
+
7
+ # Local runtime state
8
+ .omx/
9
+
10
+ # Environment and secrets
11
+ .env
12
+ .env.*
13
+ !.env.example
14
+
15
+ # Python caches and virtual environments
16
+ __pycache__/
17
+ *.py[cod]
18
+ *$py.class
19
+ .pytest_cache/
20
+ .ruff_cache/
21
+ .mypy_cache/
22
+ .coverage
23
+ htmlcov/
24
+ .venv/
25
+ venv/
26
+ env/
27
+
28
+ # Build and package output
29
+ build/
30
+ dist/
31
+ *.egg-info/
32
+
33
+ # Node dependencies and caches, if frontend/tooling is added later
34
+ node_modules/
35
+ .npm/
36
+ .pnpm-store/
37
+
38
+ # Agent tune kit generated outputs
39
+ .atk/
@@ -0,0 +1,314 @@
1
+ Metadata-Version: 2.4
2
+ Name: agent-tune-kit
3
+ Version: 0.3.6
4
+ Summary: Local Codex plugin for iterative Agent tuning with guided Skills, reusable runner templates, versioned results, and static validation.
5
+ Project-URL: Homepage, https://github.com/hustyichi/agent-tune-kit
6
+ Project-URL: Repository, https://github.com/hustyichi/agent-tune-kit
7
+ Author-email: hustyichi <hustyichi@163.com>
8
+ License: UNLICENSED
9
+ Keywords: agent-tune-kit,codex-skills,evaluation,local-plugin,reporting
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3 :: Only
12
+ Requires-Python: >=3.11
13
+ Description-Content-Type: text/markdown
14
+
15
+ # Agent Tune Kit
16
+
17
+ English | [简体中文](README.md)
18
+
19
+ Agent Tune Kit is a **local Codex plugin** that helps you evaluate and improve the quality of your own local Agent.
20
+
21
+ If you already have a working Agent but are not sure where it fails, why it fails, or what to tune next, this project lets Codex help you run a complete loop: batch test the Agent, find failure cases, write an analysis report, tune the Agent, and verify the next run.
22
+
23
+ Its main advantage is a **low-friction start**. You do not need to design a complex evaluation schema or expose a universal Agent interface first. Bring a local Agent project and a small evaluation dataset; Codex reads the code and data samples, then generates the project-specific runner and tuning workflow.
24
+
25
+ ## Who it is for
26
+
27
+ Use this if you have:
28
+
29
+ - a local Agent, chatbot, tool-using Agent, or RAG Agent;
30
+ - a few test questions, sample inputs, expected answers, or human-judgable results;
31
+ - a need to quickly find weak spots and let Codex help tune prompts, code, parameters, or tool configuration;
32
+ - a desire to keep each tuning loop traceable with result files and reports.
33
+
34
+ You do not need a full evaluation platform to start. For the first validation, 5 to 20 CSV rows are enough.
35
+
36
+ ## Prerequisites
37
+
38
+ You only need:
39
+
40
+ - Codex with local plugin/Skill support.
41
+ - Python 3.
42
+ - A local Agent project that Codex can inspect and edit.
43
+ - A simple evaluation dataset, preferably CSV. Column names do not need to follow a strict Schema; Codex will infer inputs and expected results where possible.
44
+
45
+ Create a git checkpoint before tuning if you want an easy rollback path. Agent Tune Kit does not automate Agent tuning rollback; installer rollback only restores local marketplace/plugin-store install state.
46
+
47
+ ## Quickstart: install the plugin
48
+
49
+ No repository clone is needed for normal use. Run the packaged installer directly with uvx:
50
+
51
+ ```sh
52
+ uvx --from agent-tune-kit atk install
53
+ ```
54
+
55
+ For a persistent command, install the tool first, then run `atk`:
56
+
57
+ ```sh
58
+ uv tool install agent-tune-kit
59
+ atk install
60
+ ```
61
+
62
+ If you prefer pipx:
63
+
64
+ ```sh
65
+ pipx install agent-tune-kit
66
+ atk install
67
+ ```
68
+
69
+ The installer validates the packaged plugin manifest, adds the plugin to the Personal marketplace, writes or updates `~/.agents/plugins/marketplace.json`, copies the packaged payload into `~/plugins/agent-tune-kit`, and runs local smoke/status checks by default. It proves local files and marketplace state only; it does not bypass or modify hidden Codex UI enablement state.
70
+
71
+ Useful helper commands:
72
+
73
+ ```sh
74
+ atk preview --smoke # preview only; no writes
75
+ atk status # read local install status and next steps
76
+ atk rollback --backup <backup-id> # restore installer-managed local install state only
77
+ ```
78
+
79
+ When an existing marketplace/plugin-store conflict is found, interactive terminals prompt before replacement. Noninteractive replacement requires `--yes --force`; destructive replacement creates a backup first and prints a rollback command. The installer supports explicit subcommands only and does not keep old entry points; use `preview` for no-write preview.
80
+
81
+ Contributor checkout path, for editing this repository itself:
82
+
83
+ ```sh
84
+ git clone git@github.com:hustyichi/agent-tune-kit.git
85
+ cd agent-tune-kit
86
+ uv sync
87
+ uv run atk install
88
+ # or: python3 scripts/install_plugin.py install
89
+ ```
90
+
91
+ After install, Agent Tune Kit should be visible/available in `/plugins`.
92
+
93
+ You still need to enable it in Codex:
94
+
95
+ ```text
96
+ /plugins
97
+ ```
98
+
99
+ Select `Agent Tune Kit` in the plugin list and follow the UI prompt to install/enable it. After you enable it in the UI, `$atk-status` and the other Skill commands should appear in autocomplete.
100
+
101
+ If the plugin is enabled in `/plugins` but `$atk-status` still does not appear in the current session, that is expected: Codex usually loads plugin Skills when a session starts, so newly enabled plugins may not be hot-loaded into an already running session. Restart Codex, or close the current Codex session and reopen this project, then type `$atk-status` again to verify.
102
+
103
+ If your environment cannot use local plugins, do not split-copy individual `skills/*` directories; this repository now treats the local Codex plugin install path as the only recommended entry point.
104
+
105
+ ## Maintainer release to PyPI
106
+
107
+ The release scripts follow the two-step release gate/publish shape used by `agent-tune-cli`: default mode is a dry run, and uploads only happen with an explicit `--publish`.
108
+
109
+ Run the full local release gate first. It checks version alignment, static validation, tests, `uv build --no-sources`, archive contents, and packaged `atk` smoke installs outside the repository:
110
+
111
+ ```sh
112
+ UV_NO_CONFIG=1 uv run python scripts/check-release.py
113
+ ```
114
+
115
+ Prepare clean `dist/` artifacts without uploading:
116
+
117
+ ```sh
118
+ UV_NO_CONFIG=1 uv run python scripts/publish-release.py
119
+ ```
120
+
121
+ Publish to TestPyPI first:
122
+
123
+ ```sh
124
+ export UV_PUBLISH_TOKEN='pypi-your-testpypi-token'
125
+ UV_NO_CONFIG=1 uv run python scripts/publish-release.py --repository testpypi --publish
126
+ ```
127
+
128
+ After TestPyPI install validation, publish to PyPI:
129
+
130
+ ```sh
131
+ export UV_PUBLISH_TOKEN='pypi-your-pypi-token'
132
+ UV_NO_CONFIG=1 uv run python scripts/publish-release.py --repository pypi --publish
133
+ ```
134
+
135
+ The publish script checks whether the current `project.name` + `project.version` already exists before uploading. If it exists, bump the version in `pyproject.toml`, `.codex-plugin/plugin.json`, and `src/agent_tune_kit/__init__.py` first. Never commit or paste PyPI tokens.
136
+
137
+ For the fixed production PyPI path, you can run the zero-argument wrapper:
138
+
139
+ ```sh
140
+ scripts/publish-pypi.sh
141
+ ```
142
+
143
+ It is equivalent to `UV_NO_CONFIG=1 uv run python scripts/publish-release.py --repository pypi --publish`, but checks that `UV_PUBLISH_TOKEN` is set first.
144
+
145
+ ## Minimal tuning loop
146
+
147
+ Run these steps in **your Agent repository**, not in this Agent Tune Kit repository.
148
+
149
+ ### 1. Generate a test runner
150
+
151
+ Run:
152
+
153
+ ```text
154
+ $atk-init
155
+ ```
156
+
157
+ Point Codex to your Agent entrypoint and evaluation dataset. Codex generates:
158
+
159
+ ```text
160
+ .atk/runner/eval_runner.py
161
+ ```
162
+
163
+ The runner keeps your original dataset columns and adds the Agent's actual output as `agent_output`. It also adds `agent_output_log_path`; when trustworthy Python `logging` capture is configured, this column points to row-specific files such as `logs/row_000001.log` for serial or same-process concurrent runs.
164
+
165
+ `$atk-init` first snapshots the provided dataset into `.atk/datasets/`, and the generated runner reads that project-local copy. If a same-name snapshot already exists with identical content, it is reused; if the name exists with different content, ATK uses readable incrementing names such as `dataset_2.csv` and `dataset_3.csv`.
166
+
167
+ ### 2. Run the Agent on the dataset
168
+
169
+ Run:
170
+
171
+ ```text
172
+ $atk-run
173
+ ```
174
+
175
+ This writes:
176
+
177
+ ```text
178
+ .atk/results/v1/eval_results.csv
179
+ ```
180
+
181
+ If row logging is active, the same version also contains `.atk/results/v1/logs/row_*.log`. Row logs are generated for configured same-process Python `logging` capture in serial runs and, when `CONCURRENT_ROW_LOGGING_ENABLED` remains enabled, with `--concurrency > 1`. The runner only writes records emitted while an ATK row context is active; stdout/stderr, subprocess, multiprocess, and post-row background logs remain out of scope. If concurrent row logging is disabled, concurrent runs visibly downgrade to `app.log`/CSV evidence instead of creating row logs.
182
+
183
+ ### 3. Find failing cases
184
+
185
+ For the simplest path, let Codex judge which cases failed:
186
+
187
+ ```text
188
+ $atk-find-failures
189
+ ```
190
+
191
+ If you already have a clear rule, first create or update the reusable rule script:
192
+
193
+ ```text
194
+ $atk-init-failure-rule rule: mark a row as failed when the expected field differs from agent_output
195
+ ```
196
+
197
+ Codex uses the rule you provide in the command to generate the rule script at:
198
+
199
+ ```text
200
+ .atk/runner/failure_rule.py
201
+ ```
202
+
203
+ Then execute that rule script to write the failing cases:
204
+
205
+ ```text
206
+ $atk-find-failures-by-rule
207
+ ```
208
+
209
+ If `.atk/runner/failure_rule.py` is missing, `$atk-find-failures-by-rule` stops and tells you to run `$atk-init-failure-rule` first.
210
+
211
+ The failing cases are written to:
212
+
213
+ ```text
214
+ .atk/results/v1/failure_cases.csv
215
+ ```
216
+
217
+ ### 4. Generate the analysis report
218
+
219
+ Run:
220
+
221
+ ```text
222
+ $atk-report
223
+ ```
224
+
225
+ Codex writes:
226
+
227
+ ```text
228
+ .atk/results/v1/report.md
229
+ ```
230
+
231
+ The report summarizes test results, failure cases, likely causes, and recommended tuning priorities.
232
+
233
+ ### 5. Optionally review failures in HTML
234
+
235
+ Run:
236
+
237
+ ```text
238
+ $atk-visualize-failures
239
+ ```
240
+
241
+ Codex writes:
242
+
243
+ ```text
244
+ .atk/results/v1/failure_cases.html
245
+ ```
246
+
247
+ This optional browser can run any time `failure_cases.csv` exists. If same-version `report.md` exists, it is used as best-effort, non-blocking context; missing or unparseable report context does not block the visualization. The Skill uses a fixed plugin-owned stdlib generator script, so output is deterministic and dependency-free while still offering expected-vs-actual review, search/filter/pagination, schema-adaptive role switching, and safe relative log links.
248
+
249
+ ### 6. Let Codex tune the Agent
250
+
251
+ Run:
252
+
253
+ ```text
254
+ $atk-tune
255
+ ```
256
+
257
+ Codex edits the Agent based on the report and records the tuning plan in:
258
+
259
+ ```text
260
+ .atk/results/v1/tuning_plan.md
261
+ ```
262
+
263
+ ## Verify that tuning worked
264
+
265
+ After tuning, run the test again:
266
+
267
+ ```text
268
+ $atk-run
269
+ ```
270
+
271
+ This creates `.atk/results/v2/eval_results.csv`. Then run:
272
+
273
+ ```text
274
+ $atk-find-failures
275
+ $atk-report
276
+ ```
277
+
278
+ Starting with the second loop, the report reads the previous `tuning_plan.md` and tells you whether the target failures were resolved, partially resolved, unresolved, or impossible to judge.
279
+
280
+ ## Expected output
281
+
282
+ ```text
283
+ .atk/
284
+ ├── datasets/
285
+ │ └── service_source_codes.csv
286
+ ├── runner/
287
+ │ ├── eval_runner.py
288
+ │ └── failure_rule.py
289
+ └── results/
290
+ ├── v1/
291
+ │ ├── eval_results.csv
292
+ │ ├── logs/ # optional row logs
293
+ │ │ └── row_000001.log
294
+ │ ├── failure_cases.csv
295
+ │ ├── failure_cases.html # optional failure browser
296
+ │ ├── report.md
297
+ │ └── tuning_plan.md
298
+ └── v2/
299
+ └── ...
300
+ ```
301
+
302
+ Most users only need to read `eval_results.csv`, `failure_cases.csv`, optional `failure_cases.html`, `report.md`, and row logs linked from `agent_output_log_path` when available. Version directories are managed automatically.
303
+
304
+ ## Available Skills
305
+
306
+ - `$atk-status`: inspect progress and recommend the next step.
307
+ - `$atk-init`: generate a test runner for the current Agent.
308
+ - `$atk-run`: run the test runner and create the current result version.
309
+ - `$atk-find-failures`: let Codex identify failing cases.
310
+ - `$atk-init-failure-rule`: create or update `.atk/runner/failure_rule.py`.
311
+ - `$atk-find-failures-by-rule`: execute `.atk/runner/failure_rule.py` to identify failing cases with explicit rules.
312
+ - `$atk-report`: generate analysis and cross-loop validation.
313
+ - `$atk-visualize-failures`: generate optional `.atk/results/vN/failure_cases.html` from current `failure_cases.csv`.
314
+ - `$atk-tune`: tune the Agent and record the tuning plan.
@@ -0,0 +1,300 @@
1
+ # Agent Tune Kit
2
+
3
+ English | [简体中文](README.md)
4
+
5
+ Agent Tune Kit is a **local Codex plugin** that helps you evaluate and improve the quality of your own local Agent.
6
+
7
+ If you already have a working Agent but are not sure where it fails, why it fails, or what to tune next, this project lets Codex help you run a complete loop: batch test the Agent, find failure cases, write an analysis report, tune the Agent, and verify the next run.
8
+
9
+ Its main advantage is a **low-friction start**. You do not need to design a complex evaluation schema or expose a universal Agent interface first. Bring a local Agent project and a small evaluation dataset; Codex reads the code and data samples, then generates the project-specific runner and tuning workflow.
10
+
11
+ ## Who it is for
12
+
13
+ Use this if you have:
14
+
15
+ - a local Agent, chatbot, tool-using Agent, or RAG Agent;
16
+ - a few test questions, sample inputs, expected answers, or human-judgable results;
17
+ - a need to quickly find weak spots and let Codex help tune prompts, code, parameters, or tool configuration;
18
+ - a desire to keep each tuning loop traceable with result files and reports.
19
+
20
+ You do not need a full evaluation platform to start. For the first validation, 5 to 20 CSV rows are enough.
21
+
22
+ ## Prerequisites
23
+
24
+ You only need:
25
+
26
+ - Codex with local plugin/Skill support.
27
+ - Python 3.
28
+ - A local Agent project that Codex can inspect and edit.
29
+ - A simple evaluation dataset, preferably CSV. Column names do not need to follow a strict Schema; Codex will infer inputs and expected results where possible.
30
+
31
+ Create a git checkpoint before tuning if you want an easy rollback path. Agent Tune Kit does not automate Agent tuning rollback; installer rollback only restores local marketplace/plugin-store install state.
32
+
33
+ ## Quickstart: install the plugin
34
+
35
+ No repository clone is needed for normal use. Run the packaged installer directly with uvx:
36
+
37
+ ```sh
38
+ uvx --from agent-tune-kit atk install
39
+ ```
40
+
41
+ For a persistent command, install the tool first, then run `atk`:
42
+
43
+ ```sh
44
+ uv tool install agent-tune-kit
45
+ atk install
46
+ ```
47
+
48
+ If you prefer pipx:
49
+
50
+ ```sh
51
+ pipx install agent-tune-kit
52
+ atk install
53
+ ```
54
+
55
+ The installer validates the packaged plugin manifest, adds the plugin to the Personal marketplace, writes or updates `~/.agents/plugins/marketplace.json`, copies the packaged payload into `~/plugins/agent-tune-kit`, and runs local smoke/status checks by default. It proves local files and marketplace state only; it does not bypass or modify hidden Codex UI enablement state.
56
+
57
+ Useful helper commands:
58
+
59
+ ```sh
60
+ atk preview --smoke # preview only; no writes
61
+ atk status # read local install status and next steps
62
+ atk rollback --backup <backup-id> # restore installer-managed local install state only
63
+ ```
64
+
65
+ When an existing marketplace/plugin-store conflict is found, interactive terminals prompt before replacement. Noninteractive replacement requires `--yes --force`; destructive replacement creates a backup first and prints a rollback command. The installer supports explicit subcommands only and does not keep old entry points; use `preview` for no-write preview.
66
+
67
+ Contributor checkout path, for editing this repository itself:
68
+
69
+ ```sh
70
+ git clone git@github.com:hustyichi/agent-tune-kit.git
71
+ cd agent-tune-kit
72
+ uv sync
73
+ uv run atk install
74
+ # or: python3 scripts/install_plugin.py install
75
+ ```
76
+
77
+ After install, Agent Tune Kit should be visible/available in `/plugins`.
78
+
79
+ You still need to enable it in Codex:
80
+
81
+ ```text
82
+ /plugins
83
+ ```
84
+
85
+ Select `Agent Tune Kit` in the plugin list and follow the UI prompt to install/enable it. After you enable it in the UI, `$atk-status` and the other Skill commands should appear in autocomplete.
86
+
87
+ If the plugin is enabled in `/plugins` but `$atk-status` still does not appear in the current session, that is expected: Codex usually loads plugin Skills when a session starts, so newly enabled plugins may not be hot-loaded into an already running session. Restart Codex, or close the current Codex session and reopen this project, then type `$atk-status` again to verify.
88
+
89
+ If your environment cannot use local plugins, do not split-copy individual `skills/*` directories; this repository now treats the local Codex plugin install path as the only recommended entry point.
90
+
91
+ ## Maintainer release to PyPI
92
+
93
+ The release scripts follow the two-step release gate/publish shape used by `agent-tune-cli`: default mode is a dry run, and uploads only happen with an explicit `--publish`.
94
+
95
+ Run the full local release gate first. It checks version alignment, static validation, tests, `uv build --no-sources`, archive contents, and packaged `atk` smoke installs outside the repository:
96
+
97
+ ```sh
98
+ UV_NO_CONFIG=1 uv run python scripts/check-release.py
99
+ ```
100
+
101
+ Prepare clean `dist/` artifacts without uploading:
102
+
103
+ ```sh
104
+ UV_NO_CONFIG=1 uv run python scripts/publish-release.py
105
+ ```
106
+
107
+ Publish to TestPyPI first:
108
+
109
+ ```sh
110
+ export UV_PUBLISH_TOKEN='pypi-your-testpypi-token'
111
+ UV_NO_CONFIG=1 uv run python scripts/publish-release.py --repository testpypi --publish
112
+ ```
113
+
114
+ After TestPyPI install validation, publish to PyPI:
115
+
116
+ ```sh
117
+ export UV_PUBLISH_TOKEN='pypi-your-pypi-token'
118
+ UV_NO_CONFIG=1 uv run python scripts/publish-release.py --repository pypi --publish
119
+ ```
120
+
121
+ The publish script checks whether the current `project.name` + `project.version` already exists before uploading. If it exists, bump the version in `pyproject.toml`, `.codex-plugin/plugin.json`, and `src/agent_tune_kit/__init__.py` first. Never commit or paste PyPI tokens.
122
+
123
+ For the fixed production PyPI path, you can run the zero-argument wrapper:
124
+
125
+ ```sh
126
+ scripts/publish-pypi.sh
127
+ ```
128
+
129
+ It is equivalent to `UV_NO_CONFIG=1 uv run python scripts/publish-release.py --repository pypi --publish`, but checks that `UV_PUBLISH_TOKEN` is set first.
130
+
131
+ ## Minimal tuning loop
132
+
133
+ Run these steps in **your Agent repository**, not in this Agent Tune Kit repository.
134
+
135
+ ### 1. Generate a test runner
136
+
137
+ Run:
138
+
139
+ ```text
140
+ $atk-init
141
+ ```
142
+
143
+ Point Codex to your Agent entrypoint and evaluation dataset. Codex generates:
144
+
145
+ ```text
146
+ .atk/runner/eval_runner.py
147
+ ```
148
+
149
+ The runner keeps your original dataset columns and adds the Agent's actual output as `agent_output`. It also adds `agent_output_log_path`; when trustworthy Python `logging` capture is configured, this column points to row-specific files such as `logs/row_000001.log` for serial or same-process concurrent runs.
150
+
151
+ `$atk-init` first snapshots the provided dataset into `.atk/datasets/`, and the generated runner reads that project-local copy. If a same-name snapshot already exists with identical content, it is reused; if the name exists with different content, ATK uses readable incrementing names such as `dataset_2.csv` and `dataset_3.csv`.
152
+
153
+ ### 2. Run the Agent on the dataset
154
+
155
+ Run:
156
+
157
+ ```text
158
+ $atk-run
159
+ ```
160
+
161
+ This writes:
162
+
163
+ ```text
164
+ .atk/results/v1/eval_results.csv
165
+ ```
166
+
167
+ If row logging is active, the same version also contains `.atk/results/v1/logs/row_*.log`. Row logs are generated for configured same-process Python `logging` capture in serial runs and, when `CONCURRENT_ROW_LOGGING_ENABLED` remains enabled, with `--concurrency > 1`. The runner only writes records emitted while an ATK row context is active; stdout/stderr, subprocess, multiprocess, and post-row background logs remain out of scope. If concurrent row logging is disabled, concurrent runs visibly downgrade to `app.log`/CSV evidence instead of creating row logs.
168
+
169
+ ### 3. Find failing cases
170
+
171
+ For the simplest path, let Codex judge which cases failed:
172
+
173
+ ```text
174
+ $atk-find-failures
175
+ ```
176
+
177
+ If you already have a clear rule, first create or update the reusable rule script:
178
+
179
+ ```text
180
+ $atk-init-failure-rule rule: mark a row as failed when the expected field differs from agent_output
181
+ ```
182
+
183
+ Codex uses the rule you provide in the command to generate the rule script at:
184
+
185
+ ```text
186
+ .atk/runner/failure_rule.py
187
+ ```
188
+
189
+ Then execute that rule script to write the failing cases:
190
+
191
+ ```text
192
+ $atk-find-failures-by-rule
193
+ ```
194
+
195
+ If `.atk/runner/failure_rule.py` is missing, `$atk-find-failures-by-rule` stops and tells you to run `$atk-init-failure-rule` first.
196
+
197
+ The failing cases are written to:
198
+
199
+ ```text
200
+ .atk/results/v1/failure_cases.csv
201
+ ```
202
+
203
+ ### 4. Generate the analysis report
204
+
205
+ Run:
206
+
207
+ ```text
208
+ $atk-report
209
+ ```
210
+
211
+ Codex writes:
212
+
213
+ ```text
214
+ .atk/results/v1/report.md
215
+ ```
216
+
217
+ The report summarizes test results, failure cases, likely causes, and recommended tuning priorities.
218
+
219
+ ### 5. Optionally review failures in HTML
220
+
221
+ Run:
222
+
223
+ ```text
224
+ $atk-visualize-failures
225
+ ```
226
+
227
+ Codex writes:
228
+
229
+ ```text
230
+ .atk/results/v1/failure_cases.html
231
+ ```
232
+
233
+ This optional browser can run any time `failure_cases.csv` exists. If same-version `report.md` exists, it is used as best-effort, non-blocking context; missing or unparseable report context does not block the visualization. The Skill uses a fixed plugin-owned stdlib generator script, so output is deterministic and dependency-free while still offering expected-vs-actual review, search/filter/pagination, schema-adaptive role switching, and safe relative log links.
234
+
235
+ ### 6. Let Codex tune the Agent
236
+
237
+ Run:
238
+
239
+ ```text
240
+ $atk-tune
241
+ ```
242
+
243
+ Codex edits the Agent based on the report and records the tuning plan in:
244
+
245
+ ```text
246
+ .atk/results/v1/tuning_plan.md
247
+ ```
248
+
249
+ ## Verify that tuning worked
250
+
251
+ After tuning, run the test again:
252
+
253
+ ```text
254
+ $atk-run
255
+ ```
256
+
257
+ This creates `.atk/results/v2/eval_results.csv`. Then run:
258
+
259
+ ```text
260
+ $atk-find-failures
261
+ $atk-report
262
+ ```
263
+
264
+ Starting with the second loop, the report reads the previous `tuning_plan.md` and tells you whether the target failures were resolved, partially resolved, unresolved, or impossible to judge.
265
+
266
+ ## Expected output
267
+
268
+ ```text
269
+ .atk/
270
+ ├── datasets/
271
+ │ └── service_source_codes.csv
272
+ ├── runner/
273
+ │ ├── eval_runner.py
274
+ │ └── failure_rule.py
275
+ └── results/
276
+ ├── v1/
277
+ │ ├── eval_results.csv
278
+ │ ├── logs/ # optional row logs
279
+ │ │ └── row_000001.log
280
+ │ ├── failure_cases.csv
281
+ │ ├── failure_cases.html # optional failure browser
282
+ │ ├── report.md
283
+ │ └── tuning_plan.md
284
+ └── v2/
285
+ └── ...
286
+ ```
287
+
288
+ Most users only need to read `eval_results.csv`, `failure_cases.csv`, optional `failure_cases.html`, `report.md`, and row logs linked from `agent_output_log_path` when available. Version directories are managed automatically.
289
+
290
+ ## Available Skills
291
+
292
+ - `$atk-status`: inspect progress and recommend the next step.
293
+ - `$atk-init`: generate a test runner for the current Agent.
294
+ - `$atk-run`: run the test runner and create the current result version.
295
+ - `$atk-find-failures`: let Codex identify failing cases.
296
+ - `$atk-init-failure-rule`: create or update `.atk/runner/failure_rule.py`.
297
+ - `$atk-find-failures-by-rule`: execute `.atk/runner/failure_rule.py` to identify failing cases with explicit rules.
298
+ - `$atk-report`: generate analysis and cross-loop validation.
299
+ - `$atk-visualize-failures`: generate optional `.atk/results/vN/failure_cases.html` from current `failure_cases.csv`.
300
+ - `$atk-tune`: tune the Agent and record the tuning plan.