agentpack-cli 0.3.9__tar.gz → 0.3.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/PKG-INFO +80 -2
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/README.md +79 -1
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/pyproject.toml +1 -1
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/__init__.py +1 -1
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/cli.py +2 -0
- agentpack_cli-0.3.10/src/agentpack/commands/eval_cmd.py +264 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/tune.py +24 -0
- agentpack_cli-0.3.10/src/agentpack/core/evals.py +939 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/.gitignore +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/LICENSE +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/adapters/__init__.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/adapters/antigravity.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/adapters/base.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/adapters/claude.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/adapters/codex.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/adapters/cursor.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/adapters/detect.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/adapters/generic.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/adapters/windsurf.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/analysis/__init__.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/analysis/dependency_graph.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/analysis/go_imports.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/analysis/java_imports.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/analysis/js_ts_imports.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/analysis/monorepo.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/analysis/naming_signals.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/analysis/python_imports.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/analysis/ranking.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/analysis/repo_map.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/analysis/role_inference.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/analysis/rust_imports.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/analysis/symbols.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/analysis/task_classifier.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/analysis/tests.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/application/__init__.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/application/pack_service.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/__init__.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/_shared.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/benchmark.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/claude_cmd.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/diff.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/doctor.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/explain.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/guard.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/hook_cmd.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/ignore_cmd.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/init.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/install.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/mcp_cmd.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/migrate.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/monitor.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/pack.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/quickstart.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/repair.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/scan.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/stats.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/status.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/summarize.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/watch.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/__init__.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/bootstrap.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/cache.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/config.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/context_pack.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/diff.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/git.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/git_hooks.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/global_install.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/ignore.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/merkle.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/models.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/redactor.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/scanner.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/snapshot.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/task_freshness.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/token_estimator.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/vscode_tasks.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/data/agentpack.md +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/installers/__init__.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/installers/antigravity.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/installers/claude.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/installers/codex.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/installers/cursor.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/installers/windsurf.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/integrations/__init__.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/integrations/agents.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/integrations/git_hooks.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/integrations/global_install.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/integrations/platform.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/integrations/vscode_tasks.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/mcp_server.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/renderers/__init__.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/renderers/compact.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/renderers/markdown.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/renderers/receipts.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/session/__init__.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/session/state.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/summaries/__init__.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/summaries/base.py +0 -0
- {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/summaries/offline.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: agentpack-cli
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.10
|
|
4
4
|
Summary: Local context engine for AI coding agents that ranks relevant files and builds task-focused context packs.
|
|
5
5
|
License: MIT
|
|
6
6
|
License-File: LICENSE
|
|
@@ -40,13 +40,14 @@ Description-Content-Type: text/markdown
|
|
|
40
40
|
# AgentPack
|
|
41
41
|
|
|
42
42
|
[](https://pypi.org/project/agentpack-cli/)
|
|
43
|
+
[](https://pepy.tech/projects/agentpack-cli)
|
|
43
44
|
[](https://www.npmjs.com/package/@vishal2612200/agentpack)
|
|
44
45
|
[](https://www.npmjs.com/package/@vishal2612200/agentpack)
|
|
45
46
|
[](https://pypi.org/project/agentpack-cli/)
|
|
46
47
|
[](https://opensource.org/licenses/MIT)
|
|
47
48
|
[](https://github.com/vishal2612200/agentpack/actions/workflows/ci.yml)
|
|
48
49
|
|
|
49
|
-
> **Status: alpha (v0.3.
|
|
50
|
+
> **Status: alpha (v0.3.10).** Works, tested, used in real sessions. Python and JavaScript/TypeScript are the best-supported languages. Public benchmark proof exists for the current suite, but broader repo coverage is still growing. API may change before 1.0.
|
|
50
51
|
>
|
|
51
52
|
> **Platform note:** macOS, Linux, and Windows are supported. Windows support targets PowerShell plus Git for Windows. `cmd.exe` and bare Git setups are not a supported path yet.
|
|
52
53
|
|
|
@@ -601,6 +602,7 @@ Command map:
|
|
|
601
602
|
| `agentpack doctor` | Audit hooks, agent files, CLI path, and repo health |
|
|
602
603
|
| `agentpack explain` | Understand why a file was selected or omitted |
|
|
603
604
|
| `agentpack benchmark` | Measure recall, precision, and misses against real tasks |
|
|
605
|
+
| `agentpack eval` | Run deterministic failure evals with tests, diff limits, and taxonomy labels |
|
|
604
606
|
| `agentpack tune` | Suggest fixes from recent pack metrics and benchmark misses |
|
|
605
607
|
| `agentpack status` | Inspect current pack freshness and metadata |
|
|
606
608
|
| `agentpack diff` | Show what changed between context snapshots |
|
|
@@ -1211,6 +1213,82 @@ This command does not pretend a pack is correct. It gives the next thing to insp
|
|
|
1211
1213
|
|
|
1212
1214
|
---
|
|
1213
1215
|
|
|
1216
|
+
### `agentpack eval`
|
|
1217
|
+
|
|
1218
|
+
Run deterministic failure evals. AgentPack does not run the coding agent and
|
|
1219
|
+
does not use an LLM judge; it verifies the current or replayed worktree with
|
|
1220
|
+
commands and diff policies.
|
|
1221
|
+
|
|
1222
|
+
```bash
|
|
1223
|
+
agentpack eval --init
|
|
1224
|
+
# edit .agentpack/evals.toml with real failures and checks
|
|
1225
|
+
agentpack eval
|
|
1226
|
+
agentpack eval --case auth-timeout --prove-targets
|
|
1227
|
+
agentpack eval --capture auth-timeout --failure-class context --check "pytest tests/test_auth.py -q"
|
|
1228
|
+
agentpack eval --watch --until-pass
|
|
1229
|
+
agentpack eval --replay --prove-targets
|
|
1230
|
+
agentpack eval --variant baseline
|
|
1231
|
+
agentpack eval --variant agentpack
|
|
1232
|
+
agentpack eval --compare-variants baseline:agentpack
|
|
1233
|
+
agentpack eval --ci-template
|
|
1234
|
+
agentpack eval --report
|
|
1235
|
+
```
|
|
1236
|
+
|
|
1237
|
+
Example case:
|
|
1238
|
+
|
|
1239
|
+
```toml
|
|
1240
|
+
[[cases]]
|
|
1241
|
+
id = "auth-timeout"
|
|
1242
|
+
task = "fix auth token timeout"
|
|
1243
|
+
failure_class = "context"
|
|
1244
|
+
failure_source = "agent_failed"
|
|
1245
|
+
base_ref = "HEAD"
|
|
1246
|
+
patch_file = ".agentpack/evals/auth-timeout.patch"
|
|
1247
|
+
required_changed_files = ["src/auth/token.py"]
|
|
1248
|
+
forbidden_changed_files = ["src/db/**"]
|
|
1249
|
+
max_changed_files = 5
|
|
1250
|
+
max_changed_lines = 250
|
|
1251
|
+
agent = "codex"
|
|
1252
|
+
context_file = ".agentpack/context.md"
|
|
1253
|
+
context_hash = "..."
|
|
1254
|
+
selected_files = ["src/auth/token.py", "tests/test_auth.py"]
|
|
1255
|
+
|
|
1256
|
+
[[cases.checks]]
|
|
1257
|
+
name = "tests"
|
|
1258
|
+
command = "pytest tests/test_auth.py -q"
|
|
1259
|
+
timeout_s = 120
|
|
1260
|
+
retries = 1 # optional, marks pass-after-fail checks as flaky
|
|
1261
|
+
```
|
|
1262
|
+
|
|
1263
|
+
Use `eval` after an agent run: capture the real failure, add deterministic
|
|
1264
|
+
checks such as tests, typecheck, lint, schema validation, API contract tests,
|
|
1265
|
+
diff size, forbidden files, or golden outputs, then rerun until the harness
|
|
1266
|
+
passes. The model can propose; the harness must verify.
|
|
1267
|
+
|
|
1268
|
+
For hands-free local iteration, keep `agentpack eval --watch --until-pass`
|
|
1269
|
+
running in a terminal while the agent or developer edits. It reruns when the
|
|
1270
|
+
case file, patch artifacts, golden files, or git diff content changes and stops
|
|
1271
|
+
when all deterministic checks pass. `--capture` stores the current patch under
|
|
1272
|
+
`.agentpack/evals/<case-id>.patch` plus context metadata; `--replay` checks out
|
|
1273
|
+
`base_ref` into an isolated git worktree, applies that patch, and runs the same
|
|
1274
|
+
deterministic checks there. To measure AgentPack's contribution, run the same
|
|
1275
|
+
case with `--variant baseline` and then with `--variant agentpack`;
|
|
1276
|
+
`--compare-variants baseline:agentpack` reports which cases improved, regressed,
|
|
1277
|
+
stayed unchanged, or still need both sides. Use `--ci-template` to scaffold a
|
|
1278
|
+
GitHub Actions workflow for `benchmarks/evals.toml`.
|
|
1279
|
+
|
|
1280
|
+
Eval files are executable trust boundaries: commands in `checks.command` run
|
|
1281
|
+
locally and in CI. Review eval TOML from contributors with the same care as
|
|
1282
|
+
shell scripts or workflow files.
|
|
1283
|
+
|
|
1284
|
+
Captured patch artifacts are secret-scanned with the same local redactor used
|
|
1285
|
+
for context packs before they are written. If a patch line contains a real
|
|
1286
|
+
secret, the artifact stores `[REDACTED:<type>]` and the case records
|
|
1287
|
+
`patch_redaction_warnings`. Secret-bearing patches may replay with redacted
|
|
1288
|
+
values; replace secrets with safe fixture values when exact replay matters.
|
|
1289
|
+
|
|
1290
|
+
---
|
|
1291
|
+
|
|
1214
1292
|
### `agentpack status`
|
|
1215
1293
|
|
|
1216
1294
|
Check whether the context pack is stale.
|
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
# AgentPack
|
|
2
2
|
|
|
3
3
|
[](https://pypi.org/project/agentpack-cli/)
|
|
4
|
+
[](https://pepy.tech/projects/agentpack-cli)
|
|
4
5
|
[](https://www.npmjs.com/package/@vishal2612200/agentpack)
|
|
5
6
|
[](https://www.npmjs.com/package/@vishal2612200/agentpack)
|
|
6
7
|
[](https://pypi.org/project/agentpack-cli/)
|
|
7
8
|
[](https://opensource.org/licenses/MIT)
|
|
8
9
|
[](https://github.com/vishal2612200/agentpack/actions/workflows/ci.yml)
|
|
9
10
|
|
|
10
|
-
> **Status: alpha (v0.3.
|
|
11
|
+
> **Status: alpha (v0.3.10).** Works, tested, used in real sessions. Python and JavaScript/TypeScript are the best-supported languages. Public benchmark proof exists for the current suite, but broader repo coverage is still growing. API may change before 1.0.
|
|
11
12
|
>
|
|
12
13
|
> **Platform note:** macOS, Linux, and Windows are supported. Windows support targets PowerShell plus Git for Windows. `cmd.exe` and bare Git setups are not a supported path yet.
|
|
13
14
|
|
|
@@ -562,6 +563,7 @@ Command map:
|
|
|
562
563
|
| `agentpack doctor` | Audit hooks, agent files, CLI path, and repo health |
|
|
563
564
|
| `agentpack explain` | Understand why a file was selected or omitted |
|
|
564
565
|
| `agentpack benchmark` | Measure recall, precision, and misses against real tasks |
|
|
566
|
+
| `agentpack eval` | Run deterministic failure evals with tests, diff limits, and taxonomy labels |
|
|
565
567
|
| `agentpack tune` | Suggest fixes from recent pack metrics and benchmark misses |
|
|
566
568
|
| `agentpack status` | Inspect current pack freshness and metadata |
|
|
567
569
|
| `agentpack diff` | Show what changed between context snapshots |
|
|
@@ -1172,6 +1174,82 @@ This command does not pretend a pack is correct. It gives the next thing to insp
|
|
|
1172
1174
|
|
|
1173
1175
|
---
|
|
1174
1176
|
|
|
1177
|
+
### `agentpack eval`
|
|
1178
|
+
|
|
1179
|
+
Run deterministic failure evals. AgentPack does not run the coding agent and
|
|
1180
|
+
does not use an LLM judge; it verifies the current or replayed worktree with
|
|
1181
|
+
commands and diff policies.
|
|
1182
|
+
|
|
1183
|
+
```bash
|
|
1184
|
+
agentpack eval --init
|
|
1185
|
+
# edit .agentpack/evals.toml with real failures and checks
|
|
1186
|
+
agentpack eval
|
|
1187
|
+
agentpack eval --case auth-timeout --prove-targets
|
|
1188
|
+
agentpack eval --capture auth-timeout --failure-class context --check "pytest tests/test_auth.py -q"
|
|
1189
|
+
agentpack eval --watch --until-pass
|
|
1190
|
+
agentpack eval --replay --prove-targets
|
|
1191
|
+
agentpack eval --variant baseline
|
|
1192
|
+
agentpack eval --variant agentpack
|
|
1193
|
+
agentpack eval --compare-variants baseline:agentpack
|
|
1194
|
+
agentpack eval --ci-template
|
|
1195
|
+
agentpack eval --report
|
|
1196
|
+
```
|
|
1197
|
+
|
|
1198
|
+
Example case:
|
|
1199
|
+
|
|
1200
|
+
```toml
|
|
1201
|
+
[[cases]]
|
|
1202
|
+
id = "auth-timeout"
|
|
1203
|
+
task = "fix auth token timeout"
|
|
1204
|
+
failure_class = "context"
|
|
1205
|
+
failure_source = "agent_failed"
|
|
1206
|
+
base_ref = "HEAD"
|
|
1207
|
+
patch_file = ".agentpack/evals/auth-timeout.patch"
|
|
1208
|
+
required_changed_files = ["src/auth/token.py"]
|
|
1209
|
+
forbidden_changed_files = ["src/db/**"]
|
|
1210
|
+
max_changed_files = 5
|
|
1211
|
+
max_changed_lines = 250
|
|
1212
|
+
agent = "codex"
|
|
1213
|
+
context_file = ".agentpack/context.md"
|
|
1214
|
+
context_hash = "..."
|
|
1215
|
+
selected_files = ["src/auth/token.py", "tests/test_auth.py"]
|
|
1216
|
+
|
|
1217
|
+
[[cases.checks]]
|
|
1218
|
+
name = "tests"
|
|
1219
|
+
command = "pytest tests/test_auth.py -q"
|
|
1220
|
+
timeout_s = 120
|
|
1221
|
+
retries = 1 # optional, marks pass-after-fail checks as flaky
|
|
1222
|
+
```
|
|
1223
|
+
|
|
1224
|
+
Use `eval` after an agent run: capture the real failure, add deterministic
|
|
1225
|
+
checks such as tests, typecheck, lint, schema validation, API contract tests,
|
|
1226
|
+
diff size, forbidden files, or golden outputs, then rerun until the harness
|
|
1227
|
+
passes. The model can propose; the harness must verify.
|
|
1228
|
+
|
|
1229
|
+
For hands-free local iteration, keep `agentpack eval --watch --until-pass`
|
|
1230
|
+
running in a terminal while the agent or developer edits. It reruns when the
|
|
1231
|
+
case file, patch artifacts, golden files, or git diff content changes and stops
|
|
1232
|
+
when all deterministic checks pass. `--capture` stores the current patch under
|
|
1233
|
+
`.agentpack/evals/<case-id>.patch` plus context metadata; `--replay` checks out
|
|
1234
|
+
`base_ref` into an isolated git worktree, applies that patch, and runs the same
|
|
1235
|
+
deterministic checks there. To measure AgentPack's contribution, run the same
|
|
1236
|
+
case with `--variant baseline` and then with `--variant agentpack`;
|
|
1237
|
+
`--compare-variants baseline:agentpack` reports which cases improved, regressed,
|
|
1238
|
+
stayed unchanged, or still need both sides. Use `--ci-template` to scaffold a
|
|
1239
|
+
GitHub Actions workflow for `benchmarks/evals.toml`.
|
|
1240
|
+
|
|
1241
|
+
Eval files are executable trust boundaries: commands in `checks.command` run
|
|
1242
|
+
locally and in CI. Review eval TOML from contributors with the same care as
|
|
1243
|
+
shell scripts or workflow files.
|
|
1244
|
+
|
|
1245
|
+
Captured patch artifacts are secret-scanned with the same local redactor used
|
|
1246
|
+
for context packs before they are written. If a patch line contains a real
|
|
1247
|
+
secret, the artifact stores `[REDACTED:<type>]` and the case records
|
|
1248
|
+
`patch_redaction_warnings`. Secret-bearing patches may replay with redacted
|
|
1249
|
+
values; replace secrets with safe fixture values when exact replay matters.
|
|
1250
|
+
|
|
1251
|
+
---
|
|
1252
|
+
|
|
1175
1253
|
### `agentpack status`
|
|
1176
1254
|
|
|
1177
1255
|
Check whether the context pack is stale.
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import time
|
|
5
|
+
|
|
6
|
+
import typer
|
|
7
|
+
from rich.table import Table
|
|
8
|
+
from rich import box
|
|
9
|
+
|
|
10
|
+
from agentpack.commands._shared import console, _root
|
|
11
|
+
from agentpack.core.evals import (
|
|
12
|
+
FAILURE_CLASSES,
|
|
13
|
+
append_captured_eval_case,
|
|
14
|
+
compare_eval_variants,
|
|
15
|
+
default_eval_cases_path,
|
|
16
|
+
eval_results_path,
|
|
17
|
+
eval_watch_fingerprint,
|
|
18
|
+
load_eval_cases,
|
|
19
|
+
load_eval_result_records,
|
|
20
|
+
persist_eval_results,
|
|
21
|
+
run_eval_suite,
|
|
22
|
+
scaffold_eval_cases,
|
|
23
|
+
write_eval_ci_template,
|
|
24
|
+
write_eval_report,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def register(app: typer.Typer) -> None:
|
|
29
|
+
@app.command(name="eval")
|
|
30
|
+
def eval_command(
|
|
31
|
+
init: bool = typer.Option(False, "--init", is_flag=True, help="Scaffold .agentpack/evals.toml and exit."),
|
|
32
|
+
cases: str = typer.Option("", "--cases", help="Path to eval TOML file (default: .agentpack/evals.toml)."),
|
|
33
|
+
case: str = typer.Option("", "--case", help="Run one eval case by id."),
|
|
34
|
+
prove_targets: bool = typer.Option(False, "--prove-targets", is_flag=True, help="Exit non-zero when any eval case fails."),
|
|
35
|
+
capture: str = typer.Option("", "--capture", help="Append a case from current git diff using this id."),
|
|
36
|
+
failure_class: str = typer.Option("context", "--failure-class", help=f"Failure class ({' | '.join(FAILURE_CLASSES)})."),
|
|
37
|
+
failure_source: str = typer.Option("agent_failed", "--failure-source", help="Failure source for captured cases."),
|
|
38
|
+
check: list[str] | None = typer.Option(None, "--check", help="Deterministic command check for --capture. Repeatable."),
|
|
39
|
+
task: str = typer.Option("", "--task", help="Task text for --capture."),
|
|
40
|
+
base_ref: str = typer.Option("HEAD", "--base-ref", help="Git base ref for diff checks."),
|
|
41
|
+
report: bool = typer.Option(False, "--report", is_flag=True, help="Write benchmarks/results/YYYY-MM-DD-eval.md."),
|
|
42
|
+
ci_template: bool = typer.Option(False, "--ci-template", is_flag=True, help="Scaffold .github/workflows/agentpack-eval.yml and exit."),
|
|
43
|
+
variant: str = typer.Option("agentpack", "--variant", help="Result variant label, e.g. baseline or agentpack."),
|
|
44
|
+
compare_variants: str = typer.Option("", "--compare-variants", help="Compare latest results as BASELINE:VARIANT."),
|
|
45
|
+
replay: bool = typer.Option(False, "--replay", is_flag=True, help="Run cases in isolated git worktrees using captured patch_file artifacts."),
|
|
46
|
+
watch: bool = typer.Option(False, "--watch", is_flag=True, help="Rerun evals when git diff state changes."),
|
|
47
|
+
interval: float = typer.Option(2.0, "--interval", help="Watch polling interval in seconds."),
|
|
48
|
+
max_runs: int = typer.Option(0, "--max-runs", help="Maximum watch runs (0 = unlimited)."),
|
|
49
|
+
until_pass: bool = typer.Option(False, "--until-pass", is_flag=True, help="Stop watch mode after all cases pass."),
|
|
50
|
+
agent: str = typer.Option("", "--agent", help="Agent label to store with --capture metadata."),
|
|
51
|
+
prompt_file: str = typer.Option("", "--prompt-file", help="Prompt artifact path to store with --capture."),
|
|
52
|
+
context_file: str = typer.Option(".agentpack/context.md", "--context-file", help="Context artifact path to store with --capture."),
|
|
53
|
+
) -> None:
|
|
54
|
+
"""Run deterministic eval cases without using an LLM judge."""
|
|
55
|
+
root = _root()
|
|
56
|
+
cases_path = Path(cases) if cases else default_eval_cases_path(root)
|
|
57
|
+
|
|
58
|
+
if compare_variants:
|
|
59
|
+
_print_variant_comparison(root, compare_variants)
|
|
60
|
+
return
|
|
61
|
+
|
|
62
|
+
if ci_template:
|
|
63
|
+
out = write_eval_ci_template(root)
|
|
64
|
+
console.print(f"[green]✓[/] Created [bold]{out}[/]")
|
|
65
|
+
return
|
|
66
|
+
|
|
67
|
+
if init:
|
|
68
|
+
out = scaffold_eval_cases(root)
|
|
69
|
+
console.print(f"[green]✓[/] Created [bold]{out}[/]")
|
|
70
|
+
console.print(" Edit it with real failures, then run [bold]agentpack eval[/].")
|
|
71
|
+
return
|
|
72
|
+
|
|
73
|
+
if capture:
|
|
74
|
+
try:
|
|
75
|
+
captured = append_captured_eval_case(
|
|
76
|
+
cases_path,
|
|
77
|
+
root=root,
|
|
78
|
+
case_id=capture,
|
|
79
|
+
failure_class=failure_class,
|
|
80
|
+
checks=check or [],
|
|
81
|
+
task=task,
|
|
82
|
+
failure_source=failure_source,
|
|
83
|
+
base_ref=base_ref,
|
|
84
|
+
agent=agent,
|
|
85
|
+
prompt_file=prompt_file,
|
|
86
|
+
context_file=context_file,
|
|
87
|
+
)
|
|
88
|
+
except ValueError as exc:
|
|
89
|
+
console.print(f"[red]{exc}[/]")
|
|
90
|
+
raise typer.Exit(1) from exc
|
|
91
|
+
console.print(f"[green]✓[/] Captured eval case [bold]{captured.id}[/] in [bold]{cases_path}[/]")
|
|
92
|
+
console.print(f" Required changed files: {len(captured.required_changed_files)}")
|
|
93
|
+
if captured.patch_redaction_warnings:
|
|
94
|
+
console.print(f" [yellow]Redacted {len(captured.patch_redaction_warnings)} secret(s) from patch artifact.[/]")
|
|
95
|
+
return
|
|
96
|
+
|
|
97
|
+
if report and not cases_path.exists():
|
|
98
|
+
records = load_eval_result_records(eval_results_path(root))
|
|
99
|
+
out = write_eval_report(root, records)
|
|
100
|
+
console.print(f"[green]✓[/] Wrote eval report: [bold]{out}[/]")
|
|
101
|
+
return
|
|
102
|
+
|
|
103
|
+
if not cases_path.exists():
|
|
104
|
+
console.print(f"[yellow]No eval cases file found at {cases_path}[/]")
|
|
105
|
+
console.print(" Run [bold]agentpack eval --init[/] to scaffold one.")
|
|
106
|
+
raise typer.Exit(1)
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
eval_cases = load_eval_cases(cases_path)
|
|
110
|
+
except ValueError as exc:
|
|
111
|
+
console.print(f"[red]{exc}[/]")
|
|
112
|
+
raise typer.Exit(1) from exc
|
|
113
|
+
|
|
114
|
+
if case:
|
|
115
|
+
eval_cases = [item for item in eval_cases if item.id == case]
|
|
116
|
+
if not eval_cases:
|
|
117
|
+
console.print(f"[yellow]No eval case found with id: {case}[/]")
|
|
118
|
+
raise typer.Exit(1)
|
|
119
|
+
|
|
120
|
+
if not eval_cases:
|
|
121
|
+
console.print("[yellow]No eval cases defined.[/]")
|
|
122
|
+
raise typer.Exit(1)
|
|
123
|
+
|
|
124
|
+
if watch:
|
|
125
|
+
results = _watch_eval_cases(
|
|
126
|
+
root,
|
|
127
|
+
eval_cases,
|
|
128
|
+
variant=variant,
|
|
129
|
+
replay=replay,
|
|
130
|
+
interval=interval,
|
|
131
|
+
max_runs=max_runs,
|
|
132
|
+
until_pass=until_pass,
|
|
133
|
+
extra_paths=[cases_path],
|
|
134
|
+
)
|
|
135
|
+
else:
|
|
136
|
+
results = _run_once(root, eval_cases, variant=variant, replay=replay)
|
|
137
|
+
|
|
138
|
+
if report:
|
|
139
|
+
records = load_eval_result_records(eval_results_path(root))
|
|
140
|
+
out = write_eval_report(root, records)
|
|
141
|
+
console.print(f"[green]✓[/] Wrote eval report: [bold]{out}[/]")
|
|
142
|
+
|
|
143
|
+
if prove_targets and not all(result.passed for result in results):
|
|
144
|
+
raise typer.Exit(2)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _run_once(root: Path, eval_cases, *, variant: str, replay: bool):
|
|
148
|
+
console.print(f"\n[bold]Running {len(eval_cases)} deterministic eval case(s)...[/]\n")
|
|
149
|
+
results = run_eval_suite(root, eval_cases, variant=variant, replay=replay)
|
|
150
|
+
persist_eval_results(root, results)
|
|
151
|
+
_print_results(results)
|
|
152
|
+
return results
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _watch_eval_cases(
|
|
156
|
+
root: Path,
|
|
157
|
+
eval_cases,
|
|
158
|
+
*,
|
|
159
|
+
variant: str,
|
|
160
|
+
replay: bool,
|
|
161
|
+
interval: float,
|
|
162
|
+
max_runs: int,
|
|
163
|
+
until_pass: bool,
|
|
164
|
+
extra_paths: list[Path],
|
|
165
|
+
):
|
|
166
|
+
if interval <= 0:
|
|
167
|
+
console.print("[red]--interval must be greater than 0[/]")
|
|
168
|
+
raise typer.Exit(1)
|
|
169
|
+
if max_runs < 0:
|
|
170
|
+
console.print("[red]--max-runs must be 0 or greater[/]")
|
|
171
|
+
raise typer.Exit(1)
|
|
172
|
+
|
|
173
|
+
console.print("[bold]Watching deterministic evals.[/] Press Ctrl-C to stop.")
|
|
174
|
+
last_fingerprint = ""
|
|
175
|
+
last_results = []
|
|
176
|
+
patch_paths = [root / case.patch_file for case in eval_cases if case.patch_file]
|
|
177
|
+
golden_paths = [root / golden.expected for case in eval_cases for golden in case.golden_files]
|
|
178
|
+
watched_paths = extra_paths + patch_paths + golden_paths
|
|
179
|
+
runs = 0
|
|
180
|
+
try:
|
|
181
|
+
while True:
|
|
182
|
+
fingerprint = eval_watch_fingerprint(root, eval_cases, extra_paths=watched_paths)
|
|
183
|
+
if fingerprint != last_fingerprint:
|
|
184
|
+
runs += 1
|
|
185
|
+
last_fingerprint = fingerprint
|
|
186
|
+
last_results = _run_once(root, eval_cases, variant=variant, replay=replay)
|
|
187
|
+
if until_pass and all(result.passed for result in last_results):
|
|
188
|
+
console.print("[green]✓[/] All eval cases pass; watch stopped.")
|
|
189
|
+
break
|
|
190
|
+
if max_runs and runs >= max_runs:
|
|
191
|
+
break
|
|
192
|
+
time.sleep(interval)
|
|
193
|
+
except KeyboardInterrupt:
|
|
194
|
+
console.print("\n[yellow]Eval watch stopped.[/]")
|
|
195
|
+
return last_results
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _print_results(results) -> None:
|
|
199
|
+
tbl = Table(box=box.SIMPLE, show_header=True, padding=(0, 1))
|
|
200
|
+
tbl.add_column("case", max_width=32)
|
|
201
|
+
tbl.add_column("status", width=8)
|
|
202
|
+
tbl.add_column("class", max_width=18)
|
|
203
|
+
tbl.add_column("checks", justify="right")
|
|
204
|
+
tbl.add_column("changed", justify="right")
|
|
205
|
+
tbl.add_column("lines", justify="right")
|
|
206
|
+
tbl.add_column("time", justify="right")
|
|
207
|
+
|
|
208
|
+
for result in results:
|
|
209
|
+
status = "[green]pass[/]" if result.passed else "[red]fail[/]"
|
|
210
|
+
failed = len(result.failed_checks)
|
|
211
|
+
checks = f"{len(result.checks) - failed}/{len(result.checks)}"
|
|
212
|
+
tbl.add_row(
|
|
213
|
+
result.case.id,
|
|
214
|
+
status,
|
|
215
|
+
result.case.failure_class,
|
|
216
|
+
checks,
|
|
217
|
+
str(len(result.changed_files)),
|
|
218
|
+
str(result.changed_lines),
|
|
219
|
+
f"{result.duration_s:.2f}s",
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
console.print(tbl)
|
|
223
|
+
for result in results:
|
|
224
|
+
for check in result.failed_checks:
|
|
225
|
+
detail = f": {check.detail}" if check.detail else ""
|
|
226
|
+
console.print(f" [red]![/] {result.case.id} / {check.name}{detail}", soft_wrap=True)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _print_variant_comparison(root: Path, compare_variants: str) -> None:
|
|
230
|
+
try:
|
|
231
|
+
baseline, variant = compare_variants.split(":", 1)
|
|
232
|
+
except ValueError as exc:
|
|
233
|
+
console.print("[red]--compare-variants must use BASELINE:VARIANT, e.g. baseline:agentpack[/]")
|
|
234
|
+
raise typer.Exit(1) from exc
|
|
235
|
+
records = load_eval_result_records(eval_results_path(root))
|
|
236
|
+
comparison = compare_eval_variants(records, baseline, variant)
|
|
237
|
+
|
|
238
|
+
tbl = Table(title=f"Eval Variant Comparison: {baseline} → {variant}", box=box.SIMPLE, show_header=True, padding=(0, 1))
|
|
239
|
+
tbl.add_column("case", max_width=36)
|
|
240
|
+
tbl.add_column(baseline, justify="center")
|
|
241
|
+
tbl.add_column(variant, justify="center")
|
|
242
|
+
tbl.add_column("status", max_width=12)
|
|
243
|
+
for row in comparison["rows"]:
|
|
244
|
+
tbl.add_row(
|
|
245
|
+
row["case_id"],
|
|
246
|
+
_pass_label(row["baseline_passed"]),
|
|
247
|
+
_pass_label(row["variant_passed"]),
|
|
248
|
+
row["status"],
|
|
249
|
+
)
|
|
250
|
+
console.print(tbl)
|
|
251
|
+
console.print(
|
|
252
|
+
f" improved [bold green]{comparison['improved']}[/] "
|
|
253
|
+
f"regressed [bold red]{comparison['regressed']}[/] "
|
|
254
|
+
f"unchanged [bold]{comparison['unchanged']}[/] "
|
|
255
|
+
f"incomplete [bold yellow]{comparison['incomplete']}[/]"
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def _pass_label(value) -> str:
|
|
260
|
+
if value is True:
|
|
261
|
+
return "[green]pass[/]"
|
|
262
|
+
if value is False:
|
|
263
|
+
return "[red]fail[/]"
|
|
264
|
+
return "[yellow]-[/]"
|
|
@@ -49,6 +49,7 @@ def _build_tuning_suggestions(root: Path, *, include_benchmark: bool = True) ->
|
|
|
49
49
|
suggestions: list[TuningSuggestion] = []
|
|
50
50
|
metrics = _load_jsonl(root / ".agentpack" / "metrics.jsonl")
|
|
51
51
|
benchmark = _load_jsonl(root / ".agentpack" / "benchmark_results.jsonl") if include_benchmark else []
|
|
52
|
+
eval_results = _load_jsonl(root / ".agentpack" / "eval_results.jsonl")
|
|
52
53
|
|
|
53
54
|
accuracy_rows = [row for row in metrics if "selection_recall" in row][-10:]
|
|
54
55
|
if accuracy_rows:
|
|
@@ -107,6 +108,29 @@ def _build_tuning_suggestions(root: Path, *, include_benchmark: bool = True) ->
|
|
|
107
108
|
suggestion = "Use `agentpack explain --omitted --task <task>` to inspect the miss."
|
|
108
109
|
suggestions.append(TuningSuggestion("benchmark misses", f"{count} miss(es): {status}", suggestion))
|
|
109
110
|
|
|
111
|
+
if eval_results:
|
|
112
|
+
class_counts: dict[str, int] = {}
|
|
113
|
+
check_counts: dict[str, int] = {}
|
|
114
|
+
for row in eval_results[-20:]:
|
|
115
|
+
if row.get("passed") is False:
|
|
116
|
+
failure_class = str(row.get("failure_class") or "unknown")
|
|
117
|
+
class_counts[failure_class] = class_counts.get(failure_class, 0) + 1
|
|
118
|
+
for check in row.get("failed_checks", []) or []:
|
|
119
|
+
if isinstance(check, str):
|
|
120
|
+
check_counts[check] = check_counts.get(check, 0) + 1
|
|
121
|
+
for failure_class, count in sorted(class_counts.items(), key=lambda item: (-item[1], item[0]))[:4]:
|
|
122
|
+
suggestions.append(TuningSuggestion(
|
|
123
|
+
"eval failures",
|
|
124
|
+
f"{count} failure(s): {failure_class}",
|
|
125
|
+
"Use `agentpack eval --report` and inspect the failing deterministic checks before changing prompts or scoring.",
|
|
126
|
+
))
|
|
127
|
+
for check, count in sorted(check_counts.items(), key=lambda item: (-item[1], item[0]))[:3]:
|
|
128
|
+
suggestions.append(TuningSuggestion(
|
|
129
|
+
"eval checks",
|
|
130
|
+
f"{count} failure(s): {check}",
|
|
131
|
+
"Strengthen or narrow this harness check if it is flaky; fix the agent workflow if it is deterministic.",
|
|
132
|
+
))
|
|
133
|
+
|
|
110
134
|
return suggestions
|
|
111
135
|
|
|
112
136
|
|