agentpack-cli 0.3.9__tar.gz → 0.3.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/PKG-INFO +80 -2
  2. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/README.md +79 -1
  3. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/pyproject.toml +1 -1
  4. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/__init__.py +1 -1
  5. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/cli.py +2 -0
  6. agentpack_cli-0.3.10/src/agentpack/commands/eval_cmd.py +264 -0
  7. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/tune.py +24 -0
  8. agentpack_cli-0.3.10/src/agentpack/core/evals.py +939 -0
  9. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/.gitignore +0 -0
  10. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/LICENSE +0 -0
  11. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/adapters/__init__.py +0 -0
  12. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/adapters/antigravity.py +0 -0
  13. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/adapters/base.py +0 -0
  14. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/adapters/claude.py +0 -0
  15. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/adapters/codex.py +0 -0
  16. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/adapters/cursor.py +0 -0
  17. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/adapters/detect.py +0 -0
  18. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/adapters/generic.py +0 -0
  19. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/adapters/windsurf.py +0 -0
  20. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/analysis/__init__.py +0 -0
  21. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/analysis/dependency_graph.py +0 -0
  22. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/analysis/go_imports.py +0 -0
  23. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/analysis/java_imports.py +0 -0
  24. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/analysis/js_ts_imports.py +0 -0
  25. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/analysis/monorepo.py +0 -0
  26. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/analysis/naming_signals.py +0 -0
  27. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/analysis/python_imports.py +0 -0
  28. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/analysis/ranking.py +0 -0
  29. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/analysis/repo_map.py +0 -0
  30. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/analysis/role_inference.py +0 -0
  31. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/analysis/rust_imports.py +0 -0
  32. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/analysis/symbols.py +0 -0
  33. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/analysis/task_classifier.py +0 -0
  34. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/analysis/tests.py +0 -0
  35. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/application/__init__.py +0 -0
  36. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/application/pack_service.py +0 -0
  37. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/__init__.py +0 -0
  38. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/_shared.py +0 -0
  39. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/benchmark.py +0 -0
  40. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/claude_cmd.py +0 -0
  41. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/diff.py +0 -0
  42. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/doctor.py +0 -0
  43. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/explain.py +0 -0
  44. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/guard.py +0 -0
  45. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/hook_cmd.py +0 -0
  46. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/ignore_cmd.py +0 -0
  47. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/init.py +0 -0
  48. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/install.py +0 -0
  49. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/mcp_cmd.py +0 -0
  50. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/migrate.py +0 -0
  51. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/monitor.py +0 -0
  52. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/pack.py +0 -0
  53. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/quickstart.py +0 -0
  54. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/repair.py +0 -0
  55. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/scan.py +0 -0
  56. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/stats.py +0 -0
  57. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/status.py +0 -0
  58. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/summarize.py +0 -0
  59. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/commands/watch.py +0 -0
  60. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/__init__.py +0 -0
  61. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/bootstrap.py +0 -0
  62. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/cache.py +0 -0
  63. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/config.py +0 -0
  64. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/context_pack.py +0 -0
  65. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/diff.py +0 -0
  66. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/git.py +0 -0
  67. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/git_hooks.py +0 -0
  68. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/global_install.py +0 -0
  69. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/ignore.py +0 -0
  70. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/merkle.py +0 -0
  71. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/models.py +0 -0
  72. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/redactor.py +0 -0
  73. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/scanner.py +0 -0
  74. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/snapshot.py +0 -0
  75. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/task_freshness.py +0 -0
  76. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/token_estimator.py +0 -0
  77. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/core/vscode_tasks.py +0 -0
  78. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/data/agentpack.md +0 -0
  79. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/installers/__init__.py +0 -0
  80. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/installers/antigravity.py +0 -0
  81. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/installers/claude.py +0 -0
  82. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/installers/codex.py +0 -0
  83. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/installers/cursor.py +0 -0
  84. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/installers/windsurf.py +0 -0
  85. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/integrations/__init__.py +0 -0
  86. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/integrations/agents.py +0 -0
  87. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/integrations/git_hooks.py +0 -0
  88. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/integrations/global_install.py +0 -0
  89. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/integrations/platform.py +0 -0
  90. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/integrations/vscode_tasks.py +0 -0
  91. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/mcp_server.py +0 -0
  92. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/renderers/__init__.py +0 -0
  93. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/renderers/compact.py +0 -0
  94. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/renderers/markdown.py +0 -0
  95. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/renderers/receipts.py +0 -0
  96. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/session/__init__.py +0 -0
  97. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/session/state.py +0 -0
  98. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/summaries/__init__.py +0 -0
  99. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/summaries/base.py +0 -0
  100. {agentpack_cli-0.3.9 → agentpack_cli-0.3.10}/src/agentpack/summaries/offline.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agentpack-cli
3
- Version: 0.3.9
3
+ Version: 0.3.10
4
4
  Summary: Local context engine for AI coding agents that ranks relevant files and builds task-focused context packs.
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -40,13 +40,14 @@ Description-Content-Type: text/markdown
40
40
  # AgentPack
41
41
 
42
42
  [![PyPI version](https://img.shields.io/pypi/v/agentpack-cli.svg)](https://pypi.org/project/agentpack-cli/)
43
+ [![PyPI Downloads](https://static.pepy.tech/personalized-badge/agentpack-cli?period=total&units=INTERNATIONAL_SYSTEM&left_color=BLACK&right_color=GREEN&left_text=downloads)](https://pepy.tech/projects/agentpack-cli)
43
44
  [![npm version](https://img.shields.io/npm/v/@vishal2612200/agentpack.svg)](https://www.npmjs.com/package/@vishal2612200/agentpack)
44
45
  [![npm downloads](https://img.shields.io/npm/dm/@vishal2612200/agentpack.svg)](https://www.npmjs.com/package/@vishal2612200/agentpack)
45
46
  [![Python versions](https://img.shields.io/pypi/pyversions/agentpack-cli.svg)](https://pypi.org/project/agentpack-cli/)
46
47
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
47
48
  [![CI](https://github.com/vishal2612200/agentpack/actions/workflows/ci.yml/badge.svg)](https://github.com/vishal2612200/agentpack/actions/workflows/ci.yml)
48
49
 
49
- > **Status: alpha (v0.3.9).** Works, tested, used in real sessions. Python and JavaScript/TypeScript are the best-supported languages. Public benchmark proof exists for the current suite, but broader repo coverage is still growing. API may change before 1.0.
50
+ > **Status: alpha (v0.3.10).** Works, tested, used in real sessions. Python and JavaScript/TypeScript are the best-supported languages. Public benchmark proof exists for the current suite, but broader repo coverage is still growing. API may change before 1.0.
50
51
  >
51
52
  > **Platform note:** macOS, Linux, and Windows are supported. Windows support targets PowerShell plus Git for Windows. `cmd.exe` and bare Git setups are not a supported path yet.
52
53
 
@@ -601,6 +602,7 @@ Command map:
601
602
  | `agentpack doctor` | Audit hooks, agent files, CLI path, and repo health |
602
603
  | `agentpack explain` | Understand why a file was selected or omitted |
603
604
  | `agentpack benchmark` | Measure recall, precision, and misses against real tasks |
605
+ | `agentpack eval` | Run deterministic failure evals with tests, diff limits, and taxonomy labels |
604
606
  | `agentpack tune` | Suggest fixes from recent pack metrics and benchmark misses |
605
607
  | `agentpack status` | Inspect current pack freshness and metadata |
606
608
  | `agentpack diff` | Show what changed between context snapshots |
@@ -1211,6 +1213,82 @@ This command does not pretend a pack is correct. It gives the next thing to insp
1211
1213
 
1212
1214
  ---
1213
1215
 
1216
+ ### `agentpack eval`
1217
+
1218
+ Run deterministic failure evals. AgentPack does not run the coding agent and
1219
+ does not use an LLM judge; it verifies the current or replayed worktree with
1220
+ commands and diff policies.
1221
+
1222
+ ```bash
1223
+ agentpack eval --init
1224
+ # edit .agentpack/evals.toml with real failures and checks
1225
+ agentpack eval
1226
+ agentpack eval --case auth-timeout --prove-targets
1227
+ agentpack eval --capture auth-timeout --failure-class context --check "pytest tests/test_auth.py -q"
1228
+ agentpack eval --watch --until-pass
1229
+ agentpack eval --replay --prove-targets
1230
+ agentpack eval --variant baseline
1231
+ agentpack eval --variant agentpack
1232
+ agentpack eval --compare-variants baseline:agentpack
1233
+ agentpack eval --ci-template
1234
+ agentpack eval --report
1235
+ ```
1236
+
1237
+ Example case:
1238
+
1239
+ ```toml
1240
+ [[cases]]
1241
+ id = "auth-timeout"
1242
+ task = "fix auth token timeout"
1243
+ failure_class = "context"
1244
+ failure_source = "agent_failed"
1245
+ base_ref = "HEAD"
1246
+ patch_file = ".agentpack/evals/auth-timeout.patch"
1247
+ required_changed_files = ["src/auth/token.py"]
1248
+ forbidden_changed_files = ["src/db/**"]
1249
+ max_changed_files = 5
1250
+ max_changed_lines = 250
1251
+ agent = "codex"
1252
+ context_file = ".agentpack/context.md"
1253
+ context_hash = "..."
1254
+ selected_files = ["src/auth/token.py", "tests/test_auth.py"]
1255
+
1256
+ [[cases.checks]]
1257
+ name = "tests"
1258
+ command = "pytest tests/test_auth.py -q"
1259
+ timeout_s = 120
1260
+ retries = 1 # optional, marks pass-after-fail checks as flaky
1261
+ ```
1262
+
1263
+ Use `eval` after an agent run: capture the real failure, add deterministic
1264
+ checks such as tests, typecheck, lint, schema validation, API contract tests,
1265
+ diff size, forbidden files, or golden outputs, then rerun until the harness
1266
+ passes. The model can propose; the harness must verify.
1267
+
1268
+ For hands-free local iteration, keep `agentpack eval --watch --until-pass`
1269
+ running in a terminal while the agent or developer edits. It reruns when the
1270
+ case file, patch artifacts, golden files, or git diff content changes and stops
1271
+ when all deterministic checks pass. `--capture` stores the current patch under
1272
+ `.agentpack/evals/<case-id>.patch` plus context metadata; `--replay` checks out
1273
+ `base_ref` into an isolated git worktree, applies that patch, and runs the same
1274
+ deterministic checks there. To measure AgentPack's contribution, run the same
1275
+ case with `--variant baseline` and then with `--variant agentpack`;
1276
+ `--compare-variants baseline:agentpack` reports which cases improved, regressed,
1277
+ stayed unchanged, or still need both sides. Use `--ci-template` to scaffold a
1278
+ GitHub Actions workflow for `benchmarks/evals.toml`.
1279
+
1280
+ Eval files are executable trust boundaries: commands in `checks.command` run
1281
+ locally and in CI. Review eval TOML from contributors with the same care as
1282
+ shell scripts or workflow files.
1283
+
1284
+ Captured patch artifacts are secret-scanned with the same local redactor used
1285
+ for context packs before they are written. If a patch line contains a real
1286
+ secret, the artifact stores `[REDACTED:<type>]` and the case records
1287
+ `patch_redaction_warnings`. Secret-bearing patches may replay with redacted
1288
+ values; replace secrets with safe fixture values when exact replay matters.
1289
+
1290
+ ---
1291
+
1214
1292
  ### `agentpack status`
1215
1293
 
1216
1294
  Check whether the context pack is stale.
@@ -1,13 +1,14 @@
1
1
  # AgentPack
2
2
 
3
3
  [![PyPI version](https://img.shields.io/pypi/v/agentpack-cli.svg)](https://pypi.org/project/agentpack-cli/)
4
+ [![PyPI Downloads](https://static.pepy.tech/personalized-badge/agentpack-cli?period=total&units=INTERNATIONAL_SYSTEM&left_color=BLACK&right_color=GREEN&left_text=downloads)](https://pepy.tech/projects/agentpack-cli)
4
5
  [![npm version](https://img.shields.io/npm/v/@vishal2612200/agentpack.svg)](https://www.npmjs.com/package/@vishal2612200/agentpack)
5
6
  [![npm downloads](https://img.shields.io/npm/dm/@vishal2612200/agentpack.svg)](https://www.npmjs.com/package/@vishal2612200/agentpack)
6
7
  [![Python versions](https://img.shields.io/pypi/pyversions/agentpack-cli.svg)](https://pypi.org/project/agentpack-cli/)
7
8
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
8
9
  [![CI](https://github.com/vishal2612200/agentpack/actions/workflows/ci.yml/badge.svg)](https://github.com/vishal2612200/agentpack/actions/workflows/ci.yml)
9
10
 
10
- > **Status: alpha (v0.3.9).** Works, tested, used in real sessions. Python and JavaScript/TypeScript are the best-supported languages. Public benchmark proof exists for the current suite, but broader repo coverage is still growing. API may change before 1.0.
11
+ > **Status: alpha (v0.3.10).** Works, tested, used in real sessions. Python and JavaScript/TypeScript are the best-supported languages. Public benchmark proof exists for the current suite, but broader repo coverage is still growing. API may change before 1.0.
11
12
  >
12
13
  > **Platform note:** macOS, Linux, and Windows are supported. Windows support targets PowerShell plus Git for Windows. `cmd.exe` and bare Git setups are not a supported path yet.
13
14
 
@@ -562,6 +563,7 @@ Command map:
562
563
  | `agentpack doctor` | Audit hooks, agent files, CLI path, and repo health |
563
564
  | `agentpack explain` | Understand why a file was selected or omitted |
564
565
  | `agentpack benchmark` | Measure recall, precision, and misses against real tasks |
566
+ | `agentpack eval` | Run deterministic failure evals with tests, diff limits, and taxonomy labels |
565
567
  | `agentpack tune` | Suggest fixes from recent pack metrics and benchmark misses |
566
568
  | `agentpack status` | Inspect current pack freshness and metadata |
567
569
  | `agentpack diff` | Show what changed between context snapshots |
@@ -1172,6 +1174,82 @@ This command does not pretend a pack is correct. It gives the next thing to insp
1172
1174
 
1173
1175
  ---
1174
1176
 
1177
+ ### `agentpack eval`
1178
+
1179
+ Run deterministic failure evals. AgentPack does not run the coding agent and
1180
+ does not use an LLM judge; it verifies the current or replayed worktree with
1181
+ commands and diff policies.
1182
+
1183
+ ```bash
1184
+ agentpack eval --init
1185
+ # edit .agentpack/evals.toml with real failures and checks
1186
+ agentpack eval
1187
+ agentpack eval --case auth-timeout --prove-targets
1188
+ agentpack eval --capture auth-timeout --failure-class context --check "pytest tests/test_auth.py -q"
1189
+ agentpack eval --watch --until-pass
1190
+ agentpack eval --replay --prove-targets
1191
+ agentpack eval --variant baseline
1192
+ agentpack eval --variant agentpack
1193
+ agentpack eval --compare-variants baseline:agentpack
1194
+ agentpack eval --ci-template
1195
+ agentpack eval --report
1196
+ ```
1197
+
1198
+ Example case:
1199
+
1200
+ ```toml
1201
+ [[cases]]
1202
+ id = "auth-timeout"
1203
+ task = "fix auth token timeout"
1204
+ failure_class = "context"
1205
+ failure_source = "agent_failed"
1206
+ base_ref = "HEAD"
1207
+ patch_file = ".agentpack/evals/auth-timeout.patch"
1208
+ required_changed_files = ["src/auth/token.py"]
1209
+ forbidden_changed_files = ["src/db/**"]
1210
+ max_changed_files = 5
1211
+ max_changed_lines = 250
1212
+ agent = "codex"
1213
+ context_file = ".agentpack/context.md"
1214
+ context_hash = "..."
1215
+ selected_files = ["src/auth/token.py", "tests/test_auth.py"]
1216
+
1217
+ [[cases.checks]]
1218
+ name = "tests"
1219
+ command = "pytest tests/test_auth.py -q"
1220
+ timeout_s = 120
1221
+ retries = 1 # optional, marks pass-after-fail checks as flaky
1222
+ ```
1223
+
1224
+ Use `eval` after an agent run: capture the real failure, add deterministic
1225
+ checks such as tests, typecheck, lint, schema validation, API contract tests,
1226
+ diff size, forbidden files, or golden outputs, then rerun until the harness
1227
+ passes. The model can propose; the harness must verify.
1228
+
1229
+ For hands-free local iteration, keep `agentpack eval --watch --until-pass`
1230
+ running in a terminal while the agent or developer edits. It reruns when the
1231
+ case file, patch artifacts, golden files, or git diff content changes and stops
1232
+ when all deterministic checks pass. `--capture` stores the current patch under
1233
+ `.agentpack/evals/<case-id>.patch` plus context metadata; `--replay` checks out
1234
+ `base_ref` into an isolated git worktree, applies that patch, and runs the same
1235
+ deterministic checks there. To measure AgentPack's contribution, run the same
1236
+ case with `--variant baseline` and then with `--variant agentpack`;
1237
+ `--compare-variants baseline:agentpack` reports which cases improved, regressed,
1238
+ stayed unchanged, or still need both sides. Use `--ci-template` to scaffold a
1239
+ GitHub Actions workflow for `benchmarks/evals.toml`.
1240
+
1241
+ Eval files are executable trust boundaries: commands in `checks.command` run
1242
+ locally and in CI. Review eval TOML from contributors with the same care as
1243
+ shell scripts or workflow files.
1244
+
1245
+ Captured patch artifacts are secret-scanned with the same local redactor used
1246
+ for context packs before they are written. If a patch line contains a real
1247
+ secret, the artifact stores `[REDACTED:<type>]` and the case records
1248
+ `patch_redaction_warnings`. Secret-bearing patches may replay with redacted
1249
+ values; replace secrets with safe fixture values when exact replay matters.
1250
+
1251
+ ---
1252
+
1175
1253
  ### `agentpack status`
1176
1254
 
1177
1255
  Check whether the context pack is stale.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "agentpack-cli"
3
- version = "0.3.9"
3
+ version = "0.3.10"
4
4
  description = "Local context engine for AI coding agents that ranks relevant files and builds task-focused context packs."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -1,3 +1,3 @@
1
1
  """AgentPack — task-aware context packing for AI coding agents."""
2
2
 
3
- __version__ = "0.3.9"
3
+ __version__ = "0.3.10"
@@ -6,6 +6,7 @@ from agentpack.commands import (
6
6
  claude_cmd,
7
7
  diff,
8
8
  doctor,
9
+ eval_cmd,
9
10
  explain,
10
11
  guard,
11
12
  hook_cmd,
@@ -60,6 +61,7 @@ for mod in [
60
61
  explain,
61
62
  guard,
62
63
  doctor,
64
+ eval_cmd,
63
65
  tune,
64
66
  watch,
65
67
  claude_cmd,
@@ -0,0 +1,264 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ import time
5
+
6
+ import typer
7
+ from rich.table import Table
8
+ from rich import box
9
+
10
+ from agentpack.commands._shared import console, _root
11
+ from agentpack.core.evals import (
12
+ FAILURE_CLASSES,
13
+ append_captured_eval_case,
14
+ compare_eval_variants,
15
+ default_eval_cases_path,
16
+ eval_results_path,
17
+ eval_watch_fingerprint,
18
+ load_eval_cases,
19
+ load_eval_result_records,
20
+ persist_eval_results,
21
+ run_eval_suite,
22
+ scaffold_eval_cases,
23
+ write_eval_ci_template,
24
+ write_eval_report,
25
+ )
26
+
27
+
28
+ def register(app: typer.Typer) -> None:
29
+ @app.command(name="eval")
30
+ def eval_command(
31
+ init: bool = typer.Option(False, "--init", is_flag=True, help="Scaffold .agentpack/evals.toml and exit."),
32
+ cases: str = typer.Option("", "--cases", help="Path to eval TOML file (default: .agentpack/evals.toml)."),
33
+ case: str = typer.Option("", "--case", help="Run one eval case by id."),
34
+ prove_targets: bool = typer.Option(False, "--prove-targets", is_flag=True, help="Exit non-zero when any eval case fails."),
35
+ capture: str = typer.Option("", "--capture", help="Append a case from current git diff using this id."),
36
+ failure_class: str = typer.Option("context", "--failure-class", help=f"Failure class ({' | '.join(FAILURE_CLASSES)})."),
37
+ failure_source: str = typer.Option("agent_failed", "--failure-source", help="Failure source for captured cases."),
38
+ check: list[str] | None = typer.Option(None, "--check", help="Deterministic command check for --capture. Repeatable."),
39
+ task: str = typer.Option("", "--task", help="Task text for --capture."),
40
+ base_ref: str = typer.Option("HEAD", "--base-ref", help="Git base ref for diff checks."),
41
+ report: bool = typer.Option(False, "--report", is_flag=True, help="Write benchmarks/results/YYYY-MM-DD-eval.md."),
42
+ ci_template: bool = typer.Option(False, "--ci-template", is_flag=True, help="Scaffold .github/workflows/agentpack-eval.yml and exit."),
43
+ variant: str = typer.Option("agentpack", "--variant", help="Result variant label, e.g. baseline or agentpack."),
44
+ compare_variants: str = typer.Option("", "--compare-variants", help="Compare latest results as BASELINE:VARIANT."),
45
+ replay: bool = typer.Option(False, "--replay", is_flag=True, help="Run cases in isolated git worktrees using captured patch_file artifacts."),
46
+ watch: bool = typer.Option(False, "--watch", is_flag=True, help="Rerun evals when git diff state changes."),
47
+ interval: float = typer.Option(2.0, "--interval", help="Watch polling interval in seconds."),
48
+ max_runs: int = typer.Option(0, "--max-runs", help="Maximum watch runs (0 = unlimited)."),
49
+ until_pass: bool = typer.Option(False, "--until-pass", is_flag=True, help="Stop watch mode after all cases pass."),
50
+ agent: str = typer.Option("", "--agent", help="Agent label to store with --capture metadata."),
51
+ prompt_file: str = typer.Option("", "--prompt-file", help="Prompt artifact path to store with --capture."),
52
+ context_file: str = typer.Option(".agentpack/context.md", "--context-file", help="Context artifact path to store with --capture."),
53
+ ) -> None:
54
+ """Run deterministic eval cases without using an LLM judge."""
55
+ root = _root()
56
+ cases_path = Path(cases) if cases else default_eval_cases_path(root)
57
+
58
+ if compare_variants:
59
+ _print_variant_comparison(root, compare_variants)
60
+ return
61
+
62
+ if ci_template:
63
+ out = write_eval_ci_template(root)
64
+ console.print(f"[green]✓[/] Created [bold]{out}[/]")
65
+ return
66
+
67
+ if init:
68
+ out = scaffold_eval_cases(root)
69
+ console.print(f"[green]✓[/] Created [bold]{out}[/]")
70
+ console.print(" Edit it with real failures, then run [bold]agentpack eval[/].")
71
+ return
72
+
73
+ if capture:
74
+ try:
75
+ captured = append_captured_eval_case(
76
+ cases_path,
77
+ root=root,
78
+ case_id=capture,
79
+ failure_class=failure_class,
80
+ checks=check or [],
81
+ task=task,
82
+ failure_source=failure_source,
83
+ base_ref=base_ref,
84
+ agent=agent,
85
+ prompt_file=prompt_file,
86
+ context_file=context_file,
87
+ )
88
+ except ValueError as exc:
89
+ console.print(f"[red]{exc}[/]")
90
+ raise typer.Exit(1) from exc
91
+ console.print(f"[green]✓[/] Captured eval case [bold]{captured.id}[/] in [bold]{cases_path}[/]")
92
+ console.print(f" Required changed files: {len(captured.required_changed_files)}")
93
+ if captured.patch_redaction_warnings:
94
+ console.print(f" [yellow]Redacted {len(captured.patch_redaction_warnings)} secret(s) from patch artifact.[/]")
95
+ return
96
+
97
+ if report and not cases_path.exists():
98
+ records = load_eval_result_records(eval_results_path(root))
99
+ out = write_eval_report(root, records)
100
+ console.print(f"[green]✓[/] Wrote eval report: [bold]{out}[/]")
101
+ return
102
+
103
+ if not cases_path.exists():
104
+ console.print(f"[yellow]No eval cases file found at {cases_path}[/]")
105
+ console.print(" Run [bold]agentpack eval --init[/] to scaffold one.")
106
+ raise typer.Exit(1)
107
+
108
+ try:
109
+ eval_cases = load_eval_cases(cases_path)
110
+ except ValueError as exc:
111
+ console.print(f"[red]{exc}[/]")
112
+ raise typer.Exit(1) from exc
113
+
114
+ if case:
115
+ eval_cases = [item for item in eval_cases if item.id == case]
116
+ if not eval_cases:
117
+ console.print(f"[yellow]No eval case found with id: {case}[/]")
118
+ raise typer.Exit(1)
119
+
120
+ if not eval_cases:
121
+ console.print("[yellow]No eval cases defined.[/]")
122
+ raise typer.Exit(1)
123
+
124
+ if watch:
125
+ results = _watch_eval_cases(
126
+ root,
127
+ eval_cases,
128
+ variant=variant,
129
+ replay=replay,
130
+ interval=interval,
131
+ max_runs=max_runs,
132
+ until_pass=until_pass,
133
+ extra_paths=[cases_path],
134
+ )
135
+ else:
136
+ results = _run_once(root, eval_cases, variant=variant, replay=replay)
137
+
138
+ if report:
139
+ records = load_eval_result_records(eval_results_path(root))
140
+ out = write_eval_report(root, records)
141
+ console.print(f"[green]✓[/] Wrote eval report: [bold]{out}[/]")
142
+
143
+ if prove_targets and not all(result.passed for result in results):
144
+ raise typer.Exit(2)
145
+
146
+
147
+ def _run_once(root: Path, eval_cases, *, variant: str, replay: bool):
148
+ console.print(f"\n[bold]Running {len(eval_cases)} deterministic eval case(s)...[/]\n")
149
+ results = run_eval_suite(root, eval_cases, variant=variant, replay=replay)
150
+ persist_eval_results(root, results)
151
+ _print_results(results)
152
+ return results
153
+
154
+
155
+ def _watch_eval_cases(
156
+ root: Path,
157
+ eval_cases,
158
+ *,
159
+ variant: str,
160
+ replay: bool,
161
+ interval: float,
162
+ max_runs: int,
163
+ until_pass: bool,
164
+ extra_paths: list[Path],
165
+ ):
166
+ if interval <= 0:
167
+ console.print("[red]--interval must be greater than 0[/]")
168
+ raise typer.Exit(1)
169
+ if max_runs < 0:
170
+ console.print("[red]--max-runs must be 0 or greater[/]")
171
+ raise typer.Exit(1)
172
+
173
+ console.print("[bold]Watching deterministic evals.[/] Press Ctrl-C to stop.")
174
+ last_fingerprint = ""
175
+ last_results = []
176
+ patch_paths = [root / case.patch_file for case in eval_cases if case.patch_file]
177
+ golden_paths = [root / golden.expected for case in eval_cases for golden in case.golden_files]
178
+ watched_paths = extra_paths + patch_paths + golden_paths
179
+ runs = 0
180
+ try:
181
+ while True:
182
+ fingerprint = eval_watch_fingerprint(root, eval_cases, extra_paths=watched_paths)
183
+ if fingerprint != last_fingerprint:
184
+ runs += 1
185
+ last_fingerprint = fingerprint
186
+ last_results = _run_once(root, eval_cases, variant=variant, replay=replay)
187
+ if until_pass and all(result.passed for result in last_results):
188
+ console.print("[green]✓[/] All eval cases pass; watch stopped.")
189
+ break
190
+ if max_runs and runs >= max_runs:
191
+ break
192
+ time.sleep(interval)
193
+ except KeyboardInterrupt:
194
+ console.print("\n[yellow]Eval watch stopped.[/]")
195
+ return last_results
196
+
197
+
198
+ def _print_results(results) -> None:
199
+ tbl = Table(box=box.SIMPLE, show_header=True, padding=(0, 1))
200
+ tbl.add_column("case", max_width=32)
201
+ tbl.add_column("status", width=8)
202
+ tbl.add_column("class", max_width=18)
203
+ tbl.add_column("checks", justify="right")
204
+ tbl.add_column("changed", justify="right")
205
+ tbl.add_column("lines", justify="right")
206
+ tbl.add_column("time", justify="right")
207
+
208
+ for result in results:
209
+ status = "[green]pass[/]" if result.passed else "[red]fail[/]"
210
+ failed = len(result.failed_checks)
211
+ checks = f"{len(result.checks) - failed}/{len(result.checks)}"
212
+ tbl.add_row(
213
+ result.case.id,
214
+ status,
215
+ result.case.failure_class,
216
+ checks,
217
+ str(len(result.changed_files)),
218
+ str(result.changed_lines),
219
+ f"{result.duration_s:.2f}s",
220
+ )
221
+
222
+ console.print(tbl)
223
+ for result in results:
224
+ for check in result.failed_checks:
225
+ detail = f": {check.detail}" if check.detail else ""
226
+ console.print(f" [red]![/] {result.case.id} / {check.name}{detail}", soft_wrap=True)
227
+
228
+
229
+ def _print_variant_comparison(root: Path, compare_variants: str) -> None:
230
+ try:
231
+ baseline, variant = compare_variants.split(":", 1)
232
+ except ValueError as exc:
233
+ console.print("[red]--compare-variants must use BASELINE:VARIANT, e.g. baseline:agentpack[/]")
234
+ raise typer.Exit(1) from exc
235
+ records = load_eval_result_records(eval_results_path(root))
236
+ comparison = compare_eval_variants(records, baseline, variant)
237
+
238
+ tbl = Table(title=f"Eval Variant Comparison: {baseline} → {variant}", box=box.SIMPLE, show_header=True, padding=(0, 1))
239
+ tbl.add_column("case", max_width=36)
240
+ tbl.add_column(baseline, justify="center")
241
+ tbl.add_column(variant, justify="center")
242
+ tbl.add_column("status", max_width=12)
243
+ for row in comparison["rows"]:
244
+ tbl.add_row(
245
+ row["case_id"],
246
+ _pass_label(row["baseline_passed"]),
247
+ _pass_label(row["variant_passed"]),
248
+ row["status"],
249
+ )
250
+ console.print(tbl)
251
+ console.print(
252
+ f" improved [bold green]{comparison['improved']}[/] "
253
+ f"regressed [bold red]{comparison['regressed']}[/] "
254
+ f"unchanged [bold]{comparison['unchanged']}[/] "
255
+ f"incomplete [bold yellow]{comparison['incomplete']}[/]"
256
+ )
257
+
258
+
259
+ def _pass_label(value) -> str:
260
+ if value is True:
261
+ return "[green]pass[/]"
262
+ if value is False:
263
+ return "[red]fail[/]"
264
+ return "[yellow]-[/]"
@@ -49,6 +49,7 @@ def _build_tuning_suggestions(root: Path, *, include_benchmark: bool = True) ->
49
49
  suggestions: list[TuningSuggestion] = []
50
50
  metrics = _load_jsonl(root / ".agentpack" / "metrics.jsonl")
51
51
  benchmark = _load_jsonl(root / ".agentpack" / "benchmark_results.jsonl") if include_benchmark else []
52
+ eval_results = _load_jsonl(root / ".agentpack" / "eval_results.jsonl")
52
53
 
53
54
  accuracy_rows = [row for row in metrics if "selection_recall" in row][-10:]
54
55
  if accuracy_rows:
@@ -107,6 +108,29 @@ def _build_tuning_suggestions(root: Path, *, include_benchmark: bool = True) ->
107
108
  suggestion = "Use `agentpack explain --omitted --task <task>` to inspect the miss."
108
109
  suggestions.append(TuningSuggestion("benchmark misses", f"{count} miss(es): {status}", suggestion))
109
110
 
111
+ if eval_results:
112
+ class_counts: dict[str, int] = {}
113
+ check_counts: dict[str, int] = {}
114
+ for row in eval_results[-20:]:
115
+ if row.get("passed") is False:
116
+ failure_class = str(row.get("failure_class") or "unknown")
117
+ class_counts[failure_class] = class_counts.get(failure_class, 0) + 1
118
+ for check in row.get("failed_checks", []) or []:
119
+ if isinstance(check, str):
120
+ check_counts[check] = check_counts.get(check, 0) + 1
121
+ for failure_class, count in sorted(class_counts.items(), key=lambda item: (-item[1], item[0]))[:4]:
122
+ suggestions.append(TuningSuggestion(
123
+ "eval failures",
124
+ f"{count} failure(s): {failure_class}",
125
+ "Use `agentpack eval --report` and inspect the failing deterministic checks before changing prompts or scoring.",
126
+ ))
127
+ for check, count in sorted(check_counts.items(), key=lambda item: (-item[1], item[0]))[:3]:
128
+ suggestions.append(TuningSuggestion(
129
+ "eval checks",
130
+ f"{count} failure(s): {check}",
131
+ "Strengthen or narrow this harness check if it is flaky; fix the agent workflow if it is deterministic.",
132
+ ))
133
+
110
134
  return suggestions
111
135
 
112
136