agentpack-cli 0.2.1__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/PKG-INFO +69 -12
  2. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/README.md +68 -11
  3. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/pyproject.toml +1 -1
  4. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/__init__.py +1 -1
  5. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/analysis/dependency_graph.py +27 -15
  6. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/analysis/ranking.py +90 -0
  7. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/application/pack_service.py +16 -2
  8. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/commands/benchmark.py +342 -4
  9. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/core/context_pack.py +11 -1
  10. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/mcp_server.py +71 -16
  11. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/.gitignore +0 -0
  12. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/LICENSE +0 -0
  13. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/adapters/__init__.py +0 -0
  14. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/adapters/antigravity.py +0 -0
  15. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/adapters/base.py +0 -0
  16. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/adapters/claude.py +0 -0
  17. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/adapters/codex.py +0 -0
  18. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/adapters/cursor.py +0 -0
  19. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/adapters/detect.py +0 -0
  20. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/adapters/generic.py +0 -0
  21. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/adapters/windsurf.py +0 -0
  22. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/analysis/__init__.py +0 -0
  23. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/analysis/go_imports.py +0 -0
  24. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/analysis/java_imports.py +0 -0
  25. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/analysis/js_ts_imports.py +0 -0
  26. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/analysis/monorepo.py +0 -0
  27. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/analysis/python_imports.py +0 -0
  28. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/analysis/repo_map.py +0 -0
  29. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/analysis/rust_imports.py +0 -0
  30. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/analysis/symbols.py +0 -0
  31. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/analysis/task_classifier.py +0 -0
  32. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/analysis/tests.py +0 -0
  33. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/application/__init__.py +0 -0
  34. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/cli.py +0 -0
  35. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/commands/__init__.py +0 -0
  36. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/commands/_shared.py +0 -0
  37. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/commands/claude_cmd.py +0 -0
  38. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/commands/diff.py +0 -0
  39. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/commands/doctor.py +0 -0
  40. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/commands/explain.py +0 -0
  41. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/commands/hook_cmd.py +0 -0
  42. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/commands/init.py +0 -0
  43. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/commands/install.py +0 -0
  44. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/commands/mcp_cmd.py +0 -0
  45. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/commands/monitor.py +0 -0
  46. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/commands/pack.py +0 -0
  47. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/commands/quickstart.py +0 -0
  48. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/commands/repair.py +0 -0
  49. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/commands/scan.py +0 -0
  50. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/commands/stats.py +0 -0
  51. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/commands/status.py +0 -0
  52. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/commands/summarize.py +0 -0
  53. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/commands/tune.py +0 -0
  54. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/commands/watch.py +0 -0
  55. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/core/__init__.py +0 -0
  56. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/core/bootstrap.py +0 -0
  57. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/core/cache.py +0 -0
  58. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/core/config.py +0 -0
  59. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/core/diff.py +0 -0
  60. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/core/git.py +0 -0
  61. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/core/git_hooks.py +0 -0
  62. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/core/global_install.py +0 -0
  63. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/core/ignore.py +0 -0
  64. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/core/merkle.py +0 -0
  65. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/core/models.py +0 -0
  66. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/core/redactor.py +0 -0
  67. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/core/scanner.py +0 -0
  68. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/core/snapshot.py +0 -0
  69. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/core/token_estimator.py +0 -0
  70. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/core/vscode_tasks.py +0 -0
  71. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/data/agentpack.md +0 -0
  72. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/installers/__init__.py +0 -0
  73. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/installers/antigravity.py +0 -0
  74. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/installers/claude.py +0 -0
  75. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/installers/codex.py +0 -0
  76. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/installers/cursor.py +0 -0
  77. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/installers/windsurf.py +0 -0
  78. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/integrations/__init__.py +0 -0
  79. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/integrations/agents.py +0 -0
  80. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/integrations/git_hooks.py +0 -0
  81. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/integrations/global_install.py +0 -0
  82. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/integrations/vscode_tasks.py +0 -0
  83. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/renderers/__init__.py +0 -0
  84. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/renderers/compact.py +0 -0
  85. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/renderers/markdown.py +0 -0
  86. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/renderers/receipts.py +0 -0
  87. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/session/__init__.py +0 -0
  88. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/session/state.py +0 -0
  89. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/summaries/__init__.py +0 -0
  90. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/summaries/base.py +0 -0
  91. {agentpack_cli-0.2.1 → agentpack_cli-0.2.2}/src/agentpack/summaries/offline.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agentpack-cli
3
- Version: 0.2.1
3
+ Version: 0.2.2
4
4
  Summary: Task-aware context packing for AI coding agents — Claude, Cursor, Windsurf, Codex, and Antigravity
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -151,10 +151,19 @@ Use real repo evals instead of trusting compression numbers:
151
151
  ```bash
152
152
  agentpack benchmark --init
153
153
  # add historical tasks and files actually changed
154
- agentpack benchmark --compare --misses
154
+ agentpack benchmark --compare --misses --public-table
155
+ agentpack benchmark --public-repos --prove-targets --misses --public-table
155
156
  agentpack benchmark --results-template
156
157
  ```
157
158
 
159
+ For public proof, use several real repositories or anonymized historical task
160
+ sets and publish the generated table from `benchmarks/results/*-public.md`.
161
+ This repo includes a curated public smoke suite in
162
+ `benchmarks/public-repos.toml`; it evaluates real commits from Pallets Click,
163
+ ItsDangerous, and MarkupSafe by checking out each commit's parent and scoring
164
+ against files actually changed by the commit. Synthetic fixtures are useful
165
+ regression tests, but should not be presented as market proof.
166
+
158
167
  ## Debugging Selection
159
168
 
160
169
  When AgentPack misses a file, the next command should explain the miss:
@@ -170,6 +179,41 @@ agentpack explain --task "fix billing webhook" --budget-plan
170
179
 
171
180
  This is the core reliability loop: pack, measure recall, inspect misses, then tune task wording, `.agentignore`, or scoring weights.
172
181
 
182
+ ## MCP-First Workflow
183
+
184
+ For MCP-capable agents, the preferred workflow is pull-based:
185
+
186
+ 1. Call `start_task(task)` when a new task begins. AgentPack writes `.agentpack/task.md`, packs context, and returns ranked markdown.
187
+ 2. Call `get_context()` when you need the latest cached pack; it tells you if the pack is stale.
188
+ 3. Call `get_delta_context()` after edits or hook hints to see what changed without loading the full pack.
189
+ 4. Call `explain_file(path)` or `get_related_files(path)` when a file looks relevant or suspicious.
190
+
191
+ The CLI remains the setup/debug/release path. MCP is the best interactive path because the agent can ask for only the context it needs instead of relying on one static startup blob.
192
+
193
+ ## Before / After Agent Behavior
194
+
195
+ Without AgentPack:
196
+
197
+ ```text
198
+ User: fix auth token expiry
199
+ Agent: rg "auth"; opens router; opens middleware; opens tests; opens config;
200
+ asks for more files; eventually finds token/session code.
201
+ Cost: repeated repo exploration and many unrelated file reads.
202
+ ```
203
+
204
+ With AgentPack:
205
+
206
+ ```text
207
+ User: fix auth token expiry
208
+ Agent: calls start_task("fix auth token expiry")
209
+ AgentPack: returns ranked files with reasons:
210
+ 1. src/auth/token.py — filename/content match, changed dependency
211
+ 2. src/auth/session.py — related implementation
212
+ 3. tests/test_auth.py — paired test
213
+ Agent: verifies those files, edits, runs tests, checks misses if needed.
214
+ Cost: starts from a measured map, then still verifies source normally.
215
+ ```
216
+
173
217
  ## When it helps
174
218
 
175
219
  | Workflow | Value |
@@ -766,7 +810,8 @@ Register in Claude Code settings (`~/.claude/settings.json`):
766
810
 
767
811
  | Tool | Description |
768
812
  |---|---|
769
- | `pack_context(task, mode, budget, max_tokens)` | Generate a ranked context pack for a task. Returns packed markdown, truncated to `max_tokens` (default 20,000). |
813
+ | `start_task(task, mode, budget, max_tokens)` | Recommended MCP-first entry point. Writes `.agentpack/task.md`, generates a ranked pack, and returns packed markdown. |
814
+ | `pack_context(task, mode, budget, max_tokens)` | Generate a ranked context pack. If `task` is provided, writes it to `.agentpack/task.md`; if omitted, reads `task.md` or infers from git. |
770
815
  | `get_context()` | Return the latest pre-built pack instantly (no repack). Prepends a freshness/staleness header so you know if it's stale. |
771
816
  | `refresh()` | Refresh using the current `task.md` or git-inferred task. |
772
817
  | `explain_file(path, task)` | Show score, inclusion mode, reasons, symbols, imports, and importers for one file. |
@@ -779,7 +824,7 @@ Register in Claude Code settings (`~/.claude/settings.json`):
779
824
  > **Stale context** — repo changed since last pack (generated: ...). Run pack_context() to refresh.
780
825
  ```
781
826
 
782
- **Smart truncation:** `pack_context()` keeps headers intact and trims file content blocks to fit the token budget, appending a note about how many files were omitted.
827
+ **Smart truncation:** `start_task()` and `pack_context()` keep headers intact and trim file content blocks to fit the token budget, appending a note about how many files were omitted.
783
828
 
784
829
  Zero API calls — all analysis is offline. Summary cache keyed by file hash: cold run parallelises AST parsing across CPU cores; warm cache hits are instant.
785
830
 
@@ -831,8 +876,10 @@ agentpack benchmark --init # scaffold .agentpack
831
876
  agentpack benchmark --results-template # scaffold publishable results note
832
877
  agentpack benchmark # run all cases in benchmark.toml
833
878
  agentpack benchmark --sample-fixtures # source checkout demo evals
879
+ agentpack benchmark --public-repos # real public commit evals
834
880
  agentpack benchmark --misses # explain expected-file misses
835
881
  agentpack benchmark --prove-targets # fail if recall/token precision targets miss
882
+ agentpack benchmark --public-table # write benchmarks/results/*-public.md
836
883
  ```
837
884
 
838
885
  Output per case:
@@ -889,6 +936,15 @@ Use `--misses` when recall is low. It prints each expected file that was not sel
889
936
 
890
937
  Use `--prove-targets` in CI or release prep when benchmark cases have `expected_files`. By default it requires average recall >=60% and token precision >=50%; tune with `--min-recall` and `--min-token-precision`.
891
938
 
939
+ Use `--public-repos` from an AgentPack source checkout to run the committed
940
+ real-repo smoke suite:
941
+
942
+ ```bash
943
+ agentpack benchmark --public-repos --prove-targets --misses --public-table
944
+ ```
945
+
946
+ Use `--public-table` after adding real historical tasks to write a publishable Markdown table with per-repo/task recall, token precision, rank@K, pack size, and miss count. This is the recommended artifact for README claims, release notes, and external benchmarks.
947
+
892
948
  Add `task_type` to group results by workflow area. Benchmark summaries report average precision, recall, F1, and token noise by type, so a repo can show "backend-api is good, frontend-web is noisy" instead of hiding that under one aggregate.
893
949
 
894
950
  ---
@@ -1318,7 +1374,7 @@ src/agentpack/
1318
1374
  compact.py # compact protocol format for session context files
1319
1375
  receipts.py # context receipt formatter
1320
1376
 
1321
- mcp_server.py # MCP tools: pack_context, get_context, explain, related, stats, delta
1377
+ mcp_server.py # MCP tools: start_task, pack_context, get_context, explain, related, stats, delta
1322
1378
 
1323
1379
  session/
1324
1380
  state.py # SessionState dataclass + load/save/create/stop helpers
@@ -1356,6 +1412,7 @@ src/agentpack/
1356
1412
  - **Repo maps are first-class context**: `analysis/repo_map.py` builds a compact semantic map before file context, and its token cost is reserved before file selection.
1357
1413
  - **Metrics feed history learning**: selection accuracy records hit/noise paths, token precision, mode counts, and mode tokens. Later packs gently penalize repeated noisy paths unless they are currently changed.
1358
1414
  - **Git history feeds recall**: files that historically changed in the same commits as live changed files receive a small boost, helping related tests, schemas, services, and configs surface without forcing full-content inclusion.
1415
+ - **Second-pass expansion is guarded**: after first scoring, strong seeds can lift two-hop import, reverse-import, config, and related-test neighbours only when they share task or domain signal.
1359
1416
  - **Co-change is guarded by precision history**: one-off co-change neighbors are ignored, and paths repeatedly measured as noise do not get revived by history boosts.
1360
1417
  - **Precision guardrails adapt to bad history**: when summary token precision stays near zero, later packs raise the summary score floor, cap summaries more aggressively, and suppress summaries entirely for no-live-change packs. Weak filename-only matches are also damped unless other signals confirm them.
1361
1418
  - **`AdapterRegistry` maps agent → adapter**: adding a new agent output format requires one entry in `AdapterRegistry.get()`, not changes to `PackService`.
@@ -1364,7 +1421,7 @@ src/agentpack/
1364
1421
  - **`integrations/` vs `core/`**: git hooks, shell rc patching, and VS Code tasks are infrastructure concerns — they live in `integrations/`, not `core/`. `core/` is pure domain logic.
1365
1422
  - **Adapters render; installers configure**: `adapters/` knows how to write a context file for an agent. `installers/` knows how to configure the agent's tool (CLAUDE.md, .cursorrules, settings.json). They are separate concerns and separate classes.
1366
1423
  - **Agent integration contract is shared**: `integrations/agents.py` defines install, audit, and repair behavior for Claude, Cursor, Windsurf, Codex, Antigravity, and Generic. `install`, `repair`, `doctor --agent all`, and release verification use the same contract.
1367
- - **MCP and hooks use deltas when possible**: MCP exposes `get_delta_context()`, and prompt hooks can emit task/top-file/delta hints instead of injecting the full context every time.
1424
+ - **MCP is the interactive path**: `start_task()` writes task state and returns a fresh pack, while `get_context()`, `get_delta_context()`, `explain_file()`, and `get_related_files()` let agents pull follow-up context on demand.
1368
1425
 
1369
1426
  ---
1370
1427
 
@@ -1383,7 +1440,7 @@ src/agentpack/
1383
1440
 
1384
1441
  - **Windows**: not supported. Git hooks use POSIX shell (`#!/bin/sh`, `>/dev/null 2>&1 &`). The Claude Code session hooks use `python3` and `rm -f`. Contributions welcome.
1385
1442
  - **Monorepos**: workspace-aware ranking supports npm/pnpm, Cargo, and `go.work` layouts. `--workspace` creates filtered per-workspace outputs. Package dependency hints currently come from npm/pnpm `package.json`; Cargo/Go workspace membership is detected, but package-manager dependency edges for Cargo/Go are not yet modeled.
1386
- - **Public benchmark proof**: source-checkout fixture results are useful regressions, not market proof. Use `agentpack benchmark --results-template` to publish real historical task results.
1443
+ - **Public benchmark proof**: `benchmarks/public-repos.toml` is a curated smoke suite over real public commits, and `benchmarks/results/2026-05-15-public.md` records the current proof run. Treat it as a floor, not a leaderboard; expand cases before broad external claims.
1387
1444
  - **Symbol extraction**: Python (AST, full) and JavaScript/TypeScript (regex, arrow functions + classes) are well-supported. Go, Rust, Java, Kotlin have import graph traversal but no symbol extraction — they fall back to file-level summaries.
1388
1445
  - **Selection recall**: ranking is heuristic. It can miss files when task language differs from code language, when repos have unusual architecture, or when important files are only connected at runtime.
1389
1446
  - **Secret redaction**: covers AWS keys, GitHub tokens, OpenAI/Anthropic keys, JWTs, and private key blocks. Not a substitute for a dedicated secrets scanner on sensitive repos.
@@ -1394,12 +1451,12 @@ src/agentpack/
1394
1451
 
1395
1452
  ## Roadmap
1396
1453
 
1397
- Next release target: **0.3.0 = public benchmark expansion + npm publish hardening**.
1454
+ Next release target: **0.3.0 = public proof + npm publish hardening**.
1398
1455
 
1399
- - Expand public source-checkout fixtures and publish reproducible `benchmark --sample-fixtures --compare --misses` output.
1400
- - Raise recall on real historical tasks while keeping token precision healthy; target 60%+ recall, 50%+ token precision, and balanced packs under 25k tokens.
1401
- - Improve second-pass expansion beyond current imports, reverse imports, related tests, historical co-change, and workspace hints with framework route/service/schema pairs.
1402
- - Make MCP pull flows more prominent so agents can ask for `explain_file`, `get_related_files`, and `get_delta_context` instead of relying only on a static startup pack.
1456
+ - Expand the public real-repo suite beyond the current curated Pallets smoke set.
1457
+ - Keep recall gains measured with `--prove-targets`; target 60%+ recall, 50%+ token precision, and task packs under 25k tokens.
1458
+ - Extend second-pass expansion with framework route/service/schema pairs once benchmark misses prove the pattern.
1459
+ - Make npm publishing reliable by adding `NPM_TOKEN` and rerunning the npm release workflow.
1403
1460
  - Keep integration contracts stable across Claude, Cursor, Windsurf, Codex, Antigravity, and Generic before any 1.0 work.
1404
1461
 
1405
1462
  ---
@@ -112,10 +112,19 @@ Use real repo evals instead of trusting compression numbers:
112
112
  ```bash
113
113
  agentpack benchmark --init
114
114
  # add historical tasks and files actually changed
115
- agentpack benchmark --compare --misses
115
+ agentpack benchmark --compare --misses --public-table
116
+ agentpack benchmark --public-repos --prove-targets --misses --public-table
116
117
  agentpack benchmark --results-template
117
118
  ```
118
119
 
120
+ For public proof, use several real repositories or anonymized historical task
121
+ sets and publish the generated table from `benchmarks/results/*-public.md`.
122
+ This repo includes a curated public smoke suite in
123
+ `benchmarks/public-repos.toml`; it evaluates real commits from Pallets Click,
124
+ ItsDangerous, and MarkupSafe by checking out each commit's parent and scoring
125
+ against files actually changed by the commit. Synthetic fixtures are useful
126
+ regression tests, but should not be presented as market proof.
127
+
119
128
  ## Debugging Selection
120
129
 
121
130
  When AgentPack misses a file, the next command should explain the miss:
@@ -131,6 +140,41 @@ agentpack explain --task "fix billing webhook" --budget-plan
131
140
 
132
141
  This is the core reliability loop: pack, measure recall, inspect misses, then tune task wording, `.agentignore`, or scoring weights.
133
142
 
143
+ ## MCP-First Workflow
144
+
145
+ For MCP-capable agents, the preferred workflow is pull-based:
146
+
147
+ 1. Call `start_task(task)` when a new task begins. AgentPack writes `.agentpack/task.md`, packs context, and returns ranked markdown.
148
+ 2. Call `get_context()` when you need the latest cached pack; it tells you if the pack is stale.
149
+ 3. Call `get_delta_context()` after edits or hook hints to see what changed without loading the full pack.
150
+ 4. Call `explain_file(path)` or `get_related_files(path)` when a file looks relevant or suspicious.
151
+
152
+ The CLI remains the setup/debug/release path. MCP is the best interactive path because the agent can ask for only the context it needs instead of relying on one static startup blob.
153
+
154
+ ## Before / After Agent Behavior
155
+
156
+ Without AgentPack:
157
+
158
+ ```text
159
+ User: fix auth token expiry
160
+ Agent: rg "auth"; opens router; opens middleware; opens tests; opens config;
161
+ asks for more files; eventually finds token/session code.
162
+ Cost: repeated repo exploration and many unrelated file reads.
163
+ ```
164
+
165
+ With AgentPack:
166
+
167
+ ```text
168
+ User: fix auth token expiry
169
+ Agent: calls start_task("fix auth token expiry")
170
+ AgentPack: returns ranked files with reasons:
171
+ 1. src/auth/token.py — filename/content match, changed dependency
172
+ 2. src/auth/session.py — related implementation
173
+ 3. tests/test_auth.py — paired test
174
+ Agent: verifies those files, edits, runs tests, checks misses if needed.
175
+ Cost: starts from a measured map, then still verifies source normally.
176
+ ```
177
+
134
178
  ## When it helps
135
179
 
136
180
  | Workflow | Value |
@@ -727,7 +771,8 @@ Register in Claude Code settings (`~/.claude/settings.json`):
727
771
 
728
772
  | Tool | Description |
729
773
  |---|---|
730
- | `pack_context(task, mode, budget, max_tokens)` | Generate a ranked context pack for a task. Returns packed markdown, truncated to `max_tokens` (default 20,000). |
774
+ | `start_task(task, mode, budget, max_tokens)` | Recommended MCP-first entry point. Writes `.agentpack/task.md`, generates a ranked pack, and returns packed markdown. |
775
+ | `pack_context(task, mode, budget, max_tokens)` | Generate a ranked context pack. If `task` is provided, writes it to `.agentpack/task.md`; if omitted, reads `task.md` or infers from git. |
731
776
  | `get_context()` | Return the latest pre-built pack instantly (no repack). Prepends a freshness/staleness header so you know if it's stale. |
732
777
  | `refresh()` | Refresh using the current `task.md` or git-inferred task. |
733
778
  | `explain_file(path, task)` | Show score, inclusion mode, reasons, symbols, imports, and importers for one file. |
@@ -740,7 +785,7 @@ Register in Claude Code settings (`~/.claude/settings.json`):
740
785
  > **Stale context** — repo changed since last pack (generated: ...). Run pack_context() to refresh.
741
786
  ```
742
787
 
743
- **Smart truncation:** `pack_context()` keeps headers intact and trims file content blocks to fit the token budget, appending a note about how many files were omitted.
788
+ **Smart truncation:** `start_task()` and `pack_context()` keep headers intact and trim file content blocks to fit the token budget, appending a note about how many files were omitted.
744
789
 
745
790
  Zero API calls — all analysis is offline. Summary cache keyed by file hash: cold run parallelises AST parsing across CPU cores; warm cache hits are instant.
746
791
 
@@ -792,8 +837,10 @@ agentpack benchmark --init # scaffold .agentpack
792
837
  agentpack benchmark --results-template # scaffold publishable results note
793
838
  agentpack benchmark # run all cases in benchmark.toml
794
839
  agentpack benchmark --sample-fixtures # source checkout demo evals
840
+ agentpack benchmark --public-repos # real public commit evals
795
841
  agentpack benchmark --misses # explain expected-file misses
796
842
  agentpack benchmark --prove-targets # fail if recall/token precision targets miss
843
+ agentpack benchmark --public-table # write benchmarks/results/*-public.md
797
844
  ```
798
845
 
799
846
  Output per case:
@@ -850,6 +897,15 @@ Use `--misses` when recall is low. It prints each expected file that was not sel
850
897
 
851
898
  Use `--prove-targets` in CI or release prep when benchmark cases have `expected_files`. By default it requires average recall >=60% and token precision >=50%; tune with `--min-recall` and `--min-token-precision`.
852
899
 
900
+ Use `--public-repos` from an AgentPack source checkout to run the committed
901
+ real-repo smoke suite:
902
+
903
+ ```bash
904
+ agentpack benchmark --public-repos --prove-targets --misses --public-table
905
+ ```
906
+
907
+ Use `--public-table` after adding real historical tasks to write a publishable Markdown table with per-repo/task recall, token precision, rank@K, pack size, and miss count. This is the recommended artifact for README claims, release notes, and external benchmarks.
908
+
853
909
  Add `task_type` to group results by workflow area. Benchmark summaries report average precision, recall, F1, and token noise by type, so a repo can show "backend-api is good, frontend-web is noisy" instead of hiding that under one aggregate.
854
910
 
855
911
  ---
@@ -1279,7 +1335,7 @@ src/agentpack/
1279
1335
  compact.py # compact protocol format for session context files
1280
1336
  receipts.py # context receipt formatter
1281
1337
 
1282
- mcp_server.py # MCP tools: pack_context, get_context, explain, related, stats, delta
1338
+ mcp_server.py # MCP tools: start_task, pack_context, get_context, explain, related, stats, delta
1283
1339
 
1284
1340
  session/
1285
1341
  state.py # SessionState dataclass + load/save/create/stop helpers
@@ -1317,6 +1373,7 @@ src/agentpack/
1317
1373
  - **Repo maps are first-class context**: `analysis/repo_map.py` builds a compact semantic map before file context, and its token cost is reserved before file selection.
1318
1374
  - **Metrics feed history learning**: selection accuracy records hit/noise paths, token precision, mode counts, and mode tokens. Later packs gently penalize repeated noisy paths unless they are currently changed.
1319
1375
  - **Git history feeds recall**: files that historically changed in the same commits as live changed files receive a small boost, helping related tests, schemas, services, and configs surface without forcing full-content inclusion.
1376
+ - **Second-pass expansion is guarded**: after first scoring, strong seeds can lift two-hop import, reverse-import, config, and related-test neighbours only when they share task or domain signal.
1320
1377
  - **Co-change is guarded by precision history**: one-off co-change neighbors are ignored, and paths repeatedly measured as noise do not get revived by history boosts.
1321
1378
  - **Precision guardrails adapt to bad history**: when summary token precision stays near zero, later packs raise the summary score floor, cap summaries more aggressively, and suppress summaries entirely for no-live-change packs. Weak filename-only matches are also damped unless other signals confirm them.
1322
1379
  - **`AdapterRegistry` maps agent → adapter**: adding a new agent output format requires one entry in `AdapterRegistry.get()`, not changes to `PackService`.
@@ -1325,7 +1382,7 @@ src/agentpack/
1325
1382
  - **`integrations/` vs `core/`**: git hooks, shell rc patching, and VS Code tasks are infrastructure concerns — they live in `integrations/`, not `core/`. `core/` is pure domain logic.
1326
1383
  - **Adapters render; installers configure**: `adapters/` knows how to write a context file for an agent. `installers/` knows how to configure the agent's tool (CLAUDE.md, .cursorrules, settings.json). They are separate concerns and separate classes.
1327
1384
  - **Agent integration contract is shared**: `integrations/agents.py` defines install, audit, and repair behavior for Claude, Cursor, Windsurf, Codex, Antigravity, and Generic. `install`, `repair`, `doctor --agent all`, and release verification use the same contract.
1328
- - **MCP and hooks use deltas when possible**: MCP exposes `get_delta_context()`, and prompt hooks can emit task/top-file/delta hints instead of injecting the full context every time.
1385
+ - **MCP is the interactive path**: `start_task()` writes task state and returns a fresh pack, while `get_context()`, `get_delta_context()`, `explain_file()`, and `get_related_files()` let agents pull follow-up context on demand.
1329
1386
 
1330
1387
  ---
1331
1388
 
@@ -1344,7 +1401,7 @@ src/agentpack/
1344
1401
 
1345
1402
  - **Windows**: not supported. Git hooks use POSIX shell (`#!/bin/sh`, `>/dev/null 2>&1 &`). The Claude Code session hooks use `python3` and `rm -f`. Contributions welcome.
1346
1403
  - **Monorepos**: workspace-aware ranking supports npm/pnpm, Cargo, and `go.work` layouts. `--workspace` creates filtered per-workspace outputs. Package dependency hints currently come from npm/pnpm `package.json`; Cargo/Go workspace membership is detected, but package-manager dependency edges for Cargo/Go are not yet modeled.
1347
- - **Public benchmark proof**: source-checkout fixture results are useful regressions, not market proof. Use `agentpack benchmark --results-template` to publish real historical task results.
1404
+ - **Public benchmark proof**: `benchmarks/public-repos.toml` is a curated smoke suite over real public commits, and `benchmarks/results/2026-05-15-public.md` records the current proof run. Treat it as a floor, not a leaderboard; expand cases before broad external claims.
1348
1405
  - **Symbol extraction**: Python (AST, full) and JavaScript/TypeScript (regex, arrow functions + classes) are well-supported. Go, Rust, Java, Kotlin have import graph traversal but no symbol extraction — they fall back to file-level summaries.
1349
1406
  - **Selection recall**: ranking is heuristic. It can miss files when task language differs from code language, when repos have unusual architecture, or when important files are only connected at runtime.
1350
1407
  - **Secret redaction**: covers AWS keys, GitHub tokens, OpenAI/Anthropic keys, JWTs, and private key blocks. Not a substitute for a dedicated secrets scanner on sensitive repos.
@@ -1355,12 +1412,12 @@ src/agentpack/
1355
1412
 
1356
1413
  ## Roadmap
1357
1414
 
1358
- Next release target: **0.3.0 = public benchmark expansion + npm publish hardening**.
1415
+ Next release target: **0.3.0 = public proof + npm publish hardening**.
1359
1416
 
1360
- - Expand public source-checkout fixtures and publish reproducible `benchmark --sample-fixtures --compare --misses` output.
1361
- - Raise recall on real historical tasks while keeping token precision healthy; target 60%+ recall, 50%+ token precision, and balanced packs under 25k tokens.
1362
- - Improve second-pass expansion beyond current imports, reverse imports, related tests, historical co-change, and workspace hints with framework route/service/schema pairs.
1363
- - Make MCP pull flows more prominent so agents can ask for `explain_file`, `get_related_files`, and `get_delta_context` instead of relying only on a static startup pack.
1417
+ - Expand the public real-repo suite beyond the current curated Pallets smoke set.
1418
+ - Keep recall gains measured with `--prove-targets`; target 60%+ recall, 50%+ token precision, and task packs under 25k tokens.
1419
+ - Extend second-pass expansion with framework route/service/schema pairs once benchmark misses prove the pattern.
1420
+ - Make npm publishing reliable by adding `NPM_TOKEN` and rerunning the npm release workflow.
1364
1421
  - Keep integration contracts stable across Claude, Cursor, Windsurf, Codex, Antigravity, and Generic before any 1.0 work.
1365
1422
 
1366
1423
  ---
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "agentpack-cli"
3
- version = "0.2.1"
3
+ version = "0.2.2"
4
4
  description = "Task-aware context packing for AI coding agents — Claude, Cursor, Windsurf, Codex, and Antigravity"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -1,3 +1,3 @@
1
1
  """AgentPack — task-aware context packing for AI coding agents."""
2
2
 
3
- __version__ = "0.2.1"
3
+ __version__ = "0.2.2"
@@ -37,8 +37,9 @@ def build(
37
37
  if summaries and fi.path in summaries:
38
38
  cached_imports = summaries[fi.path].get("imports", [])
39
39
  if cached_imports:
40
- graph.nodes[fi.path].imports = cached_imports
41
- for dep in cached_imports:
40
+ resolved_cached = _resolve_imports(fi.path, fi.language, cached_imports, root, path_set)
41
+ graph.nodes[fi.path].imports = resolved_cached
42
+ for dep in resolved_cached:
42
43
  if dep in graph:
43
44
  graph.nodes[dep].imported_by.append(fi.path)
44
45
  continue
@@ -58,19 +59,7 @@ def build(
58
59
  elif lang in ("java", "kotlin"):
59
60
  raw_imports = java_imports(fi.abs_path, cached)
60
61
 
61
- resolved: list[str] = []
62
- for imp in raw_imports:
63
- if imp.startswith("."):
64
- if lang == "python":
65
- r = py_resolve(fi.path, imp, root)
66
- elif lang in ("javascript", "typescript"):
67
- r = js_resolve(fi.path, imp, root)
68
- else:
69
- r = None
70
- if r and r in path_set:
71
- resolved.append(r)
72
- else:
73
- resolved.append(imp)
62
+ resolved = _resolve_imports(fi.path, lang, raw_imports, root, path_set)
74
63
 
75
64
  graph.nodes[fi.path].imports = resolved
76
65
  for dep in resolved:
@@ -78,3 +67,26 @@ def build(
78
67
  graph.nodes[dep].imported_by.append(fi.path)
79
68
 
80
69
  return graph
70
+
71
+
72
+ def _resolve_imports(
73
+ importer: str,
74
+ language: str | None,
75
+ imports: list[str],
76
+ root: Path,
77
+ path_set: set[str],
78
+ ) -> list[str]:
79
+ resolved: list[str] = []
80
+ for imp in imports:
81
+ if imp.startswith("."):
82
+ if language == "python":
83
+ r = py_resolve(importer, imp, root)
84
+ elif language in ("javascript", "typescript"):
85
+ r = js_resolve(importer, imp, root)
86
+ else:
87
+ r = None
88
+ if r and r in path_set:
89
+ resolved.append(r)
90
+ else:
91
+ resolved.append(imp)
92
+ return resolved
@@ -695,6 +695,96 @@ def boost_recall_neighbors(
695
695
  return result
696
696
 
697
697
 
698
+ def boost_second_pass_expansion(
699
+ scored: list[tuple[FileInfo, float, list[str]]],
700
+ dep_graph: DependencyGraph,
701
+ keywords: set[str] | dict[str, float],
702
+ weights: ScoringWeights | None = None,
703
+ *,
704
+ seed_limit: int = 10,
705
+ max_boosts: int = 32,
706
+ ) -> list[tuple[FileInfo, float, list[str]]]:
707
+ """Boost guarded two-hop neighbours around strong first-pass seeds.
708
+
709
+ This is deliberately conservative: it only boosts files that are close to a
710
+ strong seed and share task/domain signal, are paired tests, or are config
711
+ files. That raises recall for adjacent implementation files without turning
712
+ broad task wording into repo-wide expansion.
713
+ """
714
+ if not scored:
715
+ return scored
716
+ w = weights or _DEFAULT_WEIGHTS
717
+ path_map = {fi.path: (fi, score, reasons) for fi, score, reasons in scored}
718
+ keyword_tokens = set(_keyword_token_weights(keywords)) - _PATH_NOISE_TOKENS
719
+
720
+ seed_paths = [
721
+ fi.path
722
+ for fi, score, reasons in sorted(scored, key=lambda row: row[1], reverse=True)
723
+ if score >= 100
724
+ or any(
725
+ reason.startswith((
726
+ "modified",
727
+ "staged",
728
+ "workspace match",
729
+ "cross-layer related",
730
+ "recall neighbor",
731
+ "historically co-changed",
732
+ ))
733
+ for reason in reasons
734
+ )
735
+ ][:seed_limit]
736
+ if not seed_paths:
737
+ return scored
738
+
739
+ boosts: dict[str, tuple[float, str, str]] = {}
740
+
741
+ def neighbours(path: str) -> set[str]:
742
+ node = dep_graph.get(path)
743
+ return {p for p in (*node.imports, *node.imported_by, *node.tests) if p in path_map and p != path}
744
+
745
+ for seed in seed_paths:
746
+ seed_domains = _domain_tokens(seed) | keyword_tokens
747
+ first_hop = neighbours(seed)
748
+ second_hop = {candidate for hop in first_hop for candidate in neighbours(hop)}
749
+ for candidate in sorted(second_hop - {seed} - first_hop):
750
+ fi, _score, _reasons = path_map[candidate]
751
+ if fi.ignored or fi.binary:
752
+ continue
753
+ candidate_domains = _domain_tokens(candidate)
754
+ is_test_pair = _is_test_file(candidate) and (
755
+ any(_test_matches_source(candidate, hop) for hop in first_hop | {seed})
756
+ or any(_test_matches_source(candidate, hop) for hop in seed_domains)
757
+ )
758
+ has_domain_signal = bool(candidate_domains & seed_domains)
759
+ has_config_signal = _is_config_file(candidate) and bool(seed_domains)
760
+ if not (is_test_pair or has_domain_signal or has_config_signal):
761
+ continue
762
+ amount = w.recall_neighbor * 0.5
763
+ label = "second-pass related test" if is_test_pair else "second-pass recall neighbor"
764
+ if has_domain_signal:
765
+ amount += 4
766
+ current = boosts.get(candidate)
767
+ if current is None or amount > current[0]:
768
+ boosts[candidate] = (amount, seed, label)
769
+
770
+ if not boosts:
771
+ return scored
772
+ keep = {
773
+ path: value
774
+ for path, value in sorted(boosts.items(), key=lambda item: item[1][0], reverse=True)[:max_boosts]
775
+ }
776
+
777
+ result: list[tuple[FileInfo, float, list[str]]] = []
778
+ for fi, score, reasons in scored:
779
+ boost = keep.get(fi.path)
780
+ if boost:
781
+ amount, seed, label = boost
782
+ score += amount
783
+ reasons = reasons + [f"{label} of {seed}"]
784
+ result.append((fi, score, reasons))
785
+ return result
786
+
787
+
698
788
  def boost_monorepo_workspaces(
699
789
  scored: list[tuple[FileInfo, float, list[str]]],
700
790
  *,
@@ -25,6 +25,7 @@ from agentpack.analysis.ranking import (
25
25
  boost_cross_layer_related,
26
26
  boost_monorepo_workspaces,
27
27
  boost_recall_neighbors,
28
+ boost_second_pass_expansion,
28
29
  generic_task_term_ratio,
29
30
  )
30
31
  from agentpack.analysis.repo_map import build_repo_map
@@ -217,6 +218,7 @@ class FileRanker:
217
218
  weights=cfg.scoring,
218
219
  )
219
220
  scored = boost_recall_neighbors(scored, dep_graph, changes.all_changed, weights=cfg.scoring)
221
+ scored = boost_second_pass_expansion(scored, dep_graph, keyword_weights, weights=cfg.scoring)
220
222
  scored = boost_cross_layer_related(scored, keyword_weights, weights=cfg.scoring)
221
223
  scored = boost_paired_tests(scored, weights=cfg.scoring)
222
224
  if root is not None:
@@ -314,7 +316,12 @@ class PackPlanner:
314
316
  root, cfg, request.mode, rank_result.generic_ratio, no_live_changes=not changes.all_changed
315
317
  ),
316
318
  max_summary_files=_guarded_summary_cap(
317
- root, cfg, request.mode, rank_result.generic_ratio, no_live_changes=not changes.all_changed
319
+ root,
320
+ cfg,
321
+ request.mode,
322
+ rank_result.generic_ratio,
323
+ no_live_changes=not changes.all_changed,
324
+ effective_budget=effective_budget,
318
325
  ),
319
326
  )
320
327
  phase_times["select"] = time.perf_counter() - t0
@@ -768,12 +775,19 @@ def _guarded_summary_cap(
768
775
  generic_ratio: float = 0.0,
769
776
  *,
770
777
  no_live_changes: bool = False,
778
+ effective_budget: int = 0,
771
779
  ) -> int:
772
780
  cap = _summary_cap_for_mode(cfg, mode, generic_ratio)
781
+ if no_live_changes and effective_budget and effective_budget <= 2500 and cap > 0:
782
+ cap = min(cap, 4 if mode == "minimal" else 6)
773
783
  avg_summary_precision, rows = _recent_summary_token_precision(root)
774
784
  if rows < 3:
775
785
  if no_live_changes and cap > 0:
776
- return min(cap, 8)
786
+ if effective_budget and effective_budget <= 2500:
787
+ return min(cap, 4 if mode == "minimal" else 6)
788
+ if effective_budget and effective_budget <= 6000:
789
+ return min(cap, 12 if mode == "minimal" else 16)
790
+ return min(cap, 16)
777
791
  return cap
778
792
  if avg_summary_precision <= 0.05:
779
793
  if no_live_changes: