agentscamp 0.3.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/README.md +3 -3
  2. package/content/commands/add-caching.md +79 -0
  3. package/content/commands/audit-accessibility.md +101 -0
  4. package/content/commands/clean-branches.md +113 -0
  5. package/content/commands/review-tests.md +98 -0
  6. package/content/commands/scaffold-github-action.md +94 -0
  7. package/content/commands/setup-precommit-hooks.md +72 -0
  8. package/content/commands/write-design-doc.md +78 -0
  9. package/content/manifest.json +425 -3
  10. package/content/skills/agent-trajectory-evaluator.md +59 -0
  11. package/content/skills/alerting-rules-tuner.md +49 -0
  12. package/content/skills/canary-release-planner.md +35 -0
  13. package/content/skills/cold-start-optimizer.md +83 -0
  14. package/content/skills/connection-pool-tuner.md +46 -0
  15. package/content/skills/contract-test-designer.md +70 -0
  16. package/content/skills/dependency-upgrade-planner.md +42 -0
  17. package/content/skills/devcontainer-designer.md +40 -0
  18. package/content/skills/distributed-tracing-instrumenter.md +42 -0
  19. package/content/skills/idempotency-designer.md +47 -0
  20. package/content/skills/memory-leak-hunter.md +35 -0
  21. package/content/skills/mutation-test-runner.md +64 -0
  22. package/content/skills/pagination-designer.md +51 -0
  23. package/content/skills/property-test-designer.md +63 -0
  24. package/content/skills/query-plan-analyzer.md +49 -0
  25. package/content/skills/runbook-writer.md +83 -0
  26. package/content/skills/security-headers-hardener.md +79 -0
  27. package/content/skills/semantic-cache-designer.md +40 -0
  28. package/content/skills/slo-definer.md +38 -0
  29. package/content/skills/strangler-fig-migrator.md +47 -0
  30. package/content/skills/structured-logging-designer.md +42 -0
  31. package/content/skills/threat-model-builder.md +46 -0
  32. package/content/skills/token-usage-profiler.md +39 -0
  33. package/package.json +1 -1
@@ -1,10 +1,10 @@
1
1
  {
2
2
  "schemaVersion": 1,
3
- "generatedAt": "2026-06-18T01:57:52.358Z",
3
+ "generatedAt": "2026-06-18T02:36:19.351Z",
4
4
  "counts": {
5
5
  "agents": 58,
6
- "skills": 52,
7
- "commands": 43
6
+ "skills": 75,
7
+ "commands": 50
8
8
  },
9
9
  "items": [
10
10
  {
@@ -886,6 +886,20 @@
886
886
  "installAs": "agents/workflow-orchestrator.md",
887
887
  "url": "https://agentscamp.com/agents/meta-orchestration/workflow-orchestrator"
888
888
  },
889
+ {
890
+ "id": "commands/add-caching",
891
+ "type": "command",
892
+ "slug": "add-caching",
893
+ "category": "perf",
894
+ "title": "Add Caching",
895
+ "description": "Add a caching layer to one expensive function or endpoint correctly — confirm it's cacheable, design the cache key/TTL/layer/invalidation, handle stampedes, wrap the call in one place, and report the design.",
896
+ "topics": [
897
+ "devops-infra"
898
+ ],
899
+ "file": "commands/add-caching.md",
900
+ "installAs": "commands/add-caching.md",
901
+ "url": "https://agentscamp.com/commands/perf/add-caching"
902
+ },
889
903
  {
890
904
  "id": "commands/add-docstrings",
891
905
  "type": "command",
@@ -946,6 +960,20 @@
946
960
  "installAs": "commands/add-streaming-endpoint.md",
947
961
  "url": "https://agentscamp.com/commands/scaffold/add-streaming-endpoint"
948
962
  },
963
+ {
964
+ "id": "commands/audit-accessibility",
965
+ "type": "command",
966
+ "slug": "audit-accessibility",
967
+ "category": "analyze",
968
+ "title": "Audit Accessibility",
969
+ "description": "Audit a component or page for accessibility against WCAG — semantics, names, keyboard, ARIA, contrast, forms, motion.",
970
+ "topics": [
971
+ "review-qa"
972
+ ],
973
+ "file": "commands/audit-accessibility.md",
974
+ "installAs": "commands/audit-accessibility.md",
975
+ "url": "https://agentscamp.com/commands/analyze/audit-accessibility"
976
+ },
949
977
  {
950
978
  "id": "commands/benchmark-rerankers",
951
979
  "type": "command",
@@ -975,6 +1003,20 @@
975
1003
  "installAs": "commands/breakdown-task.md",
976
1004
  "url": "https://agentscamp.com/commands/plan/breakdown-task"
977
1005
  },
1006
+ {
1007
+ "id": "commands/clean-branches",
1008
+ "type": "command",
1009
+ "slug": "clean-branches",
1010
+ "category": "git",
1011
+ "title": "Clean Branches",
1012
+ "description": "Safely prune merged and stale Git branches: drop dead remote-tracking refs, list merged candidates for review, then delete with the safe -d variant.",
1013
+ "topics": [
1014
+ "review-qa"
1015
+ ],
1016
+ "file": "commands/clean-branches.md",
1017
+ "installAs": "commands/clean-branches.md",
1018
+ "url": "https://agentscamp.com/commands/git/clean-branches"
1019
+ },
978
1020
  {
979
1021
  "id": "commands/commit",
980
1022
  "type": "command",
@@ -1317,6 +1359,20 @@
1317
1359
  "installAs": "commands/review-pr.md",
1318
1360
  "url": "https://agentscamp.com/commands/review/review-pr"
1319
1361
  },
1362
+ {
1363
+ "id": "commands/review-tests",
1364
+ "type": "command",
1365
+ "slug": "review-tests",
1366
+ "category": "review",
1367
+ "title": "Review Tests",
1368
+ "description": "Review the quality of a test suite, not just whether it passes — find weak assertions, missing edge cases, and tests coupled to implementation.",
1369
+ "topics": [
1370
+ "review-qa"
1371
+ ],
1372
+ "file": "commands/review-tests.md",
1373
+ "installAs": "commands/review-tests.md",
1374
+ "url": "https://agentscamp.com/commands/review/review-tests"
1375
+ },
1320
1376
  {
1321
1377
  "id": "commands/run-evals",
1322
1378
  "type": "command",
@@ -1346,6 +1402,20 @@
1346
1402
  "installAs": "commands/scaffold-dockerfile.md",
1347
1403
  "url": "https://agentscamp.com/commands/scaffold/scaffold-dockerfile"
1348
1404
  },
1405
+ {
1406
+ "id": "commands/scaffold-github-action",
1407
+ "type": "command",
1408
+ "slug": "scaffold-github-action",
1409
+ "category": "scaffold",
1410
+ "title": "Scaffold GitHub Action",
1411
+ "description": "Scaffold a hardened GitHub Actions workflow for a stated goal, wired to the project's real test/lint/build commands.",
1412
+ "topics": [
1413
+ "devops-infra"
1414
+ ],
1415
+ "file": "commands/scaffold-github-action.md",
1416
+ "installAs": "commands/scaffold-github-action.md",
1417
+ "url": "https://agentscamp.com/commands/scaffold/scaffold-github-action"
1418
+ },
1349
1419
  {
1350
1420
  "id": "commands/scaffold-pgvector-schema",
1351
1421
  "type": "command",
@@ -1450,6 +1520,20 @@
1450
1520
  "installAs": "commands/setup-claude-ci.md",
1451
1521
  "url": "https://agentscamp.com/commands/workflow/setup-claude-ci"
1452
1522
  },
1523
+ {
1524
+ "id": "commands/setup-precommit-hooks",
1525
+ "type": "command",
1526
+ "slug": "setup-precommit-hooks",
1527
+ "category": "workflow",
1528
+ "title": "Setup Pre-commit Hooks",
1529
+ "description": "Set up fast pre-commit hooks that catch problems before they land — detect the repo's existing stack and hook mechanism, run lint/format/typecheck plus a secret scan on staged files only, keep the slow test suite in CI, and make the setup reproducible for the whole team.",
1530
+ "topics": [
1531
+ "devops-infra"
1532
+ ],
1533
+ "file": "commands/setup-precommit-hooks.md",
1534
+ "installAs": "commands/setup-precommit-hooks.md",
1535
+ "url": "https://agentscamp.com/commands/workflow/setup-precommit-hooks"
1536
+ },
1453
1537
  {
1454
1538
  "id": "commands/sync-branch",
1455
1539
  "type": "command",
@@ -1493,6 +1577,21 @@
1493
1577
  "installAs": "commands/update-readme.md",
1494
1578
  "url": "https://agentscamp.com/commands/docs/update-readme"
1495
1579
  },
1580
+ {
1581
+ "id": "commands/write-design-doc",
1582
+ "type": "command",
1583
+ "slug": "write-design-doc",
1584
+ "category": "plan",
1585
+ "title": "Write Design Doc",
1586
+ "description": "Explore the codebase and write a decision-oriented design doc / RFC for a feature or system change.",
1587
+ "topics": [
1588
+ "architecture",
1589
+ "workflow-prompting"
1590
+ ],
1591
+ "file": "commands/write-design-doc.md",
1592
+ "installAs": "commands/write-design-doc.md",
1593
+ "url": "https://agentscamp.com/commands/plan/write-design-doc"
1594
+ },
1496
1595
  {
1497
1596
  "id": "commands/write-tests",
1498
1597
  "type": "command",
@@ -1536,6 +1635,35 @@
1536
1635
  "installAs": "skills/agent-memory-designer/SKILL.md",
1537
1636
  "url": "https://agentscamp.com/skills/workflow/agent-memory-designer"
1538
1637
  },
1638
+ {
1639
+ "id": "skills/agent-trajectory-evaluator",
1640
+ "type": "skill",
1641
+ "slug": "agent-trajectory-evaluator",
1642
+ "category": "data",
1643
+ "title": "Agent Trajectory Evaluator",
1644
+ "description": "Evaluate a multi-step AI agent's whole run — tool calls, intermediate steps, and final result — not just final-answer correctness, so you can pinpoint WHERE it went wrong. Use when building or debugging a tool-using or multi-step agent, when final-answer-only evals can't explain failures, or when a prompt/model change quietly makes the agent less efficient or more error-prone even though the answer still looks right.",
1645
+ "topics": [
1646
+ "llm-evals",
1647
+ "ai-agents-systems"
1648
+ ],
1649
+ "file": "skills/agent-trajectory-evaluator.md",
1650
+ "installAs": "skills/agent-trajectory-evaluator/SKILL.md",
1651
+ "url": "https://agentscamp.com/skills/data/agent-trajectory-evaluator"
1652
+ },
1653
+ {
1654
+ "id": "skills/alerting-rules-tuner",
1655
+ "type": "skill",
1656
+ "slug": "alerting-rules-tuner",
1657
+ "category": "observability",
1658
+ "title": "Alerting Rules Tuner",
1659
+ "description": "Cut alert noise and make every page mean something — rewrite alerting rules to fire on user-felt symptoms (error rate, latency SLO burn, failed requests) instead of causes (high CPU, full disk), with duration windows and severity routing so only urgent, actionable conditions reach a human. Use when on-call is fatigued by low-value pages, when real incidents get missed in the noise, or when alerts fire on causes rather than impact.",
1660
+ "topics": [
1661
+ "devops-infra"
1662
+ ],
1663
+ "file": "skills/alerting-rules-tuner.md",
1664
+ "installAs": "skills/alerting-rules-tuner/SKILL.md",
1665
+ "url": "https://agentscamp.com/skills/observability/alerting-rules-tuner"
1666
+ },
1539
1667
  {
1540
1668
  "id": "skills/architecture-diagram-generator",
1541
1669
  "type": "skill",
@@ -1592,6 +1720,20 @@
1592
1720
  "installAs": "skills/bundle-analyzer/SKILL.md",
1593
1721
  "url": "https://agentscamp.com/skills/performance/bundle-analyzer"
1594
1722
  },
1723
+ {
1724
+ "id": "skills/canary-release-planner",
1725
+ "type": "skill",
1726
+ "slug": "canary-release-planner",
1727
+ "category": "release",
1728
+ "title": "Canary Release Planner",
1729
+ "description": "Design a canary / progressive rollout so a bad release reaches 1% of users instead of 100% — staged traffic with bake times, gating metrics compared against the concurrently-running stable baseline, and automated promote-or-rollback. Use when shipping a risky change, when you want automatic rollback on regression, or when moving off all-at-once deploys.",
1730
+ "topics": [
1731
+ "devops-infra"
1732
+ ],
1733
+ "file": "skills/canary-release-planner.md",
1734
+ "installAs": "skills/canary-release-planner/SKILL.md",
1735
+ "url": "https://agentscamp.com/skills/release/canary-release-planner"
1736
+ },
1595
1737
  {
1596
1738
  "id": "skills/changelog-from-prs",
1597
1739
  "type": "skill",
@@ -1634,6 +1776,48 @@
1634
1776
  "installAs": "skills/claude-settings-auditor/SKILL.md",
1635
1777
  "url": "https://agentscamp.com/skills/workflow/claude-settings-auditor"
1636
1778
  },
1779
+ {
1780
+ "id": "skills/cold-start-optimizer",
1781
+ "type": "skill",
1782
+ "slug": "cold-start-optimizer",
1783
+ "category": "performance",
1784
+ "title": "Cold Start Optimizer",
1785
+ "description": "Cut cold-start latency for serverless functions and slow-booting apps by measuring the init breakdown, then attacking the dominant phase — artifact size, eager imports, eager connections, or under-provisioned memory — instead of reflexively buying provisioned concurrency. Use when serverless p99 spikes on the first request, when a function times out during init, or when scale-to-zero is hurting user-facing latency.",
1786
+ "topics": [
1787
+ "devops-infra"
1788
+ ],
1789
+ "file": "skills/cold-start-optimizer.md",
1790
+ "installAs": "skills/cold-start-optimizer/SKILL.md",
1791
+ "url": "https://agentscamp.com/skills/performance/cold-start-optimizer"
1792
+ },
1793
+ {
1794
+ "id": "skills/connection-pool-tuner",
1795
+ "type": "skill",
1796
+ "slug": "connection-pool-tuner",
1797
+ "category": "database",
1798
+ "title": "Connection Pool Tuner",
1799
+ "description": "Size and tune a database connection pool from the real constraint — the database's shared max_connections and its core count — so total connections (per-instance pool × instance count) stay safely under the cap and a too-large pool stops adding latency. Use when the app throws 'too many connections' or pool-acquire timeouts, when the DB is saturated by connection count, or when deploying to serverless.",
1800
+ "topics": [
1801
+ "devops-infra"
1802
+ ],
1803
+ "file": "skills/connection-pool-tuner.md",
1804
+ "installAs": "skills/connection-pool-tuner/SKILL.md",
1805
+ "url": "https://agentscamp.com/skills/database/connection-pool-tuner"
1806
+ },
1807
+ {
1808
+ "id": "skills/contract-test-designer",
1809
+ "type": "skill",
1810
+ "slug": "contract-test-designer",
1811
+ "category": "testing",
1812
+ "title": "Contract Test Designer",
1813
+ "description": "Design consumer-driven contract tests between services so an API provider can't break its consumers unnoticed — without slow, flaky full end-to-end environments. Use when independent services or teams integrate over an API, when integration bugs only surface in staging or prod, or when E2E suites are too slow and brittle to catch breaking API changes.",
1814
+ "topics": [
1815
+ "review-qa"
1816
+ ],
1817
+ "file": "skills/contract-test-designer.md",
1818
+ "installAs": "skills/contract-test-designer/SKILL.md",
1819
+ "url": "https://agentscamp.com/skills/testing/contract-test-designer"
1820
+ },
1637
1821
  {
1638
1822
  "id": "skills/conventional-commits",
1639
1823
  "type": "skill",
@@ -1690,6 +1874,48 @@
1690
1874
  "installAs": "skills/dependency-audit/SKILL.md",
1691
1875
  "url": "https://agentscamp.com/skills/security/dependency-audit"
1692
1876
  },
1877
+ {
1878
+ "id": "skills/dependency-upgrade-planner",
1879
+ "type": "skill",
1880
+ "slug": "dependency-upgrade-planner",
1881
+ "category": "refactor",
1882
+ "title": "Dependency Upgrade Planner",
1883
+ "description": "Plan and de-risk a major dependency, framework, or runtime upgrade — map the full version path, read every intermediate migration guide, and pin the breaking changes to your actual call sites instead of bumping the number and hoping. Use when a key dependency is several majors behind, when a security advisory forces an upgrade, or before a framework migration.",
1884
+ "topics": [
1885
+ "devops-infra"
1886
+ ],
1887
+ "file": "skills/dependency-upgrade-planner.md",
1888
+ "installAs": "skills/dependency-upgrade-planner/SKILL.md",
1889
+ "url": "https://agentscamp.com/skills/refactor/dependency-upgrade-planner"
1890
+ },
1891
+ {
1892
+ "id": "skills/devcontainer-designer",
1893
+ "type": "skill",
1894
+ "slug": "devcontainer-designer",
1895
+ "category": "workflow",
1896
+ "title": "Dev Container Designer",
1897
+ "description": "Design a reproducible dev environment (Dev Container / Docker) so onboarding is one command and 'works on my machine' dies — by detecting the project's real stack and versions, authoring a devcontainer.json (+ Dockerfile/compose) that pins the runtime to what the repo targets, wires dependent services, caches dependencies, and injects secrets instead of baking them. Use when new contributors struggle to set up the project, when environment drift causes inconsistent behavior, or when standardizing tooling across a team.",
1898
+ "topics": [
1899
+ "devops-infra"
1900
+ ],
1901
+ "file": "skills/devcontainer-designer.md",
1902
+ "installAs": "skills/devcontainer-designer/SKILL.md",
1903
+ "url": "https://agentscamp.com/skills/workflow/devcontainer-designer"
1904
+ },
1905
+ {
1906
+ "id": "skills/distributed-tracing-instrumenter",
1907
+ "type": "skill",
1908
+ "slug": "distributed-tracing-instrumenter",
1909
+ "category": "observability",
1910
+ "title": "Distributed Tracing Instrumenter",
1911
+ "description": "Instrument a service (or a chain of services) with OpenTelemetry so a single request can be followed end-to-end — context propagated across every hop including async/queue boundaries, spans at the boundaries that matter, deliberate trace-wide sampling, and trace_id stamped on log lines. Use when latency or failures span multiple services, when you have logs but can't reconstruct a request's full path, or when adopting OpenTelemetry.",
1912
+ "topics": [
1913
+ "devops-infra"
1914
+ ],
1915
+ "file": "skills/distributed-tracing-instrumenter.md",
1916
+ "installAs": "skills/distributed-tracing-instrumenter/SKILL.md",
1917
+ "url": "https://agentscamp.com/skills/observability/distributed-tracing-instrumenter"
1918
+ },
1693
1919
  {
1694
1920
  "id": "skills/embedding-index-tuner",
1695
1921
  "type": "skill",
@@ -1804,6 +2030,20 @@
1804
2030
  "installAs": "skills/human-in-the-loop-gate/SKILL.md",
1805
2031
  "url": "https://agentscamp.com/skills/workflow/human-in-the-loop-gate"
1806
2032
  },
2033
+ {
2034
+ "id": "skills/idempotency-designer",
2035
+ "type": "skill",
2036
+ "slug": "idempotency-designer",
2037
+ "category": "api",
2038
+ "title": "Idempotency Designer",
2039
+ "description": "Make unsafe, retryable API operations idempotent so a client retry or a network hiccup can't double-charge, double-create, or double-send — design a client-supplied idempotency key, an atomic store-and-check (unique constraint or conditional write), in-flight conflict handling, and a retention policy. Use when a POST/mutation can be retried (payments, order creation, sends, webhooks), or when duplicate side effects have already shown up in production.",
2040
+ "topics": [
2041
+ "architecture"
2042
+ ],
2043
+ "file": "skills/idempotency-designer.md",
2044
+ "installAs": "skills/idempotency-designer/SKILL.md",
2045
+ "url": "https://agentscamp.com/skills/api/idempotency-designer"
2046
+ },
1807
2047
  {
1808
2048
  "id": "skills/llm-as-judge-scorer",
1809
2049
  "type": "skill",
@@ -1889,6 +2129,20 @@
1889
2129
  "installAs": "skills/mcp-server-scaffolder/SKILL.md",
1890
2130
  "url": "https://agentscamp.com/skills/api/mcp-server-scaffolder"
1891
2131
  },
2132
+ {
2133
+ "id": "skills/memory-leak-hunter",
2134
+ "type": "skill",
2135
+ "slug": "memory-leak-hunter",
2136
+ "category": "performance",
2137
+ "title": "Memory Leak Hunter",
2138
+ "description": "Find and fix a memory leak in a running app: confirm it's a real leak under steady load, diff two heap snapshots to name the growing object and its retention path, cut the root reference that blocks collection, and re-run to confirm memory plateaus. Use when RSS climbs until OOM/restart, heap grows unbounded across a steady workload, or GC pauses worsen the longer the process runs.",
2139
+ "topics": [
2140
+ "review-qa"
2141
+ ],
2142
+ "file": "skills/memory-leak-hunter.md",
2143
+ "installAs": "skills/memory-leak-hunter/SKILL.md",
2144
+ "url": "https://agentscamp.com/skills/performance/memory-leak-hunter"
2145
+ },
1892
2146
  {
1893
2147
  "id": "skills/migration-writer",
1894
2148
  "type": "skill",
@@ -1932,6 +2186,20 @@
1932
2186
  "installAs": "skills/multimodal-document-extractor/SKILL.md",
1933
2187
  "url": "https://agentscamp.com/skills/data/multimodal-document-extractor"
1934
2188
  },
2189
+ {
2190
+ "id": "skills/mutation-test-runner",
2191
+ "type": "skill",
2192
+ "slug": "mutation-test-runner",
2193
+ "category": "testing",
2194
+ "title": "Mutation Test Runner",
2195
+ "description": "Measure whether a test suite actually catches bugs by running mutation testing — introduce small faults into the code and check which ones a test kills versus which slip through silently. Use when line coverage is high but bugs still ship, when you suspect tests assert weakly, or to find the exact assertions a suite is missing.",
2196
+ "topics": [
2197
+ "review-qa"
2198
+ ],
2199
+ "file": "skills/mutation-test-runner.md",
2200
+ "installAs": "skills/mutation-test-runner/SKILL.md",
2201
+ "url": "https://agentscamp.com/skills/testing/mutation-test-runner"
2202
+ },
1935
2203
  {
1936
2204
  "id": "skills/openapi-doc-writer",
1937
2205
  "type": "skill",
@@ -1946,6 +2214,20 @@
1946
2214
  "installAs": "skills/openapi-doc-writer/SKILL.md",
1947
2215
  "url": "https://agentscamp.com/skills/docs/openapi-doc-writer"
1948
2216
  },
2217
+ {
2218
+ "id": "skills/pagination-designer",
2219
+ "type": "skill",
2220
+ "slug": "pagination-designer",
2221
+ "category": "api",
2222
+ "title": "Pagination Designer",
2223
+ "description": "Design correct, scalable pagination (plus the filtering and sorting that ride with it) for a list endpoint — pick cursor (keyset) vs offset and justify it, define an opaque cursor with a unique tiebreaker so no row is skipped or repeated, return a consistent envelope, bound page size, and name the indexes the sort actually needs. Use when adding a list endpoint, when OFFSET pagination crawls on a large table, or when clients see duplicate or missing rows while paging.",
2224
+ "topics": [
2225
+ "architecture"
2226
+ ],
2227
+ "file": "skills/pagination-designer.md",
2228
+ "installAs": "skills/pagination-designer/SKILL.md",
2229
+ "url": "https://agentscamp.com/skills/api/pagination-designer"
2230
+ },
1949
2231
  {
1950
2232
  "id": "skills/plugin-scaffolder",
1951
2233
  "type": "skill",
@@ -2045,6 +2327,20 @@
2045
2327
  "installAs": "skills/prompt-regression-tester/SKILL.md",
2046
2328
  "url": "https://agentscamp.com/skills/data/prompt-regression-tester"
2047
2329
  },
2330
+ {
2331
+ "id": "skills/property-test-designer",
2332
+ "type": "skill",
2333
+ "slug": "property-test-designer",
2334
+ "category": "testing",
2335
+ "title": "Property Test Designer",
2336
+ "description": "Design property-based tests — generate hundreds of random inputs and assert invariants that must hold for ALL of them — to surface the edge cases hand-picked examples never reach. Use when code has a large input space (parsers, serializers, encoders, math, data transforms), when a bug keeps slipping through despite green example tests, or when you can't enumerate every case worth checking.",
2337
+ "topics": [
2338
+ "review-qa"
2339
+ ],
2340
+ "file": "skills/property-test-designer.md",
2341
+ "installAs": "skills/property-test-designer/SKILL.md",
2342
+ "url": "https://agentscamp.com/skills/testing/property-test-designer"
2343
+ },
2048
2344
  {
2049
2345
  "id": "skills/provider-fallback-wrapper",
2050
2346
  "type": "skill",
@@ -2073,6 +2369,20 @@
2073
2369
  "installAs": "skills/qlora-finetune-runner/SKILL.md",
2074
2370
  "url": "https://agentscamp.com/skills/data/qlora-finetune-runner"
2075
2371
  },
2372
+ {
2373
+ "id": "skills/query-plan-analyzer",
2374
+ "type": "skill",
2375
+ "slug": "query-plan-analyzer",
2376
+ "category": "database",
2377
+ "title": "Query Plan Analyzer",
2378
+ "description": "Read a slow query's execution plan and turn it into a concrete fix — the exact index to add, the rewrite, or the ANALYZE to run — by getting the REAL plan with EXPLAIN ANALYZE (actual rows + timing, not estimates), finding the offending node, and confirming the fix removes it. Use when one specific query is slow and you need to know WHY, not just that it is.",
2379
+ "topics": [
2380
+ "devops-infra"
2381
+ ],
2382
+ "file": "skills/query-plan-analyzer.md",
2383
+ "installAs": "skills/query-plan-analyzer/SKILL.md",
2384
+ "url": "https://agentscamp.com/skills/database/query-plan-analyzer"
2385
+ },
2076
2386
  {
2077
2387
  "id": "skills/rate-limiter-designer",
2078
2388
  "type": "skill",
@@ -2117,6 +2427,20 @@
2117
2427
  "installAs": "skills/readme-generator/SKILL.md",
2118
2428
  "url": "https://agentscamp.com/skills/docs/readme-generator"
2119
2429
  },
2430
+ {
2431
+ "id": "skills/runbook-writer",
2432
+ "type": "skill",
2433
+ "slug": "runbook-writer",
2434
+ "category": "docs",
2435
+ "title": "Runbook Writer",
2436
+ "description": "Write an operational runbook a half-asleep on-call engineer can execute at 3am — scoped to ONE alert, leading with how to confirm the problem, the copy-pasteable mitigation that stops user pain, then diagnosis, escalation, and verification. Use when an alert has no documented response, after an incident exposed a missing procedure, or when standing up on-call for a service.",
2437
+ "topics": [
2438
+ "devops-infra"
2439
+ ],
2440
+ "file": "skills/runbook-writer.md",
2441
+ "installAs": "skills/runbook-writer/SKILL.md",
2442
+ "url": "https://agentscamp.com/skills/docs/runbook-writer"
2443
+ },
2120
2444
  {
2121
2445
  "id": "skills/secret-scanner",
2122
2446
  "type": "skill",
@@ -2131,6 +2455,34 @@
2131
2455
  "installAs": "skills/secret-scanner/SKILL.md",
2132
2456
  "url": "https://agentscamp.com/skills/security/secret-scanner"
2133
2457
  },
2458
+ {
2459
+ "id": "skills/security-headers-hardener",
2460
+ "type": "skill",
2461
+ "slug": "security-headers-hardener",
2462
+ "category": "security",
2463
+ "title": "Security Headers Hardener",
2464
+ "description": "Audit and harden a web app's or API's HTTP security headers — Content-Security-Policy, HSTS, X-Content-Type-Options, frame-ancestors, Referrer-Policy, Permissions-Policy, and CORS — and produce a staged rollout that won't break the site. Use before a launch, during a security pass, or when a scanner (Mozilla Observatory, securityheaders.com, a pentest) flags missing or weak headers. Audits and edits header config; rolls CSP out Report-Only first.",
2465
+ "topics": [
2466
+ "review-qa"
2467
+ ],
2468
+ "file": "skills/security-headers-hardener.md",
2469
+ "installAs": "skills/security-headers-hardener/SKILL.md",
2470
+ "url": "https://agentscamp.com/skills/security/security-headers-hardener"
2471
+ },
2472
+ {
2473
+ "id": "skills/semantic-cache-designer",
2474
+ "type": "skill",
2475
+ "slug": "semantic-cache-designer",
2476
+ "category": "data",
2477
+ "title": "Semantic Cache Designer",
2478
+ "description": "Design a semantic cache for LLM responses — serve a cached answer when a new query is similar enough to a past one — to cut cost and latency on repetitive traffic, with the similarity threshold calibrated on real query pairs and a cache key that prevents cross-user/model leaks. Use when an LLM app sees many near-duplicate prompts (FAQs, support, search), when token spend on repetitive queries is high, or when latency on common questions matters.",
2479
+ "topics": [
2480
+ "llm-app-dev"
2481
+ ],
2482
+ "file": "skills/semantic-cache-designer.md",
2483
+ "installAs": "skills/semantic-cache-designer/SKILL.md",
2484
+ "url": "https://agentscamp.com/skills/data/semantic-cache-designer"
2485
+ },
2134
2486
  {
2135
2487
  "id": "skills/semver-advisor",
2136
2488
  "type": "skill",
@@ -2145,6 +2497,20 @@
2145
2497
  "installAs": "skills/semver-advisor/SKILL.md",
2146
2498
  "url": "https://agentscamp.com/skills/release/semver-advisor"
2147
2499
  },
2500
+ {
2501
+ "id": "skills/slo-definer",
2502
+ "type": "skill",
2503
+ "slug": "slo-definer",
2504
+ "category": "observability",
2505
+ "title": "SLO Definer",
2506
+ "description": "Turn a vague reliability goal into concrete SLIs, SLOs, an error budget, and burn-rate alerts — service-level indicators measured at the user-facing boundary, targets over a rolling window, and a written policy for what happens when the budget runs out. Use when a service has no defined reliability target, when on-call is noisy and alert-fatigued, or before you commit to an SLA you can't measure.",
2507
+ "topics": [
2508
+ "devops-infra"
2509
+ ],
2510
+ "file": "skills/slo-definer.md",
2511
+ "installAs": "skills/slo-definer/SKILL.md",
2512
+ "url": "https://agentscamp.com/skills/observability/slo-definer"
2513
+ },
2148
2514
  {
2149
2515
  "id": "skills/sql-optimizer",
2150
2516
  "type": "skill",
@@ -2159,6 +2525,34 @@
2159
2525
  "installAs": "skills/sql-optimizer/SKILL.md",
2160
2526
  "url": "https://agentscamp.com/skills/data/sql-optimizer"
2161
2527
  },
2528
+ {
2529
+ "id": "skills/strangler-fig-migrator",
2530
+ "type": "skill",
2531
+ "slug": "strangler-fig-migrator",
2532
+ "category": "refactor",
2533
+ "title": "Strangler Fig Migrator",
2534
+ "description": "Plan the incremental replacement of a legacy module or service using the strangler-fig pattern — grow new code around the old behind an interception seam until the old is dead, instead of a big-bang rewrite. Use when a legacy system is too risky to rewrite at once, or when migrating off a deprecated framework/dependency gradually while staying shippable and rollback-able at every step.",
2535
+ "topics": [
2536
+ "architecture"
2537
+ ],
2538
+ "file": "skills/strangler-fig-migrator.md",
2539
+ "installAs": "skills/strangler-fig-migrator/SKILL.md",
2540
+ "url": "https://agentscamp.com/skills/refactor/strangler-fig-migrator"
2541
+ },
2542
+ {
2543
+ "id": "skills/structured-logging-designer",
2544
+ "type": "skill",
2545
+ "slug": "structured-logging-designer",
2546
+ "category": "observability",
2547
+ "title": "Structured Logging Designer",
2548
+ "description": "Design a structured (JSON) logging strategy with a stable field schema, correlation-ID propagation, and a disciplined level policy — then migrate ad-hoc string logs toward it. Use when logs are unsearchable plain text, when debugging a request across services means grepping multiple log streams by hand, or when standing up logging for a new service.",
2549
+ "topics": [
2550
+ "devops-infra"
2551
+ ],
2552
+ "file": "skills/structured-logging-designer.md",
2553
+ "installAs": "skills/structured-logging-designer/SKILL.md",
2554
+ "url": "https://agentscamp.com/skills/observability/structured-logging-designer"
2555
+ },
2162
2556
  {
2163
2557
  "id": "skills/test-scaffolder",
2164
2558
  "type": "skill",
@@ -2173,6 +2567,34 @@
2173
2567
  "installAs": "skills/test-scaffolder/SKILL.md",
2174
2568
  "url": "https://agentscamp.com/skills/testing/test-scaffolder"
2175
2569
  },
2570
+ {
2571
+ "id": "skills/threat-model-builder",
2572
+ "type": "skill",
2573
+ "slug": "threat-model-builder",
2574
+ "category": "security",
2575
+ "title": "Threat Model Builder",
2576
+ "description": "Build a practical threat model for a feature or system using STRIDE — diagram the data flow, mark trust boundaries, enumerate concrete threats where data crosses them, and prioritize by likelihood × impact so security is reasoned about before shipping instead of bolted on after. Use when designing a feature that touches auth, money, or sensitive data, running a security design review, or hardening before a launch.",
2577
+ "topics": [
2578
+ "review-qa"
2579
+ ],
2580
+ "file": "skills/threat-model-builder.md",
2581
+ "installAs": "skills/threat-model-builder/SKILL.md",
2582
+ "url": "https://agentscamp.com/skills/security/threat-model-builder"
2583
+ },
2584
+ {
2585
+ "id": "skills/token-usage-profiler",
2586
+ "type": "skill",
2587
+ "slug": "token-usage-profiler",
2588
+ "category": "data",
2589
+ "title": "Token Usage Profiler",
2590
+ "description": "Measure and attribute LLM token usage and cost across an app — input vs output tokens by feature, route, model, and tenant — then rank the waste and the specific lever to cut it. Use when LLM spend is high or climbing with no clear cause, before scaling a feature that calls a model, or when you need per-feature or per-tenant cost attribution for billing or budgets.",
2591
+ "topics": [
2592
+ "llm-app-dev"
2593
+ ],
2594
+ "file": "skills/token-usage-profiler.md",
2595
+ "installAs": "skills/token-usage-profiler/SKILL.md",
2596
+ "url": "https://agentscamp.com/skills/data/token-usage-profiler"
2597
+ },
2176
2598
  {
2177
2599
  "id": "skills/tool-definition-generator",
2178
2600
  "type": "skill",
@@ -0,0 +1,59 @@
1
+ ---
2
+ name: "agent-trajectory-evaluator"
3
+ description: "Evaluate a multi-step AI agent's whole run — tool calls, intermediate steps, and final result — not just final-answer correctness, so you can pinpoint WHERE it went wrong. Use when building or debugging a tool-using or multi-step agent, when final-answer-only evals can't explain failures, or when a prompt/model change quietly makes the agent less efficient or more error-prone even though the answer still looks right."
4
+ allowed-tools: "Read, Grep, Glob, Bash"
5
+ version: 1.0.0
6
+ ---
7
+
8
+ Final-answer evals tell you the agent failed; they don't tell you *where*. An agent that returns the right number might have called the wrong tool first, looped on a flaky API, or stumbled into the answer through a path that collapses on the next input. This skill makes the agent's **process** inspectable: capture the full trajectory — every decision, tool call, argument, and result — then score it on the axes that actually predict failure, asserting what's checkable and judging only what isn't.
9
+
10
+ ## When to use this skill
11
+ - You're building or debugging a tool-using / multi-step agent and a final-answer eval says "wrong" without saying why.
12
+ - A prompt or model change kept the answers correct but you suspect the agent got slower, looped more, or recovers worse — and you need to prove it.
13
+ - You're adding a new tool and want to confirm the agent selects it correctly instead of brute-forcing with the old one.
14
+ - Failures are intermittent and you can't tell whether the agent is fragile (lucky path) or robust (sound path).
15
+
16
+ ## Instructions
17
+
18
+ 1. **Capture the full trajectory as a structured, replayable log — one record per step.** Final-answer-only logging is the root cause of un-diagnosable failures. Each step records: the model's decision (the assistant turn, including thinking-block summaries if present), the tool called and its exact arguments, the raw tool result (success/error), and any externalized state (files written, working dir, retry count). Use a stable schema so two runs diff cleanly:
19
+ ```json
20
+ {"run_id": "...", "task_id": "...", "step": 3,
21
+ "decision": "call search_orders to find the open order",
22
+ "tool": "search_orders", "args": {"customer_id": "C-118", "status": "open"},
23
+ "result": {"ok": true, "rows": 2}, "is_error": false,
24
+ "latency_ms": 410, "state": {"retries": 0}}
25
+ ```
26
+ Pull this from your agent loop's tool-call records (or the Managed Agents event stream: `agent.tool_use` / `agent.tool_result` / `agent.custom_tool_use` events carry tool name, input, and result). Persist trajectories to disk so a baseline run is a diffable artifact, not a console scroll-by.
27
+
28
+ 2. **Build a fixed, version-controlled eval set of representative tasks — and deliberately include trap tasks.** A good set has three buckets: (a) routine tasks the agent should handle cleanly, (b) tasks that *require* tool use (the answer isn't in the prompt, so the agent must select and call the right tool), and (c) tasks engineered to trip a known failure mode — a tool that returns an error on the first call (does it recover?), an ambiguous request (does it loop?), a distractor tool that looks relevant but is wrong (does it mis-select?). Pin the set; an eval set that drifts can't catch regressions. Each task carries its expected trajectory assertions (next step).
29
+
30
+ 3. **Score every trajectory on five axes, not one.** Final-answer correctness is necessary but insufficient. For each task, evaluate:
31
+ - **Tool selection** — did it call the right tool for each sub-goal? (mis-selection often produces a right answer via a wrong, slow path)
32
+ - **Argument correctness** — were the tool arguments right? (a `status: "open"` typo'd to `status: "all"` can still return the target row by luck)
33
+ - **Step efficiency** — did it stay within a step budget, or did it repeat calls, loop, or take a needless detour? Measure against a per-task budget, not a global one.
34
+ - **Error recovery** — when a tool returned an error, did the agent recover sensibly (retry once, switch approach) or thrash / give up?
35
+ - **Goal completion** — did it actually finish the task, distinct from "the final text looks plausible"?
36
+
37
+ 4. **Split scoring into programmatic assertions and a narrow LLM-judge — assert everything you can.** An LLM-judge over a whole trajectory is noisy and expensive, and it will rationalize a broken path. So check the deterministic axes with code: exact tool-name assertions, argument equality (or schema match), and step-count budgets are all plain comparisons against the trajectory you captured.
38
+ ```python
39
+ tools = [s["tool"] for s in trajectory]
40
+ assert tools[0] == "search_orders", f"wrong first tool: {tools[0]}"
41
+ assert trajectory[0]["args"]["status"] == "open"
42
+ assert len(trajectory) <= task["step_budget"], f"{len(trajectory)} steps > budget"
43
+ assert not any(s["is_error"] for s in trajectory[-2:]), "ended on an error"
44
+ ```
45
+ Reserve the LLM-judge for the genuinely subjective steps only — "was this reasoning step sound given the prior result?", "was this summary faithful to the tool output?" — and judge **one step at a time** with the step's inputs in context, not the entire run. Default both the agent-under-test and the judge to the latest, most capable Claude model (`claude-opus-4-8`); use a *different* sample or framing for the judge so it isn't grading its own twin, and keep the judge's rubric to one criterion per call.
46
+
47
+ 5. **Diff every candidate trajectory against a stored baseline and report the regressions.** This is what catches the silent ones. After a prompt or model change, re-run the fixed eval set and compare trajectory-for-trajectory against the baseline: tools added/removed/reordered, argument changes, step-count delta, new error-recovery loops, latency delta. A change that keeps the final answer correct but adds two steps, introduces a retry loop, or swaps a precise tool for a brute-force one is a **regression** — surface it even though the answer still passes. Promote a candidate to the new baseline only when the diff is empty or every change is reviewed and intended.
48
+
49
+ > [!WARNING]
50
+ > Grading only the final answer hides process failures. An agent can reach the right answer through a path that is broken, expensive, or lucky — wrong tool, redundant loop, a crash it recovered from by chance — and that path will break on the very next input. The final answer being correct is *not* evidence the agent worked correctly.
51
+
52
+ > [!WARNING]
53
+ > An LLM-judge over a whole trajectory is noisy and tends to rationalize whatever path it sees. Assert the checkable steps — tool names, argument values, step counts — with code, and give the judge exactly one subjective step and one criterion at a time. A judge asked "was this whole run good?" will hand-wave; a judge asked "was *this* summary faithful to *this* tool output?" gives a usable signal.
54
+
55
+ ## Output
56
+ - **Trajectory schema** — the per-step record (decision, tool, args, result, is_error, latency, state) and where each field comes from in your agent loop or event stream.
57
+ - **Per-axis rubric** — the five axes (tool selection, argument correctness, step efficiency, error recovery, goal completion) with the concrete check for each task.
58
+ - **Assertion-vs-judge split** — the deterministic assertions written as code, and the short list of subjective steps routed to a single-criterion LLM-judge (agent and judge both on `claude-opus-4-8`).
59
+ - **Baseline-diff regression report** — a per-task diff of the candidate run against the stored baseline (tools reordered/added/removed, arg changes, step-count and latency deltas, new recovery loops), flagging every regression even where the final answer still passes, plus a verdict on whether to promote the candidate to baseline.