@netlify/axis 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (230) hide show
  1. package/README.md +977 -0
  2. package/dist/adapters/base/acp-adapter.d.ts +44 -0
  3. package/dist/adapters/base/acp-adapter.d.ts.map +1 -0
  4. package/dist/adapters/base/acp-adapter.js +559 -0
  5. package/dist/adapters/base/acp-adapter.js.map +1 -0
  6. package/dist/adapters/base/agent-adapter.d.ts +132 -0
  7. package/dist/adapters/base/agent-adapter.d.ts.map +1 -0
  8. package/dist/adapters/base/agent-adapter.js +212 -0
  9. package/dist/adapters/base/agent-adapter.js.map +1 -0
  10. package/dist/adapters/claude-code.d.ts +3 -0
  11. package/dist/adapters/claude-code.d.ts.map +1 -0
  12. package/dist/adapters/claude-code.js +138 -0
  13. package/dist/adapters/claude-code.js.map +1 -0
  14. package/dist/adapters/claude-sdk.d.ts +11 -0
  15. package/dist/adapters/claude-sdk.d.ts.map +1 -0
  16. package/dist/adapters/claude-sdk.js +46 -0
  17. package/dist/adapters/claude-sdk.js.map +1 -0
  18. package/dist/adapters/codex.d.ts +3 -0
  19. package/dist/adapters/codex.d.ts.map +1 -0
  20. package/dist/adapters/codex.js +183 -0
  21. package/dist/adapters/codex.js.map +1 -0
  22. package/dist/adapters/gemini-acp.d.ts +11 -0
  23. package/dist/adapters/gemini-acp.d.ts.map +1 -0
  24. package/dist/adapters/gemini-acp.js +60 -0
  25. package/dist/adapters/gemini-acp.js.map +1 -0
  26. package/dist/adapters/gemini.d.ts +3 -0
  27. package/dist/adapters/gemini.d.ts.map +1 -0
  28. package/dist/adapters/gemini.js +222 -0
  29. package/dist/adapters/gemini.js.map +1 -0
  30. package/dist/adapters/goose.d.ts +3 -0
  31. package/dist/adapters/goose.d.ts.map +1 -0
  32. package/dist/adapters/goose.js +9 -0
  33. package/dist/adapters/goose.js.map +1 -0
  34. package/dist/adapters/registry.d.ts +7 -0
  35. package/dist/adapters/registry.d.ts.map +1 -0
  36. package/dist/adapters/registry.js +37 -0
  37. package/dist/adapters/registry.js.map +1 -0
  38. package/dist/adapters/utils/mcp.d.ts +23 -0
  39. package/dist/adapters/utils/mcp.d.ts.map +1 -0
  40. package/dist/adapters/utils/mcp.js +114 -0
  41. package/dist/adapters/utils/mcp.js.map +1 -0
  42. package/dist/adapters/utils/resolve.d.ts +20 -0
  43. package/dist/adapters/utils/resolve.d.ts.map +1 -0
  44. package/dist/adapters/utils/resolve.js +48 -0
  45. package/dist/adapters/utils/resolve.js.map +1 -0
  46. package/dist/adapters/utils/skills.d.ts +17 -0
  47. package/dist/adapters/utils/skills.d.ts.map +1 -0
  48. package/dist/adapters/utils/skills.js +52 -0
  49. package/dist/adapters/utils/skills.js.map +1 -0
  50. package/dist/adapters/utils/token-estimator.d.ts +21 -0
  51. package/dist/adapters/utils/token-estimator.d.ts.map +1 -0
  52. package/dist/adapters/utils/token-estimator.js +37 -0
  53. package/dist/adapters/utils/token-estimator.js.map +1 -0
  54. package/dist/baselines/diff.d.ts +9 -0
  55. package/dist/baselines/diff.d.ts.map +1 -0
  56. package/dist/baselines/diff.js +83 -0
  57. package/dist/baselines/diff.js.map +1 -0
  58. package/dist/baselines/index.d.ts +3 -0
  59. package/dist/baselines/index.d.ts.map +1 -0
  60. package/dist/baselines/index.js +3 -0
  61. package/dist/baselines/index.js.map +1 -0
  62. package/dist/baselines/store.d.ts +19 -0
  63. package/dist/baselines/store.d.ts.map +1 -0
  64. package/dist/baselines/store.js +104 -0
  65. package/dist/baselines/store.js.map +1 -0
  66. package/dist/cli.d.ts +3 -0
  67. package/dist/cli.d.ts.map +1 -0
  68. package/dist/cli.js +487 -0
  69. package/dist/cli.js.map +1 -0
  70. package/dist/config/loader.d.ts +8 -0
  71. package/dist/config/loader.d.ts.map +1 -0
  72. package/dist/config/loader.js +99 -0
  73. package/dist/config/loader.js.map +1 -0
  74. package/dist/config/validator.d.ts +11 -0
  75. package/dist/config/validator.d.ts.map +1 -0
  76. package/dist/config/validator.js +203 -0
  77. package/dist/config/validator.js.map +1 -0
  78. package/dist/docs-site/_astro/cli.DDWZtG0-.css +1 -0
  79. package/dist/docs-site/cli/index.html +18 -0
  80. package/dist/docs-site/configuration/index.html +121 -0
  81. package/dist/docs-site/content-assets.mjs +1 -0
  82. package/dist/docs-site/content-modules.mjs +1 -0
  83. package/dist/docs-site/data-store.json +9 -0
  84. package/dist/docs-site/index.html +69 -0
  85. package/dist/docs-site/quickstart/index.html +59 -0
  86. package/dist/docs-site/running/index.html +87 -0
  87. package/dist/docs-site/scoring/index.html +135 -0
  88. package/dist/index.d.ts +19 -0
  89. package/dist/index.d.ts.map +1 -0
  90. package/dist/index.js +15 -0
  91. package/dist/index.js.map +1 -0
  92. package/dist/report-ui/index.html +291 -0
  93. package/dist/report-ui/mock-data.json +298 -0
  94. package/dist/reports/html.d.ts +7 -0
  95. package/dist/reports/html.d.ts.map +1 -0
  96. package/dist/reports/html.js +27 -0
  97. package/dist/reports/html.js.map +1 -0
  98. package/dist/reports/reader.d.ts +21 -0
  99. package/dist/reports/reader.d.ts.map +1 -0
  100. package/dist/reports/reader.js +110 -0
  101. package/dist/reports/reader.js.map +1 -0
  102. package/dist/reports/writer.d.ts +14 -0
  103. package/dist/reports/writer.d.ts.map +1 -0
  104. package/dist/reports/writer.js +106 -0
  105. package/dist/reports/writer.js.map +1 -0
  106. package/dist/runner/lifecycle.d.ts +10 -0
  107. package/dist/runner/lifecycle.d.ts.map +1 -0
  108. package/dist/runner/lifecycle.js +58 -0
  109. package/dist/runner/lifecycle.js.map +1 -0
  110. package/dist/runner/runner.d.ts +34 -0
  111. package/dist/runner/runner.d.ts.map +1 -0
  112. package/dist/runner/runner.js +330 -0
  113. package/dist/runner/runner.js.map +1 -0
  114. package/dist/scoring/category-score.d.ts +52 -0
  115. package/dist/scoring/category-score.d.ts.map +1 -0
  116. package/dist/scoring/category-score.js +157 -0
  117. package/dist/scoring/category-score.js.map +1 -0
  118. package/dist/scoring/composite.d.ts +5 -0
  119. package/dist/scoring/composite.d.ts.map +1 -0
  120. package/dist/scoring/composite.js +24 -0
  121. package/dist/scoring/composite.js.map +1 -0
  122. package/dist/scoring/deep-eval.d.ts +25 -0
  123. package/dist/scoring/deep-eval.d.ts.map +1 -0
  124. package/dist/scoring/deep-eval.js +382 -0
  125. package/dist/scoring/deep-eval.js.map +1 -0
  126. package/dist/scoring/goal-achievement.d.ts +5 -0
  127. package/dist/scoring/goal-achievement.d.ts.map +1 -0
  128. package/dist/scoring/goal-achievement.js +241 -0
  129. package/dist/scoring/goal-achievement.js.map +1 -0
  130. package/dist/scoring/index.d.ts +22 -0
  131. package/dist/scoring/index.d.ts.map +1 -0
  132. package/dist/scoring/index.js +115 -0
  133. package/dist/scoring/index.js.map +1 -0
  134. package/dist/scoring/parse-json.d.ts +6 -0
  135. package/dist/scoring/parse-json.d.ts.map +1 -0
  136. package/dist/scoring/parse-json.js +18 -0
  137. package/dist/scoring/parse-json.js.map +1 -0
  138. package/dist/scoring/sparse-index.d.ts +15 -0
  139. package/dist/scoring/sparse-index.d.ts.map +1 -0
  140. package/dist/scoring/sparse-index.js +338 -0
  141. package/dist/scoring/sparse-index.js.map +1 -0
  142. package/dist/scoring/triage.d.ts +15 -0
  143. package/dist/scoring/triage.d.ts.map +1 -0
  144. package/dist/scoring/triage.js +204 -0
  145. package/dist/scoring/triage.js.map +1 -0
  146. package/dist/skills/resolver.d.ts +19 -0
  147. package/dist/skills/resolver.d.ts.map +1 -0
  148. package/dist/skills/resolver.js +95 -0
  149. package/dist/skills/resolver.js.map +1 -0
  150. package/dist/transcript/categorize.d.ts +24 -0
  151. package/dist/transcript/categorize.d.ts.map +1 -0
  152. package/dist/transcript/categorize.js +233 -0
  153. package/dist/transcript/categorize.js.map +1 -0
  154. package/dist/transcript/classify.d.ts +7 -0
  155. package/dist/transcript/classify.d.ts.map +1 -0
  156. package/dist/transcript/classify.js +32 -0
  157. package/dist/transcript/classify.js.map +1 -0
  158. package/dist/transcript/extract.d.ts +24 -0
  159. package/dist/transcript/extract.d.ts.map +1 -0
  160. package/dist/transcript/extract.js +266 -0
  161. package/dist/transcript/extract.js.map +1 -0
  162. package/dist/transcript/index.d.ts +3 -0
  163. package/dist/transcript/index.d.ts.map +1 -0
  164. package/dist/transcript/index.js +2 -0
  165. package/dist/transcript/index.js.map +1 -0
  166. package/dist/transcript/normalize.d.ts +15 -0
  167. package/dist/transcript/normalize.d.ts.map +1 -0
  168. package/dist/transcript/normalize.js +160 -0
  169. package/dist/transcript/normalize.js.map +1 -0
  170. package/dist/transcript/types.d.ts +92 -0
  171. package/dist/transcript/types.d.ts.map +1 -0
  172. package/dist/transcript/types.js +2 -0
  173. package/dist/transcript/types.js.map +1 -0
  174. package/dist/transcript/urls.d.ts +10 -0
  175. package/dist/transcript/urls.d.ts.map +1 -0
  176. package/dist/transcript/urls.js +31 -0
  177. package/dist/transcript/urls.js.map +1 -0
  178. package/dist/types/agent.d.ts +80 -0
  179. package/dist/types/agent.d.ts.map +1 -0
  180. package/dist/types/agent.js +2 -0
  181. package/dist/types/agent.js.map +1 -0
  182. package/dist/types/baseline.d.ts +65 -0
  183. package/dist/types/baseline.d.ts.map +1 -0
  184. package/dist/types/baseline.js +2 -0
  185. package/dist/types/baseline.js.map +1 -0
  186. package/dist/types/config.d.ts +76 -0
  187. package/dist/types/config.d.ts.map +1 -0
  188. package/dist/types/config.js +2 -0
  189. package/dist/types/config.js.map +1 -0
  190. package/dist/types/index.d.ts +8 -0
  191. package/dist/types/index.d.ts.map +1 -0
  192. package/dist/types/index.js +8 -0
  193. package/dist/types/index.js.map +1 -0
  194. package/dist/types/output.d.ts +70 -0
  195. package/dist/types/output.d.ts.map +1 -0
  196. package/dist/types/output.js +15 -0
  197. package/dist/types/output.js.map +1 -0
  198. package/dist/types/report.d.ts +37 -0
  199. package/dist/types/report.d.ts.map +1 -0
  200. package/dist/types/report.js +2 -0
  201. package/dist/types/report.js.map +1 -0
  202. package/dist/types/scenario.d.ts +23 -0
  203. package/dist/types/scenario.d.ts.map +1 -0
  204. package/dist/types/scenario.js +2 -0
  205. package/dist/types/scenario.js.map +1 -0
  206. package/dist/types/scoring.d.ts +176 -0
  207. package/dist/types/scoring.d.ts.map +1 -0
  208. package/dist/types/scoring.js +2 -0
  209. package/dist/types/scoring.js.map +1 -0
  210. package/dist/ui/AnimatedTokens.d.ts +29 -0
  211. package/dist/ui/AnimatedTokens.d.ts.map +1 -0
  212. package/dist/ui/AnimatedTokens.js +53 -0
  213. package/dist/ui/AnimatedTokens.js.map +1 -0
  214. package/dist/ui/App.d.ts +6 -0
  215. package/dist/ui/App.d.ts.map +1 -0
  216. package/dist/ui/App.js +16 -0
  217. package/dist/ui/App.js.map +1 -0
  218. package/dist/ui/LiveDuration.d.ts +20 -0
  219. package/dist/ui/LiveDuration.d.ts.map +1 -0
  220. package/dist/ui/LiveDuration.js +31 -0
  221. package/dist/ui/LiveDuration.js.map +1 -0
  222. package/dist/ui/LiveStatus.d.ts +7 -0
  223. package/dist/ui/LiveStatus.d.ts.map +1 -0
  224. package/dist/ui/LiveStatus.js +52 -0
  225. package/dist/ui/LiveStatus.js.map +1 -0
  226. package/dist/ui/format.d.ts +29 -0
  227. package/dist/ui/format.d.ts.map +1 -0
  228. package/dist/ui/format.js +514 -0
  229. package/dist/ui/format.js.map +1 -0
  230. package/package.json +65 -0
@@ -0,0 +1,87 @@
1
+ <!DOCTYPE html><html lang="en"> <head><meta charset="UTF-8"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>Running Tests - AXIS Docs</title><meta name="description" content="Documentation for AXIS, the Agent eXperience Index Score synthetic testing framework for AI agents."><link rel="stylesheet" href="/_astro/cli.DDWZtG0-.css"></head> <body> <header class="site-header"> <a href="/" class="site-logo" aria-label="AXIS home"> <span class="site-logo-mark"><span class="logo-ax">AX</span><span class="logo-i">I</span>S</span> </a> <div class="site-header-links"> <a href="https://github.com/netlify/axis">GitHub</a> <a href="https://www.npmjs.com/package/@netlify/axis">npm</a> </div> <button class="mobile-menu-btn" id="menu-btn" aria-label="Toggle navigation">&#9776;</button> </header> <div class="site-shell"> <div class="sidebar-backdrop" id="sidebar-backdrop"></div> <aside class="sidebar" id="sidebar"> <nav class="sidebar-nav"> <div class="sidebar-section-label">Getting Started</div> <a href="/" class="sidebar-link">What is AXIS</a> <a href="/quickstart" class="sidebar-link">Quick Start</a> <div class="sidebar-section-label">How It Works</div> <a href="/scoring" class="sidebar-link">Scoring Framework</a> <a href="/running" class="sidebar-link active">Running Tests</a> <div class="sidebar-section-label">Reference</div> <a href="/cli" class="sidebar-link">CLI</a> <a href="/configuration" class="sidebar-link">Configuration</a> </nav> </aside> <main class="main-content"> <h1>Running AXIS Tests</h1> <p class="lead">
2
+ How AXIS executes scenarios, manages agent processes, and produces reports.
3
+ </p> <h2>Execution Model</h2> <p>
4
+ When you run <code>axis run</code>, AXIS loads your config, discovers scenarios, and executes
5
+ each scenario/agent combination as an independent job. Jobs run in parallel up to the configured
6
+ concurrency limit.
7
+ </p> <p>
8
+ Each job follows the same lifecycle:
9
+ </p> <ol> <li>Run setup actions (if defined in the scenario).</li> <li>Spawn the agent process in an isolated workspace.</li> <li>Stream and capture the full interaction transcript.</li> <li>Score the transcript against the rubric (unless <code>--no-score</code> is set).</li> <li>Run teardown actions (if defined).</li> <li>Save the result to the report.</li> </ol> <h2>Built-in Adapters</h2> <p>
10
+ AXIS ships with adapters for popular AI coding agents. Each adapter handles CLI resolution,
11
+ process spawning, transcript capture, and output normalization.
12
+ </p> <table> <thead> <tr> <th>Adapter</th> <th>CLI Binary</th> <th>Required Env</th> <th>Default Flags</th> </tr> </thead> <tbody> <tr> <td><code>claude-code</code></td> <td><code>claude</code></td> <td><code>ANTHROPIC_API_KEY</code></td> <td><code>dangerously-skip-permissions</code></td> </tr> <tr> <td><code>codex</code></td> <td><code>codex</code></td> <td><code>CODEX_API_KEY</code></td> <td><code>full-auto</code>, <code>skip-git-repo-check</code></td> </tr> <tr> <td><code>gemini</code></td> <td><code>gemini</code></td> <td><code>GEMINI_API_KEY</code></td> <td><code>yolo</code></td> </tr> <tr> <td><code>goose</code></td> <td><code>goose</code></td> <td>None</td> <td>None</td> </tr> <tr> <td><code>claude-sdk</code></td> <td>SDK</td> <td><code>ANTHROPIC_API_KEY</code></td> <td>None</td> </tr> <tr> <td><code>gemini-acp</code></td> <td>ACP</td> <td><code>GEMINI_API_KEY</code></td> <td>None</td> </tr> </tbody> </table> <p>
13
+ CLI binaries are resolved automatically. If not found locally, AXIS falls back to
14
+ <code>npx --yes &lt;package&gt;</code> silently.
15
+ </p> <h2>Custom Adapters</h2> <p>
16
+ Create a custom adapter module using <code>createAgentAdapter()</code> and register it in your
17
+ <a href="/configuration">config</a>.
18
+ </p> <pre><code>// adapters/my-agent.ts
19
+ import { createAgentAdapter } from &quot;@netlify/axis&quot;;
20
+
21
+ export default createAgentAdapter&lt;{ stdout: string }&gt;({
22
+ name: &quot;my-agent&quot;,
23
+ resolveCommand: () =&gt; ({ command: &quot;my-cli&quot;, prefixArgs: [] }),
24
+ buildArgs: (input) =&gt; [input.prompt],
25
+ initialState: () =&gt; ({ stdout: &quot;&quot; }),
26
+ streamConfig: {
27
+ mode: &quot;aggregate&quot;,
28
+ onChunk: (chunk, ctx) =&gt; {
29
+ ctx.state.stdout += chunk;
30
+ },
31
+ },
32
+ getResult: (ctx) =&gt; ({
33
+ result: ctx.state.stdout.trim() || null,
34
+ }),
35
+ });</code></pre> <p>
36
+ Adapters support two stream modes: <strong>lines</strong> (NDJSON, one JSON object per stdout
37
+ line) and <strong>aggregate</strong> (raw chunks accumulated in state). The module must export
38
+ an <code>AgentAdapter</code> as default or as a named <code>adapter</code> export.
39
+ </p> <h2>Workspace Isolation</h2> <p>
40
+ Each agent run gets a fresh temporary directory as its workspace. AXIS isolates the following.
41
+ </p> <ul> <li><strong>HOME directory:</strong> Set to the workspace to prevent global config leakage.</li> <li><strong>Adapter-specific dirs:</strong> <code>CLAUDE_CONFIG_DIR</code>, <code>CODEX_HOME</code>, <code>GEMINI_CLI_HOME</code>.</li> <li><strong>Environment variables:</strong> Only explicitly listed vars and system essentials are passed through.</li> </ul> <h2>Reports</h2> <p>
42
+ Every run automatically saves a report to <code>.axis/reports/</code>.
43
+ </p> <pre><code>.axis/reports/{reportId}/
44
+ report.json # Manifest with summary + metadata
45
+ scenarios/{key}/{agent}.json # Full result with transcript
46
+ scenarios/{key}/{agent}.raw.ndjson # Raw stdout (--debug only)
47
+ scenarios/{key}/{agent}.sparse-index.txt # Scoring reference</code></pre> <p>
48
+ Use <code>axis reports</code> to list, view, and export reports. See the
49
+ <a href="/cli">CLI reference</a> for all available options.
50
+ </p> <h2>Baselines</h2> <p>
51
+ Baselines let you snapshot scores and detect regressions in future runs. They are stored in
52
+ <code>.axis/baselines/</code> and designed to be checked into version control.
53
+ </p> <pre><code># 1. Run your scenarios
54
+ axis run
55
+
56
+ # 2. Save the results as a baseline
57
+ axis baseline set
58
+
59
+ # 3. In future runs, compare against the baseline
60
+ axis run --compare-baseline
61
+
62
+ # 4. Or diff explicitly
63
+ axis baseline diff</code></pre> <p>
64
+ Baseline diff uses a noise tolerance of 1 point. Score deltas of 0 to 1 are reported
65
+ as unchanged. The diff command exits with code 1 if any regressions are detected, making it
66
+ suitable for CI gating.
67
+ </p> <h2>CI Integration</h2> <p>
68
+ AXIS is designed to work in CI environments. Key patterns:
69
+ </p> <ul> <li>Use <code>--json</code> for machine-readable output.</li> <li>Use <code>--compare-baseline</code> to gate on regressions (exit code 1).</li> <li>Set <code>--concurrency</code> to control resource usage.</li> <li>Pass API keys via environment variables.</li> </ul> <pre><code># GitHub Actions example
70
+ - name: Run AXIS tests
71
+ env:
72
+ ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
73
+ run: npx @netlify/axis run --json --compare-baseline</code></pre> <footer class="site-footer">
74
+ AXIS is maintained by Netlify.
75
+ </footer> </main> </div> <script>
76
+ const btn = document.getElementById("menu-btn");
77
+ const sidebar = document.getElementById("sidebar");
78
+ const backdrop = document.getElementById("sidebar-backdrop");
79
+
80
+ function toggle() {
81
+ sidebar.classList.toggle("open");
82
+ backdrop.classList.toggle("open");
83
+ }
84
+
85
+ btn.addEventListener("click", toggle);
86
+ backdrop.addEventListener("click", toggle);
87
+ </script> </body> </html>
@@ -0,0 +1,135 @@
1
+ <!DOCTYPE html><html lang="en"> <head><meta charset="UTF-8"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>Scoring Framework - AXIS Docs</title><meta name="description" content="Documentation for AXIS, the Agent eXperience Index Score synthetic testing framework for AI agents."><link rel="stylesheet" href="/_astro/cli.DDWZtG0-.css"></head> <body> <header class="site-header"> <a href="/" class="site-logo" aria-label="AXIS home"> <span class="site-logo-mark"><span class="logo-ax">AX</span><span class="logo-i">I</span>S</span> </a> <div class="site-header-links"> <a href="https://github.com/netlify/axis">GitHub</a> <a href="https://www.npmjs.com/package/@netlify/axis">npm</a> </div> <button class="mobile-menu-btn" id="menu-btn" aria-label="Toggle navigation">&#9776;</button> </header> <div class="site-shell"> <div class="sidebar-backdrop" id="sidebar-backdrop"></div> <aside class="sidebar" id="sidebar"> <nav class="sidebar-nav"> <div class="sidebar-section-label">Getting Started</div> <a href="/" class="sidebar-link">What is AXIS</a> <a href="/quickstart" class="sidebar-link">Quick Start</a> <div class="sidebar-section-label">How It Works</div> <a href="/scoring" class="sidebar-link active">Scoring Framework</a> <a href="/running" class="sidebar-link">Running Tests</a> <div class="sidebar-section-label">Reference</div> <a href="/cli" class="sidebar-link">CLI</a> <a href="/configuration" class="sidebar-link">Configuration</a> </nav> </aside> <main class="main-content"> <h1>AXIS Scoring Framework</h1> <p class="lead">
2
+ AXIS produces a composite 0 to 100 AXIS Result by evaluating four independent dimensions
3
+ of agent performance. Each dimension captures a different aspect of how the agent interacted
4
+ with your system.
5
+ </p> <h2>The Four Dimensions</h2> <div class="dimension-wheel"> <a href="#agent" class="dimension-label dimension-label-agent" data-segment="agent"> <span class="dimension-pct" style="color: var(--color-agent)">20%</span> <span class="dimension-name">Agent</span> <span class="dimension-desc">Planning, reasoning, self-organization</span> </a> <a href="#goal-achievement" class="dimension-label dimension-label-goal" data-segment="goal"> <span class="dimension-pct" style="color: var(--accent)">40%</span> <span class="dimension-name">Goal Achievement</span> <span class="dimension-desc">Evaluated against your scenario rubric</span> </a> <svg class="dimension-chart" viewBox="0 0 200 200" role="img" aria-label="Scoring dimensions: Goal Achievement 40%, Environment 20%, Service 20%, Agent 20%"> <!-- Goal Achievement — 40%, starts at 12 o'clock --> <circle cx="100" cy="100" r="70" fill="none" stroke="var(--accent)" stroke-width="28" stroke-dasharray="173.7 266.1" stroke-dashoffset="111.1" data-segment="goal"></circle> <!-- Environment — 20% --> <circle cx="100" cy="100" r="70" fill="none" stroke="var(--color-env)" stroke-width="28" stroke-dasharray="85.8 354.0" stroke-dashoffset="375.0" data-segment="env"></circle> <!-- Service — 20% --> <circle cx="100" cy="100" r="70" fill="none" stroke="var(--color-svc)" stroke-width="28" stroke-dasharray="85.8 354.0" stroke-dashoffset="287.0" data-segment="svc"></circle> <!-- Agent — 20% --> <circle cx="100" cy="100" r="70" fill="none" stroke="var(--color-agent)" stroke-width="28" stroke-dasharray="85.8 354.0" stroke-dashoffset="199.0" data-segment="agent"></circle> <!-- Center label — default --> <text class="center-default" x="100" y="95" text-anchor="middle" fill="var(--text-muted)" font-family="Inter, -apple-system, system-ui, sans-serif" font-size="12" font-weight="700" letter-spacing="0.5">AXIS</text> <text class="center-default" x="100" y="112" text-anchor="middle" fill="var(--text-muted)" font-family="Inter, -apple-system, system-ui, sans-serif" font-size="11">Result</text> <!-- Center label — hover (hidden by default) --> <text class="center-pct" x="100" y="98" text-anchor="middle" font-family="Inter, -apple-system, system-ui, sans-serif" font-size="28" font-weight="800" opacity="0">40%</text> <text class="center-name" x="100" y="116" text-anchor="middle" fill="var(--text-muted)" font-family="Inter, -apple-system, system-ui, sans-serif" font-size="10" font-weight="500" letter-spacing="0.3" opacity="0">Goal Achievement</text> </svg> <a href="#service" class="dimension-label dimension-label-svc" data-segment="svc"> <span class="dimension-pct" style="color: var(--color-svc)">20%</span> <span class="dimension-name">Service</span> <span class="dimension-desc">APIs, MCP tools, network requests</span> </a> <a href="#environment" class="dimension-label dimension-label-env" data-segment="env"> <span class="dimension-pct" style="color: var(--color-env)">20%</span> <span class="dimension-name">Environment</span> <span class="dimension-desc">Shell, filesystem, git, build tools</span> </a> </div> <h2>Composite AXIS Result</h2> <p>
6
+ The final AXIS Result is the weighted average of all four dimension scores.
7
+ </p> <pre><code>AXIS Result = (Goal Achievement x 0.4) + (Environment x 0.2) + (Service x 0.2) + (Agent x 0.2)</code></pre> <p>
8
+ All dimension scores are 0 to 100. The composite result is rounded to the nearest whole number.
9
+ </p> <h2 id="goal-achievement">Goal Achievement</h2> <p>
10
+ Goal Achievement is evaluated by a Judge that reads the full agent transcript and compares
11
+ the outcome against the rubric checks you defined in the scenario. Each check receives a
12
+ score from 0 to 10, which is scaled to 0 to 100 and combined using the check weights.
13
+ </p> <p>
14
+ This is the only dimension driven entirely by your rubric. The other three dimensions are
15
+ calculated from the agent's interaction transcript.
16
+ </p> <h2 id="interaction-signals">Interaction Signals</h2> <p>
17
+ The Environment, Service, and Agent dimensions are scored by analyzing every tool interaction
18
+ in the agent's transcript. Each interaction is evaluated on five signals.
19
+ </p> <table> <thead> <tr> <th>Signal</th> <th>Method</th> <th>What It Measures</th> </tr> </thead> <tbody> <tr> <td><strong>Success</strong></td> <td>Judge</td> <td>Did the interaction complete without errors? Were the results usable?</td> </tr> <tr> <td><strong>Speed</strong></td> <td>Heuristic</td> <td>How long did the interaction take relative to expectations for its category?</td> </tr> <tr> <td><strong>Weight</strong></td> <td>Judge</td> <td>Was the tool invocation right-sized? Did the agent request more or less than needed?</td> </tr> <tr> <td><strong>Relevance</strong></td> <td>Judge</td> <td>Was the tool output relevant and useful for completing the task?</td> </tr> <tr> <td><strong>Necessity</strong></td> <td>Judge</td> <td>Were the interactions in this category actually needed, or were they avoidable?</td> </tr> </tbody> </table> <p> <strong>Judge</strong> signals are evaluated by an LLM that reads the full content of each tool
20
+ call and its result. <strong>Heuristic</strong> signals are computed deterministically from
21
+ measured values like duration, with no LLM involved.
22
+ </p> <h3>Signal Weights by Category</h3> <p>
23
+ Each category emphasizes different signals based on what matters most for that type of interaction.
24
+ </p> <table> <thead> <tr> <th>Signal</th> <th>Environment</th> <th>Service</th> <th>Agent</th> </tr> </thead> <tbody> <tr> <td>Success</td> <td>0.35</td> <td>0.25</td> <td>0.15</td> </tr> <tr> <td>Speed</td> <td>0.15</td> <td>0.15</td> <td>0.15</td> </tr> <tr> <td>Weight</td> <td>0.15</td> <td>0.20</td> <td>0.20</td> </tr> <tr> <td>Relevance</td> <td>0.15</td> <td>0.20</td> <td>0.25</td> </tr> <tr> <td>Necessity</td> <td>0.20</td> <td>0.20</td> <td>0.25</td> </tr> </tbody> </table> <p>
25
+ Environment scores weight Success most heavily because failed shell commands or file operations
26
+ are the most direct signal of a poor interaction. Speed is weighted equally across all categories
27
+ at 0.15 — the category-specific speed thresholds (below) handle the fact that different operation
28
+ types have different expected durations. Agent scores weight Necessity, Weight, and Relevance more
29
+ heavily because self-organization quality is best measured by whether the agent's own actions were
30
+ purposeful and well-scoped.
31
+ </p> <h3>Speed Thresholds</h3> <p>
32
+ Speed scores are based on how long each interaction took. Thresholds vary by category because
33
+ different types of operations have different expected durations.
34
+ </p> <table> <thead> <tr> <th>Category</th> <th>Excellent</th> <th>Good</th> <th>Fair</th> <th>Slow</th> <th>Very Slow</th> </tr> </thead> <tbody> <tr> <td>Environment</td> <td>&le;500ms</td> <td>&le;2s</td> <td>&le;5s</td> <td>&le;10s</td> <td>&gt;10s</td> </tr> <tr> <td>Service</td> <td>&le;2s</td> <td>&le;5s</td> <td>&le;10s</td> <td>&le;25s</td> <td>&gt;25s</td> </tr> <tr> <td>Agent</td> <td>&le;2s</td> <td>&le;5s</td> <td>&le;15s</td> <td>&le;30s</td> <td>&gt;30s</td> </tr> </tbody> </table> <h2>Score Calibration</h2> <p>
35
+ Raw signal scores are mapped to a 0 to 100 scale using an S-curve rather than linear scaling.
36
+ This means:
37
+ </p> <ul> <li>Improving from 20 to 50 is relatively easy (fixing obvious problems).</li> <li>Improving from 80 to 95 requires significant quality gains.</li> <li>A score of 50 represents median performance for that category.</li> </ul> <p>
38
+ Speed is aggregated using a <strong>severity-weighted average</strong>: slow interactions pull the
39
+ score down disproportionately rather than being hidden by many fast interactions. Other signals
40
+ (success, weight, relevance) are weighted by context size, so a failed API call that returns a
41
+ large error response influences the score more than a trivial file read.
42
+ </p> <h2>Transcript Categorization</h2> <p>
43
+ Every tool interaction is classified into a category based on the tool name. This classification
44
+ determines which dimension the interaction contributes to.
45
+ </p> <h3 id="environment">Environment</h3> <p>OS, filesystem, and dev tooling interactions:</p> <ul> <li><strong>Shell:</strong> bash, shell, terminal, exec.</li> <li><strong>File ops:</strong> read, write, edit, glob, grep, cat, head, tail, find, ls, mkdir, rm, cp, mv.</li> <li><strong>Version control:</strong> git.</li> <li><strong>Package managers:</strong> npm, yarn, pip, cargo, go, brew, apt.</li> <li><strong>Build and test:</strong> make, tsc, docker, kubectl, node, python.</li> </ul> <h3 id="agent">Agent</h3> <p>Self-organization and metacognition:</p> <ul> <li><strong>Tool discovery:</strong> toolsearch, listtoolsets, list_tools.</li> <li><strong>Task management:</strong> taskcreate, taskupdate, tasklist, todo_read, todo_write.</li> <li><strong>Planning:</strong> enterplanmode, exitplanmode.</li> <li><strong>User interaction:</strong> askuserquestion, askfollowupquestion.</li> <li><strong>Skill invocation:</strong> skill.</li> </ul> <h3 id="service">Service</h3> <p>
46
+ Everything else: external APIs, MCP tools, network calls, and custom services. Any tool not
47
+ matching the environment or agent patterns is classified as a service interaction.
48
+ </p> <div class="callout callout-info"> <div class="callout-title">Multi-Category Interactions</div> <p>
49
+ Some interactions span categories. For example, running <code>curl</code> via bash is both an
50
+ environment interaction (shell command) and a service interaction (network call). Environment
51
+ tools that target agent-internal paths (like <code>.claude/</code>) are reclassified as agent
52
+ interactions.
53
+ </p> </div> <h2>Interpreting Scores</h2> <table> <thead> <tr> <th>Range</th> <th>Interpretation</th> </tr> </thead> <tbody> <tr> <td><strong>90 to 100</strong></td> <td>Excellent. Agent completed the task efficiently with minimal waste.</td> </tr> <tr> <td><strong>75 to 89</strong></td> <td>Good. Task completed with minor inefficiencies or missed optimizations.</td> </tr> <tr> <td><strong>50 to 74</strong></td> <td>Fair. Notable issues in execution quality, speed, or unnecessary operations.</td> </tr> <tr> <td><strong>Below 50</strong></td> <td>Poor. Significant failures, errors, or excessive waste in the execution.</td> </tr> </tbody> </table> <p>
54
+ When a category score falls below 75, the CLI displays <strong>score insights</strong> that
55
+ identify the weakest signal, helping you understand where the agent struggled.
56
+ </p> <script>
57
+ const wheel = document.querySelector(".dimension-wheel");
58
+ if (wheel) {
59
+ const segments = wheel.querySelectorAll(".dimension-chart circle[data-segment]");
60
+ const labels = wheel.querySelectorAll(".dimension-label");
61
+ const defaults = wheel.querySelectorAll(".center-default");
62
+ const pctEl = wheel.querySelector(".center-pct");
63
+ const nameEl = wheel.querySelector(".center-name");
64
+
65
+ const info = {
66
+ goal: { pct: "40%", name: "Goal Achievement", color: "var(--accent)" },
67
+ env: { pct: "20%", name: "Environment", color: "var(--color-env)" },
68
+ svc: { pct: "20%", name: "Service", color: "var(--color-svc)" },
69
+ agent: { pct: "20%", name: "Agent", color: "var(--color-agent)" }
70
+ };
71
+
72
+ function highlight(seg) {
73
+ // Fade/grow segments
74
+ segments.forEach(s => {
75
+ s.style.opacity = s.dataset.segment === seg ? "1" : "0.15";
76
+ s.style.strokeWidth = s.dataset.segment === seg ? "34" : "28";
77
+ });
78
+ // Swap center text
79
+ const d = info[seg];
80
+ defaults.forEach(t => { t.style.opacity = "0"; });
81
+ pctEl.textContent = d.pct;
82
+ pctEl.setAttribute("fill", d.color);
83
+ pctEl.style.opacity = "1";
84
+ nameEl.textContent = d.name;
85
+ nameEl.style.opacity = "1";
86
+ }
87
+
88
+ function reset() {
89
+ segments.forEach(s => { s.style.opacity = "1"; s.style.strokeWidth = "28"; });
90
+ labels.forEach(l => { l.style.background = ""; l.style.boxShadow = ""; });
91
+ defaults.forEach(t => { t.style.opacity = "1"; });
92
+ pctEl.style.opacity = "0";
93
+ nameEl.style.opacity = "0";
94
+ }
95
+
96
+ labels.forEach(label => {
97
+ label.addEventListener("mouseenter", () => {
98
+ highlight(label.dataset.segment);
99
+ label.style.background = "var(--light-gray)";
100
+ label.style.boxShadow = "var(--shadow)";
101
+ });
102
+ label.addEventListener("mouseleave", reset);
103
+ });
104
+
105
+ segments.forEach(circle => {
106
+ circle.addEventListener("mouseenter", () => {
107
+ highlight(circle.dataset.segment);
108
+ const label = wheel.querySelector(`.dimension-label[data-segment="${circle.dataset.segment}"]`);
109
+ if (label) {
110
+ label.style.background = "var(--light-gray)";
111
+ label.style.boxShadow = "var(--shadow)";
112
+ }
113
+ });
114
+ circle.addEventListener("mouseleave", reset);
115
+ circle.addEventListener("click", () => {
116
+ const label = wheel.querySelector(`.dimension-label[data-segment="${circle.dataset.segment}"]`);
117
+ if (label) label.click();
118
+ });
119
+ });
120
+ }
121
+ </script> <footer class="site-footer">
122
+ AXIS is maintained by Netlify.
123
+ </footer> </main> </div> <script>
124
+ const btn = document.getElementById("menu-btn");
125
+ const sidebar = document.getElementById("sidebar");
126
+ const backdrop = document.getElementById("sidebar-backdrop");
127
+
128
+ function toggle() {
129
+ sidebar.classList.toggle("open");
130
+ backdrop.classList.toggle("open");
131
+ }
132
+
133
+ btn.addEventListener("click", toggle);
134
+ backdrop.addEventListener("click", toggle);
135
+ </script> </body> </html>
@@ -0,0 +1,19 @@
1
+ export { run } from "./runner/runner.js";
2
+ export type { RunOutput, RunResult, RunOptions } from "./runner/runner.js";
3
+ export { loadConfig, discoverScenarios } from "./config/loader.js";
4
+ export { getAdapter, registerAdapter } from "./adapters/registry.js";
5
+ export { createAgentAdapter } from "./adapters/base/agent-adapter.js";
6
+ export type { AgentAdapterSpec, SetupContext, StreamContext, ResultContext, AdapterResult, } from "./adapters/base/agent-adapter.js";
7
+ export { createAcpBasedAdapter } from "./adapters/base/acp-adapter.js";
8
+ export type { AcpAdapterSpec } from "./adapters/base/acp-adapter.js";
9
+ export { scoreResults, scoreRunResult, buildScoredOutput, buildMcpCategoryOverrides } from "./scoring/index.js";
10
+ export { buildSparseIndex } from "./scoring/sparse-index.js";
11
+ export { categorizeInteraction } from "./transcript/categorize.js";
12
+ export { normalizeTranscript, toTranscriptAnalysis } from "./transcript/index.js";
13
+ export type { NormalizedEntry, NormalizedTranscript, ExtractedUrl, EntryAnalysis, TranscriptAnalysis, } from "./transcript/index.js";
14
+ export { writeReportToStore } from "./reports/writer.js";
15
+ export { listReports, readReport, readScenarioResult } from "./reports/reader.js";
16
+ export { generateReportHtml } from "./reports/html.js";
17
+ export { setBaseline, readBaseline, listBaselines, deleteBaseline, diffBaseline, DEFAULT_BASELINE_NAME, } from "./baselines/index.js";
18
+ export * from "./types/index.js";
19
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,GAAG,EAAE,MAAM,oBAAoB,CAAC;AACzC,YAAY,EAAE,SAAS,EAAE,SAAS,EAAE,UAAU,EAAE,MAAM,oBAAoB,CAAC;AAC3E,OAAO,EAAE,UAAU,EAAE,iBAAiB,EAAE,MAAM,oBAAoB,CAAC;AACnE,OAAO,EAAE,UAAU,EAAE,eAAe,EAAE,MAAM,wBAAwB,CAAC;AACrE,OAAO,EAAE,kBAAkB,EAAE,MAAM,kCAAkC,CAAC;AACtE,YAAY,EACV,gBAAgB,EAChB,YAAY,EACZ,aAAa,EACb,aAAa,EACb,aAAa,GACd,MAAM,kCAAkC,CAAC;AAC1C,OAAO,EAAE,qBAAqB,EAAE,MAAM,gCAAgC,CAAC;AACvE,YAAY,EAAE,cAAc,EAAE,MAAM,gCAAgC,CAAC;AACrE,OAAO,EAAE,YAAY,EAAE,cAAc,EAAE,iBAAiB,EAAE,yBAAyB,EAAE,MAAM,oBAAoB,CAAC;AAChH,OAAO,EAAE,gBAAgB,EAAE,MAAM,2BAA2B,CAAC;AAC7D,OAAO,EAAE,qBAAqB,EAAE,MAAM,4BAA4B,CAAC;AACnE,OAAO,EAAE,mBAAmB,EAAE,oBAAoB,EAAE,MAAM,uBAAuB,CAAC;AAClF,YAAY,EACV,eAAe,EACf,oBAAoB,EACpB,YAAY,EACZ,aAAa,EACb,kBAAkB,GACnB,MAAM,uBAAuB,CAAC;AAC/B,OAAO,EAAE,kBAAkB,EAAE,MAAM,qBAAqB,CAAC;AACzD,OAAO,EAAE,WAAW,EAAE,UAAU,EAAE,kBAAkB,EAAE,MAAM,qBAAqB,CAAC;AAClF,OAAO,EAAE,kBAAkB,EAAE,MAAM,mBAAmB,CAAC;AACvD,OAAO,EACL,WAAW,EACX,YAAY,EACZ,aAAa,EACb,cAAc,EACd,YAAY,EACZ,qBAAqB,GACtB,MAAM,sBAAsB,CAAC;AAC9B,cAAc,kBAAkB,CAAC"}
package/dist/index.js ADDED
@@ -0,0 +1,15 @@
1
+ export { run } from "./runner/runner.js";
2
+ export { loadConfig, discoverScenarios } from "./config/loader.js";
3
+ export { getAdapter, registerAdapter } from "./adapters/registry.js";
4
+ export { createAgentAdapter } from "./adapters/base/agent-adapter.js";
5
+ export { createAcpBasedAdapter } from "./adapters/base/acp-adapter.js";
6
+ export { scoreResults, scoreRunResult, buildScoredOutput, buildMcpCategoryOverrides } from "./scoring/index.js";
7
+ export { buildSparseIndex } from "./scoring/sparse-index.js";
8
+ export { categorizeInteraction } from "./transcript/categorize.js";
9
+ export { normalizeTranscript, toTranscriptAnalysis } from "./transcript/index.js";
10
+ export { writeReportToStore } from "./reports/writer.js";
11
+ export { listReports, readReport, readScenarioResult } from "./reports/reader.js";
12
+ export { generateReportHtml } from "./reports/html.js";
13
+ export { setBaseline, readBaseline, listBaselines, deleteBaseline, diffBaseline, DEFAULT_BASELINE_NAME, } from "./baselines/index.js";
14
+ export * from "./types/index.js";
15
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,GAAG,EAAE,MAAM,oBAAoB,CAAC;AAEzC,OAAO,EAAE,UAAU,EAAE,iBAAiB,EAAE,MAAM,oBAAoB,CAAC;AACnE,OAAO,EAAE,UAAU,EAAE,eAAe,EAAE,MAAM,wBAAwB,CAAC;AACrE,OAAO,EAAE,kBAAkB,EAAE,MAAM,kCAAkC,CAAC;AAQtE,OAAO,EAAE,qBAAqB,EAAE,MAAM,gCAAgC,CAAC;AAEvE,OAAO,EAAE,YAAY,EAAE,cAAc,EAAE,iBAAiB,EAAE,yBAAyB,EAAE,MAAM,oBAAoB,CAAC;AAChH,OAAO,EAAE,gBAAgB,EAAE,MAAM,2BAA2B,CAAC;AAC7D,OAAO,EAAE,qBAAqB,EAAE,MAAM,4BAA4B,CAAC;AACnE,OAAO,EAAE,mBAAmB,EAAE,oBAAoB,EAAE,MAAM,uBAAuB,CAAC;AAQlF,OAAO,EAAE,kBAAkB,EAAE,MAAM,qBAAqB,CAAC;AACzD,OAAO,EAAE,WAAW,EAAE,UAAU,EAAE,kBAAkB,EAAE,MAAM,qBAAqB,CAAC;AAClF,OAAO,EAAE,kBAAkB,EAAE,MAAM,mBAAmB,CAAC;AACvD,OAAO,EACL,WAAW,EACX,YAAY,EACZ,aAAa,EACb,cAAc,EACd,YAAY,EACZ,qBAAqB,GACtB,MAAM,sBAAsB,CAAC;AAC9B,cAAc,kBAAkB,CAAC"}