@netlify/axis 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (230) hide show
  1. package/README.md +977 -0
  2. package/dist/adapters/base/acp-adapter.d.ts +44 -0
  3. package/dist/adapters/base/acp-adapter.d.ts.map +1 -0
  4. package/dist/adapters/base/acp-adapter.js +559 -0
  5. package/dist/adapters/base/acp-adapter.js.map +1 -0
  6. package/dist/adapters/base/agent-adapter.d.ts +132 -0
  7. package/dist/adapters/base/agent-adapter.d.ts.map +1 -0
  8. package/dist/adapters/base/agent-adapter.js +212 -0
  9. package/dist/adapters/base/agent-adapter.js.map +1 -0
  10. package/dist/adapters/claude-code.d.ts +3 -0
  11. package/dist/adapters/claude-code.d.ts.map +1 -0
  12. package/dist/adapters/claude-code.js +138 -0
  13. package/dist/adapters/claude-code.js.map +1 -0
  14. package/dist/adapters/claude-sdk.d.ts +11 -0
  15. package/dist/adapters/claude-sdk.d.ts.map +1 -0
  16. package/dist/adapters/claude-sdk.js +46 -0
  17. package/dist/adapters/claude-sdk.js.map +1 -0
  18. package/dist/adapters/codex.d.ts +3 -0
  19. package/dist/adapters/codex.d.ts.map +1 -0
  20. package/dist/adapters/codex.js +183 -0
  21. package/dist/adapters/codex.js.map +1 -0
  22. package/dist/adapters/gemini-acp.d.ts +11 -0
  23. package/dist/adapters/gemini-acp.d.ts.map +1 -0
  24. package/dist/adapters/gemini-acp.js +60 -0
  25. package/dist/adapters/gemini-acp.js.map +1 -0
  26. package/dist/adapters/gemini.d.ts +3 -0
  27. package/dist/adapters/gemini.d.ts.map +1 -0
  28. package/dist/adapters/gemini.js +222 -0
  29. package/dist/adapters/gemini.js.map +1 -0
  30. package/dist/adapters/goose.d.ts +3 -0
  31. package/dist/adapters/goose.d.ts.map +1 -0
  32. package/dist/adapters/goose.js +9 -0
  33. package/dist/adapters/goose.js.map +1 -0
  34. package/dist/adapters/registry.d.ts +7 -0
  35. package/dist/adapters/registry.d.ts.map +1 -0
  36. package/dist/adapters/registry.js +37 -0
  37. package/dist/adapters/registry.js.map +1 -0
  38. package/dist/adapters/utils/mcp.d.ts +23 -0
  39. package/dist/adapters/utils/mcp.d.ts.map +1 -0
  40. package/dist/adapters/utils/mcp.js +114 -0
  41. package/dist/adapters/utils/mcp.js.map +1 -0
  42. package/dist/adapters/utils/resolve.d.ts +20 -0
  43. package/dist/adapters/utils/resolve.d.ts.map +1 -0
  44. package/dist/adapters/utils/resolve.js +48 -0
  45. package/dist/adapters/utils/resolve.js.map +1 -0
  46. package/dist/adapters/utils/skills.d.ts +17 -0
  47. package/dist/adapters/utils/skills.d.ts.map +1 -0
  48. package/dist/adapters/utils/skills.js +52 -0
  49. package/dist/adapters/utils/skills.js.map +1 -0
  50. package/dist/adapters/utils/token-estimator.d.ts +21 -0
  51. package/dist/adapters/utils/token-estimator.d.ts.map +1 -0
  52. package/dist/adapters/utils/token-estimator.js +37 -0
  53. package/dist/adapters/utils/token-estimator.js.map +1 -0
  54. package/dist/baselines/diff.d.ts +9 -0
  55. package/dist/baselines/diff.d.ts.map +1 -0
  56. package/dist/baselines/diff.js +83 -0
  57. package/dist/baselines/diff.js.map +1 -0
  58. package/dist/baselines/index.d.ts +3 -0
  59. package/dist/baselines/index.d.ts.map +1 -0
  60. package/dist/baselines/index.js +3 -0
  61. package/dist/baselines/index.js.map +1 -0
  62. package/dist/baselines/store.d.ts +19 -0
  63. package/dist/baselines/store.d.ts.map +1 -0
  64. package/dist/baselines/store.js +104 -0
  65. package/dist/baselines/store.js.map +1 -0
  66. package/dist/cli.d.ts +3 -0
  67. package/dist/cli.d.ts.map +1 -0
  68. package/dist/cli.js +487 -0
  69. package/dist/cli.js.map +1 -0
  70. package/dist/config/loader.d.ts +8 -0
  71. package/dist/config/loader.d.ts.map +1 -0
  72. package/dist/config/loader.js +99 -0
  73. package/dist/config/loader.js.map +1 -0
  74. package/dist/config/validator.d.ts +11 -0
  75. package/dist/config/validator.d.ts.map +1 -0
  76. package/dist/config/validator.js +203 -0
  77. package/dist/config/validator.js.map +1 -0
  78. package/dist/docs-site/_astro/cli.DDWZtG0-.css +1 -0
  79. package/dist/docs-site/cli/index.html +18 -0
  80. package/dist/docs-site/configuration/index.html +121 -0
  81. package/dist/docs-site/content-assets.mjs +1 -0
  82. package/dist/docs-site/content-modules.mjs +1 -0
  83. package/dist/docs-site/data-store.json +9 -0
  84. package/dist/docs-site/index.html +69 -0
  85. package/dist/docs-site/quickstart/index.html +59 -0
  86. package/dist/docs-site/running/index.html +87 -0
  87. package/dist/docs-site/scoring/index.html +135 -0
  88. package/dist/index.d.ts +19 -0
  89. package/dist/index.d.ts.map +1 -0
  90. package/dist/index.js +15 -0
  91. package/dist/index.js.map +1 -0
  92. package/dist/report-ui/index.html +291 -0
  93. package/dist/report-ui/mock-data.json +298 -0
  94. package/dist/reports/html.d.ts +7 -0
  95. package/dist/reports/html.d.ts.map +1 -0
  96. package/dist/reports/html.js +27 -0
  97. package/dist/reports/html.js.map +1 -0
  98. package/dist/reports/reader.d.ts +21 -0
  99. package/dist/reports/reader.d.ts.map +1 -0
  100. package/dist/reports/reader.js +110 -0
  101. package/dist/reports/reader.js.map +1 -0
  102. package/dist/reports/writer.d.ts +14 -0
  103. package/dist/reports/writer.d.ts.map +1 -0
  104. package/dist/reports/writer.js +106 -0
  105. package/dist/reports/writer.js.map +1 -0
  106. package/dist/runner/lifecycle.d.ts +10 -0
  107. package/dist/runner/lifecycle.d.ts.map +1 -0
  108. package/dist/runner/lifecycle.js +58 -0
  109. package/dist/runner/lifecycle.js.map +1 -0
  110. package/dist/runner/runner.d.ts +34 -0
  111. package/dist/runner/runner.d.ts.map +1 -0
  112. package/dist/runner/runner.js +330 -0
  113. package/dist/runner/runner.js.map +1 -0
  114. package/dist/scoring/category-score.d.ts +52 -0
  115. package/dist/scoring/category-score.d.ts.map +1 -0
  116. package/dist/scoring/category-score.js +157 -0
  117. package/dist/scoring/category-score.js.map +1 -0
  118. package/dist/scoring/composite.d.ts +5 -0
  119. package/dist/scoring/composite.d.ts.map +1 -0
  120. package/dist/scoring/composite.js +24 -0
  121. package/dist/scoring/composite.js.map +1 -0
  122. package/dist/scoring/deep-eval.d.ts +25 -0
  123. package/dist/scoring/deep-eval.d.ts.map +1 -0
  124. package/dist/scoring/deep-eval.js +382 -0
  125. package/dist/scoring/deep-eval.js.map +1 -0
  126. package/dist/scoring/goal-achievement.d.ts +5 -0
  127. package/dist/scoring/goal-achievement.d.ts.map +1 -0
  128. package/dist/scoring/goal-achievement.js +241 -0
  129. package/dist/scoring/goal-achievement.js.map +1 -0
  130. package/dist/scoring/index.d.ts +22 -0
  131. package/dist/scoring/index.d.ts.map +1 -0
  132. package/dist/scoring/index.js +115 -0
  133. package/dist/scoring/index.js.map +1 -0
  134. package/dist/scoring/parse-json.d.ts +6 -0
  135. package/dist/scoring/parse-json.d.ts.map +1 -0
  136. package/dist/scoring/parse-json.js +18 -0
  137. package/dist/scoring/parse-json.js.map +1 -0
  138. package/dist/scoring/sparse-index.d.ts +15 -0
  139. package/dist/scoring/sparse-index.d.ts.map +1 -0
  140. package/dist/scoring/sparse-index.js +338 -0
  141. package/dist/scoring/sparse-index.js.map +1 -0
  142. package/dist/scoring/triage.d.ts +15 -0
  143. package/dist/scoring/triage.d.ts.map +1 -0
  144. package/dist/scoring/triage.js +204 -0
  145. package/dist/scoring/triage.js.map +1 -0
  146. package/dist/skills/resolver.d.ts +19 -0
  147. package/dist/skills/resolver.d.ts.map +1 -0
  148. package/dist/skills/resolver.js +95 -0
  149. package/dist/skills/resolver.js.map +1 -0
  150. package/dist/transcript/categorize.d.ts +24 -0
  151. package/dist/transcript/categorize.d.ts.map +1 -0
  152. package/dist/transcript/categorize.js +233 -0
  153. package/dist/transcript/categorize.js.map +1 -0
  154. package/dist/transcript/classify.d.ts +7 -0
  155. package/dist/transcript/classify.d.ts.map +1 -0
  156. package/dist/transcript/classify.js +32 -0
  157. package/dist/transcript/classify.js.map +1 -0
  158. package/dist/transcript/extract.d.ts +24 -0
  159. package/dist/transcript/extract.d.ts.map +1 -0
  160. package/dist/transcript/extract.js +266 -0
  161. package/dist/transcript/extract.js.map +1 -0
  162. package/dist/transcript/index.d.ts +3 -0
  163. package/dist/transcript/index.d.ts.map +1 -0
  164. package/dist/transcript/index.js +2 -0
  165. package/dist/transcript/index.js.map +1 -0
  166. package/dist/transcript/normalize.d.ts +15 -0
  167. package/dist/transcript/normalize.d.ts.map +1 -0
  168. package/dist/transcript/normalize.js +160 -0
  169. package/dist/transcript/normalize.js.map +1 -0
  170. package/dist/transcript/types.d.ts +92 -0
  171. package/dist/transcript/types.d.ts.map +1 -0
  172. package/dist/transcript/types.js +2 -0
  173. package/dist/transcript/types.js.map +1 -0
  174. package/dist/transcript/urls.d.ts +10 -0
  175. package/dist/transcript/urls.d.ts.map +1 -0
  176. package/dist/transcript/urls.js +31 -0
  177. package/dist/transcript/urls.js.map +1 -0
  178. package/dist/types/agent.d.ts +80 -0
  179. package/dist/types/agent.d.ts.map +1 -0
  180. package/dist/types/agent.js +2 -0
  181. package/dist/types/agent.js.map +1 -0
  182. package/dist/types/baseline.d.ts +65 -0
  183. package/dist/types/baseline.d.ts.map +1 -0
  184. package/dist/types/baseline.js +2 -0
  185. package/dist/types/baseline.js.map +1 -0
  186. package/dist/types/config.d.ts +76 -0
  187. package/dist/types/config.d.ts.map +1 -0
  188. package/dist/types/config.js +2 -0
  189. package/dist/types/config.js.map +1 -0
  190. package/dist/types/index.d.ts +8 -0
  191. package/dist/types/index.d.ts.map +1 -0
  192. package/dist/types/index.js +8 -0
  193. package/dist/types/index.js.map +1 -0
  194. package/dist/types/output.d.ts +70 -0
  195. package/dist/types/output.d.ts.map +1 -0
  196. package/dist/types/output.js +15 -0
  197. package/dist/types/output.js.map +1 -0
  198. package/dist/types/report.d.ts +37 -0
  199. package/dist/types/report.d.ts.map +1 -0
  200. package/dist/types/report.js +2 -0
  201. package/dist/types/report.js.map +1 -0
  202. package/dist/types/scenario.d.ts +23 -0
  203. package/dist/types/scenario.d.ts.map +1 -0
  204. package/dist/types/scenario.js +2 -0
  205. package/dist/types/scenario.js.map +1 -0
  206. package/dist/types/scoring.d.ts +176 -0
  207. package/dist/types/scoring.d.ts.map +1 -0
  208. package/dist/types/scoring.js +2 -0
  209. package/dist/types/scoring.js.map +1 -0
  210. package/dist/ui/AnimatedTokens.d.ts +29 -0
  211. package/dist/ui/AnimatedTokens.d.ts.map +1 -0
  212. package/dist/ui/AnimatedTokens.js +53 -0
  213. package/dist/ui/AnimatedTokens.js.map +1 -0
  214. package/dist/ui/App.d.ts +6 -0
  215. package/dist/ui/App.d.ts.map +1 -0
  216. package/dist/ui/App.js +16 -0
  217. package/dist/ui/App.js.map +1 -0
  218. package/dist/ui/LiveDuration.d.ts +20 -0
  219. package/dist/ui/LiveDuration.d.ts.map +1 -0
  220. package/dist/ui/LiveDuration.js +31 -0
  221. package/dist/ui/LiveDuration.js.map +1 -0
  222. package/dist/ui/LiveStatus.d.ts +7 -0
  223. package/dist/ui/LiveStatus.d.ts.map +1 -0
  224. package/dist/ui/LiveStatus.js +52 -0
  225. package/dist/ui/LiveStatus.js.map +1 -0
  226. package/dist/ui/format.d.ts +29 -0
  227. package/dist/ui/format.d.ts.map +1 -0
  228. package/dist/ui/format.js +514 -0
  229. package/dist/ui/format.js.map +1 -0
  230. package/package.json +65 -0
@@ -0,0 +1,330 @@
1
+ import * as fs from "node:fs";
2
+ import * as os from "node:os";
3
+ import * as path from "node:path";
4
+ import { loadConfig, discoverScenarios } from "../config/loader.js";
5
+ import { getAdapter, registerAdapter } from "../adapters/registry.js";
6
+ import { executeLifecycleActions } from "./lifecycle.js";
7
+ import { silentLogger as defaultLogger, formatError } from "../types/output.js";
8
+ import { resolveSkills, skillSourceString } from "../skills/resolver.js";
9
+ /** System vars always passed through to isolated environments. */
10
+ const SYSTEM_VARS = ["PATH", "USER", "SHELL", "LANG", "TERM", "TMPDIR"];
11
+ /** Default env when not specified in config. */
12
+ const DEFAULT_PASS_ENV = ["ANTHROPIC_API_KEY", "CODEX_API_KEY", "GEMINI_API_KEY"];
13
+ export async function run(options = {}) {
14
+ const logger = options.logger ?? defaultLogger;
15
+ const runStart = Date.now();
16
+ const { config, configDir } = await loadConfig(options.configPath);
17
+ // --- Load custom adapters from config ---
18
+ if (config.adapters) {
19
+ for (const [name, modulePath] of Object.entries(config.adapters)) {
20
+ const absPath = path.resolve(configDir, modulePath);
21
+ const mod = await import(absPath);
22
+ const adapter = mod.default ?? mod.adapter;
23
+ if (!adapter || typeof adapter.run !== "function") {
24
+ throw new Error(`Custom adapter "${name}" at ${modulePath} must export a valid AgentAdapter ` +
25
+ `(as default export or named "adapter" export).`);
26
+ }
27
+ registerAdapter(name, adapter);
28
+ }
29
+ }
30
+ // --- Discovery phase ---
31
+ const jobs = [];
32
+ const agents = normalizeAgents(config.agents);
33
+ for (const { name: agentName, config: agentConfig } of agents) {
34
+ if (options.agentFilter?.length && !options.agentFilter.includes(agentName)) {
35
+ continue;
36
+ }
37
+ const scenarios = await discoverScenarios(configDir, config.scenarios, agentConfig.scenarios);
38
+ const filteredScenarios = options.scenarioFilter?.length
39
+ ? scenarios.filter((s) => options.scenarioFilter.some((f) => f === s.key))
40
+ : scenarios;
41
+ for (const scenario of filteredScenarios) {
42
+ // Scenario-level agent override: if set, only listed agents run this scenario
43
+ if (scenario.agents && !scenario.agents.includes(agentName)) {
44
+ continue;
45
+ }
46
+ jobs.push({ index: jobs.length, agentName, agentConfig, scenario, configDir, axisConfig: config });
47
+ }
48
+ }
49
+ if (jobs.length === 0) {
50
+ logger.info("No jobs discovered.");
51
+ return buildOutput(runStart, []);
52
+ }
53
+ // --- Initialize job state tracker ---
54
+ const jobStates = jobs.map((job) => ({
55
+ scenarioKey: job.scenario.key,
56
+ agentName: job.agentName,
57
+ status: "pending",
58
+ }));
59
+ const updateStatus = (index, status, durationMs) => {
60
+ const patch = { status, durationMs };
61
+ // Stamp the start time on the first transition into "running" so the
62
+ // live UI can tick an elapsed-duration counter.
63
+ if (status === "running" && jobStates[index].runStartedAt === undefined) {
64
+ patch.runStartedAt = Date.now();
65
+ }
66
+ jobStates[index] = { ...jobStates[index], ...patch };
67
+ logger.onJobUpdate?.(jobStates);
68
+ };
69
+ /**
70
+ * Monotonic live-token bump — drops any non-increasing estimates. Setting
71
+ * `final` true stamps `tokensFinal` so the UI knows the number is now the
72
+ * authoritative total (from `metadata.tokenUsage`), not an estimate.
73
+ */
74
+ const updateTokens = (index, tokens, final = false) => {
75
+ const prev = jobStates[index].liveTokens ?? 0;
76
+ const grew = tokens > prev;
77
+ const newlyFinal = final && !jobStates[index].tokensFinal;
78
+ if (!grew && !newlyFinal)
79
+ return;
80
+ jobStates[index] = {
81
+ ...jobStates[index],
82
+ liveTokens: grew ? tokens : prev,
83
+ ...(newlyFinal ? { tokensFinal: true } : {}),
84
+ };
85
+ logger.onJobUpdate?.(jobStates);
86
+ };
87
+ // Build filtered environment once for all jobs
88
+ const jobEnv = buildJobEnv(config);
89
+ // --- Validate required env vars and resolve CLI binaries for each adapter ---
90
+ // This runs BEFORE the initial onJobUpdate so that any logger.info calls
91
+ // from ensureInstalled (e.g. npx fallback messages) don't interfere with
92
+ // ink's cursor tracking when it starts rendering the live display.
93
+ const checkedAdapters = new Set();
94
+ for (const job of jobs) {
95
+ if (checkedAdapters.has(job.agentConfig.adapter))
96
+ continue;
97
+ checkedAdapters.add(job.agentConfig.adapter);
98
+ const adapter = getAdapter(job.agentConfig.adapter);
99
+ const required = adapter.requiredEnv?.() ?? [];
100
+ const missing = required.filter((key) => !jobEnv[key]);
101
+ if (missing.length > 0) {
102
+ throw new Error(`The "${job.agentConfig.adapter}" adapter requires environment variable${missing.length > 1 ? "s" : ""} ${missing.join(", ")} ` +
103
+ `but ${missing.length > 1 ? "they are" : "it is"} not set. ` +
104
+ `Add ${missing.length > 1 ? "them" : "it"} to your shell environment or to the "env" array in axis.config.json.`);
105
+ }
106
+ // Resolve CLI binary (direct or npx fallback)
107
+ if (adapter.ensureInstalled) {
108
+ await adapter.ensureInstalled(logger);
109
+ }
110
+ }
111
+ // --- Resolve skills (once, before any jobs start) ---
112
+ const allSkillEntries = new Map();
113
+ for (const entry of config.skills ?? []) {
114
+ const key = skillSourceString(entry);
115
+ if (!allSkillEntries.has(key))
116
+ allSkillEntries.set(key, entry);
117
+ }
118
+ for (const job of jobs) {
119
+ for (const entry of job.agentConfig.skills ?? []) {
120
+ const key = skillSourceString(entry);
121
+ if (!allSkillEntries.has(key))
122
+ allSkillEntries.set(key, entry);
123
+ }
124
+ for (const entry of job.scenario.skills ?? []) {
125
+ const key = skillSourceString(entry);
126
+ if (!allSkillEntries.has(key))
127
+ allSkillEntries.set(key, entry);
128
+ }
129
+ }
130
+ const resolvedSkillMap = new Map();
131
+ if (allSkillEntries.size > 0) {
132
+ const entries = [...allSkillEntries.values()];
133
+ const resolved = await resolveSkills({
134
+ sources: entries,
135
+ configDir,
136
+ cacheDir: path.join(configDir, ".axis", "skills-cache"),
137
+ logger,
138
+ refresh: options.refreshSkills,
139
+ });
140
+ const keys = [...allSkillEntries.keys()];
141
+ for (let i = 0; i < keys.length; i++) {
142
+ resolvedSkillMap.set(keys[i], resolved[i]);
143
+ }
144
+ }
145
+ // Emit initial state after pre-flight so ink's first render is clean
146
+ logger.onJobUpdate?.(jobStates);
147
+ // --- Execute jobs with concurrency control ---
148
+ const concurrency = options.concurrency ?? Infinity;
149
+ const tasks = jobs.map((job) => async () => {
150
+ const { result, cleanup } = await executeJob(job, jobEnv, logger, updateStatus, updateTokens, resolvedSkillMap, options.registerCleanup, options.debug);
151
+ try {
152
+ // Allow external processing (e.g. scoring/verification) before teardown.
153
+ // If onResult returns a Promise, we await it so the judge can verify
154
+ // results before teardown scripts destroy resources.
155
+ if (options.onResult) {
156
+ await options.onResult(result);
157
+ }
158
+ }
159
+ finally {
160
+ await cleanup();
161
+ }
162
+ return result;
163
+ });
164
+ const results = await runWithConcurrency(tasks, concurrency);
165
+ return buildOutput(runStart, results);
166
+ }
167
+ async function executeJob(job, env, logger, updateStatus, updateTokens, resolvedSkillMap, registerCleanup, debug) {
168
+ const { index, agentName, agentConfig, scenario, axisConfig } = job;
169
+ const label = `${scenario.key} (${agentName})`;
170
+ const jobStart = Date.now();
171
+ // Create isolated workspace and point HOME there so agents
172
+ // don't pick up the user's global settings (e.g. ~/.claude/).
173
+ const workspace = createWorkspace();
174
+ const adapter = getAdapter(agentConfig.adapter);
175
+ const adapterIsolation = adapter.isolationEnv?.(workspace) ?? {};
176
+ const jobEnv = { ...adapterIsolation, ...env, HOME: workspace };
177
+ logger.verbose?.(`[${label}] Workspace: ${workspace}`);
178
+ // Register workspace for cleanup on process signal (Ctrl-C)
179
+ registerCleanup?.(() => {
180
+ try {
181
+ fs.rmSync(workspace, { recursive: true, force: true });
182
+ }
183
+ catch {
184
+ /* best-effort */
185
+ }
186
+ });
187
+ const cleanup = async () => {
188
+ if (scenario.teardown?.length) {
189
+ logger.verbose?.(`[${label}] Running teardown...`);
190
+ await executeLifecycleActions(scenario.teardown, workspace, jobEnv).catch((teardownErr) => {
191
+ logger.error(`[${label}] Teardown failed: ${formatError(teardownErr)}`);
192
+ });
193
+ }
194
+ try {
195
+ fs.rmSync(workspace, { recursive: true, force: true });
196
+ logger.verbose?.(`[${label}] Cleaned up workspace: ${workspace}`);
197
+ }
198
+ catch {
199
+ logger.verbose?.(`[${label}] Failed to clean up workspace: ${workspace}`);
200
+ }
201
+ };
202
+ // Setup
203
+ if (scenario.setup?.length) {
204
+ updateStatus(index, "setup");
205
+ logger.verbose?.(`[${label}] Running setup...`);
206
+ await executeLifecycleActions(scenario.setup, workspace, jobEnv);
207
+ }
208
+ try {
209
+ updateStatus(index, "running");
210
+ logger.verbose?.(`[${label}] Executing agent...`);
211
+ // Merge top-level + per-agent + per-scenario skills, deduplicate by source
212
+ const skillEntries = [...(axisConfig.skills ?? []), ...(agentConfig.skills ?? []), ...(scenario.skills ?? [])];
213
+ const seenSkills = new Set();
214
+ const agentSkills = [];
215
+ for (const entry of skillEntries) {
216
+ const key = skillSourceString(entry);
217
+ if (seenSkills.has(key))
218
+ continue;
219
+ seenSkills.add(key);
220
+ const resolved = resolvedSkillMap.get(key);
221
+ if (resolved)
222
+ agentSkills.push(resolved);
223
+ }
224
+ const output = await adapter.run({
225
+ prompt: scenario.prompt,
226
+ config: agentConfig,
227
+ scenario,
228
+ workingDirectory: workspace,
229
+ env: jobEnv,
230
+ registerCleanup,
231
+ captureRawOutput: !!debug,
232
+ mcpServers: axisConfig.mcp_servers,
233
+ resolvedSkills: agentSkills.length > 0 ? agentSkills : undefined,
234
+ onTokenProgress: (tokens) => updateTokens(index, tokens),
235
+ });
236
+ // Snap the live counter up to the real total (input + output + cache).
237
+ // The UI animates up to this value — it won't exceed it because
238
+ // `updateTokens` is monotonic. Passing `final: true` marks `tokensFinal`
239
+ // so the UI can drop the `~` approximation prefix once the animation
240
+ // catches up.
241
+ const usage = output.metadata.tokenUsage;
242
+ if (usage) {
243
+ const realTotal = (usage.input ?? 0) + (usage.output ?? 0) + (usage.cacheReadInput ?? 0);
244
+ updateTokens(index, realTotal, true);
245
+ }
246
+ const durationMs = output.metadata.durationMs || Date.now() - jobStart;
247
+ const failed = output.metadata.exitCode !== 0 || !!output.metadata.error;
248
+ updateStatus(index, failed ? "failed" : "done", durationMs);
249
+ return {
250
+ result: {
251
+ scenarioKey: scenario.key,
252
+ scenarioName: scenario.name,
253
+ agentName,
254
+ prompt: scenario.prompt,
255
+ rubric: scenario.rubric,
256
+ agentConfig,
257
+ output,
258
+ },
259
+ cleanup,
260
+ };
261
+ }
262
+ catch (err) {
263
+ updateStatus(index, "failed", Date.now() - jobStart);
264
+ // On unexpected errors, clean up immediately (nothing to verify)
265
+ await cleanup();
266
+ throw err;
267
+ }
268
+ }
269
+ /**
270
+ * Run async tasks with a concurrency limit.
271
+ * Results are returned in the same order as the input tasks.
272
+ * When limit is Infinity, all tasks run simultaneously (same as Promise.all).
273
+ */
274
+ async function runWithConcurrency(tasks, limit) {
275
+ if (tasks.length === 0)
276
+ return [];
277
+ const results = new Array(tasks.length);
278
+ let nextIndex = 0;
279
+ async function worker() {
280
+ while (nextIndex < tasks.length) {
281
+ const i = nextIndex++;
282
+ results[i] = await tasks[i]();
283
+ }
284
+ }
285
+ const workerCount = Math.min(Number.isFinite(limit) ? limit : tasks.length, tasks.length);
286
+ await Promise.all(Array.from({ length: workerCount }, () => worker()));
287
+ return results;
288
+ }
289
+ function createWorkspace() {
290
+ return fs.mkdtempSync(path.join(os.tmpdir(), "axis-"));
291
+ }
292
+ function buildJobEnv(config) {
293
+ const passthrough = config.env ?? DEFAULT_PASS_ENV;
294
+ const allowedKeys = [...SYSTEM_VARS, ...passthrough];
295
+ const env = {};
296
+ for (const key of allowedKeys) {
297
+ if (process.env[key] !== undefined) {
298
+ env[key] = process.env[key];
299
+ }
300
+ }
301
+ return env;
302
+ }
303
+ function normalizeAgents(agents) {
304
+ const nameCounts = new Map();
305
+ const result = [];
306
+ for (const entry of agents) {
307
+ const config = typeof entry === "string" ? { adapter: entry } : entry;
308
+ const baseName = config.adapter;
309
+ const count = (nameCounts.get(baseName) ?? 0) + 1;
310
+ nameCounts.set(baseName, count);
311
+ const name = count === 1 ? baseName : `${baseName}-${count}`;
312
+ result.push({ name, config });
313
+ }
314
+ return result;
315
+ }
316
+ function buildOutput(runStart, results) {
317
+ const completed = results.filter((r) => r.output.metadata.exitCode === 0 && !r.output.metadata.error).length;
318
+ return {
319
+ version: "0.1.0",
320
+ timestamp: new Date().toISOString(),
321
+ durationMs: Date.now() - runStart,
322
+ results,
323
+ summary: {
324
+ total: results.length,
325
+ completed,
326
+ failed: results.length - completed,
327
+ },
328
+ };
329
+ }
330
+ //# sourceMappingURL=runner.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"runner.js","sourceRoot":"","sources":["../../src/runner/runner.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,EAAE,UAAU,EAAE,iBAAiB,EAAE,MAAM,qBAAqB,CAAC;AACpE,OAAO,EAAE,UAAU,EAAE,eAAe,EAAE,MAAM,yBAAyB,CAAC;AACtE,OAAO,EAAE,uBAAuB,EAAE,MAAM,gBAAgB,CAAC;AAEzD,OAAO,EAAE,YAAY,IAAI,aAAa,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AAGhF,OAAO,EAAE,aAAa,EAAE,iBAAiB,EAAE,MAAM,uBAAuB,CAAC;AA4CzE,kEAAkE;AAClE,MAAM,WAAW,GAAG,CAAC,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC;AAExE,gDAAgD;AAChD,MAAM,gBAAgB,GAAG,CAAC,mBAAmB,EAAE,eAAe,EAAE,gBAAgB,CAAC,CAAC;AAElF,MAAM,CAAC,KAAK,UAAU,GAAG,CAAC,UAAsB,EAAE;IAChD,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,IAAI,aAAa,CAAC;IAC/C,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAC5B,MAAM,EAAE,MAAM,EAAE,SAAS,EAAE,GAAG,MAAM,UAAU,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC;IAEnE,2CAA2C;IAC3C,IAAI,MAAM,CAAC,QAAQ,EAAE,CAAC;QACpB,KAAK,MAAM,CAAC,IAAI,EAAE,UAAU,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,EAAE,CAAC;YACjE,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,UAAU,CAAC,CAAC;YACpD,MAAM,GAAG,GAAG,MAAM,MAAM,CAAC,OAAO,CAAC,CAAC;YAClC,MAAM,OAAO,GAAG,GAAG,CAAC,OAAO,IAAI,GAAG,CAAC,OAAO,CAAC;YAC3C,IAAI,CAAC,OAAO,IAAI,OAAO,OAAO,CAAC,GAAG,KAAK,UAAU,EAAE,CAAC;gBAClD,MAAM,IAAI,KAAK,CACb,mBAAmB,IAAI,QAAQ,UAAU,oCAAoC;oBAC3E,gDAAgD,CACnD,CAAC;YACJ,CAAC;YACD,eAAe,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;QACjC,CAAC;IACH,CAAC;IAED,0BAA0B;IAC1B,MAAM,IAAI,GAAU,EAAE,CAAC;IACvB,MAAM,MAAM,GAAG,eAAe,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;IAE9C,KAAK,MAAM,EAAE,IAAI,EAAE,SAAS,EAAE,MAAM,EAAE,WAAW,EAAE,IAAI,MAAM,EAAE,CAAC;QAC9D,IAAI,OAAO,CAAC,WAAW,EAAE,MAAM,IAAI,CAAC,OAAO,CAAC,WAAW,CAAC,QAAQ,CAAC,SAAS,CAAC,EAAE,CAAC;YAC5E,SAAS;QACX,CAAC;QAED,MAAM,SAAS,GAAG,MAAM,iBAAiB,CAAC,SAAS,EAAE,MAAM,CAAC,SAAS,EAAE,WAAW,CAAC,SAAS,CAAC,CAAC;QAE9F,MAAM,iBAAiB,GAAG,OAAO,CAAC,cAAc,EAAE,MAAM;YACtD,CAAC,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,cAAe,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC;YAC3E,CAAC,CAAC,SAAS,CAAC;QAEd,KAAK,MAAM,QAAQ,IAAI,iBAAiB,EAAE,CAAC;YACzC,8EAA8E;YAC9E,IAAI,QAAQ,CAAC,MAAM,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC,QAAQ,CAAC,SAAS,CAAC,EAAE,CAAC;gBAC5D,SAAS;YACX,CAAC;YACD,IAAI,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS,EAAE,WAAW,EAAE,QAAQ,EAAE,SAAS,EAAE,UAAU,EAAE,MAAM,EAAE,CAAC,CAAC;QACrG,CAAC;IACH,CAAC;IAED,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtB,MAAM,CAAC,IAAI,CAAC,qBAAqB,CAAC,CAAC;QACnC,OAAO,WAAW,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;IACnC,CAAC;IAED,uCAAuC;IACvC,MAAM,SAAS,GAAe,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC;QAC/C,WAAW,EAAE,GAAG,CAAC,QAAQ,CAAC,GAAG;QAC7B,SAAS,EAAE,GAAG,CAAC,SAAS;QACxB,MAAM,EAAE,SAAsB;KAC/B,CAAC,CAAC,CAAC;IAEJ,MAAM,YAAY,GAAG,CAAC,KAAa,EAAE,MAAiB,EAAE,UAAmB,EAAE,EAAE;QAC7E,MAAM,KAAK,GAAsB,EAAE,MAAM,EAAE,UAAU,EAAE,CAAC;QACxD,qEAAqE;QACrE,gDAAgD;QAChD,IAAI,MAAM,KAAK,SAAS,IAAI,SAAS,CAAC,KAAK,CAAC,CAAC,YAAY,KAAK,SAAS,EAAE,CAAC;YACxE,KAAK,CAAC,YAAY,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAClC,CAAC;QACD,SAAS,CAAC,KAAK,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC,KAAK,CAAC,EAAE,GAAG,KAAK,EAAE,CAAC;QACrD,MAAM,CAAC,WAAW,EAAE,CAAC,SAAS,CAAC,CAAC;IAClC,CAAC,CAAC;IAEF;;;;OAIG;IACH,MAAM,YAAY,GAAG,CAAC,KAAa,EAAE,MAAc,EAAE,KAAK,GAAG,KAAK,EAAE,EAAE;QACpE,MAAM,IAAI,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,UAAU,IAAI,CAAC,CAAC;QAC9C,MAAM,IAAI,GAAG,MAAM,GAAG,IAAI,CAAC;QAC3B,MAAM,UAAU,GAAG,KAAK,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,WAAW,CAAC;QAC1D,IAAI,CAAC,IAAI,IAAI,CAAC,UAAU;YAAE,OAAO;QACjC,SAAS,CAAC,KAAK,CAAC,GAAG;YACjB,GAAG,SAAS,CAAC,KAAK,CAAC;YACnB,UAAU,EAAE,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI;YAChC,GAAG,CAAC,UAAU,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;SAC7C,CAAC;QACF,MAAM,CAAC,WAAW,EAAE,CAAC,SAAS,CAAC,CAAC;IAClC,CAAC,CAAC;IAEF,+CAA+C;IAC/C,MAAM,MAAM,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC;IAEnC,+EAA+E;IAC/E,yEAAyE;IACzE,yEAAyE;IACzE,mEAAmE;IACnE,MAAM,eAAe,GAAG,IAAI,GAAG,EAAU,CAAC;IAC1C,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,IAAI,eAAe,CAAC,GAAG,CAAC,GAAG,CAAC,WAAW,CAAC,OAAO,CAAC;YAAE,SAAS;QAC3D,eAAe,CAAC,GAAG,CAAC,GAAG,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;QAE7C,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;QACpD,MAAM,QAAQ,GAAG,OAAO,CAAC,WAAW,EAAE,EAAE,IAAI,EAAE,CAAC;QAC/C,MAAM,OAAO,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC;QACvD,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACvB,MAAM,IAAI,KAAK,CACb,QAAQ,GAAG,CAAC,WAAW,CAAC,OAAO,0CAA0C,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,IAAI,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG;gBAC7H,OAAO,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,OAAO,YAAY;gBAC5D,OAAO,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,uEAAuE,CACnH,CAAC;QACJ,CAAC;QAED,8CAA8C;QAC9C,IAAI,OAAO,CAAC,eAAe,EAAE,CAAC;YAC5B,MAAM,OAAO,CAAC,eAAe,CAAC,MAAM,CAAC,CAAC;QACxC,CAAC;IACH,CAAC;IAED,uDAAuD;IACvD,MAAM,eAAe,GAAG,IAAI,GAAG,EAAuB,CAAC;IACvD,KAAK,MAAM,KAAK,IAAI,MAAM,CAAC,MAAM,IAAI,EAAE,EAAE,CAAC;QACxC,MAAM,GAAG,GAAG,iBAAiB,CAAC,KAAK,CAAC,CAAC;QACrC,IAAI,CAAC,eAAe,CAAC,GAAG,CAAC,GAAG,CAAC;YAAE,eAAe,CAAC,GAAG,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;IACjE,CAAC;IACD,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,KAAK,MAAM,KAAK,IAAI,GAAG,CAAC,WAAW,CAAC,MAAM,IAAI,EAAE,EAAE,CAAC;YACjD,MAAM,GAAG,GAAG,iBAAiB,CAAC,KAAK,CAAC,CAAC;YACrC,IAAI,CAAC,eAAe,CAAC,GAAG,CAAC,GAAG,CAAC;gBAAE,eAAe,CAAC,GAAG,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;QACjE,CAAC;QACD,KAAK,MAAM,KAAK,IAAI,GAAG,CAAC,QAAQ,CAAC,MAAM,IAAI,EAAE,EAAE,CAAC;YAC9C,MAAM,GAAG,GAAG,iBAAiB,CAAC,KAAK,CAAC,CAAC;YACrC,IAAI,CAAC,eAAe,CAAC,GAAG,CAAC,GAAG,CAAC;gBAAE,eAAe,CAAC,GAAG,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;QACjE,CAAC;IACH,CAAC;IAED,MAAM,gBAAgB,GAAG,IAAI,GAAG,EAAyB,CAAC;IAC1D,IAAI,eAAe,CAAC,IAAI,GAAG,CAAC,EAAE,CAAC;QAC7B,MAAM,OAAO,GAAG,CAAC,GAAG,eAAe,CAAC,MAAM,EAAE,CAAC,CAAC;QAC9C,MAAM,QAAQ,GAAG,MAAM,aAAa,CAAC;YACnC,OAAO,EAAE,OAAO;YAChB,SAAS;YACT,QAAQ,EAAE,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,OAAO,EAAE,cAAc,CAAC;YACvD,MAAM;YACN,OAAO,EAAE,OAAO,CAAC,aAAa;SAC/B,CAAC,CAAC;QACH,MAAM,IAAI,GAAG,CAAC,GAAG,eAAe,CAAC,IAAI,EAAE,CAAC,CAAC;QACzC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACrC,gBAAgB,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;QAC7C,CAAC;IACH,CAAC;IAED,qEAAqE;IACrE,MAAM,CAAC,WAAW,EAAE,CAAC,SAAS,CAAC,CAAC;IAEhC,gDAAgD;IAChD,MAAM,WAAW,GAAG,OAAO,CAAC,WAAW,IAAI,QAAQ,CAAC;IACpD,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,KAAK,IAAI,EAAE;QACzC,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,GAAG,MAAM,UAAU,CAC1C,GAAG,EACH,MAAM,EACN,MAAM,EACN,YAAY,EACZ,YAAY,EACZ,gBAAgB,EAChB,OAAO,CAAC,eAAe,EACvB,OAAO,CAAC,KAAK,CACd,CAAC;QAEF,IAAI,CAAC;YACH,yEAAyE;YACzE,qEAAqE;YACrE,qDAAqD;YACrD,IAAI,OAAO,CAAC,QAAQ,EAAE,CAAC;gBACrB,MAAM,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;YACjC,CAAC;QACH,CAAC;gBAAS,CAAC;YACT,MAAM,OAAO,EAAE,CAAC;QAClB,CAAC;QACD,OAAO,MAAM,CAAC;IAChB,CAAC,CAAC,CAAC;IACH,MAAM,OAAO,GAAG,MAAM,kBAAkB,CAAC,KAAK,EAAE,WAAW,CAAC,CAAC;IAE7D,OAAO,WAAW,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;AACxC,CAAC;AAQD,KAAK,UAAU,UAAU,CACvB,GAAQ,EACR,GAA2B,EAC3B,MAAc,EACd,YAA6E,EAC7E,YAAsE,EACtE,gBAA4C,EAC5C,eAA0C,EAC1C,KAAe;IAEf,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,WAAW,EAAE,QAAQ,EAAE,UAAU,EAAE,GAAG,GAAG,CAAC;IACpE,MAAM,KAAK,GAAG,GAAG,QAAQ,CAAC,GAAG,KAAK,SAAS,GAAG,CAAC;IAC/C,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAE5B,2DAA2D;IAC3D,8DAA8D;IAC9D,MAAM,SAAS,GAAG,eAAe,EAAE,CAAC;IACpC,MAAM,OAAO,GAAG,UAAU,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;IAChD,MAAM,gBAAgB,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC,SAAS,CAAC,IAAI,EAAE,CAAC;IACjE,MAAM,MAAM,GAAG,EAAE,GAAG,gBAAgB,EAAE,GAAG,GAAG,EAAE,IAAI,EAAE,SAAS,EAAE,CAAC;IAChE,MAAM,CAAC,OAAO,EAAE,CAAC,IAAI,KAAK,gBAAgB,SAAS,EAAE,CAAC,CAAC;IAEvD,4DAA4D;IAC5D,eAAe,EAAE,CAAC,GAAG,EAAE;QACrB,IAAI,CAAC;YACH,EAAE,CAAC,MAAM,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;QACzD,CAAC;QAAC,MAAM,CAAC;YACP,iBAAiB;QACnB,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,MAAM,OAAO,GAAG,KAAK,IAAI,EAAE;QACzB,IAAI,QAAQ,CAAC,QAAQ,EAAE,MAAM,EAAE,CAAC;YAC9B,MAAM,CAAC,OAAO,EAAE,CAAC,IAAI,KAAK,uBAAuB,CAAC,CAAC;YACnD,MAAM,uBAAuB,CAAC,QAAQ,CAAC,QAAQ,EAAE,SAAS,EAAE,MAAM,CAAC,CAAC,KAAK,CAAC,CAAC,WAAW,EAAE,EAAE;gBACxF,MAAM,CAAC,KAAK,CAAC,IAAI,KAAK,sBAAsB,WAAW,CAAC,WAAW,CAAC,EAAE,CAAC,CAAC;YAC1E,CAAC,CAAC,CAAC;QACL,CAAC;QACD,IAAI,CAAC;YACH,EAAE,CAAC,MAAM,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;YACvD,MAAM,CAAC,OAAO,EAAE,CAAC,IAAI,KAAK,2BAA2B,SAAS,EAAE,CAAC,CAAC;QACpE,CAAC;QAAC,MAAM,CAAC;YACP,MAAM,CAAC,OAAO,EAAE,CAAC,IAAI,KAAK,mCAAmC,SAAS,EAAE,CAAC,CAAC;QAC5E,CAAC;IACH,CAAC,CAAC;IAEF,QAAQ;IACR,IAAI,QAAQ,CAAC,KAAK,EAAE,MAAM,EAAE,CAAC;QAC3B,YAAY,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;QAC7B,MAAM,CAAC,OAAO,EAAE,CAAC,IAAI,KAAK,oBAAoB,CAAC,CAAC;QAChD,MAAM,uBAAuB,CAAC,QAAQ,CAAC,KAAK,EAAE,SAAS,EAAE,MAAM,CAAC,CAAC;IACnE,CAAC;IAED,IAAI,CAAC;QACH,YAAY,CAAC,KAAK,EAAE,SAAS,CAAC,CAAC;QAC/B,MAAM,CAAC,OAAO,EAAE,CAAC,IAAI,KAAK,sBAAsB,CAAC,CAAC;QAElD,2EAA2E;QAC3E,MAAM,YAAY,GAAG,CAAC,GAAG,CAAC,UAAU,CAAC,MAAM,IAAI,EAAE,CAAC,EAAE,GAAG,CAAC,WAAW,CAAC,MAAM,IAAI,EAAE,CAAC,EAAE,GAAG,CAAC,QAAQ,CAAC,MAAM,IAAI,EAAE,CAAC,CAAC,CAAC;QAC/G,MAAM,UAAU,GAAG,IAAI,GAAG,EAAU,CAAC;QACrC,MAAM,WAAW,GAAoB,EAAE,CAAC;QACxC,KAAK,MAAM,KAAK,IAAI,YAAY,EAAE,CAAC;YACjC,MAAM,GAAG,GAAG,iBAAiB,CAAC,KAAK,CAAC,CAAC;YACrC,IAAI,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC;gBAAE,SAAS;YAClC,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YACpB,MAAM,QAAQ,GAAG,gBAAgB,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YAC3C,IAAI,QAAQ;gBAAE,WAAW,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAC3C,CAAC;QAED,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;YAC/B,MAAM,EAAE,QAAQ,CAAC,MAAM;YACvB,MAAM,EAAE,WAAW;YACnB,QAAQ;YACR,gBAAgB,EAAE,SAAS;YAC3B,GAAG,EAAE,MAAM;YACX,eAAe;YACf,gBAAgB,EAAE,CAAC,CAAC,KAAK;YACzB,UAAU,EAAE,UAAU,CAAC,WAAW;YAClC,cAAc,EAAE,WAAW,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,SAAS;YAChE,eAAe,EAAE,CAAC,MAAM,EAAE,EAAE,CAAC,YAAY,CAAC,KAAK,EAAE,MAAM,CAAC;SACzD,CAAC,CAAC;QAEH,uEAAuE;QACvE,gEAAgE;QAChE,yEAAyE;QACzE,qEAAqE;QACrE,cAAc;QACd,MAAM,KAAK,GAAG,MAAM,CAAC,QAAQ,CAAC,UAAU,CAAC;QACzC,IAAI,KAAK,EAAE,CAAC;YACV,MAAM,SAAS,GAAG,CAAC,KAAK,CAAC,KAAK,IAAI,CAAC,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,IAAI,CAAC,CAAC,GAAG,CAAC,KAAK,CAAC,cAAc,IAAI,CAAC,CAAC,CAAC;YACzF,YAAY,CAAC,KAAK,EAAE,SAAS,EAAE,IAAI,CAAC,CAAC;QACvC,CAAC;QAED,MAAM,UAAU,GAAG,MAAM,CAAC,QAAQ,CAAC,UAAU,IAAI,IAAI,CAAC,GAAG,EAAE,GAAG,QAAQ,CAAC;QACvE,MAAM,MAAM,GAAG,MAAM,CAAC,QAAQ,CAAC,QAAQ,KAAK,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC;QACzE,YAAY,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,MAAM,EAAE,UAAU,CAAC,CAAC;QAE5D,OAAO;YACL,MAAM,EAAE;gBACN,WAAW,EAAE,QAAQ,CAAC,GAAG;gBACzB,YAAY,EAAE,QAAQ,CAAC,IAAI;gBAC3B,SAAS;gBACT,MAAM,EAAE,QAAQ,CAAC,MAAM;gBACvB,MAAM,EAAE,QAAQ,CAAC,MAAM;gBACvB,WAAW;gBACX,MAAM;aACP;YACD,OAAO;SACR,CAAC;IACJ,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,YAAY,CAAC,KAAK,EAAE,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,QAAQ,CAAC,CAAC;QACrD,iEAAiE;QACjE,MAAM,OAAO,EAAE,CAAC;QAChB,MAAM,GAAG,CAAC;IACZ,CAAC;AACH,CAAC;AAED;;;;GAIG;AACH,KAAK,UAAU,kBAAkB,CAAI,KAA8B,EAAE,KAAa;IAChF,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAElC,MAAM,OAAO,GAAQ,IAAI,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;IAC7C,IAAI,SAAS,GAAG,CAAC,CAAC;IAElB,KAAK,UAAU,MAAM;QACnB,OAAO,SAAS,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;YAChC,MAAM,CAAC,GAAG,SAAS,EAAE,CAAC;YACtB,OAAO,CAAC,CAAC,CAAC,GAAG,MAAM,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC;QAChC,CAAC;IACH,CAAC;IAED,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,MAAM,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IAC1F,MAAM,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,WAAW,EAAE,EAAE,GAAG,EAAE,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IACvE,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,SAAS,eAAe;IACtB,OAAO,EAAE,CAAC,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,MAAM,EAAE,EAAE,OAAO,CAAC,CAAC,CAAC;AACzD,CAAC;AAED,SAAS,WAAW,CAAC,MAAkB;IACrC,MAAM,WAAW,GAAG,MAAM,CAAC,GAAG,IAAI,gBAAgB,CAAC;IACnD,MAAM,WAAW,GAAG,CAAC,GAAG,WAAW,EAAE,GAAG,WAAW,CAAC,CAAC;IAErD,MAAM,GAAG,GAA2B,EAAE,CAAC;IACvC,KAAK,MAAM,GAAG,IAAI,WAAW,EAAE,CAAC;QAC9B,IAAI,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,SAAS,EAAE,CAAC;YACnC,GAAG,CAAC,GAAG,CAAC,GAAG,OAAO,CAAC,GAAG,CAAC,GAAG,CAAE,CAAC;QAC/B,CAAC;IACH,CAAC;IAED,OAAO,GAAG,CAAC;AACb,CAAC;AAED,SAAS,eAAe,CAAC,MAAgC;IACvD,MAAM,UAAU,GAAG,IAAI,GAAG,EAAkB,CAAC;IAC7C,MAAM,MAAM,GAAiD,EAAE,CAAC;IAEhE,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,MAAM,MAAM,GAAgB,OAAO,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC;QAEnF,MAAM,QAAQ,GAAG,MAAM,CAAC,OAAO,CAAC;QAChC,MAAM,KAAK,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC;QAClD,UAAU,CAAC,GAAG,CAAC,QAAQ,EAAE,KAAK,CAAC,CAAC;QAEhC,MAAM,IAAI,GAAG,KAAK,KAAK,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,GAAG,QAAQ,IAAI,KAAK,EAAE,CAAC;QAC7D,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC,CAAC;IAChC,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,WAAW,CAAC,QAAgB,EAAE,OAAoB;IACzD,MAAM,SAAS,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,QAAQ,CAAC,QAAQ,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC;IAE7G,OAAO;QACL,OAAO,EAAE,OAAO;QAChB,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;QACnC,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,QAAQ;QACjC,OAAO;QACP,OAAO,EAAE;YACP,KAAK,EAAE,OAAO,CAAC,MAAM;YACrB,SAAS;YACT,MAAM,EAAE,OAAO,CAAC,MAAM,GAAG,SAAS;SACnC;KACF,CAAC;AACJ,CAAC"}
@@ -0,0 +1,52 @@
1
+ import type { InteractionCategory, Interaction, InteractionAudit, NecessityJudgment, CategoryScore } from "../types/scoring.js";
2
+ /** How each audit dimension contributes to a category's raw score. */
3
+ export declare const CATEGORY_DIMENSION_WEIGHTS: Record<InteractionCategory, Record<string, number>>;
4
+ /** Calibration parameters for the log-normal CDF mapping. */
5
+ export interface CalibrationParams {
6
+ /** The raw score (0-1) that maps to 50/100. */
7
+ median: number;
8
+ /** Controls the spread — lower = steeper curve. */
9
+ sigma: number;
10
+ }
11
+ /** Initial calibration — to be tuned from real-world data. */
12
+ export declare const DEFAULT_CALIBRATION: Record<InteractionCategory, CalibrationParams>;
13
+ export declare const DEFAULT_AUDIT_SCORES: {
14
+ readonly success: 1;
15
+ readonly speed: 1;
16
+ readonly weight: 1;
17
+ readonly contextRelevance: 1;
18
+ };
19
+ /**
20
+ * Approximation of the standard normal CDF using Abramowitz & Stegun.
21
+ */
22
+ export declare function normalCDF(x: number): number;
23
+ /**
24
+ * Map a raw 0-1 score through a log-normal CDF to produce 0-100.
25
+ *
26
+ * The log-normal mapping ensures:
27
+ * - Improving from bad (20) to mediocre (50) is "easier" (smaller raw improvement needed)
28
+ * - Improving from good (80) to great (95) requires significant raw improvement
29
+ * - The mapping is S-shaped, rewarding getting out of the "bad" zone
30
+ */
31
+ export declare function logNormalScore(rawScore: number, median: number, sigma: number): number;
32
+ /**
33
+ * Severity-weighted average: bad scores pull harder than good scores push.
34
+ *
35
+ * Each value's effective weight is `(1 - value)² + 1`. Perfect scores (1.0)
36
+ * get weight 1, while worse scores get progressively heavier, making outlier
37
+ * problems hard to hide behind many good results.
38
+ *
39
+ * @param values - Scores in the 0-1 range
40
+ */
41
+ export declare function severityWeightedAverage(values: number[]): number;
42
+ /**
43
+ * Aggregate a single audit dimension across interactions in a category,
44
+ * weighted by each interaction's contextBytes.
45
+ */
46
+ export declare function aggregateDimension(audits: InteractionAudit[], interactions: Interaction[], dimension: "success" | "speed" | "weight" | "contextRelevance"): number;
47
+ /**
48
+ * Compute the full category score from audits, necessity judgment, and interactions.
49
+ * Applies dimension weights and log-normal mapping.
50
+ */
51
+ export declare function computeCategoryScore(category: InteractionCategory, audits: InteractionAudit[], necessity: NecessityJudgment, interactions: Interaction[], calibration?: CalibrationParams): CategoryScore;
52
+ //# sourceMappingURL=category-score.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"category-score.d.ts","sourceRoot":"","sources":["../../src/scoring/category-score.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,mBAAmB,EACnB,WAAW,EACX,gBAAgB,EAChB,iBAAiB,EACjB,aAAa,EACd,MAAM,qBAAqB,CAAC;AAI7B,sEAAsE;AACtE,eAAO,MAAM,0BAA0B,EAAE,MAAM,CAAC,mBAAmB,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAsB1F,CAAC;AAIF,6DAA6D;AAC7D,MAAM,WAAW,iBAAiB;IAChC,+CAA+C;IAC/C,MAAM,EAAE,MAAM,CAAC;IACf,mDAAmD;IACnD,KAAK,EAAE,MAAM,CAAC;CACf;AAED,8DAA8D;AAC9D,eAAO,MAAM,mBAAmB,EAAE,MAAM,CAAC,mBAAmB,EAAE,iBAAiB,CAI9E,CAAC;AAKF,eAAO,MAAM,oBAAoB;;;;;CAKvB,CAAC;AAIX;;GAEG;AACH,wBAAgB,SAAS,CAAC,CAAC,EAAE,MAAM,GAAG,MAAM,CAc3C;AAED;;;;;;;GAOG;AACH,wBAAgB,cAAc,CAAC,QAAQ,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,GAAG,MAAM,CAQtF;AAID;;;;;;;;GAQG;AACH,wBAAgB,uBAAuB,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,CAahE;AAED;;;GAGG;AACH,wBAAgB,kBAAkB,CAChC,MAAM,EAAE,gBAAgB,EAAE,EAC1B,YAAY,EAAE,WAAW,EAAE,EAC3B,SAAS,EAAE,SAAS,GAAG,OAAO,GAAG,QAAQ,GAAG,kBAAkB,GAC7D,MAAM,CAqBR;AAED;;;GAGG;AACH,wBAAgB,oBAAoB,CAClC,QAAQ,EAAE,mBAAmB,EAC7B,MAAM,EAAE,gBAAgB,EAAE,EAC1B,SAAS,EAAE,iBAAiB,EAC5B,YAAY,EAAE,WAAW,EAAE,EAC3B,WAAW,CAAC,EAAE,iBAAiB,GAC9B,aAAa,CAwCf"}
@@ -0,0 +1,157 @@
1
+ // --- Dimension weights per category ---
2
+ /** How each audit dimension contributes to a category's raw score. */
3
+ export const CATEGORY_DIMENSION_WEIGHTS = {
4
+ environment: {
5
+ success: 0.35, // env tool failures are critical
6
+ speed: 0.15, // speed matters less for env
7
+ weight: 0.15, // output size is somewhat relevant
8
+ relevance: 0.15, // was the output useful
9
+ necessity: 0.2, // did we need to do this at all
10
+ },
11
+ service: {
12
+ success: 0.25, // service failures matter
13
+ speed: 0.15, // API latency
14
+ weight: 0.2, // did we fetch too much / too little
15
+ relevance: 0.2, // was the API data actionable
16
+ necessity: 0.2, // were these calls needed
17
+ },
18
+ agent: {
19
+ success: 0.15, // agent rarely "fails" explicitly
20
+ speed: 0.15, // thinking time
21
+ weight: 0.2, // was the reasoning concise
22
+ relevance: 0.25, // was the reasoning productive
23
+ necessity: 0.25, // was the reasoning needed
24
+ },
25
+ };
26
+ /** Initial calibration — to be tuned from real-world data. */
27
+ export const DEFAULT_CALIBRATION = {
28
+ environment: { median: 0.75, sigma: 0.5 },
29
+ service: { median: 0.65, sigma: 0.5 },
30
+ agent: { median: 0.7, sigma: 0.5 },
31
+ };
32
+ // --- Default scores for interactions the LLM missed ---
33
+ // If nothing was evaluated, assume perfect — only real issues lower the score.
34
+ export const DEFAULT_AUDIT_SCORES = {
35
+ success: 1.0,
36
+ speed: 1.0,
37
+ weight: 1.0,
38
+ contextRelevance: 1.0,
39
+ };
40
+ // --- Math utilities ---
41
+ /**
42
+ * Approximation of the standard normal CDF using Abramowitz & Stegun.
43
+ */
44
+ export function normalCDF(x) {
45
+ const a1 = 0.254829592;
46
+ const a2 = -0.284496736;
47
+ const a3 = 1.421413741;
48
+ const a4 = -1.453152027;
49
+ const a5 = 1.061405429;
50
+ const p = 0.3275911;
51
+ const sign = x < 0 ? -1 : 1;
52
+ const absX = Math.abs(x) / Math.SQRT2;
53
+ const t = 1.0 / (1.0 + p * absX);
54
+ const y = 1.0 - ((((a5 * t + a4) * t + a3) * t + a2) * t + a1) * t * Math.exp(-absX * absX);
55
+ return 0.5 * (1.0 + sign * y);
56
+ }
57
+ /**
58
+ * Map a raw 0-1 score through a log-normal CDF to produce 0-100.
59
+ *
60
+ * The log-normal mapping ensures:
61
+ * - Improving from bad (20) to mediocre (50) is "easier" (smaller raw improvement needed)
62
+ * - Improving from good (80) to great (95) requires significant raw improvement
63
+ * - The mapping is S-shaped, rewarding getting out of the "bad" zone
64
+ */
65
+ export function logNormalScore(rawScore, median, sigma) {
66
+ if (rawScore <= 0)
67
+ return 0;
68
+ if (rawScore >= 1)
69
+ return 100;
70
+ const z = (Math.log(rawScore) - Math.log(median)) / sigma;
71
+ const cdf = normalCDF(z);
72
+ return Math.round(cdf * 100);
73
+ }
74
+ // --- Aggregation ---
75
+ /**
76
+ * Severity-weighted average: bad scores pull harder than good scores push.
77
+ *
78
+ * Each value's effective weight is `(1 - value)² + 1`. Perfect scores (1.0)
79
+ * get weight 1, while worse scores get progressively heavier, making outlier
80
+ * problems hard to hide behind many good results.
81
+ *
82
+ * @param values - Scores in the 0-1 range
83
+ */
84
+ export function severityWeightedAverage(values) {
85
+ if (values.length === 0)
86
+ return 1.0;
87
+ let totalWeight = 0;
88
+ let weightedSum = 0;
89
+ for (const v of values) {
90
+ const w = (1 - v) ** 2 + 1;
91
+ weightedSum += v * w;
92
+ totalWeight += w;
93
+ }
94
+ return weightedSum / totalWeight;
95
+ }
96
+ /**
97
+ * Aggregate a single audit dimension across interactions in a category,
98
+ * weighted by each interaction's contextBytes.
99
+ */
100
+ export function aggregateDimension(audits, interactions, dimension) {
101
+ if (audits.length === 0)
102
+ return DEFAULT_AUDIT_SCORES[dimension];
103
+ // Speed uses severity-weighted average — bad latency should pull harder
104
+ if (dimension === "speed") {
105
+ return severityWeightedAverage(audits.map((a) => a.speed));
106
+ }
107
+ const interactionMap = new Map(interactions.map((i) => [i.id, i]));
108
+ let totalWeight = 0;
109
+ let weightedSum = 0;
110
+ for (const audit of audits) {
111
+ const interaction = interactionMap.get(audit.id);
112
+ const w = Math.max(1, interaction?.contextBytes ?? 1);
113
+ weightedSum += audit[dimension] * w;
114
+ totalWeight += w;
115
+ }
116
+ return totalWeight > 0 ? weightedSum / totalWeight : DEFAULT_AUDIT_SCORES[dimension];
117
+ }
118
+ /**
119
+ * Compute the full category score from audits, necessity judgment, and interactions.
120
+ * Applies dimension weights and log-normal mapping.
121
+ */
122
+ export function computeCategoryScore(category, audits, necessity, interactions, calibration) {
123
+ const categoryInteractions = interactions.filter((i) => i.categories.includes(category));
124
+ const categoryAudits = audits.filter((a) => a.categories.includes(category));
125
+ const auditedCount = categoryAudits.filter((a) => a.rationale !== "default").length;
126
+ const weights = CATEGORY_DIMENSION_WEIGHTS[category];
127
+ const cal = calibration ?? DEFAULT_CALIBRATION[category];
128
+ // Aggregate each dimension
129
+ const successRaw = aggregateDimension(categoryAudits, categoryInteractions, "success");
130
+ const speedRaw = aggregateDimension(categoryAudits, categoryInteractions, "speed");
131
+ const weightRaw = aggregateDimension(categoryAudits, categoryInteractions, "weight");
132
+ const relevanceRaw = aggregateDimension(categoryAudits, categoryInteractions, "contextRelevance");
133
+ const necessityRaw = necessity.score;
134
+ // Weighted composite raw score (0-1)
135
+ const rawScore = successRaw * weights.success +
136
+ speedRaw * weights.speed +
137
+ weightRaw * weights.weight +
138
+ relevanceRaw * weights.relevance +
139
+ necessityRaw * weights.necessity;
140
+ // Map through log-normal CDF
141
+ const score = logNormalScore(rawScore, cal.median, cal.sigma);
142
+ return {
143
+ score,
144
+ interactionCount: categoryInteractions.length,
145
+ auditedCount,
146
+ dimensions: {
147
+ success: Math.round(successRaw * 100),
148
+ speed: Math.round(speedRaw * 100),
149
+ weight: Math.round(weightRaw * 100),
150
+ relevance: Math.round(relevanceRaw * 100),
151
+ necessity: Math.round(necessityRaw * 100),
152
+ },
153
+ audits: categoryAudits,
154
+ necessity,
155
+ };
156
+ }
157
+ //# sourceMappingURL=category-score.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"category-score.js","sourceRoot":"","sources":["../../src/scoring/category-score.ts"],"names":[],"mappings":"AAQA,yCAAyC;AAEzC,sEAAsE;AACtE,MAAM,CAAC,MAAM,0BAA0B,GAAwD;IAC7F,WAAW,EAAE;QACX,OAAO,EAAE,IAAI,EAAE,iCAAiC;QAChD,KAAK,EAAE,IAAI,EAAE,6BAA6B;QAC1C,MAAM,EAAE,IAAI,EAAE,mCAAmC;QACjD,SAAS,EAAE,IAAI,EAAE,wBAAwB;QACzC,SAAS,EAAE,GAAG,EAAE,gCAAgC;KACjD;IACD,OAAO,EAAE;QACP,OAAO,EAAE,IAAI,EAAE,0BAA0B;QACzC,KAAK,EAAE,IAAI,EAAE,cAAc;QAC3B,MAAM,EAAE,GAAG,EAAE,qCAAqC;QAClD,SAAS,EAAE,GAAG,EAAE,8BAA8B;QAC9C,SAAS,EAAE,GAAG,EAAE,0BAA0B;KAC3C;IACD,KAAK,EAAE;QACL,OAAO,EAAE,IAAI,EAAE,kCAAkC;QACjD,KAAK,EAAE,IAAI,EAAE,gBAAgB;QAC7B,MAAM,EAAE,GAAG,EAAE,4BAA4B;QACzC,SAAS,EAAE,IAAI,EAAE,+BAA+B;QAChD,SAAS,EAAE,IAAI,EAAE,2BAA2B;KAC7C;CACF,CAAC;AAYF,8DAA8D;AAC9D,MAAM,CAAC,MAAM,mBAAmB,GAAmD;IACjF,WAAW,EAAE,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,EAAE;IACzC,OAAO,EAAE,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,EAAE;IACrC,KAAK,EAAE,EAAE,MAAM,EAAE,GAAG,EAAE,KAAK,EAAE,GAAG,EAAE;CACnC,CAAC;AAEF,yDAAyD;AACzD,+EAA+E;AAE/E,MAAM,CAAC,MAAM,oBAAoB,GAAG;IAClC,OAAO,EAAE,GAAG;IACZ,KAAK,EAAE,GAAG;IACV,MAAM,EAAE,GAAG;IACX,gBAAgB,EAAE,GAAG;CACb,CAAC;AAEX,yBAAyB;AAEzB;;GAEG;AACH,MAAM,UAAU,SAAS,CAAC,CAAS;IACjC,MAAM,EAAE,GAAG,WAAW,CAAC;IACvB,MAAM,EAAE,GAAG,CAAC,WAAW,CAAC;IACxB,MAAM,EAAE,GAAG,WAAW,CAAC;IACvB,MAAM,EAAE,GAAG,CAAC,WAAW,CAAC;IACxB,MAAM,EAAE,GAAG,WAAW,CAAC;IACvB,MAAM,CAAC,GAAG,SAAS,CAAC;IAEpB,MAAM,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAC5B,MAAM,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC;IACtC,MAAM,CAAC,GAAG,GAAG,GAAG,CAAC,GAAG,GAAG,CAAC,GAAG,IAAI,CAAC,CAAC;IACjC,MAAM,CAAC,GAAG,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,GAAG,IAAI,CAAC,CAAC;IAE5F,OAAO,GAAG,GAAG,CAAC,GAAG,GAAG,IAAI,GAAG,CAAC,CAAC,CAAC;AAChC,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,cAAc,CAAC,QAAgB,EAAE,MAAc,EAAE,KAAa;IAC5E,IAAI,QAAQ,IAAI,CAAC;QAAE,OAAO,CAAC,CAAC;IAC5B,IAAI,QAAQ,IAAI,CAAC;QAAE,OAAO,GAAG,CAAC;IAE9B,MAAM,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,GAAG,KAAK,CAAC;IAC1D,MAAM,GAAG,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC;IAEzB,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,GAAG,GAAG,CAAC,CAAC;AAC/B,CAAC;AAED,sBAAsB;AAEtB;;;;;;;;GAQG;AACH,MAAM,UAAU,uBAAuB,CAAC,MAAgB;IACtD,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,GAAG,CAAC;IAEpC,IAAI,WAAW,GAAG,CAAC,CAAC;IACpB,IAAI,WAAW,GAAG,CAAC,CAAC;IAEpB,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;QACvB,MAAM,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAC3B,WAAW,IAAI,CAAC,GAAG,CAAC,CAAC;QACrB,WAAW,IAAI,CAAC,CAAC;IACnB,CAAC;IAED,OAAO,WAAW,GAAG,WAAW,CAAC;AACnC,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,kBAAkB,CAChC,MAA0B,EAC1B,YAA2B,EAC3B,SAA8D;IAE9D,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,oBAAoB,CAAC,SAAS,CAAC,CAAC;IAEhE,wEAAwE;IACxE,IAAI,SAAS,KAAK,OAAO,EAAE,CAAC;QAC1B,OAAO,uBAAuB,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC;IAC7D,CAAC;IAED,MAAM,cAAc,GAAG,IAAI,GAAG,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;IAEnE,IAAI,WAAW,GAAG,CAAC,CAAC;IACpB,IAAI,WAAW,GAAG,CAAC,CAAC;IAEpB,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,MAAM,WAAW,GAAG,cAAc,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;QACjD,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,WAAW,EAAE,YAAY,IAAI,CAAC,CAAC,CAAC;QACtD,WAAW,IAAI,KAAK,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;QACpC,WAAW,IAAI,CAAC,CAAC;IACnB,CAAC;IAED,OAAO,WAAW,GAAG,CAAC,CAAC,CAAC,CAAC,WAAW,GAAG,WAAW,CAAC,CAAC,CAAC,oBAAoB,CAAC,SAAS,CAAC,CAAC;AACvF,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,oBAAoB,CAClC,QAA6B,EAC7B,MAA0B,EAC1B,SAA4B,EAC5B,YAA2B,EAC3B,WAA+B;IAE/B,MAAM,oBAAoB,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC,CAAC;IACzF,MAAM,cAAc,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC,CAAC;IAC7E,MAAM,YAAY,GAAG,cAAc,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,KAAK,SAAS,CAAC,CAAC,MAAM,CAAC;IAEpF,MAAM,OAAO,GAAG,0BAA0B,CAAC,QAAQ,CAAC,CAAC;IACrD,MAAM,GAAG,GAAG,WAAW,IAAI,mBAAmB,CAAC,QAAQ,CAAC,CAAC;IAEzD,2BAA2B;IAC3B,MAAM,UAAU,GAAG,kBAAkB,CAAC,cAAc,EAAE,oBAAoB,EAAE,SAAS,CAAC,CAAC;IACvF,MAAM,QAAQ,GAAG,kBAAkB,CAAC,cAAc,EAAE,oBAAoB,EAAE,OAAO,CAAC,CAAC;IACnF,MAAM,SAAS,GAAG,kBAAkB,CAAC,cAAc,EAAE,oBAAoB,EAAE,QAAQ,CAAC,CAAC;IACrF,MAAM,YAAY,GAAG,kBAAkB,CAAC,cAAc,EAAE,oBAAoB,EAAE,kBAAkB,CAAC,CAAC;IAClG,MAAM,YAAY,GAAG,SAAS,CAAC,KAAK,CAAC;IAErC,qCAAqC;IACrC,MAAM,QAAQ,GACZ,UAAU,GAAG,OAAO,CAAC,OAAO;QAC5B,QAAQ,GAAG,OAAO,CAAC,KAAK;QACxB,SAAS,GAAG,OAAO,CAAC,MAAM;QAC1B,YAAY,GAAG,OAAO,CAAC,SAAS;QAChC,YAAY,GAAG,OAAO,CAAC,SAAS,CAAC;IAEnC,6BAA6B;IAC7B,MAAM,KAAK,GAAG,cAAc,CAAC,QAAQ,EAAE,GAAG,CAAC,MAAM,EAAE,GAAG,CAAC,KAAK,CAAC,CAAC;IAE9D,OAAO;QACL,KAAK;QACL,gBAAgB,EAAE,oBAAoB,CAAC,MAAM;QAC7C,YAAY;QACZ,UAAU,EAAE;YACV,OAAO,EAAE,IAAI,CAAC,KAAK,CAAC,UAAU,GAAG,GAAG,CAAC;YACrC,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,QAAQ,GAAG,GAAG,CAAC;YACjC,MAAM,EAAE,IAAI,CAAC,KAAK,CAAC,SAAS,GAAG,GAAG,CAAC;YACnC,SAAS,EAAE,IAAI,CAAC,KAAK,CAAC,YAAY,GAAG,GAAG,CAAC;YACzC,SAAS,EAAE,IAAI,CAAC,KAAK,CAAC,YAAY,GAAG,GAAG,CAAC;SAC1C;QACD,MAAM,EAAE,cAAc;QACtB,SAAS;KACV,CAAC;AACJ,CAAC"}
@@ -0,0 +1,5 @@
1
+ import type { ScoringWeights } from "../types/config.js";
2
+ /** Validate that scoring weights are positive and sum to approximately 1.0. */
3
+ export declare function validateWeights(weights: ScoringWeights): void;
4
+ export declare function computeComposite(goalAchievementScore: number, environmentScore: number, serviceScore: number, agentScore: number, weights: ScoringWeights): number;
5
+ //# sourceMappingURL=composite.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"composite.d.ts","sourceRoot":"","sources":["../../src/scoring/composite.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,oBAAoB,CAAC;AAEzD,+EAA+E;AAC/E,wBAAgB,eAAe,CAAC,OAAO,EAAE,cAAc,GAAG,IAAI,CAkB7D;AAED,wBAAgB,gBAAgB,CAC9B,oBAAoB,EAAE,MAAM,EAC5B,gBAAgB,EAAE,MAAM,EACxB,YAAY,EAAE,MAAM,EACpB,UAAU,EAAE,MAAM,EAClB,OAAO,EAAE,cAAc,GACtB,MAAM,CAUR"}
@@ -0,0 +1,24 @@
1
+ /** Validate that scoring weights are positive and sum to approximately 1.0. */
2
+ export function validateWeights(weights) {
3
+ const { goal_achievement, environment, service, agent } = weights;
4
+ if (goal_achievement < 0 || environment < 0 || service < 0 || agent < 0) {
5
+ throw new Error("Scoring weights must be non-negative");
6
+ }
7
+ const sum = goal_achievement + environment + service + agent;
8
+ if (sum === 0) {
9
+ throw new Error("Scoring weights must not all be zero");
10
+ }
11
+ if (Math.abs(sum - 1.0) > 0.01) {
12
+ throw new Error(`Scoring weights must sum to 1.0 (got ${sum.toFixed(3)}). ` +
13
+ `Received: goal_achievement=${goal_achievement}, environment=${environment}, service=${service}, agent=${agent}`);
14
+ }
15
+ }
16
+ export function computeComposite(goalAchievementScore, environmentScore, serviceScore, agentScore, weights) {
17
+ validateWeights(weights);
18
+ const weighted = goalAchievementScore * weights.goal_achievement +
19
+ environmentScore * weights.environment +
20
+ serviceScore * weights.service +
21
+ agentScore * weights.agent;
22
+ return Math.round(weighted);
23
+ }
24
+ //# sourceMappingURL=composite.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"composite.js","sourceRoot":"","sources":["../../src/scoring/composite.ts"],"names":[],"mappings":"AAEA,+EAA+E;AAC/E,MAAM,UAAU,eAAe,CAAC,OAAuB;IACrD,MAAM,EAAE,gBAAgB,EAAE,WAAW,EAAE,OAAO,EAAE,KAAK,EAAE,GAAG,OAAO,CAAC;IAElE,IAAI,gBAAgB,GAAG,CAAC,IAAI,WAAW,GAAG,CAAC,IAAI,OAAO,GAAG,CAAC,IAAI,KAAK,GAAG,CAAC,EAAE,CAAC;QACxE,MAAM,IAAI,KAAK,CAAC,sCAAsC,CAAC,CAAC;IAC1D,CAAC;IAED,MAAM,GAAG,GAAG,gBAAgB,GAAG,WAAW,GAAG,OAAO,GAAG,KAAK,CAAC;IAC7D,IAAI,GAAG,KAAK,CAAC,EAAE,CAAC;QACd,MAAM,IAAI,KAAK,CAAC,sCAAsC,CAAC,CAAC;IAC1D,CAAC;IAED,IAAI,IAAI,CAAC,GAAG,CAAC,GAAG,GAAG,GAAG,CAAC,GAAG,IAAI,EAAE,CAAC;QAC/B,MAAM,IAAI,KAAK,CACb,wCAAwC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,KAAK;YACzD,8BAA8B,gBAAgB,iBAAiB,WAAW,aAAa,OAAO,WAAW,KAAK,EAAE,CACnH,CAAC;IACJ,CAAC;AACH,CAAC;AAED,MAAM,UAAU,gBAAgB,CAC9B,oBAA4B,EAC5B,gBAAwB,EACxB,YAAoB,EACpB,UAAkB,EAClB,OAAuB;IAEvB,eAAe,CAAC,OAAO,CAAC,CAAC;IAEzB,MAAM,QAAQ,GACZ,oBAAoB,GAAG,OAAO,CAAC,gBAAgB;QAC/C,gBAAgB,GAAG,OAAO,CAAC,WAAW;QACtC,YAAY,GAAG,OAAO,CAAC,OAAO;QAC9B,UAAU,GAAG,OAAO,CAAC,KAAK,CAAC;IAE7B,OAAO,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;AAC9B,CAAC"}
@@ -0,0 +1,25 @@
1
+ import type { NormalizedTranscript } from "../transcript/types.js";
2
+ import type { RunResult } from "../types/output.js";
3
+ import type { DeepEvalResult, Interaction, SparseIndex, TriageResult } from "../types/scoring.js";
4
+ /**
5
+ * Run the deep evaluation LLM pass.
6
+ *
7
+ * Speed is always computed heuristically from interaction timing data (no LLM needed).
8
+ * The LLM evaluates ALL interactions for success, weight, contextRelevance,
9
+ * and necessity per category.
10
+ */
11
+ export declare function runDeepEval(result: RunResult, sparseIndex: SparseIndex, triage: TriageResult, normalized: NormalizedTranscript): Promise<DeepEvalResult>;
12
+ /**
13
+ * Compute a heuristic speed score (0-1) for an interaction based on
14
+ * duration and category. Deterministic — no LLM needed.
15
+ *
16
+ * Thresholds are generous to account for system overhead
17
+ * (SDK roundtrips, sandbox setup, process spawning).
18
+ */
19
+ export declare function computeHeuristicSpeed(interaction: Interaction): number;
20
+ /**
21
+ * Parse the deep eval LLM response.
22
+ * Fills in default audits for interactions the LLM missed and default necessity for missing categories.
23
+ */
24
+ export declare function parseDeepEvalResponse(responseText: string, sparseIndex: SparseIndex): DeepEvalResult;
25
+ //# sourceMappingURL=deep-eval.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"deep-eval.d.ts","sourceRoot":"","sources":["../../src/scoring/deep-eval.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAmB,oBAAoB,EAAE,MAAM,wBAAwB,CAAC;AACpF,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,oBAAoB,CAAC;AACpD,OAAO,KAAK,EACV,cAAc,EACd,WAAW,EAIX,WAAW,EACX,YAAY,EACb,MAAM,qBAAqB,CAAC;AAa7B;;;;;;GAMG;AACH,wBAAsB,WAAW,CAC/B,MAAM,EAAE,SAAS,EACjB,WAAW,EAAE,WAAW,EACxB,MAAM,EAAE,YAAY,EACpB,UAAU,EAAE,oBAAoB,GAC/B,OAAO,CAAC,cAAc,CAAC,CAoBzB;AAED;;;;;;GAMG;AACH,wBAAgB,qBAAqB,CAAC,WAAW,EAAE,WAAW,GAAG,MAAM,CAgCtE;AAqMD;;;GAGG;AACH,wBAAgB,qBAAqB,CAAC,YAAY,EAAE,MAAM,EAAE,WAAW,EAAE,WAAW,GAAG,cAAc,CA+CpG"}