@dailephd/my-dev-kit-lab 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (250) hide show
  1. package/README.md +272 -0
  2. package/benchmarks/contracts/benchmark-project-profiles.json +1199 -0
  3. package/benchmarks/contracts/todo-behavior.md +70 -0
  4. package/benchmarks/contracts/todo-benchmark-case.json +227 -0
  5. package/benchmarks/projects/README.md +34 -0
  6. package/benchmarks/projects/task-analytics-large-mixed/README.md +1 -0
  7. package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/__init__.py +3 -0
  8. package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/fixtures.py +6 -0
  9. package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/metrics.py +29 -0
  10. package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/models.py +21 -0
  11. package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/parser.py +16 -0
  12. package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/pipeline.py +9 -0
  13. package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/quality.py +8 -0
  14. package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/reporting.py +11 -0
  15. package/benchmarks/projects/task-analytics-large-mixed/py/tests/test_metrics.py +19 -0
  16. package/benchmarks/projects/task-analytics-large-mixed/py/tests/test_parser.py +15 -0
  17. package/benchmarks/projects/task-analytics-large-mixed/py/tests/test_quality.py +19 -0
  18. package/benchmarks/projects/task-analytics-large-mixed/py/tests/test_reporting.py +15 -0
  19. package/benchmarks/projects/task-analytics-large-mixed/ts/package.json +12 -0
  20. package/benchmarks/projects/task-analytics-large-mixed/ts/src/index.ts +11 -0
  21. package/benchmarks/projects/task-analytics-large-mixed/ts/src/models/analyticsSnapshot.ts +20 -0
  22. package/benchmarks/projects/task-analytics-large-mixed/ts/src/models/project.ts +5 -0
  23. package/benchmarks/projects/task-analytics-large-mixed/ts/src/models/task.ts +10 -0
  24. package/benchmarks/projects/task-analytics-large-mixed/ts/src/reporting/buildProjectLeaderboard.ts +7 -0
  25. package/benchmarks/projects/task-analytics-large-mixed/ts/src/reporting/formatTaskHealthReport.ts +13 -0
  26. package/benchmarks/projects/task-analytics-large-mixed/ts/src/services/buildAnalyticsSnapshot.ts +39 -0
  27. package/benchmarks/projects/task-analytics-large-mixed/ts/src/services/completeTask.ts +10 -0
  28. package/benchmarks/projects/task-analytics-large-mixed/ts/src/services/createTask.ts +21 -0
  29. package/benchmarks/projects/task-analytics-large-mixed/ts/src/services/listTasksByProject.ts +6 -0
  30. package/benchmarks/projects/task-analytics-large-mixed/ts/src/store/projectStore.ts +20 -0
  31. package/benchmarks/projects/task-analytics-large-mixed/ts/src/store/taskStore.ts +44 -0
  32. package/benchmarks/projects/task-analytics-large-mixed/ts/src/validation/projectValidation.ts +12 -0
  33. package/benchmarks/projects/task-analytics-large-mixed/ts/src/validation/taskValidation.ts +18 -0
  34. package/benchmarks/projects/task-analytics-large-mixed/ts/tests/buildAnalyticsSnapshot.test.ts +48 -0
  35. package/benchmarks/projects/task-analytics-large-mixed/ts/tests/completeTask.test.ts +21 -0
  36. package/benchmarks/projects/task-analytics-large-mixed/ts/tests/createTask.test.ts +31 -0
  37. package/benchmarks/projects/task-analytics-large-mixed/ts/tests/listTasksByProject.test.ts +18 -0
  38. package/benchmarks/projects/task-analytics-large-mixed/ts/tests/reporting.test.ts +19 -0
  39. package/benchmarks/projects/task-analytics-large-mixed/ts/tsconfig.json +12 -0
  40. package/benchmarks/projects/task-analytics-large-mixed/ts/vitest.config.ts +5 -0
  41. package/benchmarks/projects/task-workflow-medium-ts/README.md +1 -0
  42. package/benchmarks/projects/task-workflow-medium-ts/package.json +12 -0
  43. package/benchmarks/projects/task-workflow-medium-ts/src/index.ts +9 -0
  44. package/benchmarks/projects/task-workflow-medium-ts/src/models/project.ts +6 -0
  45. package/benchmarks/projects/task-workflow-medium-ts/src/models/task.ts +39 -0
  46. package/benchmarks/projects/task-workflow-medium-ts/src/services/completeTask.ts +15 -0
  47. package/benchmarks/projects/task-workflow-medium-ts/src/services/createTask.ts +26 -0
  48. package/benchmarks/projects/task-workflow-medium-ts/src/services/filterTasks.ts +17 -0
  49. package/benchmarks/projects/task-workflow-medium-ts/src/services/importTasks.ts +33 -0
  50. package/benchmarks/projects/task-workflow-medium-ts/src/services/summarizeTasks.ts +30 -0
  51. package/benchmarks/projects/task-workflow-medium-ts/src/store/taskStore.ts +76 -0
  52. package/benchmarks/projects/task-workflow-medium-ts/src/utils/deterministicId.ts +3 -0
  53. package/benchmarks/projects/task-workflow-medium-ts/src/validation/taskValidation.ts +45 -0
  54. package/benchmarks/projects/task-workflow-medium-ts/tests/completeTask.test.ts +16 -0
  55. package/benchmarks/projects/task-workflow-medium-ts/tests/createTask.test.ts +21 -0
  56. package/benchmarks/projects/task-workflow-medium-ts/tests/filterTasks.test.ts +18 -0
  57. package/benchmarks/projects/task-workflow-medium-ts/tests/importTasks.test.ts +22 -0
  58. package/benchmarks/projects/task-workflow-medium-ts/tests/summarizeTasks.test.ts +29 -0
  59. package/benchmarks/projects/task-workflow-medium-ts/tsconfig.json +12 -0
  60. package/benchmarks/projects/task-workflow-medium-ts/vitest.config.ts +5 -0
  61. package/benchmarks/projects/todo-js/README.md +3 -0
  62. package/benchmarks/projects/todo-js/package.json +11 -0
  63. package/benchmarks/projects/todo-js/src/index.js +2 -0
  64. package/benchmarks/projects/todo-js/src/taskService.js +37 -0
  65. package/benchmarks/projects/todo-js/src/taskStore.js +28 -0
  66. package/benchmarks/projects/todo-js/tests/taskService.test.js +45 -0
  67. package/benchmarks/projects/todo-js/vitest.config.js +5 -0
  68. package/benchmarks/projects/todo-mixed-ts-py/README.md +3 -0
  69. package/benchmarks/projects/todo-mixed-ts-py/package.json +13 -0
  70. package/benchmarks/projects/todo-mixed-ts-py/python/task_service.py +76 -0
  71. package/benchmarks/projects/todo-mixed-ts-py/src/taskCli.ts +38 -0
  72. package/benchmarks/projects/todo-mixed-ts-py/tests/mixedBoundary.test.ts +18 -0
  73. package/benchmarks/projects/todo-mixed-ts-py/tsconfig.json +12 -0
  74. package/benchmarks/projects/todo-mixed-ts-py/vitest.config.ts +5 -0
  75. package/benchmarks/projects/todo-python/README.md +3 -0
  76. package/benchmarks/projects/todo-python/src/__init__.py +4 -0
  77. package/benchmarks/projects/todo-python/src/task_service.py +32 -0
  78. package/benchmarks/projects/todo-python/src/task_store.py +28 -0
  79. package/benchmarks/projects/todo-python/tests/test_task_service.py +52 -0
  80. package/benchmarks/projects/todo-ts/README.md +3 -0
  81. package/benchmarks/projects/todo-ts/package.json +12 -0
  82. package/benchmarks/projects/todo-ts/src/index.ts +2 -0
  83. package/benchmarks/projects/todo-ts/src/taskService.ts +41 -0
  84. package/benchmarks/projects/todo-ts/src/taskStore.ts +34 -0
  85. package/benchmarks/projects/todo-ts/tests/taskService.test.ts +45 -0
  86. package/benchmarks/projects/todo-ts/tsconfig.json +12 -0
  87. package/benchmarks/projects/todo-ts/vitest.config.ts +5 -0
  88. package/dist/scripts/build-gallery.js +3 -0
  89. package/dist/scripts/capture-demo-report.js +3 -0
  90. package/dist/scripts/evaluate-token-savings.js +2 -0
  91. package/dist/scripts/experiments/describeExperiment.js +143 -0
  92. package/dist/scripts/experiments/listExperiments.js +44 -0
  93. package/dist/scripts/experiments/runExperiment.js +199 -0
  94. package/dist/scripts/generate-experiment-plots.js +3 -0
  95. package/dist/scripts/generate-prompt-variants.js +2 -0
  96. package/dist/scripts/render-experiment-report.js +2 -0
  97. package/dist/scripts/run-agent-prompt.js +2 -0
  98. package/dist/scripts/run-controlled-experiment.js +2 -0
  99. package/dist/scripts/run-final-demo.js +3 -0
  100. package/dist/scripts/run-lab-demo.js +5 -0
  101. package/dist/scripts/run-visualization-demos.js +3 -0
  102. package/dist/scripts/security/runCodeql.js +57 -0
  103. package/dist/scripts/security/runDependencyChecks.js +57 -0
  104. package/dist/scripts/security/runFuzzSmoke.js +29 -0
  105. package/dist/scripts/security/runPackageChecks.js +56 -0
  106. package/dist/scripts/security/runSemgrep.js +63 -0
  107. package/dist/scripts/security/validate.js +117 -0
  108. package/dist/scripts/verify-benchmarks.js +202 -0
  109. package/dist/src/agents/adapters/claudeAdapter.js +37 -0
  110. package/dist/src/agents/adapters/codexAdapter.js +110 -0
  111. package/dist/src/agents/adapters/fakeAgentAdapter.js +101 -0
  112. package/dist/src/agents/agentRegistry.js +21 -0
  113. package/dist/src/agents/index.js +7 -0
  114. package/dist/src/agents/parseAgentTokenUsage.js +137 -0
  115. package/dist/src/agents/runAgentPrompt.js +38 -0
  116. package/dist/src/agents/types.js +1 -0
  117. package/dist/src/commands/buildGalleryCommand.js +56 -0
  118. package/dist/src/commands/captureDemoReport.js +116 -0
  119. package/dist/src/commands/evaluateTokenSavings.js +175 -0
  120. package/dist/src/commands/generateExperimentPlotsCommand.js +38 -0
  121. package/dist/src/commands/generatePromptVariants.js +67 -0
  122. package/dist/src/commands/renderExperimentReportCommand.js +131 -0
  123. package/dist/src/commands/runAgentPromptCommand.js +132 -0
  124. package/dist/src/commands/runControlledExperimentCommand.js +174 -0
  125. package/dist/src/commands/runFinalDemoCommand.js +123 -0
  126. package/dist/src/commands/runLabDemo.js +62 -0
  127. package/dist/src/commands/runVisualizationDemosCommand.js +67 -0
  128. package/dist/src/core/commandLine.js +59 -0
  129. package/dist/src/core/countTokens.js +8 -0
  130. package/dist/src/core/fileGlobs.js +100 -0
  131. package/dist/src/core/localProjectTarget.js +75 -0
  132. package/dist/src/core/pathSafety.js +19 -0
  133. package/dist/src/core/pythonCommand.js +30 -0
  134. package/dist/src/core/resolveCommand.js +110 -0
  135. package/dist/src/core/runMeasuredCommand.js +143 -0
  136. package/dist/src/evaluation/benchmarkMetadata.js +207 -0
  137. package/dist/src/evaluation/buildExperimentMatrix.js +75 -0
  138. package/dist/src/evaluation/classifyAgentRunOutcome.js +40 -0
  139. package/dist/src/evaluation/compareExperimentRuns.js +79 -0
  140. package/dist/src/evaluation/compareTokenSavings.js +47 -0
  141. package/dist/src/evaluation/controlledExperimentTypes.js +1 -0
  142. package/dist/src/evaluation/index.js +18 -0
  143. package/dist/src/evaluation/parseAgentAnswer.js +230 -0
  144. package/dist/src/evaluation/projectComplexity.js +126 -0
  145. package/dist/src/evaluation/projectFileTree.js +83 -0
  146. package/dist/src/evaluation/readEvaluationCases.js +59 -0
  147. package/dist/src/evaluation/renderTokenSavingsReportInput.js +55 -0
  148. package/dist/src/evaluation/runControlledExperiment.js +158 -0
  149. package/dist/src/evaluation/runMyDevKitRetrieval.js +197 -0
  150. package/dist/src/evaluation/runRawFullFileBaseline.js +31 -0
  151. package/dist/src/evaluation/scoreCorrectness.js +127 -0
  152. package/dist/src/evaluation/types.js +1 -0
  153. package/dist/src/evaluation/writeExperimentArtifacts.js +104 -0
  154. package/dist/src/evaluation/writeTokenSavingsArtifacts.js +57 -0
  155. package/dist/src/experiments/config.js +24 -0
  156. package/dist/src/experiments/defaultRegistry.js +7 -0
  157. package/dist/src/experiments/errors.js +18 -0
  158. package/dist/src/experiments/index.js +9 -0
  159. package/dist/src/experiments/outputPaths.js +25 -0
  160. package/dist/src/experiments/plugins/contextStrategyComparison/config.js +37 -0
  161. package/dist/src/experiments/plugins/contextStrategyComparison/index.js +3 -0
  162. package/dist/src/experiments/plugins/contextStrategyComparison/plugin.js +83 -0
  163. package/dist/src/experiments/plugins/contextStrategyComparison/resultMapping.js +260 -0
  164. package/dist/src/experiments/plugins/index.js +1 -0
  165. package/dist/src/experiments/registry.js +43 -0
  166. package/dist/src/experiments/results.js +48 -0
  167. package/dist/src/experiments/runner.js +181 -0
  168. package/dist/src/experiments/target.js +8 -0
  169. package/dist/src/experiments/types.js +1 -0
  170. package/dist/src/gallery/index.js +2 -0
  171. package/dist/src/gallery/types.js +1 -0
  172. package/dist/src/gallery/writeGalleryManifest.js +214 -0
  173. package/dist/src/index.js +12 -0
  174. package/dist/src/plots/buildExperimentPlotData.js +137 -0
  175. package/dist/src/plots/index.js +4 -0
  176. package/dist/src/plots/renderSvgChart.js +82 -0
  177. package/dist/src/plots/types.js +1 -0
  178. package/dist/src/plots/writePlotArtifacts.js +46 -0
  179. package/dist/src/prompts/buildPromptContext.js +68 -0
  180. package/dist/src/prompts/generateMyDevKitPrompt.js +106 -0
  181. package/dist/src/prompts/generatePromptVariants.js +36 -0
  182. package/dist/src/prompts/generateRawFullFilePrompt.js +97 -0
  183. package/dist/src/prompts/index.js +7 -0
  184. package/dist/src/prompts/measurePromptComplexity.js +41 -0
  185. package/dist/src/prompts/types.js +1 -0
  186. package/dist/src/prompts/writePromptArtifacts.js +43 -0
  187. package/dist/src/report/buildExperimentReportInput.js +339 -0
  188. package/dist/src/report/experimentReportTypes.js +1 -0
  189. package/dist/src/report/experiments/buildPluginExperimentReport.js +153 -0
  190. package/dist/src/report/experiments/experimentReportModel.js +1 -0
  191. package/dist/src/report/experiments/index.js +4 -0
  192. package/dist/src/report/experiments/renderPluginExperimentReportHtml.js +133 -0
  193. package/dist/src/report/experiments/writePluginExperimentReports.js +30 -0
  194. package/dist/src/report/index.js +8 -0
  195. package/dist/src/report/renderExperimentHtmlReport.js +354 -0
  196. package/dist/src/report/renderHtmlReport.js +103 -0
  197. package/dist/src/report/types.js +10 -0
  198. package/dist/src/report/writeExperimentReportArtifacts.js +38 -0
  199. package/dist/src/report/writeReportArtifacts.js +39 -0
  200. package/dist/src/screenshot/captureReportScreenshot.js +75 -0
  201. package/dist/src/screenshot/index.js +2 -0
  202. package/dist/src/screenshot/types.js +1 -0
  203. package/dist/src/securityValidation/artifacts.js +15 -0
  204. package/dist/src/securityValidation/cliAdversarial/adversarialCliConfig.js +38 -0
  205. package/dist/src/securityValidation/cliAdversarial/dataVolumeChecks.js +194 -0
  206. package/dist/src/securityValidation/cliAdversarial/jsonStdoutChecks.js +359 -0
  207. package/dist/src/securityValidation/cliAdversarial/malformedArtifactChecks.js +284 -0
  208. package/dist/src/securityValidation/cliAdversarial/malformedArtifactFixtures.js +79 -0
  209. package/dist/src/securityValidation/cliAdversarial/pathBoundaryChecks.js +431 -0
  210. package/dist/src/securityValidation/cliAdversarial/pathCases.js +144 -0
  211. package/dist/src/securityValidation/cliAdversarial/readOnlyBoundaryChecks.js +294 -0
  212. package/dist/src/securityValidation/cliAdversarial/runAdversarialCheck.js +149 -0
  213. package/dist/src/securityValidation/cliAdversarial/subprocessSafetyChecks.js +214 -0
  214. package/dist/src/securityValidation/cliAdversarial/tempWorkspace.js +160 -0
  215. package/dist/src/securityValidation/commandRunner.js +136 -0
  216. package/dist/src/securityValidation/config.js +39 -0
  217. package/dist/src/securityValidation/dependencies/parseNpmAudit.js +115 -0
  218. package/dist/src/securityValidation/dependencies/parseNpmLs.js +71 -0
  219. package/dist/src/securityValidation/dependencies/parseNpmOutdated.js +41 -0
  220. package/dist/src/securityValidation/dependencies/runDependencyChecks.js +239 -0
  221. package/dist/src/securityValidation/dependencies/runOsvScanner.js +43 -0
  222. package/dist/src/securityValidation/fuzz/fuzzHarness.js +61 -0
  223. package/dist/src/securityValidation/fuzz/fuzzTargets.js +204 -0
  224. package/dist/src/securityValidation/fuzz/randomInput.js +0 -0
  225. package/dist/src/securityValidation/index.js +34 -0
  226. package/dist/src/securityValidation/packageChecks/forbiddenPackageContents.js +67 -0
  227. package/dist/src/securityValidation/packageChecks/parseNpmPackDryRun.js +56 -0
  228. package/dist/src/securityValidation/packageChecks/runPackageChecks.js +88 -0
  229. package/dist/src/securityValidation/report/renderSecurityReport.js +248 -0
  230. package/dist/src/securityValidation/report/securityReportTypes.js +1 -0
  231. package/dist/src/securityValidation/staticScans/codeql.js +66 -0
  232. package/dist/src/securityValidation/staticScans/semgrep.js +180 -0
  233. package/dist/src/securityValidation/testMatrix.js +535 -0
  234. package/dist/src/securityValidation/types.js +34 -0
  235. package/dist/src/securityValidation/validate/resolveTarget.js +32 -0
  236. package/dist/src/securityValidation/validate/runSecurityValidation.js +169 -0
  237. package/dist/src/securityValidation/validate/verdict.js +73 -0
  238. package/dist/src/visualizationDemos/buildMyDevKitVisualizationCommands.js +59 -0
  239. package/dist/src/visualizationDemos/index.js +4 -0
  240. package/dist/src/visualizationDemos/runVisualizationDemos.js +82 -0
  241. package/dist/src/visualizationDemos/types.js +1 -0
  242. package/dist/src/visualizationDemos/writeVisualizationDemoArtifacts.js +25 -0
  243. package/docs/METRICS.md +286 -0
  244. package/examples/demo-report-input.json +78 -0
  245. package/examples/lab-demo-cases.json +35 -0
  246. package/examples/real-agent-campaign-cases.json +118 -0
  247. package/examples/token-savings-cases.json +122 -0
  248. package/package.json +91 -0
  249. package/tests/fixtures/fake-adversarial-cli.js +152 -0
  250. package/tests/fixtures/fake-my-dev-kit-cli.js +83 -0
@@ -0,0 +1,158 @@
1
+ import path from "node:path";
2
+ import { runAgentPrompt } from "../agents/index.js";
3
+ import { generatePromptVariants } from "../prompts/index.js";
4
+ import { buildExperimentMatrix } from "./buildExperimentMatrix.js";
5
+ import { classifyAgentRunOutcome } from "./classifyAgentRunOutcome.js";
6
+ import { compareExperimentRuns } from "./compareExperimentRuns.js";
7
+ import { parseAgentAnswer } from "./parseAgentAnswer.js";
8
+ import { scoreCorrectness } from "./scoreCorrectness.js";
9
+ import { buildExperimentSummary, writeExperimentArtifacts } from "./writeExperimentArtifacts.js";
10
+ export async function runControlledExperiment(args) {
11
+ const repoRoot = args.repoRoot ?? process.cwd();
12
+ const config = {
13
+ ...args.config,
14
+ agents: args.config.agents ?? ["fake-agent"],
15
+ strategies: args.config.strategies ?? ["raw-full-file", "my-dev-kit-guided"],
16
+ complexityLevels: args.config.complexityLevels ?? ["short"],
17
+ continueOnFailure: args.config.continueOnFailure ?? true,
18
+ includeRealAgents: args.config.includeRealAgents ?? false
19
+ };
20
+ const matrix = buildExperimentMatrix({ cases: args.cases, config });
21
+ const runs = [];
22
+ for (const cell of matrix) {
23
+ const evaluationCase = args.cases.find((candidate) => candidate.id === cell.caseId);
24
+ if (!evaluationCase) {
25
+ throw new Error(`Evaluation case not found while running matrix: ${cell.caseId}`);
26
+ }
27
+ const promptVariant = buildPromptVariant({
28
+ evaluationCase,
29
+ projectProfiles: args.projectProfiles,
30
+ strategy: cell.strategy,
31
+ complexityLevel: cell.complexityLevel
32
+ });
33
+ const runDir = path.join(path.resolve(repoRoot, config.outDir), "runs", cell.runId);
34
+ let run = await executeExperimentCell({
35
+ runId: cell.runId,
36
+ agentId: cell.agentId,
37
+ promptVariant,
38
+ repoRoot,
39
+ runDir,
40
+ timeoutMs: config.timeoutMs,
41
+ requireAgents: config.requireAgents ?? false,
42
+ commandTemplate: cell.agentId === "codex" || cell.agentId === "claude" ? config.commandTemplates?.[cell.agentId] : undefined,
43
+ env: args.env ?? process.env
44
+ });
45
+ runs.push(run);
46
+ if (run.status !== "completed" && config.continueOnFailure === false) {
47
+ break;
48
+ }
49
+ }
50
+ const comparisons = compareExperimentRuns(runs);
51
+ const summary = buildExperimentSummary({ config, runs, comparisons });
52
+ return writeExperimentArtifacts({
53
+ outDir: path.resolve(repoRoot, config.outDir),
54
+ config,
55
+ runs,
56
+ comparisons,
57
+ summary
58
+ });
59
+ }
60
+ async function executeExperimentCell(args) {
61
+ let agentRunResult;
62
+ try {
63
+ agentRunResult = await runAgentPrompt({
64
+ runId: args.runId,
65
+ agentId: args.agentId,
66
+ promptVariant: args.promptVariant,
67
+ promptText: args.promptVariant.promptText,
68
+ cwd: args.repoRoot,
69
+ outDir: args.runDir,
70
+ timeoutMs: args.timeoutMs,
71
+ requireAvailable: args.requireAgents,
72
+ commandTemplate: args.commandTemplate,
73
+ env: args.env
74
+ });
75
+ }
76
+ catch (error) {
77
+ agentRunResult = buildSyntheticFailureResult(args, error);
78
+ }
79
+ const parsedAnswer = parseAgentAnswer({
80
+ text: agentRunResult.finalAnswerText,
81
+ answerKey: args.promptVariant.expectedAnswerKey,
82
+ tokenUsage: agentRunResult.tokenUsage
83
+ });
84
+ const classification = classifyAgentRunOutcome({ agentRunResult, parsedAnswer });
85
+ const correctness = scoreCorrectness({
86
+ caseId: args.promptVariant.caseId,
87
+ answerKey: args.promptVariant.expectedAnswerKey,
88
+ parsedAnswer,
89
+ status: classification.status
90
+ });
91
+ return {
92
+ runId: args.runId,
93
+ caseId: args.promptVariant.caseId,
94
+ benchmarkProject: args.promptVariant.benchmarkProject,
95
+ agentId: args.agentId,
96
+ promptStrategy: args.promptVariant.strategy,
97
+ promptComplexityLevel: args.promptVariant.complexityLevel,
98
+ promptVariantId: args.promptVariant.id,
99
+ promptTextForArtifact: args.promptVariant.promptText,
100
+ projectComplexityLevel: args.promptVariant.projectProfile.complexityLevel,
101
+ projectComplexityScore: args.promptVariant.projectProfile.complexityScore,
102
+ promptMetrics: args.promptVariant.promptMetrics,
103
+ agentRunResult,
104
+ parsedAnswer,
105
+ correctness,
106
+ status: classification.status,
107
+ statusReason: classification.statusReason,
108
+ startedAt: agentRunResult.startedAt,
109
+ endedAt: agentRunResult.endedAt,
110
+ durationMs: agentRunResult.durationMs,
111
+ tokenUsage: agentRunResult.tokenUsage,
112
+ tokenUsageSource: agentRunResult.tokenUsageSource,
113
+ tokenUsageReliability: agentRunResult.tokenUsageReliability,
114
+ warnings: classification.warnings,
115
+ errors: classification.errors,
116
+ artifactPaths: {}
117
+ };
118
+ }
119
+ function buildPromptVariant(args) {
120
+ const [variant] = generatePromptVariants({
121
+ cases: [args.evaluationCase],
122
+ projectProfiles: args.projectProfiles,
123
+ strategies: [args.strategy],
124
+ complexityLevels: [args.complexityLevel]
125
+ });
126
+ if (!variant) {
127
+ throw new Error(`Failed to generate prompt variant for case: ${args.evaluationCase.id}`);
128
+ }
129
+ return variant;
130
+ }
131
+ function buildSyntheticFailureResult(args, error) {
132
+ const now = new Date().toISOString();
133
+ const message = error instanceof Error ? error.message : String(error);
134
+ return {
135
+ runId: args.runId,
136
+ agentId: args.agentId,
137
+ displayName: args.agentId,
138
+ surface: args.agentId === "fake-agent" ? "simulated" : "cli",
139
+ promptVariantId: args.promptVariant.id,
140
+ promptStrategy: args.promptVariant.strategy,
141
+ promptComplexityLevel: args.promptVariant.complexityLevel,
142
+ startedAt: now,
143
+ endedAt: now,
144
+ durationMs: 0,
145
+ status: "failed",
146
+ exitCode: null,
147
+ command: args.agentId,
148
+ args: [],
149
+ cwd: args.repoRoot,
150
+ finalAnswerText: "",
151
+ finalAnswerParseStatus: "empty",
152
+ tokenUsage: { source: "unavailable" },
153
+ tokenUsageSource: "unavailable",
154
+ tokenUsageReliability: "unavailable",
155
+ warnings: [],
156
+ errors: [message]
157
+ };
158
+ }
@@ -0,0 +1,197 @@
1
+ import path from "node:path";
2
+ import { countEstimatedTokens, countTextChars, tokenCountMethod } from "../core/countTokens.js";
3
+ import { runMeasuredCommand } from "../core/runMeasuredCommand.js";
4
+ function parseJsonIfPossible(text) {
5
+ try {
6
+ return JSON.parse(text);
7
+ }
8
+ catch {
9
+ return undefined;
10
+ }
11
+ }
12
+ function readSearchResults(payload) {
13
+ if (Array.isArray(payload)) {
14
+ return payload.filter((item) => !!item && typeof item === "object");
15
+ }
16
+ if (!payload || typeof payload !== "object") {
17
+ return [];
18
+ }
19
+ const record = payload;
20
+ for (const key of ["results", "matches", "items", "data"]) {
21
+ if (Array.isArray(record[key])) {
22
+ return record[key].filter((item) => !!item && typeof item === "object");
23
+ }
24
+ }
25
+ return [];
26
+ }
27
+ function pickCandidateFields(candidate) {
28
+ const readString = (...keys) => {
29
+ for (const key of keys) {
30
+ if (typeof candidate[key] === "string" && candidate[key]) {
31
+ return candidate[key];
32
+ }
33
+ }
34
+ return undefined;
35
+ };
36
+ return {
37
+ nodeId: readString("nodeId", "id", "node", "symbolId"),
38
+ file: readString("file", "path", "filePath"),
39
+ symbol: readString("symbol", "name", "label")
40
+ };
41
+ }
42
+ export async function runMyDevKitRetrieval(options) {
43
+ const started = Date.now();
44
+ const warnings = [];
45
+ const commandsDir = path.join(options.outputDir, "commands", options.evaluationCase.id);
46
+ const indexesDir = path.join(options.outputDir, "indexes", options.evaluationCase.id);
47
+ const commands = [];
48
+ const indexCommand = await runMeasuredCommand({
49
+ commandId: "index",
50
+ commandString: options.kitCommand,
51
+ cwd: process.cwd(),
52
+ outDir: commandsDir,
53
+ extraArgs: [
54
+ "index",
55
+ "--root",
56
+ options.evaluationCase.absoluteTargetRoot,
57
+ ...options.evaluationCase.sourceRoots.flatMap((sourceRoot) => ["--src", sourceRoot]),
58
+ "--out",
59
+ indexesDir,
60
+ "--json"
61
+ ]
62
+ });
63
+ commands.push(indexCommand);
64
+ if (!indexCommand.ok) {
65
+ if (options.requireKit) {
66
+ throw new Error(indexCommand.error || `my-dev-kit index failed with exit code ${indexCommand.exitCode}`);
67
+ }
68
+ warnings.push("my-dev-kit index command was unavailable or failed.");
69
+ return {
70
+ caseId: options.evaluationCase.id,
71
+ skipped: true,
72
+ warnings,
73
+ totalChars: 0,
74
+ totalEstimatedTokens: 0,
75
+ tokenCountMethod,
76
+ contextText: "",
77
+ filesRead: [],
78
+ commands,
79
+ durationMs: Date.now() - started
80
+ };
81
+ }
82
+ const searchCommand = await runMeasuredCommand({
83
+ commandId: "search",
84
+ commandString: options.kitCommand,
85
+ cwd: process.cwd(),
86
+ outDir: commandsDir,
87
+ extraArgs: ["search", "--index", indexesDir, "--query", options.evaluationCase.query, "--json"]
88
+ });
89
+ commands.push(searchCommand);
90
+ if (!searchCommand.ok) {
91
+ if (options.requireKit) {
92
+ throw new Error(searchCommand.error || `my-dev-kit search failed with exit code ${searchCommand.exitCode}`);
93
+ }
94
+ warnings.push("my-dev-kit search command failed.");
95
+ return {
96
+ caseId: options.evaluationCase.id,
97
+ skipped: true,
98
+ warnings,
99
+ totalChars: 0,
100
+ totalEstimatedTokens: 0,
101
+ tokenCountMethod,
102
+ contextText: "",
103
+ filesRead: [],
104
+ commands,
105
+ durationMs: Date.now() - started
106
+ };
107
+ }
108
+ const searchPayload = parseJsonIfPossible(searchCommand.stdout);
109
+ const candidates = readSearchResults(searchPayload);
110
+ const selected = candidates[0];
111
+ if (!selected) {
112
+ warnings.push("No my-dev-kit search candidate was found.");
113
+ return {
114
+ caseId: options.evaluationCase.id,
115
+ skipped: true,
116
+ warnings,
117
+ totalChars: 0,
118
+ totalEstimatedTokens: 0,
119
+ tokenCountMethod,
120
+ contextText: "",
121
+ filesRead: [],
122
+ commands,
123
+ durationMs: Date.now() - started
124
+ };
125
+ }
126
+ const candidate = pickCandidateFields(selected);
127
+ const selectedNodeId = candidate.nodeId;
128
+ const selectedFile = candidate.file;
129
+ const selectedSymbol = candidate.symbol;
130
+ let lookupOutput = "";
131
+ let sliceOutput = "";
132
+ let sourceOutput = "";
133
+ if (selectedNodeId) {
134
+ const lookupCommand = await runMeasuredCommand({
135
+ commandId: "lookup",
136
+ commandString: options.kitCommand,
137
+ cwd: process.cwd(),
138
+ outDir: commandsDir,
139
+ extraArgs: ["lookup", "--index", indexesDir, "--node", selectedNodeId, "--json"]
140
+ });
141
+ commands.push(lookupCommand);
142
+ if (lookupCommand.ok) {
143
+ lookupOutput = lookupCommand.stdout;
144
+ }
145
+ else {
146
+ warnings.push("my-dev-kit lookup command failed.");
147
+ }
148
+ const sliceCommand = await runMeasuredCommand({
149
+ commandId: "slice",
150
+ commandString: options.kitCommand,
151
+ cwd: process.cwd(),
152
+ outDir: commandsDir,
153
+ extraArgs: ["slice", "--index", indexesDir, "--node", selectedNodeId, "--json"]
154
+ });
155
+ commands.push(sliceCommand);
156
+ if (sliceCommand.ok) {
157
+ sliceOutput = sliceCommand.stdout;
158
+ }
159
+ else {
160
+ warnings.push("my-dev-kit slice command failed.");
161
+ }
162
+ const sourceCommand = await runMeasuredCommand({
163
+ commandId: "source",
164
+ commandString: options.kitCommand,
165
+ cwd: process.cwd(),
166
+ outDir: commandsDir,
167
+ extraArgs: ["source", "--index", indexesDir, "--node", selectedNodeId, "--max-lines", "160", "--format", "numbered"]
168
+ });
169
+ commands.push(sourceCommand);
170
+ if (sourceCommand.ok) {
171
+ sourceOutput = sourceCommand.stdout;
172
+ }
173
+ else {
174
+ warnings.push("my-dev-kit source command failed.");
175
+ }
176
+ }
177
+ else {
178
+ warnings.push("No my-dev-kit node id was available after search.");
179
+ }
180
+ const contextText = [sourceOutput, sliceOutput, lookupOutput, searchCommand.stdout].find((text) => text && text.trim().length > 0) ?? "";
181
+ const filesRead = selectedFile ? [selectedFile] : [];
182
+ return {
183
+ caseId: options.evaluationCase.id,
184
+ skipped: contextText.length === 0,
185
+ warnings,
186
+ totalChars: countTextChars(contextText),
187
+ totalEstimatedTokens: countEstimatedTokens(contextText),
188
+ tokenCountMethod,
189
+ contextText,
190
+ filesRead,
191
+ commands,
192
+ selectedNodeId,
193
+ selectedFile,
194
+ selectedSymbol,
195
+ durationMs: Date.now() - started
196
+ };
197
+ }
@@ -0,0 +1,31 @@
1
+ import { readFileSync, statSync } from "node:fs";
2
+ import { collectFilesForGlobs } from "../core/fileGlobs.js";
3
+ import { countEstimatedTokens, countTextChars, tokenCountMethod } from "../core/countTokens.js";
4
+ export async function runRawFullFileBaseline(evaluationCase) {
5
+ const started = Date.now();
6
+ let stats;
7
+ try {
8
+ stats = statSync(evaluationCase.absoluteTargetRoot);
9
+ }
10
+ catch {
11
+ throw new Error(`Target root does not exist: ${evaluationCase.targetRoot}`);
12
+ }
13
+ if (!stats.isDirectory()) {
14
+ throw new Error(`Target root is not a directory: ${evaluationCase.targetRoot}`);
15
+ }
16
+ const files = collectFilesForGlobs(evaluationCase.absoluteTargetRoot, evaluationCase.rawIncludeGlobs);
17
+ const contextText = files
18
+ .map(({ absolutePath, relativePath }) => `=== FILE: ${relativePath} ===\n${readFileSync(absolutePath, "utf8")}\n`)
19
+ .join("\n");
20
+ return {
21
+ caseId: evaluationCase.id,
22
+ targetRoot: evaluationCase.absoluteTargetRoot,
23
+ filesIncluded: files.map((file) => file.relativePath),
24
+ totalFiles: files.length,
25
+ totalChars: countTextChars(contextText),
26
+ totalEstimatedTokens: countEstimatedTokens(contextText),
27
+ tokenCountMethod,
28
+ contextText,
29
+ durationMs: Date.now() - started
30
+ };
31
+ }
@@ -0,0 +1,127 @@
1
+ export const CORRECTNESS_FORMULA = "correctnessScore = 0.25 * fileMatchScore + 0.25 * symbolMatchScore + 0.50 * factMatchScore; empty file or symbol categories are neutral at 1.0.";
2
+ export function scoreCorrectness(args) {
3
+ const failureReasons = [];
4
+ if (args.status && args.status !== "completed" && args.status !== "invalid-output") {
5
+ failureReasons.push(statusFailureReason(args.status));
6
+ }
7
+ if (args.parsedAnswer.parseStatus === "failed") {
8
+ failureReasons.push("invalid output");
9
+ }
10
+ const expectedFilesFound = countMatches(args.answerKey.expectedFiles, args.parsedAnswer.relevantFiles);
11
+ const expectedSymbolsFound = countMatches(args.answerKey.expectedSymbols, args.parsedAnswer.relevantSymbols);
12
+ const factMatches = matchFacts(args.answerKey, args.parsedAnswer.expectedFactsFound, args.parsedAnswer.answerText);
13
+ const requiredFactsTotal = args.answerKey.expectedFacts.filter((fact) => fact.required).length;
14
+ const requiredFactsFound = factMatches.filter((fact) => fact.required).length;
15
+ const optionalFactsTotal = args.answerKey.expectedFacts.length - requiredFactsTotal;
16
+ const optionalFactsFound = factMatches.filter((fact) => !fact.required).length;
17
+ const fileMatchScore = categoryScore(expectedFilesFound, args.answerKey.expectedFiles.length);
18
+ const symbolMatchScore = categoryScore(expectedSymbolsFound, args.answerKey.expectedSymbols.length);
19
+ const factMatchScore = weightedFactScore(args.answerKey, factMatches.map((fact) => fact.id));
20
+ const correctnessScore = round(0.25 * fileMatchScore + 0.25 * symbolMatchScore + 0.5 * factMatchScore);
21
+ const foundFactCount = factMatches.length;
22
+ for (const fact of args.answerKey.expectedFacts.filter((fact) => fact.required)) {
23
+ if (!factMatches.some((match) => match.id === fact.id)) {
24
+ failureReasons.push(`missing required fact: ${fact.id}`);
25
+ }
26
+ }
27
+ if (foundFactCount < args.answerKey.minimumCorrectFacts) {
28
+ failureReasons.push(`too few facts found: ${foundFactCount}/${args.answerKey.minimumCorrectFacts}`);
29
+ }
30
+ if (args.answerKey.expectedFiles.length > 0 && expectedFilesFound < args.answerKey.expectedFiles.length) {
31
+ failureReasons.push("missing expected file");
32
+ }
33
+ if (args.answerKey.expectedSymbols.length > 0 && expectedSymbolsFound < args.answerKey.expectedSymbols.length) {
34
+ failureReasons.push("missing expected symbol");
35
+ }
36
+ if (correctnessScore < 0.7) {
37
+ failureReasons.push("score below threshold");
38
+ }
39
+ const passed = failureReasons.length === 0 &&
40
+ requiredFactsFound === requiredFactsTotal &&
41
+ foundFactCount >= args.answerKey.minimumCorrectFacts &&
42
+ correctnessScore >= 0.7;
43
+ return {
44
+ caseId: args.caseId,
45
+ fileMatchScore,
46
+ symbolMatchScore,
47
+ factMatchScore,
48
+ correctnessScore,
49
+ requiredFactsFound,
50
+ requiredFactsTotal,
51
+ optionalFactsFound,
52
+ optionalFactsTotal,
53
+ expectedFilesFound,
54
+ expectedFilesTotal: args.answerKey.expectedFiles.length,
55
+ expectedSymbolsFound,
56
+ expectedSymbolsTotal: args.answerKey.expectedSymbols.length,
57
+ passed,
58
+ failureReasons: unique(failureReasons),
59
+ formula: CORRECTNESS_FORMULA
60
+ };
61
+ }
62
+ function statusFailureReason(status) {
63
+ if (status === "agent-limit-reached")
64
+ return "agent limit reached";
65
+ if (status === "agent-unavailable")
66
+ return "agent unavailable";
67
+ if (status === "timeout")
68
+ return "timeout";
69
+ if (status === "failed")
70
+ return "agent run failed";
71
+ if (status === "skipped")
72
+ return "agent run skipped";
73
+ if (status === "invalid-output")
74
+ return "invalid output";
75
+ return status;
76
+ }
77
+ function countMatches(expected, actual) {
78
+ return expected.filter((item) => {
79
+ const expectedNormalized = normalize(item);
80
+ const expectedPath = normalizePath(item);
81
+ return actual.some((candidate) => {
82
+ const actualNormalized = normalize(candidate);
83
+ const actualPath = normalizePath(candidate);
84
+ return (actualNormalized === expectedNormalized ||
85
+ actualNormalized.includes(expectedNormalized) ||
86
+ expectedNormalized.includes(actualNormalized) ||
87
+ actualPath === expectedPath ||
88
+ actualPath.endsWith(`/${expectedPath}`) ||
89
+ expectedPath.endsWith(`/${actualPath}`));
90
+ });
91
+ }).length;
92
+ }
93
+ function matchFacts(answerKey, expectedFactsFound, answerText) {
94
+ const found = new Set(expectedFactsFound.map(normalize));
95
+ const normalizedAnswer = normalize(answerText);
96
+ return answerKey.expectedFacts.filter((fact) => {
97
+ const factId = normalize(fact.id);
98
+ const factText = normalize(fact.text);
99
+ return found.has(factId) || found.has(factText) || normalizedAnswer.includes(factId) || normalizedAnswer.includes(factText);
100
+ });
101
+ }
102
+ function weightedFactScore(answerKey, factIds) {
103
+ if (answerKey.expectedFacts.length === 0) {
104
+ return 1;
105
+ }
106
+ const found = new Set(factIds);
107
+ const totalWeight = answerKey.expectedFacts.reduce((sum, fact) => sum + fact.weight, 0);
108
+ if (totalWeight === 0) {
109
+ return 1;
110
+ }
111
+ return round(answerKey.expectedFacts.filter((fact) => found.has(fact.id)).reduce((sum, fact) => sum + fact.weight, 0) / totalWeight);
112
+ }
113
+ function categoryScore(found, total) {
114
+ return total === 0 ? 1 : round(found / total);
115
+ }
116
+ function normalize(value) {
117
+ return value.toLowerCase().replace(/[^a-z0-9]+/g, " ").trim();
118
+ }
119
+ function normalizePath(value) {
120
+ return value.toLowerCase().replace(/\\/g, "/").replace(/^\.?\//, "").trim();
121
+ }
122
+ function round(value) {
123
+ return Math.round(value * 10000) / 10000;
124
+ }
125
+ function unique(values) {
126
+ return [...new Set(values)];
127
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,104 @@
1
+ import { mkdir, writeFile } from "node:fs/promises";
2
+ import path from "node:path";
3
+ export async function writeExperimentArtifacts(args) {
4
+ const outDir = path.resolve(args.outDir);
5
+ const runsDir = path.join(outDir, "runs");
6
+ await mkdir(runsDir, { recursive: true });
7
+ for (const run of args.runs) {
8
+ const runDir = path.join(runsDir, run.runId);
9
+ await mkdir(runDir, { recursive: true });
10
+ const promptPath = path.join(runDir, "prompt.txt");
11
+ const agentRunResultPath = path.join(runDir, "agent-run-result.json");
12
+ const parsedAnswerPath = path.join(runDir, "parsed-answer.json");
13
+ const correctnessScorePath = path.join(runDir, "correctness-score.json");
14
+ await writeFile(promptPath, getPromptTextFromRun(run), "utf8");
15
+ await writeFile(agentRunResultPath, `${JSON.stringify(run.agentRunResult, null, 2)}\n`, "utf8");
16
+ await writeFile(parsedAnswerPath, `${JSON.stringify(run.parsedAnswer, null, 2)}\n`, "utf8");
17
+ await writeFile(correctnessScorePath, `${JSON.stringify(run.correctness, null, 2)}\n`, "utf8");
18
+ run.artifactPaths = {
19
+ promptPath,
20
+ agentRunResultPath,
21
+ parsedAnswerPath,
22
+ correctnessScorePath
23
+ };
24
+ }
25
+ const artifactPaths = {
26
+ summaryPath: path.join(outDir, "experiment-summary.json"),
27
+ runsPath: path.join(outDir, "experiment-runs.json"),
28
+ comparisonsPath: path.join(outDir, "experiment-comparisons.json"),
29
+ configPath: path.join(outDir, "experiment-config.json"),
30
+ runsDir
31
+ };
32
+ await writeFile(artifactPaths.summaryPath, `${JSON.stringify(args.summary, null, 2)}\n`, "utf8");
33
+ await writeFile(artifactPaths.runsPath, `${JSON.stringify({ generatedAt: args.summary.generatedAt, runs: args.runs }, null, 2)}\n`, "utf8");
34
+ await writeFile(artifactPaths.comparisonsPath, `${JSON.stringify({ generatedAt: args.summary.generatedAt, comparisons: args.comparisons }, null, 2)}\n`, "utf8");
35
+ await writeFile(artifactPaths.configPath, `${JSON.stringify(sanitizeConfig(args.config), null, 2)}\n`, "utf8");
36
+ return {
37
+ summary: args.summary,
38
+ runs: args.runs,
39
+ comparisons: args.comparisons,
40
+ artifactPaths,
41
+ warnings: args.summary.warnings
42
+ };
43
+ }
44
+ export function buildExperimentSummary(args) {
45
+ const runs = args.runs;
46
+ const comparisons = args.comparisons;
47
+ const tokenSavings = comparisons
48
+ .map((comparison) => comparison.tokenSavingsPercent)
49
+ .filter((value) => typeof value === "number" && Number.isFinite(value));
50
+ const durationReductions = comparisons
51
+ .map((comparison) => comparison.durationReductionPercent)
52
+ .filter((value) => typeof value === "number" && Number.isFinite(value));
53
+ const correctnessDeltas = comparisons
54
+ .map((comparison) => comparison.correctnessDelta)
55
+ .filter((value) => typeof value === "number" && Number.isFinite(value));
56
+ const completedComparisons = comparisons.filter((comparison) => comparison.rawStatus === "completed" && comparison.myDevKitStatus === "completed");
57
+ return {
58
+ generatedAt: args.generatedAt ?? new Date().toISOString(),
59
+ casesPath: args.config.casesPath,
60
+ projectProfilesPath: args.config.projectProfilesPath,
61
+ agents: [...new Set(runs.map((run) => run.agentId))].sort(),
62
+ strategies: [...new Set(runs.map((run) => run.promptStrategy))].sort(),
63
+ complexityLevels: [...new Set(runs.map((run) => run.promptComplexityLevel))].sort(),
64
+ totalRuns: runs.length,
65
+ completedRuns: countStatus(runs, "completed"),
66
+ failedRuns: countStatus(runs, "failed"),
67
+ skippedRuns: countStatus(runs, "skipped"),
68
+ unavailableRuns: countStatus(runs, "agent-unavailable"),
69
+ limitReachedRuns: countStatus(runs, "agent-limit-reached"),
70
+ timeoutRuns: countStatus(runs, "timeout"),
71
+ invalidOutputRuns: countStatus(runs, "invalid-output"),
72
+ totalComparisons: comparisons.length,
73
+ averageTokenSavingsPercent: averageOrNull(tokenSavings),
74
+ averageDurationReductionPercent: averageOrNull(durationReductions),
75
+ averageCorrectnessDelta: averageOrNull(correctnessDeltas),
76
+ answerDoesMyDevKitSaveTokens: tokenSavings.length === 0 ? null : averageOrNull(tokenSavings) > 0,
77
+ answerDoesMyDevKitPreserveCorrectness: completedComparisons.length === 0 ? null : completedComparisons.every((comparison) => comparison.sameCorrectnessPass),
78
+ answerDoesMyDevKitReduceExecutionTime: durationReductions.length === 0 ? null : averageOrNull(durationReductions) > 0,
79
+ warnings: [
80
+ ...runs.flatMap((run) => run.warnings),
81
+ ...comparisons.flatMap((comparison) => comparison.warnings)
82
+ ]
83
+ };
84
+ }
85
+ function getPromptTextFromRun(run) {
86
+ return run.promptTextForArtifact ?? "";
87
+ }
88
+ function sanitizeConfig(config) {
89
+ return {
90
+ ...config,
91
+ commandTemplates: config.commandTemplates
92
+ ? Object.fromEntries(Object.entries(config.commandTemplates).map(([key, value]) => [key, value ? { ...value, args: value.args } : value]))
93
+ : undefined
94
+ };
95
+ }
96
+ function countStatus(runs, status) {
97
+ return runs.filter((run) => run.status === status).length;
98
+ }
99
+ function averageOrNull(values) {
100
+ if (values.length === 0) {
101
+ return null;
102
+ }
103
+ return values.reduce((sum, value) => sum + value, 0) / values.length;
104
+ }
@@ -0,0 +1,57 @@
1
+ import { mkdir, writeFile } from "node:fs/promises";
2
+ import path from "node:path";
3
+ import { normalizeLabReport } from "../report/types.js";
4
+ import { renderHtmlReport } from "../report/renderHtmlReport.js";
5
+ import { renderTokenSavingsReportInput } from "./renderTokenSavingsReportInput.js";
6
+ export async function writeTokenSavingsArtifacts(options) {
7
+ const outDir = path.resolve(options.outDir);
8
+ await mkdir(outDir, { recursive: true });
9
+ const artifactPaths = {
10
+ summaryPath: path.join(outDir, "token-savings-summary.json"),
11
+ runsPath: path.join(outDir, "token-savings-runs.json"),
12
+ htmlPath: path.join(outDir, "token-savings-report.html"),
13
+ pngPath: path.join(outDir, "token-savings-report.png")
14
+ };
15
+ const warnings = [...options.summary.warnings];
16
+ if (options.screenshot.warning) {
17
+ warnings.push(options.screenshot.warning);
18
+ }
19
+ if (options.screenshot.status === "failed" && options.screenshot.error) {
20
+ warnings.push(`PNG screenshot capture failed: ${options.screenshot.error}`);
21
+ }
22
+ const report = renderTokenSavingsReportInput({
23
+ summary: options.summary,
24
+ cases: options.comparisonCases,
25
+ commandConfig: options.commandConfig,
26
+ artifactPaths: {
27
+ summaryPath: artifactPaths.summaryPath,
28
+ runsPath: artifactPaths.runsPath,
29
+ htmlPath: artifactPaths.htmlPath
30
+ },
31
+ warnings
32
+ });
33
+ const normalizedReport = normalizeLabReport(report, options.generatedAt);
34
+ await writeFile(artifactPaths.summaryPath, JSON.stringify({
35
+ summary: options.summary,
36
+ tokenCountMethod: options.summary.tokenCountMethod,
37
+ generatedAt: normalizedReport.generatedAt,
38
+ commandConfiguration: options.commandConfig,
39
+ warnings,
40
+ screenshot: options.screenshot,
41
+ artifactPaths
42
+ }, null, 2), "utf8");
43
+ await writeFile(artifactPaths.runsPath, JSON.stringify({
44
+ generatedAt: normalizedReport.generatedAt,
45
+ tokenCountMethod: options.summary.tokenCountMethod,
46
+ runs: options.runs
47
+ }, null, 2), "utf8");
48
+ await writeFile(artifactPaths.htmlPath, renderHtmlReport(normalizedReport), "utf8");
49
+ return {
50
+ summary: options.summary,
51
+ runs: options.runs,
52
+ report: normalizedReport,
53
+ screenshot: options.screenshot,
54
+ artifactPaths,
55
+ warnings
56
+ };
57
+ }