@dailephd/my-dev-kit-lab 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +272 -0
- package/benchmarks/contracts/benchmark-project-profiles.json +1199 -0
- package/benchmarks/contracts/todo-behavior.md +70 -0
- package/benchmarks/contracts/todo-benchmark-case.json +227 -0
- package/benchmarks/projects/README.md +34 -0
- package/benchmarks/projects/task-analytics-large-mixed/README.md +1 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/__init__.py +3 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/fixtures.py +6 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/metrics.py +29 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/models.py +21 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/parser.py +16 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/pipeline.py +9 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/quality.py +8 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/reporting.py +11 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/tests/test_metrics.py +19 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/tests/test_parser.py +15 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/tests/test_quality.py +19 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/tests/test_reporting.py +15 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/package.json +12 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/index.ts +11 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/models/analyticsSnapshot.ts +20 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/models/project.ts +5 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/models/task.ts +10 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/reporting/buildProjectLeaderboard.ts +7 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/reporting/formatTaskHealthReport.ts +13 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/services/buildAnalyticsSnapshot.ts +39 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/services/completeTask.ts +10 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/services/createTask.ts +21 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/services/listTasksByProject.ts +6 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/store/projectStore.ts +20 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/store/taskStore.ts +44 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/validation/projectValidation.ts +12 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/validation/taskValidation.ts +18 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tests/buildAnalyticsSnapshot.test.ts +48 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tests/completeTask.test.ts +21 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tests/createTask.test.ts +31 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tests/listTasksByProject.test.ts +18 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tests/reporting.test.ts +19 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tsconfig.json +12 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/vitest.config.ts +5 -0
- package/benchmarks/projects/task-workflow-medium-ts/README.md +1 -0
- package/benchmarks/projects/task-workflow-medium-ts/package.json +12 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/index.ts +9 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/models/project.ts +6 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/models/task.ts +39 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/services/completeTask.ts +15 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/services/createTask.ts +26 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/services/filterTasks.ts +17 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/services/importTasks.ts +33 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/services/summarizeTasks.ts +30 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/store/taskStore.ts +76 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/utils/deterministicId.ts +3 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/validation/taskValidation.ts +45 -0
- package/benchmarks/projects/task-workflow-medium-ts/tests/completeTask.test.ts +16 -0
- package/benchmarks/projects/task-workflow-medium-ts/tests/createTask.test.ts +21 -0
- package/benchmarks/projects/task-workflow-medium-ts/tests/filterTasks.test.ts +18 -0
- package/benchmarks/projects/task-workflow-medium-ts/tests/importTasks.test.ts +22 -0
- package/benchmarks/projects/task-workflow-medium-ts/tests/summarizeTasks.test.ts +29 -0
- package/benchmarks/projects/task-workflow-medium-ts/tsconfig.json +12 -0
- package/benchmarks/projects/task-workflow-medium-ts/vitest.config.ts +5 -0
- package/benchmarks/projects/todo-js/README.md +3 -0
- package/benchmarks/projects/todo-js/package.json +11 -0
- package/benchmarks/projects/todo-js/src/index.js +2 -0
- package/benchmarks/projects/todo-js/src/taskService.js +37 -0
- package/benchmarks/projects/todo-js/src/taskStore.js +28 -0
- package/benchmarks/projects/todo-js/tests/taskService.test.js +45 -0
- package/benchmarks/projects/todo-js/vitest.config.js +5 -0
- package/benchmarks/projects/todo-mixed-ts-py/README.md +3 -0
- package/benchmarks/projects/todo-mixed-ts-py/package.json +13 -0
- package/benchmarks/projects/todo-mixed-ts-py/python/task_service.py +76 -0
- package/benchmarks/projects/todo-mixed-ts-py/src/taskCli.ts +38 -0
- package/benchmarks/projects/todo-mixed-ts-py/tests/mixedBoundary.test.ts +18 -0
- package/benchmarks/projects/todo-mixed-ts-py/tsconfig.json +12 -0
- package/benchmarks/projects/todo-mixed-ts-py/vitest.config.ts +5 -0
- package/benchmarks/projects/todo-python/README.md +3 -0
- package/benchmarks/projects/todo-python/src/__init__.py +4 -0
- package/benchmarks/projects/todo-python/src/task_service.py +32 -0
- package/benchmarks/projects/todo-python/src/task_store.py +28 -0
- package/benchmarks/projects/todo-python/tests/test_task_service.py +52 -0
- package/benchmarks/projects/todo-ts/README.md +3 -0
- package/benchmarks/projects/todo-ts/package.json +12 -0
- package/benchmarks/projects/todo-ts/src/index.ts +2 -0
- package/benchmarks/projects/todo-ts/src/taskService.ts +41 -0
- package/benchmarks/projects/todo-ts/src/taskStore.ts +34 -0
- package/benchmarks/projects/todo-ts/tests/taskService.test.ts +45 -0
- package/benchmarks/projects/todo-ts/tsconfig.json +12 -0
- package/benchmarks/projects/todo-ts/vitest.config.ts +5 -0
- package/dist/scripts/build-gallery.js +3 -0
- package/dist/scripts/capture-demo-report.js +3 -0
- package/dist/scripts/evaluate-token-savings.js +2 -0
- package/dist/scripts/experiments/describeExperiment.js +143 -0
- package/dist/scripts/experiments/listExperiments.js +44 -0
- package/dist/scripts/experiments/runExperiment.js +199 -0
- package/dist/scripts/generate-experiment-plots.js +3 -0
- package/dist/scripts/generate-prompt-variants.js +2 -0
- package/dist/scripts/render-experiment-report.js +2 -0
- package/dist/scripts/run-agent-prompt.js +2 -0
- package/dist/scripts/run-controlled-experiment.js +2 -0
- package/dist/scripts/run-final-demo.js +3 -0
- package/dist/scripts/run-lab-demo.js +5 -0
- package/dist/scripts/run-visualization-demos.js +3 -0
- package/dist/scripts/security/runCodeql.js +57 -0
- package/dist/scripts/security/runDependencyChecks.js +57 -0
- package/dist/scripts/security/runFuzzSmoke.js +29 -0
- package/dist/scripts/security/runPackageChecks.js +56 -0
- package/dist/scripts/security/runSemgrep.js +63 -0
- package/dist/scripts/security/validate.js +117 -0
- package/dist/scripts/verify-benchmarks.js +202 -0
- package/dist/src/agents/adapters/claudeAdapter.js +37 -0
- package/dist/src/agents/adapters/codexAdapter.js +110 -0
- package/dist/src/agents/adapters/fakeAgentAdapter.js +101 -0
- package/dist/src/agents/agentRegistry.js +21 -0
- package/dist/src/agents/index.js +7 -0
- package/dist/src/agents/parseAgentTokenUsage.js +137 -0
- package/dist/src/agents/runAgentPrompt.js +38 -0
- package/dist/src/agents/types.js +1 -0
- package/dist/src/commands/buildGalleryCommand.js +56 -0
- package/dist/src/commands/captureDemoReport.js +116 -0
- package/dist/src/commands/evaluateTokenSavings.js +175 -0
- package/dist/src/commands/generateExperimentPlotsCommand.js +38 -0
- package/dist/src/commands/generatePromptVariants.js +67 -0
- package/dist/src/commands/renderExperimentReportCommand.js +131 -0
- package/dist/src/commands/runAgentPromptCommand.js +132 -0
- package/dist/src/commands/runControlledExperimentCommand.js +174 -0
- package/dist/src/commands/runFinalDemoCommand.js +123 -0
- package/dist/src/commands/runLabDemo.js +62 -0
- package/dist/src/commands/runVisualizationDemosCommand.js +67 -0
- package/dist/src/core/commandLine.js +59 -0
- package/dist/src/core/countTokens.js +8 -0
- package/dist/src/core/fileGlobs.js +100 -0
- package/dist/src/core/localProjectTarget.js +75 -0
- package/dist/src/core/pathSafety.js +19 -0
- package/dist/src/core/pythonCommand.js +30 -0
- package/dist/src/core/resolveCommand.js +110 -0
- package/dist/src/core/runMeasuredCommand.js +143 -0
- package/dist/src/evaluation/benchmarkMetadata.js +207 -0
- package/dist/src/evaluation/buildExperimentMatrix.js +75 -0
- package/dist/src/evaluation/classifyAgentRunOutcome.js +40 -0
- package/dist/src/evaluation/compareExperimentRuns.js +79 -0
- package/dist/src/evaluation/compareTokenSavings.js +47 -0
- package/dist/src/evaluation/controlledExperimentTypes.js +1 -0
- package/dist/src/evaluation/index.js +18 -0
- package/dist/src/evaluation/parseAgentAnswer.js +230 -0
- package/dist/src/evaluation/projectComplexity.js +126 -0
- package/dist/src/evaluation/projectFileTree.js +83 -0
- package/dist/src/evaluation/readEvaluationCases.js +59 -0
- package/dist/src/evaluation/renderTokenSavingsReportInput.js +55 -0
- package/dist/src/evaluation/runControlledExperiment.js +158 -0
- package/dist/src/evaluation/runMyDevKitRetrieval.js +197 -0
- package/dist/src/evaluation/runRawFullFileBaseline.js +31 -0
- package/dist/src/evaluation/scoreCorrectness.js +127 -0
- package/dist/src/evaluation/types.js +1 -0
- package/dist/src/evaluation/writeExperimentArtifacts.js +104 -0
- package/dist/src/evaluation/writeTokenSavingsArtifacts.js +57 -0
- package/dist/src/experiments/config.js +24 -0
- package/dist/src/experiments/defaultRegistry.js +7 -0
- package/dist/src/experiments/errors.js +18 -0
- package/dist/src/experiments/index.js +9 -0
- package/dist/src/experiments/outputPaths.js +25 -0
- package/dist/src/experiments/plugins/contextStrategyComparison/config.js +37 -0
- package/dist/src/experiments/plugins/contextStrategyComparison/index.js +3 -0
- package/dist/src/experiments/plugins/contextStrategyComparison/plugin.js +83 -0
- package/dist/src/experiments/plugins/contextStrategyComparison/resultMapping.js +260 -0
- package/dist/src/experiments/plugins/index.js +1 -0
- package/dist/src/experiments/registry.js +43 -0
- package/dist/src/experiments/results.js +48 -0
- package/dist/src/experiments/runner.js +181 -0
- package/dist/src/experiments/target.js +8 -0
- package/dist/src/experiments/types.js +1 -0
- package/dist/src/gallery/index.js +2 -0
- package/dist/src/gallery/types.js +1 -0
- package/dist/src/gallery/writeGalleryManifest.js +214 -0
- package/dist/src/index.js +12 -0
- package/dist/src/plots/buildExperimentPlotData.js +137 -0
- package/dist/src/plots/index.js +4 -0
- package/dist/src/plots/renderSvgChart.js +82 -0
- package/dist/src/plots/types.js +1 -0
- package/dist/src/plots/writePlotArtifacts.js +46 -0
- package/dist/src/prompts/buildPromptContext.js +68 -0
- package/dist/src/prompts/generateMyDevKitPrompt.js +106 -0
- package/dist/src/prompts/generatePromptVariants.js +36 -0
- package/dist/src/prompts/generateRawFullFilePrompt.js +97 -0
- package/dist/src/prompts/index.js +7 -0
- package/dist/src/prompts/measurePromptComplexity.js +41 -0
- package/dist/src/prompts/types.js +1 -0
- package/dist/src/prompts/writePromptArtifacts.js +43 -0
- package/dist/src/report/buildExperimentReportInput.js +339 -0
- package/dist/src/report/experimentReportTypes.js +1 -0
- package/dist/src/report/experiments/buildPluginExperimentReport.js +153 -0
- package/dist/src/report/experiments/experimentReportModel.js +1 -0
- package/dist/src/report/experiments/index.js +4 -0
- package/dist/src/report/experiments/renderPluginExperimentReportHtml.js +133 -0
- package/dist/src/report/experiments/writePluginExperimentReports.js +30 -0
- package/dist/src/report/index.js +8 -0
- package/dist/src/report/renderExperimentHtmlReport.js +354 -0
- package/dist/src/report/renderHtmlReport.js +103 -0
- package/dist/src/report/types.js +10 -0
- package/dist/src/report/writeExperimentReportArtifacts.js +38 -0
- package/dist/src/report/writeReportArtifacts.js +39 -0
- package/dist/src/screenshot/captureReportScreenshot.js +75 -0
- package/dist/src/screenshot/index.js +2 -0
- package/dist/src/screenshot/types.js +1 -0
- package/dist/src/securityValidation/artifacts.js +15 -0
- package/dist/src/securityValidation/cliAdversarial/adversarialCliConfig.js +38 -0
- package/dist/src/securityValidation/cliAdversarial/dataVolumeChecks.js +194 -0
- package/dist/src/securityValidation/cliAdversarial/jsonStdoutChecks.js +359 -0
- package/dist/src/securityValidation/cliAdversarial/malformedArtifactChecks.js +284 -0
- package/dist/src/securityValidation/cliAdversarial/malformedArtifactFixtures.js +79 -0
- package/dist/src/securityValidation/cliAdversarial/pathBoundaryChecks.js +431 -0
- package/dist/src/securityValidation/cliAdversarial/pathCases.js +144 -0
- package/dist/src/securityValidation/cliAdversarial/readOnlyBoundaryChecks.js +294 -0
- package/dist/src/securityValidation/cliAdversarial/runAdversarialCheck.js +149 -0
- package/dist/src/securityValidation/cliAdversarial/subprocessSafetyChecks.js +214 -0
- package/dist/src/securityValidation/cliAdversarial/tempWorkspace.js +160 -0
- package/dist/src/securityValidation/commandRunner.js +136 -0
- package/dist/src/securityValidation/config.js +39 -0
- package/dist/src/securityValidation/dependencies/parseNpmAudit.js +115 -0
- package/dist/src/securityValidation/dependencies/parseNpmLs.js +71 -0
- package/dist/src/securityValidation/dependencies/parseNpmOutdated.js +41 -0
- package/dist/src/securityValidation/dependencies/runDependencyChecks.js +239 -0
- package/dist/src/securityValidation/dependencies/runOsvScanner.js +43 -0
- package/dist/src/securityValidation/fuzz/fuzzHarness.js +61 -0
- package/dist/src/securityValidation/fuzz/fuzzTargets.js +204 -0
- package/dist/src/securityValidation/fuzz/randomInput.js +0 -0
- package/dist/src/securityValidation/index.js +34 -0
- package/dist/src/securityValidation/packageChecks/forbiddenPackageContents.js +67 -0
- package/dist/src/securityValidation/packageChecks/parseNpmPackDryRun.js +56 -0
- package/dist/src/securityValidation/packageChecks/runPackageChecks.js +88 -0
- package/dist/src/securityValidation/report/renderSecurityReport.js +248 -0
- package/dist/src/securityValidation/report/securityReportTypes.js +1 -0
- package/dist/src/securityValidation/staticScans/codeql.js +66 -0
- package/dist/src/securityValidation/staticScans/semgrep.js +180 -0
- package/dist/src/securityValidation/testMatrix.js +535 -0
- package/dist/src/securityValidation/types.js +34 -0
- package/dist/src/securityValidation/validate/resolveTarget.js +32 -0
- package/dist/src/securityValidation/validate/runSecurityValidation.js +169 -0
- package/dist/src/securityValidation/validate/verdict.js +73 -0
- package/dist/src/visualizationDemos/buildMyDevKitVisualizationCommands.js +59 -0
- package/dist/src/visualizationDemos/index.js +4 -0
- package/dist/src/visualizationDemos/runVisualizationDemos.js +82 -0
- package/dist/src/visualizationDemos/types.js +1 -0
- package/dist/src/visualizationDemos/writeVisualizationDemoArtifacts.js +25 -0
- package/docs/METRICS.md +286 -0
- package/examples/demo-report-input.json +78 -0
- package/examples/lab-demo-cases.json +35 -0
- package/examples/real-agent-campaign-cases.json +118 -0
- package/examples/token-savings-cases.json +122 -0
- package/package.json +91 -0
- package/tests/fixtures/fake-adversarial-cli.js +152 -0
- package/tests/fixtures/fake-my-dev-kit-cli.js +83 -0
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
import path from "node:path";
|
|
2
|
+
import { runAgentPrompt } from "../agents/index.js";
|
|
3
|
+
import { generatePromptVariants } from "../prompts/index.js";
|
|
4
|
+
import { buildExperimentMatrix } from "./buildExperimentMatrix.js";
|
|
5
|
+
import { classifyAgentRunOutcome } from "./classifyAgentRunOutcome.js";
|
|
6
|
+
import { compareExperimentRuns } from "./compareExperimentRuns.js";
|
|
7
|
+
import { parseAgentAnswer } from "./parseAgentAnswer.js";
|
|
8
|
+
import { scoreCorrectness } from "./scoreCorrectness.js";
|
|
9
|
+
import { buildExperimentSummary, writeExperimentArtifacts } from "./writeExperimentArtifacts.js";
|
|
10
|
+
export async function runControlledExperiment(args) {
|
|
11
|
+
const repoRoot = args.repoRoot ?? process.cwd();
|
|
12
|
+
const config = {
|
|
13
|
+
...args.config,
|
|
14
|
+
agents: args.config.agents ?? ["fake-agent"],
|
|
15
|
+
strategies: args.config.strategies ?? ["raw-full-file", "my-dev-kit-guided"],
|
|
16
|
+
complexityLevels: args.config.complexityLevels ?? ["short"],
|
|
17
|
+
continueOnFailure: args.config.continueOnFailure ?? true,
|
|
18
|
+
includeRealAgents: args.config.includeRealAgents ?? false
|
|
19
|
+
};
|
|
20
|
+
const matrix = buildExperimentMatrix({ cases: args.cases, config });
|
|
21
|
+
const runs = [];
|
|
22
|
+
for (const cell of matrix) {
|
|
23
|
+
const evaluationCase = args.cases.find((candidate) => candidate.id === cell.caseId);
|
|
24
|
+
if (!evaluationCase) {
|
|
25
|
+
throw new Error(`Evaluation case not found while running matrix: ${cell.caseId}`);
|
|
26
|
+
}
|
|
27
|
+
const promptVariant = buildPromptVariant({
|
|
28
|
+
evaluationCase,
|
|
29
|
+
projectProfiles: args.projectProfiles,
|
|
30
|
+
strategy: cell.strategy,
|
|
31
|
+
complexityLevel: cell.complexityLevel
|
|
32
|
+
});
|
|
33
|
+
const runDir = path.join(path.resolve(repoRoot, config.outDir), "runs", cell.runId);
|
|
34
|
+
let run = await executeExperimentCell({
|
|
35
|
+
runId: cell.runId,
|
|
36
|
+
agentId: cell.agentId,
|
|
37
|
+
promptVariant,
|
|
38
|
+
repoRoot,
|
|
39
|
+
runDir,
|
|
40
|
+
timeoutMs: config.timeoutMs,
|
|
41
|
+
requireAgents: config.requireAgents ?? false,
|
|
42
|
+
commandTemplate: cell.agentId === "codex" || cell.agentId === "claude" ? config.commandTemplates?.[cell.agentId] : undefined,
|
|
43
|
+
env: args.env ?? process.env
|
|
44
|
+
});
|
|
45
|
+
runs.push(run);
|
|
46
|
+
if (run.status !== "completed" && config.continueOnFailure === false) {
|
|
47
|
+
break;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
const comparisons = compareExperimentRuns(runs);
|
|
51
|
+
const summary = buildExperimentSummary({ config, runs, comparisons });
|
|
52
|
+
return writeExperimentArtifacts({
|
|
53
|
+
outDir: path.resolve(repoRoot, config.outDir),
|
|
54
|
+
config,
|
|
55
|
+
runs,
|
|
56
|
+
comparisons,
|
|
57
|
+
summary
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
async function executeExperimentCell(args) {
|
|
61
|
+
let agentRunResult;
|
|
62
|
+
try {
|
|
63
|
+
agentRunResult = await runAgentPrompt({
|
|
64
|
+
runId: args.runId,
|
|
65
|
+
agentId: args.agentId,
|
|
66
|
+
promptVariant: args.promptVariant,
|
|
67
|
+
promptText: args.promptVariant.promptText,
|
|
68
|
+
cwd: args.repoRoot,
|
|
69
|
+
outDir: args.runDir,
|
|
70
|
+
timeoutMs: args.timeoutMs,
|
|
71
|
+
requireAvailable: args.requireAgents,
|
|
72
|
+
commandTemplate: args.commandTemplate,
|
|
73
|
+
env: args.env
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
catch (error) {
|
|
77
|
+
agentRunResult = buildSyntheticFailureResult(args, error);
|
|
78
|
+
}
|
|
79
|
+
const parsedAnswer = parseAgentAnswer({
|
|
80
|
+
text: agentRunResult.finalAnswerText,
|
|
81
|
+
answerKey: args.promptVariant.expectedAnswerKey,
|
|
82
|
+
tokenUsage: agentRunResult.tokenUsage
|
|
83
|
+
});
|
|
84
|
+
const classification = classifyAgentRunOutcome({ agentRunResult, parsedAnswer });
|
|
85
|
+
const correctness = scoreCorrectness({
|
|
86
|
+
caseId: args.promptVariant.caseId,
|
|
87
|
+
answerKey: args.promptVariant.expectedAnswerKey,
|
|
88
|
+
parsedAnswer,
|
|
89
|
+
status: classification.status
|
|
90
|
+
});
|
|
91
|
+
return {
|
|
92
|
+
runId: args.runId,
|
|
93
|
+
caseId: args.promptVariant.caseId,
|
|
94
|
+
benchmarkProject: args.promptVariant.benchmarkProject,
|
|
95
|
+
agentId: args.agentId,
|
|
96
|
+
promptStrategy: args.promptVariant.strategy,
|
|
97
|
+
promptComplexityLevel: args.promptVariant.complexityLevel,
|
|
98
|
+
promptVariantId: args.promptVariant.id,
|
|
99
|
+
promptTextForArtifact: args.promptVariant.promptText,
|
|
100
|
+
projectComplexityLevel: args.promptVariant.projectProfile.complexityLevel,
|
|
101
|
+
projectComplexityScore: args.promptVariant.projectProfile.complexityScore,
|
|
102
|
+
promptMetrics: args.promptVariant.promptMetrics,
|
|
103
|
+
agentRunResult,
|
|
104
|
+
parsedAnswer,
|
|
105
|
+
correctness,
|
|
106
|
+
status: classification.status,
|
|
107
|
+
statusReason: classification.statusReason,
|
|
108
|
+
startedAt: agentRunResult.startedAt,
|
|
109
|
+
endedAt: agentRunResult.endedAt,
|
|
110
|
+
durationMs: agentRunResult.durationMs,
|
|
111
|
+
tokenUsage: agentRunResult.tokenUsage,
|
|
112
|
+
tokenUsageSource: agentRunResult.tokenUsageSource,
|
|
113
|
+
tokenUsageReliability: agentRunResult.tokenUsageReliability,
|
|
114
|
+
warnings: classification.warnings,
|
|
115
|
+
errors: classification.errors,
|
|
116
|
+
artifactPaths: {}
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
function buildPromptVariant(args) {
|
|
120
|
+
const [variant] = generatePromptVariants({
|
|
121
|
+
cases: [args.evaluationCase],
|
|
122
|
+
projectProfiles: args.projectProfiles,
|
|
123
|
+
strategies: [args.strategy],
|
|
124
|
+
complexityLevels: [args.complexityLevel]
|
|
125
|
+
});
|
|
126
|
+
if (!variant) {
|
|
127
|
+
throw new Error(`Failed to generate prompt variant for case: ${args.evaluationCase.id}`);
|
|
128
|
+
}
|
|
129
|
+
return variant;
|
|
130
|
+
}
|
|
131
|
+
function buildSyntheticFailureResult(args, error) {
|
|
132
|
+
const now = new Date().toISOString();
|
|
133
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
134
|
+
return {
|
|
135
|
+
runId: args.runId,
|
|
136
|
+
agentId: args.agentId,
|
|
137
|
+
displayName: args.agentId,
|
|
138
|
+
surface: args.agentId === "fake-agent" ? "simulated" : "cli",
|
|
139
|
+
promptVariantId: args.promptVariant.id,
|
|
140
|
+
promptStrategy: args.promptVariant.strategy,
|
|
141
|
+
promptComplexityLevel: args.promptVariant.complexityLevel,
|
|
142
|
+
startedAt: now,
|
|
143
|
+
endedAt: now,
|
|
144
|
+
durationMs: 0,
|
|
145
|
+
status: "failed",
|
|
146
|
+
exitCode: null,
|
|
147
|
+
command: args.agentId,
|
|
148
|
+
args: [],
|
|
149
|
+
cwd: args.repoRoot,
|
|
150
|
+
finalAnswerText: "",
|
|
151
|
+
finalAnswerParseStatus: "empty",
|
|
152
|
+
tokenUsage: { source: "unavailable" },
|
|
153
|
+
tokenUsageSource: "unavailable",
|
|
154
|
+
tokenUsageReliability: "unavailable",
|
|
155
|
+
warnings: [],
|
|
156
|
+
errors: [message]
|
|
157
|
+
};
|
|
158
|
+
}
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
import path from "node:path";
|
|
2
|
+
import { countEstimatedTokens, countTextChars, tokenCountMethod } from "../core/countTokens.js";
|
|
3
|
+
import { runMeasuredCommand } from "../core/runMeasuredCommand.js";
|
|
4
|
+
function parseJsonIfPossible(text) {
|
|
5
|
+
try {
|
|
6
|
+
return JSON.parse(text);
|
|
7
|
+
}
|
|
8
|
+
catch {
|
|
9
|
+
return undefined;
|
|
10
|
+
}
|
|
11
|
+
}
|
|
12
|
+
function readSearchResults(payload) {
|
|
13
|
+
if (Array.isArray(payload)) {
|
|
14
|
+
return payload.filter((item) => !!item && typeof item === "object");
|
|
15
|
+
}
|
|
16
|
+
if (!payload || typeof payload !== "object") {
|
|
17
|
+
return [];
|
|
18
|
+
}
|
|
19
|
+
const record = payload;
|
|
20
|
+
for (const key of ["results", "matches", "items", "data"]) {
|
|
21
|
+
if (Array.isArray(record[key])) {
|
|
22
|
+
return record[key].filter((item) => !!item && typeof item === "object");
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
return [];
|
|
26
|
+
}
|
|
27
|
+
function pickCandidateFields(candidate) {
|
|
28
|
+
const readString = (...keys) => {
|
|
29
|
+
for (const key of keys) {
|
|
30
|
+
if (typeof candidate[key] === "string" && candidate[key]) {
|
|
31
|
+
return candidate[key];
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
return undefined;
|
|
35
|
+
};
|
|
36
|
+
return {
|
|
37
|
+
nodeId: readString("nodeId", "id", "node", "symbolId"),
|
|
38
|
+
file: readString("file", "path", "filePath"),
|
|
39
|
+
symbol: readString("symbol", "name", "label")
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
export async function runMyDevKitRetrieval(options) {
|
|
43
|
+
const started = Date.now();
|
|
44
|
+
const warnings = [];
|
|
45
|
+
const commandsDir = path.join(options.outputDir, "commands", options.evaluationCase.id);
|
|
46
|
+
const indexesDir = path.join(options.outputDir, "indexes", options.evaluationCase.id);
|
|
47
|
+
const commands = [];
|
|
48
|
+
const indexCommand = await runMeasuredCommand({
|
|
49
|
+
commandId: "index",
|
|
50
|
+
commandString: options.kitCommand,
|
|
51
|
+
cwd: process.cwd(),
|
|
52
|
+
outDir: commandsDir,
|
|
53
|
+
extraArgs: [
|
|
54
|
+
"index",
|
|
55
|
+
"--root",
|
|
56
|
+
options.evaluationCase.absoluteTargetRoot,
|
|
57
|
+
...options.evaluationCase.sourceRoots.flatMap((sourceRoot) => ["--src", sourceRoot]),
|
|
58
|
+
"--out",
|
|
59
|
+
indexesDir,
|
|
60
|
+
"--json"
|
|
61
|
+
]
|
|
62
|
+
});
|
|
63
|
+
commands.push(indexCommand);
|
|
64
|
+
if (!indexCommand.ok) {
|
|
65
|
+
if (options.requireKit) {
|
|
66
|
+
throw new Error(indexCommand.error || `my-dev-kit index failed with exit code ${indexCommand.exitCode}`);
|
|
67
|
+
}
|
|
68
|
+
warnings.push("my-dev-kit index command was unavailable or failed.");
|
|
69
|
+
return {
|
|
70
|
+
caseId: options.evaluationCase.id,
|
|
71
|
+
skipped: true,
|
|
72
|
+
warnings,
|
|
73
|
+
totalChars: 0,
|
|
74
|
+
totalEstimatedTokens: 0,
|
|
75
|
+
tokenCountMethod,
|
|
76
|
+
contextText: "",
|
|
77
|
+
filesRead: [],
|
|
78
|
+
commands,
|
|
79
|
+
durationMs: Date.now() - started
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
const searchCommand = await runMeasuredCommand({
|
|
83
|
+
commandId: "search",
|
|
84
|
+
commandString: options.kitCommand,
|
|
85
|
+
cwd: process.cwd(),
|
|
86
|
+
outDir: commandsDir,
|
|
87
|
+
extraArgs: ["search", "--index", indexesDir, "--query", options.evaluationCase.query, "--json"]
|
|
88
|
+
});
|
|
89
|
+
commands.push(searchCommand);
|
|
90
|
+
if (!searchCommand.ok) {
|
|
91
|
+
if (options.requireKit) {
|
|
92
|
+
throw new Error(searchCommand.error || `my-dev-kit search failed with exit code ${searchCommand.exitCode}`);
|
|
93
|
+
}
|
|
94
|
+
warnings.push("my-dev-kit search command failed.");
|
|
95
|
+
return {
|
|
96
|
+
caseId: options.evaluationCase.id,
|
|
97
|
+
skipped: true,
|
|
98
|
+
warnings,
|
|
99
|
+
totalChars: 0,
|
|
100
|
+
totalEstimatedTokens: 0,
|
|
101
|
+
tokenCountMethod,
|
|
102
|
+
contextText: "",
|
|
103
|
+
filesRead: [],
|
|
104
|
+
commands,
|
|
105
|
+
durationMs: Date.now() - started
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
const searchPayload = parseJsonIfPossible(searchCommand.stdout);
|
|
109
|
+
const candidates = readSearchResults(searchPayload);
|
|
110
|
+
const selected = candidates[0];
|
|
111
|
+
if (!selected) {
|
|
112
|
+
warnings.push("No my-dev-kit search candidate was found.");
|
|
113
|
+
return {
|
|
114
|
+
caseId: options.evaluationCase.id,
|
|
115
|
+
skipped: true,
|
|
116
|
+
warnings,
|
|
117
|
+
totalChars: 0,
|
|
118
|
+
totalEstimatedTokens: 0,
|
|
119
|
+
tokenCountMethod,
|
|
120
|
+
contextText: "",
|
|
121
|
+
filesRead: [],
|
|
122
|
+
commands,
|
|
123
|
+
durationMs: Date.now() - started
|
|
124
|
+
};
|
|
125
|
+
}
|
|
126
|
+
const candidate = pickCandidateFields(selected);
|
|
127
|
+
const selectedNodeId = candidate.nodeId;
|
|
128
|
+
const selectedFile = candidate.file;
|
|
129
|
+
const selectedSymbol = candidate.symbol;
|
|
130
|
+
let lookupOutput = "";
|
|
131
|
+
let sliceOutput = "";
|
|
132
|
+
let sourceOutput = "";
|
|
133
|
+
if (selectedNodeId) {
|
|
134
|
+
const lookupCommand = await runMeasuredCommand({
|
|
135
|
+
commandId: "lookup",
|
|
136
|
+
commandString: options.kitCommand,
|
|
137
|
+
cwd: process.cwd(),
|
|
138
|
+
outDir: commandsDir,
|
|
139
|
+
extraArgs: ["lookup", "--index", indexesDir, "--node", selectedNodeId, "--json"]
|
|
140
|
+
});
|
|
141
|
+
commands.push(lookupCommand);
|
|
142
|
+
if (lookupCommand.ok) {
|
|
143
|
+
lookupOutput = lookupCommand.stdout;
|
|
144
|
+
}
|
|
145
|
+
else {
|
|
146
|
+
warnings.push("my-dev-kit lookup command failed.");
|
|
147
|
+
}
|
|
148
|
+
const sliceCommand = await runMeasuredCommand({
|
|
149
|
+
commandId: "slice",
|
|
150
|
+
commandString: options.kitCommand,
|
|
151
|
+
cwd: process.cwd(),
|
|
152
|
+
outDir: commandsDir,
|
|
153
|
+
extraArgs: ["slice", "--index", indexesDir, "--node", selectedNodeId, "--json"]
|
|
154
|
+
});
|
|
155
|
+
commands.push(sliceCommand);
|
|
156
|
+
if (sliceCommand.ok) {
|
|
157
|
+
sliceOutput = sliceCommand.stdout;
|
|
158
|
+
}
|
|
159
|
+
else {
|
|
160
|
+
warnings.push("my-dev-kit slice command failed.");
|
|
161
|
+
}
|
|
162
|
+
const sourceCommand = await runMeasuredCommand({
|
|
163
|
+
commandId: "source",
|
|
164
|
+
commandString: options.kitCommand,
|
|
165
|
+
cwd: process.cwd(),
|
|
166
|
+
outDir: commandsDir,
|
|
167
|
+
extraArgs: ["source", "--index", indexesDir, "--node", selectedNodeId, "--max-lines", "160", "--format", "numbered"]
|
|
168
|
+
});
|
|
169
|
+
commands.push(sourceCommand);
|
|
170
|
+
if (sourceCommand.ok) {
|
|
171
|
+
sourceOutput = sourceCommand.stdout;
|
|
172
|
+
}
|
|
173
|
+
else {
|
|
174
|
+
warnings.push("my-dev-kit source command failed.");
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
else {
|
|
178
|
+
warnings.push("No my-dev-kit node id was available after search.");
|
|
179
|
+
}
|
|
180
|
+
const contextText = [sourceOutput, sliceOutput, lookupOutput, searchCommand.stdout].find((text) => text && text.trim().length > 0) ?? "";
|
|
181
|
+
const filesRead = selectedFile ? [selectedFile] : [];
|
|
182
|
+
return {
|
|
183
|
+
caseId: options.evaluationCase.id,
|
|
184
|
+
skipped: contextText.length === 0,
|
|
185
|
+
warnings,
|
|
186
|
+
totalChars: countTextChars(contextText),
|
|
187
|
+
totalEstimatedTokens: countEstimatedTokens(contextText),
|
|
188
|
+
tokenCountMethod,
|
|
189
|
+
contextText,
|
|
190
|
+
filesRead,
|
|
191
|
+
commands,
|
|
192
|
+
selectedNodeId,
|
|
193
|
+
selectedFile,
|
|
194
|
+
selectedSymbol,
|
|
195
|
+
durationMs: Date.now() - started
|
|
196
|
+
};
|
|
197
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import { readFileSync, statSync } from "node:fs";
|
|
2
|
+
import { collectFilesForGlobs } from "../core/fileGlobs.js";
|
|
3
|
+
import { countEstimatedTokens, countTextChars, tokenCountMethod } from "../core/countTokens.js";
|
|
4
|
+
export async function runRawFullFileBaseline(evaluationCase) {
|
|
5
|
+
const started = Date.now();
|
|
6
|
+
let stats;
|
|
7
|
+
try {
|
|
8
|
+
stats = statSync(evaluationCase.absoluteTargetRoot);
|
|
9
|
+
}
|
|
10
|
+
catch {
|
|
11
|
+
throw new Error(`Target root does not exist: ${evaluationCase.targetRoot}`);
|
|
12
|
+
}
|
|
13
|
+
if (!stats.isDirectory()) {
|
|
14
|
+
throw new Error(`Target root is not a directory: ${evaluationCase.targetRoot}`);
|
|
15
|
+
}
|
|
16
|
+
const files = collectFilesForGlobs(evaluationCase.absoluteTargetRoot, evaluationCase.rawIncludeGlobs);
|
|
17
|
+
const contextText = files
|
|
18
|
+
.map(({ absolutePath, relativePath }) => `=== FILE: ${relativePath} ===\n${readFileSync(absolutePath, "utf8")}\n`)
|
|
19
|
+
.join("\n");
|
|
20
|
+
return {
|
|
21
|
+
caseId: evaluationCase.id,
|
|
22
|
+
targetRoot: evaluationCase.absoluteTargetRoot,
|
|
23
|
+
filesIncluded: files.map((file) => file.relativePath),
|
|
24
|
+
totalFiles: files.length,
|
|
25
|
+
totalChars: countTextChars(contextText),
|
|
26
|
+
totalEstimatedTokens: countEstimatedTokens(contextText),
|
|
27
|
+
tokenCountMethod,
|
|
28
|
+
contextText,
|
|
29
|
+
durationMs: Date.now() - started
|
|
30
|
+
};
|
|
31
|
+
}
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
export const CORRECTNESS_FORMULA = "correctnessScore = 0.25 * fileMatchScore + 0.25 * symbolMatchScore + 0.50 * factMatchScore; empty file or symbol categories are neutral at 1.0.";
|
|
2
|
+
export function scoreCorrectness(args) {
|
|
3
|
+
const failureReasons = [];
|
|
4
|
+
if (args.status && args.status !== "completed" && args.status !== "invalid-output") {
|
|
5
|
+
failureReasons.push(statusFailureReason(args.status));
|
|
6
|
+
}
|
|
7
|
+
if (args.parsedAnswer.parseStatus === "failed") {
|
|
8
|
+
failureReasons.push("invalid output");
|
|
9
|
+
}
|
|
10
|
+
const expectedFilesFound = countMatches(args.answerKey.expectedFiles, args.parsedAnswer.relevantFiles);
|
|
11
|
+
const expectedSymbolsFound = countMatches(args.answerKey.expectedSymbols, args.parsedAnswer.relevantSymbols);
|
|
12
|
+
const factMatches = matchFacts(args.answerKey, args.parsedAnswer.expectedFactsFound, args.parsedAnswer.answerText);
|
|
13
|
+
const requiredFactsTotal = args.answerKey.expectedFacts.filter((fact) => fact.required).length;
|
|
14
|
+
const requiredFactsFound = factMatches.filter((fact) => fact.required).length;
|
|
15
|
+
const optionalFactsTotal = args.answerKey.expectedFacts.length - requiredFactsTotal;
|
|
16
|
+
const optionalFactsFound = factMatches.filter((fact) => !fact.required).length;
|
|
17
|
+
const fileMatchScore = categoryScore(expectedFilesFound, args.answerKey.expectedFiles.length);
|
|
18
|
+
const symbolMatchScore = categoryScore(expectedSymbolsFound, args.answerKey.expectedSymbols.length);
|
|
19
|
+
const factMatchScore = weightedFactScore(args.answerKey, factMatches.map((fact) => fact.id));
|
|
20
|
+
const correctnessScore = round(0.25 * fileMatchScore + 0.25 * symbolMatchScore + 0.5 * factMatchScore);
|
|
21
|
+
const foundFactCount = factMatches.length;
|
|
22
|
+
for (const fact of args.answerKey.expectedFacts.filter((fact) => fact.required)) {
|
|
23
|
+
if (!factMatches.some((match) => match.id === fact.id)) {
|
|
24
|
+
failureReasons.push(`missing required fact: ${fact.id}`);
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
if (foundFactCount < args.answerKey.minimumCorrectFacts) {
|
|
28
|
+
failureReasons.push(`too few facts found: ${foundFactCount}/${args.answerKey.minimumCorrectFacts}`);
|
|
29
|
+
}
|
|
30
|
+
if (args.answerKey.expectedFiles.length > 0 && expectedFilesFound < args.answerKey.expectedFiles.length) {
|
|
31
|
+
failureReasons.push("missing expected file");
|
|
32
|
+
}
|
|
33
|
+
if (args.answerKey.expectedSymbols.length > 0 && expectedSymbolsFound < args.answerKey.expectedSymbols.length) {
|
|
34
|
+
failureReasons.push("missing expected symbol");
|
|
35
|
+
}
|
|
36
|
+
if (correctnessScore < 0.7) {
|
|
37
|
+
failureReasons.push("score below threshold");
|
|
38
|
+
}
|
|
39
|
+
const passed = failureReasons.length === 0 &&
|
|
40
|
+
requiredFactsFound === requiredFactsTotal &&
|
|
41
|
+
foundFactCount >= args.answerKey.minimumCorrectFacts &&
|
|
42
|
+
correctnessScore >= 0.7;
|
|
43
|
+
return {
|
|
44
|
+
caseId: args.caseId,
|
|
45
|
+
fileMatchScore,
|
|
46
|
+
symbolMatchScore,
|
|
47
|
+
factMatchScore,
|
|
48
|
+
correctnessScore,
|
|
49
|
+
requiredFactsFound,
|
|
50
|
+
requiredFactsTotal,
|
|
51
|
+
optionalFactsFound,
|
|
52
|
+
optionalFactsTotal,
|
|
53
|
+
expectedFilesFound,
|
|
54
|
+
expectedFilesTotal: args.answerKey.expectedFiles.length,
|
|
55
|
+
expectedSymbolsFound,
|
|
56
|
+
expectedSymbolsTotal: args.answerKey.expectedSymbols.length,
|
|
57
|
+
passed,
|
|
58
|
+
failureReasons: unique(failureReasons),
|
|
59
|
+
formula: CORRECTNESS_FORMULA
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
function statusFailureReason(status) {
|
|
63
|
+
if (status === "agent-limit-reached")
|
|
64
|
+
return "agent limit reached";
|
|
65
|
+
if (status === "agent-unavailable")
|
|
66
|
+
return "agent unavailable";
|
|
67
|
+
if (status === "timeout")
|
|
68
|
+
return "timeout";
|
|
69
|
+
if (status === "failed")
|
|
70
|
+
return "agent run failed";
|
|
71
|
+
if (status === "skipped")
|
|
72
|
+
return "agent run skipped";
|
|
73
|
+
if (status === "invalid-output")
|
|
74
|
+
return "invalid output";
|
|
75
|
+
return status;
|
|
76
|
+
}
|
|
77
|
+
function countMatches(expected, actual) {
|
|
78
|
+
return expected.filter((item) => {
|
|
79
|
+
const expectedNormalized = normalize(item);
|
|
80
|
+
const expectedPath = normalizePath(item);
|
|
81
|
+
return actual.some((candidate) => {
|
|
82
|
+
const actualNormalized = normalize(candidate);
|
|
83
|
+
const actualPath = normalizePath(candidate);
|
|
84
|
+
return (actualNormalized === expectedNormalized ||
|
|
85
|
+
actualNormalized.includes(expectedNormalized) ||
|
|
86
|
+
expectedNormalized.includes(actualNormalized) ||
|
|
87
|
+
actualPath === expectedPath ||
|
|
88
|
+
actualPath.endsWith(`/${expectedPath}`) ||
|
|
89
|
+
expectedPath.endsWith(`/${actualPath}`));
|
|
90
|
+
});
|
|
91
|
+
}).length;
|
|
92
|
+
}
|
|
93
|
+
function matchFacts(answerKey, expectedFactsFound, answerText) {
|
|
94
|
+
const found = new Set(expectedFactsFound.map(normalize));
|
|
95
|
+
const normalizedAnswer = normalize(answerText);
|
|
96
|
+
return answerKey.expectedFacts.filter((fact) => {
|
|
97
|
+
const factId = normalize(fact.id);
|
|
98
|
+
const factText = normalize(fact.text);
|
|
99
|
+
return found.has(factId) || found.has(factText) || normalizedAnswer.includes(factId) || normalizedAnswer.includes(factText);
|
|
100
|
+
});
|
|
101
|
+
}
|
|
102
|
+
function weightedFactScore(answerKey, factIds) {
|
|
103
|
+
if (answerKey.expectedFacts.length === 0) {
|
|
104
|
+
return 1;
|
|
105
|
+
}
|
|
106
|
+
const found = new Set(factIds);
|
|
107
|
+
const totalWeight = answerKey.expectedFacts.reduce((sum, fact) => sum + fact.weight, 0);
|
|
108
|
+
if (totalWeight === 0) {
|
|
109
|
+
return 1;
|
|
110
|
+
}
|
|
111
|
+
return round(answerKey.expectedFacts.filter((fact) => found.has(fact.id)).reduce((sum, fact) => sum + fact.weight, 0) / totalWeight);
|
|
112
|
+
}
|
|
113
|
+
function categoryScore(found, total) {
|
|
114
|
+
return total === 0 ? 1 : round(found / total);
|
|
115
|
+
}
|
|
116
|
+
function normalize(value) {
|
|
117
|
+
return value.toLowerCase().replace(/[^a-z0-9]+/g, " ").trim();
|
|
118
|
+
}
|
|
119
|
+
function normalizePath(value) {
|
|
120
|
+
return value.toLowerCase().replace(/\\/g, "/").replace(/^\.?\//, "").trim();
|
|
121
|
+
}
|
|
122
|
+
function round(value) {
|
|
123
|
+
return Math.round(value * 10000) / 10000;
|
|
124
|
+
}
|
|
125
|
+
function unique(values) {
|
|
126
|
+
return [...new Set(values)];
|
|
127
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import { mkdir, writeFile } from "node:fs/promises";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
export async function writeExperimentArtifacts(args) {
|
|
4
|
+
const outDir = path.resolve(args.outDir);
|
|
5
|
+
const runsDir = path.join(outDir, "runs");
|
|
6
|
+
await mkdir(runsDir, { recursive: true });
|
|
7
|
+
for (const run of args.runs) {
|
|
8
|
+
const runDir = path.join(runsDir, run.runId);
|
|
9
|
+
await mkdir(runDir, { recursive: true });
|
|
10
|
+
const promptPath = path.join(runDir, "prompt.txt");
|
|
11
|
+
const agentRunResultPath = path.join(runDir, "agent-run-result.json");
|
|
12
|
+
const parsedAnswerPath = path.join(runDir, "parsed-answer.json");
|
|
13
|
+
const correctnessScorePath = path.join(runDir, "correctness-score.json");
|
|
14
|
+
await writeFile(promptPath, getPromptTextFromRun(run), "utf8");
|
|
15
|
+
await writeFile(agentRunResultPath, `${JSON.stringify(run.agentRunResult, null, 2)}\n`, "utf8");
|
|
16
|
+
await writeFile(parsedAnswerPath, `${JSON.stringify(run.parsedAnswer, null, 2)}\n`, "utf8");
|
|
17
|
+
await writeFile(correctnessScorePath, `${JSON.stringify(run.correctness, null, 2)}\n`, "utf8");
|
|
18
|
+
run.artifactPaths = {
|
|
19
|
+
promptPath,
|
|
20
|
+
agentRunResultPath,
|
|
21
|
+
parsedAnswerPath,
|
|
22
|
+
correctnessScorePath
|
|
23
|
+
};
|
|
24
|
+
}
|
|
25
|
+
const artifactPaths = {
|
|
26
|
+
summaryPath: path.join(outDir, "experiment-summary.json"),
|
|
27
|
+
runsPath: path.join(outDir, "experiment-runs.json"),
|
|
28
|
+
comparisonsPath: path.join(outDir, "experiment-comparisons.json"),
|
|
29
|
+
configPath: path.join(outDir, "experiment-config.json"),
|
|
30
|
+
runsDir
|
|
31
|
+
};
|
|
32
|
+
await writeFile(artifactPaths.summaryPath, `${JSON.stringify(args.summary, null, 2)}\n`, "utf8");
|
|
33
|
+
await writeFile(artifactPaths.runsPath, `${JSON.stringify({ generatedAt: args.summary.generatedAt, runs: args.runs }, null, 2)}\n`, "utf8");
|
|
34
|
+
await writeFile(artifactPaths.comparisonsPath, `${JSON.stringify({ generatedAt: args.summary.generatedAt, comparisons: args.comparisons }, null, 2)}\n`, "utf8");
|
|
35
|
+
await writeFile(artifactPaths.configPath, `${JSON.stringify(sanitizeConfig(args.config), null, 2)}\n`, "utf8");
|
|
36
|
+
return {
|
|
37
|
+
summary: args.summary,
|
|
38
|
+
runs: args.runs,
|
|
39
|
+
comparisons: args.comparisons,
|
|
40
|
+
artifactPaths,
|
|
41
|
+
warnings: args.summary.warnings
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
export function buildExperimentSummary(args) {
|
|
45
|
+
const runs = args.runs;
|
|
46
|
+
const comparisons = args.comparisons;
|
|
47
|
+
const tokenSavings = comparisons
|
|
48
|
+
.map((comparison) => comparison.tokenSavingsPercent)
|
|
49
|
+
.filter((value) => typeof value === "number" && Number.isFinite(value));
|
|
50
|
+
const durationReductions = comparisons
|
|
51
|
+
.map((comparison) => comparison.durationReductionPercent)
|
|
52
|
+
.filter((value) => typeof value === "number" && Number.isFinite(value));
|
|
53
|
+
const correctnessDeltas = comparisons
|
|
54
|
+
.map((comparison) => comparison.correctnessDelta)
|
|
55
|
+
.filter((value) => typeof value === "number" && Number.isFinite(value));
|
|
56
|
+
const completedComparisons = comparisons.filter((comparison) => comparison.rawStatus === "completed" && comparison.myDevKitStatus === "completed");
|
|
57
|
+
return {
|
|
58
|
+
generatedAt: args.generatedAt ?? new Date().toISOString(),
|
|
59
|
+
casesPath: args.config.casesPath,
|
|
60
|
+
projectProfilesPath: args.config.projectProfilesPath,
|
|
61
|
+
agents: [...new Set(runs.map((run) => run.agentId))].sort(),
|
|
62
|
+
strategies: [...new Set(runs.map((run) => run.promptStrategy))].sort(),
|
|
63
|
+
complexityLevels: [...new Set(runs.map((run) => run.promptComplexityLevel))].sort(),
|
|
64
|
+
totalRuns: runs.length,
|
|
65
|
+
completedRuns: countStatus(runs, "completed"),
|
|
66
|
+
failedRuns: countStatus(runs, "failed"),
|
|
67
|
+
skippedRuns: countStatus(runs, "skipped"),
|
|
68
|
+
unavailableRuns: countStatus(runs, "agent-unavailable"),
|
|
69
|
+
limitReachedRuns: countStatus(runs, "agent-limit-reached"),
|
|
70
|
+
timeoutRuns: countStatus(runs, "timeout"),
|
|
71
|
+
invalidOutputRuns: countStatus(runs, "invalid-output"),
|
|
72
|
+
totalComparisons: comparisons.length,
|
|
73
|
+
averageTokenSavingsPercent: averageOrNull(tokenSavings),
|
|
74
|
+
averageDurationReductionPercent: averageOrNull(durationReductions),
|
|
75
|
+
averageCorrectnessDelta: averageOrNull(correctnessDeltas),
|
|
76
|
+
answerDoesMyDevKitSaveTokens: tokenSavings.length === 0 ? null : averageOrNull(tokenSavings) > 0,
|
|
77
|
+
answerDoesMyDevKitPreserveCorrectness: completedComparisons.length === 0 ? null : completedComparisons.every((comparison) => comparison.sameCorrectnessPass),
|
|
78
|
+
answerDoesMyDevKitReduceExecutionTime: durationReductions.length === 0 ? null : averageOrNull(durationReductions) > 0,
|
|
79
|
+
warnings: [
|
|
80
|
+
...runs.flatMap((run) => run.warnings),
|
|
81
|
+
...comparisons.flatMap((comparison) => comparison.warnings)
|
|
82
|
+
]
|
|
83
|
+
};
|
|
84
|
+
}
|
|
85
|
+
function getPromptTextFromRun(run) {
|
|
86
|
+
return run.promptTextForArtifact ?? "";
|
|
87
|
+
}
|
|
88
|
+
function sanitizeConfig(config) {
|
|
89
|
+
return {
|
|
90
|
+
...config,
|
|
91
|
+
commandTemplates: config.commandTemplates
|
|
92
|
+
? Object.fromEntries(Object.entries(config.commandTemplates).map(([key, value]) => [key, value ? { ...value, args: value.args } : value]))
|
|
93
|
+
: undefined
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
function countStatus(runs, status) {
|
|
97
|
+
return runs.filter((run) => run.status === status).length;
|
|
98
|
+
}
|
|
99
|
+
function averageOrNull(values) {
|
|
100
|
+
if (values.length === 0) {
|
|
101
|
+
return null;
|
|
102
|
+
}
|
|
103
|
+
return values.reduce((sum, value) => sum + value, 0) / values.length;
|
|
104
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import { mkdir, writeFile } from "node:fs/promises";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { normalizeLabReport } from "../report/types.js";
|
|
4
|
+
import { renderHtmlReport } from "../report/renderHtmlReport.js";
|
|
5
|
+
import { renderTokenSavingsReportInput } from "./renderTokenSavingsReportInput.js";
|
|
6
|
+
export async function writeTokenSavingsArtifacts(options) {
|
|
7
|
+
const outDir = path.resolve(options.outDir);
|
|
8
|
+
await mkdir(outDir, { recursive: true });
|
|
9
|
+
const artifactPaths = {
|
|
10
|
+
summaryPath: path.join(outDir, "token-savings-summary.json"),
|
|
11
|
+
runsPath: path.join(outDir, "token-savings-runs.json"),
|
|
12
|
+
htmlPath: path.join(outDir, "token-savings-report.html"),
|
|
13
|
+
pngPath: path.join(outDir, "token-savings-report.png")
|
|
14
|
+
};
|
|
15
|
+
const warnings = [...options.summary.warnings];
|
|
16
|
+
if (options.screenshot.warning) {
|
|
17
|
+
warnings.push(options.screenshot.warning);
|
|
18
|
+
}
|
|
19
|
+
if (options.screenshot.status === "failed" && options.screenshot.error) {
|
|
20
|
+
warnings.push(`PNG screenshot capture failed: ${options.screenshot.error}`);
|
|
21
|
+
}
|
|
22
|
+
const report = renderTokenSavingsReportInput({
|
|
23
|
+
summary: options.summary,
|
|
24
|
+
cases: options.comparisonCases,
|
|
25
|
+
commandConfig: options.commandConfig,
|
|
26
|
+
artifactPaths: {
|
|
27
|
+
summaryPath: artifactPaths.summaryPath,
|
|
28
|
+
runsPath: artifactPaths.runsPath,
|
|
29
|
+
htmlPath: artifactPaths.htmlPath
|
|
30
|
+
},
|
|
31
|
+
warnings
|
|
32
|
+
});
|
|
33
|
+
const normalizedReport = normalizeLabReport(report, options.generatedAt);
|
|
34
|
+
await writeFile(artifactPaths.summaryPath, JSON.stringify({
|
|
35
|
+
summary: options.summary,
|
|
36
|
+
tokenCountMethod: options.summary.tokenCountMethod,
|
|
37
|
+
generatedAt: normalizedReport.generatedAt,
|
|
38
|
+
commandConfiguration: options.commandConfig,
|
|
39
|
+
warnings,
|
|
40
|
+
screenshot: options.screenshot,
|
|
41
|
+
artifactPaths
|
|
42
|
+
}, null, 2), "utf8");
|
|
43
|
+
await writeFile(artifactPaths.runsPath, JSON.stringify({
|
|
44
|
+
generatedAt: normalizedReport.generatedAt,
|
|
45
|
+
tokenCountMethod: options.summary.tokenCountMethod,
|
|
46
|
+
runs: options.runs
|
|
47
|
+
}, null, 2), "utf8");
|
|
48
|
+
await writeFile(artifactPaths.htmlPath, renderHtmlReport(normalizedReport), "utf8");
|
|
49
|
+
return {
|
|
50
|
+
summary: options.summary,
|
|
51
|
+
runs: options.runs,
|
|
52
|
+
report: normalizedReport,
|
|
53
|
+
screenshot: options.screenshot,
|
|
54
|
+
artifactPaths,
|
|
55
|
+
warnings
|
|
56
|
+
};
|
|
57
|
+
}
|