@dailephd/my-dev-kit-lab 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +272 -0
- package/benchmarks/contracts/benchmark-project-profiles.json +1199 -0
- package/benchmarks/contracts/todo-behavior.md +70 -0
- package/benchmarks/contracts/todo-benchmark-case.json +227 -0
- package/benchmarks/projects/README.md +34 -0
- package/benchmarks/projects/task-analytics-large-mixed/README.md +1 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/__init__.py +3 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/fixtures.py +6 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/metrics.py +29 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/models.py +21 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/parser.py +16 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/pipeline.py +9 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/quality.py +8 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/reporting.py +11 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/tests/test_metrics.py +19 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/tests/test_parser.py +15 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/tests/test_quality.py +19 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/tests/test_reporting.py +15 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/package.json +12 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/index.ts +11 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/models/analyticsSnapshot.ts +20 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/models/project.ts +5 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/models/task.ts +10 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/reporting/buildProjectLeaderboard.ts +7 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/reporting/formatTaskHealthReport.ts +13 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/services/buildAnalyticsSnapshot.ts +39 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/services/completeTask.ts +10 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/services/createTask.ts +21 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/services/listTasksByProject.ts +6 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/store/projectStore.ts +20 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/store/taskStore.ts +44 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/validation/projectValidation.ts +12 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/validation/taskValidation.ts +18 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tests/buildAnalyticsSnapshot.test.ts +48 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tests/completeTask.test.ts +21 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tests/createTask.test.ts +31 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tests/listTasksByProject.test.ts +18 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tests/reporting.test.ts +19 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tsconfig.json +12 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/vitest.config.ts +5 -0
- package/benchmarks/projects/task-workflow-medium-ts/README.md +1 -0
- package/benchmarks/projects/task-workflow-medium-ts/package.json +12 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/index.ts +9 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/models/project.ts +6 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/models/task.ts +39 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/services/completeTask.ts +15 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/services/createTask.ts +26 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/services/filterTasks.ts +17 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/services/importTasks.ts +33 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/services/summarizeTasks.ts +30 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/store/taskStore.ts +76 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/utils/deterministicId.ts +3 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/validation/taskValidation.ts +45 -0
- package/benchmarks/projects/task-workflow-medium-ts/tests/completeTask.test.ts +16 -0
- package/benchmarks/projects/task-workflow-medium-ts/tests/createTask.test.ts +21 -0
- package/benchmarks/projects/task-workflow-medium-ts/tests/filterTasks.test.ts +18 -0
- package/benchmarks/projects/task-workflow-medium-ts/tests/importTasks.test.ts +22 -0
- package/benchmarks/projects/task-workflow-medium-ts/tests/summarizeTasks.test.ts +29 -0
- package/benchmarks/projects/task-workflow-medium-ts/tsconfig.json +12 -0
- package/benchmarks/projects/task-workflow-medium-ts/vitest.config.ts +5 -0
- package/benchmarks/projects/todo-js/README.md +3 -0
- package/benchmarks/projects/todo-js/package.json +11 -0
- package/benchmarks/projects/todo-js/src/index.js +2 -0
- package/benchmarks/projects/todo-js/src/taskService.js +37 -0
- package/benchmarks/projects/todo-js/src/taskStore.js +28 -0
- package/benchmarks/projects/todo-js/tests/taskService.test.js +45 -0
- package/benchmarks/projects/todo-js/vitest.config.js +5 -0
- package/benchmarks/projects/todo-mixed-ts-py/README.md +3 -0
- package/benchmarks/projects/todo-mixed-ts-py/package.json +13 -0
- package/benchmarks/projects/todo-mixed-ts-py/python/task_service.py +76 -0
- package/benchmarks/projects/todo-mixed-ts-py/src/taskCli.ts +38 -0
- package/benchmarks/projects/todo-mixed-ts-py/tests/mixedBoundary.test.ts +18 -0
- package/benchmarks/projects/todo-mixed-ts-py/tsconfig.json +12 -0
- package/benchmarks/projects/todo-mixed-ts-py/vitest.config.ts +5 -0
- package/benchmarks/projects/todo-python/README.md +3 -0
- package/benchmarks/projects/todo-python/src/__init__.py +4 -0
- package/benchmarks/projects/todo-python/src/task_service.py +32 -0
- package/benchmarks/projects/todo-python/src/task_store.py +28 -0
- package/benchmarks/projects/todo-python/tests/test_task_service.py +52 -0
- package/benchmarks/projects/todo-ts/README.md +3 -0
- package/benchmarks/projects/todo-ts/package.json +12 -0
- package/benchmarks/projects/todo-ts/src/index.ts +2 -0
- package/benchmarks/projects/todo-ts/src/taskService.ts +41 -0
- package/benchmarks/projects/todo-ts/src/taskStore.ts +34 -0
- package/benchmarks/projects/todo-ts/tests/taskService.test.ts +45 -0
- package/benchmarks/projects/todo-ts/tsconfig.json +12 -0
- package/benchmarks/projects/todo-ts/vitest.config.ts +5 -0
- package/dist/scripts/build-gallery.js +3 -0
- package/dist/scripts/capture-demo-report.js +3 -0
- package/dist/scripts/evaluate-token-savings.js +2 -0
- package/dist/scripts/experiments/describeExperiment.js +143 -0
- package/dist/scripts/experiments/listExperiments.js +44 -0
- package/dist/scripts/experiments/runExperiment.js +199 -0
- package/dist/scripts/generate-experiment-plots.js +3 -0
- package/dist/scripts/generate-prompt-variants.js +2 -0
- package/dist/scripts/render-experiment-report.js +2 -0
- package/dist/scripts/run-agent-prompt.js +2 -0
- package/dist/scripts/run-controlled-experiment.js +2 -0
- package/dist/scripts/run-final-demo.js +3 -0
- package/dist/scripts/run-lab-demo.js +5 -0
- package/dist/scripts/run-visualization-demos.js +3 -0
- package/dist/scripts/security/runCodeql.js +57 -0
- package/dist/scripts/security/runDependencyChecks.js +57 -0
- package/dist/scripts/security/runFuzzSmoke.js +29 -0
- package/dist/scripts/security/runPackageChecks.js +56 -0
- package/dist/scripts/security/runSemgrep.js +63 -0
- package/dist/scripts/security/validate.js +117 -0
- package/dist/scripts/verify-benchmarks.js +202 -0
- package/dist/src/agents/adapters/claudeAdapter.js +37 -0
- package/dist/src/agents/adapters/codexAdapter.js +110 -0
- package/dist/src/agents/adapters/fakeAgentAdapter.js +101 -0
- package/dist/src/agents/agentRegistry.js +21 -0
- package/dist/src/agents/index.js +7 -0
- package/dist/src/agents/parseAgentTokenUsage.js +137 -0
- package/dist/src/agents/runAgentPrompt.js +38 -0
- package/dist/src/agents/types.js +1 -0
- package/dist/src/commands/buildGalleryCommand.js +56 -0
- package/dist/src/commands/captureDemoReport.js +116 -0
- package/dist/src/commands/evaluateTokenSavings.js +175 -0
- package/dist/src/commands/generateExperimentPlotsCommand.js +38 -0
- package/dist/src/commands/generatePromptVariants.js +67 -0
- package/dist/src/commands/renderExperimentReportCommand.js +131 -0
- package/dist/src/commands/runAgentPromptCommand.js +132 -0
- package/dist/src/commands/runControlledExperimentCommand.js +174 -0
- package/dist/src/commands/runFinalDemoCommand.js +123 -0
- package/dist/src/commands/runLabDemo.js +62 -0
- package/dist/src/commands/runVisualizationDemosCommand.js +67 -0
- package/dist/src/core/commandLine.js +59 -0
- package/dist/src/core/countTokens.js +8 -0
- package/dist/src/core/fileGlobs.js +100 -0
- package/dist/src/core/localProjectTarget.js +75 -0
- package/dist/src/core/pathSafety.js +19 -0
- package/dist/src/core/pythonCommand.js +30 -0
- package/dist/src/core/resolveCommand.js +110 -0
- package/dist/src/core/runMeasuredCommand.js +143 -0
- package/dist/src/evaluation/benchmarkMetadata.js +207 -0
- package/dist/src/evaluation/buildExperimentMatrix.js +75 -0
- package/dist/src/evaluation/classifyAgentRunOutcome.js +40 -0
- package/dist/src/evaluation/compareExperimentRuns.js +79 -0
- package/dist/src/evaluation/compareTokenSavings.js +47 -0
- package/dist/src/evaluation/controlledExperimentTypes.js +1 -0
- package/dist/src/evaluation/index.js +18 -0
- package/dist/src/evaluation/parseAgentAnswer.js +230 -0
- package/dist/src/evaluation/projectComplexity.js +126 -0
- package/dist/src/evaluation/projectFileTree.js +83 -0
- package/dist/src/evaluation/readEvaluationCases.js +59 -0
- package/dist/src/evaluation/renderTokenSavingsReportInput.js +55 -0
- package/dist/src/evaluation/runControlledExperiment.js +158 -0
- package/dist/src/evaluation/runMyDevKitRetrieval.js +197 -0
- package/dist/src/evaluation/runRawFullFileBaseline.js +31 -0
- package/dist/src/evaluation/scoreCorrectness.js +127 -0
- package/dist/src/evaluation/types.js +1 -0
- package/dist/src/evaluation/writeExperimentArtifacts.js +104 -0
- package/dist/src/evaluation/writeTokenSavingsArtifacts.js +57 -0
- package/dist/src/experiments/config.js +24 -0
- package/dist/src/experiments/defaultRegistry.js +7 -0
- package/dist/src/experiments/errors.js +18 -0
- package/dist/src/experiments/index.js +9 -0
- package/dist/src/experiments/outputPaths.js +25 -0
- package/dist/src/experiments/plugins/contextStrategyComparison/config.js +37 -0
- package/dist/src/experiments/plugins/contextStrategyComparison/index.js +3 -0
- package/dist/src/experiments/plugins/contextStrategyComparison/plugin.js +83 -0
- package/dist/src/experiments/plugins/contextStrategyComparison/resultMapping.js +260 -0
- package/dist/src/experiments/plugins/index.js +1 -0
- package/dist/src/experiments/registry.js +43 -0
- package/dist/src/experiments/results.js +48 -0
- package/dist/src/experiments/runner.js +181 -0
- package/dist/src/experiments/target.js +8 -0
- package/dist/src/experiments/types.js +1 -0
- package/dist/src/gallery/index.js +2 -0
- package/dist/src/gallery/types.js +1 -0
- package/dist/src/gallery/writeGalleryManifest.js +214 -0
- package/dist/src/index.js +12 -0
- package/dist/src/plots/buildExperimentPlotData.js +137 -0
- package/dist/src/plots/index.js +4 -0
- package/dist/src/plots/renderSvgChart.js +82 -0
- package/dist/src/plots/types.js +1 -0
- package/dist/src/plots/writePlotArtifacts.js +46 -0
- package/dist/src/prompts/buildPromptContext.js +68 -0
- package/dist/src/prompts/generateMyDevKitPrompt.js +106 -0
- package/dist/src/prompts/generatePromptVariants.js +36 -0
- package/dist/src/prompts/generateRawFullFilePrompt.js +97 -0
- package/dist/src/prompts/index.js +7 -0
- package/dist/src/prompts/measurePromptComplexity.js +41 -0
- package/dist/src/prompts/types.js +1 -0
- package/dist/src/prompts/writePromptArtifacts.js +43 -0
- package/dist/src/report/buildExperimentReportInput.js +339 -0
- package/dist/src/report/experimentReportTypes.js +1 -0
- package/dist/src/report/experiments/buildPluginExperimentReport.js +153 -0
- package/dist/src/report/experiments/experimentReportModel.js +1 -0
- package/dist/src/report/experiments/index.js +4 -0
- package/dist/src/report/experiments/renderPluginExperimentReportHtml.js +133 -0
- package/dist/src/report/experiments/writePluginExperimentReports.js +30 -0
- package/dist/src/report/index.js +8 -0
- package/dist/src/report/renderExperimentHtmlReport.js +354 -0
- package/dist/src/report/renderHtmlReport.js +103 -0
- package/dist/src/report/types.js +10 -0
- package/dist/src/report/writeExperimentReportArtifacts.js +38 -0
- package/dist/src/report/writeReportArtifacts.js +39 -0
- package/dist/src/screenshot/captureReportScreenshot.js +75 -0
- package/dist/src/screenshot/index.js +2 -0
- package/dist/src/screenshot/types.js +1 -0
- package/dist/src/securityValidation/artifacts.js +15 -0
- package/dist/src/securityValidation/cliAdversarial/adversarialCliConfig.js +38 -0
- package/dist/src/securityValidation/cliAdversarial/dataVolumeChecks.js +194 -0
- package/dist/src/securityValidation/cliAdversarial/jsonStdoutChecks.js +359 -0
- package/dist/src/securityValidation/cliAdversarial/malformedArtifactChecks.js +284 -0
- package/dist/src/securityValidation/cliAdversarial/malformedArtifactFixtures.js +79 -0
- package/dist/src/securityValidation/cliAdversarial/pathBoundaryChecks.js +431 -0
- package/dist/src/securityValidation/cliAdversarial/pathCases.js +144 -0
- package/dist/src/securityValidation/cliAdversarial/readOnlyBoundaryChecks.js +294 -0
- package/dist/src/securityValidation/cliAdversarial/runAdversarialCheck.js +149 -0
- package/dist/src/securityValidation/cliAdversarial/subprocessSafetyChecks.js +214 -0
- package/dist/src/securityValidation/cliAdversarial/tempWorkspace.js +160 -0
- package/dist/src/securityValidation/commandRunner.js +136 -0
- package/dist/src/securityValidation/config.js +39 -0
- package/dist/src/securityValidation/dependencies/parseNpmAudit.js +115 -0
- package/dist/src/securityValidation/dependencies/parseNpmLs.js +71 -0
- package/dist/src/securityValidation/dependencies/parseNpmOutdated.js +41 -0
- package/dist/src/securityValidation/dependencies/runDependencyChecks.js +239 -0
- package/dist/src/securityValidation/dependencies/runOsvScanner.js +43 -0
- package/dist/src/securityValidation/fuzz/fuzzHarness.js +61 -0
- package/dist/src/securityValidation/fuzz/fuzzTargets.js +204 -0
- package/dist/src/securityValidation/fuzz/randomInput.js +0 -0
- package/dist/src/securityValidation/index.js +34 -0
- package/dist/src/securityValidation/packageChecks/forbiddenPackageContents.js +67 -0
- package/dist/src/securityValidation/packageChecks/parseNpmPackDryRun.js +56 -0
- package/dist/src/securityValidation/packageChecks/runPackageChecks.js +88 -0
- package/dist/src/securityValidation/report/renderSecurityReport.js +248 -0
- package/dist/src/securityValidation/report/securityReportTypes.js +1 -0
- package/dist/src/securityValidation/staticScans/codeql.js +66 -0
- package/dist/src/securityValidation/staticScans/semgrep.js +180 -0
- package/dist/src/securityValidation/testMatrix.js +535 -0
- package/dist/src/securityValidation/types.js +34 -0
- package/dist/src/securityValidation/validate/resolveTarget.js +32 -0
- package/dist/src/securityValidation/validate/runSecurityValidation.js +169 -0
- package/dist/src/securityValidation/validate/verdict.js +73 -0
- package/dist/src/visualizationDemos/buildMyDevKitVisualizationCommands.js +59 -0
- package/dist/src/visualizationDemos/index.js +4 -0
- package/dist/src/visualizationDemos/runVisualizationDemos.js +82 -0
- package/dist/src/visualizationDemos/types.js +1 -0
- package/dist/src/visualizationDemos/writeVisualizationDemoArtifacts.js +25 -0
- package/docs/METRICS.md +286 -0
- package/examples/demo-report-input.json +78 -0
- package/examples/lab-demo-cases.json +35 -0
- package/examples/real-agent-campaign-cases.json +118 -0
- package/examples/token-savings-cases.json +122 -0
- package/package.json +91 -0
- package/tests/fixtures/fake-adversarial-cli.js +152 -0
- package/tests/fixtures/fake-my-dev-kit-cli.js +83 -0
|
@@ -0,0 +1,339 @@
|
|
|
1
|
+
import { access, readFile } from "node:fs/promises";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { readBenchmarkProjectProfiles, readEvaluationCases } from "../evaluation/index.js";
|
|
4
|
+
const DEFAULT_MAX_PROMPT_CHARS = 1800;
|
|
5
|
+
const DEFAULT_MAX_FILE_TREE_ENTRIES = 80;
|
|
6
|
+
export async function buildExperimentReportInput(options) {
|
|
7
|
+
const repoRoot = path.resolve(options.repoRoot ?? process.cwd());
|
|
8
|
+
const experimentDir = path.resolve(repoRoot, options.experimentDir);
|
|
9
|
+
const warnings = [];
|
|
10
|
+
const summary = await readRequiredJson(experimentDir, "experiment-summary.json");
|
|
11
|
+
const runsPayload = await readRequiredJson(experimentDir, "experiment-runs.json");
|
|
12
|
+
const comparisonsPayload = await readRequiredJson(experimentDir, "experiment-comparisons.json");
|
|
13
|
+
const config = await readRequiredJson(experimentDir, "experiment-config.json");
|
|
14
|
+
const runs = runsPayload.runs ?? [];
|
|
15
|
+
const comparisons = comparisonsPayload.comparisons ?? [];
|
|
16
|
+
const profilesPath = typeof summary.projectProfilesPath === "string" ? summary.projectProfilesPath : "benchmarks/contracts/benchmark-project-profiles.json";
|
|
17
|
+
const profiles = await readBenchmarkProjectProfiles(path.resolve(repoRoot, profilesPath), repoRoot);
|
|
18
|
+
const cases = await readEvaluationCases(path.resolve(repoRoot, summary.casesPath), repoRoot, {
|
|
19
|
+
projectProfiles: profiles,
|
|
20
|
+
requireProjectProfileRef: true
|
|
21
|
+
});
|
|
22
|
+
const selectedProjectIds = unique(runs.map((run) => run.benchmarkProject));
|
|
23
|
+
const selectedCaseIds = unique(runs.map((run) => run.caseId));
|
|
24
|
+
const selectedProfiles = selectedProjectIds
|
|
25
|
+
.map((projectId) => profiles.find((profile) => profile.projectId === projectId))
|
|
26
|
+
.filter((profile) => Boolean(profile));
|
|
27
|
+
const selectedCases = selectedCaseIds
|
|
28
|
+
.map((caseId) => cases.find((evaluationCase) => evaluationCase.id === caseId))
|
|
29
|
+
.filter((evaluationCase) => Boolean(evaluationCase));
|
|
30
|
+
const promptSections = await buildPromptSections({
|
|
31
|
+
experimentDir,
|
|
32
|
+
runs,
|
|
33
|
+
warnings,
|
|
34
|
+
maxPromptChars: options.maxPromptChars ?? DEFAULT_MAX_PROMPT_CHARS
|
|
35
|
+
});
|
|
36
|
+
const plotSections = await buildPlotSections({ repoRoot, plotsDir: options.plotsDir, warnings });
|
|
37
|
+
const visualizationSections = await buildVisualizationSections({ repoRoot, visualizationsDir: options.visualizationsDir, warnings });
|
|
38
|
+
const aggregate = buildAggregateAnswers({ summary, runs, comparisons });
|
|
39
|
+
return {
|
|
40
|
+
generatedAt: new Date().toISOString(),
|
|
41
|
+
sourceExperimentDir: experimentDir,
|
|
42
|
+
title: options.title ?? "Controlled Experiment Report",
|
|
43
|
+
subtitle: options.subtitle ?? "raw-full-file vs my-dev-kit-guided strategy comparison",
|
|
44
|
+
executiveSummary: aggregate,
|
|
45
|
+
methodology: buildMethodology(summary),
|
|
46
|
+
projectProfiles: selectedProfiles.map(buildProjectSection),
|
|
47
|
+
benchmarkCases: selectedCases.map(buildBenchmarkCaseSection),
|
|
48
|
+
fileTreeSections: selectedProfiles.map((profile) => buildFileTreeSection(profile, options.maxFileTreeEntries ?? DEFAULT_MAX_FILE_TREE_ENTRIES)),
|
|
49
|
+
promptComparisonSections: promptSections,
|
|
50
|
+
agentRunSections: runs,
|
|
51
|
+
correctnessSections: runs,
|
|
52
|
+
tokenSections: comparisons,
|
|
53
|
+
timingSections: comparisons,
|
|
54
|
+
comparisonSections: comparisons,
|
|
55
|
+
plotSections,
|
|
56
|
+
visualizationSections,
|
|
57
|
+
formulaSections: buildFormulaSections(),
|
|
58
|
+
limitations: buildLimitations(runs),
|
|
59
|
+
warnings: unique([...summary.warnings, ...runs.flatMap((run) => run.warnings), ...comparisons.flatMap((comparison) => comparison.warnings), ...warnings]),
|
|
60
|
+
artifactLinks: buildArtifactLinks(experimentDir, runs),
|
|
61
|
+
nextSteps: [
|
|
62
|
+
"Run optional Codex and Claude experiments when local CLI sessions and account limits allow.",
|
|
63
|
+
"Use JSON artifacts as the source of truth for follow-up analysis."
|
|
64
|
+
],
|
|
65
|
+
rawArtifacts: {
|
|
66
|
+
summary,
|
|
67
|
+
runs,
|
|
68
|
+
comparisons,
|
|
69
|
+
config
|
|
70
|
+
}
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
async function buildPlotSections(args) {
|
|
74
|
+
if (!args.plotsDir)
|
|
75
|
+
return [];
|
|
76
|
+
const plotsDir = path.resolve(args.repoRoot, args.plotsDir);
|
|
77
|
+
try {
|
|
78
|
+
const data = JSON.parse(await readFile(path.join(plotsDir, "plot-data.json"), "utf8"));
|
|
79
|
+
return (data.plots ?? []).map((plot) => ({
|
|
80
|
+
id: plot.id,
|
|
81
|
+
title: plot.title,
|
|
82
|
+
kind: "svg",
|
|
83
|
+
path: path.join(plotsDir, "charts", `${plot.id}.svg`)
|
|
84
|
+
}));
|
|
85
|
+
}
|
|
86
|
+
catch (error) {
|
|
87
|
+
args.warnings.push(`Plot artifacts unavailable: ${error instanceof Error ? error.message : String(error)}`);
|
|
88
|
+
return [];
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
async function buildVisualizationSections(args) {
|
|
92
|
+
if (!args.visualizationsDir)
|
|
93
|
+
return [];
|
|
94
|
+
const visualizationsDir = path.resolve(args.repoRoot, args.visualizationsDir);
|
|
95
|
+
try {
|
|
96
|
+
const runsPayload = JSON.parse(await readFile(path.join(visualizationsDir, "visualization-demo-runs.json"), "utf8"));
|
|
97
|
+
return (runsPayload.runs ?? []).map((run) => ({
|
|
98
|
+
id: run.id,
|
|
99
|
+
name: run.name,
|
|
100
|
+
status: run.ok ? "completed" : "warning",
|
|
101
|
+
durationMs: run.durationMs,
|
|
102
|
+
producedArtifactPaths: run.producedArtifactPaths ?? [],
|
|
103
|
+
warnings: run.warnings ?? []
|
|
104
|
+
}));
|
|
105
|
+
}
|
|
106
|
+
catch (error) {
|
|
107
|
+
args.warnings.push(`Visualization demo artifacts unavailable: ${error instanceof Error ? error.message : String(error)}`);
|
|
108
|
+
return [];
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
export function buildAggregateAnswers(args) {
|
|
112
|
+
const reliabilityCounts = countBy(args.comparisons.map((comparison) => comparison.reliabilityLabel));
|
|
113
|
+
const tokenAnswer = answerFromComparisonValues(args.comparisons.filter((comparison) => comparison.tokenComparisonAvailable).map((comparison) => comparison.tokenSavingsPercent));
|
|
114
|
+
const correctnessAnswer = answerCorrectness(args.comparisons);
|
|
115
|
+
const timingAnswer = answerFromComparisonValues(args.comparisons.map((comparison) => comparison.durationReductionPercent));
|
|
116
|
+
const fakeOnly = args.runs.length > 0 && args.runs.every((run) => run.agentId === "fake-agent");
|
|
117
|
+
const externalIssues = args.summary.limitReachedRuns + args.summary.timeoutRuns + args.summary.unavailableRuns + args.summary.invalidOutputRuns;
|
|
118
|
+
const qualifier = fakeOnly
|
|
119
|
+
? " Results are deterministic fake-agent smoke evidence, not real Codex or Claude evidence."
|
|
120
|
+
: externalIssues > 0
|
|
121
|
+
? " Some real-agent outcomes were incomplete and qualify the aggregate answers."
|
|
122
|
+
: "";
|
|
123
|
+
return {
|
|
124
|
+
doesMyDevKitSaveTokens: tokenAnswer,
|
|
125
|
+
doesMyDevKitPreserveCorrectness: correctnessAnswer,
|
|
126
|
+
doesMyDevKitReduceExecutionTime: timingAnswer,
|
|
127
|
+
completedRuns: args.summary.completedRuns,
|
|
128
|
+
failedRuns: args.summary.failedRuns,
|
|
129
|
+
unavailableRuns: args.summary.unavailableRuns,
|
|
130
|
+
limitReachedRuns: args.summary.limitReachedRuns,
|
|
131
|
+
timeoutRuns: args.summary.timeoutRuns,
|
|
132
|
+
invalidOutputRuns: args.summary.invalidOutputRuns,
|
|
133
|
+
comparisonReliabilityCounts: reliabilityCounts,
|
|
134
|
+
summaryText: [
|
|
135
|
+
`Completed ${args.summary.completedRuns} of ${args.summary.totalRuns} runs across ${args.summary.totalComparisons} comparisons.`,
|
|
136
|
+
`Token savings: ${tokenAnswer}. Correctness preserved: ${correctnessAnswer}. Execution time reduced: ${timingAnswer}.`,
|
|
137
|
+
qualifier.trim()
|
|
138
|
+
]
|
|
139
|
+
.filter(Boolean)
|
|
140
|
+
.join(" ")
|
|
141
|
+
};
|
|
142
|
+
}
|
|
143
|
+
async function readRequiredJson(experimentDir, fileName) {
|
|
144
|
+
const filePath = path.join(experimentDir, fileName);
|
|
145
|
+
try {
|
|
146
|
+
return JSON.parse(await readFile(filePath, "utf8"));
|
|
147
|
+
}
|
|
148
|
+
catch (error) {
|
|
149
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
150
|
+
throw new Error(`Missing or invalid required experiment artifact ${fileName}: ${message}`);
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
async function buildPromptSections(args) {
|
|
154
|
+
const sections = [];
|
|
155
|
+
for (const run of args.runs) {
|
|
156
|
+
const promptPath = run.artifactPaths.promptPath ?? path.join(args.experimentDir, "runs", run.runId, "prompt.txt");
|
|
157
|
+
let promptText = run.promptTextForArtifact ?? "";
|
|
158
|
+
try {
|
|
159
|
+
promptText = await readFile(promptPath, "utf8");
|
|
160
|
+
}
|
|
161
|
+
catch {
|
|
162
|
+
args.warnings.push(`Optional prompt artifact missing for run ${run.runId}.`);
|
|
163
|
+
}
|
|
164
|
+
const truncated = promptText.length > args.maxPromptChars;
|
|
165
|
+
sections.push({
|
|
166
|
+
runId: run.runId,
|
|
167
|
+
caseId: run.caseId,
|
|
168
|
+
agentId: run.agentId,
|
|
169
|
+
strategy: run.promptStrategy,
|
|
170
|
+
complexityLevel: run.promptComplexityLevel,
|
|
171
|
+
promptPath,
|
|
172
|
+
promptExcerpt: truncated ? `${promptText.slice(0, args.maxPromptChars)}\n...` : promptText,
|
|
173
|
+
promptWasTruncated: truncated,
|
|
174
|
+
metrics: run.promptMetrics
|
|
175
|
+
});
|
|
176
|
+
}
|
|
177
|
+
return sections;
|
|
178
|
+
}
|
|
179
|
+
function buildProjectSection(profile) {
|
|
180
|
+
return {
|
|
181
|
+
profile,
|
|
182
|
+
complexityMetrics: pickComplexityMetrics(profile.complexityMetrics)
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
function pickComplexityMetrics(metrics) {
|
|
186
|
+
return {
|
|
187
|
+
fileCount: metrics.fileCount,
|
|
188
|
+
sourceFileCount: metrics.sourceFileCount,
|
|
189
|
+
testFileCount: metrics.testFileCount,
|
|
190
|
+
totalLinesOfCode: metrics.totalLinesOfCode,
|
|
191
|
+
sourceLinesOfCode: metrics.sourceLinesOfCode,
|
|
192
|
+
testLinesOfCode: metrics.testLinesOfCode,
|
|
193
|
+
languageCount: metrics.languageCount,
|
|
194
|
+
internalImportCount: metrics.internalImportCount,
|
|
195
|
+
exportedSymbolEstimate: metrics.exportedSymbolEstimate,
|
|
196
|
+
taskCount: metrics.taskCount,
|
|
197
|
+
expectedRelevantFilesAverage: metrics.expectedRelevantFilesAverage,
|
|
198
|
+
expectedRelevantSymbolsAverage: metrics.expectedRelevantSymbolsAverage,
|
|
199
|
+
maxFileLines: metrics.maxFileLines,
|
|
200
|
+
averageFileLines: metrics.averageFileLines
|
|
201
|
+
};
|
|
202
|
+
}
|
|
203
|
+
function buildFileTreeSection(profile, maxEntries) {
|
|
204
|
+
return {
|
|
205
|
+
projectId: profile.projectId,
|
|
206
|
+
entries: profile.fileTree.entries.slice(0, maxEntries),
|
|
207
|
+
totalEntries: profile.fileTree.entries.length,
|
|
208
|
+
truncated: profile.fileTree.entries.length > maxEntries
|
|
209
|
+
};
|
|
210
|
+
}
|
|
211
|
+
function buildBenchmarkCaseSection(evaluationCase) {
|
|
212
|
+
const answerKey = evaluationCase.answerKey;
|
|
213
|
+
return {
|
|
214
|
+
caseId: evaluationCase.id,
|
|
215
|
+
title: evaluationCase.title,
|
|
216
|
+
benchmarkProject: evaluationCase.benchmarkProject,
|
|
217
|
+
query: evaluationCase.query,
|
|
218
|
+
expectedOperation: evaluationCase.expectedOperation,
|
|
219
|
+
expectedFiles: answerKey?.expectedFiles ?? evaluationCase.expectedFiles,
|
|
220
|
+
expectedSymbols: answerKey?.expectedSymbols ?? evaluationCase.expectedSymbols,
|
|
221
|
+
expectedFacts: answerKey?.expectedFacts ?? [],
|
|
222
|
+
minimumCorrectFacts: answerKey?.minimumCorrectFacts ?? 0,
|
|
223
|
+
notes: evaluationCase.notes
|
|
224
|
+
};
|
|
225
|
+
}
|
|
226
|
+
function buildMethodology(summary) {
|
|
227
|
+
return [
|
|
228
|
+
`Benchmark projects were selected from ${summary.projectProfilesPath ?? "benchmark project profiles"}.`,
|
|
229
|
+
`Prompt strategies tested: ${summary.strategies.join(", ")}.`,
|
|
230
|
+
`Prompt complexity levels tested: ${summary.complexityLevels.join(", ")}.`,
|
|
231
|
+
`Agents tested: ${summary.agents.join(", ")}.`,
|
|
232
|
+
"Run statuses distinguish completed, failed, skipped, unavailable, external limit, timeout, and invalid-output outcomes.",
|
|
233
|
+
"Correctness scoring is deterministic and answer-key based; no semantic LLM judging is used.",
|
|
234
|
+
"Token comparison uses agent session token totals only when both paired runs provide totalTokens.",
|
|
235
|
+
"Timing comparison uses measured run duration from normalized agent run results.",
|
|
236
|
+
"Provider telemetry dashboards and OpenTelemetry collection are not part of this report."
|
|
237
|
+
];
|
|
238
|
+
}
|
|
239
|
+
function buildFormulaSections() {
|
|
240
|
+
return [
|
|
241
|
+
{
|
|
242
|
+
id: "correctness",
|
|
243
|
+
title: "Correctness Score",
|
|
244
|
+
formula: "correctnessScore = 0.25 * fileMatchScore + 0.25 * symbolMatchScore + 0.50 * factMatchScore",
|
|
245
|
+
notes: [
|
|
246
|
+
"Pass condition: required facts found unless minimumCorrectFacts applies.",
|
|
247
|
+
"Pass condition: found facts >= minimumCorrectFacts.",
|
|
248
|
+
"Pass condition: correctnessScore >= 0.70."
|
|
249
|
+
]
|
|
250
|
+
},
|
|
251
|
+
{
|
|
252
|
+
id: "tokens",
|
|
253
|
+
title: "Token Savings",
|
|
254
|
+
formula: "tokenDelta = rawTotalTokens - myDevKitTotalTokens; tokenSavingsPercent = tokenDelta / rawTotalTokens * 100",
|
|
255
|
+
notes: [
|
|
256
|
+
"Computed only when both paired runs expose totalTokens and rawTotalTokens is greater than zero.",
|
|
257
|
+
"Prompt estimated tokens are shown separately and are not substituted for provider session tokens."
|
|
258
|
+
]
|
|
259
|
+
},
|
|
260
|
+
{
|
|
261
|
+
id: "timing",
|
|
262
|
+
title: "Execution Time Reduction",
|
|
263
|
+
formula: "durationDeltaMs = rawDurationMs - myDevKitDurationMs; durationReductionPercent = durationDeltaMs / rawDurationMs * 100",
|
|
264
|
+
notes: ["Computed only when both paired runs expose durationMs and rawDurationMs is greater than zero."]
|
|
265
|
+
}
|
|
266
|
+
];
|
|
267
|
+
}
|
|
268
|
+
function buildLimitations(runs) {
|
|
269
|
+
const fakeOnly = runs.length > 0 && runs.every((run) => run.agentId === "fake-agent");
|
|
270
|
+
return [
|
|
271
|
+
fakeOnly ? "fake-agent results are deterministic smoke results, not real Codex or Claude evidence." : undefined,
|
|
272
|
+
"Codex and Claude may fail due external account, usage, or session limits.",
|
|
273
|
+
"Token usage depends on agent output availability.",
|
|
274
|
+
"Prompt estimated tokens are not provider session tokens.",
|
|
275
|
+
"Correctness scoring is answer-key based and deterministic, not semantic.",
|
|
276
|
+
"Charts are deterministic static SVG summaries, not provider telemetry dashboards.",
|
|
277
|
+
"Visualization demos are bounded command smoke checks and may report unsupported graph commands as warnings."
|
|
278
|
+
].filter((item) => Boolean(item));
|
|
279
|
+
}
|
|
280
|
+
function buildArtifactLinks(experimentDir, runs) {
|
|
281
|
+
return [
|
|
282
|
+
{ label: "Metric glossary", path: path.resolve("docs", "METRICS.md"), kind: "markdown" },
|
|
283
|
+
{ label: "Experiment summary", path: path.join(experimentDir, "experiment-summary.json"), kind: "json" },
|
|
284
|
+
{ label: "Experiment runs", path: path.join(experimentDir, "experiment-runs.json"), kind: "json" },
|
|
285
|
+
{ label: "Experiment comparisons", path: path.join(experimentDir, "experiment-comparisons.json"), kind: "json" },
|
|
286
|
+
{ label: "Experiment config", path: path.join(experimentDir, "experiment-config.json"), kind: "json" },
|
|
287
|
+
...runs.flatMap((run) => [
|
|
288
|
+
{ label: `${run.runId} prompt`, path: run.artifactPaths.promptPath ?? path.join(experimentDir, "runs", run.runId, "prompt.txt"), kind: "text" },
|
|
289
|
+
{
|
|
290
|
+
label: `${run.runId} correctness`,
|
|
291
|
+
path: run.artifactPaths.correctnessScorePath ?? path.join(experimentDir, "runs", run.runId, "correctness-score.json"),
|
|
292
|
+
kind: "json"
|
|
293
|
+
}
|
|
294
|
+
])
|
|
295
|
+
];
|
|
296
|
+
}
|
|
297
|
+
function answerFromComparisonValues(values) {
|
|
298
|
+
const usable = values.filter((value) => typeof value === "number" && Number.isFinite(value));
|
|
299
|
+
if (usable.length === 0) {
|
|
300
|
+
return "unavailable";
|
|
301
|
+
}
|
|
302
|
+
const positive = usable.filter((value) => value > 0).length;
|
|
303
|
+
const negative = usable.filter((value) => value < 0).length;
|
|
304
|
+
if (positive > usable.length / 2)
|
|
305
|
+
return "yes";
|
|
306
|
+
if (negative > usable.length / 2)
|
|
307
|
+
return "no";
|
|
308
|
+
return "mixed";
|
|
309
|
+
}
|
|
310
|
+
function answerCorrectness(comparisons) {
|
|
311
|
+
const completed = comparisons.filter((comparison) => comparison.rawStatus === "completed" && comparison.myDevKitStatus === "completed");
|
|
312
|
+
if (completed.length === 0) {
|
|
313
|
+
return comparisons.length === 0 ? "unavailable" : "inconclusive";
|
|
314
|
+
}
|
|
315
|
+
const preserved = completed.filter((comparison) => comparison.sameCorrectnessPass).length;
|
|
316
|
+
if (preserved === completed.length)
|
|
317
|
+
return "yes";
|
|
318
|
+
if (preserved === 0)
|
|
319
|
+
return "no";
|
|
320
|
+
return "mixed";
|
|
321
|
+
}
|
|
322
|
+
function countBy(values) {
|
|
323
|
+
return values.reduce((counts, value) => {
|
|
324
|
+
counts[value] = (counts[value] ?? 0) + 1;
|
|
325
|
+
return counts;
|
|
326
|
+
}, {});
|
|
327
|
+
}
|
|
328
|
+
function unique(values) {
|
|
329
|
+
return [...new Set(values)];
|
|
330
|
+
}
|
|
331
|
+
export async function assertExperimentReportCanRead(filePath) {
|
|
332
|
+
try {
|
|
333
|
+
await access(filePath);
|
|
334
|
+
return true;
|
|
335
|
+
}
|
|
336
|
+
catch {
|
|
337
|
+
return false;
|
|
338
|
+
}
|
|
339
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
import path from "node:path";
|
|
2
|
+
export function buildPluginExperimentReport(args) {
|
|
3
|
+
const outputRoot = args.outputRoot ?? readString(args.run.metadata?.outputRoot) ?? null;
|
|
4
|
+
const allOutcomes = args.run.cases.flatMap((experimentCase) => experimentCase.outcomes);
|
|
5
|
+
return {
|
|
6
|
+
metadata: {
|
|
7
|
+
generatedAt: args.generatedAt ?? new Date().toISOString(),
|
|
8
|
+
runId: args.run.runId,
|
|
9
|
+
startedAt: args.run.startedAt,
|
|
10
|
+
completedAt: args.run.completedAt ?? null,
|
|
11
|
+
status: args.run.status,
|
|
12
|
+
outputRoot,
|
|
13
|
+
},
|
|
14
|
+
plugin: args.plugin,
|
|
15
|
+
target: {
|
|
16
|
+
...args.run.target,
|
|
17
|
+
mode: args.run.target.isSelf ? "self" : "external target",
|
|
18
|
+
},
|
|
19
|
+
summary: args.run.summary ?? null,
|
|
20
|
+
variants: args.run.variants.map((variant) => buildVariantSummary(variant, allOutcomes)),
|
|
21
|
+
cases: args.run.cases.map(buildCaseSummary),
|
|
22
|
+
metrics: args.run.metrics,
|
|
23
|
+
artifacts: relativizeArtifacts(args.run.artifacts, outputRoot),
|
|
24
|
+
warnings: args.run.warnings,
|
|
25
|
+
failures: args.run.failures,
|
|
26
|
+
skippedOutcomes: allOutcomes.filter((outcome) => outcome.status === "skipped"),
|
|
27
|
+
findings: buildFindings(args.run),
|
|
28
|
+
interpretation: buildInterpretation(args.run),
|
|
29
|
+
rawRun: {
|
|
30
|
+
...args.run,
|
|
31
|
+
artifacts: relativizeArtifacts(args.run.artifacts, outputRoot),
|
|
32
|
+
},
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
function buildVariantSummary(variant, outcomes) {
|
|
36
|
+
const variantOutcomes = outcomes.filter((outcome) => outcome.variantId === variant.id);
|
|
37
|
+
return {
|
|
38
|
+
...variant,
|
|
39
|
+
outcomeCount: variantOutcomes.length,
|
|
40
|
+
completedOutcomes: countStatus(variantOutcomes, "completed"),
|
|
41
|
+
partialOutcomes: countStatus(variantOutcomes, "partial"),
|
|
42
|
+
failedOutcomes: countStatus(variantOutcomes, "failed"),
|
|
43
|
+
skippedOutcomes: countStatus(variantOutcomes, "skipped"),
|
|
44
|
+
metrics: variantOutcomes.flatMap((outcome) => outcome.metrics),
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
function buildCaseSummary(experimentCase) {
|
|
48
|
+
return {
|
|
49
|
+
id: experimentCase.id,
|
|
50
|
+
name: experimentCase.name,
|
|
51
|
+
status: summarizeStatus(experimentCase.outcomes),
|
|
52
|
+
outcomeCount: experimentCase.outcomes.length,
|
|
53
|
+
completedOutcomes: countStatus(experimentCase.outcomes, "completed"),
|
|
54
|
+
partialOutcomes: countStatus(experimentCase.outcomes, "partial"),
|
|
55
|
+
failedOutcomes: countStatus(experimentCase.outcomes, "failed"),
|
|
56
|
+
skippedOutcomes: countStatus(experimentCase.outcomes, "skipped"),
|
|
57
|
+
outcomes: experimentCase.outcomes,
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
function buildFindings(run) {
|
|
61
|
+
const skipped = run.cases.flatMap((experimentCase) => experimentCase.outcomes
|
|
62
|
+
.filter((outcome) => outcome.status === "skipped")
|
|
63
|
+
.map((outcome) => ({
|
|
64
|
+
severity: "skip",
|
|
65
|
+
code: "outcome-skipped",
|
|
66
|
+
message: `Outcome skipped: ${outcome.id}`,
|
|
67
|
+
caseId: outcome.caseId,
|
|
68
|
+
variantId: outcome.variantId,
|
|
69
|
+
})));
|
|
70
|
+
return [
|
|
71
|
+
...run.warnings.map((warning) => ({
|
|
72
|
+
severity: "warning",
|
|
73
|
+
code: warning.code,
|
|
74
|
+
message: warning.message,
|
|
75
|
+
caseId: warning.caseId,
|
|
76
|
+
variantId: warning.variantId,
|
|
77
|
+
})),
|
|
78
|
+
...run.failures.map((failure) => ({
|
|
79
|
+
severity: "failure",
|
|
80
|
+
code: failure.code,
|
|
81
|
+
message: failure.message,
|
|
82
|
+
caseId: failure.caseId,
|
|
83
|
+
variantId: failure.variantId,
|
|
84
|
+
})),
|
|
85
|
+
...skipped,
|
|
86
|
+
];
|
|
87
|
+
}
|
|
88
|
+
function buildInterpretation(run) {
|
|
89
|
+
if (run.pluginId === "context-strategy-comparison") {
|
|
90
|
+
const tokenSavings = metricNumber(run.metrics, "average-token-savings-percent");
|
|
91
|
+
const correctnessDelta = metricNumber(run.metrics, "average-correctness-delta");
|
|
92
|
+
const durationReduction = metricNumber(run.metrics, "average-duration-reduction-percent");
|
|
93
|
+
const better = tokenSavings !== undefined && tokenSavings > 0 && (correctnessDelta ?? 0) >= 0
|
|
94
|
+
? "my-dev-kit-guided"
|
|
95
|
+
: correctnessDelta !== undefined && correctnessDelta < 0
|
|
96
|
+
? "raw-full-file"
|
|
97
|
+
: "inconclusive";
|
|
98
|
+
return {
|
|
99
|
+
summary: [
|
|
100
|
+
"raw-full-file vs my-dev-kit-guided comparison",
|
|
101
|
+
`Best-supported strategy: ${better}.`,
|
|
102
|
+
tokenSavings === undefined ? undefined : `Average token savings: ${tokenSavings}%.`,
|
|
103
|
+
durationReduction === undefined ? undefined : `Average duration reduction: ${durationReduction}%.`,
|
|
104
|
+
correctnessDelta === undefined ? undefined : `Average correctness delta: ${correctnessDelta}.`,
|
|
105
|
+
]
|
|
106
|
+
.filter(Boolean)
|
|
107
|
+
.join(" "),
|
|
108
|
+
recommendedNextStep: run.status === "completed"
|
|
109
|
+
? "Review case-level outcomes and repeat with real agents if this was a fake-agent smoke run."
|
|
110
|
+
: "Review warnings, skipped outcomes, and failures before using this run as evidence.",
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
return {
|
|
114
|
+
summary: `Experiment ${run.pluginId} finished with status ${run.status}.`,
|
|
115
|
+
recommendedNextStep: run.failures.length > 0 || run.warnings.length > 0
|
|
116
|
+
? "Review warnings and failures before comparing results."
|
|
117
|
+
: "Use the JSON report as the source of truth for follow-up analysis.",
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
function relativizeArtifacts(artifacts, outputRoot) {
|
|
121
|
+
if (!outputRoot)
|
|
122
|
+
return artifacts;
|
|
123
|
+
return artifacts.map((artifact) => {
|
|
124
|
+
if (!artifact.path)
|
|
125
|
+
return artifact;
|
|
126
|
+
const relative = path.relative(outputRoot, artifact.path);
|
|
127
|
+
return {
|
|
128
|
+
...artifact,
|
|
129
|
+
path: relative && !relative.startsWith("..") && !path.isAbsolute(relative) ? relative : artifact.path,
|
|
130
|
+
};
|
|
131
|
+
});
|
|
132
|
+
}
|
|
133
|
+
function countStatus(outcomes, status) {
|
|
134
|
+
return outcomes.filter((outcome) => outcome.status === status).length;
|
|
135
|
+
}
|
|
136
|
+
function summarizeStatus(outcomes) {
|
|
137
|
+
if (outcomes.length === 0)
|
|
138
|
+
return "skipped";
|
|
139
|
+
if (outcomes.every((outcome) => outcome.status === "completed"))
|
|
140
|
+
return "completed";
|
|
141
|
+
if (outcomes.every((outcome) => outcome.status === "skipped"))
|
|
142
|
+
return "skipped";
|
|
143
|
+
if (outcomes.some((outcome) => outcome.status === "completed"))
|
|
144
|
+
return "partial";
|
|
145
|
+
return "failed";
|
|
146
|
+
}
|
|
147
|
+
function metricNumber(metrics, id) {
|
|
148
|
+
const value = metrics.find((metric) => metric.id === id)?.value;
|
|
149
|
+
return typeof value === "number" && Number.isFinite(value) ? value : undefined;
|
|
150
|
+
}
|
|
151
|
+
function readString(value) {
|
|
152
|
+
return typeof value === "string" && value ? value : undefined;
|
|
153
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
export function renderPluginExperimentReportHtml(report) {
|
|
2
|
+
return `<!DOCTYPE html>
|
|
3
|
+
<html lang="en">
|
|
4
|
+
<head>
|
|
5
|
+
<meta charset="utf-8" />
|
|
6
|
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
|
7
|
+
<title>${escapeHtml(report.plugin.name)} Report</title>
|
|
8
|
+
<style>
|
|
9
|
+
:root { color-scheme: light; font-family: Arial, Helvetica, sans-serif; }
|
|
10
|
+
body { margin: 0; color: #18212b; background: #f6f8fb; }
|
|
11
|
+
main { max-width: 1120px; margin: 0 auto; padding: 28px; }
|
|
12
|
+
h1 { margin: 0 0 6px; font-size: 32px; }
|
|
13
|
+
h2 { margin: 0 0 12px; font-size: 22px; }
|
|
14
|
+
h3 { margin: 14px 0 8px; font-size: 17px; }
|
|
15
|
+
p, li { line-height: 1.45; }
|
|
16
|
+
section, .hero { background: #fff; border: 1px solid #d7e0ea; border-radius: 8px; padding: 18px; margin-bottom: 16px; }
|
|
17
|
+
table { width: 100%; border-collapse: collapse; margin: 8px 0 16px; background: #fff; }
|
|
18
|
+
th, td { border: 1px solid #d9e1ea; padding: 8px 10px; text-align: left; vertical-align: top; }
|
|
19
|
+
th { background: #edf2f7; }
|
|
20
|
+
code { font-family: Consolas, "Courier New", monospace; }
|
|
21
|
+
.muted { color: #5b6b7b; }
|
|
22
|
+
.grid { display: grid; grid-template-columns: repeat(3, minmax(0, 1fr)); gap: 12px; }
|
|
23
|
+
.card { border: 1px solid #d9e1ea; border-radius: 6px; padding: 12px; background: #fbfcfe; }
|
|
24
|
+
.badge { display: inline-block; border-radius: 999px; padding: 3px 9px; font-size: 12px; font-weight: 700; text-transform: uppercase; background: #e8eef7; }
|
|
25
|
+
.status-completed { background: #dcf4e4; color: #0c6b30; }
|
|
26
|
+
.status-partial, .status-skipped { background: #fff0ce; color: #785400; }
|
|
27
|
+
.status-failed { background: #fde2e2; color: #8a1f1f; }
|
|
28
|
+
@media (max-width: 860px) { main { padding: 14px; } .grid { grid-template-columns: 1fr; } table { font-size: 13px; } }
|
|
29
|
+
</style>
|
|
30
|
+
</head>
|
|
31
|
+
<body>
|
|
32
|
+
<main>
|
|
33
|
+
<header class="hero">
|
|
34
|
+
<h1>${escapeHtml(report.plugin.name)}</h1>
|
|
35
|
+
<p>${escapeHtml(report.plugin.description)}</p>
|
|
36
|
+
<p><span class="badge status-${escapeHtml(report.metadata.status)}">${escapeHtml(report.metadata.status)}</span></p>
|
|
37
|
+
<p class="muted">Generated ${escapeHtml(report.metadata.generatedAt)} for run <code>${escapeHtml(report.metadata.runId)}</code></p>
|
|
38
|
+
</header>
|
|
39
|
+
|
|
40
|
+
<section>
|
|
41
|
+
<h2>Plugin And Target</h2>
|
|
42
|
+
${table(["Field", "Value"], [
|
|
43
|
+
["Plugin ID", report.plugin.id],
|
|
44
|
+
["Plugin schema", report.plugin.schemaVersion],
|
|
45
|
+
["Mode", report.target.mode],
|
|
46
|
+
["Tool root", report.target.toolRoot],
|
|
47
|
+
["Target root", report.target.targetRoot],
|
|
48
|
+
["Target package", report.target.packageName ? `${report.target.packageName}@${report.target.packageVersion ?? "unknown"}` : "unavailable"],
|
|
49
|
+
["Target branch/commit", report.target.hasGit ? `${report.target.branch ?? "unknown"} / ${report.target.commit ?? "unknown"}` : "unavailable"],
|
|
50
|
+
["Has package.json", String(report.target.hasPackageJson)],
|
|
51
|
+
["Has lockfile", String(report.target.hasLockfile)],
|
|
52
|
+
["Output root", report.metadata.outputRoot ?? "unavailable"],
|
|
53
|
+
])}
|
|
54
|
+
</section>
|
|
55
|
+
|
|
56
|
+
<section>
|
|
57
|
+
<h2>Interpretation</h2>
|
|
58
|
+
<p>${escapeHtml(report.interpretation.summary)}</p>
|
|
59
|
+
<p><strong>Recommended next step:</strong> ${escapeHtml(report.interpretation.recommendedNextStep)}</p>
|
|
60
|
+
</section>
|
|
61
|
+
|
|
62
|
+
<section>
|
|
63
|
+
<h2>Variants</h2>
|
|
64
|
+
${table(["Variant", "Completed", "Partial", "Failed", "Skipped"], report.variants.map((variant) => [
|
|
65
|
+
`${variant.name} (${variant.id})`,
|
|
66
|
+
String(variant.completedOutcomes),
|
|
67
|
+
String(variant.partialOutcomes),
|
|
68
|
+
String(variant.failedOutcomes),
|
|
69
|
+
String(variant.skippedOutcomes),
|
|
70
|
+
]))}
|
|
71
|
+
</section>
|
|
72
|
+
|
|
73
|
+
<section>
|
|
74
|
+
<h2>Cases</h2>
|
|
75
|
+
${table(["Case", "Status", "Completed", "Partial", "Failed", "Skipped"], report.cases.map((experimentCase) => [
|
|
76
|
+
`${experimentCase.name} (${experimentCase.id})`,
|
|
77
|
+
experimentCase.status,
|
|
78
|
+
String(experimentCase.completedOutcomes),
|
|
79
|
+
String(experimentCase.partialOutcomes),
|
|
80
|
+
String(experimentCase.failedOutcomes),
|
|
81
|
+
String(experimentCase.skippedOutcomes),
|
|
82
|
+
]))}
|
|
83
|
+
</section>
|
|
84
|
+
|
|
85
|
+
<section>
|
|
86
|
+
<h2>Metrics</h2>
|
|
87
|
+
${table(["Metric", "Value", "Unit", "Variant", "Case"], report.metrics.map((metric) => [
|
|
88
|
+
`${metric.name} (${metric.id})`,
|
|
89
|
+
String(metric.value ?? ""),
|
|
90
|
+
metric.unit ?? "",
|
|
91
|
+
metric.variantId ?? "",
|
|
92
|
+
metric.caseId ?? "",
|
|
93
|
+
]))}
|
|
94
|
+
</section>
|
|
95
|
+
|
|
96
|
+
<section>
|
|
97
|
+
<h2>Warnings, Skips, And Failures</h2>
|
|
98
|
+
${report.findings.length ? table(["Severity", "Code", "Message", "Variant", "Case"], report.findings.map((finding) => [
|
|
99
|
+
finding.severity,
|
|
100
|
+
finding.code,
|
|
101
|
+
finding.message,
|
|
102
|
+
finding.variantId ?? "",
|
|
103
|
+
finding.caseId ?? "",
|
|
104
|
+
])) : "<p>No warnings, skips, or failures.</p>"}
|
|
105
|
+
</section>
|
|
106
|
+
|
|
107
|
+
<section>
|
|
108
|
+
<h2>Artifacts</h2>
|
|
109
|
+
${table(["Artifact", "Kind", "Path", "Variant", "Case"], report.artifacts.map((artifact) => [
|
|
110
|
+
`${artifact.label} (${artifact.id})`,
|
|
111
|
+
artifact.kind,
|
|
112
|
+
artifact.path ?? "",
|
|
113
|
+
artifact.variantId ?? "",
|
|
114
|
+
artifact.caseId ?? "",
|
|
115
|
+
]))}
|
|
116
|
+
</section>
|
|
117
|
+
</main>
|
|
118
|
+
</body>
|
|
119
|
+
</html>`;
|
|
120
|
+
}
|
|
121
|
+
function table(headers, rows) {
|
|
122
|
+
return `<table><thead><tr>${headers.map((header) => `<th>${escapeHtml(header)}</th>`).join("")}</tr></thead><tbody>${rows
|
|
123
|
+
.map((row) => `<tr>${row.map((cell) => `<td>${escapeHtml(cell)}</td>`).join("")}</tr>`)
|
|
124
|
+
.join("")}</tbody></table>`;
|
|
125
|
+
}
|
|
126
|
+
function escapeHtml(value) {
|
|
127
|
+
return String(value ?? "")
|
|
128
|
+
.replaceAll("&", "&")
|
|
129
|
+
.replaceAll("<", "<")
|
|
130
|
+
.replaceAll(">", ">")
|
|
131
|
+
.replaceAll('"', """)
|
|
132
|
+
.replaceAll("'", "'");
|
|
133
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import { mkdir, writeFile } from "node:fs/promises";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { resolveWithinRoot } from "../../core/pathSafety.js";
|
|
4
|
+
import { buildPluginExperimentReport } from "./buildPluginExperimentReport.js";
|
|
5
|
+
import { renderPluginExperimentReportHtml } from "./renderPluginExperimentReportHtml.js";
|
|
6
|
+
export async function writePluginExperimentReports(args) {
|
|
7
|
+
const rawOutDir = args.outputRoot ?? readString(args.run.metadata?.outputRoot);
|
|
8
|
+
if (!rawOutDir) {
|
|
9
|
+
throw new Error("Plugin experiment report output root is required.");
|
|
10
|
+
}
|
|
11
|
+
const outDir = path.resolve(rawOutDir);
|
|
12
|
+
const outputPaths = {
|
|
13
|
+
outDir,
|
|
14
|
+
jsonPath: resolveWithinRoot(outDir, "report.json"),
|
|
15
|
+
htmlPath: resolveWithinRoot(outDir, "report.html"),
|
|
16
|
+
};
|
|
17
|
+
await mkdir(outputPaths.outDir, { recursive: true });
|
|
18
|
+
const report = buildPluginExperimentReport({
|
|
19
|
+
run: args.run,
|
|
20
|
+
plugin: args.plugin,
|
|
21
|
+
outputRoot: outputPaths.outDir,
|
|
22
|
+
generatedAt: args.generatedAt,
|
|
23
|
+
});
|
|
24
|
+
await writeFile(outputPaths.jsonPath, `${JSON.stringify({ report, outputPaths }, null, 2)}\n`, "utf8");
|
|
25
|
+
await writeFile(outputPaths.htmlPath, renderPluginExperimentReportHtml(report), "utf8");
|
|
26
|
+
return { report, outputPaths };
|
|
27
|
+
}
|
|
28
|
+
function readString(value) {
|
|
29
|
+
return typeof value === "string" && value.trim() ? value : undefined;
|
|
30
|
+
}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
export * from "./types.js";
|
|
2
|
+
export * from "./renderHtmlReport.js";
|
|
3
|
+
export * from "./writeReportArtifacts.js";
|
|
4
|
+
export * from "./experimentReportTypes.js";
|
|
5
|
+
export * from "./buildExperimentReportInput.js";
|
|
6
|
+
export * from "./renderExperimentHtmlReport.js";
|
|
7
|
+
export * from "./writeExperimentReportArtifacts.js";
|
|
8
|
+
export * from "./experiments/index.js";
|