@dailephd/my-dev-kit-lab 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +272 -0
- package/benchmarks/contracts/benchmark-project-profiles.json +1199 -0
- package/benchmarks/contracts/todo-behavior.md +70 -0
- package/benchmarks/contracts/todo-benchmark-case.json +227 -0
- package/benchmarks/projects/README.md +34 -0
- package/benchmarks/projects/task-analytics-large-mixed/README.md +1 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/__init__.py +3 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/fixtures.py +6 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/metrics.py +29 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/models.py +21 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/parser.py +16 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/pipeline.py +9 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/quality.py +8 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/reporting.py +11 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/tests/test_metrics.py +19 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/tests/test_parser.py +15 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/tests/test_quality.py +19 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/tests/test_reporting.py +15 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/package.json +12 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/index.ts +11 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/models/analyticsSnapshot.ts +20 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/models/project.ts +5 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/models/task.ts +10 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/reporting/buildProjectLeaderboard.ts +7 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/reporting/formatTaskHealthReport.ts +13 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/services/buildAnalyticsSnapshot.ts +39 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/services/completeTask.ts +10 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/services/createTask.ts +21 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/services/listTasksByProject.ts +6 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/store/projectStore.ts +20 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/store/taskStore.ts +44 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/validation/projectValidation.ts +12 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/validation/taskValidation.ts +18 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tests/buildAnalyticsSnapshot.test.ts +48 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tests/completeTask.test.ts +21 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tests/createTask.test.ts +31 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tests/listTasksByProject.test.ts +18 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tests/reporting.test.ts +19 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tsconfig.json +12 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/vitest.config.ts +5 -0
- package/benchmarks/projects/task-workflow-medium-ts/README.md +1 -0
- package/benchmarks/projects/task-workflow-medium-ts/package.json +12 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/index.ts +9 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/models/project.ts +6 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/models/task.ts +39 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/services/completeTask.ts +15 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/services/createTask.ts +26 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/services/filterTasks.ts +17 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/services/importTasks.ts +33 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/services/summarizeTasks.ts +30 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/store/taskStore.ts +76 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/utils/deterministicId.ts +3 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/validation/taskValidation.ts +45 -0
- package/benchmarks/projects/task-workflow-medium-ts/tests/completeTask.test.ts +16 -0
- package/benchmarks/projects/task-workflow-medium-ts/tests/createTask.test.ts +21 -0
- package/benchmarks/projects/task-workflow-medium-ts/tests/filterTasks.test.ts +18 -0
- package/benchmarks/projects/task-workflow-medium-ts/tests/importTasks.test.ts +22 -0
- package/benchmarks/projects/task-workflow-medium-ts/tests/summarizeTasks.test.ts +29 -0
- package/benchmarks/projects/task-workflow-medium-ts/tsconfig.json +12 -0
- package/benchmarks/projects/task-workflow-medium-ts/vitest.config.ts +5 -0
- package/benchmarks/projects/todo-js/README.md +3 -0
- package/benchmarks/projects/todo-js/package.json +11 -0
- package/benchmarks/projects/todo-js/src/index.js +2 -0
- package/benchmarks/projects/todo-js/src/taskService.js +37 -0
- package/benchmarks/projects/todo-js/src/taskStore.js +28 -0
- package/benchmarks/projects/todo-js/tests/taskService.test.js +45 -0
- package/benchmarks/projects/todo-js/vitest.config.js +5 -0
- package/benchmarks/projects/todo-mixed-ts-py/README.md +3 -0
- package/benchmarks/projects/todo-mixed-ts-py/package.json +13 -0
- package/benchmarks/projects/todo-mixed-ts-py/python/task_service.py +76 -0
- package/benchmarks/projects/todo-mixed-ts-py/src/taskCli.ts +38 -0
- package/benchmarks/projects/todo-mixed-ts-py/tests/mixedBoundary.test.ts +18 -0
- package/benchmarks/projects/todo-mixed-ts-py/tsconfig.json +12 -0
- package/benchmarks/projects/todo-mixed-ts-py/vitest.config.ts +5 -0
- package/benchmarks/projects/todo-python/README.md +3 -0
- package/benchmarks/projects/todo-python/src/__init__.py +4 -0
- package/benchmarks/projects/todo-python/src/task_service.py +32 -0
- package/benchmarks/projects/todo-python/src/task_store.py +28 -0
- package/benchmarks/projects/todo-python/tests/test_task_service.py +52 -0
- package/benchmarks/projects/todo-ts/README.md +3 -0
- package/benchmarks/projects/todo-ts/package.json +12 -0
- package/benchmarks/projects/todo-ts/src/index.ts +2 -0
- package/benchmarks/projects/todo-ts/src/taskService.ts +41 -0
- package/benchmarks/projects/todo-ts/src/taskStore.ts +34 -0
- package/benchmarks/projects/todo-ts/tests/taskService.test.ts +45 -0
- package/benchmarks/projects/todo-ts/tsconfig.json +12 -0
- package/benchmarks/projects/todo-ts/vitest.config.ts +5 -0
- package/dist/scripts/build-gallery.js +3 -0
- package/dist/scripts/capture-demo-report.js +3 -0
- package/dist/scripts/evaluate-token-savings.js +2 -0
- package/dist/scripts/experiments/describeExperiment.js +143 -0
- package/dist/scripts/experiments/listExperiments.js +44 -0
- package/dist/scripts/experiments/runExperiment.js +199 -0
- package/dist/scripts/generate-experiment-plots.js +3 -0
- package/dist/scripts/generate-prompt-variants.js +2 -0
- package/dist/scripts/render-experiment-report.js +2 -0
- package/dist/scripts/run-agent-prompt.js +2 -0
- package/dist/scripts/run-controlled-experiment.js +2 -0
- package/dist/scripts/run-final-demo.js +3 -0
- package/dist/scripts/run-lab-demo.js +5 -0
- package/dist/scripts/run-visualization-demos.js +3 -0
- package/dist/scripts/security/runCodeql.js +57 -0
- package/dist/scripts/security/runDependencyChecks.js +57 -0
- package/dist/scripts/security/runFuzzSmoke.js +29 -0
- package/dist/scripts/security/runPackageChecks.js +56 -0
- package/dist/scripts/security/runSemgrep.js +63 -0
- package/dist/scripts/security/validate.js +117 -0
- package/dist/scripts/verify-benchmarks.js +202 -0
- package/dist/src/agents/adapters/claudeAdapter.js +37 -0
- package/dist/src/agents/adapters/codexAdapter.js +110 -0
- package/dist/src/agents/adapters/fakeAgentAdapter.js +101 -0
- package/dist/src/agents/agentRegistry.js +21 -0
- package/dist/src/agents/index.js +7 -0
- package/dist/src/agents/parseAgentTokenUsage.js +137 -0
- package/dist/src/agents/runAgentPrompt.js +38 -0
- package/dist/src/agents/types.js +1 -0
- package/dist/src/commands/buildGalleryCommand.js +56 -0
- package/dist/src/commands/captureDemoReport.js +116 -0
- package/dist/src/commands/evaluateTokenSavings.js +175 -0
- package/dist/src/commands/generateExperimentPlotsCommand.js +38 -0
- package/dist/src/commands/generatePromptVariants.js +67 -0
- package/dist/src/commands/renderExperimentReportCommand.js +131 -0
- package/dist/src/commands/runAgentPromptCommand.js +132 -0
- package/dist/src/commands/runControlledExperimentCommand.js +174 -0
- package/dist/src/commands/runFinalDemoCommand.js +123 -0
- package/dist/src/commands/runLabDemo.js +62 -0
- package/dist/src/commands/runVisualizationDemosCommand.js +67 -0
- package/dist/src/core/commandLine.js +59 -0
- package/dist/src/core/countTokens.js +8 -0
- package/dist/src/core/fileGlobs.js +100 -0
- package/dist/src/core/localProjectTarget.js +75 -0
- package/dist/src/core/pathSafety.js +19 -0
- package/dist/src/core/pythonCommand.js +30 -0
- package/dist/src/core/resolveCommand.js +110 -0
- package/dist/src/core/runMeasuredCommand.js +143 -0
- package/dist/src/evaluation/benchmarkMetadata.js +207 -0
- package/dist/src/evaluation/buildExperimentMatrix.js +75 -0
- package/dist/src/evaluation/classifyAgentRunOutcome.js +40 -0
- package/dist/src/evaluation/compareExperimentRuns.js +79 -0
- package/dist/src/evaluation/compareTokenSavings.js +47 -0
- package/dist/src/evaluation/controlledExperimentTypes.js +1 -0
- package/dist/src/evaluation/index.js +18 -0
- package/dist/src/evaluation/parseAgentAnswer.js +230 -0
- package/dist/src/evaluation/projectComplexity.js +126 -0
- package/dist/src/evaluation/projectFileTree.js +83 -0
- package/dist/src/evaluation/readEvaluationCases.js +59 -0
- package/dist/src/evaluation/renderTokenSavingsReportInput.js +55 -0
- package/dist/src/evaluation/runControlledExperiment.js +158 -0
- package/dist/src/evaluation/runMyDevKitRetrieval.js +197 -0
- package/dist/src/evaluation/runRawFullFileBaseline.js +31 -0
- package/dist/src/evaluation/scoreCorrectness.js +127 -0
- package/dist/src/evaluation/types.js +1 -0
- package/dist/src/evaluation/writeExperimentArtifacts.js +104 -0
- package/dist/src/evaluation/writeTokenSavingsArtifacts.js +57 -0
- package/dist/src/experiments/config.js +24 -0
- package/dist/src/experiments/defaultRegistry.js +7 -0
- package/dist/src/experiments/errors.js +18 -0
- package/dist/src/experiments/index.js +9 -0
- package/dist/src/experiments/outputPaths.js +25 -0
- package/dist/src/experiments/plugins/contextStrategyComparison/config.js +37 -0
- package/dist/src/experiments/plugins/contextStrategyComparison/index.js +3 -0
- package/dist/src/experiments/plugins/contextStrategyComparison/plugin.js +83 -0
- package/dist/src/experiments/plugins/contextStrategyComparison/resultMapping.js +260 -0
- package/dist/src/experiments/plugins/index.js +1 -0
- package/dist/src/experiments/registry.js +43 -0
- package/dist/src/experiments/results.js +48 -0
- package/dist/src/experiments/runner.js +181 -0
- package/dist/src/experiments/target.js +8 -0
- package/dist/src/experiments/types.js +1 -0
- package/dist/src/gallery/index.js +2 -0
- package/dist/src/gallery/types.js +1 -0
- package/dist/src/gallery/writeGalleryManifest.js +214 -0
- package/dist/src/index.js +12 -0
- package/dist/src/plots/buildExperimentPlotData.js +137 -0
- package/dist/src/plots/index.js +4 -0
- package/dist/src/plots/renderSvgChart.js +82 -0
- package/dist/src/plots/types.js +1 -0
- package/dist/src/plots/writePlotArtifacts.js +46 -0
- package/dist/src/prompts/buildPromptContext.js +68 -0
- package/dist/src/prompts/generateMyDevKitPrompt.js +106 -0
- package/dist/src/prompts/generatePromptVariants.js +36 -0
- package/dist/src/prompts/generateRawFullFilePrompt.js +97 -0
- package/dist/src/prompts/index.js +7 -0
- package/dist/src/prompts/measurePromptComplexity.js +41 -0
- package/dist/src/prompts/types.js +1 -0
- package/dist/src/prompts/writePromptArtifacts.js +43 -0
- package/dist/src/report/buildExperimentReportInput.js +339 -0
- package/dist/src/report/experimentReportTypes.js +1 -0
- package/dist/src/report/experiments/buildPluginExperimentReport.js +153 -0
- package/dist/src/report/experiments/experimentReportModel.js +1 -0
- package/dist/src/report/experiments/index.js +4 -0
- package/dist/src/report/experiments/renderPluginExperimentReportHtml.js +133 -0
- package/dist/src/report/experiments/writePluginExperimentReports.js +30 -0
- package/dist/src/report/index.js +8 -0
- package/dist/src/report/renderExperimentHtmlReport.js +354 -0
- package/dist/src/report/renderHtmlReport.js +103 -0
- package/dist/src/report/types.js +10 -0
- package/dist/src/report/writeExperimentReportArtifacts.js +38 -0
- package/dist/src/report/writeReportArtifacts.js +39 -0
- package/dist/src/screenshot/captureReportScreenshot.js +75 -0
- package/dist/src/screenshot/index.js +2 -0
- package/dist/src/screenshot/types.js +1 -0
- package/dist/src/securityValidation/artifacts.js +15 -0
- package/dist/src/securityValidation/cliAdversarial/adversarialCliConfig.js +38 -0
- package/dist/src/securityValidation/cliAdversarial/dataVolumeChecks.js +194 -0
- package/dist/src/securityValidation/cliAdversarial/jsonStdoutChecks.js +359 -0
- package/dist/src/securityValidation/cliAdversarial/malformedArtifactChecks.js +284 -0
- package/dist/src/securityValidation/cliAdversarial/malformedArtifactFixtures.js +79 -0
- package/dist/src/securityValidation/cliAdversarial/pathBoundaryChecks.js +431 -0
- package/dist/src/securityValidation/cliAdversarial/pathCases.js +144 -0
- package/dist/src/securityValidation/cliAdversarial/readOnlyBoundaryChecks.js +294 -0
- package/dist/src/securityValidation/cliAdversarial/runAdversarialCheck.js +149 -0
- package/dist/src/securityValidation/cliAdversarial/subprocessSafetyChecks.js +214 -0
- package/dist/src/securityValidation/cliAdversarial/tempWorkspace.js +160 -0
- package/dist/src/securityValidation/commandRunner.js +136 -0
- package/dist/src/securityValidation/config.js +39 -0
- package/dist/src/securityValidation/dependencies/parseNpmAudit.js +115 -0
- package/dist/src/securityValidation/dependencies/parseNpmLs.js +71 -0
- package/dist/src/securityValidation/dependencies/parseNpmOutdated.js +41 -0
- package/dist/src/securityValidation/dependencies/runDependencyChecks.js +239 -0
- package/dist/src/securityValidation/dependencies/runOsvScanner.js +43 -0
- package/dist/src/securityValidation/fuzz/fuzzHarness.js +61 -0
- package/dist/src/securityValidation/fuzz/fuzzTargets.js +204 -0
- package/dist/src/securityValidation/fuzz/randomInput.js +0 -0
- package/dist/src/securityValidation/index.js +34 -0
- package/dist/src/securityValidation/packageChecks/forbiddenPackageContents.js +67 -0
- package/dist/src/securityValidation/packageChecks/parseNpmPackDryRun.js +56 -0
- package/dist/src/securityValidation/packageChecks/runPackageChecks.js +88 -0
- package/dist/src/securityValidation/report/renderSecurityReport.js +248 -0
- package/dist/src/securityValidation/report/securityReportTypes.js +1 -0
- package/dist/src/securityValidation/staticScans/codeql.js +66 -0
- package/dist/src/securityValidation/staticScans/semgrep.js +180 -0
- package/dist/src/securityValidation/testMatrix.js +535 -0
- package/dist/src/securityValidation/types.js +34 -0
- package/dist/src/securityValidation/validate/resolveTarget.js +32 -0
- package/dist/src/securityValidation/validate/runSecurityValidation.js +169 -0
- package/dist/src/securityValidation/validate/verdict.js +73 -0
- package/dist/src/visualizationDemos/buildMyDevKitVisualizationCommands.js +59 -0
- package/dist/src/visualizationDemos/index.js +4 -0
- package/dist/src/visualizationDemos/runVisualizationDemos.js +82 -0
- package/dist/src/visualizationDemos/types.js +1 -0
- package/dist/src/visualizationDemos/writeVisualizationDemoArtifacts.js +25 -0
- package/docs/METRICS.md +286 -0
- package/examples/demo-report-input.json +78 -0
- package/examples/lab-demo-cases.json +35 -0
- package/examples/real-agent-campaign-cases.json +118 -0
- package/examples/token-savings-cases.json +122 -0
- package/package.json +91 -0
- package/tests/fixtures/fake-adversarial-cli.js +152 -0
- package/tests/fixtures/fake-my-dev-kit-cli.js +83 -0
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
export function parseAgentAnswer(args) {
|
|
2
|
+
const text = args.text.trim();
|
|
3
|
+
const warnings = [];
|
|
4
|
+
if (!text) {
|
|
5
|
+
return emptyParsedAnswer("failed", ["Agent answer was empty."], args.tokenUsage);
|
|
6
|
+
}
|
|
7
|
+
const jsonParsed = parseJsonAnswer(text, args.tokenUsage);
|
|
8
|
+
if (jsonParsed) {
|
|
9
|
+
return enrichFacts(jsonParsed, text, args.answerKey);
|
|
10
|
+
}
|
|
11
|
+
const fields = collectFieldValues(text);
|
|
12
|
+
const parsed = {
|
|
13
|
+
answerText: fields.get("answer")?.join("\n")?.trim() || text,
|
|
14
|
+
relevantFiles: splitListValues(fields.get("relevantfiles") ?? fields.get("files")),
|
|
15
|
+
relevantSymbols: splitListValues(fields.get("relevantsymbols") ?? fields.get("symbols")),
|
|
16
|
+
expectedFactsFound: splitListValues(fields.get("expectedfactsfound") ?? fields.get("facts") ?? fields.get("factids")),
|
|
17
|
+
confidence: firstValue(fields.get("confidence")),
|
|
18
|
+
commandsRun: splitListValues(fields.get("commandsrun") ?? fields.get("commands")),
|
|
19
|
+
selectedContext: splitListValues(fields.get("selectedcontext") ?? fields.get("context")),
|
|
20
|
+
fullFileReads: splitListValues(fields.get("fullfilereads")),
|
|
21
|
+
fullFileReadJustifications: splitListValues(fields.get("fullfilereadjustifications")),
|
|
22
|
+
parseStatus: "parsed",
|
|
23
|
+
warnings,
|
|
24
|
+
tokenUsage: args.tokenUsage
|
|
25
|
+
};
|
|
26
|
+
if (parsed.relevantFiles.length === 0) {
|
|
27
|
+
parsed.relevantFiles = parseMarkdownSection(text, ["Relevant Files", "Files"]);
|
|
28
|
+
}
|
|
29
|
+
if (parsed.relevantSymbols.length === 0) {
|
|
30
|
+
parsed.relevantSymbols = parseMarkdownSection(text, ["Relevant Symbols", "Symbols"]);
|
|
31
|
+
}
|
|
32
|
+
if (parsed.expectedFactsFound.length === 0) {
|
|
33
|
+
parsed.expectedFactsFound = parseMarkdownSection(text, ["Expected Facts Found", "Facts Found", "Facts"]);
|
|
34
|
+
}
|
|
35
|
+
if (parsed.commandsRun.length === 0) {
|
|
36
|
+
parsed.commandsRun = parseMarkdownSection(text, ["Commands Run", "Commands"]);
|
|
37
|
+
}
|
|
38
|
+
parsed.expectedFactsFound = normalizeFactMatches(parsed.expectedFactsFound, text, args.answerKey);
|
|
39
|
+
if (parsed.answerText.length === 0 || (parsed.relevantFiles.length === 0 && parsed.relevantSymbols.length === 0 && parsed.expectedFactsFound.length === 0)) {
|
|
40
|
+
parsed.parseStatus = parsed.answerText.length > 0 ? "partial" : "failed";
|
|
41
|
+
warnings.push("Agent answer did not include enough structured fields for full parsing.");
|
|
42
|
+
}
|
|
43
|
+
return parsed;
|
|
44
|
+
}
|
|
45
|
+
function parseJsonAnswer(text, tokenUsage) {
|
|
46
|
+
const candidates = collectJsonCandidates(text);
|
|
47
|
+
for (const candidate of candidates) {
|
|
48
|
+
try {
|
|
49
|
+
const value = JSON.parse(candidate);
|
|
50
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) {
|
|
51
|
+
continue;
|
|
52
|
+
}
|
|
53
|
+
return {
|
|
54
|
+
answerText: readString(value, "answer", "answerText", "finalAnswer") ?? "",
|
|
55
|
+
relevantFiles: readStringArray(value, "relevantFiles", "files"),
|
|
56
|
+
relevantSymbols: readStringArray(value, "relevantSymbols", "symbols"),
|
|
57
|
+
expectedFactsFound: readStringArray(value, "expectedFactsFound", "facts", "factIds"),
|
|
58
|
+
confidence: readString(value, "confidence"),
|
|
59
|
+
commandsRun: readStringArray(value, "commandsRun", "commands"),
|
|
60
|
+
selectedContext: readStringArray(value, "selectedContext", "context"),
|
|
61
|
+
fullFileReads: readStringArray(value, "fullFileReads"),
|
|
62
|
+
fullFileReadJustifications: readStringArray(value, "fullFileReadJustifications"),
|
|
63
|
+
parseStatus: "parsed",
|
|
64
|
+
warnings: [],
|
|
65
|
+
tokenUsage
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
catch {
|
|
69
|
+
// Mixed markdown often contains fenced non-JSON blocks; ignore malformed candidates.
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
return undefined;
|
|
73
|
+
}
|
|
74
|
+
function collectJsonCandidates(text) {
|
|
75
|
+
const candidates = [];
|
|
76
|
+
const trimmed = text.trim();
|
|
77
|
+
if (trimmed.startsWith("{") && trimmed.endsWith("}")) {
|
|
78
|
+
candidates.push(trimmed);
|
|
79
|
+
}
|
|
80
|
+
for (const match of text.matchAll(/```(?:json)?\s*([\s\S]*?)```/gi)) {
|
|
81
|
+
const body = match[1]?.trim();
|
|
82
|
+
if (body?.startsWith("{")) {
|
|
83
|
+
candidates.push(body);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
return candidates;
|
|
87
|
+
}
|
|
88
|
+
function collectFieldValues(text) {
|
|
89
|
+
const fields = new Map();
|
|
90
|
+
let currentKey;
|
|
91
|
+
for (const line of text.split(/\r?\n/)) {
|
|
92
|
+
const fieldMatch = line.match(/^\s*(?:[-*]\s*)?(?:[*_`]{0,2})([A-Za-z][A-Za-z0-9 _-]{1,40})(?:[*_`]{0,2})\s*:\s*(.*)$/);
|
|
93
|
+
if (fieldMatch) {
|
|
94
|
+
currentKey = normalizeKey(fieldMatch[1]);
|
|
95
|
+
const current = fields.get(currentKey) ?? [];
|
|
96
|
+
current.push(stripMarkupOnlyValue(fieldMatch[2] ?? ""));
|
|
97
|
+
fields.set(currentKey, current);
|
|
98
|
+
continue;
|
|
99
|
+
}
|
|
100
|
+
if (currentKey && (/^\s+[-*]?\s*\S/.test(line) || /^\s*[-*]\s+\S/.test(line))) {
|
|
101
|
+
fields.get(currentKey)?.push(line.trim().replace(/^[-*]\s*/, ""));
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
return fields;
|
|
105
|
+
}
|
|
106
|
+
function parseMarkdownSection(text, headings) {
|
|
107
|
+
const lines = text.split(/\r?\n/);
|
|
108
|
+
const values = [];
|
|
109
|
+
let inSection = false;
|
|
110
|
+
for (const line of lines) {
|
|
111
|
+
const heading = line.match(/^\s*#{1,6}\s*(.+?)\s*$/);
|
|
112
|
+
if (heading) {
|
|
113
|
+
inSection = headings.some((candidate) => normalizeKey(candidate) === normalizeKey(heading[1]));
|
|
114
|
+
continue;
|
|
115
|
+
}
|
|
116
|
+
if (!inSection) {
|
|
117
|
+
continue;
|
|
118
|
+
}
|
|
119
|
+
if (/^\s*#{1,6}\s+/.test(line)) {
|
|
120
|
+
break;
|
|
121
|
+
}
|
|
122
|
+
const bullet = line.match(/^\s*[-*]\s+(.+)$/);
|
|
123
|
+
if (bullet) {
|
|
124
|
+
values.push(bullet[1].trim());
|
|
125
|
+
}
|
|
126
|
+
else if (line.includes(",")) {
|
|
127
|
+
values.push(...splitListValues([line]));
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
return unique(values);
|
|
131
|
+
}
|
|
132
|
+
function enrichFacts(parsed, text, answerKey) {
|
|
133
|
+
parsed.expectedFactsFound = normalizeFactMatches(parsed.expectedFactsFound, text, answerKey);
|
|
134
|
+
if (parsed.answerText.length === 0) {
|
|
135
|
+
parsed.answerText = text;
|
|
136
|
+
}
|
|
137
|
+
if (parsed.relevantFiles.length === 0 && parsed.relevantSymbols.length === 0 && parsed.expectedFactsFound.length === 0) {
|
|
138
|
+
parsed.parseStatus = "partial";
|
|
139
|
+
parsed.warnings.push("JSON agent answer did not include scoring fields.");
|
|
140
|
+
}
|
|
141
|
+
return parsed;
|
|
142
|
+
}
|
|
143
|
+
function normalizeFactMatches(values, fullText, answerKey) {
|
|
144
|
+
if (!answerKey) {
|
|
145
|
+
return unique(values.map(cleanListItem).filter(Boolean));
|
|
146
|
+
}
|
|
147
|
+
const normalizedValues = new Set(values.map(normalizeMatchText));
|
|
148
|
+
const normalizedFullText = normalizeMatchText(fullText);
|
|
149
|
+
const matches = [];
|
|
150
|
+
for (const fact of answerKey.expectedFacts) {
|
|
151
|
+
const normalizedFactText = normalizeMatchText(fact.text);
|
|
152
|
+
if (normalizedValues.has(normalizeMatchText(fact.id)) ||
|
|
153
|
+
normalizedValues.has(normalizedFactText) ||
|
|
154
|
+
normalizedFullText.includes(normalizeMatchText(fact.id)) ||
|
|
155
|
+
(normalizedFactText.length > 20 && normalizedFullText.includes(normalizedFactText))) {
|
|
156
|
+
matches.push(fact.id);
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
return unique([...matches, ...values.map(cleanListItem).filter(Boolean)]);
|
|
160
|
+
}
|
|
161
|
+
function splitListValues(values) {
|
|
162
|
+
if (!values) {
|
|
163
|
+
return [];
|
|
164
|
+
}
|
|
165
|
+
return unique(values
|
|
166
|
+
.flatMap((value) => value.split(/,|\n/))
|
|
167
|
+
.map(cleanListItem)
|
|
168
|
+
.filter(Boolean));
|
|
169
|
+
}
|
|
170
|
+
function cleanListItem(value) {
|
|
171
|
+
const codeSpan = value.match(/`([^`]+)`/);
|
|
172
|
+
const cleaned = (codeSpan?.[1] ?? value)
|
|
173
|
+
.trim()
|
|
174
|
+
.replace(/^[-*]\s*/, "")
|
|
175
|
+
.replace(/^["'`]+|["'`.]+$/g, "")
|
|
176
|
+
.replace(/\s+[--]\s+.*$/g, "")
|
|
177
|
+
.replace(/\s+[-–—]\s+.*$/g, "")
|
|
178
|
+
.trim();
|
|
179
|
+
return cleaned;
|
|
180
|
+
}
|
|
181
|
+
function stripMarkupOnlyValue(value) {
|
|
182
|
+
return /^[*_`\s]+$/.test(value) ? "" : value;
|
|
183
|
+
}
|
|
184
|
+
function normalizeKey(value) {
|
|
185
|
+
return value.toLowerCase().replace(/[^a-z0-9]+/g, "");
|
|
186
|
+
}
|
|
187
|
+
function normalizeMatchText(value) {
|
|
188
|
+
return value.toLowerCase().replace(/[^a-z0-9]+/g, " ").trim();
|
|
189
|
+
}
|
|
190
|
+
function firstValue(values) {
|
|
191
|
+
return values?.find((value) => value.trim().length > 0)?.trim();
|
|
192
|
+
}
|
|
193
|
+
function readString(value, ...keys) {
|
|
194
|
+
for (const key of keys) {
|
|
195
|
+
if (typeof value[key] === "string") {
|
|
196
|
+
return value[key];
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
return undefined;
|
|
200
|
+
}
|
|
201
|
+
function readStringArray(value, ...keys) {
|
|
202
|
+
for (const key of keys) {
|
|
203
|
+
const field = value[key];
|
|
204
|
+
if (Array.isArray(field)) {
|
|
205
|
+
return field.filter((item) => typeof item === "string");
|
|
206
|
+
}
|
|
207
|
+
if (typeof field === "string") {
|
|
208
|
+
return splitListValues([field]);
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
return [];
|
|
212
|
+
}
|
|
213
|
+
function unique(values) {
|
|
214
|
+
return [...new Set(values)];
|
|
215
|
+
}
|
|
216
|
+
function emptyParsedAnswer(parseStatus, warnings, tokenUsage) {
|
|
217
|
+
return {
|
|
218
|
+
answerText: "",
|
|
219
|
+
relevantFiles: [],
|
|
220
|
+
relevantSymbols: [],
|
|
221
|
+
expectedFactsFound: [],
|
|
222
|
+
commandsRun: [],
|
|
223
|
+
selectedContext: [],
|
|
224
|
+
fullFileReads: [],
|
|
225
|
+
fullFileReadJustifications: [],
|
|
226
|
+
parseStatus,
|
|
227
|
+
warnings,
|
|
228
|
+
tokenUsage
|
|
229
|
+
};
|
|
230
|
+
}
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
import { readFileSync } from "node:fs";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
export const PROJECT_COMPLEXITY_FORMULA = {
|
|
4
|
+
id: "benchmark-project-complexity-v1",
|
|
5
|
+
description: "Weighted score using capped normalized source files, source lines, language count, internal imports, max file lines, expected relevant files, and expected relevant symbols.",
|
|
6
|
+
scoreRange: [0, 100],
|
|
7
|
+
normalizedValue: "min(value / cap, 1)",
|
|
8
|
+
weights: {
|
|
9
|
+
sourceFileCount: 0.2,
|
|
10
|
+
sourceLinesOfCode: 0.2,
|
|
11
|
+
languageCount: 0.15,
|
|
12
|
+
internalImportCount: 0.15,
|
|
13
|
+
maxFileLines: 0.1,
|
|
14
|
+
expectedRelevantFilesAverage: 0.1,
|
|
15
|
+
expectedRelevantSymbolsAverage: 0.1
|
|
16
|
+
},
|
|
17
|
+
caps: {
|
|
18
|
+
sourceFileCount: 20,
|
|
19
|
+
sourceLinesOfCode: 2000,
|
|
20
|
+
languageCount: 4,
|
|
21
|
+
internalImportCount: 50,
|
|
22
|
+
maxFileLines: 300,
|
|
23
|
+
expectedRelevantFilesAverage: 10,
|
|
24
|
+
expectedRelevantSymbolsAverage: 20
|
|
25
|
+
}
|
|
26
|
+
};
|
|
27
|
+
export function calculateProjectComplexityScore(metrics) {
|
|
28
|
+
const { weights, caps } = PROJECT_COMPLEXITY_FORMULA;
|
|
29
|
+
const normalizedScore = weights.sourceFileCount * normalize(metrics.sourceFileCount, caps.sourceFileCount) +
|
|
30
|
+
weights.sourceLinesOfCode * normalize(metrics.sourceLinesOfCode, caps.sourceLinesOfCode) +
|
|
31
|
+
weights.languageCount * normalize(metrics.languageCount, caps.languageCount) +
|
|
32
|
+
weights.internalImportCount * normalize(metrics.internalImportCount, caps.internalImportCount) +
|
|
33
|
+
weights.maxFileLines * normalize(metrics.maxFileLines, caps.maxFileLines) +
|
|
34
|
+
weights.expectedRelevantFilesAverage * normalize(metrics.expectedRelevantFilesAverage, caps.expectedRelevantFilesAverage) +
|
|
35
|
+
weights.expectedRelevantSymbolsAverage * normalize(metrics.expectedRelevantSymbolsAverage, caps.expectedRelevantSymbolsAverage);
|
|
36
|
+
return Math.round(normalizedScore * 100);
|
|
37
|
+
}
|
|
38
|
+
export function computeProjectComplexityMetrics(projectRoot, fileTree, taskStats) {
|
|
39
|
+
const fileEntries = fileTree.entries.filter((entry) => entry.kind === "file");
|
|
40
|
+
const codeEntries = fileEntries.filter((entry) => entry.role === "source" || entry.role === "test");
|
|
41
|
+
const sourceEntries = fileEntries.filter((entry) => entry.role === "source");
|
|
42
|
+
const testEntries = fileEntries.filter((entry) => entry.role === "test");
|
|
43
|
+
const languages = new Set(codeEntries.map((entry) => entry.language).filter((language) => Boolean(language)));
|
|
44
|
+
const codeLineCounts = new Map();
|
|
45
|
+
for (const entry of codeEntries) {
|
|
46
|
+
codeLineCounts.set(entry.path, countApproximateCodeLines(path.join(projectRoot, entry.path)));
|
|
47
|
+
}
|
|
48
|
+
const totalLinesOfCode = sum([...codeLineCounts.values()]);
|
|
49
|
+
const sourceLinesOfCode = sum(sourceEntries.map((entry) => codeLineCounts.get(entry.path) ?? 0));
|
|
50
|
+
const testLinesOfCode = sum(testEntries.map((entry) => codeLineCounts.get(entry.path) ?? 0));
|
|
51
|
+
const fileLineCounts = codeEntries.map((entry) => entry.lines ?? 0);
|
|
52
|
+
const metrics = {
|
|
53
|
+
fileCount: fileEntries.length,
|
|
54
|
+
sourceFileCount: sourceEntries.length,
|
|
55
|
+
testFileCount: testEntries.length,
|
|
56
|
+
totalLinesOfCode,
|
|
57
|
+
sourceLinesOfCode,
|
|
58
|
+
testLinesOfCode,
|
|
59
|
+
languageCount: languages.size,
|
|
60
|
+
dependencyFileCount: fileEntries.filter((entry) => isDependencyFile(entry.path)).length,
|
|
61
|
+
internalImportCount: sum(sourceEntries.map((entry) => countInternalImports(path.join(projectRoot, entry.path)))),
|
|
62
|
+
exportedSymbolEstimate: sum(sourceEntries.map((entry) => countExportedSymbols(path.join(projectRoot, entry.path)))),
|
|
63
|
+
taskCount: taskStats.taskCount,
|
|
64
|
+
expectedRelevantFilesAverage: roundToTwo(taskStats.expectedRelevantFilesAverage),
|
|
65
|
+
expectedRelevantSymbolsAverage: roundToTwo(taskStats.expectedRelevantSymbolsAverage),
|
|
66
|
+
maxFileLines: fileLineCounts.length > 0 ? Math.max(...fileLineCounts) : 0,
|
|
67
|
+
averageFileLines: fileLineCounts.length > 0 ? roundToTwo(sum(fileLineCounts) / fileLineCounts.length) : 0,
|
|
68
|
+
packageDependencyCount: sum(fileEntries.map((entry) => countPackageDependencies(path.join(projectRoot, entry.path)))),
|
|
69
|
+
functionOrClassEstimate: sum(sourceEntries.map((entry) => countFunctionsOrClasses(path.join(projectRoot, entry.path))))
|
|
70
|
+
};
|
|
71
|
+
return metrics;
|
|
72
|
+
}
|
|
73
|
+
function normalize(value, cap) {
|
|
74
|
+
return Math.min(value / cap, 1);
|
|
75
|
+
}
|
|
76
|
+
function roundToTwo(value) {
|
|
77
|
+
return Math.round(value * 100) / 100;
|
|
78
|
+
}
|
|
79
|
+
function sum(values) {
|
|
80
|
+
return values.reduce((total, value) => total + value, 0);
|
|
81
|
+
}
|
|
82
|
+
function readLines(filePath) {
|
|
83
|
+
return readFileSync(filePath, "utf8").split(/\r?\n/);
|
|
84
|
+
}
|
|
85
|
+
function countApproximateCodeLines(filePath) {
|
|
86
|
+
return readLines(filePath).filter((line) => {
|
|
87
|
+
const trimmed = line.trim();
|
|
88
|
+
return trimmed.length > 0 && !trimmed.startsWith("//") && !trimmed.startsWith("#") && !trimmed.startsWith("*");
|
|
89
|
+
}).length;
|
|
90
|
+
}
|
|
91
|
+
function countInternalImports(filePath) {
|
|
92
|
+
return readLines(filePath).filter((line) => {
|
|
93
|
+
const trimmed = line.trim();
|
|
94
|
+
return (/^import\s+.*from\s+["']\./.test(trimmed) ||
|
|
95
|
+
/^import\s+["']\./.test(trimmed) ||
|
|
96
|
+
/require\(["']\./.test(trimmed) ||
|
|
97
|
+
/^from\s+\./.test(trimmed) ||
|
|
98
|
+
/^from\s+(task_|task|src|python)/.test(trimmed));
|
|
99
|
+
}).length;
|
|
100
|
+
}
|
|
101
|
+
function countExportedSymbols(filePath) {
|
|
102
|
+
const lines = readLines(filePath);
|
|
103
|
+
return lines.filter((line) => {
|
|
104
|
+
const trimmed = line.trim();
|
|
105
|
+
return (/^export\s+(function|class|const|let|var|type|interface)/.test(trimmed) ||
|
|
106
|
+
/^module\.exports\s*=/.test(trimmed) ||
|
|
107
|
+
/^exports\./.test(trimmed) ||
|
|
108
|
+
/^__all__\s*=/.test(trimmed) ||
|
|
109
|
+
/^(def|class)\s+\w+/.test(trimmed));
|
|
110
|
+
}).length;
|
|
111
|
+
}
|
|
112
|
+
function countFunctionsOrClasses(filePath) {
|
|
113
|
+
return readLines(filePath).filter((line) => /^(export\s+)?(async\s+)?function\s+\w+|^(export\s+)?class\s+\w+|^\s*(def|class)\s+\w+/.test(line.trim()))
|
|
114
|
+
.length;
|
|
115
|
+
}
|
|
116
|
+
function isDependencyFile(relativePath) {
|
|
117
|
+
const basename = path.posix.basename(relativePath);
|
|
118
|
+
return ["package.json", "package-lock.json", "requirements.txt", "pyproject.toml"].includes(basename);
|
|
119
|
+
}
|
|
120
|
+
function countPackageDependencies(filePath) {
|
|
121
|
+
if (path.basename(filePath) !== "package.json") {
|
|
122
|
+
return 0;
|
|
123
|
+
}
|
|
124
|
+
const parsed = JSON.parse(readFileSync(filePath, "utf8"));
|
|
125
|
+
return Object.keys(parsed.dependencies ?? {}).length + Object.keys(parsed.devDependencies ?? {}).length;
|
|
126
|
+
}
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import { readdirSync, readFileSync, statSync } from "node:fs";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
const EXCLUDED_SEGMENTS = new Set(["node_modules", "dist", "build", "coverage", "lab-output", ".git", "__pycache__"]);
|
|
4
|
+
export function isExcludedProjectPath(relativePath) {
|
|
5
|
+
return relativePath.split(/[\\/]/).some((segment) => EXCLUDED_SEGMENTS.has(segment));
|
|
6
|
+
}
|
|
7
|
+
export function inferFileLanguage(filePath) {
|
|
8
|
+
const extension = path.extname(filePath).toLowerCase();
|
|
9
|
+
if (extension === ".ts")
|
|
10
|
+
return "typescript";
|
|
11
|
+
if (extension === ".js")
|
|
12
|
+
return "javascript";
|
|
13
|
+
if (extension === ".py")
|
|
14
|
+
return "python";
|
|
15
|
+
if (extension === ".json")
|
|
16
|
+
return "json";
|
|
17
|
+
if (extension === ".md")
|
|
18
|
+
return "markdown";
|
|
19
|
+
return undefined;
|
|
20
|
+
}
|
|
21
|
+
export function inferFileRole(relativePath, kind) {
|
|
22
|
+
const normalized = relativePath.replace(/\\/g, "/");
|
|
23
|
+
const basename = path.posix.basename(normalized).toLowerCase();
|
|
24
|
+
if (/(^|\/)(tests?|__tests__)(\/|$)/.test(normalized) || /\.test\.[tj]s$/.test(basename) || basename.startsWith("test_")) {
|
|
25
|
+
return "test";
|
|
26
|
+
}
|
|
27
|
+
if (/(^|\/)(src|python|py)(\/|$)/.test(normalized)) {
|
|
28
|
+
return "source";
|
|
29
|
+
}
|
|
30
|
+
if (basename === "readme.md" || basename.endsWith(".md")) {
|
|
31
|
+
return "docs";
|
|
32
|
+
}
|
|
33
|
+
if (["package.json", "package-lock.json", "tsconfig.json", "vitest.config.ts", "vitest.config.js", "requirements.txt", "pyproject.toml"].includes(basename)) {
|
|
34
|
+
return "config";
|
|
35
|
+
}
|
|
36
|
+
if (normalized.includes("contracts/")) {
|
|
37
|
+
return "contract";
|
|
38
|
+
}
|
|
39
|
+
return kind === "directory" ? "other" : "other";
|
|
40
|
+
}
|
|
41
|
+
export function countFileLines(filePath) {
|
|
42
|
+
const content = readFileSync(filePath, "utf8");
|
|
43
|
+
if (content.length === 0) {
|
|
44
|
+
return 0;
|
|
45
|
+
}
|
|
46
|
+
return content.split(/\r?\n/).length;
|
|
47
|
+
}
|
|
48
|
+
export function buildProjectFileTree(projectRoot) {
|
|
49
|
+
const root = path.resolve(projectRoot);
|
|
50
|
+
const entries = [];
|
|
51
|
+
function walk(currentDir) {
|
|
52
|
+
const dirEntries = readdirSync(currentDir, { withFileTypes: true }).sort((a, b) => a.name.localeCompare(b.name));
|
|
53
|
+
for (const entry of dirEntries) {
|
|
54
|
+
const fullPath = path.join(currentDir, entry.name);
|
|
55
|
+
const relativePath = path.relative(root, fullPath).replace(/\\/g, "/");
|
|
56
|
+
if (isExcludedProjectPath(relativePath)) {
|
|
57
|
+
continue;
|
|
58
|
+
}
|
|
59
|
+
if (entry.isDirectory()) {
|
|
60
|
+
entries.push({
|
|
61
|
+
path: relativePath,
|
|
62
|
+
kind: "directory",
|
|
63
|
+
role: inferFileRole(relativePath, "directory")
|
|
64
|
+
});
|
|
65
|
+
walk(fullPath);
|
|
66
|
+
continue;
|
|
67
|
+
}
|
|
68
|
+
if (!entry.isFile()) {
|
|
69
|
+
continue;
|
|
70
|
+
}
|
|
71
|
+
entries.push({
|
|
72
|
+
path: relativePath,
|
|
73
|
+
kind: "file",
|
|
74
|
+
role: inferFileRole(relativePath, "file"),
|
|
75
|
+
language: inferFileLanguage(relativePath),
|
|
76
|
+
lines: countFileLines(fullPath)
|
|
77
|
+
});
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
statSync(root);
|
|
81
|
+
walk(root);
|
|
82
|
+
return { entries };
|
|
83
|
+
}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import { readFile } from "node:fs/promises";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { resolveWithinRoot } from "../core/pathSafety.js";
|
|
4
|
+
import { validateAnswerKey } from "./benchmarkMetadata.js";
|
|
5
|
+
export async function readEvaluationCases(casesPath, repoRoot = process.cwd(), options = {}) {
|
|
6
|
+
let parsed;
|
|
7
|
+
try {
|
|
8
|
+
parsed = JSON.parse(await readFile(casesPath, "utf8"));
|
|
9
|
+
}
|
|
10
|
+
catch (error) {
|
|
11
|
+
throw new Error(`Failed to parse evaluation cases: ${error.message}`);
|
|
12
|
+
}
|
|
13
|
+
if (!Array.isArray(parsed)) {
|
|
14
|
+
throw new Error("Evaluation cases file must contain an array.");
|
|
15
|
+
}
|
|
16
|
+
const ids = new Set();
|
|
17
|
+
return parsed.map((value, index) => {
|
|
18
|
+
if (!value || typeof value !== "object") {
|
|
19
|
+
throw new Error(`Invalid evaluation case at index ${index}.`);
|
|
20
|
+
}
|
|
21
|
+
const candidate = value;
|
|
22
|
+
const requiredStringFields = ["id", "title", "benchmarkProject", "targetRoot", "query"];
|
|
23
|
+
for (const field of requiredStringFields) {
|
|
24
|
+
if (typeof candidate[field] !== "string" || candidate[field] === "") {
|
|
25
|
+
throw new Error(`Missing required field: ${field}`);
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
const requiredArrayFields = ["sourceRoots", "expectedFiles", "expectedSymbols", "rawIncludeGlobs"];
|
|
29
|
+
for (const field of requiredArrayFields) {
|
|
30
|
+
if (!Array.isArray(candidate[field])) {
|
|
31
|
+
throw new Error(`Missing required field: ${field}`);
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
const input = candidate;
|
|
35
|
+
if (ids.has(input.id)) {
|
|
36
|
+
throw new Error(`Duplicate evaluation case id: ${input.id}`);
|
|
37
|
+
}
|
|
38
|
+
ids.add(input.id);
|
|
39
|
+
if (candidate.answerKey !== undefined) {
|
|
40
|
+
const answerKeyErrors = validateAnswerKey(candidate.answerKey, `evaluation case ${input.id}`);
|
|
41
|
+
if (answerKeyErrors.length > 0) {
|
|
42
|
+
throw new Error(answerKeyErrors.join("\n"));
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
if (options.requireProjectProfileRef === true) {
|
|
46
|
+
if (typeof input.projectProfileRef !== "string" || input.projectProfileRef.length === 0) {
|
|
47
|
+
throw new Error(`evaluation case ${input.id}: missing projectProfileRef.`);
|
|
48
|
+
}
|
|
49
|
+
const profileIds = new Set((options.projectProfiles ?? []).map((profile) => profile.projectId));
|
|
50
|
+
if (!profileIds.has(input.projectProfileRef)) {
|
|
51
|
+
throw new Error(`evaluation case ${input.id}: unknown projectProfileRef ${input.projectProfileRef}.`);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
return {
|
|
55
|
+
...input,
|
|
56
|
+
absoluteTargetRoot: resolveWithinRoot(path.resolve(repoRoot), input.targetRoot)
|
|
57
|
+
};
|
|
58
|
+
});
|
|
59
|
+
}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
export function renderTokenSavingsReportInput(options) {
|
|
2
|
+
return {
|
|
3
|
+
reportId: "token-savings-report",
|
|
4
|
+
title: "Token savings evaluation",
|
|
5
|
+
projectName: "my-dev-kit-lab",
|
|
6
|
+
benchmarkProject: "multiple",
|
|
7
|
+
workflowName: "raw full-file context vs my-dev-kit retrieval",
|
|
8
|
+
summary: "Estimated token comparison using static context size. Token counts use estimated_chars_div_4. This is not provider billing telemetry, and Codex and Claude telemetry are future work.",
|
|
9
|
+
steps: [
|
|
10
|
+
{
|
|
11
|
+
id: "load-cases",
|
|
12
|
+
label: "Load evaluation cases",
|
|
13
|
+
command: `--cases ${options.commandConfig.casesPath}`,
|
|
14
|
+
status: "pass",
|
|
15
|
+
notes: `${options.summary.caseCount} cases loaded.`
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
id: "run-raw-baseline",
|
|
19
|
+
label: "Run raw full-file baseline",
|
|
20
|
+
status: "pass",
|
|
21
|
+
notes: "Collected deterministic full-file context for each benchmark case."
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
id: "run-my-dev-kit",
|
|
25
|
+
label: "Run external my-dev-kit retrieval",
|
|
26
|
+
command: options.commandConfig.kitCommand,
|
|
27
|
+
status: options.summary.skippedCaseCount > 0 ? "skipped" : "pass",
|
|
28
|
+
notes: `${options.summary.completedCaseCount} completed, ${options.summary.skippedCaseCount} skipped.`
|
|
29
|
+
}
|
|
30
|
+
],
|
|
31
|
+
metrics: [
|
|
32
|
+
{ id: "token-count-method", label: "Token count method", value: options.summary.tokenCountMethod },
|
|
33
|
+
{ id: "case-count", label: "Case count", value: options.summary.caseCount },
|
|
34
|
+
{ id: "completed-case-count", label: "Completed case count", value: options.summary.completedCaseCount },
|
|
35
|
+
{ id: "skipped-case-count", label: "Skipped case count", value: options.summary.skippedCaseCount },
|
|
36
|
+
{ id: "average-raw-tokens", label: "Average raw tokens", value: options.summary.averageRawTokens.toFixed(2) },
|
|
37
|
+
{ id: "average-my-dev-kit-tokens", label: "Average my-dev-kit tokens", value: options.summary.averageMyDevKitTokens.toFixed(2) },
|
|
38
|
+
{ id: "average-tokens-saved", label: "Average tokens saved", value: options.summary.averageTokensSaved.toFixed(2) },
|
|
39
|
+
{ id: "average-percent-saved", label: "Average percent saved", value: options.summary.averagePercentSaved.toFixed(2), unit: "%" },
|
|
40
|
+
{ id: "total-commands-run", label: "Total commands run", value: options.summary.totalCommandsRun }
|
|
41
|
+
],
|
|
42
|
+
artifacts: [
|
|
43
|
+
{ id: "summary-json", label: "Token savings summary JSON", path: options.artifactPaths.summaryPath, kind: "json" },
|
|
44
|
+
{ id: "runs-json", label: "Token savings runs JSON", path: options.artifactPaths.runsPath, kind: "json" },
|
|
45
|
+
{ id: "report-html", label: "Token savings report HTML", path: options.artifactPaths.htmlPath, kind: "html" }
|
|
46
|
+
],
|
|
47
|
+
warnings: [
|
|
48
|
+
"Token counts are estimated using estimated_chars_div_4.",
|
|
49
|
+
"This is a static context comparison.",
|
|
50
|
+
"This is not provider billing telemetry.",
|
|
51
|
+
"Codex and Claude telemetry are future work.",
|
|
52
|
+
...options.warnings
|
|
53
|
+
]
|
|
54
|
+
};
|
|
55
|
+
}
|