@dailephd/my-dev-kit-lab 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +272 -0
- package/benchmarks/contracts/benchmark-project-profiles.json +1199 -0
- package/benchmarks/contracts/todo-behavior.md +70 -0
- package/benchmarks/contracts/todo-benchmark-case.json +227 -0
- package/benchmarks/projects/README.md +34 -0
- package/benchmarks/projects/task-analytics-large-mixed/README.md +1 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/__init__.py +3 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/fixtures.py +6 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/metrics.py +29 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/models.py +21 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/parser.py +16 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/pipeline.py +9 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/quality.py +8 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/reporting.py +11 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/tests/test_metrics.py +19 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/tests/test_parser.py +15 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/tests/test_quality.py +19 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/tests/test_reporting.py +15 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/package.json +12 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/index.ts +11 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/models/analyticsSnapshot.ts +20 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/models/project.ts +5 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/models/task.ts +10 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/reporting/buildProjectLeaderboard.ts +7 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/reporting/formatTaskHealthReport.ts +13 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/services/buildAnalyticsSnapshot.ts +39 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/services/completeTask.ts +10 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/services/createTask.ts +21 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/services/listTasksByProject.ts +6 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/store/projectStore.ts +20 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/store/taskStore.ts +44 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/validation/projectValidation.ts +12 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/validation/taskValidation.ts +18 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tests/buildAnalyticsSnapshot.test.ts +48 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tests/completeTask.test.ts +21 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tests/createTask.test.ts +31 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tests/listTasksByProject.test.ts +18 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tests/reporting.test.ts +19 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tsconfig.json +12 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/vitest.config.ts +5 -0
- package/benchmarks/projects/task-workflow-medium-ts/README.md +1 -0
- package/benchmarks/projects/task-workflow-medium-ts/package.json +12 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/index.ts +9 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/models/project.ts +6 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/models/task.ts +39 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/services/completeTask.ts +15 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/services/createTask.ts +26 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/services/filterTasks.ts +17 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/services/importTasks.ts +33 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/services/summarizeTasks.ts +30 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/store/taskStore.ts +76 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/utils/deterministicId.ts +3 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/validation/taskValidation.ts +45 -0
- package/benchmarks/projects/task-workflow-medium-ts/tests/completeTask.test.ts +16 -0
- package/benchmarks/projects/task-workflow-medium-ts/tests/createTask.test.ts +21 -0
- package/benchmarks/projects/task-workflow-medium-ts/tests/filterTasks.test.ts +18 -0
- package/benchmarks/projects/task-workflow-medium-ts/tests/importTasks.test.ts +22 -0
- package/benchmarks/projects/task-workflow-medium-ts/tests/summarizeTasks.test.ts +29 -0
- package/benchmarks/projects/task-workflow-medium-ts/tsconfig.json +12 -0
- package/benchmarks/projects/task-workflow-medium-ts/vitest.config.ts +5 -0
- package/benchmarks/projects/todo-js/README.md +3 -0
- package/benchmarks/projects/todo-js/package.json +11 -0
- package/benchmarks/projects/todo-js/src/index.js +2 -0
- package/benchmarks/projects/todo-js/src/taskService.js +37 -0
- package/benchmarks/projects/todo-js/src/taskStore.js +28 -0
- package/benchmarks/projects/todo-js/tests/taskService.test.js +45 -0
- package/benchmarks/projects/todo-js/vitest.config.js +5 -0
- package/benchmarks/projects/todo-mixed-ts-py/README.md +3 -0
- package/benchmarks/projects/todo-mixed-ts-py/package.json +13 -0
- package/benchmarks/projects/todo-mixed-ts-py/python/task_service.py +76 -0
- package/benchmarks/projects/todo-mixed-ts-py/src/taskCli.ts +38 -0
- package/benchmarks/projects/todo-mixed-ts-py/tests/mixedBoundary.test.ts +18 -0
- package/benchmarks/projects/todo-mixed-ts-py/tsconfig.json +12 -0
- package/benchmarks/projects/todo-mixed-ts-py/vitest.config.ts +5 -0
- package/benchmarks/projects/todo-python/README.md +3 -0
- package/benchmarks/projects/todo-python/src/__init__.py +4 -0
- package/benchmarks/projects/todo-python/src/task_service.py +32 -0
- package/benchmarks/projects/todo-python/src/task_store.py +28 -0
- package/benchmarks/projects/todo-python/tests/test_task_service.py +52 -0
- package/benchmarks/projects/todo-ts/README.md +3 -0
- package/benchmarks/projects/todo-ts/package.json +12 -0
- package/benchmarks/projects/todo-ts/src/index.ts +2 -0
- package/benchmarks/projects/todo-ts/src/taskService.ts +41 -0
- package/benchmarks/projects/todo-ts/src/taskStore.ts +34 -0
- package/benchmarks/projects/todo-ts/tests/taskService.test.ts +45 -0
- package/benchmarks/projects/todo-ts/tsconfig.json +12 -0
- package/benchmarks/projects/todo-ts/vitest.config.ts +5 -0
- package/dist/scripts/build-gallery.js +3 -0
- package/dist/scripts/capture-demo-report.js +3 -0
- package/dist/scripts/evaluate-token-savings.js +2 -0
- package/dist/scripts/experiments/describeExperiment.js +143 -0
- package/dist/scripts/experiments/listExperiments.js +44 -0
- package/dist/scripts/experiments/runExperiment.js +199 -0
- package/dist/scripts/generate-experiment-plots.js +3 -0
- package/dist/scripts/generate-prompt-variants.js +2 -0
- package/dist/scripts/render-experiment-report.js +2 -0
- package/dist/scripts/run-agent-prompt.js +2 -0
- package/dist/scripts/run-controlled-experiment.js +2 -0
- package/dist/scripts/run-final-demo.js +3 -0
- package/dist/scripts/run-lab-demo.js +5 -0
- package/dist/scripts/run-visualization-demos.js +3 -0
- package/dist/scripts/security/runCodeql.js +57 -0
- package/dist/scripts/security/runDependencyChecks.js +57 -0
- package/dist/scripts/security/runFuzzSmoke.js +29 -0
- package/dist/scripts/security/runPackageChecks.js +56 -0
- package/dist/scripts/security/runSemgrep.js +63 -0
- package/dist/scripts/security/validate.js +117 -0
- package/dist/scripts/verify-benchmarks.js +202 -0
- package/dist/src/agents/adapters/claudeAdapter.js +37 -0
- package/dist/src/agents/adapters/codexAdapter.js +110 -0
- package/dist/src/agents/adapters/fakeAgentAdapter.js +101 -0
- package/dist/src/agents/agentRegistry.js +21 -0
- package/dist/src/agents/index.js +7 -0
- package/dist/src/agents/parseAgentTokenUsage.js +137 -0
- package/dist/src/agents/runAgentPrompt.js +38 -0
- package/dist/src/agents/types.js +1 -0
- package/dist/src/commands/buildGalleryCommand.js +56 -0
- package/dist/src/commands/captureDemoReport.js +116 -0
- package/dist/src/commands/evaluateTokenSavings.js +175 -0
- package/dist/src/commands/generateExperimentPlotsCommand.js +38 -0
- package/dist/src/commands/generatePromptVariants.js +67 -0
- package/dist/src/commands/renderExperimentReportCommand.js +131 -0
- package/dist/src/commands/runAgentPromptCommand.js +132 -0
- package/dist/src/commands/runControlledExperimentCommand.js +174 -0
- package/dist/src/commands/runFinalDemoCommand.js +123 -0
- package/dist/src/commands/runLabDemo.js +62 -0
- package/dist/src/commands/runVisualizationDemosCommand.js +67 -0
- package/dist/src/core/commandLine.js +59 -0
- package/dist/src/core/countTokens.js +8 -0
- package/dist/src/core/fileGlobs.js +100 -0
- package/dist/src/core/localProjectTarget.js +75 -0
- package/dist/src/core/pathSafety.js +19 -0
- package/dist/src/core/pythonCommand.js +30 -0
- package/dist/src/core/resolveCommand.js +110 -0
- package/dist/src/core/runMeasuredCommand.js +143 -0
- package/dist/src/evaluation/benchmarkMetadata.js +207 -0
- package/dist/src/evaluation/buildExperimentMatrix.js +75 -0
- package/dist/src/evaluation/classifyAgentRunOutcome.js +40 -0
- package/dist/src/evaluation/compareExperimentRuns.js +79 -0
- package/dist/src/evaluation/compareTokenSavings.js +47 -0
- package/dist/src/evaluation/controlledExperimentTypes.js +1 -0
- package/dist/src/evaluation/index.js +18 -0
- package/dist/src/evaluation/parseAgentAnswer.js +230 -0
- package/dist/src/evaluation/projectComplexity.js +126 -0
- package/dist/src/evaluation/projectFileTree.js +83 -0
- package/dist/src/evaluation/readEvaluationCases.js +59 -0
- package/dist/src/evaluation/renderTokenSavingsReportInput.js +55 -0
- package/dist/src/evaluation/runControlledExperiment.js +158 -0
- package/dist/src/evaluation/runMyDevKitRetrieval.js +197 -0
- package/dist/src/evaluation/runRawFullFileBaseline.js +31 -0
- package/dist/src/evaluation/scoreCorrectness.js +127 -0
- package/dist/src/evaluation/types.js +1 -0
- package/dist/src/evaluation/writeExperimentArtifacts.js +104 -0
- package/dist/src/evaluation/writeTokenSavingsArtifacts.js +57 -0
- package/dist/src/experiments/config.js +24 -0
- package/dist/src/experiments/defaultRegistry.js +7 -0
- package/dist/src/experiments/errors.js +18 -0
- package/dist/src/experiments/index.js +9 -0
- package/dist/src/experiments/outputPaths.js +25 -0
- package/dist/src/experiments/plugins/contextStrategyComparison/config.js +37 -0
- package/dist/src/experiments/plugins/contextStrategyComparison/index.js +3 -0
- package/dist/src/experiments/plugins/contextStrategyComparison/plugin.js +83 -0
- package/dist/src/experiments/plugins/contextStrategyComparison/resultMapping.js +260 -0
- package/dist/src/experiments/plugins/index.js +1 -0
- package/dist/src/experiments/registry.js +43 -0
- package/dist/src/experiments/results.js +48 -0
- package/dist/src/experiments/runner.js +181 -0
- package/dist/src/experiments/target.js +8 -0
- package/dist/src/experiments/types.js +1 -0
- package/dist/src/gallery/index.js +2 -0
- package/dist/src/gallery/types.js +1 -0
- package/dist/src/gallery/writeGalleryManifest.js +214 -0
- package/dist/src/index.js +12 -0
- package/dist/src/plots/buildExperimentPlotData.js +137 -0
- package/dist/src/plots/index.js +4 -0
- package/dist/src/plots/renderSvgChart.js +82 -0
- package/dist/src/plots/types.js +1 -0
- package/dist/src/plots/writePlotArtifacts.js +46 -0
- package/dist/src/prompts/buildPromptContext.js +68 -0
- package/dist/src/prompts/generateMyDevKitPrompt.js +106 -0
- package/dist/src/prompts/generatePromptVariants.js +36 -0
- package/dist/src/prompts/generateRawFullFilePrompt.js +97 -0
- package/dist/src/prompts/index.js +7 -0
- package/dist/src/prompts/measurePromptComplexity.js +41 -0
- package/dist/src/prompts/types.js +1 -0
- package/dist/src/prompts/writePromptArtifacts.js +43 -0
- package/dist/src/report/buildExperimentReportInput.js +339 -0
- package/dist/src/report/experimentReportTypes.js +1 -0
- package/dist/src/report/experiments/buildPluginExperimentReport.js +153 -0
- package/dist/src/report/experiments/experimentReportModel.js +1 -0
- package/dist/src/report/experiments/index.js +4 -0
- package/dist/src/report/experiments/renderPluginExperimentReportHtml.js +133 -0
- package/dist/src/report/experiments/writePluginExperimentReports.js +30 -0
- package/dist/src/report/index.js +8 -0
- package/dist/src/report/renderExperimentHtmlReport.js +354 -0
- package/dist/src/report/renderHtmlReport.js +103 -0
- package/dist/src/report/types.js +10 -0
- package/dist/src/report/writeExperimentReportArtifacts.js +38 -0
- package/dist/src/report/writeReportArtifacts.js +39 -0
- package/dist/src/screenshot/captureReportScreenshot.js +75 -0
- package/dist/src/screenshot/index.js +2 -0
- package/dist/src/screenshot/types.js +1 -0
- package/dist/src/securityValidation/artifacts.js +15 -0
- package/dist/src/securityValidation/cliAdversarial/adversarialCliConfig.js +38 -0
- package/dist/src/securityValidation/cliAdversarial/dataVolumeChecks.js +194 -0
- package/dist/src/securityValidation/cliAdversarial/jsonStdoutChecks.js +359 -0
- package/dist/src/securityValidation/cliAdversarial/malformedArtifactChecks.js +284 -0
- package/dist/src/securityValidation/cliAdversarial/malformedArtifactFixtures.js +79 -0
- package/dist/src/securityValidation/cliAdversarial/pathBoundaryChecks.js +431 -0
- package/dist/src/securityValidation/cliAdversarial/pathCases.js +144 -0
- package/dist/src/securityValidation/cliAdversarial/readOnlyBoundaryChecks.js +294 -0
- package/dist/src/securityValidation/cliAdversarial/runAdversarialCheck.js +149 -0
- package/dist/src/securityValidation/cliAdversarial/subprocessSafetyChecks.js +214 -0
- package/dist/src/securityValidation/cliAdversarial/tempWorkspace.js +160 -0
- package/dist/src/securityValidation/commandRunner.js +136 -0
- package/dist/src/securityValidation/config.js +39 -0
- package/dist/src/securityValidation/dependencies/parseNpmAudit.js +115 -0
- package/dist/src/securityValidation/dependencies/parseNpmLs.js +71 -0
- package/dist/src/securityValidation/dependencies/parseNpmOutdated.js +41 -0
- package/dist/src/securityValidation/dependencies/runDependencyChecks.js +239 -0
- package/dist/src/securityValidation/dependencies/runOsvScanner.js +43 -0
- package/dist/src/securityValidation/fuzz/fuzzHarness.js +61 -0
- package/dist/src/securityValidation/fuzz/fuzzTargets.js +204 -0
- package/dist/src/securityValidation/fuzz/randomInput.js +0 -0
- package/dist/src/securityValidation/index.js +34 -0
- package/dist/src/securityValidation/packageChecks/forbiddenPackageContents.js +67 -0
- package/dist/src/securityValidation/packageChecks/parseNpmPackDryRun.js +56 -0
- package/dist/src/securityValidation/packageChecks/runPackageChecks.js +88 -0
- package/dist/src/securityValidation/report/renderSecurityReport.js +248 -0
- package/dist/src/securityValidation/report/securityReportTypes.js +1 -0
- package/dist/src/securityValidation/staticScans/codeql.js +66 -0
- package/dist/src/securityValidation/staticScans/semgrep.js +180 -0
- package/dist/src/securityValidation/testMatrix.js +535 -0
- package/dist/src/securityValidation/types.js +34 -0
- package/dist/src/securityValidation/validate/resolveTarget.js +32 -0
- package/dist/src/securityValidation/validate/runSecurityValidation.js +169 -0
- package/dist/src/securityValidation/validate/verdict.js +73 -0
- package/dist/src/visualizationDemos/buildMyDevKitVisualizationCommands.js +59 -0
- package/dist/src/visualizationDemos/index.js +4 -0
- package/dist/src/visualizationDemos/runVisualizationDemos.js +82 -0
- package/dist/src/visualizationDemos/types.js +1 -0
- package/dist/src/visualizationDemos/writeVisualizationDemoArtifacts.js +25 -0
- package/docs/METRICS.md +286 -0
- package/examples/demo-report-input.json +78 -0
- package/examples/lab-demo-cases.json +35 -0
- package/examples/real-agent-campaign-cases.json +118 -0
- package/examples/token-savings-cases.json +122 -0
- package/package.json +91 -0
- package/tests/fixtures/fake-adversarial-cli.js +152 -0
- package/tests/fixtures/fake-my-dev-kit-cli.js +83 -0
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
import { existsSync } from "node:fs";
|
|
2
|
+
import { readFile } from "node:fs/promises";
|
|
3
|
+
import path from "node:path";
|
|
4
|
+
import { PROJECT_COMPLEXITY_FORMULA, calculateProjectComplexityScore } from "./projectComplexity.js";
|
|
5
|
+
export const REQUIRED_BENCHMARK_PROJECT_IDS = [
|
|
6
|
+
"todo-ts",
|
|
7
|
+
"todo-python",
|
|
8
|
+
"todo-js",
|
|
9
|
+
"todo-mixed-ts-py",
|
|
10
|
+
"task-workflow-medium-ts",
|
|
11
|
+
"task-analytics-large-mixed"
|
|
12
|
+
];
|
|
13
|
+
export const VALID_COMPLEXITY_LEVELS = new Set(["small", "medium", "large", "mixed-language"]);
|
|
14
|
+
export async function readBenchmarkProjectProfiles(profilesPath, repoRoot = process.cwd()) {
|
|
15
|
+
let parsed;
|
|
16
|
+
try {
|
|
17
|
+
parsed = JSON.parse(await readFile(profilesPath, "utf8"));
|
|
18
|
+
}
|
|
19
|
+
catch (error) {
|
|
20
|
+
throw new Error(`Failed to parse benchmark project profiles: ${error.message}`);
|
|
21
|
+
}
|
|
22
|
+
const profiles = parseBenchmarkProjectProfiles(parsed);
|
|
23
|
+
const errors = validateBenchmarkProjectProfiles(profiles, repoRoot);
|
|
24
|
+
if (errors.length > 0) {
|
|
25
|
+
throw new Error(`Invalid benchmark project profiles:\n${errors.join("\n")}`);
|
|
26
|
+
}
|
|
27
|
+
return profiles;
|
|
28
|
+
}
|
|
29
|
+
export function parseBenchmarkProjectProfiles(value) {
|
|
30
|
+
if (!value || typeof value !== "object") {
|
|
31
|
+
throw new Error("Benchmark project profiles file must contain an object.");
|
|
32
|
+
}
|
|
33
|
+
const contract = value;
|
|
34
|
+
if (!Array.isArray(contract.profiles)) {
|
|
35
|
+
throw new Error("Benchmark project profiles file must contain a profiles array.");
|
|
36
|
+
}
|
|
37
|
+
return contract.profiles;
|
|
38
|
+
}
|
|
39
|
+
export function validateAnswerKey(answerKey, label) {
|
|
40
|
+
const errors = [];
|
|
41
|
+
if (!answerKey || typeof answerKey !== "object") {
|
|
42
|
+
return [`${label}: answerKey must be an object.`];
|
|
43
|
+
}
|
|
44
|
+
const candidate = answerKey;
|
|
45
|
+
for (const field of ["expectedFiles", "expectedSymbols", "expectedFacts"]) {
|
|
46
|
+
if (!Array.isArray(candidate[field])) {
|
|
47
|
+
errors.push(`${label}: answerKey.${field} must be an array.`);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
if (!Number.isInteger(candidate.minimumCorrectFacts) || candidate.minimumCorrectFacts < 0) {
|
|
51
|
+
errors.push(`${label}: answerKey.minimumCorrectFacts must be a nonnegative integer.`);
|
|
52
|
+
}
|
|
53
|
+
if (Array.isArray(candidate.expectedFacts)) {
|
|
54
|
+
const factIds = new Set();
|
|
55
|
+
let requiredFactCount = 0;
|
|
56
|
+
for (const fact of candidate.expectedFacts) {
|
|
57
|
+
if (!fact || typeof fact !== "object") {
|
|
58
|
+
errors.push(`${label}: expectedFacts entries must be objects.`);
|
|
59
|
+
continue;
|
|
60
|
+
}
|
|
61
|
+
if (typeof fact.id !== "string" || fact.id.length === 0) {
|
|
62
|
+
errors.push(`${label}: expectedFacts entries must include id.`);
|
|
63
|
+
}
|
|
64
|
+
else if (factIds.has(fact.id)) {
|
|
65
|
+
errors.push(`${label}: duplicate expected fact id ${fact.id}.`);
|
|
66
|
+
}
|
|
67
|
+
else {
|
|
68
|
+
factIds.add(fact.id);
|
|
69
|
+
}
|
|
70
|
+
if (typeof fact.text !== "string" || fact.text.length === 0) {
|
|
71
|
+
errors.push(`${label}: expected fact ${fact.id ?? "<unknown>"} must include text.`);
|
|
72
|
+
}
|
|
73
|
+
if (typeof fact.weight !== "number" || fact.weight <= 0) {
|
|
74
|
+
errors.push(`${label}: expected fact ${fact.id ?? "<unknown>"} must include positive weight.`);
|
|
75
|
+
}
|
|
76
|
+
if (typeof fact.required !== "boolean") {
|
|
77
|
+
errors.push(`${label}: expected fact ${fact.id ?? "<unknown>"} must include required boolean.`);
|
|
78
|
+
}
|
|
79
|
+
if (fact.required === true) {
|
|
80
|
+
requiredFactCount += 1;
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
if (Number.isInteger(candidate.minimumCorrectFacts) && candidate.minimumCorrectFacts > candidate.expectedFacts.length) {
|
|
84
|
+
errors.push(`${label}: answerKey.minimumCorrectFacts cannot exceed expectedFacts length.`);
|
|
85
|
+
}
|
|
86
|
+
if (requiredFactCount > 0 && candidate.minimumCorrectFacts > requiredFactCount + (candidate.expectedFacts.length - requiredFactCount)) {
|
|
87
|
+
errors.push(`${label}: answerKey.minimumCorrectFacts is not satisfiable.`);
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
if (Array.isArray(candidate.expectedFiles) && candidate.expectedFiles.length === 0) {
|
|
91
|
+
errors.push(`${label}: answerKey.expectedFiles must not be empty.`);
|
|
92
|
+
}
|
|
93
|
+
if (Array.isArray(candidate.expectedSymbols) && candidate.expectedSymbols.length === 0) {
|
|
94
|
+
errors.push(`${label}: answerKey.expectedSymbols must not be empty.`);
|
|
95
|
+
}
|
|
96
|
+
return errors;
|
|
97
|
+
}
|
|
98
|
+
export function validateBenchmarkProjectProfiles(profiles, repoRoot = process.cwd()) {
|
|
99
|
+
const errors = [];
|
|
100
|
+
const ids = new Set(profiles.map((profile) => profile.projectId));
|
|
101
|
+
for (const requiredProjectId of REQUIRED_BENCHMARK_PROJECT_IDS) {
|
|
102
|
+
if (!ids.has(requiredProjectId)) {
|
|
103
|
+
errors.push(`Missing benchmark project profile: ${requiredProjectId}.`);
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
for (const profile of profiles) {
|
|
107
|
+
errors.push(...validateBenchmarkProjectProfile(profile, repoRoot));
|
|
108
|
+
}
|
|
109
|
+
return errors;
|
|
110
|
+
}
|
|
111
|
+
function validateBenchmarkProjectProfile(profile, repoRoot) {
|
|
112
|
+
const errors = [];
|
|
113
|
+
const label = `profile ${profile.projectId ?? "<unknown>"}`;
|
|
114
|
+
for (const field of ["projectId", "displayName", "description", "languageMix", "primaryLanguage", "rootPath", "benchmarkPurpose"]) {
|
|
115
|
+
if (typeof profile[field] !== "string" || profile[field].length === 0) {
|
|
116
|
+
errors.push(`${label}: ${field} must be a nonempty string.`);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
for (const field of ["languages", "sourceRoots", "testRoots", "expectedUseCases"]) {
|
|
120
|
+
if (!Array.isArray(profile[field]) || profile[field].length === 0) {
|
|
121
|
+
errors.push(`${label}: ${field} must be a nonempty array.`);
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
if (!VALID_COMPLEXITY_LEVELS.has(profile.complexityLevel)) {
|
|
125
|
+
errors.push(`${label}: complexityLevel must be one of ${[...VALID_COMPLEXITY_LEVELS].join(", ")}.`);
|
|
126
|
+
}
|
|
127
|
+
if (typeof profile.complexityScore !== "number" || profile.complexityScore < 0 || profile.complexityScore > 100) {
|
|
128
|
+
errors.push(`${label}: complexityScore must be between 0 and 100.`);
|
|
129
|
+
}
|
|
130
|
+
if (!profile.complexityMetrics || typeof profile.complexityMetrics !== "object") {
|
|
131
|
+
errors.push(`${label}: complexityMetrics must be an object.`);
|
|
132
|
+
}
|
|
133
|
+
else {
|
|
134
|
+
errors.push(...validateComplexityMetrics(profile.complexityMetrics, label));
|
|
135
|
+
const calculatedScore = calculateProjectComplexityScore(profile.complexityMetrics);
|
|
136
|
+
if (profile.complexityScore !== calculatedScore) {
|
|
137
|
+
errors.push(`${label}: complexityScore ${profile.complexityScore} does not match formula score ${calculatedScore}.`);
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
if (profile.complexityFormula?.id !== PROJECT_COMPLEXITY_FORMULA.id) {
|
|
141
|
+
errors.push(`${label}: complexityFormula.id must be ${PROJECT_COMPLEXITY_FORMULA.id}.`);
|
|
142
|
+
}
|
|
143
|
+
const projectRoot = path.resolve(repoRoot, profile.rootPath ?? "");
|
|
144
|
+
if (!existsSync(projectRoot)) {
|
|
145
|
+
errors.push(`${label}: rootPath does not exist: ${profile.rootPath}.`);
|
|
146
|
+
}
|
|
147
|
+
if (!profile.fileTree || !Array.isArray(profile.fileTree.entries)) {
|
|
148
|
+
errors.push(`${label}: fileTree.entries must be an array.`);
|
|
149
|
+
}
|
|
150
|
+
else {
|
|
151
|
+
errors.push(...validateFileTreeEntries(profile.fileTree.entries, projectRoot, label));
|
|
152
|
+
}
|
|
153
|
+
return errors;
|
|
154
|
+
}
|
|
155
|
+
function validateComplexityMetrics(metrics, label) {
|
|
156
|
+
const errors = [];
|
|
157
|
+
const requiredMetricFields = [
|
|
158
|
+
"fileCount",
|
|
159
|
+
"sourceFileCount",
|
|
160
|
+
"testFileCount",
|
|
161
|
+
"totalLinesOfCode",
|
|
162
|
+
"sourceLinesOfCode",
|
|
163
|
+
"testLinesOfCode",
|
|
164
|
+
"languageCount",
|
|
165
|
+
"dependencyFileCount",
|
|
166
|
+
"internalImportCount",
|
|
167
|
+
"exportedSymbolEstimate",
|
|
168
|
+
"taskCount",
|
|
169
|
+
"expectedRelevantFilesAverage",
|
|
170
|
+
"expectedRelevantSymbolsAverage",
|
|
171
|
+
"maxFileLines",
|
|
172
|
+
"averageFileLines"
|
|
173
|
+
];
|
|
174
|
+
for (const field of requiredMetricFields) {
|
|
175
|
+
if (typeof metrics[field] !== "number" || metrics[field] < 0) {
|
|
176
|
+
errors.push(`${label}: complexityMetrics.${field} must be a nonnegative number.`);
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
return errors;
|
|
180
|
+
}
|
|
181
|
+
function validateFileTreeEntries(entries, projectRoot, label) {
|
|
182
|
+
const errors = [];
|
|
183
|
+
const paths = new Set();
|
|
184
|
+
for (const entry of entries) {
|
|
185
|
+
if (typeof entry.path !== "string" || entry.path.length === 0) {
|
|
186
|
+
errors.push(`${label}: fileTree entries must include path.`);
|
|
187
|
+
continue;
|
|
188
|
+
}
|
|
189
|
+
if (path.isAbsolute(entry.path) || entry.path.includes("..")) {
|
|
190
|
+
errors.push(`${label}: fileTree path must be a safe relative path: ${entry.path}.`);
|
|
191
|
+
}
|
|
192
|
+
if (paths.has(entry.path)) {
|
|
193
|
+
errors.push(`${label}: duplicate fileTree path: ${entry.path}.`);
|
|
194
|
+
}
|
|
195
|
+
paths.add(entry.path);
|
|
196
|
+
if (entry.kind !== "file" && entry.kind !== "directory") {
|
|
197
|
+
errors.push(`${label}: fileTree path ${entry.path} has invalid kind.`);
|
|
198
|
+
}
|
|
199
|
+
if (!existsSync(path.join(projectRoot, entry.path))) {
|
|
200
|
+
errors.push(`${label}: fileTree path does not exist: ${entry.path}.`);
|
|
201
|
+
}
|
|
202
|
+
if (entry.kind === "file" && (typeof entry.lines !== "number" || entry.lines < 0)) {
|
|
203
|
+
errors.push(`${label}: fileTree file ${entry.path} must include nonnegative lines.`);
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
return errors;
|
|
207
|
+
}
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
export const ALL_EXPERIMENT_AGENTS = ["fake-agent", "codex", "claude"];
|
|
2
|
+
export const DEFAULT_EXPERIMENT_AGENTS = ["fake-agent"];
|
|
3
|
+
export const DEFAULT_EXPERIMENT_STRATEGIES = ["raw-full-file", "my-dev-kit-guided"];
|
|
4
|
+
export const DEFAULT_EXPERIMENT_COMPLEXITIES = ["short"];
|
|
5
|
+
const VALID_AGENTS = new Set(ALL_EXPERIMENT_AGENTS);
|
|
6
|
+
const VALID_STRATEGIES = new Set(DEFAULT_EXPERIMENT_STRATEGIES);
|
|
7
|
+
const VALID_COMPLEXITIES = new Set(["short", "medium", "long", "multi-step"]);
|
|
8
|
+
export function buildExperimentMatrix(args) {
|
|
9
|
+
const agents = args.config.agents ?? DEFAULT_EXPERIMENT_AGENTS;
|
|
10
|
+
const strategies = args.config.strategies ?? DEFAULT_EXPERIMENT_STRATEGIES;
|
|
11
|
+
const complexityLevels = args.config.complexityLevels ?? DEFAULT_EXPERIMENT_COMPLEXITIES;
|
|
12
|
+
validateSelections({ agents, strategies, complexityLevels, includeRealAgents: args.config.includeRealAgents ?? false });
|
|
13
|
+
const selectedCases = args.cases.filter((evaluationCase) => {
|
|
14
|
+
const caseMatches = !args.config.caseIds?.length || args.config.caseIds.includes(evaluationCase.id);
|
|
15
|
+
const projectMatches = !args.config.benchmarkProjects?.length || args.config.benchmarkProjects.includes(evaluationCase.benchmarkProject);
|
|
16
|
+
return caseMatches && projectMatches;
|
|
17
|
+
});
|
|
18
|
+
if (args.config.caseIds?.length) {
|
|
19
|
+
const found = new Set(selectedCases.map((evaluationCase) => evaluationCase.id));
|
|
20
|
+
const missing = args.config.caseIds.filter((caseId) => !found.has(caseId));
|
|
21
|
+
if (missing.length > 0) {
|
|
22
|
+
throw new Error(`Evaluation case not found: ${missing.join(", ")}`);
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
const cells = [];
|
|
26
|
+
for (const evaluationCase of selectedCases) {
|
|
27
|
+
for (const agentId of agents) {
|
|
28
|
+
for (const strategy of strategies) {
|
|
29
|
+
for (const complexityLevel of complexityLevels) {
|
|
30
|
+
cells.push({
|
|
31
|
+
caseId: evaluationCase.id,
|
|
32
|
+
benchmarkProject: evaluationCase.benchmarkProject,
|
|
33
|
+
agentId,
|
|
34
|
+
strategy,
|
|
35
|
+
complexityLevel,
|
|
36
|
+
runId: buildExperimentRunId({
|
|
37
|
+
caseId: evaluationCase.id,
|
|
38
|
+
benchmarkProject: evaluationCase.benchmarkProject,
|
|
39
|
+
agentId,
|
|
40
|
+
strategy,
|
|
41
|
+
complexityLevel
|
|
42
|
+
})
|
|
43
|
+
});
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
return typeof args.config.maxRuns === "number" ? cells.slice(0, args.config.maxRuns) : cells;
|
|
49
|
+
}
|
|
50
|
+
export function buildExperimentRunId(args) {
|
|
51
|
+
return [args.caseId, args.benchmarkProject, args.agentId, args.strategy, args.complexityLevel].map(safeSegment).join(".");
|
|
52
|
+
}
|
|
53
|
+
function validateSelections(args) {
|
|
54
|
+
for (const agent of args.agents) {
|
|
55
|
+
if (!VALID_AGENTS.has(agent)) {
|
|
56
|
+
throw new Error(`Invalid experiment agent: ${agent}`);
|
|
57
|
+
}
|
|
58
|
+
if ((agent === "codex" || agent === "claude") && !args.includeRealAgents) {
|
|
59
|
+
throw new Error(`Real agent ${agent} requires --include-real-agents.`);
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
for (const strategy of args.strategies) {
|
|
63
|
+
if (!VALID_STRATEGIES.has(strategy)) {
|
|
64
|
+
throw new Error(`Invalid experiment strategy: ${strategy}`);
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
for (const complexity of args.complexityLevels) {
|
|
68
|
+
if (!VALID_COMPLEXITIES.has(complexity)) {
|
|
69
|
+
throw new Error(`Invalid prompt complexity level: ${complexity}`);
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
function safeSegment(value) {
|
|
74
|
+
return value.replace(/[^a-zA-Z0-9._-]+/g, "-");
|
|
75
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
const LIMIT_PATTERNS = [
|
|
2
|
+
/usage\s+limit/i,
|
|
3
|
+
/rate\s+limit/i,
|
|
4
|
+
/quota/i,
|
|
5
|
+
/session\s+limit/i,
|
|
6
|
+
/limit\s+reached/i,
|
|
7
|
+
/too\s+many\s+requests/i,
|
|
8
|
+
/exhausted/i,
|
|
9
|
+
/insufficient\s+quota/i,
|
|
10
|
+
/daily\s+limit/i,
|
|
11
|
+
/monthly\s+limit/i
|
|
12
|
+
];
|
|
13
|
+
export function classifyAgentRunOutcome(args) {
|
|
14
|
+
const result = args.agentRunResult;
|
|
15
|
+
const warnings = [...result.warnings, ...(args.parsedAnswer?.warnings ?? [])];
|
|
16
|
+
const errors = [...result.errors];
|
|
17
|
+
const combinedText = [result.finalAnswerText, ...result.warnings, ...result.errors].join("\n");
|
|
18
|
+
if (result.status === "skipped" && /not available|unavailable|not found/i.test(combinedText)) {
|
|
19
|
+
return { status: "agent-unavailable", statusReason: "Agent executable is unavailable.", warnings, errors };
|
|
20
|
+
}
|
|
21
|
+
if (LIMIT_PATTERNS.some((pattern) => pattern.test(combinedText))) {
|
|
22
|
+
return { status: "agent-limit-reached", statusReason: "Agent output indicates an external usage or session limit.", warnings, errors };
|
|
23
|
+
}
|
|
24
|
+
if (/timed out after|timeout|timed out/i.test(combinedText)) {
|
|
25
|
+
return { status: "timeout", statusReason: "Agent command timed out.", warnings, errors };
|
|
26
|
+
}
|
|
27
|
+
if (result.status === "failed") {
|
|
28
|
+
return { status: "failed", statusReason: result.errors[0] ?? "Agent run failed.", warnings, errors };
|
|
29
|
+
}
|
|
30
|
+
if (result.status === "skipped") {
|
|
31
|
+
return { status: "skipped", statusReason: result.warnings[0] ?? "Agent run was skipped.", warnings, errors };
|
|
32
|
+
}
|
|
33
|
+
if (result.status === "completed" && args.parsedAnswer?.parseStatus === "failed") {
|
|
34
|
+
return { status: "invalid-output", statusReason: "Agent completed but answer could not be parsed for scoring.", warnings, errors };
|
|
35
|
+
}
|
|
36
|
+
if (result.status === "completed" && args.parsedAnswer?.parseStatus === "partial") {
|
|
37
|
+
return { status: "invalid-output", statusReason: "Agent completed but answer was only partially parseable for scoring.", warnings, errors };
|
|
38
|
+
}
|
|
39
|
+
return { status: "completed", statusReason: "Agent completed and answer was scoreable.", warnings, errors };
|
|
40
|
+
}
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
export function compareExperimentRuns(runs) {
|
|
2
|
+
const groups = new Map();
|
|
3
|
+
for (const run of runs) {
|
|
4
|
+
const key = [run.caseId, run.benchmarkProject, run.agentId, run.promptComplexityLevel].join("|");
|
|
5
|
+
groups.set(key, [...(groups.get(key) ?? []), run]);
|
|
6
|
+
}
|
|
7
|
+
const comparisons = [];
|
|
8
|
+
for (const groupRuns of groups.values()) {
|
|
9
|
+
const raw = groupRuns.find((run) => run.promptStrategy === "raw-full-file");
|
|
10
|
+
const guided = groupRuns.find((run) => run.promptStrategy === "my-dev-kit-guided");
|
|
11
|
+
const anchor = raw ?? guided;
|
|
12
|
+
if (!anchor) {
|
|
13
|
+
continue;
|
|
14
|
+
}
|
|
15
|
+
const warnings = [];
|
|
16
|
+
if (!raw || !guided) {
|
|
17
|
+
warnings.push("Comparison is missing one paired strategy run.");
|
|
18
|
+
}
|
|
19
|
+
const rawTotalTokens = raw?.tokenUsage.totalTokens;
|
|
20
|
+
const myDevKitTotalTokens = guided?.tokenUsage.totalTokens;
|
|
21
|
+
const tokenComparisonAvailable = typeof rawTotalTokens === "number" && typeof myDevKitTotalTokens === "number";
|
|
22
|
+
if (!tokenComparisonAvailable) {
|
|
23
|
+
warnings.push("Token comparison unavailable because one or both runs lack total token usage.");
|
|
24
|
+
}
|
|
25
|
+
const tokenDelta = tokenComparisonAvailable ? rawTotalTokens - myDevKitTotalTokens : undefined;
|
|
26
|
+
const tokenSavingsPercent = tokenComparisonAvailable && rawTotalTokens !== 0 && typeof tokenDelta === "number" ? round((tokenDelta / rawTotalTokens) * 100) : undefined;
|
|
27
|
+
const durationDeltaMs = typeof raw?.durationMs === "number" && typeof guided?.durationMs === "number" ? raw.durationMs - guided.durationMs : undefined;
|
|
28
|
+
const durationReductionPercent = typeof durationDeltaMs === "number" && raw && raw.durationMs !== 0 ? round((durationDeltaMs / raw.durationMs) * 100) : undefined;
|
|
29
|
+
const correctnessDelta = raw && guided ? round(guided.correctness.correctnessScore - raw.correctness.correctnessScore) : undefined;
|
|
30
|
+
comparisons.push({
|
|
31
|
+
comparisonId: [anchor.caseId, anchor.benchmarkProject, anchor.agentId, anchor.promptComplexityLevel].join("."),
|
|
32
|
+
caseId: anchor.caseId,
|
|
33
|
+
benchmarkProject: anchor.benchmarkProject,
|
|
34
|
+
agentId: anchor.agentId,
|
|
35
|
+
complexityLevel: anchor.promptComplexityLevel,
|
|
36
|
+
rawRunId: raw?.runId,
|
|
37
|
+
myDevKitRunId: guided?.runId,
|
|
38
|
+
rawStatus: raw?.status,
|
|
39
|
+
myDevKitStatus: guided?.status,
|
|
40
|
+
rawCorrectnessScore: raw?.correctness.correctnessScore,
|
|
41
|
+
myDevKitCorrectnessScore: guided?.correctness.correctnessScore,
|
|
42
|
+
sameCorrectnessPass: Boolean(raw && guided && raw.correctness.passed === guided.correctness.passed),
|
|
43
|
+
correctnessDelta,
|
|
44
|
+
rawDurationMs: raw?.durationMs,
|
|
45
|
+
myDevKitDurationMs: guided?.durationMs,
|
|
46
|
+
durationDeltaMs,
|
|
47
|
+
durationReductionPercent,
|
|
48
|
+
rawTotalTokens,
|
|
49
|
+
myDevKitTotalTokens,
|
|
50
|
+
tokenDelta,
|
|
51
|
+
tokenSavingsPercent,
|
|
52
|
+
tokenComparisonAvailable,
|
|
53
|
+
reliabilityLabel: labelReliability(raw, guided, tokenComparisonAvailable),
|
|
54
|
+
warnings
|
|
55
|
+
});
|
|
56
|
+
}
|
|
57
|
+
return comparisons;
|
|
58
|
+
}
|
|
59
|
+
function round(value) {
|
|
60
|
+
return Math.round(value * 10000) / 10000;
|
|
61
|
+
}
|
|
62
|
+
function labelReliability(raw, guided, tokenComparisonAvailable) {
|
|
63
|
+
if (!raw || !guided) {
|
|
64
|
+
return "partial";
|
|
65
|
+
}
|
|
66
|
+
if (raw.status === "agent-limit-reached" || guided.status === "agent-limit-reached") {
|
|
67
|
+
return "limit-reached";
|
|
68
|
+
}
|
|
69
|
+
if (raw.status === "agent-unavailable" || guided.status === "agent-unavailable") {
|
|
70
|
+
return "unavailable";
|
|
71
|
+
}
|
|
72
|
+
if (raw.status === "completed" && guided.status === "completed") {
|
|
73
|
+
return tokenComparisonAvailable ? "strong" : "correctness-only";
|
|
74
|
+
}
|
|
75
|
+
if (raw.status === "failed" || guided.status === "failed" || raw.status === "invalid-output" || guided.status === "invalid-output") {
|
|
76
|
+
return "failed";
|
|
77
|
+
}
|
|
78
|
+
return "partial";
|
|
79
|
+
}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import { tokenCountMethod } from "../core/countTokens.js";
|
|
2
|
+
export function compareTokenSavings(evaluations) {
|
|
3
|
+
const cases = evaluations.map(({ evaluationCase, rawBaseline, myDevKit }) => {
|
|
4
|
+
const tokensSaved = rawBaseline.totalEstimatedTokens - myDevKit.totalEstimatedTokens;
|
|
5
|
+
const percentSaved = rawBaseline.totalEstimatedTokens === 0 ? 0 : (tokensSaved / rawBaseline.totalEstimatedTokens) * 100;
|
|
6
|
+
return {
|
|
7
|
+
caseId: evaluationCase.id,
|
|
8
|
+
title: evaluationCase.title,
|
|
9
|
+
benchmarkProject: evaluationCase.benchmarkProject,
|
|
10
|
+
rawChars: rawBaseline.totalChars,
|
|
11
|
+
rawEstimatedTokens: rawBaseline.totalEstimatedTokens,
|
|
12
|
+
myDevKitChars: myDevKit.totalChars,
|
|
13
|
+
myDevKitEstimatedTokens: myDevKit.totalEstimatedTokens,
|
|
14
|
+
tokensSaved,
|
|
15
|
+
percentSaved,
|
|
16
|
+
filesReadRaw: rawBaseline.totalFiles,
|
|
17
|
+
filesReadMyDevKit: myDevKit.filesRead.length,
|
|
18
|
+
commandsRun: myDevKit.commands.length,
|
|
19
|
+
durationMsRaw: rawBaseline.durationMs,
|
|
20
|
+
durationMsMyDevKit: myDevKit.durationMs,
|
|
21
|
+
skipped: myDevKit.skipped,
|
|
22
|
+
warnings: [...myDevKit.warnings]
|
|
23
|
+
};
|
|
24
|
+
});
|
|
25
|
+
const completed = cases.filter((result) => !result.skipped);
|
|
26
|
+
const aggregate = (selector) => completed.reduce((sum, item) => sum + selector(item), 0);
|
|
27
|
+
const average = (selector) => (completed.length === 0 ? 0 : aggregate(selector) / completed.length);
|
|
28
|
+
return {
|
|
29
|
+
cases,
|
|
30
|
+
summary: {
|
|
31
|
+
caseCount: cases.length,
|
|
32
|
+
completedCaseCount: completed.length,
|
|
33
|
+
skippedCaseCount: cases.length - completed.length,
|
|
34
|
+
averageRawTokens: average((item) => item.rawEstimatedTokens),
|
|
35
|
+
averageMyDevKitTokens: average((item) => item.myDevKitEstimatedTokens),
|
|
36
|
+
averageTokensSaved: average((item) => item.tokensSaved),
|
|
37
|
+
averagePercentSaved: average((item) => item.percentSaved),
|
|
38
|
+
totalRawTokens: aggregate((item) => item.rawEstimatedTokens),
|
|
39
|
+
totalMyDevKitTokens: aggregate((item) => item.myDevKitEstimatedTokens),
|
|
40
|
+
totalTokensSaved: aggregate((item) => item.tokensSaved),
|
|
41
|
+
totalCommandsRun: cases.reduce((sum, item) => sum + item.commandsRun, 0),
|
|
42
|
+
totalDurationMs: cases.reduce((sum, item) => sum + item.durationMsRaw + item.durationMsMyDevKit, 0),
|
|
43
|
+
tokenCountMethod,
|
|
44
|
+
warnings: cases.flatMap((item) => item.warnings)
|
|
45
|
+
}
|
|
46
|
+
};
|
|
47
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
export * from "./types.js";
|
|
2
|
+
export * from "./readEvaluationCases.js";
|
|
3
|
+
export * from "./runRawFullFileBaseline.js";
|
|
4
|
+
export * from "./runMyDevKitRetrieval.js";
|
|
5
|
+
export * from "./compareTokenSavings.js";
|
|
6
|
+
export * from "./writeTokenSavingsArtifacts.js";
|
|
7
|
+
export * from "./renderTokenSavingsReportInput.js";
|
|
8
|
+
export * from "./projectFileTree.js";
|
|
9
|
+
export * from "./projectComplexity.js";
|
|
10
|
+
export * from "./benchmarkMetadata.js";
|
|
11
|
+
export * from "./controlledExperimentTypes.js";
|
|
12
|
+
export * from "./buildExperimentMatrix.js";
|
|
13
|
+
export * from "./classifyAgentRunOutcome.js";
|
|
14
|
+
export * from "./parseAgentAnswer.js";
|
|
15
|
+
export * from "./scoreCorrectness.js";
|
|
16
|
+
export * from "./compareExperimentRuns.js";
|
|
17
|
+
export * from "./runControlledExperiment.js";
|
|
18
|
+
export * from "./writeExperimentArtifacts.js";
|