@dailephd/my-dev-kit-lab 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +272 -0
- package/benchmarks/contracts/benchmark-project-profiles.json +1199 -0
- package/benchmarks/contracts/todo-behavior.md +70 -0
- package/benchmarks/contracts/todo-benchmark-case.json +227 -0
- package/benchmarks/projects/README.md +34 -0
- package/benchmarks/projects/task-analytics-large-mixed/README.md +1 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/__init__.py +3 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/fixtures.py +6 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/metrics.py +29 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/models.py +21 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/parser.py +16 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/pipeline.py +9 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/quality.py +8 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/reporting.py +11 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/tests/test_metrics.py +19 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/tests/test_parser.py +15 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/tests/test_quality.py +19 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/tests/test_reporting.py +15 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/package.json +12 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/index.ts +11 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/models/analyticsSnapshot.ts +20 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/models/project.ts +5 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/models/task.ts +10 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/reporting/buildProjectLeaderboard.ts +7 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/reporting/formatTaskHealthReport.ts +13 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/services/buildAnalyticsSnapshot.ts +39 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/services/completeTask.ts +10 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/services/createTask.ts +21 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/services/listTasksByProject.ts +6 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/store/projectStore.ts +20 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/store/taskStore.ts +44 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/validation/projectValidation.ts +12 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/validation/taskValidation.ts +18 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tests/buildAnalyticsSnapshot.test.ts +48 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tests/completeTask.test.ts +21 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tests/createTask.test.ts +31 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tests/listTasksByProject.test.ts +18 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tests/reporting.test.ts +19 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tsconfig.json +12 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/vitest.config.ts +5 -0
- package/benchmarks/projects/task-workflow-medium-ts/README.md +1 -0
- package/benchmarks/projects/task-workflow-medium-ts/package.json +12 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/index.ts +9 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/models/project.ts +6 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/models/task.ts +39 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/services/completeTask.ts +15 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/services/createTask.ts +26 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/services/filterTasks.ts +17 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/services/importTasks.ts +33 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/services/summarizeTasks.ts +30 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/store/taskStore.ts +76 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/utils/deterministicId.ts +3 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/validation/taskValidation.ts +45 -0
- package/benchmarks/projects/task-workflow-medium-ts/tests/completeTask.test.ts +16 -0
- package/benchmarks/projects/task-workflow-medium-ts/tests/createTask.test.ts +21 -0
- package/benchmarks/projects/task-workflow-medium-ts/tests/filterTasks.test.ts +18 -0
- package/benchmarks/projects/task-workflow-medium-ts/tests/importTasks.test.ts +22 -0
- package/benchmarks/projects/task-workflow-medium-ts/tests/summarizeTasks.test.ts +29 -0
- package/benchmarks/projects/task-workflow-medium-ts/tsconfig.json +12 -0
- package/benchmarks/projects/task-workflow-medium-ts/vitest.config.ts +5 -0
- package/benchmarks/projects/todo-js/README.md +3 -0
- package/benchmarks/projects/todo-js/package.json +11 -0
- package/benchmarks/projects/todo-js/src/index.js +2 -0
- package/benchmarks/projects/todo-js/src/taskService.js +37 -0
- package/benchmarks/projects/todo-js/src/taskStore.js +28 -0
- package/benchmarks/projects/todo-js/tests/taskService.test.js +45 -0
- package/benchmarks/projects/todo-js/vitest.config.js +5 -0
- package/benchmarks/projects/todo-mixed-ts-py/README.md +3 -0
- package/benchmarks/projects/todo-mixed-ts-py/package.json +13 -0
- package/benchmarks/projects/todo-mixed-ts-py/python/task_service.py +76 -0
- package/benchmarks/projects/todo-mixed-ts-py/src/taskCli.ts +38 -0
- package/benchmarks/projects/todo-mixed-ts-py/tests/mixedBoundary.test.ts +18 -0
- package/benchmarks/projects/todo-mixed-ts-py/tsconfig.json +12 -0
- package/benchmarks/projects/todo-mixed-ts-py/vitest.config.ts +5 -0
- package/benchmarks/projects/todo-python/README.md +3 -0
- package/benchmarks/projects/todo-python/src/__init__.py +4 -0
- package/benchmarks/projects/todo-python/src/task_service.py +32 -0
- package/benchmarks/projects/todo-python/src/task_store.py +28 -0
- package/benchmarks/projects/todo-python/tests/test_task_service.py +52 -0
- package/benchmarks/projects/todo-ts/README.md +3 -0
- package/benchmarks/projects/todo-ts/package.json +12 -0
- package/benchmarks/projects/todo-ts/src/index.ts +2 -0
- package/benchmarks/projects/todo-ts/src/taskService.ts +41 -0
- package/benchmarks/projects/todo-ts/src/taskStore.ts +34 -0
- package/benchmarks/projects/todo-ts/tests/taskService.test.ts +45 -0
- package/benchmarks/projects/todo-ts/tsconfig.json +12 -0
- package/benchmarks/projects/todo-ts/vitest.config.ts +5 -0
- package/dist/scripts/build-gallery.js +3 -0
- package/dist/scripts/capture-demo-report.js +3 -0
- package/dist/scripts/evaluate-token-savings.js +2 -0
- package/dist/scripts/experiments/describeExperiment.js +143 -0
- package/dist/scripts/experiments/listExperiments.js +44 -0
- package/dist/scripts/experiments/runExperiment.js +199 -0
- package/dist/scripts/generate-experiment-plots.js +3 -0
- package/dist/scripts/generate-prompt-variants.js +2 -0
- package/dist/scripts/render-experiment-report.js +2 -0
- package/dist/scripts/run-agent-prompt.js +2 -0
- package/dist/scripts/run-controlled-experiment.js +2 -0
- package/dist/scripts/run-final-demo.js +3 -0
- package/dist/scripts/run-lab-demo.js +5 -0
- package/dist/scripts/run-visualization-demos.js +3 -0
- package/dist/scripts/security/runCodeql.js +57 -0
- package/dist/scripts/security/runDependencyChecks.js +57 -0
- package/dist/scripts/security/runFuzzSmoke.js +29 -0
- package/dist/scripts/security/runPackageChecks.js +56 -0
- package/dist/scripts/security/runSemgrep.js +63 -0
- package/dist/scripts/security/validate.js +117 -0
- package/dist/scripts/verify-benchmarks.js +202 -0
- package/dist/src/agents/adapters/claudeAdapter.js +37 -0
- package/dist/src/agents/adapters/codexAdapter.js +110 -0
- package/dist/src/agents/adapters/fakeAgentAdapter.js +101 -0
- package/dist/src/agents/agentRegistry.js +21 -0
- package/dist/src/agents/index.js +7 -0
- package/dist/src/agents/parseAgentTokenUsage.js +137 -0
- package/dist/src/agents/runAgentPrompt.js +38 -0
- package/dist/src/agents/types.js +1 -0
- package/dist/src/commands/buildGalleryCommand.js +56 -0
- package/dist/src/commands/captureDemoReport.js +116 -0
- package/dist/src/commands/evaluateTokenSavings.js +175 -0
- package/dist/src/commands/generateExperimentPlotsCommand.js +38 -0
- package/dist/src/commands/generatePromptVariants.js +67 -0
- package/dist/src/commands/renderExperimentReportCommand.js +131 -0
- package/dist/src/commands/runAgentPromptCommand.js +132 -0
- package/dist/src/commands/runControlledExperimentCommand.js +174 -0
- package/dist/src/commands/runFinalDemoCommand.js +123 -0
- package/dist/src/commands/runLabDemo.js +62 -0
- package/dist/src/commands/runVisualizationDemosCommand.js +67 -0
- package/dist/src/core/commandLine.js +59 -0
- package/dist/src/core/countTokens.js +8 -0
- package/dist/src/core/fileGlobs.js +100 -0
- package/dist/src/core/localProjectTarget.js +75 -0
- package/dist/src/core/pathSafety.js +19 -0
- package/dist/src/core/pythonCommand.js +30 -0
- package/dist/src/core/resolveCommand.js +110 -0
- package/dist/src/core/runMeasuredCommand.js +143 -0
- package/dist/src/evaluation/benchmarkMetadata.js +207 -0
- package/dist/src/evaluation/buildExperimentMatrix.js +75 -0
- package/dist/src/evaluation/classifyAgentRunOutcome.js +40 -0
- package/dist/src/evaluation/compareExperimentRuns.js +79 -0
- package/dist/src/evaluation/compareTokenSavings.js +47 -0
- package/dist/src/evaluation/controlledExperimentTypes.js +1 -0
- package/dist/src/evaluation/index.js +18 -0
- package/dist/src/evaluation/parseAgentAnswer.js +230 -0
- package/dist/src/evaluation/projectComplexity.js +126 -0
- package/dist/src/evaluation/projectFileTree.js +83 -0
- package/dist/src/evaluation/readEvaluationCases.js +59 -0
- package/dist/src/evaluation/renderTokenSavingsReportInput.js +55 -0
- package/dist/src/evaluation/runControlledExperiment.js +158 -0
- package/dist/src/evaluation/runMyDevKitRetrieval.js +197 -0
- package/dist/src/evaluation/runRawFullFileBaseline.js +31 -0
- package/dist/src/evaluation/scoreCorrectness.js +127 -0
- package/dist/src/evaluation/types.js +1 -0
- package/dist/src/evaluation/writeExperimentArtifacts.js +104 -0
- package/dist/src/evaluation/writeTokenSavingsArtifacts.js +57 -0
- package/dist/src/experiments/config.js +24 -0
- package/dist/src/experiments/defaultRegistry.js +7 -0
- package/dist/src/experiments/errors.js +18 -0
- package/dist/src/experiments/index.js +9 -0
- package/dist/src/experiments/outputPaths.js +25 -0
- package/dist/src/experiments/plugins/contextStrategyComparison/config.js +37 -0
- package/dist/src/experiments/plugins/contextStrategyComparison/index.js +3 -0
- package/dist/src/experiments/plugins/contextStrategyComparison/plugin.js +83 -0
- package/dist/src/experiments/plugins/contextStrategyComparison/resultMapping.js +260 -0
- package/dist/src/experiments/plugins/index.js +1 -0
- package/dist/src/experiments/registry.js +43 -0
- package/dist/src/experiments/results.js +48 -0
- package/dist/src/experiments/runner.js +181 -0
- package/dist/src/experiments/target.js +8 -0
- package/dist/src/experiments/types.js +1 -0
- package/dist/src/gallery/index.js +2 -0
- package/dist/src/gallery/types.js +1 -0
- package/dist/src/gallery/writeGalleryManifest.js +214 -0
- package/dist/src/index.js +12 -0
- package/dist/src/plots/buildExperimentPlotData.js +137 -0
- package/dist/src/plots/index.js +4 -0
- package/dist/src/plots/renderSvgChart.js +82 -0
- package/dist/src/plots/types.js +1 -0
- package/dist/src/plots/writePlotArtifacts.js +46 -0
- package/dist/src/prompts/buildPromptContext.js +68 -0
- package/dist/src/prompts/generateMyDevKitPrompt.js +106 -0
- package/dist/src/prompts/generatePromptVariants.js +36 -0
- package/dist/src/prompts/generateRawFullFilePrompt.js +97 -0
- package/dist/src/prompts/index.js +7 -0
- package/dist/src/prompts/measurePromptComplexity.js +41 -0
- package/dist/src/prompts/types.js +1 -0
- package/dist/src/prompts/writePromptArtifacts.js +43 -0
- package/dist/src/report/buildExperimentReportInput.js +339 -0
- package/dist/src/report/experimentReportTypes.js +1 -0
- package/dist/src/report/experiments/buildPluginExperimentReport.js +153 -0
- package/dist/src/report/experiments/experimentReportModel.js +1 -0
- package/dist/src/report/experiments/index.js +4 -0
- package/dist/src/report/experiments/renderPluginExperimentReportHtml.js +133 -0
- package/dist/src/report/experiments/writePluginExperimentReports.js +30 -0
- package/dist/src/report/index.js +8 -0
- package/dist/src/report/renderExperimentHtmlReport.js +354 -0
- package/dist/src/report/renderHtmlReport.js +103 -0
- package/dist/src/report/types.js +10 -0
- package/dist/src/report/writeExperimentReportArtifacts.js +38 -0
- package/dist/src/report/writeReportArtifacts.js +39 -0
- package/dist/src/screenshot/captureReportScreenshot.js +75 -0
- package/dist/src/screenshot/index.js +2 -0
- package/dist/src/screenshot/types.js +1 -0
- package/dist/src/securityValidation/artifacts.js +15 -0
- package/dist/src/securityValidation/cliAdversarial/adversarialCliConfig.js +38 -0
- package/dist/src/securityValidation/cliAdversarial/dataVolumeChecks.js +194 -0
- package/dist/src/securityValidation/cliAdversarial/jsonStdoutChecks.js +359 -0
- package/dist/src/securityValidation/cliAdversarial/malformedArtifactChecks.js +284 -0
- package/dist/src/securityValidation/cliAdversarial/malformedArtifactFixtures.js +79 -0
- package/dist/src/securityValidation/cliAdversarial/pathBoundaryChecks.js +431 -0
- package/dist/src/securityValidation/cliAdversarial/pathCases.js +144 -0
- package/dist/src/securityValidation/cliAdversarial/readOnlyBoundaryChecks.js +294 -0
- package/dist/src/securityValidation/cliAdversarial/runAdversarialCheck.js +149 -0
- package/dist/src/securityValidation/cliAdversarial/subprocessSafetyChecks.js +214 -0
- package/dist/src/securityValidation/cliAdversarial/tempWorkspace.js +160 -0
- package/dist/src/securityValidation/commandRunner.js +136 -0
- package/dist/src/securityValidation/config.js +39 -0
- package/dist/src/securityValidation/dependencies/parseNpmAudit.js +115 -0
- package/dist/src/securityValidation/dependencies/parseNpmLs.js +71 -0
- package/dist/src/securityValidation/dependencies/parseNpmOutdated.js +41 -0
- package/dist/src/securityValidation/dependencies/runDependencyChecks.js +239 -0
- package/dist/src/securityValidation/dependencies/runOsvScanner.js +43 -0
- package/dist/src/securityValidation/fuzz/fuzzHarness.js +61 -0
- package/dist/src/securityValidation/fuzz/fuzzTargets.js +204 -0
- package/dist/src/securityValidation/fuzz/randomInput.js +0 -0
- package/dist/src/securityValidation/index.js +34 -0
- package/dist/src/securityValidation/packageChecks/forbiddenPackageContents.js +67 -0
- package/dist/src/securityValidation/packageChecks/parseNpmPackDryRun.js +56 -0
- package/dist/src/securityValidation/packageChecks/runPackageChecks.js +88 -0
- package/dist/src/securityValidation/report/renderSecurityReport.js +248 -0
- package/dist/src/securityValidation/report/securityReportTypes.js +1 -0
- package/dist/src/securityValidation/staticScans/codeql.js +66 -0
- package/dist/src/securityValidation/staticScans/semgrep.js +180 -0
- package/dist/src/securityValidation/testMatrix.js +535 -0
- package/dist/src/securityValidation/types.js +34 -0
- package/dist/src/securityValidation/validate/resolveTarget.js +32 -0
- package/dist/src/securityValidation/validate/runSecurityValidation.js +169 -0
- package/dist/src/securityValidation/validate/verdict.js +73 -0
- package/dist/src/visualizationDemos/buildMyDevKitVisualizationCommands.js +59 -0
- package/dist/src/visualizationDemos/index.js +4 -0
- package/dist/src/visualizationDemos/runVisualizationDemos.js +82 -0
- package/dist/src/visualizationDemos/types.js +1 -0
- package/dist/src/visualizationDemos/writeVisualizationDemoArtifacts.js +25 -0
- package/docs/METRICS.md +286 -0
- package/examples/demo-report-input.json +78 -0
- package/examples/lab-demo-cases.json +35 -0
- package/examples/real-agent-campaign-cases.json +118 -0
- package/examples/token-savings-cases.json +122 -0
- package/package.json +91 -0
- package/tests/fixtures/fake-adversarial-cli.js +152 -0
- package/tests/fixtures/fake-my-dev-kit-cli.js +83 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { runPackageChecks } from "../../src/securityValidation/index.js";
|
|
4
|
+
import { DEFAULT_SECURITY_CONFIG } from "../../src/securityValidation/index.js";
|
|
5
|
+
import { resolveValidationTarget } from "../../src/securityValidation/validate/resolveTarget.js";
|
|
6
|
+
const rawArgs = process.argv.slice(2);
|
|
7
|
+
const args = parseArgs(rawArgs);
|
|
8
|
+
const toolRoot = process.cwd();
|
|
9
|
+
let targetRoot;
|
|
10
|
+
try {
|
|
11
|
+
const target = resolveValidationTarget(args.target, toolRoot);
|
|
12
|
+
targetRoot = target.targetRoot;
|
|
13
|
+
if (!target.isSelf) {
|
|
14
|
+
console.log(`Target: ${targetRoot}`);
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
catch (err) {
|
|
18
|
+
console.error(`ERROR: ${err instanceof Error ? err.message : String(err)}`);
|
|
19
|
+
process.exitCode = 1;
|
|
20
|
+
process.exit(1);
|
|
21
|
+
}
|
|
22
|
+
const config = {
|
|
23
|
+
...DEFAULT_SECURITY_CONFIG,
|
|
24
|
+
reportDir: path.join(toolRoot, DEFAULT_SECURITY_CONFIG.reportDir),
|
|
25
|
+
rawOutputDir: path.join(toolRoot, DEFAULT_SECURITY_CONFIG.rawOutputDir),
|
|
26
|
+
};
|
|
27
|
+
console.log("Running package content checks...");
|
|
28
|
+
console.log(`Report directory: ${config.reportDir}`);
|
|
29
|
+
const output = await runPackageChecks({ cwd: targetRoot, config });
|
|
30
|
+
const passed = output.checks.filter((c) => c.status === "passed").length;
|
|
31
|
+
const failed = output.checks.filter((c) => c.status === "failed").length;
|
|
32
|
+
const warned = output.checks.filter((c) => c.status === "warning").length;
|
|
33
|
+
console.log(`\nPackage checks complete:`);
|
|
34
|
+
console.log(` Passed: ${passed}`);
|
|
35
|
+
console.log(` Warned: ${warned}`);
|
|
36
|
+
console.log(` Failed: ${failed}`);
|
|
37
|
+
console.log(` Findings: ${output.findings.length}`);
|
|
38
|
+
if (output.findings.length > 0) {
|
|
39
|
+
console.log("\nFindings:");
|
|
40
|
+
for (const f of output.findings) {
|
|
41
|
+
console.log(` [${f.severity.toUpperCase()}] ${f.title}`);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
console.log(`\nResults written to ${config.reportDir}`);
|
|
45
|
+
const hasBlocker = output.findings.some((f) => f.severity === "blocker");
|
|
46
|
+
const hasMajor = output.findings.some((f) => f.severity === "major");
|
|
47
|
+
process.exitCode = hasBlocker || hasMajor ? 1 : 0;
|
|
48
|
+
function parseArgs(argv) {
|
|
49
|
+
const result = {};
|
|
50
|
+
for (let i = 0; i < argv.length; i++) {
|
|
51
|
+
if ((argv[i] === "--target" || argv[i] === "-t") && i + 1 < argv.length) {
|
|
52
|
+
result.target = argv[++i];
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
return result;
|
|
56
|
+
}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { runSemgrepCheck } from "../../src/securityValidation/staticScans/semgrep.js";
|
|
4
|
+
import { resolveValidationTarget } from "../../src/securityValidation/validate/resolveTarget.js";
|
|
5
|
+
const rawArgs = process.argv.slice(2);
|
|
6
|
+
const args = parseArgs(rawArgs);
|
|
7
|
+
const toolRoot = process.cwd();
|
|
8
|
+
let targetRoot;
|
|
9
|
+
try {
|
|
10
|
+
const target = resolveValidationTarget(args.target, toolRoot);
|
|
11
|
+
targetRoot = target.targetRoot;
|
|
12
|
+
if (!target.isSelf) {
|
|
13
|
+
console.log(`Target: ${targetRoot}`);
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
catch (err) {
|
|
17
|
+
console.error(`ERROR: ${err instanceof Error ? err.message : String(err)}`);
|
|
18
|
+
process.exitCode = 1;
|
|
19
|
+
process.exit(1);
|
|
20
|
+
}
|
|
21
|
+
console.log("Running Semgrep static analysis check...");
|
|
22
|
+
console.log(`Config: ${path.join(toolRoot, ".semgrep.yml")}`);
|
|
23
|
+
const result = await runSemgrepCheck({
|
|
24
|
+
targetRoot,
|
|
25
|
+
toolRoot,
|
|
26
|
+
configPath: path.join(toolRoot, ".semgrep.yml"),
|
|
27
|
+
timeoutMs: 120_000,
|
|
28
|
+
});
|
|
29
|
+
const label = result.status === "skipped"
|
|
30
|
+
? `SKIPPED — ${result.skippedReason ?? "tool unavailable"}`
|
|
31
|
+
: result.status.toUpperCase();
|
|
32
|
+
console.log(`\nStatus: ${label}`);
|
|
33
|
+
if (result.findings.length > 0) {
|
|
34
|
+
console.log("\nFindings:");
|
|
35
|
+
for (const f of result.findings) {
|
|
36
|
+
console.log(` [${f.severity.toUpperCase()}] ${f.title}`);
|
|
37
|
+
if (f.affectedFiles && f.affectedFiles.length > 0) {
|
|
38
|
+
console.log(` Location: ${f.affectedFiles[0]}`);
|
|
39
|
+
}
|
|
40
|
+
if (f.description)
|
|
41
|
+
console.log(` ${f.description.slice(0, 120)}`);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
console.log(`\nDuration: ${result.durationMs}ms`);
|
|
45
|
+
if (result.status === "skipped") {
|
|
46
|
+
console.log("\nSemgrep is optional. Absence does not block release.");
|
|
47
|
+
process.exitCode = 0;
|
|
48
|
+
}
|
|
49
|
+
else if (result.status === "failed") {
|
|
50
|
+
process.exitCode = 1;
|
|
51
|
+
}
|
|
52
|
+
else {
|
|
53
|
+
process.exitCode = 0;
|
|
54
|
+
}
|
|
55
|
+
function parseArgs(argv) {
|
|
56
|
+
const result = {};
|
|
57
|
+
for (let i = 0; i < argv.length; i++) {
|
|
58
|
+
if ((argv[i] === "--target" || argv[i] === "-t") && i + 1 < argv.length) {
|
|
59
|
+
result.target = argv[++i];
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
return result;
|
|
63
|
+
}
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import fs from "node:fs";
|
|
4
|
+
import { runSecurityValidation } from "../../src/securityValidation/validate/runSecurityValidation.js";
|
|
5
|
+
import { resolveValidationTarget, reportFilenamePrefix } from "../../src/securityValidation/validate/resolveTarget.js";
|
|
6
|
+
import { renderTextReport, renderJsonReport } from "../../src/securityValidation/report/renderSecurityReport.js";
|
|
7
|
+
// Parse CLI arguments from process.argv (after the node/tsx and script path).
|
|
8
|
+
const rawArgs = process.argv.slice(2);
|
|
9
|
+
const args = parseArgs(rawArgs);
|
|
10
|
+
const toolRoot = process.cwd();
|
|
11
|
+
// Resolve and validate target early so we can fail fast with a clean error.
|
|
12
|
+
let target;
|
|
13
|
+
try {
|
|
14
|
+
target = resolveValidationTarget(args.target, toolRoot);
|
|
15
|
+
}
|
|
16
|
+
catch (err) {
|
|
17
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
18
|
+
console.error(`\nERROR: ${msg}`);
|
|
19
|
+
console.error("Usage: npm run security:validate -- [--target <path>] [--out <dir>] [--report-prefix <name>]");
|
|
20
|
+
process.exitCode = 1;
|
|
21
|
+
process.exit(1);
|
|
22
|
+
}
|
|
23
|
+
console.log("=".repeat(60));
|
|
24
|
+
console.log("my-dev-kit-lab security:validate");
|
|
25
|
+
console.log("=".repeat(60));
|
|
26
|
+
console.log(`Tool root : ${toolRoot}`);
|
|
27
|
+
if (!target.isSelf) {
|
|
28
|
+
console.log(`Target : ${target.targetRoot}`);
|
|
29
|
+
if (target.packageName)
|
|
30
|
+
console.log(`Package : ${target.packageName}${target.packageVersion ? `@${target.packageVersion}` : ""}`);
|
|
31
|
+
}
|
|
32
|
+
else {
|
|
33
|
+
console.log(`Mode : self-validation`);
|
|
34
|
+
}
|
|
35
|
+
console.log("");
|
|
36
|
+
const summary = await runSecurityValidation({
|
|
37
|
+
cwd: toolRoot,
|
|
38
|
+
targetPath: args.target,
|
|
39
|
+
fuzzIterations: parseInt(process.env["FUZZ_ITERATIONS"] ?? "50", 10),
|
|
40
|
+
fuzzSeed: parseInt(process.env["FUZZ_SEED"] ?? "0xDEADBEEF", 16),
|
|
41
|
+
});
|
|
42
|
+
// Build report object
|
|
43
|
+
const report = {
|
|
44
|
+
metadata: {
|
|
45
|
+
toolRoot: summary.toolRoot,
|
|
46
|
+
toolPackageName: summary.toolPackageName,
|
|
47
|
+
toolPackageVersion: summary.toolPackageVersion,
|
|
48
|
+
targetRoot: summary.targetRoot,
|
|
49
|
+
targetDescription: summary.targetDescription,
|
|
50
|
+
packageName: summary.packageName,
|
|
51
|
+
packageVersion: summary.packageVersion,
|
|
52
|
+
branch: summary.auditedBranch,
|
|
53
|
+
commit: summary.auditedCommit,
|
|
54
|
+
isSelf: summary.isSelf,
|
|
55
|
+
generatedAt: summary.finishedAt,
|
|
56
|
+
totalDurationMs: new Date(summary.finishedAt).getTime() - new Date(summary.startedAt).getTime(),
|
|
57
|
+
},
|
|
58
|
+
sections: [],
|
|
59
|
+
allChecks: summary.checks,
|
|
60
|
+
allFindings: summary.findings,
|
|
61
|
+
verdict: summary.verdict,
|
|
62
|
+
recommendedNextStep: summary.recommendedNextStep,
|
|
63
|
+
};
|
|
64
|
+
const textReport = renderTextReport(report);
|
|
65
|
+
const jsonReport = renderJsonReport(report);
|
|
66
|
+
// Determine output directory
|
|
67
|
+
const reportsDir = args.out
|
|
68
|
+
? path.resolve(args.out)
|
|
69
|
+
: path.join(toolRoot, "reports", "security");
|
|
70
|
+
if (!fs.existsSync(reportsDir)) {
|
|
71
|
+
fs.mkdirSync(reportsDir, { recursive: true });
|
|
72
|
+
}
|
|
73
|
+
// Determine report filename prefix
|
|
74
|
+
const prefix = args.reportPrefix ?? reportFilenamePrefix(target);
|
|
75
|
+
const txtPath = path.join(reportsDir, `${prefix}-security-validation.txt`);
|
|
76
|
+
const jsonPath = path.join(reportsDir, `${prefix}-security-validation.json`);
|
|
77
|
+
fs.writeFileSync(txtPath, textReport, "utf8");
|
|
78
|
+
fs.writeFileSync(jsonPath, jsonReport, "utf8");
|
|
79
|
+
// Print report to stdout
|
|
80
|
+
console.log(textReport);
|
|
81
|
+
console.log(`\nReports written:`);
|
|
82
|
+
console.log(` ${txtPath}`);
|
|
83
|
+
console.log(` ${jsonPath}`);
|
|
84
|
+
// Exit code based on verdict
|
|
85
|
+
const blockerExists = summary.verdict === "not-ready-security-blocker-remains";
|
|
86
|
+
const inconclusive = summary.verdict === "inconclusive-audit-environment-incomplete";
|
|
87
|
+
if (blockerExists) {
|
|
88
|
+
console.error("\nExit 1 — security blocker remains.");
|
|
89
|
+
process.exitCode = 1;
|
|
90
|
+
}
|
|
91
|
+
else if (inconclusive) {
|
|
92
|
+
console.warn("\nExit 2 — audit environment incomplete.");
|
|
93
|
+
process.exitCode = 2;
|
|
94
|
+
}
|
|
95
|
+
else {
|
|
96
|
+
console.log("\nExit 0 — validation completed.");
|
|
97
|
+
process.exitCode = 0;
|
|
98
|
+
}
|
|
99
|
+
// ---------------------------------------------------------------------------
|
|
100
|
+
// Argument parser
|
|
101
|
+
// ---------------------------------------------------------------------------
|
|
102
|
+
function parseArgs(argv) {
|
|
103
|
+
const result = {};
|
|
104
|
+
for (let i = 0; i < argv.length; i++) {
|
|
105
|
+
const arg = argv[i];
|
|
106
|
+
if ((arg === "--target" || arg === "-t") && i + 1 < argv.length) {
|
|
107
|
+
result.target = argv[++i];
|
|
108
|
+
}
|
|
109
|
+
else if (arg === "--out" && i + 1 < argv.length) {
|
|
110
|
+
result.out = argv[++i];
|
|
111
|
+
}
|
|
112
|
+
else if (arg === "--report-prefix" && i + 1 < argv.length) {
|
|
113
|
+
result.reportPrefix = argv[++i];
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
return result;
|
|
117
|
+
}
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
import { existsSync, readdirSync, readFileSync, statSync } from "node:fs";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { fileURLToPath } from "node:url";
|
|
4
|
+
import { REQUIRED_BENCHMARK_PROJECT_IDS, parseBenchmarkProjectProfiles, validateAnswerKey, validateBenchmarkProjectProfiles } from "../src/evaluation/benchmarkMetadata.js";
|
|
5
|
+
const requiredProjects = REQUIRED_BENCHMARK_PROJECT_IDS;
|
|
6
|
+
const projectRequiredPaths = {
|
|
7
|
+
"todo-ts": [
|
|
8
|
+
"README.md",
|
|
9
|
+
"package.json",
|
|
10
|
+
"tsconfig.json",
|
|
11
|
+
"src/taskStore.ts",
|
|
12
|
+
"src/taskService.ts",
|
|
13
|
+
"src/index.ts",
|
|
14
|
+
"tests/taskService.test.ts"
|
|
15
|
+
],
|
|
16
|
+
"todo-python": [
|
|
17
|
+
"README.md",
|
|
18
|
+
"src/task_store.py",
|
|
19
|
+
"src/task_service.py",
|
|
20
|
+
"src/__init__.py",
|
|
21
|
+
"tests/test_task_service.py"
|
|
22
|
+
],
|
|
23
|
+
"todo-js": [
|
|
24
|
+
"README.md",
|
|
25
|
+
"package.json",
|
|
26
|
+
"src/taskStore.js",
|
|
27
|
+
"src/taskService.js",
|
|
28
|
+
"src/index.js",
|
|
29
|
+
"tests/taskService.test.js"
|
|
30
|
+
],
|
|
31
|
+
"todo-mixed-ts-py": [
|
|
32
|
+
"README.md",
|
|
33
|
+
"package.json",
|
|
34
|
+
"tsconfig.json",
|
|
35
|
+
"src/taskCli.ts",
|
|
36
|
+
"python/task_service.py",
|
|
37
|
+
"tests/mixedBoundary.test.ts"
|
|
38
|
+
],
|
|
39
|
+
"task-workflow-medium-ts": [
|
|
40
|
+
"README.md",
|
|
41
|
+
"package.json",
|
|
42
|
+
"tsconfig.json",
|
|
43
|
+
"src/store/taskStore.ts",
|
|
44
|
+
"src/services/createTask.ts",
|
|
45
|
+
"src/services/importTasks.ts",
|
|
46
|
+
"src/services/summarizeTasks.ts",
|
|
47
|
+
"tests/importTasks.test.ts"
|
|
48
|
+
],
|
|
49
|
+
"task-analytics-large-mixed": [
|
|
50
|
+
"README.md",
|
|
51
|
+
"ts/package.json",
|
|
52
|
+
"ts/tsconfig.json",
|
|
53
|
+
"ts/src/services/buildAnalyticsSnapshot.ts",
|
|
54
|
+
"ts/src/reporting/formatTaskHealthReport.ts",
|
|
55
|
+
"ts/tests/buildAnalyticsSnapshot.test.ts",
|
|
56
|
+
"py/task_analytics/metrics.py",
|
|
57
|
+
"py/task_analytics/quality.py",
|
|
58
|
+
"py/tests/test_reporting.py"
|
|
59
|
+
]
|
|
60
|
+
};
|
|
61
|
+
function walk(dir) {
|
|
62
|
+
const entries = readdirSync(dir, { withFileTypes: true });
|
|
63
|
+
const files = [];
|
|
64
|
+
for (const entry of entries) {
|
|
65
|
+
const fullPath = path.join(dir, entry.name);
|
|
66
|
+
if (entry.isDirectory()) {
|
|
67
|
+
files.push(...walk(fullPath));
|
|
68
|
+
}
|
|
69
|
+
else {
|
|
70
|
+
files.push(fullPath);
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
return files;
|
|
74
|
+
}
|
|
75
|
+
export function validateBenchmarks(rootDir = process.cwd()) {
|
|
76
|
+
const checks = [];
|
|
77
|
+
const errors = [];
|
|
78
|
+
const contractsDir = path.join(rootDir, "benchmarks", "contracts");
|
|
79
|
+
const projectsDir = path.join(rootDir, "benchmarks", "projects");
|
|
80
|
+
const behaviorPath = path.join(contractsDir, "todo-behavior.md");
|
|
81
|
+
const casesPath = path.join(contractsDir, "todo-benchmark-case.json");
|
|
82
|
+
const profilesPath = path.join(contractsDir, "benchmark-project-profiles.json");
|
|
83
|
+
if (!existsSync(behaviorPath)) {
|
|
84
|
+
errors.push("Missing contract file: benchmarks/contracts/todo-behavior.md");
|
|
85
|
+
}
|
|
86
|
+
else {
|
|
87
|
+
checks.push("found todo-behavior.md");
|
|
88
|
+
}
|
|
89
|
+
let cases = [];
|
|
90
|
+
if (!existsSync(casesPath)) {
|
|
91
|
+
errors.push("Missing contract file: benchmarks/contracts/todo-benchmark-case.json");
|
|
92
|
+
}
|
|
93
|
+
else {
|
|
94
|
+
try {
|
|
95
|
+
cases = JSON.parse(readFileSync(casesPath, "utf8"));
|
|
96
|
+
checks.push("parsed todo-benchmark-case.json");
|
|
97
|
+
}
|
|
98
|
+
catch (error) {
|
|
99
|
+
errors.push(`Invalid JSON in todo-benchmark-case.json: ${error.message}`);
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
if (!existsSync(profilesPath)) {
|
|
103
|
+
errors.push("Missing contract file: benchmarks/contracts/benchmark-project-profiles.json");
|
|
104
|
+
}
|
|
105
|
+
else {
|
|
106
|
+
try {
|
|
107
|
+
const profiles = parseBenchmarkProjectProfiles(JSON.parse(readFileSync(profilesPath, "utf8")));
|
|
108
|
+
const profileErrors = validateBenchmarkProjectProfiles(profiles, rootDir);
|
|
109
|
+
errors.push(...profileErrors);
|
|
110
|
+
if (profileErrors.length === 0) {
|
|
111
|
+
checks.push("validated benchmark-project-profiles.json");
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
catch (error) {
|
|
115
|
+
errors.push(`Invalid benchmark-project-profiles.json: ${error.message}`);
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
const ids = new Set();
|
|
119
|
+
for (const benchmarkCase of cases) {
|
|
120
|
+
if (ids.has(benchmarkCase.id)) {
|
|
121
|
+
errors.push(`Duplicate benchmark case id: ${benchmarkCase.id}`);
|
|
122
|
+
}
|
|
123
|
+
ids.add(benchmarkCase.id);
|
|
124
|
+
}
|
|
125
|
+
if (cases.length > 0 && errors.every((error) => !error.startsWith("Duplicate benchmark case id:"))) {
|
|
126
|
+
checks.push("benchmark case ids are unique");
|
|
127
|
+
}
|
|
128
|
+
for (const project of requiredProjects) {
|
|
129
|
+
const projectDir = path.join(projectsDir, project);
|
|
130
|
+
if (!existsSync(projectDir) || !statSync(projectDir).isDirectory()) {
|
|
131
|
+
errors.push(`Missing benchmark project: benchmarks/projects/${project}`);
|
|
132
|
+
continue;
|
|
133
|
+
}
|
|
134
|
+
checks.push(`found benchmark project ${project}`);
|
|
135
|
+
for (const relPath of projectRequiredPaths[project]) {
|
|
136
|
+
const fullPath = path.join(projectDir, relPath);
|
|
137
|
+
if (!existsSync(fullPath)) {
|
|
138
|
+
errors.push(`Missing required file for ${project}: benchmarks/projects/${project}/${relPath}`);
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
for (const benchmarkCase of cases) {
|
|
143
|
+
if (!benchmarkCase.answerKey) {
|
|
144
|
+
errors.push(`Case ${benchmarkCase.id} does not define answerKey`);
|
|
145
|
+
}
|
|
146
|
+
else {
|
|
147
|
+
errors.push(...validateAnswerKey(benchmarkCase.answerKey, `Case ${benchmarkCase.id}`));
|
|
148
|
+
}
|
|
149
|
+
if (!Array.isArray(benchmarkCase.expectedSymbols) || benchmarkCase.expectedSymbols.length === 0) {
|
|
150
|
+
errors.push(`Case ${benchmarkCase.id} does not define expectedSymbols`);
|
|
151
|
+
}
|
|
152
|
+
for (const [project, expectedFiles] of Object.entries(benchmarkCase.expectedFilesByProject ?? {})) {
|
|
153
|
+
if (!requiredProjects.includes(project)) {
|
|
154
|
+
errors.push(`Case ${benchmarkCase.id} references unknown project id: ${project}`);
|
|
155
|
+
continue;
|
|
156
|
+
}
|
|
157
|
+
if (!Array.isArray(expectedFiles) || expectedFiles.length === 0) {
|
|
158
|
+
errors.push(`Case ${benchmarkCase.id} does not define expected files for ${project}`);
|
|
159
|
+
continue;
|
|
160
|
+
}
|
|
161
|
+
for (const expectedFile of expectedFiles) {
|
|
162
|
+
const fullPath = path.join(rootDir, "benchmarks", "projects", project, expectedFile);
|
|
163
|
+
if (!existsSync(fullPath)) {
|
|
164
|
+
errors.push(`Case ${benchmarkCase.id} references missing file: benchmarks/projects/${project}/${expectedFile}`);
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
for (const project of requiredProjects) {
|
|
170
|
+
const projectDir = path.join(projectsDir, project);
|
|
171
|
+
if (!existsSync(projectDir)) {
|
|
172
|
+
continue;
|
|
173
|
+
}
|
|
174
|
+
const forbidden = walk(projectDir).filter((fullPath) => {
|
|
175
|
+
const rel = path.relative(projectDir, fullPath).replace(/\\/g, "/");
|
|
176
|
+
return /(^|\/)(node_modules|dist|build|coverage|lab-output)(\/|$)/.test(rel);
|
|
177
|
+
});
|
|
178
|
+
if (forbidden.length > 0) {
|
|
179
|
+
errors.push(`Forbidden generated output found in ${project}: ${forbidden[0]}`);
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
return { ok: errors.length === 0, errors, checks };
|
|
183
|
+
}
|
|
184
|
+
function printSummary(result) {
|
|
185
|
+
console.log(`Benchmark verification ${result.ok ? "passed" : "failed"}.`);
|
|
186
|
+
console.log(`Checks: ${result.checks.length}`);
|
|
187
|
+
if (result.errors.length > 0) {
|
|
188
|
+
console.log(`Errors: ${result.errors.length}`);
|
|
189
|
+
for (const error of result.errors) {
|
|
190
|
+
console.log(`- ${error}`);
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
const currentFile = fileURLToPath(import.meta.url);
|
|
195
|
+
const invokedPath = process.argv[1] ? path.resolve(process.argv[1]) : "";
|
|
196
|
+
if (invokedPath === currentFile) {
|
|
197
|
+
const result = validateBenchmarks();
|
|
198
|
+
printSummary(result);
|
|
199
|
+
if (!result.ok) {
|
|
200
|
+
process.exitCode = 1;
|
|
201
|
+
}
|
|
202
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import { runCliAgent } from "./codexAdapter.js";
|
|
2
|
+
import { applyPromptToCommandTemplate } from "../runAgentPrompt.js";
|
|
3
|
+
import { parseAgentTokenUsage } from "../parseAgentTokenUsage.js";
|
|
4
|
+
import { runMeasuredCommand } from "../../core/runMeasuredCommand.js";
|
|
5
|
+
export const claudeAdapter = {
|
|
6
|
+
id: "claude",
|
|
7
|
+
displayName: "Claude",
|
|
8
|
+
surface: "cli",
|
|
9
|
+
async isAvailable(request) {
|
|
10
|
+
if (request.commandTemplate) {
|
|
11
|
+
return true;
|
|
12
|
+
}
|
|
13
|
+
const check = await runMeasuredCommand({
|
|
14
|
+
commandId: "claude-availability",
|
|
15
|
+
commandString: "claude",
|
|
16
|
+
extraArgs: ["--version"],
|
|
17
|
+
cwd: request.cwd,
|
|
18
|
+
outDir: request.outDir,
|
|
19
|
+
env: request.env
|
|
20
|
+
});
|
|
21
|
+
return check.ok;
|
|
22
|
+
},
|
|
23
|
+
buildCommand(request) {
|
|
24
|
+
if (request.commandTemplate) {
|
|
25
|
+
return applyPromptToCommandTemplate(request.commandTemplate, request.promptText);
|
|
26
|
+
}
|
|
27
|
+
return { command: "claude", args: ["-p", request.promptText] };
|
|
28
|
+
},
|
|
29
|
+
async runPrompt(request) {
|
|
30
|
+
return runCliAgent(request, this);
|
|
31
|
+
},
|
|
32
|
+
parseTokenUsage: parseAgentTokenUsage,
|
|
33
|
+
parseFinalAnswer(text) {
|
|
34
|
+
const trimmed = text.trim();
|
|
35
|
+
return { finalAnswerText: trimmed, finalAnswerParseStatus: trimmed ? "parsed" : "empty" };
|
|
36
|
+
}
|
|
37
|
+
};
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import { runMeasuredCommand } from "../../core/runMeasuredCommand.js";
|
|
2
|
+
import { applyPromptToCommandTemplate } from "../runAgentPrompt.js";
|
|
3
|
+
import { parseAgentTokenUsage } from "../parseAgentTokenUsage.js";
|
|
4
|
+
export const codexAdapter = {
|
|
5
|
+
id: "codex",
|
|
6
|
+
displayName: "Codex",
|
|
7
|
+
surface: "cli",
|
|
8
|
+
async isAvailable(request) {
|
|
9
|
+
if (request.commandTemplate) {
|
|
10
|
+
return true;
|
|
11
|
+
}
|
|
12
|
+
const check = await runMeasuredCommand({
|
|
13
|
+
commandId: "codex-availability",
|
|
14
|
+
commandString: "codex",
|
|
15
|
+
extraArgs: ["--version"],
|
|
16
|
+
cwd: request.cwd,
|
|
17
|
+
outDir: request.outDir,
|
|
18
|
+
env: request.env
|
|
19
|
+
});
|
|
20
|
+
return check.ok;
|
|
21
|
+
},
|
|
22
|
+
buildCommand(request) {
|
|
23
|
+
if (request.commandTemplate) {
|
|
24
|
+
return applyPromptToCommandTemplate(request.commandTemplate, request.promptText);
|
|
25
|
+
}
|
|
26
|
+
return { command: "codex", args: ["exec", "--json", request.promptText] };
|
|
27
|
+
},
|
|
28
|
+
async runPrompt(request) {
|
|
29
|
+
return runCliAgent(request, this);
|
|
30
|
+
},
|
|
31
|
+
parseTokenUsage: parseAgentTokenUsage,
|
|
32
|
+
parseFinalAnswer(text) {
|
|
33
|
+
const trimmed = text.trim();
|
|
34
|
+
return { finalAnswerText: trimmed, finalAnswerParseStatus: trimmed ? "parsed" : "empty" };
|
|
35
|
+
}
|
|
36
|
+
};
|
|
37
|
+
export async function runCliAgent(request, adapter) {
|
|
38
|
+
const started = Date.now();
|
|
39
|
+
const command = adapter.buildCommand(request);
|
|
40
|
+
const available = await adapter.isAvailable(request);
|
|
41
|
+
if (!available) {
|
|
42
|
+
const ended = Date.now();
|
|
43
|
+
const status = request.requireAvailable ? "failed" : "skipped";
|
|
44
|
+
const message = `${adapter.displayName} CLI was not available.`;
|
|
45
|
+
return {
|
|
46
|
+
runId: request.runId,
|
|
47
|
+
agentId: adapter.id,
|
|
48
|
+
displayName: adapter.displayName,
|
|
49
|
+
surface: adapter.surface,
|
|
50
|
+
promptVariantId: request.promptVariant.id,
|
|
51
|
+
promptStrategy: request.promptVariant.strategy,
|
|
52
|
+
promptComplexityLevel: request.promptVariant.complexityLevel,
|
|
53
|
+
startedAt: new Date(started).toISOString(),
|
|
54
|
+
endedAt: new Date(ended).toISOString(),
|
|
55
|
+
durationMs: ended - started,
|
|
56
|
+
status,
|
|
57
|
+
exitCode: null,
|
|
58
|
+
command: command.command,
|
|
59
|
+
args: command.args,
|
|
60
|
+
cwd: request.cwd,
|
|
61
|
+
finalAnswerText: "",
|
|
62
|
+
finalAnswerParseStatus: "empty",
|
|
63
|
+
tokenUsage: { source: "unavailable" },
|
|
64
|
+
tokenUsageSource: "unavailable",
|
|
65
|
+
tokenUsageReliability: "unavailable",
|
|
66
|
+
warnings: status === "skipped" ? [message] : [],
|
|
67
|
+
errors: status === "failed" ? [message] : []
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
const measured = await runMeasuredCommand({
|
|
71
|
+
commandId: `${adapter.id}-agent-run`,
|
|
72
|
+
commandString: command.command,
|
|
73
|
+
extraArgs: command.args,
|
|
74
|
+
cwd: request.commandTemplate?.cwd ?? request.cwd,
|
|
75
|
+
outDir: request.outDir,
|
|
76
|
+
env: request.env,
|
|
77
|
+
timeoutMs: request.timeoutMs
|
|
78
|
+
});
|
|
79
|
+
const ended = Date.now();
|
|
80
|
+
const combinedOutput = `${measured.stdout}\n${measured.stderr}`;
|
|
81
|
+
const parsedAnswer = adapter.parseFinalAnswer(measured.stdout || measured.stderr);
|
|
82
|
+
const parsedUsage = adapter.parseTokenUsage(combinedOutput);
|
|
83
|
+
return {
|
|
84
|
+
runId: request.runId,
|
|
85
|
+
agentId: adapter.id,
|
|
86
|
+
displayName: adapter.displayName,
|
|
87
|
+
surface: adapter.surface,
|
|
88
|
+
promptVariantId: request.promptVariant.id,
|
|
89
|
+
promptStrategy: request.promptVariant.strategy,
|
|
90
|
+
promptComplexityLevel: request.promptVariant.complexityLevel,
|
|
91
|
+
startedAt: new Date(started).toISOString(),
|
|
92
|
+
endedAt: new Date(ended).toISOString(),
|
|
93
|
+
durationMs: ended - started,
|
|
94
|
+
status: measured.ok ? "completed" : "failed",
|
|
95
|
+
exitCode: measured.exitCode,
|
|
96
|
+
command: measured.executable,
|
|
97
|
+
args: measured.args,
|
|
98
|
+
cwd: request.commandTemplate?.cwd ?? request.cwd,
|
|
99
|
+
stdoutPath: measured.stdoutPath,
|
|
100
|
+
stderrPath: measured.stderrPath,
|
|
101
|
+
telemetryPath: measured.telemetryPath,
|
|
102
|
+
finalAnswerText: parsedAnswer.finalAnswerText,
|
|
103
|
+
finalAnswerParseStatus: parsedAnswer.finalAnswerParseStatus,
|
|
104
|
+
tokenUsage: parsedUsage.tokenUsage,
|
|
105
|
+
tokenUsageSource: parsedUsage.tokenUsageSource,
|
|
106
|
+
tokenUsageReliability: parsedUsage.tokenUsageReliability,
|
|
107
|
+
warnings: parsedUsage.warnings,
|
|
108
|
+
errors: measured.ok ? [] : [measured.error ?? "Agent command failed."]
|
|
109
|
+
};
|
|
110
|
+
}
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import { mkdir, writeFile } from "node:fs/promises";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
export const fakeAgentAdapter = {
|
|
4
|
+
id: "fake-agent",
|
|
5
|
+
displayName: "Fake Agent",
|
|
6
|
+
surface: "simulated",
|
|
7
|
+
async isAvailable() {
|
|
8
|
+
return true;
|
|
9
|
+
},
|
|
10
|
+
buildCommand() {
|
|
11
|
+
return { command: "fake-agent", args: [] };
|
|
12
|
+
},
|
|
13
|
+
async runPrompt(request) {
|
|
14
|
+
const started = Date.now();
|
|
15
|
+
const mode = request.env?.FAKE_AGENT_MODE ?? "success";
|
|
16
|
+
await mkdir(request.outDir, { recursive: true });
|
|
17
|
+
const stdoutPath = path.join(request.outDir, "fake-agent.stdout.txt");
|
|
18
|
+
const stderrPath = path.join(request.outDir, "fake-agent.stderr.txt");
|
|
19
|
+
const telemetryPath = path.join(request.outDir, "fake-agent.telemetry.json");
|
|
20
|
+
const failed = mode === "failure";
|
|
21
|
+
const missingUsage = mode === "missing-token-usage";
|
|
22
|
+
const invalidOutput = mode === "invalid-output";
|
|
23
|
+
const finalAnswerText = invalidOutput ? "Simulated unstructured output without scoreable fields." : buildFakeAnswer(request, missingUsage);
|
|
24
|
+
await writeFile(stdoutPath, `${finalAnswerText}\n`, "utf8");
|
|
25
|
+
await writeFile(stderrPath, failed ? "Simulated fake-agent failure.\n" : "", "utf8");
|
|
26
|
+
const ended = Date.now();
|
|
27
|
+
const result = {
|
|
28
|
+
runId: request.runId,
|
|
29
|
+
agentId: "fake-agent",
|
|
30
|
+
displayName: "Fake Agent",
|
|
31
|
+
surface: "simulated",
|
|
32
|
+
promptVariantId: request.promptVariant.id,
|
|
33
|
+
promptStrategy: request.promptVariant.strategy,
|
|
34
|
+
promptComplexityLevel: request.promptVariant.complexityLevel,
|
|
35
|
+
startedAt: new Date(started).toISOString(),
|
|
36
|
+
endedAt: new Date(ended).toISOString(),
|
|
37
|
+
durationMs: ended - started,
|
|
38
|
+
status: failed ? "failed" : "completed",
|
|
39
|
+
exitCode: failed ? 1 : 0,
|
|
40
|
+
command: "fake-agent",
|
|
41
|
+
args: [],
|
|
42
|
+
cwd: request.cwd,
|
|
43
|
+
stdoutPath,
|
|
44
|
+
stderrPath,
|
|
45
|
+
telemetryPath,
|
|
46
|
+
finalAnswerText,
|
|
47
|
+
finalAnswerParseStatus: invalidOutput ? "empty" : "parsed",
|
|
48
|
+
tokenUsage: missingUsage
|
|
49
|
+
? { source: "unavailable", rawText: finalAnswerText }
|
|
50
|
+
: {
|
|
51
|
+
inputTokens: request.promptVariant.promptMetrics.promptEstimatedTokens,
|
|
52
|
+
outputTokens: 128,
|
|
53
|
+
totalTokens: request.promptVariant.promptMetrics.promptEstimatedTokens + 128,
|
|
54
|
+
source: "agent-reported",
|
|
55
|
+
rawText: finalAnswerText
|
|
56
|
+
},
|
|
57
|
+
tokenUsageSource: missingUsage ? "unavailable" : "agent-reported",
|
|
58
|
+
tokenUsageReliability: missingUsage ? "unavailable" : "high",
|
|
59
|
+
warnings: missingUsage ? ["Token usage was intentionally omitted by fake-agent mode."] : [],
|
|
60
|
+
errors: failed ? ["Simulated fake-agent failure."] : []
|
|
61
|
+
};
|
|
62
|
+
await writeFile(telemetryPath, `${JSON.stringify({ commandId: "fake-agent", exitCode: result.exitCode, durationMs: result.durationMs }, null, 2)}\n`, "utf8");
|
|
63
|
+
return result;
|
|
64
|
+
},
|
|
65
|
+
parseTokenUsage() {
|
|
66
|
+
return {
|
|
67
|
+
tokenUsage: { source: "agent-reported" },
|
|
68
|
+
tokenUsageSource: "agent-reported",
|
|
69
|
+
tokenUsageReliability: "high",
|
|
70
|
+
warnings: []
|
|
71
|
+
};
|
|
72
|
+
},
|
|
73
|
+
parseFinalAnswer(text) {
|
|
74
|
+
const trimmed = text.trim();
|
|
75
|
+
return {
|
|
76
|
+
finalAnswerText: trimmed,
|
|
77
|
+
finalAnswerParseStatus: trimmed ? "parsed" : "empty"
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
};
|
|
81
|
+
function buildFakeAnswer(request, missingUsage) {
|
|
82
|
+
const facts = request.promptVariant.expectedAnswerKey.expectedFacts.slice(0, 2).map((fact) => fact.id);
|
|
83
|
+
const tokenLines = missingUsage
|
|
84
|
+
? ""
|
|
85
|
+
: [
|
|
86
|
+
`tokenUsage: inputTokens=${request.promptVariant.promptMetrics.promptEstimatedTokens}, outputTokens=128, totalTokens=${request.promptVariant.promptMetrics.promptEstimatedTokens + 128}`,
|
|
87
|
+
"tokenUsageSource: agent-reported"
|
|
88
|
+
].join("\n");
|
|
89
|
+
return [
|
|
90
|
+
"answer: Simulated benchmark answer from fake-agent.",
|
|
91
|
+
`relevantFiles: ${request.promptVariant.expectedAnswerKey.expectedFiles.join(", ")}`,
|
|
92
|
+
`relevantSymbols: ${request.promptVariant.expectedAnswerKey.expectedSymbols.join(", ")}`,
|
|
93
|
+
`expectedFactsFound: ${facts.join(", ")}`,
|
|
94
|
+
"confidence: high",
|
|
95
|
+
tokenLines,
|
|
96
|
+
`executionTime: simulated-${request.promptVariant.complexityLevel}`,
|
|
97
|
+
"notes: Deterministic fake-agent output for tests."
|
|
98
|
+
]
|
|
99
|
+
.filter(Boolean)
|
|
100
|
+
.join("\n");
|
|
101
|
+
}
|