@dailephd/my-dev-kit-lab 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +272 -0
- package/benchmarks/contracts/benchmark-project-profiles.json +1199 -0
- package/benchmarks/contracts/todo-behavior.md +70 -0
- package/benchmarks/contracts/todo-benchmark-case.json +227 -0
- package/benchmarks/projects/README.md +34 -0
- package/benchmarks/projects/task-analytics-large-mixed/README.md +1 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/__init__.py +3 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/fixtures.py +6 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/metrics.py +29 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/models.py +21 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/parser.py +16 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/pipeline.py +9 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/quality.py +8 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/task_analytics/reporting.py +11 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/tests/test_metrics.py +19 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/tests/test_parser.py +15 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/tests/test_quality.py +19 -0
- package/benchmarks/projects/task-analytics-large-mixed/py/tests/test_reporting.py +15 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/package.json +12 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/index.ts +11 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/models/analyticsSnapshot.ts +20 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/models/project.ts +5 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/models/task.ts +10 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/reporting/buildProjectLeaderboard.ts +7 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/reporting/formatTaskHealthReport.ts +13 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/services/buildAnalyticsSnapshot.ts +39 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/services/completeTask.ts +10 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/services/createTask.ts +21 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/services/listTasksByProject.ts +6 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/store/projectStore.ts +20 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/store/taskStore.ts +44 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/validation/projectValidation.ts +12 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/src/validation/taskValidation.ts +18 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tests/buildAnalyticsSnapshot.test.ts +48 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tests/completeTask.test.ts +21 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tests/createTask.test.ts +31 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tests/listTasksByProject.test.ts +18 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tests/reporting.test.ts +19 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/tsconfig.json +12 -0
- package/benchmarks/projects/task-analytics-large-mixed/ts/vitest.config.ts +5 -0
- package/benchmarks/projects/task-workflow-medium-ts/README.md +1 -0
- package/benchmarks/projects/task-workflow-medium-ts/package.json +12 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/index.ts +9 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/models/project.ts +6 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/models/task.ts +39 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/services/completeTask.ts +15 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/services/createTask.ts +26 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/services/filterTasks.ts +17 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/services/importTasks.ts +33 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/services/summarizeTasks.ts +30 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/store/taskStore.ts +76 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/utils/deterministicId.ts +3 -0
- package/benchmarks/projects/task-workflow-medium-ts/src/validation/taskValidation.ts +45 -0
- package/benchmarks/projects/task-workflow-medium-ts/tests/completeTask.test.ts +16 -0
- package/benchmarks/projects/task-workflow-medium-ts/tests/createTask.test.ts +21 -0
- package/benchmarks/projects/task-workflow-medium-ts/tests/filterTasks.test.ts +18 -0
- package/benchmarks/projects/task-workflow-medium-ts/tests/importTasks.test.ts +22 -0
- package/benchmarks/projects/task-workflow-medium-ts/tests/summarizeTasks.test.ts +29 -0
- package/benchmarks/projects/task-workflow-medium-ts/tsconfig.json +12 -0
- package/benchmarks/projects/task-workflow-medium-ts/vitest.config.ts +5 -0
- package/benchmarks/projects/todo-js/README.md +3 -0
- package/benchmarks/projects/todo-js/package.json +11 -0
- package/benchmarks/projects/todo-js/src/index.js +2 -0
- package/benchmarks/projects/todo-js/src/taskService.js +37 -0
- package/benchmarks/projects/todo-js/src/taskStore.js +28 -0
- package/benchmarks/projects/todo-js/tests/taskService.test.js +45 -0
- package/benchmarks/projects/todo-js/vitest.config.js +5 -0
- package/benchmarks/projects/todo-mixed-ts-py/README.md +3 -0
- package/benchmarks/projects/todo-mixed-ts-py/package.json +13 -0
- package/benchmarks/projects/todo-mixed-ts-py/python/task_service.py +76 -0
- package/benchmarks/projects/todo-mixed-ts-py/src/taskCli.ts +38 -0
- package/benchmarks/projects/todo-mixed-ts-py/tests/mixedBoundary.test.ts +18 -0
- package/benchmarks/projects/todo-mixed-ts-py/tsconfig.json +12 -0
- package/benchmarks/projects/todo-mixed-ts-py/vitest.config.ts +5 -0
- package/benchmarks/projects/todo-python/README.md +3 -0
- package/benchmarks/projects/todo-python/src/__init__.py +4 -0
- package/benchmarks/projects/todo-python/src/task_service.py +32 -0
- package/benchmarks/projects/todo-python/src/task_store.py +28 -0
- package/benchmarks/projects/todo-python/tests/test_task_service.py +52 -0
- package/benchmarks/projects/todo-ts/README.md +3 -0
- package/benchmarks/projects/todo-ts/package.json +12 -0
- package/benchmarks/projects/todo-ts/src/index.ts +2 -0
- package/benchmarks/projects/todo-ts/src/taskService.ts +41 -0
- package/benchmarks/projects/todo-ts/src/taskStore.ts +34 -0
- package/benchmarks/projects/todo-ts/tests/taskService.test.ts +45 -0
- package/benchmarks/projects/todo-ts/tsconfig.json +12 -0
- package/benchmarks/projects/todo-ts/vitest.config.ts +5 -0
- package/dist/scripts/build-gallery.js +3 -0
- package/dist/scripts/capture-demo-report.js +3 -0
- package/dist/scripts/evaluate-token-savings.js +2 -0
- package/dist/scripts/experiments/describeExperiment.js +143 -0
- package/dist/scripts/experiments/listExperiments.js +44 -0
- package/dist/scripts/experiments/runExperiment.js +199 -0
- package/dist/scripts/generate-experiment-plots.js +3 -0
- package/dist/scripts/generate-prompt-variants.js +2 -0
- package/dist/scripts/render-experiment-report.js +2 -0
- package/dist/scripts/run-agent-prompt.js +2 -0
- package/dist/scripts/run-controlled-experiment.js +2 -0
- package/dist/scripts/run-final-demo.js +3 -0
- package/dist/scripts/run-lab-demo.js +5 -0
- package/dist/scripts/run-visualization-demos.js +3 -0
- package/dist/scripts/security/runCodeql.js +57 -0
- package/dist/scripts/security/runDependencyChecks.js +57 -0
- package/dist/scripts/security/runFuzzSmoke.js +29 -0
- package/dist/scripts/security/runPackageChecks.js +56 -0
- package/dist/scripts/security/runSemgrep.js +63 -0
- package/dist/scripts/security/validate.js +117 -0
- package/dist/scripts/verify-benchmarks.js +202 -0
- package/dist/src/agents/adapters/claudeAdapter.js +37 -0
- package/dist/src/agents/adapters/codexAdapter.js +110 -0
- package/dist/src/agents/adapters/fakeAgentAdapter.js +101 -0
- package/dist/src/agents/agentRegistry.js +21 -0
- package/dist/src/agents/index.js +7 -0
- package/dist/src/agents/parseAgentTokenUsage.js +137 -0
- package/dist/src/agents/runAgentPrompt.js +38 -0
- package/dist/src/agents/types.js +1 -0
- package/dist/src/commands/buildGalleryCommand.js +56 -0
- package/dist/src/commands/captureDemoReport.js +116 -0
- package/dist/src/commands/evaluateTokenSavings.js +175 -0
- package/dist/src/commands/generateExperimentPlotsCommand.js +38 -0
- package/dist/src/commands/generatePromptVariants.js +67 -0
- package/dist/src/commands/renderExperimentReportCommand.js +131 -0
- package/dist/src/commands/runAgentPromptCommand.js +132 -0
- package/dist/src/commands/runControlledExperimentCommand.js +174 -0
- package/dist/src/commands/runFinalDemoCommand.js +123 -0
- package/dist/src/commands/runLabDemo.js +62 -0
- package/dist/src/commands/runVisualizationDemosCommand.js +67 -0
- package/dist/src/core/commandLine.js +59 -0
- package/dist/src/core/countTokens.js +8 -0
- package/dist/src/core/fileGlobs.js +100 -0
- package/dist/src/core/localProjectTarget.js +75 -0
- package/dist/src/core/pathSafety.js +19 -0
- package/dist/src/core/pythonCommand.js +30 -0
- package/dist/src/core/resolveCommand.js +110 -0
- package/dist/src/core/runMeasuredCommand.js +143 -0
- package/dist/src/evaluation/benchmarkMetadata.js +207 -0
- package/dist/src/evaluation/buildExperimentMatrix.js +75 -0
- package/dist/src/evaluation/classifyAgentRunOutcome.js +40 -0
- package/dist/src/evaluation/compareExperimentRuns.js +79 -0
- package/dist/src/evaluation/compareTokenSavings.js +47 -0
- package/dist/src/evaluation/controlledExperimentTypes.js +1 -0
- package/dist/src/evaluation/index.js +18 -0
- package/dist/src/evaluation/parseAgentAnswer.js +230 -0
- package/dist/src/evaluation/projectComplexity.js +126 -0
- package/dist/src/evaluation/projectFileTree.js +83 -0
- package/dist/src/evaluation/readEvaluationCases.js +59 -0
- package/dist/src/evaluation/renderTokenSavingsReportInput.js +55 -0
- package/dist/src/evaluation/runControlledExperiment.js +158 -0
- package/dist/src/evaluation/runMyDevKitRetrieval.js +197 -0
- package/dist/src/evaluation/runRawFullFileBaseline.js +31 -0
- package/dist/src/evaluation/scoreCorrectness.js +127 -0
- package/dist/src/evaluation/types.js +1 -0
- package/dist/src/evaluation/writeExperimentArtifacts.js +104 -0
- package/dist/src/evaluation/writeTokenSavingsArtifacts.js +57 -0
- package/dist/src/experiments/config.js +24 -0
- package/dist/src/experiments/defaultRegistry.js +7 -0
- package/dist/src/experiments/errors.js +18 -0
- package/dist/src/experiments/index.js +9 -0
- package/dist/src/experiments/outputPaths.js +25 -0
- package/dist/src/experiments/plugins/contextStrategyComparison/config.js +37 -0
- package/dist/src/experiments/plugins/contextStrategyComparison/index.js +3 -0
- package/dist/src/experiments/plugins/contextStrategyComparison/plugin.js +83 -0
- package/dist/src/experiments/plugins/contextStrategyComparison/resultMapping.js +260 -0
- package/dist/src/experiments/plugins/index.js +1 -0
- package/dist/src/experiments/registry.js +43 -0
- package/dist/src/experiments/results.js +48 -0
- package/dist/src/experiments/runner.js +181 -0
- package/dist/src/experiments/target.js +8 -0
- package/dist/src/experiments/types.js +1 -0
- package/dist/src/gallery/index.js +2 -0
- package/dist/src/gallery/types.js +1 -0
- package/dist/src/gallery/writeGalleryManifest.js +214 -0
- package/dist/src/index.js +12 -0
- package/dist/src/plots/buildExperimentPlotData.js +137 -0
- package/dist/src/plots/index.js +4 -0
- package/dist/src/plots/renderSvgChart.js +82 -0
- package/dist/src/plots/types.js +1 -0
- package/dist/src/plots/writePlotArtifacts.js +46 -0
- package/dist/src/prompts/buildPromptContext.js +68 -0
- package/dist/src/prompts/generateMyDevKitPrompt.js +106 -0
- package/dist/src/prompts/generatePromptVariants.js +36 -0
- package/dist/src/prompts/generateRawFullFilePrompt.js +97 -0
- package/dist/src/prompts/index.js +7 -0
- package/dist/src/prompts/measurePromptComplexity.js +41 -0
- package/dist/src/prompts/types.js +1 -0
- package/dist/src/prompts/writePromptArtifacts.js +43 -0
- package/dist/src/report/buildExperimentReportInput.js +339 -0
- package/dist/src/report/experimentReportTypes.js +1 -0
- package/dist/src/report/experiments/buildPluginExperimentReport.js +153 -0
- package/dist/src/report/experiments/experimentReportModel.js +1 -0
- package/dist/src/report/experiments/index.js +4 -0
- package/dist/src/report/experiments/renderPluginExperimentReportHtml.js +133 -0
- package/dist/src/report/experiments/writePluginExperimentReports.js +30 -0
- package/dist/src/report/index.js +8 -0
- package/dist/src/report/renderExperimentHtmlReport.js +354 -0
- package/dist/src/report/renderHtmlReport.js +103 -0
- package/dist/src/report/types.js +10 -0
- package/dist/src/report/writeExperimentReportArtifacts.js +38 -0
- package/dist/src/report/writeReportArtifacts.js +39 -0
- package/dist/src/screenshot/captureReportScreenshot.js +75 -0
- package/dist/src/screenshot/index.js +2 -0
- package/dist/src/screenshot/types.js +1 -0
- package/dist/src/securityValidation/artifacts.js +15 -0
- package/dist/src/securityValidation/cliAdversarial/adversarialCliConfig.js +38 -0
- package/dist/src/securityValidation/cliAdversarial/dataVolumeChecks.js +194 -0
- package/dist/src/securityValidation/cliAdversarial/jsonStdoutChecks.js +359 -0
- package/dist/src/securityValidation/cliAdversarial/malformedArtifactChecks.js +284 -0
- package/dist/src/securityValidation/cliAdversarial/malformedArtifactFixtures.js +79 -0
- package/dist/src/securityValidation/cliAdversarial/pathBoundaryChecks.js +431 -0
- package/dist/src/securityValidation/cliAdversarial/pathCases.js +144 -0
- package/dist/src/securityValidation/cliAdversarial/readOnlyBoundaryChecks.js +294 -0
- package/dist/src/securityValidation/cliAdversarial/runAdversarialCheck.js +149 -0
- package/dist/src/securityValidation/cliAdversarial/subprocessSafetyChecks.js +214 -0
- package/dist/src/securityValidation/cliAdversarial/tempWorkspace.js +160 -0
- package/dist/src/securityValidation/commandRunner.js +136 -0
- package/dist/src/securityValidation/config.js +39 -0
- package/dist/src/securityValidation/dependencies/parseNpmAudit.js +115 -0
- package/dist/src/securityValidation/dependencies/parseNpmLs.js +71 -0
- package/dist/src/securityValidation/dependencies/parseNpmOutdated.js +41 -0
- package/dist/src/securityValidation/dependencies/runDependencyChecks.js +239 -0
- package/dist/src/securityValidation/dependencies/runOsvScanner.js +43 -0
- package/dist/src/securityValidation/fuzz/fuzzHarness.js +61 -0
- package/dist/src/securityValidation/fuzz/fuzzTargets.js +204 -0
- package/dist/src/securityValidation/fuzz/randomInput.js +0 -0
- package/dist/src/securityValidation/index.js +34 -0
- package/dist/src/securityValidation/packageChecks/forbiddenPackageContents.js +67 -0
- package/dist/src/securityValidation/packageChecks/parseNpmPackDryRun.js +56 -0
- package/dist/src/securityValidation/packageChecks/runPackageChecks.js +88 -0
- package/dist/src/securityValidation/report/renderSecurityReport.js +248 -0
- package/dist/src/securityValidation/report/securityReportTypes.js +1 -0
- package/dist/src/securityValidation/staticScans/codeql.js +66 -0
- package/dist/src/securityValidation/staticScans/semgrep.js +180 -0
- package/dist/src/securityValidation/testMatrix.js +535 -0
- package/dist/src/securityValidation/types.js +34 -0
- package/dist/src/securityValidation/validate/resolveTarget.js +32 -0
- package/dist/src/securityValidation/validate/runSecurityValidation.js +169 -0
- package/dist/src/securityValidation/validate/verdict.js +73 -0
- package/dist/src/visualizationDemos/buildMyDevKitVisualizationCommands.js +59 -0
- package/dist/src/visualizationDemos/index.js +4 -0
- package/dist/src/visualizationDemos/runVisualizationDemos.js +82 -0
- package/dist/src/visualizationDemos/types.js +1 -0
- package/dist/src/visualizationDemos/writeVisualizationDemoArtifacts.js +25 -0
- package/docs/METRICS.md +286 -0
- package/examples/demo-report-input.json +78 -0
- package/examples/lab-demo-cases.json +35 -0
- package/examples/real-agent-campaign-cases.json +118 -0
- package/examples/token-savings-cases.json +122 -0
- package/package.json +91 -0
- package/tests/fixtures/fake-adversarial-cli.js +152 -0
- package/tests/fixtures/fake-my-dev-kit-cli.js +83 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import { claudeAdapter } from "./adapters/claudeAdapter.js";
|
|
2
|
+
import { codexAdapter } from "./adapters/codexAdapter.js";
|
|
3
|
+
import { fakeAgentAdapter } from "./adapters/fakeAgentAdapter.js";
|
|
4
|
+
export const agentAdapters = {
|
|
5
|
+
codex: codexAdapter,
|
|
6
|
+
claude: claudeAdapter,
|
|
7
|
+
"fake-agent": fakeAgentAdapter
|
|
8
|
+
};
|
|
9
|
+
export function getAgentAdapter(agentId) {
|
|
10
|
+
const adapter = agentAdapters[agentId];
|
|
11
|
+
if (!adapter) {
|
|
12
|
+
throw new Error(`Unknown agent adapter: ${agentId}`);
|
|
13
|
+
}
|
|
14
|
+
return adapter;
|
|
15
|
+
}
|
|
16
|
+
export function parseAgentId(value) {
|
|
17
|
+
if (value === "codex" || value === "claude" || value === "fake-agent") {
|
|
18
|
+
return value;
|
|
19
|
+
}
|
|
20
|
+
throw new Error(`Invalid agent id: ${value}`);
|
|
21
|
+
}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
export * from "./types.js";
|
|
2
|
+
export * from "./agentRegistry.js";
|
|
3
|
+
export * from "./runAgentPrompt.js";
|
|
4
|
+
export * from "./parseAgentTokenUsage.js";
|
|
5
|
+
export * from "./adapters/fakeAgentAdapter.js";
|
|
6
|
+
export * from "./adapters/codexAdapter.js";
|
|
7
|
+
export * from "./adapters/claudeAdapter.js";
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
const numberFields = {
|
|
2
|
+
inputTokens: ["input_tokens", "inputTokens", "prompt_tokens", "promptTokens"],
|
|
3
|
+
outputTokens: ["output_tokens", "outputTokens", "completion_tokens", "completionTokens"],
|
|
4
|
+
cachedInputTokens: [
|
|
5
|
+
"cached_input_tokens",
|
|
6
|
+
"cachedInputTokens",
|
|
7
|
+
"cacheReadInputTokens",
|
|
8
|
+
"cache_creation_input_tokens",
|
|
9
|
+
"cacheCreationInputTokens"
|
|
10
|
+
],
|
|
11
|
+
reasoningTokens: ["reasoning_tokens", "reasoningTokens"],
|
|
12
|
+
totalTokens: ["total_tokens", "totalTokens"]
|
|
13
|
+
};
|
|
14
|
+
export function parseAgentTokenUsage(text) {
|
|
15
|
+
const warnings = [];
|
|
16
|
+
const jsonUsage = parseJsonUsage(text);
|
|
17
|
+
if (jsonUsage.found) {
|
|
18
|
+
return buildResult(jsonUsage.usage, "cli-json", text, warnings);
|
|
19
|
+
}
|
|
20
|
+
const plainTextUsage = parsePlainTextUsage(text);
|
|
21
|
+
if (plainTextUsage.found) {
|
|
22
|
+
return buildResult(plainTextUsage.usage, "agent-reported", text, warnings);
|
|
23
|
+
}
|
|
24
|
+
warnings.push("Token usage was not found in agent output.");
|
|
25
|
+
return {
|
|
26
|
+
tokenUsage: { source: "unavailable", rawText: text },
|
|
27
|
+
tokenUsageSource: "unavailable",
|
|
28
|
+
tokenUsageReliability: "unavailable",
|
|
29
|
+
warnings
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
function parseJsonUsage(text) {
|
|
33
|
+
const usage = {};
|
|
34
|
+
for (const candidate of collectJsonCandidates(text)) {
|
|
35
|
+
try {
|
|
36
|
+
mergeUsage(usage, extractUsage(JSON.parse(candidate)));
|
|
37
|
+
}
|
|
38
|
+
catch {
|
|
39
|
+
// Agent output is often mixed JSONL plus text; malformed lines are ignored.
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
return { found: hasAnyUsage(usage), usage };
|
|
43
|
+
}
|
|
44
|
+
function collectJsonCandidates(text) {
|
|
45
|
+
const trimmed = text.trim();
|
|
46
|
+
const candidates = [];
|
|
47
|
+
if (trimmed.startsWith("{") || trimmed.startsWith("[")) {
|
|
48
|
+
candidates.push(trimmed);
|
|
49
|
+
}
|
|
50
|
+
for (const line of text.split(/\r?\n/)) {
|
|
51
|
+
const lineTrimmed = line.trim();
|
|
52
|
+
if (lineTrimmed.startsWith("{") || lineTrimmed.startsWith("[")) {
|
|
53
|
+
candidates.push(lineTrimmed);
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
return [...new Set(candidates)];
|
|
57
|
+
}
|
|
58
|
+
function extractUsage(value) {
|
|
59
|
+
const usage = {};
|
|
60
|
+
visit(value, usage);
|
|
61
|
+
return usage;
|
|
62
|
+
}
|
|
63
|
+
function visit(value, usage) {
|
|
64
|
+
if (!value || typeof value !== "object") {
|
|
65
|
+
return;
|
|
66
|
+
}
|
|
67
|
+
if (Array.isArray(value)) {
|
|
68
|
+
value.forEach((item) => visit(item, usage));
|
|
69
|
+
return;
|
|
70
|
+
}
|
|
71
|
+
const record = value;
|
|
72
|
+
for (const [usageKey, fieldNames] of Object.entries(numberFields)) {
|
|
73
|
+
for (const fieldName of fieldNames) {
|
|
74
|
+
const fieldValue = record[fieldName];
|
|
75
|
+
if (typeof fieldValue === "number" && Number.isFinite(fieldValue)) {
|
|
76
|
+
usage[usageKey] = Math.max(usage[usageKey] ?? 0, fieldValue);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
for (const child of Object.values(record)) {
|
|
81
|
+
visit(child, usage);
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
function parsePlainTextUsage(text) {
|
|
85
|
+
const usage = {};
|
|
86
|
+
const patterns = [
|
|
87
|
+
["inputTokens", /(?:input|prompt)\s+tokens?\s*[:=]\s*(\d+)/i],
|
|
88
|
+
["outputTokens", /(?:output|completion)\s+tokens?\s*[:=]\s*(\d+)/i],
|
|
89
|
+
["cachedInputTokens", /cached\s+input\s+tokens?\s*[:=]\s*(\d+)/i],
|
|
90
|
+
["reasoningTokens", /reasoning\s+tokens?\s*[:=]\s*(\d+)/i],
|
|
91
|
+
["totalTokens", /total\s+tokens?\s*[:=]\s*(\d+)/i]
|
|
92
|
+
];
|
|
93
|
+
for (const [key, pattern] of patterns) {
|
|
94
|
+
const match = text.match(pattern);
|
|
95
|
+
if (match?.[1]) {
|
|
96
|
+
usage[key] = Number(match[1]);
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
return { found: hasAnyUsage(usage), usage };
|
|
100
|
+
}
|
|
101
|
+
function mergeUsage(target, source) {
|
|
102
|
+
for (const key of Object.keys(numberFields)) {
|
|
103
|
+
if (source[key] !== undefined) {
|
|
104
|
+
target[key] = Math.max(target[key] ?? 0, source[key]);
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
function hasAnyUsage(usage) {
|
|
109
|
+
return Object.values(usage).some((value) => typeof value === "number");
|
|
110
|
+
}
|
|
111
|
+
function buildResult(usage, source, rawText, warnings) {
|
|
112
|
+
if (usage.totalTokens === undefined && (usage.inputTokens !== undefined || usage.outputTokens !== undefined)) {
|
|
113
|
+
usage.totalTokens = (usage.inputTokens ?? 0) + (usage.outputTokens ?? 0) + (usage.reasoningTokens ?? 0);
|
|
114
|
+
}
|
|
115
|
+
const reliability = determineReliability(usage, source);
|
|
116
|
+
if (reliability !== "high") {
|
|
117
|
+
warnings.push("Token usage was partial in agent output.");
|
|
118
|
+
}
|
|
119
|
+
const tokenUsage = {
|
|
120
|
+
...usage,
|
|
121
|
+
source,
|
|
122
|
+
rawText
|
|
123
|
+
};
|
|
124
|
+
return {
|
|
125
|
+
tokenUsage,
|
|
126
|
+
tokenUsageSource: source,
|
|
127
|
+
tokenUsageReliability: reliability,
|
|
128
|
+
warnings
|
|
129
|
+
};
|
|
130
|
+
}
|
|
131
|
+
function determineReliability(usage, source) {
|
|
132
|
+
const complete = usage.totalTokens !== undefined && (usage.inputTokens !== undefined || usage.outputTokens !== undefined);
|
|
133
|
+
if (source === "cli-json") {
|
|
134
|
+
return complete ? "high" : "medium";
|
|
135
|
+
}
|
|
136
|
+
return complete ? "medium" : "low";
|
|
137
|
+
}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import { mkdir, writeFile } from "node:fs/promises";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { parseCommandString } from "../core/commandLine.js";
|
|
4
|
+
import { getAgentAdapter } from "./agentRegistry.js";
|
|
5
|
+
export async function runAgentPrompt(options) {
|
|
6
|
+
await mkdir(options.outDir, { recursive: true });
|
|
7
|
+
await writeFile(path.join(options.outDir, "prompt.txt"), options.promptText, "utf8");
|
|
8
|
+
const adapter = getAgentAdapter(options.agentId);
|
|
9
|
+
const result = await adapter.runPrompt(options);
|
|
10
|
+
await writeFile(path.join(options.outDir, "agent-run-result.json"), `${JSON.stringify(result, null, 2)}\n`, "utf8");
|
|
11
|
+
return result;
|
|
12
|
+
}
|
|
13
|
+
export function parseAgentCommandTemplate(template) {
|
|
14
|
+
const parts = parseCommandString(template);
|
|
15
|
+
return {
|
|
16
|
+
command: parts.executable,
|
|
17
|
+
args: parts.args,
|
|
18
|
+
promptPlaceholder: "{prompt}"
|
|
19
|
+
};
|
|
20
|
+
}
|
|
21
|
+
export function applyPromptToCommandTemplate(template, promptText) {
|
|
22
|
+
let replaced = false;
|
|
23
|
+
const args = template.args.map((arg) => {
|
|
24
|
+
if (arg.includes(`{${template.promptPlaceholder}}`)) {
|
|
25
|
+
replaced = true;
|
|
26
|
+
return arg.replaceAll(`{${template.promptPlaceholder}}`, promptText);
|
|
27
|
+
}
|
|
28
|
+
if (arg.includes(template.promptPlaceholder)) {
|
|
29
|
+
replaced = true;
|
|
30
|
+
return arg.replaceAll(template.promptPlaceholder, promptText);
|
|
31
|
+
}
|
|
32
|
+
return arg;
|
|
33
|
+
});
|
|
34
|
+
if (!replaced) {
|
|
35
|
+
args.push(promptText);
|
|
36
|
+
}
|
|
37
|
+
return { command: template.command, args };
|
|
38
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import path from "node:path";
|
|
2
|
+
import { writeExperimentGalleryManifest } from "../gallery/index.js";
|
|
3
|
+
export function parseBuildGalleryArgs(argv) {
|
|
4
|
+
let outDir = "";
|
|
5
|
+
let reportDir;
|
|
6
|
+
let plotsDir;
|
|
7
|
+
let visualizationsDir;
|
|
8
|
+
let experimentDir;
|
|
9
|
+
for (let index = 0; index < argv.length; index += 1) {
|
|
10
|
+
const arg = argv[index];
|
|
11
|
+
if (arg === "--report") {
|
|
12
|
+
reportDir = argv[index + 1] ?? "";
|
|
13
|
+
index += 1;
|
|
14
|
+
}
|
|
15
|
+
else if (arg === "--plots") {
|
|
16
|
+
plotsDir = argv[index + 1] ?? "";
|
|
17
|
+
index += 1;
|
|
18
|
+
}
|
|
19
|
+
else if (arg === "--visualizations") {
|
|
20
|
+
visualizationsDir = argv[index + 1] ?? "";
|
|
21
|
+
index += 1;
|
|
22
|
+
}
|
|
23
|
+
else if (arg === "--experiment") {
|
|
24
|
+
experimentDir = argv[index + 1] ?? "";
|
|
25
|
+
index += 1;
|
|
26
|
+
}
|
|
27
|
+
else if (arg === "--out") {
|
|
28
|
+
outDir = argv[index + 1] ?? "";
|
|
29
|
+
index += 1;
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
if (!outDir)
|
|
33
|
+
throw new Error("Usage: --out <dir> [--report <dir>] [--plots <dir>] [--visualizations <dir>] [--experiment <dir>]");
|
|
34
|
+
return { outDir, reportDir, plotsDir, visualizationsDir, experimentDir };
|
|
35
|
+
}
|
|
36
|
+
export async function runBuildGalleryFromArgs(args, repoRoot = process.cwd()) {
|
|
37
|
+
return writeExperimentGalleryManifest({
|
|
38
|
+
outDir: path.resolve(repoRoot, args.outDir),
|
|
39
|
+
reportDir: args.reportDir ? path.resolve(repoRoot, args.reportDir) : undefined,
|
|
40
|
+
plotsDir: args.plotsDir ? path.resolve(repoRoot, args.plotsDir) : undefined,
|
|
41
|
+
visualizationsDir: args.visualizationsDir ? path.resolve(repoRoot, args.visualizationsDir) : undefined,
|
|
42
|
+
experimentDir: args.experimentDir ? path.resolve(repoRoot, args.experimentDir) : undefined
|
|
43
|
+
});
|
|
44
|
+
}
|
|
45
|
+
export async function runBuildGalleryCommand(argv) {
|
|
46
|
+
try {
|
|
47
|
+
const args = parseBuildGalleryArgs(argv);
|
|
48
|
+
const gallery = await runBuildGalleryFromArgs(args);
|
|
49
|
+
console.log([`Items: ${gallery.manifest.items.length}`, `Manifest: ${gallery.manifestPath}`, `Index: ${gallery.indexPath}`].join("\n"));
|
|
50
|
+
return 0;
|
|
51
|
+
}
|
|
52
|
+
catch (error) {
|
|
53
|
+
console.error(error instanceof Error ? error.message : String(error));
|
|
54
|
+
return 1;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import { readFile } from "node:fs/promises";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { captureReportScreenshot, SCREENSHOT_SKIP_WARNING } from "../screenshot/index.js";
|
|
4
|
+
import { getReportArtifactPaths, normalizeLabReport, writeReportArtifacts } from "../report/index.js";
|
|
5
|
+
function parseArgs(argv) {
|
|
6
|
+
let inputPath = "";
|
|
7
|
+
let outDir = "";
|
|
8
|
+
let noScreenshot = false;
|
|
9
|
+
for (let index = 0; index < argv.length; index += 1) {
|
|
10
|
+
const arg = argv[index];
|
|
11
|
+
if (arg === "--input") {
|
|
12
|
+
inputPath = argv[index + 1] ?? "";
|
|
13
|
+
index += 1;
|
|
14
|
+
}
|
|
15
|
+
else if (arg === "--out") {
|
|
16
|
+
outDir = argv[index + 1] ?? "";
|
|
17
|
+
index += 1;
|
|
18
|
+
}
|
|
19
|
+
else if (arg === "--no-screenshot") {
|
|
20
|
+
noScreenshot = true;
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
if (!inputPath || !outDir) {
|
|
24
|
+
throw new Error("Usage: --input <path> --out <directory> [--no-screenshot]");
|
|
25
|
+
}
|
|
26
|
+
return { inputPath, outDir, noScreenshot };
|
|
27
|
+
}
|
|
28
|
+
function validateReportInput(value) {
|
|
29
|
+
if (!value || typeof value !== "object") {
|
|
30
|
+
throw new Error("Report input must be a JSON object.");
|
|
31
|
+
}
|
|
32
|
+
const report = value;
|
|
33
|
+
const requiredStringFields = ["reportId", "title", "projectName", "benchmarkProject", "workflowName", "summary"];
|
|
34
|
+
for (const field of requiredStringFields) {
|
|
35
|
+
if (typeof report[field] !== "string" || report[field] === "") {
|
|
36
|
+
throw new Error(`Missing required field: ${field}`);
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
const requiredArrayFields = ["steps", "metrics", "artifacts", "warnings"];
|
|
40
|
+
for (const field of requiredArrayFields) {
|
|
41
|
+
if (!Array.isArray(report[field])) {
|
|
42
|
+
throw new Error(`Missing required field: ${field}`);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
function summarizeResult(result) {
|
|
47
|
+
const lines = [
|
|
48
|
+
`Report ID: ${result.report.reportId}`,
|
|
49
|
+
`HTML: ${result.outputPaths.htmlPath}`,
|
|
50
|
+
`JSON: ${result.outputPaths.jsonPath}`
|
|
51
|
+
];
|
|
52
|
+
if (result.screenshot.status === "captured") {
|
|
53
|
+
lines.push(`PNG: ${result.outputPaths.pngPath}`);
|
|
54
|
+
}
|
|
55
|
+
if (result.screenshot.status === "skipped" && result.screenshot.warning) {
|
|
56
|
+
lines.push(result.screenshot.warning);
|
|
57
|
+
}
|
|
58
|
+
if (result.screenshot.status === "failed" && result.screenshot.error) {
|
|
59
|
+
lines.push(`PNG screenshot failed: ${result.screenshot.error}`);
|
|
60
|
+
}
|
|
61
|
+
return lines.join("\n");
|
|
62
|
+
}
|
|
63
|
+
export async function runCaptureDemoReportCommand(argv) {
|
|
64
|
+
try {
|
|
65
|
+
const args = parseArgs(argv);
|
|
66
|
+
const rawInput = await readFile(path.resolve(args.inputPath), "utf8").catch((error) => {
|
|
67
|
+
const fileError = error;
|
|
68
|
+
if (fileError.code === "ENOENT") {
|
|
69
|
+
throw new Error(`Input file not found: ${args.inputPath}`);
|
|
70
|
+
}
|
|
71
|
+
throw error;
|
|
72
|
+
});
|
|
73
|
+
let parsed;
|
|
74
|
+
try {
|
|
75
|
+
parsed = JSON.parse(rawInput);
|
|
76
|
+
}
|
|
77
|
+
catch (error) {
|
|
78
|
+
throw new Error(`Invalid JSON input: ${error.message}`);
|
|
79
|
+
}
|
|
80
|
+
validateReportInput(parsed);
|
|
81
|
+
const report = normalizeLabReport(parsed);
|
|
82
|
+
const outDir = path.resolve(args.outDir);
|
|
83
|
+
const outputPaths = getReportArtifactPaths(outDir, report.reportId);
|
|
84
|
+
let screenshot;
|
|
85
|
+
if (args.noScreenshot) {
|
|
86
|
+
screenshot = {
|
|
87
|
+
status: "skipped",
|
|
88
|
+
htmlPath: outputPaths.htmlPath,
|
|
89
|
+
pngPath: outputPaths.pngPath,
|
|
90
|
+
warning: "PNG screenshot skipped because --no-screenshot was provided."
|
|
91
|
+
};
|
|
92
|
+
}
|
|
93
|
+
else {
|
|
94
|
+
await writeReportArtifacts({
|
|
95
|
+
report,
|
|
96
|
+
outDir,
|
|
97
|
+
screenshot: {
|
|
98
|
+
status: "skipped",
|
|
99
|
+
htmlPath: outputPaths.htmlPath,
|
|
100
|
+
pngPath: outputPaths.pngPath
|
|
101
|
+
}
|
|
102
|
+
});
|
|
103
|
+
screenshot = await captureReportScreenshot(outputPaths.htmlPath, outputPaths.pngPath);
|
|
104
|
+
if (screenshot.status === "skipped" && !screenshot.warning) {
|
|
105
|
+
screenshot.warning = SCREENSHOT_SKIP_WARNING;
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
const result = await writeReportArtifacts({ report, outDir, screenshot });
|
|
109
|
+
console.log(summarizeResult(result));
|
|
110
|
+
return screenshot.status === "failed" ? 1 : 0;
|
|
111
|
+
}
|
|
112
|
+
catch (error) {
|
|
113
|
+
console.error(error instanceof Error ? error.message : String(error));
|
|
114
|
+
return 1;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
import path from "node:path";
|
|
2
|
+
import { readEvaluationCases, runRawFullFileBaseline, runMyDevKitRetrieval, compareTokenSavings, writeTokenSavingsArtifacts } from "../evaluation/index.js";
|
|
3
|
+
import { captureReportScreenshot, SCREENSHOT_SKIP_WARNING } from "../screenshot/index.js";
|
|
4
|
+
export function parseEvaluateTokenSavingsArgs(argv) {
|
|
5
|
+
let casesPath = "";
|
|
6
|
+
let kitCommand = "";
|
|
7
|
+
let outDir = "";
|
|
8
|
+
let requireKit = false;
|
|
9
|
+
let noScreenshot = false;
|
|
10
|
+
for (let index = 0; index < argv.length; index += 1) {
|
|
11
|
+
const arg = argv[index];
|
|
12
|
+
if (arg === "--cases") {
|
|
13
|
+
casesPath = argv[index + 1] ?? "";
|
|
14
|
+
index += 1;
|
|
15
|
+
}
|
|
16
|
+
else if (arg === "--kit-command") {
|
|
17
|
+
kitCommand = argv[index + 1] ?? "";
|
|
18
|
+
index += 1;
|
|
19
|
+
}
|
|
20
|
+
else if (arg === "--out") {
|
|
21
|
+
outDir = argv[index + 1] ?? "";
|
|
22
|
+
index += 1;
|
|
23
|
+
}
|
|
24
|
+
else if (arg === "--require-kit") {
|
|
25
|
+
requireKit = true;
|
|
26
|
+
}
|
|
27
|
+
else if (arg === "--no-screenshot") {
|
|
28
|
+
noScreenshot = true;
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
if (!casesPath || !kitCommand || !outDir) {
|
|
32
|
+
throw new Error("Usage: --cases <path> --kit-command <command> --out <directory> [--require-kit] [--no-screenshot]");
|
|
33
|
+
}
|
|
34
|
+
return { casesPath, kitCommand, outDir, requireKit, noScreenshot };
|
|
35
|
+
}
|
|
36
|
+
function summarize(summary, outDir) {
|
|
37
|
+
return [
|
|
38
|
+
`Cases: ${summary.caseCount}`,
|
|
39
|
+
`Completed: ${summary.completedCaseCount}`,
|
|
40
|
+
`Skipped: ${summary.skippedCaseCount}`,
|
|
41
|
+
`Total estimated tokens saved: ${summary.totalTokensSaved}`,
|
|
42
|
+
`Output: ${outDir}`
|
|
43
|
+
].join("\n");
|
|
44
|
+
}
|
|
45
|
+
export async function runTokenSavingsEvaluation(args, repoRoot = process.cwd()) {
|
|
46
|
+
const cases = await readEvaluationCases(path.resolve(repoRoot, args.casesPath), repoRoot);
|
|
47
|
+
const outputDir = path.resolve(repoRoot, args.outDir);
|
|
48
|
+
const commandConfig = {
|
|
49
|
+
casesPath: path.resolve(repoRoot, args.casesPath),
|
|
50
|
+
kitCommand: args.kitCommand,
|
|
51
|
+
requireKit: args.requireKit,
|
|
52
|
+
noScreenshot: args.noScreenshot,
|
|
53
|
+
outputDir
|
|
54
|
+
};
|
|
55
|
+
const evaluations = [];
|
|
56
|
+
for (const evaluationCase of cases) {
|
|
57
|
+
const rawBaseline = await runRawFullFileBaseline(evaluationCase);
|
|
58
|
+
const myDevKit = await runMyDevKitRetrieval({
|
|
59
|
+
evaluationCase,
|
|
60
|
+
kitCommand: args.kitCommand,
|
|
61
|
+
outputDir,
|
|
62
|
+
requireKit: args.requireKit
|
|
63
|
+
});
|
|
64
|
+
evaluations.push({ evaluationCase, rawBaseline, myDevKit });
|
|
65
|
+
}
|
|
66
|
+
const comparison = compareTokenSavings(evaluations);
|
|
67
|
+
let screenshot = {
|
|
68
|
+
status: "skipped",
|
|
69
|
+
htmlPath: path.join(outputDir, "token-savings-report.html"),
|
|
70
|
+
pngPath: path.join(outputDir, "token-savings-report.png")
|
|
71
|
+
};
|
|
72
|
+
const runs = evaluations.map(({ evaluationCase, rawBaseline, myDevKit }, index) => ({
|
|
73
|
+
case: {
|
|
74
|
+
id: evaluationCase.id,
|
|
75
|
+
title: evaluationCase.title,
|
|
76
|
+
benchmarkProject: evaluationCase.benchmarkProject,
|
|
77
|
+
targetRoot: evaluationCase.targetRoot,
|
|
78
|
+
sourceRoots: evaluationCase.sourceRoots,
|
|
79
|
+
query: evaluationCase.query,
|
|
80
|
+
expectedFiles: evaluationCase.expectedFiles,
|
|
81
|
+
expectedSymbols: evaluationCase.expectedSymbols,
|
|
82
|
+
rawIncludeGlobs: evaluationCase.rawIncludeGlobs,
|
|
83
|
+
notes: evaluationCase.notes
|
|
84
|
+
},
|
|
85
|
+
rawBaseline: {
|
|
86
|
+
caseId: rawBaseline.caseId,
|
|
87
|
+
targetRoot: rawBaseline.targetRoot,
|
|
88
|
+
filesIncluded: rawBaseline.filesIncluded,
|
|
89
|
+
totalFiles: rawBaseline.totalFiles,
|
|
90
|
+
totalChars: rawBaseline.totalChars,
|
|
91
|
+
totalEstimatedTokens: rawBaseline.totalEstimatedTokens,
|
|
92
|
+
tokenCountMethod: rawBaseline.tokenCountMethod,
|
|
93
|
+
durationMs: rawBaseline.durationMs
|
|
94
|
+
},
|
|
95
|
+
myDevKit: {
|
|
96
|
+
caseId: myDevKit.caseId,
|
|
97
|
+
skipped: myDevKit.skipped,
|
|
98
|
+
warnings: myDevKit.warnings,
|
|
99
|
+
totalChars: myDevKit.totalChars,
|
|
100
|
+
totalEstimatedTokens: myDevKit.totalEstimatedTokens,
|
|
101
|
+
tokenCountMethod: myDevKit.tokenCountMethod,
|
|
102
|
+
filesRead: myDevKit.filesRead,
|
|
103
|
+
commands: myDevKit.commands,
|
|
104
|
+
selectedNodeId: myDevKit.selectedNodeId,
|
|
105
|
+
selectedFile: myDevKit.selectedFile,
|
|
106
|
+
selectedSymbol: myDevKit.selectedSymbol,
|
|
107
|
+
durationMs: myDevKit.durationMs,
|
|
108
|
+
commandTelemetry: myDevKit.commands.map((command) => ({
|
|
109
|
+
commandId: command.commandId,
|
|
110
|
+
stdoutPath: command.stdoutPath,
|
|
111
|
+
stderrPath: command.stderrPath,
|
|
112
|
+
telemetryPath: command.telemetryPath,
|
|
113
|
+
exitCode: command.exitCode,
|
|
114
|
+
ok: command.ok
|
|
115
|
+
}))
|
|
116
|
+
},
|
|
117
|
+
comparison: comparison.cases[index]
|
|
118
|
+
}));
|
|
119
|
+
let artifacts = await writeTokenSavingsArtifacts({
|
|
120
|
+
outDir: outputDir,
|
|
121
|
+
summary: comparison.summary,
|
|
122
|
+
runs,
|
|
123
|
+
comparisonCases: comparison.cases,
|
|
124
|
+
commandConfig,
|
|
125
|
+
screenshot
|
|
126
|
+
});
|
|
127
|
+
if (args.noScreenshot) {
|
|
128
|
+
screenshot = {
|
|
129
|
+
status: "skipped",
|
|
130
|
+
htmlPath: artifacts.artifactPaths.htmlPath,
|
|
131
|
+
pngPath: artifacts.artifactPaths.pngPath,
|
|
132
|
+
warning: "PNG screenshot skipped because --no-screenshot was provided."
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
else {
|
|
136
|
+
screenshot = await captureReportScreenshot(artifacts.artifactPaths.htmlPath, artifacts.artifactPaths.pngPath);
|
|
137
|
+
if (screenshot.status === "skipped" && !screenshot.warning) {
|
|
138
|
+
screenshot.warning = SCREENSHOT_SKIP_WARNING;
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
artifacts = await writeTokenSavingsArtifacts({
|
|
142
|
+
outDir: outputDir,
|
|
143
|
+
summary: comparison.summary,
|
|
144
|
+
runs,
|
|
145
|
+
comparisonCases: comparison.cases,
|
|
146
|
+
commandConfig,
|
|
147
|
+
screenshot
|
|
148
|
+
});
|
|
149
|
+
return {
|
|
150
|
+
commandConfig,
|
|
151
|
+
cases,
|
|
152
|
+
runs,
|
|
153
|
+
comparison,
|
|
154
|
+
artifacts
|
|
155
|
+
};
|
|
156
|
+
}
|
|
157
|
+
export async function runEvaluateTokenSavingsCommand(argv) {
|
|
158
|
+
try {
|
|
159
|
+
const args = parseEvaluateTokenSavingsArgs(argv);
|
|
160
|
+
const result = await runTokenSavingsEvaluation(args);
|
|
161
|
+
const { artifacts } = result;
|
|
162
|
+
console.log(summarize(artifacts.summary, result.commandConfig.outputDir));
|
|
163
|
+
if (args.requireKit && artifacts.summary.completedCaseCount === 0) {
|
|
164
|
+
return 1;
|
|
165
|
+
}
|
|
166
|
+
if (artifacts.screenshot.status === "failed" && args.requireKit) {
|
|
167
|
+
return 1;
|
|
168
|
+
}
|
|
169
|
+
return 0;
|
|
170
|
+
}
|
|
171
|
+
catch (error) {
|
|
172
|
+
console.error(error instanceof Error ? error.message : String(error));
|
|
173
|
+
return 1;
|
|
174
|
+
}
|
|
175
|
+
}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import path from "node:path";
|
|
2
|
+
import { writePlotArtifacts } from "../plots/index.js";
|
|
3
|
+
export function parseGenerateExperimentPlotsArgs(argv) {
|
|
4
|
+
let experimentDir = "";
|
|
5
|
+
let outDir = "";
|
|
6
|
+
for (let index = 0; index < argv.length; index += 1) {
|
|
7
|
+
if (argv[index] === "--experiment") {
|
|
8
|
+
experimentDir = argv[index + 1] ?? "";
|
|
9
|
+
index += 1;
|
|
10
|
+
}
|
|
11
|
+
else if (argv[index] === "--out") {
|
|
12
|
+
outDir = argv[index + 1] ?? "";
|
|
13
|
+
index += 1;
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
if (!experimentDir || !outDir)
|
|
17
|
+
throw new Error("Usage: --experiment <dir> --out <dir>");
|
|
18
|
+
return { experimentDir, outDir };
|
|
19
|
+
}
|
|
20
|
+
export async function runGenerateExperimentPlotsFromArgs(args, repoRoot = process.cwd()) {
|
|
21
|
+
return writePlotArtifacts({
|
|
22
|
+
experimentDir: path.resolve(repoRoot, args.experimentDir),
|
|
23
|
+
outDir: path.resolve(repoRoot, args.outDir),
|
|
24
|
+
repoRoot
|
|
25
|
+
});
|
|
26
|
+
}
|
|
27
|
+
export async function runGenerateExperimentPlotsCommand(argv) {
|
|
28
|
+
try {
|
|
29
|
+
const args = parseGenerateExperimentPlotsArgs(argv);
|
|
30
|
+
const artifacts = await runGenerateExperimentPlotsFromArgs(args);
|
|
31
|
+
console.log([`Charts: ${artifacts.summary.chartCount}`, `Skipped points: ${artifacts.summary.skippedPointCount}`, `Output: ${path.resolve(args.outDir)}`].join("\n"));
|
|
32
|
+
return 0;
|
|
33
|
+
}
|
|
34
|
+
catch (error) {
|
|
35
|
+
console.error(error instanceof Error ? error.message : String(error));
|
|
36
|
+
return 1;
|
|
37
|
+
}
|
|
38
|
+
}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import path from "node:path";
|
|
2
|
+
import { readBenchmarkProjectProfiles, readEvaluationCases } from "../evaluation/index.js";
|
|
3
|
+
import { generatePromptVariants, parsePromptComplexityLevel, parsePromptStrategy, writePromptArtifacts } from "../prompts/index.js";
|
|
4
|
+
export function parseGeneratePromptVariantsArgs(argv) {
|
|
5
|
+
let casesPath = "";
|
|
6
|
+
let outDir = "";
|
|
7
|
+
let projectProfilesPath = "benchmarks/contracts/benchmark-project-profiles.json";
|
|
8
|
+
let strategy;
|
|
9
|
+
let complexity;
|
|
10
|
+
for (let index = 0; index < argv.length; index += 1) {
|
|
11
|
+
const arg = argv[index];
|
|
12
|
+
if (arg === "--cases") {
|
|
13
|
+
casesPath = argv[index + 1] ?? "";
|
|
14
|
+
index += 1;
|
|
15
|
+
}
|
|
16
|
+
else if (arg === "--out") {
|
|
17
|
+
outDir = argv[index + 1] ?? "";
|
|
18
|
+
index += 1;
|
|
19
|
+
}
|
|
20
|
+
else if (arg === "--project-profiles") {
|
|
21
|
+
projectProfilesPath = argv[index + 1] ?? "";
|
|
22
|
+
index += 1;
|
|
23
|
+
}
|
|
24
|
+
else if (arg === "--strategy") {
|
|
25
|
+
strategy = parsePromptStrategy(argv[index + 1] ?? "");
|
|
26
|
+
index += 1;
|
|
27
|
+
}
|
|
28
|
+
else if (arg === "--complexity") {
|
|
29
|
+
complexity = parsePromptComplexityLevel(argv[index + 1] ?? "");
|
|
30
|
+
index += 1;
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
if (!casesPath || !outDir || !projectProfilesPath) {
|
|
34
|
+
throw new Error("Usage: --cases <path> --out <directory> [--project-profiles <path>] [--strategy <raw-full-file|my-dev-kit-guided>] [--complexity <short|medium|long|multi-step>]");
|
|
35
|
+
}
|
|
36
|
+
return { casesPath, outDir, projectProfilesPath, strategy, complexity };
|
|
37
|
+
}
|
|
38
|
+
export async function runGeneratePromptVariants(args, repoRoot = process.cwd()) {
|
|
39
|
+
const projectProfiles = await readBenchmarkProjectProfiles(path.resolve(repoRoot, args.projectProfilesPath), repoRoot);
|
|
40
|
+
const cases = await readEvaluationCases(path.resolve(repoRoot, args.casesPath), repoRoot, {
|
|
41
|
+
projectProfiles,
|
|
42
|
+
requireProjectProfileRef: true
|
|
43
|
+
});
|
|
44
|
+
const variants = generatePromptVariants({
|
|
45
|
+
cases,
|
|
46
|
+
projectProfiles,
|
|
47
|
+
strategies: args.strategy ? [args.strategy] : undefined,
|
|
48
|
+
complexityLevels: args.complexity ? [args.complexity] : undefined
|
|
49
|
+
});
|
|
50
|
+
const summary = await writePromptArtifacts({
|
|
51
|
+
outDir: path.resolve(repoRoot, args.outDir),
|
|
52
|
+
variants
|
|
53
|
+
});
|
|
54
|
+
return { cases, projectProfiles, variants, summary };
|
|
55
|
+
}
|
|
56
|
+
export async function runGeneratePromptVariantsCommand(argv) {
|
|
57
|
+
try {
|
|
58
|
+
const args = parseGeneratePromptVariantsArgs(argv);
|
|
59
|
+
const result = await runGeneratePromptVariants(args);
|
|
60
|
+
console.log([`Cases: ${result.summary.caseCount}`, `Prompts: ${result.summary.promptCount}`, `Output: ${path.resolve(args.outDir)}`].join("\n"));
|
|
61
|
+
return 0;
|
|
62
|
+
}
|
|
63
|
+
catch (error) {
|
|
64
|
+
console.error(error instanceof Error ? error.message : String(error));
|
|
65
|
+
return 1;
|
|
66
|
+
}
|
|
67
|
+
}
|