npm - @dailephd/my-dev-kit-lab - Versions diffs - 0.2.0 - Mend

@dailephd/my-dev-kit-lab 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (250) hide show

package/dist/src/evaluation/runControlledExperiment.js ADDED Viewed

@@ -0,0 +1,158 @@
+import path from "node:path";
+import { runAgentPrompt } from "../agents/index.js";
+import { generatePromptVariants } from "../prompts/index.js";
+import { buildExperimentMatrix } from "./buildExperimentMatrix.js";
+import { classifyAgentRunOutcome } from "./classifyAgentRunOutcome.js";
+import { compareExperimentRuns } from "./compareExperimentRuns.js";
+import { parseAgentAnswer } from "./parseAgentAnswer.js";
+import { scoreCorrectness } from "./scoreCorrectness.js";
+import { buildExperimentSummary, writeExperimentArtifacts } from "./writeExperimentArtifacts.js";
+export async function runControlledExperiment(args) {
+    const repoRoot = args.repoRoot ?? process.cwd();
+    const config = {
+        ...args.config,
+        agents: args.config.agents ?? ["fake-agent"],
+        strategies: args.config.strategies ?? ["raw-full-file", "my-dev-kit-guided"],
+        complexityLevels: args.config.complexityLevels ?? ["short"],
+        continueOnFailure: args.config.continueOnFailure ?? true,
+        includeRealAgents: args.config.includeRealAgents ?? false
+    };
+    const matrix = buildExperimentMatrix({ cases: args.cases, config });
+    const runs = [];
+    for (const cell of matrix) {
+        const evaluationCase = args.cases.find((candidate) => candidate.id === cell.caseId);
+        if (!evaluationCase) {
+            throw new Error(`Evaluation case not found while running matrix: ${cell.caseId}`);
+        }
+        const promptVariant = buildPromptVariant({
+            evaluationCase,
+            projectProfiles: args.projectProfiles,
+            strategy: cell.strategy,
+            complexityLevel: cell.complexityLevel
+        });
+        const runDir = path.join(path.resolve(repoRoot, config.outDir), "runs", cell.runId);
+        let run = await executeExperimentCell({
+            runId: cell.runId,
+            agentId: cell.agentId,
+            promptVariant,
+            repoRoot,
+            runDir,
+            timeoutMs: config.timeoutMs,
+            requireAgents: config.requireAgents ?? false,
+            commandTemplate: cell.agentId === "codex" || cell.agentId === "claude" ? config.commandTemplates?.[cell.agentId] : undefined,
+            env: args.env ?? process.env
+        });
+        runs.push(run);
+        if (run.status !== "completed" && config.continueOnFailure === false) {
+            break;
+        }
+    }
+    const comparisons = compareExperimentRuns(runs);
+    const summary = buildExperimentSummary({ config, runs, comparisons });
+    return writeExperimentArtifacts({
+        outDir: path.resolve(repoRoot, config.outDir),
+        config,
+        runs,
+        comparisons,
+        summary
+    });
+}
+async function executeExperimentCell(args) {
+    let agentRunResult;
+    try {
+        agentRunResult = await runAgentPrompt({
+            runId: args.runId,
+            agentId: args.agentId,
+            promptVariant: args.promptVariant,
+            promptText: args.promptVariant.promptText,
+            cwd: args.repoRoot,
+            outDir: args.runDir,
+            timeoutMs: args.timeoutMs,
+            requireAvailable: args.requireAgents,
+            commandTemplate: args.commandTemplate,
+            env: args.env
+        });
+    }
+    catch (error) {
+        agentRunResult = buildSyntheticFailureResult(args, error);
+    }
+    const parsedAnswer = parseAgentAnswer({
+        text: agentRunResult.finalAnswerText,
+        answerKey: args.promptVariant.expectedAnswerKey,
+        tokenUsage: agentRunResult.tokenUsage
+    });
+    const classification = classifyAgentRunOutcome({ agentRunResult, parsedAnswer });
+    const correctness = scoreCorrectness({
+        caseId: args.promptVariant.caseId,
+        answerKey: args.promptVariant.expectedAnswerKey,
+        parsedAnswer,
+        status: classification.status
+    });
+    return {
+        runId: args.runId,
+        caseId: args.promptVariant.caseId,
+        benchmarkProject: args.promptVariant.benchmarkProject,
+        agentId: args.agentId,
+        promptStrategy: args.promptVariant.strategy,
+        promptComplexityLevel: args.promptVariant.complexityLevel,
+        promptVariantId: args.promptVariant.id,
+        promptTextForArtifact: args.promptVariant.promptText,
+        projectComplexityLevel: args.promptVariant.projectProfile.complexityLevel,
+        projectComplexityScore: args.promptVariant.projectProfile.complexityScore,
+        promptMetrics: args.promptVariant.promptMetrics,
+        agentRunResult,
+        parsedAnswer,
+        correctness,
+        status: classification.status,
+        statusReason: classification.statusReason,
+        startedAt: agentRunResult.startedAt,
+        endedAt: agentRunResult.endedAt,
+        durationMs: agentRunResult.durationMs,
+        tokenUsage: agentRunResult.tokenUsage,
+        tokenUsageSource: agentRunResult.tokenUsageSource,
+        tokenUsageReliability: agentRunResult.tokenUsageReliability,
+        warnings: classification.warnings,
+        errors: classification.errors,
+        artifactPaths: {}
+    };
+}
+function buildPromptVariant(args) {
+    const [variant] = generatePromptVariants({
+        cases: [args.evaluationCase],
+        projectProfiles: args.projectProfiles,
+        strategies: [args.strategy],
+        complexityLevels: [args.complexityLevel]
+    });
+    if (!variant) {
+        throw new Error(`Failed to generate prompt variant for case: ${args.evaluationCase.id}`);
+    }
+    return variant;
+}
+function buildSyntheticFailureResult(args, error) {
+    const now = new Date().toISOString();
+    const message = error instanceof Error ? error.message : String(error);
+    return {
+        runId: args.runId,
+        agentId: args.agentId,
+        displayName: args.agentId,
+        surface: args.agentId === "fake-agent" ? "simulated" : "cli",
+        promptVariantId: args.promptVariant.id,
+        promptStrategy: args.promptVariant.strategy,
+        promptComplexityLevel: args.promptVariant.complexityLevel,
+        startedAt: now,
+        endedAt: now,
+        durationMs: 0,
+        status: "failed",
+        exitCode: null,
+        command: args.agentId,
+        args: [],
+        cwd: args.repoRoot,
+        finalAnswerText: "",
+        finalAnswerParseStatus: "empty",
+        tokenUsage: { source: "unavailable" },
+        tokenUsageSource: "unavailable",
+        tokenUsageReliability: "unavailable",
+        warnings: [],
+        errors: [message]
+    };
+}

package/dist/src/evaluation/runMyDevKitRetrieval.js ADDED Viewed

@@ -0,0 +1,197 @@
+import path from "node:path";
+import { countEstimatedTokens, countTextChars, tokenCountMethod } from "../core/countTokens.js";
+import { runMeasuredCommand } from "../core/runMeasuredCommand.js";
+function parseJsonIfPossible(text) {
+    try {
+        return JSON.parse(text);
+    }
+    catch {
+        return undefined;
+    }
+}
+function readSearchResults(payload) {
+    if (Array.isArray(payload)) {
+        return payload.filter((item) => !!item && typeof item === "object");
+    }
+    if (!payload || typeof payload !== "object") {
+        return [];
+    }
+    const record = payload;
+    for (const key of ["results", "matches", "items", "data"]) {
+        if (Array.isArray(record[key])) {
+            return record[key].filter((item) => !!item && typeof item === "object");
+        }
+    }
+    return [];
+}
+function pickCandidateFields(candidate) {
+    const readString = (...keys) => {
+        for (const key of keys) {
+            if (typeof candidate[key] === "string" && candidate[key]) {
+                return candidate[key];
+            }
+        }
+        return undefined;
+    };
+    return {
+        nodeId: readString("nodeId", "id", "node", "symbolId"),
+        file: readString("file", "path", "filePath"),
+        symbol: readString("symbol", "name", "label")
+    };
+}
+export async function runMyDevKitRetrieval(options) {
+    const started = Date.now();
+    const warnings = [];
+    const commandsDir = path.join(options.outputDir, "commands", options.evaluationCase.id);
+    const indexesDir = path.join(options.outputDir, "indexes", options.evaluationCase.id);
+    const commands = [];
+    const indexCommand = await runMeasuredCommand({
+        commandId: "index",
+        commandString: options.kitCommand,
+        cwd: process.cwd(),
+        outDir: commandsDir,
+        extraArgs: [
+            "index",
+            "--root",
+            options.evaluationCase.absoluteTargetRoot,
+            ...options.evaluationCase.sourceRoots.flatMap((sourceRoot) => ["--src", sourceRoot]),
+            "--out",
+            indexesDir,
+            "--json"
+        ]
+    });
+    commands.push(indexCommand);
+    if (!indexCommand.ok) {
+        if (options.requireKit) {
+            throw new Error(indexCommand.error || `my-dev-kit index failed with exit code ${indexCommand.exitCode}`);
+        }
+        warnings.push("my-dev-kit index command was unavailable or failed.");
+        return {
+            caseId: options.evaluationCase.id,
+            skipped: true,
+            warnings,
+            totalChars: 0,
+            totalEstimatedTokens: 0,
+            tokenCountMethod,
+            contextText: "",
+            filesRead: [],
+            commands,
+            durationMs: Date.now() - started
+        };
+    }
+    const searchCommand = await runMeasuredCommand({
+        commandId: "search",
+        commandString: options.kitCommand,
+        cwd: process.cwd(),
+        outDir: commandsDir,
+        extraArgs: ["search", "--index", indexesDir, "--query", options.evaluationCase.query, "--json"]
+    });
+    commands.push(searchCommand);
+    if (!searchCommand.ok) {
+        if (options.requireKit) {
+            throw new Error(searchCommand.error || `my-dev-kit search failed with exit code ${searchCommand.exitCode}`);
+        }
+        warnings.push("my-dev-kit search command failed.");
+        return {
+            caseId: options.evaluationCase.id,
+            skipped: true,
+            warnings,
+            totalChars: 0,
+            totalEstimatedTokens: 0,
+            tokenCountMethod,
+            contextText: "",
+            filesRead: [],
+            commands,
+            durationMs: Date.now() - started
+        };
+    }
+    const searchPayload = parseJsonIfPossible(searchCommand.stdout);
+    const candidates = readSearchResults(searchPayload);
+    const selected = candidates[0];
+    if (!selected) {
+        warnings.push("No my-dev-kit search candidate was found.");
+        return {
+            caseId: options.evaluationCase.id,
+            skipped: true,
+            warnings,
+            totalChars: 0,
+            totalEstimatedTokens: 0,
+            tokenCountMethod,
+            contextText: "",
+            filesRead: [],
+            commands,
+            durationMs: Date.now() - started
+        };
+    }
+    const candidate = pickCandidateFields(selected);
+    const selectedNodeId = candidate.nodeId;
+    const selectedFile = candidate.file;
+    const selectedSymbol = candidate.symbol;
+    let lookupOutput = "";
+    let sliceOutput = "";
+    let sourceOutput = "";
+    if (selectedNodeId) {
+        const lookupCommand = await runMeasuredCommand({
+            commandId: "lookup",
+            commandString: options.kitCommand,
+            cwd: process.cwd(),
+            outDir: commandsDir,
+            extraArgs: ["lookup", "--index", indexesDir, "--node", selectedNodeId, "--json"]
+        });
+        commands.push(lookupCommand);
+        if (lookupCommand.ok) {
+            lookupOutput = lookupCommand.stdout;
+        }
+        else {
+            warnings.push("my-dev-kit lookup command failed.");
+        }
+        const sliceCommand = await runMeasuredCommand({
+            commandId: "slice",
+            commandString: options.kitCommand,
+            cwd: process.cwd(),
+            outDir: commandsDir,
+            extraArgs: ["slice", "--index", indexesDir, "--node", selectedNodeId, "--json"]
+        });
+        commands.push(sliceCommand);
+        if (sliceCommand.ok) {
+            sliceOutput = sliceCommand.stdout;
+        }
+        else {
+            warnings.push("my-dev-kit slice command failed.");
+        }
+        const sourceCommand = await runMeasuredCommand({
+            commandId: "source",
+            commandString: options.kitCommand,
+            cwd: process.cwd(),
+            outDir: commandsDir,
+            extraArgs: ["source", "--index", indexesDir, "--node", selectedNodeId, "--max-lines", "160", "--format", "numbered"]
+        });
+        commands.push(sourceCommand);
+        if (sourceCommand.ok) {
+            sourceOutput = sourceCommand.stdout;
+        }
+        else {
+            warnings.push("my-dev-kit source command failed.");
+        }
+    }
+    else {
+        warnings.push("No my-dev-kit node id was available after search.");
+    }
+    const contextText = [sourceOutput, sliceOutput, lookupOutput, searchCommand.stdout].find((text) => text && text.trim().length > 0) ?? "";
+    const filesRead = selectedFile ? [selectedFile] : [];
+    return {
+        caseId: options.evaluationCase.id,
+        skipped: contextText.length === 0,
+        warnings,
+        totalChars: countTextChars(contextText),
+        totalEstimatedTokens: countEstimatedTokens(contextText),
+        tokenCountMethod,
+        contextText,
+        filesRead,
+        commands,
+        selectedNodeId,
+        selectedFile,
+        selectedSymbol,
+        durationMs: Date.now() - started
+    };
+}

package/dist/src/evaluation/runRawFullFileBaseline.js ADDED Viewed

@@ -0,0 +1,31 @@
+import { readFileSync, statSync } from "node:fs";
+import { collectFilesForGlobs } from "../core/fileGlobs.js";
+import { countEstimatedTokens, countTextChars, tokenCountMethod } from "../core/countTokens.js";
+export async function runRawFullFileBaseline(evaluationCase) {
+    const started = Date.now();
+    let stats;
+    try {
+        stats = statSync(evaluationCase.absoluteTargetRoot);
+    }
+    catch {
+        throw new Error(`Target root does not exist: ${evaluationCase.targetRoot}`);
+    }
+    if (!stats.isDirectory()) {
+        throw new Error(`Target root is not a directory: ${evaluationCase.targetRoot}`);
+    }
+    const files = collectFilesForGlobs(evaluationCase.absoluteTargetRoot, evaluationCase.rawIncludeGlobs);
+    const contextText = files
+        .map(({ absolutePath, relativePath }) => `=== FILE: ${relativePath} ===\n${readFileSync(absolutePath, "utf8")}\n`)
+        .join("\n");
+    return {
+        caseId: evaluationCase.id,
+        targetRoot: evaluationCase.absoluteTargetRoot,
+        filesIncluded: files.map((file) => file.relativePath),
+        totalFiles: files.length,
+        totalChars: countTextChars(contextText),
+        totalEstimatedTokens: countEstimatedTokens(contextText),
+        tokenCountMethod,
+        contextText,
+        durationMs: Date.now() - started
+    };
+}

package/dist/src/evaluation/scoreCorrectness.js ADDED Viewed

@@ -0,0 +1,127 @@
+export const CORRECTNESS_FORMULA = "correctnessScore = 0.25 * fileMatchScore + 0.25 * symbolMatchScore + 0.50 * factMatchScore; empty file or symbol categories are neutral at 1.0.";
+export function scoreCorrectness(args) {
+    const failureReasons = [];
+    if (args.status && args.status !== "completed" && args.status !== "invalid-output") {
+        failureReasons.push(statusFailureReason(args.status));
+    }
+    if (args.parsedAnswer.parseStatus === "failed") {
+        failureReasons.push("invalid output");
+    }
+    const expectedFilesFound = countMatches(args.answerKey.expectedFiles, args.parsedAnswer.relevantFiles);
+    const expectedSymbolsFound = countMatches(args.answerKey.expectedSymbols, args.parsedAnswer.relevantSymbols);
+    const factMatches = matchFacts(args.answerKey, args.parsedAnswer.expectedFactsFound, args.parsedAnswer.answerText);
+    const requiredFactsTotal = args.answerKey.expectedFacts.filter((fact) => fact.required).length;
+    const requiredFactsFound = factMatches.filter((fact) => fact.required).length;
+    const optionalFactsTotal = args.answerKey.expectedFacts.length - requiredFactsTotal;
+    const optionalFactsFound = factMatches.filter((fact) => !fact.required).length;
+    const fileMatchScore = categoryScore(expectedFilesFound, args.answerKey.expectedFiles.length);
+    const symbolMatchScore = categoryScore(expectedSymbolsFound, args.answerKey.expectedSymbols.length);
+    const factMatchScore = weightedFactScore(args.answerKey, factMatches.map((fact) => fact.id));
+    const correctnessScore = round(0.25 * fileMatchScore + 0.25 * symbolMatchScore + 0.5 * factMatchScore);
+    const foundFactCount = factMatches.length;
+    for (const fact of args.answerKey.expectedFacts.filter((fact) => fact.required)) {
+        if (!factMatches.some((match) => match.id === fact.id)) {
+            failureReasons.push(`missing required fact: ${fact.id}`);
+        }
+    }
+    if (foundFactCount < args.answerKey.minimumCorrectFacts) {
+        failureReasons.push(`too few facts found: ${foundFactCount}/${args.answerKey.minimumCorrectFacts}`);
+    }
+    if (args.answerKey.expectedFiles.length > 0 && expectedFilesFound < args.answerKey.expectedFiles.length) {
+        failureReasons.push("missing expected file");
+    }
+    if (args.answerKey.expectedSymbols.length > 0 && expectedSymbolsFound < args.answerKey.expectedSymbols.length) {
+        failureReasons.push("missing expected symbol");
+    }
+    if (correctnessScore < 0.7) {
+        failureReasons.push("score below threshold");
+    }
+    const passed = failureReasons.length === 0 &&
+        requiredFactsFound === requiredFactsTotal &&
+        foundFactCount >= args.answerKey.minimumCorrectFacts &&
+        correctnessScore >= 0.7;
+    return {
+        caseId: args.caseId,
+        fileMatchScore,
+        symbolMatchScore,
+        factMatchScore,
+        correctnessScore,
+        requiredFactsFound,
+        requiredFactsTotal,
+        optionalFactsFound,
+        optionalFactsTotal,
+        expectedFilesFound,
+        expectedFilesTotal: args.answerKey.expectedFiles.length,
+        expectedSymbolsFound,
+        expectedSymbolsTotal: args.answerKey.expectedSymbols.length,
+        passed,
+        failureReasons: unique(failureReasons),
+        formula: CORRECTNESS_FORMULA
+    };
+}
+function statusFailureReason(status) {
+    if (status === "agent-limit-reached")
+        return "agent limit reached";
+    if (status === "agent-unavailable")
+        return "agent unavailable";
+    if (status === "timeout")
+        return "timeout";
+    if (status === "failed")
+        return "agent run failed";
+    if (status === "skipped")
+        return "agent run skipped";
+    if (status === "invalid-output")
+        return "invalid output";
+    return status;
+}
+function countMatches(expected, actual) {
+    return expected.filter((item) => {
+        const expectedNormalized = normalize(item);
+        const expectedPath = normalizePath(item);
+        return actual.some((candidate) => {
+            const actualNormalized = normalize(candidate);
+            const actualPath = normalizePath(candidate);
+            return (actualNormalized === expectedNormalized ||
+                actualNormalized.includes(expectedNormalized) ||
+                expectedNormalized.includes(actualNormalized) ||
+                actualPath === expectedPath ||
+                actualPath.endsWith(`/${expectedPath}`) ||
+                expectedPath.endsWith(`/${actualPath}`));
+        });
+    }).length;
+}
+function matchFacts(answerKey, expectedFactsFound, answerText) {
+    const found = new Set(expectedFactsFound.map(normalize));
+    const normalizedAnswer = normalize(answerText);
+    return answerKey.expectedFacts.filter((fact) => {
+        const factId = normalize(fact.id);
+        const factText = normalize(fact.text);
+        return found.has(factId) || found.has(factText) || normalizedAnswer.includes(factId) || normalizedAnswer.includes(factText);
+    });
+}
+function weightedFactScore(answerKey, factIds) {
+    if (answerKey.expectedFacts.length === 0) {
+        return 1;
+    }
+    const found = new Set(factIds);
+    const totalWeight = answerKey.expectedFacts.reduce((sum, fact) => sum + fact.weight, 0);
+    if (totalWeight === 0) {
+        return 1;
+    }
+    return round(answerKey.expectedFacts.filter((fact) => found.has(fact.id)).reduce((sum, fact) => sum + fact.weight, 0) / totalWeight);
+}
+function categoryScore(found, total) {
+    return total === 0 ? 1 : round(found / total);
+}
+function normalize(value) {
+    return value.toLowerCase().replace(/[^a-z0-9]+/g, " ").trim();
+}
+function normalizePath(value) {
+    return value.toLowerCase().replace(/\\/g, "/").replace(/^\.?\//, "").trim();
+}
+function round(value) {
+    return Math.round(value * 10000) / 10000;
+}
+function unique(values) {
+    return [...new Set(values)];
+}

package/dist/src/evaluation/types.js ADDED Viewed

	@@ -0,0 +1 @@
1	+ export {};

package/dist/src/evaluation/writeExperimentArtifacts.js ADDED Viewed

@@ -0,0 +1,104 @@
+import { mkdir, writeFile } from "node:fs/promises";
+import path from "node:path";
+export async function writeExperimentArtifacts(args) {
+    const outDir = path.resolve(args.outDir);
+    const runsDir = path.join(outDir, "runs");
+    await mkdir(runsDir, { recursive: true });
+    for (const run of args.runs) {
+        const runDir = path.join(runsDir, run.runId);
+        await mkdir(runDir, { recursive: true });
+        const promptPath = path.join(runDir, "prompt.txt");
+        const agentRunResultPath = path.join(runDir, "agent-run-result.json");
+        const parsedAnswerPath = path.join(runDir, "parsed-answer.json");
+        const correctnessScorePath = path.join(runDir, "correctness-score.json");
+        await writeFile(promptPath, getPromptTextFromRun(run), "utf8");
+        await writeFile(agentRunResultPath, `${JSON.stringify(run.agentRunResult, null, 2)}\n`, "utf8");
+        await writeFile(parsedAnswerPath, `${JSON.stringify(run.parsedAnswer, null, 2)}\n`, "utf8");
+        await writeFile(correctnessScorePath, `${JSON.stringify(run.correctness, null, 2)}\n`, "utf8");
+        run.artifactPaths = {
+            promptPath,
+            agentRunResultPath,
+            parsedAnswerPath,
+            correctnessScorePath
+        };
+    }
+    const artifactPaths = {
+        summaryPath: path.join(outDir, "experiment-summary.json"),
+        runsPath: path.join(outDir, "experiment-runs.json"),
+        comparisonsPath: path.join(outDir, "experiment-comparisons.json"),
+        configPath: path.join(outDir, "experiment-config.json"),
+        runsDir
+    };
+    await writeFile(artifactPaths.summaryPath, `${JSON.stringify(args.summary, null, 2)}\n`, "utf8");
+    await writeFile(artifactPaths.runsPath, `${JSON.stringify({ generatedAt: args.summary.generatedAt, runs: args.runs }, null, 2)}\n`, "utf8");
+    await writeFile(artifactPaths.comparisonsPath, `${JSON.stringify({ generatedAt: args.summary.generatedAt, comparisons: args.comparisons }, null, 2)}\n`, "utf8");
+    await writeFile(artifactPaths.configPath, `${JSON.stringify(sanitizeConfig(args.config), null, 2)}\n`, "utf8");
+    return {
+        summary: args.summary,
+        runs: args.runs,
+        comparisons: args.comparisons,
+        artifactPaths,
+        warnings: args.summary.warnings
+    };
+}
+export function buildExperimentSummary(args) {
+    const runs = args.runs;
+    const comparisons = args.comparisons;
+    const tokenSavings = comparisons
+        .map((comparison) => comparison.tokenSavingsPercent)
+        .filter((value) => typeof value === "number" && Number.isFinite(value));
+    const durationReductions = comparisons
+        .map((comparison) => comparison.durationReductionPercent)
+        .filter((value) => typeof value === "number" && Number.isFinite(value));
+    const correctnessDeltas = comparisons
+        .map((comparison) => comparison.correctnessDelta)
+        .filter((value) => typeof value === "number" && Number.isFinite(value));
+    const completedComparisons = comparisons.filter((comparison) => comparison.rawStatus === "completed" && comparison.myDevKitStatus === "completed");
+    return {
+        generatedAt: args.generatedAt ?? new Date().toISOString(),
+        casesPath: args.config.casesPath,
+        projectProfilesPath: args.config.projectProfilesPath,
+        agents: [...new Set(runs.map((run) => run.agentId))].sort(),
+        strategies: [...new Set(runs.map((run) => run.promptStrategy))].sort(),
+        complexityLevels: [...new Set(runs.map((run) => run.promptComplexityLevel))].sort(),
+        totalRuns: runs.length,
+        completedRuns: countStatus(runs, "completed"),
+        failedRuns: countStatus(runs, "failed"),
+        skippedRuns: countStatus(runs, "skipped"),
+        unavailableRuns: countStatus(runs, "agent-unavailable"),
+        limitReachedRuns: countStatus(runs, "agent-limit-reached"),
+        timeoutRuns: countStatus(runs, "timeout"),
+        invalidOutputRuns: countStatus(runs, "invalid-output"),
+        totalComparisons: comparisons.length,
+        averageTokenSavingsPercent: averageOrNull(tokenSavings),
+        averageDurationReductionPercent: averageOrNull(durationReductions),
+        averageCorrectnessDelta: averageOrNull(correctnessDeltas),
+        answerDoesMyDevKitSaveTokens: tokenSavings.length === 0 ? null : averageOrNull(tokenSavings) > 0,
+        answerDoesMyDevKitPreserveCorrectness: completedComparisons.length === 0 ? null : completedComparisons.every((comparison) => comparison.sameCorrectnessPass),
+        answerDoesMyDevKitReduceExecutionTime: durationReductions.length === 0 ? null : averageOrNull(durationReductions) > 0,
+        warnings: [
+            ...runs.flatMap((run) => run.warnings),
+            ...comparisons.flatMap((comparison) => comparison.warnings)
+        ]
+    };
+}
+function getPromptTextFromRun(run) {
+    return run.promptTextForArtifact ?? "";
+}
+function sanitizeConfig(config) {
+    return {
+        ...config,
+        commandTemplates: config.commandTemplates
+            ? Object.fromEntries(Object.entries(config.commandTemplates).map(([key, value]) => [key, value ? { ...value, args: value.args } : value]))
+            : undefined
+    };
+}
+function countStatus(runs, status) {
+    return runs.filter((run) => run.status === status).length;
+}
+function averageOrNull(values) {
+    if (values.length === 0) {
+        return null;
+    }
+    return values.reduce((sum, value) => sum + value, 0) / values.length;
+}

package/dist/src/evaluation/writeTokenSavingsArtifacts.js ADDED Viewed

@@ -0,0 +1,57 @@
+import { mkdir, writeFile } from "node:fs/promises";
+import path from "node:path";
+import { normalizeLabReport } from "../report/types.js";
+import { renderHtmlReport } from "../report/renderHtmlReport.js";
+import { renderTokenSavingsReportInput } from "./renderTokenSavingsReportInput.js";
+export async function writeTokenSavingsArtifacts(options) {
+    const outDir = path.resolve(options.outDir);
+    await mkdir(outDir, { recursive: true });
+    const artifactPaths = {
+        summaryPath: path.join(outDir, "token-savings-summary.json"),
+        runsPath: path.join(outDir, "token-savings-runs.json"),
+        htmlPath: path.join(outDir, "token-savings-report.html"),
+        pngPath: path.join(outDir, "token-savings-report.png")
+    };
+    const warnings = [...options.summary.warnings];
+    if (options.screenshot.warning) {
+        warnings.push(options.screenshot.warning);
+    }
+    if (options.screenshot.status === "failed" && options.screenshot.error) {
+        warnings.push(`PNG screenshot capture failed: ${options.screenshot.error}`);
+    }
+    const report = renderTokenSavingsReportInput({
+        summary: options.summary,
+        cases: options.comparisonCases,
+        commandConfig: options.commandConfig,
+        artifactPaths: {
+            summaryPath: artifactPaths.summaryPath,
+            runsPath: artifactPaths.runsPath,
+            htmlPath: artifactPaths.htmlPath
+        },
+        warnings
+    });
+    const normalizedReport = normalizeLabReport(report, options.generatedAt);
+    await writeFile(artifactPaths.summaryPath, JSON.stringify({
+        summary: options.summary,
+        tokenCountMethod: options.summary.tokenCountMethod,
+        generatedAt: normalizedReport.generatedAt,
+        commandConfiguration: options.commandConfig,
+        warnings,
+        screenshot: options.screenshot,
+        artifactPaths
+    }, null, 2), "utf8");
+    await writeFile(artifactPaths.runsPath, JSON.stringify({
+        generatedAt: normalizedReport.generatedAt,
+        tokenCountMethod: options.summary.tokenCountMethod,
+        runs: options.runs
+    }, null, 2), "utf8");
+    await writeFile(artifactPaths.htmlPath, renderHtmlReport(normalizedReport), "utf8");
+    return {
+        summary: options.summary,
+        runs: options.runs,
+        report: normalizedReport,
+        screenshot: options.screenshot,
+        artifactPaths,
+        warnings
+    };
+}