npm - @sebastiantuyu/agest - Versions diffs - 0.3.0 → 0.3.1 - Mend

@sebastiantuyu/agest 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/dist/reporter.js CHANGED Viewed

@@ -23,6 +23,52 @@ export function formatReport(report) {
             if (reason) {
                 lines.push(`          reason: "${reason}"`);
             }
+            const result = report.results.find((r) => r.prompt === c);
+            if (result?.response.text) {
+                const escaped = result.response.text.replace(/"/g, '\\"').replace(/\n/g, '\\n');
+                lines.push(`          response: "${escaped}"`);
+            }
+        }
+    }
+    // Suite breakdown
+    const suites = new Set(report.results.map((r) => r.suite).filter(Boolean));
+    if (suites.size > 0) {
+        lines.push(`    suites:`);
+        for (const s of suites) {
+            const suiteResults = report.results.filter((r) => r.suite === s);
+            const suitePassed = suiteResults.filter((r) => r.passed).length;
+            const suiteRate = suiteResults.length > 0
+                ? Number((suitePassed / suiteResults.length).toFixed(2))
+                : 0;
+            lines.push(`        - name: "${s}"`);
+            lines.push(`          success_rate: ${suiteRate}`);
+            lines.push(`          total_cases: ${suiteResults.length}`);
+            lines.push(`          failed_cases_count: ${suiteResults.length - suitePassed}`);
+            if (suitePassed < suiteResults.length) {
+                lines.push(`          failed_cases:`);
+                for (const r of suiteResults.filter((r) => !r.passed)) {
+                    lines.push(`              - "${r.prompt}"`);
+                    if (r.error) {
+                        lines.push(`                reason: "${r.error}"`);
+                    }
+                    if (r.response.text) {
+                        const escaped = r.response.text.replace(/"/g, '\\"').replace(/\n/g, '\\n');
+                        lines.push(`                response: "${escaped}"`);
+                    }
+                }
+            }
+        }
+    }
+    // Statistical runs summary
+    const withRuns = report.results.filter((r) => r.runs && r.runs.length > 1);
+    if (withRuns.length > 0) {
+        lines.push(`    statistical_runs:`);
+        for (const r of withRuns) {
+            const label = r.prompt.length > 50 ? r.prompt.slice(0, 47) + "..." : r.prompt;
+            lines.push(`        - "${label}"`);
+            lines.push(`          runs: ${r.runs.length}`);
+            lines.push(`          pass_rate: ${((r.passRate ?? 0) * 100).toFixed(1)}%`);
+            lines.push(`          significance: ${((r.statisticalSignificance ?? 0) * 100).toFixed(1)}%`);
         }
     }
     lines.push(`    timestamp: "${report.timestamp}"`, `    duration: ${report.duration}`, `    total_cases: ${report.totalCases}`);

package/dist/reports.d.ts CHANGED Viewed

@@ -1,3 +1,14 @@
+export interface ParsedSuiteResult {
+    name: string;
+    successRate: number;
+    totalCases: number;
+    failedCasesCount: number;
+    failedCases: Array<{
+        prompt: string;
+        reason?: string;
+        response?: string;
+    }>;
+}
 export interface ParsedReport {
     name?: string;
     systemPromptHash?: string;
@@ -11,11 +22,13 @@ export interface ParsedReport {
     failedCases: Array<{
         prompt: string;
         reason?: string;
+        response?: string;
     }>;
     duration: number;
     timestamp: string;
     averageInputTokensPerCase?: number;
     averageOutputTokensPerCase?: number;
+    suites?: ParsedSuiteResult[];
     source: string;
 }
 export interface DiffEntry {
@@ -27,8 +40,10 @@ export declare function extractField(content: string, key: string): string | und
 export declare function parseFailedCases(content: string): Array<{
     prompt: string;
     reason?: string;
+    response?: string;
 }>;
 export declare function parseDimensions(content: string): Record<string, string> | undefined;
+export declare function parseSuites(content: string): ParsedSuiteResult[] | undefined;
 export declare function parseReport(content: string, source: string): ParsedReport;
 export declare function findReports(dir: string, depth?: number): Promise<string[]>;
 export declare function loadDiffEntry(hash: string): Promise<DiffEntry | null>;
@@ -75,4 +90,9 @@ export declare function findVaryingDimensions(reports: ParsedReport[]): string[]
  * Group reports by the value of a specific dimension.
  */
 export declare function groupByDimension(reports: ParsedReport[], dimension: string): Map<string, ParsedReport[]>;
+/**
+ * Wilson score interval lower bound at 95% confidence.
+ * Gives a conservative success rate estimate that accounts for sample size.
+ */
+export declare function wilsonLowerBound(successRate: number, totalCases: number): number;
 export declare function formatDuration(ms: number): string;

package/dist/reports.js CHANGED Viewed

@@ -20,9 +20,21 @@ export function parseFailedCases(content) {
             break;
         const promptMatch = line.match(/^\s+- "(.+)"$/);
         if (promptMatch) {
-            const next = lines[i + 1];
-            const reasonMatch = next?.match(/^\s+reason: "(.+)"$/);
-            cases.push({ prompt: promptMatch[1], reason: reasonMatch?.[1] });
+            let reason;
+            let response;
+            // Look ahead for reason and response fields
+            for (let j = i + 1; j < Math.min(i + 3, lines.length); j++) {
+                const next = lines[j];
+                if (!next || !next.match(/^\s+(reason|response):/))
+                    break;
+                const reasonMatch = next.match(/^\s+reason: "(.+)"$/);
+                if (reasonMatch)
+                    reason = reasonMatch[1];
+                const responseMatch = next.match(/^\s+response: "(.+)"$/);
+                if (responseMatch)
+                    response = responseMatch[1].replace(/\\n/g, '\n').replace(/\\"/g, '"');
+            }
+            cases.push({ prompt: promptMatch[1], reason, response });
         }
     }
     return cases;
@@ -44,6 +56,74 @@ export function parseDimensions(content) {
     }
     return Object.keys(dims).length > 0 ? dims : undefined;
 }
+export function parseSuites(content) {
+    const lines = content.split("\n");
+    const startIdx = lines.findIndex((l) => l.trim() === "suites:");
+    if (startIdx === -1)
+        return undefined;
+    const suites = [];
+    let current = null;
+    let parsingFailedCases = false;
+    for (let i = startIdx + 1; i < lines.length; i++) {
+        const line = lines[i];
+        // Stop if we exit the suites indentation level
+        if (line.length > 0 && !line.startsWith("        "))
+            break;
+        if (line.trim() === "")
+            continue;
+        const nameMatch = line.match(/^\s+- name: "(.+)"$/);
+        if (nameMatch) {
+            if (current)
+                suites.push(current);
+            current = { name: nameMatch[1], failedCases: [], failedCasesCount: 0 };
+            parsingFailedCases = false;
+            continue;
+        }
+        if (!current)
+            continue;
+        const srMatch = line.match(/^\s+success_rate: (.+)$/);
+        if (srMatch) {
+            current.successRate = parseFloat(srMatch[1]);
+            continue;
+        }
+        const tcMatch = line.match(/^\s+total_cases: (.+)$/);
+        if (tcMatch) {
+            current.totalCases = parseInt(tcMatch[1], 10);
+            continue;
+        }
+        const fccMatch = line.match(/^\s+failed_cases_count: (.+)$/);
+        if (fccMatch) {
+            current.failedCasesCount = parseInt(fccMatch[1], 10);
+            continue;
+        }
+        if (line.trim() === "failed_cases:") {
+            parsingFailedCases = true;
+            continue;
+        }
+        if (parsingFailedCases) {
+            const promptMatch = line.match(/^\s+- "(.+)"$/);
+            if (promptMatch) {
+                let reason;
+                let response;
+                for (let j = i + 1; j < Math.min(i + 3, lines.length); j++) {
+                    const next = lines[j];
+                    if (!next || !next.match(/^\s+(reason|response):/))
+                        break;
+                    const reasonMatch = next.match(/^\s+reason: "(.+)"$/);
+                    if (reasonMatch)
+                        reason = reasonMatch[1];
+                    const responseMatch = next.match(/^\s+response: "(.+)"$/);
+                    if (responseMatch)
+                        response = responseMatch[1].replace(/\\n/g, '\n').replace(/\\"/g, '"');
+                }
+                current.failedCases.push({ prompt: promptMatch[1], reason, response });
+            }
+        }
+    }
+    if (current)
+        suites.push(current);
+    return suites.length > 0 ? suites : undefined;
+}
 export function parseReport(content, source) {
     const num = (key, fallback = 0) => parseFloat(extractField(content, key) ?? String(fallback));
     const avgIn = extractField(content, "average_input_tokens_per_case");
@@ -78,6 +158,7 @@ export function parseReport(content, source) {
         timestamp: extractField(content, "timestamp") ?? "",
         averageInputTokensPerCase: avgIn != null ? parseFloat(avgIn) : undefined,
         averageOutputTokensPerCase: avgOut != null ? parseFloat(avgOut) : undefined,
+        suites: parseSuites(content),
         source,
     };
 }
@@ -267,6 +348,21 @@ export function groupByDimension(reports, dimension) {
     }
     return groups;
 }
+/**
+ * Wilson score interval lower bound at 95% confidence.
+ * Gives a conservative success rate estimate that accounts for sample size.
+ */
+export function wilsonLowerBound(successRate, totalCases) {
+    if (totalCases === 0)
+        return 0;
+    const z = 1.96;
+    const p = successRate;
+    const denominator = 1 + (z * z) / totalCases;
+    const centre = p + (z * z) / (2 * totalCases);
+    const spread = z * Math.sqrt((p * (1 - p) + (z * z) / (4 * totalCases)) / totalCases);
+    const lower = (centre - spread) / denominator;
+    return Math.max(0, Math.min(1, lower));
+}
 export function formatDuration(ms) {
     if (ms < 1000)
         return `${ms.toFixed(0)}ms`;

package/dist/runner.js CHANGED Viewed

@@ -13,11 +13,26 @@ export function extractField(response, field) {
             return response.metadata?.[field];
     }
 }
-export async function executeScene(executor, scene, globalTimeout, judgeConfig, globalTurns) {
+/**
+ * Compute Wilson score interval lower bound.
+ * Measures confidence that the true pass rate is above 50% (random chance).
+ * z = 1.96 for 95% confidence level.
+ */
+function wilsonSignificance(passes, total) {
+    if (total === 0)
+        return 0;
+    const z = 1.96;
+    const p = passes / total;
+    const denominator = 1 + (z * z) / total;
+    const centre = p + (z * z) / (2 * total);
+    const spread = z * Math.sqrt((p * (1 - p) + (z * z) / (4 * total)) / total);
+    const lower = (centre - spread) / denominator;
+    // Return the lower bound clamped to [0, 1]
+    return Math.max(0, Math.min(1, lower));
+}
+async function executeSingleRun(executor, scene, timeoutMs, turns, judgeConfig) {
     let response = { text: "" };
     let duration;
-    const timeoutMs = scene.timeout ?? globalTimeout ?? DEFAULT_SCENE_TIMEOUT;
-    const turns = scene.turns ?? globalTurns ?? 1;
     try {
         const start = performance.now();
         let input = scene.prompt;
@@ -38,21 +53,14 @@ export async function executeScene(executor, scene, globalTimeout, judgeConfig,
     }
     catch (err) {
         return {
-            prompt: scene.prompt,
-            response: { text: "", executionError: err.message },
-            duration: 0,
             passed: false,
             error: err.message,
+            response: { text: "", executionError: err.message },
+            duration: 0,
         };
     }
     if (response.executionError) {
-        return {
-            prompt: scene.prompt,
-            response,
-            duration,
-            passed: false,
-            error: response.executionError,
-        };
+        return { passed: false, error: response.executionError, response, duration };
     }
     let passed = true;
     let error;
@@ -94,5 +102,52 @@ export async function executeScene(executor, scene, globalTimeout, judgeConfig,
             }
         }
     }
-    return { prompt: scene.prompt, response, duration, passed, error, judgement };
+    return { passed, error, response, duration, judgement };
+}
+export async function executeScene(executor, scene, globalTimeout, judgeConfig, globalTurns) {
+    const timeoutMs = scene.timeout ?? globalTimeout ?? DEFAULT_SCENE_TIMEOUT;
+    const turns = scene.turns ?? globalTurns ?? 1;
+    const numRuns = scene.runs ?? 1;
+    // Single run — original fast path
+    if (numRuns <= 1) {
+        const run = await executeSingleRun(executor, scene, timeoutMs, turns, judgeConfig);
+        return {
+            prompt: scene.prompt,
+            response: run.response,
+            duration: run.duration,
+            passed: run.passed,
+            error: run.error,
+            judgement: run.judgement,
+            suite: scene.suite,
+        };
+    }
+    // Multiple runs — execute N times and aggregate
+    const runs = [];
+    for (let i = 0; i < numRuns; i++) {
+        runs.push(await executeSingleRun(executor, scene, timeoutMs, turns, judgeConfig));
+    }
+    const passes = runs.filter((r) => r.passed).length;
+    const passRate = passes / runs.length;
+    const totalDuration = runs.reduce((sum, r) => sum + r.duration, 0);
+    const statisticalSignificance = wilsonSignificance(passes, runs.length);
+    // Use the last run's response as representative
+    const lastRun = runs[runs.length - 1];
+    // Overall pass = majority passed (> 50%)
+    const overallPassed = passRate > 0.5;
+    const failedRuns = runs.filter((r) => !r.passed);
+    const error = overallPassed
+        ? undefined
+        : failedRuns[0]?.error ?? "Majority of runs failed";
+    return {
+        prompt: scene.prompt,
+        response: lastRun.response,
+        duration: totalDuration,
+        passed: overallPassed,
+        error,
+        judgement: lastRun.judgement,
+        suite: scene.suite,
+        runs,
+        passRate,
+        statisticalSignificance,
+    };
 }

package/dist/stats.js CHANGED Viewed

@@ -197,6 +197,8 @@ async function main() {
     const args = process.argv.slice(2);
     const agentFlagIdx = args.indexOf("--agent");
     const agentFilter = agentFlagIdx !== -1 ? args[agentFlagIdx + 1] : undefined;
+    const modelFlagIdx = args.indexOf("--model");
+    const modelFilter = modelFlagIdx !== -1 ? args[modelFlagIdx + 1] : undefined;
     if (args.includes("--purge")) {
         await purge(process.cwd());
         return;
@@ -220,8 +222,15 @@ async function main() {
             return;
         }
     }
+    if (modelFilter) {
+        reports = reports.filter((r) => r.model.toLowerCase() === modelFilter.toLowerCase());
+        if (reports.length === 0) {
+            console.log(`\n  No reports found for model "${modelFilter}".\n`);
+            return;
+        }
+    }
     console.log("\n" + "━".repeat(W));
-    const filterLabel = agentFilter ? `  ·  agent: ${agentFilter}` : "";
+    const filterLabel = (agentFilter ? `  ·  agent: ${agentFilter}` : "") + (modelFilter ? `  ·  model: ${modelFilter}` : "");
     console.log(`  AGEST STATS  ·  ${reports.length} report${reports.length !== 1 ? "s" : ""} found${filterLabel}`);
     console.log("━".repeat(W));
     // Aggregate by model
@@ -250,6 +259,29 @@ async function main() {
         value: a.avgSuccessRate,
         display: `${(a.avgSuccessRate * 100).toFixed(0).padStart(3)}%`,
     })), 1);
+    // Suite breakdown (aggregate across all reports that have suites)
+    const withSuites = reports.filter((r) => r.suites && r.suites.length > 0);
+    if (withSuites.length > 0) {
+        const suiteAgg = new Map();
+        for (const r of withSuites) {
+            for (const s of r.suites) {
+                const arr = suiteAgg.get(s.name) ?? [];
+                arr.push(s.successRate);
+                suiteAgg.set(s.name, arr);
+            }
+        }
+        const suiteRows = [...suiteAgg.entries()]
+            .map(([name, rates]) => {
+            const avgRate = rates.reduce((a, b) => a + b, 0) / rates.length;
+            return {
+                label: name,
+                value: avgRate,
+                display: `${(avgRate * 100).toFixed(0).padStart(3)}%`,
+            };
+        })
+            .sort((a, b) => b.value - a.value);
+        printSection("Suite Breakdown", suiteRows, 1);
+    }
     // Token charts (only when data is present)
     const withTokens = agg.filter((a) => a.avgInputTokens != null && a.avgOutputTokens != null);
     if (withTokens.length > 0) {

package/dist/types.d.ts CHANGED Viewed

@@ -14,6 +14,7 @@ export interface AgentResponse {
         [key: string]: unknown;
     };
 }
+export type HookFn = () => void | Promise<void>;
 export interface SceneDefinition {
     prompt: string;
     assertions: Array<{
@@ -22,6 +23,8 @@ export interface SceneDefinition {
     }>;
     timeout?: number;
     turns?: number;
+    runs?: number;
+    suite?: string;
 }
 export type JudgeVerdict = "pass" | "fail" | "partial";
 export interface JudgeResult {
@@ -29,6 +32,13 @@ export interface JudgeResult {
     reasoning: string;
     criteria: string;
 }
+export interface RunResult {
+    passed: boolean;
+    error?: string;
+    response: AgentResponse;
+    duration: number;
+    judgement?: JudgeResult;
+}
 export interface SceneResult {
     prompt: string;
     response: AgentResponse;
@@ -36,6 +46,10 @@ export interface SceneResult {
     passed: boolean;
     error?: string;
     judgement?: JudgeResult;
+    suite?: string;
+    runs?: RunResult[];
+    passRate?: number;
+    statisticalSignificance?: number;
 }
 export interface AgentReport {
     name?: string;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@sebastiantuyu/agest",
-  "version": "0.3.0",
+  "version": "0.3.1",
   "description": "A testing library for agents",
   "repository": {
     "type": "git",