npm - @sebastiantuyu/agest - Versions diffs - 0.3.0 → 0.3.1 - Mend

@sebastiantuyu/agest 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Sebastian Tuyu
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

package/README.md CHANGED Viewed

@@ -116,11 +116,13 @@ npx tsx examples/openrouter.test.ts
 - [x] Remote HTTP adapter for framework-agnostic testing
 - [x] Report persistence to `.reports/` with YAML format
 - [x] Stats CLI with multi-model comparison and dimension analysis
+- [x] Lifecycle hooks: `beforeEach`, `beforeAll`, `afterEach`, `afterAll` supporting sync/async functions
+- [x] Multiple test suites per agent via `suite()` to evaluate different aspects independently
+- [x] Statistical runs: `.runs(n)` per scene with pass rate and Wilson significance scoring
 ### Up next
 - [ ] Schema validation: `toBe.matchingSchema(zodSchema)`
 - [ ] Semantic similarity: `toBe.semanticallySimilarTo(text, threshold)`
-- [ ] Statistical runs: `.runs(n)` per scene with mean/stddev reporting
 - [ ] Vercel AI SDK adapter
 - [ ] Snapshot regression: diff current run against a saved baseline

package/dist/context.d.ts CHANGED Viewed

@@ -1,12 +1,17 @@
-import type { AgentExecutor, AgentReport, SceneDefinition } from "./types";
+import type { AgentExecutor, AgentReport, HookFn, SceneDefinition } from "./types";
 export declare class SceneBuilder {
     private _prompt;
     private _assertions;
     private _timeout?;
     private _turns?;
+    private _runs?;
+    private _suite?;
     constructor(_prompt: string);
     timeout(ms: number): SceneBuilder;
     turns(n: number): SceneBuilder;
+    runs(n: number): SceneBuilder;
+    /** @internal */
+    _setSuite(name: string): void;
     expect(field: string, fn: (value: any) => void): SceneBuilder;
     toDefinition(): SceneDefinition;
 }
@@ -14,7 +19,15 @@ export declare class AgentContext {
     private _executor;
     private _name?;
     private _scenes;
+    private _currentSuite?;
+    private _beforeAllHooks;
+    private _afterAllHooks;
+    private _beforeEachHooks;
+    private _afterEachHooks;
     constructor(_executor: AgentExecutor, _name?: string | undefined);
+    registerHook(type: "beforeAll" | "afterAll" | "beforeEach" | "afterEach", fn: HookFn): void;
+    setSuite(name: string): void;
+    clearSuite(): void;
     registerScene(prompt: string): SceneBuilder;
     execute(): Promise<AgentReport>;
 }

package/dist/context.js CHANGED Viewed

@@ -9,6 +9,8 @@ export class SceneBuilder {
     _assertions = [];
     _timeout;
     _turns;
+    _runs;
+    _suite;
     constructor(_prompt) {
         this._prompt = _prompt;
     }
@@ -20,24 +22,56 @@ export class SceneBuilder {
         this._turns = n;
         return this;
     }
+    runs(n) {
+        this._runs = Math.max(1, Math.round(n));
+        return this;
+    }
+    /** @internal */
+    _setSuite(name) {
+        this._suite = name;
+    }
     expect(field, fn) {
         this._assertions.push({ field, fn });
         return this;
     }
     toDefinition() {
-        return { prompt: this._prompt, assertions: [...this._assertions], timeout: this._timeout, turns: this._turns };
+        return {
+            prompt: this._prompt,
+            assertions: [...this._assertions],
+            timeout: this._timeout,
+            turns: this._turns,
+            runs: this._runs,
+            suite: this._suite,
+        };
     }
 }
 export class AgentContext {
     _executor;
     _name;
     _scenes = [];
+    _currentSuite;
+    _beforeAllHooks = [];
+    _afterAllHooks = [];
+    _beforeEachHooks = [];
+    _afterEachHooks = [];
     constructor(_executor, _name) {
         this._executor = _executor;
         this._name = _name;
     }
+    registerHook(type, fn) {
+        this[`_${type}Hooks`].push(fn);
+    }
+    setSuite(name) {
+        this._currentSuite = name;
+    }
+    clearSuite() {
+        this._currentSuite = undefined;
+    }
     registerScene(prompt) {
         const builder = new SceneBuilder(prompt);
+        if (this._currentSuite) {
+            builder._setSuite(this._currentSuite);
+        }
         this._scenes.push(builder);
         return builder;
     }
@@ -47,32 +81,82 @@ export class AgentContext {
         const definitions = this._scenes.map((s) => s.toDefinition());
         const orderedResults = new Array(definitions.length);
         const total = definitions.length;
-        logger.info(c.bold(`\nRunning ${total} scene${total !== 1 ? "s" : ""}${parallelism > 1 ? c.dim(` (parallelism: ${parallelism})`) : ""}...\n`));
-        const tasks = definitions.map((scene, i) => async () => {
+        // Group scenes by suite for organized output
+        const suiteNames = [...new Set(definitions.map((d) => d.suite).filter(Boolean))];
+        const hasSuites = suiteNames.length > 0;
+        const suiteCount = hasSuites ? ` (${suiteNames.length} suite${suiteNames.length !== 1 ? "s" : ""})` : "";
+        logger.info(c.bold(`\nRunning ${total} scene${total !== 1 ? "s" : ""}${suiteCount}${parallelism > 1 ? c.dim(` (parallelism: ${parallelism})`) : ""}...\n`));
+        // Run beforeAll hooks
+        for (const hook of this._beforeAllHooks) {
+            await hook();
+        }
+        const buildTask = (scene, i) => async () => {
             const label = scene.prompt.length > 60
                 ? scene.prompt.slice(0, 57) + "..."
                 : scene.prompt;
+            // Run beforeEach hooks
+            for (const hook of this._beforeEachHooks) {
+                await hook();
+            }
             const result = await executeScene(this._executor, scene, config.timeout, config.judge, config.turns);
             orderedResults[i] = result;
+            // Run afterEach hooks
+            for (const hook of this._afterEachHooks) {
+                await hook();
+            }
             const ms = result.duration.toFixed(0);
+            const runsLabel = result.runs ? c.dim(` [${result.runs.filter(r => r.passed).length}/${result.runs.length} passed]`) : "";
+            const indent = hasSuites ? "    " : "  ";
             if (result.passed) {
-                logger.info(`  ${c.cyan(`[${i + 1}/${total}]`)} ${label} ... ${c.green("PASS")}${c.dim(` (${ms}ms)`)}`);
+                logger.info(`${indent}${c.cyan(`[${i + 1}/${total}]`)} ${label} ... ${c.green("PASS")}${c.dim(` (${ms}ms)`)}${runsLabel}`);
             }
             else if (result.judgement?.verdict === "partial") {
-                logger.info(`  ${c.cyan(`[${i + 1}/${total}]`)} ${label} ... ${c.yellow("PARTIAL")}${c.dim(` (${ms}ms)`)}`);
+                logger.info(`${indent}${c.cyan(`[${i + 1}/${total}]`)} ${label} ... ${c.yellow("PARTIAL")}${c.dim(` (${ms}ms)`)}${runsLabel}`);
                 if (result.error) {
-                    logger.info(`         ${c.yellow(result.error)}`);
+                    logger.info(`${indent}       ${c.yellow(result.error)}`);
                 }
             }
             else {
-                logger.info(`  ${c.cyan(`[${i + 1}/${total}]`)} ${label} ... ${c.red("FAIL")}${c.dim(` (${ms}ms)`)}`);
+                logger.info(`${indent}${c.cyan(`[${i + 1}/${total}]`)} ${label} ... ${c.red("FAIL")}${c.dim(` (${ms}ms)`)}${runsLabel}`);
                 if (result.error) {
-                    logger.info(`         ${c.red(result.error)}`);
+                    logger.info(`${indent}       ${c.red(result.error)}`);
                 }
             }
-            logger.debug(`         response: ${result.response.text?.slice(0, 120)}`);
-        });
-        await PromisePool.withConcurrency(parallelism).for(tasks).process((task) => task());
+            if (result.statisticalSignificance != null) {
+                const sig = result.statisticalSignificance;
+                const sigColor = sig >= 0.95 ? c.green : sig >= 0.80 ? c.yellow : c.red;
+                logger.info(`${indent}       ${c.dim("significance:")} ${sigColor(`${(sig * 100).toFixed(1)}%`)} ${c.dim(`(pass rate: ${((result.passRate ?? 0) * 100).toFixed(1)}%)`)}`);
+            }
+            logger.debug(`${indent}       response: ${result.response.text?.slice(0, 120)}`);
+        };
+        if (hasSuites) {
+            // Execute suite by suite — print header once, then run all scenes in that suite
+            for (const suiteName of suiteNames) {
+                const suiteIndices = definitions
+                    .map((d, i) => d.suite === suiteName ? i : -1)
+                    .filter((i) => i !== -1);
+                logger.info(`  ${c.bold(c.cyan(`▸ ${suiteName}`))} ${c.dim(`(${suiteIndices.length} scene${suiteIndices.length !== 1 ? "s" : ""})`)}`);
+                const tasks = suiteIndices.map((i) => buildTask(definitions[i], i));
+                await PromisePool.withConcurrency(parallelism).for(tasks).process((task) => task());
+                logger.info("");
+            }
+            // Run any scenes not in a suite
+            const unsuitedIndices = definitions
+                .map((d, i) => d.suite ? -1 : i)
+                .filter((i) => i !== -1);
+            if (unsuitedIndices.length > 0) {
+                const tasks = unsuitedIndices.map((i) => buildTask(definitions[i], i));
+                await PromisePool.withConcurrency(parallelism).for(tasks).process((task) => task());
+            }
+        }
+        else {
+            const tasks = definitions.map((scene, i) => buildTask(scene, i));
+            await PromisePool.withConcurrency(parallelism).for(tasks).process((task) => task());
+        }
+        // Run afterAll hooks
+        for (const hook of this._afterAllHooks) {
+            await hook();
+        }
         const results = orderedResults;
         let totalDuration = results.reduce((sum, r) => sum + r.duration, 0);
         logger.info("");

package/dist/index.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import type { AgentExecutor, AgentReport } from "./types";
+import type { AgentExecutor, AgentReport, HookFn } from "./types";
 import { SceneBuilder } from "./context";
 export { expect } from "./assertions";
 export { logger } from "./logger";
@@ -7,11 +7,16 @@ export type { AgestConfig, JudgeConfig, JudgeExecutor } from "./config";
 export type { LogLevel } from "./logger";
 export type { AgentExpectation, AgentMatchers } from "./assertions";
 export type { JudgeCriteria } from "./judge";
-export type { AgentExecutor, AgentResponse, AgentReport, SceneResult, JudgeVerdict, JudgeResult, } from "./types";
+export type { AgentExecutor, AgentResponse, AgentReport, SceneResult, RunResult, JudgeVerdict, JudgeResult, HookFn, } from "./types";
 export interface AgentOptions {
     name?: string;
 }
 export declare function scene(prompt: string): SceneBuilder;
+export declare function beforeAll(fn: HookFn): void;
+export declare function afterAll(fn: HookFn): void;
+export declare function beforeEach(fn: HookFn): void;
+export declare function afterEach(fn: HookFn): void;
+export declare function suite(name: string, fn: () => void): void;
 /** @internal reset auto-run state between tests */
 export declare function _resetAutoRun(): void;
 export declare function agent(executor: AgentExecutor, fn: () => void, options?: AgentOptions): Promise<AgentReport>;

package/dist/index.js CHANGED Viewed

@@ -5,12 +5,36 @@ export { defineConfig } from "./config";
 export function scene(prompt) {
     return getContext().registerScene(prompt);
 }
+export function beforeAll(fn) {
+    getContext().registerHook("beforeAll", fn);
+}
+export function afterAll(fn) {
+    getContext().registerHook("afterAll", fn);
+}
+export function beforeEach(fn) {
+    getContext().registerHook("beforeEach", fn);
+}
+export function afterEach(fn) {
+    getContext().registerHook("afterEach", fn);
+}
+export function suite(name, fn) {
+    const ctx = getContext();
+    ctx.setSuite(name);
+    try {
+        fn();
+    }
+    finally {
+        ctx.clearSuite();
+    }
+}
 const pendingAgents = [];
 let autoRunScheduled = false;
+let executionChain = Promise.resolve();
 /** @internal reset auto-run state between tests */
 export function _resetAutoRun() {
     pendingAgents.length = 0;
     autoRunScheduled = false;
+    executionChain = Promise.resolve();
 }
 export function agent(executor, fn, options) {
     const ctx = new AgentContext(executor, options?.name);
@@ -23,7 +47,8 @@ export function agent(executor, fn, options) {
         return Promise.reject(err);
     }
     setContext(null);
-    const promise = ctx.execute();
+    const promise = executionChain.then(() => ctx.execute());
+    executionChain = promise.then(() => { }, () => { });
     pendingAgents.push(promise);
     if (!autoRunScheduled) {
         autoRunScheduled = true;