npm - @sebastiantuyu/agest - Versions diffs - 0.3.2 → 0.3.3-next.10 - Mend

@sebastiantuyu/agest 0.3.2 → 0.3.3-next.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

package/README.md +158 -1
package/dist/adapters/index.d.ts +2 -0
package/dist/adapters/index.js +1 -0
package/dist/adapters/langchain.d.ts +1 -1
package/dist/adapters/langchain.js +80 -11
package/dist/adapters/remote.d.ts +1 -1
package/dist/adapters/remote.js +3 -2
package/dist/adapters/tracing.d.ts +73 -0
package/dist/adapters/tracing.js +338 -0
package/dist/assertions.d.ts +57 -2
package/dist/assertions.js +119 -33
package/dist/cli.d.ts +15 -1
package/dist/cli.js +97 -18
package/dist/config.d.ts +9 -0
package/dist/context.d.ts +32 -11
package/dist/context.js +84 -10
package/dist/discover.d.ts +16 -0
package/dist/discover.js +62 -0
package/dist/index.d.ts +20 -2
package/dist/index.js +10 -3
package/dist/match.d.ts +28 -0
package/dist/match.js +57 -0
package/dist/preview.js +93 -0
package/dist/pricing/index.d.ts +32 -0
package/dist/pricing/index.js +48 -0
package/dist/pricing/models.json +21 -0
package/dist/reporter.d.ts +1 -1
package/dist/reporter.js +77 -4
package/dist/reports.d.ts +37 -0
package/dist/reports.js +126 -0
package/dist/resolve.d.ts +25 -0
package/dist/resolve.js +62 -0
package/dist/runner.d.ts +11 -2
package/dist/runner.js +97 -11
package/dist/schema.d.ts +63 -0
package/dist/schema.js +61 -0
package/dist/types.d.ts +84 -9
package/dist/waterfall.d.ts +11 -0
package/dist/waterfall.js +46 -0
package/package.json +24 -15

package/dist/runner.js CHANGED Viewed

@@ -1,16 +1,34 @@
 import { collectPendingJudgements } from "./assertions";
 import { callJudge, resolveJudgeExecutor } from "./judge";
+import { resolveValue, resolveText, serializeValue, navigatePath } from "./resolve";
+import { validateAgainstSchema } from "./schema";
 const DEFAULT_SCENE_TIMEOUT = 10_000;
+/**
+ * Extract a named field from an agent response for assertion.
+ * - "response" / "value" → the native structured value (deterministic matchers)
+ * - "text"               → the serialized/judge view (lazy; text matchers)
+ * - "metadata"/"refusal" → the corresponding response property
+ * - dot-path             → navigated into the structured value first
+ *                          (e.g. "plan_items.0.options"), falling back to
+ *                          metadata so existing metadata paths keep resolving.
+ */
 export function extractField(response, field) {
     switch (field) {
         case "response":
-            return response.text;
+        case "value":
+            return resolveValue(response);
+        case "text":
+            return resolveText(response);
         case "metadata":
             return response.metadata;
         case "refusal":
             return response.refusal;
-        default:
-            return response.metadata?.[field];
+        default: {
+            const fromValue = navigatePath(resolveValue(response), field);
+            if (fromValue !== undefined)
+                return fromValue;
+            return navigatePath(response.metadata ?? {}, field);
+        }
     }
 }
 /**
@@ -31,19 +49,29 @@ function wilsonSignificance(passes, total) {
     return Math.max(0, Math.min(1, lower));
 }
 async function executeSingleRun(executor, scene, timeoutMs, turns, judgeConfig) {
+    // The empty sentinel uses the `text` branch of the union so it is a valid
+    // AgentResponse<T> for ANY T (there is no native value yet — the executor
+    // hasn't run). Using `{ value: "" }` would wrongly assume T = string.
     let response = { text: "" };
     let duration;
     try {
         const start = performance.now();
         const input = scene.prompt;
         for (let t = 0; t < turns; t++) {
-            let timer;
-            response = await Promise.race([
-                executor(input).finally(() => clearTimeout(timer)),
-                new Promise((_, reject) => {
-                    timer = setTimeout(() => reject(new Error(`Scene timed out after ${timeoutMs}ms`)), timeoutMs);
-                }),
-            ]);
+            const controller = new AbortController();
+            const timer = setTimeout(() => controller.abort(), timeoutMs);
+            try {
+                response = await executor(input, { signal: controller.signal });
+            }
+            catch (err) {
+                if (err.name === "AbortError" || controller.signal.aborted) {
+                    throw new Error(`Scene timed out after ${timeoutMs}ms`);
+                }
+                throw err;
+            }
+            finally {
+                clearTimeout(timer);
+            }
             if (response.executionError)
                 break;
         }
@@ -63,7 +91,21 @@ async function executeSingleRun(executor, scene, timeoutMs, turns, judgeConfig)
     let passed = true;
     let error;
     let judgement;
+    // Schema validation runs first — a structural failure is the headline. Skip
+    // refusals (which legitimately won't match the output shape) and empty values.
+    if (scene.schema && !response.refusal) {
+        const value = resolveValue(response);
+        if (value !== undefined) {
+            const outcome = await validateAgainstSchema(scene.schema, value);
+            if (!outcome.ok) {
+                passed = false;
+                error = `Schema validation failed — ${outcome.message}`;
+            }
+        }
+    }
     for (const assertion of scene.assertions) {
+        if (!passed)
+            break;
         try {
             const value = extractField(response, assertion.field);
             assertion.fn(value);
@@ -84,7 +126,9 @@ async function executeSingleRun(executor, scene, timeoutMs, turns, judgeConfig)
             const judgeExecutor = resolveJudgeExecutor(judgeConfig);
             for (const p of pending) {
                 try {
-                    const result = await callJudge(String(p.value), p.criteria, judgeExecutor);
+                    // Hand the judge the serialized text view — NOT String(value),
+                    // which would render a structured value as "[object Object]".
+                    const result = await callJudge(serializeValue(p.value), p.criteria, judgeExecutor);
                     judgement = result;
                     if (result.verdict === "fail" || result.verdict === "partial") {
                         passed = false;
@@ -109,6 +153,9 @@ export async function executeScene(executor, scene, globalTimeout, judgeConfig,
     // Single run — original fast path
     if (numRuns <= 1) {
         const run = await executeSingleRun(executor, scene, timeoutMs, turns, judgeConfig);
+        const tokens = run.response.metadata?.tokens;
+        const cost = run.response.metadata?.cost;
+        const events = run.response.metadata?.events;
         return {
             prompt: scene.prompt,
             response: run.response,
@@ -117,6 +164,10 @@ export async function executeScene(executor, scene, globalTimeout, judgeConfig,
             error: run.error,
             judgement: run.judgement,
             suite: scene.suite,
+            tokens: tokens ? { input: tokens.input, output: tokens.output } : undefined,
+            costUsd: cost?.totalUsd,
+            costSource: cost?.source,
+            events: events && events.length ? events : undefined,
         };
     }
     // Multiple runs — execute N times and aggregate
@@ -136,6 +187,37 @@ export async function executeScene(executor, scene, globalTimeout, judgeConfig,
     const error = overallPassed
         ? undefined
         : failedRuns[0]?.error ?? "Majority of runs failed";
+    // Aggregate tokens, cost, events across runs
+    let inputTokens = 0;
+    let outputTokens = 0;
+    let hasTokens = false;
+    let costTotal = 0;
+    let hasCost = false;
+    let costSource;
+    const allEvents = [];
+    runs.forEach((r, runIndex) => {
+        const meta = r.response.metadata;
+        if (meta?.tokens) {
+            hasTokens = true;
+            inputTokens += meta.tokens.input;
+            outputTokens += meta.tokens.output;
+        }
+        if (meta?.cost?.totalUsd != null) {
+            hasCost = true;
+            costTotal += meta.cost.totalUsd;
+            // Promote weakest source: provider > table > unavailable
+            if (costSource !== "table")
+                costSource = meta.cost.source;
+            if (meta.cost.source === "table" && costSource !== "table") {
+                costSource = "table";
+            }
+        }
+        if (meta?.events?.length) {
+            for (const e of meta.events) {
+                allEvents.push({ ...e, runIndex });
+            }
+        }
+    });
     return {
         prompt: scene.prompt,
         response: lastRun.response,
@@ -147,5 +229,9 @@ export async function executeScene(executor, scene, globalTimeout, judgeConfig,
         runs,
         passRate,
         statisticalSignificance,
+        tokens: hasTokens ? { input: inputTokens, output: outputTokens } : undefined,
+        costUsd: hasCost ? costTotal : undefined,
+        costSource,
+        events: allEvents.length ? allEvents : undefined,
     };
 }

package/dist/schema.d.ts ADDED Viewed

@@ -0,0 +1,63 @@
+/**
+ * Schema validation built on the Standard Schema v1 spec
+ * (https://standardschema.dev). Agest never imports a schema library — it talks
+ * to whatever the consumer brings (zod 4, valibot, arktype, …) through the
+ * `~standard` contract every compliant library exposes. zod is the documented,
+ * blessed choice but is not a runtime or peer dependency.
+ */
+/** The minimal Standard Schema v1 interface, vendored from the spec. */
+export interface StandardSchemaV1<Input = unknown, Output = Input> {
+    readonly "~standard": StandardSchemaV1.Props<Input, Output>;
+}
+export declare namespace StandardSchemaV1 {
+    interface Props<Input = unknown, Output = Input> {
+        readonly version: 1;
+        readonly vendor: string;
+        readonly validate: (value: unknown) => Result<Output> | Promise<Result<Output>>;
+        readonly types?: Types<Input, Output>;
+    }
+    type Result<Output> = SuccessResult<Output> | FailureResult;
+    interface SuccessResult<Output> {
+        readonly value: Output;
+        readonly issues?: undefined;
+    }
+    interface FailureResult {
+        readonly issues: ReadonlyArray<Issue>;
+    }
+    interface Issue {
+        readonly message: string;
+        readonly path?: ReadonlyArray<PropertyKey | PathSegment>;
+    }
+    interface PathSegment {
+        readonly key: PropertyKey;
+    }
+    interface Types<Input = unknown, Output = Input> {
+        readonly input: Input;
+        readonly output: Output;
+    }
+}
+/** The inferred output type of a Standard Schema (e.g. `z.infer<typeof S>`). */
+export type InferOutput<S extends StandardSchemaV1> = NonNullable<S["~standard"]["types"]>["output"];
+/** Structural duck-type check so any Standard-Schema library is accepted. */
+export declare function isStandardSchema(value: unknown): value is StandardSchemaV1;
+/** Render Standard Schema failure issues into a readable multi-line message. */
+export declare function formatIssues(issues: ReadonlyArray<StandardSchemaV1.Issue>): string;
+export type ValidationOutcome = {
+    ok: true;
+} | {
+    ok: false;
+    message: string;
+};
+/**
+ * Validate a value against a schema, awaiting the result. Supports both
+ * synchronous and asynchronous (`refine`-style) schemas — used by the runner,
+ * which is already async.
+ */
+export declare function validateAgainstSchema(schema: StandardSchemaV1, value: unknown): Promise<ValidationOutcome>;
+/**
+ * Synchronous validation for the `matchingSchema` matcher (matchers run inside
+ * a sync assertion callback). Throws a directive error if the schema needs to
+ * resolve asynchronously — declare such schemas at the agent/scene level, where
+ * validation is awaited.
+ */
+export declare function validateSync(schema: StandardSchemaV1, value: unknown): ValidationOutcome;

package/dist/schema.js ADDED Viewed

@@ -0,0 +1,61 @@
+/**
+ * Schema validation built on the Standard Schema v1 spec
+ * (https://standardschema.dev). Agest never imports a schema library — it talks
+ * to whatever the consumer brings (zod 4, valibot, arktype, …) through the
+ * `~standard` contract every compliant library exposes. zod is the documented,
+ * blessed choice but is not a runtime or peer dependency.
+ */
+/** Structural duck-type check so any Standard-Schema library is accepted. */
+export function isStandardSchema(value) {
+    return (typeof value === "object" &&
+        value !== null &&
+        "~standard" in value &&
+        typeof value["~standard"]?.validate === "function");
+}
+function isThenable(value) {
+    return (typeof value === "object" &&
+        value !== null &&
+        typeof value.then === "function");
+}
+/** Normalise one issue path segment (`PropertyKey | { key }`) to a string. */
+function renderSegment(seg) {
+    return typeof seg === "object" ? String(seg.key) : String(seg);
+}
+/** Render Standard Schema failure issues into a readable multi-line message. */
+export function formatIssues(issues) {
+    const lines = issues.map((issue) => {
+        const path = issue.path?.map(renderSegment).join(".");
+        return path ? `  • ${path}: ${issue.message}` : `  • ${issue.message}`;
+    });
+    const count = issues.length;
+    return `${count} issue${count !== 1 ? "s" : ""}:\n${lines.join("\n")}`;
+}
+/**
+ * Validate a value against a schema, awaiting the result. Supports both
+ * synchronous and asynchronous (`refine`-style) schemas — used by the runner,
+ * which is already async.
+ */
+export async function validateAgainstSchema(schema, value) {
+    const result = await schema["~standard"].validate(value);
+    if (result.issues) {
+        return { ok: false, message: formatIssues(result.issues) };
+    }
+    return { ok: true };
+}
+/**
+ * Synchronous validation for the `matchingSchema` matcher (matchers run inside
+ * a sync assertion callback). Throws a directive error if the schema needs to
+ * resolve asynchronously — declare such schemas at the agent/scene level, where
+ * validation is awaited.
+ */
+export function validateSync(schema, value) {
+    const result = schema["~standard"].validate(value);
+    if (isThenable(result)) {
+        throw new Error("matchingSchema() cannot validate an async schema. Declare the schema at " +
+            "the agent() or scene().expectSchema() level, where validation is awaited.");
+    }
+    if (result.issues) {
+        return { ok: false, message: formatIssues(result.issues) };
+    }
+    return { ok: true };
+}

package/dist/types.d.ts CHANGED Viewed

@@ -1,6 +1,63 @@
-export type AgentExecutor = (input: string) => Promise<AgentResponse>;
-export interface AgentResponse {
+import type { StandardSchemaV1 } from "./schema";
+export interface ExecutorOptions {
+    signal?: AbortSignal;
+}
+export type AgentExecutor<T = string> = (input: string, options?: ExecutorOptions) => Promise<AgentResponse<T>>;
+export type CostSource = "provider" | "table" | "unavailable";
+export interface CostBreakdown {
+    inputUsd?: number;
+    outputUsd?: number;
+    totalUsd?: number;
+    source: CostSource;
+}
+export type TimelineEventKind = "model" | "tool";
+export interface TimelineEvent {
+    kind: TimelineEventKind;
+    name: string;
+    /** ms relative to the scene start */
+    startMs: number;
+    endMs: number;
+    durationMs: number;
+    tokens?: {
+        input: number;
+        output: number;
+    };
+    /** Prompt-cache-hit input tokens (subset of tokens.input), when reported by the provider */
+    cachedInputTokens?: number;
+    cost?: CostBreakdown;
+    /** Index of the run this event belongs to (only set when aggregating across multi-run scenes) */
+    runIndex?: number;
+    error?: string;
+}
+/**
+ * The result an executor hands back. EXACTLY ONE of `value` / `text` is
+ * required (both may be present); the rest are optional.
+ *
+ * `value` is the agent's NATIVE output and the source of truth for
+ * deterministic, structural assertions — a string for a chat agent, an object
+ * for a structured agent (a plan, a tool-call payload, parsed JSON). It is
+ * never coerced to a string before a matcher asks for text.
+ *
+ * `text` is a pre-serialized projection for the judge model and the text
+ * matchers (`containing`, `matchingPattern`, `refusal`). A string-producing
+ * agent can return ONLY `text` (the legacy/common case) — it is then also used
+ * as `value`. A structured agent returns `value` and, optionally, an enriched
+ * `text` when the judge needs a view the raw value can't give cheaply (e.g.
+ * resolving opaque ids to names). When `text` is omitted, agest serializes
+ * `value` lazily (string passthrough, else JSON). See `resolve.ts`.
+ *
+ * The generic defaults to `string`, so the common chat case stays
+ * `{ text: "..." }` or `{ value: "..." }` with no type ceremony.
+ */
+export type AgentResponse<T = string> = AgentResponseBase<T> & ({
+    value: T;
+} | {
     text: string;
+});
+interface AgentResponseBase<T = string> {
+    value?: T;
+    /** Pre-serialized view for the judge / text matchers. */
+    text?: string;
     refusal?: boolean;
     executionError?: string;
     metadata?: {
@@ -11,6 +68,8 @@ export interface AgentResponse {
         };
         tools?: string[];
         systemPrompt?: string;
+        events?: TimelineEvent[];
+        cost?: CostBreakdown;
         [key: string]: unknown;
     };
 }
@@ -25,6 +84,8 @@ export interface SceneDefinition {
     turns?: number;
     runs?: number;
     suite?: string;
+    /** Standard Schema validated against the native value before user assertions. */
+    schema?: StandardSchemaV1;
 }
 export type JudgeVerdict = "pass" | "fail" | "partial";
 export interface JudgeResult {
@@ -32,26 +93,36 @@ export interface JudgeResult {
     reasoning: string;
     criteria: string;
 }
-export interface RunResult {
+export interface RunResult<T = string> {
     passed: boolean;
     error?: string;
-    response: AgentResponse;
+    response: AgentResponse<T>;
     duration: number;
     judgement?: JudgeResult;
 }
-export interface SceneResult {
+export interface SceneResult<T = string> {
     prompt: string;
-    response: AgentResponse;
+    response: AgentResponse<T>;
     duration: number;
     passed: boolean;
     error?: string;
     judgement?: JudgeResult;
     suite?: string;
-    runs?: RunResult[];
+    runs?: RunResult<T>[];
     passRate?: number;
     statisticalSignificance?: number;
+    /** Aggregate tokens across all runs of this scene */
+    tokens?: {
+        input: number;
+        output: number;
+    };
+    /** Aggregate USD cost across all runs of this scene */
+    costUsd?: number;
+    costSource?: CostSource;
+    /** Ordered timeline events from every run of the scene */
+    events?: TimelineEvent[];
 }
-export interface AgentReport {
+export interface AgentReport<T = string> {
     name?: string;
     model?: string;
     systemPromptHash?: string;
@@ -66,5 +137,9 @@ export interface AgentReport {
     totalCases: number;
     averageInputTokensPerCase?: number;
     averageOutputTokensPerCase?: number;
-    results: SceneResult[];
+    totalInputTokens?: number;
+    totalOutputTokens?: number;
+    totalCostUsd?: number;
+    results: SceneResult<T>[];
 }
+export {};

package/dist/waterfall.d.ts ADDED Viewed

@@ -0,0 +1,11 @@
+import type { TimelineEvent } from "./types";
+/**
+ * Render a Chrome-DevTools-style waterfall of timeline events as colored
+ * terminal lines. Bars are positioned by `startMs` and sized by `durationMs`
+ * relative to the full span of the scene. Returns one string per event row
+ * (already indented), or `[]` when there's nothing to draw.
+ */
+export declare function renderTerminalWaterfall(events: TimelineEvent[], opts?: {
+    width?: number;
+    indent?: string;
+}): string[];

package/dist/waterfall.js ADDED Viewed

@@ -0,0 +1,46 @@
+import { c } from "./logger";
+const BLOCK = "█";
+const THIN = "▏";
+function truncate(s, n) {
+    return s.length > n ? s.slice(0, n - 1) + "…" : s;
+}
+function fmtUsd(n) {
+    if (n === 0)
+        return "$0";
+    return "$" + Number(n.toFixed(4)).toString();
+}
+/**
+ * Render a Chrome-DevTools-style waterfall of timeline events as colored
+ * terminal lines. Bars are positioned by `startMs` and sized by `durationMs`
+ * relative to the full span of the scene. Returns one string per event row
+ * (already indented), or `[]` when there's nothing to draw.
+ */
+export function renderTerminalWaterfall(events, opts = {}) {
+    if (!events || events.length === 0)
+        return [];
+    const width = opts.width ?? 28;
+    const indent = opts.indent ?? "";
+    const t0 = Math.min(...events.map((e) => e.startMs));
+    const tEnd = Math.max(...events.map((e) => e.endMs));
+    const span = Math.max(1, tEnd - t0);
+    const nameWidth = 16;
+    return events.map((e) => {
+        const lead = Math.min(width - 1, Math.round(((e.startMs - t0) / span) * width));
+        const barLen = Math.max(1, Math.round((e.durationMs / span) * width));
+        const fill = e.durationMs === 0 ? THIN : BLOCK.repeat(Math.min(barLen, width - lead));
+        const cells = Array(width).fill(" ");
+        for (let i = 0; i < fill.length && lead + i < width; i++) {
+            cells[lead + i] = fill[i];
+        }
+        let bar = cells.join("");
+        const color = e.error ? c.red : e.kind === "model" ? c.cyan : c.yellow;
+        bar = color(bar);
+        const kindLabel = (e.kind === "model" ? "model" : "tool ").padEnd(5);
+        const nameLabel = truncate(e.name, nameWidth).padEnd(nameWidth);
+        const dur = `${Math.round(e.durationMs)}ms`.padStart(7);
+        const cost = e.cost?.totalUsd != null ? `  ${fmtUsd(e.cost.totalUsd)}` : "";
+        const cached = e.cachedInputTokens ? `  ${c.dim(`(${e.cachedInputTokens} cached)`)}` : "";
+        const err = e.error ? `  ${c.red("✗ " + truncate(e.error, 40))}` : "";
+        return `${indent}${c.dim(kindLabel)} ${nameLabel} ${bar} ${c.dim(dur)}${c.dim(cost)}${cached}${err}`;
+    });
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@sebastiantuyu/agest",
-  "version": "0.3.2",
+  "version": "0.3.3-next.10",
   "description": "A testing library for agents",
   "repository": {
     "type": "git",
@@ -26,7 +26,7 @@
     }
   },
   "scripts": {
-    "build": "tsc -p tsconfig.build.json",
+    "build": "tsc -p tsconfig.build.json && mkdir -p dist/pricing && cp src/pricing/models.json dist/pricing/models.json",
     "test": "vitest run",
     "test:watch": "vitest",
     "test:coverage": "vitest run --coverage",
@@ -37,25 +37,34 @@
     "site:preview": "npx serve site -p 3000",
     "release:patch": "npm version patch && git push && git push --tags",
     "release:minor": "npm version minor && git push && git push --tags",
-    "release:major": "npm version major && git push && git push --tags"
+    "release:major": "npm version major && git push && git push --tags",
+    "release:next": "npm version prerelease --preid=next && git push && git push --tags"
   },
   "engines": {
     "node": ">=22.0.0"
   },
   "devDependencies": {
-    "@langchain/core": "^1.1.39",
-    "@langchain/langgraph": "^1.2.8",
-    "@langchain/openai": "^1.4.4",
-    "@types/node": "^22.0.0",
-    "@vitest/coverage-v8": "^3",
-    "dotenv": "^17.4.1",
-    "langchain": "^1.3.1",
-    "tsx": "^4.21.0",
-    "typescript": "^5.4.0",
-    "vitest": "^3",
-    "zod": "^4.3.6"
+    "@langchain/core": "1.1.39",
+    "@langchain/langgraph": "1.2.8",
+    "@langchain/openai": "1.4.4",
+    "@types/node": "22.19.17",
+    "@vitest/coverage-v8": "3.2.4",
+    "dotenv": "17.4.1",
+    "langchain": "1.3.1",
+    "tsx": "4.21.0",
+    "typescript": "5.9.3",
+    "vitest": "3.2.4",
+    "zod": "4.3.6"
   },
   "dependencies": {
-    "@supercharge/promise-pool": "^3.3.0"
+    "@supercharge/promise-pool": "3.3.0"
+  },
+  "peerDependencies": {
+    "@langchain/core": ">=0.3.0 <2.0.0"
+  },
+  "peerDependenciesMeta": {
+    "@langchain/core": {
+      "optional": true
+    }
   }
 }