npm - @wix/eval-assertions - Versions diffs - 0.1.0 - Mend

@wix/eval-assertions 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/README.md +215 -0
package/build/index.js +543 -0
package/build/index.js.map +7 -0
package/build/index.mjs +495 -0
package/build/index.mjs.map +7 -0
package/build/types/evaluators/assertion-evaluator.d.ts +44 -0
package/build/types/evaluators/build-passed-evaluator.d.ts +13 -0
package/build/types/evaluators/index.d.ts +32 -0
package/build/types/evaluators/llm-judge-evaluator.d.ts +24 -0
package/build/types/evaluators/skill-was-called-evaluator.d.ts +11 -0
package/build/types/index.d.ts +8 -0
package/build/types/types/assertions.d.ts +58 -0
package/build/types/types/index.d.ts +4 -0
package/build/types/types/input.d.ts +20 -0
package/build/types/types/result.d.ts +47 -0
package/build/types/types/trace.d.ts +132 -0
package/package.json +64 -0

package/build/types/evaluators/llm-judge-evaluator.d.ts ADDED Viewed

@@ -0,0 +1,24 @@
+import type { LlmJudgeAssertion, AssertionResult, LLMTrace, EvaluationInput } from "../types/index.js";
+import type { AssertionContext } from "./assertion-evaluator.js";
+import { AssertionEvaluator } from "./assertion-evaluator.js";
+export interface JudgeResult {
+    text: string;
+    score: number;
+    scoreReasoning: string;
+}
+/**
+ * Format LLM trace as readable text for the judge (step number, type, tool name/args, output preview).
+ */
+export declare function formatTraceForJudge(llmTrace: LLMTrace | undefined): string;
+export declare function replacePlaceholders(str: string, output: string, cwd: string, changedFiles: string, trace: string): string;
+export declare function validateJudgeResult(parsed: unknown): JudgeResult;
+/**
+ * Evaluator for "llm_judge" assertion: an LLM judges the scenario output
+ * (prompt with {{output}}, {{cwd}}, {{changedFiles}}, {{trace}}) and returns a score 0-100.
+ * Passes if score >= minScore.
+ */
+export declare class LlmJudgeEvaluator extends AssertionEvaluator<LlmJudgeAssertion> {
+    readonly type: "llm_judge";
+    evaluate(assertion: LlmJudgeAssertion, input: EvaluationInput, context?: AssertionContext): Promise<AssertionResult>;
+    private callGenerateText;
+}

package/build/types/evaluators/skill-was-called-evaluator.d.ts ADDED Viewed

@@ -0,0 +1,11 @@
+import type { SkillWasCalledAssertion, AssertionResult, EvaluationInput } from "../types/index.js";
+import type { AssertionContext } from "./assertion-evaluator.js";
+import { AssertionEvaluator } from "./assertion-evaluator.js";
+/**
+ * Evaluator for "skill_was_called" assertion: the LLM trace must contain a step
+ * where the "Skill" tool was used with the expected skill (by name).
+ */
+export declare class SkillWasCalledEvaluator extends AssertionEvaluator<SkillWasCalledAssertion> {
+    readonly type: "skill_was_called";
+    evaluate(assertion: SkillWasCalledAssertion, input: EvaluationInput, _context?: AssertionContext): AssertionResult;
+}

package/build/types/index.d.ts ADDED Viewed

@@ -0,0 +1,8 @@
+/**
+ * @wix/eval-assertions
+ *
+ * Assertion framework for AI agent evaluations.
+ * Supports skill invocation checks, build validation, and LLM-based judging.
+ */
+export { AssertionSchema, SkillWasCalledAssertionSchema, BuildPassedAssertionSchema, LlmJudgeAssertionSchema, type Assertion, type SkillWasCalledAssertion, type BuildPassedAssertion, type LlmJudgeAssertion, LLMTraceSchema, LLMTraceStepSchema, LLMTraceSummarySchema, LLMBreakdownStatsSchema, TokenUsageSchema, LLMStepType, type LLMTrace, type LLMTraceStep, type LLMTraceSummary, type LLMBreakdownStats, type TokenUsage, AssertionResultSchema, AssertionResultStatus, type AssertionResult, type EvaluationInput, type FileDiff, } from "./types/index.js";
+export { evaluateAssertions, registerEvaluator, getEvaluator, AssertionEvaluator, SkillWasCalledEvaluator, BuildPassedEvaluator, LlmJudgeEvaluator, formatTraceForJudge, replacePlaceholders, validateJudgeResult, type AssertionContext, type LlmConfig, type LlmJudgeGenerateTextOptions, type JudgeResult, } from "./evaluators/index.js";

package/build/types/types/assertions.d.ts ADDED Viewed

@@ -0,0 +1,58 @@
+import { z } from "zod";
+/**
+ * Assertion: the agent must have invoked a specific skill during the run.
+ * Checked by inspecting the LLM trace for a "Skill" tool use with the given skill.
+ * Data: skillName (the skill that must have been called).
+ */
+export declare const SkillWasCalledAssertionSchema: z.ZodObject<{
+    type: z.ZodLiteral<"skill_was_called">;
+    skillName: z.ZodString;
+}, z.core.$strip>;
+export type SkillWasCalledAssertion = z.infer<typeof SkillWasCalledAssertionSchema>;
+/**
+ * Assertion: a build command must exit with the expected code (default 0).
+ * Runs the command in the scenario working directory.
+ */
+export declare const BuildPassedAssertionSchema: z.ZodObject<{
+    type: z.ZodLiteral<"build_passed">;
+    command: z.ZodOptional<z.ZodString>;
+    expectedExitCode: z.ZodOptional<z.ZodNumber>;
+}, z.core.$strip>;
+export type BuildPassedAssertion = z.infer<typeof BuildPassedAssertionSchema>;
+/**
+ * Assertion: an LLM judges the scenario output (score 0-100).
+ * Prompt can use {{output}}, {{cwd}}, {{changedFiles}}, {{trace}}.
+ * Passes if judge score >= minScore.
+ */
+export declare const LlmJudgeAssertionSchema: z.ZodObject<{
+    type: z.ZodLiteral<"llm_judge">;
+    prompt: z.ZodString;
+    systemPrompt: z.ZodOptional<z.ZodString>;
+    minScore: z.ZodOptional<z.ZodNumber>;
+    model: z.ZodOptional<z.ZodString>;
+    maxTokens: z.ZodOptional<z.ZodNumber>;
+    temperature: z.ZodOptional<z.ZodNumber>;
+}, z.core.$strip>;
+export type LlmJudgeAssertion = z.infer<typeof LlmJudgeAssertionSchema>;
+/**
+ * Union of all assertion types.
+ * Each assertion has a type and type-specific data.
+ * Uses z.union (not z.discriminatedUnion) for Zod v4 compatibility when used as array element.
+ */
+export declare const AssertionSchema: z.ZodUnion<readonly [z.ZodObject<{
+    type: z.ZodLiteral<"skill_was_called">;
+    skillName: z.ZodString;
+}, z.core.$strip>, z.ZodObject<{
+    type: z.ZodLiteral<"build_passed">;
+    command: z.ZodOptional<z.ZodString>;
+    expectedExitCode: z.ZodOptional<z.ZodNumber>;
+}, z.core.$strip>, z.ZodObject<{
+    type: z.ZodLiteral<"llm_judge">;
+    prompt: z.ZodString;
+    systemPrompt: z.ZodOptional<z.ZodString>;
+    minScore: z.ZodOptional<z.ZodNumber>;
+    model: z.ZodOptional<z.ZodString>;
+    maxTokens: z.ZodOptional<z.ZodNumber>;
+    temperature: z.ZodOptional<z.ZodNumber>;
+}, z.core.$strip>]>;
+export type Assertion = z.infer<typeof AssertionSchema>;

package/build/types/types/index.d.ts ADDED Viewed

@@ -0,0 +1,4 @@
+export { AssertionSchema, SkillWasCalledAssertionSchema, BuildPassedAssertionSchema, LlmJudgeAssertionSchema, type Assertion, type SkillWasCalledAssertion, type BuildPassedAssertion, type LlmJudgeAssertion, } from "./assertions.js";
+export { LLMTraceSchema, LLMTraceStepSchema, LLMTraceSummarySchema, LLMBreakdownStatsSchema, TokenUsageSchema, LLMStepType, type LLMTrace, type LLMTraceStep, type LLMTraceSummary, type LLMBreakdownStats, type TokenUsage, } from "./trace.js";
+export { AssertionResultSchema, AssertionResultStatus, type AssertionResult, } from "./result.js";
+export { type EvaluationInput, type FileDiff } from "./input.js";

package/build/types/types/input.d.ts ADDED Viewed

@@ -0,0 +1,20 @@
+import type { LLMTrace } from "./trace.js";
+/**
+ * File diff information for evaluation context.
+ */
+export interface FileDiff {
+    path: string;
+    content?: string;
+}
+/**
+ * Input data for assertion evaluation.
+ * This is a generic interface that can be adapted from any evaluation system.
+ */
+export interface EvaluationInput {
+    /** The agent's final output text */
+    outputText?: string;
+    /** LLM trace containing tool calls and completions */
+    llmTrace?: LLMTrace;
+    /** List of files that were modified during the evaluation */
+    fileDiffs?: FileDiff[];
+}

package/build/types/types/result.d.ts ADDED Viewed

@@ -0,0 +1,47 @@
+import { z } from "zod";
+/**
+ * Assertion result status enum.
+ */
+export declare enum AssertionResultStatus {
+    PASSED = "passed",
+    FAILED = "failed",
+    SKIPPED = "skipped",
+    ERROR = "error"
+}
+/**
+ * Assertion result schema.
+ */
+export declare const AssertionResultSchema: z.ZodObject<{
+    id: z.ZodString;
+    assertionId: z.ZodString;
+    assertionType: z.ZodString;
+    assertionName: z.ZodString;
+    status: z.ZodEnum<typeof AssertionResultStatus>;
+    message: z.ZodOptional<z.ZodString>;
+    expected: z.ZodOptional<z.ZodString>;
+    actual: z.ZodOptional<z.ZodString>;
+    duration: z.ZodOptional<z.ZodNumber>;
+    details: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
+    llmTraceSteps: z.ZodOptional<z.ZodArray<z.ZodObject<{
+        id: z.ZodString;
+        stepNumber: z.ZodNumber;
+        type: z.ZodEnum<typeof import("./trace.js").LLMStepType>;
+        model: z.ZodString;
+        provider: z.ZodString;
+        startedAt: z.ZodString;
+        durationMs: z.ZodNumber;
+        tokenUsage: z.ZodObject<{
+            prompt: z.ZodNumber;
+            completion: z.ZodNumber;
+            total: z.ZodNumber;
+        }, z.core.$strip>;
+        costUsd: z.ZodNumber;
+        toolName: z.ZodOptional<z.ZodString>;
+        toolArguments: z.ZodOptional<z.ZodString>;
+        inputPreview: z.ZodOptional<z.ZodString>;
+        outputPreview: z.ZodOptional<z.ZodString>;
+        success: z.ZodBoolean;
+        error: z.ZodOptional<z.ZodString>;
+    }, z.core.$strip>>>;
+}, z.core.$strip>;
+export type AssertionResult = z.infer<typeof AssertionResultSchema>;

package/build/types/types/trace.d.ts ADDED Viewed

@@ -0,0 +1,132 @@
+import { z } from "zod";
+/**
+ * Token usage schema.
+ */
+export declare const TokenUsageSchema: z.ZodObject<{
+    prompt: z.ZodNumber;
+    completion: z.ZodNumber;
+    total: z.ZodNumber;
+}, z.core.$strip>;
+export type TokenUsage = z.infer<typeof TokenUsageSchema>;
+/**
+ * LLM step type enum.
+ */
+export declare enum LLMStepType {
+    COMPLETION = "completion",
+    TOOL_USE = "tool_use",
+    TOOL_RESULT = "tool_result",
+    THINKING = "thinking"
+}
+/**
+ * LLM trace step schema.
+ */
+export declare const LLMTraceStepSchema: z.ZodObject<{
+    id: z.ZodString;
+    stepNumber: z.ZodNumber;
+    type: z.ZodEnum<typeof LLMStepType>;
+    model: z.ZodString;
+    provider: z.ZodString;
+    startedAt: z.ZodString;
+    durationMs: z.ZodNumber;
+    tokenUsage: z.ZodObject<{
+        prompt: z.ZodNumber;
+        completion: z.ZodNumber;
+        total: z.ZodNumber;
+    }, z.core.$strip>;
+    costUsd: z.ZodNumber;
+    toolName: z.ZodOptional<z.ZodString>;
+    toolArguments: z.ZodOptional<z.ZodString>;
+    inputPreview: z.ZodOptional<z.ZodString>;
+    outputPreview: z.ZodOptional<z.ZodString>;
+    success: z.ZodBoolean;
+    error: z.ZodOptional<z.ZodString>;
+}, z.core.$strip>;
+export type LLMTraceStep = z.infer<typeof LLMTraceStepSchema>;
+/**
+ * LLM breakdown stats schema.
+ */
+export declare const LLMBreakdownStatsSchema: z.ZodObject<{
+    count: z.ZodNumber;
+    durationMs: z.ZodNumber;
+    tokens: z.ZodNumber;
+    costUsd: z.ZodNumber;
+}, z.core.$strip>;
+export type LLMBreakdownStats = z.infer<typeof LLMBreakdownStatsSchema>;
+/**
+ * LLM trace summary schema.
+ */
+export declare const LLMTraceSummarySchema: z.ZodObject<{
+    totalSteps: z.ZodNumber;
+    totalDurationMs: z.ZodNumber;
+    totalTokens: z.ZodObject<{
+        prompt: z.ZodNumber;
+        completion: z.ZodNumber;
+        total: z.ZodNumber;
+    }, z.core.$strip>;
+    totalCostUsd: z.ZodNumber;
+    stepTypeBreakdown: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodObject<{
+        count: z.ZodNumber;
+        durationMs: z.ZodNumber;
+        tokens: z.ZodNumber;
+        costUsd: z.ZodNumber;
+    }, z.core.$strip>>>;
+    modelBreakdown: z.ZodRecord<z.ZodString, z.ZodObject<{
+        count: z.ZodNumber;
+        durationMs: z.ZodNumber;
+        tokens: z.ZodNumber;
+        costUsd: z.ZodNumber;
+    }, z.core.$strip>>;
+    modelsUsed: z.ZodArray<z.ZodString>;
+}, z.core.$strip>;
+export type LLMTraceSummary = z.infer<typeof LLMTraceSummarySchema>;
+/**
+ * LLM trace schema.
+ */
+export declare const LLMTraceSchema: z.ZodObject<{
+    id: z.ZodString;
+    steps: z.ZodArray<z.ZodObject<{
+        id: z.ZodString;
+        stepNumber: z.ZodNumber;
+        type: z.ZodEnum<typeof LLMStepType>;
+        model: z.ZodString;
+        provider: z.ZodString;
+        startedAt: z.ZodString;
+        durationMs: z.ZodNumber;
+        tokenUsage: z.ZodObject<{
+            prompt: z.ZodNumber;
+            completion: z.ZodNumber;
+            total: z.ZodNumber;
+        }, z.core.$strip>;
+        costUsd: z.ZodNumber;
+        toolName: z.ZodOptional<z.ZodString>;
+        toolArguments: z.ZodOptional<z.ZodString>;
+        inputPreview: z.ZodOptional<z.ZodString>;
+        outputPreview: z.ZodOptional<z.ZodString>;
+        success: z.ZodBoolean;
+        error: z.ZodOptional<z.ZodString>;
+    }, z.core.$strip>>;
+    summary: z.ZodObject<{
+        totalSteps: z.ZodNumber;
+        totalDurationMs: z.ZodNumber;
+        totalTokens: z.ZodObject<{
+            prompt: z.ZodNumber;
+            completion: z.ZodNumber;
+            total: z.ZodNumber;
+        }, z.core.$strip>;
+        totalCostUsd: z.ZodNumber;
+        stepTypeBreakdown: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodObject<{
+            count: z.ZodNumber;
+            durationMs: z.ZodNumber;
+            tokens: z.ZodNumber;
+            costUsd: z.ZodNumber;
+        }, z.core.$strip>>>;
+        modelBreakdown: z.ZodRecord<z.ZodString, z.ZodObject<{
+            count: z.ZodNumber;
+            durationMs: z.ZodNumber;
+            tokens: z.ZodNumber;
+            costUsd: z.ZodNumber;
+        }, z.core.$strip>>;
+        modelsUsed: z.ZodArray<z.ZodString>;
+    }, z.core.$strip>;
+}, z.core.$strip>;
+export type LLMTrace = z.infer<typeof LLMTraceSchema>;

package/package.json ADDED Viewed

@@ -0,0 +1,64 @@
+{
+  "name": "@wix/eval-assertions",
+  "version": "0.1.0",
+  "description": "Assertion framework for AI agent evaluations - supports skill invocation checks, build validation, and LLM-based judging",
+  "files": [
+    "build"
+  ],
+  "scripts": {
+    "clean": "rm -rf build",
+    "build:cjs": "esbuild src/index.ts --bundle --platform=node --outfile=build/index.js --format=cjs --sourcemap --packages=external",
+    "build:esm": "esbuild src/index.ts --bundle --platform=node --outfile=build/index.mjs --format=esm --sourcemap --packages=external",
+    "build:types": "tsc --emitDeclarationOnly --outDir ./build/types",
+    "build": "yarn run clean && yarn run build:cjs && yarn run build:esm && yarn run build:types",
+    "lint": "eslint .",
+    "typecheck": "tsc --noEmit",
+    "test": "node --import tsx --test tests/**/*.test.ts"
+  },
+  "dependencies": {
+    "@ai-sdk/anthropic": "^3.0.2",
+    "ai": "^6.0.6",
+    "zod": "^4.3.5"
+  },
+  "devDependencies": {
+    "@eslint/js": "^9.39.2",
+    "@types/node": "^22.19.3",
+    "esbuild": "^0.27.2",
+    "eslint": "^9.39.2",
+    "eslint-config-prettier": "^10.1.8",
+    "eslint-plugin-prettier": "^5.5.4",
+    "prettier": "^3.7.4",
+    "tsx": "^4.21.0",
+    "typescript": "^5.9.3",
+    "typescript-eslint": "^8.51.0"
+  },
+  "exports": {
+    ".": {
+      "types": "./build/types/index.d.ts",
+      "import": "./build/index.mjs",
+      "require": "./build/index.js"
+    },
+    "./package.json": "./package.json"
+  },
+  "publishConfig": {
+    "registry": "https://registry.npmjs.org/",
+    "access": "public"
+  },
+  "wix": {
+    "artifact": {
+      "groupId": "com.wixpress",
+      "artifactId": "eval-assertions"
+    }
+  },
+  "keywords": [
+    "ai",
+    "evaluation",
+    "assertions",
+    "testing",
+    "llm",
+    "agent"
+  ],
+  "license": "MIT",
+  "author": "Wix",
+  "falconPackageHash": "20902c570369f86bc5193d4c20ccdd3a702068beb5ea509f142fb2f5"
+}