npm - @wix/eval-assertions - Versions diffs - 0.16.0 → 0.18.0 - Mend

@wix/eval-assertions 0.16.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/README.md +3 -2
package/build/index.js +181 -103
package/build/index.js.map +4 -4
package/build/index.mjs +172 -102
package/build/index.mjs.map +4 -4
package/build/types/evaluators/assertion-evaluator.d.ts +4 -17
package/build/types/evaluators/cost-evaluator.d.ts +10 -0
package/build/types/evaluators/index.d.ts +3 -2
package/build/types/evaluators/llm-judge-evaluator.d.ts +11 -0
package/build/types/index.d.ts +3 -2
package/build/types/tools/index.d.ts +1 -0
package/build/types/tools/read-file-tool.d.ts +10 -0
package/build/types/types/assertions.d.ts +12 -0
package/build/types/types/index.d.ts +1 -1
package/package.json +4 -3

package/build/types/evaluators/assertion-evaluator.d.ts CHANGED Viewed

@@ -1,13 +1,5 @@
 import type { Assertion, AssertionResult, EvaluationInput } from "../types/index.js";
-/**
- * Options passed to the LLM for llm_judge. Used by the optional stub for testing.
- */
-export interface LlmJudgeGenerateTextOptions {
-    prompt: string;
-    system: string;
-    maxOutputTokens: number;
-    temperature: number;
-}
+import type { LanguageModel } from "ai";
 /**
  * Configuration for LLM calls (used by llm_judge assertion).
  */
@@ -25,15 +17,10 @@ export interface AssertionContext {
     workDir?: string;
     /** LLM configuration (used by llm_judge) */
     llmConfig?: LlmConfig;
-    /** Default model for llm_judge when assertion.model is not set. Caller provides this. */
+    /** Default model for llm_judge when assertion.model is not set */
     defaultJudgeModel?: string;
-    /**
-     * Optional stub for llm_judge: when set, the evaluator uses this instead of the real AI call.
-     * Used only in tests to avoid hitting the API.
-     */
-    generateTextForLlmJudge?: (options: LlmJudgeGenerateTextOptions) => Promise<{
-        text: string;
-    }>;
+    /** Optional model override — when provided, used instead of creating from llmConfig + modelId */
+    model?: LanguageModel;
 }
 /**
  * Abstract base for assertion evaluators.

package/build/types/evaluators/cost-evaluator.d.ts ADDED Viewed

@@ -0,0 +1,10 @@
+import type { CostAssertion, AssertionResult, EvaluationInput } from "../types/index.js";
+import { AssertionEvaluator } from "./assertion-evaluator.js";
+/**
+ * Evaluator for "cost" assertion: checks that the scenario's LLM execution cost
+ * stays within a configured USD threshold by reading llmTrace.summary.totalCostUsd.
+ */
+export declare class CostEvaluator extends AssertionEvaluator<CostAssertion> {
+    readonly type: "cost";
+    evaluate(assertion: CostAssertion, input: EvaluationInput): AssertionResult;
+}

package/build/types/evaluators/index.d.ts CHANGED Viewed

@@ -26,8 +26,9 @@ export declare function getEvaluator(type: string): AssertionEvaluator | undefin
  */
 export declare function evaluateAssertions(input: EvaluationInput, assertions: Assertion[], context?: AssertionContext): Promise<AssertionResult[]>;
 export { AssertionEvaluator } from "./assertion-evaluator.js";
-export type { AssertionContext, LlmConfig, LlmJudgeGenerateTextOptions, } from "./assertion-evaluator.js";
+export type { AssertionContext, LlmConfig } from "./assertion-evaluator.js";
 export { SkillWasCalledEvaluator } from "./skill-was-called-evaluator.js";
 export { BuildPassedEvaluator } from "./build-passed-evaluator.js";
 export { TimeEvaluator } from "./time-evaluator.js";
-export { LlmJudgeEvaluator, formatTraceForJudge, replacePlaceholders, stripMarkdownCodeBlock, validateJudgeResult, type JudgeResult, } from "./llm-judge-evaluator.js";
+export { CostEvaluator } from "./cost-evaluator.js";
+export { LlmJudgeEvaluator, JudgeResultSchema, formatTraceForJudge, replacePlaceholders, stripMarkdownCodeBlock, validateJudgeResult, type JudgeResult, } from "./llm-judge-evaluator.js";

package/build/types/evaluators/llm-judge-evaluator.d.ts CHANGED Viewed

@@ -1,4 +1,5 @@
 import type { LlmJudgeAssertion, AssertionResult, LLMTrace, EvaluationInput } from "../types/index.js";
+import { z } from "zod";
 import type { AssertionContext } from "./assertion-evaluator.js";
 import { AssertionEvaluator } from "./assertion-evaluator.js";
 export interface JudgeResult {
@@ -6,6 +7,11 @@ export interface JudgeResult {
     score: number;
     scoreReasoning: string;
 }
+export declare const JudgeResultSchema: z.ZodObject<{
+    text: z.ZodString;
+    score: z.ZodNumber;
+    scoreReasoning: z.ZodString;
+}, z.core.$strip>;
 /**
  * Format LLM trace as readable text for the judge (step number, type, tool name/args, output preview).
  */
@@ -36,5 +42,10 @@ export declare function validateJudgeResult(parsed: unknown): JudgeResult;
 export declare class LlmJudgeEvaluator extends AssertionEvaluator<LlmJudgeAssertion> {
     readonly type: "llm_judge";
     evaluate(assertion: LlmJudgeAssertion, input: EvaluationInput, context?: AssertionContext): Promise<AssertionResult>;
+    /**
+     * Resolve the LanguageModel to use: context.model (injected mock/override)
+     * takes precedence, otherwise create from llmConfig + modelId.
+     */
+    private resolveModel;
     private callGenerateText;
 }

package/build/types/index.d.ts CHANGED Viewed

@@ -4,5 +4,6 @@
  * Assertion framework for AI agent evaluations.
  * Supports skill invocation checks, build validation, and LLM-based judging.
  */
-export { AssertionSchema, SkillWasCalledAssertionSchema, BuildPassedAssertionSchema, TimeAssertionSchema, LlmJudgeAssertionSchema, type Assertion, type SkillWasCalledAssertion, type BuildPassedAssertion, type TimeAssertion, type LlmJudgeAssertion, LLMTraceSchema, LLMTraceStepSchema, LLMTraceSummarySchema, LLMBreakdownStatsSchema, TokenUsageSchema, LLMStepType, type LLMTrace, type LLMTraceStep, type LLMTraceSummary, type LLMBreakdownStats, type TokenUsage, AssertionResultSchema, AssertionResultStatus, type AssertionResult, type EvaluationInput, type FileDiff, } from "./types/index.js";
-export { evaluateAssertions, registerEvaluator, getEvaluator, AssertionEvaluator, SkillWasCalledEvaluator, BuildPassedEvaluator, TimeEvaluator, LlmJudgeEvaluator, formatTraceForJudge, replacePlaceholders, stripMarkdownCodeBlock, validateJudgeResult, type AssertionContext, type LlmConfig, type LlmJudgeGenerateTextOptions, type JudgeResult, } from "./evaluators/index.js";
+export { AssertionSchema, SkillWasCalledAssertionSchema, BuildPassedAssertionSchema, TimeAssertionSchema, CostAssertionSchema, LlmJudgeAssertionSchema, type Assertion, type SkillWasCalledAssertion, type BuildPassedAssertion, type TimeAssertion, type CostAssertion, type LlmJudgeAssertion, LLMTraceSchema, LLMTraceStepSchema, LLMTraceSummarySchema, LLMBreakdownStatsSchema, TokenUsageSchema, LLMStepType, type LLMTrace, type LLMTraceStep, type LLMTraceSummary, type LLMBreakdownStats, type TokenUsage, AssertionResultSchema, AssertionResultStatus, type AssertionResult, type EvaluationInput, type FileDiff, } from "./types/index.js";
+export { evaluateAssertions, registerEvaluator, getEvaluator, AssertionEvaluator, SkillWasCalledEvaluator, BuildPassedEvaluator, TimeEvaluator, CostEvaluator, LlmJudgeEvaluator, JudgeResultSchema, formatTraceForJudge, replacePlaceholders, stripMarkdownCodeBlock, validateJudgeResult, type AssertionContext, type LlmConfig, type JudgeResult, } from "./evaluators/index.js";
+export { createReadFileTool } from "./tools/index.js";

package/build/types/tools/index.d.ts ADDED Viewed

	@@ -0,0 +1 @@
1	+ export { createReadFileTool } from "./read-file-tool.js";

package/build/types/tools/read-file-tool.d.ts ADDED Viewed

@@ -0,0 +1,10 @@
+import { type Tool } from "ai";
+export type ReadFileResult = {
+    path: string;
+    content: string;
+} | {
+    error: string;
+};
+export declare function createReadFileTool(workDir: string): Tool<{
+    path: string;
+}, ReadFileResult>;

package/build/types/types/assertions.d.ts CHANGED Viewed

@@ -20,6 +20,15 @@ export declare const BuildPassedAssertionSchema: z.ZodObject<{
     expectedExitCode: z.ZodOptional<z.ZodNumber>;
 }, z.core.$strip>;
 export type BuildPassedAssertion = z.infer<typeof BuildPassedAssertionSchema>;
+/**
+ * Assertion: the scenario LLM execution cost must stay within a USD threshold.
+ * Checked by reading llmTrace.summary.totalCostUsd.
+ */
+export declare const CostAssertionSchema: z.ZodObject<{
+    type: z.ZodLiteral<"cost">;
+    maxCostUsd: z.ZodNumber;
+}, z.core.$strip>;
+export type CostAssertion = z.infer<typeof CostAssertionSchema>;
 /**
  * Assertion: an LLM judges the scenario output (score 0-100).
  * Prompt can use {{output}}, {{cwd}}, {{changedFiles}}, {{modifiedFiles}}, {{newFiles}}, {{trace}}.
@@ -59,6 +68,9 @@ export declare const AssertionSchema: z.ZodUnion<readonly [z.ZodObject<{
 }, z.core.$strip>, z.ZodObject<{
     type: z.ZodLiteral<"time_limit">;
     maxDurationMs: z.ZodNumber;
+}, z.core.$strip>, z.ZodObject<{
+    type: z.ZodLiteral<"cost">;
+    maxCostUsd: z.ZodNumber;
 }, z.core.$strip>, z.ZodObject<{
     type: z.ZodLiteral<"llm_judge">;
     prompt: z.ZodString;

package/build/types/types/index.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-export { AssertionSchema, SkillWasCalledAssertionSchema, BuildPassedAssertionSchema, TimeAssertionSchema, LlmJudgeAssertionSchema, type Assertion, type SkillWasCalledAssertion, type BuildPassedAssertion, type TimeAssertion, type LlmJudgeAssertion, } from "./assertions.js";
+export { AssertionSchema, SkillWasCalledAssertionSchema, BuildPassedAssertionSchema, TimeAssertionSchema, CostAssertionSchema, LlmJudgeAssertionSchema, type Assertion, type SkillWasCalledAssertion, type BuildPassedAssertion, type TimeAssertion, type CostAssertion, type LlmJudgeAssertion, } from "./assertions.js";
 export { LLMTraceSchema, LLMTraceStepSchema, LLMTraceSummarySchema, LLMBreakdownStatsSchema, TokenUsageSchema, LLMStepType, type LLMTrace, type LLMTraceStep, type LLMTraceSummary, type LLMBreakdownStats, type TokenUsage, } from "./trace.js";
 export { AssertionResultSchema, AssertionResultStatus, type AssertionResult, } from "./result.js";
 export { type EvaluationInput, type FileDiff, type FileStatus, } from "./input.js";

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@wix/eval-assertions",
-  "version": "0.16.0",
+  "version": "0.18.0",
   "description": "Assertion framework for AI agent evaluations - supports skill invocation checks, build validation, and LLM-based judging",
   "files": [
     "build"
@@ -13,7 +13,8 @@
     "build": "yarn run clean && yarn run build:cjs && yarn run build:esm && yarn run build:types",
     "lint": "eslint .",
     "typecheck": "tsc --noEmit",
-    "test": "node --import tsx --test tests/**/*.test.ts"
+    "test": "node --import tsx --test tests/**/*.test.ts",
+    "test:agent": "node --import tsx --test tests/**/*.agent-test.ts"
   },
   "dependencies": {
     "@ai-sdk/anthropic": "^3.0.2",
@@ -60,5 +61,5 @@
   ],
   "license": "MIT",
   "author": "Wix",
-  "falconPackageHash": "9d0a90b3fcf13f9ce2aa735a208d8e96d027956486330ac73dba07d1"
+  "falconPackageHash": "2862b952ba60dd90bbde91927c092b33ea729c4ddaf92ff2784d03c5"
 }