@wix/eval-assertions 0.16.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -2
- package/build/index.js +181 -103
- package/build/index.js.map +4 -4
- package/build/index.mjs +172 -102
- package/build/index.mjs.map +4 -4
- package/build/types/evaluators/assertion-evaluator.d.ts +4 -17
- package/build/types/evaluators/cost-evaluator.d.ts +10 -0
- package/build/types/evaluators/index.d.ts +3 -2
- package/build/types/evaluators/llm-judge-evaluator.d.ts +11 -0
- package/build/types/index.d.ts +3 -2
- package/build/types/tools/index.d.ts +1 -0
- package/build/types/tools/read-file-tool.d.ts +10 -0
- package/build/types/types/assertions.d.ts +12 -0
- package/build/types/types/index.d.ts +1 -1
- package/package.json +4 -3
|
@@ -1,13 +1,5 @@
|
|
|
1
1
|
import type { Assertion, AssertionResult, EvaluationInput } from "../types/index.js";
|
|
2
|
-
|
|
3
|
-
* Options passed to the LLM for llm_judge. Used by the optional stub for testing.
|
|
4
|
-
*/
|
|
5
|
-
export interface LlmJudgeGenerateTextOptions {
|
|
6
|
-
prompt: string;
|
|
7
|
-
system: string;
|
|
8
|
-
maxOutputTokens: number;
|
|
9
|
-
temperature: number;
|
|
10
|
-
}
|
|
2
|
+
import type { LanguageModel } from "ai";
|
|
11
3
|
/**
|
|
12
4
|
* Configuration for LLM calls (used by llm_judge assertion).
|
|
13
5
|
*/
|
|
@@ -25,15 +17,10 @@ export interface AssertionContext {
|
|
|
25
17
|
workDir?: string;
|
|
26
18
|
/** LLM configuration (used by llm_judge) */
|
|
27
19
|
llmConfig?: LlmConfig;
|
|
28
|
-
/** Default model for llm_judge when assertion.model is not set
|
|
20
|
+
/** Default model for llm_judge when assertion.model is not set */
|
|
29
21
|
defaultJudgeModel?: string;
|
|
30
|
-
/**
|
|
31
|
-
|
|
32
|
-
* Used only in tests to avoid hitting the API.
|
|
33
|
-
*/
|
|
34
|
-
generateTextForLlmJudge?: (options: LlmJudgeGenerateTextOptions) => Promise<{
|
|
35
|
-
text: string;
|
|
36
|
-
}>;
|
|
22
|
+
/** Optional model override — when provided, used instead of creating from llmConfig + modelId */
|
|
23
|
+
model?: LanguageModel;
|
|
37
24
|
}
|
|
38
25
|
/**
|
|
39
26
|
* Abstract base for assertion evaluators.
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import type { CostAssertion, AssertionResult, EvaluationInput } from "../types/index.js";
|
|
2
|
+
import { AssertionEvaluator } from "./assertion-evaluator.js";
|
|
3
|
+
/**
|
|
4
|
+
* Evaluator for "cost" assertion: checks that the scenario's LLM execution cost
|
|
5
|
+
* stays within a configured USD threshold by reading llmTrace.summary.totalCostUsd.
|
|
6
|
+
*/
|
|
7
|
+
export declare class CostEvaluator extends AssertionEvaluator<CostAssertion> {
|
|
8
|
+
readonly type: "cost";
|
|
9
|
+
evaluate(assertion: CostAssertion, input: EvaluationInput): AssertionResult;
|
|
10
|
+
}
|
|
@@ -26,8 +26,9 @@ export declare function getEvaluator(type: string): AssertionEvaluator | undefin
|
|
|
26
26
|
*/
|
|
27
27
|
export declare function evaluateAssertions(input: EvaluationInput, assertions: Assertion[], context?: AssertionContext): Promise<AssertionResult[]>;
|
|
28
28
|
export { AssertionEvaluator } from "./assertion-evaluator.js";
|
|
29
|
-
export type { AssertionContext, LlmConfig
|
|
29
|
+
export type { AssertionContext, LlmConfig } from "./assertion-evaluator.js";
|
|
30
30
|
export { SkillWasCalledEvaluator } from "./skill-was-called-evaluator.js";
|
|
31
31
|
export { BuildPassedEvaluator } from "./build-passed-evaluator.js";
|
|
32
32
|
export { TimeEvaluator } from "./time-evaluator.js";
|
|
33
|
-
export {
|
|
33
|
+
export { CostEvaluator } from "./cost-evaluator.js";
|
|
34
|
+
export { LlmJudgeEvaluator, JudgeResultSchema, formatTraceForJudge, replacePlaceholders, stripMarkdownCodeBlock, validateJudgeResult, type JudgeResult, } from "./llm-judge-evaluator.js";
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import type { LlmJudgeAssertion, AssertionResult, LLMTrace, EvaluationInput } from "../types/index.js";
|
|
2
|
+
import { z } from "zod";
|
|
2
3
|
import type { AssertionContext } from "./assertion-evaluator.js";
|
|
3
4
|
import { AssertionEvaluator } from "./assertion-evaluator.js";
|
|
4
5
|
export interface JudgeResult {
|
|
@@ -6,6 +7,11 @@ export interface JudgeResult {
|
|
|
6
7
|
score: number;
|
|
7
8
|
scoreReasoning: string;
|
|
8
9
|
}
|
|
10
|
+
export declare const JudgeResultSchema: z.ZodObject<{
|
|
11
|
+
text: z.ZodString;
|
|
12
|
+
score: z.ZodNumber;
|
|
13
|
+
scoreReasoning: z.ZodString;
|
|
14
|
+
}, z.core.$strip>;
|
|
9
15
|
/**
|
|
10
16
|
* Format LLM trace as readable text for the judge (step number, type, tool name/args, output preview).
|
|
11
17
|
*/
|
|
@@ -36,5 +42,10 @@ export declare function validateJudgeResult(parsed: unknown): JudgeResult;
|
|
|
36
42
|
export declare class LlmJudgeEvaluator extends AssertionEvaluator<LlmJudgeAssertion> {
|
|
37
43
|
readonly type: "llm_judge";
|
|
38
44
|
evaluate(assertion: LlmJudgeAssertion, input: EvaluationInput, context?: AssertionContext): Promise<AssertionResult>;
|
|
45
|
+
/**
|
|
46
|
+
* Resolve the LanguageModel to use: context.model (injected mock/override)
|
|
47
|
+
* takes precedence, otherwise create from llmConfig + modelId.
|
|
48
|
+
*/
|
|
49
|
+
private resolveModel;
|
|
39
50
|
private callGenerateText;
|
|
40
51
|
}
|
package/build/types/index.d.ts
CHANGED
|
@@ -4,5 +4,6 @@
|
|
|
4
4
|
* Assertion framework for AI agent evaluations.
|
|
5
5
|
* Supports skill invocation checks, build validation, and LLM-based judging.
|
|
6
6
|
*/
|
|
7
|
-
export { AssertionSchema, SkillWasCalledAssertionSchema, BuildPassedAssertionSchema, TimeAssertionSchema, LlmJudgeAssertionSchema, type Assertion, type SkillWasCalledAssertion, type BuildPassedAssertion, type TimeAssertion, type LlmJudgeAssertion, LLMTraceSchema, LLMTraceStepSchema, LLMTraceSummarySchema, LLMBreakdownStatsSchema, TokenUsageSchema, LLMStepType, type LLMTrace, type LLMTraceStep, type LLMTraceSummary, type LLMBreakdownStats, type TokenUsage, AssertionResultSchema, AssertionResultStatus, type AssertionResult, type EvaluationInput, type FileDiff, } from "./types/index.js";
|
|
8
|
-
export { evaluateAssertions, registerEvaluator, getEvaluator, AssertionEvaluator, SkillWasCalledEvaluator, BuildPassedEvaluator, TimeEvaluator, LlmJudgeEvaluator, formatTraceForJudge, replacePlaceholders, stripMarkdownCodeBlock, validateJudgeResult, type AssertionContext, type LlmConfig, type
|
|
7
|
+
export { AssertionSchema, SkillWasCalledAssertionSchema, BuildPassedAssertionSchema, TimeAssertionSchema, CostAssertionSchema, LlmJudgeAssertionSchema, type Assertion, type SkillWasCalledAssertion, type BuildPassedAssertion, type TimeAssertion, type CostAssertion, type LlmJudgeAssertion, LLMTraceSchema, LLMTraceStepSchema, LLMTraceSummarySchema, LLMBreakdownStatsSchema, TokenUsageSchema, LLMStepType, type LLMTrace, type LLMTraceStep, type LLMTraceSummary, type LLMBreakdownStats, type TokenUsage, AssertionResultSchema, AssertionResultStatus, type AssertionResult, type EvaluationInput, type FileDiff, } from "./types/index.js";
|
|
8
|
+
export { evaluateAssertions, registerEvaluator, getEvaluator, AssertionEvaluator, SkillWasCalledEvaluator, BuildPassedEvaluator, TimeEvaluator, CostEvaluator, LlmJudgeEvaluator, JudgeResultSchema, formatTraceForJudge, replacePlaceholders, stripMarkdownCodeBlock, validateJudgeResult, type AssertionContext, type LlmConfig, type JudgeResult, } from "./evaluators/index.js";
|
|
9
|
+
export { createReadFileTool } from "./tools/index.js";
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { createReadFileTool } from "./read-file-tool.js";
|
|
@@ -20,6 +20,15 @@ export declare const BuildPassedAssertionSchema: z.ZodObject<{
|
|
|
20
20
|
expectedExitCode: z.ZodOptional<z.ZodNumber>;
|
|
21
21
|
}, z.core.$strip>;
|
|
22
22
|
export type BuildPassedAssertion = z.infer<typeof BuildPassedAssertionSchema>;
|
|
23
|
+
/**
|
|
24
|
+
* Assertion: the scenario LLM execution cost must stay within a USD threshold.
|
|
25
|
+
* Checked by reading llmTrace.summary.totalCostUsd.
|
|
26
|
+
*/
|
|
27
|
+
export declare const CostAssertionSchema: z.ZodObject<{
|
|
28
|
+
type: z.ZodLiteral<"cost">;
|
|
29
|
+
maxCostUsd: z.ZodNumber;
|
|
30
|
+
}, z.core.$strip>;
|
|
31
|
+
export type CostAssertion = z.infer<typeof CostAssertionSchema>;
|
|
23
32
|
/**
|
|
24
33
|
* Assertion: an LLM judges the scenario output (score 0-100).
|
|
25
34
|
* Prompt can use {{output}}, {{cwd}}, {{changedFiles}}, {{modifiedFiles}}, {{newFiles}}, {{trace}}.
|
|
@@ -59,6 +68,9 @@ export declare const AssertionSchema: z.ZodUnion<readonly [z.ZodObject<{
|
|
|
59
68
|
}, z.core.$strip>, z.ZodObject<{
|
|
60
69
|
type: z.ZodLiteral<"time_limit">;
|
|
61
70
|
maxDurationMs: z.ZodNumber;
|
|
71
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
72
|
+
type: z.ZodLiteral<"cost">;
|
|
73
|
+
maxCostUsd: z.ZodNumber;
|
|
62
74
|
}, z.core.$strip>, z.ZodObject<{
|
|
63
75
|
type: z.ZodLiteral<"llm_judge">;
|
|
64
76
|
prompt: z.ZodString;
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
export { AssertionSchema, SkillWasCalledAssertionSchema, BuildPassedAssertionSchema, TimeAssertionSchema, LlmJudgeAssertionSchema, type Assertion, type SkillWasCalledAssertion, type BuildPassedAssertion, type TimeAssertion, type LlmJudgeAssertion, } from "./assertions.js";
|
|
1
|
+
export { AssertionSchema, SkillWasCalledAssertionSchema, BuildPassedAssertionSchema, TimeAssertionSchema, CostAssertionSchema, LlmJudgeAssertionSchema, type Assertion, type SkillWasCalledAssertion, type BuildPassedAssertion, type TimeAssertion, type CostAssertion, type LlmJudgeAssertion, } from "./assertions.js";
|
|
2
2
|
export { LLMTraceSchema, LLMTraceStepSchema, LLMTraceSummarySchema, LLMBreakdownStatsSchema, TokenUsageSchema, LLMStepType, type LLMTrace, type LLMTraceStep, type LLMTraceSummary, type LLMBreakdownStats, type TokenUsage, } from "./trace.js";
|
|
3
3
|
export { AssertionResultSchema, AssertionResultStatus, type AssertionResult, } from "./result.js";
|
|
4
4
|
export { type EvaluationInput, type FileDiff, type FileStatus, } from "./input.js";
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@wix/eval-assertions",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.18.0",
|
|
4
4
|
"description": "Assertion framework for AI agent evaluations - supports skill invocation checks, build validation, and LLM-based judging",
|
|
5
5
|
"files": [
|
|
6
6
|
"build"
|
|
@@ -13,7 +13,8 @@
|
|
|
13
13
|
"build": "yarn run clean && yarn run build:cjs && yarn run build:esm && yarn run build:types",
|
|
14
14
|
"lint": "eslint .",
|
|
15
15
|
"typecheck": "tsc --noEmit",
|
|
16
|
-
"test": "node --import tsx --test tests/**/*.test.ts"
|
|
16
|
+
"test": "node --import tsx --test tests/**/*.test.ts",
|
|
17
|
+
"test:agent": "node --import tsx --test tests/**/*.agent-test.ts"
|
|
17
18
|
},
|
|
18
19
|
"dependencies": {
|
|
19
20
|
"@ai-sdk/anthropic": "^3.0.2",
|
|
@@ -60,5 +61,5 @@
|
|
|
60
61
|
],
|
|
61
62
|
"license": "MIT",
|
|
62
63
|
"author": "Wix",
|
|
63
|
-
"falconPackageHash": "
|
|
64
|
+
"falconPackageHash": "2862b952ba60dd90bbde91927c092b33ea729c4ddaf92ff2784d03c5"
|
|
64
65
|
}
|