@wix/evalforge-evaluator 0.35.0 → 0.36.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +35413 -35316
- package/build/index.js.map +4 -4
- package/build/index.mjs +35289 -35192
- package/build/index.mjs.map +4 -4
- package/package.json +5 -4
- package/build/types/run-scenario/assertions/assertionEvaluator.d.ts +0 -37
- package/build/types/run-scenario/assertions/buildPassedEvaluator.d.ts +0 -12
- package/build/types/run-scenario/assertions/index.d.ts +0 -13
- package/build/types/run-scenario/assertions/llmJudgeEvaluator.d.ts +0 -25
- package/build/types/run-scenario/assertions/skillWasCalledEvaluator.d.ts +0 -12
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@wix/evalforge-evaluator",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.36.0",
|
|
4
4
|
"description": "EvalForge Evaluator",
|
|
5
5
|
"bin": "./build/index.js",
|
|
6
6
|
"files": [
|
|
@@ -13,13 +13,14 @@
|
|
|
13
13
|
"build:types": "tsc --emitDeclarationOnly --outDir ./build/types",
|
|
14
14
|
"build": "yarn run clean && yarn run build:cjs && yarn run build:esm && yarn run build:types",
|
|
15
15
|
"lint": "eslint .",
|
|
16
|
-
"test": "
|
|
16
|
+
"test": "echo 'Tests moved to @wix/eval-assertions package' && exit 0"
|
|
17
17
|
},
|
|
18
18
|
"dependencies": {
|
|
19
19
|
"@ai-sdk/anthropic": "^3.0.2",
|
|
20
20
|
"@anthropic-ai/claude-agent-sdk": "^0.2.12",
|
|
21
21
|
"@anthropic-ai/claude-code": "^2.0.76",
|
|
22
|
-
"@wix/
|
|
22
|
+
"@wix/eval-assertions": "0.1.0",
|
|
23
|
+
"@wix/evalforge-types": "0.10.0",
|
|
23
24
|
"ai": "^6.0.6",
|
|
24
25
|
"tar": "^7.5.3",
|
|
25
26
|
"zod": "^4.3.5"
|
|
@@ -56,5 +57,5 @@
|
|
|
56
57
|
"artifactId": "evalforge-evaluator"
|
|
57
58
|
}
|
|
58
59
|
},
|
|
59
|
-
"falconPackageHash": "
|
|
60
|
+
"falconPackageHash": "15d351fd4bdbc1ded924fcc82efa1cf47c410837b12f78bd9fdee310"
|
|
60
61
|
}
|
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
import type { Assertion, AssertionResult } from '@wix/evalforge-types';
|
|
2
|
-
import type { EvaluatorConfig } from '../../config.js';
|
|
3
|
-
import type { PartialEvalRunResult } from '../types.js';
|
|
4
|
-
/**
|
|
5
|
-
* Options passed to the LLM for llm_judge. Used by the optional stub for testing.
|
|
6
|
-
*/
|
|
7
|
-
export interface LlmJudgeGenerateTextOptions {
|
|
8
|
-
prompt: string;
|
|
9
|
-
system: string;
|
|
10
|
-
maxOutputTokens: number;
|
|
11
|
-
temperature: number;
|
|
12
|
-
}
|
|
13
|
-
/**
|
|
14
|
-
* Optional context passed when evaluating assertions (e.g. workDir for build_passed, config for llm_judge).
|
|
15
|
-
*/
|
|
16
|
-
export interface AssertionContext {
|
|
17
|
-
/** Working directory for the scenario (used by build_passed) */
|
|
18
|
-
workDir?: string;
|
|
19
|
-
/** Evaluator config (used by llm_judge for AI gateway) */
|
|
20
|
-
config?: EvaluatorConfig;
|
|
21
|
-
/**
|
|
22
|
-
* Optional stub for llm_judge: when set, the evaluator uses this instead of the real AI call.
|
|
23
|
-
* Used only in tests to avoid hitting the API.
|
|
24
|
-
*/
|
|
25
|
-
generateTextForLlmJudge?: (options: LlmJudgeGenerateTextOptions) => Promise<{
|
|
26
|
-
text: string;
|
|
27
|
-
}>;
|
|
28
|
-
}
|
|
29
|
-
/**
|
|
30
|
-
* Abstract base for assertion evaluators.
|
|
31
|
-
* Each assertion type has a concrete class that implements evaluate().
|
|
32
|
-
* evaluate() may return a Promise for async assertions (e.g. llm_judge).
|
|
33
|
-
*/
|
|
34
|
-
export declare abstract class AssertionEvaluator<T extends Assertion = Assertion> {
|
|
35
|
-
abstract readonly type: T['type'];
|
|
36
|
-
abstract evaluate(assertion: T, partialResult: PartialEvalRunResult, context?: AssertionContext): AssertionResult | Promise<AssertionResult>;
|
|
37
|
-
}
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
import type { BuildPassedAssertion, AssertionResult } from '@wix/evalforge-types';
|
|
2
|
-
import type { PartialEvalRunResult } from '../types.js';
|
|
3
|
-
import type { AssertionContext } from './assertionEvaluator.js';
|
|
4
|
-
import { AssertionEvaluator } from './assertionEvaluator.js';
|
|
5
|
-
/**
|
|
6
|
-
* Evaluator for "build_passed" assertion: runs a build command in the scenario
|
|
7
|
-
* working directory and passes if the command exits with the expected code (default 0).
|
|
8
|
-
*/
|
|
9
|
-
export declare class BuildPassedEvaluator extends AssertionEvaluator<BuildPassedAssertion> {
|
|
10
|
-
readonly type: "build_passed";
|
|
11
|
-
evaluate(assertion: BuildPassedAssertion, partialResult: PartialEvalRunResult, context?: AssertionContext): AssertionResult;
|
|
12
|
-
}
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
import type { Assertion, AssertionResult } from '@wix/evalforge-types';
|
|
2
|
-
import type { PartialEvalRunResult } from '../types.js';
|
|
3
|
-
import type { AssertionContext } from './assertionEvaluator.js';
|
|
4
|
-
/**
|
|
5
|
-
* Evaluate all assertions against the partial result.
|
|
6
|
-
*
|
|
7
|
-
* @param partialResult - Result from callSkill/callAgent (includes llmTrace)
|
|
8
|
-
* @param assertions - Optional list of assertions to evaluate
|
|
9
|
-
* @param context - Optional context (e.g. workDir for build_passed, config for llm_judge)
|
|
10
|
-
* @returns Array of assertion results; empty if no assertions
|
|
11
|
-
*/
|
|
12
|
-
export declare function evaluateAssertions(partialResult: PartialEvalRunResult, assertions: Assertion[], context?: AssertionContext): Promise<AssertionResult[]>;
|
|
13
|
-
export { AssertionEvaluator } from './assertionEvaluator.js';
|
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
import type { LlmJudgeAssertion, AssertionResult, LLMTrace } from '@wix/evalforge-types';
|
|
2
|
-
import type { PartialEvalRunResult } from '../types.js';
|
|
3
|
-
import type { AssertionContext } from './assertionEvaluator.js';
|
|
4
|
-
import { AssertionEvaluator } from './assertionEvaluator.js';
|
|
5
|
-
export interface JudgeResult {
|
|
6
|
-
text: string;
|
|
7
|
-
score: number;
|
|
8
|
-
scoreReasoning: string;
|
|
9
|
-
}
|
|
10
|
-
/**
|
|
11
|
-
* Format LLM trace as readable text for the judge (step number, type, tool name/args, output preview).
|
|
12
|
-
*/
|
|
13
|
-
export declare function formatTraceForJudge(llmTrace: LLMTrace | undefined): string;
|
|
14
|
-
export declare function replacePlaceholders(str: string, output: string, cwd: string, changedFiles: string, trace: string): string;
|
|
15
|
-
export declare function validateJudgeResult(parsed: unknown): JudgeResult;
|
|
16
|
-
/**
|
|
17
|
-
* Evaluator for "llm_judge" assertion: an LLM judges the scenario output
|
|
18
|
-
* (prompt with {{output}}, {{cwd}}, {{changedFiles}}, {{trace}}) and returns a score 0–100.
|
|
19
|
-
* Passes if score >= minScore.
|
|
20
|
-
*/
|
|
21
|
-
export declare class LlmJudgeEvaluator extends AssertionEvaluator<LlmJudgeAssertion> {
|
|
22
|
-
readonly type: "llm_judge";
|
|
23
|
-
evaluate(assertion: LlmJudgeAssertion, partialResult: PartialEvalRunResult, context?: AssertionContext): Promise<AssertionResult>;
|
|
24
|
-
private callGenerateText;
|
|
25
|
-
}
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
import type { SkillWasCalledAssertion, AssertionResult } from '@wix/evalforge-types';
|
|
2
|
-
import type { PartialEvalRunResult } from '../types.js';
|
|
3
|
-
import type { AssertionContext } from './assertionEvaluator.js';
|
|
4
|
-
import { AssertionEvaluator } from './assertionEvaluator.js';
|
|
5
|
-
/**
|
|
6
|
-
* Evaluator for "skill_was_called" assertion: the LLM trace must contain a step
|
|
7
|
-
* where the "Skill" tool was used with the expected skill (by name).
|
|
8
|
-
*/
|
|
9
|
-
export declare class SkillWasCalledEvaluator extends AssertionEvaluator<SkillWasCalledAssertion> {
|
|
10
|
-
readonly type: "skill_was_called";
|
|
11
|
-
evaluate(assertion: SkillWasCalledAssertion, partialResult: PartialEvalRunResult, _context?: AssertionContext): AssertionResult;
|
|
12
|
-
}
|