@wix/eval-assertions 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +215 -0
- package/build/index.js +543 -0
- package/build/index.js.map +7 -0
- package/build/index.mjs +495 -0
- package/build/index.mjs.map +7 -0
- package/build/types/evaluators/assertion-evaluator.d.ts +44 -0
- package/build/types/evaluators/build-passed-evaluator.d.ts +13 -0
- package/build/types/evaluators/index.d.ts +32 -0
- package/build/types/evaluators/llm-judge-evaluator.d.ts +24 -0
- package/build/types/evaluators/skill-was-called-evaluator.d.ts +11 -0
- package/build/types/index.d.ts +8 -0
- package/build/types/types/assertions.d.ts +58 -0
- package/build/types/types/index.d.ts +4 -0
- package/build/types/types/input.d.ts +20 -0
- package/build/types/types/result.d.ts +47 -0
- package/build/types/types/trace.d.ts +132 -0
- package/package.json +64 -0
package/README.md
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
# @wix/eval-assertions
|
|
2
|
+
|
|
3
|
+
A framework for evaluating AI agent outputs through configurable assertions. Supports skill invocation checks, build validation, and LLM-based judging.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm install @wix/eval-assertions
|
|
9
|
+
# or
|
|
10
|
+
yarn add @wix/eval-assertions
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Features
|
|
14
|
+
|
|
15
|
+
- **Skill Was Called**: Verify that specific skills were invoked during agent execution
|
|
16
|
+
- **Build Passed**: Run build commands and verify exit codes
|
|
17
|
+
- **LLM Judge**: Use an LLM to evaluate agent outputs with customizable prompts and scoring
|
|
18
|
+
|
|
19
|
+
## Quick Start
|
|
20
|
+
|
|
21
|
+
```typescript
|
|
22
|
+
import {
|
|
23
|
+
evaluateAssertions,
|
|
24
|
+
AssertionResultStatus,
|
|
25
|
+
type Assertion,
|
|
26
|
+
type AssertionContext,
|
|
27
|
+
type EvaluationInput
|
|
28
|
+
} from '@wix/eval-assertions';
|
|
29
|
+
|
|
30
|
+
// Define your assertions
|
|
31
|
+
const assertions: Assertion[] = [
|
|
32
|
+
{
|
|
33
|
+
type: 'skill_was_called',
|
|
34
|
+
skillName: 'my-skill'
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
type: 'build_passed',
|
|
38
|
+
command: 'npm test',
|
|
39
|
+
expectedExitCode: 0
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
type: 'llm_judge',
|
|
43
|
+
prompt: 'Evaluate if the output correctly implements the requested feature:\n\n{{output}}',
|
|
44
|
+
minScore: 70
|
|
45
|
+
}
|
|
46
|
+
];
|
|
47
|
+
|
|
48
|
+
// Prepare your evaluation input
|
|
49
|
+
const input: EvaluationInput = {
|
|
50
|
+
outputText: 'Agent output here...',
|
|
51
|
+
llmTrace: {
|
|
52
|
+
id: 'trace-1',
|
|
53
|
+
steps: [...],
|
|
54
|
+
summary: {...}
|
|
55
|
+
},
|
|
56
|
+
fileDiffs: [...]
|
|
57
|
+
};
|
|
58
|
+
|
|
59
|
+
// Set up context for assertions that need it
|
|
60
|
+
const context: AssertionContext = {
|
|
61
|
+
workDir: '/path/to/working/directory',
|
|
62
|
+
llmConfig: {
|
|
63
|
+
baseUrl: 'https://api.anthropic.com',
|
|
64
|
+
headers: { 'x-api-key': 'your-key' }
|
|
65
|
+
}
|
|
66
|
+
};
|
|
67
|
+
|
|
68
|
+
// Run assertions
|
|
69
|
+
const results = await evaluateAssertions(input, assertions, context);
|
|
70
|
+
|
|
71
|
+
// Check results
|
|
72
|
+
for (const result of results) {
|
|
73
|
+
console.log(`${result.assertionName}: ${result.status}`);
|
|
74
|
+
if (result.status === AssertionResultStatus.FAILED) {
|
|
75
|
+
console.log(` Message: ${result.message}`);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Assertion Types
|
|
81
|
+
|
|
82
|
+
### skill_was_called
|
|
83
|
+
|
|
84
|
+
Checks if a specific skill was invoked by examining the LLM trace.
|
|
85
|
+
|
|
86
|
+
```typescript
|
|
87
|
+
{
|
|
88
|
+
type: 'skill_was_called',
|
|
89
|
+
skillName: 'commit' // Name of the skill that must have been called
|
|
90
|
+
}
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### build_passed
|
|
94
|
+
|
|
95
|
+
Runs a command in the working directory and checks the exit code.
|
|
96
|
+
|
|
97
|
+
```typescript
|
|
98
|
+
{
|
|
99
|
+
type: 'build_passed',
|
|
100
|
+
command: 'yarn build', // Command to run (default: 'yarn build')
|
|
101
|
+
expectedExitCode: 0 // Expected exit code (default: 0)
|
|
102
|
+
}
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### llm_judge
|
|
106
|
+
|
|
107
|
+
Uses an LLM to evaluate the output with a customizable prompt.
|
|
108
|
+
|
|
109
|
+
```typescript
|
|
110
|
+
{
|
|
111
|
+
type: 'llm_judge',
|
|
112
|
+
prompt: 'Evaluate the quality of this code:\n\n{{output}}',
|
|
113
|
+
systemPrompt: 'You are a code reviewer...', // Optional custom system prompt
|
|
114
|
+
minScore: 70, // Minimum passing score (0-100, default: 70)
|
|
115
|
+
model: 'claude-3-5-haiku-20241022', // Model to use
|
|
116
|
+
maxTokens: 1024, // Max output tokens
|
|
117
|
+
temperature: 0 // Temperature (0-1)
|
|
118
|
+
}
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
**Available placeholders in prompts:**
|
|
122
|
+
- `{{output}}` - The agent's final output text
|
|
123
|
+
- `{{cwd}}` - Working directory path
|
|
124
|
+
- `{{changedFiles}}` - List of files that were modified
|
|
125
|
+
- `{{trace}}` - Formatted LLM trace showing tool calls and completions
|
|
126
|
+
|
|
127
|
+
## Types
|
|
128
|
+
|
|
129
|
+
### EvaluationInput
|
|
130
|
+
|
|
131
|
+
The input data for assertion evaluation:
|
|
132
|
+
|
|
133
|
+
```typescript
|
|
134
|
+
interface EvaluationInput {
|
|
135
|
+
outputText?: string;
|
|
136
|
+
llmTrace?: LLMTrace;
|
|
137
|
+
fileDiffs?: Array<{ path: string }>;
|
|
138
|
+
}
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
### AssertionContext
|
|
142
|
+
|
|
143
|
+
Optional context for assertions:
|
|
144
|
+
|
|
145
|
+
```typescript
|
|
146
|
+
interface AssertionContext {
|
|
147
|
+
workDir?: string; // For build_passed
|
|
148
|
+
llmConfig?: { // For llm_judge
|
|
149
|
+
baseUrl: string;
|
|
150
|
+
headers: Record<string, string>;
|
|
151
|
+
};
|
|
152
|
+
generateTextForLlmJudge?: (options) => Promise<{ text: string }>; // For testing
|
|
153
|
+
}
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
### AssertionResult
|
|
157
|
+
|
|
158
|
+
The result of evaluating an assertion:
|
|
159
|
+
|
|
160
|
+
```typescript
|
|
161
|
+
interface AssertionResult {
|
|
162
|
+
id: string;
|
|
163
|
+
assertionId: string;
|
|
164
|
+
assertionType: string;
|
|
165
|
+
assertionName: string;
|
|
166
|
+
status: AssertionResultStatus; // 'passed' | 'failed' | 'skipped' | 'error'
|
|
167
|
+
message?: string;
|
|
168
|
+
expected?: string;
|
|
169
|
+
actual?: string;
|
|
170
|
+
duration?: number;
|
|
171
|
+
details?: Record<string, unknown>;
|
|
172
|
+
}
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
## Creating Custom Evaluators
|
|
176
|
+
|
|
177
|
+
You can extend the framework with custom assertion types:
|
|
178
|
+
|
|
179
|
+
```typescript
|
|
180
|
+
import { AssertionEvaluator, type AssertionResult, AssertionResultStatus } from '@wix/eval-assertions';
|
|
181
|
+
import { z } from 'zod';
|
|
182
|
+
|
|
183
|
+
// Define your assertion schema
|
|
184
|
+
export const MyAssertionSchema = z.object({
|
|
185
|
+
type: z.literal('my_assertion'),
|
|
186
|
+
customField: z.string()
|
|
187
|
+
});
|
|
188
|
+
|
|
189
|
+
export type MyAssertion = z.infer<typeof MyAssertionSchema>;
|
|
190
|
+
|
|
191
|
+
// Implement the evaluator
|
|
192
|
+
export class MyAssertionEvaluator extends AssertionEvaluator<MyAssertion> {
|
|
193
|
+
readonly type = 'my_assertion' as const;
|
|
194
|
+
|
|
195
|
+
evaluate(assertion, input, context): AssertionResult {
|
|
196
|
+
// Your evaluation logic here
|
|
197
|
+
return {
|
|
198
|
+
id: crypto.randomUUID(),
|
|
199
|
+
assertionId: crypto.randomUUID(),
|
|
200
|
+
assertionType: 'my_assertion',
|
|
201
|
+
assertionName: 'My Custom Assertion',
|
|
202
|
+
status: AssertionResultStatus.PASSED,
|
|
203
|
+
message: 'Assertion passed!'
|
|
204
|
+
};
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
// Register your evaluator
|
|
209
|
+
import { registerEvaluator } from '@wix/eval-assertions';
|
|
210
|
+
registerEvaluator('my_assertion', new MyAssertionEvaluator());
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
## License
|
|
214
|
+
|
|
215
|
+
MIT
|