@wix/eval-assertions 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,215 @@
1
+ # @wix/eval-assertions
2
+
3
+ A framework for evaluating AI agent outputs through configurable assertions. Supports skill invocation checks, build validation, and LLM-based judging.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ npm install @wix/eval-assertions
9
+ # or
10
+ yarn add @wix/eval-assertions
11
+ ```
12
+
13
+ ## Features
14
+
15
+ - **Skill Was Called**: Verify that specific skills were invoked during agent execution
16
+ - **Build Passed**: Run build commands and verify exit codes
17
+ - **LLM Judge**: Use an LLM to evaluate agent outputs with customizable prompts and scoring
18
+
19
+ ## Quick Start
20
+
21
+ ```typescript
22
+ import {
23
+ evaluateAssertions,
24
+ AssertionResultStatus,
25
+ type Assertion,
26
+ type AssertionContext,
27
+ type EvaluationInput
28
+ } from '@wix/eval-assertions';
29
+
30
+ // Define your assertions
31
+ const assertions: Assertion[] = [
32
+ {
33
+ type: 'skill_was_called',
34
+ skillName: 'my-skill'
35
+ },
36
+ {
37
+ type: 'build_passed',
38
+ command: 'npm test',
39
+ expectedExitCode: 0
40
+ },
41
+ {
42
+ type: 'llm_judge',
43
+ prompt: 'Evaluate if the output correctly implements the requested feature:\n\n{{output}}',
44
+ minScore: 70
45
+ }
46
+ ];
47
+
48
+ // Prepare your evaluation input
49
+ const input: EvaluationInput = {
50
+ outputText: 'Agent output here...',
51
+ llmTrace: {
52
+ id: 'trace-1',
53
+ steps: [...],
54
+ summary: {...}
55
+ },
56
+ fileDiffs: [...]
57
+ };
58
+
59
+ // Set up context for assertions that need it
60
+ const context: AssertionContext = {
61
+ workDir: '/path/to/working/directory',
62
+ llmConfig: {
63
+ baseUrl: 'https://api.anthropic.com',
64
+ headers: { 'x-api-key': 'your-key' }
65
+ }
66
+ };
67
+
68
+ // Run assertions
69
+ const results = await evaluateAssertions(input, assertions, context);
70
+
71
+ // Check results
72
+ for (const result of results) {
73
+ console.log(`${result.assertionName}: ${result.status}`);
74
+ if (result.status === AssertionResultStatus.FAILED) {
75
+ console.log(` Message: ${result.message}`);
76
+ }
77
+ }
78
+ ```
79
+
80
+ ## Assertion Types
81
+
82
+ ### skill_was_called
83
+
84
+ Checks if a specific skill was invoked by examining the LLM trace.
85
+
86
+ ```typescript
87
+ {
88
+ type: 'skill_was_called',
89
+ skillName: 'commit' // Name of the skill that must have been called
90
+ }
91
+ ```
92
+
93
+ ### build_passed
94
+
95
+ Runs a command in the working directory and checks the exit code.
96
+
97
+ ```typescript
98
+ {
99
+ type: 'build_passed',
100
+ command: 'yarn build', // Command to run (default: 'yarn build')
101
+ expectedExitCode: 0 // Expected exit code (default: 0)
102
+ }
103
+ ```
104
+
105
+ ### llm_judge
106
+
107
+ Uses an LLM to evaluate the output with a customizable prompt.
108
+
109
+ ```typescript
110
+ {
111
+ type: 'llm_judge',
112
+ prompt: 'Evaluate the quality of this code:\n\n{{output}}',
113
+ systemPrompt: 'You are a code reviewer...', // Optional custom system prompt
114
+ minScore: 70, // Minimum passing score (0-100, default: 70)
115
+ model: 'claude-3-5-haiku-20241022', // Model to use
116
+ maxTokens: 1024, // Max output tokens
117
+ temperature: 0 // Temperature (0-1)
118
+ }
119
+ ```
120
+
121
+ **Available placeholders in prompts:**
122
+ - `{{output}}` - The agent's final output text
123
+ - `{{cwd}}` - Working directory path
124
+ - `{{changedFiles}}` - List of files that were modified
125
+ - `{{trace}}` - Formatted LLM trace showing tool calls and completions
126
+
127
+ ## Types
128
+
129
+ ### EvaluationInput
130
+
131
+ The input data for assertion evaluation:
132
+
133
+ ```typescript
134
+ interface EvaluationInput {
135
+ outputText?: string;
136
+ llmTrace?: LLMTrace;
137
+ fileDiffs?: Array<{ path: string }>;
138
+ }
139
+ ```
140
+
141
+ ### AssertionContext
142
+
143
+ Optional context for assertions:
144
+
145
+ ```typescript
146
+ interface AssertionContext {
147
+ workDir?: string; // For build_passed
148
+ llmConfig?: { // For llm_judge
149
+ baseUrl: string;
150
+ headers: Record<string, string>;
151
+ };
152
+ generateTextForLlmJudge?: (options) => Promise<{ text: string }>; // For testing
153
+ }
154
+ ```
155
+
156
+ ### AssertionResult
157
+
158
+ The result of evaluating an assertion:
159
+
160
+ ```typescript
161
+ interface AssertionResult {
162
+ id: string;
163
+ assertionId: string;
164
+ assertionType: string;
165
+ assertionName: string;
166
+ status: AssertionResultStatus; // 'passed' | 'failed' | 'skipped' | 'error'
167
+ message?: string;
168
+ expected?: string;
169
+ actual?: string;
170
+ duration?: number;
171
+ details?: Record<string, unknown>;
172
+ }
173
+ ```
174
+
175
+ ## Creating Custom Evaluators
176
+
177
+ You can extend the framework with custom assertion types:
178
+
179
+ ```typescript
180
+ import { AssertionEvaluator, type AssertionResult, AssertionResultStatus } from '@wix/eval-assertions';
181
+ import { z } from 'zod';
182
+
183
+ // Define your assertion schema
184
+ export const MyAssertionSchema = z.object({
185
+ type: z.literal('my_assertion'),
186
+ customField: z.string()
187
+ });
188
+
189
+ export type MyAssertion = z.infer<typeof MyAssertionSchema>;
190
+
191
+ // Implement the evaluator
192
+ export class MyAssertionEvaluator extends AssertionEvaluator<MyAssertion> {
193
+ readonly type = 'my_assertion' as const;
194
+
195
+ evaluate(assertion, input, context): AssertionResult {
196
+ // Your evaluation logic here
197
+ return {
198
+ id: crypto.randomUUID(),
199
+ assertionId: crypto.randomUUID(),
200
+ assertionType: 'my_assertion',
201
+ assertionName: 'My Custom Assertion',
202
+ status: AssertionResultStatus.PASSED,
203
+ message: 'Assertion passed!'
204
+ };
205
+ }
206
+ }
207
+
208
+ // Register your evaluator
209
+ import { registerEvaluator } from '@wix/eval-assertions';
210
+ registerEvaluator('my_assertion', new MyAssertionEvaluator());
211
+ ```
212
+
213
+ ## License
214
+
215
+ MIT