judgeval 0.1.42 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +105 -73
- package/dist/cjs/common/integrations/langgraph.js +141 -367
- package/dist/cjs/common/integrations/langgraph.js.map +1 -1
- package/dist/cjs/common/logger-instance.js +17 -19
- package/dist/cjs/common/logger-instance.js.map +1 -1
- package/dist/cjs/common/tracer.js +446 -379
- package/dist/cjs/common/tracer.js.map +1 -1
- package/dist/cjs/constants.js +3 -2
- package/dist/cjs/constants.js.map +1 -1
- package/dist/cjs/index.js +1 -3
- package/dist/cjs/index.js.map +1 -1
- package/dist/cjs/judgment-client.js +45 -122
- package/dist/cjs/judgment-client.js.map +1 -1
- package/dist/cjs/rules.js +6 -6
- package/dist/cjs/rules.js.map +1 -1
- package/dist/cjs/scorers/api-scorer.js +56 -48
- package/dist/cjs/scorers/api-scorer.js.map +1 -1
- package/dist/cjs/scorers/base-scorer.js +66 -11
- package/dist/cjs/scorers/base-scorer.js.map +1 -1
- package/dist/esm/common/integrations/langgraph.js +142 -371
- package/dist/esm/common/integrations/langgraph.js.map +1 -1
- package/dist/esm/common/logger-instance.js +17 -19
- package/dist/esm/common/logger-instance.js.map +1 -1
- package/dist/esm/common/tracer.js +447 -378
- package/dist/esm/common/tracer.js.map +1 -1
- package/dist/esm/constants.js +2 -1
- package/dist/esm/constants.js.map +1 -1
- package/dist/esm/index.js +0 -1
- package/dist/esm/index.js.map +1 -1
- package/dist/esm/judgment-client.js +45 -122
- package/dist/esm/judgment-client.js.map +1 -1
- package/dist/esm/rules.js +6 -6
- package/dist/esm/rules.js.map +1 -1
- package/dist/esm/scorers/api-scorer.js +56 -48
- package/dist/esm/scorers/api-scorer.js.map +1 -1
- package/dist/esm/scorers/base-scorer.js +66 -11
- package/dist/esm/scorers/base-scorer.js.map +1 -1
- package/dist/types/common/integrations/langgraph.d.ts +21 -29
- package/dist/types/common/tracer.d.ts +49 -32
- package/dist/types/constants.d.ts +2 -1
- package/dist/types/index.d.ts +0 -1
- package/dist/types/judgment-client.d.ts +0 -22
- package/dist/types/rules.d.ts +2 -2
- package/dist/types/scorers/api-scorer.d.ts +15 -15
- package/dist/types/scorers/base-scorer.d.ts +53 -10
- package/package.json +15 -5
- package/dist/cjs/scorers/exact-match-scorer.js +0 -84
- package/dist/cjs/scorers/exact-match-scorer.js.map +0 -1
- package/dist/esm/scorers/exact-match-scorer.js +0 -80
- package/dist/esm/scorers/exact-match-scorer.js.map +0 -1
- package/dist/types/scorers/exact-match-scorer.d.ts +0 -10
package/README.md
CHANGED
|
@@ -131,12 +131,17 @@ const tracer = Tracer.getInstance({
|
|
|
131
131
|
enableEvaluations: true
|
|
132
132
|
});
|
|
133
133
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
134
|
+
// Analogous to Python SDK's with, e.g.
|
|
135
|
+
//
|
|
136
|
+
// with tracer.trace("my-trace") as trace:
|
|
137
|
+
// with trace.span("operation") as span:
|
|
138
|
+
// # Perform operations
|
|
139
|
+
//
|
|
140
|
+
for (const trace of tracer.trace("my-trace")) {
|
|
141
|
+
for (const span of trace.span("operation")) {
|
|
137
142
|
// Perform operations
|
|
138
|
-
}
|
|
139
|
-
}
|
|
143
|
+
}
|
|
144
|
+
}
|
|
140
145
|
```
|
|
141
146
|
|
|
142
147
|
## Result Retrieval
|
|
@@ -147,21 +152,12 @@ You can retrieve past evaluation results using several methods:
|
|
|
147
152
|
// Initialize the JudgmentClient
|
|
148
153
|
const client = JudgmentClient.getInstance();
|
|
149
154
|
|
|
150
|
-
//
|
|
155
|
+
// Using pullEval
|
|
151
156
|
const results = await client.pullEval('my-project', 'my-eval-run');
|
|
152
157
|
|
|
153
|
-
//
|
|
154
|
-
const
|
|
155
|
-
|
|
156
|
-
// List all evaluation runs for a project
|
|
157
|
-
const evalRuns = await client.listEvalRuns('my-project', 100, 0); // limit=100, offset=0
|
|
158
|
-
|
|
159
|
-
// Get statistics for an evaluation run
|
|
160
|
-
const stats = await client.getEvalRunStats('my-project', 'my-eval-run');
|
|
161
|
-
|
|
162
|
-
// Export evaluation results to JSON or CSV
|
|
163
|
-
const jsonExport = await client.exportEvalResults('my-project', 'my-eval-run', 'json');
|
|
164
|
-
const csvExport = await client.exportEvalResults('my-project', 'my-eval-run', 'csv');
|
|
158
|
+
// Export evaluation results to different formats
|
|
159
|
+
const jsonData = await client.exportEvalResults('my-project', 'my-eval-run', 'json');
|
|
160
|
+
const csvData = await client.exportEvalResults('my-project', 'my-eval-run', 'csv');
|
|
165
161
|
```
|
|
166
162
|
|
|
167
163
|
The returned results include the evaluation run ID and a list of scoring results:
|
|
@@ -183,34 +179,49 @@ For a complete example of retrieving evaluation results, see `src/examples/resul
|
|
|
183
179
|
|
|
184
180
|
## Custom Scorers
|
|
185
181
|
|
|
186
|
-
You can create custom scorers by extending the `JudgevalScorer` class.
|
|
182
|
+
You can create custom scorers by extending the `JudgevalScorer` class. This implementation aligns with the Python SDK approach, making it easy to port scorers between languages.
|
|
183
|
+
|
|
184
|
+
### Creating a Custom Scorer
|
|
185
|
+
|
|
186
|
+
To create a custom scorer:
|
|
187
|
+
|
|
188
|
+
1. **Extend the JudgevalScorer class**:
|
|
187
189
|
|
|
188
190
|
```typescript
|
|
189
|
-
import { Example } from '
|
|
190
|
-
import { JudgevalScorer } from '
|
|
191
|
-
import { ScorerData } from '
|
|
191
|
+
import { Example } from 'judgeval/data/example';
|
|
192
|
+
import { JudgevalScorer } from 'judgeval/scorers/base-scorer';
|
|
193
|
+
import { ScorerData } from 'judgeval/data/result';
|
|
192
194
|
|
|
193
|
-
/**
|
|
194
|
-
* ExactMatchScorer - A custom scorer that checks if the actual output exactly matches the expected output
|
|
195
|
-
*/
|
|
196
195
|
class ExactMatchScorer extends JudgevalScorer {
|
|
197
|
-
constructor(
|
|
198
|
-
|
|
196
|
+
constructor(
|
|
197
|
+
threshold: number = 1.0,
|
|
198
|
+
additional_metadata?: Record<string, any>,
|
|
199
|
+
include_reason: boolean = true,
|
|
200
|
+
async_mode: boolean = true,
|
|
201
|
+
strict_mode: boolean = false,
|
|
202
|
+
verbose_mode: boolean = true
|
|
203
|
+
) {
|
|
204
|
+
super('exact_match', threshold, additional_metadata, include_reason, async_mode, strict_mode, verbose_mode);
|
|
199
205
|
}
|
|
200
206
|
|
|
201
207
|
async scoreExample(example: Example): Promise<ScorerData> {
|
|
202
208
|
try {
|
|
203
209
|
// Check if the example has expected output
|
|
204
210
|
if (!example.expectedOutput) {
|
|
211
|
+
this.error = "Missing expected output";
|
|
212
|
+
this.score = 0;
|
|
213
|
+
this.success = false;
|
|
214
|
+
this.reason = "Expected output is required for exact match scoring";
|
|
215
|
+
|
|
205
216
|
return {
|
|
206
217
|
name: this.type,
|
|
207
218
|
threshold: this.threshold,
|
|
208
219
|
success: false,
|
|
209
220
|
score: 0,
|
|
210
|
-
reason:
|
|
211
|
-
strict_mode:
|
|
221
|
+
reason: this.reason,
|
|
222
|
+
strict_mode: this.strict_mode,
|
|
212
223
|
evaluation_model: "exact-match",
|
|
213
|
-
error:
|
|
224
|
+
error: this.error,
|
|
214
225
|
evaluation_cost: null,
|
|
215
226
|
verbose_logs: null,
|
|
216
227
|
additional_metadata: this.additional_metadata || {}
|
|
@@ -226,35 +237,48 @@ class ExactMatchScorer extends JudgevalScorer {
|
|
|
226
237
|
this.score = isMatch ? 1 : 0;
|
|
227
238
|
|
|
228
239
|
// Generate a reason for the score
|
|
229
|
-
|
|
240
|
+
this.reason = isMatch
|
|
230
241
|
? "The actual output exactly matches the expected output."
|
|
231
242
|
: `The actual output "${actualOutput}" does not match the expected output "${expectedOutput}".`;
|
|
232
243
|
|
|
244
|
+
// Set success based on the score and threshold
|
|
245
|
+
this.success = this._successCheck();
|
|
246
|
+
|
|
247
|
+
// Generate verbose logs if verbose mode is enabled
|
|
248
|
+
if (this.verbose_mode) {
|
|
249
|
+
this.verbose_logs = `Comparing: "${actualOutput}" with "${expectedOutput}"`;
|
|
250
|
+
}
|
|
251
|
+
|
|
233
252
|
// Return the scorer data
|
|
234
253
|
return {
|
|
235
254
|
name: this.type,
|
|
236
255
|
threshold: this.threshold,
|
|
237
|
-
success: this.
|
|
256
|
+
success: this.success,
|
|
238
257
|
score: this.score,
|
|
239
|
-
reason: reason,
|
|
240
|
-
strict_mode:
|
|
258
|
+
reason: this.include_reason ? this.reason : null,
|
|
259
|
+
strict_mode: this.strict_mode,
|
|
241
260
|
evaluation_model: "exact-match",
|
|
242
261
|
error: null,
|
|
243
262
|
evaluation_cost: null,
|
|
244
|
-
verbose_logs: this.
|
|
263
|
+
verbose_logs: this.verbose_mode ? this.verbose_logs : null,
|
|
245
264
|
additional_metadata: this.additional_metadata || {}
|
|
246
265
|
};
|
|
247
266
|
} catch (error) {
|
|
248
267
|
// Handle any errors during scoring
|
|
249
268
|
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
250
269
|
|
|
270
|
+
this.error = errorMessage;
|
|
271
|
+
this.score = 0;
|
|
272
|
+
this.success = false;
|
|
273
|
+
this.reason = `Error during scoring: ${errorMessage}`;
|
|
274
|
+
|
|
251
275
|
return {
|
|
252
276
|
name: this.type,
|
|
253
277
|
threshold: this.threshold,
|
|
254
278
|
success: false,
|
|
255
279
|
score: 0,
|
|
256
|
-
reason:
|
|
257
|
-
strict_mode:
|
|
280
|
+
reason: this.reason,
|
|
281
|
+
strict_mode: this.strict_mode,
|
|
258
282
|
evaluation_model: "exact-match",
|
|
259
283
|
error: errorMessage,
|
|
260
284
|
evaluation_cost: null,
|
|
@@ -263,9 +287,30 @@ class ExactMatchScorer extends JudgevalScorer {
|
|
|
263
287
|
};
|
|
264
288
|
}
|
|
265
289
|
}
|
|
290
|
+
|
|
291
|
+
/**
|
|
292
|
+
* Get the name of the scorer
|
|
293
|
+
* This is equivalent to Python's __name__ property
|
|
294
|
+
*/
|
|
295
|
+
get name(): string {
|
|
296
|
+
return "Exact Match Scorer";
|
|
297
|
+
}
|
|
266
298
|
}
|
|
267
299
|
```
|
|
268
300
|
|
|
301
|
+
2. **Implement required methods**:
|
|
302
|
+
|
|
303
|
+
- `scoreExample(example: Example)`: The core method that evaluates an example and returns a score
|
|
304
|
+
- `name`: A getter property that returns the human-readable name of your scorer
|
|
305
|
+
|
|
306
|
+
3. **Set internal state**:
|
|
307
|
+
|
|
308
|
+
Your implementation should set these internal properties:
|
|
309
|
+
- `this.score`: The numerical score (typically between 0 and 1)
|
|
310
|
+
- `this.success`: Whether the example passed the evaluation
|
|
311
|
+
- `this.reason`: A human-readable explanation of the score
|
|
312
|
+
- `this.error`: Any error that occurred during scoring
|
|
313
|
+
|
|
269
314
|
### Using Custom Scorers
|
|
270
315
|
|
|
271
316
|
You can use custom scorers with the JudgmentClient just like any other scorer:
|
|
@@ -277,53 +322,40 @@ const examples = [
|
|
|
277
322
|
.input("What is the capital of France?")
|
|
278
323
|
.actualOutput("Paris is the capital of France.")
|
|
279
324
|
.expectedOutput("Paris is the capital of France.")
|
|
280
|
-
.exampleIndex(0)
|
|
281
325
|
.build(),
|
|
282
326
|
// Add more examples...
|
|
283
327
|
];
|
|
284
328
|
|
|
285
329
|
// Create a custom scorer
|
|
286
|
-
const exactMatchScorer = new ExactMatchScorer(
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
//
|
|
292
|
-
|
|
293
|
-
examples,
|
|
294
|
-
[exactMatchScorer],
|
|
295
|
-
"gpt-3.5-turbo", // Specify a valid model name
|
|
296
|
-
"my-project",
|
|
297
|
-
{
|
|
298
|
-
evalRunName: "custom-scorer-test",
|
|
299
|
-
logResults: true
|
|
300
|
-
}
|
|
330
|
+
const exactMatchScorer = new ExactMatchScorer(
|
|
331
|
+
1.0,
|
|
332
|
+
{ description: "Checks for exact string match" },
|
|
333
|
+
true, // include_reason
|
|
334
|
+
true, // async_mode
|
|
335
|
+
false, // strict_mode
|
|
336
|
+
true // verbose_mode
|
|
301
337
|
);
|
|
302
|
-
```
|
|
303
|
-
|
|
304
|
-
### Viewing Results
|
|
305
338
|
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
339
|
+
// Run evaluation with the custom scorer
|
|
340
|
+
const results = await client.runEvaluation({
|
|
341
|
+
examples: examples,
|
|
342
|
+
scorers: [exactMatchScorer],
|
|
343
|
+
projectName: "my-project",
|
|
344
|
+
evalRunName: "custom-scorer-test",
|
|
345
|
+
useJudgment: false // Run locally, don't use Judgment API
|
|
346
|
+
});
|
|
310
347
|
```
|
|
311
348
|
|
|
312
|
-
|
|
349
|
+
### Custom Scorer Parameters
|
|
313
350
|
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
return r.scorersData?.every(s => s.success) ?? false;
|
|
321
|
-
}).length;
|
|
322
|
-
|
|
323
|
-
console.log(`Success rate: ${successCount}/${examples.length} (${(successCount/examples.length*100).toFixed(2)}%)`);
|
|
324
|
-
```
|
|
351
|
+
- `threshold`: The minimum score required for success (0-1 for most scorers)
|
|
352
|
+
- `additional_metadata`: Extra information to include with results
|
|
353
|
+
- `include_reason`: Whether to include a reason for the score
|
|
354
|
+
- `async_mode`: Whether to run the scorer asynchronously
|
|
355
|
+
- `strict_mode`: If true, sets threshold to 1.0 for strict evaluation
|
|
356
|
+
- `verbose_mode`: Whether to include detailed logs
|
|
325
357
|
|
|
326
|
-
For a complete example of using custom scorers, see `src/examples/custom-scorer.ts`.
|
|
358
|
+
For a complete example of creating and using custom scorers, see `src/examples/custom-scorer.ts`.
|
|
327
359
|
|
|
328
360
|
## Examples
|
|
329
361
|
|