judgeval 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +95 -68
- package/dist/cjs/common/tracer.js +235 -143
- package/dist/cjs/common/tracer.js.map +1 -1
- package/dist/cjs/constants.js +8 -5
- package/dist/cjs/constants.js.map +1 -1
- package/dist/cjs/data/datasets/eval-dataset-client.js +349 -0
- package/dist/cjs/data/datasets/eval-dataset-client.js.map +1 -0
- package/dist/cjs/data/datasets/eval-dataset.js +405 -0
- package/dist/cjs/data/datasets/eval-dataset.js.map +1 -0
- package/dist/cjs/data/example.js +22 -1
- package/dist/cjs/data/example.js.map +1 -1
- package/dist/cjs/e2etests/eval-operations.test.js +282 -0
- package/dist/cjs/e2etests/eval-operations.test.js.map +1 -0
- package/dist/cjs/e2etests/judgee-traces.test.js +278 -0
- package/dist/cjs/e2etests/judgee-traces.test.js.map +1 -0
- package/dist/cjs/index.js +1 -3
- package/dist/cjs/index.js.map +1 -1
- package/dist/cjs/judgment-client.js +326 -645
- package/dist/cjs/judgment-client.js.map +1 -1
- package/dist/cjs/scorers/api-scorer.js +56 -48
- package/dist/cjs/scorers/api-scorer.js.map +1 -1
- package/dist/cjs/scorers/base-scorer.js +66 -11
- package/dist/cjs/scorers/base-scorer.js.map +1 -1
- package/dist/esm/common/tracer.js +236 -144
- package/dist/esm/common/tracer.js.map +1 -1
- package/dist/esm/constants.js +7 -4
- package/dist/esm/constants.js.map +1 -1
- package/dist/esm/data/datasets/eval-dataset-client.js +342 -0
- package/dist/esm/data/datasets/eval-dataset-client.js.map +1 -0
- package/dist/esm/data/datasets/eval-dataset.js +375 -0
- package/dist/esm/data/datasets/eval-dataset.js.map +1 -0
- package/dist/esm/data/example.js +22 -1
- package/dist/esm/data/example.js.map +1 -1
- package/dist/esm/e2etests/eval-operations.test.js +254 -0
- package/dist/esm/e2etests/eval-operations.test.js.map +1 -0
- package/dist/esm/e2etests/judgee-traces.test.js +253 -0
- package/dist/esm/e2etests/judgee-traces.test.js.map +1 -0
- package/dist/esm/index.js +0 -1
- package/dist/esm/index.js.map +1 -1
- package/dist/esm/judgment-client.js +328 -647
- package/dist/esm/judgment-client.js.map +1 -1
- package/dist/esm/scorers/api-scorer.js +56 -48
- package/dist/esm/scorers/api-scorer.js.map +1 -1
- package/dist/esm/scorers/base-scorer.js +66 -11
- package/dist/esm/scorers/base-scorer.js.map +1 -1
- package/dist/types/common/tracer.d.ts +27 -14
- package/dist/types/constants.d.ts +4 -4
- package/dist/types/data/datasets/eval-dataset-client.d.ts +39 -0
- package/dist/types/data/datasets/eval-dataset.d.ts +45 -0
- package/dist/types/data/example.d.ts +24 -12
- package/dist/types/e2etests/eval-operations.test.d.ts +5 -0
- package/dist/types/e2etests/judgee-traces.test.d.ts +5 -0
- package/dist/types/index.d.ts +0 -1
- package/dist/types/judgment-client.d.ts +3 -47
- package/dist/types/scorers/api-scorer.d.ts +15 -15
- package/dist/types/scorers/base-scorer.d.ts +53 -10
- package/package.json +2 -1
- package/dist/cjs/scorers/exact-match-scorer.js +0 -84
- package/dist/cjs/scorers/exact-match-scorer.js.map +0 -1
- package/dist/esm/scorers/exact-match-scorer.js +0 -80
- package/dist/esm/scorers/exact-match-scorer.js.map +0 -1
- package/dist/types/scorers/exact-match-scorer.d.ts +0 -10
package/README.md
CHANGED
|
@@ -152,21 +152,12 @@ You can retrieve past evaluation results using several methods:
|
|
|
152
152
|
// Initialize the JudgmentClient
|
|
153
153
|
const client = JudgmentClient.getInstance();
|
|
154
154
|
|
|
155
|
-
//
|
|
155
|
+
// Using pullEval
|
|
156
156
|
const results = await client.pullEval('my-project', 'my-eval-run');
|
|
157
157
|
|
|
158
|
-
//
|
|
159
|
-
const
|
|
160
|
-
|
|
161
|
-
// List all evaluation runs for a project
|
|
162
|
-
const evalRuns = await client.listEvalRuns('my-project', 100, 0); // limit=100, offset=0
|
|
163
|
-
|
|
164
|
-
// Get statistics for an evaluation run
|
|
165
|
-
const stats = await client.getEvalRunStats('my-project', 'my-eval-run');
|
|
166
|
-
|
|
167
|
-
// Export evaluation results to JSON or CSV
|
|
168
|
-
const jsonExport = await client.exportEvalResults('my-project', 'my-eval-run', 'json');
|
|
169
|
-
const csvExport = await client.exportEvalResults('my-project', 'my-eval-run', 'csv');
|
|
158
|
+
// Export evaluation results to different formats
|
|
159
|
+
const jsonData = await client.exportEvalResults('my-project', 'my-eval-run', 'json');
|
|
160
|
+
const csvData = await client.exportEvalResults('my-project', 'my-eval-run', 'csv');
|
|
170
161
|
```
|
|
171
162
|
|
|
172
163
|
The returned results include the evaluation run ID and a list of scoring results:
|
|
@@ -188,34 +179,49 @@ For a complete example of retrieving evaluation results, see `src/examples/resul
|
|
|
188
179
|
|
|
189
180
|
## Custom Scorers
|
|
190
181
|
|
|
191
|
-
You can create custom scorers by extending the `JudgevalScorer` class.
|
|
182
|
+
You can create custom scorers by extending the `JudgevalScorer` class. This implementation aligns with the Python SDK approach, making it easy to port scorers between languages.
|
|
183
|
+
|
|
184
|
+
### Creating a Custom Scorer
|
|
185
|
+
|
|
186
|
+
To create a custom scorer:
|
|
187
|
+
|
|
188
|
+
1. **Extend the JudgevalScorer class**:
|
|
192
189
|
|
|
193
190
|
```typescript
|
|
194
|
-
import { Example } from '
|
|
195
|
-
import { JudgevalScorer } from '
|
|
196
|
-
import { ScorerData } from '
|
|
191
|
+
import { Example } from 'judgeval/data/example';
|
|
192
|
+
import { JudgevalScorer } from 'judgeval/scorers/base-scorer';
|
|
193
|
+
import { ScorerData } from 'judgeval/data/result';
|
|
197
194
|
|
|
198
|
-
/**
|
|
199
|
-
* ExactMatchScorer - A custom scorer that checks if the actual output exactly matches the expected output
|
|
200
|
-
*/
|
|
201
195
|
class ExactMatchScorer extends JudgevalScorer {
|
|
202
|
-
constructor(
|
|
203
|
-
|
|
196
|
+
constructor(
|
|
197
|
+
threshold: number = 1.0,
|
|
198
|
+
additional_metadata?: Record<string, any>,
|
|
199
|
+
include_reason: boolean = true,
|
|
200
|
+
async_mode: boolean = true,
|
|
201
|
+
strict_mode: boolean = false,
|
|
202
|
+
verbose_mode: boolean = true
|
|
203
|
+
) {
|
|
204
|
+
super('exact_match', threshold, additional_metadata, include_reason, async_mode, strict_mode, verbose_mode);
|
|
204
205
|
}
|
|
205
206
|
|
|
206
207
|
async scoreExample(example: Example): Promise<ScorerData> {
|
|
207
208
|
try {
|
|
208
209
|
// Check if the example has expected output
|
|
209
210
|
if (!example.expectedOutput) {
|
|
211
|
+
this.error = "Missing expected output";
|
|
212
|
+
this.score = 0;
|
|
213
|
+
this.success = false;
|
|
214
|
+
this.reason = "Expected output is required for exact match scoring";
|
|
215
|
+
|
|
210
216
|
return {
|
|
211
217
|
name: this.type,
|
|
212
218
|
threshold: this.threshold,
|
|
213
219
|
success: false,
|
|
214
220
|
score: 0,
|
|
215
|
-
reason:
|
|
216
|
-
strict_mode:
|
|
221
|
+
reason: this.reason,
|
|
222
|
+
strict_mode: this.strict_mode,
|
|
217
223
|
evaluation_model: "exact-match",
|
|
218
|
-
error:
|
|
224
|
+
error: this.error,
|
|
219
225
|
evaluation_cost: null,
|
|
220
226
|
verbose_logs: null,
|
|
221
227
|
additional_metadata: this.additional_metadata || {}
|
|
@@ -231,35 +237,48 @@ class ExactMatchScorer extends JudgevalScorer {
|
|
|
231
237
|
this.score = isMatch ? 1 : 0;
|
|
232
238
|
|
|
233
239
|
// Generate a reason for the score
|
|
234
|
-
|
|
240
|
+
this.reason = isMatch
|
|
235
241
|
? "The actual output exactly matches the expected output."
|
|
236
242
|
: `The actual output "${actualOutput}" does not match the expected output "${expectedOutput}".`;
|
|
237
243
|
|
|
244
|
+
// Set success based on the score and threshold
|
|
245
|
+
this.success = this._successCheck();
|
|
246
|
+
|
|
247
|
+
// Generate verbose logs if verbose mode is enabled
|
|
248
|
+
if (this.verbose_mode) {
|
|
249
|
+
this.verbose_logs = `Comparing: "${actualOutput}" with "${expectedOutput}"`;
|
|
250
|
+
}
|
|
251
|
+
|
|
238
252
|
// Return the scorer data
|
|
239
253
|
return {
|
|
240
254
|
name: this.type,
|
|
241
255
|
threshold: this.threshold,
|
|
242
|
-
success: this.
|
|
256
|
+
success: this.success,
|
|
243
257
|
score: this.score,
|
|
244
|
-
reason: reason,
|
|
245
|
-
strict_mode:
|
|
258
|
+
reason: this.include_reason ? this.reason : null,
|
|
259
|
+
strict_mode: this.strict_mode,
|
|
246
260
|
evaluation_model: "exact-match",
|
|
247
261
|
error: null,
|
|
248
262
|
evaluation_cost: null,
|
|
249
|
-
verbose_logs: this.
|
|
263
|
+
verbose_logs: this.verbose_mode ? this.verbose_logs : null,
|
|
250
264
|
additional_metadata: this.additional_metadata || {}
|
|
251
265
|
};
|
|
252
266
|
} catch (error) {
|
|
253
267
|
// Handle any errors during scoring
|
|
254
268
|
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
255
269
|
|
|
270
|
+
this.error = errorMessage;
|
|
271
|
+
this.score = 0;
|
|
272
|
+
this.success = false;
|
|
273
|
+
this.reason = `Error during scoring: ${errorMessage}`;
|
|
274
|
+
|
|
256
275
|
return {
|
|
257
276
|
name: this.type,
|
|
258
277
|
threshold: this.threshold,
|
|
259
278
|
success: false,
|
|
260
279
|
score: 0,
|
|
261
|
-
reason:
|
|
262
|
-
strict_mode:
|
|
280
|
+
reason: this.reason,
|
|
281
|
+
strict_mode: this.strict_mode,
|
|
263
282
|
evaluation_model: "exact-match",
|
|
264
283
|
error: errorMessage,
|
|
265
284
|
evaluation_cost: null,
|
|
@@ -268,9 +287,30 @@ class ExactMatchScorer extends JudgevalScorer {
|
|
|
268
287
|
};
|
|
269
288
|
}
|
|
270
289
|
}
|
|
290
|
+
|
|
291
|
+
/**
|
|
292
|
+
* Get the name of the scorer
|
|
293
|
+
* This is equivalent to Python's __name__ property
|
|
294
|
+
*/
|
|
295
|
+
get name(): string {
|
|
296
|
+
return "Exact Match Scorer";
|
|
297
|
+
}
|
|
271
298
|
}
|
|
272
299
|
```
|
|
273
300
|
|
|
301
|
+
2. **Implement required methods**:
|
|
302
|
+
|
|
303
|
+
- `scoreExample(example: Example)`: The core method that evaluates an example and returns a score
|
|
304
|
+
- `name`: A getter property that returns the human-readable name of your scorer
|
|
305
|
+
|
|
306
|
+
3. **Set internal state**:
|
|
307
|
+
|
|
308
|
+
Your implementation should set these internal properties:
|
|
309
|
+
- `this.score`: The numerical score (typically between 0 and 1)
|
|
310
|
+
- `this.success`: Whether the example passed the evaluation
|
|
311
|
+
- `this.reason`: A human-readable explanation of the score
|
|
312
|
+
- `this.error`: Any error that occurred during scoring
|
|
313
|
+
|
|
274
314
|
### Using Custom Scorers
|
|
275
315
|
|
|
276
316
|
You can use custom scorers with the JudgmentClient just like any other scorer:
|
|
@@ -282,53 +322,40 @@ const examples = [
|
|
|
282
322
|
.input("What is the capital of France?")
|
|
283
323
|
.actualOutput("Paris is the capital of France.")
|
|
284
324
|
.expectedOutput("Paris is the capital of France.")
|
|
285
|
-
.exampleIndex(0)
|
|
286
325
|
.build(),
|
|
287
326
|
// Add more examples...
|
|
288
327
|
];
|
|
289
328
|
|
|
290
329
|
// Create a custom scorer
|
|
291
|
-
const exactMatchScorer = new ExactMatchScorer(
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
//
|
|
297
|
-
|
|
298
|
-
examples,
|
|
299
|
-
[exactMatchScorer],
|
|
300
|
-
"gpt-3.5-turbo", // Specify a valid model name
|
|
301
|
-
"my-project",
|
|
302
|
-
{
|
|
303
|
-
evalRunName: "custom-scorer-test",
|
|
304
|
-
logResults: true
|
|
305
|
-
}
|
|
330
|
+
const exactMatchScorer = new ExactMatchScorer(
|
|
331
|
+
1.0,
|
|
332
|
+
{ description: "Checks for exact string match" },
|
|
333
|
+
true, // include_reason
|
|
334
|
+
true, // async_mode
|
|
335
|
+
false, // strict_mode
|
|
336
|
+
true // verbose_mode
|
|
306
337
|
);
|
|
307
|
-
```
|
|
308
|
-
|
|
309
|
-
### Viewing Results
|
|
310
|
-
|
|
311
|
-
After running an evaluation with custom scorers, you can view the results in the Judgment platform:
|
|
312
338
|
|
|
313
|
-
|
|
314
|
-
|
|
339
|
+
// Run evaluation with the custom scorer
|
|
340
|
+
const results = await client.runEvaluation({
|
|
341
|
+
examples: examples,
|
|
342
|
+
scorers: [exactMatchScorer],
|
|
343
|
+
projectName: "my-project",
|
|
344
|
+
evalRunName: "custom-scorer-test",
|
|
345
|
+
useJudgment: false // Run locally, don't use Judgment API
|
|
346
|
+
});
|
|
315
347
|
```
|
|
316
348
|
|
|
317
|
-
|
|
349
|
+
### Custom Scorer Parameters
|
|
318
350
|
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
return r.scorersData?.every(s => s.success) ?? false;
|
|
326
|
-
}).length;
|
|
327
|
-
|
|
328
|
-
console.log(`Success rate: ${successCount}/${examples.length} (${(successCount/examples.length*100).toFixed(2)}%)`);
|
|
329
|
-
```
|
|
351
|
+
- `threshold`: The minimum score required for success (0-1 for most scorers)
|
|
352
|
+
- `additional_metadata`: Extra information to include with results
|
|
353
|
+
- `include_reason`: Whether to include a reason for the score
|
|
354
|
+
- `async_mode`: Whether to run the scorer asynchronously
|
|
355
|
+
- `strict_mode`: If true, sets threshold to 1.0 for strict evaluation
|
|
356
|
+
- `verbose_mode`: Whether to include detailed logs
|
|
330
357
|
|
|
331
|
-
For a complete example of using custom scorers, see `src/examples/custom-scorer.ts`.
|
|
358
|
+
For a complete example of creating and using custom scorers, see `src/examples/custom-scorer.ts`.
|
|
332
359
|
|
|
333
360
|
## Examples
|
|
334
361
|
|