@mastra/evals 1.1.2-alpha.0 → 1.2.0-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +59 -2
- package/LICENSE.md +15 -0
- package/dist/chunk-EVBNIL5M.js +606 -0
- package/dist/chunk-EVBNIL5M.js.map +1 -0
- package/dist/chunk-XRUR5PBK.cjs +632 -0
- package/dist/chunk-XRUR5PBK.cjs.map +1 -0
- package/dist/docs/SKILL.md +20 -19
- package/dist/docs/assets/SOURCE_MAP.json +1 -1
- package/dist/docs/references/docs-evals-built-in-scorers.md +2 -1
- package/dist/docs/references/docs-evals-overview.md +11 -16
- package/dist/docs/references/reference-evals-answer-relevancy.md +25 -25
- package/dist/docs/references/reference-evals-answer-similarity.md +33 -35
- package/dist/docs/references/reference-evals-bias.md +24 -24
- package/dist/docs/references/reference-evals-completeness.md +19 -20
- package/dist/docs/references/reference-evals-content-similarity.md +20 -20
- package/dist/docs/references/reference-evals-context-precision.md +36 -36
- package/dist/docs/references/reference-evals-context-relevance.md +136 -141
- package/dist/docs/references/reference-evals-faithfulness.md +24 -24
- package/dist/docs/references/reference-evals-hallucination.md +52 -69
- package/dist/docs/references/reference-evals-keyword-coverage.md +18 -18
- package/dist/docs/references/reference-evals-noise-sensitivity.md +167 -177
- package/dist/docs/references/reference-evals-prompt-alignment.md +111 -116
- package/dist/docs/references/reference-evals-scorer-utils.md +285 -105
- package/dist/docs/references/reference-evals-textual-difference.md +18 -18
- package/dist/docs/references/reference-evals-tone-consistency.md +19 -19
- package/dist/docs/references/reference-evals-tool-call-accuracy.md +165 -165
- package/dist/docs/references/reference-evals-toxicity.md +21 -21
- package/dist/docs/references/reference-evals-trajectory-accuracy.md +613 -0
- package/dist/scorers/code/index.d.ts +1 -0
- package/dist/scorers/code/index.d.ts.map +1 -1
- package/dist/scorers/code/trajectory/index.d.ts +147 -0
- package/dist/scorers/code/trajectory/index.d.ts.map +1 -0
- package/dist/scorers/llm/answer-similarity/index.d.ts +2 -2
- package/dist/scorers/llm/context-precision/index.d.ts +2 -2
- package/dist/scorers/llm/context-relevance/index.d.ts +1 -1
- package/dist/scorers/llm/faithfulness/index.d.ts +1 -1
- package/dist/scorers/llm/hallucination/index.d.ts +2 -2
- package/dist/scorers/llm/index.d.ts +1 -0
- package/dist/scorers/llm/index.d.ts.map +1 -1
- package/dist/scorers/llm/noise-sensitivity/index.d.ts +1 -1
- package/dist/scorers/llm/prompt-alignment/index.d.ts +5 -5
- package/dist/scorers/llm/tool-call-accuracy/index.d.ts +1 -1
- package/dist/scorers/llm/toxicity/index.d.ts +1 -1
- package/dist/scorers/llm/trajectory/index.d.ts +58 -0
- package/dist/scorers/llm/trajectory/index.d.ts.map +1 -0
- package/dist/scorers/llm/trajectory/prompts.d.ts +20 -0
- package/dist/scorers/llm/trajectory/prompts.d.ts.map +1 -0
- package/dist/scorers/prebuilt/index.cjs +638 -59
- package/dist/scorers/prebuilt/index.cjs.map +1 -1
- package/dist/scorers/prebuilt/index.js +578 -2
- package/dist/scorers/prebuilt/index.js.map +1 -1
- package/dist/scorers/utils.cjs +41 -17
- package/dist/scorers/utils.d.ts +171 -1
- package/dist/scorers/utils.d.ts.map +1 -1
- package/dist/scorers/utils.js +1 -1
- package/package.json +14 -11
- package/dist/chunk-OEOE7ZHN.js +0 -195
- package/dist/chunk-OEOE7ZHN.js.map +0 -1
- package/dist/chunk-W3U7MMDX.cjs +0 -212
- package/dist/chunk-W3U7MMDX.cjs.map +0 -1
|
@@ -1,18 +1,18 @@
|
|
|
1
|
-
# Prompt
|
|
1
|
+
# Prompt alignment scorer
|
|
2
2
|
|
|
3
3
|
The `createPromptAlignmentScorerLLM()` function creates a scorer that evaluates how well agent responses align with user prompts across multiple dimensions: intent understanding, requirement fulfillment, response completeness, and format appropriateness.
|
|
4
4
|
|
|
5
5
|
## Parameters
|
|
6
6
|
|
|
7
|
-
**model
|
|
7
|
+
**model** (`MastraModelConfig`): The language model to use for evaluating prompt-response alignment
|
|
8
8
|
|
|
9
|
-
**options
|
|
9
|
+
**options** (`PromptAlignmentOptions`): Configuration options for the scorer
|
|
10
10
|
|
|
11
|
-
##
|
|
11
|
+
## `.run()` returns
|
|
12
12
|
|
|
13
|
-
**score
|
|
13
|
+
**score** (`number`): Multi-dimensional alignment score between 0 and scale (default 0-1)
|
|
14
14
|
|
|
15
|
-
**reason
|
|
15
|
+
**reason** (`string`): Human-readable explanation of the prompt alignment evaluation with detailed breakdown
|
|
16
16
|
|
|
17
17
|
`.run()` returns a result in the following shape:
|
|
18
18
|
|
|
@@ -52,7 +52,7 @@ The `createPromptAlignmentScorerLLM()` function creates a scorer that evaluates
|
|
|
52
52
|
}
|
|
53
53
|
```
|
|
54
54
|
|
|
55
|
-
## Scoring
|
|
55
|
+
## Scoring details
|
|
56
56
|
|
|
57
57
|
### Scorer configuration
|
|
58
58
|
|
|
@@ -60,12 +60,12 @@ You can customize the Prompt Alignment Scorer by adjusting the scale parameter a
|
|
|
60
60
|
|
|
61
61
|
```typescript
|
|
62
62
|
const scorer = createPromptAlignmentScorerLLM({
|
|
63
|
-
model:
|
|
63
|
+
model: 'openai/gpt-5.4',
|
|
64
64
|
options: {
|
|
65
65
|
scale: 10, // Score from 0-10 instead of 0-1
|
|
66
|
-
evaluationMode:
|
|
66
|
+
evaluationMode: 'both', // 'user', 'system', or 'both' (default)
|
|
67
67
|
},
|
|
68
|
-
})
|
|
68
|
+
})
|
|
69
69
|
```
|
|
70
70
|
|
|
71
71
|
### Multi-Dimensional Analysis
|
|
@@ -163,7 +163,7 @@ Final Score = Weighted Score × scale
|
|
|
163
163
|
- Production monitoring where both user and system requirements matter
|
|
164
164
|
- Holistic assessment of prompt-response alignment
|
|
165
165
|
|
|
166
|
-
## Common
|
|
166
|
+
## Common use cases
|
|
167
167
|
|
|
168
168
|
### Code Generation Evaluation
|
|
169
169
|
|
|
@@ -176,8 +176,7 @@ Ideal for evaluating:
|
|
|
176
176
|
|
|
177
177
|
```typescript
|
|
178
178
|
// Example: API endpoint creation
|
|
179
|
-
const codePrompt =
|
|
180
|
-
"Create a REST API endpoint with authentication and rate limiting";
|
|
179
|
+
const codePrompt = 'Create a REST API endpoint with authentication and rate limiting'
|
|
181
180
|
// Scorer evaluates: intent (API creation), requirements (auth + rate limiting),
|
|
182
181
|
// completeness (full implementation), format (code structure)
|
|
183
182
|
```
|
|
@@ -194,7 +193,7 @@ Perfect for:
|
|
|
194
193
|
```typescript
|
|
195
194
|
// Example: Multi-requirement task
|
|
196
195
|
const taskPrompt =
|
|
197
|
-
|
|
196
|
+
'Write a Python class with initialization, validation, error handling, and documentation'
|
|
198
197
|
// Scorer tracks each requirement individually and provides detailed breakdown
|
|
199
198
|
```
|
|
200
199
|
|
|
@@ -210,7 +209,7 @@ Useful for:
|
|
|
210
209
|
```typescript
|
|
211
210
|
// Example: Structured output
|
|
212
211
|
const formatPrompt =
|
|
213
|
-
|
|
212
|
+
'Explain the differences between let and const in JavaScript using bullet points'
|
|
214
213
|
// Scorer evaluates content accuracy AND format compliance
|
|
215
214
|
```
|
|
216
215
|
|
|
@@ -220,31 +219,30 @@ Measure how well your AI agents follow user instructions:
|
|
|
220
219
|
|
|
221
220
|
```typescript
|
|
222
221
|
const agent = new Agent({
|
|
223
|
-
name:
|
|
224
|
-
instructions:
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
});
|
|
222
|
+
name: 'CodingAssistant',
|
|
223
|
+
instructions: 'You are a helpful coding assistant. Always provide working code examples.',
|
|
224
|
+
model: 'openai/gpt-5.4',
|
|
225
|
+
})
|
|
228
226
|
|
|
229
227
|
// Evaluate comprehensive alignment (default)
|
|
230
228
|
const scorer = createPromptAlignmentScorerLLM({
|
|
231
|
-
model:
|
|
232
|
-
options: { evaluationMode:
|
|
233
|
-
})
|
|
229
|
+
model: 'openai/gpt-5.4',
|
|
230
|
+
options: { evaluationMode: 'both' }, // Evaluates both user intent and system guidelines
|
|
231
|
+
})
|
|
234
232
|
|
|
235
233
|
// Evaluate just user satisfaction
|
|
236
234
|
const userScorer = createPromptAlignmentScorerLLM({
|
|
237
|
-
model:
|
|
238
|
-
options: { evaluationMode:
|
|
239
|
-
})
|
|
235
|
+
model: 'openai/gpt-5.4',
|
|
236
|
+
options: { evaluationMode: 'user' }, // Focus only on user request fulfillment
|
|
237
|
+
})
|
|
240
238
|
|
|
241
239
|
// Evaluate system compliance
|
|
242
240
|
const systemScorer = createPromptAlignmentScorerLLM({
|
|
243
|
-
model:
|
|
244
|
-
options: { evaluationMode:
|
|
245
|
-
})
|
|
241
|
+
model: 'openai/gpt-5.4',
|
|
242
|
+
options: { evaluationMode: 'system' }, // Check adherence to system instructions
|
|
243
|
+
})
|
|
246
244
|
|
|
247
|
-
const result = await scorer.run(agentRun)
|
|
245
|
+
const result = await scorer.run(agentRun)
|
|
248
246
|
```
|
|
249
247
|
|
|
250
248
|
### Prompt Engineering Optimization
|
|
@@ -253,15 +251,15 @@ Test different prompts to improve alignment:
|
|
|
253
251
|
|
|
254
252
|
```typescript
|
|
255
253
|
const prompts = [
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
]
|
|
254
|
+
'Write a function to calculate factorial',
|
|
255
|
+
'Create a Python function that calculates factorial with error handling for negative inputs',
|
|
256
|
+
'Implement a factorial calculator in Python with: input validation, error handling, and docstring',
|
|
257
|
+
]
|
|
260
258
|
|
|
261
259
|
// Compare alignment scores to find the best prompt
|
|
262
260
|
for (const prompt of prompts) {
|
|
263
|
-
const result = await scorer.run(createTestRun(prompt, response))
|
|
264
|
-
console.log(`Prompt alignment: ${result.score}`)
|
|
261
|
+
const result = await scorer.run(createTestRun(prompt, response))
|
|
262
|
+
console.log(`Prompt alignment: ${result.score}`)
|
|
265
263
|
}
|
|
266
264
|
```
|
|
267
265
|
|
|
@@ -289,23 +287,22 @@ for (const agent of agents) {
|
|
|
289
287
|
### Basic Configuration
|
|
290
288
|
|
|
291
289
|
```typescript
|
|
292
|
-
import { createPromptAlignmentScorerLLM } from
|
|
290
|
+
import { createPromptAlignmentScorerLLM } from '@mastra/evals'
|
|
293
291
|
|
|
294
292
|
const scorer = createPromptAlignmentScorerLLM({
|
|
295
|
-
model:
|
|
296
|
-
})
|
|
293
|
+
model: 'openai/gpt-5.4',
|
|
294
|
+
})
|
|
297
295
|
|
|
298
296
|
// Evaluate a code generation task
|
|
299
297
|
const result = await scorer.run({
|
|
300
298
|
input: [
|
|
301
299
|
{
|
|
302
|
-
role:
|
|
303
|
-
content:
|
|
304
|
-
"Write a Python function to calculate factorial with error handling",
|
|
300
|
+
role: 'user',
|
|
301
|
+
content: 'Write a Python function to calculate factorial with error handling',
|
|
305
302
|
},
|
|
306
303
|
],
|
|
307
304
|
output: {
|
|
308
|
-
role:
|
|
305
|
+
role: 'assistant',
|
|
309
306
|
text: `def factorial(n):
|
|
310
307
|
if n < 0:
|
|
311
308
|
raise ValueError("Factorial not defined for negative numbers")
|
|
@@ -313,7 +310,7 @@ const result = await scorer.run({
|
|
|
313
310
|
return 1
|
|
314
311
|
return n * factorial(n-1)`,
|
|
315
312
|
},
|
|
316
|
-
})
|
|
313
|
+
})
|
|
317
314
|
// Result: { score: 0.95, reason: "Excellent alignment - function addresses intent, includes error handling..." }
|
|
318
315
|
```
|
|
319
316
|
|
|
@@ -322,26 +319,26 @@ const result = await scorer.run({
|
|
|
322
319
|
```typescript
|
|
323
320
|
// Configure scale and evaluation mode
|
|
324
321
|
const scorer = createPromptAlignmentScorerLLM({
|
|
325
|
-
model:
|
|
322
|
+
model: 'openai/gpt-5.4',
|
|
326
323
|
options: {
|
|
327
324
|
scale: 10, // Score from 0-10 instead of 0-1
|
|
328
|
-
evaluationMode:
|
|
325
|
+
evaluationMode: 'both', // 'user', 'system', or 'both' (default)
|
|
329
326
|
},
|
|
330
|
-
})
|
|
327
|
+
})
|
|
331
328
|
|
|
332
329
|
// User-only evaluation - focus on user satisfaction
|
|
333
330
|
const userScorer = createPromptAlignmentScorerLLM({
|
|
334
|
-
model:
|
|
335
|
-
options: { evaluationMode:
|
|
336
|
-
})
|
|
331
|
+
model: 'openai/gpt-5.4',
|
|
332
|
+
options: { evaluationMode: 'user' },
|
|
333
|
+
})
|
|
337
334
|
|
|
338
335
|
// System-only evaluation - focus on compliance
|
|
339
336
|
const systemScorer = createPromptAlignmentScorerLLM({
|
|
340
|
-
model:
|
|
341
|
-
options: { evaluationMode:
|
|
342
|
-
})
|
|
337
|
+
model: 'openai/gpt-5.4',
|
|
338
|
+
options: { evaluationMode: 'system' },
|
|
339
|
+
})
|
|
343
340
|
|
|
344
|
-
const result = await scorer.run(testRun)
|
|
341
|
+
const result = await scorer.run(testRun)
|
|
345
342
|
// Result: { score: 8.5, reason: "Score: 8.5 out of 10 - Good alignment with both user intent and system guidelines..." }
|
|
346
343
|
```
|
|
347
344
|
|
|
@@ -352,15 +349,15 @@ const result = await scorer.run(testRun);
|
|
|
352
349
|
const result = await scorer.run({
|
|
353
350
|
input: [
|
|
354
351
|
{
|
|
355
|
-
role:
|
|
356
|
-
content:
|
|
352
|
+
role: 'user',
|
|
353
|
+
content: 'List the benefits of TypeScript in bullet points',
|
|
357
354
|
},
|
|
358
355
|
],
|
|
359
356
|
output: {
|
|
360
|
-
role:
|
|
361
|
-
text:
|
|
357
|
+
role: 'assistant',
|
|
358
|
+
text: 'TypeScript provides static typing, better IDE support, and enhanced code reliability.',
|
|
362
359
|
},
|
|
363
|
-
})
|
|
360
|
+
})
|
|
364
361
|
// Result: Lower appropriateness score due to format mismatch (paragraph vs bullet points)
|
|
365
362
|
```
|
|
366
363
|
|
|
@@ -369,19 +366,19 @@ const result = await scorer.run({
|
|
|
369
366
|
In this example, the response fully addresses the user's prompt with all requirements met.
|
|
370
367
|
|
|
371
368
|
```typescript
|
|
372
|
-
import { createPromptAlignmentScorerLLM } from
|
|
369
|
+
import { createPromptAlignmentScorerLLM } from '@mastra/evals/scorers/prebuilt'
|
|
373
370
|
|
|
374
371
|
const scorer = createPromptAlignmentScorerLLM({
|
|
375
|
-
model:
|
|
376
|
-
})
|
|
372
|
+
model: 'openai/gpt-5.4',
|
|
373
|
+
})
|
|
377
374
|
|
|
378
375
|
const inputMessages = [
|
|
379
376
|
{
|
|
380
|
-
role:
|
|
377
|
+
role: 'user',
|
|
381
378
|
content:
|
|
382
|
-
|
|
379
|
+
'Write a Python function to calculate factorial with error handling for negative numbers',
|
|
383
380
|
},
|
|
384
|
-
]
|
|
381
|
+
]
|
|
385
382
|
|
|
386
383
|
const outputMessage = {
|
|
387
384
|
text: `def factorial(n):
|
|
@@ -391,14 +388,14 @@ const outputMessage = {
|
|
|
391
388
|
if n == 0 or n == 1:
|
|
392
389
|
return 1
|
|
393
390
|
return n * factorial(n - 1)`,
|
|
394
|
-
}
|
|
391
|
+
}
|
|
395
392
|
|
|
396
393
|
const result = await scorer.run({
|
|
397
394
|
input: inputMessages,
|
|
398
395
|
output: outputMessage,
|
|
399
|
-
})
|
|
396
|
+
})
|
|
400
397
|
|
|
401
|
-
console.log(result)
|
|
398
|
+
console.log(result)
|
|
402
399
|
```
|
|
403
400
|
|
|
404
401
|
### Excellent alignment output
|
|
@@ -417,29 +414,29 @@ The output receives a high score because it perfectly addresses the intent, fulf
|
|
|
417
414
|
In this example, the response addresses the core intent but misses some requirements or has format issues.
|
|
418
415
|
|
|
419
416
|
```typescript
|
|
420
|
-
import { createPromptAlignmentScorerLLM } from
|
|
417
|
+
import { createPromptAlignmentScorerLLM } from '@mastra/evals/scorers/prebuilt'
|
|
421
418
|
|
|
422
419
|
const scorer = createPromptAlignmentScorerLLM({
|
|
423
|
-
model:
|
|
424
|
-
})
|
|
420
|
+
model: 'openai/gpt-5.4',
|
|
421
|
+
})
|
|
425
422
|
|
|
426
423
|
const inputMessages = [
|
|
427
424
|
{
|
|
428
|
-
role:
|
|
429
|
-
content:
|
|
425
|
+
role: 'user',
|
|
426
|
+
content: 'List the benefits of TypeScript in bullet points',
|
|
430
427
|
},
|
|
431
|
-
]
|
|
428
|
+
]
|
|
432
429
|
|
|
433
430
|
const outputMessage = {
|
|
434
|
-
text:
|
|
435
|
-
}
|
|
431
|
+
text: 'TypeScript provides static typing, better IDE support, and enhanced code reliability through compile-time error checking.',
|
|
432
|
+
}
|
|
436
433
|
|
|
437
434
|
const result = await scorer.run({
|
|
438
435
|
input: inputMessages,
|
|
439
436
|
output: outputMessage,
|
|
440
|
-
})
|
|
437
|
+
})
|
|
441
438
|
|
|
442
|
-
console.log(result)
|
|
439
|
+
console.log(result)
|
|
443
440
|
```
|
|
444
441
|
|
|
445
442
|
#### Partial alignment output
|
|
@@ -458,32 +455,32 @@ The output receives a lower score because while the content is accurate, it does
|
|
|
458
455
|
In this example, the response fails to address the user's specific requirements.
|
|
459
456
|
|
|
460
457
|
```typescript
|
|
461
|
-
import { createPromptAlignmentScorerLLM } from
|
|
458
|
+
import { createPromptAlignmentScorerLLM } from '@mastra/evals/scorers/prebuilt'
|
|
462
459
|
|
|
463
460
|
const scorer = createPromptAlignmentScorerLLM({
|
|
464
|
-
model:
|
|
465
|
-
})
|
|
461
|
+
model: 'openai/gpt-5.4',
|
|
462
|
+
})
|
|
466
463
|
|
|
467
464
|
const inputMessages = [
|
|
468
465
|
{
|
|
469
|
-
role:
|
|
466
|
+
role: 'user',
|
|
470
467
|
content:
|
|
471
|
-
|
|
468
|
+
'Write a Python class with initialization, validation, error handling, and documentation',
|
|
472
469
|
},
|
|
473
|
-
]
|
|
470
|
+
]
|
|
474
471
|
|
|
475
472
|
const outputMessage = {
|
|
476
473
|
text: `class Example:
|
|
477
474
|
def __init__(self, value):
|
|
478
475
|
self.value = value`,
|
|
479
|
-
}
|
|
476
|
+
}
|
|
480
477
|
|
|
481
478
|
const result = await scorer.run({
|
|
482
479
|
input: inputMessages,
|
|
483
480
|
output: outputMessage,
|
|
484
|
-
})
|
|
481
|
+
})
|
|
485
482
|
|
|
486
|
-
console.log(result)
|
|
483
|
+
console.log(result)
|
|
487
484
|
```
|
|
488
485
|
|
|
489
486
|
### Poor alignment output
|
|
@@ -505,29 +502,29 @@ Evaluates how well the response addresses the user's request, ignoring system in
|
|
|
505
502
|
|
|
506
503
|
```typescript
|
|
507
504
|
const scorer = createPromptAlignmentScorerLLM({
|
|
508
|
-
model:
|
|
509
|
-
options: { evaluationMode:
|
|
510
|
-
})
|
|
505
|
+
model: 'openai/gpt-5.4',
|
|
506
|
+
options: { evaluationMode: 'user' },
|
|
507
|
+
})
|
|
511
508
|
|
|
512
509
|
const result = await scorer.run({
|
|
513
510
|
input: {
|
|
514
511
|
inputMessages: [
|
|
515
512
|
{
|
|
516
|
-
role:
|
|
517
|
-
content:
|
|
513
|
+
role: 'user',
|
|
514
|
+
content: 'Explain recursion with an example',
|
|
518
515
|
},
|
|
519
516
|
],
|
|
520
517
|
systemMessages: [
|
|
521
518
|
{
|
|
522
|
-
role:
|
|
523
|
-
content:
|
|
519
|
+
role: 'system',
|
|
520
|
+
content: 'Always provide code examples in Python',
|
|
524
521
|
},
|
|
525
522
|
],
|
|
526
523
|
},
|
|
527
524
|
output: {
|
|
528
|
-
text:
|
|
525
|
+
text: 'Recursion is when a function calls itself. For example: factorial(5) = 5 * factorial(4)',
|
|
529
526
|
},
|
|
530
|
-
})
|
|
527
|
+
})
|
|
531
528
|
// Scores high for addressing user request, even without Python code
|
|
532
529
|
```
|
|
533
530
|
|
|
@@ -537,30 +534,29 @@ Evaluates compliance with system behavioral guidelines and constraints:
|
|
|
537
534
|
|
|
538
535
|
```typescript
|
|
539
536
|
const scorer = createPromptAlignmentScorerLLM({
|
|
540
|
-
model:
|
|
541
|
-
options: { evaluationMode:
|
|
542
|
-
})
|
|
537
|
+
model: 'openai/gpt-5.4',
|
|
538
|
+
options: { evaluationMode: 'system' },
|
|
539
|
+
})
|
|
543
540
|
|
|
544
541
|
const result = await scorer.run({
|
|
545
542
|
input: {
|
|
546
543
|
systemMessages: [
|
|
547
544
|
{
|
|
548
|
-
role:
|
|
549
|
-
content:
|
|
550
|
-
"You are a helpful assistant. Always be polite, concise, and provide examples.",
|
|
545
|
+
role: 'system',
|
|
546
|
+
content: 'You are a helpful assistant. Always be polite, concise, and provide examples.',
|
|
551
547
|
},
|
|
552
548
|
],
|
|
553
549
|
inputMessages: [
|
|
554
550
|
{
|
|
555
|
-
role:
|
|
556
|
-
content:
|
|
551
|
+
role: 'user',
|
|
552
|
+
content: 'What is machine learning?',
|
|
557
553
|
},
|
|
558
554
|
],
|
|
559
555
|
},
|
|
560
556
|
output: {
|
|
561
|
-
text:
|
|
557
|
+
text: 'Machine learning is a subset of AI where computers learn from data. For example, spam filters learn to identify unwanted emails by analyzing patterns in previously marked spam.',
|
|
562
558
|
},
|
|
563
|
-
})
|
|
559
|
+
})
|
|
564
560
|
// Evaluates politeness, conciseness, and example provision
|
|
565
561
|
```
|
|
566
562
|
|
|
@@ -570,23 +566,22 @@ Evaluates both user intent fulfillment and system compliance with weighted scori
|
|
|
570
566
|
|
|
571
567
|
```typescript
|
|
572
568
|
const scorer = createPromptAlignmentScorerLLM({
|
|
573
|
-
model:
|
|
574
|
-
options: { evaluationMode:
|
|
575
|
-
})
|
|
569
|
+
model: 'openai/gpt-5.4',
|
|
570
|
+
options: { evaluationMode: 'both' }, // This is the default
|
|
571
|
+
})
|
|
576
572
|
|
|
577
573
|
const result = await scorer.run({
|
|
578
574
|
input: {
|
|
579
575
|
systemMessages: [
|
|
580
576
|
{
|
|
581
|
-
role:
|
|
582
|
-
content:
|
|
583
|
-
"Always provide code examples when explaining programming concepts",
|
|
577
|
+
role: 'system',
|
|
578
|
+
content: 'Always provide code examples when explaining programming concepts',
|
|
584
579
|
},
|
|
585
580
|
],
|
|
586
581
|
inputMessages: [
|
|
587
582
|
{
|
|
588
|
-
role:
|
|
589
|
-
content:
|
|
583
|
+
role: 'user',
|
|
584
|
+
content: 'Explain how to reverse a string',
|
|
590
585
|
},
|
|
591
586
|
],
|
|
592
587
|
},
|
|
@@ -598,11 +593,11 @@ const result = await scorer.run({
|
|
|
598
593
|
|
|
599
594
|
# Usage: reverse_string("hello") returns "olleh"`,
|
|
600
595
|
},
|
|
601
|
-
})
|
|
596
|
+
})
|
|
602
597
|
// High score for both addressing the user's request AND following system guidelines
|
|
603
598
|
```
|
|
604
599
|
|
|
605
|
-
## Comparison with
|
|
600
|
+
## Comparison with other scorers
|
|
606
601
|
|
|
607
602
|
| Aspect | Prompt Alignment | Answer Relevancy | Faithfulness |
|
|
608
603
|
| -------------- | ------------------------------------------ | ---------------------------- | -------------------------------- |
|