@mastra/evals 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +17 -11
- package/.turbo/turbo-lint.log +4 -0
- package/CHANGELOG.md +28 -0
- package/dist/_tsup-dts-rollup.d.ts +45 -19
- package/dist/chunk-TXXJUIES.js +15 -0
- package/dist/{dist-56AYDN4X.js → dist-W3SXCXOT.js} +843 -471
- package/dist/index.js +2 -3
- package/dist/magic-string.es-5UDOWOAZ.js +20 -20
- package/dist/metrics/judge/index.d.ts +1 -0
- package/dist/metrics/judge/index.js +2 -0
- package/dist/metrics/llm/index.d.ts +1 -0
- package/dist/metrics/llm/index.js +243 -49
- package/dist/metrics/nlp/index.js +1 -1
- package/eslint.config.js +6 -0
- package/package.json +14 -5
- package/src/evaluation.ts +3 -2
- package/src/metrics/index.ts +1 -0
- package/src/metrics/judge/index.ts +1 -1
- package/src/metrics/llm/answer-relevancy/index.test.ts +2 -1
- package/src/metrics/llm/answer-relevancy/index.ts +3 -3
- package/src/metrics/llm/answer-relevancy/metricJudge.ts +9 -9
- package/src/metrics/llm/bias/index.test.ts +2 -1
- package/src/metrics/llm/bias/index.ts +5 -5
- package/src/metrics/llm/bias/metricJudge.ts +3 -3
- package/src/metrics/llm/context-position/index.test.ts +2 -1
- package/src/metrics/llm/context-position/index.ts +3 -3
- package/src/metrics/llm/context-position/metricJudge.ts +9 -9
- package/src/metrics/llm/context-precision/index.test.ts +1 -1
- package/src/metrics/llm/context-precision/index.ts +3 -3
- package/src/metrics/llm/context-precision/metricJudge.ts +9 -10
- package/src/metrics/llm/context-relevancy/index.test.ts +1 -1
- package/src/metrics/llm/context-relevancy/index.ts +2 -2
- package/src/metrics/llm/context-relevancy/metricJudge.ts +1 -1
- package/src/metrics/llm/contextual-recall/index.test.ts +1 -1
- package/src/metrics/llm/contextual-recall/index.ts +2 -2
- package/src/metrics/llm/contextual-recall/metricJudge.ts +1 -1
- package/src/metrics/llm/faithfulness/index.test.ts +1 -1
- package/src/metrics/llm/faithfulness/index.ts +2 -2
- package/src/metrics/llm/faithfulness/metricJudge.ts +1 -1
- package/src/metrics/llm/hallucination/index.test.ts +1 -1
- package/src/metrics/llm/hallucination/index.ts +2 -2
- package/src/metrics/llm/hallucination/metricJudge.ts +1 -1
- package/src/metrics/llm/index.ts +1 -0
- package/src/metrics/llm/prompt-alignment/index.test.ts +1 -1
- package/src/metrics/llm/prompt-alignment/index.ts +1 -1
- package/src/metrics/llm/prompt-alignment/metricJudge.ts +1 -1
- package/src/metrics/llm/summarization/index.test.ts +2 -1
- package/src/metrics/llm/summarization/index.ts +2 -2
- package/src/metrics/llm/summarization/metricJudge.ts +1 -1
- package/src/metrics/llm/toxicity/index.test.ts +1 -1
- package/src/metrics/llm/toxicity/index.ts +2 -2
- package/src/metrics/llm/toxicity/metricJudge.ts +3 -3
- package/src/metrics/llm/types.ts +1 -1
- package/src/metrics/nlp/completeness/index.ts +2 -1
- package/src/metrics/nlp/content-similarity/index.ts +2 -1
- package/src/metrics/nlp/keyword-coverage/index.ts +2 -1
- package/src/metrics/nlp/textual-difference/index.ts +2 -1
- package/src/metrics/nlp/tone/index.ts +2 -1
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { Metric } from '@mastra/core/eval';
|
|
2
|
-
import {
|
|
2
|
+
import type { LanguageModel } from '@mastra/core/llm';
|
|
3
3
|
|
|
4
|
-
import {
|
|
4
|
+
import type { MetricResultWithReason } from '../types';
|
|
5
5
|
import { roundToTwoDecimals } from '../utils';
|
|
6
6
|
|
|
7
7
|
import { BiasJudge } from './metricJudge';
|
|
@@ -24,10 +24,10 @@ export class BiasMetric extends Metric {
|
|
|
24
24
|
async measure(input: string, output: string): Promise<MetricResultWithReason> {
|
|
25
25
|
const verdicts = await this.judge.evaluate(input, output);
|
|
26
26
|
const score = this.calculateScore(verdicts);
|
|
27
|
-
const reason = await this.judge.getReason(
|
|
27
|
+
const reason = await this.judge.getReason({
|
|
28
28
|
score,
|
|
29
|
-
verdicts.filter(Boolean).map(v => v.reason),
|
|
30
|
-
);
|
|
29
|
+
biases: verdicts.filter(Boolean).map(v => v.reason),
|
|
30
|
+
});
|
|
31
31
|
|
|
32
32
|
return {
|
|
33
33
|
score,
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import type { LanguageModel } from '@mastra/core/llm';
|
|
2
2
|
import { z } from 'zod';
|
|
3
3
|
|
|
4
4
|
import { MastraAgentJudge } from '../../judge';
|
|
@@ -40,8 +40,8 @@ export class BiasJudge extends MastraAgentJudge {
|
|
|
40
40
|
return result.object.verdicts;
|
|
41
41
|
}
|
|
42
42
|
|
|
43
|
-
async getReason(score: number
|
|
44
|
-
const prompt = generateReasonPrompt(
|
|
43
|
+
async getReason(args: { score: number; biases: string[] }): Promise<string> {
|
|
44
|
+
const prompt = generateReasonPrompt(args);
|
|
45
45
|
const result = await this.agent.generate(prompt, {
|
|
46
46
|
output: z.object({
|
|
47
47
|
reason: z.string(),
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import { createOpenAI } from '@ai-sdk/openai';
|
|
2
2
|
import { describe, it, expect } from 'vitest';
|
|
3
3
|
|
|
4
|
-
import {
|
|
4
|
+
import type { TestCaseWithContext } from '../utils';
|
|
5
|
+
import { isCloserTo } from '../utils';
|
|
5
6
|
|
|
6
7
|
import { ContextPositionMetric } from './index';
|
|
7
8
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { Metric } from '@mastra/core/eval';
|
|
2
|
-
import {
|
|
2
|
+
import type { LanguageModel } from '@mastra/core/llm';
|
|
3
3
|
|
|
4
|
-
import {
|
|
4
|
+
import type { MetricResultWithReason } from '../types';
|
|
5
5
|
import { roundToTwoDecimals } from '../utils';
|
|
6
6
|
|
|
7
7
|
import { ContextPositionJudge } from './metricJudge';
|
|
@@ -27,7 +27,7 @@ export class ContextPositionMetric extends Metric {
|
|
|
27
27
|
async measure(input: string, output: string): Promise<MetricResultWithReason> {
|
|
28
28
|
const verdicts = await this.judge.evaluate(input, output, this.context);
|
|
29
29
|
const score = this.calculateScore(verdicts);
|
|
30
|
-
const reason = await this.judge.getReason(input, output, score, this.scale, verdicts);
|
|
30
|
+
const reason = await this.judge.getReason({ input, output, score, scale: this.scale, verdicts });
|
|
31
31
|
|
|
32
32
|
return {
|
|
33
33
|
score,
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import type { LanguageModel } from '@mastra/core/llm';
|
|
2
2
|
import { z } from 'zod';
|
|
3
3
|
|
|
4
4
|
import { MastraAgentJudge } from '../../judge';
|
|
@@ -34,17 +34,17 @@ export class ContextPositionJudge extends MastraAgentJudge {
|
|
|
34
34
|
return result.object.verdicts;
|
|
35
35
|
}
|
|
36
36
|
|
|
37
|
-
async getReason(
|
|
38
|
-
input: string
|
|
39
|
-
|
|
40
|
-
score: number
|
|
41
|
-
scale: number
|
|
37
|
+
async getReason(args: {
|
|
38
|
+
input: string;
|
|
39
|
+
output: string;
|
|
40
|
+
score: number;
|
|
41
|
+
scale: number;
|
|
42
42
|
verdicts: {
|
|
43
43
|
verdict: string;
|
|
44
44
|
reason: string;
|
|
45
|
-
}[]
|
|
46
|
-
): Promise<string> {
|
|
47
|
-
const prompt = generateReasonPrompt(
|
|
45
|
+
}[];
|
|
46
|
+
}): Promise<string> {
|
|
47
|
+
const prompt = generateReasonPrompt(args);
|
|
48
48
|
const result = await this.agent.generate(prompt, {
|
|
49
49
|
output: z.object({
|
|
50
50
|
reason: z.string(),
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { Metric } from '@mastra/core/eval';
|
|
2
|
-
import {
|
|
2
|
+
import type { LanguageModel } from '@mastra/core/llm';
|
|
3
3
|
|
|
4
|
-
import {
|
|
4
|
+
import type { MetricResultWithReason } from '../types';
|
|
5
5
|
import { roundToTwoDecimals } from '../utils';
|
|
6
6
|
|
|
7
7
|
import { ContextPrecisionJudge } from './metricJudge';
|
|
@@ -27,7 +27,7 @@ export class ContextPrecisionMetric extends Metric {
|
|
|
27
27
|
async measure(input: string, output: string): Promise<MetricResultWithReason> {
|
|
28
28
|
const verdicts = await this.judge.evaluate(input, output, this.context);
|
|
29
29
|
const score = this.calculateScore(verdicts);
|
|
30
|
-
const reason = await this.judge.getReason(input, output, score, this.scale, verdicts);
|
|
30
|
+
const reason = await this.judge.getReason({ input, output, score, scale: this.scale, verdicts });
|
|
31
31
|
|
|
32
32
|
return {
|
|
33
33
|
score,
|
|
@@ -1,9 +1,8 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import type { LanguageModel } from '@mastra/core/llm';
|
|
2
2
|
import { z } from 'zod';
|
|
3
3
|
|
|
4
4
|
import { MastraAgentJudge } from '../../judge';
|
|
5
5
|
|
|
6
|
-
import './prompts';
|
|
7
6
|
import { CONTEXT_PRECISION_AGENT_INSTRUCTIONS, generateEvaluatePrompt, generateReasonPrompt } from './prompts';
|
|
8
7
|
|
|
9
8
|
export class ContextPrecisionJudge extends MastraAgentJudge {
|
|
@@ -35,17 +34,17 @@ export class ContextPrecisionJudge extends MastraAgentJudge {
|
|
|
35
34
|
return result.object.verdicts;
|
|
36
35
|
}
|
|
37
36
|
|
|
38
|
-
async getReason(
|
|
39
|
-
input: string
|
|
40
|
-
|
|
41
|
-
score: number
|
|
42
|
-
scale: number
|
|
37
|
+
async getReason(args: {
|
|
38
|
+
input: string;
|
|
39
|
+
output: string;
|
|
40
|
+
score: number;
|
|
41
|
+
scale: number;
|
|
43
42
|
verdicts: {
|
|
44
43
|
verdict: string;
|
|
45
44
|
reason: string;
|
|
46
|
-
}[]
|
|
47
|
-
): Promise<string> {
|
|
48
|
-
const prompt = generateReasonPrompt(
|
|
45
|
+
}[];
|
|
46
|
+
}): Promise<string> {
|
|
47
|
+
const prompt = generateReasonPrompt(args);
|
|
49
48
|
const result = await this.agent.generate(prompt, {
|
|
50
49
|
output: z.object({
|
|
51
50
|
reason: z.string(),
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import { createOpenAI } from '@ai-sdk/openai';
|
|
2
2
|
import { describe, it, expect } from 'vitest';
|
|
3
3
|
|
|
4
|
+
import type { TestCaseWithContext } from '../utils';
|
|
4
5
|
import { isCloserTo } from '../utils';
|
|
5
|
-
import { TestCaseWithContext } from '../utils';
|
|
6
6
|
|
|
7
7
|
import { ContextRelevancyMetric } from './index';
|
|
8
8
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { Metric } from '@mastra/core/eval';
|
|
2
|
-
import {
|
|
2
|
+
import type { LanguageModel } from '@mastra/core/llm';
|
|
3
3
|
|
|
4
|
-
import {
|
|
4
|
+
import type { MetricResultWithReason } from '../types';
|
|
5
5
|
import { roundToTwoDecimals } from '../utils';
|
|
6
6
|
|
|
7
7
|
import { ContextRelevancyJudge } from './metricJudge';
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import { createOpenAI } from '@ai-sdk/openai';
|
|
2
2
|
import { describe, it, expect } from 'vitest';
|
|
3
3
|
|
|
4
|
+
import type { TestCaseWithContext } from '../utils';
|
|
4
5
|
import { isCloserTo } from '../utils';
|
|
5
|
-
import { TestCaseWithContext } from '../utils';
|
|
6
6
|
|
|
7
7
|
import { ContextualRecallMetric } from './index';
|
|
8
8
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { Metric } from '@mastra/core/eval';
|
|
2
|
-
import {
|
|
2
|
+
import type { LanguageModel } from '@mastra/core/llm';
|
|
3
3
|
|
|
4
|
-
import {
|
|
4
|
+
import type { MetricResultWithReason } from '../types';
|
|
5
5
|
import { roundToTwoDecimals } from '../utils';
|
|
6
6
|
|
|
7
7
|
import { ContextualRecallJudge } from './metricJudge';
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { Metric } from '@mastra/core/eval';
|
|
2
|
-
import {
|
|
2
|
+
import type { LanguageModel } from '@mastra/core/llm';
|
|
3
3
|
|
|
4
|
-
import {
|
|
4
|
+
import type { MetricResultWithReason } from '../types';
|
|
5
5
|
import { roundToTwoDecimals } from '../utils';
|
|
6
6
|
|
|
7
7
|
import { FaithfulnessJudge } from './metricJudge';
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { Metric } from '@mastra/core/eval';
|
|
2
|
-
import {
|
|
2
|
+
import type { LanguageModel } from '@mastra/core/llm';
|
|
3
3
|
|
|
4
|
-
import {
|
|
4
|
+
import type { MetricResultWithReason } from '../types';
|
|
5
5
|
import { roundToTwoDecimals } from '../utils';
|
|
6
6
|
|
|
7
7
|
import { HallucinationJudge } from './metricJudge';
|
package/src/metrics/llm/index.ts
CHANGED
|
@@ -2,6 +2,7 @@ export { AnswerRelevancyMetric } from './answer-relevancy';
|
|
|
2
2
|
export { ContextPositionMetric } from './context-position';
|
|
3
3
|
export { ContextPrecisionMetric } from './context-precision';
|
|
4
4
|
export { FaithfulnessMetric } from './faithfulness';
|
|
5
|
+
export { HallucinationMetric } from './hallucination';
|
|
5
6
|
export { PromptAlignmentMetric } from './prompt-alignment';
|
|
6
7
|
export { ToxicityMetric } from './toxicity';
|
|
7
8
|
export { ContextRelevancyMetric } from './context-relevancy';
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import { createOpenAI } from '@ai-sdk/openai';
|
|
2
2
|
import { describe, it, expect, vi } from 'vitest';
|
|
3
3
|
|
|
4
|
-
import {
|
|
4
|
+
import type { TestCase } from '../utils';
|
|
5
|
+
import { isCloserTo } from '../utils';
|
|
5
6
|
|
|
6
7
|
import { SummarizationMetric } from './index';
|
|
7
8
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { Metric } from '@mastra/core/eval';
|
|
2
|
-
import {
|
|
2
|
+
import type { LanguageModel } from '@mastra/core/llm';
|
|
3
3
|
|
|
4
|
-
import {
|
|
4
|
+
import type { MetricResultWithReason } from '../types';
|
|
5
5
|
import { roundToTwoDecimals } from '../utils';
|
|
6
6
|
|
|
7
7
|
import { SummarizationJudge } from './metricJudge';
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { Metric } from '@mastra/core/eval';
|
|
2
|
-
import {
|
|
2
|
+
import type { LanguageModel } from '@mastra/core/llm';
|
|
3
3
|
|
|
4
|
-
import {
|
|
4
|
+
import type { MetricResultWithReason } from '../types';
|
|
5
5
|
import { roundToTwoDecimals } from '../utils';
|
|
6
6
|
|
|
7
7
|
import { ToxicityJudge } from './metricJudge';
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import type { LanguageModel } from '@mastra/core/llm';
|
|
2
2
|
import { z } from 'zod';
|
|
3
3
|
|
|
4
4
|
import { MastraAgentJudge } from '../../judge';
|
|
@@ -26,8 +26,8 @@ export class ToxicityJudge extends MastraAgentJudge {
|
|
|
26
26
|
return result.object.verdicts;
|
|
27
27
|
}
|
|
28
28
|
|
|
29
|
-
async getReason(
|
|
30
|
-
const prompt = getReasonPrompt(
|
|
29
|
+
async getReason(args: { score: number; toxics: string[] }): Promise<string> {
|
|
30
|
+
const prompt = getReasonPrompt(args);
|
|
31
31
|
const result = await this.agent.generate(prompt, {
|
|
32
32
|
output: z.object({
|
|
33
33
|
reason: z.string(),
|
package/src/metrics/llm/types.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import { Metric
|
|
1
|
+
import { Metric } from '@mastra/core/eval';
|
|
2
|
+
import type { MetricResult } from '@mastra/core/eval';
|
|
2
3
|
import stringSimilarity from 'string-similarity';
|
|
3
4
|
|
|
4
5
|
interface ContentSimilarityResult extends MetricResult {
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import { Metric
|
|
1
|
+
import { Metric } from '@mastra/core/eval';
|
|
2
|
+
import type { MetricResult } from '@mastra/core/eval';
|
|
2
3
|
import keyword_extractor from 'keyword-extractor';
|
|
3
4
|
|
|
4
5
|
interface KeywordCoverageResult extends MetricResult {
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import { Metric
|
|
1
|
+
import { Metric } from '@mastra/core/eval';
|
|
2
|
+
import type { MetricResult } from '@mastra/core/eval';
|
|
2
3
|
import { SequenceMatcher } from 'difflib';
|
|
3
4
|
|
|
4
5
|
interface TextualDifferenceResult extends MetricResult {
|