@mastra/evals 0.1.0-alpha.30 → 0.1.0-alpha.33
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +27 -0
- package/dist/{dist-56AYDN4X.js → dist-XPBCCWOM.js} +8 -8
- package/dist/index.js +1 -1
- package/dist/metrics/llm/index.d.ts +12 -11
- package/dist/metrics/llm/index.js +51 -49
- package/package.json +4 -3
- package/src/evaluation.test.ts +4 -6
- package/src/metrics/judge/index.ts +5 -4
- package/src/metrics/llm/answer-relevancy/index.test.ts +4 -7
- package/src/metrics/llm/answer-relevancy/index.ts +4 -3
- package/src/metrics/llm/answer-relevancy/metricJudge.ts +3 -3
- package/src/metrics/llm/bias/index.test.ts +4 -6
- package/src/metrics/llm/bias/index.ts +4 -3
- package/src/metrics/llm/bias/metricJudge.ts +3 -3
- package/src/metrics/llm/context-position/index.test.ts +15 -17
- package/src/metrics/llm/context-position/index.ts +6 -4
- package/src/metrics/llm/context-position/metricJudge.ts +3 -3
- package/src/metrics/llm/context-precision/index.test.ts +13 -15
- package/src/metrics/llm/context-precision/index.ts +6 -4
- package/src/metrics/llm/context-precision/metricJudge.ts +3 -3
- package/src/metrics/llm/context-relevancy/index.test.ts +7 -9
- package/src/metrics/llm/context-relevancy/index.ts +6 -4
- package/src/metrics/llm/context-relevancy/metricJudge.ts +3 -3
- package/src/metrics/llm/contextual-recall/index.test.ts +6 -8
- package/src/metrics/llm/contextual-recall/index.ts +6 -4
- package/src/metrics/llm/contextual-recall/metricJudge.ts +3 -3
- package/src/metrics/llm/faithfulness/index.test.ts +15 -17
- package/src/metrics/llm/faithfulness/index.ts +6 -4
- package/src/metrics/llm/faithfulness/metricJudge.ts +3 -3
- package/src/metrics/llm/hallucination/index.test.ts +15 -19
- package/src/metrics/llm/hallucination/index.ts +7 -5
- package/src/metrics/llm/hallucination/metricJudge.ts +3 -3
- package/src/metrics/llm/prompt-alignment/index.test.ts +9 -11
- package/src/metrics/llm/prompt-alignment/index.ts +4 -3
- package/src/metrics/llm/prompt-alignment/metricJudge.ts +3 -3
- package/src/metrics/llm/summarization/index.test.ts +4 -6
- package/src/metrics/llm/summarization/index.ts +4 -3
- package/src/metrics/llm/summarization/metricJudge.ts +3 -3
- package/src/metrics/llm/toxicity/index.test.ts +4 -6
- package/src/metrics/llm/toxicity/index.ts +4 -3
- package/src/metrics/llm/toxicity/metricJudge.ts +3 -3
- package/src/metrics/llm/types.ts +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,32 @@
|
|
|
1
1
|
# @mastra/evals
|
|
2
2
|
|
|
3
|
+
## 0.1.0-alpha.33
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- Updated dependencies [d7d465a]
|
|
8
|
+
- Updated dependencies [d7d465a]
|
|
9
|
+
- Updated dependencies [2017553]
|
|
10
|
+
- Updated dependencies [a10b7a3]
|
|
11
|
+
- Updated dependencies [16e5b04]
|
|
12
|
+
- @mastra/core@0.2.0-alpha.91
|
|
13
|
+
|
|
14
|
+
## 0.1.0-alpha.32
|
|
15
|
+
|
|
16
|
+
### Patch Changes
|
|
17
|
+
|
|
18
|
+
- Updated dependencies [8151f44]
|
|
19
|
+
- Updated dependencies [e897f1c]
|
|
20
|
+
- Updated dependencies [3700be1]
|
|
21
|
+
- @mastra/core@0.2.0-alpha.90
|
|
22
|
+
|
|
23
|
+
## 0.1.0-alpha.31
|
|
24
|
+
|
|
25
|
+
### Patch Changes
|
|
26
|
+
|
|
27
|
+
- Updated dependencies [27275c9]
|
|
28
|
+
- @mastra/core@0.2.0-alpha.89
|
|
29
|
+
|
|
3
30
|
## 0.1.0-alpha.30
|
|
4
31
|
|
|
5
32
|
### Patch Changes
|
|
@@ -12068,7 +12068,7 @@ function createTestHook(name, handler) {
|
|
|
12068
12068
|
globalThis.performance ? globalThis.performance.now.bind(globalThis.performance) : Date.now;
|
|
12069
12069
|
globalThis.performance ? globalThis.performance.now.bind(globalThis.performance) : Date.now;
|
|
12070
12070
|
|
|
12071
|
-
// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@
|
|
12071
|
+
// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_hiqcd72qe6fxfy2i7gs4jbo6ry/node_modules/vitest/dist/chunks/utils.C8RiOc4B.js
|
|
12072
12072
|
var NAME_WORKER_STATE = "__vitest_worker__";
|
|
12073
12073
|
function getWorkerState() {
|
|
12074
12074
|
const workerState = globalThis[NAME_WORKER_STATE];
|
|
@@ -12128,7 +12128,7 @@ async function waitForImportsToResolve() {
|
|
|
12128
12128
|
await waitForImportsToResolve();
|
|
12129
12129
|
}
|
|
12130
12130
|
|
|
12131
|
-
// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@
|
|
12131
|
+
// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_hiqcd72qe6fxfy2i7gs4jbo6ry/node_modules/vitest/dist/chunks/_commonjsHelpers.BFTU3MAI.js
|
|
12132
12132
|
var commonjsGlobal = typeof globalThis !== "undefined" ? globalThis : typeof window !== "undefined" ? window : typeof global !== "undefined" ? global : typeof self !== "undefined" ? self : {};
|
|
12133
12133
|
function getDefaultExportFromCjs3(x2) {
|
|
12134
12134
|
return x2 && x2.__esModule && Object.prototype.hasOwnProperty.call(x2, "default") ? x2["default"] : x2;
|
|
@@ -14197,7 +14197,7 @@ var SnapshotClient = class {
|
|
|
14197
14197
|
}
|
|
14198
14198
|
};
|
|
14199
14199
|
|
|
14200
|
-
// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@
|
|
14200
|
+
// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_hiqcd72qe6fxfy2i7gs4jbo6ry/node_modules/vitest/dist/chunks/date.W2xKR2qe.js
|
|
14201
14201
|
var RealDate = Date;
|
|
14202
14202
|
var now2 = null;
|
|
14203
14203
|
var MockDate = class _MockDate extends RealDate {
|
|
@@ -14250,7 +14250,7 @@ function resetDate() {
|
|
|
14250
14250
|
globalThis.Date = RealDate;
|
|
14251
14251
|
}
|
|
14252
14252
|
|
|
14253
|
-
// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@
|
|
14253
|
+
// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_hiqcd72qe6fxfy2i7gs4jbo6ry/node_modules/vitest/dist/chunks/vi.CjhMlMwf.js
|
|
14254
14254
|
var unsupported = [
|
|
14255
14255
|
// .poll is meant to retry matchers until they succeed, and
|
|
14256
14256
|
// snapshots will always succeed as long as the poll method doesn't throw an error
|
|
@@ -17214,7 +17214,7 @@ function getImporter(name) {
|
|
|
17214
17214
|
return stack?.file || "";
|
|
17215
17215
|
}
|
|
17216
17216
|
|
|
17217
|
-
// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@
|
|
17217
|
+
// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_hiqcd72qe6fxfy2i7gs4jbo6ry/node_modules/vitest/dist/chunks/run-once.2ogXb3JV.js
|
|
17218
17218
|
var filesCount = /* @__PURE__ */ new Map();
|
|
17219
17219
|
var cache = /* @__PURE__ */ new Map();
|
|
17220
17220
|
function runOnce(fn2, key) {
|
|
@@ -17237,7 +17237,7 @@ function isFirstRun() {
|
|
|
17237
17237
|
return firstRun;
|
|
17238
17238
|
}
|
|
17239
17239
|
|
|
17240
|
-
// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@
|
|
17240
|
+
// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_hiqcd72qe6fxfy2i7gs4jbo6ry/node_modules/vitest/dist/chunks/benchmark.Cdu9hjj4.js
|
|
17241
17241
|
var benchFns = /* @__PURE__ */ new WeakMap();
|
|
17242
17242
|
var benchOptsMap = /* @__PURE__ */ new WeakMap();
|
|
17243
17243
|
var bench = createBenchmark(function(name, fn2 = noop, options = {}) {
|
|
@@ -17266,7 +17266,7 @@ function formatName2(name) {
|
|
|
17266
17266
|
return typeof name === "string" ? name : name instanceof Function ? name.name || "<anonymous>" : String(name);
|
|
17267
17267
|
}
|
|
17268
17268
|
|
|
17269
|
-
// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@
|
|
17269
|
+
// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_hiqcd72qe6fxfy2i7gs4jbo6ry/node_modules/vitest/dist/chunks/index.Bf4FgyZN.js
|
|
17270
17270
|
__toESM(require_dist(), 1);
|
|
17271
17271
|
function getRunningMode() {
|
|
17272
17272
|
return process.env.VITEST_MODE === "WATCH" ? "watch" : "run";
|
|
@@ -17277,7 +17277,7 @@ function isWatchMode() {
|
|
|
17277
17277
|
var assertType = function assertType2() {
|
|
17278
17278
|
};
|
|
17279
17279
|
|
|
17280
|
-
// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@
|
|
17280
|
+
// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_hiqcd72qe6fxfy2i7gs4jbo6ry/node_modules/vitest/dist/index.js
|
|
17281
17281
|
var import_expect_type2 = __toESM(require_dist(), 1);
|
|
17282
17282
|
var export_expectTypeOf = import_expect_type2.expectTypeOf;
|
|
17283
17283
|
/*! Bundled license information:
|
package/dist/index.js
CHANGED
|
@@ -39,7 +39,7 @@ var getCurrentTestInfo = async () => {
|
|
|
39
39
|
};
|
|
40
40
|
}
|
|
41
41
|
try {
|
|
42
|
-
const vitest = await import('./dist-
|
|
42
|
+
const vitest = await import('./dist-XPBCCWOM.js');
|
|
43
43
|
if (typeof vitest !== "undefined" && vitest.expect?.getState) {
|
|
44
44
|
const state = vitest.expect.getState();
|
|
45
45
|
return {
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import { MetricResult, Metric
|
|
1
|
+
import { MetricResult, Metric } from '@mastra/core/eval';
|
|
2
|
+
import { MastraLLMBase } from '@mastra/core/llm';
|
|
2
3
|
|
|
3
4
|
interface MetricResultWithReason extends MetricResult {
|
|
4
5
|
info: {
|
|
@@ -14,7 +15,7 @@ declare class AnswerRelevancyMetric extends Metric {
|
|
|
14
15
|
private judge;
|
|
15
16
|
private uncertaintyWeight;
|
|
16
17
|
private scale;
|
|
17
|
-
constructor(
|
|
18
|
+
constructor(llm: MastraLLMBase, { uncertaintyWeight, scale }?: AnswerRelevancyMetricOptions);
|
|
18
19
|
measure(input: string, output: string): Promise<MetricResultWithReason>;
|
|
19
20
|
private calculateScore;
|
|
20
21
|
}
|
|
@@ -27,7 +28,7 @@ declare class ContextPositionMetric extends Metric {
|
|
|
27
28
|
private judge;
|
|
28
29
|
private scale;
|
|
29
30
|
private context;
|
|
30
|
-
constructor(
|
|
31
|
+
constructor(llm: MastraLLMBase, { scale, context }: ContextPositionMetricOptions);
|
|
31
32
|
measure(input: string, output: string): Promise<MetricResultWithReason>;
|
|
32
33
|
private calculateScore;
|
|
33
34
|
}
|
|
@@ -40,7 +41,7 @@ declare class ContextPrecisionMetric extends Metric {
|
|
|
40
41
|
private judge;
|
|
41
42
|
private scale;
|
|
42
43
|
private context;
|
|
43
|
-
constructor(
|
|
44
|
+
constructor(llm: MastraLLMBase, { scale, context }: ContextPrecisionMetricOptions);
|
|
44
45
|
measure(input: string, output: string): Promise<MetricResultWithReason>;
|
|
45
46
|
private calculateScore;
|
|
46
47
|
}
|
|
@@ -53,7 +54,7 @@ declare class FaithfulnessMetric extends Metric {
|
|
|
53
54
|
private judge;
|
|
54
55
|
private scale;
|
|
55
56
|
private context;
|
|
56
|
-
constructor(
|
|
57
|
+
constructor(llm: MastraLLMBase, { scale, context }: FaithfulnessMetricOptions);
|
|
57
58
|
measure(input: string, output: string): Promise<MetricResultWithReason>;
|
|
58
59
|
private calculateScore;
|
|
59
60
|
}
|
|
@@ -66,7 +67,7 @@ declare class PromptAlignmentMetric extends Metric {
|
|
|
66
67
|
private instructions;
|
|
67
68
|
private judge;
|
|
68
69
|
private scale;
|
|
69
|
-
constructor(
|
|
70
|
+
constructor(llm: MastraLLMBase, { instructions, scale }: PromptAlignmentMetricOptions);
|
|
70
71
|
measure(input: string, output: string): Promise<MetricResultWithReason>;
|
|
71
72
|
private calculateScore;
|
|
72
73
|
}
|
|
@@ -77,7 +78,7 @@ interface ToxicityMetricOptions {
|
|
|
77
78
|
declare class ToxicityMetric extends Metric {
|
|
78
79
|
private judge;
|
|
79
80
|
private scale;
|
|
80
|
-
constructor(
|
|
81
|
+
constructor(llm: MastraLLMBase, { scale }?: ToxicityMetricOptions);
|
|
81
82
|
measure(input: string, output: string): Promise<MetricResultWithReason>;
|
|
82
83
|
private calculateScore;
|
|
83
84
|
}
|
|
@@ -90,7 +91,7 @@ declare class ContextRelevancyMetric extends Metric {
|
|
|
90
91
|
private judge;
|
|
91
92
|
private scale;
|
|
92
93
|
private context;
|
|
93
|
-
constructor(
|
|
94
|
+
constructor(llm: MastraLLMBase, { scale, context }: ContextRelevancyOptions);
|
|
94
95
|
measure(input: string, output: string): Promise<MetricResultWithReason>;
|
|
95
96
|
private calculateScore;
|
|
96
97
|
}
|
|
@@ -103,7 +104,7 @@ declare class ContextualRecallMetric extends Metric {
|
|
|
103
104
|
private judge;
|
|
104
105
|
private scale;
|
|
105
106
|
private context;
|
|
106
|
-
constructor(
|
|
107
|
+
constructor(llm: MastraLLMBase, { scale, context }: ContextualRecallMetricOptions);
|
|
107
108
|
measure(input: string, output: string): Promise<MetricResultWithReason>;
|
|
108
109
|
private calculateScore;
|
|
109
110
|
}
|
|
@@ -114,7 +115,7 @@ interface SummarizationMetricOptions {
|
|
|
114
115
|
declare class SummarizationMetric extends Metric {
|
|
115
116
|
private judge;
|
|
116
117
|
private scale;
|
|
117
|
-
constructor(
|
|
118
|
+
constructor(llm: MastraLLMBase, { scale }?: SummarizationMetricOptions);
|
|
118
119
|
measure(input: string, output: string): Promise<MetricResultWithReason & {
|
|
119
120
|
info: {
|
|
120
121
|
alignmentScore: number;
|
|
@@ -130,7 +131,7 @@ interface BiasMetricOptions {
|
|
|
130
131
|
declare class BiasMetric extends Metric {
|
|
131
132
|
private judge;
|
|
132
133
|
private scale;
|
|
133
|
-
constructor(
|
|
134
|
+
constructor(llm: MastraLLMBase, { scale }?: BiasMetricOptions);
|
|
134
135
|
measure(input: string, output: string): Promise<MetricResultWithReason>;
|
|
135
136
|
private calculateScore;
|
|
136
137
|
}
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import '../../chunk-4VNS5WPM.js';
|
|
2
|
-
import { Metric
|
|
2
|
+
import { Metric } from '@mastra/core/eval';
|
|
3
|
+
import '@mastra/core/llm';
|
|
3
4
|
import { z } from 'zod';
|
|
5
|
+
import { Agent } from '@mastra/core/agent';
|
|
4
6
|
|
|
5
7
|
// src/metrics/llm/utils.ts
|
|
6
8
|
var roundToTwoDecimals = (num) => {
|
|
@@ -8,11 +10,11 @@ var roundToTwoDecimals = (num) => {
|
|
|
8
10
|
};
|
|
9
11
|
var MastraAgentJudge = class {
|
|
10
12
|
agent;
|
|
11
|
-
constructor(name, instructions,
|
|
13
|
+
constructor(name, instructions, llm) {
|
|
12
14
|
this.agent = new Agent({
|
|
13
|
-
name: `Mastra Eval Judge ${
|
|
15
|
+
name: `Mastra Eval Judge ${llm.name} ${name}`,
|
|
14
16
|
instructions,
|
|
15
|
-
|
|
17
|
+
llm
|
|
16
18
|
});
|
|
17
19
|
}
|
|
18
20
|
};
|
|
@@ -226,8 +228,8 @@ function generateReasonPrompt({
|
|
|
226
228
|
|
|
227
229
|
// src/metrics/llm/answer-relevancy/metricJudge.ts
|
|
228
230
|
var AnswerRelevancyJudge = class extends MastraAgentJudge {
|
|
229
|
-
constructor(
|
|
230
|
-
super("Answer Relevancy", ANSWER_RELEVANCY_AGENT_INSTRUCTIONS,
|
|
231
|
+
constructor(llm) {
|
|
232
|
+
super("Answer Relevancy", ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, llm);
|
|
231
233
|
}
|
|
232
234
|
async evaluate(input, actualOutput) {
|
|
233
235
|
const statementPrompt = generateEvaluationStatementsPrompt({ output: actualOutput });
|
|
@@ -265,10 +267,10 @@ var AnswerRelevancyMetric = class extends Metric {
|
|
|
265
267
|
judge;
|
|
266
268
|
uncertaintyWeight;
|
|
267
269
|
scale;
|
|
268
|
-
constructor(
|
|
270
|
+
constructor(llm, { uncertaintyWeight = 0.3, scale = 1 } = {}) {
|
|
269
271
|
super();
|
|
270
272
|
this.uncertaintyWeight = uncertaintyWeight;
|
|
271
|
-
this.judge = new AnswerRelevancyJudge(
|
|
273
|
+
this.judge = new AnswerRelevancyJudge(llm);
|
|
272
274
|
this.scale = scale;
|
|
273
275
|
}
|
|
274
276
|
async measure(input, output) {
|
|
@@ -427,8 +429,8 @@ function generateReasonPrompt2({
|
|
|
427
429
|
|
|
428
430
|
// src/metrics/llm/context-position/metricJudge.ts
|
|
429
431
|
var ContextPositionJudge = class extends MastraAgentJudge {
|
|
430
|
-
constructor(
|
|
431
|
-
super("Context Position", CONTEXT_POSITION_AGENT_INSTRUCTIONS,
|
|
432
|
+
constructor(llm) {
|
|
433
|
+
super("Context Position", CONTEXT_POSITION_AGENT_INSTRUCTIONS, llm);
|
|
432
434
|
}
|
|
433
435
|
async evaluate(input, actualOutput, retrievalContext) {
|
|
434
436
|
const prompt = generateEvaluatePrompt2({
|
|
@@ -464,11 +466,11 @@ var ContextPositionMetric = class extends Metric {
|
|
|
464
466
|
judge;
|
|
465
467
|
scale;
|
|
466
468
|
context;
|
|
467
|
-
constructor(
|
|
469
|
+
constructor(llm, { scale = 1, context }) {
|
|
468
470
|
super();
|
|
469
|
-
this.judge = new ContextPositionJudge(model);
|
|
470
|
-
this.scale = scale;
|
|
471
471
|
this.context = context;
|
|
472
|
+
this.judge = new ContextPositionJudge(llm);
|
|
473
|
+
this.scale = scale;
|
|
472
474
|
}
|
|
473
475
|
async measure(input, output) {
|
|
474
476
|
const verdicts = await this.judge.evaluate(input, output, this.context);
|
|
@@ -635,8 +637,8 @@ JSON:
|
|
|
635
637
|
|
|
636
638
|
// src/metrics/llm/context-precision/metricJudge.ts
|
|
637
639
|
var ContextPrecisionJudge = class extends MastraAgentJudge {
|
|
638
|
-
constructor(
|
|
639
|
-
super("Context Precision", CONTEXT_PRECISION_AGENT_INSTRUCTIONS,
|
|
640
|
+
constructor(llm) {
|
|
641
|
+
super("Context Precision", CONTEXT_PRECISION_AGENT_INSTRUCTIONS, llm);
|
|
640
642
|
}
|
|
641
643
|
async evaluate(input, actualOutput, retrievalContext) {
|
|
642
644
|
const prompt = generateEvaluatePrompt3({
|
|
@@ -672,11 +674,11 @@ var ContextPrecisionMetric = class extends Metric {
|
|
|
672
674
|
judge;
|
|
673
675
|
scale;
|
|
674
676
|
context;
|
|
675
|
-
constructor(
|
|
677
|
+
constructor(llm, { scale = 1, context }) {
|
|
676
678
|
super();
|
|
677
|
-
this.judge = new ContextPrecisionJudge(model);
|
|
678
|
-
this.scale = scale;
|
|
679
679
|
this.context = context;
|
|
680
|
+
this.judge = new ContextPrecisionJudge(llm);
|
|
681
|
+
this.scale = scale;
|
|
680
682
|
}
|
|
681
683
|
async measure(input, output) {
|
|
682
684
|
const verdicts = await this.judge.evaluate(input, output, this.context);
|
|
@@ -872,8 +874,8 @@ Example Responses:
|
|
|
872
874
|
|
|
873
875
|
// src/metrics/llm/faithfulness/metricJudge.ts
|
|
874
876
|
var FaithfulnessJudge = class extends MastraAgentJudge {
|
|
875
|
-
constructor(
|
|
876
|
-
super("Faithfulness", FAITHFULNESS_AGENT_INSTRUCTIONS,
|
|
877
|
+
constructor(llm) {
|
|
878
|
+
super("Faithfulness", FAITHFULNESS_AGENT_INSTRUCTIONS, llm);
|
|
877
879
|
}
|
|
878
880
|
async evaluate(output, context) {
|
|
879
881
|
const claimsPrompt = generateClaimExtractionPrompt({ output });
|
|
@@ -915,11 +917,11 @@ var FaithfulnessMetric = class extends Metric {
|
|
|
915
917
|
judge;
|
|
916
918
|
scale;
|
|
917
919
|
context;
|
|
918
|
-
constructor(
|
|
920
|
+
constructor(llm, { scale = 1, context }) {
|
|
919
921
|
super();
|
|
920
|
-
this.scale = scale;
|
|
921
922
|
this.context = context;
|
|
922
|
-
this.judge = new FaithfulnessJudge(
|
|
923
|
+
this.judge = new FaithfulnessJudge(llm);
|
|
924
|
+
this.scale = scale;
|
|
923
925
|
}
|
|
924
926
|
async measure(input, output) {
|
|
925
927
|
const verdicts = await this.judge.evaluate(output, this.context);
|
|
@@ -1067,8 +1069,8 @@ function generateReasonPrompt5({
|
|
|
1067
1069
|
|
|
1068
1070
|
// src/metrics/llm/prompt-alignment/metricJudge.ts
|
|
1069
1071
|
var PromptAlignmentJudge = class extends MastraAgentJudge {
|
|
1070
|
-
constructor(
|
|
1071
|
-
super("Prompt Alignment", PROMPT_ALIGNMENT_AGENT_INSTRUCTIONS,
|
|
1072
|
+
constructor(llm) {
|
|
1073
|
+
super("Prompt Alignment", PROMPT_ALIGNMENT_AGENT_INSTRUCTIONS, llm);
|
|
1072
1074
|
}
|
|
1073
1075
|
async evaluate(input, actualOutput, instructions) {
|
|
1074
1076
|
const prompt = generateEvaluatePrompt5({ input, output: actualOutput, instructions });
|
|
@@ -1096,10 +1098,10 @@ var PromptAlignmentMetric = class extends Metric {
|
|
|
1096
1098
|
instructions;
|
|
1097
1099
|
judge;
|
|
1098
1100
|
scale;
|
|
1099
|
-
constructor(
|
|
1101
|
+
constructor(llm, { instructions, scale = 1 }) {
|
|
1100
1102
|
super();
|
|
1101
1103
|
this.instructions = instructions;
|
|
1102
|
-
this.judge = new PromptAlignmentJudge(
|
|
1104
|
+
this.judge = new PromptAlignmentJudge(llm);
|
|
1103
1105
|
this.scale = scale;
|
|
1104
1106
|
}
|
|
1105
1107
|
async measure(input, output) {
|
|
@@ -1225,8 +1227,8 @@ ${toxics.join("\n")}`;
|
|
|
1225
1227
|
|
|
1226
1228
|
// src/metrics/llm/toxicity/metricJudge.ts
|
|
1227
1229
|
var ToxicityJudge = class extends MastraAgentJudge {
|
|
1228
|
-
constructor(
|
|
1229
|
-
super("Toxicity", TOXICITY_AGENT_INSTRUCTIONS,
|
|
1230
|
+
constructor(llm) {
|
|
1231
|
+
super("Toxicity", TOXICITY_AGENT_INSTRUCTIONS, llm);
|
|
1230
1232
|
}
|
|
1231
1233
|
async evaluate(input, actualOutput) {
|
|
1232
1234
|
const prompt = generateEvaluatePrompt6({ input, output: actualOutput });
|
|
@@ -1257,10 +1259,10 @@ var ToxicityJudge = class extends MastraAgentJudge {
|
|
|
1257
1259
|
var ToxicityMetric = class extends Metric {
|
|
1258
1260
|
judge;
|
|
1259
1261
|
scale;
|
|
1260
|
-
constructor(
|
|
1262
|
+
constructor(llm, { scale = 1 } = {}) {
|
|
1261
1263
|
super();
|
|
1264
|
+
this.judge = new ToxicityJudge(llm);
|
|
1262
1265
|
this.scale = scale;
|
|
1263
|
-
this.judge = new ToxicityJudge(model);
|
|
1264
1266
|
}
|
|
1265
1267
|
async measure(input, output) {
|
|
1266
1268
|
const verdicts = await this.judge.evaluate(input, output);
|
|
@@ -1376,8 +1378,8 @@ ${relevantStatements}`;
|
|
|
1376
1378
|
|
|
1377
1379
|
// src/metrics/llm/context-relevancy/metricJudge.ts
|
|
1378
1380
|
var ContextRelevancyJudge = class extends MastraAgentJudge {
|
|
1379
|
-
constructor(
|
|
1380
|
-
super("Context Relevancy", CONTEXT_RELEVANCY_AGENT_INSTRUCTIONS,
|
|
1381
|
+
constructor(llm) {
|
|
1382
|
+
super("Context Relevancy", CONTEXT_RELEVANCY_AGENT_INSTRUCTIONS, llm);
|
|
1381
1383
|
}
|
|
1382
1384
|
async evaluate(input, actualOutput, retrievalContext) {
|
|
1383
1385
|
const prompt = generateEvaluatePrompt7({
|
|
@@ -1413,11 +1415,11 @@ var ContextRelevancyMetric = class extends Metric {
|
|
|
1413
1415
|
judge;
|
|
1414
1416
|
scale;
|
|
1415
1417
|
context;
|
|
1416
|
-
constructor(
|
|
1418
|
+
constructor(llm, { scale = 1, context }) {
|
|
1417
1419
|
super();
|
|
1418
|
-
this.judge = new ContextRelevancyJudge(model);
|
|
1419
|
-
this.scale = scale;
|
|
1420
1420
|
this.context = context;
|
|
1421
|
+
this.judge = new ContextRelevancyJudge(llm);
|
|
1422
|
+
this.scale = scale;
|
|
1421
1423
|
}
|
|
1422
1424
|
async measure(input, output) {
|
|
1423
1425
|
const verdicts = await this.judge.evaluate(input, output, this.context);
|
|
@@ -1523,8 +1525,8 @@ ${unsupportiveReasons.join("\n")}
|
|
|
1523
1525
|
|
|
1524
1526
|
// src/metrics/llm/contextual-recall/metricJudge.ts
|
|
1525
1527
|
var ContextualRecallJudge = class extends MastraAgentJudge {
|
|
1526
|
-
constructor(
|
|
1527
|
-
super("Contextual Recall", CONTEXT_RECALL_AGENT_INSTRUCTIONS,
|
|
1528
|
+
constructor(llm) {
|
|
1529
|
+
super("Contextual Recall", CONTEXT_RECALL_AGENT_INSTRUCTIONS, llm);
|
|
1528
1530
|
}
|
|
1529
1531
|
async evaluate(input, actualOutput, retrievalContext) {
|
|
1530
1532
|
const prompt = generateEvaluatePrompt8({
|
|
@@ -1560,11 +1562,11 @@ var ContextualRecallMetric = class extends Metric {
|
|
|
1560
1562
|
judge;
|
|
1561
1563
|
scale;
|
|
1562
1564
|
context;
|
|
1563
|
-
constructor(
|
|
1565
|
+
constructor(llm, { scale = 1, context }) {
|
|
1564
1566
|
super();
|
|
1565
|
-
this.judge = new ContextualRecallJudge(model);
|
|
1566
|
-
this.scale = scale;
|
|
1567
1567
|
this.context = context;
|
|
1568
|
+
this.judge = new ContextualRecallJudge(llm);
|
|
1569
|
+
this.scale = scale;
|
|
1568
1570
|
}
|
|
1569
1571
|
async measure(input, output) {
|
|
1570
1572
|
const verdicts = await this.judge.evaluate(input, output, this.context);
|
|
@@ -1829,8 +1831,8 @@ function generateReasonPrompt8({
|
|
|
1829
1831
|
|
|
1830
1832
|
// src/metrics/llm/summarization/metricJudge.ts
|
|
1831
1833
|
var SummarizationJudge = class extends MastraAgentJudge {
|
|
1832
|
-
constructor(
|
|
1833
|
-
super("Summarization", SUMMARIZATION_AGENT_INSTRUCTIONS,
|
|
1834
|
+
constructor(llm) {
|
|
1835
|
+
super("Summarization", SUMMARIZATION_AGENT_INSTRUCTIONS, llm);
|
|
1834
1836
|
}
|
|
1835
1837
|
async evaluateAlignment(originalText, summary) {
|
|
1836
1838
|
const claimsPrompt = generateClaimExtractionPrompt({ output: summary });
|
|
@@ -1894,9 +1896,9 @@ var SummarizationJudge = class extends MastraAgentJudge {
|
|
|
1894
1896
|
var SummarizationMetric = class extends Metric {
|
|
1895
1897
|
judge;
|
|
1896
1898
|
scale;
|
|
1897
|
-
constructor(
|
|
1899
|
+
constructor(llm, { scale = 1 } = {}) {
|
|
1898
1900
|
super();
|
|
1899
|
-
this.judge = new SummarizationJudge(
|
|
1901
|
+
this.judge = new SummarizationJudge(llm);
|
|
1900
1902
|
this.scale = scale;
|
|
1901
1903
|
}
|
|
1902
1904
|
async measure(input, output) {
|
|
@@ -2048,8 +2050,8 @@ ${biases.join("\n")}
|
|
|
2048
2050
|
|
|
2049
2051
|
// src/metrics/llm/bias/metricJudge.ts
|
|
2050
2052
|
var BiasJudge = class extends MastraAgentJudge {
|
|
2051
|
-
constructor(
|
|
2052
|
-
super("Bias", BIAS_AGENT_INSTRUCTIONS,
|
|
2053
|
+
constructor(llm) {
|
|
2054
|
+
super("Bias", BIAS_AGENT_INSTRUCTIONS, llm);
|
|
2053
2055
|
}
|
|
2054
2056
|
async evaluate(input, actualOutput) {
|
|
2055
2057
|
const opinionsPrompt = generateOpinionsPrompt({ input, output: actualOutput });
|
|
@@ -2086,10 +2088,10 @@ var BiasJudge = class extends MastraAgentJudge {
|
|
|
2086
2088
|
var BiasMetric = class extends Metric {
|
|
2087
2089
|
judge;
|
|
2088
2090
|
scale;
|
|
2089
|
-
constructor(
|
|
2091
|
+
constructor(llm, { scale = 1 } = {}) {
|
|
2090
2092
|
super();
|
|
2093
|
+
this.judge = new BiasJudge(llm);
|
|
2091
2094
|
this.scale = scale;
|
|
2092
|
-
this.judge = new BiasJudge(model);
|
|
2093
2095
|
}
|
|
2094
2096
|
async measure(input, output) {
|
|
2095
2097
|
const verdicts = await this.judge.evaluate(input, output);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mastra/evals",
|
|
3
|
-
"version": "0.1.0-alpha.
|
|
3
|
+
"version": "0.1.0-alpha.33",
|
|
4
4
|
"description": "",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -38,7 +38,7 @@
|
|
|
38
38
|
"sentiment": "^5.0.2",
|
|
39
39
|
"string-similarity": "^4.0.4",
|
|
40
40
|
"zod": "^3.24.1",
|
|
41
|
-
"@mastra/core": "^0.2.0-alpha.
|
|
41
|
+
"@mastra/core": "^0.2.0-alpha.91"
|
|
42
42
|
},
|
|
43
43
|
"devDependencies": {
|
|
44
44
|
"@tsconfig/recommended": "^1.0.7",
|
|
@@ -50,7 +50,8 @@
|
|
|
50
50
|
"vitest": "^3.0.4"
|
|
51
51
|
},
|
|
52
52
|
"scripts": {
|
|
53
|
-
"
|
|
53
|
+
"check": "tsc --noEmit",
|
|
54
|
+
"build": "pnpm check && tsup src/index.ts src/metrics/llm/index.ts src/metrics/nlp/index.ts --format esm --dts --clean --treeshake",
|
|
54
55
|
"dev": "tsup src/index.ts src/metrics/llm/index.ts src/metrics/nlp/index.ts --format esm --dts --watch",
|
|
55
56
|
"test": "vitest"
|
|
56
57
|
}
|
package/src/evaluation.test.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import { type ModelConfig } from '@mastra/core';
|
|
2
1
|
import { Agent } from '@mastra/core/agent';
|
|
3
2
|
import { Metric } from '@mastra/core/eval';
|
|
3
|
+
import { OpenAI } from '@mastra/core/llm/openai';
|
|
4
4
|
import { describe, expect, it } from 'vitest';
|
|
5
5
|
|
|
6
6
|
import { evaluate } from './evaluation';
|
|
@@ -14,18 +14,16 @@ class TestMetric extends Metric {
|
|
|
14
14
|
}
|
|
15
15
|
}
|
|
16
16
|
|
|
17
|
-
const
|
|
18
|
-
provider: 'OPEN_AI',
|
|
17
|
+
const llm = new OpenAI({
|
|
19
18
|
name: 'gpt-4o',
|
|
20
|
-
|
|
21
|
-
};
|
|
19
|
+
});
|
|
22
20
|
|
|
23
21
|
describe('evaluate', () => {
|
|
24
22
|
it('should get a text response from the agent', async () => {
|
|
25
23
|
const electionAgent = new Agent({
|
|
26
24
|
name: 'US Election agent',
|
|
27
25
|
instructions: 'You know about the past US elections',
|
|
28
|
-
|
|
26
|
+
llm,
|
|
29
27
|
});
|
|
30
28
|
|
|
31
29
|
const result = await evaluate(electionAgent, 'Who won the 2016 US presidential election?', new TestMetric());
|
|
@@ -1,13 +1,14 @@
|
|
|
1
|
-
import { Agent
|
|
1
|
+
import { Agent } from '@mastra/core/agent';
|
|
2
|
+
import { type MastraLLMBase } from '@mastra/core/llm';
|
|
2
3
|
|
|
3
4
|
export abstract class MastraAgentJudge {
|
|
4
5
|
protected readonly agent: Agent;
|
|
5
6
|
|
|
6
|
-
constructor(name: string, instructions: string,
|
|
7
|
+
constructor(name: string, instructions: string, llm: MastraLLMBase) {
|
|
7
8
|
this.agent = new Agent({
|
|
8
|
-
name: `Mastra Eval Judge ${
|
|
9
|
+
name: `Mastra Eval Judge ${llm.name} ${name}`,
|
|
9
10
|
instructions: instructions,
|
|
10
|
-
|
|
11
|
+
llm,
|
|
11
12
|
});
|
|
12
13
|
}
|
|
13
14
|
}
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { OpenAI } from '@mastra/core/llm/openai';
|
|
2
2
|
import { describe, it, expect } from 'vitest';
|
|
3
3
|
|
|
4
4
|
import { TestCase } from '../utils';
|
|
@@ -92,17 +92,14 @@ const testCases: TestCase[] = [
|
|
|
92
92
|
|
|
93
93
|
const SECONDS = 10000;
|
|
94
94
|
|
|
95
|
-
const
|
|
96
|
-
provider: 'OPEN_AI',
|
|
95
|
+
const llm = new OpenAI({
|
|
97
96
|
name: 'gpt-4o',
|
|
98
|
-
|
|
99
|
-
apiKey: process.env.OPENAI_API_KEY,
|
|
100
|
-
};
|
|
97
|
+
});
|
|
101
98
|
|
|
102
99
|
describe(
|
|
103
100
|
'AnswerRelevancyMetric',
|
|
104
101
|
() => {
|
|
105
|
-
const metric = new AnswerRelevancyMetric(
|
|
102
|
+
const metric = new AnswerRelevancyMetric(llm);
|
|
106
103
|
|
|
107
104
|
it('should be able to measure a prompt with perfect relevancy', async () => {
|
|
108
105
|
const result = await metric.measure(testCases[0].input, testCases[0].output);
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import { Metric
|
|
1
|
+
import { Metric } from '@mastra/core/eval';
|
|
2
|
+
import { type MastraLLMBase } from '@mastra/core/llm';
|
|
2
3
|
|
|
3
4
|
import { type MetricResultWithReason } from '../types';
|
|
4
5
|
import { roundToTwoDecimals } from '../utils';
|
|
@@ -15,11 +16,11 @@ export class AnswerRelevancyMetric extends Metric {
|
|
|
15
16
|
private uncertaintyWeight: number;
|
|
16
17
|
private scale: number;
|
|
17
18
|
|
|
18
|
-
constructor(
|
|
19
|
+
constructor(llm: MastraLLMBase, { uncertaintyWeight = 0.3, scale = 1 }: AnswerRelevancyMetricOptions = {}) {
|
|
19
20
|
super();
|
|
20
21
|
|
|
21
22
|
this.uncertaintyWeight = uncertaintyWeight;
|
|
22
|
-
this.judge = new AnswerRelevancyJudge(
|
|
23
|
+
this.judge = new AnswerRelevancyJudge(llm);
|
|
23
24
|
this.scale = scale;
|
|
24
25
|
}
|
|
25
26
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { type
|
|
1
|
+
import { type MastraLLMBase } from '@mastra/core/llm';
|
|
2
2
|
import { z } from 'zod';
|
|
3
3
|
|
|
4
4
|
import { MastraAgentJudge } from '../../judge';
|
|
@@ -11,8 +11,8 @@ import {
|
|
|
11
11
|
} from './prompts';
|
|
12
12
|
|
|
13
13
|
export class AnswerRelevancyJudge extends MastraAgentJudge {
|
|
14
|
-
constructor(
|
|
15
|
-
super('Answer Relevancy', ANSWER_RELEVANCY_AGENT_INSTRUCTIONS,
|
|
14
|
+
constructor(llm: MastraLLMBase) {
|
|
15
|
+
super('Answer Relevancy', ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, llm);
|
|
16
16
|
}
|
|
17
17
|
|
|
18
18
|
async evaluate(input: string, actualOutput: string): Promise<{ verdict: string; reason: string }[]> {
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { OpenAI } from '@mastra/core/llm/openai';
|
|
2
2
|
import { describe, it, expect, vi } from 'vitest';
|
|
3
3
|
|
|
4
4
|
import { TestCase } from '../utils';
|
|
@@ -46,15 +46,13 @@ vi.setConfig({
|
|
|
46
46
|
testTimeout: 20 * SECONDS,
|
|
47
47
|
});
|
|
48
48
|
|
|
49
|
-
const
|
|
50
|
-
provider: 'OPEN_AI',
|
|
49
|
+
const llm = new OpenAI({
|
|
51
50
|
name: 'gpt-4o',
|
|
52
|
-
toolChoice: 'auto',
|
|
53
51
|
apiKey: process.env.OPENAI_API_KEY,
|
|
54
|
-
};
|
|
52
|
+
});
|
|
55
53
|
|
|
56
54
|
describe('BiasMetric', () => {
|
|
57
|
-
const metric = new BiasMetric(
|
|
55
|
+
const metric = new BiasMetric(llm);
|
|
58
56
|
|
|
59
57
|
it('should be able to measure a prompt that is biased', async () => {
|
|
60
58
|
const result = await metric.measure(testCases[0].input, testCases[0].output);
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import { Metric
|
|
1
|
+
import { Metric } from '@mastra/core/eval';
|
|
2
|
+
import { type MastraLLMBase } from '@mastra/core/llm';
|
|
2
3
|
|
|
3
4
|
import { type MetricResultWithReason } from '../types';
|
|
4
5
|
import { roundToTwoDecimals } from '../utils';
|
|
@@ -13,11 +14,11 @@ export class BiasMetric extends Metric {
|
|
|
13
14
|
private judge: BiasJudge;
|
|
14
15
|
private scale: number;
|
|
15
16
|
|
|
16
|
-
constructor(
|
|
17
|
+
constructor(llm: MastraLLMBase, { scale = 1 }: BiasMetricOptions = {}) {
|
|
17
18
|
super();
|
|
18
19
|
|
|
20
|
+
this.judge = new BiasJudge(llm);
|
|
19
21
|
this.scale = scale;
|
|
20
|
-
this.judge = new BiasJudge(model);
|
|
21
22
|
}
|
|
22
23
|
|
|
23
24
|
async measure(input: string, output: string): Promise<MetricResultWithReason> {
|