@mastra/evals 0.0.1-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. package/CHANGELOG.md +10 -0
  2. package/LICENSE +44 -0
  3. package/dist/evals.cjs.development.js +1510 -0
  4. package/dist/evals.cjs.development.js.map +1 -0
  5. package/dist/evals.cjs.production.min.js +2 -0
  6. package/dist/evals.cjs.production.min.js.map +1 -0
  7. package/dist/evals.esm.js +1497 -0
  8. package/dist/evals.esm.js.map +1 -0
  9. package/dist/evaluation.d.ts +3 -0
  10. package/dist/evaluation.d.ts.map +1 -0
  11. package/dist/index.d.ts +3 -0
  12. package/dist/index.d.ts.map +1 -0
  13. package/dist/index.js +8 -0
  14. package/dist/judge/index.d.ts +6 -0
  15. package/dist/judge/index.d.ts.map +1 -0
  16. package/dist/metrics/answer-relevancy/index.d.ts +17 -0
  17. package/dist/metrics/answer-relevancy/index.d.ts.map +1 -0
  18. package/dist/metrics/answer-relevancy/metricJudge.d.ts +11 -0
  19. package/dist/metrics/answer-relevancy/metricJudge.d.ts.map +1 -0
  20. package/dist/metrics/answer-relevancy/prompts.d.ts +15 -0
  21. package/dist/metrics/answer-relevancy/prompts.d.ts.map +1 -0
  22. package/dist/metrics/completeness/index.d.ts +12 -0
  23. package/dist/metrics/completeness/index.d.ts.map +1 -0
  24. package/dist/metrics/content-similarity/index.d.ts +11 -0
  25. package/dist/metrics/content-similarity/index.d.ts.map +1 -0
  26. package/dist/metrics/context-position/index.d.ts +15 -0
  27. package/dist/metrics/context-position/index.d.ts.map +1 -0
  28. package/dist/metrics/context-position/metricJudge.d.ts +14 -0
  29. package/dist/metrics/context-position/metricJudge.d.ts.map +1 -0
  30. package/dist/metrics/context-position/prompts.d.ts +16 -0
  31. package/dist/metrics/context-position/prompts.d.ts.map +1 -0
  32. package/dist/metrics/context-precision/index.d.ts +15 -0
  33. package/dist/metrics/context-precision/index.d.ts.map +1 -0
  34. package/dist/metrics/context-precision/metricJudge.d.ts +15 -0
  35. package/dist/metrics/context-precision/metricJudge.d.ts.map +1 -0
  36. package/dist/metrics/context-precision/prompts.d.ts +16 -0
  37. package/dist/metrics/context-precision/prompts.d.ts.map +1 -0
  38. package/dist/metrics/difference/index.d.ts +9 -0
  39. package/dist/metrics/difference/index.d.ts.map +1 -0
  40. package/dist/metrics/index.d.ts +10 -0
  41. package/dist/metrics/index.d.ts.map +1 -0
  42. package/dist/metrics/keyword-coverage/index.d.ts +9 -0
  43. package/dist/metrics/keyword-coverage/index.d.ts.map +1 -0
  44. package/dist/metrics/prompt-alignment/index.d.ts +17 -0
  45. package/dist/metrics/prompt-alignment/index.d.ts.map +1 -0
  46. package/dist/metrics/prompt-alignment/metricJudge.d.ts +11 -0
  47. package/dist/metrics/prompt-alignment/metricJudge.d.ts.map +1 -0
  48. package/dist/metrics/prompt-alignment/prompts.d.ts +13 -0
  49. package/dist/metrics/prompt-alignment/prompts.d.ts.map +1 -0
  50. package/dist/metrics/tone/index.d.ts +10 -0
  51. package/dist/metrics/tone/index.d.ts.map +1 -0
  52. package/dist/metrics/types.d.ts +12 -0
  53. package/dist/metrics/types.d.ts.map +1 -0
  54. package/jest.config.ts +19 -0
  55. package/package.json +51 -0
  56. package/src/evaluation.test.ts +32 -0
  57. package/src/evaluation.ts +20 -0
  58. package/src/index.ts +2 -0
  59. package/src/judge/index.ts +13 -0
  60. package/src/metrics/answer-relevancy/index.test.ts +193 -0
  61. package/src/metrics/answer-relevancy/index.ts +80 -0
  62. package/src/metrics/answer-relevancy/metricJudge.ts +49 -0
  63. package/src/metrics/answer-relevancy/prompts.ts +179 -0
  64. package/src/metrics/completeness/index.test.ts +96 -0
  65. package/src/metrics/completeness/index.ts +112 -0
  66. package/src/metrics/content-similarity/index.test.ts +107 -0
  67. package/src/metrics/content-similarity/index.ts +41 -0
  68. package/src/metrics/context-position/index.test.ts +292 -0
  69. package/src/metrics/context-position/index.ts +63 -0
  70. package/src/metrics/context-position/metricJudge.ts +54 -0
  71. package/src/metrics/context-position/prompts.ts +123 -0
  72. package/src/metrics/context-precision/index.test.ts +249 -0
  73. package/src/metrics/context-precision/index.ts +62 -0
  74. package/src/metrics/context-precision/metricJudge.ts +55 -0
  75. package/src/metrics/context-precision/prompts.ts +111 -0
  76. package/src/metrics/difference/index.test.ts +116 -0
  77. package/src/metrics/difference/index.ts +31 -0
  78. package/src/metrics/index.ts +9 -0
  79. package/src/metrics/keyword-coverage/index.test.ts +114 -0
  80. package/src/metrics/keyword-coverage/index.ts +47 -0
  81. package/src/metrics/prompt-alignment/index.test.ts +46 -0
  82. package/src/metrics/prompt-alignment/index.ts +66 -0
  83. package/src/metrics/prompt-alignment/metricJudge.ts +41 -0
  84. package/src/metrics/prompt-alignment/prompts.ts +102 -0
  85. package/src/metrics/tone/index.test.ts +123 -0
  86. package/src/metrics/tone/index.ts +47 -0
  87. package/src/metrics/types.ts +13 -0
  88. package/tsconfig.json +10 -0
@@ -0,0 +1,1510 @@
1
+ 'use strict';
2
+
3
+ Object.defineProperty(exports, '__esModule', { value: true });
4
+
5
+ var core = require('@mastra/core');
6
+ var zod = require('zod');
7
+ var nlp = require('compromise');
8
+ var stringSimilarity = require('string-similarity');
9
+ var difflib = require('difflib');
10
+ var keyword_extractor = require('keyword-extractor');
11
+ var Sentiment = require('sentiment');
12
+
13
+ function _arrayLikeToArray(r, a) {
14
+ (null == a || a > r.length) && (a = r.length);
15
+ for (var e = 0, n = Array(a); e < a; e++) n[e] = r[e];
16
+ return n;
17
+ }
18
+ function asyncGeneratorStep(n, t, e, r, o, a, c) {
19
+ try {
20
+ var i = n[a](c),
21
+ u = i.value;
22
+ } catch (n) {
23
+ return void e(n);
24
+ }
25
+ i.done ? t(u) : Promise.resolve(u).then(r, o);
26
+ }
27
+ function _asyncToGenerator(n) {
28
+ return function () {
29
+ var t = this,
30
+ e = arguments;
31
+ return new Promise(function (r, o) {
32
+ var a = n.apply(t, e);
33
+ function _next(n) {
34
+ asyncGeneratorStep(a, r, o, _next, _throw, "next", n);
35
+ }
36
+ function _throw(n) {
37
+ asyncGeneratorStep(a, r, o, _next, _throw, "throw", n);
38
+ }
39
+ _next(void 0);
40
+ });
41
+ };
42
+ }
43
+ function _createForOfIteratorHelperLoose(r, e) {
44
+ var t = "undefined" != typeof Symbol && r[Symbol.iterator] || r["@@iterator"];
45
+ if (t) return (t = t.call(r)).next.bind(t);
46
+ if (Array.isArray(r) || (t = _unsupportedIterableToArray(r)) || e && r && "number" == typeof r.length) {
47
+ t && (r = t);
48
+ var o = 0;
49
+ return function () {
50
+ return o >= r.length ? {
51
+ done: !0
52
+ } : {
53
+ done: !1,
54
+ value: r[o++]
55
+ };
56
+ };
57
+ }
58
+ throw new TypeError("Invalid attempt to iterate non-iterable instance.\nIn order to be iterable, non-array objects must have a [Symbol.iterator]() method.");
59
+ }
60
+ function _extends() {
61
+ return _extends = Object.assign ? Object.assign.bind() : function (n) {
62
+ for (var e = 1; e < arguments.length; e++) {
63
+ var t = arguments[e];
64
+ for (var r in t) ({}).hasOwnProperty.call(t, r) && (n[r] = t[r]);
65
+ }
66
+ return n;
67
+ }, _extends.apply(null, arguments);
68
+ }
69
+ function _inheritsLoose(t, o) {
70
+ t.prototype = Object.create(o.prototype), t.prototype.constructor = t, _setPrototypeOf(t, o);
71
+ }
72
+ function _regeneratorRuntime() {
73
+ _regeneratorRuntime = function () {
74
+ return e;
75
+ };
76
+ var t,
77
+ e = {},
78
+ r = Object.prototype,
79
+ n = r.hasOwnProperty,
80
+ o = Object.defineProperty || function (t, e, r) {
81
+ t[e] = r.value;
82
+ },
83
+ i = "function" == typeof Symbol ? Symbol : {},
84
+ a = i.iterator || "@@iterator",
85
+ c = i.asyncIterator || "@@asyncIterator",
86
+ u = i.toStringTag || "@@toStringTag";
87
+ function define(t, e, r) {
88
+ return Object.defineProperty(t, e, {
89
+ value: r,
90
+ enumerable: !0,
91
+ configurable: !0,
92
+ writable: !0
93
+ }), t[e];
94
+ }
95
+ try {
96
+ define({}, "");
97
+ } catch (t) {
98
+ define = function (t, e, r) {
99
+ return t[e] = r;
100
+ };
101
+ }
102
+ function wrap(t, e, r, n) {
103
+ var i = e && e.prototype instanceof Generator ? e : Generator,
104
+ a = Object.create(i.prototype),
105
+ c = new Context(n || []);
106
+ return o(a, "_invoke", {
107
+ value: makeInvokeMethod(t, r, c)
108
+ }), a;
109
+ }
110
+ function tryCatch(t, e, r) {
111
+ try {
112
+ return {
113
+ type: "normal",
114
+ arg: t.call(e, r)
115
+ };
116
+ } catch (t) {
117
+ return {
118
+ type: "throw",
119
+ arg: t
120
+ };
121
+ }
122
+ }
123
+ e.wrap = wrap;
124
+ var h = "suspendedStart",
125
+ l = "suspendedYield",
126
+ f = "executing",
127
+ s = "completed",
128
+ y = {};
129
+ function Generator() {}
130
+ function GeneratorFunction() {}
131
+ function GeneratorFunctionPrototype() {}
132
+ var p = {};
133
+ define(p, a, function () {
134
+ return this;
135
+ });
136
+ var d = Object.getPrototypeOf,
137
+ v = d && d(d(values([])));
138
+ v && v !== r && n.call(v, a) && (p = v);
139
+ var g = GeneratorFunctionPrototype.prototype = Generator.prototype = Object.create(p);
140
+ function defineIteratorMethods(t) {
141
+ ["next", "throw", "return"].forEach(function (e) {
142
+ define(t, e, function (t) {
143
+ return this._invoke(e, t);
144
+ });
145
+ });
146
+ }
147
+ function AsyncIterator(t, e) {
148
+ function invoke(r, o, i, a) {
149
+ var c = tryCatch(t[r], t, o);
150
+ if ("throw" !== c.type) {
151
+ var u = c.arg,
152
+ h = u.value;
153
+ return h && "object" == typeof h && n.call(h, "__await") ? e.resolve(h.__await).then(function (t) {
154
+ invoke("next", t, i, a);
155
+ }, function (t) {
156
+ invoke("throw", t, i, a);
157
+ }) : e.resolve(h).then(function (t) {
158
+ u.value = t, i(u);
159
+ }, function (t) {
160
+ return invoke("throw", t, i, a);
161
+ });
162
+ }
163
+ a(c.arg);
164
+ }
165
+ var r;
166
+ o(this, "_invoke", {
167
+ value: function (t, n) {
168
+ function callInvokeWithMethodAndArg() {
169
+ return new e(function (e, r) {
170
+ invoke(t, n, e, r);
171
+ });
172
+ }
173
+ return r = r ? r.then(callInvokeWithMethodAndArg, callInvokeWithMethodAndArg) : callInvokeWithMethodAndArg();
174
+ }
175
+ });
176
+ }
177
+ function makeInvokeMethod(e, r, n) {
178
+ var o = h;
179
+ return function (i, a) {
180
+ if (o === f) throw Error("Generator is already running");
181
+ if (o === s) {
182
+ if ("throw" === i) throw a;
183
+ return {
184
+ value: t,
185
+ done: !0
186
+ };
187
+ }
188
+ for (n.method = i, n.arg = a;;) {
189
+ var c = n.delegate;
190
+ if (c) {
191
+ var u = maybeInvokeDelegate(c, n);
192
+ if (u) {
193
+ if (u === y) continue;
194
+ return u;
195
+ }
196
+ }
197
+ if ("next" === n.method) n.sent = n._sent = n.arg;else if ("throw" === n.method) {
198
+ if (o === h) throw o = s, n.arg;
199
+ n.dispatchException(n.arg);
200
+ } else "return" === n.method && n.abrupt("return", n.arg);
201
+ o = f;
202
+ var p = tryCatch(e, r, n);
203
+ if ("normal" === p.type) {
204
+ if (o = n.done ? s : l, p.arg === y) continue;
205
+ return {
206
+ value: p.arg,
207
+ done: n.done
208
+ };
209
+ }
210
+ "throw" === p.type && (o = s, n.method = "throw", n.arg = p.arg);
211
+ }
212
+ };
213
+ }
214
+ function maybeInvokeDelegate(e, r) {
215
+ var n = r.method,
216
+ o = e.iterator[n];
217
+ if (o === t) return r.delegate = null, "throw" === n && e.iterator.return && (r.method = "return", r.arg = t, maybeInvokeDelegate(e, r), "throw" === r.method) || "return" !== n && (r.method = "throw", r.arg = new TypeError("The iterator does not provide a '" + n + "' method")), y;
218
+ var i = tryCatch(o, e.iterator, r.arg);
219
+ if ("throw" === i.type) return r.method = "throw", r.arg = i.arg, r.delegate = null, y;
220
+ var a = i.arg;
221
+ return a ? a.done ? (r[e.resultName] = a.value, r.next = e.nextLoc, "return" !== r.method && (r.method = "next", r.arg = t), r.delegate = null, y) : a : (r.method = "throw", r.arg = new TypeError("iterator result is not an object"), r.delegate = null, y);
222
+ }
223
+ function pushTryEntry(t) {
224
+ var e = {
225
+ tryLoc: t[0]
226
+ };
227
+ 1 in t && (e.catchLoc = t[1]), 2 in t && (e.finallyLoc = t[2], e.afterLoc = t[3]), this.tryEntries.push(e);
228
+ }
229
+ function resetTryEntry(t) {
230
+ var e = t.completion || {};
231
+ e.type = "normal", delete e.arg, t.completion = e;
232
+ }
233
+ function Context(t) {
234
+ this.tryEntries = [{
235
+ tryLoc: "root"
236
+ }], t.forEach(pushTryEntry, this), this.reset(!0);
237
+ }
238
+ function values(e) {
239
+ if (e || "" === e) {
240
+ var r = e[a];
241
+ if (r) return r.call(e);
242
+ if ("function" == typeof e.next) return e;
243
+ if (!isNaN(e.length)) {
244
+ var o = -1,
245
+ i = function next() {
246
+ for (; ++o < e.length;) if (n.call(e, o)) return next.value = e[o], next.done = !1, next;
247
+ return next.value = t, next.done = !0, next;
248
+ };
249
+ return i.next = i;
250
+ }
251
+ }
252
+ throw new TypeError(typeof e + " is not iterable");
253
+ }
254
+ return GeneratorFunction.prototype = GeneratorFunctionPrototype, o(g, "constructor", {
255
+ value: GeneratorFunctionPrototype,
256
+ configurable: !0
257
+ }), o(GeneratorFunctionPrototype, "constructor", {
258
+ value: GeneratorFunction,
259
+ configurable: !0
260
+ }), GeneratorFunction.displayName = define(GeneratorFunctionPrototype, u, "GeneratorFunction"), e.isGeneratorFunction = function (t) {
261
+ var e = "function" == typeof t && t.constructor;
262
+ return !!e && (e === GeneratorFunction || "GeneratorFunction" === (e.displayName || e.name));
263
+ }, e.mark = function (t) {
264
+ return Object.setPrototypeOf ? Object.setPrototypeOf(t, GeneratorFunctionPrototype) : (t.__proto__ = GeneratorFunctionPrototype, define(t, u, "GeneratorFunction")), t.prototype = Object.create(g), t;
265
+ }, e.awrap = function (t) {
266
+ return {
267
+ __await: t
268
+ };
269
+ }, defineIteratorMethods(AsyncIterator.prototype), define(AsyncIterator.prototype, c, function () {
270
+ return this;
271
+ }), e.AsyncIterator = AsyncIterator, e.async = function (t, r, n, o, i) {
272
+ void 0 === i && (i = Promise);
273
+ var a = new AsyncIterator(wrap(t, r, n, o), i);
274
+ return e.isGeneratorFunction(r) ? a : a.next().then(function (t) {
275
+ return t.done ? t.value : a.next();
276
+ });
277
+ }, defineIteratorMethods(g), define(g, u, "Generator"), define(g, a, function () {
278
+ return this;
279
+ }), define(g, "toString", function () {
280
+ return "[object Generator]";
281
+ }), e.keys = function (t) {
282
+ var e = Object(t),
283
+ r = [];
284
+ for (var n in e) r.push(n);
285
+ return r.reverse(), function next() {
286
+ for (; r.length;) {
287
+ var t = r.pop();
288
+ if (t in e) return next.value = t, next.done = !1, next;
289
+ }
290
+ return next.done = !0, next;
291
+ };
292
+ }, e.values = values, Context.prototype = {
293
+ constructor: Context,
294
+ reset: function (e) {
295
+ if (this.prev = 0, this.next = 0, this.sent = this._sent = t, this.done = !1, this.delegate = null, this.method = "next", this.arg = t, this.tryEntries.forEach(resetTryEntry), !e) for (var r in this) "t" === r.charAt(0) && n.call(this, r) && !isNaN(+r.slice(1)) && (this[r] = t);
296
+ },
297
+ stop: function () {
298
+ this.done = !0;
299
+ var t = this.tryEntries[0].completion;
300
+ if ("throw" === t.type) throw t.arg;
301
+ return this.rval;
302
+ },
303
+ dispatchException: function (e) {
304
+ if (this.done) throw e;
305
+ var r = this;
306
+ function handle(n, o) {
307
+ return a.type = "throw", a.arg = e, r.next = n, o && (r.method = "next", r.arg = t), !!o;
308
+ }
309
+ for (var o = this.tryEntries.length - 1; o >= 0; --o) {
310
+ var i = this.tryEntries[o],
311
+ a = i.completion;
312
+ if ("root" === i.tryLoc) return handle("end");
313
+ if (i.tryLoc <= this.prev) {
314
+ var c = n.call(i, "catchLoc"),
315
+ u = n.call(i, "finallyLoc");
316
+ if (c && u) {
317
+ if (this.prev < i.catchLoc) return handle(i.catchLoc, !0);
318
+ if (this.prev < i.finallyLoc) return handle(i.finallyLoc);
319
+ } else if (c) {
320
+ if (this.prev < i.catchLoc) return handle(i.catchLoc, !0);
321
+ } else {
322
+ if (!u) throw Error("try statement without catch or finally");
323
+ if (this.prev < i.finallyLoc) return handle(i.finallyLoc);
324
+ }
325
+ }
326
+ }
327
+ },
328
+ abrupt: function (t, e) {
329
+ for (var r = this.tryEntries.length - 1; r >= 0; --r) {
330
+ var o = this.tryEntries[r];
331
+ if (o.tryLoc <= this.prev && n.call(o, "finallyLoc") && this.prev < o.finallyLoc) {
332
+ var i = o;
333
+ break;
334
+ }
335
+ }
336
+ i && ("break" === t || "continue" === t) && i.tryLoc <= e && e <= i.finallyLoc && (i = null);
337
+ var a = i ? i.completion : {};
338
+ return a.type = t, a.arg = e, i ? (this.method = "next", this.next = i.finallyLoc, y) : this.complete(a);
339
+ },
340
+ complete: function (t, e) {
341
+ if ("throw" === t.type) throw t.arg;
342
+ return "break" === t.type || "continue" === t.type ? this.next = t.arg : "return" === t.type ? (this.rval = this.arg = t.arg, this.method = "return", this.next = "end") : "normal" === t.type && e && (this.next = e), y;
343
+ },
344
+ finish: function (t) {
345
+ for (var e = this.tryEntries.length - 1; e >= 0; --e) {
346
+ var r = this.tryEntries[e];
347
+ if (r.finallyLoc === t) return this.complete(r.completion, r.afterLoc), resetTryEntry(r), y;
348
+ }
349
+ },
350
+ catch: function (t) {
351
+ for (var e = this.tryEntries.length - 1; e >= 0; --e) {
352
+ var r = this.tryEntries[e];
353
+ if (r.tryLoc === t) {
354
+ var n = r.completion;
355
+ if ("throw" === n.type) {
356
+ var o = n.arg;
357
+ resetTryEntry(r);
358
+ }
359
+ return o;
360
+ }
361
+ }
362
+ throw Error("illegal catch attempt");
363
+ },
364
+ delegateYield: function (e, r, n) {
365
+ return this.delegate = {
366
+ iterator: values(e),
367
+ resultName: r,
368
+ nextLoc: n
369
+ }, "next" === this.method && (this.arg = t), y;
370
+ }
371
+ }, e;
372
+ }
373
+ function _setPrototypeOf(t, e) {
374
+ return _setPrototypeOf = Object.setPrototypeOf ? Object.setPrototypeOf.bind() : function (t, e) {
375
+ return t.__proto__ = e, t;
376
+ }, _setPrototypeOf(t, e);
377
+ }
378
+ function _unsupportedIterableToArray(r, a) {
379
+ if (r) {
380
+ if ("string" == typeof r) return _arrayLikeToArray(r, a);
381
+ var t = {}.toString.call(r).slice(8, -1);
382
+ return "Object" === t && r.constructor && (t = r.constructor.name), "Map" === t || "Set" === t ? Array.from(r) : "Arguments" === t || /^(?:Ui|I)nt(?:8|16|32)(?:Clamped)?Array$/.test(t) ? _arrayLikeToArray(r, a) : void 0;
383
+ }
384
+ }
385
+
386
+ function evaluate(_x, _x2, _x3) {
387
+ return _evaluate.apply(this, arguments);
388
+ }
389
+ function _evaluate() {
390
+ _evaluate = _asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee(agent, input, metric) {
391
+ var agentOutput, metricResult;
392
+ return _regeneratorRuntime().wrap(function _callee$(_context) {
393
+ while (1) switch (_context.prev = _context.next) {
394
+ case 0:
395
+ _context.next = 2;
396
+ return agent.generate(input);
397
+ case 2:
398
+ agentOutput = _context.sent;
399
+ _context.next = 5;
400
+ return metric.measure({
401
+ input: input.toString(),
402
+ output: agentOutput.text
403
+ });
404
+ case 5:
405
+ metricResult = _context.sent;
406
+ // capture infomration about the evaluation
407
+ core.executeHook(core.AvailableHooks.ON_EVALUATION, {
408
+ input: input.toString(),
409
+ output: agentOutput.text,
410
+ result: metricResult
411
+ });
412
+ return _context.abrupt("return", metricResult);
413
+ case 8:
414
+ case "end":
415
+ return _context.stop();
416
+ }
417
+ }, _callee);
418
+ }));
419
+ return _evaluate.apply(this, arguments);
420
+ }
421
+
422
+ var MastraAgentJudge = function MastraAgentJudge(name, instructions, model) {
423
+ this.agent = void 0;
424
+ this.agent = new core.Agent({
425
+ name: "Mastra Eval Judge " + model.provider + " " + name,
426
+ instructions: instructions,
427
+ model: model
428
+ });
429
+ };
430
+
431
+ var ANSWER_RELEVANCY_AGENT_INSTRUCTIONS = "You are a balanced and nuanced answer relevancy evaluator. Your job is to determine if LLM outputs are relevant to the input, including handling partially relevant or uncertain cases.\n\nKey Principles:\n1. Evaluate whether the output addresses what the input is asking for\n2. Consider both direct answers and related context\n3. Prioritize relevance to the input over correctness\n4. Recognize that responses can be partially relevant\n5. Empty inputs or error messages should always be marked as \"no\"";
432
+ function generateEvaluationStatementsPrompt(_ref) {
433
+ var output = _ref.output;
434
+ return "Given the text, break it down into meaningful statements while preserving context and relationships.\nDon't split too aggressively.\n\nSplit compound statements particularly when they:\n- Are joined by \"and\"\n- Contain multiple distinct facts or claims\n- Have multiple descriptive elements about the subject\n\n\nHandle special cases:\n- A single word answer should be treated as a complete statement\n- Error messages should be treated as a single statement\n- Empty strings should return an empty list\n- When splitting text, keep related information together\n\nExample:\nExample text: Look! A bird! Birds are an interesting animal.\n\n{{\n \"statements\": [\"Look!\", \"A bird!\", \"Birds are interesting animals.\"]\n}}\n\nPlease return only JSON format with \"statements\" array.\nReturn empty list for empty input.\n\nText:\n" + output + "\n\nJSON:\n";
435
+ }
436
+ function generateEvaluatePrompt$3(_ref2) {
437
+ var input = _ref2.input,
438
+ statements = _ref2.statements;
439
+ return "Evaluate each statement's relevance to the input question, considering direct answers, related context, and uncertain cases.\n\n Return JSON with array of verdict objects. Each verdict must include:\n - \"verdict\": \"yes\", \"no\", or \"unsure\"\n - \"reason\": Clear explanation of the verdict\n - Exact match between number of verdicts and statements\n\n Verdict Guidelines:\n - \"yes\": Statement explicitly and directly answers the input question\n * Contains specific answer to the question asked (e.g., \"The color of the sky is blue\")\n * States explicit relationship between key concepts (e.g., \"X is the CEO of company Y\")\n * Can stand alone as a complete answer\n * Contains appropriate question-type response (e.g., location for \"where\", person for \"who\")\n\n - \"unsure\": Statement shows partial relevance when it:\n * Contains topic-related administrative/governance terms without direct answer\n * Mentions locations or entities related to the answer without specifying their role\n * References functions or characteristics typically associated with the answer\n * Is incorrect but shows understanding of the question\n * Uses importance indicators (\"main\", \"primary\", \"major\") with relevant concepts\n * Includes indirect references to the answer (e.g., \"where the president works\")\n * Contains multiple relevant concepts but lacks explicit relationship between them\n * Demonstrates understanding of question domain without providing specific answer\n\n - \"no\": Statement lacks meaningful connection to question when it:\n * Contains no concepts related to the question type or domain\n * Only mentions the broader topic without relevant details (e.g., \"the country has nice weather\")\n * Provides general descriptions without addressing the specific question\n * Contains purely tangential information about the subject\n * Consists of empty or meaningless content\n * Discusses characteristics unrelated to the question type (e.g., describing cuisine when asked about geography)\n * Note: Assessment is about topical relationship, not factual accuracy\n\n REMEMBER: A statmenent does not have to be correct, it just has to be relevant.\n If the statement contains words or phrases that are relevant to the input, it is partially relevant.\n If the statement is a direct answer to the input, it is relevant.\n If the statement is completely unrelated to the input or contains nothing, it is not relevant.\n DO NOT MAKE A JUDGEMENT ON THE CORRECTNESS OF THE STATEMENT, JUST THE RELEVANCY.\n\n\n Example:\n Input: \"What color is the sky during daytime?\"\n Statements: [\n \"The sky is blue during daytime\",\n \"The sky is full of clouds\", \n \"I had breakfast today\",\n \"Blue is a beautiful color\",\n \"Many birds fly in the sky\",\n \"\",\n \"The sky is purple during daytime\",\n ]\n JSON:\n {{\n \"verdicts\": [\n {{\n \"verdict\": \"yes\",\n \"reason\": \"This statement explicitly answers what color the sky is during daytime\"\n }},\n {{\n \"verdict\": \"unsure\",\n \"reason\": \"This statement describes the sky but doesn't address its color\"\n }},\n {{\n \"verdict\": \"no\",\n \"reason\": \"This statement about breakfast is completely unrelated to the sky\"\n }},\n {{\n \"verdict\": \"unsure\",\n \"reason\": \"This statement about blue is related to color but doesn't address the sky\"\n }},\n {{\n \"verdict\": \"unsure\",\n \"reason\": \"This statement is about the sky but doesn't address its color\"\n }},\n {{\n \"verdict\": \"no\",\n \"reason\": \"This statement is empty\"\n }},\n {{\n \"verdict\": \"unsure\",\n \"reason\": \"This statement is incorrect but contains relevant information and still addresses the question\"\n }}\n ]\n }}\n\n Input:\n " + input + "\n\n Statements:\n " + statements.join('\n') + "\n\n JSON:\n ";
440
+ }
441
+ function generateReasonPrompt$3(_ref3) {
442
+ var score = _ref3.score,
443
+ reasons = _ref3.reasons,
444
+ input = _ref3.input,
445
+ output = _ref3.output;
446
+ return "Explain the irrelevancy score (0-10) for the LLM's response using this context:\n Context:\n Input: " + input + "\n Output: " + output + "\n Score: " + score + "\n Irrelevancy Reasons: " + reasons.join('\n') + "\n \n Rules:\n - Explain score based on mix of direct answers and related context\n - Consider both full and partial relevance\n - Keep explanation concise and focused\n - Use given score, don't recalculate\n - Don't judge factual correctness\n - Explain both relevant and irrelevant aspects\n - For mixed responses, explain the balance\n\n Format:\n {\n \"reason\": \"The score is {score} because {explanation of overall relevance}\"\n }\n\n Example Responses:\n {\n \"reason\": \"The score is 7 because while the first statement directly answers the question, the additional context is only partially relevant\"\n }\n {\n \"reason\": \"The score is 3 because while the answer discusses the right topic, it doesn't directly address the question\"\n }\n ";
447
+ }
448
+
449
+ var AnswerRelevancyJudge = /*#__PURE__*/function (_MastraAgentJudge) {
450
+ function AnswerRelevancyJudge(model) {
451
+ return _MastraAgentJudge.call(this, 'Answer Relevancy', ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, model) || this;
452
+ }
453
+ _inheritsLoose(AnswerRelevancyJudge, _MastraAgentJudge);
454
+ var _proto = AnswerRelevancyJudge.prototype;
455
+ _proto.evaluate = /*#__PURE__*/function () {
456
+ var _evaluate = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee(input, actualOutput) {
457
+ var statementPrompt, statements, prompt, result;
458
+ return _regeneratorRuntime().wrap(function _callee$(_context) {
459
+ while (1) switch (_context.prev = _context.next) {
460
+ case 0:
461
+ statementPrompt = generateEvaluationStatementsPrompt({
462
+ output: actualOutput
463
+ });
464
+ _context.next = 3;
465
+ return this.agent.generate(statementPrompt, {
466
+ output: zod.z.object({
467
+ statements: zod.z.array(zod.z.string())
468
+ })
469
+ });
470
+ case 3:
471
+ statements = _context.sent;
472
+ prompt = generateEvaluatePrompt$3({
473
+ input: input,
474
+ statements: statements.object.statements
475
+ });
476
+ _context.next = 7;
477
+ return this.agent.generate(prompt, {
478
+ output: zod.z.object({
479
+ verdicts: zod.z.array(zod.z.object({
480
+ verdict: zod.z.string(),
481
+ reason: zod.z.string()
482
+ }))
483
+ })
484
+ });
485
+ case 7:
486
+ result = _context.sent;
487
+ return _context.abrupt("return", result.object.verdicts);
488
+ case 9:
489
+ case "end":
490
+ return _context.stop();
491
+ }
492
+ }, _callee, this);
493
+ }));
494
+ function evaluate(_x, _x2) {
495
+ return _evaluate.apply(this, arguments);
496
+ }
497
+ return evaluate;
498
+ }();
499
+ _proto.getReason = /*#__PURE__*/function () {
500
+ var _getReason = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee2(input, actualOutput, score, reasons) {
501
+ var prompt, result;
502
+ return _regeneratorRuntime().wrap(function _callee2$(_context2) {
503
+ while (1) switch (_context2.prev = _context2.next) {
504
+ case 0:
505
+ prompt = generateReasonPrompt$3({
506
+ input: input,
507
+ output: actualOutput,
508
+ reasons: reasons,
509
+ score: score
510
+ });
511
+ _context2.next = 3;
512
+ return this.agent.generate(prompt, {
513
+ output: zod.z.object({
514
+ reason: zod.z.string()
515
+ })
516
+ });
517
+ case 3:
518
+ result = _context2.sent;
519
+ return _context2.abrupt("return", result.object.reason);
520
+ case 5:
521
+ case "end":
522
+ return _context2.stop();
523
+ }
524
+ }, _callee2, this);
525
+ }));
526
+ function getReason(_x3, _x4, _x5, _x6) {
527
+ return _getReason.apply(this, arguments);
528
+ }
529
+ return getReason;
530
+ }();
531
+ return AnswerRelevancyJudge;
532
+ }(MastraAgentJudge);
533
+
534
+ var AnswerRelevancyMetric = /*#__PURE__*/function (_Metric) {
535
+ function AnswerRelevancyMetric(model, _temp) {
536
+ var _this;
537
+ var _ref = _temp === void 0 ? {
538
+ uncertaintyWeight: 0.3,
539
+ scale: 10
540
+ } : _temp,
541
+ uncertaintyWeight = _ref.uncertaintyWeight,
542
+ scale = _ref.scale;
543
+ _this = _Metric.call(this) || this;
544
+ _this.judge = void 0;
545
+ _this.uncertaintyWeight = void 0;
546
+ _this.scale = void 0;
547
+ _this.uncertaintyWeight = uncertaintyWeight;
548
+ _this.judge = new AnswerRelevancyJudge(model);
549
+ _this.scale = scale;
550
+ return _this;
551
+ }
552
+ _inheritsLoose(AnswerRelevancyMetric, _Metric);
553
+ var _proto = AnswerRelevancyMetric.prototype;
554
+ _proto.measure = /*#__PURE__*/function () {
555
+ var _measure = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee(_ref2) {
556
+ var input, output, verdicts, score, reason;
557
+ return _regeneratorRuntime().wrap(function _callee$(_context) {
558
+ while (1) switch (_context.prev = _context.next) {
559
+ case 0:
560
+ input = _ref2.input, output = _ref2.output;
561
+ _context.next = 3;
562
+ return this.judge.evaluate(input, output);
563
+ case 3:
564
+ verdicts = _context.sent;
565
+ score = this.calculateScore(verdicts);
566
+ _context.next = 7;
567
+ return this.generateReason(input, output, score, verdicts);
568
+ case 7:
569
+ reason = _context.sent;
570
+ return _context.abrupt("return", {
571
+ score: score,
572
+ reason: reason
573
+ });
574
+ case 9:
575
+ case "end":
576
+ return _context.stop();
577
+ }
578
+ }, _callee, this);
579
+ }));
580
+ function measure(_x) {
581
+ return _measure.apply(this, arguments);
582
+ }
583
+ return measure;
584
+ }();
585
+ _proto.generateReason = /*#__PURE__*/function () {
586
+ var _generateReason = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee2(input, output, score, verdicts) {
587
+ var reasonsForVerdicts, _iterator, _step, _step$value, verdict, _reason, reason;
588
+ return _regeneratorRuntime().wrap(function _callee2$(_context2) {
589
+ while (1) switch (_context2.prev = _context2.next) {
590
+ case 0:
591
+ reasonsForVerdicts = [];
592
+ for (_iterator = _createForOfIteratorHelperLoose(verdicts || []); !(_step = _iterator()).done;) {
593
+ _step$value = _step.value, verdict = _step$value.verdict, _reason = _step$value.reason;
594
+ if (verdict.trim().toLowerCase() === 'no') {
595
+ reasonsForVerdicts.push(_reason);
596
+ }
597
+ }
598
+ _context2.next = 4;
599
+ return this.judge.getReason(input, output, score, reasonsForVerdicts);
600
+ case 4:
601
+ reason = _context2.sent;
602
+ return _context2.abrupt("return", reason);
603
+ case 6:
604
+ case "end":
605
+ return _context2.stop();
606
+ }
607
+ }, _callee2, this);
608
+ }));
609
+ function generateReason(_x2, _x3, _x4, _x5) {
610
+ return _generateReason.apply(this, arguments);
611
+ }
612
+ return generateReason;
613
+ }();
614
+ _proto.calculateScore = function calculateScore(evaluation) {
615
+ var numberOfVerdicts = (evaluation == null ? void 0 : evaluation.length) || 0;
616
+ if (numberOfVerdicts === 0) {
617
+ return 1;
618
+ }
619
+ var relevancyCount = 0;
620
+ for (var _iterator2 = _createForOfIteratorHelperLoose(evaluation), _step2; !(_step2 = _iterator2()).done;) {
621
+ var verdict = _step2.value.verdict;
622
+ if (verdict.trim().toLowerCase() === 'yes') {
623
+ relevancyCount++;
624
+ } else if (verdict.trim().toLowerCase() === 'unsure') {
625
+ relevancyCount += this.uncertaintyWeight;
626
+ }
627
+ }
628
+ var score = relevancyCount / numberOfVerdicts;
629
+ return Math.round(score * this.scale);
630
+ };
631
+ return AnswerRelevancyMetric;
632
+ }(core.Metric);
633
+
634
+ var CompletenessMetric = /*#__PURE__*/function (_Metric) {
635
+ function CompletenessMetric() {
636
+ return _Metric.apply(this, arguments) || this;
637
+ }
638
+ _inheritsLoose(CompletenessMetric, _Metric);
639
+ var _proto = CompletenessMetric.prototype;
640
+ _proto.measure = /*#__PURE__*/function () {
641
+ var _measure = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee(_ref) {
642
+ var input, output, inputDoc, outputDoc, inputElements, outputElements, coverage;
643
+ return _regeneratorRuntime().wrap(function _callee$(_context) {
644
+ while (1) switch (_context.prev = _context.next) {
645
+ case 0:
646
+ input = _ref.input, output = _ref.output;
647
+ if (!(input === null || input === undefined || output === null || output === undefined)) {
648
+ _context.next = 3;
649
+ break;
650
+ }
651
+ throw new Error('Inputs cannot be null or undefined');
652
+ case 3:
653
+ // Trim both inputs
654
+ input = input.trim();
655
+ output = output.trim();
656
+ inputDoc = nlp(input);
657
+ outputDoc = nlp(output); // Extract and log elements
658
+ inputElements = this.extractElements(inputDoc);
659
+ outputElements = this.extractElements(outputDoc); // Maybe we need a more sophisticated matching approach
660
+ coverage = this.calculateCoverage(inputElements, outputElements);
661
+ return _context.abrupt("return", {
662
+ score: coverage,
663
+ details: "Completeness score: " + (coverage * 100).toFixed(1) + "%",
664
+ confidence: 0.8,
665
+ metrics: {
666
+ inputElements: inputElements,
667
+ outputElements: outputElements,
668
+ missingElements: inputElements.filter(function (e) {
669
+ return !outputElements.includes(e);
670
+ }),
671
+ elementCounts: {
672
+ input: inputElements.length,
673
+ output: outputElements.length
674
+ }
675
+ }
676
+ });
677
+ case 11:
678
+ case "end":
679
+ return _context.stop();
680
+ }
681
+ }, _callee, this);
682
+ }));
683
+ function measure(_x) {
684
+ return _measure.apply(this, arguments);
685
+ }
686
+ return measure;
687
+ }();
688
+ _proto.extractElements = function extractElements(doc) {
689
+ var _this = this;
690
+ // Get more specific elements and ensure they're arrays
691
+ var nouns = doc.nouns().out('array') || [];
692
+ var verbs = doc.verbs().toInfinitive().out('array') || [];
693
+ var topics = doc.topics().out('array') || [];
694
+ var terms = doc.terms().out('array') || [];
695
+ // Helper function to clean and split terms
696
+ var cleanAndSplitTerm = function cleanAndSplitTerm(term) {
697
+ // First normalize the string
698
+ var normalized = _this.normalizeString(term);
699
+ // Split on word boundaries and filter out empty strings
700
+ return normalized.replace(/([a-z])([A-Z])/g, '$1 $2') // Split camelCase
701
+ .replace(/[^a-z0-9]+/g, ' ') // Replace non-alphanumeric with spaces
702
+ .trim().split(/\s+/).filter(function (word) {
703
+ return word.length > 0;
704
+ });
705
+ };
706
+ // Process all elements
707
+ var processedTerms = [].concat(nouns.flatMap(cleanAndSplitTerm), verbs.flatMap(cleanAndSplitTerm), topics.flatMap(cleanAndSplitTerm), terms.flatMap(cleanAndSplitTerm));
708
+ // Remove duplicates
709
+ return [].concat(new Set(processedTerms));
710
+ };
711
+ _proto.normalizeString = function normalizeString(str) {
712
+ // Remove diacritics and convert to lowercase
713
+ return str.normalize('NFD').replace(/[\u0300-\u036f]/g, '').toLowerCase();
714
+ };
715
+ _proto.calculateCoverage = function calculateCoverage(original, simplified) {
716
+ var _this2 = this;
717
+ if (original.length === 0) {
718
+ return simplified.length === 0 ? 1 : 0;
719
+ }
720
+ // Exact matching for short words (3 chars or less), substring matching for longer words
721
+ var covered = original.filter(function (element) {
722
+ return simplified.some(function (s) {
723
+ var elem = _this2.normalizeString(element);
724
+ var simp = _this2.normalizeString(s);
725
+ // For short words (3 chars or less), require exact match
726
+ if (elem.length <= 3) {
727
+ return elem === simp;
728
+ }
729
+ // For longer words, require substantial overlap (more than 60% of the longer word)
730
+ var longer = elem.length > simp.length ? elem : simp;
731
+ var shorter = elem.length > simp.length ? simp : elem;
732
+ if (longer.includes(shorter)) {
733
+ return shorter.length / longer.length > 0.6;
734
+ }
735
+ return false;
736
+ });
737
+ });
738
+ return covered.length / original.length;
739
+ };
740
+ return CompletenessMetric;
741
+ }(core.Metric);
742
+
743
+ var ContentSimilarityMetric = /*#__PURE__*/function (_Metric) {
744
+ function ContentSimilarityMetric(options) {
745
+ var _this;
746
+ if (options === void 0) {
747
+ options = {};
748
+ }
749
+ _this = _Metric.call(this) || this;
750
+ _this.options = void 0;
751
+ _this.options = _extends({
752
+ ignoreCase: true,
753
+ ignoreWhitespace: true
754
+ }, options);
755
+ return _this;
756
+ }
757
+ _inheritsLoose(ContentSimilarityMetric, _Metric);
758
+ var _proto = ContentSimilarityMetric.prototype;
759
+ _proto.measure = /*#__PURE__*/function () {
760
+ var _measure = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee(_ref) {
761
+ var input, output, processedInput, processedOutput, similarity;
762
+ return _regeneratorRuntime().wrap(function _callee$(_context) {
763
+ while (1) switch (_context.prev = _context.next) {
764
+ case 0:
765
+ input = _ref.input, output = _ref.output;
766
+ processedInput = input;
767
+ processedOutput = output;
768
+ if (this.options.ignoreCase) {
769
+ processedInput = processedInput.toLowerCase();
770
+ processedOutput = processedOutput.toLowerCase();
771
+ }
772
+ if (this.options.ignoreWhitespace) {
773
+ processedInput = processedInput.replace(/\s+/g, ' ').trim();
774
+ processedOutput = processedOutput.replace(/\s+/g, ' ').trim();
775
+ }
776
+ similarity = stringSimilarity.compareTwoStrings(processedInput, processedOutput);
777
+ return _context.abrupt("return", {
778
+ score: similarity,
779
+ details: "Content similarity: " + (similarity * 100).toFixed(1) + "%",
780
+ confidence: 0.9,
781
+ metrics: {
782
+ similarity: similarity
783
+ }
784
+ });
785
+ case 7:
786
+ case "end":
787
+ return _context.stop();
788
+ }
789
+ }, _callee, this);
790
+ }));
791
+ function measure(_x) {
792
+ return _measure.apply(this, arguments);
793
+ }
794
+ return measure;
795
+ }();
796
+ return ContentSimilarityMetric;
797
+ }(core.Metric);
798
+
799
+ var CONTEXT_POSITION_AGENT_INSTRUCTIONS = "You are a balanced and nuanced context position evaluator. Your job is to determine if retrieved context nodes are relevant to generating the expected output, with special attention to their ordering.\n\nKey Principles:\n1. Evaluate whether each context node contributes to understanding the expected output - both directly AND indirectly\n2. Consider all forms of relevance:\n - Direct definitions or explanations\n - Supporting evidence or examples\n - Related characteristics or behaviors\n - Real-world applications or effects\n3. Pay attention to the position of relevant information\n4. Recognize that earlier positions should contain more relevant information\n5. Be inclusive rather than exclusive in determining relevance - if the information supports or reinforces the output in any way, consider it relevant\n6. Empty or error nodes should be marked as not relevant";
800
+ function generateEvaluatePrompt$2(_ref) {
801
+ var input = _ref.input,
802
+ output = _ref.output,
803
+ context = _ref.context;
804
+ return "Given the input, output, and context, evaluate each context piece's relevance by generating a list of JSON objects.\n\n**\nIMPORTANT: Your response must be in JSON format with a 'verdicts' key containing a list. Each verdict must have only two fields: `verdict` with either 'yes' or 'no', and `reason` explaining the verdict. Your reason should include relevant quotes from the context.\n\nExample Context: [\"The Sun is a star\", \"Stars produce their own light\", \"The Moon reflects sunlight\"]\nExample Query: \"What is the Sun?\"\nExample Expected Response: \"The Sun is a star that produces light.\"\n\nConsider context relevant if it:\n- Directly addresses the input\n- Provides examples or instances that help explain the concept\n- Offers related information that helps build understanding\n- Contains partial information that contributes to the response\n- Demonstrates or validates characteristics mentioned in the output\n- Shows real-world applications or effects of the concept\n- Reinforces or provides evidence for any part of the output\n- Helps establish credibility or understanding of the subject\n- Describes the actions the subject can perform\n\nA context piece should be considered relevant if it contributes ANY supporting information or evidence, even if indirect.\n\nExample:\n{\n \"verdicts\": [\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context 'The Sun is a star' directly defines what the Sun is.\"\n },\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context 'Stars produce their own light' is relevant as it describes a key characteristic of stars, which includes the Sun.\"\n },\n {\n \"verdict\": \"no\",\n \"reason\": \"The context 'The Moon reflects sunlight' is not relevant to defining what the Sun is.\"\n }\n ] \n}\n\nThe number of verdicts MUST MATCH the number of context pieces exactly.\n**\n\nInput:\n" + input + "\n\nOutput:\n" + output + "\n\nContext:\n" + context + "\n\nJSON:\n";
805
+ }
806
+ function generateReasonPrompt$2(_ref2) {
807
+ var input = _ref2.input,
808
+ output = _ref2.output,
809
+ verdicts = _ref2.verdicts,
810
+ score = _ref2.score;
811
+ return "Given the input, output, verdicts, and position score, provide a BRIEF explanation for the score. Focus on both relevance and positioning of the context.\n The retrieved contexts is a list containing `verdict` ('yes' or 'no' for relevance), `reason` (explaining the verdict) and `node` (the context text). Contexts are listed in their ranking order.\n\n**\nIMPORTANT: Return only JSON format with a single 'reason' key explaining the score.\nExample JSON:\n{\n \"reason\": \"The score is <score> because <explanation>.\"\n}\n\nGuidelines:\n- Don't mention 'verdict' - refer to relevant/irrelevant nodes instead\n- Use information from the `reason` field, not the field itself\n- Reference node positions (first, second, etc.) when explaining relevance\n- For perfect scores (10.0), emphasize both relevance and optimal ordering\n- Always reference the ranking order when discussing relevance\n**\n\nPosition Score:\n" + score + "\n\nInput:\n" + input + "\n\nOutput:\n" + output + "\n\nContext:\n" + verdicts + "\n\nJSON:\n";
812
+ }
813
+
814
+ var ContextPositionJudge = /*#__PURE__*/function (_MastraAgentJudge) {
815
+ function ContextPositionJudge(model) {
816
+ return _MastraAgentJudge.call(this, 'Context Position', CONTEXT_POSITION_AGENT_INSTRUCTIONS, model) || this;
817
+ }
818
+ _inheritsLoose(ContextPositionJudge, _MastraAgentJudge);
819
+ var _proto = ContextPositionJudge.prototype;
820
+ _proto.evaluate = /*#__PURE__*/function () {
821
+ var _evaluate = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee(input, actualOutput, retrievalContext) {
822
+ var prompt, result;
823
+ return _regeneratorRuntime().wrap(function _callee$(_context) {
824
+ while (1) switch (_context.prev = _context.next) {
825
+ case 0:
826
+ prompt = generateEvaluatePrompt$2({
827
+ input: input,
828
+ output: actualOutput,
829
+ context: retrievalContext
830
+ });
831
+ _context.next = 3;
832
+ return this.agent.generate(prompt, {
833
+ output: zod.z.object({
834
+ verdicts: zod.z.array(zod.z.object({
835
+ verdict: zod.z.string(),
836
+ reason: zod.z.string()
837
+ }))
838
+ })
839
+ });
840
+ case 3:
841
+ result = _context.sent;
842
+ return _context.abrupt("return", result.object.verdicts);
843
+ case 5:
844
+ case "end":
845
+ return _context.stop();
846
+ }
847
+ }, _callee, this);
848
+ }));
849
+ function evaluate(_x, _x2, _x3) {
850
+ return _evaluate.apply(this, arguments);
851
+ }
852
+ return evaluate;
853
+ }();
854
+ _proto.getReason = /*#__PURE__*/function () {
855
+ var _getReason = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee2(input, actualOutput, score, verdicts) {
856
+ var prompt, result;
857
+ return _regeneratorRuntime().wrap(function _callee2$(_context2) {
858
+ while (1) switch (_context2.prev = _context2.next) {
859
+ case 0:
860
+ prompt = generateReasonPrompt$2({
861
+ input: input,
862
+ output: actualOutput,
863
+ verdicts: verdicts,
864
+ score: score
865
+ });
866
+ _context2.next = 3;
867
+ return this.agent.generate(prompt, {
868
+ output: zod.z.object({
869
+ reason: zod.z.string()
870
+ })
871
+ });
872
+ case 3:
873
+ result = _context2.sent;
874
+ return _context2.abrupt("return", result.object.reason);
875
+ case 5:
876
+ case "end":
877
+ return _context2.stop();
878
+ }
879
+ }, _callee2, this);
880
+ }));
881
+ function getReason(_x4, _x5, _x6, _x7) {
882
+ return _getReason.apply(this, arguments);
883
+ }
884
+ return getReason;
885
+ }();
886
+ return ContextPositionJudge;
887
+ }(MastraAgentJudge);
888
+
889
+ var ContextPositionMetric = /*#__PURE__*/function (_Metric) {
890
+ function ContextPositionMetric(model, _temp) {
891
+ var _this;
892
+ var _ref = _temp === void 0 ? {} : _temp,
893
+ _ref$scale = _ref.scale,
894
+ scale = _ref$scale === void 0 ? 10 : _ref$scale;
895
+ _this = _Metric.call(this) || this;
896
+ _this.judge = void 0;
897
+ _this.scale = void 0;
898
+ _this.judge = new ContextPositionJudge(model);
899
+ _this.scale = scale;
900
+ return _this;
901
+ }
902
+ _inheritsLoose(ContextPositionMetric, _Metric);
903
+ var _proto = ContextPositionMetric.prototype;
904
+ _proto.measure = /*#__PURE__*/function () {
905
+ var _measure = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee(_ref2) {
906
+ var input, output, context, verdicts, score, reason;
907
+ return _regeneratorRuntime().wrap(function _callee$(_context) {
908
+ while (1) switch (_context.prev = _context.next) {
909
+ case 0:
910
+ input = _ref2.input, output = _ref2.output, context = _ref2.context;
911
+ _context.next = 3;
912
+ return this.judge.evaluate(input, output, context);
913
+ case 3:
914
+ verdicts = _context.sent;
915
+ score = this.calculateScore(verdicts);
916
+ _context.next = 7;
917
+ return this.judge.getReason(input, output, score, verdicts);
918
+ case 7:
919
+ reason = _context.sent;
920
+ return _context.abrupt("return", {
921
+ score: score,
922
+ reason: reason
923
+ });
924
+ case 9:
925
+ case "end":
926
+ return _context.stop();
927
+ }
928
+ }, _callee, this);
929
+ }));
930
+ function measure(_x) {
931
+ return _measure.apply(this, arguments);
932
+ }
933
+ return measure;
934
+ }();
935
+ _proto.calculateScore = function calculateScore(verdicts) {
936
+ var totalVerdicts = (verdicts == null ? void 0 : verdicts.length) || 0;
937
+ if (totalVerdicts === 0) {
938
+ return 0;
939
+ }
940
+ // Convert to binary scores (1 for yes, 0 for no)
941
+ var binaryScores = verdicts.map(function (v) {
942
+ return v.verdict.trim().toLowerCase() === 'yes' ? 1 : 0;
943
+ });
944
+ var weightedSum = 0;
945
+ var maxPossibleSum = 0; // Track the maximum possible sum for normalization
946
+ // Calculate position-weighted scores
947
+ binaryScores.forEach(function (isRelevant, index) {
948
+ var positionWeight = 1 / (index + 1);
949
+ if (isRelevant) {
950
+ weightedSum += positionWeight;
951
+ }
952
+ maxPossibleSum += positionWeight; // Add to max possible sum regardless of relevance
953
+ });
954
+ if (weightedSum === 0) {
955
+ return 0;
956
+ }
957
+ // Normalize against the maximum possible score
958
+ var finalScore = weightedSum / maxPossibleSum * this.scale;
959
+ return finalScore;
960
+ };
961
+ return ContextPositionMetric;
962
+ }(core.Metric);
963
+
964
+ var CONTEXT_PRECISION_AGENT_INSTRUCTIONS = "You are a balanced and nuanced context precision evaluator. Your job is to determine if retrieved context nodes are relevant to generating the expected output.\n\nKey Principles:\n1. Evaluate whether each context node was useful in generating the expected output\n2. Consider both direct and indirect relevance\n3. Prioritize usefulness over completeness\n4. Recognize that some nodes may be partially relevant\n5. Empty or error nodes should be marked as not relevant";
965
+ function generateEvaluatePrompt$1(_ref) {
966
+ var input = _ref.input,
967
+ output = _ref.output,
968
+ context = _ref.context;
969
+ return "Given the input, output, and context, evaluate each context piece's relevance by generating a list of JSON objects.\n\n**\nIMPORTANT: Your response must be in JSON format with a 'verdicts' key containing a list. Each verdict must have only two fields: `verdict` with either 'yes' or 'no', and `reason` explaining the verdict. Your reason should include relevant quotes from the context.\n\nExample Context: [\"The Sun is a star\", \"Stars produce their own light\", \"The Moon reflects sunlight\"]\nExample Query: \"What is the Sun?\"\nExample Expected Response: \"The Sun is a star that produces light.\"\n\nExample:\n{\n \"verdicts\": [\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context 'The Sun is a star' directly defines what the Sun is.\"\n },\n {\n \"verdict\": \"yes\",\n \"reason\": \"The context 'Stars produce their own light' is relevant as it describes a key characteristic of stars, which includes the Sun.\"\n },\n {\n \"verdict\": \"no\",\n \"reason\": \"The context 'The Moon reflects sunlight' is not relevant to defining what the Sun is.\"\n }\n ] \n}\n\nConsider context relevant if it:\n- Directly addresses the query\n- Provides examples or instances that help explain the concept\n- Offers related information that helps build understanding\n- Contains partial information that contributes to the response\n\nThe number of verdicts MUST MATCH the number of context pieces exactly.\n**\n\nInput:\n" + input + "\n\nOutput:\n" + output + "\n\nContext:\n" + context + "\n\nJSON:\n";
970
+ }
971
+ function generateReasonPrompt$1(_ref2) {
972
+ var input = _ref2.input,
973
+ output = _ref2.output,
974
+ verdicts = _ref2.verdicts,
975
+ score = _ref2.score;
976
+ return "Given the input, output, verdicts, and precision score, provide a BRIEF explanation for the score. Explain both its strengths and limitations.\nThe retrieved contexts is a list containing `verdict` ('yes' or 'no' for relevance), `reason` (explaining the verdict) and `node` (the context text). Contexts are listed in their ranking order.\n\n**\nIMPORTANT: Return only JSON format with a single 'reason' key explaining the score.\nExample JSON:\n{\n \"reason\": \"The score is <score> because <explanation>.\"\n}\n\nGuidelines:\n- Don't mention 'verdict' - refer to relevant/irrelevant nodes instead\n- Use information from the `reason` field, not the field itself\n- Reference node positions (first, second, etc.) when explaining relevance\n- For perfect scores (10.0), emphasize both relevance and optimal ordering\n- Always reference the ranking order when discussing relevance\n**\n\nPrecision Score:\n" + score + "\n\nInput:\n" + input + "\n\nOutput:\n" + output + "\n\nContext:\n" + verdicts + "\n\nJSON:\n";
977
+ }
978
+
979
+ var ContextPrecisionJudge = /*#__PURE__*/function (_MastraAgentJudge) {
980
+ function ContextPrecisionJudge(model) {
981
+ return _MastraAgentJudge.call(this, 'Context Precision', CONTEXT_PRECISION_AGENT_INSTRUCTIONS, model) || this;
982
+ }
983
+ _inheritsLoose(ContextPrecisionJudge, _MastraAgentJudge);
984
+ var _proto = ContextPrecisionJudge.prototype;
985
+ _proto.evaluate = /*#__PURE__*/function () {
986
+ var _evaluate = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee(input, actualOutput, retrievalContext) {
987
+ var prompt, result;
988
+ return _regeneratorRuntime().wrap(function _callee$(_context) {
989
+ while (1) switch (_context.prev = _context.next) {
990
+ case 0:
991
+ prompt = generateEvaluatePrompt$1({
992
+ input: input,
993
+ output: actualOutput,
994
+ context: retrievalContext
995
+ });
996
+ _context.next = 3;
997
+ return this.agent.generate(prompt, {
998
+ output: zod.z.object({
999
+ verdicts: zod.z.array(zod.z.object({
1000
+ verdict: zod.z.string(),
1001
+ reason: zod.z.string()
1002
+ }))
1003
+ })
1004
+ });
1005
+ case 3:
1006
+ result = _context.sent;
1007
+ return _context.abrupt("return", result.object.verdicts);
1008
+ case 5:
1009
+ case "end":
1010
+ return _context.stop();
1011
+ }
1012
+ }, _callee, this);
1013
+ }));
1014
+ function evaluate(_x, _x2, _x3) {
1015
+ return _evaluate.apply(this, arguments);
1016
+ }
1017
+ return evaluate;
1018
+ }();
1019
+ _proto.getReason = /*#__PURE__*/function () {
1020
+ var _getReason = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee2(input, actualOutput, score, verdicts) {
1021
+ var prompt, result;
1022
+ return _regeneratorRuntime().wrap(function _callee2$(_context2) {
1023
+ while (1) switch (_context2.prev = _context2.next) {
1024
+ case 0:
1025
+ prompt = generateReasonPrompt$1({
1026
+ input: input,
1027
+ output: actualOutput,
1028
+ verdicts: verdicts,
1029
+ score: score
1030
+ });
1031
+ _context2.next = 3;
1032
+ return this.agent.generate(prompt, {
1033
+ output: zod.z.object({
1034
+ reason: zod.z.string()
1035
+ })
1036
+ });
1037
+ case 3:
1038
+ result = _context2.sent;
1039
+ return _context2.abrupt("return", result.object.reason);
1040
+ case 5:
1041
+ case "end":
1042
+ return _context2.stop();
1043
+ }
1044
+ }, _callee2, this);
1045
+ }));
1046
+ function getReason(_x4, _x5, _x6, _x7) {
1047
+ return _getReason.apply(this, arguments);
1048
+ }
1049
+ return getReason;
1050
+ }();
1051
+ return ContextPrecisionJudge;
1052
+ }(MastraAgentJudge);
1053
+
1054
+ var ContextPrecisionMetric = /*#__PURE__*/function (_Metric) {
1055
+ function ContextPrecisionMetric(model, _temp) {
1056
+ var _this;
1057
+ var _ref = _temp === void 0 ? {} : _temp,
1058
+ _ref$scale = _ref.scale,
1059
+ scale = _ref$scale === void 0 ? 10 : _ref$scale;
1060
+ _this = _Metric.call(this) || this;
1061
+ _this.judge = void 0;
1062
+ _this.scale = void 0;
1063
+ _this.judge = new ContextPrecisionJudge(model);
1064
+ _this.scale = scale;
1065
+ return _this;
1066
+ }
1067
+ _inheritsLoose(ContextPrecisionMetric, _Metric);
1068
+ var _proto = ContextPrecisionMetric.prototype;
1069
+ _proto.measure = /*#__PURE__*/function () {
1070
+ var _measure = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee(_ref2) {
1071
+ var input, output, context, verdicts, score, reason;
1072
+ return _regeneratorRuntime().wrap(function _callee$(_context) {
1073
+ while (1) switch (_context.prev = _context.next) {
1074
+ case 0:
1075
+ input = _ref2.input, output = _ref2.output, context = _ref2.context;
1076
+ _context.next = 3;
1077
+ return this.judge.evaluate(input, output, context);
1078
+ case 3:
1079
+ verdicts = _context.sent;
1080
+ score = this.calculateScore(verdicts);
1081
+ _context.next = 7;
1082
+ return this.judge.getReason(input, output, score, verdicts);
1083
+ case 7:
1084
+ reason = _context.sent;
1085
+ return _context.abrupt("return", {
1086
+ score: score,
1087
+ reason: reason
1088
+ });
1089
+ case 9:
1090
+ case "end":
1091
+ return _context.stop();
1092
+ }
1093
+ }, _callee, this);
1094
+ }));
1095
+ function measure(_x) {
1096
+ return _measure.apply(this, arguments);
1097
+ }
1098
+ return measure;
1099
+ }();
1100
+ _proto.calculateScore = function calculateScore(verdicts) {
1101
+ var totalVerdicts = (verdicts == null ? void 0 : verdicts.length) || 0;
1102
+ if (totalVerdicts === 0) {
1103
+ return 0;
1104
+ }
1105
+ // Convert to binary scores (1 for yes, 0 for no)
1106
+ var binaryScores = verdicts.map(function (v) {
1107
+ return v.verdict.trim().toLowerCase() === 'yes' ? 1 : 0;
1108
+ });
1109
+ var weightedPrecisionSum = 0;
1110
+ var relevantCount = 0;
1111
+ // Calculate weighted precision at each position
1112
+ binaryScores.forEach(function (isRelevant, index) {
1113
+ if (isRelevant) {
1114
+ relevantCount++;
1115
+ var currentPrecision = relevantCount / (index + 1);
1116
+ weightedPrecisionSum += currentPrecision * isRelevant;
1117
+ }
1118
+ });
1119
+ if (relevantCount === 0) {
1120
+ return 0;
1121
+ }
1122
+ var finalScore = weightedPrecisionSum / relevantCount;
1123
+ return finalScore * this.scale;
1124
+ };
1125
+ return ContextPrecisionMetric;
1126
+ }(core.Metric);
1127
+
1128
+ var DifferenceMetric = /*#__PURE__*/function (_Metric) {
1129
+ function DifferenceMetric() {
1130
+ return _Metric.apply(this, arguments) || this;
1131
+ }
1132
+ _inheritsLoose(DifferenceMetric, _Metric);
1133
+ var _proto = DifferenceMetric.prototype;
1134
+ _proto.measure = /*#__PURE__*/function () {
1135
+ var _measure = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee(_ref) {
1136
+ var input, output, matcher, ratio, ops, changes, maxLength, lengthDiff, confidence;
1137
+ return _regeneratorRuntime().wrap(function _callee$(_context) {
1138
+ while (1) switch (_context.prev = _context.next) {
1139
+ case 0:
1140
+ input = _ref.input, output = _ref.output;
1141
+ matcher = new difflib.SequenceMatcher(null, input, output);
1142
+ ratio = matcher.ratio(); // Get detailed operations
1143
+ ops = matcher.getOpcodes();
1144
+ changes = ops.filter(function (_ref2) {
1145
+ var op = _ref2[0];
1146
+ return op !== 'equal';
1147
+ }).length; // Calculate confidence based on text length difference
1148
+ maxLength = Math.max(input.length, output.length);
1149
+ lengthDiff = maxLength > 0 ? Math.abs(input.length - output.length) / maxLength : 0;
1150
+ confidence = 1 - lengthDiff;
1151
+ return _context.abrupt("return", {
1152
+ score: ratio,
1153
+ details: "Difference score: " + (ratio * 100).toFixed(1) + "% with " + changes + " changes",
1154
+ confidence: confidence,
1155
+ metrics: {
1156
+ ratio: ratio,
1157
+ changes: changes,
1158
+ lengthDiff: lengthDiff
1159
+ }
1160
+ });
1161
+ case 9:
1162
+ case "end":
1163
+ return _context.stop();
1164
+ }
1165
+ }, _callee);
1166
+ }));
1167
+ function measure(_x) {
1168
+ return _measure.apply(this, arguments);
1169
+ }
1170
+ return measure;
1171
+ }();
1172
+ return DifferenceMetric;
1173
+ }(core.Metric);
1174
+
1175
+ var KeywordCoverageMetric = /*#__PURE__*/function (_Metric) {
1176
+ function KeywordCoverageMetric() {
1177
+ return _Metric.apply(this, arguments) || this;
1178
+ }
1179
+ _inheritsLoose(KeywordCoverageMetric, _Metric);
1180
+ var _proto = KeywordCoverageMetric.prototype;
1181
+ _proto.measure = /*#__PURE__*/function () {
1182
+ var _measure = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee(_ref) {
1183
+ var input, output, extractKeywords, referenceKeywords, responseKeywords, matchedKeywords, totalKeywords, coverage;
1184
+ return _regeneratorRuntime().wrap(function _callee$(_context) {
1185
+ while (1) switch (_context.prev = _context.next) {
1186
+ case 0:
1187
+ input = _ref.input, output = _ref.output;
1188
+ if (!(!input && !output)) {
1189
+ _context.next = 3;
1190
+ break;
1191
+ }
1192
+ return _context.abrupt("return", {
1193
+ score: 1,
1194
+ details: 'Keyword coverage: 100.0% (0/0 keywords)',
1195
+ confidence: 0.85,
1196
+ metrics: {
1197
+ totalKeywords: 0,
1198
+ matchedKeywords: 0
1199
+ }
1200
+ });
1201
+ case 3:
1202
+ extractKeywords = function extractKeywords(text) {
1203
+ return keyword_extractor.extract(text, {
1204
+ language: 'english',
1205
+ remove_digits: true,
1206
+ return_changed_case: true,
1207
+ remove_duplicates: true
1208
+ });
1209
+ };
1210
+ referenceKeywords = new Set(extractKeywords(input));
1211
+ responseKeywords = new Set(extractKeywords(output));
1212
+ matchedKeywords = [].concat(referenceKeywords).filter(function (k) {
1213
+ return responseKeywords.has(k);
1214
+ });
1215
+ totalKeywords = referenceKeywords.size;
1216
+ coverage = totalKeywords > 0 ? matchedKeywords.length / totalKeywords : 0;
1217
+ return _context.abrupt("return", {
1218
+ score: coverage,
1219
+ details: "Keyword coverage: " + (coverage * 100).toFixed(1) + "% (" + matchedKeywords.length + "/" + referenceKeywords.size + " keywords)",
1220
+ confidence: 0.85,
1221
+ metrics: {
1222
+ totalKeywords: referenceKeywords.size,
1223
+ matchedKeywords: matchedKeywords.length
1224
+ }
1225
+ });
1226
+ case 10:
1227
+ case "end":
1228
+ return _context.stop();
1229
+ }
1230
+ }, _callee);
1231
+ }));
1232
+ function measure(_x) {
1233
+ return _measure.apply(this, arguments);
1234
+ }
1235
+ return measure;
1236
+ }();
1237
+ return KeywordCoverageMetric;
1238
+ }(core.Metric);
1239
+
1240
+ var PROMPT_ALIGNMENT_AGENT_INSTRUCTIONS = "You are a strict and thorough prompt alignment evaluator. Your job is to determine if LLM outputs follow their given prompt instructions exactly.\n\nKey Principles:\n1. Be EXTRA STRICT in your evaluation in regards to whether the instructions are followed exactly.\n2. Only give a \"yes\" verdict if an instruction is COMPLETELY followed\n3. Any partial compliance should be marked as \"no\"\n4. Provide clear, specific reasons for any \"no\" verdicts\n5. Focus solely on instruction compliance, not output quality\n\nRemember:\n- Each instruction must be evaluated independently\n- Verdicts must be either \"yes\" or \"no\" - no in-between\n- Reasons are required only for \"no\" verdicts\n- The number of verdicts must match the number of instructions exactly";
1241
+ function generateEvaluatePrompt(_ref) {
1242
+ var instructions = _ref.instructions,
1243
+ input = _ref.input,
1244
+ output = _ref.output;
1245
+ return "For the provided list of prompt instructions, determine whether each instruction has been followed in the LLM output.\nMake sure to judge the output on each instruction independently. Do not let instructions be influenced by other instructions.\nGenerate a list of verdicts in JSON format, where each verdict must have:\n- \"verdict\": Strictly \"yes\" or \"no\"\n- \"reason\": Give a reason for the verdict\n\nBe EXTRA STRICT in your evaluation. Only give \"yes\" if the instruction is followed COMPLETELY.\nEvaluate the output EXACTLY as written - consider every character, space, and case\n\nExample:\nInput: \"describe the sky\"\nOutput: \"the sky is Blue today\"\nInstructions: [\"Start sentences with capital letters\", \"Use proper English\"]\n\n{\n \"verdicts\": [\n {\n \"verdict\": \"no\",\n \"reason\": \"The sentence 'the sky is Blue' starts with lowercase 't'\"\n },\n {\n \"verdict\": \"no\",\n \"reason\": \"Improper capitalization: 'Blue' is capitalized mid-sentence\"\n }\n ]\n}\n\nPrompt Instructions:\n" + instructions.join('\n') + "\n\nInput:\n" + input + "\n\nLLM Actual Output:\n" + output + "\n\nJSON:";
1246
+ }
1247
+ function generateReasonPrompt(_ref2) {
1248
+ var input = _ref2.input,
1249
+ output = _ref2.output,
1250
+ score = _ref2.score,
1251
+ reasons = _ref2.reasons;
1252
+ return "Explain the instruction following score (0-10) for the LLM's response using this context:\n Context:\n Input: " + input + "\n Output: " + output + "\n Score: " + score + "\n Failure Reasons: " + reasons.join('\n') + "\n\n Rules (follow these rules exactly. do not deviate):\n - Keep your response concise and to the point.\n - Do not change score from what is given.\n - Do not make judgements on inputs or outputs (factual correctness, quality, etc).\n - If there are failure reasons given, explain why the score is not higher.\n \n\n Output format:\n {\n \"reason\": \"The score is {score} because {explanation of instruction following}\"\n }\n \n Example Responses:\n {\n \"reason\": \"The score is 10 because the output follows the instructions exactly\"\n }\n {\n \"reason\": \"The score is 0 because the output does not follow the instructions\"\n }\n ";
1253
+ }
1254
+
1255
+ var PromptAlignmentJudge = /*#__PURE__*/function (_MastraAgentJudge) {
1256
+ function PromptAlignmentJudge(model) {
1257
+ return _MastraAgentJudge.call(this, 'Prompt Alignment', PROMPT_ALIGNMENT_AGENT_INSTRUCTIONS, model) || this;
1258
+ }
1259
+ _inheritsLoose(PromptAlignmentJudge, _MastraAgentJudge);
1260
+ var _proto = PromptAlignmentJudge.prototype;
1261
+ _proto.evaluate = /*#__PURE__*/function () {
1262
+ var _evaluate = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee(input, actualOutput, instructions) {
1263
+ var prompt, result;
1264
+ return _regeneratorRuntime().wrap(function _callee$(_context) {
1265
+ while (1) switch (_context.prev = _context.next) {
1266
+ case 0:
1267
+ prompt = generateEvaluatePrompt({
1268
+ input: input,
1269
+ output: actualOutput,
1270
+ instructions: instructions
1271
+ });
1272
+ _context.next = 3;
1273
+ return this.agent.generate(prompt, {
1274
+ output: zod.z.object({
1275
+ verdicts: zod.z.array(zod.z.object({
1276
+ verdict: zod.z.string(),
1277
+ reason: zod.z.string()
1278
+ }))
1279
+ })
1280
+ });
1281
+ case 3:
1282
+ result = _context.sent;
1283
+ return _context.abrupt("return", result.object.verdicts);
1284
+ case 5:
1285
+ case "end":
1286
+ return _context.stop();
1287
+ }
1288
+ }, _callee, this);
1289
+ }));
1290
+ function evaluate(_x, _x2, _x3) {
1291
+ return _evaluate.apply(this, arguments);
1292
+ }
1293
+ return evaluate;
1294
+ }();
1295
+ _proto.getReason = /*#__PURE__*/function () {
1296
+ var _getReason = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee2(input, actualOutput, score, reasons) {
1297
+ var prompt, result;
1298
+ return _regeneratorRuntime().wrap(function _callee2$(_context2) {
1299
+ while (1) switch (_context2.prev = _context2.next) {
1300
+ case 0:
1301
+ prompt = generateReasonPrompt({
1302
+ input: input,
1303
+ output: actualOutput,
1304
+ reasons: reasons,
1305
+ score: score
1306
+ });
1307
+ _context2.next = 3;
1308
+ return this.agent.generate(prompt, {
1309
+ output: zod.z.object({
1310
+ reason: zod.z.string()
1311
+ })
1312
+ });
1313
+ case 3:
1314
+ result = _context2.sent;
1315
+ return _context2.abrupt("return", result.object.reason);
1316
+ case 5:
1317
+ case "end":
1318
+ return _context2.stop();
1319
+ }
1320
+ }, _callee2, this);
1321
+ }));
1322
+ function getReason(_x4, _x5, _x6, _x7) {
1323
+ return _getReason.apply(this, arguments);
1324
+ }
1325
+ return getReason;
1326
+ }();
1327
+ return PromptAlignmentJudge;
1328
+ }(MastraAgentJudge);
1329
+
1330
+ var PromptAlignmentMetric = /*#__PURE__*/function (_Metric) {
1331
+ function PromptAlignmentMetric(model, _ref) {
1332
+ var _this;
1333
+ var instructions = _ref.instructions,
1334
+ _ref$scale = _ref.scale,
1335
+ scale = _ref$scale === void 0 ? 10 : _ref$scale;
1336
+ _this = _Metric.call(this) || this;
1337
+ _this.instructions = void 0;
1338
+ _this.judge = void 0;
1339
+ _this.scale = void 0;
1340
+ _this.instructions = instructions;
1341
+ _this.judge = new PromptAlignmentJudge(model);
1342
+ _this.scale = scale;
1343
+ return _this;
1344
+ }
1345
+ _inheritsLoose(PromptAlignmentMetric, _Metric);
1346
+ var _proto = PromptAlignmentMetric.prototype;
1347
+ _proto.measure = /*#__PURE__*/function () {
1348
+ var _measure = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee(_ref2) {
1349
+ var input, output, verdicts, score, reason;
1350
+ return _regeneratorRuntime().wrap(function _callee$(_context) {
1351
+ while (1) switch (_context.prev = _context.next) {
1352
+ case 0:
1353
+ input = _ref2.input, output = _ref2.output;
1354
+ _context.next = 3;
1355
+ return this.judge.evaluate(input, output, this.instructions);
1356
+ case 3:
1357
+ verdicts = _context.sent;
1358
+ score = this.calculateScore(verdicts);
1359
+ _context.next = 7;
1360
+ return this.generateReason(input, output, score, verdicts);
1361
+ case 7:
1362
+ reason = _context.sent;
1363
+ return _context.abrupt("return", {
1364
+ score: score,
1365
+ reason: reason
1366
+ });
1367
+ case 9:
1368
+ case "end":
1369
+ return _context.stop();
1370
+ }
1371
+ }, _callee, this);
1372
+ }));
1373
+ function measure(_x) {
1374
+ return _measure.apply(this, arguments);
1375
+ }
1376
+ return measure;
1377
+ }();
1378
+ _proto.generateReason = /*#__PURE__*/function () {
1379
+ var _generateReason = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee2(input, output, score, verdicts) {
1380
+ var reasonsForVerdicts, _iterator, _step, _step$value, verdict, _reason, reason;
1381
+ return _regeneratorRuntime().wrap(function _callee2$(_context2) {
1382
+ while (1) switch (_context2.prev = _context2.next) {
1383
+ case 0:
1384
+ reasonsForVerdicts = [];
1385
+ for (_iterator = _createForOfIteratorHelperLoose(verdicts || []); !(_step = _iterator()).done;) {
1386
+ _step$value = _step.value, verdict = _step$value.verdict, _reason = _step$value.reason;
1387
+ if (verdict.trim().toLowerCase() === 'no') {
1388
+ reasonsForVerdicts.push(_reason);
1389
+ }
1390
+ }
1391
+ _context2.next = 4;
1392
+ return this.judge.getReason(input, output, score, reasonsForVerdicts);
1393
+ case 4:
1394
+ reason = _context2.sent;
1395
+ return _context2.abrupt("return", reason);
1396
+ case 6:
1397
+ case "end":
1398
+ return _context2.stop();
1399
+ }
1400
+ }, _callee2, this);
1401
+ }));
1402
+ function generateReason(_x2, _x3, _x4, _x5) {
1403
+ return _generateReason.apply(this, arguments);
1404
+ }
1405
+ return generateReason;
1406
+ }();
1407
+ _proto.calculateScore = function calculateScore(evaluation) {
1408
+ var numberOfVerdicts = (evaluation == null ? void 0 : evaluation.length) || 0;
1409
+ if (numberOfVerdicts === 0) {
1410
+ return 1;
1411
+ }
1412
+ var alignmentCount = 0;
1413
+ for (var _iterator2 = _createForOfIteratorHelperLoose(evaluation), _step2; !(_step2 = _iterator2()).done;) {
1414
+ var verdict = _step2.value.verdict;
1415
+ if (verdict.trim().toLowerCase() !== 'no') {
1416
+ alignmentCount++;
1417
+ }
1418
+ }
1419
+ var score = alignmentCount / numberOfVerdicts;
1420
+ return score * this.scale;
1421
+ };
1422
+ return PromptAlignmentMetric;
1423
+ }(core.Metric);
1424
+
1425
+ var ToneConsistencyMetric = /*#__PURE__*/function (_Metric) {
1426
+ function ToneConsistencyMetric() {
1427
+ var _this;
1428
+ for (var _len = arguments.length, args = new Array(_len), _key = 0; _key < _len; _key++) {
1429
+ args[_key] = arguments[_key];
1430
+ }
1431
+ _this = _Metric.call.apply(_Metric, [this].concat(args)) || this;
1432
+ _this.sentiment = new Sentiment();
1433
+ return _this;
1434
+ }
1435
+ _inheritsLoose(ToneConsistencyMetric, _Metric);
1436
+ var _proto = ToneConsistencyMetric.prototype;
1437
+ _proto.measure = /*#__PURE__*/function () {
1438
+ var _measure = /*#__PURE__*/_asyncToGenerator(/*#__PURE__*/_regeneratorRuntime().mark(function _callee(_ref) {
1439
+ var _this2 = this;
1440
+ var input, output, responseSentiment, referenceSentiment, sentimentDiff, normalizedScore, sentences, sentiments, avgSentiment, variance, stability;
1441
+ return _regeneratorRuntime().wrap(function _callee$(_context) {
1442
+ while (1) switch (_context.prev = _context.next) {
1443
+ case 0:
1444
+ input = _ref.input, output = _ref.output;
1445
+ responseSentiment = this.sentiment.analyze(input);
1446
+ if (!output) {
1447
+ _context.next = 7;
1448
+ break;
1449
+ }
1450
+ // Compare sentiment with reference
1451
+ referenceSentiment = this.sentiment.analyze(output);
1452
+ sentimentDiff = Math.abs(responseSentiment.comparative - referenceSentiment.comparative);
1453
+ normalizedScore = Math.max(0, 1 - sentimentDiff);
1454
+ return _context.abrupt("return", {
1455
+ score: normalizedScore,
1456
+ details: "Tone consistency: " + (normalizedScore * 100).toFixed(1) + "%",
1457
+ confidence: 0.75,
1458
+ metrics: {
1459
+ responseSentiment: responseSentiment.comparative,
1460
+ referenceSentiment: referenceSentiment.comparative,
1461
+ difference: sentimentDiff
1462
+ }
1463
+ });
1464
+ case 7:
1465
+ // Evaluate sentiment stability across response
1466
+ sentences = input.match(/[^.!?]+[.!?]+/g) || [input];
1467
+ sentiments = sentences.map(function (s) {
1468
+ return _this2.sentiment.analyze(s).comparative;
1469
+ });
1470
+ avgSentiment = sentiments.reduce(function (a, b) {
1471
+ return a + b;
1472
+ }, 0) / sentiments.length;
1473
+ variance = sentiments.reduce(function (sum, s) {
1474
+ return sum + Math.pow(s - avgSentiment, 2);
1475
+ }, 0) / sentiments.length;
1476
+ stability = Math.max(0, 1 - variance);
1477
+ return _context.abrupt("return", {
1478
+ score: stability,
1479
+ details: "Tone stability: " + (stability * 100).toFixed(1) + "%",
1480
+ confidence: 0.7,
1481
+ metrics: {
1482
+ avgSentiment: avgSentiment,
1483
+ sentimentVariance: variance
1484
+ }
1485
+ });
1486
+ case 13:
1487
+ case "end":
1488
+ return _context.stop();
1489
+ }
1490
+ }, _callee, this);
1491
+ }));
1492
+ function measure(_x) {
1493
+ return _measure.apply(this, arguments);
1494
+ }
1495
+ return measure;
1496
+ }();
1497
+ return ToneConsistencyMetric;
1498
+ }(core.Metric);
1499
+
1500
+ exports.AnswerRelevancyMetric = AnswerRelevancyMetric;
1501
+ exports.CompletenessMetric = CompletenessMetric;
1502
+ exports.ContentSimilarityMetric = ContentSimilarityMetric;
1503
+ exports.ContextPositionMetric = ContextPositionMetric;
1504
+ exports.ContextPrecisionMetric = ContextPrecisionMetric;
1505
+ exports.DifferenceMetric = DifferenceMetric;
1506
+ exports.KeywordCoverageMetric = KeywordCoverageMetric;
1507
+ exports.PromptAlignmentMetric = PromptAlignmentMetric;
1508
+ exports.ToneConsistencyMetric = ToneConsistencyMetric;
1509
+ exports.evaluate = evaluate;
1510
+ //# sourceMappingURL=evals.cjs.development.js.map