@langwatch/mcp-server 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +25 -0
- package/dist/{archive-scenario-GAE4XVFM.js → archive-scenario-YFD5THOR.js} +3 -3
- package/dist/archive-scenario-YFD5THOR.js.map +1 -0
- package/dist/chunk-5UOPNRXW.js +37 -0
- package/dist/chunk-5UOPNRXW.js.map +1 -0
- package/dist/{chunk-K2YFPOSD.js → chunk-6U4TCGFC.js} +2 -2
- package/dist/chunk-IX6QJKAD.js +22 -0
- package/dist/chunk-IX6QJKAD.js.map +1 -0
- package/dist/{chunk-JVWDWL3J.js → chunk-LLRQIF52.js} +3 -11
- package/dist/chunk-LLRQIF52.js.map +1 -0
- package/dist/create-evaluator-E5X5ZP3B.js +27 -0
- package/dist/create-evaluator-E5X5ZP3B.js.map +1 -0
- package/dist/create-prompt-7Z35MIL6.js +36 -0
- package/dist/create-prompt-7Z35MIL6.js.map +1 -0
- package/dist/{create-scenario-3YRZVDYF.js → create-scenario-DIMPJRPY.js} +3 -3
- package/dist/create-scenario-DIMPJRPY.js.map +1 -0
- package/dist/discover-evaluator-schema-H23XCLNE.js +1402 -0
- package/dist/discover-evaluator-schema-H23XCLNE.js.map +1 -0
- package/dist/{get-analytics-BAVXTAPB.js → get-analytics-4YJW4S5L.js} +2 -2
- package/dist/get-evaluator-WDEH2F7M.js +47 -0
- package/dist/get-evaluator-WDEH2F7M.js.map +1 -0
- package/dist/{get-prompt-LKCPT26O.js → get-prompt-F6PDVC76.js} +2 -5
- package/dist/get-prompt-F6PDVC76.js.map +1 -0
- package/dist/{get-scenario-3SCDW4Z6.js → get-scenario-H24ZYNT5.js} +3 -3
- package/dist/{get-trace-QFDWJ5D4.js → get-trace-27USKGO7.js} +2 -2
- package/dist/index.js +13310 -2410
- package/dist/index.js.map +1 -1
- package/dist/list-evaluators-KRGI72EH.js +34 -0
- package/dist/list-evaluators-KRGI72EH.js.map +1 -0
- package/dist/list-model-providers-A5YCFTPI.js +35 -0
- package/dist/list-model-providers-A5YCFTPI.js.map +1 -0
- package/dist/{list-prompts-UQPBCUYA.js → list-prompts-LKJSE7XN.js} +6 -7
- package/dist/list-prompts-LKJSE7XN.js.map +1 -0
- package/dist/{list-scenarios-573YOUKC.js → list-scenarios-ZK5CMGC4.js} +5 -5
- package/dist/list-scenarios-ZK5CMGC4.js.map +1 -0
- package/dist/{search-traces-RSMYCAN7.js → search-traces-SOKAAMAR.js} +2 -2
- package/dist/set-model-provider-7MGULZDH.js +33 -0
- package/dist/set-model-provider-7MGULZDH.js.map +1 -0
- package/dist/update-evaluator-A3XINFLJ.js +24 -0
- package/dist/update-evaluator-A3XINFLJ.js.map +1 -0
- package/dist/update-prompt-IW7X2UQM.js +22 -0
- package/dist/update-prompt-IW7X2UQM.js.map +1 -0
- package/dist/{update-scenario-SSGVOBJO.js → update-scenario-ZT7TOBFR.js} +3 -3
- package/dist/update-scenario-ZT7TOBFR.js.map +1 -0
- package/package.json +10 -10
- package/src/__tests__/all-tools.integration.test.ts +1337 -0
- package/src/__tests__/discover-evaluator-schema.unit.test.ts +89 -0
- package/src/__tests__/evaluator-tools.unit.test.ts +262 -0
- package/src/__tests__/integration.integration.test.ts +9 -34
- package/src/__tests__/langwatch-api.unit.test.ts +4 -32
- package/src/__tests__/model-provider-tools.unit.test.ts +190 -0
- package/src/__tests__/scenario-tools.integration.test.ts +5 -5
- package/src/__tests__/scenario-tools.unit.test.ts +2 -2
- package/src/__tests__/tools.unit.test.ts +59 -65
- package/src/index.ts +249 -88
- package/src/langwatch-api-evaluators.ts +70 -0
- package/src/langwatch-api-model-providers.ts +41 -0
- package/src/langwatch-api.ts +3 -28
- package/src/tools/archive-scenario.ts +1 -1
- package/src/tools/create-evaluator.ts +33 -0
- package/src/tools/create-prompt.ts +30 -5
- package/src/tools/create-scenario.ts +1 -1
- package/src/tools/discover-evaluator-schema.ts +143 -0
- package/src/tools/get-evaluator.ts +53 -0
- package/src/tools/get-prompt.ts +1 -4
- package/src/tools/list-evaluators.ts +37 -0
- package/src/tools/list-model-providers.ts +40 -0
- package/src/tools/list-prompts.ts +5 -6
- package/src/tools/list-scenarios.ts +3 -3
- package/src/tools/set-model-provider.ts +46 -0
- package/src/tools/update-evaluator.ts +30 -0
- package/src/tools/update-prompt.ts +9 -25
- package/src/tools/update-scenario.ts +1 -1
- package/dist/archive-scenario-GAE4XVFM.js.map +0 -1
- package/dist/chunk-JVWDWL3J.js.map +0 -1
- package/dist/create-prompt-P35POKBW.js +0 -22
- package/dist/create-prompt-P35POKBW.js.map +0 -1
- package/dist/create-scenario-3YRZVDYF.js.map +0 -1
- package/dist/get-prompt-LKCPT26O.js.map +0 -1
- package/dist/list-prompts-UQPBCUYA.js.map +0 -1
- package/dist/list-scenarios-573YOUKC.js.map +0 -1
- package/dist/update-prompt-G2Y5EBQY.js +0 -31
- package/dist/update-prompt-G2Y5EBQY.js.map +0 -1
- package/dist/update-scenario-SSGVOBJO.js.map +0 -1
- /package/dist/{chunk-K2YFPOSD.js.map → chunk-6U4TCGFC.js.map} +0 -0
- /package/dist/{get-analytics-BAVXTAPB.js.map → get-analytics-4YJW4S5L.js.map} +0 -0
- /package/dist/{get-scenario-3SCDW4Z6.js.map → get-scenario-H24ZYNT5.js.map} +0 -0
- /package/dist/{get-trace-QFDWJ5D4.js.map → get-trace-27USKGO7.js.map} +0 -0
- /package/dist/{search-traces-RSMYCAN7.js.map → search-traces-SOKAAMAR.js.map} +0 -0
|
@@ -0,0 +1,1402 @@
|
|
|
1
|
+
// ../langevals/ts-integration/evaluators.generated.ts
|
|
2
|
+
var AVAILABLE_EVALUATORS = {
|
|
3
|
+
"legacy/ragas_answer_correctness": {
|
|
4
|
+
name: `Ragas Answer Correctness`,
|
|
5
|
+
description: `
|
|
6
|
+
Computes with an LLM a weighted combination of factual as well as semantic similarity between the generated answer and the expected output.
|
|
7
|
+
`,
|
|
8
|
+
category: "rag",
|
|
9
|
+
docsUrl: "https://docs.ragas.io/en/latest/concepts/metrics/answer_correctness.html",
|
|
10
|
+
isGuardrail: false,
|
|
11
|
+
requiredFields: ["output", "expected_output"],
|
|
12
|
+
optionalFields: ["input"],
|
|
13
|
+
settings: {
|
|
14
|
+
"model": {
|
|
15
|
+
"description": "The model to use for evaluation.",
|
|
16
|
+
"default": "openai/gpt-5"
|
|
17
|
+
},
|
|
18
|
+
"embeddings_model": {
|
|
19
|
+
"description": "The model to use for embeddings.",
|
|
20
|
+
"default": "openai/text-embedding-ada-002"
|
|
21
|
+
},
|
|
22
|
+
"max_tokens": {
|
|
23
|
+
"description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.",
|
|
24
|
+
"default": 2048
|
|
25
|
+
}
|
|
26
|
+
},
|
|
27
|
+
envVars: [],
|
|
28
|
+
result: {
|
|
29
|
+
"score": {
|
|
30
|
+
"description": "A score between 0.0 and 1.0 indicating the correctness of the answer."
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
},
|
|
34
|
+
"legacy/ragas_answer_relevancy": {
|
|
35
|
+
name: `Ragas Answer Relevancy`,
|
|
36
|
+
description: `
|
|
37
|
+
Evaluates how pertinent the generated answer is to the given prompt. Higher scores indicate better relevancy.
|
|
38
|
+
`,
|
|
39
|
+
category: "rag",
|
|
40
|
+
docsUrl: "https://docs.ragas.io/en/latest/concepts/metrics/answer_relevance.html",
|
|
41
|
+
isGuardrail: false,
|
|
42
|
+
requiredFields: ["input", "output"],
|
|
43
|
+
optionalFields: [],
|
|
44
|
+
settings: {
|
|
45
|
+
"model": {
|
|
46
|
+
"description": "The model to use for evaluation.",
|
|
47
|
+
"default": "openai/gpt-5"
|
|
48
|
+
},
|
|
49
|
+
"embeddings_model": {
|
|
50
|
+
"description": "The model to use for embeddings.",
|
|
51
|
+
"default": "openai/text-embedding-ada-002"
|
|
52
|
+
},
|
|
53
|
+
"max_tokens": {
|
|
54
|
+
"description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.",
|
|
55
|
+
"default": 2048
|
|
56
|
+
}
|
|
57
|
+
},
|
|
58
|
+
envVars: [],
|
|
59
|
+
result: {
|
|
60
|
+
"score": {
|
|
61
|
+
"description": "A score between 0.0 and 1.0 indicating the relevance of the answer."
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
},
|
|
65
|
+
"legacy/ragas_context_precision": {
|
|
66
|
+
name: `Ragas Context Precision`,
|
|
67
|
+
description: `
|
|
68
|
+
This metric evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not. Higher scores indicate better precision.
|
|
69
|
+
`,
|
|
70
|
+
category: "rag",
|
|
71
|
+
docsUrl: "https://docs.ragas.io/en/latest/concepts/metrics/context_precision.html",
|
|
72
|
+
isGuardrail: false,
|
|
73
|
+
requiredFields: ["input", "contexts", "expected_output"],
|
|
74
|
+
optionalFields: [],
|
|
75
|
+
settings: {
|
|
76
|
+
"model": {
|
|
77
|
+
"description": "The model to use for evaluation.",
|
|
78
|
+
"default": "openai/gpt-5"
|
|
79
|
+
},
|
|
80
|
+
"embeddings_model": {
|
|
81
|
+
"description": "The model to use for embeddings.",
|
|
82
|
+
"default": "openai/text-embedding-ada-002"
|
|
83
|
+
},
|
|
84
|
+
"max_tokens": {
|
|
85
|
+
"description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.",
|
|
86
|
+
"default": 2048
|
|
87
|
+
}
|
|
88
|
+
},
|
|
89
|
+
envVars: [],
|
|
90
|
+
result: {
|
|
91
|
+
"score": {
|
|
92
|
+
"description": "A score between 0.0 and 1.0 indicating the precision of the context."
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
},
|
|
96
|
+
"legacy/ragas_context_recall": {
|
|
97
|
+
name: `Ragas Context Recall`,
|
|
98
|
+
description: `
|
|
99
|
+
This evaluator measures the extent to which the retrieved context aligns with the annotated answer, treated as the ground truth. Higher values indicate better performance.
|
|
100
|
+
`,
|
|
101
|
+
category: "rag",
|
|
102
|
+
docsUrl: "https://docs.ragas.io/en/latest/concepts/metrics/context_recall.html",
|
|
103
|
+
isGuardrail: false,
|
|
104
|
+
requiredFields: ["input", "contexts", "expected_output"],
|
|
105
|
+
optionalFields: [],
|
|
106
|
+
settings: {
|
|
107
|
+
"model": {
|
|
108
|
+
"description": "The model to use for evaluation.",
|
|
109
|
+
"default": "openai/gpt-5"
|
|
110
|
+
},
|
|
111
|
+
"embeddings_model": {
|
|
112
|
+
"description": "The model to use for embeddings.",
|
|
113
|
+
"default": "openai/text-embedding-ada-002"
|
|
114
|
+
},
|
|
115
|
+
"max_tokens": {
|
|
116
|
+
"description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.",
|
|
117
|
+
"default": 2048
|
|
118
|
+
}
|
|
119
|
+
},
|
|
120
|
+
envVars: [],
|
|
121
|
+
result: {
|
|
122
|
+
"score": {
|
|
123
|
+
"description": "A score between 0.0 and 1.0 indicating the recall of the context."
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
},
|
|
127
|
+
"legacy/ragas_context_relevancy": {
|
|
128
|
+
name: `Ragas Context Relevancy`,
|
|
129
|
+
description: `
|
|
130
|
+
This metric gauges the relevancy of the retrieved context, calculated based on both the question and contexts. The values fall within the range of (0, 1), with higher values indicating better relevancy.
|
|
131
|
+
`,
|
|
132
|
+
category: "rag",
|
|
133
|
+
docsUrl: "https://docs.ragas.io/en/latest/concepts/metrics/context_relevancy.html",
|
|
134
|
+
isGuardrail: false,
|
|
135
|
+
requiredFields: ["output", "contexts"],
|
|
136
|
+
optionalFields: [],
|
|
137
|
+
settings: {
|
|
138
|
+
"model": {
|
|
139
|
+
"description": "The model to use for evaluation.",
|
|
140
|
+
"default": "openai/gpt-5"
|
|
141
|
+
},
|
|
142
|
+
"embeddings_model": {
|
|
143
|
+
"description": "The model to use for embeddings.",
|
|
144
|
+
"default": "openai/text-embedding-ada-002"
|
|
145
|
+
},
|
|
146
|
+
"max_tokens": {
|
|
147
|
+
"description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.",
|
|
148
|
+
"default": 2048
|
|
149
|
+
}
|
|
150
|
+
},
|
|
151
|
+
envVars: [],
|
|
152
|
+
result: {
|
|
153
|
+
"score": {
|
|
154
|
+
"description": "A score between 0.0 and 1.0 indicating the relevancy of the context."
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
},
|
|
158
|
+
"legacy/ragas_context_utilization": {
|
|
159
|
+
name: `Ragas Context Utilization`,
|
|
160
|
+
description: `
|
|
161
|
+
This metric evaluates whether all of the output relevant items present in the contexts are ranked higher or not. Higher scores indicate better utilization.
|
|
162
|
+
`,
|
|
163
|
+
category: "rag",
|
|
164
|
+
docsUrl: "https://docs.ragas.io/en/latest/concepts/metrics/context_precision.html",
|
|
165
|
+
isGuardrail: false,
|
|
166
|
+
requiredFields: ["input", "output", "contexts"],
|
|
167
|
+
optionalFields: [],
|
|
168
|
+
settings: {
|
|
169
|
+
"model": {
|
|
170
|
+
"description": "The model to use for evaluation.",
|
|
171
|
+
"default": "openai/gpt-5"
|
|
172
|
+
},
|
|
173
|
+
"embeddings_model": {
|
|
174
|
+
"description": "The model to use for embeddings.",
|
|
175
|
+
"default": "openai/text-embedding-ada-002"
|
|
176
|
+
},
|
|
177
|
+
"max_tokens": {
|
|
178
|
+
"description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.",
|
|
179
|
+
"default": 2048
|
|
180
|
+
}
|
|
181
|
+
},
|
|
182
|
+
envVars: [],
|
|
183
|
+
result: {
|
|
184
|
+
"score": {
|
|
185
|
+
"description": "A score between 0.0 and 1.0 indicating the utilization of the context."
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
},
|
|
189
|
+
"legacy/ragas_faithfulness": {
|
|
190
|
+
name: `Ragas Faithfulness`,
|
|
191
|
+
description: `
|
|
192
|
+
This evaluator assesses the extent to which the generated answer is consistent with the provided context. Higher scores indicate better faithfulness to the context, useful for detecting hallucinations.
|
|
193
|
+
`,
|
|
194
|
+
category: "rag",
|
|
195
|
+
docsUrl: "https://docs.ragas.io/en/latest/concepts/metrics/faithfulness.html",
|
|
196
|
+
isGuardrail: false,
|
|
197
|
+
requiredFields: ["output", "contexts"],
|
|
198
|
+
optionalFields: [],
|
|
199
|
+
settings: {
|
|
200
|
+
"model": {
|
|
201
|
+
"description": "The model to use for evaluation.",
|
|
202
|
+
"default": "openai/gpt-5"
|
|
203
|
+
},
|
|
204
|
+
"embeddings_model": {
|
|
205
|
+
"description": "The model to use for embeddings.",
|
|
206
|
+
"default": "openai/text-embedding-ada-002"
|
|
207
|
+
},
|
|
208
|
+
"max_tokens": {
|
|
209
|
+
"description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.",
|
|
210
|
+
"default": 2048
|
|
211
|
+
}
|
|
212
|
+
},
|
|
213
|
+
envVars: [],
|
|
214
|
+
result: {
|
|
215
|
+
"score": {
|
|
216
|
+
"description": "A score between 0.0 and 1.0 indicating the faithfulness of the answer."
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
},
|
|
220
|
+
"presidio/pii_detection": {
|
|
221
|
+
name: `Presidio PII Detection`,
|
|
222
|
+
description: `
|
|
223
|
+
Detects personally identifiable information in text, including phone numbers, email addresses, and
|
|
224
|
+
social security numbers. It allows customization of the detection threshold and the specific types of PII to check.
|
|
225
|
+
`,
|
|
226
|
+
category: "safety",
|
|
227
|
+
docsUrl: "https://microsoft.github.io/presidio",
|
|
228
|
+
isGuardrail: true,
|
|
229
|
+
requiredFields: [],
|
|
230
|
+
optionalFields: ["input", "output"],
|
|
231
|
+
settings: {
|
|
232
|
+
"entities": {
|
|
233
|
+
"description": "The types of PII to check for in the input.",
|
|
234
|
+
"default": {
|
|
235
|
+
"credit_card": true,
|
|
236
|
+
"crypto": true,
|
|
237
|
+
"email_address": true,
|
|
238
|
+
"iban_code": true,
|
|
239
|
+
"ip_address": true,
|
|
240
|
+
"location": false,
|
|
241
|
+
"person": false,
|
|
242
|
+
"phone_number": true,
|
|
243
|
+
"medical_license": true,
|
|
244
|
+
"us_bank_number": false,
|
|
245
|
+
"us_driver_license": false,
|
|
246
|
+
"us_itin": false,
|
|
247
|
+
"us_passport": false,
|
|
248
|
+
"us_ssn": false,
|
|
249
|
+
"uk_nhs": false,
|
|
250
|
+
"sg_nric_fin": false,
|
|
251
|
+
"au_abn": false,
|
|
252
|
+
"au_acn": false,
|
|
253
|
+
"au_tfn": false,
|
|
254
|
+
"au_medicare": false,
|
|
255
|
+
"in_pan": false,
|
|
256
|
+
"in_aadhaar": false,
|
|
257
|
+
"in_vehicle_registration": false,
|
|
258
|
+
"in_voter": false,
|
|
259
|
+
"in_passport": false
|
|
260
|
+
}
|
|
261
|
+
},
|
|
262
|
+
"min_threshold": {
|
|
263
|
+
"description": "The minimum confidence required for failing the evaluation on a PII match.",
|
|
264
|
+
"default": 0.5
|
|
265
|
+
}
|
|
266
|
+
},
|
|
267
|
+
envVars: [],
|
|
268
|
+
result: {
|
|
269
|
+
"score": {
|
|
270
|
+
"description": "Amount of PII detected, 0 means no PII detected"
|
|
271
|
+
},
|
|
272
|
+
"passed": {
|
|
273
|
+
"description": "If true then no PII was detected, if false then at least one PII was detected"
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
},
|
|
277
|
+
"lingua/language_detection": {
|
|
278
|
+
name: `Lingua Language Detection`,
|
|
279
|
+
description: `
|
|
280
|
+
This evaluator detects the language of the input and output text to check for example if the generated answer is in the same language as the prompt,
|
|
281
|
+
or if it's in a specific expected language.
|
|
282
|
+
`,
|
|
283
|
+
category: "quality",
|
|
284
|
+
docsUrl: "https://github.com/pemistahl/lingua-py",
|
|
285
|
+
isGuardrail: true,
|
|
286
|
+
requiredFields: ["output"],
|
|
287
|
+
optionalFields: ["input"],
|
|
288
|
+
settings: {
|
|
289
|
+
"check_for": {
|
|
290
|
+
"description": "What should be checked",
|
|
291
|
+
"default": "input_matches_output"
|
|
292
|
+
},
|
|
293
|
+
"expected_language": {
|
|
294
|
+
"description": "The specific language that the output is expected to be",
|
|
295
|
+
"default": void 0
|
|
296
|
+
},
|
|
297
|
+
"min_words": {
|
|
298
|
+
"description": "Minimum number of words to check, as the language detection can be unreliable for very short texts. Inputs shorter than the minimum will be skipped.",
|
|
299
|
+
"default": 7
|
|
300
|
+
},
|
|
301
|
+
"threshold": {
|
|
302
|
+
"description": "Minimum confidence threshold for the language detection. If the confidence is lower than this, the evaluation will be skipped.",
|
|
303
|
+
"default": 0.25
|
|
304
|
+
}
|
|
305
|
+
},
|
|
306
|
+
envVars: [],
|
|
307
|
+
result: {
|
|
308
|
+
"passed": {
|
|
309
|
+
"description": "Passes if the detected language on the output matches the detected language on the input, or if the output matches the expected language"
|
|
310
|
+
},
|
|
311
|
+
"label": {
|
|
312
|
+
"description": "Language detected on the input for input_matches_output, or language detected on the output for output_matches_language"
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
},
|
|
316
|
+
"openai/moderation": {
|
|
317
|
+
name: `OpenAI Moderation`,
|
|
318
|
+
description: `
|
|
319
|
+
This evaluator uses OpenAI's moderation API to detect potentially harmful content in text,
|
|
320
|
+
including harassment, hate speech, self-harm, sexual content, and violence.
|
|
321
|
+
`,
|
|
322
|
+
category: "safety",
|
|
323
|
+
docsUrl: "https://platform.openai.com/docs/guides/moderation/overview",
|
|
324
|
+
isGuardrail: true,
|
|
325
|
+
requiredFields: [],
|
|
326
|
+
optionalFields: ["input", "output"],
|
|
327
|
+
settings: {
|
|
328
|
+
"model": {
|
|
329
|
+
"description": "The model version to use, `text-moderation-latest` will be automatically upgraded over time, while `text-moderation-stable` will only be updated with advanced notice by OpenAI.",
|
|
330
|
+
"default": "text-moderation-stable"
|
|
331
|
+
},
|
|
332
|
+
"categories": {
|
|
333
|
+
"description": "The categories of content to check for moderation.",
|
|
334
|
+
"default": {
|
|
335
|
+
"harassment": true,
|
|
336
|
+
"harassment_threatening": true,
|
|
337
|
+
"hate": true,
|
|
338
|
+
"hate_threatening": true,
|
|
339
|
+
"self_harm": true,
|
|
340
|
+
"self_harm_instructions": true,
|
|
341
|
+
"self_harm_intent": true,
|
|
342
|
+
"sexual": true,
|
|
343
|
+
"sexual_minors": true,
|
|
344
|
+
"violence": true,
|
|
345
|
+
"violence_graphic": true
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
},
|
|
349
|
+
envVars: ["OPENAI_API_KEY"],
|
|
350
|
+
result: {
|
|
351
|
+
"score": {
|
|
352
|
+
"description": "The model's confidence on primary category where the input violates the OpenAI's policy. The value is between 0 and 1, where higher values denote higher confidence."
|
|
353
|
+
},
|
|
354
|
+
"passed": {
|
|
355
|
+
"description": "Fails if any moderation category is flagged"
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
},
|
|
359
|
+
"ragas/bleu_score": {
|
|
360
|
+
name: `BLEU Score`,
|
|
361
|
+
description: `
|
|
362
|
+
Traditional NLP metric. BLEU score for evaluating the similarity between two strings.
|
|
363
|
+
`,
|
|
364
|
+
category: "quality",
|
|
365
|
+
docsUrl: "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/traditional/#bleu-score",
|
|
366
|
+
isGuardrail: false,
|
|
367
|
+
requiredFields: ["output", "expected_output"],
|
|
368
|
+
optionalFields: [],
|
|
369
|
+
settings: {},
|
|
370
|
+
envVars: [],
|
|
371
|
+
result: {
|
|
372
|
+
"score": {
|
|
373
|
+
"description": "BLEU similarity score"
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
},
|
|
377
|
+
"ragas/context_f1": {
|
|
378
|
+
name: `Context F1`,
|
|
379
|
+
description: `
|
|
380
|
+
Balances between precision and recall for context retrieval, increasing it means a better signal-to-noise ratio. Uses traditional string distance metrics.
|
|
381
|
+
`,
|
|
382
|
+
category: "rag",
|
|
383
|
+
docsUrl: "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/context_F1/#non-llm-based-context-F1",
|
|
384
|
+
isGuardrail: false,
|
|
385
|
+
requiredFields: ["contexts", "expected_contexts"],
|
|
386
|
+
optionalFields: [],
|
|
387
|
+
settings: {
|
|
388
|
+
"distance_measure": {
|
|
389
|
+
"description": void 0,
|
|
390
|
+
"default": "levenshtein"
|
|
391
|
+
}
|
|
392
|
+
},
|
|
393
|
+
envVars: [],
|
|
394
|
+
result: {
|
|
395
|
+
"score": {
|
|
396
|
+
"description": "A score between 0.0 and 1.0 indicating the F1 score."
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
},
|
|
400
|
+
"ragas/context_precision": {
|
|
401
|
+
name: `Context Precision`,
|
|
402
|
+
description: `
|
|
403
|
+
Measures how accurate is the retrieval compared to expected contexts, increasing it means less noise in the retrieval. Uses traditional string distance metrics.
|
|
404
|
+
`,
|
|
405
|
+
category: "rag",
|
|
406
|
+
docsUrl: "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/context_precision/#non-llm-based-context-precision",
|
|
407
|
+
isGuardrail: false,
|
|
408
|
+
requiredFields: ["contexts", "expected_contexts"],
|
|
409
|
+
optionalFields: [],
|
|
410
|
+
settings: {
|
|
411
|
+
"distance_measure": {
|
|
412
|
+
"description": void 0,
|
|
413
|
+
"default": "levenshtein"
|
|
414
|
+
}
|
|
415
|
+
},
|
|
416
|
+
envVars: [],
|
|
417
|
+
result: {
|
|
418
|
+
"score": {
|
|
419
|
+
"description": "A score between 0.0 and 1.0 indicating the precision score."
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
},
|
|
423
|
+
"ragas/context_recall": {
|
|
424
|
+
name: `Context Recall`,
|
|
425
|
+
description: `
|
|
426
|
+
Measures how many relevant contexts were retrieved compared to expected contexts, increasing it means more signal in the retrieval. Uses traditional string distance metrics.
|
|
427
|
+
`,
|
|
428
|
+
category: "rag",
|
|
429
|
+
docsUrl: "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/context_recall/#non-llm-based-context-recall",
|
|
430
|
+
isGuardrail: false,
|
|
431
|
+
requiredFields: ["contexts", "expected_contexts"],
|
|
432
|
+
optionalFields: [],
|
|
433
|
+
settings: {
|
|
434
|
+
"distance_measure": {
|
|
435
|
+
"description": void 0,
|
|
436
|
+
"default": "levenshtein"
|
|
437
|
+
}
|
|
438
|
+
},
|
|
439
|
+
envVars: [],
|
|
440
|
+
result: {
|
|
441
|
+
"score": {
|
|
442
|
+
"description": "A score between 0.0 and 1.0 indicating the Recall score."
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
},
|
|
446
|
+
"ragas/factual_correctness": {
|
|
447
|
+
name: `LLM Factual Match`,
|
|
448
|
+
description: `
|
|
449
|
+
Computes with an LLM how factually similar the generated answer is to the expected output.
|
|
450
|
+
`,
|
|
451
|
+
category: "quality",
|
|
452
|
+
docsUrl: "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/factual_correctness/",
|
|
453
|
+
isGuardrail: false,
|
|
454
|
+
requiredFields: ["output", "expected_output"],
|
|
455
|
+
optionalFields: [],
|
|
456
|
+
settings: {
|
|
457
|
+
"model": {
|
|
458
|
+
"description": "The model to use for evaluation.",
|
|
459
|
+
"default": "openai/gpt-5"
|
|
460
|
+
},
|
|
461
|
+
"max_tokens": {
|
|
462
|
+
"description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.",
|
|
463
|
+
"default": 2048
|
|
464
|
+
},
|
|
465
|
+
"mode": {
|
|
466
|
+
"description": "The mode to use for the factual correctness metric.",
|
|
467
|
+
"default": "f1"
|
|
468
|
+
},
|
|
469
|
+
"atomicity": {
|
|
470
|
+
"description": "The level of atomicity for claim decomposition.",
|
|
471
|
+
"default": "low"
|
|
472
|
+
},
|
|
473
|
+
"coverage": {
|
|
474
|
+
"description": "The level of coverage for claim decomposition.",
|
|
475
|
+
"default": "low"
|
|
476
|
+
}
|
|
477
|
+
},
|
|
478
|
+
envVars: [],
|
|
479
|
+
result: {
|
|
480
|
+
"score": {
|
|
481
|
+
"description": "A score between 0.0 and 1.0 indicating how factually similar the generated answer is to the expected output."
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
},
|
|
485
|
+
"ragas/faithfulness": {
|
|
486
|
+
name: `Ragas Faithfulness`,
|
|
487
|
+
description: `
|
|
488
|
+
This evaluator assesses the extent to which the generated answer is consistent with the provided context. Higher scores indicate better faithfulness to the context, useful for detecting hallucinations.
|
|
489
|
+
`,
|
|
490
|
+
category: "rag",
|
|
491
|
+
docsUrl: "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/faithfulness/",
|
|
492
|
+
isGuardrail: false,
|
|
493
|
+
requiredFields: ["output", "contexts"],
|
|
494
|
+
optionalFields: ["input"],
|
|
495
|
+
settings: {
|
|
496
|
+
"model": {
|
|
497
|
+
"description": "The model to use for evaluation.",
|
|
498
|
+
"default": "openai/gpt-5"
|
|
499
|
+
},
|
|
500
|
+
"max_tokens": {
|
|
501
|
+
"description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.",
|
|
502
|
+
"default": 2048
|
|
503
|
+
},
|
|
504
|
+
"autodetect_dont_know": {
|
|
505
|
+
"description": "Whether to autodetect 'I don't know' in the output to avoid failing the evaluation.",
|
|
506
|
+
"default": true
|
|
507
|
+
}
|
|
508
|
+
},
|
|
509
|
+
envVars: [],
|
|
510
|
+
result: {
|
|
511
|
+
"score": {
|
|
512
|
+
"description": "A score between 0.0 and 1.0 indicating the faithfulness of the answer."
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
},
|
|
516
|
+
"ragas/response_context_precision": {
|
|
517
|
+
name: `Ragas Response Context Precision`,
|
|
518
|
+
description: `
|
|
519
|
+
Uses an LLM to measure the proportion of chunks in the retrieved context that were relevant to generate the output or the expected output.
|
|
520
|
+
`,
|
|
521
|
+
category: "rag",
|
|
522
|
+
docsUrl: "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/context_precision/#context-precision-without-reference",
|
|
523
|
+
isGuardrail: false,
|
|
524
|
+
requiredFields: ["input", "contexts"],
|
|
525
|
+
optionalFields: ["output", "expected_output"],
|
|
526
|
+
settings: {
|
|
527
|
+
"model": {
|
|
528
|
+
"description": "The model to use for evaluation.",
|
|
529
|
+
"default": "openai/gpt-5"
|
|
530
|
+
},
|
|
531
|
+
"max_tokens": {
|
|
532
|
+
"description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.",
|
|
533
|
+
"default": 2048
|
|
534
|
+
}
|
|
535
|
+
},
|
|
536
|
+
envVars: [],
|
|
537
|
+
result: {
|
|
538
|
+
"score": {
|
|
539
|
+
"description": "A score between 0.0 and 1.0 indicating the precision of the retrieved context."
|
|
540
|
+
}
|
|
541
|
+
}
|
|
542
|
+
},
|
|
543
|
+
"ragas/response_context_recall": {
|
|
544
|
+
name: `Ragas Response Context Recall`,
|
|
545
|
+
description: `
|
|
546
|
+
Uses an LLM to measure how many of relevant documents attributable the claims in the output were successfully retrieved in order to generate an expected output.
|
|
547
|
+
`,
|
|
548
|
+
category: "rag",
|
|
549
|
+
docsUrl: "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/context_recall/#llm-based-context-recall",
|
|
550
|
+
isGuardrail: false,
|
|
551
|
+
requiredFields: ["input", "output", "contexts", "expected_output"],
|
|
552
|
+
optionalFields: [],
|
|
553
|
+
settings: {
|
|
554
|
+
"model": {
|
|
555
|
+
"description": "The model to use for evaluation.",
|
|
556
|
+
"default": "openai/gpt-5"
|
|
557
|
+
},
|
|
558
|
+
"max_tokens": {
|
|
559
|
+
"description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.",
|
|
560
|
+
"default": 2048
|
|
561
|
+
}
|
|
562
|
+
},
|
|
563
|
+
envVars: [],
|
|
564
|
+
result: {
|
|
565
|
+
"score": {
|
|
566
|
+
"description": "A score between 0.0 and 1.0 indicating the recall of the retrieved context."
|
|
567
|
+
}
|
|
568
|
+
}
|
|
569
|
+
},
|
|
570
|
+
"ragas/response_relevancy": {
|
|
571
|
+
name: `Ragas Response Relevancy`,
|
|
572
|
+
description: `
|
|
573
|
+
Evaluates how pertinent the generated answer is to the given prompt. Higher scores indicate better relevancy.
|
|
574
|
+
`,
|
|
575
|
+
category: "quality",
|
|
576
|
+
docsUrl: "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/answer_relevance/",
|
|
577
|
+
isGuardrail: false,
|
|
578
|
+
requiredFields: ["input", "output"],
|
|
579
|
+
optionalFields: [],
|
|
580
|
+
settings: {
|
|
581
|
+
"model": {
|
|
582
|
+
"description": "The model to use for evaluation.",
|
|
583
|
+
"default": "openai/gpt-5"
|
|
584
|
+
},
|
|
585
|
+
"max_tokens": {
|
|
586
|
+
"description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.",
|
|
587
|
+
"default": 2048
|
|
588
|
+
},
|
|
589
|
+
"embeddings_model": {
|
|
590
|
+
"description": "The model to use for embeddings.",
|
|
591
|
+
"default": "openai/text-embedding-ada-002"
|
|
592
|
+
}
|
|
593
|
+
},
|
|
594
|
+
envVars: [],
|
|
595
|
+
result: {
|
|
596
|
+
"score": {
|
|
597
|
+
"description": "A score between 0.0 and 1.0 indicating the relevance of the answer."
|
|
598
|
+
}
|
|
599
|
+
}
|
|
600
|
+
},
|
|
601
|
+
"ragas/rouge_score": {
|
|
602
|
+
name: `ROUGE Score`,
|
|
603
|
+
description: `
|
|
604
|
+
Traditional NLP metric. ROUGE score for evaluating the similarity between two strings.
|
|
605
|
+
`,
|
|
606
|
+
category: "quality",
|
|
607
|
+
docsUrl: "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/traditional/#rouge-score",
|
|
608
|
+
isGuardrail: false,
|
|
609
|
+
requiredFields: ["output", "expected_output"],
|
|
610
|
+
optionalFields: [],
|
|
611
|
+
settings: {
|
|
612
|
+
"rouge_type": {
|
|
613
|
+
"description": "ROUGE type",
|
|
614
|
+
"default": "rouge1"
|
|
615
|
+
},
|
|
616
|
+
"measure_type": {
|
|
617
|
+
"description": "ROUGE measure type",
|
|
618
|
+
"default": "fmeasure"
|
|
619
|
+
}
|
|
620
|
+
},
|
|
621
|
+
envVars: [],
|
|
622
|
+
result: {
|
|
623
|
+
"score": {
|
|
624
|
+
"description": "ROUGE similarity score"
|
|
625
|
+
}
|
|
626
|
+
}
|
|
627
|
+
},
|
|
628
|
+
"ragas/rubrics_based_scoring": {
|
|
629
|
+
name: `Rubrics Based Scoring`,
|
|
630
|
+
description: `
|
|
631
|
+
Rubric-based evaluation metric that is used to evaluate responses. The rubric consists of descriptions for each score, typically ranging from 1 to 5
|
|
632
|
+
`,
|
|
633
|
+
category: "quality",
|
|
634
|
+
docsUrl: "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/general_purpose/#rubrics-based-criteria-scoring",
|
|
635
|
+
isGuardrail: false,
|
|
636
|
+
requiredFields: ["input", "output"],
|
|
637
|
+
optionalFields: ["expected_output"],
|
|
638
|
+
settings: {
|
|
639
|
+
"model": {
|
|
640
|
+
"description": "The model to use for evaluation.",
|
|
641
|
+
"default": "openai/gpt-5"
|
|
642
|
+
},
|
|
643
|
+
"max_tokens": {
|
|
644
|
+
"description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.",
|
|
645
|
+
"default": 2048
|
|
646
|
+
},
|
|
647
|
+
"rubrics": {
|
|
648
|
+
"description": void 0,
|
|
649
|
+
"default": [
|
|
650
|
+
{
|
|
651
|
+
"description": "The response is incorrect, irrelevant."
|
|
652
|
+
},
|
|
653
|
+
{
|
|
654
|
+
"description": "The response partially answers the question but includes significant errors, omissions, or irrelevant information."
|
|
655
|
+
},
|
|
656
|
+
{
|
|
657
|
+
"description": "The response partially answers the question but includes minor errors, omissions, or irrelevant information."
|
|
658
|
+
},
|
|
659
|
+
{
|
|
660
|
+
"description": "The response fully answers the question and includes minor errors, omissions, or irrelevant information."
|
|
661
|
+
},
|
|
662
|
+
{
|
|
663
|
+
"description": "The response fully answers the question and includes no errors, omissions, or irrelevant information."
|
|
664
|
+
}
|
|
665
|
+
]
|
|
666
|
+
}
|
|
667
|
+
},
|
|
668
|
+
envVars: [],
|
|
669
|
+
result: {
|
|
670
|
+
"score": {
|
|
671
|
+
"description": "A score according to the rubrics, typically between 1 and 5."
|
|
672
|
+
}
|
|
673
|
+
}
|
|
674
|
+
},
|
|
675
|
+
"ragas/sql_query_equivalence": {
|
|
676
|
+
name: `SQL Query Equivalence`,
|
|
677
|
+
description: `
|
|
678
|
+
Checks if the SQL query is equivalent to a reference one by using an LLM to infer if it would generate the same results given the table schemas.
|
|
679
|
+
`,
|
|
680
|
+
category: "quality",
|
|
681
|
+
docsUrl: "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/sql/#sql-query-semantic-equivalence",
|
|
682
|
+
isGuardrail: false,
|
|
683
|
+
requiredFields: ["output", "expected_output", "expected_contexts"],
|
|
684
|
+
optionalFields: [],
|
|
685
|
+
settings: {
|
|
686
|
+
"model": {
|
|
687
|
+
"description": "The model to use for evaluation.",
|
|
688
|
+
"default": "openai/gpt-5"
|
|
689
|
+
},
|
|
690
|
+
"max_tokens": {
|
|
691
|
+
"description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.",
|
|
692
|
+
"default": 2048
|
|
693
|
+
}
|
|
694
|
+
},
|
|
695
|
+
envVars: [],
|
|
696
|
+
result: {
|
|
697
|
+
"passed": {
|
|
698
|
+
"description": "Whether the SQL query is equivalent to the expected one."
|
|
699
|
+
}
|
|
700
|
+
}
|
|
701
|
+
},
|
|
702
|
+
"ragas/summarization_score": {
|
|
703
|
+
name: `Summarization Score`,
|
|
704
|
+
description: `
|
|
705
|
+
Measures how well the summary captures important information from the retrieved contexts.
|
|
706
|
+
`,
|
|
707
|
+
category: "quality",
|
|
708
|
+
docsUrl: "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/summarization_score/",
|
|
709
|
+
isGuardrail: false,
|
|
710
|
+
requiredFields: ["output", "contexts"],
|
|
711
|
+
optionalFields: [],
|
|
712
|
+
settings: {
|
|
713
|
+
"model": {
|
|
714
|
+
"description": "The model to use for evaluation.",
|
|
715
|
+
"default": "openai/gpt-5"
|
|
716
|
+
},
|
|
717
|
+
"max_tokens": {
|
|
718
|
+
"description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.",
|
|
719
|
+
"default": 2048
|
|
720
|
+
}
|
|
721
|
+
},
|
|
722
|
+
envVars: [],
|
|
723
|
+
result: {
|
|
724
|
+
"score": {
|
|
725
|
+
"description": "A score between 0.0 and 1.0 indicating the summarization quality."
|
|
726
|
+
}
|
|
727
|
+
}
|
|
728
|
+
},
|
|
729
|
+
"azure/content_safety": {
|
|
730
|
+
name: `Azure Content Safety`,
|
|
731
|
+
description: `
|
|
732
|
+
This evaluator detects potentially unsafe content in text, including hate speech,
|
|
733
|
+
self-harm, sexual content, and violence. It allows customization of the severity
|
|
734
|
+
threshold and the specific categories to check.
|
|
735
|
+
`,
|
|
736
|
+
category: "safety",
|
|
737
|
+
docsUrl: "https://learn.microsoft.com/en-us/azure/ai-services/content-safety/quickstart-text",
|
|
738
|
+
isGuardrail: true,
|
|
739
|
+
requiredFields: [],
|
|
740
|
+
optionalFields: ["input", "output"],
|
|
741
|
+
settings: {
|
|
742
|
+
"severity_threshold": {
|
|
743
|
+
"description": "The minimum severity level to consider content as unsafe, from 1 to 7.",
|
|
744
|
+
"default": 1
|
|
745
|
+
},
|
|
746
|
+
"categories": {
|
|
747
|
+
"description": "The categories of moderation to check for.",
|
|
748
|
+
"default": {
|
|
749
|
+
"Hate": true,
|
|
750
|
+
"SelfHarm": true,
|
|
751
|
+
"Sexual": true,
|
|
752
|
+
"Violence": true
|
|
753
|
+
}
|
|
754
|
+
},
|
|
755
|
+
"output_type": {
|
|
756
|
+
"description": "The type of severity levels to return on the full 0-7 severity scale, it can be either the trimmed version with four values (0, 2, 4, 6 scores) or the whole range.",
|
|
757
|
+
"default": "FourSeverityLevels"
|
|
758
|
+
}
|
|
759
|
+
},
|
|
760
|
+
envVars: ["AZURE_CONTENT_SAFETY_ENDPOINT", "AZURE_CONTENT_SAFETY_KEY"],
|
|
761
|
+
result: {
|
|
762
|
+
"score": {
|
|
763
|
+
"description": "The severity level of the detected content from 0 to 7. A higher score indicates higher severity."
|
|
764
|
+
}
|
|
765
|
+
}
|
|
766
|
+
},
|
|
767
|
+
"azure/jailbreak": {
|
|
768
|
+
name: `Azure Jailbreak Detection`,
|
|
769
|
+
description: `
|
|
770
|
+
This evaluator checks for jailbreak-attempt in the input using Azure's Content Safety API.
|
|
771
|
+
`,
|
|
772
|
+
category: "safety",
|
|
773
|
+
docsUrl: "",
|
|
774
|
+
isGuardrail: true,
|
|
775
|
+
requiredFields: ["input"],
|
|
776
|
+
optionalFields: [],
|
|
777
|
+
settings: {},
|
|
778
|
+
envVars: ["AZURE_CONTENT_SAFETY_ENDPOINT", "AZURE_CONTENT_SAFETY_KEY"],
|
|
779
|
+
result: {
|
|
780
|
+
"passed": {
|
|
781
|
+
"description": "If true then no jailbreak was detected, if false then a jailbreak was detected"
|
|
782
|
+
}
|
|
783
|
+
}
|
|
784
|
+
},
|
|
785
|
+
"azure/prompt_injection": {
|
|
786
|
+
name: `Azure Prompt Shield`,
|
|
787
|
+
description: `
|
|
788
|
+
This evaluator checks for prompt injection attempt in the input and the contexts using Azure's Content Safety API.
|
|
789
|
+
`,
|
|
790
|
+
category: "safety",
|
|
791
|
+
docsUrl: "https://learn.microsoft.com/en-us/azure/ai-services/content-safety/concepts/jailbreak-detection",
|
|
792
|
+
isGuardrail: true,
|
|
793
|
+
requiredFields: ["input"],
|
|
794
|
+
optionalFields: ["contexts"],
|
|
795
|
+
settings: {},
|
|
796
|
+
envVars: ["AZURE_CONTENT_SAFETY_ENDPOINT", "AZURE_CONTENT_SAFETY_KEY"],
|
|
797
|
+
result: {
|
|
798
|
+
"passed": {
|
|
799
|
+
"description": "If true then no prompt injection was detected, if false then a prompt injection was detected"
|
|
800
|
+
}
|
|
801
|
+
}
|
|
802
|
+
},
|
|
803
|
+
"langevals/basic": {
|
|
804
|
+
name: `Custom Basic Evaluator`,
|
|
805
|
+
description: `
|
|
806
|
+
Allows you to check for simple text matches or regex evaluation.
|
|
807
|
+
`,
|
|
808
|
+
category: "custom",
|
|
809
|
+
docsUrl: "",
|
|
810
|
+
isGuardrail: true,
|
|
811
|
+
requiredFields: [],
|
|
812
|
+
optionalFields: ["input", "output"],
|
|
813
|
+
settings: {
|
|
814
|
+
"rules": {
|
|
815
|
+
"description": "List of rules to check, the message must pass all of them",
|
|
816
|
+
"default": [
|
|
817
|
+
{
|
|
818
|
+
"field": "output",
|
|
819
|
+
"rule": "not_contains",
|
|
820
|
+
"value": "artificial intelligence"
|
|
821
|
+
}
|
|
822
|
+
]
|
|
823
|
+
}
|
|
824
|
+
},
|
|
825
|
+
envVars: [],
|
|
826
|
+
result: {
|
|
827
|
+
"passed": {
|
|
828
|
+
"description": "True if all rules pass, False if any rule fails"
|
|
829
|
+
}
|
|
830
|
+
}
|
|
831
|
+
},
|
|
832
|
+
"langevals/competitor_blocklist": {
|
|
833
|
+
name: `Competitor Blocklist`,
|
|
834
|
+
description: `
|
|
835
|
+
This evaluator checks if any of the specified competitors was mentioned
|
|
836
|
+
`,
|
|
837
|
+
category: "policy",
|
|
838
|
+
docsUrl: "https://path/to/official/docs",
|
|
839
|
+
isGuardrail: true,
|
|
840
|
+
requiredFields: [],
|
|
841
|
+
optionalFields: ["output", "input"],
|
|
842
|
+
settings: {
|
|
843
|
+
"competitors": {
|
|
844
|
+
"description": "The competitors that must not be mentioned.",
|
|
845
|
+
"default": [
|
|
846
|
+
"OpenAI",
|
|
847
|
+
"Google",
|
|
848
|
+
"Microsoft"
|
|
849
|
+
]
|
|
850
|
+
}
|
|
851
|
+
},
|
|
852
|
+
envVars: [],
|
|
853
|
+
result: {
|
|
854
|
+
"score": {
|
|
855
|
+
"description": "Number of competitors mentioned in the input and output"
|
|
856
|
+
},
|
|
857
|
+
"passed": {
|
|
858
|
+
"description": "Is the message containing explicit mention of competitor"
|
|
859
|
+
}
|
|
860
|
+
}
|
|
861
|
+
},
|
|
862
|
+
"langevals/competitor_llm": {
|
|
863
|
+
name: `Competitor Allowlist Check`,
|
|
864
|
+
description: `
|
|
865
|
+
This evaluator use an LLM-as-judge to check if the conversation is related to competitors, without having to name them explicitly
|
|
866
|
+
`,
|
|
867
|
+
category: "policy",
|
|
868
|
+
docsUrl: "",
|
|
869
|
+
isGuardrail: true,
|
|
870
|
+
requiredFields: [],
|
|
871
|
+
optionalFields: ["output", "input"],
|
|
872
|
+
settings: {
|
|
873
|
+
"model": {
|
|
874
|
+
"description": "The model to use for evaluation",
|
|
875
|
+
"default": "openai/gpt-5"
|
|
876
|
+
},
|
|
877
|
+
"max_tokens": {
|
|
878
|
+
"description": "Max tokens allowed for evaluation",
|
|
879
|
+
"default": 128e3
|
|
880
|
+
},
|
|
881
|
+
"name": {
|
|
882
|
+
"description": "The name of your company",
|
|
883
|
+
"default": "LangWatch"
|
|
884
|
+
},
|
|
885
|
+
"description": {
|
|
886
|
+
"description": "Description of what your company is specializing at",
|
|
887
|
+
"default": "We are providing an LLM observability and evaluation platform"
|
|
888
|
+
}
|
|
889
|
+
},
|
|
890
|
+
envVars: [],
|
|
891
|
+
result: {
|
|
892
|
+
"score": {
|
|
893
|
+
"description": "Confidence that the message is competitor free"
|
|
894
|
+
},
|
|
895
|
+
"passed": {
|
|
896
|
+
"description": "Is the message related to the competitors"
|
|
897
|
+
}
|
|
898
|
+
}
|
|
899
|
+
},
|
|
900
|
+
"langevals/competitor_llm_function_call": {
|
|
901
|
+
name: `Competitor LLM Check`,
|
|
902
|
+
description: `
|
|
903
|
+
This evaluator implements LLM-as-a-judge with a function call approach to check if the message contains a mention of a competitor.
|
|
904
|
+
`,
|
|
905
|
+
category: "policy",
|
|
906
|
+
docsUrl: "",
|
|
907
|
+
isGuardrail: true,
|
|
908
|
+
requiredFields: [],
|
|
909
|
+
optionalFields: ["output", "input"],
|
|
910
|
+
settings: {
|
|
911
|
+
"model": {
|
|
912
|
+
"description": "The model to use for evaluation",
|
|
913
|
+
"default": "openai/gpt-5"
|
|
914
|
+
},
|
|
915
|
+
"max_tokens": {
|
|
916
|
+
"description": "Max tokens allowed for evaluation",
|
|
917
|
+
"default": 128e3
|
|
918
|
+
},
|
|
919
|
+
"name": {
|
|
920
|
+
"description": "The name of your company",
|
|
921
|
+
"default": "LangWatch"
|
|
922
|
+
},
|
|
923
|
+
"description": {
|
|
924
|
+
"description": "Description of what your company is specializing at",
|
|
925
|
+
"default": "We are providing an LLM observability and evaluation platform"
|
|
926
|
+
},
|
|
927
|
+
"competitors": {
|
|
928
|
+
"description": "The competitors that must not be mentioned.",
|
|
929
|
+
"default": [
|
|
930
|
+
"OpenAI",
|
|
931
|
+
"Google",
|
|
932
|
+
"Microsoft"
|
|
933
|
+
]
|
|
934
|
+
}
|
|
935
|
+
},
|
|
936
|
+
envVars: [],
|
|
937
|
+
result: {
|
|
938
|
+
"score": {
|
|
939
|
+
"description": "Number of unique competitors mentioned"
|
|
940
|
+
},
|
|
941
|
+
"passed": {
|
|
942
|
+
"description": "Is the message related to the competitors"
|
|
943
|
+
}
|
|
944
|
+
}
|
|
945
|
+
},
|
|
946
|
+
"langevals/exact_match": {
|
|
947
|
+
name: `Exact Match Evaluator`,
|
|
948
|
+
description: `
|
|
949
|
+
A simple evaluator that checks if the output matches the expected_output exactly.
|
|
950
|
+
`,
|
|
951
|
+
category: "quality",
|
|
952
|
+
docsUrl: "",
|
|
953
|
+
isGuardrail: false,
|
|
954
|
+
requiredFields: ["output", "expected_output"],
|
|
955
|
+
optionalFields: [],
|
|
956
|
+
settings: {
|
|
957
|
+
"case_sensitive": {
|
|
958
|
+
"description": "True if the comparison should be case-sensitive, False otherwise",
|
|
959
|
+
"default": false
|
|
960
|
+
},
|
|
961
|
+
"trim_whitespace": {
|
|
962
|
+
"description": "True if the comparison should trim whitespace, False otherwise",
|
|
963
|
+
"default": true
|
|
964
|
+
},
|
|
965
|
+
"remove_punctuation": {
|
|
966
|
+
"description": "True if the comparison should remove punctuation, False otherwise",
|
|
967
|
+
"default": true
|
|
968
|
+
}
|
|
969
|
+
},
|
|
970
|
+
envVars: [],
|
|
971
|
+
result: {
|
|
972
|
+
"passed": {
|
|
973
|
+
"description": "True if the output matched the expected_output exactly, False otherwise"
|
|
974
|
+
}
|
|
975
|
+
}
|
|
976
|
+
},
|
|
977
|
+
"langevals/llm_answer_match": {
|
|
978
|
+
name: `LLM Answer Match`,
|
|
979
|
+
description: `
|
|
980
|
+
Uses an LLM to check if the generated output answers a question correctly the same way as the expected output, even if their style is different.
|
|
981
|
+
`,
|
|
982
|
+
category: "quality",
|
|
983
|
+
docsUrl: "",
|
|
984
|
+
isGuardrail: false,
|
|
985
|
+
requiredFields: ["output", "expected_output"],
|
|
986
|
+
optionalFields: ["input"],
|
|
987
|
+
settings: {
|
|
988
|
+
"model": {
|
|
989
|
+
"description": "The model to use for evaluation",
|
|
990
|
+
"default": "openai/gpt-5"
|
|
991
|
+
},
|
|
992
|
+
"max_tokens": {
|
|
993
|
+
"description": "Max tokens allowed for evaluation",
|
|
994
|
+
"default": 128e3
|
|
995
|
+
},
|
|
996
|
+
"prompt": {
|
|
997
|
+
"description": "Prompt for the comparison",
|
|
998
|
+
"default": "Verify that the predicted answer matches the gold answer for the question. Style does not matter, for example the gold answer may be more direct while the predicted answer more verbose and still be correct."
|
|
999
|
+
}
|
|
1000
|
+
},
|
|
1001
|
+
envVars: [],
|
|
1002
|
+
result: {
|
|
1003
|
+
"passed": {
|
|
1004
|
+
"description": "Whether the predicted answer matches the gold answer"
|
|
1005
|
+
}
|
|
1006
|
+
}
|
|
1007
|
+
},
|
|
1008
|
+
"langevals/llm_boolean": {
|
|
1009
|
+
name: `LLM-as-a-Judge Boolean Evaluator`,
|
|
1010
|
+
description: `
|
|
1011
|
+
Use an LLM as a judge with a custom prompt to do a true/false boolean evaluation of the message.
|
|
1012
|
+
`,
|
|
1013
|
+
category: "custom",
|
|
1014
|
+
docsUrl: "",
|
|
1015
|
+
isGuardrail: true,
|
|
1016
|
+
requiredFields: [],
|
|
1017
|
+
optionalFields: ["input", "output", "contexts"],
|
|
1018
|
+
settings: {
|
|
1019
|
+
"model": {
|
|
1020
|
+
"description": "The model to use for evaluation",
|
|
1021
|
+
"default": "openai/gpt-5"
|
|
1022
|
+
},
|
|
1023
|
+
"max_tokens": {
|
|
1024
|
+
"description": "Max tokens allowed for evaluation",
|
|
1025
|
+
"default": 128e3
|
|
1026
|
+
},
|
|
1027
|
+
"prompt": {
|
|
1028
|
+
"description": "The system prompt to use for the LLM to run the evaluation",
|
|
1029
|
+
"default": "You are an LLM evaluator. We need the guarantee that the output answers what is being asked on the input, please evaluate as False if it doesn't"
|
|
1030
|
+
}
|
|
1031
|
+
},
|
|
1032
|
+
envVars: [],
|
|
1033
|
+
result: {
|
|
1034
|
+
"passed": {
|
|
1035
|
+
"description": "The veredict given by the LLM"
|
|
1036
|
+
}
|
|
1037
|
+
}
|
|
1038
|
+
},
|
|
1039
|
+
"langevals/llm_category": {
|
|
1040
|
+
name: `LLM-as-a-Judge Category Evaluator`,
|
|
1041
|
+
description: `
|
|
1042
|
+
Use an LLM as a judge with a custom prompt to classify the message into custom defined categories.
|
|
1043
|
+
`,
|
|
1044
|
+
category: "custom",
|
|
1045
|
+
docsUrl: "",
|
|
1046
|
+
isGuardrail: false,
|
|
1047
|
+
requiredFields: [],
|
|
1048
|
+
optionalFields: ["input", "output", "contexts"],
|
|
1049
|
+
settings: {
|
|
1050
|
+
"model": {
|
|
1051
|
+
"description": "The model to use for evaluation",
|
|
1052
|
+
"default": "openai/gpt-5"
|
|
1053
|
+
},
|
|
1054
|
+
"max_tokens": {
|
|
1055
|
+
"description": "Max tokens allowed for evaluation",
|
|
1056
|
+
"default": 128e3
|
|
1057
|
+
},
|
|
1058
|
+
"prompt": {
|
|
1059
|
+
"description": "The system prompt to use for the LLM to run the evaluation",
|
|
1060
|
+
"default": "You are an LLM category evaluator. Please categorize the message in one of the following categories"
|
|
1061
|
+
},
|
|
1062
|
+
"categories": {
|
|
1063
|
+
"description": "The categories to use for the evaluation",
|
|
1064
|
+
"default": [
|
|
1065
|
+
{
|
|
1066
|
+
"name": "smalltalk",
|
|
1067
|
+
"description": "Smalltalk with the user"
|
|
1068
|
+
},
|
|
1069
|
+
{
|
|
1070
|
+
"name": "company",
|
|
1071
|
+
"description": "Questions about the company, what we do, etc"
|
|
1072
|
+
}
|
|
1073
|
+
]
|
|
1074
|
+
}
|
|
1075
|
+
},
|
|
1076
|
+
envVars: [],
|
|
1077
|
+
result: {
|
|
1078
|
+
"label": {
|
|
1079
|
+
"description": "The detected category of the message"
|
|
1080
|
+
}
|
|
1081
|
+
}
|
|
1082
|
+
},
|
|
1083
|
+
"langevals/llm_score": {
|
|
1084
|
+
name: `LLM-as-a-Judge Score Evaluator`,
|
|
1085
|
+
description: `
|
|
1086
|
+
Use an LLM as a judge with custom prompt to do a numeric score evaluation of the message.
|
|
1087
|
+
`,
|
|
1088
|
+
category: "custom",
|
|
1089
|
+
docsUrl: "",
|
|
1090
|
+
isGuardrail: false,
|
|
1091
|
+
requiredFields: [],
|
|
1092
|
+
optionalFields: ["input", "output", "contexts"],
|
|
1093
|
+
settings: {
|
|
1094
|
+
"model": {
|
|
1095
|
+
"description": "The model to use for evaluation",
|
|
1096
|
+
"default": "openai/gpt-5"
|
|
1097
|
+
},
|
|
1098
|
+
"max_tokens": {
|
|
1099
|
+
"description": "Max tokens allowed for evaluation",
|
|
1100
|
+
"default": 128e3
|
|
1101
|
+
},
|
|
1102
|
+
"prompt": {
|
|
1103
|
+
"description": "The system prompt to use for the LLM to run the evaluation",
|
|
1104
|
+
"default": "You are an LLM evaluator. Please score from 0.0 to 1.0 how likely the user is to be satisfied with this answer, from 0.0 being not satisfied at all to 1.0 being completely satisfied"
|
|
1105
|
+
}
|
|
1106
|
+
},
|
|
1107
|
+
envVars: [],
|
|
1108
|
+
result: {
|
|
1109
|
+
"score": {
|
|
1110
|
+
"description": "The score given by the LLM, according to the prompt"
|
|
1111
|
+
}
|
|
1112
|
+
}
|
|
1113
|
+
},
|
|
1114
|
+
"langevals/off_topic": {
|
|
1115
|
+
name: `Off Topic Evaluator`,
|
|
1116
|
+
description: `
|
|
1117
|
+
This evaluator checks if the user message is concerning one of the allowed topics of the chatbot
|
|
1118
|
+
`,
|
|
1119
|
+
category: "policy",
|
|
1120
|
+
docsUrl: "",
|
|
1121
|
+
isGuardrail: true,
|
|
1122
|
+
requiredFields: ["input"],
|
|
1123
|
+
optionalFields: [],
|
|
1124
|
+
settings: {
|
|
1125
|
+
"model": {
|
|
1126
|
+
"description": "The model to use for evaluation",
|
|
1127
|
+
"default": "openai/gpt-5"
|
|
1128
|
+
},
|
|
1129
|
+
"max_tokens": {
|
|
1130
|
+
"description": "Max tokens allowed for evaluation",
|
|
1131
|
+
"default": 128e3
|
|
1132
|
+
},
|
|
1133
|
+
"allowed_topics": {
|
|
1134
|
+
"description": "The list of topics and their short descriptions that the chatbot is allowed to talk about",
|
|
1135
|
+
"default": [
|
|
1136
|
+
{
|
|
1137
|
+
"topic": "simple_chat",
|
|
1138
|
+
"description": "Smalltalk with the user"
|
|
1139
|
+
},
|
|
1140
|
+
{
|
|
1141
|
+
"topic": "company",
|
|
1142
|
+
"description": "Questions about the company, what we do, etc"
|
|
1143
|
+
}
|
|
1144
|
+
]
|
|
1145
|
+
}
|
|
1146
|
+
},
|
|
1147
|
+
envVars: [],
|
|
1148
|
+
result: {
|
|
1149
|
+
"score": {
|
|
1150
|
+
"description": "Confidence level of the intent prediction"
|
|
1151
|
+
},
|
|
1152
|
+
"passed": {
|
|
1153
|
+
"description": "Is the message concerning allowed topic"
|
|
1154
|
+
},
|
|
1155
|
+
"label": {
|
|
1156
|
+
"description": "The detected intent or 'other' if the intent is not in the allowed topics"
|
|
1157
|
+
}
|
|
1158
|
+
}
|
|
1159
|
+
},
|
|
1160
|
+
"langevals/query_resolution": {
|
|
1161
|
+
name: `Query Resolution`,
|
|
1162
|
+
description: `
|
|
1163
|
+
This evaluator checks if all the user queries in the conversation were resolved. Useful to detect when the bot doesn't know how to answer or can't help the user.
|
|
1164
|
+
`,
|
|
1165
|
+
category: "quality",
|
|
1166
|
+
docsUrl: "",
|
|
1167
|
+
isGuardrail: false,
|
|
1168
|
+
requiredFields: ["conversation"],
|
|
1169
|
+
optionalFields: [],
|
|
1170
|
+
settings: {
|
|
1171
|
+
"model": {
|
|
1172
|
+
"description": "The model to use for evaluation",
|
|
1173
|
+
"default": "openai/gpt-5"
|
|
1174
|
+
},
|
|
1175
|
+
"max_tokens": {
|
|
1176
|
+
"description": "Max tokens allowed for evaluation",
|
|
1177
|
+
"default": 128e3
|
|
1178
|
+
}
|
|
1179
|
+
},
|
|
1180
|
+
envVars: [],
|
|
1181
|
+
result: {}
|
|
1182
|
+
},
|
|
1183
|
+
"langevals/sentiment": {
|
|
1184
|
+
name: `Sentiment Evaluator`,
|
|
1185
|
+
description: `
|
|
1186
|
+
Analyzes the sentiment of the input text by comparing its embedding similarity
|
|
1187
|
+
to positive and negative reference phrases. Returns a score from -1.0 (very negative)
|
|
1188
|
+
to 1.0 (very positive) and a corresponding label.
|
|
1189
|
+
`,
|
|
1190
|
+
category: "quality",
|
|
1191
|
+
docsUrl: "",
|
|
1192
|
+
isGuardrail: false,
|
|
1193
|
+
requiredFields: ["input"],
|
|
1194
|
+
optionalFields: [],
|
|
1195
|
+
settings: {
|
|
1196
|
+
"embeddings_model": {
|
|
1197
|
+
"description": "The embeddings model to use for sentiment analysis",
|
|
1198
|
+
"default": "openai/text-embedding-3-small"
|
|
1199
|
+
},
|
|
1200
|
+
"positive_reference": {
|
|
1201
|
+
"description": "Reference phrase representing the positive end of the sentiment scale",
|
|
1202
|
+
"default": "Comment of a very happy and satisfied user"
|
|
1203
|
+
},
|
|
1204
|
+
"negative_reference": {
|
|
1205
|
+
"description": "Reference phrase representing the negative end of the sentiment scale",
|
|
1206
|
+
"default": "Comment of a user who is extremely dissatisfied"
|
|
1207
|
+
},
|
|
1208
|
+
"normalization_factor": {
|
|
1209
|
+
"description": "Controls sentiment sensitivity. Decrease to make scores more extreme (fewer neutrals), increase to make scores more moderate (more neutrals)",
|
|
1210
|
+
"default": 0.1
|
|
1211
|
+
}
|
|
1212
|
+
},
|
|
1213
|
+
envVars: [],
|
|
1214
|
+
result: {
|
|
1215
|
+
"score": {
|
|
1216
|
+
"description": "Sentiment score from -1.0 (very negative) to 1.0 (very positive)"
|
|
1217
|
+
},
|
|
1218
|
+
"label": {
|
|
1219
|
+
"description": "Sentiment label: 'positive' or 'negative'"
|
|
1220
|
+
}
|
|
1221
|
+
}
|
|
1222
|
+
},
|
|
1223
|
+
"langevals/similarity": {
|
|
1224
|
+
name: `Semantic Similarity Evaluator`,
|
|
1225
|
+
description: `
|
|
1226
|
+
Allows you to check for semantic similarity or dissimilarity between input and output and a
|
|
1227
|
+
target value, so you can avoid sentences that you don't want to be present without having to
|
|
1228
|
+
match on the exact text.
|
|
1229
|
+
`,
|
|
1230
|
+
category: "custom",
|
|
1231
|
+
docsUrl: "",
|
|
1232
|
+
isGuardrail: true,
|
|
1233
|
+
requiredFields: [],
|
|
1234
|
+
optionalFields: ["input", "output"],
|
|
1235
|
+
settings: {
|
|
1236
|
+
"field": {
|
|
1237
|
+
"description": void 0,
|
|
1238
|
+
"default": "output"
|
|
1239
|
+
},
|
|
1240
|
+
"rule": {
|
|
1241
|
+
"description": void 0,
|
|
1242
|
+
"default": "is_not_similar_to"
|
|
1243
|
+
},
|
|
1244
|
+
"value": {
|
|
1245
|
+
"description": void 0,
|
|
1246
|
+
"default": "example"
|
|
1247
|
+
},
|
|
1248
|
+
"threshold": {
|
|
1249
|
+
"description": void 0,
|
|
1250
|
+
"default": 0.3
|
|
1251
|
+
},
|
|
1252
|
+
"embeddings_model": {
|
|
1253
|
+
"description": void 0,
|
|
1254
|
+
"default": "openai/text-embedding-3-small"
|
|
1255
|
+
}
|
|
1256
|
+
},
|
|
1257
|
+
envVars: [],
|
|
1258
|
+
result: {
|
|
1259
|
+
"score": {
|
|
1260
|
+
"description": "How similar the input and output semantically, from 0.0 to 1.0, with 1.0 meaning the sentences are identical"
|
|
1261
|
+
},
|
|
1262
|
+
"passed": {
|
|
1263
|
+
"description": "Passes if the cosine similarity crosses the threshold for the defined rule"
|
|
1264
|
+
}
|
|
1265
|
+
}
|
|
1266
|
+
},
|
|
1267
|
+
"langevals/valid_format": {
|
|
1268
|
+
name: `Valid Format Evaluator`,
|
|
1269
|
+
description: `
|
|
1270
|
+
Allows you to check if the output is a valid json, markdown, python, sql, etc.
|
|
1271
|
+
For JSON, can optionally validate against a provided schema.
|
|
1272
|
+
`,
|
|
1273
|
+
category: "quality",
|
|
1274
|
+
docsUrl: "",
|
|
1275
|
+
isGuardrail: true,
|
|
1276
|
+
requiredFields: [],
|
|
1277
|
+
optionalFields: ["output"],
|
|
1278
|
+
settings: {
|
|
1279
|
+
"format": {
|
|
1280
|
+
"description": void 0,
|
|
1281
|
+
"default": "json"
|
|
1282
|
+
},
|
|
1283
|
+
"json_schema": {
|
|
1284
|
+
"description": "JSON schema to validate against when format is 'json'",
|
|
1285
|
+
"default": void 0
|
|
1286
|
+
}
|
|
1287
|
+
},
|
|
1288
|
+
envVars: [],
|
|
1289
|
+
result: {
|
|
1290
|
+
"passed": {
|
|
1291
|
+
"description": "True if the output is formatted correctly, False otherwise"
|
|
1292
|
+
}
|
|
1293
|
+
}
|
|
1294
|
+
}
|
|
1295
|
+
};
|
|
1296
|
+
|
|
1297
|
+
// src/tools/discover-evaluator-schema.ts
|
|
1298
|
+
function formatEvaluatorSchema(evaluatorType) {
|
|
1299
|
+
if (evaluatorType) {
|
|
1300
|
+
return formatEvaluatorDetail(evaluatorType);
|
|
1301
|
+
}
|
|
1302
|
+
return formatEvaluatorOverview();
|
|
1303
|
+
}
|
|
1304
|
+
function formatEvaluatorOverview() {
|
|
1305
|
+
const lines = [];
|
|
1306
|
+
lines.push("# Available Evaluator Types\n");
|
|
1307
|
+
const byCategory = /* @__PURE__ */ new Map();
|
|
1308
|
+
for (const [type, def] of Object.entries(AVAILABLE_EVALUATORS)) {
|
|
1309
|
+
const evalDef = def;
|
|
1310
|
+
const oneLine = extractFirstLine(evalDef.description);
|
|
1311
|
+
const entry = { type, name: evalDef.name, description: oneLine };
|
|
1312
|
+
const list = byCategory.get(evalDef.category) ?? [];
|
|
1313
|
+
list.push(entry);
|
|
1314
|
+
byCategory.set(evalDef.category, list);
|
|
1315
|
+
}
|
|
1316
|
+
for (const [category, entries] of byCategory) {
|
|
1317
|
+
lines.push(`## ${category}
|
|
1318
|
+
`);
|
|
1319
|
+
for (const entry of entries) {
|
|
1320
|
+
lines.push(`- **${entry.type}** (${entry.name}): ${entry.description}`);
|
|
1321
|
+
}
|
|
1322
|
+
lines.push("");
|
|
1323
|
+
}
|
|
1324
|
+
lines.push(
|
|
1325
|
+
"> Use `discover_schema({ category: 'evaluators', evaluatorType: '<type>' })` for full details on a specific evaluator type."
|
|
1326
|
+
);
|
|
1327
|
+
return lines.join("\n");
|
|
1328
|
+
}
|
|
1329
|
+
function formatEvaluatorDetail(evaluatorType) {
|
|
1330
|
+
const def = AVAILABLE_EVALUATORS[evaluatorType];
|
|
1331
|
+
if (!def) {
|
|
1332
|
+
return `Unknown evaluator type: "${evaluatorType}". Use \`discover_schema({ category: 'evaluators' })\` to see all available types.`;
|
|
1333
|
+
}
|
|
1334
|
+
const lines = [];
|
|
1335
|
+
lines.push(`# ${def.name} (\`${evaluatorType}\`)
|
|
1336
|
+
`);
|
|
1337
|
+
lines.push(`**Category**: ${def.category}`);
|
|
1338
|
+
lines.push(`**Is Guardrail**: ${def.isGuardrail ? "Yes" : "No"}`);
|
|
1339
|
+
if (def.docsUrl) {
|
|
1340
|
+
lines.push(`**Docs**: ${def.docsUrl}`);
|
|
1341
|
+
}
|
|
1342
|
+
lines.push("");
|
|
1343
|
+
lines.push(`## Description
|
|
1344
|
+
`);
|
|
1345
|
+
lines.push(def.description.trim());
|
|
1346
|
+
lines.push("\n## Fields\n");
|
|
1347
|
+
if (def.requiredFields.length > 0) {
|
|
1348
|
+
lines.push(`**Required**: ${def.requiredFields.join(", ")}`);
|
|
1349
|
+
} else {
|
|
1350
|
+
lines.push("**Required**: none");
|
|
1351
|
+
}
|
|
1352
|
+
if (def.optionalFields.length > 0) {
|
|
1353
|
+
lines.push(`**Optional**: ${def.optionalFields.join(", ")}`);
|
|
1354
|
+
}
|
|
1355
|
+
const settingsEntries = Object.entries(def.settings);
|
|
1356
|
+
if (settingsEntries.length > 0) {
|
|
1357
|
+
lines.push("\n## Settings\n");
|
|
1358
|
+
for (const [key, setting] of settingsEntries) {
|
|
1359
|
+
const s = setting;
|
|
1360
|
+
const defaultStr = JSON.stringify(s.default);
|
|
1361
|
+
const desc = s.description ? ` - ${s.description}` : "";
|
|
1362
|
+
lines.push(`- **${key}**${desc}`);
|
|
1363
|
+
lines.push(` Default: \`${defaultStr}\``);
|
|
1364
|
+
}
|
|
1365
|
+
}
|
|
1366
|
+
if (def.envVars.length > 0) {
|
|
1367
|
+
lines.push("\n## Required Environment Variables\n");
|
|
1368
|
+
for (const envVar of def.envVars) {
|
|
1369
|
+
lines.push(`- \`${envVar}\``);
|
|
1370
|
+
}
|
|
1371
|
+
}
|
|
1372
|
+
const resultEntries = Object.entries(def.result);
|
|
1373
|
+
if (resultEntries.length > 0) {
|
|
1374
|
+
lines.push("\n## Result Fields\n");
|
|
1375
|
+
for (const [key, value] of resultEntries) {
|
|
1376
|
+
const v = value;
|
|
1377
|
+
lines.push(`- **${key}**: ${v.description}`);
|
|
1378
|
+
}
|
|
1379
|
+
}
|
|
1380
|
+
lines.push("\n## Usage Example\n");
|
|
1381
|
+
lines.push("```json");
|
|
1382
|
+
lines.push(JSON.stringify({
|
|
1383
|
+
evaluatorType,
|
|
1384
|
+
settings: Object.fromEntries(
|
|
1385
|
+
settingsEntries.map(([key, setting]) => [key, setting.default])
|
|
1386
|
+
)
|
|
1387
|
+
}, null, 2));
|
|
1388
|
+
lines.push("```");
|
|
1389
|
+
return lines.join("\n");
|
|
1390
|
+
}
|
|
1391
|
+
function extractFirstLine(description) {
|
|
1392
|
+
const trimmed = description.trim();
|
|
1393
|
+
const firstLine = trimmed.split("\n")[0]?.trim() ?? trimmed;
|
|
1394
|
+
if (firstLine.length > 120) {
|
|
1395
|
+
return firstLine.slice(0, 117) + "...";
|
|
1396
|
+
}
|
|
1397
|
+
return firstLine;
|
|
1398
|
+
}
|
|
1399
|
+
export {
|
|
1400
|
+
formatEvaluatorSchema
|
|
1401
|
+
};
|
|
1402
|
+
//# sourceMappingURL=discover-evaluator-schema-H23XCLNE.js.map
|