@exulu/backend 1.48.2 → 1.49.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +351 -42
- package/dist/index.d.cts +96 -1
- package/dist/index.d.ts +96 -1
- package/dist/index.js +340 -38
- package/ee/{markdown.ts → chunking/markdown.ts} +2 -2
- package/ee/python/README.md +295 -0
- package/ee/python/documents/processing/README.md +155 -0
- package/ee/{documents → python/documents}/processing/doc_processor.ts +25 -17
- package/ee/{documents/processing/pdf_to_markdown.py → python/documents/processing/document_to_markdown.py} +3 -10
- package/ee/python/setup.sh +180 -0
- package/package.json +14 -3
- package/scripts/postinstall.cjs +149 -0
- package/.agents/skills/mintlify/SKILL.md +0 -347
- package/.editorconfig +0 -15
- package/.eslintrc.json +0 -52
- package/.github/workflows/release-backend.yml +0 -38
- package/.husky/commit-msg +0 -1
- package/.jscpd.json +0 -18
- package/.mcp.json +0 -25
- package/.nvmrc +0 -1
- package/.prettierignore +0 -5
- package/.prettierrc.json +0 -12
- package/CHANGELOG.md +0 -8
- package/SECURITY.md +0 -5
- package/commitlint.config.js +0 -4
- package/devops/documentation/patch-older-releases.md +0 -42
- package/ee/documents/processing/build_pdf_processor.sh +0 -35
- package/ee/documents/processing/chunk_markdown.py +0 -263
- package/ee/documents/processing/pdf_processor.spec +0 -115
- package/eslint.config.js +0 -88
- package/jest.config.ts +0 -25
- package/mintlify-docs/.mintignore +0 -7
- package/mintlify-docs/AGENTS.md +0 -33
- package/mintlify-docs/CLAUDE.MD +0 -50
- package/mintlify-docs/CONTRIBUTING.md +0 -32
- package/mintlify-docs/LICENSE +0 -21
- package/mintlify-docs/README.md +0 -55
- package/mintlify-docs/ai-tools/claude-code.mdx +0 -43
- package/mintlify-docs/ai-tools/cursor.mdx +0 -39
- package/mintlify-docs/ai-tools/windsurf.mdx +0 -39
- package/mintlify-docs/api-reference/core-types/agent-types.mdx +0 -110
- package/mintlify-docs/api-reference/core-types/analytics-types.mdx +0 -95
- package/mintlify-docs/api-reference/core-types/configuration-types.mdx +0 -83
- package/mintlify-docs/api-reference/core-types/evaluation-types.mdx +0 -106
- package/mintlify-docs/api-reference/core-types/job-types.mdx +0 -135
- package/mintlify-docs/api-reference/core-types/overview.mdx +0 -73
- package/mintlify-docs/api-reference/core-types/prompt-types.mdx +0 -102
- package/mintlify-docs/api-reference/core-types/rbac-types.mdx +0 -163
- package/mintlify-docs/api-reference/core-types/session-types.mdx +0 -77
- package/mintlify-docs/api-reference/core-types/user-management.mdx +0 -112
- package/mintlify-docs/api-reference/core-types/workflow-types.mdx +0 -88
- package/mintlify-docs/api-reference/core-types.mdx +0 -585
- package/mintlify-docs/api-reference/dynamic-types.mdx +0 -851
- package/mintlify-docs/api-reference/endpoint/create.mdx +0 -4
- package/mintlify-docs/api-reference/endpoint/delete.mdx +0 -4
- package/mintlify-docs/api-reference/endpoint/get.mdx +0 -4
- package/mintlify-docs/api-reference/endpoint/webhook.mdx +0 -4
- package/mintlify-docs/api-reference/introduction.mdx +0 -661
- package/mintlify-docs/api-reference/mutations.mdx +0 -1012
- package/mintlify-docs/api-reference/openapi.json +0 -217
- package/mintlify-docs/api-reference/queries.mdx +0 -1154
- package/mintlify-docs/backend/introduction.mdx +0 -218
- package/mintlify-docs/changelog.mdx +0 -387
- package/mintlify-docs/community-edition.mdx +0 -304
- package/mintlify-docs/core/exulu-agent/api-reference.mdx +0 -894
- package/mintlify-docs/core/exulu-agent/configuration.mdx +0 -690
- package/mintlify-docs/core/exulu-agent/introduction.mdx +0 -552
- package/mintlify-docs/core/exulu-app/api-reference.mdx +0 -481
- package/mintlify-docs/core/exulu-app/configuration.mdx +0 -319
- package/mintlify-docs/core/exulu-app/introduction.mdx +0 -117
- package/mintlify-docs/core/exulu-authentication.mdx +0 -810
- package/mintlify-docs/core/exulu-chunkers/api-reference.mdx +0 -1011
- package/mintlify-docs/core/exulu-chunkers/configuration.mdx +0 -596
- package/mintlify-docs/core/exulu-chunkers/introduction.mdx +0 -403
- package/mintlify-docs/core/exulu-context/api-reference.mdx +0 -911
- package/mintlify-docs/core/exulu-context/configuration.mdx +0 -648
- package/mintlify-docs/core/exulu-context/introduction.mdx +0 -394
- package/mintlify-docs/core/exulu-database.mdx +0 -811
- package/mintlify-docs/core/exulu-default-agents.mdx +0 -545
- package/mintlify-docs/core/exulu-eval/api-reference.mdx +0 -772
- package/mintlify-docs/core/exulu-eval/configuration.mdx +0 -680
- package/mintlify-docs/core/exulu-eval/introduction.mdx +0 -459
- package/mintlify-docs/core/exulu-logging.mdx +0 -464
- package/mintlify-docs/core/exulu-otel.mdx +0 -670
- package/mintlify-docs/core/exulu-queues/api-reference.mdx +0 -648
- package/mintlify-docs/core/exulu-queues/configuration.mdx +0 -650
- package/mintlify-docs/core/exulu-queues/introduction.mdx +0 -474
- package/mintlify-docs/core/exulu-reranker/api-reference.mdx +0 -630
- package/mintlify-docs/core/exulu-reranker/configuration.mdx +0 -663
- package/mintlify-docs/core/exulu-reranker/introduction.mdx +0 -516
- package/mintlify-docs/core/exulu-tool/api-reference.mdx +0 -723
- package/mintlify-docs/core/exulu-tool/configuration.mdx +0 -805
- package/mintlify-docs/core/exulu-tool/introduction.mdx +0 -539
- package/mintlify-docs/core/exulu-variables/api-reference.mdx +0 -699
- package/mintlify-docs/core/exulu-variables/configuration.mdx +0 -736
- package/mintlify-docs/core/exulu-variables/introduction.mdx +0 -511
- package/mintlify-docs/development.mdx +0 -94
- package/mintlify-docs/docs.json +0 -248
- package/mintlify-docs/enterprise-edition.mdx +0 -538
- package/mintlify-docs/essentials/code.mdx +0 -35
- package/mintlify-docs/essentials/images.mdx +0 -59
- package/mintlify-docs/essentials/markdown.mdx +0 -88
- package/mintlify-docs/essentials/navigation.mdx +0 -87
- package/mintlify-docs/essentials/reusable-snippets.mdx +0 -110
- package/mintlify-docs/essentials/settings.mdx +0 -318
- package/mintlify-docs/favicon.svg +0 -3
- package/mintlify-docs/frontend/introduction.mdx +0 -39
- package/mintlify-docs/getting-started.mdx +0 -267
- package/mintlify-docs/guides/custom-agent.mdx +0 -608
- package/mintlify-docs/guides/first-agent.mdx +0 -315
- package/mintlify-docs/images/admin_ui.png +0 -0
- package/mintlify-docs/images/contexts.png +0 -0
- package/mintlify-docs/images/create_agents.png +0 -0
- package/mintlify-docs/images/evals.png +0 -0
- package/mintlify-docs/images/graphql.png +0 -0
- package/mintlify-docs/images/graphql_api.png +0 -0
- package/mintlify-docs/images/hero-dark.png +0 -0
- package/mintlify-docs/images/hero-light.png +0 -0
- package/mintlify-docs/images/hero.png +0 -0
- package/mintlify-docs/images/knowledge_sources.png +0 -0
- package/mintlify-docs/images/mcp.png +0 -0
- package/mintlify-docs/images/scaling.png +0 -0
- package/mintlify-docs/index.mdx +0 -411
- package/mintlify-docs/logo/dark.svg +0 -9
- package/mintlify-docs/logo/light.svg +0 -9
- package/mintlify-docs/partners.mdx +0 -558
- package/mintlify-docs/products.mdx +0 -77
- package/mintlify-docs/snippets/snippet-intro.mdx +0 -4
- package/mintlify-docs/styles.css +0 -207
- package/ngrok.bash +0 -1
- package/ngrok.md +0 -6
- package/ngrok.yml +0 -10
- package/release.config.cjs +0 -15
- package/skills-lock.json +0 -10
- package/types/context-processor.ts +0 -45
- package/types/enums/eval-types.ts +0 -5
- package/types/enums/field-types.ts +0 -1
- package/types/enums/jobs.ts +0 -11
- package/types/enums/statistics.ts +0 -13
- package/types/exulu-table-definition.ts +0 -79
- package/types/file-types.ts +0 -18
- package/types/models/agent-session.ts +0 -27
- package/types/models/agent.ts +0 -68
- package/types/models/context.ts +0 -53
- package/types/models/embedding.ts +0 -17
- package/types/models/eval-run.ts +0 -40
- package/types/models/exulu-agent-tool-config.ts +0 -11
- package/types/models/item.ts +0 -21
- package/types/models/job.ts +0 -8
- package/types/models/project.ts +0 -16
- package/types/models/rate-limiter-rules.ts +0 -7
- package/types/models/test-case.ts +0 -25
- package/types/models/tool.ts +0 -9
- package/types/models/user-role.ts +0 -12
- package/types/models/user.ts +0 -20
- package/types/models/variable.ts +0 -8
- package/types/models/vector-methods.ts +0 -7
- package/types/provider-config.ts +0 -21
- package/types/queue-config.ts +0 -16
- package/types/rbac-rights-modes.ts +0 -1
- package/types/statistics.ts +0 -20
- package/types/workflow.ts +0 -31
- /package/ee/{documents → python/documents}/THIRD_PARTY_LICENSES/docling.txt +0 -0
- /package/ee/{documents/processing → python}/requirements.txt +0 -0
|
@@ -1,680 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
title: "Configuration"
|
|
3
|
-
description: "Complete guide to configuring ExuluEval evaluation functions"
|
|
4
|
-
---
|
|
5
|
-
|
|
6
|
-
## Constructor parameters
|
|
7
|
-
|
|
8
|
-
ExuluEval requires specific configuration to define evaluation behavior:
|
|
9
|
-
|
|
10
|
-
```typescript
|
|
11
|
-
new ExuluEval({
|
|
12
|
-
id: string;
|
|
13
|
-
name: string;
|
|
14
|
-
description: string;
|
|
15
|
-
llm: boolean;
|
|
16
|
-
execute: (params) => Promise<number>;
|
|
17
|
-
config?: { name: string; description: string }[];
|
|
18
|
-
queue?: Promise<ExuluQueueConfig>;
|
|
19
|
-
})
|
|
20
|
-
```
|
|
21
|
-
|
|
22
|
-
<ParamField path="id" type="string" required>
|
|
23
|
-
Unique identifier for the evaluation function
|
|
24
|
-
</ParamField>
|
|
25
|
-
|
|
26
|
-
<ParamField path="name" type="string" required>
|
|
27
|
-
Human-readable name for the evaluation
|
|
28
|
-
</ParamField>
|
|
29
|
-
|
|
30
|
-
<ParamField path="description" type="string" required>
|
|
31
|
-
Description of what this evaluation measures
|
|
32
|
-
</ParamField>
|
|
33
|
-
|
|
34
|
-
<ParamField path="llm" type="boolean" required>
|
|
35
|
-
Whether this evaluation uses an LLM for scoring (LLM-as-judge)
|
|
36
|
-
</ParamField>
|
|
37
|
-
|
|
38
|
-
<ParamField path="execute" type="function" required>
|
|
39
|
-
Function that performs the evaluation and returns a score from 0-100
|
|
40
|
-
</ParamField>
|
|
41
|
-
|
|
42
|
-
<ParamField path="config" type="array" default={undefined}>
|
|
43
|
-
Optional configuration parameters for the evaluation function
|
|
44
|
-
</ParamField>
|
|
45
|
-
|
|
46
|
-
<ParamField path="queue" type="Promise<ExuluQueueConfig>" default={undefined}>
|
|
47
|
-
Optional queue configuration for running evaluations as background jobs
|
|
48
|
-
</ParamField>
|
|
49
|
-
|
|
50
|
-
## Execute function
|
|
51
|
-
|
|
52
|
-
The `execute` function receives evaluation parameters and must return a score between 0 and 100:
|
|
53
|
-
|
|
54
|
-
```typescript
|
|
55
|
-
execute: async ({
|
|
56
|
-
agent, // Agent database record
|
|
57
|
-
backend, // ExuluAgent instance
|
|
58
|
-
messages, // Conversation messages
|
|
59
|
-
testCase, // Test case with expected output
|
|
60
|
-
config // Optional runtime configuration
|
|
61
|
-
}) => {
|
|
62
|
-
// Your evaluation logic
|
|
63
|
-
return score; // Must be 0-100
|
|
64
|
-
}
|
|
65
|
-
```
|
|
66
|
-
|
|
67
|
-
### Parameters
|
|
68
|
-
|
|
69
|
-
<ParamField path="agent" type="Agent">
|
|
70
|
-
The agent database record being evaluated
|
|
71
|
-
```typescript
|
|
72
|
-
interface Agent {
|
|
73
|
-
id: string;
|
|
74
|
-
name: string;
|
|
75
|
-
description: string;
|
|
76
|
-
// ... other agent properties
|
|
77
|
-
}
|
|
78
|
-
```
|
|
79
|
-
</ParamField>
|
|
80
|
-
|
|
81
|
-
<ParamField path="backend" type="ExuluAgent">
|
|
82
|
-
ExuluAgent instance for generating responses or using LLM-as-judge
|
|
83
|
-
</ParamField>
|
|
84
|
-
|
|
85
|
-
<ParamField path="messages" type="UIMessage[]">
|
|
86
|
-
Array of conversation messages including inputs and generated response
|
|
87
|
-
```typescript
|
|
88
|
-
interface UIMessage {
|
|
89
|
-
role: "user" | "assistant" | "system";
|
|
90
|
-
content: string;
|
|
91
|
-
toolInvocations?: ToolInvocation[];
|
|
92
|
-
}
|
|
93
|
-
```
|
|
94
|
-
</ParamField>
|
|
95
|
-
|
|
96
|
-
<ParamField path="testCase" type="TestCase">
|
|
97
|
-
Test case containing inputs and expected outputs
|
|
98
|
-
```typescript
|
|
99
|
-
interface TestCase {
|
|
100
|
-
id: string;
|
|
101
|
-
name: string;
|
|
102
|
-
description?: string;
|
|
103
|
-
inputs: UIMessage[];
|
|
104
|
-
expected_output: string;
|
|
105
|
-
expected_tools?: string[];
|
|
106
|
-
expected_knowledge_sources?: string[];
|
|
107
|
-
expected_agent_tools?: string[];
|
|
108
|
-
}
|
|
109
|
-
```
|
|
110
|
-
</ParamField>
|
|
111
|
-
|
|
112
|
-
<ParamField path="config" type="Record<string, any>">
|
|
113
|
-
Runtime configuration values (optional)
|
|
114
|
-
</ParamField>
|
|
115
|
-
|
|
116
|
-
## Configuration patterns
|
|
117
|
-
|
|
118
|
-
### Basic exact match evaluation
|
|
119
|
-
|
|
120
|
-
```typescript
|
|
121
|
-
import { ExuluEval } from "@exulu/backend";
|
|
122
|
-
|
|
123
|
-
const exactMatchEval = new ExuluEval({
|
|
124
|
-
id: "exact_match",
|
|
125
|
-
name: "Exact Match",
|
|
126
|
-
description: "Returns 100 if response exactly matches expected output, 0 otherwise",
|
|
127
|
-
llm: false,
|
|
128
|
-
execute: async ({ messages, testCase }) => {
|
|
129
|
-
const lastMessage = messages[messages.length - 1];
|
|
130
|
-
const response = lastMessage?.content || "";
|
|
131
|
-
|
|
132
|
-
return response === testCase.expected_output ? 100 : 0;
|
|
133
|
-
}
|
|
134
|
-
});
|
|
135
|
-
```
|
|
136
|
-
|
|
137
|
-
### Partial match with scoring
|
|
138
|
-
|
|
139
|
-
```typescript
|
|
140
|
-
const partialMatchEval = new ExuluEval({
|
|
141
|
-
id: "partial_match",
|
|
142
|
-
name: "Partial Match",
|
|
143
|
-
description: "Scores based on how much of expected output appears in response",
|
|
144
|
-
llm: false,
|
|
145
|
-
execute: async ({ messages, testCase }) => {
|
|
146
|
-
const lastMessage = messages[messages.length - 1];
|
|
147
|
-
const response = lastMessage?.content?.toLowerCase() || "";
|
|
148
|
-
const expected = testCase.expected_output.toLowerCase();
|
|
149
|
-
|
|
150
|
-
// Split into words
|
|
151
|
-
const expectedWords = expected.split(/\s+/);
|
|
152
|
-
const matchedWords = expectedWords.filter(word =>
|
|
153
|
-
response.includes(word)
|
|
154
|
-
);
|
|
155
|
-
|
|
156
|
-
return (matchedWords.length / expectedWords.length) * 100;
|
|
157
|
-
}
|
|
158
|
-
});
|
|
159
|
-
```
|
|
160
|
-
|
|
161
|
-
### Keyword presence evaluation
|
|
162
|
-
|
|
163
|
-
```typescript
|
|
164
|
-
const keywordEval = new ExuluEval({
|
|
165
|
-
id: "keyword_presence",
|
|
166
|
-
name: "Keyword Presence",
|
|
167
|
-
description: "Checks if response contains required keywords",
|
|
168
|
-
llm: false,
|
|
169
|
-
execute: async ({ messages, testCase, config }) => {
|
|
170
|
-
const lastMessage = messages[messages.length - 1];
|
|
171
|
-
const response = lastMessage?.content?.toLowerCase() || "";
|
|
172
|
-
|
|
173
|
-
const keywords = config?.keywords || [];
|
|
174
|
-
if (keywords.length === 0) return 100;
|
|
175
|
-
|
|
176
|
-
const foundKeywords = keywords.filter(kw =>
|
|
177
|
-
response.includes(kw.toLowerCase())
|
|
178
|
-
);
|
|
179
|
-
|
|
180
|
-
return (foundKeywords.length / keywords.length) * 100;
|
|
181
|
-
},
|
|
182
|
-
config: [
|
|
183
|
-
{
|
|
184
|
-
name: "keywords",
|
|
185
|
-
description: "Array of keywords that should appear in response"
|
|
186
|
-
}
|
|
187
|
-
]
|
|
188
|
-
});
|
|
189
|
-
|
|
190
|
-
// Run with config
|
|
191
|
-
const score = await keywordEval.run(
|
|
192
|
-
agent,
|
|
193
|
-
backend,
|
|
194
|
-
testCase,
|
|
195
|
-
messages,
|
|
196
|
-
{ keywords: ["weather", "temperature", "San Francisco"] }
|
|
197
|
-
);
|
|
198
|
-
```
|
|
199
|
-
|
|
200
|
-
### LLM-as-judge evaluation
|
|
201
|
-
|
|
202
|
-
```typescript
|
|
203
|
-
const llmJudgeEval = new ExuluEval({
|
|
204
|
-
id: "llm_judge",
|
|
205
|
-
name: "LLM Judge",
|
|
206
|
-
description: "Uses an LLM to evaluate response quality",
|
|
207
|
-
llm: true,
|
|
208
|
-
execute: async ({ backend, messages, testCase, config }) => {
|
|
209
|
-
const lastMessage = messages[messages.length - 1];
|
|
210
|
-
const response = lastMessage?.content || "";
|
|
211
|
-
|
|
212
|
-
const judgePrompt = `
|
|
213
|
-
You are an expert evaluator. Rate the following response on a scale of 0-100.
|
|
214
|
-
|
|
215
|
-
Test Case: ${testCase.name}
|
|
216
|
-
Description: ${testCase.description || "N/A"}
|
|
217
|
-
|
|
218
|
-
Expected Output:
|
|
219
|
-
${testCase.expected_output}
|
|
220
|
-
|
|
221
|
-
Actual Response:
|
|
222
|
-
${response}
|
|
223
|
-
|
|
224
|
-
Criteria:
|
|
225
|
-
1. Accuracy: Does it match the expected output?
|
|
226
|
-
2. Completeness: Does it address all required aspects?
|
|
227
|
-
3. Clarity: Is it well-structured and understandable?
|
|
228
|
-
4. Relevance: Does it stay on topic?
|
|
229
|
-
|
|
230
|
-
Respond with ONLY a number from 0 to 100. No explanation.
|
|
231
|
-
`.trim();
|
|
232
|
-
|
|
233
|
-
const result = await backend.generateSync({
|
|
234
|
-
prompt: judgePrompt,
|
|
235
|
-
agentInstance: await loadAgent(config?.judgeAgentId || "default_judge"),
|
|
236
|
-
statistics: { label: "eval", trigger: "llm_judge" }
|
|
237
|
-
});
|
|
238
|
-
|
|
239
|
-
const score = parseInt(result.text.trim());
|
|
240
|
-
|
|
241
|
-
if (isNaN(score)) {
|
|
242
|
-
console.warn(`LLM judge returned non-numeric: ${result.text}`);
|
|
243
|
-
return 0;
|
|
244
|
-
}
|
|
245
|
-
|
|
246
|
-
return Math.max(0, Math.min(100, score));
|
|
247
|
-
},
|
|
248
|
-
config: [
|
|
249
|
-
{
|
|
250
|
-
name: "judgeAgentId",
|
|
251
|
-
description: "Agent ID to use for evaluation (must support text generation)"
|
|
252
|
-
}
|
|
253
|
-
]
|
|
254
|
-
});
|
|
255
|
-
```
|
|
256
|
-
|
|
257
|
-
### Tool usage evaluation
|
|
258
|
-
|
|
259
|
-
```typescript
|
|
260
|
-
const toolUsageEval = new ExuluEval({
|
|
261
|
-
id: "tool_usage",
|
|
262
|
-
name: "Tool Usage",
|
|
263
|
-
description: "Checks if agent used expected tools",
|
|
264
|
-
llm: false,
|
|
265
|
-
execute: async ({ messages, testCase }) => {
|
|
266
|
-
// Extract tool calls from conversation
|
|
267
|
-
const toolCalls = messages
|
|
268
|
-
.flatMap(msg => msg.toolInvocations || [])
|
|
269
|
-
.map(inv => inv.toolName);
|
|
270
|
-
|
|
271
|
-
const expectedTools = testCase.expected_tools || [];
|
|
272
|
-
|
|
273
|
-
// If no tools expected, check that no tools were used
|
|
274
|
-
if (expectedTools.length === 0) {
|
|
275
|
-
return toolCalls.length === 0 ? 100 : 0;
|
|
276
|
-
}
|
|
277
|
-
|
|
278
|
-
// Check if all expected tools were used
|
|
279
|
-
const usedExpected = expectedTools.filter(tool =>
|
|
280
|
-
toolCalls.includes(tool)
|
|
281
|
-
);
|
|
282
|
-
|
|
283
|
-
return (usedExpected.length / expectedTools.length) * 100;
|
|
284
|
-
}
|
|
285
|
-
});
|
|
286
|
-
```
|
|
287
|
-
|
|
288
|
-
### Regex pattern matching
|
|
289
|
-
|
|
290
|
-
```typescript
|
|
291
|
-
const regexMatchEval = new ExuluEval({
|
|
292
|
-
id: "regex_match",
|
|
293
|
-
name: "Regex Pattern Match",
|
|
294
|
-
description: "Checks if response matches regex pattern",
|
|
295
|
-
llm: false,
|
|
296
|
-
execute: async ({ messages, testCase, config }) => {
|
|
297
|
-
const lastMessage = messages[messages.length - 1];
|
|
298
|
-
const response = lastMessage?.content || "";
|
|
299
|
-
|
|
300
|
-
const pattern = config?.pattern;
|
|
301
|
-
if (!pattern) {
|
|
302
|
-
throw new Error("Regex pattern required in config");
|
|
303
|
-
}
|
|
304
|
-
|
|
305
|
-
const regex = new RegExp(pattern, config?.flags || "");
|
|
306
|
-
return regex.test(response) ? 100 : 0;
|
|
307
|
-
},
|
|
308
|
-
config: [
|
|
309
|
-
{
|
|
310
|
-
name: "pattern",
|
|
311
|
-
description: "Regex pattern to match"
|
|
312
|
-
},
|
|
313
|
-
{
|
|
314
|
-
name: "flags",
|
|
315
|
-
description: "Regex flags (e.g., 'i' for case-insensitive)"
|
|
316
|
-
}
|
|
317
|
-
]
|
|
318
|
-
});
|
|
319
|
-
|
|
320
|
-
// Run with regex config
|
|
321
|
-
const score = await regexMatchEval.run(
|
|
322
|
-
agent,
|
|
323
|
-
backend,
|
|
324
|
-
testCase,
|
|
325
|
-
messages,
|
|
326
|
-
{
|
|
327
|
-
pattern: "\\d{2}°[FC]", // Matches temperature like "68°F"
|
|
328
|
-
flags: "i"
|
|
329
|
-
}
|
|
330
|
-
);
|
|
331
|
-
```
|
|
332
|
-
|
|
333
|
-
### Length-based evaluation
|
|
334
|
-
|
|
335
|
-
```typescript
|
|
336
|
-
const lengthEval = new ExuluEval({
|
|
337
|
-
id: "response_length",
|
|
338
|
-
name: "Response Length",
|
|
339
|
-
description: "Scores based on response length within acceptable range",
|
|
340
|
-
llm: false,
|
|
341
|
-
execute: async ({ messages, config }) => {
|
|
342
|
-
const lastMessage = messages[messages.length - 1];
|
|
343
|
-
const response = lastMessage?.content || "";
|
|
344
|
-
const length = response.length;
|
|
345
|
-
|
|
346
|
-
const minLength = config?.minLength || 0;
|
|
347
|
-
const maxLength = config?.maxLength || Infinity;
|
|
348
|
-
const targetLength = config?.targetLength;
|
|
349
|
-
|
|
350
|
-
// If within range, score based on proximity to target
|
|
351
|
-
if (length < minLength) {
|
|
352
|
-
return Math.max(0, (length / minLength) * 100);
|
|
353
|
-
}
|
|
354
|
-
|
|
355
|
-
if (length > maxLength) {
|
|
356
|
-
return Math.max(0, 100 - ((length - maxLength) / maxLength) * 100);
|
|
357
|
-
}
|
|
358
|
-
|
|
359
|
-
// Within range
|
|
360
|
-
if (targetLength) {
|
|
361
|
-
const deviation = Math.abs(length - targetLength);
|
|
362
|
-
const maxDeviation = Math.max(
|
|
363
|
-
targetLength - minLength,
|
|
364
|
-
maxLength - targetLength
|
|
365
|
-
);
|
|
366
|
-
return Math.max(0, 100 - (deviation / maxDeviation) * 50);
|
|
367
|
-
}
|
|
368
|
-
|
|
369
|
-
return 100;
|
|
370
|
-
},
|
|
371
|
-
config: [
|
|
372
|
-
{
|
|
373
|
-
name: "minLength",
|
|
374
|
-
description: "Minimum acceptable character count"
|
|
375
|
-
},
|
|
376
|
-
{
|
|
377
|
-
name: "maxLength",
|
|
378
|
-
description: "Maximum acceptable character count"
|
|
379
|
-
},
|
|
380
|
-
{
|
|
381
|
-
name: "targetLength",
|
|
382
|
-
description: "Ideal character count (optional)"
|
|
383
|
-
}
|
|
384
|
-
]
|
|
385
|
-
});
|
|
386
|
-
```
|
|
387
|
-
|
|
388
|
-
### Composite evaluation
|
|
389
|
-
|
|
390
|
-
Combine multiple evaluation criteria:
|
|
391
|
-
|
|
392
|
-
```typescript
|
|
393
|
-
const compositeEval = new ExuluEval({
|
|
394
|
-
id: "composite",
|
|
395
|
-
name: "Composite Evaluation",
|
|
396
|
-
description: "Combines multiple evaluation criteria with weights",
|
|
397
|
-
llm: false,
|
|
398
|
-
execute: async ({ messages, testCase, config }) => {
|
|
399
|
-
const lastMessage = messages[messages.length - 1];
|
|
400
|
-
const response = lastMessage?.content || "";
|
|
401
|
-
|
|
402
|
-
let totalScore = 0;
|
|
403
|
-
let totalWeight = 0;
|
|
404
|
-
|
|
405
|
-
// Criteria 1: Contains expected output (weight: 50%)
|
|
406
|
-
const containsExpected = response.includes(testCase.expected_output);
|
|
407
|
-
totalScore += containsExpected ? 50 : 0;
|
|
408
|
-
totalWeight += 50;
|
|
409
|
-
|
|
410
|
-
// Criteria 2: Reasonable length (weight: 20%)
|
|
411
|
-
const isReasonableLength = response.length >= 50 && response.length <= 500;
|
|
412
|
-
totalScore += isReasonableLength ? 20 : 0;
|
|
413
|
-
totalWeight += 20;
|
|
414
|
-
|
|
415
|
-
// Criteria 3: Uses tools if expected (weight: 30%)
|
|
416
|
-
const toolCalls = messages.flatMap(msg => msg.toolInvocations || []);
|
|
417
|
-
const expectedTools = testCase.expected_tools || [];
|
|
418
|
-
if (expectedTools.length > 0) {
|
|
419
|
-
const toolsUsed = expectedTools.every(tool =>
|
|
420
|
-
toolCalls.some(call => call.toolName === tool)
|
|
421
|
-
);
|
|
422
|
-
totalScore += toolsUsed ? 30 : 0;
|
|
423
|
-
totalWeight += 30;
|
|
424
|
-
} else {
|
|
425
|
-
totalScore += 30; // No tools expected, full points
|
|
426
|
-
totalWeight += 30;
|
|
427
|
-
}
|
|
428
|
-
|
|
429
|
-
return (totalScore / totalWeight) * 100;
|
|
430
|
-
}
|
|
431
|
-
});
|
|
432
|
-
```
|
|
433
|
-
|
|
434
|
-
## Queue configuration
|
|
435
|
-
|
|
436
|
-
Run evaluations as background jobs using ExuluQueues:
|
|
437
|
-
|
|
438
|
-
```typescript
|
|
439
|
-
import { ExuluEval, ExuluQueues } from "@exulu/backend";
|
|
440
|
-
|
|
441
|
-
const backgroundEval = new ExuluEval({
|
|
442
|
-
id: "background_eval",
|
|
443
|
-
name: "Background Evaluation",
|
|
444
|
-
description: "Runs as queued job",
|
|
445
|
-
llm: true,
|
|
446
|
-
execute: async ({ backend, messages, testCase }) => {
|
|
447
|
-
// Long-running evaluation logic
|
|
448
|
-
return 85;
|
|
449
|
-
},
|
|
450
|
-
queue: Promise.resolve({
|
|
451
|
-
connection: await ExuluQueues.getConnection(),
|
|
452
|
-
name: "evaluations",
|
|
453
|
-
prefix: "{exulu}",
|
|
454
|
-
defaultJobOptions: {
|
|
455
|
-
attempts: 3,
|
|
456
|
-
backoff: {
|
|
457
|
-
type: "exponential",
|
|
458
|
-
delay: 2000
|
|
459
|
-
},
|
|
460
|
-
removeOnComplete: true,
|
|
461
|
-
removeOnFail: false
|
|
462
|
-
}
|
|
463
|
-
})
|
|
464
|
-
});
|
|
465
|
-
```
|
|
466
|
-
|
|
467
|
-
## Advanced patterns
|
|
468
|
-
|
|
469
|
-
### Semantic similarity evaluation
|
|
470
|
-
|
|
471
|
-
Use embeddings to measure semantic similarity:
|
|
472
|
-
|
|
473
|
-
```typescript
|
|
474
|
-
import { ExuluEval, ExuluEmbedder, ExuluVariables } from "@exulu/backend";
|
|
475
|
-
|
|
476
|
-
const semanticSimilarityEval = new ExuluEval({
|
|
477
|
-
id: "semantic_similarity",
|
|
478
|
-
name: "Semantic Similarity",
|
|
479
|
-
description: "Measures semantic similarity using embeddings",
|
|
480
|
-
llm: false,
|
|
481
|
-
execute: async ({ messages, testCase, config }) => {
|
|
482
|
-
const lastMessage = messages[messages.length - 1];
|
|
483
|
-
const response = lastMessage?.content || "";
|
|
484
|
-
|
|
485
|
-
const embedder = new ExuluEmbedder({
|
|
486
|
-
id: "eval_embedder",
|
|
487
|
-
name: "Evaluation Embedder",
|
|
488
|
-
provider: "openai",
|
|
489
|
-
model: "text-embedding-3-small",
|
|
490
|
-
vectorDimensions: 1536,
|
|
491
|
-
authenticationInformation: await ExuluVariables.get("openai_api_key")
|
|
492
|
-
});
|
|
493
|
-
|
|
494
|
-
const [responseEmb, expectedEmb] = await embedder.generate([
|
|
495
|
-
response,
|
|
496
|
-
testCase.expected_output
|
|
497
|
-
]);
|
|
498
|
-
|
|
499
|
-
// Cosine similarity
|
|
500
|
-
const similarity = cosineSimilarity(responseEmb, expectedEmb);
|
|
501
|
-
|
|
502
|
-
// Scale to 0-100
|
|
503
|
-
return similarity * 100;
|
|
504
|
-
}
|
|
505
|
-
});
|
|
506
|
-
|
|
507
|
-
function cosineSimilarity(a: number[], b: number[]): number {
|
|
508
|
-
const dotProduct = a.reduce((sum, val, i) => sum + val * b[i], 0);
|
|
509
|
-
const magnitudeA = Math.sqrt(a.reduce((sum, val) => sum + val * val, 0));
|
|
510
|
-
const magnitudeB = Math.sqrt(b.reduce((sum, val) => sum + val * val, 0));
|
|
511
|
-
return dotProduct / (magnitudeA * magnitudeB);
|
|
512
|
-
}
|
|
513
|
-
```
|
|
514
|
-
|
|
515
|
-
### Multi-aspect LLM judge
|
|
516
|
-
|
|
517
|
-
Evaluate multiple aspects separately:
|
|
518
|
-
|
|
519
|
-
```typescript
|
|
520
|
-
const multiAspectJudgeEval = new ExuluEval({
|
|
521
|
-
id: "multi_aspect_judge",
|
|
522
|
-
name: "Multi-Aspect LLM Judge",
|
|
523
|
-
description: "Evaluates multiple aspects with separate LLM calls",
|
|
524
|
-
llm: true,
|
|
525
|
-
execute: async ({ backend, messages, testCase, config }) => {
|
|
526
|
-
const lastMessage = messages[messages.length - 1];
|
|
527
|
-
const response = lastMessage?.content || "";
|
|
528
|
-
|
|
529
|
-
const aspects = [
|
|
530
|
-
{
|
|
531
|
-
name: "accuracy",
|
|
532
|
-
weight: 40,
|
|
533
|
-
prompt: "Rate the accuracy of this response (0-100):"
|
|
534
|
-
},
|
|
535
|
-
{
|
|
536
|
-
name: "clarity",
|
|
537
|
-
weight: 30,
|
|
538
|
-
prompt: "Rate the clarity and readability (0-100):"
|
|
539
|
-
},
|
|
540
|
-
{
|
|
541
|
-
name: "completeness",
|
|
542
|
-
weight: 30,
|
|
543
|
-
prompt: "Rate how complete the response is (0-100):"
|
|
544
|
-
}
|
|
545
|
-
];
|
|
546
|
-
|
|
547
|
-
let totalScore = 0;
|
|
548
|
-
let totalWeight = 0;
|
|
549
|
-
|
|
550
|
-
for (const aspect of aspects) {
|
|
551
|
-
const judgePrompt = `
|
|
552
|
-
${aspect.prompt}
|
|
553
|
-
|
|
554
|
-
Expected: ${testCase.expected_output}
|
|
555
|
-
Actual: ${response}
|
|
556
|
-
|
|
557
|
-
Respond with ONLY a number 0-100.
|
|
558
|
-
`.trim();
|
|
559
|
-
|
|
560
|
-
const result = await backend.generateSync({
|
|
561
|
-
prompt: judgePrompt,
|
|
562
|
-
agentInstance: await loadAgent(config?.judgeAgentId || "default_judge"),
|
|
563
|
-
statistics: { label: "eval", trigger: "multi_aspect_judge" }
|
|
564
|
-
});
|
|
565
|
-
|
|
566
|
-
const score = parseInt(result.text.trim());
|
|
567
|
-
if (!isNaN(score)) {
|
|
568
|
-
totalScore += Math.max(0, Math.min(100, score)) * aspect.weight;
|
|
569
|
-
totalWeight += aspect.weight;
|
|
570
|
-
}
|
|
571
|
-
}
|
|
572
|
-
|
|
573
|
-
return totalWeight > 0 ? totalScore / totalWeight : 0;
|
|
574
|
-
},
|
|
575
|
-
config: [
|
|
576
|
-
{
|
|
577
|
-
name: "judgeAgentId",
|
|
578
|
-
description: "Agent ID for LLM judge"
|
|
579
|
-
}
|
|
580
|
-
]
|
|
581
|
-
});
|
|
582
|
-
```
|
|
583
|
-
|
|
584
|
-
### A/B testing evaluation
|
|
585
|
-
|
|
586
|
-
Compare two agent configurations:
|
|
587
|
-
|
|
588
|
-
```typescript
|
|
589
|
-
async function compareAgents(
|
|
590
|
-
agentA: Agent,
|
|
591
|
-
agentB: Agent,
|
|
592
|
-
backendA: ExuluAgent,
|
|
593
|
-
backendB: ExuluAgent,
|
|
594
|
-
testCases: TestCase[],
|
|
595
|
-
evals: ExuluEval[]
|
|
596
|
-
) {
|
|
597
|
-
const resultsA = [];
|
|
598
|
-
const resultsB = [];
|
|
599
|
-
|
|
600
|
-
for (const testCase of testCases) {
|
|
601
|
-
// Generate response from Agent A
|
|
602
|
-
const responseA = await backendA.generateSync({
|
|
603
|
-
prompt: testCase.inputs[testCase.inputs.length - 1].content,
|
|
604
|
-
agentInstance: await loadAgent(agentA.id),
|
|
605
|
-
statistics: { label: "ab_test", trigger: "test" }
|
|
606
|
-
});
|
|
607
|
-
|
|
608
|
-
const messagesA = [
|
|
609
|
-
...testCase.inputs,
|
|
610
|
-
{ role: "assistant", content: responseA.text }
|
|
611
|
-
];
|
|
612
|
-
|
|
613
|
-
// Generate response from Agent B
|
|
614
|
-
const responseB = await backendB.generateSync({
|
|
615
|
-
prompt: testCase.inputs[testCase.inputs.length - 1].content,
|
|
616
|
-
agentInstance: await loadAgent(agentB.id),
|
|
617
|
-
statistics: { label: "ab_test", trigger: "test" }
|
|
618
|
-
});
|
|
619
|
-
|
|
620
|
-
const messagesB = [
|
|
621
|
-
...testCase.inputs,
|
|
622
|
-
{ role: "assistant", content: responseB.text }
|
|
623
|
-
];
|
|
624
|
-
|
|
625
|
-
// Run evaluations on both
|
|
626
|
-
for (const eval of evals) {
|
|
627
|
-
const scoreA = await eval.run(agentA, backendA, testCase, messagesA);
|
|
628
|
-
const scoreB = await eval.run(agentB, backendB, testCase, messagesB);
|
|
629
|
-
|
|
630
|
-
resultsA.push({ testCase: testCase.name, eval: eval.name, score: scoreA });
|
|
631
|
-
resultsB.push({ testCase: testCase.name, eval: eval.name, score: scoreB });
|
|
632
|
-
}
|
|
633
|
-
}
|
|
634
|
-
|
|
635
|
-
// Calculate averages
|
|
636
|
-
const avgA = resultsA.reduce((sum, r) => sum + r.score, 0) / resultsA.length;
|
|
637
|
-
const avgB = resultsB.reduce((sum, r) => sum + r.score, 0) / resultsB.length;
|
|
638
|
-
|
|
639
|
-
return {
|
|
640
|
-
agentA: { results: resultsA, average: avgA },
|
|
641
|
-
agentB: { results: resultsB, average: avgB },
|
|
642
|
-
winner: avgA > avgB ? "Agent A" : "Agent B"
|
|
643
|
-
};
|
|
644
|
-
}
|
|
645
|
-
```
|
|
646
|
-
|
|
647
|
-
## Best practices
|
|
648
|
-
|
|
649
|
-
<Tip>
|
|
650
|
-
**Weighted scoring**: For composite evaluations, use weighted scoring to prioritize important criteria.
|
|
651
|
-
</Tip>
|
|
652
|
-
|
|
653
|
-
<Note>
|
|
654
|
-
**Error handling**: Always handle errors in execute functions and return 0 or throw descriptive errors.
|
|
655
|
-
</Note>
|
|
656
|
-
|
|
657
|
-
<Warning>
|
|
658
|
-
**LLM judge reliability**: LLM judges can be inconsistent. Run multiple times or use temperature=0 for deterministic results.
|
|
659
|
-
</Warning>
|
|
660
|
-
|
|
661
|
-
<Info>
|
|
662
|
-
**Config validation**: Validate config parameters at the start of execute functions to provide clear error messages.
|
|
663
|
-
</Info>
|
|
664
|
-
|
|
665
|
-
## Next steps
|
|
666
|
-
|
|
667
|
-
<CardGroup cols={2}>
|
|
668
|
-
<Card title="API reference" icon="code" href="/core/exulu-eval/api-reference">
|
|
669
|
-
Explore all methods and properties
|
|
670
|
-
</Card>
|
|
671
|
-
<Card title="Overview" icon="book" href="/core/exulu-eval/introduction">
|
|
672
|
-
Learn about evaluation concepts
|
|
673
|
-
</Card>
|
|
674
|
-
<Card title="ExuluQueues" icon="layer-group" href="/core/exulu-queues/introduction">
|
|
675
|
-
Run evaluations as background jobs
|
|
676
|
-
</Card>
|
|
677
|
-
<Card title="ExuluAgent" icon="robot" href="/core/exulu-agent/introduction">
|
|
678
|
-
Create agents to evaluate
|
|
679
|
-
</Card>
|
|
680
|
-
</CardGroup>
|