@exulu/backend 1.48.2 → 1.49.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +351 -42
- package/dist/index.d.cts +96 -1
- package/dist/index.d.ts +96 -1
- package/dist/index.js +340 -38
- package/ee/{markdown.ts → chunking/markdown.ts} +2 -2
- package/ee/python/README.md +295 -0
- package/ee/python/documents/processing/README.md +155 -0
- package/ee/{documents → python/documents}/processing/doc_processor.ts +25 -17
- package/ee/{documents/processing/pdf_to_markdown.py → python/documents/processing/document_to_markdown.py} +3 -10
- package/ee/python/setup.sh +180 -0
- package/package.json +14 -3
- package/scripts/postinstall.cjs +149 -0
- package/.agents/skills/mintlify/SKILL.md +0 -347
- package/.editorconfig +0 -15
- package/.eslintrc.json +0 -52
- package/.github/workflows/release-backend.yml +0 -38
- package/.husky/commit-msg +0 -1
- package/.jscpd.json +0 -18
- package/.mcp.json +0 -25
- package/.nvmrc +0 -1
- package/.prettierignore +0 -5
- package/.prettierrc.json +0 -12
- package/CHANGELOG.md +0 -8
- package/SECURITY.md +0 -5
- package/commitlint.config.js +0 -4
- package/devops/documentation/patch-older-releases.md +0 -42
- package/ee/documents/processing/build_pdf_processor.sh +0 -35
- package/ee/documents/processing/chunk_markdown.py +0 -263
- package/ee/documents/processing/pdf_processor.spec +0 -115
- package/eslint.config.js +0 -88
- package/jest.config.ts +0 -25
- package/mintlify-docs/.mintignore +0 -7
- package/mintlify-docs/AGENTS.md +0 -33
- package/mintlify-docs/CLAUDE.MD +0 -50
- package/mintlify-docs/CONTRIBUTING.md +0 -32
- package/mintlify-docs/LICENSE +0 -21
- package/mintlify-docs/README.md +0 -55
- package/mintlify-docs/ai-tools/claude-code.mdx +0 -43
- package/mintlify-docs/ai-tools/cursor.mdx +0 -39
- package/mintlify-docs/ai-tools/windsurf.mdx +0 -39
- package/mintlify-docs/api-reference/core-types/agent-types.mdx +0 -110
- package/mintlify-docs/api-reference/core-types/analytics-types.mdx +0 -95
- package/mintlify-docs/api-reference/core-types/configuration-types.mdx +0 -83
- package/mintlify-docs/api-reference/core-types/evaluation-types.mdx +0 -106
- package/mintlify-docs/api-reference/core-types/job-types.mdx +0 -135
- package/mintlify-docs/api-reference/core-types/overview.mdx +0 -73
- package/mintlify-docs/api-reference/core-types/prompt-types.mdx +0 -102
- package/mintlify-docs/api-reference/core-types/rbac-types.mdx +0 -163
- package/mintlify-docs/api-reference/core-types/session-types.mdx +0 -77
- package/mintlify-docs/api-reference/core-types/user-management.mdx +0 -112
- package/mintlify-docs/api-reference/core-types/workflow-types.mdx +0 -88
- package/mintlify-docs/api-reference/core-types.mdx +0 -585
- package/mintlify-docs/api-reference/dynamic-types.mdx +0 -851
- package/mintlify-docs/api-reference/endpoint/create.mdx +0 -4
- package/mintlify-docs/api-reference/endpoint/delete.mdx +0 -4
- package/mintlify-docs/api-reference/endpoint/get.mdx +0 -4
- package/mintlify-docs/api-reference/endpoint/webhook.mdx +0 -4
- package/mintlify-docs/api-reference/introduction.mdx +0 -661
- package/mintlify-docs/api-reference/mutations.mdx +0 -1012
- package/mintlify-docs/api-reference/openapi.json +0 -217
- package/mintlify-docs/api-reference/queries.mdx +0 -1154
- package/mintlify-docs/backend/introduction.mdx +0 -218
- package/mintlify-docs/changelog.mdx +0 -387
- package/mintlify-docs/community-edition.mdx +0 -304
- package/mintlify-docs/core/exulu-agent/api-reference.mdx +0 -894
- package/mintlify-docs/core/exulu-agent/configuration.mdx +0 -690
- package/mintlify-docs/core/exulu-agent/introduction.mdx +0 -552
- package/mintlify-docs/core/exulu-app/api-reference.mdx +0 -481
- package/mintlify-docs/core/exulu-app/configuration.mdx +0 -319
- package/mintlify-docs/core/exulu-app/introduction.mdx +0 -117
- package/mintlify-docs/core/exulu-authentication.mdx +0 -810
- package/mintlify-docs/core/exulu-chunkers/api-reference.mdx +0 -1011
- package/mintlify-docs/core/exulu-chunkers/configuration.mdx +0 -596
- package/mintlify-docs/core/exulu-chunkers/introduction.mdx +0 -403
- package/mintlify-docs/core/exulu-context/api-reference.mdx +0 -911
- package/mintlify-docs/core/exulu-context/configuration.mdx +0 -648
- package/mintlify-docs/core/exulu-context/introduction.mdx +0 -394
- package/mintlify-docs/core/exulu-database.mdx +0 -811
- package/mintlify-docs/core/exulu-default-agents.mdx +0 -545
- package/mintlify-docs/core/exulu-eval/api-reference.mdx +0 -772
- package/mintlify-docs/core/exulu-eval/configuration.mdx +0 -680
- package/mintlify-docs/core/exulu-eval/introduction.mdx +0 -459
- package/mintlify-docs/core/exulu-logging.mdx +0 -464
- package/mintlify-docs/core/exulu-otel.mdx +0 -670
- package/mintlify-docs/core/exulu-queues/api-reference.mdx +0 -648
- package/mintlify-docs/core/exulu-queues/configuration.mdx +0 -650
- package/mintlify-docs/core/exulu-queues/introduction.mdx +0 -474
- package/mintlify-docs/core/exulu-reranker/api-reference.mdx +0 -630
- package/mintlify-docs/core/exulu-reranker/configuration.mdx +0 -663
- package/mintlify-docs/core/exulu-reranker/introduction.mdx +0 -516
- package/mintlify-docs/core/exulu-tool/api-reference.mdx +0 -723
- package/mintlify-docs/core/exulu-tool/configuration.mdx +0 -805
- package/mintlify-docs/core/exulu-tool/introduction.mdx +0 -539
- package/mintlify-docs/core/exulu-variables/api-reference.mdx +0 -699
- package/mintlify-docs/core/exulu-variables/configuration.mdx +0 -736
- package/mintlify-docs/core/exulu-variables/introduction.mdx +0 -511
- package/mintlify-docs/development.mdx +0 -94
- package/mintlify-docs/docs.json +0 -248
- package/mintlify-docs/enterprise-edition.mdx +0 -538
- package/mintlify-docs/essentials/code.mdx +0 -35
- package/mintlify-docs/essentials/images.mdx +0 -59
- package/mintlify-docs/essentials/markdown.mdx +0 -88
- package/mintlify-docs/essentials/navigation.mdx +0 -87
- package/mintlify-docs/essentials/reusable-snippets.mdx +0 -110
- package/mintlify-docs/essentials/settings.mdx +0 -318
- package/mintlify-docs/favicon.svg +0 -3
- package/mintlify-docs/frontend/introduction.mdx +0 -39
- package/mintlify-docs/getting-started.mdx +0 -267
- package/mintlify-docs/guides/custom-agent.mdx +0 -608
- package/mintlify-docs/guides/first-agent.mdx +0 -315
- package/mintlify-docs/images/admin_ui.png +0 -0
- package/mintlify-docs/images/contexts.png +0 -0
- package/mintlify-docs/images/create_agents.png +0 -0
- package/mintlify-docs/images/evals.png +0 -0
- package/mintlify-docs/images/graphql.png +0 -0
- package/mintlify-docs/images/graphql_api.png +0 -0
- package/mintlify-docs/images/hero-dark.png +0 -0
- package/mintlify-docs/images/hero-light.png +0 -0
- package/mintlify-docs/images/hero.png +0 -0
- package/mintlify-docs/images/knowledge_sources.png +0 -0
- package/mintlify-docs/images/mcp.png +0 -0
- package/mintlify-docs/images/scaling.png +0 -0
- package/mintlify-docs/index.mdx +0 -411
- package/mintlify-docs/logo/dark.svg +0 -9
- package/mintlify-docs/logo/light.svg +0 -9
- package/mintlify-docs/partners.mdx +0 -558
- package/mintlify-docs/products.mdx +0 -77
- package/mintlify-docs/snippets/snippet-intro.mdx +0 -4
- package/mintlify-docs/styles.css +0 -207
- package/ngrok.bash +0 -1
- package/ngrok.md +0 -6
- package/ngrok.yml +0 -10
- package/release.config.cjs +0 -15
- package/skills-lock.json +0 -10
- package/types/context-processor.ts +0 -45
- package/types/enums/eval-types.ts +0 -5
- package/types/enums/field-types.ts +0 -1
- package/types/enums/jobs.ts +0 -11
- package/types/enums/statistics.ts +0 -13
- package/types/exulu-table-definition.ts +0 -79
- package/types/file-types.ts +0 -18
- package/types/models/agent-session.ts +0 -27
- package/types/models/agent.ts +0 -68
- package/types/models/context.ts +0 -53
- package/types/models/embedding.ts +0 -17
- package/types/models/eval-run.ts +0 -40
- package/types/models/exulu-agent-tool-config.ts +0 -11
- package/types/models/item.ts +0 -21
- package/types/models/job.ts +0 -8
- package/types/models/project.ts +0 -16
- package/types/models/rate-limiter-rules.ts +0 -7
- package/types/models/test-case.ts +0 -25
- package/types/models/tool.ts +0 -9
- package/types/models/user-role.ts +0 -12
- package/types/models/user.ts +0 -20
- package/types/models/variable.ts +0 -8
- package/types/models/vector-methods.ts +0 -7
- package/types/provider-config.ts +0 -21
- package/types/queue-config.ts +0 -16
- package/types/rbac-rights-modes.ts +0 -1
- package/types/statistics.ts +0 -20
- package/types/workflow.ts +0 -31
- /package/ee/{documents → python/documents}/THIRD_PARTY_LICENSES/docling.txt +0 -0
- /package/ee/{documents/processing → python}/requirements.txt +0 -0
|
@@ -1,459 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
title: "Overview"
|
|
3
|
-
description: "Custom evaluation functions to measure and score agent performance"
|
|
4
|
-
---
|
|
5
|
-
|
|
6
|
-
## Overview
|
|
7
|
-
|
|
8
|
-
`ExuluEval` is a class for creating custom evaluation functions that measure and score agent performance against test cases. Evaluations allow you to systematically test your agents, track quality over time, and identify areas for improvement.
|
|
9
|
-
|
|
10
|
-
## What is ExuluEval?
|
|
11
|
-
|
|
12
|
-
ExuluEval provides a framework for defining evaluation logic that:
|
|
13
|
-
|
|
14
|
-
- **Scores agent responses**: Returns a score from 0-100 based on custom criteria
|
|
15
|
-
- **Runs against test cases**: Evaluates agent behavior using structured test inputs
|
|
16
|
-
- **Supports any evaluation method**: Custom logic, LLM-as-judge, regex matching, or any scoring approach
|
|
17
|
-
- **Integrates with queues**: Can be run as background jobs using ExuluQueues
|
|
18
|
-
- **Enables A/B testing**: Compare different agent configurations, prompts, or models
|
|
19
|
-
|
|
20
|
-
<CardGroup cols={2}>
|
|
21
|
-
<Card title="Custom scoring logic" icon="code">
|
|
22
|
-
Write any evaluation function in TypeScript
|
|
23
|
-
</Card>
|
|
24
|
-
<Card title="Test cases" icon="clipboard-check">
|
|
25
|
-
Structured inputs with expected outputs
|
|
26
|
-
</Card>
|
|
27
|
-
<Card title="LLM-as-judge" icon="scale-balanced">
|
|
28
|
-
Use LLMs to evaluate response quality
|
|
29
|
-
</Card>
|
|
30
|
-
<Card title="Queue integration" icon="layer-group">
|
|
31
|
-
Run evaluations as background jobs
|
|
32
|
-
</Card>
|
|
33
|
-
</CardGroup>
|
|
34
|
-
|
|
35
|
-
## Why use evaluations?
|
|
36
|
-
|
|
37
|
-
Evaluations help you:
|
|
38
|
-
|
|
39
|
-
<AccordionGroup>
|
|
40
|
-
<Accordion title="Measure quality">
|
|
41
|
-
Quantify agent performance with consistent scoring criteria across all responses
|
|
42
|
-
</Accordion>
|
|
43
|
-
|
|
44
|
-
<Accordion title="Prevent regressions">
|
|
45
|
-
Catch performance degradation when updating prompts, models, or tools
|
|
46
|
-
</Accordion>
|
|
47
|
-
|
|
48
|
-
<Accordion title="Compare configurations">
|
|
49
|
-
A/B test different agent setups to find the best performing configuration
|
|
50
|
-
</Accordion>
|
|
51
|
-
|
|
52
|
-
<Accordion title="Track improvements">
|
|
53
|
-
Monitor evaluation scores over time to verify that changes improve quality
|
|
54
|
-
</Accordion>
|
|
55
|
-
|
|
56
|
-
<Accordion title="Automate testing">
|
|
57
|
-
Build CI/CD pipelines that fail if evaluation scores drop below thresholds
|
|
58
|
-
</Accordion>
|
|
59
|
-
</AccordionGroup>
|
|
60
|
-
|
|
61
|
-
## Quick start
|
|
62
|
-
|
|
63
|
-
```typescript
|
|
64
|
-
import { ExuluEval } from "@exulu/backend";
|
|
65
|
-
|
|
66
|
-
// Create an evaluation function
|
|
67
|
-
const exactMatchEval = new ExuluEval({
|
|
68
|
-
id: "exact_match",
|
|
69
|
-
name: "Exact Match",
|
|
70
|
-
description: "Checks if response exactly matches expected output",
|
|
71
|
-
llm: false, // Not using LLM-as-judge
|
|
72
|
-
execute: async ({ messages, testCase }) => {
|
|
73
|
-
const lastMessage = messages[messages.length - 1];
|
|
74
|
-
const response = lastMessage?.content || "";
|
|
75
|
-
|
|
76
|
-
return response === testCase.expected_output ? 100 : 0;
|
|
77
|
-
}
|
|
78
|
-
});
|
|
79
|
-
|
|
80
|
-
// Run against a test case
|
|
81
|
-
const score = await exactMatchEval.run(
|
|
82
|
-
agent, // Agent database record
|
|
83
|
-
backend, // ExuluAgent instance
|
|
84
|
-
testCase, // Test case with inputs and expected output
|
|
85
|
-
messages // Conversation messages
|
|
86
|
-
);
|
|
87
|
-
|
|
88
|
-
console.log(`Score: ${score}/100`);
|
|
89
|
-
```
|
|
90
|
-
|
|
91
|
-
## Evaluation types
|
|
92
|
-
|
|
93
|
-
### Custom logic evaluations
|
|
94
|
-
|
|
95
|
-
Write any scoring logic in TypeScript:
|
|
96
|
-
|
|
97
|
-
```typescript
|
|
98
|
-
const containsKeywordEval = new ExuluEval({
|
|
99
|
-
id: "contains_keyword",
|
|
100
|
-
name: "Contains Keyword",
|
|
101
|
-
description: "Checks if response contains required keywords",
|
|
102
|
-
llm: false,
|
|
103
|
-
execute: async ({ messages, testCase, config }) => {
|
|
104
|
-
const lastMessage = messages[messages.length - 1];
|
|
105
|
-
const response = lastMessage?.content?.toLowerCase() || "";
|
|
106
|
-
|
|
107
|
-
const keywords = config?.keywords || [];
|
|
108
|
-
const foundKeywords = keywords.filter(kw => response.includes(kw.toLowerCase()));
|
|
109
|
-
|
|
110
|
-
return (foundKeywords.length / keywords.length) * 100;
|
|
111
|
-
},
|
|
112
|
-
config: [
|
|
113
|
-
{
|
|
114
|
-
name: "keywords",
|
|
115
|
-
description: "List of keywords that should appear in the response"
|
|
116
|
-
}
|
|
117
|
-
]
|
|
118
|
-
});
|
|
119
|
-
```
|
|
120
|
-
|
|
121
|
-
### LLM-as-judge evaluations
|
|
122
|
-
|
|
123
|
-
Use an LLM to evaluate response quality:
|
|
124
|
-
|
|
125
|
-
```typescript
|
|
126
|
-
const llmJudgeEval = new ExuluEval({
|
|
127
|
-
id: "llm_judge_quality",
|
|
128
|
-
name: "LLM Judge - Quality",
|
|
129
|
-
description: "Uses an LLM to evaluate response quality",
|
|
130
|
-
llm: true, // Using LLM
|
|
131
|
-
execute: async ({ backend, messages, testCase, config }) => {
|
|
132
|
-
const lastMessage = messages[messages.length - 1];
|
|
133
|
-
const response = lastMessage?.content || "";
|
|
134
|
-
|
|
135
|
-
const prompt = `
|
|
136
|
-
You are an expert evaluator. Rate the quality of this response on a scale of 0-100.
|
|
137
|
-
|
|
138
|
-
Test Case: ${testCase.name}
|
|
139
|
-
Expected: ${testCase.expected_output}
|
|
140
|
-
Actual Response: ${response}
|
|
141
|
-
|
|
142
|
-
Consider:
|
|
143
|
-
- Accuracy: Does it match the expected output?
|
|
144
|
-
- Completeness: Does it address all aspects?
|
|
145
|
-
- Clarity: Is it well-structured and clear?
|
|
146
|
-
|
|
147
|
-
Respond with ONLY a number from 0-100.
|
|
148
|
-
`;
|
|
149
|
-
|
|
150
|
-
const result = await backend.generateSync({
|
|
151
|
-
prompt,
|
|
152
|
-
agentInstance: await loadAgent(config?.judgeAgentId),
|
|
153
|
-
statistics: { label: "eval", trigger: "system" }
|
|
154
|
-
});
|
|
155
|
-
|
|
156
|
-
const score = parseInt(result.text);
|
|
157
|
-
return isNaN(score) ? 0 : Math.max(0, Math.min(100, score));
|
|
158
|
-
},
|
|
159
|
-
config: [
|
|
160
|
-
{
|
|
161
|
-
name: "judgeAgentId",
|
|
162
|
-
description: "Agent ID to use as judge"
|
|
163
|
-
}
|
|
164
|
-
]
|
|
165
|
-
});
|
|
166
|
-
```
|
|
167
|
-
|
|
168
|
-
### Tool usage evaluations
|
|
169
|
-
|
|
170
|
-
Check if the agent used the correct tools:
|
|
171
|
-
|
|
172
|
-
```typescript
|
|
173
|
-
const toolUsageEval = new ExuluEval({
|
|
174
|
-
id: "tool_usage",
|
|
175
|
-
name: "Tool Usage",
|
|
176
|
-
description: "Checks if agent used expected tools",
|
|
177
|
-
llm: false,
|
|
178
|
-
execute: async ({ messages, testCase }) => {
|
|
179
|
-
// Extract tool calls from messages
|
|
180
|
-
const toolCalls = messages
|
|
181
|
-
.flatMap(msg => msg.toolInvocations || [])
|
|
182
|
-
.map(inv => inv.toolName);
|
|
183
|
-
|
|
184
|
-
const expectedTools = testCase.expected_tools || [];
|
|
185
|
-
|
|
186
|
-
if (expectedTools.length === 0) return 100;
|
|
187
|
-
|
|
188
|
-
const usedExpected = expectedTools.filter(tool => toolCalls.includes(tool));
|
|
189
|
-
|
|
190
|
-
return (usedExpected.length / expectedTools.length) * 100;
|
|
191
|
-
}
|
|
192
|
-
});
|
|
193
|
-
```
|
|
194
|
-
|
|
195
|
-
### Similarity evaluations
|
|
196
|
-
|
|
197
|
-
Use embeddings to measure semantic similarity:
|
|
198
|
-
|
|
199
|
-
```typescript
|
|
200
|
-
import { ExuluEval, ExuluEmbedder } from "@exulu/backend";
|
|
201
|
-
|
|
202
|
-
const similarityEval = new ExuluEval({
|
|
203
|
-
id: "semantic_similarity",
|
|
204
|
-
name: "Semantic Similarity",
|
|
205
|
-
description: "Measures semantic similarity between response and expected output",
|
|
206
|
-
llm: false,
|
|
207
|
-
execute: async ({ messages, testCase }) => {
|
|
208
|
-
const lastMessage = messages[messages.length - 1];
|
|
209
|
-
const response = lastMessage?.content || "";
|
|
210
|
-
|
|
211
|
-
const embedder = new ExuluEmbedder({
|
|
212
|
-
id: "eval_embedder",
|
|
213
|
-
name: "Eval Embedder",
|
|
214
|
-
provider: "openai",
|
|
215
|
-
model: "text-embedding-3-small",
|
|
216
|
-
vectorDimensions: 1536,
|
|
217
|
-
authenticationInformation: await ExuluVariables.get("openai_api_key")
|
|
218
|
-
});
|
|
219
|
-
|
|
220
|
-
const [responseEmb, expectedEmb] = await embedder.generate([
|
|
221
|
-
response,
|
|
222
|
-
testCase.expected_output
|
|
223
|
-
]);
|
|
224
|
-
|
|
225
|
-
// Cosine similarity
|
|
226
|
-
const similarity = cosineSimilarity(responseEmb, expectedEmb);
|
|
227
|
-
|
|
228
|
-
return similarity * 100;
|
|
229
|
-
}
|
|
230
|
-
});
|
|
231
|
-
|
|
232
|
-
function cosineSimilarity(a: number[], b: number[]): number {
|
|
233
|
-
const dotProduct = a.reduce((sum, val, i) => sum + val * b[i], 0);
|
|
234
|
-
const magnitudeA = Math.sqrt(a.reduce((sum, val) => sum + val * val, 0));
|
|
235
|
-
const magnitudeB = Math.sqrt(b.reduce((sum, val) => sum + val * val, 0));
|
|
236
|
-
return dotProduct / (magnitudeA * magnitudeB);
|
|
237
|
-
}
|
|
238
|
-
```
|
|
239
|
-
|
|
240
|
-
## Test cases
|
|
241
|
-
|
|
242
|
-
Test cases define the inputs and expected outputs for evaluations:
|
|
243
|
-
|
|
244
|
-
```typescript
|
|
245
|
-
interface TestCase {
|
|
246
|
-
id: string;
|
|
247
|
-
name: string;
|
|
248
|
-
description?: string;
|
|
249
|
-
inputs: UIMessage[]; // Input conversation
|
|
250
|
-
expected_output: string; // Expected response
|
|
251
|
-
expected_tools?: string[]; // Expected tool calls
|
|
252
|
-
expected_knowledge_sources?: string[]; // Expected contexts used
|
|
253
|
-
expected_agent_tools?: string[]; // Expected agent tools
|
|
254
|
-
createdAt: string;
|
|
255
|
-
updatedAt: string;
|
|
256
|
-
}
|
|
257
|
-
```
|
|
258
|
-
|
|
259
|
-
**Example test case:**
|
|
260
|
-
|
|
261
|
-
```typescript
|
|
262
|
-
const testCase: TestCase = {
|
|
263
|
-
id: "tc_001",
|
|
264
|
-
name: "Weather query",
|
|
265
|
-
description: "User asks about weather",
|
|
266
|
-
inputs: [
|
|
267
|
-
{
|
|
268
|
-
role: "user",
|
|
269
|
-
content: "What's the weather like in San Francisco?"
|
|
270
|
-
}
|
|
271
|
-
],
|
|
272
|
-
expected_output: "Based on current data, it's 68°F and sunny in San Francisco.",
|
|
273
|
-
expected_tools: ["get_weather"],
|
|
274
|
-
expected_knowledge_sources: [],
|
|
275
|
-
expected_agent_tools: [],
|
|
276
|
-
createdAt: "2025-01-15T10:00:00Z",
|
|
277
|
-
updatedAt: "2025-01-15T10:00:00Z"
|
|
278
|
-
};
|
|
279
|
-
```
|
|
280
|
-
|
|
281
|
-
## Running evaluations
|
|
282
|
-
|
|
283
|
-
### Basic evaluation run
|
|
284
|
-
|
|
285
|
-
```typescript
|
|
286
|
-
import { ExuluEval, ExuluAgent } from "@exulu/backend";
|
|
287
|
-
|
|
288
|
-
const eval = new ExuluEval({
|
|
289
|
-
id: "my_eval",
|
|
290
|
-
name: "My Evaluation",
|
|
291
|
-
description: "Custom evaluation",
|
|
292
|
-
llm: false,
|
|
293
|
-
execute: async ({ messages, testCase }) => {
|
|
294
|
-
// Your scoring logic
|
|
295
|
-
return 85; // Score from 0-100
|
|
296
|
-
}
|
|
297
|
-
});
|
|
298
|
-
|
|
299
|
-
// Run evaluation
|
|
300
|
-
const score = await eval.run(
|
|
301
|
-
agent, // Agent DB record
|
|
302
|
-
backend, // ExuluAgent instance
|
|
303
|
-
testCase, // TestCase
|
|
304
|
-
messages, // UIMessage[]
|
|
305
|
-
config // Optional config
|
|
306
|
-
);
|
|
307
|
-
|
|
308
|
-
console.log(`Score: ${score}/100`);
|
|
309
|
-
```
|
|
310
|
-
|
|
311
|
-
### Batch evaluation
|
|
312
|
-
|
|
313
|
-
Run multiple evaluations on a test suite:
|
|
314
|
-
|
|
315
|
-
```typescript
|
|
316
|
-
async function runEvaluations(
|
|
317
|
-
agent: Agent,
|
|
318
|
-
backend: ExuluAgent,
|
|
319
|
-
testCases: TestCase[],
|
|
320
|
-
evals: ExuluEval[]
|
|
321
|
-
) {
|
|
322
|
-
const results = [];
|
|
323
|
-
|
|
324
|
-
for (const testCase of testCases) {
|
|
325
|
-
// Generate response
|
|
326
|
-
const response = await backend.generateSync({
|
|
327
|
-
prompt: testCase.inputs[testCase.inputs.length - 1].content,
|
|
328
|
-
agentInstance: await loadAgent(agent.id),
|
|
329
|
-
statistics: { label: "eval", trigger: "test" }
|
|
330
|
-
});
|
|
331
|
-
|
|
332
|
-
const messages = [
|
|
333
|
-
...testCase.inputs,
|
|
334
|
-
{ role: "assistant", content: response.text }
|
|
335
|
-
];
|
|
336
|
-
|
|
337
|
-
// Run all evals on this test case
|
|
338
|
-
for (const eval of evals) {
|
|
339
|
-
const score = await eval.run(agent, backend, testCase, messages);
|
|
340
|
-
|
|
341
|
-
results.push({
|
|
342
|
-
testCaseId: testCase.id,
|
|
343
|
-
testCaseName: testCase.name,
|
|
344
|
-
evalId: eval.id,
|
|
345
|
-
evalName: eval.name,
|
|
346
|
-
score
|
|
347
|
-
});
|
|
348
|
-
}
|
|
349
|
-
}
|
|
350
|
-
|
|
351
|
-
return results;
|
|
352
|
-
}
|
|
353
|
-
```
|
|
354
|
-
|
|
355
|
-
## Integration with ExuluQueues
|
|
356
|
-
|
|
357
|
-
Run evaluations as background jobs:
|
|
358
|
-
|
|
359
|
-
```typescript
|
|
360
|
-
import { ExuluEval, ExuluQueues } from "@exulu/backend";
|
|
361
|
-
|
|
362
|
-
// Create eval with queue config
|
|
363
|
-
const eval = new ExuluEval({
|
|
364
|
-
id: "background_eval",
|
|
365
|
-
name: "Background Evaluation",
|
|
366
|
-
description: "Runs as background job",
|
|
367
|
-
llm: true,
|
|
368
|
-
execute: async ({ backend, messages, testCase }) => {
|
|
369
|
-
// Evaluation logic
|
|
370
|
-
return 90;
|
|
371
|
-
},
|
|
372
|
-
queue: Promise.resolve({
|
|
373
|
-
connection: await ExuluQueues.getConnection(),
|
|
374
|
-
name: "evaluations",
|
|
375
|
-
prefix: "{exulu}",
|
|
376
|
-
defaultJobOptions: {
|
|
377
|
-
attempts: 3,
|
|
378
|
-
backoff: { type: "exponential", delay: 2000 }
|
|
379
|
-
}
|
|
380
|
-
})
|
|
381
|
-
});
|
|
382
|
-
|
|
383
|
-
// Queue the evaluation job
|
|
384
|
-
// (Implementation depends on your worker setup)
|
|
385
|
-
```
|
|
386
|
-
|
|
387
|
-
## Best practices
|
|
388
|
-
|
|
389
|
-
<Tip>
|
|
390
|
-
**Start simple**: Begin with basic evaluations (exact match, keyword presence) before building complex LLM-as-judge evaluations.
|
|
391
|
-
</Tip>
|
|
392
|
-
|
|
393
|
-
<Note>
|
|
394
|
-
**Multiple evaluations**: Use multiple evaluation functions to assess different aspects (accuracy, tone, tool usage, etc.).
|
|
395
|
-
</Note>
|
|
396
|
-
|
|
397
|
-
<Warning>
|
|
398
|
-
**Score range**: Evaluation functions must return a score between 0 and 100. Scores outside this range will throw an error.
|
|
399
|
-
</Warning>
|
|
400
|
-
|
|
401
|
-
<Info>
|
|
402
|
-
**Test case quality**: Good test cases are specific, representative of real usage, and have clear expected outputs.
|
|
403
|
-
</Info>
|
|
404
|
-
|
|
405
|
-
## When to use ExuluEval
|
|
406
|
-
|
|
407
|
-
<AccordionGroup>
|
|
408
|
-
<Accordion title="Agent development">
|
|
409
|
-
Test agent behavior during development to catch issues early
|
|
410
|
-
</Accordion>
|
|
411
|
-
|
|
412
|
-
<Accordion title="Prompt engineering">
|
|
413
|
-
Compare prompt variations to find the best performing instructions
|
|
414
|
-
</Accordion>
|
|
415
|
-
|
|
416
|
-
<Accordion title="Model comparison">
|
|
417
|
-
Evaluate the same agent with different LLM models (GPT-4 vs Claude vs Gemini)
|
|
418
|
-
</Accordion>
|
|
419
|
-
|
|
420
|
-
<Accordion title="CI/CD pipelines">
|
|
421
|
-
Automated testing in deployment pipelines to prevent regressions
|
|
422
|
-
</Accordion>
|
|
423
|
-
|
|
424
|
-
<Accordion title="Quality monitoring">
|
|
425
|
-
Continuous evaluation in production to track performance over time
|
|
426
|
-
</Accordion>
|
|
427
|
-
</AccordionGroup>
|
|
428
|
-
|
|
429
|
-
## Evaluation workflow
|
|
430
|
-
|
|
431
|
-
```mermaid
|
|
432
|
-
graph TD
|
|
433
|
-
A[Create Test Cases] --> B[Define Evaluation Functions]
|
|
434
|
-
B --> C[Generate Agent Response]
|
|
435
|
-
C --> D[Run Evaluation]
|
|
436
|
-
D --> E{Score >= Threshold?}
|
|
437
|
-
E -->|Yes| F[Pass]
|
|
438
|
-
E -->|No| G[Fail]
|
|
439
|
-
F --> H[Deploy / Continue]
|
|
440
|
-
G --> I[Fix Agent]
|
|
441
|
-
I --> C
|
|
442
|
-
```
|
|
443
|
-
|
|
444
|
-
## Next steps
|
|
445
|
-
|
|
446
|
-
<CardGroup cols={2}>
|
|
447
|
-
<Card title="Configuration" icon="gear" href="/core/exulu-eval/configuration">
|
|
448
|
-
Learn about evaluation configuration
|
|
449
|
-
</Card>
|
|
450
|
-
<Card title="API reference" icon="code" href="/core/exulu-eval/api-reference">
|
|
451
|
-
Explore methods and properties
|
|
452
|
-
</Card>
|
|
453
|
-
<Card title="ExuluAgent" icon="robot" href="/core/exulu-agent/introduction">
|
|
454
|
-
Create agents to evaluate
|
|
455
|
-
</Card>
|
|
456
|
-
<Card title="ExuluQueues" icon="layer-group" href="/core/exulu-queues/introduction">
|
|
457
|
-
Run evaluations as background jobs
|
|
458
|
-
</Card>
|
|
459
|
-
</CardGroup>
|