@exulu/backend 1.48.2 → 1.49.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +351 -42
- package/dist/index.d.cts +96 -1
- package/dist/index.d.ts +96 -1
- package/dist/index.js +340 -38
- package/ee/{markdown.ts → chunking/markdown.ts} +2 -2
- package/ee/python/README.md +295 -0
- package/ee/python/documents/processing/README.md +155 -0
- package/ee/{documents → python/documents}/processing/doc_processor.ts +25 -17
- package/ee/{documents/processing/pdf_to_markdown.py → python/documents/processing/document_to_markdown.py} +3 -10
- package/ee/python/setup.sh +180 -0
- package/package.json +14 -3
- package/scripts/postinstall.cjs +149 -0
- package/.agents/skills/mintlify/SKILL.md +0 -347
- package/.editorconfig +0 -15
- package/.eslintrc.json +0 -52
- package/.github/workflows/release-backend.yml +0 -38
- package/.husky/commit-msg +0 -1
- package/.jscpd.json +0 -18
- package/.mcp.json +0 -25
- package/.nvmrc +0 -1
- package/.prettierignore +0 -5
- package/.prettierrc.json +0 -12
- package/CHANGELOG.md +0 -8
- package/SECURITY.md +0 -5
- package/commitlint.config.js +0 -4
- package/devops/documentation/patch-older-releases.md +0 -42
- package/ee/documents/processing/build_pdf_processor.sh +0 -35
- package/ee/documents/processing/chunk_markdown.py +0 -263
- package/ee/documents/processing/pdf_processor.spec +0 -115
- package/eslint.config.js +0 -88
- package/jest.config.ts +0 -25
- package/mintlify-docs/.mintignore +0 -7
- package/mintlify-docs/AGENTS.md +0 -33
- package/mintlify-docs/CLAUDE.MD +0 -50
- package/mintlify-docs/CONTRIBUTING.md +0 -32
- package/mintlify-docs/LICENSE +0 -21
- package/mintlify-docs/README.md +0 -55
- package/mintlify-docs/ai-tools/claude-code.mdx +0 -43
- package/mintlify-docs/ai-tools/cursor.mdx +0 -39
- package/mintlify-docs/ai-tools/windsurf.mdx +0 -39
- package/mintlify-docs/api-reference/core-types/agent-types.mdx +0 -110
- package/mintlify-docs/api-reference/core-types/analytics-types.mdx +0 -95
- package/mintlify-docs/api-reference/core-types/configuration-types.mdx +0 -83
- package/mintlify-docs/api-reference/core-types/evaluation-types.mdx +0 -106
- package/mintlify-docs/api-reference/core-types/job-types.mdx +0 -135
- package/mintlify-docs/api-reference/core-types/overview.mdx +0 -73
- package/mintlify-docs/api-reference/core-types/prompt-types.mdx +0 -102
- package/mintlify-docs/api-reference/core-types/rbac-types.mdx +0 -163
- package/mintlify-docs/api-reference/core-types/session-types.mdx +0 -77
- package/mintlify-docs/api-reference/core-types/user-management.mdx +0 -112
- package/mintlify-docs/api-reference/core-types/workflow-types.mdx +0 -88
- package/mintlify-docs/api-reference/core-types.mdx +0 -585
- package/mintlify-docs/api-reference/dynamic-types.mdx +0 -851
- package/mintlify-docs/api-reference/endpoint/create.mdx +0 -4
- package/mintlify-docs/api-reference/endpoint/delete.mdx +0 -4
- package/mintlify-docs/api-reference/endpoint/get.mdx +0 -4
- package/mintlify-docs/api-reference/endpoint/webhook.mdx +0 -4
- package/mintlify-docs/api-reference/introduction.mdx +0 -661
- package/mintlify-docs/api-reference/mutations.mdx +0 -1012
- package/mintlify-docs/api-reference/openapi.json +0 -217
- package/mintlify-docs/api-reference/queries.mdx +0 -1154
- package/mintlify-docs/backend/introduction.mdx +0 -218
- package/mintlify-docs/changelog.mdx +0 -387
- package/mintlify-docs/community-edition.mdx +0 -304
- package/mintlify-docs/core/exulu-agent/api-reference.mdx +0 -894
- package/mintlify-docs/core/exulu-agent/configuration.mdx +0 -690
- package/mintlify-docs/core/exulu-agent/introduction.mdx +0 -552
- package/mintlify-docs/core/exulu-app/api-reference.mdx +0 -481
- package/mintlify-docs/core/exulu-app/configuration.mdx +0 -319
- package/mintlify-docs/core/exulu-app/introduction.mdx +0 -117
- package/mintlify-docs/core/exulu-authentication.mdx +0 -810
- package/mintlify-docs/core/exulu-chunkers/api-reference.mdx +0 -1011
- package/mintlify-docs/core/exulu-chunkers/configuration.mdx +0 -596
- package/mintlify-docs/core/exulu-chunkers/introduction.mdx +0 -403
- package/mintlify-docs/core/exulu-context/api-reference.mdx +0 -911
- package/mintlify-docs/core/exulu-context/configuration.mdx +0 -648
- package/mintlify-docs/core/exulu-context/introduction.mdx +0 -394
- package/mintlify-docs/core/exulu-database.mdx +0 -811
- package/mintlify-docs/core/exulu-default-agents.mdx +0 -545
- package/mintlify-docs/core/exulu-eval/api-reference.mdx +0 -772
- package/mintlify-docs/core/exulu-eval/configuration.mdx +0 -680
- package/mintlify-docs/core/exulu-eval/introduction.mdx +0 -459
- package/mintlify-docs/core/exulu-logging.mdx +0 -464
- package/mintlify-docs/core/exulu-otel.mdx +0 -670
- package/mintlify-docs/core/exulu-queues/api-reference.mdx +0 -648
- package/mintlify-docs/core/exulu-queues/configuration.mdx +0 -650
- package/mintlify-docs/core/exulu-queues/introduction.mdx +0 -474
- package/mintlify-docs/core/exulu-reranker/api-reference.mdx +0 -630
- package/mintlify-docs/core/exulu-reranker/configuration.mdx +0 -663
- package/mintlify-docs/core/exulu-reranker/introduction.mdx +0 -516
- package/mintlify-docs/core/exulu-tool/api-reference.mdx +0 -723
- package/mintlify-docs/core/exulu-tool/configuration.mdx +0 -805
- package/mintlify-docs/core/exulu-tool/introduction.mdx +0 -539
- package/mintlify-docs/core/exulu-variables/api-reference.mdx +0 -699
- package/mintlify-docs/core/exulu-variables/configuration.mdx +0 -736
- package/mintlify-docs/core/exulu-variables/introduction.mdx +0 -511
- package/mintlify-docs/development.mdx +0 -94
- package/mintlify-docs/docs.json +0 -248
- package/mintlify-docs/enterprise-edition.mdx +0 -538
- package/mintlify-docs/essentials/code.mdx +0 -35
- package/mintlify-docs/essentials/images.mdx +0 -59
- package/mintlify-docs/essentials/markdown.mdx +0 -88
- package/mintlify-docs/essentials/navigation.mdx +0 -87
- package/mintlify-docs/essentials/reusable-snippets.mdx +0 -110
- package/mintlify-docs/essentials/settings.mdx +0 -318
- package/mintlify-docs/favicon.svg +0 -3
- package/mintlify-docs/frontend/introduction.mdx +0 -39
- package/mintlify-docs/getting-started.mdx +0 -267
- package/mintlify-docs/guides/custom-agent.mdx +0 -608
- package/mintlify-docs/guides/first-agent.mdx +0 -315
- package/mintlify-docs/images/admin_ui.png +0 -0
- package/mintlify-docs/images/contexts.png +0 -0
- package/mintlify-docs/images/create_agents.png +0 -0
- package/mintlify-docs/images/evals.png +0 -0
- package/mintlify-docs/images/graphql.png +0 -0
- package/mintlify-docs/images/graphql_api.png +0 -0
- package/mintlify-docs/images/hero-dark.png +0 -0
- package/mintlify-docs/images/hero-light.png +0 -0
- package/mintlify-docs/images/hero.png +0 -0
- package/mintlify-docs/images/knowledge_sources.png +0 -0
- package/mintlify-docs/images/mcp.png +0 -0
- package/mintlify-docs/images/scaling.png +0 -0
- package/mintlify-docs/index.mdx +0 -411
- package/mintlify-docs/logo/dark.svg +0 -9
- package/mintlify-docs/logo/light.svg +0 -9
- package/mintlify-docs/partners.mdx +0 -558
- package/mintlify-docs/products.mdx +0 -77
- package/mintlify-docs/snippets/snippet-intro.mdx +0 -4
- package/mintlify-docs/styles.css +0 -207
- package/ngrok.bash +0 -1
- package/ngrok.md +0 -6
- package/ngrok.yml +0 -10
- package/release.config.cjs +0 -15
- package/skills-lock.json +0 -10
- package/types/context-processor.ts +0 -45
- package/types/enums/eval-types.ts +0 -5
- package/types/enums/field-types.ts +0 -1
- package/types/enums/jobs.ts +0 -11
- package/types/enums/statistics.ts +0 -13
- package/types/exulu-table-definition.ts +0 -79
- package/types/file-types.ts +0 -18
- package/types/models/agent-session.ts +0 -27
- package/types/models/agent.ts +0 -68
- package/types/models/context.ts +0 -53
- package/types/models/embedding.ts +0 -17
- package/types/models/eval-run.ts +0 -40
- package/types/models/exulu-agent-tool-config.ts +0 -11
- package/types/models/item.ts +0 -21
- package/types/models/job.ts +0 -8
- package/types/models/project.ts +0 -16
- package/types/models/rate-limiter-rules.ts +0 -7
- package/types/models/test-case.ts +0 -25
- package/types/models/tool.ts +0 -9
- package/types/models/user-role.ts +0 -12
- package/types/models/user.ts +0 -20
- package/types/models/variable.ts +0 -8
- package/types/models/vector-methods.ts +0 -7
- package/types/provider-config.ts +0 -21
- package/types/queue-config.ts +0 -16
- package/types/rbac-rights-modes.ts +0 -1
- package/types/statistics.ts +0 -20
- package/types/workflow.ts +0 -31
- /package/ee/{documents → python/documents}/THIRD_PARTY_LICENSES/docling.txt +0 -0
- /package/ee/{documents/processing → python}/requirements.txt +0 -0
|
@@ -1,772 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
title: "API reference"
|
|
3
|
-
description: "Complete method and property reference for ExuluEval"
|
|
4
|
-
---
|
|
5
|
-
|
|
6
|
-
## ExuluEval class
|
|
7
|
-
|
|
8
|
-
```typescript
|
|
9
|
-
class ExuluEval {
|
|
10
|
-
public id: string;
|
|
11
|
-
public name: string;
|
|
12
|
-
public description: string;
|
|
13
|
-
public llm: boolean;
|
|
14
|
-
public config?: { name: string; description: string }[];
|
|
15
|
-
public queue?: Promise<ExuluQueueConfig>;
|
|
16
|
-
|
|
17
|
-
constructor(params: ExuluEvalParams);
|
|
18
|
-
async run(
|
|
19
|
-
agent: Agent,
|
|
20
|
-
backend: ExuluAgent,
|
|
21
|
-
testCase: TestCase,
|
|
22
|
-
messages: UIMessage[],
|
|
23
|
-
config?: Record<string, any>
|
|
24
|
-
): Promise<number>;
|
|
25
|
-
}
|
|
26
|
-
```
|
|
27
|
-
|
|
28
|
-
## Constructor
|
|
29
|
-
|
|
30
|
-
Creates a new evaluation function instance.
|
|
31
|
-
|
|
32
|
-
```typescript
|
|
33
|
-
new ExuluEval(params: ExuluEvalParams)
|
|
34
|
-
```
|
|
35
|
-
|
|
36
|
-
### Parameters
|
|
37
|
-
|
|
38
|
-
<ParamField path="params" type="ExuluEvalParams" required>
|
|
39
|
-
Configuration object for the evaluation function
|
|
40
|
-
|
|
41
|
-
```typescript
|
|
42
|
-
interface ExuluEvalParams {
|
|
43
|
-
id: string;
|
|
44
|
-
name: string;
|
|
45
|
-
description: string;
|
|
46
|
-
llm: boolean;
|
|
47
|
-
execute: (params: ExecuteParams) => Promise<number>;
|
|
48
|
-
config?: { name: string; description: string }[];
|
|
49
|
-
queue?: Promise<ExuluQueueConfig>;
|
|
50
|
-
}
|
|
51
|
-
```
|
|
52
|
-
</ParamField>
|
|
53
|
-
|
|
54
|
-
<ParamField path="params.id" type="string" required>
|
|
55
|
-
Unique identifier for this evaluation function
|
|
56
|
-
</ParamField>
|
|
57
|
-
|
|
58
|
-
<ParamField path="params.name" type="string" required>
|
|
59
|
-
Human-readable name
|
|
60
|
-
</ParamField>
|
|
61
|
-
|
|
62
|
-
<ParamField path="params.description" type="string" required>
|
|
63
|
-
Description of what this evaluation measures
|
|
64
|
-
</ParamField>
|
|
65
|
-
|
|
66
|
-
<ParamField path="params.llm" type="boolean" required>
|
|
67
|
-
Whether this evaluation uses an LLM (LLM-as-judge)
|
|
68
|
-
</ParamField>
|
|
69
|
-
|
|
70
|
-
<ParamField path="params.execute" type="function" required>
|
|
71
|
-
Function that performs the evaluation
|
|
72
|
-
```typescript
|
|
73
|
-
async (params: {
|
|
74
|
-
agent: Agent;
|
|
75
|
-
backend: ExuluAgent;
|
|
76
|
-
messages: UIMessage[];
|
|
77
|
-
testCase: TestCase;
|
|
78
|
-
config?: Record<string, any>;
|
|
79
|
-
}) => Promise<number>
|
|
80
|
-
```
|
|
81
|
-
Must return a score between 0 and 100
|
|
82
|
-
</ParamField>
|
|
83
|
-
|
|
84
|
-
<ParamField path="params.config" type="array">
|
|
85
|
-
Optional configuration schema
|
|
86
|
-
```typescript
|
|
87
|
-
{
|
|
88
|
-
name: string; // Config parameter name
|
|
89
|
-
description: string; // What this parameter does
|
|
90
|
-
}[]
|
|
91
|
-
```
|
|
92
|
-
</ParamField>
|
|
93
|
-
|
|
94
|
-
<ParamField path="params.queue" type="Promise<ExuluQueueConfig>">
|
|
95
|
-
Optional queue configuration for background execution
|
|
96
|
-
</ParamField>
|
|
97
|
-
|
|
98
|
-
### Example
|
|
99
|
-
|
|
100
|
-
```typescript
|
|
101
|
-
import { ExuluEval } from "@exulu/backend";
|
|
102
|
-
|
|
103
|
-
const eval = new ExuluEval({
|
|
104
|
-
id: "exact_match",
|
|
105
|
-
name: "Exact Match",
|
|
106
|
-
description: "Checks if response exactly matches expected output",
|
|
107
|
-
llm: false,
|
|
108
|
-
execute: async ({ messages, testCase }) => {
|
|
109
|
-
const response = messages[messages.length - 1]?.content || "";
|
|
110
|
-
return response === testCase.expected_output ? 100 : 0;
|
|
111
|
-
}
|
|
112
|
-
});
|
|
113
|
-
```
|
|
114
|
-
|
|
115
|
-
## Properties
|
|
116
|
-
|
|
117
|
-
### id
|
|
118
|
-
|
|
119
|
-
<ResponseField name="id" type="string">
|
|
120
|
-
Unique identifier for this evaluation function
|
|
121
|
-
</ResponseField>
|
|
122
|
-
|
|
123
|
-
```typescript
|
|
124
|
-
const evalId = eval.id; // "exact_match"
|
|
125
|
-
```
|
|
126
|
-
|
|
127
|
-
### name
|
|
128
|
-
|
|
129
|
-
<ResponseField name="name" type="string">
|
|
130
|
-
Human-readable name for the evaluation
|
|
131
|
-
</ResponseField>
|
|
132
|
-
|
|
133
|
-
```typescript
|
|
134
|
-
const evalName = eval.name; // "Exact Match"
|
|
135
|
-
```
|
|
136
|
-
|
|
137
|
-
### description
|
|
138
|
-
|
|
139
|
-
<ResponseField name="description" type="string">
|
|
140
|
-
Description of what this evaluation measures
|
|
141
|
-
</ResponseField>
|
|
142
|
-
|
|
143
|
-
```typescript
|
|
144
|
-
const evalDesc = eval.description; // "Checks if response exactly matches expected output"
|
|
145
|
-
```
|
|
146
|
-
|
|
147
|
-
### llm
|
|
148
|
-
|
|
149
|
-
<ResponseField name="llm" type="boolean">
|
|
150
|
-
Whether this evaluation uses an LLM for scoring
|
|
151
|
-
</ResponseField>
|
|
152
|
-
|
|
153
|
-
```typescript
|
|
154
|
-
const usesLLM = eval.llm; // false
|
|
155
|
-
```
|
|
156
|
-
|
|
157
|
-
### config
|
|
158
|
-
|
|
159
|
-
<ResponseField name="config" type="array | undefined">
|
|
160
|
-
Configuration schema defining runtime parameters
|
|
161
|
-
|
|
162
|
-
```typescript
|
|
163
|
-
{
|
|
164
|
-
name: string;
|
|
165
|
-
description: string;
|
|
166
|
-
}[]
|
|
167
|
-
```
|
|
168
|
-
</ResponseField>
|
|
169
|
-
|
|
170
|
-
```typescript
|
|
171
|
-
const configSchema = eval.config;
|
|
172
|
-
// [{ name: "threshold", description: "Minimum score threshold" }]
|
|
173
|
-
```
|
|
174
|
-
|
|
175
|
-
### queue
|
|
176
|
-
|
|
177
|
-
<ResponseField name="queue" type="Promise<ExuluQueueConfig> | undefined">
|
|
178
|
-
Queue configuration for background execution
|
|
179
|
-
</ResponseField>
|
|
180
|
-
|
|
181
|
-
```typescript
|
|
182
|
-
const queueConfig = await eval.queue;
|
|
183
|
-
```
|
|
184
|
-
|
|
185
|
-
## Methods
|
|
186
|
-
|
|
187
|
-
### run()
|
|
188
|
-
|
|
189
|
-
Executes the evaluation function and returns a score.
|
|
190
|
-
|
|
191
|
-
```typescript
|
|
192
|
-
async run(
|
|
193
|
-
agent: Agent,
|
|
194
|
-
backend: ExuluAgent,
|
|
195
|
-
testCase: TestCase,
|
|
196
|
-
messages: UIMessage[],
|
|
197
|
-
config?: Record<string, any>
|
|
198
|
-
): Promise<number>
|
|
199
|
-
```
|
|
200
|
-
|
|
201
|
-
<ParamField path="agent" type="Agent" required>
|
|
202
|
-
Agent database record being evaluated
|
|
203
|
-
```typescript
|
|
204
|
-
interface Agent {
|
|
205
|
-
id: string;
|
|
206
|
-
name: string;
|
|
207
|
-
description: string;
|
|
208
|
-
// ... other properties
|
|
209
|
-
}
|
|
210
|
-
```
|
|
211
|
-
</ParamField>
|
|
212
|
-
|
|
213
|
-
<ParamField path="backend" type="ExuluAgent" required>
|
|
214
|
-
ExuluAgent instance for generating responses or using LLM-as-judge
|
|
215
|
-
</ParamField>
|
|
216
|
-
|
|
217
|
-
<ParamField path="testCase" type="TestCase" required>
|
|
218
|
-
Test case with inputs and expected outputs
|
|
219
|
-
```typescript
|
|
220
|
-
interface TestCase {
|
|
221
|
-
id: string;
|
|
222
|
-
name: string;
|
|
223
|
-
description?: string;
|
|
224
|
-
inputs: UIMessage[];
|
|
225
|
-
expected_output: string;
|
|
226
|
-
expected_tools?: string[];
|
|
227
|
-
expected_knowledge_sources?: string[];
|
|
228
|
-
expected_agent_tools?: string[];
|
|
229
|
-
createdAt: string;
|
|
230
|
-
updatedAt: string;
|
|
231
|
-
}
|
|
232
|
-
```
|
|
233
|
-
</ParamField>
|
|
234
|
-
|
|
235
|
-
<ParamField path="messages" type="UIMessage[]" required>
|
|
236
|
-
Conversation messages including inputs and agent responses
|
|
237
|
-
```typescript
|
|
238
|
-
interface UIMessage {
|
|
239
|
-
role: "user" | "assistant" | "system";
|
|
240
|
-
content: string;
|
|
241
|
-
toolInvocations?: ToolInvocation[];
|
|
242
|
-
}
|
|
243
|
-
```
|
|
244
|
-
</ParamField>
|
|
245
|
-
|
|
246
|
-
<ParamField path="config" type="Record<string, any>">
|
|
247
|
-
Optional runtime configuration values
|
|
248
|
-
</ParamField>
|
|
249
|
-
|
|
250
|
-
<ResponseField name="return" type="Promise<number>">
|
|
251
|
-
Score from 0 to 100
|
|
252
|
-
</ResponseField>
|
|
253
|
-
|
|
254
|
-
**Example:**
|
|
255
|
-
|
|
256
|
-
```typescript
|
|
257
|
-
const score = await eval.run(
|
|
258
|
-
agent,
|
|
259
|
-
backend,
|
|
260
|
-
testCase,
|
|
261
|
-
messages,
|
|
262
|
-
{ threshold: 80 }
|
|
263
|
-
);
|
|
264
|
-
|
|
265
|
-
console.log(`Score: ${score}/100`);
|
|
266
|
-
```
|
|
267
|
-
|
|
268
|
-
**Error handling:**
|
|
269
|
-
|
|
270
|
-
```typescript
|
|
271
|
-
try {
|
|
272
|
-
const score = await eval.run(agent, backend, testCase, messages);
|
|
273
|
-
console.log(`Score: ${score}`);
|
|
274
|
-
} catch (error) {
|
|
275
|
-
console.error("Evaluation failed:", error.message);
|
|
276
|
-
// Error: Eval function must return a score between 0 and 100, got 150
|
|
277
|
-
}
|
|
278
|
-
```
|
|
279
|
-
|
|
280
|
-
**Throws:**
|
|
281
|
-
- Error if execute function returns score < 0 or > 100
|
|
282
|
-
- Error if execute function throws an error
|
|
283
|
-
|
|
284
|
-
## Type definitions
|
|
285
|
-
|
|
286
|
-
### ExuluEvalParams
|
|
287
|
-
|
|
288
|
-
```typescript
|
|
289
|
-
interface ExuluEvalParams {
|
|
290
|
-
id: string;
|
|
291
|
-
name: string;
|
|
292
|
-
description: string;
|
|
293
|
-
llm: boolean;
|
|
294
|
-
execute: (params: {
|
|
295
|
-
agent: Agent;
|
|
296
|
-
backend: ExuluAgent;
|
|
297
|
-
messages: UIMessage[];
|
|
298
|
-
testCase: TestCase;
|
|
299
|
-
config?: Record<string, any>;
|
|
300
|
-
}) => Promise<number>;
|
|
301
|
-
config?: {
|
|
302
|
-
name: string;
|
|
303
|
-
description: string;
|
|
304
|
-
}[];
|
|
305
|
-
queue?: Promise<ExuluQueueConfig>;
|
|
306
|
-
}
|
|
307
|
-
```
|
|
308
|
-
|
|
309
|
-
### TestCase
|
|
310
|
-
|
|
311
|
-
```typescript
|
|
312
|
-
interface TestCase {
|
|
313
|
-
id: string;
|
|
314
|
-
name: string;
|
|
315
|
-
description?: string;
|
|
316
|
-
inputs: UIMessage[]; // Input messages
|
|
317
|
-
expected_output: string; // Expected response
|
|
318
|
-
expected_tools?: string[]; // Expected tool names
|
|
319
|
-
expected_knowledge_sources?: string[]; // Expected context IDs
|
|
320
|
-
expected_agent_tools?: string[]; // Expected agent tool IDs
|
|
321
|
-
createdAt: string;
|
|
322
|
-
updatedAt: string;
|
|
323
|
-
}
|
|
324
|
-
```
|
|
325
|
-
|
|
326
|
-
### UIMessage
|
|
327
|
-
|
|
328
|
-
```typescript
|
|
329
|
-
interface UIMessage {
|
|
330
|
-
role: "user" | "assistant" | "system";
|
|
331
|
-
content: string;
|
|
332
|
-
toolInvocations?: ToolInvocation[];
|
|
333
|
-
}
|
|
334
|
-
|
|
335
|
-
interface ToolInvocation {
|
|
336
|
-
toolName: string;
|
|
337
|
-
toolCallId: string;
|
|
338
|
-
args: Record<string, any>;
|
|
339
|
-
result?: any;
|
|
340
|
-
}
|
|
341
|
-
```
|
|
342
|
-
|
|
343
|
-
## Usage examples
|
|
344
|
-
|
|
345
|
-
### Basic exact match
|
|
346
|
-
|
|
347
|
-
```typescript
|
|
348
|
-
import { ExuluEval } from "@exulu/backend";
|
|
349
|
-
|
|
350
|
-
const exactMatch = new ExuluEval({
|
|
351
|
-
id: "exact_match",
|
|
352
|
-
name: "Exact Match",
|
|
353
|
-
description: "100 if exact match, 0 otherwise",
|
|
354
|
-
llm: false,
|
|
355
|
-
execute: async ({ messages, testCase }) => {
|
|
356
|
-
const response = messages[messages.length - 1]?.content || "";
|
|
357
|
-
return response === testCase.expected_output ? 100 : 0;
|
|
358
|
-
}
|
|
359
|
-
});
|
|
360
|
-
|
|
361
|
-
const score = await exactMatch.run(agent, backend, testCase, messages);
|
|
362
|
-
console.log(`Score: ${score}/100`);
|
|
363
|
-
```
|
|
364
|
-
|
|
365
|
-
### Keyword evaluation with config
|
|
366
|
-
|
|
367
|
-
```typescript
|
|
368
|
-
const keywordEval = new ExuluEval({
|
|
369
|
-
id: "keyword_check",
|
|
370
|
-
name: "Keyword Check",
|
|
371
|
-
description: "Checks for presence of keywords",
|
|
372
|
-
llm: false,
|
|
373
|
-
execute: async ({ messages, config }) => {
|
|
374
|
-
const response = messages[messages.length - 1]?.content?.toLowerCase() || "";
|
|
375
|
-
const keywords = config?.keywords || [];
|
|
376
|
-
|
|
377
|
-
if (keywords.length === 0) return 100;
|
|
378
|
-
|
|
379
|
-
const found = keywords.filter(kw => response.includes(kw.toLowerCase()));
|
|
380
|
-
return (found.length / keywords.length) * 100;
|
|
381
|
-
},
|
|
382
|
-
config: [
|
|
383
|
-
{
|
|
384
|
-
name: "keywords",
|
|
385
|
-
description: "Array of required keywords"
|
|
386
|
-
}
|
|
387
|
-
]
|
|
388
|
-
});
|
|
389
|
-
|
|
390
|
-
const score = await keywordEval.run(
|
|
391
|
-
agent,
|
|
392
|
-
backend,
|
|
393
|
-
testCase,
|
|
394
|
-
messages,
|
|
395
|
-
{ keywords: ["weather", "temperature"] }
|
|
396
|
-
);
|
|
397
|
-
```
|
|
398
|
-
|
|
399
|
-
### LLM-as-judge
|
|
400
|
-
|
|
401
|
-
```typescript
|
|
402
|
-
const llmJudge = new ExuluEval({
|
|
403
|
-
id: "llm_judge",
|
|
404
|
-
name: "LLM Quality Judge",
|
|
405
|
-
description: "Uses LLM to evaluate response quality",
|
|
406
|
-
llm: true,
|
|
407
|
-
execute: async ({ backend, messages, testCase, config }) => {
|
|
408
|
-
const response = messages[messages.length - 1]?.content || "";
|
|
409
|
-
|
|
410
|
-
const judgePrompt = `
|
|
411
|
-
Rate this response on a scale of 0-100.
|
|
412
|
-
|
|
413
|
-
Expected: ${testCase.expected_output}
|
|
414
|
-
Actual: ${response}
|
|
415
|
-
|
|
416
|
-
Respond with ONLY a number 0-100.
|
|
417
|
-
`.trim();
|
|
418
|
-
|
|
419
|
-
const result = await backend.generateSync({
|
|
420
|
-
prompt: judgePrompt,
|
|
421
|
-
agentInstance: await loadAgent(config?.judgeAgentId),
|
|
422
|
-
statistics: { label: "eval", trigger: "llm_judge" }
|
|
423
|
-
});
|
|
424
|
-
|
|
425
|
-
const score = parseInt(result.text.trim());
|
|
426
|
-
return isNaN(score) ? 0 : Math.max(0, Math.min(100, score));
|
|
427
|
-
},
|
|
428
|
-
config: [
|
|
429
|
-
{
|
|
430
|
-
name: "judgeAgentId",
|
|
431
|
-
description: "Agent to use for judging"
|
|
432
|
-
}
|
|
433
|
-
]
|
|
434
|
-
});
|
|
435
|
-
|
|
436
|
-
const score = await llmJudge.run(
|
|
437
|
-
agent,
|
|
438
|
-
backend,
|
|
439
|
-
testCase,
|
|
440
|
-
messages,
|
|
441
|
-
{ judgeAgentId: "claude_opus_judge" }
|
|
442
|
-
);
|
|
443
|
-
```
|
|
444
|
-
|
|
445
|
-
### Tool usage evaluation
|
|
446
|
-
|
|
447
|
-
```typescript
|
|
448
|
-
const toolUsageEval = new ExuluEval({
|
|
449
|
-
id: "tool_usage",
|
|
450
|
-
name: "Tool Usage Check",
|
|
451
|
-
description: "Verifies correct tools were used",
|
|
452
|
-
llm: false,
|
|
453
|
-
execute: async ({ messages, testCase }) => {
|
|
454
|
-
const toolCalls = messages
|
|
455
|
-
.flatMap(msg => msg.toolInvocations || [])
|
|
456
|
-
.map(inv => inv.toolName);
|
|
457
|
-
|
|
458
|
-
const expectedTools = testCase.expected_tools || [];
|
|
459
|
-
|
|
460
|
-
if (expectedTools.length === 0) {
|
|
461
|
-
return toolCalls.length === 0 ? 100 : 0;
|
|
462
|
-
}
|
|
463
|
-
|
|
464
|
-
const usedExpected = expectedTools.filter(tool =>
|
|
465
|
-
toolCalls.includes(tool)
|
|
466
|
-
);
|
|
467
|
-
|
|
468
|
-
return (usedExpected.length / expectedTools.length) * 100;
|
|
469
|
-
}
|
|
470
|
-
});
|
|
471
|
-
|
|
472
|
-
const score = await toolUsageEval.run(agent, backend, testCase, messages);
|
|
473
|
-
```
|
|
474
|
-
|
|
475
|
-
### Batch evaluation
|
|
476
|
-
|
|
477
|
-
```typescript
|
|
478
|
-
async function runAllEvaluations(
|
|
479
|
-
agent: Agent,
|
|
480
|
-
backend: ExuluAgent,
|
|
481
|
-
testCases: TestCase[],
|
|
482
|
-
evaluations: ExuluEval[]
|
|
483
|
-
) {
|
|
484
|
-
const results = [];
|
|
485
|
-
|
|
486
|
-
for (const testCase of testCases) {
|
|
487
|
-
// Generate response
|
|
488
|
-
const response = await backend.generateSync({
|
|
489
|
-
prompt: testCase.inputs[testCase.inputs.length - 1].content,
|
|
490
|
-
agentInstance: await loadAgent(agent.id),
|
|
491
|
-
statistics: { label: "eval", trigger: "test" }
|
|
492
|
-
});
|
|
493
|
-
|
|
494
|
-
const messages = [
|
|
495
|
-
...testCase.inputs,
|
|
496
|
-
{ role: "assistant", content: response.text }
|
|
497
|
-
];
|
|
498
|
-
|
|
499
|
-
// Run all evaluations
|
|
500
|
-
for (const evaluation of evaluations) {
|
|
501
|
-
const score = await evaluation.run(agent, backend, testCase, messages);
|
|
502
|
-
|
|
503
|
-
results.push({
|
|
504
|
-
testCaseId: testCase.id,
|
|
505
|
-
testCaseName: testCase.name,
|
|
506
|
-
evaluationId: evaluation.id,
|
|
507
|
-
evaluationName: evaluation.name,
|
|
508
|
-
score
|
|
509
|
-
});
|
|
510
|
-
}
|
|
511
|
-
}
|
|
512
|
-
|
|
513
|
-
return results;
|
|
514
|
-
}
|
|
515
|
-
|
|
516
|
-
// Use
|
|
517
|
-
const results = await runAllEvaluations(
|
|
518
|
-
agent,
|
|
519
|
-
backend,
|
|
520
|
-
testCases,
|
|
521
|
-
[exactMatch, keywordEval, toolUsageEval]
|
|
522
|
-
);
|
|
523
|
-
|
|
524
|
-
console.log("Results:", results);
|
|
525
|
-
```
|
|
526
|
-
|
|
527
|
-
### Evaluation suite
|
|
528
|
-
|
|
529
|
-
```typescript
|
|
530
|
-
import { ExuluEval } from "@exulu/backend";
|
|
531
|
-
|
|
532
|
-
class EvaluationSuite {
|
|
533
|
-
private evaluations: ExuluEval[] = [];
|
|
534
|
-
|
|
535
|
-
add(evaluation: ExuluEval) {
|
|
536
|
-
this.evaluations.push(evaluation);
|
|
537
|
-
}
|
|
538
|
-
|
|
539
|
-
async runAll(
|
|
540
|
-
agent: Agent,
|
|
541
|
-
backend: ExuluAgent,
|
|
542
|
-
testCase: TestCase,
|
|
543
|
-
messages: UIMessage[],
|
|
544
|
-
config?: Record<string, any>
|
|
545
|
-
) {
|
|
546
|
-
const results = await Promise.all(
|
|
547
|
-
this.evaluations.map(async (eval) => ({
|
|
548
|
-
id: eval.id,
|
|
549
|
-
name: eval.name,
|
|
550
|
-
score: await eval.run(agent, backend, testCase, messages, config)
|
|
551
|
-
}))
|
|
552
|
-
);
|
|
553
|
-
|
|
554
|
-
return {
|
|
555
|
-
testCase: testCase.name,
|
|
556
|
-
evaluations: results,
|
|
557
|
-
average: results.reduce((sum, r) => sum + r.score, 0) / results.length,
|
|
558
|
-
passed: results.every(r => r.score >= (config?.threshold || 80))
|
|
559
|
-
};
|
|
560
|
-
}
|
|
561
|
-
}
|
|
562
|
-
|
|
563
|
-
// Use
|
|
564
|
-
const suite = new EvaluationSuite();
|
|
565
|
-
suite.add(exactMatch);
|
|
566
|
-
suite.add(keywordEval);
|
|
567
|
-
suite.add(toolUsageEval);
|
|
568
|
-
|
|
569
|
-
const result = await suite.runAll(agent, backend, testCase, messages);
|
|
570
|
-
console.log("Suite result:", result);
|
|
571
|
-
```
|
|
572
|
-
|
|
573
|
-
### Composite evaluation
|
|
574
|
-
|
|
575
|
-
```typescript
|
|
576
|
-
const compositeEval = new ExuluEval({
|
|
577
|
-
id: "composite",
|
|
578
|
-
name: "Composite Evaluation",
|
|
579
|
-
description: "Combines multiple criteria with weights",
|
|
580
|
-
llm: false,
|
|
581
|
-
execute: async ({ messages, testCase }) => {
|
|
582
|
-
const response = messages[messages.length - 1]?.content || "";
|
|
583
|
-
let totalScore = 0;
|
|
584
|
-
|
|
585
|
-
// Accuracy (50%)
|
|
586
|
-
const containsExpected = response.includes(testCase.expected_output);
|
|
587
|
-
totalScore += containsExpected ? 50 : 0;
|
|
588
|
-
|
|
589
|
-
// Length (20%)
|
|
590
|
-
const isReasonableLength = response.length >= 50 && response.length <= 500;
|
|
591
|
-
totalScore += isReasonableLength ? 20 : 0;
|
|
592
|
-
|
|
593
|
-
// Tool usage (30%)
|
|
594
|
-
const toolCalls = messages.flatMap(msg => msg.toolInvocations || []);
|
|
595
|
-
const expectedTools = testCase.expected_tools || [];
|
|
596
|
-
if (expectedTools.length > 0) {
|
|
597
|
-
const toolsUsed = expectedTools.every(tool =>
|
|
598
|
-
toolCalls.some(call => call.toolName === tool)
|
|
599
|
-
);
|
|
600
|
-
totalScore += toolsUsed ? 30 : 0;
|
|
601
|
-
} else {
|
|
602
|
-
totalScore += 30;
|
|
603
|
-
}
|
|
604
|
-
|
|
605
|
-
return totalScore;
|
|
606
|
-
}
|
|
607
|
-
});
|
|
608
|
-
```
|
|
609
|
-
|
|
610
|
-
### Error handling
|
|
611
|
-
|
|
612
|
-
```typescript
|
|
613
|
-
const safeEval = new ExuluEval({
|
|
614
|
-
id: "safe_eval",
|
|
615
|
-
name: "Safe Evaluation",
|
|
616
|
-
description: "Evaluation with comprehensive error handling",
|
|
617
|
-
llm: false,
|
|
618
|
-
execute: async ({ messages, testCase, config }) => {
|
|
619
|
-
try {
|
|
620
|
-
const response = messages[messages.length - 1]?.content;
|
|
621
|
-
|
|
622
|
-
if (!response) {
|
|
623
|
-
console.warn("No response content found");
|
|
624
|
-
return 0;
|
|
625
|
-
}
|
|
626
|
-
|
|
627
|
-
// Your evaluation logic
|
|
628
|
-
const score = computeScore(response, testCase.expected_output);
|
|
629
|
-
|
|
630
|
-
// Validate score range
|
|
631
|
-
if (score < 0 || score > 100) {
|
|
632
|
-
throw new Error(`Score out of range: ${score}`);
|
|
633
|
-
}
|
|
634
|
-
|
|
635
|
-
return score;
|
|
636
|
-
} catch (error) {
|
|
637
|
-
console.error(`Evaluation error: ${error.message}`);
|
|
638
|
-
throw error; // Re-throw for ExuluEval to handle
|
|
639
|
-
}
|
|
640
|
-
}
|
|
641
|
-
});
|
|
642
|
-
|
|
643
|
-
// Run with error handling
|
|
644
|
-
try {
|
|
645
|
-
const score = await safeEval.run(agent, backend, testCase, messages);
|
|
646
|
-
console.log(`Score: ${score}`);
|
|
647
|
-
} catch (error) {
|
|
648
|
-
console.error("Evaluation failed:", error.message);
|
|
649
|
-
// Handle failure (log, alert, retry, etc.)
|
|
650
|
-
}
|
|
651
|
-
```
|
|
652
|
-
|
|
653
|
-
## Integration patterns
|
|
654
|
-
|
|
655
|
-
### With test management system
|
|
656
|
-
|
|
657
|
-
```typescript
|
|
658
|
-
interface EvaluationResult {
|
|
659
|
-
evaluationId: string;
|
|
660
|
-
testCaseId: string;
|
|
661
|
-
score: number;
|
|
662
|
-
timestamp: string;
|
|
663
|
-
agentId: string;
|
|
664
|
-
passed: boolean;
|
|
665
|
-
}
|
|
666
|
-
|
|
667
|
-
async function runAndStoreEvaluation(
|
|
668
|
-
evaluation: ExuluEval,
|
|
669
|
-
agent: Agent,
|
|
670
|
-
backend: ExuluAgent,
|
|
671
|
-
testCase: TestCase,
|
|
672
|
-
messages: UIMessage[],
|
|
673
|
-
threshold: number = 80
|
|
674
|
-
): Promise<EvaluationResult> {
|
|
675
|
-
const score = await evaluation.run(agent, backend, testCase, messages);
|
|
676
|
-
|
|
677
|
-
const result: EvaluationResult = {
|
|
678
|
-
evaluationId: evaluation.id,
|
|
679
|
-
testCaseId: testCase.id,
|
|
680
|
-
score,
|
|
681
|
-
timestamp: new Date().toISOString(),
|
|
682
|
-
agentId: agent.id,
|
|
683
|
-
passed: score >= threshold
|
|
684
|
-
};
|
|
685
|
-
|
|
686
|
-
// Store in database
|
|
687
|
-
const { db } = await postgresClient();
|
|
688
|
-
await db.into("evaluation_results").insert(result);
|
|
689
|
-
|
|
690
|
-
return result;
|
|
691
|
-
}
|
|
692
|
-
```
|
|
693
|
-
|
|
694
|
-
### CI/CD integration
|
|
695
|
-
|
|
696
|
-
```typescript
|
|
697
|
-
async function runCIPipeline(
|
|
698
|
-
agent: Agent,
|
|
699
|
-
backend: ExuluAgent,
|
|
700
|
-
testCases: TestCase[],
|
|
701
|
-
evaluations: ExuluEval[],
|
|
702
|
-
minPassRate: number = 0.8
|
|
703
|
-
) {
|
|
704
|
-
const results = [];
|
|
705
|
-
|
|
706
|
-
for (const testCase of testCases) {
|
|
707
|
-
const response = await backend.generateSync({
|
|
708
|
-
prompt: testCase.inputs[testCase.inputs.length - 1].content,
|
|
709
|
-
agentInstance: await loadAgent(agent.id),
|
|
710
|
-
statistics: { label: "ci", trigger: "test" }
|
|
711
|
-
});
|
|
712
|
-
|
|
713
|
-
const messages = [
|
|
714
|
-
...testCase.inputs,
|
|
715
|
-
{ role: "assistant", content: response.text }
|
|
716
|
-
];
|
|
717
|
-
|
|
718
|
-
for (const evaluation of evaluations) {
|
|
719
|
-
const score = await evaluation.run(agent, backend, testCase, messages);
|
|
720
|
-
results.push({ testCase: testCase.name, eval: evaluation.name, score });
|
|
721
|
-
}
|
|
722
|
-
}
|
|
723
|
-
|
|
724
|
-
const averageScore = results.reduce((sum, r) => sum + r.score, 0) / results.length;
|
|
725
|
-
const passRate = results.filter(r => r.score >= 80).length / results.length;
|
|
726
|
-
|
|
727
|
-
if (passRate < minPassRate) {
|
|
728
|
-
throw new Error(
|
|
729
|
-
`CI failed: Pass rate ${passRate.toFixed(2)} below minimum ${minPassRate}. ` +
|
|
730
|
-
`Average score: ${averageScore.toFixed(2)}/100`
|
|
731
|
-
);
|
|
732
|
-
}
|
|
733
|
-
|
|
734
|
-
console.log(`✓ CI passed: ${passRate.toFixed(2)} pass rate, ${averageScore.toFixed(2)} avg score`);
|
|
735
|
-
return { averageScore, passRate, results };
|
|
736
|
-
}
|
|
737
|
-
```
|
|
738
|
-
|
|
739
|
-
## Best practices
|
|
740
|
-
|
|
741
|
-
<Tip>
|
|
742
|
-
**Validate inputs**: Check that messages and testCase have expected structure before running evaluation logic.
|
|
743
|
-
</Tip>
|
|
744
|
-
|
|
745
|
-
<Note>
|
|
746
|
-
**Score range**: Always ensure your execute function returns a value between 0 and 100, inclusive.
|
|
747
|
-
</Note>
|
|
748
|
-
|
|
749
|
-
<Warning>
|
|
750
|
-
**LLM consistency**: LLM judges can be inconsistent. Use temperature=0 for more deterministic scoring.
|
|
751
|
-
</Warning>
|
|
752
|
-
|
|
753
|
-
<Info>
|
|
754
|
-
**Multiple evaluations**: Use multiple evaluation functions to assess different aspects (accuracy, style, tool usage).
|
|
755
|
-
</Info>
|
|
756
|
-
|
|
757
|
-
## Next steps
|
|
758
|
-
|
|
759
|
-
<CardGroup cols={2}>
|
|
760
|
-
<Card title="Configuration guide" icon="gear" href="/core/exulu-eval/configuration">
|
|
761
|
-
Learn about evaluation configuration
|
|
762
|
-
</Card>
|
|
763
|
-
<Card title="Overview" icon="book" href="/core/exulu-eval/introduction">
|
|
764
|
-
Understand evaluation concepts
|
|
765
|
-
</Card>
|
|
766
|
-
<Card title="ExuluAgent" icon="robot" href="/core/exulu-agent/introduction">
|
|
767
|
-
Create agents to evaluate
|
|
768
|
-
</Card>
|
|
769
|
-
<Card title="ExuluQueues" icon="layer-group" href="/core/exulu-queues/introduction">
|
|
770
|
-
Run evaluations as background jobs
|
|
771
|
-
</Card>
|
|
772
|
-
</CardGroup>
|