@exulu/backend 1.48.2 → 1.49.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. package/dist/index.cjs +351 -42
  2. package/dist/index.d.cts +96 -1
  3. package/dist/index.d.ts +96 -1
  4. package/dist/index.js +340 -38
  5. package/ee/{markdown.ts → chunking/markdown.ts} +2 -2
  6. package/ee/python/README.md +295 -0
  7. package/ee/python/documents/processing/README.md +155 -0
  8. package/ee/{documents → python/documents}/processing/doc_processor.ts +25 -17
  9. package/ee/{documents/processing/pdf_to_markdown.py → python/documents/processing/document_to_markdown.py} +3 -10
  10. package/ee/python/setup.sh +180 -0
  11. package/package.json +14 -3
  12. package/scripts/postinstall.cjs +149 -0
  13. package/.agents/skills/mintlify/SKILL.md +0 -347
  14. package/.editorconfig +0 -15
  15. package/.eslintrc.json +0 -52
  16. package/.github/workflows/release-backend.yml +0 -38
  17. package/.husky/commit-msg +0 -1
  18. package/.jscpd.json +0 -18
  19. package/.mcp.json +0 -25
  20. package/.nvmrc +0 -1
  21. package/.prettierignore +0 -5
  22. package/.prettierrc.json +0 -12
  23. package/CHANGELOG.md +0 -8
  24. package/SECURITY.md +0 -5
  25. package/commitlint.config.js +0 -4
  26. package/devops/documentation/patch-older-releases.md +0 -42
  27. package/ee/documents/processing/build_pdf_processor.sh +0 -35
  28. package/ee/documents/processing/chunk_markdown.py +0 -263
  29. package/ee/documents/processing/pdf_processor.spec +0 -115
  30. package/eslint.config.js +0 -88
  31. package/jest.config.ts +0 -25
  32. package/mintlify-docs/.mintignore +0 -7
  33. package/mintlify-docs/AGENTS.md +0 -33
  34. package/mintlify-docs/CLAUDE.MD +0 -50
  35. package/mintlify-docs/CONTRIBUTING.md +0 -32
  36. package/mintlify-docs/LICENSE +0 -21
  37. package/mintlify-docs/README.md +0 -55
  38. package/mintlify-docs/ai-tools/claude-code.mdx +0 -43
  39. package/mintlify-docs/ai-tools/cursor.mdx +0 -39
  40. package/mintlify-docs/ai-tools/windsurf.mdx +0 -39
  41. package/mintlify-docs/api-reference/core-types/agent-types.mdx +0 -110
  42. package/mintlify-docs/api-reference/core-types/analytics-types.mdx +0 -95
  43. package/mintlify-docs/api-reference/core-types/configuration-types.mdx +0 -83
  44. package/mintlify-docs/api-reference/core-types/evaluation-types.mdx +0 -106
  45. package/mintlify-docs/api-reference/core-types/job-types.mdx +0 -135
  46. package/mintlify-docs/api-reference/core-types/overview.mdx +0 -73
  47. package/mintlify-docs/api-reference/core-types/prompt-types.mdx +0 -102
  48. package/mintlify-docs/api-reference/core-types/rbac-types.mdx +0 -163
  49. package/mintlify-docs/api-reference/core-types/session-types.mdx +0 -77
  50. package/mintlify-docs/api-reference/core-types/user-management.mdx +0 -112
  51. package/mintlify-docs/api-reference/core-types/workflow-types.mdx +0 -88
  52. package/mintlify-docs/api-reference/core-types.mdx +0 -585
  53. package/mintlify-docs/api-reference/dynamic-types.mdx +0 -851
  54. package/mintlify-docs/api-reference/endpoint/create.mdx +0 -4
  55. package/mintlify-docs/api-reference/endpoint/delete.mdx +0 -4
  56. package/mintlify-docs/api-reference/endpoint/get.mdx +0 -4
  57. package/mintlify-docs/api-reference/endpoint/webhook.mdx +0 -4
  58. package/mintlify-docs/api-reference/introduction.mdx +0 -661
  59. package/mintlify-docs/api-reference/mutations.mdx +0 -1012
  60. package/mintlify-docs/api-reference/openapi.json +0 -217
  61. package/mintlify-docs/api-reference/queries.mdx +0 -1154
  62. package/mintlify-docs/backend/introduction.mdx +0 -218
  63. package/mintlify-docs/changelog.mdx +0 -387
  64. package/mintlify-docs/community-edition.mdx +0 -304
  65. package/mintlify-docs/core/exulu-agent/api-reference.mdx +0 -894
  66. package/mintlify-docs/core/exulu-agent/configuration.mdx +0 -690
  67. package/mintlify-docs/core/exulu-agent/introduction.mdx +0 -552
  68. package/mintlify-docs/core/exulu-app/api-reference.mdx +0 -481
  69. package/mintlify-docs/core/exulu-app/configuration.mdx +0 -319
  70. package/mintlify-docs/core/exulu-app/introduction.mdx +0 -117
  71. package/mintlify-docs/core/exulu-authentication.mdx +0 -810
  72. package/mintlify-docs/core/exulu-chunkers/api-reference.mdx +0 -1011
  73. package/mintlify-docs/core/exulu-chunkers/configuration.mdx +0 -596
  74. package/mintlify-docs/core/exulu-chunkers/introduction.mdx +0 -403
  75. package/mintlify-docs/core/exulu-context/api-reference.mdx +0 -911
  76. package/mintlify-docs/core/exulu-context/configuration.mdx +0 -648
  77. package/mintlify-docs/core/exulu-context/introduction.mdx +0 -394
  78. package/mintlify-docs/core/exulu-database.mdx +0 -811
  79. package/mintlify-docs/core/exulu-default-agents.mdx +0 -545
  80. package/mintlify-docs/core/exulu-eval/api-reference.mdx +0 -772
  81. package/mintlify-docs/core/exulu-eval/configuration.mdx +0 -680
  82. package/mintlify-docs/core/exulu-eval/introduction.mdx +0 -459
  83. package/mintlify-docs/core/exulu-logging.mdx +0 -464
  84. package/mintlify-docs/core/exulu-otel.mdx +0 -670
  85. package/mintlify-docs/core/exulu-queues/api-reference.mdx +0 -648
  86. package/mintlify-docs/core/exulu-queues/configuration.mdx +0 -650
  87. package/mintlify-docs/core/exulu-queues/introduction.mdx +0 -474
  88. package/mintlify-docs/core/exulu-reranker/api-reference.mdx +0 -630
  89. package/mintlify-docs/core/exulu-reranker/configuration.mdx +0 -663
  90. package/mintlify-docs/core/exulu-reranker/introduction.mdx +0 -516
  91. package/mintlify-docs/core/exulu-tool/api-reference.mdx +0 -723
  92. package/mintlify-docs/core/exulu-tool/configuration.mdx +0 -805
  93. package/mintlify-docs/core/exulu-tool/introduction.mdx +0 -539
  94. package/mintlify-docs/core/exulu-variables/api-reference.mdx +0 -699
  95. package/mintlify-docs/core/exulu-variables/configuration.mdx +0 -736
  96. package/mintlify-docs/core/exulu-variables/introduction.mdx +0 -511
  97. package/mintlify-docs/development.mdx +0 -94
  98. package/mintlify-docs/docs.json +0 -248
  99. package/mintlify-docs/enterprise-edition.mdx +0 -538
  100. package/mintlify-docs/essentials/code.mdx +0 -35
  101. package/mintlify-docs/essentials/images.mdx +0 -59
  102. package/mintlify-docs/essentials/markdown.mdx +0 -88
  103. package/mintlify-docs/essentials/navigation.mdx +0 -87
  104. package/mintlify-docs/essentials/reusable-snippets.mdx +0 -110
  105. package/mintlify-docs/essentials/settings.mdx +0 -318
  106. package/mintlify-docs/favicon.svg +0 -3
  107. package/mintlify-docs/frontend/introduction.mdx +0 -39
  108. package/mintlify-docs/getting-started.mdx +0 -267
  109. package/mintlify-docs/guides/custom-agent.mdx +0 -608
  110. package/mintlify-docs/guides/first-agent.mdx +0 -315
  111. package/mintlify-docs/images/admin_ui.png +0 -0
  112. package/mintlify-docs/images/contexts.png +0 -0
  113. package/mintlify-docs/images/create_agents.png +0 -0
  114. package/mintlify-docs/images/evals.png +0 -0
  115. package/mintlify-docs/images/graphql.png +0 -0
  116. package/mintlify-docs/images/graphql_api.png +0 -0
  117. package/mintlify-docs/images/hero-dark.png +0 -0
  118. package/mintlify-docs/images/hero-light.png +0 -0
  119. package/mintlify-docs/images/hero.png +0 -0
  120. package/mintlify-docs/images/knowledge_sources.png +0 -0
  121. package/mintlify-docs/images/mcp.png +0 -0
  122. package/mintlify-docs/images/scaling.png +0 -0
  123. package/mintlify-docs/index.mdx +0 -411
  124. package/mintlify-docs/logo/dark.svg +0 -9
  125. package/mintlify-docs/logo/light.svg +0 -9
  126. package/mintlify-docs/partners.mdx +0 -558
  127. package/mintlify-docs/products.mdx +0 -77
  128. package/mintlify-docs/snippets/snippet-intro.mdx +0 -4
  129. package/mintlify-docs/styles.css +0 -207
  130. package/ngrok.bash +0 -1
  131. package/ngrok.md +0 -6
  132. package/ngrok.yml +0 -10
  133. package/release.config.cjs +0 -15
  134. package/skills-lock.json +0 -10
  135. package/types/context-processor.ts +0 -45
  136. package/types/enums/eval-types.ts +0 -5
  137. package/types/enums/field-types.ts +0 -1
  138. package/types/enums/jobs.ts +0 -11
  139. package/types/enums/statistics.ts +0 -13
  140. package/types/exulu-table-definition.ts +0 -79
  141. package/types/file-types.ts +0 -18
  142. package/types/models/agent-session.ts +0 -27
  143. package/types/models/agent.ts +0 -68
  144. package/types/models/context.ts +0 -53
  145. package/types/models/embedding.ts +0 -17
  146. package/types/models/eval-run.ts +0 -40
  147. package/types/models/exulu-agent-tool-config.ts +0 -11
  148. package/types/models/item.ts +0 -21
  149. package/types/models/job.ts +0 -8
  150. package/types/models/project.ts +0 -16
  151. package/types/models/rate-limiter-rules.ts +0 -7
  152. package/types/models/test-case.ts +0 -25
  153. package/types/models/tool.ts +0 -9
  154. package/types/models/user-role.ts +0 -12
  155. package/types/models/user.ts +0 -20
  156. package/types/models/variable.ts +0 -8
  157. package/types/models/vector-methods.ts +0 -7
  158. package/types/provider-config.ts +0 -21
  159. package/types/queue-config.ts +0 -16
  160. package/types/rbac-rights-modes.ts +0 -1
  161. package/types/statistics.ts +0 -20
  162. package/types/workflow.ts +0 -31
  163. /package/ee/{documents → python/documents}/THIRD_PARTY_LICENSES/docling.txt +0 -0
  164. /package/ee/{documents/processing → python}/requirements.txt +0 -0
@@ -1,772 +0,0 @@
1
- ---
2
- title: "API reference"
3
- description: "Complete method and property reference for ExuluEval"
4
- ---
5
-
6
- ## ExuluEval class
7
-
8
- ```typescript
9
- class ExuluEval {
10
- public id: string;
11
- public name: string;
12
- public description: string;
13
- public llm: boolean;
14
- public config?: { name: string; description: string }[];
15
- public queue?: Promise<ExuluQueueConfig>;
16
-
17
- constructor(params: ExuluEvalParams);
18
- async run(
19
- agent: Agent,
20
- backend: ExuluAgent,
21
- testCase: TestCase,
22
- messages: UIMessage[],
23
- config?: Record<string, any>
24
- ): Promise<number>;
25
- }
26
- ```
27
-
28
- ## Constructor
29
-
30
- Creates a new evaluation function instance.
31
-
32
- ```typescript
33
- new ExuluEval(params: ExuluEvalParams)
34
- ```
35
-
36
- ### Parameters
37
-
38
- <ParamField path="params" type="ExuluEvalParams" required>
39
- Configuration object for the evaluation function
40
-
41
- ```typescript
42
- interface ExuluEvalParams {
43
- id: string;
44
- name: string;
45
- description: string;
46
- llm: boolean;
47
- execute: (params: ExecuteParams) => Promise<number>;
48
- config?: { name: string; description: string }[];
49
- queue?: Promise<ExuluQueueConfig>;
50
- }
51
- ```
52
- </ParamField>
53
-
54
- <ParamField path="params.id" type="string" required>
55
- Unique identifier for this evaluation function
56
- </ParamField>
57
-
58
- <ParamField path="params.name" type="string" required>
59
- Human-readable name
60
- </ParamField>
61
-
62
- <ParamField path="params.description" type="string" required>
63
- Description of what this evaluation measures
64
- </ParamField>
65
-
66
- <ParamField path="params.llm" type="boolean" required>
67
- Whether this evaluation uses an LLM (LLM-as-judge)
68
- </ParamField>
69
-
70
- <ParamField path="params.execute" type="function" required>
71
- Function that performs the evaluation
72
- ```typescript
73
- async (params: {
74
- agent: Agent;
75
- backend: ExuluAgent;
76
- messages: UIMessage[];
77
- testCase: TestCase;
78
- config?: Record<string, any>;
79
- }) => Promise<number>
80
- ```
81
- Must return a score between 0 and 100
82
- </ParamField>
83
-
84
- <ParamField path="params.config" type="array">
85
- Optional configuration schema
86
- ```typescript
87
- {
88
- name: string; // Config parameter name
89
- description: string; // What this parameter does
90
- }[]
91
- ```
92
- </ParamField>
93
-
94
- <ParamField path="params.queue" type="Promise<ExuluQueueConfig>">
95
- Optional queue configuration for background execution
96
- </ParamField>
97
-
98
- ### Example
99
-
100
- ```typescript
101
- import { ExuluEval } from "@exulu/backend";
102
-
103
- const eval = new ExuluEval({
104
- id: "exact_match",
105
- name: "Exact Match",
106
- description: "Checks if response exactly matches expected output",
107
- llm: false,
108
- execute: async ({ messages, testCase }) => {
109
- const response = messages[messages.length - 1]?.content || "";
110
- return response === testCase.expected_output ? 100 : 0;
111
- }
112
- });
113
- ```
114
-
115
- ## Properties
116
-
117
- ### id
118
-
119
- <ResponseField name="id" type="string">
120
- Unique identifier for this evaluation function
121
- </ResponseField>
122
-
123
- ```typescript
124
- const evalId = eval.id; // "exact_match"
125
- ```
126
-
127
- ### name
128
-
129
- <ResponseField name="name" type="string">
130
- Human-readable name for the evaluation
131
- </ResponseField>
132
-
133
- ```typescript
134
- const evalName = eval.name; // "Exact Match"
135
- ```
136
-
137
- ### description
138
-
139
- <ResponseField name="description" type="string">
140
- Description of what this evaluation measures
141
- </ResponseField>
142
-
143
- ```typescript
144
- const evalDesc = eval.description; // "Checks if response exactly matches expected output"
145
- ```
146
-
147
- ### llm
148
-
149
- <ResponseField name="llm" type="boolean">
150
- Whether this evaluation uses an LLM for scoring
151
- </ResponseField>
152
-
153
- ```typescript
154
- const usesLLM = eval.llm; // false
155
- ```
156
-
157
- ### config
158
-
159
- <ResponseField name="config" type="array | undefined">
160
- Configuration schema defining runtime parameters
161
-
162
- ```typescript
163
- {
164
- name: string;
165
- description: string;
166
- }[]
167
- ```
168
- </ResponseField>
169
-
170
- ```typescript
171
- const configSchema = eval.config;
172
- // [{ name: "threshold", description: "Minimum score threshold" }]
173
- ```
174
-
175
- ### queue
176
-
177
- <ResponseField name="queue" type="Promise<ExuluQueueConfig> | undefined">
178
- Queue configuration for background execution
179
- </ResponseField>
180
-
181
- ```typescript
182
- const queueConfig = await eval.queue;
183
- ```
184
-
185
- ## Methods
186
-
187
- ### run()
188
-
189
- Executes the evaluation function and returns a score.
190
-
191
- ```typescript
192
- async run(
193
- agent: Agent,
194
- backend: ExuluAgent,
195
- testCase: TestCase,
196
- messages: UIMessage[],
197
- config?: Record<string, any>
198
- ): Promise<number>
199
- ```
200
-
201
- <ParamField path="agent" type="Agent" required>
202
- Agent database record being evaluated
203
- ```typescript
204
- interface Agent {
205
- id: string;
206
- name: string;
207
- description: string;
208
- // ... other properties
209
- }
210
- ```
211
- </ParamField>
212
-
213
- <ParamField path="backend" type="ExuluAgent" required>
214
- ExuluAgent instance for generating responses or using LLM-as-judge
215
- </ParamField>
216
-
217
- <ParamField path="testCase" type="TestCase" required>
218
- Test case with inputs and expected outputs
219
- ```typescript
220
- interface TestCase {
221
- id: string;
222
- name: string;
223
- description?: string;
224
- inputs: UIMessage[];
225
- expected_output: string;
226
- expected_tools?: string[];
227
- expected_knowledge_sources?: string[];
228
- expected_agent_tools?: string[];
229
- createdAt: string;
230
- updatedAt: string;
231
- }
232
- ```
233
- </ParamField>
234
-
235
- <ParamField path="messages" type="UIMessage[]" required>
236
- Conversation messages including inputs and agent responses
237
- ```typescript
238
- interface UIMessage {
239
- role: "user" | "assistant" | "system";
240
- content: string;
241
- toolInvocations?: ToolInvocation[];
242
- }
243
- ```
244
- </ParamField>
245
-
246
- <ParamField path="config" type="Record<string, any>">
247
- Optional runtime configuration values
248
- </ParamField>
249
-
250
- <ResponseField name="return" type="Promise<number>">
251
- Score from 0 to 100
252
- </ResponseField>
253
-
254
- **Example:**
255
-
256
- ```typescript
257
- const score = await eval.run(
258
- agent,
259
- backend,
260
- testCase,
261
- messages,
262
- { threshold: 80 }
263
- );
264
-
265
- console.log(`Score: ${score}/100`);
266
- ```
267
-
268
- **Error handling:**
269
-
270
- ```typescript
271
- try {
272
- const score = await eval.run(agent, backend, testCase, messages);
273
- console.log(`Score: ${score}`);
274
- } catch (error) {
275
- console.error("Evaluation failed:", error.message);
276
- // Error: Eval function must return a score between 0 and 100, got 150
277
- }
278
- ```
279
-
280
- **Throws:**
281
- - Error if execute function returns score < 0 or > 100
282
- - Error if execute function throws an error
283
-
284
- ## Type definitions
285
-
286
- ### ExuluEvalParams
287
-
288
- ```typescript
289
- interface ExuluEvalParams {
290
- id: string;
291
- name: string;
292
- description: string;
293
- llm: boolean;
294
- execute: (params: {
295
- agent: Agent;
296
- backend: ExuluAgent;
297
- messages: UIMessage[];
298
- testCase: TestCase;
299
- config?: Record<string, any>;
300
- }) => Promise<number>;
301
- config?: {
302
- name: string;
303
- description: string;
304
- }[];
305
- queue?: Promise<ExuluQueueConfig>;
306
- }
307
- ```
308
-
309
- ### TestCase
310
-
311
- ```typescript
312
- interface TestCase {
313
- id: string;
314
- name: string;
315
- description?: string;
316
- inputs: UIMessage[]; // Input messages
317
- expected_output: string; // Expected response
318
- expected_tools?: string[]; // Expected tool names
319
- expected_knowledge_sources?: string[]; // Expected context IDs
320
- expected_agent_tools?: string[]; // Expected agent tool IDs
321
- createdAt: string;
322
- updatedAt: string;
323
- }
324
- ```
325
-
326
- ### UIMessage
327
-
328
- ```typescript
329
- interface UIMessage {
330
- role: "user" | "assistant" | "system";
331
- content: string;
332
- toolInvocations?: ToolInvocation[];
333
- }
334
-
335
- interface ToolInvocation {
336
- toolName: string;
337
- toolCallId: string;
338
- args: Record<string, any>;
339
- result?: any;
340
- }
341
- ```
342
-
343
- ## Usage examples
344
-
345
- ### Basic exact match
346
-
347
- ```typescript
348
- import { ExuluEval } from "@exulu/backend";
349
-
350
- const exactMatch = new ExuluEval({
351
- id: "exact_match",
352
- name: "Exact Match",
353
- description: "100 if exact match, 0 otherwise",
354
- llm: false,
355
- execute: async ({ messages, testCase }) => {
356
- const response = messages[messages.length - 1]?.content || "";
357
- return response === testCase.expected_output ? 100 : 0;
358
- }
359
- });
360
-
361
- const score = await exactMatch.run(agent, backend, testCase, messages);
362
- console.log(`Score: ${score}/100`);
363
- ```
364
-
365
- ### Keyword evaluation with config
366
-
367
- ```typescript
368
- const keywordEval = new ExuluEval({
369
- id: "keyword_check",
370
- name: "Keyword Check",
371
- description: "Checks for presence of keywords",
372
- llm: false,
373
- execute: async ({ messages, config }) => {
374
- const response = messages[messages.length - 1]?.content?.toLowerCase() || "";
375
- const keywords = config?.keywords || [];
376
-
377
- if (keywords.length === 0) return 100;
378
-
379
- const found = keywords.filter(kw => response.includes(kw.toLowerCase()));
380
- return (found.length / keywords.length) * 100;
381
- },
382
- config: [
383
- {
384
- name: "keywords",
385
- description: "Array of required keywords"
386
- }
387
- ]
388
- });
389
-
390
- const score = await keywordEval.run(
391
- agent,
392
- backend,
393
- testCase,
394
- messages,
395
- { keywords: ["weather", "temperature"] }
396
- );
397
- ```
398
-
399
- ### LLM-as-judge
400
-
401
- ```typescript
402
- const llmJudge = new ExuluEval({
403
- id: "llm_judge",
404
- name: "LLM Quality Judge",
405
- description: "Uses LLM to evaluate response quality",
406
- llm: true,
407
- execute: async ({ backend, messages, testCase, config }) => {
408
- const response = messages[messages.length - 1]?.content || "";
409
-
410
- const judgePrompt = `
411
- Rate this response on a scale of 0-100.
412
-
413
- Expected: ${testCase.expected_output}
414
- Actual: ${response}
415
-
416
- Respond with ONLY a number 0-100.
417
- `.trim();
418
-
419
- const result = await backend.generateSync({
420
- prompt: judgePrompt,
421
- agentInstance: await loadAgent(config?.judgeAgentId),
422
- statistics: { label: "eval", trigger: "llm_judge" }
423
- });
424
-
425
- const score = parseInt(result.text.trim());
426
- return isNaN(score) ? 0 : Math.max(0, Math.min(100, score));
427
- },
428
- config: [
429
- {
430
- name: "judgeAgentId",
431
- description: "Agent to use for judging"
432
- }
433
- ]
434
- });
435
-
436
- const score = await llmJudge.run(
437
- agent,
438
- backend,
439
- testCase,
440
- messages,
441
- { judgeAgentId: "claude_opus_judge" }
442
- );
443
- ```
444
-
445
- ### Tool usage evaluation
446
-
447
- ```typescript
448
- const toolUsageEval = new ExuluEval({
449
- id: "tool_usage",
450
- name: "Tool Usage Check",
451
- description: "Verifies correct tools were used",
452
- llm: false,
453
- execute: async ({ messages, testCase }) => {
454
- const toolCalls = messages
455
- .flatMap(msg => msg.toolInvocations || [])
456
- .map(inv => inv.toolName);
457
-
458
- const expectedTools = testCase.expected_tools || [];
459
-
460
- if (expectedTools.length === 0) {
461
- return toolCalls.length === 0 ? 100 : 0;
462
- }
463
-
464
- const usedExpected = expectedTools.filter(tool =>
465
- toolCalls.includes(tool)
466
- );
467
-
468
- return (usedExpected.length / expectedTools.length) * 100;
469
- }
470
- });
471
-
472
- const score = await toolUsageEval.run(agent, backend, testCase, messages);
473
- ```
474
-
475
- ### Batch evaluation
476
-
477
- ```typescript
478
- async function runAllEvaluations(
479
- agent: Agent,
480
- backend: ExuluAgent,
481
- testCases: TestCase[],
482
- evaluations: ExuluEval[]
483
- ) {
484
- const results = [];
485
-
486
- for (const testCase of testCases) {
487
- // Generate response
488
- const response = await backend.generateSync({
489
- prompt: testCase.inputs[testCase.inputs.length - 1].content,
490
- agentInstance: await loadAgent(agent.id),
491
- statistics: { label: "eval", trigger: "test" }
492
- });
493
-
494
- const messages = [
495
- ...testCase.inputs,
496
- { role: "assistant", content: response.text }
497
- ];
498
-
499
- // Run all evaluations
500
- for (const evaluation of evaluations) {
501
- const score = await evaluation.run(agent, backend, testCase, messages);
502
-
503
- results.push({
504
- testCaseId: testCase.id,
505
- testCaseName: testCase.name,
506
- evaluationId: evaluation.id,
507
- evaluationName: evaluation.name,
508
- score
509
- });
510
- }
511
- }
512
-
513
- return results;
514
- }
515
-
516
- // Use
517
- const results = await runAllEvaluations(
518
- agent,
519
- backend,
520
- testCases,
521
- [exactMatch, keywordEval, toolUsageEval]
522
- );
523
-
524
- console.log("Results:", results);
525
- ```
526
-
527
- ### Evaluation suite
528
-
529
- ```typescript
530
- import { ExuluEval } from "@exulu/backend";
531
-
532
- class EvaluationSuite {
533
- private evaluations: ExuluEval[] = [];
534
-
535
- add(evaluation: ExuluEval) {
536
- this.evaluations.push(evaluation);
537
- }
538
-
539
- async runAll(
540
- agent: Agent,
541
- backend: ExuluAgent,
542
- testCase: TestCase,
543
- messages: UIMessage[],
544
- config?: Record<string, any>
545
- ) {
546
- const results = await Promise.all(
547
- this.evaluations.map(async (eval) => ({
548
- id: eval.id,
549
- name: eval.name,
550
- score: await eval.run(agent, backend, testCase, messages, config)
551
- }))
552
- );
553
-
554
- return {
555
- testCase: testCase.name,
556
- evaluations: results,
557
- average: results.reduce((sum, r) => sum + r.score, 0) / results.length,
558
- passed: results.every(r => r.score >= (config?.threshold || 80))
559
- };
560
- }
561
- }
562
-
563
- // Use
564
- const suite = new EvaluationSuite();
565
- suite.add(exactMatch);
566
- suite.add(keywordEval);
567
- suite.add(toolUsageEval);
568
-
569
- const result = await suite.runAll(agent, backend, testCase, messages);
570
- console.log("Suite result:", result);
571
- ```
572
-
573
- ### Composite evaluation
574
-
575
- ```typescript
576
- const compositeEval = new ExuluEval({
577
- id: "composite",
578
- name: "Composite Evaluation",
579
- description: "Combines multiple criteria with weights",
580
- llm: false,
581
- execute: async ({ messages, testCase }) => {
582
- const response = messages[messages.length - 1]?.content || "";
583
- let totalScore = 0;
584
-
585
- // Accuracy (50%)
586
- const containsExpected = response.includes(testCase.expected_output);
587
- totalScore += containsExpected ? 50 : 0;
588
-
589
- // Length (20%)
590
- const isReasonableLength = response.length >= 50 && response.length <= 500;
591
- totalScore += isReasonableLength ? 20 : 0;
592
-
593
- // Tool usage (30%)
594
- const toolCalls = messages.flatMap(msg => msg.toolInvocations || []);
595
- const expectedTools = testCase.expected_tools || [];
596
- if (expectedTools.length > 0) {
597
- const toolsUsed = expectedTools.every(tool =>
598
- toolCalls.some(call => call.toolName === tool)
599
- );
600
- totalScore += toolsUsed ? 30 : 0;
601
- } else {
602
- totalScore += 30;
603
- }
604
-
605
- return totalScore;
606
- }
607
- });
608
- ```
609
-
610
- ### Error handling
611
-
612
- ```typescript
613
- const safeEval = new ExuluEval({
614
- id: "safe_eval",
615
- name: "Safe Evaluation",
616
- description: "Evaluation with comprehensive error handling",
617
- llm: false,
618
- execute: async ({ messages, testCase, config }) => {
619
- try {
620
- const response = messages[messages.length - 1]?.content;
621
-
622
- if (!response) {
623
- console.warn("No response content found");
624
- return 0;
625
- }
626
-
627
- // Your evaluation logic
628
- const score = computeScore(response, testCase.expected_output);
629
-
630
- // Validate score range
631
- if (score < 0 || score > 100) {
632
- throw new Error(`Score out of range: ${score}`);
633
- }
634
-
635
- return score;
636
- } catch (error) {
637
- console.error(`Evaluation error: ${error.message}`);
638
- throw error; // Re-throw for ExuluEval to handle
639
- }
640
- }
641
- });
642
-
643
- // Run with error handling
644
- try {
645
- const score = await safeEval.run(agent, backend, testCase, messages);
646
- console.log(`Score: ${score}`);
647
- } catch (error) {
648
- console.error("Evaluation failed:", error.message);
649
- // Handle failure (log, alert, retry, etc.)
650
- }
651
- ```
652
-
653
- ## Integration patterns
654
-
655
- ### With test management system
656
-
657
- ```typescript
658
- interface EvaluationResult {
659
- evaluationId: string;
660
- testCaseId: string;
661
- score: number;
662
- timestamp: string;
663
- agentId: string;
664
- passed: boolean;
665
- }
666
-
667
- async function runAndStoreEvaluation(
668
- evaluation: ExuluEval,
669
- agent: Agent,
670
- backend: ExuluAgent,
671
- testCase: TestCase,
672
- messages: UIMessage[],
673
- threshold: number = 80
674
- ): Promise<EvaluationResult> {
675
- const score = await evaluation.run(agent, backend, testCase, messages);
676
-
677
- const result: EvaluationResult = {
678
- evaluationId: evaluation.id,
679
- testCaseId: testCase.id,
680
- score,
681
- timestamp: new Date().toISOString(),
682
- agentId: agent.id,
683
- passed: score >= threshold
684
- };
685
-
686
- // Store in database
687
- const { db } = await postgresClient();
688
- await db.into("evaluation_results").insert(result);
689
-
690
- return result;
691
- }
692
- ```
693
-
694
- ### CI/CD integration
695
-
696
- ```typescript
697
- async function runCIPipeline(
698
- agent: Agent,
699
- backend: ExuluAgent,
700
- testCases: TestCase[],
701
- evaluations: ExuluEval[],
702
- minPassRate: number = 0.8
703
- ) {
704
- const results = [];
705
-
706
- for (const testCase of testCases) {
707
- const response = await backend.generateSync({
708
- prompt: testCase.inputs[testCase.inputs.length - 1].content,
709
- agentInstance: await loadAgent(agent.id),
710
- statistics: { label: "ci", trigger: "test" }
711
- });
712
-
713
- const messages = [
714
- ...testCase.inputs,
715
- { role: "assistant", content: response.text }
716
- ];
717
-
718
- for (const evaluation of evaluations) {
719
- const score = await evaluation.run(agent, backend, testCase, messages);
720
- results.push({ testCase: testCase.name, eval: evaluation.name, score });
721
- }
722
- }
723
-
724
- const averageScore = results.reduce((sum, r) => sum + r.score, 0) / results.length;
725
- const passRate = results.filter(r => r.score >= 80).length / results.length;
726
-
727
- if (passRate < minPassRate) {
728
- throw new Error(
729
- `CI failed: Pass rate ${passRate.toFixed(2)} below minimum ${minPassRate}. ` +
730
- `Average score: ${averageScore.toFixed(2)}/100`
731
- );
732
- }
733
-
734
- console.log(`✓ CI passed: ${passRate.toFixed(2)} pass rate, ${averageScore.toFixed(2)} avg score`);
735
- return { averageScore, passRate, results };
736
- }
737
- ```
738
-
739
- ## Best practices
740
-
741
- <Tip>
742
- **Validate inputs**: Check that messages and testCase have expected structure before running evaluation logic.
743
- </Tip>
744
-
745
- <Note>
746
- **Score range**: Always ensure your execute function returns a value between 0 and 100, inclusive.
747
- </Note>
748
-
749
- <Warning>
750
- **LLM consistency**: LLM judges can be inconsistent. Use temperature=0 for more deterministic scoring.
751
- </Warning>
752
-
753
- <Info>
754
- **Multiple evaluations**: Use multiple evaluation functions to assess different aspects (accuracy, style, tool usage).
755
- </Info>
756
-
757
- ## Next steps
758
-
759
- <CardGroup cols={2}>
760
- <Card title="Configuration guide" icon="gear" href="/core/exulu-eval/configuration">
761
- Learn about evaluation configuration
762
- </Card>
763
- <Card title="Overview" icon="book" href="/core/exulu-eval/introduction">
764
- Understand evaluation concepts
765
- </Card>
766
- <Card title="ExuluAgent" icon="robot" href="/core/exulu-agent/introduction">
767
- Create agents to evaluate
768
- </Card>
769
- <Card title="ExuluQueues" icon="layer-group" href="/core/exulu-queues/introduction">
770
- Run evaluations as background jobs
771
- </Card>
772
- </CardGroup>