@exulu/backend 1.48.2 → 1.49.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. package/dist/index.cjs +351 -42
  2. package/dist/index.d.cts +96 -1
  3. package/dist/index.d.ts +96 -1
  4. package/dist/index.js +340 -38
  5. package/ee/{markdown.ts → chunking/markdown.ts} +2 -2
  6. package/ee/python/README.md +295 -0
  7. package/ee/python/documents/processing/README.md +155 -0
  8. package/ee/{documents → python/documents}/processing/doc_processor.ts +25 -17
  9. package/ee/{documents/processing/pdf_to_markdown.py → python/documents/processing/document_to_markdown.py} +3 -10
  10. package/ee/python/setup.sh +180 -0
  11. package/package.json +14 -3
  12. package/scripts/postinstall.cjs +149 -0
  13. package/.agents/skills/mintlify/SKILL.md +0 -347
  14. package/.editorconfig +0 -15
  15. package/.eslintrc.json +0 -52
  16. package/.github/workflows/release-backend.yml +0 -38
  17. package/.husky/commit-msg +0 -1
  18. package/.jscpd.json +0 -18
  19. package/.mcp.json +0 -25
  20. package/.nvmrc +0 -1
  21. package/.prettierignore +0 -5
  22. package/.prettierrc.json +0 -12
  23. package/CHANGELOG.md +0 -8
  24. package/SECURITY.md +0 -5
  25. package/commitlint.config.js +0 -4
  26. package/devops/documentation/patch-older-releases.md +0 -42
  27. package/ee/documents/processing/build_pdf_processor.sh +0 -35
  28. package/ee/documents/processing/chunk_markdown.py +0 -263
  29. package/ee/documents/processing/pdf_processor.spec +0 -115
  30. package/eslint.config.js +0 -88
  31. package/jest.config.ts +0 -25
  32. package/mintlify-docs/.mintignore +0 -7
  33. package/mintlify-docs/AGENTS.md +0 -33
  34. package/mintlify-docs/CLAUDE.MD +0 -50
  35. package/mintlify-docs/CONTRIBUTING.md +0 -32
  36. package/mintlify-docs/LICENSE +0 -21
  37. package/mintlify-docs/README.md +0 -55
  38. package/mintlify-docs/ai-tools/claude-code.mdx +0 -43
  39. package/mintlify-docs/ai-tools/cursor.mdx +0 -39
  40. package/mintlify-docs/ai-tools/windsurf.mdx +0 -39
  41. package/mintlify-docs/api-reference/core-types/agent-types.mdx +0 -110
  42. package/mintlify-docs/api-reference/core-types/analytics-types.mdx +0 -95
  43. package/mintlify-docs/api-reference/core-types/configuration-types.mdx +0 -83
  44. package/mintlify-docs/api-reference/core-types/evaluation-types.mdx +0 -106
  45. package/mintlify-docs/api-reference/core-types/job-types.mdx +0 -135
  46. package/mintlify-docs/api-reference/core-types/overview.mdx +0 -73
  47. package/mintlify-docs/api-reference/core-types/prompt-types.mdx +0 -102
  48. package/mintlify-docs/api-reference/core-types/rbac-types.mdx +0 -163
  49. package/mintlify-docs/api-reference/core-types/session-types.mdx +0 -77
  50. package/mintlify-docs/api-reference/core-types/user-management.mdx +0 -112
  51. package/mintlify-docs/api-reference/core-types/workflow-types.mdx +0 -88
  52. package/mintlify-docs/api-reference/core-types.mdx +0 -585
  53. package/mintlify-docs/api-reference/dynamic-types.mdx +0 -851
  54. package/mintlify-docs/api-reference/endpoint/create.mdx +0 -4
  55. package/mintlify-docs/api-reference/endpoint/delete.mdx +0 -4
  56. package/mintlify-docs/api-reference/endpoint/get.mdx +0 -4
  57. package/mintlify-docs/api-reference/endpoint/webhook.mdx +0 -4
  58. package/mintlify-docs/api-reference/introduction.mdx +0 -661
  59. package/mintlify-docs/api-reference/mutations.mdx +0 -1012
  60. package/mintlify-docs/api-reference/openapi.json +0 -217
  61. package/mintlify-docs/api-reference/queries.mdx +0 -1154
  62. package/mintlify-docs/backend/introduction.mdx +0 -218
  63. package/mintlify-docs/changelog.mdx +0 -387
  64. package/mintlify-docs/community-edition.mdx +0 -304
  65. package/mintlify-docs/core/exulu-agent/api-reference.mdx +0 -894
  66. package/mintlify-docs/core/exulu-agent/configuration.mdx +0 -690
  67. package/mintlify-docs/core/exulu-agent/introduction.mdx +0 -552
  68. package/mintlify-docs/core/exulu-app/api-reference.mdx +0 -481
  69. package/mintlify-docs/core/exulu-app/configuration.mdx +0 -319
  70. package/mintlify-docs/core/exulu-app/introduction.mdx +0 -117
  71. package/mintlify-docs/core/exulu-authentication.mdx +0 -810
  72. package/mintlify-docs/core/exulu-chunkers/api-reference.mdx +0 -1011
  73. package/mintlify-docs/core/exulu-chunkers/configuration.mdx +0 -596
  74. package/mintlify-docs/core/exulu-chunkers/introduction.mdx +0 -403
  75. package/mintlify-docs/core/exulu-context/api-reference.mdx +0 -911
  76. package/mintlify-docs/core/exulu-context/configuration.mdx +0 -648
  77. package/mintlify-docs/core/exulu-context/introduction.mdx +0 -394
  78. package/mintlify-docs/core/exulu-database.mdx +0 -811
  79. package/mintlify-docs/core/exulu-default-agents.mdx +0 -545
  80. package/mintlify-docs/core/exulu-eval/api-reference.mdx +0 -772
  81. package/mintlify-docs/core/exulu-eval/configuration.mdx +0 -680
  82. package/mintlify-docs/core/exulu-eval/introduction.mdx +0 -459
  83. package/mintlify-docs/core/exulu-logging.mdx +0 -464
  84. package/mintlify-docs/core/exulu-otel.mdx +0 -670
  85. package/mintlify-docs/core/exulu-queues/api-reference.mdx +0 -648
  86. package/mintlify-docs/core/exulu-queues/configuration.mdx +0 -650
  87. package/mintlify-docs/core/exulu-queues/introduction.mdx +0 -474
  88. package/mintlify-docs/core/exulu-reranker/api-reference.mdx +0 -630
  89. package/mintlify-docs/core/exulu-reranker/configuration.mdx +0 -663
  90. package/mintlify-docs/core/exulu-reranker/introduction.mdx +0 -516
  91. package/mintlify-docs/core/exulu-tool/api-reference.mdx +0 -723
  92. package/mintlify-docs/core/exulu-tool/configuration.mdx +0 -805
  93. package/mintlify-docs/core/exulu-tool/introduction.mdx +0 -539
  94. package/mintlify-docs/core/exulu-variables/api-reference.mdx +0 -699
  95. package/mintlify-docs/core/exulu-variables/configuration.mdx +0 -736
  96. package/mintlify-docs/core/exulu-variables/introduction.mdx +0 -511
  97. package/mintlify-docs/development.mdx +0 -94
  98. package/mintlify-docs/docs.json +0 -248
  99. package/mintlify-docs/enterprise-edition.mdx +0 -538
  100. package/mintlify-docs/essentials/code.mdx +0 -35
  101. package/mintlify-docs/essentials/images.mdx +0 -59
  102. package/mintlify-docs/essentials/markdown.mdx +0 -88
  103. package/mintlify-docs/essentials/navigation.mdx +0 -87
  104. package/mintlify-docs/essentials/reusable-snippets.mdx +0 -110
  105. package/mintlify-docs/essentials/settings.mdx +0 -318
  106. package/mintlify-docs/favicon.svg +0 -3
  107. package/mintlify-docs/frontend/introduction.mdx +0 -39
  108. package/mintlify-docs/getting-started.mdx +0 -267
  109. package/mintlify-docs/guides/custom-agent.mdx +0 -608
  110. package/mintlify-docs/guides/first-agent.mdx +0 -315
  111. package/mintlify-docs/images/admin_ui.png +0 -0
  112. package/mintlify-docs/images/contexts.png +0 -0
  113. package/mintlify-docs/images/create_agents.png +0 -0
  114. package/mintlify-docs/images/evals.png +0 -0
  115. package/mintlify-docs/images/graphql.png +0 -0
  116. package/mintlify-docs/images/graphql_api.png +0 -0
  117. package/mintlify-docs/images/hero-dark.png +0 -0
  118. package/mintlify-docs/images/hero-light.png +0 -0
  119. package/mintlify-docs/images/hero.png +0 -0
  120. package/mintlify-docs/images/knowledge_sources.png +0 -0
  121. package/mintlify-docs/images/mcp.png +0 -0
  122. package/mintlify-docs/images/scaling.png +0 -0
  123. package/mintlify-docs/index.mdx +0 -411
  124. package/mintlify-docs/logo/dark.svg +0 -9
  125. package/mintlify-docs/logo/light.svg +0 -9
  126. package/mintlify-docs/partners.mdx +0 -558
  127. package/mintlify-docs/products.mdx +0 -77
  128. package/mintlify-docs/snippets/snippet-intro.mdx +0 -4
  129. package/mintlify-docs/styles.css +0 -207
  130. package/ngrok.bash +0 -1
  131. package/ngrok.md +0 -6
  132. package/ngrok.yml +0 -10
  133. package/release.config.cjs +0 -15
  134. package/skills-lock.json +0 -10
  135. package/types/context-processor.ts +0 -45
  136. package/types/enums/eval-types.ts +0 -5
  137. package/types/enums/field-types.ts +0 -1
  138. package/types/enums/jobs.ts +0 -11
  139. package/types/enums/statistics.ts +0 -13
  140. package/types/exulu-table-definition.ts +0 -79
  141. package/types/file-types.ts +0 -18
  142. package/types/models/agent-session.ts +0 -27
  143. package/types/models/agent.ts +0 -68
  144. package/types/models/context.ts +0 -53
  145. package/types/models/embedding.ts +0 -17
  146. package/types/models/eval-run.ts +0 -40
  147. package/types/models/exulu-agent-tool-config.ts +0 -11
  148. package/types/models/item.ts +0 -21
  149. package/types/models/job.ts +0 -8
  150. package/types/models/project.ts +0 -16
  151. package/types/models/rate-limiter-rules.ts +0 -7
  152. package/types/models/test-case.ts +0 -25
  153. package/types/models/tool.ts +0 -9
  154. package/types/models/user-role.ts +0 -12
  155. package/types/models/user.ts +0 -20
  156. package/types/models/variable.ts +0 -8
  157. package/types/models/vector-methods.ts +0 -7
  158. package/types/provider-config.ts +0 -21
  159. package/types/queue-config.ts +0 -16
  160. package/types/rbac-rights-modes.ts +0 -1
  161. package/types/statistics.ts +0 -20
  162. package/types/workflow.ts +0 -31
  163. /package/ee/{documents → python/documents}/THIRD_PARTY_LICENSES/docling.txt +0 -0
  164. /package/ee/{documents/processing → python}/requirements.txt +0 -0
@@ -1,680 +0,0 @@
1
- ---
2
- title: "Configuration"
3
- description: "Complete guide to configuring ExuluEval evaluation functions"
4
- ---
5
-
6
- ## Constructor parameters
7
-
8
- ExuluEval requires specific configuration to define evaluation behavior:
9
-
10
- ```typescript
11
- new ExuluEval({
12
- id: string;
13
- name: string;
14
- description: string;
15
- llm: boolean;
16
- execute: (params) => Promise<number>;
17
- config?: { name: string; description: string }[];
18
- queue?: Promise<ExuluQueueConfig>;
19
- })
20
- ```
21
-
22
- <ParamField path="id" type="string" required>
23
- Unique identifier for the evaluation function
24
- </ParamField>
25
-
26
- <ParamField path="name" type="string" required>
27
- Human-readable name for the evaluation
28
- </ParamField>
29
-
30
- <ParamField path="description" type="string" required>
31
- Description of what this evaluation measures
32
- </ParamField>
33
-
34
- <ParamField path="llm" type="boolean" required>
35
- Whether this evaluation uses an LLM for scoring (LLM-as-judge)
36
- </ParamField>
37
-
38
- <ParamField path="execute" type="function" required>
39
- Function that performs the evaluation and returns a score from 0-100
40
- </ParamField>
41
-
42
- <ParamField path="config" type="array" default={undefined}>
43
- Optional configuration parameters for the evaluation function
44
- </ParamField>
45
-
46
- <ParamField path="queue" type="Promise<ExuluQueueConfig>" default={undefined}>
47
- Optional queue configuration for running evaluations as background jobs
48
- </ParamField>
49
-
50
- ## Execute function
51
-
52
- The `execute` function receives evaluation parameters and must return a score between 0 and 100:
53
-
54
- ```typescript
55
- execute: async ({
56
- agent, // Agent database record
57
- backend, // ExuluAgent instance
58
- messages, // Conversation messages
59
- testCase, // Test case with expected output
60
- config // Optional runtime configuration
61
- }) => {
62
- // Your evaluation logic
63
- return score; // Must be 0-100
64
- }
65
- ```
66
-
67
- ### Parameters
68
-
69
- <ParamField path="agent" type="Agent">
70
- The agent database record being evaluated
71
- ```typescript
72
- interface Agent {
73
- id: string;
74
- name: string;
75
- description: string;
76
- // ... other agent properties
77
- }
78
- ```
79
- </ParamField>
80
-
81
- <ParamField path="backend" type="ExuluAgent">
82
- ExuluAgent instance for generating responses or using LLM-as-judge
83
- </ParamField>
84
-
85
- <ParamField path="messages" type="UIMessage[]">
86
- Array of conversation messages including inputs and generated response
87
- ```typescript
88
- interface UIMessage {
89
- role: "user" | "assistant" | "system";
90
- content: string;
91
- toolInvocations?: ToolInvocation[];
92
- }
93
- ```
94
- </ParamField>
95
-
96
- <ParamField path="testCase" type="TestCase">
97
- Test case containing inputs and expected outputs
98
- ```typescript
99
- interface TestCase {
100
- id: string;
101
- name: string;
102
- description?: string;
103
- inputs: UIMessage[];
104
- expected_output: string;
105
- expected_tools?: string[];
106
- expected_knowledge_sources?: string[];
107
- expected_agent_tools?: string[];
108
- }
109
- ```
110
- </ParamField>
111
-
112
- <ParamField path="config" type="Record<string, any>">
113
- Runtime configuration values (optional)
114
- </ParamField>
115
-
116
- ## Configuration patterns
117
-
118
- ### Basic exact match evaluation
119
-
120
- ```typescript
121
- import { ExuluEval } from "@exulu/backend";
122
-
123
- const exactMatchEval = new ExuluEval({
124
- id: "exact_match",
125
- name: "Exact Match",
126
- description: "Returns 100 if response exactly matches expected output, 0 otherwise",
127
- llm: false,
128
- execute: async ({ messages, testCase }) => {
129
- const lastMessage = messages[messages.length - 1];
130
- const response = lastMessage?.content || "";
131
-
132
- return response === testCase.expected_output ? 100 : 0;
133
- }
134
- });
135
- ```
136
-
137
- ### Partial match with scoring
138
-
139
- ```typescript
140
- const partialMatchEval = new ExuluEval({
141
- id: "partial_match",
142
- name: "Partial Match",
143
- description: "Scores based on how much of expected output appears in response",
144
- llm: false,
145
- execute: async ({ messages, testCase }) => {
146
- const lastMessage = messages[messages.length - 1];
147
- const response = lastMessage?.content?.toLowerCase() || "";
148
- const expected = testCase.expected_output.toLowerCase();
149
-
150
- // Split into words
151
- const expectedWords = expected.split(/\s+/);
152
- const matchedWords = expectedWords.filter(word =>
153
- response.includes(word)
154
- );
155
-
156
- return (matchedWords.length / expectedWords.length) * 100;
157
- }
158
- });
159
- ```
160
-
161
- ### Keyword presence evaluation
162
-
163
- ```typescript
164
- const keywordEval = new ExuluEval({
165
- id: "keyword_presence",
166
- name: "Keyword Presence",
167
- description: "Checks if response contains required keywords",
168
- llm: false,
169
- execute: async ({ messages, testCase, config }) => {
170
- const lastMessage = messages[messages.length - 1];
171
- const response = lastMessage?.content?.toLowerCase() || "";
172
-
173
- const keywords = config?.keywords || [];
174
- if (keywords.length === 0) return 100;
175
-
176
- const foundKeywords = keywords.filter(kw =>
177
- response.includes(kw.toLowerCase())
178
- );
179
-
180
- return (foundKeywords.length / keywords.length) * 100;
181
- },
182
- config: [
183
- {
184
- name: "keywords",
185
- description: "Array of keywords that should appear in response"
186
- }
187
- ]
188
- });
189
-
190
- // Run with config
191
- const score = await keywordEval.run(
192
- agent,
193
- backend,
194
- testCase,
195
- messages,
196
- { keywords: ["weather", "temperature", "San Francisco"] }
197
- );
198
- ```
199
-
200
- ### LLM-as-judge evaluation
201
-
202
- ```typescript
203
- const llmJudgeEval = new ExuluEval({
204
- id: "llm_judge",
205
- name: "LLM Judge",
206
- description: "Uses an LLM to evaluate response quality",
207
- llm: true,
208
- execute: async ({ backend, messages, testCase, config }) => {
209
- const lastMessage = messages[messages.length - 1];
210
- const response = lastMessage?.content || "";
211
-
212
- const judgePrompt = `
213
- You are an expert evaluator. Rate the following response on a scale of 0-100.
214
-
215
- Test Case: ${testCase.name}
216
- Description: ${testCase.description || "N/A"}
217
-
218
- Expected Output:
219
- ${testCase.expected_output}
220
-
221
- Actual Response:
222
- ${response}
223
-
224
- Criteria:
225
- 1. Accuracy: Does it match the expected output?
226
- 2. Completeness: Does it address all required aspects?
227
- 3. Clarity: Is it well-structured and understandable?
228
- 4. Relevance: Does it stay on topic?
229
-
230
- Respond with ONLY a number from 0 to 100. No explanation.
231
- `.trim();
232
-
233
- const result = await backend.generateSync({
234
- prompt: judgePrompt,
235
- agentInstance: await loadAgent(config?.judgeAgentId || "default_judge"),
236
- statistics: { label: "eval", trigger: "llm_judge" }
237
- });
238
-
239
- const score = parseInt(result.text.trim());
240
-
241
- if (isNaN(score)) {
242
- console.warn(`LLM judge returned non-numeric: ${result.text}`);
243
- return 0;
244
- }
245
-
246
- return Math.max(0, Math.min(100, score));
247
- },
248
- config: [
249
- {
250
- name: "judgeAgentId",
251
- description: "Agent ID to use for evaluation (must support text generation)"
252
- }
253
- ]
254
- });
255
- ```
256
-
257
- ### Tool usage evaluation
258
-
259
- ```typescript
260
- const toolUsageEval = new ExuluEval({
261
- id: "tool_usage",
262
- name: "Tool Usage",
263
- description: "Checks if agent used expected tools",
264
- llm: false,
265
- execute: async ({ messages, testCase }) => {
266
- // Extract tool calls from conversation
267
- const toolCalls = messages
268
- .flatMap(msg => msg.toolInvocations || [])
269
- .map(inv => inv.toolName);
270
-
271
- const expectedTools = testCase.expected_tools || [];
272
-
273
- // If no tools expected, check that no tools were used
274
- if (expectedTools.length === 0) {
275
- return toolCalls.length === 0 ? 100 : 0;
276
- }
277
-
278
- // Check if all expected tools were used
279
- const usedExpected = expectedTools.filter(tool =>
280
- toolCalls.includes(tool)
281
- );
282
-
283
- return (usedExpected.length / expectedTools.length) * 100;
284
- }
285
- });
286
- ```
287
-
288
- ### Regex pattern matching
289
-
290
- ```typescript
291
- const regexMatchEval = new ExuluEval({
292
- id: "regex_match",
293
- name: "Regex Pattern Match",
294
- description: "Checks if response matches regex pattern",
295
- llm: false,
296
- execute: async ({ messages, testCase, config }) => {
297
- const lastMessage = messages[messages.length - 1];
298
- const response = lastMessage?.content || "";
299
-
300
- const pattern = config?.pattern;
301
- if (!pattern) {
302
- throw new Error("Regex pattern required in config");
303
- }
304
-
305
- const regex = new RegExp(pattern, config?.flags || "");
306
- return regex.test(response) ? 100 : 0;
307
- },
308
- config: [
309
- {
310
- name: "pattern",
311
- description: "Regex pattern to match"
312
- },
313
- {
314
- name: "flags",
315
- description: "Regex flags (e.g., 'i' for case-insensitive)"
316
- }
317
- ]
318
- });
319
-
320
- // Run with regex config
321
- const score = await regexMatchEval.run(
322
- agent,
323
- backend,
324
- testCase,
325
- messages,
326
- {
327
- pattern: "\\d{2}°[FC]", // Matches temperature like "68°F"
328
- flags: "i"
329
- }
330
- );
331
- ```
332
-
333
- ### Length-based evaluation
334
-
335
- ```typescript
336
- const lengthEval = new ExuluEval({
337
- id: "response_length",
338
- name: "Response Length",
339
- description: "Scores based on response length within acceptable range",
340
- llm: false,
341
- execute: async ({ messages, config }) => {
342
- const lastMessage = messages[messages.length - 1];
343
- const response = lastMessage?.content || "";
344
- const length = response.length;
345
-
346
- const minLength = config?.minLength || 0;
347
- const maxLength = config?.maxLength || Infinity;
348
- const targetLength = config?.targetLength;
349
-
350
- // If within range, score based on proximity to target
351
- if (length < minLength) {
352
- return Math.max(0, (length / minLength) * 100);
353
- }
354
-
355
- if (length > maxLength) {
356
- return Math.max(0, 100 - ((length - maxLength) / maxLength) * 100);
357
- }
358
-
359
- // Within range
360
- if (targetLength) {
361
- const deviation = Math.abs(length - targetLength);
362
- const maxDeviation = Math.max(
363
- targetLength - minLength,
364
- maxLength - targetLength
365
- );
366
- return Math.max(0, 100 - (deviation / maxDeviation) * 50);
367
- }
368
-
369
- return 100;
370
- },
371
- config: [
372
- {
373
- name: "minLength",
374
- description: "Minimum acceptable character count"
375
- },
376
- {
377
- name: "maxLength",
378
- description: "Maximum acceptable character count"
379
- },
380
- {
381
- name: "targetLength",
382
- description: "Ideal character count (optional)"
383
- }
384
- ]
385
- });
386
- ```
387
-
388
- ### Composite evaluation
389
-
390
- Combine multiple evaluation criteria:
391
-
392
- ```typescript
393
- const compositeEval = new ExuluEval({
394
- id: "composite",
395
- name: "Composite Evaluation",
396
- description: "Combines multiple evaluation criteria with weights",
397
- llm: false,
398
- execute: async ({ messages, testCase, config }) => {
399
- const lastMessage = messages[messages.length - 1];
400
- const response = lastMessage?.content || "";
401
-
402
- let totalScore = 0;
403
- let totalWeight = 0;
404
-
405
- // Criteria 1: Contains expected output (weight: 50%)
406
- const containsExpected = response.includes(testCase.expected_output);
407
- totalScore += containsExpected ? 50 : 0;
408
- totalWeight += 50;
409
-
410
- // Criteria 2: Reasonable length (weight: 20%)
411
- const isReasonableLength = response.length >= 50 && response.length <= 500;
412
- totalScore += isReasonableLength ? 20 : 0;
413
- totalWeight += 20;
414
-
415
- // Criteria 3: Uses tools if expected (weight: 30%)
416
- const toolCalls = messages.flatMap(msg => msg.toolInvocations || []);
417
- const expectedTools = testCase.expected_tools || [];
418
- if (expectedTools.length > 0) {
419
- const toolsUsed = expectedTools.every(tool =>
420
- toolCalls.some(call => call.toolName === tool)
421
- );
422
- totalScore += toolsUsed ? 30 : 0;
423
- totalWeight += 30;
424
- } else {
425
- totalScore += 30; // No tools expected, full points
426
- totalWeight += 30;
427
- }
428
-
429
- return (totalScore / totalWeight) * 100;
430
- }
431
- });
432
- ```
433
-
434
- ## Queue configuration
435
-
436
- Run evaluations as background jobs using ExuluQueues:
437
-
438
- ```typescript
439
- import { ExuluEval, ExuluQueues } from "@exulu/backend";
440
-
441
- const backgroundEval = new ExuluEval({
442
- id: "background_eval",
443
- name: "Background Evaluation",
444
- description: "Runs as queued job",
445
- llm: true,
446
- execute: async ({ backend, messages, testCase }) => {
447
- // Long-running evaluation logic
448
- return 85;
449
- },
450
- queue: Promise.resolve({
451
- connection: await ExuluQueues.getConnection(),
452
- name: "evaluations",
453
- prefix: "{exulu}",
454
- defaultJobOptions: {
455
- attempts: 3,
456
- backoff: {
457
- type: "exponential",
458
- delay: 2000
459
- },
460
- removeOnComplete: true,
461
- removeOnFail: false
462
- }
463
- })
464
- });
465
- ```
466
-
467
- ## Advanced patterns
468
-
469
- ### Semantic similarity evaluation
470
-
471
- Use embeddings to measure semantic similarity:
472
-
473
- ```typescript
474
- import { ExuluEval, ExuluEmbedder, ExuluVariables } from "@exulu/backend";
475
-
476
- const semanticSimilarityEval = new ExuluEval({
477
- id: "semantic_similarity",
478
- name: "Semantic Similarity",
479
- description: "Measures semantic similarity using embeddings",
480
- llm: false,
481
- execute: async ({ messages, testCase, config }) => {
482
- const lastMessage = messages[messages.length - 1];
483
- const response = lastMessage?.content || "";
484
-
485
- const embedder = new ExuluEmbedder({
486
- id: "eval_embedder",
487
- name: "Evaluation Embedder",
488
- provider: "openai",
489
- model: "text-embedding-3-small",
490
- vectorDimensions: 1536,
491
- authenticationInformation: await ExuluVariables.get("openai_api_key")
492
- });
493
-
494
- const [responseEmb, expectedEmb] = await embedder.generate([
495
- response,
496
- testCase.expected_output
497
- ]);
498
-
499
- // Cosine similarity
500
- const similarity = cosineSimilarity(responseEmb, expectedEmb);
501
-
502
- // Scale to 0-100
503
- return similarity * 100;
504
- }
505
- });
506
-
507
- function cosineSimilarity(a: number[], b: number[]): number {
508
- const dotProduct = a.reduce((sum, val, i) => sum + val * b[i], 0);
509
- const magnitudeA = Math.sqrt(a.reduce((sum, val) => sum + val * val, 0));
510
- const magnitudeB = Math.sqrt(b.reduce((sum, val) => sum + val * val, 0));
511
- return dotProduct / (magnitudeA * magnitudeB);
512
- }
513
- ```
514
-
515
- ### Multi-aspect LLM judge
516
-
517
- Evaluate multiple aspects separately:
518
-
519
- ```typescript
520
- const multiAspectJudgeEval = new ExuluEval({
521
- id: "multi_aspect_judge",
522
- name: "Multi-Aspect LLM Judge",
523
- description: "Evaluates multiple aspects with separate LLM calls",
524
- llm: true,
525
- execute: async ({ backend, messages, testCase, config }) => {
526
- const lastMessage = messages[messages.length - 1];
527
- const response = lastMessage?.content || "";
528
-
529
- const aspects = [
530
- {
531
- name: "accuracy",
532
- weight: 40,
533
- prompt: "Rate the accuracy of this response (0-100):"
534
- },
535
- {
536
- name: "clarity",
537
- weight: 30,
538
- prompt: "Rate the clarity and readability (0-100):"
539
- },
540
- {
541
- name: "completeness",
542
- weight: 30,
543
- prompt: "Rate how complete the response is (0-100):"
544
- }
545
- ];
546
-
547
- let totalScore = 0;
548
- let totalWeight = 0;
549
-
550
- for (const aspect of aspects) {
551
- const judgePrompt = `
552
- ${aspect.prompt}
553
-
554
- Expected: ${testCase.expected_output}
555
- Actual: ${response}
556
-
557
- Respond with ONLY a number 0-100.
558
- `.trim();
559
-
560
- const result = await backend.generateSync({
561
- prompt: judgePrompt,
562
- agentInstance: await loadAgent(config?.judgeAgentId || "default_judge"),
563
- statistics: { label: "eval", trigger: "multi_aspect_judge" }
564
- });
565
-
566
- const score = parseInt(result.text.trim());
567
- if (!isNaN(score)) {
568
- totalScore += Math.max(0, Math.min(100, score)) * aspect.weight;
569
- totalWeight += aspect.weight;
570
- }
571
- }
572
-
573
- return totalWeight > 0 ? totalScore / totalWeight : 0;
574
- },
575
- config: [
576
- {
577
- name: "judgeAgentId",
578
- description: "Agent ID for LLM judge"
579
- }
580
- ]
581
- });
582
- ```
583
-
584
- ### A/B testing evaluation
585
-
586
- Compare two agent configurations:
587
-
588
- ```typescript
589
- async function compareAgents(
590
- agentA: Agent,
591
- agentB: Agent,
592
- backendA: ExuluAgent,
593
- backendB: ExuluAgent,
594
- testCases: TestCase[],
595
- evals: ExuluEval[]
596
- ) {
597
- const resultsA = [];
598
- const resultsB = [];
599
-
600
- for (const testCase of testCases) {
601
- // Generate response from Agent A
602
- const responseA = await backendA.generateSync({
603
- prompt: testCase.inputs[testCase.inputs.length - 1].content,
604
- agentInstance: await loadAgent(agentA.id),
605
- statistics: { label: "ab_test", trigger: "test" }
606
- });
607
-
608
- const messagesA = [
609
- ...testCase.inputs,
610
- { role: "assistant", content: responseA.text }
611
- ];
612
-
613
- // Generate response from Agent B
614
- const responseB = await backendB.generateSync({
615
- prompt: testCase.inputs[testCase.inputs.length - 1].content,
616
- agentInstance: await loadAgent(agentB.id),
617
- statistics: { label: "ab_test", trigger: "test" }
618
- });
619
-
620
- const messagesB = [
621
- ...testCase.inputs,
622
- { role: "assistant", content: responseB.text }
623
- ];
624
-
625
- // Run evaluations on both
626
- for (const eval of evals) {
627
- const scoreA = await eval.run(agentA, backendA, testCase, messagesA);
628
- const scoreB = await eval.run(agentB, backendB, testCase, messagesB);
629
-
630
- resultsA.push({ testCase: testCase.name, eval: eval.name, score: scoreA });
631
- resultsB.push({ testCase: testCase.name, eval: eval.name, score: scoreB });
632
- }
633
- }
634
-
635
- // Calculate averages
636
- const avgA = resultsA.reduce((sum, r) => sum + r.score, 0) / resultsA.length;
637
- const avgB = resultsB.reduce((sum, r) => sum + r.score, 0) / resultsB.length;
638
-
639
- return {
640
- agentA: { results: resultsA, average: avgA },
641
- agentB: { results: resultsB, average: avgB },
642
- winner: avgA > avgB ? "Agent A" : "Agent B"
643
- };
644
- }
645
- ```
646
-
647
- ## Best practices
648
-
649
- <Tip>
650
- **Weighted scoring**: For composite evaluations, use weighted scoring to prioritize important criteria.
651
- </Tip>
652
-
653
- <Note>
654
- **Error handling**: Always handle errors in execute functions and return 0 or throw descriptive errors.
655
- </Note>
656
-
657
- <Warning>
658
- **LLM judge reliability**: LLM judges can be inconsistent. Run multiple times or use temperature=0 for deterministic results.
659
- </Warning>
660
-
661
- <Info>
662
- **Config validation**: Validate config parameters at the start of execute functions to provide clear error messages.
663
- </Info>
664
-
665
- ## Next steps
666
-
667
- <CardGroup cols={2}>
668
- <Card title="API reference" icon="code" href="/core/exulu-eval/api-reference">
669
- Explore all methods and properties
670
- </Card>
671
- <Card title="Overview" icon="book" href="/core/exulu-eval/introduction">
672
- Learn about evaluation concepts
673
- </Card>
674
- <Card title="ExuluQueues" icon="layer-group" href="/core/exulu-queues/introduction">
675
- Run evaluations as background jobs
676
- </Card>
677
- <Card title="ExuluAgent" icon="robot" href="/core/exulu-agent/introduction">
678
- Create agents to evaluate
679
- </Card>
680
- </CardGroup>