@exulu/backend 1.53.1 → 1.55.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,550 @@
1
+ import { z } from "zod";
2
+ import { tool } from "ai";
3
+ import type { ExuluContext } from "@SRC/exulu/context";
4
+ import { getTableName, getChunksTableName } from "@SRC/exulu/context";
5
+ import { postgresClient } from "@SRC/postgres/client";
6
+ import { applyFilters } from "@SRC/graphql/resolvers/apply-filters";
7
+ import { applyAccessControl } from "@SRC/graphql/utilities/access-control";
8
+ import { convertContextToTableDefinition } from "@SRC/graphql/utilities/convert-context-to-table-definition";
9
+ import type { SearchFilters } from "@SRC/graphql/types";
10
+ import type { VectorSearchChunkResult } from "@SRC/graphql/resolvers/vector-search";
11
+ import type { User } from "@EXULU_TYPES/models/user";
12
+ import type { ChunkResult } from "./types";
13
+
14
+ function buildContextEnum(contexts: ExuluContext[]) {
15
+ return z
16
+ .array(z.enum(contexts.map((c) => c.id) as [string, ...string[]]))
17
+ .describe(
18
+ contexts
19
+ .map(
20
+ (c) =>
21
+ `<knowledge_base id="${c.id}" name="${c.name}">${c.description}</knowledge_base>`,
22
+ )
23
+ .join("\n"),
24
+ );
25
+ }
26
+
27
+ function resolveContexts(
28
+ ids: string[],
29
+ all: ExuluContext[],
30
+ ): ExuluContext[] {
31
+ if (!ids?.length) return all;
32
+ return ids.map((id) => {
33
+ const ctx = all.find(
34
+ (c) => c.id === id || c.id.toLowerCase().includes(id.toLowerCase()),
35
+ );
36
+ if (!ctx) throw new Error(`Knowledge base not found: ${id}`);
37
+ return ctx;
38
+ });
39
+ }
40
+
41
+ function mapSearchMethod(method: "hybrid" | "keyword" | "semantic"): "hybridSearch" | "tsvector" | "cosineDistance" {
42
+ if (method === "hybrid") return "hybridSearch";
43
+ if (method === "keyword") return "tsvector";
44
+ return "cosineDistance";
45
+ }
46
+
47
+ /**
48
+ * Parses session item entries into a per-context map.
49
+ *
50
+ * Two supported formats:
51
+ * "<context_id>/<item_id>" → specific item; value is a non-empty string[]
52
+ * "<context_id>" → full context (no item filter); value is null
53
+ *
54
+ * If both a full-context entry and specific-item entries exist for the same
55
+ * context, full-context (null) wins.
56
+ */
57
+ export function parseGlobalItemIds(globalIds: string[]): Map<string, string[] | null> {
58
+ const map = new Map<string, string[] | null>();
59
+ for (const gid of globalIds) {
60
+ const slashIdx = gid.indexOf("/");
61
+ if (slashIdx === -1) {
62
+ // No slash → entire context selected
63
+ if (gid) map.set(gid, null);
64
+ continue;
65
+ }
66
+ const contextId = gid.slice(0, slashIdx);
67
+ const itemId = gid.slice(slashIdx + 1);
68
+ if (!contextId || !itemId) continue;
69
+ // Full-context entry already wins — don't downgrade to specific items
70
+ if (map.get(contextId) === null) continue;
71
+ const existing = map.get(contextId) ?? [];
72
+ existing.push(itemId);
73
+ map.set(contextId, existing);
74
+ }
75
+ return map;
76
+ }
77
+
78
+ export type RetrievalToolParams = {
79
+ contexts: ExuluContext[];
80
+ user?: User;
81
+ role?: string;
82
+ updateVirtualFiles: (files: Array<{ path: string; content: string }>) => Promise<void>;
83
+ /**
84
+ * Preselected scope keyed by context ID. When set, every tool is scoped accordingly:
85
+ * null → full context access (no item filter)
86
+ * string[] → only these specific item IDs
87
+ * missing key → context was not selected; return empty results
88
+ */
89
+ preselectedItemsByContext?: Map<string, string[] | null>;
90
+ };
91
+
92
+ /**
93
+ * Creates all pre-built retrieval tools. These are passed to the agent loop
94
+ * and filtered per strategy.
95
+ */
96
+ export function createRetrievalTools(params: RetrievalToolParams) {
97
+ const { contexts, user, role, updateVirtualFiles, preselectedItemsByContext } = params;
98
+ const ctxEnum = buildContextEnum(contexts);
99
+
100
+ // ──────────────────────────────────────────────────────────
101
+ // count_items_or_chunks
102
+ // ──────────────────────────────────────────────────────────
103
+ const count_items_or_chunks = tool({
104
+ description:
105
+ "Count items or chunks WITHOUT loading them into context. Use for 'how many', 'count', or 'total number of' queries.",
106
+ inputSchema: z.object({
107
+ knowledge_base_ids: ctxEnum,
108
+ count_what: z
109
+ .enum(["items", "chunks"])
110
+ .describe("Whether to count items (documents) or chunks (pages/sections)"),
111
+ name_contains: z
112
+ .string()
113
+ .optional()
114
+ .describe("Only count items whose name contains this text (case-insensitive)"),
115
+ content_query: z
116
+ .string()
117
+ .optional()
118
+ .describe(
119
+ "Only count chunks matching this search query (uses hybrid search). Only used when count_what is 'chunks'.",
120
+ ),
121
+ }),
122
+ execute: async ({ knowledge_base_ids, count_what, name_contains, content_query }) => {
123
+ const { db } = await postgresClient();
124
+ const ctxList = resolveContexts(knowledge_base_ids, contexts);
125
+
126
+ const counts = await Promise.all(
127
+ ctxList.map(async (ctx) => {
128
+ const contextItemIds = preselectedItemsByContext?.get(ctx.id);
129
+ // undefined = context not in preselection map → skip
130
+ if (preselectedItemsByContext && contextItemIds === undefined) {
131
+ return { context: ctx.id, context_name: ctx.name, count: 0 };
132
+ }
133
+ // null = full context; string[] = specific items
134
+
135
+ let count = 0;
136
+
137
+ if (count_what === "items") {
138
+ const tableName = getTableName(ctx.id);
139
+ let q = db(tableName).count("id as count").whereNull("archived");
140
+ if (name_contains) {
141
+ q = q.whereRaw("LOWER(name) LIKE ?", [`%${name_contains.toLowerCase()}%`]);
142
+ }
143
+ if (Array.isArray(contextItemIds)) {
144
+ q = q.whereIn("id", contextItemIds);
145
+ }
146
+ const tableDefinition = convertContextToTableDefinition(ctx);
147
+ q = applyAccessControl(tableDefinition, q, user, tableName);
148
+ const result = await q.first();
149
+ count = Number(result?.count ?? 0);
150
+ } else {
151
+ const chunksTable = getChunksTableName(ctx.id);
152
+ const baseItemFilters: SearchFilters = Array.isArray(contextItemIds)
153
+ ? [{ id: { in: contextItemIds } }]
154
+ : [];
155
+ if (content_query) {
156
+ const searchResults = await ctx.search({
157
+ query: content_query,
158
+ method: "hybridSearch",
159
+ limit: 10000,
160
+ page: 1,
161
+ itemFilters: baseItemFilters,
162
+ chunkFilters: [],
163
+ sort: { field: "updatedAt", direction: "desc" },
164
+ user,
165
+ role,
166
+ trigger: "tool",
167
+ });
168
+ count = searchResults.chunks.length;
169
+ } else if (Array.isArray(contextItemIds)) {
170
+ const result = await db(chunksTable).count("id as count").whereIn("source", contextItemIds).first();
171
+ count = Number(result?.count ?? 0);
172
+ } else {
173
+ const result = await db(chunksTable).count("id as count").first();
174
+ count = Number(result?.count ?? 0);
175
+ }
176
+ }
177
+
178
+ return { context: ctx.id, context_name: ctx.name, count };
179
+ }),
180
+ );
181
+
182
+ return JSON.stringify({
183
+ total_count: counts.reduce((s, c) => s + c.count, 0),
184
+ breakdown_by_context: counts,
185
+ });
186
+ },
187
+ });
188
+
189
+ // ──────────────────────────────────────────────────────────
190
+ // search_items_by_name
191
+ // ──────────────────────────────────────────────────────────
192
+ const search_items_by_name = tool({
193
+ description:
194
+ "Search for items by their name or external ID. Use when:\n" +
195
+ "• The user asks for a document BY TITLE or NAME\n" +
196
+ "• The user asks whether a specific named document EXISTS (e.g. 'do you have the X manual?', 'is there a document for Y?')\n" +
197
+ "• Any query that references a specific document, manual, or resource by its name rather than by topic\n" +
198
+ "Do NOT use for topic-based content queries (e.g. 'what are the parameters for X?', 'how do I configure Y?').",
199
+ inputSchema: z.object({
200
+ knowledge_base_ids: ctxEnum,
201
+ item_name: z.string().describe(
202
+ "The name or partial name to search for. Uses substring matching, so shorter and more specific terms work better than full phrases. " +
203
+ "Extract only the core identifying part — typically the product model, document title, or unique identifier. " +
204
+ "Do NOT include surrounding descriptors like type words ('manual', 'guide', 'document') or manufacturer names unless they are likely part of the actual document title."
205
+ ),
206
+ limit: z
207
+ .number()
208
+ .default(100)
209
+ .describe(
210
+ "Max items per knowledge base (max 400). Applies independently to each knowledge base.",
211
+ ),
212
+ }),
213
+ execute: async ({ item_name, limit, knowledge_base_ids }) => {
214
+ const { db } = await postgresClient();
215
+ const ctxList = resolveContexts(knowledge_base_ids, contexts);
216
+ const safeLimit = Math.min(limit ?? 100, 400);
217
+
218
+ const results = await Promise.all(
219
+ ctxList.map(async (ctx) => {
220
+ const contextItemIds = preselectedItemsByContext?.get(ctx.id);
221
+ // undefined = context not in preselection map → skip
222
+ if (preselectedItemsByContext && contextItemIds === undefined) return [];
223
+
224
+ const itemFilters: SearchFilters = item_name ? [{ name: { contains: item_name } }] : [];
225
+ if (Array.isArray(contextItemIds)) itemFilters.push({ id: { in: contextItemIds } });
226
+
227
+ const tableName = getTableName(ctx.id);
228
+ const tableDefinition = convertContextToTableDefinition(ctx);
229
+
230
+ let q = db(`${tableName} as items`).select([
231
+ "items.id as item_id",
232
+ "items.name as item_name",
233
+ "items.external_id as item_external_id",
234
+ db.raw('items."updatedAt" as item_updated_at'),
235
+ db.raw('items."createdAt" as item_created_at'),
236
+ ...ctx.fields.map((f) => `items.${f.name} as ${f.name}`),
237
+ ]);
238
+ q = q.limit(safeLimit);
239
+ q = applyFilters(q, itemFilters, tableDefinition, "items");
240
+ q = applyAccessControl(tableDefinition, q, user, "items");
241
+ const items = await q;
242
+
243
+ return Promise.all(
244
+ items.map(async (item) => {
245
+ const chunksTable = getChunksTableName(ctx.id);
246
+ const chunks = await db(chunksTable)
247
+ .select(["id", "source", "metadata"])
248
+ .where("source", item.item_id)
249
+ .limit(1);
250
+
251
+ if (!chunks[0]) return null;
252
+ return {
253
+ item_name: item.item_name,
254
+ item_id: item.item_id,
255
+ context: ctx.id,
256
+ chunk_id: chunks[0].id,
257
+ chunk_index: 1,
258
+ metadata: chunks[0].metadata,
259
+ } satisfies ChunkResult;
260
+ }),
261
+ );
262
+ }),
263
+ );
264
+
265
+ return JSON.stringify(results.flat().filter(Boolean));
266
+ },
267
+ });
268
+
269
+ // ──────────────────────────────────────────────────────────
270
+ // search_content
271
+ // ──────────────────────────────────────────────────────────
272
+ const search_content = tool({
273
+ description: `Search ONE knowledge base for document content using hybrid, keyword, or semantic search.
274
+ Always make a separate call for each knowledge base you want to search — never bundle multiple in one call.
275
+
276
+ Use includeContent: false when you only need to know WHICH documents match (listing, overview, navigation).
277
+ Use includeContent: true when you need the ACTUAL text to answer a question.
278
+
279
+ For listing queries: always start with includeContent: false, then use dynamic tools to fetch specific pages.`,
280
+ inputSchema: z.object({
281
+ query: z.string().describe("Search query about the content you're looking for"),
282
+ knowledge_base_id: z
283
+ .enum(contexts.map((c) => c.id) as [string, ...string[]])
284
+ .describe(
285
+ contexts
286
+ .map(
287
+ (c) =>
288
+ `<knowledge_base id="${c.id}" name="${c.name}">${c.description}</knowledge_base>`,
289
+ )
290
+ .join("\n"),
291
+ ),
292
+ keywords: z.array(z.string()).optional().describe("Keywords extracted from the query"),
293
+ searchMethod: z
294
+ .enum(["hybrid", "keyword", "semantic"])
295
+ .default("hybrid")
296
+ .describe(
297
+ "hybrid: best default (semantic + keyword). keyword: exact terms, product codes, IDs. semantic: conceptual/synonyms.",
298
+ ),
299
+ includeContent: z
300
+ .boolean()
301
+ .default(true)
302
+ .describe(
303
+ "false: returns metadata only (document names, scores) — use for listing/navigation. " +
304
+ "true: returns full chunk text — use when you need content to answer a question.",
305
+ ),
306
+ item_ids: z.array(z.string()).optional().describe("Filter results to specific item IDs"),
307
+ item_names: z
308
+ .array(z.string())
309
+ .optional()
310
+ .describe("Filter results to items whose name contains one of these strings"),
311
+ item_external_ids: z
312
+ .array(z.string())
313
+ .optional()
314
+ .describe("Filter results to specific external IDs"),
315
+ limit: z
316
+ .number()
317
+ .default(20)
318
+ .describe("Max chunks with content (max 20). Without content, up to 200 are returned."),
319
+ }),
320
+ execute: async ({
321
+ query,
322
+ knowledge_base_id,
323
+ keywords,
324
+ searchMethod,
325
+ includeContent,
326
+ item_ids,
327
+ item_names,
328
+ item_external_ids,
329
+ limit,
330
+ }) => {
331
+ const [ctx] = resolveContexts([knowledge_base_id], contexts) as [ExuluContext];
332
+ const effectiveLimit = includeContent ? Math.min(limit ?? 20, 20) : Math.min((limit ?? 20) * 20, 400);
333
+
334
+ const itemFilters: SearchFilters = [];
335
+
336
+ if (preselectedItemsByContext) {
337
+ const contextItemIds = preselectedItemsByContext.get(knowledge_base_id);
338
+ if (contextItemIds === undefined) {
339
+ // Context not in preselection map — nothing to search
340
+ return JSON.stringify([]);
341
+ }
342
+ if (Array.isArray(contextItemIds)) {
343
+ const intersection = item_ids?.length
344
+ ? item_ids.filter((id) => contextItemIds.includes(id))
345
+ : contextItemIds;
346
+ if (!intersection.length) {
347
+ // Agent specified item_ids entirely outside the preselected scope
348
+ return JSON.stringify([]);
349
+ }
350
+ itemFilters.push({ id: { in: intersection } });
351
+ }
352
+ // null = full context → no item filter; agent's item_ids still respected if provided
353
+ else if (item_ids?.length) {
354
+ itemFilters.push({ id: { in: item_ids } });
355
+ }
356
+ } else if (item_ids?.length) {
357
+ itemFilters.push({ id: { in: item_ids } });
358
+ }
359
+
360
+ if (item_names)
361
+ itemFilters.push({ name: { or: item_names.map((n) => ({ contains: n })) } });
362
+ if (item_external_ids) itemFilters.push({ external_id: { in: item_external_ids } });
363
+
364
+ const effectiveQuery = query || keywords?.join(" ") || "";
365
+
366
+ let method = mapSearchMethod(searchMethod ?? "hybrid");
367
+
368
+ if (method === "hybridSearch" || method === "cosineDistance") {
369
+ if (!ctx.embedder) {
370
+ console.error(`[EXULU] context "${ctx.id}" does not have an embedder, falling back to tsvector search`);
371
+ method = "tsvector";
372
+ }
373
+ }
374
+
375
+ try {
376
+ const { chunks } = await ctx.search({
377
+ query: effectiveQuery,
378
+ keywords,
379
+ method,
380
+ limit: effectiveLimit,
381
+ page: 1,
382
+ itemFilters,
383
+ chunkFilters: [],
384
+ sort: { field: "updatedAt", direction: "desc" },
385
+ user,
386
+ role,
387
+ trigger: "tool",
388
+ });
389
+
390
+ return JSON.stringify(
391
+ chunks.map(
392
+ (chunk): ChunkResult => ({
393
+ item_name: chunk.item_name,
394
+ item_id: chunk.item_id,
395
+ context: chunk.context?.id ?? ctx.id,
396
+ chunk_id: chunk.chunk_id,
397
+ chunk_index: chunk.chunk_index,
398
+ chunk_content: includeContent ? chunk.chunk_content : undefined,
399
+ metadata: {
400
+ ...chunk.chunk_metadata,
401
+ cosine_distance: chunk.chunk_cosine_distance,
402
+ fts_rank: chunk.chunk_fts_rank,
403
+ hybrid_score: chunk.chunk_hybrid_score,
404
+ },
405
+ }),
406
+ ),
407
+ );
408
+ } catch (err) {
409
+ console.error(`[EXULU] search_content failed for context "${ctx.id}":`, err);
410
+ return JSON.stringify([]);
411
+ }
412
+ },
413
+ });
414
+
415
+ // ──────────────────────────────────────────────────────────
416
+ // save_search_results
417
+ // ──────────────────────────────────────────────────────────
418
+ const save_search_results = tool({
419
+ description: `Execute a search on ONE knowledge base and save ALL results to the virtual filesystem WITHOUT loading them into context.
420
+ Always make a separate call for each knowledge base you want to search.
421
+
422
+ Use this when you expect many results (>20) and need to filter iteratively:
423
+ 1. Call save_search_results (once per knowledge base) to save up to 1000 results to /search_results_{knowledge_base_id}.txt
424
+ 2. Use bash grep/awk to identify relevant chunks by pattern
425
+ 3. Use dynamic get_content tools to load only the specific chunks you need
426
+
427
+ The saved file format:
428
+ ### RESULT N ###
429
+ ITEM_NAME: ...
430
+ ITEM_ID: ...
431
+ CHUNK_ID: ...
432
+ CHUNK_INDEX: ...
433
+ CONTEXT: ...
434
+ SCORE: ...
435
+ ---CONTENT START---
436
+ (content or placeholder)
437
+ ---CONTENT END---`,
438
+ inputSchema: z.object({
439
+ knowledge_base_id: z
440
+ .enum(contexts.map((c) => c.id) as [string, ...string[]])
441
+ .describe(
442
+ contexts
443
+ .map(
444
+ (c) =>
445
+ `<knowledge_base id="${c.id}" name="${c.name}">${c.description}</knowledge_base>`,
446
+ )
447
+ .join("\n"),
448
+ ),
449
+ query: z.string().describe("Search query"),
450
+ searchMethod: z.enum(["hybrid", "keyword", "semantic"]).default("hybrid"),
451
+ limit: z
452
+ .number()
453
+ .max(1000)
454
+ .default(100)
455
+ .describe("Max results to save (max 1000)"),
456
+ includeContent: z
457
+ .boolean()
458
+ .default(true)
459
+ .describe(
460
+ "Whether to include chunk text in the saved file. False saves tokens — use true only if you need to grep content.",
461
+ ),
462
+ }),
463
+ execute: async ({ query, knowledge_base_id, searchMethod, limit, includeContent }) => {
464
+ const [ctx] = resolveContexts([knowledge_base_id], contexts) as [ExuluContext];
465
+
466
+ const contextItemIds = preselectedItemsByContext?.get(knowledge_base_id);
467
+ // undefined = context not in preselection map → skip
468
+ if (preselectedItemsByContext && contextItemIds === undefined) {
469
+ return JSON.stringify({
470
+ success: true,
471
+ results_count: 0,
472
+ message: `Context "${knowledge_base_id}" not in preselected scope — skipped.`,
473
+ });
474
+ }
475
+
476
+ // null = full context (no filter); string[] = specific items
477
+ const itemFilters: SearchFilters = Array.isArray(contextItemIds)
478
+ ? [{ id: { in: contextItemIds } }]
479
+ : [];
480
+
481
+ let chunks: VectorSearchChunkResult[] = [];
482
+ try {
483
+ const result = await ctx.search({
484
+ query,
485
+ method: mapSearchMethod(searchMethod ?? "hybrid"),
486
+ limit: Math.min(limit ?? 100, 1000),
487
+ page: 1,
488
+ itemFilters,
489
+ chunkFilters: [],
490
+ sort: { field: "updatedAt", direction: "desc" },
491
+ user,
492
+ role,
493
+ trigger: "tool",
494
+ });
495
+ chunks = result.chunks;
496
+ } catch (err) {
497
+ console.error(`[EXULU] save_search_results failed for context "${ctx.id}":`, err);
498
+ }
499
+
500
+ const fileName = `search_results_${ctx.id}.txt`;
501
+ const fileContent = chunks
502
+ .map(
503
+ (chunk, i) =>
504
+ `### RESULT ${i + 1} ###\n` +
505
+ `ITEM_NAME: ${chunk.item_name}\n` +
506
+ `ITEM_ID: ${chunk.item_id}\n` +
507
+ `CHUNK_ID: ${chunk.chunk_id}\n` +
508
+ `CHUNK_INDEX: ${chunk.chunk_index}\n` +
509
+ `CONTEXT: ${chunk.context?.id ?? ""}\n` +
510
+ `SCORE: ${chunk.chunk_hybrid_score ?? chunk.chunk_fts_rank ?? chunk.chunk_cosine_distance ?? 0}\n` +
511
+ `---CONTENT START---\n` +
512
+ `${includeContent && chunk.chunk_content ? chunk.chunk_content : "[use includeContent: true or get_content tool to load]"}\n` +
513
+ `---CONTENT END---\n`,
514
+ )
515
+ .join("\n");
516
+
517
+ await updateVirtualFiles([
518
+ { path: fileName, content: fileContent },
519
+ {
520
+ path: `search_metadata_${ctx.id}.json`,
521
+ content: JSON.stringify({
522
+ query,
523
+ timestamp: new Date().toISOString(),
524
+ results_count: chunks.length,
525
+ context: ctx.id,
526
+ method: searchMethod,
527
+ }),
528
+ },
529
+ ]);
530
+
531
+ return JSON.stringify({
532
+ success: true,
533
+ results_count: chunks.length,
534
+ message: `Saved ${chunks.length} results to /${fileName}`,
535
+ grep_examples: [
536
+ `grep -i 'keyword' ${fileName} | head -20`,
537
+ `grep 'ITEM_NAME:' ${fileName}`,
538
+ `grep -B 5 'pattern' ${fileName} | grep 'CHUNK_ID:'`,
539
+ ],
540
+ });
541
+ },
542
+ });
543
+
544
+ return {
545
+ count_items_or_chunks,
546
+ search_items_by_name,
547
+ search_content,
548
+ save_search_results,
549
+ };
550
+ }