npm - @exulu/backend - Versions diffs - 1.54.0 → 1.56.0 - Mend

@exulu/backend 1.54.0 → 1.56.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/dist/index.cjs +2275 -1330
package/dist/index.d.cts +8 -30
package/dist/index.d.ts +8 -30
package/dist/index.js +2256 -1306
package/ee/agentic-retrieval/v3/agent-loop.ts +49 -3
package/ee/agentic-retrieval/v3/classifier.ts +61 -42
package/ee/agentic-retrieval/v3/context-sampler.ts +10 -1
package/ee/agentic-retrieval/v3/index.ts +211 -35
package/ee/agentic-retrieval/v3/session-tools-registry.ts +20 -0
package/ee/agentic-retrieval/v3/strategies.ts +28 -24
package/ee/agentic-retrieval/v3/tools.ts +236 -113
package/ee/agentic-retrieval/v3/trajectory.ts +227 -14
package/ee/agentic-retrieval/v4/agent-loop.ts +142 -55
package/ee/agentic-retrieval/v4/context-sampler.ts +79 -0
package/ee/agentic-retrieval/v4/index.ts +673 -164
package/ee/agentic-retrieval/v4/types.ts +33 -4
package/ee/invoke-skills/create-sandbox.ts +119 -0
package/ee/python/documents/processing/doc_processor.ts +106 -14
package/package.json +4 -2
package/ee/agentic-retrieval/ANALYSIS.md +0 -658
package/ee/agentic-retrieval/index.ts +0 -1109
package/ee/agentic-retrieval/logs/README.md +0 -198
package/ee/agentic-retrieval/v2.ts +0 -1628
package/ee/agentic-retrieval/v4/embed-preprocessor.ts +0 -76
package/ee/agentic-retrieval/v4/system-prompt.ts +0 -248
package/ee/agentic-retrieval/v4/tools.ts +0 -241

package/ee/agentic-retrieval/v3/tools.ts CHANGED Viewed

@@ -44,11 +44,50 @@ function mapSearchMethod(method: "hybrid" | "keyword" | "semantic"): "hybridSear
   return "cosineDistance";
 }
+/**
+ * Parses session item entries into a per-context map.
+ *
+ * Two supported formats:
+ *   "<context_id>/<item_id>" → specific item; value is a non-empty string[]
+ *   "<context_id>"           → full context (no item filter); value is null
+ *
+ * If both a full-context entry and specific-item entries exist for the same
+ * context, full-context (null) wins.
+ */
+export function parseGlobalItemIds(globalIds: string[]): Map<string, string[] | null> {
+  const map = new Map<string, string[] | null>();
+  for (const gid of globalIds) {
+    const slashIdx = gid.indexOf("/");
+    if (slashIdx === -1) {
+      // No slash → entire context selected
+      if (gid) map.set(gid, null);
+      continue;
+    }
+    const contextId = gid.slice(0, slashIdx);
+    const itemId = gid.slice(slashIdx + 1);
+    if (!contextId || !itemId) continue;
+    // Full-context entry already wins — don't downgrade to specific items
+    if (map.get(contextId) === null) continue;
+    const existing = map.get(contextId) ?? [];
+    existing.push(itemId);
+    map.set(contextId, existing);
+  }
+  return map;
+}
 export type RetrievalToolParams = {
   contexts: ExuluContext[];
+  toolVariablesConfig?: Record<string, any>;
   user?: User;
   role?: string;
   updateVirtualFiles: (files: Array<{ path: string; content: string }>) => Promise<void>;
+  /**
+   * Preselected scope keyed by context ID. When set, every tool is scoped accordingly:
+   *   null        → full context access (no item filter)
+   *   string[]    → only these specific item IDs
+   *   missing key → context was not selected; return empty results
+   */
+  preselectedItemsByContext?: Map<string, string[] | null>;
 };
 /**
@@ -56,7 +95,7 @@ export type RetrievalToolParams = {
  * and filtered per strategy.
  */
 export function createRetrievalTools(params: RetrievalToolParams) {
-  const { contexts, user, role, updateVirtualFiles } = params;
+  const { contexts, toolVariablesConfig, user, role, updateVirtualFiles, preselectedItemsByContext } = params;
   const ctxEnum = buildContextEnum(contexts);
   // ──────────────────────────────────────────────────────────
@@ -87,6 +126,13 @@ export function createRetrievalTools(params: RetrievalToolParams) {
       const counts = await Promise.all(
         ctxList.map(async (ctx) => {
+          const contextItemIds = preselectedItemsByContext?.get(ctx.id);
+          // undefined = context not in preselection map → skip
+          if (preselectedItemsByContext && contextItemIds === undefined) {
+            return { context: ctx.id, context_name: ctx.name, count: 0 };
+          }
+          // null = full context; string[] = specific items
           let count = 0;
           if (count_what === "items") {
@@ -95,19 +141,25 @@ export function createRetrievalTools(params: RetrievalToolParams) {
             if (name_contains) {
               q = q.whereRaw("LOWER(name) LIKE ?", [`%${name_contains.toLowerCase()}%`]);
             }
+            if (Array.isArray(contextItemIds)) {
+              q = q.whereIn("id", contextItemIds);
+            }
             const tableDefinition = convertContextToTableDefinition(ctx);
             q = applyAccessControl(tableDefinition, q, user, tableName);
             const result = await q.first();
             count = Number(result?.count ?? 0);
           } else {
             const chunksTable = getChunksTableName(ctx.id);
+            const baseItemFilters: SearchFilters = Array.isArray(contextItemIds)
+              ? [{ id: { in: contextItemIds } }]
+              : [];
             if (content_query) {
               const searchResults = await ctx.search({
                 query: content_query,
                 method: "hybridSearch",
                 limit: 10000,
                 page: 1,
-                itemFilters: [],
+                itemFilters: baseItemFilters,
                 chunkFilters: [],
                 sort: { field: "updatedAt", direction: "desc" },
                 user,
@@ -115,6 +167,9 @@ export function createRetrievalTools(params: RetrievalToolParams) {
                 trigger: "tool",
               });
               count = searchResults.chunks.length;
+            } else if (Array.isArray(contextItemIds)) {
+              const result = await db(chunksTable).count("id as count").whereIn("source", contextItemIds).first();
+              count = Number(result?.count ?? 0);
             } else {
               const result = await db(chunksTable).count("id as count").first();
               count = Number(result?.count ?? 0);
@@ -137,10 +192,18 @@ export function createRetrievalTools(params: RetrievalToolParams) {
   // ──────────────────────────────────────────────────────────
   const search_items_by_name = tool({
     description:
-      "Search for items by their name or external ID. Use only when the user is asking for documents BY TITLE, not by content topic.",
+      "Search for items by their name or external ID. Use when:\n" +
+      "• The user asks for a document BY TITLE or NAME\n" +
+      "• The user asks whether a specific named document EXISTS (e.g. 'do you have the X manual?', 'is there a document for Y?')\n" +
+      "• Any query that references a specific document, manual, or resource by its name rather than by topic\n" +
+      "Do NOT use for topic-based content queries (e.g. 'what are the parameters for X?', 'how do I configure Y?').",
     inputSchema: z.object({
       knowledge_base_ids: ctxEnum,
-      item_name: z.string().describe("The name or partial name to search for"),
+      item_name: z.string().describe(
+        "The name or partial name to search for. Uses substring matching, so shorter and more specific terms work better than full phrases. " +
+        "Extract only the core identifying part — typically the product model, document title, or unique identifier. " +
+        "Do NOT include surrounding descriptors like type words ('manual', 'guide', 'document') or manufacturer names unless they are likely part of the actual document title."
+      ),
       limit: z
         .number()
         .default(100)
@@ -152,10 +215,16 @@ export function createRetrievalTools(params: RetrievalToolParams) {
       const { db } = await postgresClient();
       const ctxList = resolveContexts(knowledge_base_ids, contexts);
       const safeLimit = Math.min(limit ?? 100, 400);
-      const itemFilters: SearchFilters = item_name ? [{ name: { contains: item_name } }] : [];
       const results = await Promise.all(
         ctxList.map(async (ctx) => {
+          const contextItemIds = preselectedItemsByContext?.get(ctx.id);
+          // undefined = context not in preselection map → skip
+          if (preselectedItemsByContext && contextItemIds === undefined) return [];
+          const itemFilters: SearchFilters = item_name ? [{ name: { contains: item_name } }] : [];
+          if (Array.isArray(contextItemIds)) itemFilters.push({ id: { in: contextItemIds } });
           const tableName = getTableName(ctx.id);
           const tableDefinition = convertContextToTableDefinition(ctx);
@@ -202,15 +271,25 @@ export function createRetrievalTools(params: RetrievalToolParams) {
   // search_content
   // ──────────────────────────────────────────────────────────
   const search_content = tool({
-    description: `Search across document content using hybrid, keyword, or semantic search.
+    description: `Search ONE knowledge base for document content using hybrid, keyword, or semantic search.
+Always make a separate call for each knowledge base you want to search — never bundle multiple in one call.
 Use includeContent: false when you only need to know WHICH documents match (listing, overview, navigation).
 Use includeContent: true when you need the ACTUAL text to answer a question.
 For listing queries: always start with includeContent: false, then use dynamic tools to fetch specific pages.`,
     inputSchema: z.object({
-      query: z.string().describe("Search query about the content you're looking for"),
-      knowledge_base_ids: ctxEnum,
+      userQuery: z.string().describe("The original unaltered question from the user"),
+      knowledge_base_id: z
+        .enum(contexts.map((c) => c.id) as [string, ...string[]])
+        .describe(
+          contexts
+            .map(
+              (c) =>
+                `<knowledge_base id="${c.id}" name="${c.name}">${c.description}</knowledge_base>`,
+            )
+            .join("\n"),
+        ),
       keywords: z.array(z.string()).optional().describe("Keywords extracted from the query"),
       searchMethod: z
         .enum(["hybrid", "keyword", "semantic"])
@@ -236,12 +315,12 @@ For listing queries: always start with includeContent: false, then use dynamic t
         .describe("Filter results to specific external IDs"),
       limit: z
         .number()
-        .default(10)
-        .describe("Max chunks with content (max 10). Without content, up to 200 are returned."),
+        .default(20)
+        .describe("Max chunks with content (max 20). Without content, up to 200 are returned."),
     }),
     execute: async ({
-      query,
-      knowledge_base_ids,
+      userQuery,
+      knowledge_base_id,
       keywords,
       searchMethod,
       includeContent,
@@ -250,70 +329,94 @@ For listing queries: always start with includeContent: false, then use dynamic t
       item_external_ids,
       limit,
     }) => {
-      const ctxList = resolveContexts(knowledge_base_ids, contexts);
-      const effectiveLimit = includeContent ? Math.min(limit ?? 10, 10) : Math.min((limit ?? 10) * 20, 400);
-      const results = await Promise.all(
-        ctxList.map(async (ctx) => {
-          const itemFilters: SearchFilters = [];
-          if (item_ids) itemFilters.push({ id: { in: item_ids } });
-          if (item_names)
-            itemFilters.push({ name: { or: item_names.map((n) => ({ contains: n })) } });
-          if (item_external_ids) itemFilters.push({ external_id: { in: item_external_ids } });
-          const effectiveQuery = query || keywords?.join(" ") || "";
-          let method = mapSearchMethod(searchMethod ?? "hybrid")
-          if (
-            method === "hybridSearch" ||
-            method === "cosineDistance"
-          ) {
-            if (!ctx.embedder) {
-              console.error(`[EXULU] context "${ctx.id}" does not have an embedder, falling back to tsvector search`);
-              method = "tsvector"
-            }
+      const [ctx] = resolveContexts([knowledge_base_id], contexts) as [ExuluContext];
+      const maxResults = toolVariablesConfig?.[`${ctx.id}_|_max_results`] || 20;
+      const effectiveLimit = includeContent ? Math.min(limit ?? maxResults, maxResults) : Math.min((limit ?? maxResults) * maxResults, 400);
+      const itemFilters: SearchFilters = [];
+      if (preselectedItemsByContext) {
+        const contextItemIds = preselectedItemsByContext.get(knowledge_base_id);
+        if (contextItemIds === undefined) {
+          // Context not in preselection map — nothing to search
+          return JSON.stringify([]);
+        }
+        if (Array.isArray(contextItemIds)) {
+          const intersection = item_ids?.length
+            ? item_ids.filter((id) => contextItemIds.includes(id))
+            : contextItemIds;
+          if (!intersection.length) {
+            // Agent specified item_ids entirely outside the preselected scope
+            return JSON.stringify([]);
           }
-          try {
-            const { chunks } = await ctx.search({
-              query: effectiveQuery,
-              keywords,
-              method: method,
-              limit: effectiveLimit,
-              page: 1,
-              itemFilters,
-              chunkFilters: [],
-              sort: { field: "updatedAt", direction: "desc" },
-              user,
-              role,
-              trigger: "tool",
-            });
-            return chunks.map(
-              (chunk): ChunkResult => ({
-                item_name: chunk.item_name,
-                item_id: chunk.item_id,
-                context: chunk.context?.id ?? ctx.id,
-                chunk_id: chunk.chunk_id,
-                chunk_index: chunk.chunk_index,
-                chunk_content: includeContent ? chunk.chunk_content : undefined,
-                metadata: {
-                  ...chunk.chunk_metadata,
-                  cosine_distance: chunk.chunk_cosine_distance,
-                  fts_rank: chunk.chunk_fts_rank,
-                  hybrid_score: chunk.chunk_hybrid_score,
-                },
-              }),
-            );
-          } catch (err) {
-            console.error(`[EXULU] search_content failed for context "${ctx.id}":`, err);
-            return [];
-          }
-        }),
-      );
-      return JSON.stringify(results.flat());
+          itemFilters.push({ id: { in: intersection } });
+        }
+        // null = full context → no item filter; agent's item_ids still respected if provided
+        else if (item_ids?.length) {
+          itemFilters.push({ id: { in: item_ids } });
+        }
+      } else if (item_ids?.length) {
+        itemFilters.push({ id: { in: item_ids } });
+      }
+      if (item_names)
+        itemFilters.push({ name: { or: item_names.map((n) => ({ contains: n })) } });
+      if (item_external_ids) itemFilters.push({ external_id: { in: item_external_ids } });
+      const effectiveQuery = userQuery || keywords?.join(" ") || "";
+      let method = mapSearchMethod(searchMethod ?? "hybrid");
+      if (method === "hybridSearch" || method === "cosineDistance") {
+        if (!ctx.embedder) {
+          console.error(`[EXULU] context "${ctx.id}" does not have an embedder, falling back to tsvector search`);
+          method = "tsvector";
+        }
+      }
+      const expandChunks = toolVariablesConfig?.[`${ctx.id}_|_expand_chunks`] || 0;
+      try {
+        const { chunks } = await ctx.search({
+          query: effectiveQuery,
+          keywords,
+          method,
+          limit: effectiveLimit,
+          page: 1,
+          itemFilters,
+          chunkFilters: [],
+          sort: { field: "updatedAt", direction: "desc" },
+          user,
+          role,
+          trigger: "tool",
+          expand: expandChunks > 0 ? {
+            before: expandChunks,
+            after: expandChunks,
+          } : undefined,
+        });
+        return JSON.stringify(
+          chunks.map(
+            (chunk): ChunkResult => ({
+              item_name: chunk.item_name,
+              item_id: chunk.item_id,
+              context: chunk.context?.id ?? ctx.id,
+              chunk_id: chunk.chunk_id,
+              chunk_index: chunk.chunk_index,
+              chunk_content: includeContent ? chunk.chunk_content : undefined,
+              metadata: {
+                ...chunk.chunk_metadata,
+                cosine_distance: chunk.chunk_cosine_distance,
+                fts_rank: chunk.chunk_fts_rank,
+                hybrid_score: chunk.chunk_hybrid_score,
+              },
+            }),
+          ),
+        );
+      } catch (err) {
+        console.error(`[EXULU] search_content failed for context "${ctx.id}":`, err);
+        return JSON.stringify([]);
+      }
     },
   });
@@ -321,10 +424,11 @@ For listing queries: always start with includeContent: false, then use dynamic t
   // save_search_results
   // ──────────────────────────────────────────────────────────
   const save_search_results = tool({
-    description: `Execute a search and save ALL results to the virtual filesystem WITHOUT loading them into context.
+    description: `Execute a search on ONE knowledge base and save ALL results to the virtual filesystem WITHOUT loading them into context.
+Always make a separate call for each knowledge base you want to search.
 Use this when you expect many results (>20) and need to filter iteratively:
-1. Call save_search_results to save up to 1000 results to /search_results.txt
+1. Call save_search_results (once per knowledge base) to save up to 1000 results to /search_results_{knowledge_base_id}.txt
 2. Use bash grep/awk to identify relevant chunks by pattern
 3. Use dynamic get_content tools to load only the specific chunks you need
@@ -340,7 +444,16 @@ SCORE: ...
 (content or placeholder)
 ---CONTENT END---`,
     inputSchema: z.object({
-      knowledge_base_ids: ctxEnum,
+      knowledge_base_id: z
+        .enum(contexts.map((c) => c.id) as [string, ...string[]])
+        .describe(
+          contexts
+            .map(
+              (c) =>
+                `<knowledge_base id="${c.id}" name="${c.name}">${c.description}</knowledge_base>`,
+            )
+            .join("\n"),
+        ),
       query: z.string().describe("Search query"),
       searchMethod: z.enum(["hybrid", "keyword", "semantic"]).default("hybrid"),
       limit: z
@@ -355,34 +468,44 @@ SCORE: ...
           "Whether to include chunk text in the saved file. False saves tokens — use true only if you need to grep content.",
         ),
     }),
-    execute: async ({ query, knowledge_base_ids, searchMethod, limit, includeContent }) => {
-      const ctxList = resolveContexts(knowledge_base_ids, contexts);
-      const results = await Promise.all(
-        ctxList.map(async (ctx) => {
-          try {
-            const { chunks } = await ctx.search({
-              query,
-              method: mapSearchMethod(searchMethod ?? "hybrid"),
-              limit: Math.min(limit ?? 100, 1000),
-              page: 1,
-              itemFilters: [],
-              chunkFilters: [],
-              sort: { field: "updatedAt", direction: "desc" },
-              user,
-              role,
-              trigger: "tool",
-            });
-            return chunks;
-          } catch (err) {
-            console.error(`[EXULU] save_search_results failed for context "${ctx.id}":`, err);
-            return [];
-          }
-        }),
-      );
-      const chunks: VectorSearchChunkResult[] = results.flat();
+    execute: async ({ query, knowledge_base_id, searchMethod, limit, includeContent }) => {
+      const [ctx] = resolveContexts([knowledge_base_id], contexts) as [ExuluContext];
+      const contextItemIds = preselectedItemsByContext?.get(knowledge_base_id);
+      // undefined = context not in preselection map → skip
+      if (preselectedItemsByContext && contextItemIds === undefined) {
+        return JSON.stringify({
+          success: true,
+          results_count: 0,
+          message: `Context "${knowledge_base_id}" not in preselected scope — skipped.`,
+        });
+      }
+      // null = full context (no filter); string[] = specific items
+      const itemFilters: SearchFilters = Array.isArray(contextItemIds)
+        ? [{ id: { in: contextItemIds } }]
+        : [];
+      let chunks: VectorSearchChunkResult[] = [];
+      try {
+        const result = await ctx.search({
+          query,
+          method: mapSearchMethod(searchMethod ?? "hybrid"),
+          limit: Math.min(limit ?? 100, 1000),
+          page: 1,
+          itemFilters,
+          chunkFilters: [],
+          sort: { field: "updatedAt", direction: "desc" },
+          user,
+          role,
+          trigger: "tool",
+        });
+        chunks = result.chunks;
+      } catch (err) {
+        console.error(`[EXULU] save_search_results failed for context "${ctx.id}":`, err);
+      }
+      const fileName = `search_results_${ctx.id}.txt`;
       const fileContent = chunks
         .map(
           (chunk, i) =>
@@ -400,14 +523,14 @@ SCORE: ...
         .join("\n");
       await updateVirtualFiles([
-        { path: "search_results.txt", content: fileContent },
+        { path: fileName, content: fileContent },
         {
-          path: "search_metadata.json",
+          path: `search_metadata_${ctx.id}.json`,
           content: JSON.stringify({
             query,
             timestamp: new Date().toISOString(),
             results_count: chunks.length,
-            contexts: ctxList.map((c) => c.id),
+            context: ctx.id,
             method: searchMethod,
           }),
         },
@@ -416,11 +539,11 @@ SCORE: ...
       return JSON.stringify({
         success: true,
         results_count: chunks.length,
-        message: `Saved ${chunks.length} results to /search_results.txt`,
+        message: `Saved ${chunks.length} results to /${fileName}`,
         grep_examples: [
-          "grep -i 'keyword' search_results.txt | head -20",
-          "grep 'ITEM_NAME:' search_results.txt",
-          "grep -B 5 'pattern' search_results.txt | grep 'CHUNK_ID:'",
+          `grep -i 'keyword' ${fileName} | head -20`,
+          `grep 'ITEM_NAME:' ${fileName}`,
+          `grep -B 5 'pattern' ${fileName} | grep 'CHUNK_ID:'`,
         ],
       });
     },