npm - @exulu/backend - Versions diffs - 1.53.1 → 1.54.0 - Mend

@exulu/backend 1.53.1 → 1.54.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

package/dist/index.cjs +3404 -2389
package/dist/index.d.cts +66 -4
package/dist/index.d.ts +66 -4
package/dist/index.js +4926 -3918
package/ee/agentic-retrieval/ANALYSIS.md +658 -0
package/ee/agentic-retrieval/logs/README.md +198 -0
package/ee/agentic-retrieval/v2.ts +1628 -0
package/ee/agentic-retrieval/v3/agent-loop.ts +242 -0
package/ee/agentic-retrieval/v3/classifier.ts +73 -0
package/ee/agentic-retrieval/v3/context-sampler.ts +70 -0
package/ee/agentic-retrieval/v3/dynamic-tools.ts +115 -0
package/ee/agentic-retrieval/v3/index.ts +281 -0
package/ee/agentic-retrieval/v3/strategies.ts +167 -0
package/ee/agentic-retrieval/v3/tools.ts +435 -0
package/ee/agentic-retrieval/v3/trajectory.ts +96 -0
package/ee/agentic-retrieval/v3/types.ts +59 -0
package/ee/agentic-retrieval/v4/agent-loop.ts +121 -0
package/ee/agentic-retrieval/v4/embed-preprocessor.ts +76 -0
package/ee/agentic-retrieval/v4/index.ts +181 -0
package/ee/agentic-retrieval/v4/system-prompt.ts +248 -0
package/ee/agentic-retrieval/v4/tools.ts +241 -0
package/ee/agentic-retrieval/v4/types.ts +29 -0
package/ee/chunking/markdown.ts +4 -2
package/ee/workers.ts +1 -1
package/package.json +6 -3

package/ee/agentic-retrieval/v4/tools.ts ADDED Viewed

@@ -0,0 +1,241 @@
+import * as fs from "fs/promises";
+import * as path from "path";
+import { exec } from "child_process";
+import { promisify } from "util";
+import { z } from "zod";
+import { tool } from "ai";
+import { postgresClient } from "@SRC/postgres/client";
+import type { ExuluContext } from "@SRC/exulu/context";
+import type { User } from "@EXULU_TYPES/models/user";
+import { preprocessEmbedCalls } from "./embed-preprocessor";
+import type { ChunkResult } from "./types";
+const execAsync = promisify(exec);
+const MAX_INLINE_CHARS = 20_000;
+const MAX_GREP_OUTPUT_CHARS = 5_000;
+// ──────────────────────────────────────────────────────────────────────────────
+// SQL safety: only allow read-only statements
+// ──────────────────────────────────────────────────────────────────────────────
+const WRITE_PATTERN =
+  /^\s*(INSERT|UPDATE|DELETE|DROP|CREATE|ALTER|TRUNCATE|GRANT|REVOKE|VACUUM|ANALYZE|EXPLAIN\s+ANALYZE)\b/i;
+function assertReadOnly(sql: string): void {
+  if (WRITE_PATTERN.test(sql)) {
+    throw new Error(
+      "Only SELECT queries are allowed. Write operations (INSERT, UPDATE, DELETE, DROP, etc.) are not permitted.",
+    );
+  }
+}
+// ──────────────────────────────────────────────────────────────────────────────
+// Chunk harvesting: extract ChunkResult objects from raw SQL result rows
+// ──────────────────────────────────────────────────────────────────────────────
+/**
+ * Tries to interpret a raw DB row as a ChunkResult.
+ * The system prompt instructs the agent to use standard aliases, so we look for
+ * those first and fall back to common alternative column names.
+ */
+export function rowToChunkResult(row: Record<string, any>): ChunkResult | null {
+  const chunkId = row.chunk_id ?? row.id;
+  const chunkContent = row.chunk_content ?? row.content;
+  const itemId = row.item_id ?? row.source;
+  const context = row.context ?? row.context_id;
+  const itemName = row.item_name ?? row.name;
+  // Require at minimum a chunk identifier and either content or an item reference
+  if (!chunkId || (!chunkContent && !itemId)) return null;
+  return {
+    item_name: itemName ?? "",
+    item_id: itemId ?? "",
+    context: context ?? "",
+    chunk_id: chunkId,
+    chunk_index: row.chunk_index ?? undefined,
+    chunk_content: chunkContent ?? undefined,
+    metadata: row.metadata ?? row.chunk_metadata ?? undefined,
+  };
+}
+// ──────────────────────────────────────────────────────────────────────────────
+// Tool factory
+// ──────────────────────────────────────────────────────────────────────────────
+export type ToolFactoryParams = {
+  contexts: ExuluContext[];
+  user?: User;
+  role?: string;
+  sessionDir: string;
+};
+// eslint-disable-next-line @typescript-eslint/explicit-function-return-type
+export function createTools(params: ToolFactoryParams) {
+  const { contexts, user, role, sessionDir } = params;
+  let queryCount = 0;
+  // ── execute_query ────────────────────────────────────────────────────────────
+  const execute_query = tool({
+    description: `Execute a read-only PostgreSQL SELECT query against the knowledge base.
+Use this to search, filter, aggregate, and explore content. The database contains items
+and chunks tables for each knowledge base (see schema in the system prompt).
+Use embed('your text') anywhere in the query to generate a semantic search vector:
+  embedding <=> embed('machine learning') AS distance
+If the result exceeds ${(MAX_INLINE_CHARS / 1000).toFixed(0)}k characters it is saved to a file.
+Use the grep tool to iteratively search the file for relevant information.`,
+    inputSchema: z.object({
+      sql: z.string().describe("A read-only SELECT (or WITH ... SELECT) PostgreSQL query"),
+    }),
+    execute: async ({ sql }) => {
+      assertReadOnly(sql);
+      let processedSql: string;
+      try {
+        processedSql = await preprocessEmbedCalls(sql, contexts, user, role);
+      } catch (err: any) {
+        return JSON.stringify({ error: `embed() preprocessing failed: ${err.message}` });
+      }
+      let rows: any[];
+      try {
+        const { db } = await postgresClient();
+        const result = await db.raw(processedSql);
+        rows = result.rows ?? [];
+      } catch (err: any) {
+        return JSON.stringify({ error: `Query failed: ${err.message}` });
+      }
+      const json = JSON.stringify(rows, null, 2);
+      if (json.length <= MAX_INLINE_CHARS) {
+        return json;
+      }
+      // Results are large — store to session dir and tell the agent to grep
+      await fs.mkdir(sessionDir, { recursive: true });
+      const filename = `query_${++queryCount}.json`;
+      const filePath = path.join(sessionDir, filename);
+      await fs.writeFile(filePath, json, "utf-8");
+      return JSON.stringify({
+        stored: true,
+        file: filePath,
+        row_count: rows.length,
+        message: `Results too large to display (${rows.length} rows, ${(json.length / 1000).toFixed(1)}k chars). Stored at ${filePath}. Use the grep tool to search for relevant information.`,
+        grep_hint: `grep -i "keyword" ${filePath}`,
+      });
+    },
+  });
+  // ── grep ─────────────────────────────────────────────────────────────────────
+  const grep = tool({
+    description: `Search a stored query result file using grep.
+Use this after execute_query returns a file path because results were too large.
+Iteratively narrow down the results with multiple grep calls.`,
+    inputSchema: z.object({
+      pattern: z.string().describe("Regular expression or literal string to search for"),
+      file: z.string().describe("Absolute path to the file returned by execute_query"),
+      context_lines: z
+        .number()
+        .int()
+        .min(0)
+        .max(10)
+        .default(2)
+        .describe("Number of lines of context to show around each match (default 2)"),
+      case_insensitive: z
+        .boolean()
+        .default(true)
+        .describe("Case-insensitive matching (default true)"),
+    }),
+    execute: async ({ pattern, file, context_lines, case_insensitive }) => {
+      // Security: only allow reading from our session directory
+      const resolvedFile = path.resolve(file);
+      const resolvedSession = path.resolve(sessionDir);
+      if (!resolvedFile.startsWith(resolvedSession)) {
+        return JSON.stringify({
+          error: `Access denied. Only files within the session directory (${sessionDir}) can be searched.`,
+        });
+      }
+      // Verify file exists
+      try {
+        await fs.access(resolvedFile);
+      } catch {
+        return JSON.stringify({ error: `File not found: ${file}` });
+      }
+      const flags = [
+        "-n",
+        context_lines > 0 ? `-C${context_lines}` : "",
+        case_insensitive ? "-i" : "",
+      ]
+        .filter(Boolean)
+        .join(" ");
+      // Escape pattern for shell to prevent injection
+      const escapedPattern = pattern.replace(/'/g, `'\\''`);
+      const cmd = `grep ${flags} '${escapedPattern}' '${resolvedFile}'`;
+      let output: string;
+      try {
+        const { stdout } = await execAsync(cmd, { maxBuffer: 10 * 1024 * 1024 });
+        output = stdout;
+      } catch (err: any) {
+        // grep exits with code 1 when no matches — that's not an error
+        if (err.code === 1) {
+          return JSON.stringify({ matches: 0, output: "No matches found." });
+        }
+        return JSON.stringify({ error: `grep failed: ${err.message}` });
+      }
+      if (output.length > MAX_GREP_OUTPUT_CHARS) {
+        output =
+          output.slice(0, MAX_GREP_OUTPUT_CHARS) +
+          `\n... (output truncated at ${MAX_GREP_OUTPUT_CHARS} chars — refine your pattern to narrow results)`;
+      }
+      const lineCount = output.split("\n").filter(Boolean).length;
+      return JSON.stringify({ matches: lineCount, output });
+    },
+  });
+  return { execute_query, grep };
+}
+/**
+ * Harvests ChunkResult objects from all tool results in a step.
+ * Called after each agent step to collect any chunk-shaped rows the agent retrieved.
+ */
+export function harvestChunks(toolResults: any[]): ChunkResult[] {
+  const chunks: ChunkResult[] = [];
+  for (const result of toolResults ?? []) {
+    const rawOutput = result.output ?? result.result;
+    let parsed: any;
+    try {
+      parsed = typeof rawOutput === "string" ? JSON.parse(rawOutput) : rawOutput;
+    } catch {
+      continue;
+    }
+    // Array of rows (direct SELECT result)
+    if (Array.isArray(parsed)) {
+      for (const row of parsed) {
+        if (row && typeof row === "object") {
+          const chunk = rowToChunkResult(row);
+          if (chunk) chunks.push(chunk);
+        }
+      }
+    }
+  }
+  return chunks;
+}

package/ee/agentic-retrieval/v4/types.ts ADDED Viewed

@@ -0,0 +1,29 @@
+export interface ChunkResult {
+  item_name: string;
+  item_id: string;
+  context: string;
+  chunk_id?: string;
+  chunk_index?: number;
+  chunk_content?: string;
+  metadata?: Record<string, any>;
+}
+export interface RetrievalStep {
+  stepNumber: number;
+  text: string;
+  toolCalls: Array<{ name: string; id: string; input: any }>;
+  chunks: ChunkResult[];
+  tokens: number;
+}
+export interface AgenticRetrievalOutput {
+  steps: RetrievalStep[];
+  reasoning: Array<{
+    text: string;
+    tools: { name: string; id: string; input: any; output: any }[];
+  }>;
+  chunks: ChunkResult[];
+  usage: any[];
+  totalTokens: number;
+  trajectoryFile?: string;
+}

package/ee/chunking/markdown.ts CHANGED Viewed

@@ -516,7 +516,9 @@ export class MarkdownChunker {
         return newHeaders;
     }
-    public async chunk(text: string, chunkSize: number, prefix?: string): Promise<{
+    public async chunk(text: string, chunkSize: number, prefix?: string, config?: {
+        pageBreakTags?: boolean;
+    }): Promise<{
         text: string;
         page: number;
     }[]> {
@@ -684,7 +686,7 @@ export class MarkdownChunker {
                     finalText = headerPrefixText + '\n\n' + currentSlice;
                 }
-                if (currentPage) {
+                if (currentPage && config?.pageBreakTags) {
                     finalText = `<!-- Current page: ${currentPage} -->\n\n` + finalText;
                 }

package/ee/workers.ts CHANGED Viewed

@@ -1311,7 +1311,7 @@ const pollJobResult = async ({
       break;
     }
     // Wait for 2 seconds before polling again
-    await new Promise((resolve) => setTimeout((resolve) => resolve(true), 2000));
+    await new Promise((resolve) => setTimeout(() => resolve(true), 2000));
   }
   return result;
 };

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@exulu/backend",
   "author": "Qventu Bv.",
-  "version": "1.53.1",
+  "version": "1.54.0",
   "main": "./dist/index.js",
   "private": false,
   "publishConfig": {
@@ -87,6 +87,7 @@
   },
   "dependencies": {
     "@ai-sdk/anthropic": "^3.0.23",
+    "@ai-sdk/azure": "^3.0.53",
     "@ai-sdk/cerebras": "^2.0.29",
     "@ai-sdk/google-vertex": "^4.0.28",
     "@ai-sdk/openai": "^3.0.18",
@@ -111,6 +112,7 @@
     "@opentelemetry/winston-transport": "^0.14.1",
     "@perplexity-ai/perplexity_ai": "^0.25.0",
     "ai": "^6.0.49",
+    "bash-tool": "^1.3.16",
     "bcryptjs": "^3.0.2",
     "body-parser": "^2.2.0",
     "bullmq": "^5.48.1",
@@ -132,12 +134,13 @@
     "jose": "^6.0.10",
     "json-schema-to-zod": "^2.6.1",
     "jsonwebtoken": "^9.0.2",
+    "just-bash": "^2.14.0",
     "knex": "^3.1.0",
     "link": "^2.1.1",
     "mammoth": "^1.11.0",
     "natural": "^8.1.0",
     "officeparser": "^5.2.2",
-    "openai": "^5.21.0",
+    "openai": "^6.34.0",
     "p-limit": "^7.3.0",
     "papaparse": "^5.5.2",
     "pg": "^8.16.3",
@@ -150,7 +153,7 @@
     "wink-nlp": "^2.4.0",
     "winston": "^3.17.0",
     "word-extractor": "^1.0.4",
-    "zod": "^3.25.76",
+    "zod": "^4.3.6",
     "zod-from-json-schema": "^0.5.2",
     "zod-to-json-schema": "^3.25.1",
     "zodex": "^0.18.2"