npm - @exulu/backend - Versions diffs - 1.54.0 → 1.56.0 - Mend

@exulu/backend 1.54.0 → 1.56.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/dist/index.cjs +2275 -1330
package/dist/index.d.cts +8 -30
package/dist/index.d.ts +8 -30
package/dist/index.js +2256 -1306
package/ee/agentic-retrieval/v3/agent-loop.ts +49 -3
package/ee/agentic-retrieval/v3/classifier.ts +61 -42
package/ee/agentic-retrieval/v3/context-sampler.ts +10 -1
package/ee/agentic-retrieval/v3/index.ts +211 -35
package/ee/agentic-retrieval/v3/session-tools-registry.ts +20 -0
package/ee/agentic-retrieval/v3/strategies.ts +28 -24
package/ee/agentic-retrieval/v3/tools.ts +236 -113
package/ee/agentic-retrieval/v3/trajectory.ts +227 -14
package/ee/agentic-retrieval/v4/agent-loop.ts +142 -55
package/ee/agentic-retrieval/v4/context-sampler.ts +79 -0
package/ee/agentic-retrieval/v4/index.ts +673 -164
package/ee/agentic-retrieval/v4/types.ts +33 -4
package/ee/invoke-skills/create-sandbox.ts +119 -0
package/ee/python/documents/processing/doc_processor.ts +106 -14
package/package.json +4 -2
package/ee/agentic-retrieval/ANALYSIS.md +0 -658
package/ee/agentic-retrieval/index.ts +0 -1109
package/ee/agentic-retrieval/logs/README.md +0 -198
package/ee/agentic-retrieval/v2.ts +0 -1628
package/ee/agentic-retrieval/v4/embed-preprocessor.ts +0 -76
package/ee/agentic-retrieval/v4/system-prompt.ts +0 -248
package/ee/agentic-retrieval/v4/tools.ts +0 -241

package/ee/agentic-retrieval/v4/embed-preprocessor.ts DELETED Viewed

@@ -1,76 +0,0 @@
-import type { ExuluContext } from "@SRC/exulu/context";
-import type { User } from "@EXULU_TYPES/models/user";
-/**
- * Finds embed('text') or embed('text', 'contextId') calls in a SQL string,
- * generates the embedding vectors using the appropriate context's embedder,
- * and substitutes them with ARRAY[...]::vector literals so db.raw() can execute it.
- *
- * Examples:
- *   embed('machine learning')         → uses first context that has an embedder
- *   embed('machine learning', 'ctx1') → uses the embedder from context 'ctx1'
- */
-export async function preprocessEmbedCalls(
-  sql: string,
-  contexts: ExuluContext[],
-  user?: User,
-  role?: string,
-): Promise<string> {
-  // Match embed('...') or embed('...', 'contextId')
-  // We use a global regex but process matches manually so we can await async calls
-  const EMBED_RE = /embed\('((?:[^'\\]|\\.)*)'\s*(?:,\s*'((?:[^'\\]|\\.)*)')?\)/gi;
-  const matches: { fullMatch: string; text: string; contextId?: string; index: number }[] = [];
-  let m: RegExpExecArray | null;
-  while ((m = EMBED_RE.exec(sql)) !== null) {
-    matches.push({
-      fullMatch: m[0],
-      text: m[1],
-      contextId: m[2] || undefined,
-      index: m.index,
-    });
-  }
-  if (matches.length === 0) return sql;
-  // Generate all embeddings in parallel
-  const substitutions = await Promise.all(
-    matches.map(async ({ text, contextId }) => {
-      const context = contextId
-        ? contexts.find((c) => c.id === contextId)
-        : contexts.find((c) => c.embedder != null);
-      if (!context?.embedder) {
-        throw new Error(
-          `No embedder available${contextId ? ` for context "${contextId}"` : ""}. ` +
-            `Available contexts with embedders: [${contexts.filter((c) => c.embedder).map((c) => c.id).join(", ")}]`,
-        );
-      }
-      const result = await context.embedder.generateFromQuery(
-        context.id,
-        text,
-        undefined,
-        (user as any)?.id,
-        role,
-      );
-      const vector = result?.chunks?.[0]?.vector;
-      if (!vector?.length) {
-        throw new Error(`Embedder returned no vector for text: "${text}"`);
-      }
-      return `ARRAY[${vector.join(",")}]::vector`;
-    }),
-  );
-  // Replace in reverse order so indices stay valid
-  let result = sql;
-  for (let i = matches.length - 1; i >= 0; i--) {
-    const { fullMatch, index } = matches[i];
-    result = result.slice(0, index) + substitutions[i] + result.slice(index + fullMatch.length);
-  }
-  return result;
-}

package/ee/agentic-retrieval/v4/system-prompt.ts DELETED Viewed

@@ -1,248 +0,0 @@
-import { getTableName, getChunksTableName, type ExuluContext } from "@SRC/exulu/context";
-/**
- * Builds the system prompt for the V4 observe-infer-act retrieval agent.
- *
- * The prompt includes:
- *  1. The observe-infer-act loop philosophy
- *  2. The full database schema for every available context
- *  3. Common SQL query patterns (keyword, semantic, hybrid, aggregation)
- *  4. Instructions on when/how to use grep for large result sets
- *  5. The standard column alias convention the agent should follow
- */
-export function buildSystemPrompt(
-  contexts: ExuluContext[],
-  customInstructions?: string,
-): string {
-  const schemaBlock = buildSchemaBlock(contexts);
-  const hasEmbedder = contexts.some((c) => c.embedder != null);
-  return `\
-You are a knowledge base retrieval agent. Your job is to find all information relevant to the user's query.
-## Approach: Observe → Infer → Act
-Work iteratively:
-1. **Observe** — examine what data you have and what the query asks for
-2. **Infer** — decide what SQL query will best surface relevant information
-3. **Act** — execute the query and study the results
-4. Repeat until you have found sufficient information, then write your final answer.
-Do NOT guess or hallucinate. If results are empty, try alternative queries (different keywords,
-broader filters, semantic search). Exhaust the available search strategies before concluding
-that no relevant data exists.
----
-## Database Schema
-${schemaBlock}
----
-## Query Patterns
-### Keyword / Full-Text Search
-\`\`\`sql
-SELECT
-  c.id          AS chunk_id,
-  c.chunk_index,
-  c.content     AS chunk_content,
-  c.metadata,
-  c.source      AS item_id,
-  i.name        AS item_name,
-  '<context_id>' AS context
-FROM <context_id>_chunks c
-JOIN <context_id>_items i ON c.source = i.id
-WHERE c.fts @@ plainto_tsquery('english', 'your search terms')
-  AND (i.archived IS FALSE OR i.archived IS NULL)
-ORDER BY ts_rank(c.fts, plainto_tsquery('english', 'your search terms')) DESC
-LIMIT 20;
-\`\`\`
-For German text use \`'german'\` instead of \`'english'\`.
-For multi-language, use \`websearch_to_tsquery\` or UNION both languages.
-${
-  hasEmbedder
-    ? `
-### Semantic Search (use embed() helper)
-\`\`\`sql
-SELECT
-  c.id          AS chunk_id,
-  c.chunk_index,
-  c.content     AS chunk_content,
-  c.metadata,
-  c.source      AS item_id,
-  i.name        AS item_name,
-  '<context_id>' AS context,
-  c.embedding <=> embed('your concept here') AS distance
-FROM <context_id>_chunks c
-JOIN <context_id>_items i ON c.source = i.id
-WHERE (i.archived IS FALSE OR i.archived IS NULL)
-ORDER BY distance ASC
-LIMIT 20;
-\`\`\`
-### Hybrid Search (keyword + semantic combined via RRF)
-\`\`\`sql
-WITH fts AS (
-  SELECT id, ROW_NUMBER() OVER (ORDER BY ts_rank(fts, q) DESC) AS rank
-  FROM <context_id>_chunks, plainto_tsquery('english', 'your query') q
-  WHERE fts @@ q
-  LIMIT 500
-),
-sem AS (
-  SELECT id, ROW_NUMBER() OVER (ORDER BY embedding <=> embed('your query') ASC) AS rank
-  FROM <context_id>_chunks
-  LIMIT 500
-),
-rrf AS (
-  SELECT
-    COALESCE(fts.id, sem.id) AS id,
-    (COALESCE(1.0 / (50 + fts.rank), 0) * 2 + COALESCE(1.0 / (50 + sem.rank), 0)) AS score
-  FROM fts FULL OUTER JOIN sem ON fts.id = sem.id
-)
-SELECT
-  c.id          AS chunk_id,
-  c.chunk_index,
-  c.content     AS chunk_content,
-  c.metadata,
-  c.source      AS item_id,
-  i.name        AS item_name,
-  '<context_id>' AS context,
-  rrf.score
-FROM rrf
-JOIN <context_id>_chunks c ON c.id = rrf.id
-JOIN <context_id>_items i ON c.source = i.id
-WHERE (i.archived IS FALSE OR i.archived IS NULL)
-ORDER BY rrf.score DESC
-LIMIT 20;
-\`\`\`
-`
-    : `
-Note: No embedder is configured for these contexts. Use keyword/full-text search only.
-`
-}
-### Browse all chunks of a specific document (in order)
-\`\`\`sql
-SELECT
-  c.id          AS chunk_id,
-  c.chunk_index,
-  c.content     AS chunk_content,
-  c.metadata,
-  c.source      AS item_id,
-  i.name        AS item_name,
-  '<context_id>' AS context
-FROM <context_id>_chunks c
-JOIN <context_id>_items i ON c.source = i.id
-WHERE c.source = '<item_id>'
-ORDER BY c.chunk_index;
-\`\`\`
-### Count / aggregate
-\`\`\`sql
-SELECT COUNT(*) FROM <context_id>_items WHERE archived IS FALSE;
-SELECT COUNT(*) FROM <context_id>_chunks;
-\`\`\`
-### Explore item names (when query is about a specific document)
-\`\`\`sql
-SELECT id, name, external_id, "createdAt"
-FROM <context_id>_items
-WHERE (archived IS FALSE OR archived IS NULL)
-  AND LOWER(name) LIKE '%keyword%'
-LIMIT 50;
-\`\`\`
-### Filter by custom metadata on chunks
-\`\`\`sql
-SELECT chunk_id, chunk_content, item_name, context
-FROM ...
-WHERE c.metadata->>'page' = '5'
-   OR c.metadata @> '{"category": "finance"}'
-\`\`\`
----
-## Column Alias Convention
-**Always use these aliases** in queries that return chunks so results are collected correctly:
-| Alias          | Source column           |
-|----------------|-------------------------|
-| \`chunk_id\`     | \`c.id\`                  |
-| \`chunk_index\`  | \`c.chunk_index\`         |
-| \`chunk_content\`| \`c.content\`             |
-| \`item_id\`      | \`c.source\`              |
-| \`item_name\`    | \`i.name\`                |
-| \`context\`      | literal context id string |
-| \`metadata\`     | \`c.metadata\`            |
----
-## Handling Large Results
-When execute_query returns a file path (results > 20k chars):
-1. Use \`grep\` with a specific pattern to find relevant sections
-2. Multiple grep calls are fine — narrow down iteratively
-3. Once you know specific \`item_id\` or \`chunk_id\` values, run a targeted SELECT to get full content
----
-## Search Strategy
-- **Start broad**: use keyword or hybrid search with your main terms, LIMIT 30–50
-- **Go deeper**: if results are sparse, try alternative phrasings, synonyms, or semantic search
-- **Drill into documents**: once you find a relevant item, fetch its chunks in order to get full context
-- **Cross-context**: search multiple contexts when the query could span knowledge bases
-- **Aggregate last**: use COUNT queries only for "how many" questions
----
-${customInstructions ? `## Additional Instructions\n\n${customInstructions}\n\n---\n` : ""}
-When you have gathered sufficient information, write a clear answer. Do not call any more tools once you have what you need.`;
-}
-function buildSchemaBlock(contexts: ExuluContext[]): string {
-  return contexts
-    .map((ctx) => {
-      const itemsTable = getTableName(ctx.id);
-      const chunksTable = getChunksTableName(ctx.id);
-      const customFields =
-        ctx.fields.length > 0
-          ? ctx.fields.map((f) => `  ${f.name} (${f.type})`).join("\n")
-          : "  (no custom fields)";
-      const embedderNote = ctx.embedder
-        ? `Embedder: ${ctx.embedder.name} — semantic search and embed() are available`
-        : "No embedder — use keyword search only";
-      return `### Context: "${ctx.name}" (id: \`${ctx.id}\`)
-${ctx.description || ""}
-${embedderNote}
-**${itemsTable}** — documents / items
-  id           (uuid, primary key)
-  name         (text)
-  external_id  (text, nullable)
-  archived     (boolean, nullable)
-  created_by   (integer, nullable)
-  rights_mode  (text, nullable)
-  "createdAt"  (timestamp)
-  "updatedAt"  (timestamp)
-  -- Custom fields:
-${customFields}
-**${chunksTable}** — text chunks (source FK → ${itemsTable}.id)
-  id           (uuid, primary key)
-  source       (uuid, FK → ${itemsTable}.id)
-  content      (text)
-  chunk_index  (integer)
-  fts          (tsvector — full-text search index)
-  embedding    (vector — pgvector, nullable)
-  metadata     (jsonb, nullable)
-  "createdAt"  (timestamp)
-  "updatedAt"  (timestamp)`;
-    })
-    .join("\n\n");
-}

package/ee/agentic-retrieval/v4/tools.ts DELETED Viewed

@@ -1,241 +0,0 @@
-import * as fs from "fs/promises";
-import * as path from "path";
-import { exec } from "child_process";
-import { promisify } from "util";
-import { z } from "zod";
-import { tool } from "ai";
-import { postgresClient } from "@SRC/postgres/client";
-import type { ExuluContext } from "@SRC/exulu/context";
-import type { User } from "@EXULU_TYPES/models/user";
-import { preprocessEmbedCalls } from "./embed-preprocessor";
-import type { ChunkResult } from "./types";
-const execAsync = promisify(exec);
-const MAX_INLINE_CHARS = 20_000;
-const MAX_GREP_OUTPUT_CHARS = 5_000;
-// ──────────────────────────────────────────────────────────────────────────────
-// SQL safety: only allow read-only statements
-// ──────────────────────────────────────────────────────────────────────────────
-const WRITE_PATTERN =
-  /^\s*(INSERT|UPDATE|DELETE|DROP|CREATE|ALTER|TRUNCATE|GRANT|REVOKE|VACUUM|ANALYZE|EXPLAIN\s+ANALYZE)\b/i;
-function assertReadOnly(sql: string): void {
-  if (WRITE_PATTERN.test(sql)) {
-    throw new Error(
-      "Only SELECT queries are allowed. Write operations (INSERT, UPDATE, DELETE, DROP, etc.) are not permitted.",
-    );
-  }
-}
-// ──────────────────────────────────────────────────────────────────────────────
-// Chunk harvesting: extract ChunkResult objects from raw SQL result rows
-// ──────────────────────────────────────────────────────────────────────────────
-/**
- * Tries to interpret a raw DB row as a ChunkResult.
- * The system prompt instructs the agent to use standard aliases, so we look for
- * those first and fall back to common alternative column names.
- */
-export function rowToChunkResult(row: Record<string, any>): ChunkResult | null {
-  const chunkId = row.chunk_id ?? row.id;
-  const chunkContent = row.chunk_content ?? row.content;
-  const itemId = row.item_id ?? row.source;
-  const context = row.context ?? row.context_id;
-  const itemName = row.item_name ?? row.name;
-  // Require at minimum a chunk identifier and either content or an item reference
-  if (!chunkId || (!chunkContent && !itemId)) return null;
-  return {
-    item_name: itemName ?? "",
-    item_id: itemId ?? "",
-    context: context ?? "",
-    chunk_id: chunkId,
-    chunk_index: row.chunk_index ?? undefined,
-    chunk_content: chunkContent ?? undefined,
-    metadata: row.metadata ?? row.chunk_metadata ?? undefined,
-  };
-}
-// ──────────────────────────────────────────────────────────────────────────────
-// Tool factory
-// ──────────────────────────────────────────────────────────────────────────────
-export type ToolFactoryParams = {
-  contexts: ExuluContext[];
-  user?: User;
-  role?: string;
-  sessionDir: string;
-};
-// eslint-disable-next-line @typescript-eslint/explicit-function-return-type
-export function createTools(params: ToolFactoryParams) {
-  const { contexts, user, role, sessionDir } = params;
-  let queryCount = 0;
-  // ── execute_query ────────────────────────────────────────────────────────────
-  const execute_query = tool({
-    description: `Execute a read-only PostgreSQL SELECT query against the knowledge base.
-Use this to search, filter, aggregate, and explore content. The database contains items
-and chunks tables for each knowledge base (see schema in the system prompt).
-Use embed('your text') anywhere in the query to generate a semantic search vector:
-  embedding <=> embed('machine learning') AS distance
-If the result exceeds ${(MAX_INLINE_CHARS / 1000).toFixed(0)}k characters it is saved to a file.
-Use the grep tool to iteratively search the file for relevant information.`,
-    inputSchema: z.object({
-      sql: z.string().describe("A read-only SELECT (or WITH ... SELECT) PostgreSQL query"),
-    }),
-    execute: async ({ sql }) => {
-      assertReadOnly(sql);
-      let processedSql: string;
-      try {
-        processedSql = await preprocessEmbedCalls(sql, contexts, user, role);
-      } catch (err: any) {
-        return JSON.stringify({ error: `embed() preprocessing failed: ${err.message}` });
-      }
-      let rows: any[];
-      try {
-        const { db } = await postgresClient();
-        const result = await db.raw(processedSql);
-        rows = result.rows ?? [];
-      } catch (err: any) {
-        return JSON.stringify({ error: `Query failed: ${err.message}` });
-      }
-      const json = JSON.stringify(rows, null, 2);
-      if (json.length <= MAX_INLINE_CHARS) {
-        return json;
-      }
-      // Results are large — store to session dir and tell the agent to grep
-      await fs.mkdir(sessionDir, { recursive: true });
-      const filename = `query_${++queryCount}.json`;
-      const filePath = path.join(sessionDir, filename);
-      await fs.writeFile(filePath, json, "utf-8");
-      return JSON.stringify({
-        stored: true,
-        file: filePath,
-        row_count: rows.length,
-        message: `Results too large to display (${rows.length} rows, ${(json.length / 1000).toFixed(1)}k chars). Stored at ${filePath}. Use the grep tool to search for relevant information.`,
-        grep_hint: `grep -i "keyword" ${filePath}`,
-      });
-    },
-  });
-  // ── grep ─────────────────────────────────────────────────────────────────────
-  const grep = tool({
-    description: `Search a stored query result file using grep.
-Use this after execute_query returns a file path because results were too large.
-Iteratively narrow down the results with multiple grep calls.`,
-    inputSchema: z.object({
-      pattern: z.string().describe("Regular expression or literal string to search for"),
-      file: z.string().describe("Absolute path to the file returned by execute_query"),
-      context_lines: z
-        .number()
-        .int()
-        .min(0)
-        .max(10)
-        .default(2)
-        .describe("Number of lines of context to show around each match (default 2)"),
-      case_insensitive: z
-        .boolean()
-        .default(true)
-        .describe("Case-insensitive matching (default true)"),
-    }),
-    execute: async ({ pattern, file, context_lines, case_insensitive }) => {
-      // Security: only allow reading from our session directory
-      const resolvedFile = path.resolve(file);
-      const resolvedSession = path.resolve(sessionDir);
-      if (!resolvedFile.startsWith(resolvedSession)) {
-        return JSON.stringify({
-          error: `Access denied. Only files within the session directory (${sessionDir}) can be searched.`,
-        });
-      }
-      // Verify file exists
-      try {
-        await fs.access(resolvedFile);
-      } catch {
-        return JSON.stringify({ error: `File not found: ${file}` });
-      }
-      const flags = [
-        "-n",
-        context_lines > 0 ? `-C${context_lines}` : "",
-        case_insensitive ? "-i" : "",
-      ]
-        .filter(Boolean)
-        .join(" ");
-      // Escape pattern for shell to prevent injection
-      const escapedPattern = pattern.replace(/'/g, `'\\''`);
-      const cmd = `grep ${flags} '${escapedPattern}' '${resolvedFile}'`;
-      let output: string;
-      try {
-        const { stdout } = await execAsync(cmd, { maxBuffer: 10 * 1024 * 1024 });
-        output = stdout;
-      } catch (err: any) {
-        // grep exits with code 1 when no matches — that's not an error
-        if (err.code === 1) {
-          return JSON.stringify({ matches: 0, output: "No matches found." });
-        }
-        return JSON.stringify({ error: `grep failed: ${err.message}` });
-      }
-      if (output.length > MAX_GREP_OUTPUT_CHARS) {
-        output =
-          output.slice(0, MAX_GREP_OUTPUT_CHARS) +
-          `\n... (output truncated at ${MAX_GREP_OUTPUT_CHARS} chars — refine your pattern to narrow results)`;
-      }
-      const lineCount = output.split("\n").filter(Boolean).length;
-      return JSON.stringify({ matches: lineCount, output });
-    },
-  });
-  return { execute_query, grep };
-}
-/**
- * Harvests ChunkResult objects from all tool results in a step.
- * Called after each agent step to collect any chunk-shaped rows the agent retrieved.
- */
-export function harvestChunks(toolResults: any[]): ChunkResult[] {
-  const chunks: ChunkResult[] = [];
-  for (const result of toolResults ?? []) {
-    const rawOutput = result.output ?? result.result;
-    let parsed: any;
-    try {
-      parsed = typeof rawOutput === "string" ? JSON.parse(rawOutput) : rawOutput;
-    } catch {
-      continue;
-    }
-    // Array of rows (direct SELECT result)
-    if (Array.isArray(parsed)) {
-      for (const row of parsed) {
-        if (row && typeof row === "object") {
-          const chunk = rowToChunkResult(row);
-          if (chunk) chunks.push(chunk);
-        }
-      }
-    }
-  }
-  return chunks;
-}