npm - @exulu/backend - Versions diffs - 1.49.2 → 1.51.0 - Mend

@exulu/backend 1.49.2 → 1.51.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/bin/setup-python.cjs +140 -0
package/dist/index.cjs +561 -119
package/dist/index.d.cts +16 -3
package/dist/index.d.ts +16 -3
package/dist/index.js +564 -122
package/ee/chunking/markdown.ts +83 -5
package/ee/python/documents/processing/doc_processor.ts +380 -84
package/ee/workers.ts +214 -18
package/package.json +8 -1

package/dist/index.js CHANGED Viewed

@@ -147,17 +147,19 @@ async function postgresClient() {
           // 30 minutes
         },
         pool: {
-          min: 5,
-          // Increased from 2 to ensure enough connections available
-          max: 50,
-          // Increased from 20 to handle more concurrent operations with processor jobs
-          acquireTimeoutMillis: 6e4,
-          // Increased from 30s to 60s to handle pool contention
+          min: 10,
+          // Minimum connections always ready
+          max: 300,
+          // Increased to support high worker concurrency (250+ concurrent jobs)
+          acquireTimeoutMillis: 12e4,
+          // 2 minutes - increased to handle high contention during bursts
           createTimeoutMillis: 3e4,
           idleTimeoutMillis: 6e4,
-          // Increased to keep connections alive longer
+          // Keep connections alive for reuse
           reapIntervalMillis: 1e3,
           createRetryIntervalMillis: 200,
+          // Enable propagateCreateError to properly handle connection creation failures
+          propagateCreateError: false,
           // Log pool events to help debug connection issues
           afterCreate: (conn, done) => {
             console.log("[EXULU] New database connection created");
@@ -3577,7 +3579,7 @@ var convertExuluToolsToAiSdkTools = async (currentTools, approvedTools, allExulu
           description,
           // The approvedTools array uses the tool.name lookup as the frontend
           // Vercel AI SDK uses the sanitized tool name as the key, so this matches.
-          needsApproval: approvedTools?.includes("tool-" + cur.name) ? false : true,
+          needsApproval: approvedTools?.includes("tool-" + cur.name) || !cur.needsApproval ? false : true,
           // todo make configurable
           async *execute(inputs, options) {
             console.log(
@@ -3727,6 +3729,7 @@ var ExuluTool = class {
   inputSchema;
   type;
   tool;
+  needsApproval;
   config;
   constructor({
     id,
@@ -3736,10 +3739,12 @@ var ExuluTool = class {
     inputSchema,
     type,
     execute: execute2,
-    config
+    config,
+    needsApproval
   }) {
     this.id = id;
     this.config = config;
+    this.needsApproval = needsApproval ?? true;
     this.category = category || "default";
     this.name = name;
     this.description = description;
@@ -4146,11 +4151,30 @@ var ExuluContext2 = class {
     );
     await db2.from(getChunksTableName(this.id)).where({ source }).delete();
     if (chunks?.length) {
+      const sanitizeString = (str) => {
+        if (!str) return "";
+        return str.replace(/\0/g, "");
+      };
+      const sanitizeMetadata2 = (metadata) => {
+        if (!metadata) return {};
+        const sanitized = {};
+        for (const [key, value] of Object.entries(metadata)) {
+          if (typeof value === "string") {
+            sanitized[key] = sanitizeString(value);
+          } else {
+            sanitized[key] = value;
+          }
+        }
+        return sanitized;
+      };
       await db2.from(getChunksTableName(this.id)).insert(
         chunks.map((chunk) => ({
-          source,
-          metadata: chunk.metadata,
-          content: chunk.content,
+          // Sanitize source to remove null bytes
+          source: sanitizeString(source),
+          // Sanitize metadata to remove null bytes from string values
+          metadata: sanitizeMetadata2(chunk.metadata),
+          // Remove null bytes (0x00) which are invalid in PostgreSQL UTF8 encoding
+          content: sanitizeString(chunk.content),
           chunk_index: chunk.index,
           embedding: pgvector2.toSql(chunk.vector)
         }))
@@ -4539,6 +4563,8 @@ var ExuluContext2 = class {
       name: `${this.name}_context_search`,
       type: "context",
       category: "contexts",
+      needsApproval: true,
+      // todo make configurable
       inputSchema: z4.object({
         query: z4.string().describe("The original question that the user asked"),
         keywords: z4.array(z4.string()).describe(
@@ -5732,7 +5758,7 @@ var finalizeRequestedFields = async ({
           return result;
         }
         const { db: db2 } = await postgresClient();
-        const query = db2.from(getChunksTableName(context.id)).where({ source: result.id }).select("id", "content", "source", "chunk_index", "createdAt", "updatedAt");
+        const query = db2.from(getChunksTableName(context.id)).where({ source: result.id }).select("id", "content", "source", "chunk_index", "createdAt", "updatedAt", "metadata");
         const chunks = await query;
         result.chunks = chunks.map((chunk) => ({
           chunk_content: chunk.content,
@@ -5745,7 +5771,8 @@ var finalizeRequestedFields = async ({
           item_created_at: chunk.item_created_at,
           item_id: chunk.item_id,
           item_external_id: chunk.item_external_id,
-          item_name: chunk.item_name
+          item_name: chunk.item_name,
+          chunk_metadata: chunk.metadata
         }));
       }
     }
@@ -7119,6 +7146,36 @@ import "ai";
 import CryptoJS4 from "crypto-js";
 var redisConnection;
 var unhandledRejectionHandlerInstalled = false;
+var poolMonitoringInterval;
+var startPoolMonitoring = () => {
+  if (poolMonitoringInterval) return;
+  poolMonitoringInterval = setInterval(async () => {
+    try {
+      const { db: db2 } = await postgresClient();
+      const poolStats = db2.client.pool;
+      if (poolStats) {
+        const used = poolStats.numUsed?.() || 0;
+        const free = poolStats.numFree?.() || 0;
+        const pending = poolStats.numPendingAcquires?.() || 0;
+        const total = used + free;
+        console.log("[EXULU] Connection pool health check:", {
+          used,
+          free,
+          pending,
+          total,
+          utilization: total > 0 ? `${Math.round(used / total * 100)}%` : "0%"
+        });
+        if (pending > 10) {
+          console.warn(
+            `[EXULU] WARNING: ${pending} jobs waiting for database connections. Consider increasing pool size or reducing worker concurrency.`
+          );
+        }
+      }
+    } catch (error) {
+      console.error("[EXULU] Error checking pool health:", error);
+    }
+  }, 3e4);
+};
 var installGlobalErrorHandlers = () => {
   if (unhandledRejectionHandlerInstalled) return;
   process.on("unhandledRejection", (reason) => {
@@ -7143,6 +7200,7 @@ var installGlobalErrorHandlers = () => {
   unhandledRejectionHandlerInstalled = true;
   console.log("[EXULU] Global error handlers installed to prevent worker crashes");
 };
+var isShuttingDown = false;
 var createWorkers = async (providers, queues2, config, contexts, rerankers, evals, tools, tracer) => {
   console.log("[EXULU] creating workers for " + queues2?.length + " queues.");
   console.log(
@@ -7150,7 +7208,8 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
     queues2.map((q) => q.queue.name)
   );
   installGlobalErrorHandlers();
-  process.setMaxListeners(Math.max(queues2.length * 2 + 5, 15));
+  startPoolMonitoring();
+  process.setMaxListeners(Math.max(15, process.getMaxListeners()));
   if (!redisServer.host || !redisServer.port) {
     console.error(
       "[EXULU] you are trying to start worker, but no redis server is configured in the environment."
@@ -7183,7 +7242,53 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
           status: await bullmqJob.getState(),
           type: bullmqJob.data.type
         });
-        const { db: db2 } = await postgresClient();
+        let progressInterval;
+        if (bullmqJob.data.type === "processor") {
+          progressInterval = setInterval(async () => {
+            try {
+              await bullmqJob.updateProgress({
+                status: "processing",
+                timestamp: (/* @__PURE__ */ new Date()).toISOString()
+              });
+              console.log(`[EXULU] Job ${bullmqJob.id} heartbeat sent to prevent stalling`);
+            } catch (error) {
+              console.error(`[EXULU] Error updating job progress:`, error);
+            }
+          }, 25e3);
+        }
+        let db2;
+        let retries = 3;
+        let lastError;
+        for (let attempt = 1; attempt <= retries; attempt++) {
+          try {
+            const client2 = await postgresClient();
+            db2 = client2.db;
+            const poolStats = db2.client.pool;
+            if (poolStats) {
+              console.log(`[EXULU] Connection pool stats for job ${bullmqJob.id}:`, {
+                size: poolStats.numUsed?.() || 0,
+                available: poolStats.numFree?.() || 0,
+                pending: poolStats.numPendingAcquires?.() || 0
+              });
+            }
+            break;
+          } catch (error) {
+            lastError = error instanceof Error ? error : new Error(String(error));
+            console.error(
+              `[EXULU] Failed to acquire database connection (attempt ${attempt}/${retries}) for job ${bullmqJob.id}:`,
+              lastError.message
+            );
+            if (attempt < retries) {
+              const backoffMs = 500 * Math.pow(2, attempt - 1);
+              await new Promise((resolve3) => setTimeout(resolve3, backoffMs));
+            }
+          }
+        }
+        if (!db2) {
+          throw new Error(
+            `Failed to acquire database connection after ${retries} attempts: ${lastError?.message}`
+          );
+        }
         const data = bullmqJob.data;
         const timeoutInSeconds = data.timeoutInSeconds || queue.timeoutInSeconds || 600;
         const timeoutMs = timeoutInSeconds * 1e3;
@@ -7275,7 +7380,7 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
               }
               const exuluStorage = new ExuluStorage({ config });
               console.log("[EXULU] POS 2 -- EXULU CONTEXT PROCESS FIELD");
-              const processorResult = await context.processor.execute({
+              let processorResult = await context.processor.execute({
                 item: data.inputs,
                 user: data.user,
                 role: data.role,
@@ -7290,12 +7395,16 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
                 );
               }
               delete processorResult.field;
+              const updateData = { ...processorResult };
               await db2.from(getTableName(context.id)).where({
                 id: processorResult.id
               }).update({
-                ...processorResult,
+                ...updateData,
                 last_processed_at: (/* @__PURE__ */ new Date()).toISOString()
               });
+              Object.keys(updateData).forEach((key) => {
+                delete updateData[key];
+              });
               let jobs = [];
               if (context.processor?.config?.generateEmbeddings) {
                 const fullItem = await db2.from(getTableName(context.id)).where({
@@ -7317,12 +7426,18 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
                   jobs.push(embeddingsJob);
                 }
               }
-              return {
-                result: processorResult,
+              const result = {
+                result: { id: processorResult.id },
                 metadata: {
                   jobs: jobs.length > 0 ? jobs.join(",") : void 0
                 }
               };
+              processorResult = null;
+              const memUsage = process.memoryUsage();
+              console.log(
+                `[EXULU] Memory after processor job ${bullmqJob.id}: ${Math.round(memUsage.heapUsed / 1024 / 1024)}MB / ${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`
+              );
+              return result;
             }
             if (data.type === "workflow") {
               console.log("[EXULU] running a workflow job.", bullmqJob.name);
@@ -7341,10 +7456,10 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
                 user,
                 messages: inputMessages
               } = await validateWorkflowPayload(data, providers);
-              const retries = 3;
+              const retries2 = 3;
               let attempts = 0;
               const promise = new Promise(async (resolve3, reject) => {
-                while (attempts < retries) {
+                while (attempts < retries2) {
                   try {
                     const messages2 = await processUiMessagesFlow({
                       providers,
@@ -7366,7 +7481,7 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
                       error instanceof Error ? error.message : String(error)
                     );
                     attempts++;
-                    if (attempts >= retries) {
+                    if (attempts >= retries2) {
                       reject(new Error(error instanceof Error ? error.message : String(error)));
                     }
                     await new Promise((resolve4) => setTimeout((resolve5) => resolve5(true), 2e3));
@@ -7417,10 +7532,10 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
                 testCase,
                 messages: inputMessages
               } = await validateEvalPayload(data, providers);
-              const retries = 3;
+              const retries2 = 3;
               let attempts = 0;
               const promise = new Promise(async (resolve3, reject) => {
-                while (attempts < retries) {
+                while (attempts < retries2) {
                   try {
                     const messages2 = await processUiMessagesFlow({
                       providers,
@@ -7441,7 +7556,7 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
                       error instanceof Error ? error.message : String(error)
                     );
                     attempts++;
-                    if (attempts >= retries) {
+                    if (attempts >= retries2) {
                       reject(new Error(error instanceof Error ? error.message : String(error)));
                     }
                     await new Promise((resolve4) => setTimeout((resolve5) => resolve5(true), 2e3));
@@ -7690,9 +7805,15 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
         try {
           const result = await Promise.race([workPromise, timeoutPromise]);
           clearTimeout(timeoutHandle);
+          if (progressInterval) {
+            clearInterval(progressInterval);
+          }
           return result;
         } catch (error) {
           clearTimeout(timeoutHandle);
+          if (progressInterval) {
+            clearInterval(progressInterval);
+          }
           console.error(
             `[EXULU] job ${bullmqJob.id} failed (error caught in race handler).`,
             error instanceof Error ? error.message : String(error)
@@ -7706,6 +7827,16 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
         concurrency: queue.concurrency?.worker || 1,
         removeOnComplete: { count: 1e3 },
         removeOnFail: { count: 5e3 },
+        // Configure settings for long-running jobs (especially processor jobs)
+        // lockDuration: How long a worker can hold a job before it's considered stalled
+        // Set to 5 minutes to accommodate CPU-intensive operations
+        lockDuration: 3e5,
+        // 5 minutes in milliseconds
+        // stalledInterval: How often to check for stalled jobs
+        // Set to 2 minutes to reduce false positives for long-running operations
+        stalledInterval: 12e4,
+        // 2 minutes in milliseconds
+        maxStalledCount: 1,
         ...queue.ratelimit && {
           limiter: {
             max: queue.ratelimit,
@@ -7742,24 +7873,68 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
           error: error instanceof Error ? error.message : String(error)
         } : error
       );
+      throw error;
     });
     worker.on("error", (error) => {
       console.error(`[EXULU] worker error.`, error);
+      throw error;
     });
     worker.on("progress", (job, progress) => {
       console.log(`[EXULU] job progress ${job.id}.`, job.name, {
         progress
       });
     });
-    const gracefulShutdown = async (signal) => {
-      console.log(`Received ${signal}, closing server...`);
-      await worker.close();
-      process.exit(0);
-    };
-    process.on("SIGINT", () => gracefulShutdown("SIGINT"));
-    process.on("SIGTERM", () => gracefulShutdown("SIGTERM"));
     return worker;
   });
+  const gracefulShutdown = async (signal) => {
+    if (isShuttingDown) {
+      console.log(`[EXULU] Shutdown already in progress, ignoring additional ${signal}`);
+      return;
+    }
+    isShuttingDown = true;
+    console.log(`[EXULU] Received ${signal}, shutting down gracefully...`);
+    try {
+      if (poolMonitoringInterval) {
+        clearInterval(poolMonitoringInterval);
+        poolMonitoringInterval = void 0;
+      }
+      console.log(`[EXULU] Closing ${workers.length} worker(s)...`);
+      const closePromises = workers.map(async (worker, index) => {
+        try {
+          await Promise.race([
+            worker.close(),
+            new Promise(
+              (_, reject) => setTimeout(() => reject(new Error("Worker close timeout")), 3e4)
+            )
+          ]);
+          console.log(`[EXULU] Worker ${index + 1} closed successfully`);
+        } catch (error) {
+          console.error(`[EXULU] Error closing worker ${index + 1}:`, error);
+        }
+      });
+      await Promise.allSettled(closePromises);
+      if (redisConnection) {
+        console.log(`[EXULU] Closing Redis connection...`);
+        await redisConnection.quit();
+      }
+      try {
+        const { db: db2 } = await postgresClient();
+        if (db2?.client) {
+          console.log(`[EXULU] Closing database connection pool...`);
+          await db2.client.destroy();
+        }
+      } catch (error) {
+        console.error(`[EXULU] Error closing database:`, error);
+      }
+      console.log(`[EXULU] Graceful shutdown complete`);
+      process.exit(0);
+    } catch (error) {
+      console.error(`[EXULU] Error during graceful shutdown:`, error);
+      process.exit(1);
+    }
+  };
+  process.once("SIGINT", () => gracefulShutdown("SIGINT"));
+  process.once("SIGTERM", () => gracefulShutdown("SIGTERM"));
   return workers;
 };
 var validateWorkflowPayload = async (data, providers) => {
@@ -9473,6 +9648,7 @@ type AgentEvalFunctionConfig {
 type ItemChunks {
     chunk_id: String!
+    chunk_metadata: JSON!
     chunk_index: Int!
     chunk_content: String!
     chunk_source: String!
@@ -9691,7 +9867,7 @@ import cookieParser from "cookie-parser";
 import { z as z7 } from "zod";
 import {
   convertToModelMessages,
-  generateObject,
+  Output as Output2,
   generateText as generateText2,
   streamText,
   validateUIMessages,
@@ -9886,7 +10062,7 @@ var ExuluProvider = class {
         prompt: z7.string().describe("The prompt (usually a question for the agent) to send to the agent."),
         information: z7.string().describe("A summary of relevant context / information from the current session")
       }),
-      description: `This tool calls an AI agent named: ${agent.name}. The agent does the following: ${agent.description}.`,
+      description: `This tool calls an agent named: ${agent.name}. The agent does the following: ${agent.description}.`,
       config: [],
       execute: async ({ prompt, information, user, allExuluTools }) => {
         const hasAccessToAgent = await checkRecordAccess(agent, "read", user);
@@ -9999,9 +10175,6 @@ var ExuluProvider = class {
     if (!prompt && !inputMessages?.length) {
       throw new Error("Prompt or message is required for generating.");
     }
-    if (outputSchema && !prompt) {
-      throw new Error("Prompt is required for generating with an output schema.");
-    }
     const model = this.model.create({
       ...providerapikey ? { apiKey: providerapikey } : {}
     });
@@ -10138,14 +10311,18 @@ var ExuluProvider = class {
       let inputTokens = 0;
       let outputTokens = 0;
       if (outputSchema) {
-        const { object, usage } = await generateObject({
+        const { output, usage } = await generateText2({
           model,
           system,
-          prompt,
           maxRetries: 3,
-          schema: outputSchema
+          output: Output2.object({
+            schema: outputSchema
+          }),
+          prompt,
+          stopWhen: [stepCountIs2(5)]
+          // make configurable
         });
-        result.object = object;
+        result.object = output;
         inputTokens = usage.inputTokens || 0;
         outputTokens = usage.outputTokens || 0;
       } else {
@@ -10176,6 +10353,7 @@ var ExuluProvider = class {
             agent
           ),
           stopWhen: [stepCountIs2(5)]
+          // make configurable
         });
         result.text = text;
         inputTokens = totalUsage?.inputTokens || 0;
@@ -10650,6 +10828,7 @@ var providerRateLimiter = async (key, windowSeconds, limit, points) => {
 };
 // src/exulu/routes.ts
+import { convertJsonSchemaToZod } from "zod-from-json-schema";
 var REQUEST_SIZE_LIMIT = "50mb";
 var getExuluVersionNumber = async () => {
   try {
@@ -11070,6 +11249,16 @@ Mood: friendly and intelligent.
         providers,
         user
       );
+      if (req.body.outputSchema && !!headers.stream) {
+        throw new Error("Providing a outputSchema in the POST body is not allowed when using the streaming API, set 'stream' to false in the headers when defining a response schema.");
+      }
+      let outputSchema;
+      if (req.body.outputSchema) {
+        if (typeof req.body.outputSchema === "string") {
+          req.body.outputSchema = JSON.parse(req.body.outputSchema);
+        }
+        outputSchema = convertJsonSchemaToZod(req.body.outputSchema);
+      }
       let providerapikey;
       const variableName = agent.providerapikey;
       if (variableName) {
@@ -11216,6 +11405,7 @@ Mood: friendly and intelligent.
         const response = await provider.generateSync({
           contexts,
           rerankers: rerankers || [],
+          outputSchema,
           agent,
           user,
           req,
@@ -15451,6 +15641,22 @@ var MarkdownChunker = class {
     });
     return result;
   }
+  /**
+   * Checks if a position in the text falls within a <diagram> tag.
+   * Returns the adjusted position (before the diagram) if inside a diagram, otherwise returns the original position.
+   */
+  adjustForDiagramTags(text, position) {
+    const diagramRegex = /<diagram>[\s\S]*?<\/diagram>/gi;
+    let match;
+    while ((match = diagramRegex.exec(text)) !== null) {
+      const diagramStart = match.index;
+      const diagramEnd = match.index + match[0].length;
+      if (position > diagramStart && position < diagramEnd) {
+        return diagramStart;
+      }
+    }
+    return position;
+  }
   /**
    * Find the nearest logical breakpoint working backwards from the end of the text.
    * Logical breakpoints are prioritized as follows:
@@ -15462,6 +15668,7 @@ var MarkdownChunker = class {
    *
    * Only considers breakpoints in the last 50% of the text to avoid creating very small chunks.
    * Returns the position of the breakpoint, or null if none found
+   * IMPORTANT: Never splits content within <diagram> tags
    */
   findLogicalBreakpoint(text) {
     if (text.length === 0) return null;
@@ -15481,7 +15688,7 @@ var MarkdownChunker = class {
       }
     }
     if (lastHeaderPosition > 0) {
-      return lastHeaderPosition;
+      return this.adjustForDiagramTags(text, lastHeaderPosition);
     }
     let lastParagraphBreak = -1;
     let searchPos = text.length;
@@ -15494,11 +15701,12 @@ var MarkdownChunker = class {
       searchPos = pos;
     }
     if (lastParagraphBreak > 0) {
-      return lastParagraphBreak + 2;
+      const adjusted = this.adjustForDiagramTags(text, lastParagraphBreak + 2);
+      return adjusted;
     }
     const newlineIndex = text.lastIndexOf("\n");
     if (newlineIndex >= minPosition) {
-      return newlineIndex + 1;
+      return this.adjustForDiagramTags(text, newlineIndex + 1);
     }
     const sentenceEndRegex = /[.!?](?:\s|$)/g;
     let lastSentenceEnd = -1;
@@ -15508,13 +15716,13 @@ var MarkdownChunker = class {
       }
     }
     if (lastSentenceEnd > 0) {
-      return lastSentenceEnd;
+      return this.adjustForDiagramTags(text, lastSentenceEnd);
     }
     let lastSpace = text.length;
     while (lastSpace > minPosition) {
       const pos = text.lastIndexOf(" ", lastSpace - 1);
       if (pos >= minPosition) {
-        return pos + 1;
+        return this.adjustForDiagramTags(text, pos + 1);
       }
       lastSpace = pos;
     }
@@ -15646,6 +15854,38 @@ var MarkdownChunker = class {
           targetPosition = currentPosition + decoded.length;
         }
       }
+      const diagramCheck = /<diagram>/gi;
+      const diagramCloseCheck = /<\/diagram>/gi;
+      let openDiagramsInSlice = 0;
+      while (diagramCheck.exec(currentSlice) !== null) {
+        openDiagramsInSlice++;
+      }
+      let closeDiagramsInSlice = 0;
+      while (diagramCloseCheck.exec(currentSlice) !== null) {
+        closeDiagramsInSlice++;
+      }
+      if (openDiagramsInSlice > closeDiagramsInSlice) {
+        const lastDiagramOpenIndex = currentSlice.lastIndexOf("<diagram>");
+        if (lastDiagramOpenIndex !== -1) {
+          const remainingText = text.slice(currentPosition + lastDiagramOpenIndex);
+          const closingTagMatch = /<\/diagram>/i.exec(remainingText);
+          if (closingTagMatch) {
+            const closingTagPosition = lastDiagramOpenIndex + closingTagMatch.index + closingTagMatch[0].length;
+            const extendedSlice = text.slice(currentPosition, currentPosition + closingTagPosition);
+            const extendedTokens = tokenizer.encode(extendedSlice);
+            if (extendedTokens.length <= adjustedChunkSize * 1.5) {
+              currentSlice = extendedSlice;
+              targetPosition = currentPosition + closingTagPosition;
+            } else {
+              currentSlice = currentSlice.slice(0, lastDiagramOpenIndex);
+              targetPosition = currentPosition + lastDiagramOpenIndex;
+            }
+          } else {
+            currentSlice = currentSlice.slice(0, lastDiagramOpenIndex);
+            targetPosition = currentPosition + lastDiagramOpenIndex;
+          }
+        }
+      }
       const breakpointPosition = this.findLogicalBreakpoint(currentSlice);
       if (breakpointPosition !== null) {
         currentSlice = currentSlice.slice(0, breakpointPosition);
@@ -15917,7 +16157,7 @@ Or manually run the setup script:
 // ee/python/documents/processing/doc_processor.ts
 import * as fs2 from "fs";
 import * as path from "path";
-import { generateText as generateText3, Output as Output2 } from "ai";
+import { generateText as generateText3, Output as Output3 } from "ai";
 import { z as z12 } from "zod";
 import pLimit from "p-limit";
 import { randomUUID as randomUUID6 } from "crypto";
@@ -16067,6 +16307,8 @@ ${command}`;
 }
 // ee/python/documents/processing/doc_processor.ts
+import { LiteParse } from "@llamaindex/liteparse";
+import { Mistral } from "@mistralai/mistralai";
 async function processDocx(file) {
   const html = await mammoth.convertToHtml({ buffer: file });
   const turndownService = new TurndownService();
@@ -16141,50 +16383,91 @@ async function validatePageWithVLM(page, imagePath, model) {
   const imageBuffer = await fs2.promises.readFile(imagePath);
   const imageBase64 = imageBuffer.toString("base64");
   const mimeType = "image/png";
-  const prompt = `You are validating OCR/document parsing output for a page that might contain tables and images.
-Here is the current OCR/parsed content for this page:
+  const prompt = `You are a document validation assistant. Your task is to analyze a page image and correct the output of an OCR/parsing pipeline. The content may include tables, technical diagrams, schematics, and structured text.
 ---
+## CURRENT OCR OUTPUT
 ${page.content}
 ---
-Please analyze the page image and validate it:
-1. Check if the extracted markdown text accurately represents the content from the page, including:
-   - Table data (rows, columns, headers, values)
-   - Technical diagrams, schematics, control boards
-   - Icons, checkmarks, symbols
-   - Image captions and labels
-2. If the page has significant errors or omissions, provide a corrected version for the page.
+## YOUR TASK
-3. Return a validation result for the page.
+Compare the page image to the OCR output above. Identify errors, omissions, and formatting issues, then return a structured validation result (see OUTPUT FORMAT below).
-IMPORTANT OUTPUT FORMAT REQUIREMENTS:
-- You MUST output all tables in proper Markdown table format using pipes (|) and dashes (---)
-- Use simple separator rows: | --- | --- | (NOT long dashes like ----------------------)
-- Every table must have: header row, separator row, and data rows
-- Example format:
+---
+## VALIDATION CHECKLIST
+Work through these checks in order:
+### 1. Text Accuracy
+- Verify all text is correctly transcribed.
+- For minor character-level OCR errors (e.g. "\xF6" vs "\xFC", "rn" vs "m"), **prefer the original OCR output** unless you are certain of an error. Do not silently "fix" characters based on guesswork.
+### 2. Heading Levels
+- Verify that headings use correct Markdown levels (#, ##, ###, ####, #####, ######).
+- Determine heading level using the following priority:
+  1. **Hierarchical numbering** (strongest signal): e.g. "1" \u2192 #, "2.1" \u2192 ##, "2.1.1" \u2192 ###, "2.1.2.5" \u2192 ####
+  2. Font size (larger = higher level)
+  3. Indentation
+  4. Bold/emphasis styling
+### 3. Tables
+**First, decide whether the table should be Markdown or plain text:**
+- Use **Markdown table format** if the table has a consistent, clear header structure and uniform column layout throughout.
+- Use **plain text structured description** if the table:
+  - Lacks a clear header row
+  - Uses mixed or irregular column structures across rows
+  - Functions more like a certificate, form, or label layout
+**If using Markdown format**, follow these rules strictly:
+- Every table must have: header row \u2192 separator row \u2192 data rows
+- Use simple separators only: \`| --- | --- |\` (NOT \`|---|---|\` or long dashes)
+- Example:
+  \`\`\`
   | Column 1 | Column 2 |
   | --- | --- |
-  | Data 1 | Data 2 |
-- If the extracted content already has tables, preserve their structure but fix any errors you find in the actual data
-- Do NOT output tables as plain text or in any other format
-- Preserve all markdown formatting (headings with ##, lists, etc.)
-Specific notes and guidelines:
-- Some pages might contain a table with a column that show black and white dots (for Example Rufe-LEDs). You should translate this into + for black (meaning active) and - for white (meaning inactive).
-- Some tables might use green or black checkmarks and red or black crosses. You should translate this into + for checkmarks (meaning active) and - for a cross (meaning inactive).
-- IMPORTANT: Only provide corrections if you find actual errors in the content. If the extracted text is accurate, set needs_correction to false.
+  | Data 1   | Data 2   |
+  \`\`\`
+- Important: do not use the | character as part of the data inside a cell, this would break the table, if a cell contains a | character, use a capital I.
+**Symbol translation rules for table cells:**
+- Black/filled dot \u2192 \`+\` (active); White/empty dot \u2192 \`-\` (inactive)
+  *(e.g. Rufe-LED columns)*
+- Green or black checkmark \u2192 \`+\` (active); Red or black cross \u2192 \`-\` (inactive)
+### 4. Multi-Page Table Continuity
+- If this page contains a table with a header row that runs to the bottom of the page (suggesting it may continue on the next page), extract the header row and include it in the \`current_page_table.headers\` field.
+- If this page contains a table WITHOUT a header row (suggesting it's a continuation from a previous page), set \`current_page_table.is_continuation\` to true and try to identify what the headers might be based on the data structure. Include your best guess for headers in \`current_page_table.headers\`.
+### 5. Technical Diagrams & Schematics
+If the page contains a flow-chart, schematic, technical drawing or control board layout that is **absent or poorly described** in the OCR output do the following:
+- Open a <diagram> tag with the following content:
+  <diagram>
+    <description>
+      Add a detailed description of the diagram here.
+    </description>
+    <mermaid>
+      Add a mermaid diagram schema here that in detail describes the diagram.
+    </mermaid>
+  </diagram>
+### 6. Captions, Icons & Symbols
+- Verify that image captions, labels, icons, and checkmarks are present and correctly transcribed.
+### 7. Only populate \`corrected_text\` when \`needs_correction\` is true. If the OCR output is accurate, return \`needs_correction: false\` and \`corrected_content: null\`.
 `;
   const result = await generateText3({
     model,
-    output: Output2.object({
+    output: Output3.object({
       schema: z12.object({
         needs_correction: z12.boolean(),
         corrected_text: z12.string().nullable(),
+        current_page_table: z12.object({
+          headers: z12.array(z12.string()),
+          is_continuation: z12.boolean()
+        }).nullable(),
         confidence: z12.enum(["high", "medium", "low"]),
         reasoning: z12.string()
       })
@@ -16207,23 +16490,80 @@ Specific notes and guidelines:
     needs_correction: parsedOutput.needs_correction,
     corrected_text: parsedOutput.corrected_text || void 0,
     confidence: parsedOutput.confidence,
+    current_page_table: parsedOutput.current_page_table || void 0,
     reasoning: parsedOutput.reasoning
   };
   return validation;
 }
+function reconstructTableHeaders(document, validationResults, verbose = false) {
+  let lastTableHeaders = void 0;
+  for (const page of document) {
+    const validation = validationResults.get(page.page);
+    if (!validation) continue;
+    const tableInfo = validation.current_page_table;
+    if (tableInfo && tableInfo.headers.length > 0) {
+      if (tableInfo.is_continuation && lastTableHeaders) {
+        if (verbose) {
+          console.log(`[EXULU] Page ${page.page}: Reconstructing table headers from previous page`);
+          console.log(`[EXULU] Previous headers: ${lastTableHeaders.join(" | ")}`);
+        }
+        const contentToModify = page.vlm_corrected_text || page.content;
+        const lines = contentToModify.split("\n");
+        const firstTableLineIndex = lines.findIndex((line) => line.trim().startsWith("|"));
+        if (firstTableLineIndex !== -1) {
+          const headerRow = `| ${lastTableHeaders.join(" | ")} |`;
+          const separatorRow = `| ${lastTableHeaders.map(() => "---").join(" | ")} |`;
+          lines.splice(firstTableLineIndex, 0, headerRow, separatorRow);
+          const reconstructedContent = lines.join("\n");
+          if (page.vlm_corrected_text) {
+            page.vlm_corrected_text = reconstructedContent;
+          } else {
+            page.content = reconstructedContent;
+          }
+          if (verbose) {
+            console.log(`[EXULU] Page ${page.page}: Added table headers successfully`);
+          }
+        }
+        if (!tableInfo.is_continuation) {
+          lastTableHeaders = tableInfo.headers;
+        }
+      } else {
+        lastTableHeaders = tableInfo.headers;
+        if (verbose) {
+          console.log(`[EXULU] Page ${page.page}: Storing table headers for potential continuation`);
+          console.log(`[EXULU] Headers: ${lastTableHeaders.join(" | ")}`);
+        }
+      }
+    } else {
+      lastTableHeaders = void 0;
+    }
+  }
+}
 async function validateWithVLM(document, model, verbose = false, concurrency = 10) {
   console.log(`[EXULU] Starting VLM validation for docling output, ${document.length} pages...`);
-  console.log(
-    `[EXULU] Concurrency limit: ${concurrency}`
-  );
+  console.log(`[EXULU] Concurrency limit: ${concurrency}`);
+  const limit = pLimit(concurrency);
+  const validationResults = /* @__PURE__ */ new Map();
   let validatedCount = 0;
   let correctedCount = 0;
-  const limit = pLimit(concurrency);
   const validationTasks = document.map(
     (page) => limit(async () => {
+      await new Promise((resolve3) => setImmediate(resolve3));
       const imagePath = page.image;
+      if (!page.content) {
+        console.warn(`[EXULU] Page ${page.page}: No content found, skipping validation`);
+        return;
+      }
       if (!imagePath) {
-        console.log(`[EXULU] Page ${page.page}: No image found, skipping validation`);
+        console.warn(`[EXULU] Page ${page.page}: No image found, skipping validation`);
+        return;
+      }
+      const hasImage = page.content.match(/\.(jpeg|jpg|png|gif|webp)/i);
+      const hasTable = (page.content.match(/\|/g)?.length || 0) > 1;
+      if (!hasImage && !hasTable) {
+        if (verbose) {
+          console.log(`[EXULU] Page ${page.page}: No image or table found, SKIPPING VLM validation`);
+        }
         return;
       }
       let validation;
@@ -16231,6 +16571,13 @@ async function validateWithVLM(document, model, verbose = false, concurrency = 1
         validation = await withRetry(async () => {
           return await validatePageWithVLM(page, imagePath, model);
         }, 3);
+        validationResults.set(page.page, validation);
+        if (verbose && validation.current_page_table) {
+          console.log(`[EXULU] Page ${page.page} table info:`, {
+            headers: validation.current_page_table.headers,
+            is_continuation: validation.current_page_table.is_continuation
+          });
+        }
       } catch (error) {
         console.error(`[EXULU] Error validating page ${page.page} with VLM more than 3 times, skipping:`, error);
         throw error;
@@ -16261,9 +16608,12 @@ async function validateWithVLM(document, model, verbose = false, concurrency = 1
     })
   );
   await Promise.all(validationTasks);
-  console.log(`[EXULU] VLM validation complete:`);
-  console.log(`[EXULU] Validated: ${validatedCount} chunks`);
-  console.log(`[EXULU] Corrected: ${correctedCount} chunks`);
+  console.log(`[EXULU] VLM validation complete (parallel processing):`);
+  console.log(`[EXULU] Validated: ${validatedCount} pages`);
+  console.log(`[EXULU] Corrected: ${correctedCount} pages`);
+  console.log(`[EXULU] Starting sequential table header reconstruction...`);
+  reconstructTableHeaders(document, validationResults, verbose);
+  console.log(`[EXULU] Table header reconstruction complete`);
   return document;
 }
 async function processDocument(filePath, fileType, buffer, tempDir, config, verbose = false) {
@@ -16278,15 +16628,6 @@ async function processDocument(filePath, fileType, buffer, tempDir, config, verb
   const stripped = filePath.split(".").pop()?.trim();
   let result;
   switch (stripped) {
-    case "pdf":
-      result = await processPdf(buffer, paths, config, verbose);
-      break;
-    case "docx":
-      result = await processDocx(buffer);
-      break;
-    case "doc":
-      result = await processWord(buffer);
-      break;
     case "txt":
     case "md":
       let content = buffer.toString();
@@ -16300,6 +16641,15 @@ async function processDocument(filePath, fileType, buffer, tempDir, config, verb
         }]
       };
       break;
+    case "pdf":
+      result = await processPdf(buffer, paths, config, verbose);
+      break;
+    case "docx":
+      result = await processDocx(buffer);
+      break;
+    case "doc":
+      result = await processWord(buffer);
+      break;
     // Todo other file types with docx and officeparser
     default:
       throw new Error(`[EXULU] Unsupported file type: ${fileType}`);
@@ -16314,8 +16664,8 @@ async function processDocument(filePath, fileType, buffer, tempDir, config, verb
 }
 async function processPdf(buffer, paths, config, verbose = false) {
   try {
-    let json;
-    if (config?.docling) {
+    let json = [];
+    if (config?.processor.name === "docling") {
       console.log(`[EXULU] Validating Python environment...`);
       const validation = await validatePythonEnvironment(void 0, true);
       if (!validation.valid) {
@@ -16356,7 +16706,7 @@ ${setupResult.output || ""}`);
       }
       const jsonContent = await fs2.promises.readFile(paths.json, "utf-8");
       json = JSON.parse(jsonContent);
-    } else {
+    } else if (config?.processor.name === "officeparser") {
       const text = await parseOfficeAsync2(buffer, {
         outputErrorToConsole: false,
         newlineDelimiter: "\n"
@@ -16366,15 +16716,69 @@ ${setupResult.output || ""}`);
         content: text,
         headings: []
       }];
+    } else if (config?.processor.name === "mistral") {
+      if (!process.env.MISTRAL_API_KEY) {
+        throw new Error("[EXULU] MISTRAL_API_KEY is not set, please set it in the environment variables.");
+      }
+      await new Promise((resolve3) => setTimeout(resolve3, Math.floor(Math.random() * 4e3) + 1e3));
+      const base64Pdf = buffer.toString("base64");
+      const client2 = new Mistral({ apiKey: process.env.MISTRAL_API_KEY });
+      const ocrResponse = await withRetry(async () => {
+        const ocrResponse2 = await client2.ocr.process({
+          document: {
+            type: "document_url",
+            documentUrl: "data:application/pdf;base64," + base64Pdf
+          },
+          model: "mistral-ocr-latest",
+          includeImageBase64: false
+        });
+        return ocrResponse2;
+      }, 10);
+      const parser = new LiteParse();
+      const screenshots = await parser.screenshot(paths.source, void 0);
+      await fs2.promises.mkdir(paths.images, { recursive: true });
+      for (const screenshot of screenshots) {
+        await fs2.promises.writeFile(
+          path.join(
+            paths.images,
+            `${screenshot.pageNum}.png`
+          ),
+          screenshot.imageBuffer
+        );
+        screenshot.imagePath = path.join(paths.images, `${screenshot.pageNum}.png`);
+      }
+      json = ocrResponse.pages.map((page) => ({
+        page: page.index + 1,
+        content: page.markdown,
+        image: screenshots.find((s) => s.pageNum === page.index + 1)?.imagePath,
+        headings: []
+      }));
+      fs2.writeFileSync(paths.json, JSON.stringify(json, null, 2));
+    } else if (config?.processor.name === "liteparse") {
+      const parser = new LiteParse();
+      const result = await parser.parse(paths.source);
+      const screenshots = await parser.screenshot(paths.source, void 0);
+      console.log(`[EXULU] Liteparse screenshots: ${JSON.stringify(screenshots)}`);
+      await fs2.promises.mkdir(paths.images, { recursive: true });
+      for (const screenshot of screenshots) {
+        await fs2.promises.writeFile(path.join(paths.images, `${screenshot.pageNum}.png`), screenshot.imageBuffer);
+        screenshot.imagePath = path.join(paths.images, `${screenshot.pageNum}.png`);
+      }
+      json = result.pages.map((page) => ({
+        page: page.pageNum,
+        content: page.text,
+        image: screenshots.find((s) => s.pageNum === page.pageNum)?.imagePath
+      }));
+      fs2.writeFileSync(paths.json, JSON.stringify(json, null, 2));
     }
     console.log(`[EXULU]
 \u2713 Document processing completed successfully`);
     console.log(`[EXULU] Total pages: ${json.length}`);
     console.log(`[EXULU] Output file: ${paths.json}`);
-    if (!config?.docling && config?.vlm?.model) {
+    if (config?.vlm?.model) {
       console.error("[EXULU] VLM validation is only supported when docling is enabled, skipping validation.");
     }
-    if (config?.docling && config?.vlm?.model) {
+    if (config?.vlm?.model && json.length > 0) {
       json = await validateWithVLM(
         json,
         config.vlm.model,
@@ -16402,29 +16806,37 @@ ${setupResult.output || ""}`);
         "utf-8"
       );
     }
-    const markdown = json.map((p) => {
-      if (p.vlm_corrected_text) {
-        return p.vlm_corrected_text;
-      } else {
-        return p.content;
+    const markdownStream = fs2.createWriteStream(paths.markdown, { encoding: "utf-8" });
+    for (let i = 0; i < json.length; i++) {
+      const p = json[i];
+      if (!p) continue;
+      const content = p.vlm_corrected_text ?? p.content;
+      markdownStream.write(content);
+      if (i < json.length - 1) {
+        markdownStream.write("\n\n\n<!-- END_OF_PAGE -->\n\n\n");
       }
-    }).join("\n\n\n<!-- END_OF_PAGE -->\n\n\n");
-    await fs2.promises.writeFile(
-      paths.markdown,
-      markdown,
-      "utf-8"
-    );
+    }
+    await new Promise((resolve3, reject) => {
+      markdownStream.end(() => resolve3());
+      markdownStream.on("error", reject);
+    });
     console.log(`[EXULU] Validated output saved to: ${paths.json}`);
     console.log(`[EXULU] Validated markdown saved to: ${paths.markdown}`);
+    const markdown = await fs2.promises.readFile(paths.markdown, "utf-8");
+    const processedJson = json.map((e) => {
+      const finalContent = e.vlm_corrected_text ?? e.content;
+      return {
+        page: e.page,
+        content: finalContent
+      };
+    });
+    json.length = 0;
+    json = [];
+    const memUsage = process.memoryUsage();
+    console.log(`[EXULU] Memory after document processing: ${Math.round(memUsage.heapUsed / 1024 / 1024)}MB / ${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`);
     return {
       markdown,
-      json: json.map((e) => {
-        const finalContent = e.vlm_corrected_text || e.content;
-        return {
-          page: e.page,
-          content: finalContent
-        };
-      })
+      json: processedJson
     };
   } catch (error) {
     console.error("[EXULU] Error processing document:", error);
@@ -16437,9 +16849,9 @@ var loadFile = async (file, name, tempDir) => {
   if (!fileType) {
     throw new Error("[EXULU] File name does not include extension, extension is required for document processing.");
   }
+  const UUID = randomUUID6();
   let buffer;
   if (Buffer.isBuffer(file)) {
-    const UUID = randomUUID6();
     filePath = path.join(tempDir, `${UUID}.${fileType}`);
     await fs2.promises.writeFile(filePath, file);
     buffer = file;
@@ -16448,7 +16860,10 @@ var loadFile = async (file, name, tempDir) => {
     if (filePath.startsWith("http")) {
       const response = await fetch(filePath);
       const array = await response.arrayBuffer();
+      const tempFilePath = path.join(tempDir, `${UUID}.${fileType}`);
+      await fs2.promises.writeFile(tempFilePath, Buffer.from(array));
       buffer = Buffer.from(array);
+      filePath = tempFilePath;
     } else {
       buffer = await fs2.promises.readFile(file);
     }
@@ -16466,17 +16881,34 @@ async function documentProcessor({
   }
   const uuid = randomUUID6();
   const tempDir = path.join(process.cwd(), "temp", uuid);
+  const localFilesAndFoldersToDelete = [tempDir];
   console.log(`[EXULU] Temporary directory for processing document ${name}: ${tempDir}`);
   await fs2.promises.mkdir(tempDir, { recursive: true });
+  const timestamp = (/* @__PURE__ */ new Date()).toISOString();
+  await fs2.promises.writeFile(path.join(tempDir, "created_at.txt"), timestamp);
   try {
     const {
       filePath,
       fileType,
       buffer
     } = await loadFile(file, name, tempDir);
-    const supportedTypes = ["pdf", "docx", "doc", "txt", "md"];
+    let supportedTypes = [];
+    switch (config?.processor.name) {
+      case "docling":
+        supportedTypes = ["pdf", "docx", "doc", "txt", "md"];
+        break;
+      case "officeparser":
+        supportedTypes = [];
+        break;
+      case "liteparse":
+        supportedTypes = ["pdf", "doc", "docx", "docm", "odt", "rtf", "ppt", "pptx", "pptm", "odp", "xls", "xlsx", "xlsm", "ods", "csv", "tsv"];
+        break;
+      case "mistral":
+        supportedTypes = ["pdf", "docx", "doc", "txt", "md"];
+        break;
+    }
     if (!supportedTypes.includes(fileType)) {
-      throw new Error(`[EXULU] Unsupported file type: ${fileType} for Exulu document processor.`);
+      throw new Error(`[EXULU] Unsupported file type: ${fileType} for Exulu document processor, the ${config?.processor.name} processor only supports the following file types: ${supportedTypes.join(", ")}.`);
     }
     const { content } = await processDocument(
       filePath,
@@ -16489,9 +16921,19 @@ async function documentProcessor({
     return content.json;
   } catch (error) {
     console.error("Error during chunking:", error);
-    return void 0;
+    throw error;
   } finally {
-    await fs2.promises.rm(tempDir, { recursive: true });
+    if (config?.debugging?.deleteTempFiles !== false) {
+      for (const file2 of localFilesAndFoldersToDelete) {
+        try {
+          await fs2.promises.rm(file2, { recursive: true });
+          console.log(`[EXULU] Deleted file or folder: ${file2}`);
+        } catch (error) {
+          console.error(`[EXULU] Error deleting file or folder: ${file2}`, error);
+          console.log(`[EXULU] File or folder still exists: ${file2}`);
+        }
+      }
+    }
   }
 }