@exulu/backend 1.49.2 → 1.51.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -147,17 +147,19 @@ async function postgresClient() {
147
147
  // 30 minutes
148
148
  },
149
149
  pool: {
150
- min: 5,
151
- // Increased from 2 to ensure enough connections available
152
- max: 50,
153
- // Increased from 20 to handle more concurrent operations with processor jobs
154
- acquireTimeoutMillis: 6e4,
155
- // Increased from 30s to 60s to handle pool contention
150
+ min: 10,
151
+ // Minimum connections always ready
152
+ max: 300,
153
+ // Increased to support high worker concurrency (250+ concurrent jobs)
154
+ acquireTimeoutMillis: 12e4,
155
+ // 2 minutes - increased to handle high contention during bursts
156
156
  createTimeoutMillis: 3e4,
157
157
  idleTimeoutMillis: 6e4,
158
- // Increased to keep connections alive longer
158
+ // Keep connections alive for reuse
159
159
  reapIntervalMillis: 1e3,
160
160
  createRetryIntervalMillis: 200,
161
+ // Enable propagateCreateError to properly handle connection creation failures
162
+ propagateCreateError: false,
161
163
  // Log pool events to help debug connection issues
162
164
  afterCreate: (conn, done) => {
163
165
  console.log("[EXULU] New database connection created");
@@ -3577,7 +3579,7 @@ var convertExuluToolsToAiSdkTools = async (currentTools, approvedTools, allExulu
3577
3579
  description,
3578
3580
  // The approvedTools array uses the tool.name lookup as the frontend
3579
3581
  // Vercel AI SDK uses the sanitized tool name as the key, so this matches.
3580
- needsApproval: approvedTools?.includes("tool-" + cur.name) ? false : true,
3582
+ needsApproval: approvedTools?.includes("tool-" + cur.name) || !cur.needsApproval ? false : true,
3581
3583
  // todo make configurable
3582
3584
  async *execute(inputs, options) {
3583
3585
  console.log(
@@ -3727,6 +3729,7 @@ var ExuluTool = class {
3727
3729
  inputSchema;
3728
3730
  type;
3729
3731
  tool;
3732
+ needsApproval;
3730
3733
  config;
3731
3734
  constructor({
3732
3735
  id,
@@ -3736,10 +3739,12 @@ var ExuluTool = class {
3736
3739
  inputSchema,
3737
3740
  type,
3738
3741
  execute: execute2,
3739
- config
3742
+ config,
3743
+ needsApproval
3740
3744
  }) {
3741
3745
  this.id = id;
3742
3746
  this.config = config;
3747
+ this.needsApproval = needsApproval ?? true;
3743
3748
  this.category = category || "default";
3744
3749
  this.name = name;
3745
3750
  this.description = description;
@@ -4146,11 +4151,30 @@ var ExuluContext2 = class {
4146
4151
  );
4147
4152
  await db2.from(getChunksTableName(this.id)).where({ source }).delete();
4148
4153
  if (chunks?.length) {
4154
+ const sanitizeString = (str) => {
4155
+ if (!str) return "";
4156
+ return str.replace(/\0/g, "");
4157
+ };
4158
+ const sanitizeMetadata2 = (metadata) => {
4159
+ if (!metadata) return {};
4160
+ const sanitized = {};
4161
+ for (const [key, value] of Object.entries(metadata)) {
4162
+ if (typeof value === "string") {
4163
+ sanitized[key] = sanitizeString(value);
4164
+ } else {
4165
+ sanitized[key] = value;
4166
+ }
4167
+ }
4168
+ return sanitized;
4169
+ };
4149
4170
  await db2.from(getChunksTableName(this.id)).insert(
4150
4171
  chunks.map((chunk) => ({
4151
- source,
4152
- metadata: chunk.metadata,
4153
- content: chunk.content,
4172
+ // Sanitize source to remove null bytes
4173
+ source: sanitizeString(source),
4174
+ // Sanitize metadata to remove null bytes from string values
4175
+ metadata: sanitizeMetadata2(chunk.metadata),
4176
+ // Remove null bytes (0x00) which are invalid in PostgreSQL UTF8 encoding
4177
+ content: sanitizeString(chunk.content),
4154
4178
  chunk_index: chunk.index,
4155
4179
  embedding: pgvector2.toSql(chunk.vector)
4156
4180
  }))
@@ -4539,6 +4563,8 @@ var ExuluContext2 = class {
4539
4563
  name: `${this.name}_context_search`,
4540
4564
  type: "context",
4541
4565
  category: "contexts",
4566
+ needsApproval: true,
4567
+ // todo make configurable
4542
4568
  inputSchema: z4.object({
4543
4569
  query: z4.string().describe("The original question that the user asked"),
4544
4570
  keywords: z4.array(z4.string()).describe(
@@ -5732,7 +5758,7 @@ var finalizeRequestedFields = async ({
5732
5758
  return result;
5733
5759
  }
5734
5760
  const { db: db2 } = await postgresClient();
5735
- const query = db2.from(getChunksTableName(context.id)).where({ source: result.id }).select("id", "content", "source", "chunk_index", "createdAt", "updatedAt");
5761
+ const query = db2.from(getChunksTableName(context.id)).where({ source: result.id }).select("id", "content", "source", "chunk_index", "createdAt", "updatedAt", "metadata");
5736
5762
  const chunks = await query;
5737
5763
  result.chunks = chunks.map((chunk) => ({
5738
5764
  chunk_content: chunk.content,
@@ -5745,7 +5771,8 @@ var finalizeRequestedFields = async ({
5745
5771
  item_created_at: chunk.item_created_at,
5746
5772
  item_id: chunk.item_id,
5747
5773
  item_external_id: chunk.item_external_id,
5748
- item_name: chunk.item_name
5774
+ item_name: chunk.item_name,
5775
+ chunk_metadata: chunk.metadata
5749
5776
  }));
5750
5777
  }
5751
5778
  }
@@ -7119,6 +7146,36 @@ import "ai";
7119
7146
  import CryptoJS4 from "crypto-js";
7120
7147
  var redisConnection;
7121
7148
  var unhandledRejectionHandlerInstalled = false;
7149
+ var poolMonitoringInterval;
7150
+ var startPoolMonitoring = () => {
7151
+ if (poolMonitoringInterval) return;
7152
+ poolMonitoringInterval = setInterval(async () => {
7153
+ try {
7154
+ const { db: db2 } = await postgresClient();
7155
+ const poolStats = db2.client.pool;
7156
+ if (poolStats) {
7157
+ const used = poolStats.numUsed?.() || 0;
7158
+ const free = poolStats.numFree?.() || 0;
7159
+ const pending = poolStats.numPendingAcquires?.() || 0;
7160
+ const total = used + free;
7161
+ console.log("[EXULU] Connection pool health check:", {
7162
+ used,
7163
+ free,
7164
+ pending,
7165
+ total,
7166
+ utilization: total > 0 ? `${Math.round(used / total * 100)}%` : "0%"
7167
+ });
7168
+ if (pending > 10) {
7169
+ console.warn(
7170
+ `[EXULU] WARNING: ${pending} jobs waiting for database connections. Consider increasing pool size or reducing worker concurrency.`
7171
+ );
7172
+ }
7173
+ }
7174
+ } catch (error) {
7175
+ console.error("[EXULU] Error checking pool health:", error);
7176
+ }
7177
+ }, 3e4);
7178
+ };
7122
7179
  var installGlobalErrorHandlers = () => {
7123
7180
  if (unhandledRejectionHandlerInstalled) return;
7124
7181
  process.on("unhandledRejection", (reason) => {
@@ -7143,6 +7200,7 @@ var installGlobalErrorHandlers = () => {
7143
7200
  unhandledRejectionHandlerInstalled = true;
7144
7201
  console.log("[EXULU] Global error handlers installed to prevent worker crashes");
7145
7202
  };
7203
+ var isShuttingDown = false;
7146
7204
  var createWorkers = async (providers, queues2, config, contexts, rerankers, evals, tools, tracer) => {
7147
7205
  console.log("[EXULU] creating workers for " + queues2?.length + " queues.");
7148
7206
  console.log(
@@ -7150,7 +7208,8 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7150
7208
  queues2.map((q) => q.queue.name)
7151
7209
  );
7152
7210
  installGlobalErrorHandlers();
7153
- process.setMaxListeners(Math.max(queues2.length * 2 + 5, 15));
7211
+ startPoolMonitoring();
7212
+ process.setMaxListeners(Math.max(15, process.getMaxListeners()));
7154
7213
  if (!redisServer.host || !redisServer.port) {
7155
7214
  console.error(
7156
7215
  "[EXULU] you are trying to start worker, but no redis server is configured in the environment."
@@ -7183,7 +7242,53 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7183
7242
  status: await bullmqJob.getState(),
7184
7243
  type: bullmqJob.data.type
7185
7244
  });
7186
- const { db: db2 } = await postgresClient();
7245
+ let progressInterval;
7246
+ if (bullmqJob.data.type === "processor") {
7247
+ progressInterval = setInterval(async () => {
7248
+ try {
7249
+ await bullmqJob.updateProgress({
7250
+ status: "processing",
7251
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
7252
+ });
7253
+ console.log(`[EXULU] Job ${bullmqJob.id} heartbeat sent to prevent stalling`);
7254
+ } catch (error) {
7255
+ console.error(`[EXULU] Error updating job progress:`, error);
7256
+ }
7257
+ }, 25e3);
7258
+ }
7259
+ let db2;
7260
+ let retries = 3;
7261
+ let lastError;
7262
+ for (let attempt = 1; attempt <= retries; attempt++) {
7263
+ try {
7264
+ const client2 = await postgresClient();
7265
+ db2 = client2.db;
7266
+ const poolStats = db2.client.pool;
7267
+ if (poolStats) {
7268
+ console.log(`[EXULU] Connection pool stats for job ${bullmqJob.id}:`, {
7269
+ size: poolStats.numUsed?.() || 0,
7270
+ available: poolStats.numFree?.() || 0,
7271
+ pending: poolStats.numPendingAcquires?.() || 0
7272
+ });
7273
+ }
7274
+ break;
7275
+ } catch (error) {
7276
+ lastError = error instanceof Error ? error : new Error(String(error));
7277
+ console.error(
7278
+ `[EXULU] Failed to acquire database connection (attempt ${attempt}/${retries}) for job ${bullmqJob.id}:`,
7279
+ lastError.message
7280
+ );
7281
+ if (attempt < retries) {
7282
+ const backoffMs = 500 * Math.pow(2, attempt - 1);
7283
+ await new Promise((resolve3) => setTimeout(resolve3, backoffMs));
7284
+ }
7285
+ }
7286
+ }
7287
+ if (!db2) {
7288
+ throw new Error(
7289
+ `Failed to acquire database connection after ${retries} attempts: ${lastError?.message}`
7290
+ );
7291
+ }
7187
7292
  const data = bullmqJob.data;
7188
7293
  const timeoutInSeconds = data.timeoutInSeconds || queue.timeoutInSeconds || 600;
7189
7294
  const timeoutMs = timeoutInSeconds * 1e3;
@@ -7275,7 +7380,7 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7275
7380
  }
7276
7381
  const exuluStorage = new ExuluStorage({ config });
7277
7382
  console.log("[EXULU] POS 2 -- EXULU CONTEXT PROCESS FIELD");
7278
- const processorResult = await context.processor.execute({
7383
+ let processorResult = await context.processor.execute({
7279
7384
  item: data.inputs,
7280
7385
  user: data.user,
7281
7386
  role: data.role,
@@ -7290,12 +7395,16 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7290
7395
  );
7291
7396
  }
7292
7397
  delete processorResult.field;
7398
+ const updateData = { ...processorResult };
7293
7399
  await db2.from(getTableName(context.id)).where({
7294
7400
  id: processorResult.id
7295
7401
  }).update({
7296
- ...processorResult,
7402
+ ...updateData,
7297
7403
  last_processed_at: (/* @__PURE__ */ new Date()).toISOString()
7298
7404
  });
7405
+ Object.keys(updateData).forEach((key) => {
7406
+ delete updateData[key];
7407
+ });
7299
7408
  let jobs = [];
7300
7409
  if (context.processor?.config?.generateEmbeddings) {
7301
7410
  const fullItem = await db2.from(getTableName(context.id)).where({
@@ -7317,12 +7426,18 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7317
7426
  jobs.push(embeddingsJob);
7318
7427
  }
7319
7428
  }
7320
- return {
7321
- result: processorResult,
7429
+ const result = {
7430
+ result: { id: processorResult.id },
7322
7431
  metadata: {
7323
7432
  jobs: jobs.length > 0 ? jobs.join(",") : void 0
7324
7433
  }
7325
7434
  };
7435
+ processorResult = null;
7436
+ const memUsage = process.memoryUsage();
7437
+ console.log(
7438
+ `[EXULU] Memory after processor job ${bullmqJob.id}: ${Math.round(memUsage.heapUsed / 1024 / 1024)}MB / ${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`
7439
+ );
7440
+ return result;
7326
7441
  }
7327
7442
  if (data.type === "workflow") {
7328
7443
  console.log("[EXULU] running a workflow job.", bullmqJob.name);
@@ -7341,10 +7456,10 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7341
7456
  user,
7342
7457
  messages: inputMessages
7343
7458
  } = await validateWorkflowPayload(data, providers);
7344
- const retries = 3;
7459
+ const retries2 = 3;
7345
7460
  let attempts = 0;
7346
7461
  const promise = new Promise(async (resolve3, reject) => {
7347
- while (attempts < retries) {
7462
+ while (attempts < retries2) {
7348
7463
  try {
7349
7464
  const messages2 = await processUiMessagesFlow({
7350
7465
  providers,
@@ -7366,7 +7481,7 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7366
7481
  error instanceof Error ? error.message : String(error)
7367
7482
  );
7368
7483
  attempts++;
7369
- if (attempts >= retries) {
7484
+ if (attempts >= retries2) {
7370
7485
  reject(new Error(error instanceof Error ? error.message : String(error)));
7371
7486
  }
7372
7487
  await new Promise((resolve4) => setTimeout((resolve5) => resolve5(true), 2e3));
@@ -7417,10 +7532,10 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7417
7532
  testCase,
7418
7533
  messages: inputMessages
7419
7534
  } = await validateEvalPayload(data, providers);
7420
- const retries = 3;
7535
+ const retries2 = 3;
7421
7536
  let attempts = 0;
7422
7537
  const promise = new Promise(async (resolve3, reject) => {
7423
- while (attempts < retries) {
7538
+ while (attempts < retries2) {
7424
7539
  try {
7425
7540
  const messages2 = await processUiMessagesFlow({
7426
7541
  providers,
@@ -7441,7 +7556,7 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7441
7556
  error instanceof Error ? error.message : String(error)
7442
7557
  );
7443
7558
  attempts++;
7444
- if (attempts >= retries) {
7559
+ if (attempts >= retries2) {
7445
7560
  reject(new Error(error instanceof Error ? error.message : String(error)));
7446
7561
  }
7447
7562
  await new Promise((resolve4) => setTimeout((resolve5) => resolve5(true), 2e3));
@@ -7690,9 +7805,15 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7690
7805
  try {
7691
7806
  const result = await Promise.race([workPromise, timeoutPromise]);
7692
7807
  clearTimeout(timeoutHandle);
7808
+ if (progressInterval) {
7809
+ clearInterval(progressInterval);
7810
+ }
7693
7811
  return result;
7694
7812
  } catch (error) {
7695
7813
  clearTimeout(timeoutHandle);
7814
+ if (progressInterval) {
7815
+ clearInterval(progressInterval);
7816
+ }
7696
7817
  console.error(
7697
7818
  `[EXULU] job ${bullmqJob.id} failed (error caught in race handler).`,
7698
7819
  error instanceof Error ? error.message : String(error)
@@ -7706,6 +7827,16 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7706
7827
  concurrency: queue.concurrency?.worker || 1,
7707
7828
  removeOnComplete: { count: 1e3 },
7708
7829
  removeOnFail: { count: 5e3 },
7830
+ // Configure settings for long-running jobs (especially processor jobs)
7831
+ // lockDuration: How long a worker can hold a job before it's considered stalled
7832
+ // Set to 5 minutes to accommodate CPU-intensive operations
7833
+ lockDuration: 3e5,
7834
+ // 5 minutes in milliseconds
7835
+ // stalledInterval: How often to check for stalled jobs
7836
+ // Set to 2 minutes to reduce false positives for long-running operations
7837
+ stalledInterval: 12e4,
7838
+ // 2 minutes in milliseconds
7839
+ maxStalledCount: 1,
7709
7840
  ...queue.ratelimit && {
7710
7841
  limiter: {
7711
7842
  max: queue.ratelimit,
@@ -7742,24 +7873,68 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7742
7873
  error: error instanceof Error ? error.message : String(error)
7743
7874
  } : error
7744
7875
  );
7876
+ throw error;
7745
7877
  });
7746
7878
  worker.on("error", (error) => {
7747
7879
  console.error(`[EXULU] worker error.`, error);
7880
+ throw error;
7748
7881
  });
7749
7882
  worker.on("progress", (job, progress) => {
7750
7883
  console.log(`[EXULU] job progress ${job.id}.`, job.name, {
7751
7884
  progress
7752
7885
  });
7753
7886
  });
7754
- const gracefulShutdown = async (signal) => {
7755
- console.log(`Received ${signal}, closing server...`);
7756
- await worker.close();
7757
- process.exit(0);
7758
- };
7759
- process.on("SIGINT", () => gracefulShutdown("SIGINT"));
7760
- process.on("SIGTERM", () => gracefulShutdown("SIGTERM"));
7761
7887
  return worker;
7762
7888
  });
7889
+ const gracefulShutdown = async (signal) => {
7890
+ if (isShuttingDown) {
7891
+ console.log(`[EXULU] Shutdown already in progress, ignoring additional ${signal}`);
7892
+ return;
7893
+ }
7894
+ isShuttingDown = true;
7895
+ console.log(`[EXULU] Received ${signal}, shutting down gracefully...`);
7896
+ try {
7897
+ if (poolMonitoringInterval) {
7898
+ clearInterval(poolMonitoringInterval);
7899
+ poolMonitoringInterval = void 0;
7900
+ }
7901
+ console.log(`[EXULU] Closing ${workers.length} worker(s)...`);
7902
+ const closePromises = workers.map(async (worker, index) => {
7903
+ try {
7904
+ await Promise.race([
7905
+ worker.close(),
7906
+ new Promise(
7907
+ (_, reject) => setTimeout(() => reject(new Error("Worker close timeout")), 3e4)
7908
+ )
7909
+ ]);
7910
+ console.log(`[EXULU] Worker ${index + 1} closed successfully`);
7911
+ } catch (error) {
7912
+ console.error(`[EXULU] Error closing worker ${index + 1}:`, error);
7913
+ }
7914
+ });
7915
+ await Promise.allSettled(closePromises);
7916
+ if (redisConnection) {
7917
+ console.log(`[EXULU] Closing Redis connection...`);
7918
+ await redisConnection.quit();
7919
+ }
7920
+ try {
7921
+ const { db: db2 } = await postgresClient();
7922
+ if (db2?.client) {
7923
+ console.log(`[EXULU] Closing database connection pool...`);
7924
+ await db2.client.destroy();
7925
+ }
7926
+ } catch (error) {
7927
+ console.error(`[EXULU] Error closing database:`, error);
7928
+ }
7929
+ console.log(`[EXULU] Graceful shutdown complete`);
7930
+ process.exit(0);
7931
+ } catch (error) {
7932
+ console.error(`[EXULU] Error during graceful shutdown:`, error);
7933
+ process.exit(1);
7934
+ }
7935
+ };
7936
+ process.once("SIGINT", () => gracefulShutdown("SIGINT"));
7937
+ process.once("SIGTERM", () => gracefulShutdown("SIGTERM"));
7763
7938
  return workers;
7764
7939
  };
7765
7940
  var validateWorkflowPayload = async (data, providers) => {
@@ -9473,6 +9648,7 @@ type AgentEvalFunctionConfig {
9473
9648
 
9474
9649
  type ItemChunks {
9475
9650
  chunk_id: String!
9651
+ chunk_metadata: JSON!
9476
9652
  chunk_index: Int!
9477
9653
  chunk_content: String!
9478
9654
  chunk_source: String!
@@ -9691,7 +9867,7 @@ import cookieParser from "cookie-parser";
9691
9867
  import { z as z7 } from "zod";
9692
9868
  import {
9693
9869
  convertToModelMessages,
9694
- generateObject,
9870
+ Output as Output2,
9695
9871
  generateText as generateText2,
9696
9872
  streamText,
9697
9873
  validateUIMessages,
@@ -9886,7 +10062,7 @@ var ExuluProvider = class {
9886
10062
  prompt: z7.string().describe("The prompt (usually a question for the agent) to send to the agent."),
9887
10063
  information: z7.string().describe("A summary of relevant context / information from the current session")
9888
10064
  }),
9889
- description: `This tool calls an AI agent named: ${agent.name}. The agent does the following: ${agent.description}.`,
10065
+ description: `This tool calls an agent named: ${agent.name}. The agent does the following: ${agent.description}.`,
9890
10066
  config: [],
9891
10067
  execute: async ({ prompt, information, user, allExuluTools }) => {
9892
10068
  const hasAccessToAgent = await checkRecordAccess(agent, "read", user);
@@ -9999,9 +10175,6 @@ var ExuluProvider = class {
9999
10175
  if (!prompt && !inputMessages?.length) {
10000
10176
  throw new Error("Prompt or message is required for generating.");
10001
10177
  }
10002
- if (outputSchema && !prompt) {
10003
- throw new Error("Prompt is required for generating with an output schema.");
10004
- }
10005
10178
  const model = this.model.create({
10006
10179
  ...providerapikey ? { apiKey: providerapikey } : {}
10007
10180
  });
@@ -10138,14 +10311,18 @@ var ExuluProvider = class {
10138
10311
  let inputTokens = 0;
10139
10312
  let outputTokens = 0;
10140
10313
  if (outputSchema) {
10141
- const { object, usage } = await generateObject({
10314
+ const { output, usage } = await generateText2({
10142
10315
  model,
10143
10316
  system,
10144
- prompt,
10145
10317
  maxRetries: 3,
10146
- schema: outputSchema
10318
+ output: Output2.object({
10319
+ schema: outputSchema
10320
+ }),
10321
+ prompt,
10322
+ stopWhen: [stepCountIs2(5)]
10323
+ // make configurable
10147
10324
  });
10148
- result.object = object;
10325
+ result.object = output;
10149
10326
  inputTokens = usage.inputTokens || 0;
10150
10327
  outputTokens = usage.outputTokens || 0;
10151
10328
  } else {
@@ -10176,6 +10353,7 @@ var ExuluProvider = class {
10176
10353
  agent
10177
10354
  ),
10178
10355
  stopWhen: [stepCountIs2(5)]
10356
+ // make configurable
10179
10357
  });
10180
10358
  result.text = text;
10181
10359
  inputTokens = totalUsage?.inputTokens || 0;
@@ -10650,6 +10828,7 @@ var providerRateLimiter = async (key, windowSeconds, limit, points) => {
10650
10828
  };
10651
10829
 
10652
10830
  // src/exulu/routes.ts
10831
+ import { convertJsonSchemaToZod } from "zod-from-json-schema";
10653
10832
  var REQUEST_SIZE_LIMIT = "50mb";
10654
10833
  var getExuluVersionNumber = async () => {
10655
10834
  try {
@@ -11070,6 +11249,16 @@ Mood: friendly and intelligent.
11070
11249
  providers,
11071
11250
  user
11072
11251
  );
11252
+ if (req.body.outputSchema && !!headers.stream) {
11253
+ throw new Error("Providing a outputSchema in the POST body is not allowed when using the streaming API, set 'stream' to false in the headers when defining a response schema.");
11254
+ }
11255
+ let outputSchema;
11256
+ if (req.body.outputSchema) {
11257
+ if (typeof req.body.outputSchema === "string") {
11258
+ req.body.outputSchema = JSON.parse(req.body.outputSchema);
11259
+ }
11260
+ outputSchema = convertJsonSchemaToZod(req.body.outputSchema);
11261
+ }
11073
11262
  let providerapikey;
11074
11263
  const variableName = agent.providerapikey;
11075
11264
  if (variableName) {
@@ -11216,6 +11405,7 @@ Mood: friendly and intelligent.
11216
11405
  const response = await provider.generateSync({
11217
11406
  contexts,
11218
11407
  rerankers: rerankers || [],
11408
+ outputSchema,
11219
11409
  agent,
11220
11410
  user,
11221
11411
  req,
@@ -15451,6 +15641,22 @@ var MarkdownChunker = class {
15451
15641
  });
15452
15642
  return result;
15453
15643
  }
15644
+ /**
15645
+ * Checks if a position in the text falls within a <diagram> tag.
15646
+ * Returns the adjusted position (before the diagram) if inside a diagram, otherwise returns the original position.
15647
+ */
15648
+ adjustForDiagramTags(text, position) {
15649
+ const diagramRegex = /<diagram>[\s\S]*?<\/diagram>/gi;
15650
+ let match;
15651
+ while ((match = diagramRegex.exec(text)) !== null) {
15652
+ const diagramStart = match.index;
15653
+ const diagramEnd = match.index + match[0].length;
15654
+ if (position > diagramStart && position < diagramEnd) {
15655
+ return diagramStart;
15656
+ }
15657
+ }
15658
+ return position;
15659
+ }
15454
15660
  /**
15455
15661
  * Find the nearest logical breakpoint working backwards from the end of the text.
15456
15662
  * Logical breakpoints are prioritized as follows:
@@ -15462,6 +15668,7 @@ var MarkdownChunker = class {
15462
15668
  *
15463
15669
  * Only considers breakpoints in the last 50% of the text to avoid creating very small chunks.
15464
15670
  * Returns the position of the breakpoint, or null if none found
15671
+ * IMPORTANT: Never splits content within <diagram> tags
15465
15672
  */
15466
15673
  findLogicalBreakpoint(text) {
15467
15674
  if (text.length === 0) return null;
@@ -15481,7 +15688,7 @@ var MarkdownChunker = class {
15481
15688
  }
15482
15689
  }
15483
15690
  if (lastHeaderPosition > 0) {
15484
- return lastHeaderPosition;
15691
+ return this.adjustForDiagramTags(text, lastHeaderPosition);
15485
15692
  }
15486
15693
  let lastParagraphBreak = -1;
15487
15694
  let searchPos = text.length;
@@ -15494,11 +15701,12 @@ var MarkdownChunker = class {
15494
15701
  searchPos = pos;
15495
15702
  }
15496
15703
  if (lastParagraphBreak > 0) {
15497
- return lastParagraphBreak + 2;
15704
+ const adjusted = this.adjustForDiagramTags(text, lastParagraphBreak + 2);
15705
+ return adjusted;
15498
15706
  }
15499
15707
  const newlineIndex = text.lastIndexOf("\n");
15500
15708
  if (newlineIndex >= minPosition) {
15501
- return newlineIndex + 1;
15709
+ return this.adjustForDiagramTags(text, newlineIndex + 1);
15502
15710
  }
15503
15711
  const sentenceEndRegex = /[.!?](?:\s|$)/g;
15504
15712
  let lastSentenceEnd = -1;
@@ -15508,13 +15716,13 @@ var MarkdownChunker = class {
15508
15716
  }
15509
15717
  }
15510
15718
  if (lastSentenceEnd > 0) {
15511
- return lastSentenceEnd;
15719
+ return this.adjustForDiagramTags(text, lastSentenceEnd);
15512
15720
  }
15513
15721
  let lastSpace = text.length;
15514
15722
  while (lastSpace > minPosition) {
15515
15723
  const pos = text.lastIndexOf(" ", lastSpace - 1);
15516
15724
  if (pos >= minPosition) {
15517
- return pos + 1;
15725
+ return this.adjustForDiagramTags(text, pos + 1);
15518
15726
  }
15519
15727
  lastSpace = pos;
15520
15728
  }
@@ -15646,6 +15854,38 @@ var MarkdownChunker = class {
15646
15854
  targetPosition = currentPosition + decoded.length;
15647
15855
  }
15648
15856
  }
15857
+ const diagramCheck = /<diagram>/gi;
15858
+ const diagramCloseCheck = /<\/diagram>/gi;
15859
+ let openDiagramsInSlice = 0;
15860
+ while (diagramCheck.exec(currentSlice) !== null) {
15861
+ openDiagramsInSlice++;
15862
+ }
15863
+ let closeDiagramsInSlice = 0;
15864
+ while (diagramCloseCheck.exec(currentSlice) !== null) {
15865
+ closeDiagramsInSlice++;
15866
+ }
15867
+ if (openDiagramsInSlice > closeDiagramsInSlice) {
15868
+ const lastDiagramOpenIndex = currentSlice.lastIndexOf("<diagram>");
15869
+ if (lastDiagramOpenIndex !== -1) {
15870
+ const remainingText = text.slice(currentPosition + lastDiagramOpenIndex);
15871
+ const closingTagMatch = /<\/diagram>/i.exec(remainingText);
15872
+ if (closingTagMatch) {
15873
+ const closingTagPosition = lastDiagramOpenIndex + closingTagMatch.index + closingTagMatch[0].length;
15874
+ const extendedSlice = text.slice(currentPosition, currentPosition + closingTagPosition);
15875
+ const extendedTokens = tokenizer.encode(extendedSlice);
15876
+ if (extendedTokens.length <= adjustedChunkSize * 1.5) {
15877
+ currentSlice = extendedSlice;
15878
+ targetPosition = currentPosition + closingTagPosition;
15879
+ } else {
15880
+ currentSlice = currentSlice.slice(0, lastDiagramOpenIndex);
15881
+ targetPosition = currentPosition + lastDiagramOpenIndex;
15882
+ }
15883
+ } else {
15884
+ currentSlice = currentSlice.slice(0, lastDiagramOpenIndex);
15885
+ targetPosition = currentPosition + lastDiagramOpenIndex;
15886
+ }
15887
+ }
15888
+ }
15649
15889
  const breakpointPosition = this.findLogicalBreakpoint(currentSlice);
15650
15890
  if (breakpointPosition !== null) {
15651
15891
  currentSlice = currentSlice.slice(0, breakpointPosition);
@@ -15917,7 +16157,7 @@ Or manually run the setup script:
15917
16157
  // ee/python/documents/processing/doc_processor.ts
15918
16158
  import * as fs2 from "fs";
15919
16159
  import * as path from "path";
15920
- import { generateText as generateText3, Output as Output2 } from "ai";
16160
+ import { generateText as generateText3, Output as Output3 } from "ai";
15921
16161
  import { z as z12 } from "zod";
15922
16162
  import pLimit from "p-limit";
15923
16163
  import { randomUUID as randomUUID6 } from "crypto";
@@ -16067,6 +16307,8 @@ ${command}`;
16067
16307
  }
16068
16308
 
16069
16309
  // ee/python/documents/processing/doc_processor.ts
16310
+ import { LiteParse } from "@llamaindex/liteparse";
16311
+ import { Mistral } from "@mistralai/mistralai";
16070
16312
  async function processDocx(file) {
16071
16313
  const html = await mammoth.convertToHtml({ buffer: file });
16072
16314
  const turndownService = new TurndownService();
@@ -16141,50 +16383,91 @@ async function validatePageWithVLM(page, imagePath, model) {
16141
16383
  const imageBuffer = await fs2.promises.readFile(imagePath);
16142
16384
  const imageBase64 = imageBuffer.toString("base64");
16143
16385
  const mimeType = "image/png";
16144
- const prompt = `You are validating OCR/document parsing output for a page that might contain tables and images.
16145
-
16146
- Here is the current OCR/parsed content for this page:
16386
+ const prompt = `You are a document validation assistant. Your task is to analyze a page image and correct the output of an OCR/parsing pipeline. The content may include tables, technical diagrams, schematics, and structured text.
16147
16387
 
16148
16388
  ---
16389
+ ## CURRENT OCR OUTPUT
16390
+
16149
16391
  ${page.content}
16150
16392
  ---
16151
16393
 
16152
- Please analyze the page image and validate it:
16153
-
16154
- 1. Check if the extracted markdown text accurately represents the content from the page, including:
16155
- - Table data (rows, columns, headers, values)
16156
- - Technical diagrams, schematics, control boards
16157
- - Icons, checkmarks, symbols
16158
- - Image captions and labels
16159
-
16160
- 2. If the page has significant errors or omissions, provide a corrected version for the page.
16394
+ ## YOUR TASK
16161
16395
 
16162
- 3. Return a validation result for the page.
16396
+ Compare the page image to the OCR output above. Identify errors, omissions, and formatting issues, then return a structured validation result (see OUTPUT FORMAT below).
16163
16397
 
16164
- IMPORTANT OUTPUT FORMAT REQUIREMENTS:
16165
- - You MUST output all tables in proper Markdown table format using pipes (|) and dashes (---)
16166
- - Use simple separator rows: | --- | --- | (NOT long dashes like ----------------------)
16167
- - Every table must have: header row, separator row, and data rows
16168
- - Example format:
16398
+ ---
16399
+ ## VALIDATION CHECKLIST
16400
+
16401
+ Work through these checks in order:
16402
+
16403
+ ### 1. Text Accuracy
16404
+ - Verify all text is correctly transcribed.
16405
+ - For minor character-level OCR errors (e.g. "\xF6" vs "\xFC", "rn" vs "m"), **prefer the original OCR output** unless you are certain of an error. Do not silently "fix" characters based on guesswork.
16406
+
16407
+ ### 2. Heading Levels
16408
+ - Verify that headings use correct Markdown levels (#, ##, ###, ####, #####, ######).
16409
+ - Determine heading level using the following priority:
16410
+ 1. **Hierarchical numbering** (strongest signal): e.g. "1" \u2192 #, "2.1" \u2192 ##, "2.1.1" \u2192 ###, "2.1.2.5" \u2192 ####
16411
+ 2. Font size (larger = higher level)
16412
+ 3. Indentation
16413
+ 4. Bold/emphasis styling
16414
+
16415
+ ### 3. Tables
16416
+
16417
+ **First, decide whether the table should be Markdown or plain text:**
16418
+ - Use **Markdown table format** if the table has a consistent, clear header structure and uniform column layout throughout.
16419
+ - Use **plain text structured description** if the table:
16420
+ - Lacks a clear header row
16421
+ - Uses mixed or irregular column structures across rows
16422
+ - Functions more like a certificate, form, or label layout
16423
+
16424
+ **If using Markdown format**, follow these rules strictly:
16425
+ - Every table must have: header row \u2192 separator row \u2192 data rows
16426
+ - Use simple separators only: \`| --- | --- |\` (NOT \`|---|---|\` or long dashes)
16427
+ - Example:
16428
+ \`\`\`
16169
16429
  | Column 1 | Column 2 |
16170
16430
  | --- | --- |
16171
- | Data 1 | Data 2 |
16172
- - If the extracted content already has tables, preserve their structure but fix any errors you find in the actual data
16173
- - Do NOT output tables as plain text or in any other format
16174
- - Preserve all markdown formatting (headings with ##, lists, etc.)
16175
-
16176
- Specific notes and guidelines:
16177
- - Some pages might contain a table with a column that show black and white dots (for Example Rufe-LEDs). You should translate this into + for black (meaning active) and - for white (meaning inactive).
16178
- - Some tables might use green or black checkmarks and red or black crosses. You should translate this into + for checkmarks (meaning active) and - for a cross (meaning inactive).
16179
- - IMPORTANT: Only provide corrections if you find actual errors in the content. If the extracted text is accurate, set needs_correction to false.
16180
-
16431
+ | Data 1 | Data 2 |
16432
+ \`\`\`
16433
+ - Important: do not use the | character as part of the data inside a cell, this would break the table, if a cell contains a | character, use a capital I.
16434
+
16435
+ **Symbol translation rules for table cells:**
16436
+ - Black/filled dot \u2192 \`+\` (active); White/empty dot \u2192 \`-\` (inactive)
16437
+ *(e.g. Rufe-LED columns)*
16438
+ - Green or black checkmark \u2192 \`+\` (active); Red or black cross \u2192 \`-\` (inactive)
16439
+
16440
+ ### 4. Multi-Page Table Continuity
16441
+ - If this page contains a table with a header row that runs to the bottom of the page (suggesting it may continue on the next page), extract the header row and include it in the \`current_page_table.headers\` field.
16442
+ - If this page contains a table WITHOUT a header row (suggesting it's a continuation from a previous page), set \`current_page_table.is_continuation\` to true and try to identify what the headers might be based on the data structure. Include your best guess for headers in \`current_page_table.headers\`.
16443
+
16444
+ ### 5. Technical Diagrams & Schematics
16445
+ If the page contains a flow-chart, schematic, technical drawing or control board layout that is **absent or poorly described** in the OCR output do the following:
16446
+ - Open a <diagram> tag with the following content:
16447
+ <diagram>
16448
+ <description>
16449
+ Add a detailed description of the diagram here.
16450
+ </description>
16451
+ <mermaid>
16452
+ Add a mermaid diagram schema here that in detail describes the diagram.
16453
+ </mermaid>
16454
+ </diagram>
16455
+
16456
+ ### 6. Captions, Icons & Symbols
16457
+ - Verify that image captions, labels, icons, and checkmarks are present and correctly transcribed.
16458
+
16459
+ ### 7. Only populate \`corrected_text\` when \`needs_correction\` is true. If the OCR output is accurate, return \`needs_correction: false\` and \`corrected_content: null\`.
16181
16460
  `;
16182
16461
  const result = await generateText3({
16183
16462
  model,
16184
- output: Output2.object({
16463
+ output: Output3.object({
16185
16464
  schema: z12.object({
16186
16465
  needs_correction: z12.boolean(),
16187
16466
  corrected_text: z12.string().nullable(),
16467
+ current_page_table: z12.object({
16468
+ headers: z12.array(z12.string()),
16469
+ is_continuation: z12.boolean()
16470
+ }).nullable(),
16188
16471
  confidence: z12.enum(["high", "medium", "low"]),
16189
16472
  reasoning: z12.string()
16190
16473
  })
@@ -16207,23 +16490,80 @@ Specific notes and guidelines:
16207
16490
  needs_correction: parsedOutput.needs_correction,
16208
16491
  corrected_text: parsedOutput.corrected_text || void 0,
16209
16492
  confidence: parsedOutput.confidence,
16493
+ current_page_table: parsedOutput.current_page_table || void 0,
16210
16494
  reasoning: parsedOutput.reasoning
16211
16495
  };
16212
16496
  return validation;
16213
16497
  }
16498
+ function reconstructTableHeaders(document, validationResults, verbose = false) {
16499
+ let lastTableHeaders = void 0;
16500
+ for (const page of document) {
16501
+ const validation = validationResults.get(page.page);
16502
+ if (!validation) continue;
16503
+ const tableInfo = validation.current_page_table;
16504
+ if (tableInfo && tableInfo.headers.length > 0) {
16505
+ if (tableInfo.is_continuation && lastTableHeaders) {
16506
+ if (verbose) {
16507
+ console.log(`[EXULU] Page ${page.page}: Reconstructing table headers from previous page`);
16508
+ console.log(`[EXULU] Previous headers: ${lastTableHeaders.join(" | ")}`);
16509
+ }
16510
+ const contentToModify = page.vlm_corrected_text || page.content;
16511
+ const lines = contentToModify.split("\n");
16512
+ const firstTableLineIndex = lines.findIndex((line) => line.trim().startsWith("|"));
16513
+ if (firstTableLineIndex !== -1) {
16514
+ const headerRow = `| ${lastTableHeaders.join(" | ")} |`;
16515
+ const separatorRow = `| ${lastTableHeaders.map(() => "---").join(" | ")} |`;
16516
+ lines.splice(firstTableLineIndex, 0, headerRow, separatorRow);
16517
+ const reconstructedContent = lines.join("\n");
16518
+ if (page.vlm_corrected_text) {
16519
+ page.vlm_corrected_text = reconstructedContent;
16520
+ } else {
16521
+ page.content = reconstructedContent;
16522
+ }
16523
+ if (verbose) {
16524
+ console.log(`[EXULU] Page ${page.page}: Added table headers successfully`);
16525
+ }
16526
+ }
16527
+ if (!tableInfo.is_continuation) {
16528
+ lastTableHeaders = tableInfo.headers;
16529
+ }
16530
+ } else {
16531
+ lastTableHeaders = tableInfo.headers;
16532
+ if (verbose) {
16533
+ console.log(`[EXULU] Page ${page.page}: Storing table headers for potential continuation`);
16534
+ console.log(`[EXULU] Headers: ${lastTableHeaders.join(" | ")}`);
16535
+ }
16536
+ }
16537
+ } else {
16538
+ lastTableHeaders = void 0;
16539
+ }
16540
+ }
16541
+ }
16214
16542
  async function validateWithVLM(document, model, verbose = false, concurrency = 10) {
16215
16543
  console.log(`[EXULU] Starting VLM validation for docling output, ${document.length} pages...`);
16216
- console.log(
16217
- `[EXULU] Concurrency limit: ${concurrency}`
16218
- );
16544
+ console.log(`[EXULU] Concurrency limit: ${concurrency}`);
16545
+ const limit = pLimit(concurrency);
16546
+ const validationResults = /* @__PURE__ */ new Map();
16219
16547
  let validatedCount = 0;
16220
16548
  let correctedCount = 0;
16221
- const limit = pLimit(concurrency);
16222
16549
  const validationTasks = document.map(
16223
16550
  (page) => limit(async () => {
16551
+ await new Promise((resolve3) => setImmediate(resolve3));
16224
16552
  const imagePath = page.image;
16553
+ if (!page.content) {
16554
+ console.warn(`[EXULU] Page ${page.page}: No content found, skipping validation`);
16555
+ return;
16556
+ }
16225
16557
  if (!imagePath) {
16226
- console.log(`[EXULU] Page ${page.page}: No image found, skipping validation`);
16558
+ console.warn(`[EXULU] Page ${page.page}: No image found, skipping validation`);
16559
+ return;
16560
+ }
16561
+ const hasImage = page.content.match(/\.(jpeg|jpg|png|gif|webp)/i);
16562
+ const hasTable = (page.content.match(/\|/g)?.length || 0) > 1;
16563
+ if (!hasImage && !hasTable) {
16564
+ if (verbose) {
16565
+ console.log(`[EXULU] Page ${page.page}: No image or table found, SKIPPING VLM validation`);
16566
+ }
16227
16567
  return;
16228
16568
  }
16229
16569
  let validation;
@@ -16231,6 +16571,13 @@ async function validateWithVLM(document, model, verbose = false, concurrency = 1
16231
16571
  validation = await withRetry(async () => {
16232
16572
  return await validatePageWithVLM(page, imagePath, model);
16233
16573
  }, 3);
16574
+ validationResults.set(page.page, validation);
16575
+ if (verbose && validation.current_page_table) {
16576
+ console.log(`[EXULU] Page ${page.page} table info:`, {
16577
+ headers: validation.current_page_table.headers,
16578
+ is_continuation: validation.current_page_table.is_continuation
16579
+ });
16580
+ }
16234
16581
  } catch (error) {
16235
16582
  console.error(`[EXULU] Error validating page ${page.page} with VLM more than 3 times, skipping:`, error);
16236
16583
  throw error;
@@ -16261,9 +16608,12 @@ async function validateWithVLM(document, model, verbose = false, concurrency = 1
16261
16608
  })
16262
16609
  );
16263
16610
  await Promise.all(validationTasks);
16264
- console.log(`[EXULU] VLM validation complete:`);
16265
- console.log(`[EXULU] Validated: ${validatedCount} chunks`);
16266
- console.log(`[EXULU] Corrected: ${correctedCount} chunks`);
16611
+ console.log(`[EXULU] VLM validation complete (parallel processing):`);
16612
+ console.log(`[EXULU] Validated: ${validatedCount} pages`);
16613
+ console.log(`[EXULU] Corrected: ${correctedCount} pages`);
16614
+ console.log(`[EXULU] Starting sequential table header reconstruction...`);
16615
+ reconstructTableHeaders(document, validationResults, verbose);
16616
+ console.log(`[EXULU] Table header reconstruction complete`);
16267
16617
  return document;
16268
16618
  }
16269
16619
  async function processDocument(filePath, fileType, buffer, tempDir, config, verbose = false) {
@@ -16278,15 +16628,6 @@ async function processDocument(filePath, fileType, buffer, tempDir, config, verb
16278
16628
  const stripped = filePath.split(".").pop()?.trim();
16279
16629
  let result;
16280
16630
  switch (stripped) {
16281
- case "pdf":
16282
- result = await processPdf(buffer, paths, config, verbose);
16283
- break;
16284
- case "docx":
16285
- result = await processDocx(buffer);
16286
- break;
16287
- case "doc":
16288
- result = await processWord(buffer);
16289
- break;
16290
16631
  case "txt":
16291
16632
  case "md":
16292
16633
  let content = buffer.toString();
@@ -16300,6 +16641,15 @@ async function processDocument(filePath, fileType, buffer, tempDir, config, verb
16300
16641
  }]
16301
16642
  };
16302
16643
  break;
16644
+ case "pdf":
16645
+ result = await processPdf(buffer, paths, config, verbose);
16646
+ break;
16647
+ case "docx":
16648
+ result = await processDocx(buffer);
16649
+ break;
16650
+ case "doc":
16651
+ result = await processWord(buffer);
16652
+ break;
16303
16653
  // Todo other file types with docx and officeparser
16304
16654
  default:
16305
16655
  throw new Error(`[EXULU] Unsupported file type: ${fileType}`);
@@ -16314,8 +16664,8 @@ async function processDocument(filePath, fileType, buffer, tempDir, config, verb
16314
16664
  }
16315
16665
  async function processPdf(buffer, paths, config, verbose = false) {
16316
16666
  try {
16317
- let json;
16318
- if (config?.docling) {
16667
+ let json = [];
16668
+ if (config?.processor.name === "docling") {
16319
16669
  console.log(`[EXULU] Validating Python environment...`);
16320
16670
  const validation = await validatePythonEnvironment(void 0, true);
16321
16671
  if (!validation.valid) {
@@ -16356,7 +16706,7 @@ ${setupResult.output || ""}`);
16356
16706
  }
16357
16707
  const jsonContent = await fs2.promises.readFile(paths.json, "utf-8");
16358
16708
  json = JSON.parse(jsonContent);
16359
- } else {
16709
+ } else if (config?.processor.name === "officeparser") {
16360
16710
  const text = await parseOfficeAsync2(buffer, {
16361
16711
  outputErrorToConsole: false,
16362
16712
  newlineDelimiter: "\n"
@@ -16366,15 +16716,69 @@ ${setupResult.output || ""}`);
16366
16716
  content: text,
16367
16717
  headings: []
16368
16718
  }];
16719
+ } else if (config?.processor.name === "mistral") {
16720
+ if (!process.env.MISTRAL_API_KEY) {
16721
+ throw new Error("[EXULU] MISTRAL_API_KEY is not set, please set it in the environment variables.");
16722
+ }
16723
+ await new Promise((resolve3) => setTimeout(resolve3, Math.floor(Math.random() * 4e3) + 1e3));
16724
+ const base64Pdf = buffer.toString("base64");
16725
+ const client2 = new Mistral({ apiKey: process.env.MISTRAL_API_KEY });
16726
+ const ocrResponse = await withRetry(async () => {
16727
+ const ocrResponse2 = await client2.ocr.process({
16728
+ document: {
16729
+ type: "document_url",
16730
+ documentUrl: "data:application/pdf;base64," + base64Pdf
16731
+ },
16732
+ model: "mistral-ocr-latest",
16733
+ includeImageBase64: false
16734
+ });
16735
+ return ocrResponse2;
16736
+ }, 10);
16737
+ const parser = new LiteParse();
16738
+ const screenshots = await parser.screenshot(paths.source, void 0);
16739
+ await fs2.promises.mkdir(paths.images, { recursive: true });
16740
+ for (const screenshot of screenshots) {
16741
+ await fs2.promises.writeFile(
16742
+ path.join(
16743
+ paths.images,
16744
+ `${screenshot.pageNum}.png`
16745
+ ),
16746
+ screenshot.imageBuffer
16747
+ );
16748
+ screenshot.imagePath = path.join(paths.images, `${screenshot.pageNum}.png`);
16749
+ }
16750
+ json = ocrResponse.pages.map((page) => ({
16751
+ page: page.index + 1,
16752
+ content: page.markdown,
16753
+ image: screenshots.find((s) => s.pageNum === page.index + 1)?.imagePath,
16754
+ headings: []
16755
+ }));
16756
+ fs2.writeFileSync(paths.json, JSON.stringify(json, null, 2));
16757
+ } else if (config?.processor.name === "liteparse") {
16758
+ const parser = new LiteParse();
16759
+ const result = await parser.parse(paths.source);
16760
+ const screenshots = await parser.screenshot(paths.source, void 0);
16761
+ console.log(`[EXULU] Liteparse screenshots: ${JSON.stringify(screenshots)}`);
16762
+ await fs2.promises.mkdir(paths.images, { recursive: true });
16763
+ for (const screenshot of screenshots) {
16764
+ await fs2.promises.writeFile(path.join(paths.images, `${screenshot.pageNum}.png`), screenshot.imageBuffer);
16765
+ screenshot.imagePath = path.join(paths.images, `${screenshot.pageNum}.png`);
16766
+ }
16767
+ json = result.pages.map((page) => ({
16768
+ page: page.pageNum,
16769
+ content: page.text,
16770
+ image: screenshots.find((s) => s.pageNum === page.pageNum)?.imagePath
16771
+ }));
16772
+ fs2.writeFileSync(paths.json, JSON.stringify(json, null, 2));
16369
16773
  }
16370
16774
  console.log(`[EXULU]
16371
16775
  \u2713 Document processing completed successfully`);
16372
16776
  console.log(`[EXULU] Total pages: ${json.length}`);
16373
16777
  console.log(`[EXULU] Output file: ${paths.json}`);
16374
- if (!config?.docling && config?.vlm?.model) {
16778
+ if (config?.vlm?.model) {
16375
16779
  console.error("[EXULU] VLM validation is only supported when docling is enabled, skipping validation.");
16376
16780
  }
16377
- if (config?.docling && config?.vlm?.model) {
16781
+ if (config?.vlm?.model && json.length > 0) {
16378
16782
  json = await validateWithVLM(
16379
16783
  json,
16380
16784
  config.vlm.model,
@@ -16402,29 +16806,37 @@ ${setupResult.output || ""}`);
16402
16806
  "utf-8"
16403
16807
  );
16404
16808
  }
16405
- const markdown = json.map((p) => {
16406
- if (p.vlm_corrected_text) {
16407
- return p.vlm_corrected_text;
16408
- } else {
16409
- return p.content;
16809
+ const markdownStream = fs2.createWriteStream(paths.markdown, { encoding: "utf-8" });
16810
+ for (let i = 0; i < json.length; i++) {
16811
+ const p = json[i];
16812
+ if (!p) continue;
16813
+ const content = p.vlm_corrected_text ?? p.content;
16814
+ markdownStream.write(content);
16815
+ if (i < json.length - 1) {
16816
+ markdownStream.write("\n\n\n<!-- END_OF_PAGE -->\n\n\n");
16410
16817
  }
16411
- }).join("\n\n\n<!-- END_OF_PAGE -->\n\n\n");
16412
- await fs2.promises.writeFile(
16413
- paths.markdown,
16414
- markdown,
16415
- "utf-8"
16416
- );
16818
+ }
16819
+ await new Promise((resolve3, reject) => {
16820
+ markdownStream.end(() => resolve3());
16821
+ markdownStream.on("error", reject);
16822
+ });
16417
16823
  console.log(`[EXULU] Validated output saved to: ${paths.json}`);
16418
16824
  console.log(`[EXULU] Validated markdown saved to: ${paths.markdown}`);
16825
+ const markdown = await fs2.promises.readFile(paths.markdown, "utf-8");
16826
+ const processedJson = json.map((e) => {
16827
+ const finalContent = e.vlm_corrected_text ?? e.content;
16828
+ return {
16829
+ page: e.page,
16830
+ content: finalContent
16831
+ };
16832
+ });
16833
+ json.length = 0;
16834
+ json = [];
16835
+ const memUsage = process.memoryUsage();
16836
+ console.log(`[EXULU] Memory after document processing: ${Math.round(memUsage.heapUsed / 1024 / 1024)}MB / ${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`);
16419
16837
  return {
16420
16838
  markdown,
16421
- json: json.map((e) => {
16422
- const finalContent = e.vlm_corrected_text || e.content;
16423
- return {
16424
- page: e.page,
16425
- content: finalContent
16426
- };
16427
- })
16839
+ json: processedJson
16428
16840
  };
16429
16841
  } catch (error) {
16430
16842
  console.error("[EXULU] Error processing document:", error);
@@ -16437,9 +16849,9 @@ var loadFile = async (file, name, tempDir) => {
16437
16849
  if (!fileType) {
16438
16850
  throw new Error("[EXULU] File name does not include extension, extension is required for document processing.");
16439
16851
  }
16852
+ const UUID = randomUUID6();
16440
16853
  let buffer;
16441
16854
  if (Buffer.isBuffer(file)) {
16442
- const UUID = randomUUID6();
16443
16855
  filePath = path.join(tempDir, `${UUID}.${fileType}`);
16444
16856
  await fs2.promises.writeFile(filePath, file);
16445
16857
  buffer = file;
@@ -16448,7 +16860,10 @@ var loadFile = async (file, name, tempDir) => {
16448
16860
  if (filePath.startsWith("http")) {
16449
16861
  const response = await fetch(filePath);
16450
16862
  const array = await response.arrayBuffer();
16863
+ const tempFilePath = path.join(tempDir, `${UUID}.${fileType}`);
16864
+ await fs2.promises.writeFile(tempFilePath, Buffer.from(array));
16451
16865
  buffer = Buffer.from(array);
16866
+ filePath = tempFilePath;
16452
16867
  } else {
16453
16868
  buffer = await fs2.promises.readFile(file);
16454
16869
  }
@@ -16466,17 +16881,34 @@ async function documentProcessor({
16466
16881
  }
16467
16882
  const uuid = randomUUID6();
16468
16883
  const tempDir = path.join(process.cwd(), "temp", uuid);
16884
+ const localFilesAndFoldersToDelete = [tempDir];
16469
16885
  console.log(`[EXULU] Temporary directory for processing document ${name}: ${tempDir}`);
16470
16886
  await fs2.promises.mkdir(tempDir, { recursive: true });
16887
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString();
16888
+ await fs2.promises.writeFile(path.join(tempDir, "created_at.txt"), timestamp);
16471
16889
  try {
16472
16890
  const {
16473
16891
  filePath,
16474
16892
  fileType,
16475
16893
  buffer
16476
16894
  } = await loadFile(file, name, tempDir);
16477
- const supportedTypes = ["pdf", "docx", "doc", "txt", "md"];
16895
+ let supportedTypes = [];
16896
+ switch (config?.processor.name) {
16897
+ case "docling":
16898
+ supportedTypes = ["pdf", "docx", "doc", "txt", "md"];
16899
+ break;
16900
+ case "officeparser":
16901
+ supportedTypes = [];
16902
+ break;
16903
+ case "liteparse":
16904
+ supportedTypes = ["pdf", "doc", "docx", "docm", "odt", "rtf", "ppt", "pptx", "pptm", "odp", "xls", "xlsx", "xlsm", "ods", "csv", "tsv"];
16905
+ break;
16906
+ case "mistral":
16907
+ supportedTypes = ["pdf", "docx", "doc", "txt", "md"];
16908
+ break;
16909
+ }
16478
16910
  if (!supportedTypes.includes(fileType)) {
16479
- throw new Error(`[EXULU] Unsupported file type: ${fileType} for Exulu document processor.`);
16911
+ throw new Error(`[EXULU] Unsupported file type: ${fileType} for Exulu document processor, the ${config?.processor.name} processor only supports the following file types: ${supportedTypes.join(", ")}.`);
16480
16912
  }
16481
16913
  const { content } = await processDocument(
16482
16914
  filePath,
@@ -16489,9 +16921,19 @@ async function documentProcessor({
16489
16921
  return content.json;
16490
16922
  } catch (error) {
16491
16923
  console.error("Error during chunking:", error);
16492
- return void 0;
16924
+ throw error;
16493
16925
  } finally {
16494
- await fs2.promises.rm(tempDir, { recursive: true });
16926
+ if (config?.debugging?.deleteTempFiles !== false) {
16927
+ for (const file2 of localFilesAndFoldersToDelete) {
16928
+ try {
16929
+ await fs2.promises.rm(file2, { recursive: true });
16930
+ console.log(`[EXULU] Deleted file or folder: ${file2}`);
16931
+ } catch (error) {
16932
+ console.error(`[EXULU] Error deleting file or folder: ${file2}`, error);
16933
+ console.log(`[EXULU] File or folder still exists: ${file2}`);
16934
+ }
16935
+ }
16936
+ }
16495
16937
  }
16496
16938
  }
16497
16939