@exulu/backend 1.50.0 → 1.51.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -205,17 +205,19 @@ async function postgresClient() {
205
205
  // 30 minutes
206
206
  },
207
207
  pool: {
208
- min: 5,
209
- // Increased from 2 to ensure enough connections available
210
- max: 50,
211
- // Increased from 20 to handle more concurrent operations with processor jobs
212
- acquireTimeoutMillis: 6e4,
213
- // Increased from 30s to 60s to handle pool contention
208
+ min: 10,
209
+ // Minimum connections always ready
210
+ max: 300,
211
+ // Increased to support high worker concurrency (250+ concurrent jobs)
212
+ acquireTimeoutMillis: 12e4,
213
+ // 2 minutes - increased to handle high contention during bursts
214
214
  createTimeoutMillis: 3e4,
215
215
  idleTimeoutMillis: 6e4,
216
- // Increased to keep connections alive longer
216
+ // Keep connections alive for reuse
217
217
  reapIntervalMillis: 1e3,
218
218
  createRetryIntervalMillis: 200,
219
+ // Enable propagateCreateError to properly handle connection creation failures
220
+ propagateCreateError: false,
219
221
  // Log pool events to help debug connection issues
220
222
  afterCreate: (conn, done) => {
221
223
  console.log("[EXULU] New database connection created");
@@ -3618,7 +3620,7 @@ var convertExuluToolsToAiSdkTools = async (currentTools, approvedTools, allExulu
3618
3620
  description,
3619
3621
  // The approvedTools array uses the tool.name lookup as the frontend
3620
3622
  // Vercel AI SDK uses the sanitized tool name as the key, so this matches.
3621
- needsApproval: approvedTools?.includes("tool-" + cur.name) ? false : true,
3623
+ needsApproval: approvedTools?.includes("tool-" + cur.name) || !cur.needsApproval ? false : true,
3622
3624
  // todo make configurable
3623
3625
  async *execute(inputs, options) {
3624
3626
  console.log(
@@ -3768,6 +3770,7 @@ var ExuluTool = class {
3768
3770
  inputSchema;
3769
3771
  type;
3770
3772
  tool;
3773
+ needsApproval;
3771
3774
  config;
3772
3775
  constructor({
3773
3776
  id,
@@ -3777,10 +3780,12 @@ var ExuluTool = class {
3777
3780
  inputSchema,
3778
3781
  type,
3779
3782
  execute: execute2,
3780
- config
3783
+ config,
3784
+ needsApproval
3781
3785
  }) {
3782
3786
  this.id = id;
3783
3787
  this.config = config;
3788
+ this.needsApproval = needsApproval ?? true;
3784
3789
  this.category = category || "default";
3785
3790
  this.name = name;
3786
3791
  this.description = description;
@@ -4187,11 +4192,30 @@ var ExuluContext2 = class {
4187
4192
  );
4188
4193
  await db2.from(getChunksTableName(this.id)).where({ source }).delete();
4189
4194
  if (chunks?.length) {
4195
+ const sanitizeString = (str) => {
4196
+ if (!str) return "";
4197
+ return str.replace(/\0/g, "");
4198
+ };
4199
+ const sanitizeMetadata2 = (metadata) => {
4200
+ if (!metadata) return {};
4201
+ const sanitized = {};
4202
+ for (const [key, value] of Object.entries(metadata)) {
4203
+ if (typeof value === "string") {
4204
+ sanitized[key] = sanitizeString(value);
4205
+ } else {
4206
+ sanitized[key] = value;
4207
+ }
4208
+ }
4209
+ return sanitized;
4210
+ };
4190
4211
  await db2.from(getChunksTableName(this.id)).insert(
4191
4212
  chunks.map((chunk) => ({
4192
- source,
4193
- metadata: chunk.metadata,
4194
- content: chunk.content,
4213
+ // Sanitize source to remove null bytes
4214
+ source: sanitizeString(source),
4215
+ // Sanitize metadata to remove null bytes from string values
4216
+ metadata: sanitizeMetadata2(chunk.metadata),
4217
+ // Remove null bytes (0x00) which are invalid in PostgreSQL UTF8 encoding
4218
+ content: sanitizeString(chunk.content),
4195
4219
  chunk_index: chunk.index,
4196
4220
  embedding: import_knex5.default.toSql(chunk.vector)
4197
4221
  }))
@@ -4580,6 +4604,8 @@ var ExuluContext2 = class {
4580
4604
  name: `${this.name}_context_search`,
4581
4605
  type: "context",
4582
4606
  category: "contexts",
4607
+ needsApproval: true,
4608
+ // todo make configurable
4583
4609
  inputSchema: import_zod4.z.object({
4584
4610
  query: import_zod4.z.string().describe("The original question that the user asked"),
4585
4611
  keywords: import_zod4.z.array(import_zod4.z.string()).describe(
@@ -5773,7 +5799,7 @@ var finalizeRequestedFields = async ({
5773
5799
  return result;
5774
5800
  }
5775
5801
  const { db: db2 } = await postgresClient();
5776
- const query = db2.from(getChunksTableName(context.id)).where({ source: result.id }).select("id", "content", "source", "chunk_index", "createdAt", "updatedAt");
5802
+ const query = db2.from(getChunksTableName(context.id)).where({ source: result.id }).select("id", "content", "source", "chunk_index", "createdAt", "updatedAt", "metadata");
5777
5803
  const chunks = await query;
5778
5804
  result.chunks = chunks.map((chunk) => ({
5779
5805
  chunk_content: chunk.content,
@@ -5786,7 +5812,8 @@ var finalizeRequestedFields = async ({
5786
5812
  item_created_at: chunk.item_created_at,
5787
5813
  item_id: chunk.item_id,
5788
5814
  item_external_id: chunk.item_external_id,
5789
- item_name: chunk.item_name
5815
+ item_name: chunk.item_name,
5816
+ chunk_metadata: chunk.metadata
5790
5817
  }));
5791
5818
  }
5792
5819
  }
@@ -7160,6 +7187,36 @@ var import_ai3 = require("ai");
7160
7187
  var import_crypto_js4 = __toESM(require("crypto-js"), 1);
7161
7188
  var redisConnection;
7162
7189
  var unhandledRejectionHandlerInstalled = false;
7190
+ var poolMonitoringInterval;
7191
+ var startPoolMonitoring = () => {
7192
+ if (poolMonitoringInterval) return;
7193
+ poolMonitoringInterval = setInterval(async () => {
7194
+ try {
7195
+ const { db: db2 } = await postgresClient();
7196
+ const poolStats = db2.client.pool;
7197
+ if (poolStats) {
7198
+ const used = poolStats.numUsed?.() || 0;
7199
+ const free = poolStats.numFree?.() || 0;
7200
+ const pending = poolStats.numPendingAcquires?.() || 0;
7201
+ const total = used + free;
7202
+ console.log("[EXULU] Connection pool health check:", {
7203
+ used,
7204
+ free,
7205
+ pending,
7206
+ total,
7207
+ utilization: total > 0 ? `${Math.round(used / total * 100)}%` : "0%"
7208
+ });
7209
+ if (pending > 10) {
7210
+ console.warn(
7211
+ `[EXULU] WARNING: ${pending} jobs waiting for database connections. Consider increasing pool size or reducing worker concurrency.`
7212
+ );
7213
+ }
7214
+ }
7215
+ } catch (error) {
7216
+ console.error("[EXULU] Error checking pool health:", error);
7217
+ }
7218
+ }, 3e4);
7219
+ };
7163
7220
  var installGlobalErrorHandlers = () => {
7164
7221
  if (unhandledRejectionHandlerInstalled) return;
7165
7222
  process.on("unhandledRejection", (reason) => {
@@ -7184,6 +7241,7 @@ var installGlobalErrorHandlers = () => {
7184
7241
  unhandledRejectionHandlerInstalled = true;
7185
7242
  console.log("[EXULU] Global error handlers installed to prevent worker crashes");
7186
7243
  };
7244
+ var isShuttingDown = false;
7187
7245
  var createWorkers = async (providers, queues2, config, contexts, rerankers, evals, tools, tracer) => {
7188
7246
  console.log("[EXULU] creating workers for " + queues2?.length + " queues.");
7189
7247
  console.log(
@@ -7191,7 +7249,8 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7191
7249
  queues2.map((q) => q.queue.name)
7192
7250
  );
7193
7251
  installGlobalErrorHandlers();
7194
- process.setMaxListeners(Math.max(queues2.length * 2 + 5, 15));
7252
+ startPoolMonitoring();
7253
+ process.setMaxListeners(Math.max(15, process.getMaxListeners()));
7195
7254
  if (!redisServer.host || !redisServer.port) {
7196
7255
  console.error(
7197
7256
  "[EXULU] you are trying to start worker, but no redis server is configured in the environment."
@@ -7224,7 +7283,53 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7224
7283
  status: await bullmqJob.getState(),
7225
7284
  type: bullmqJob.data.type
7226
7285
  });
7227
- const { db: db2 } = await postgresClient();
7286
+ let progressInterval;
7287
+ if (bullmqJob.data.type === "processor") {
7288
+ progressInterval = setInterval(async () => {
7289
+ try {
7290
+ await bullmqJob.updateProgress({
7291
+ status: "processing",
7292
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
7293
+ });
7294
+ console.log(`[EXULU] Job ${bullmqJob.id} heartbeat sent to prevent stalling`);
7295
+ } catch (error) {
7296
+ console.error(`[EXULU] Error updating job progress:`, error);
7297
+ }
7298
+ }, 25e3);
7299
+ }
7300
+ let db2;
7301
+ let retries = 3;
7302
+ let lastError;
7303
+ for (let attempt = 1; attempt <= retries; attempt++) {
7304
+ try {
7305
+ const client2 = await postgresClient();
7306
+ db2 = client2.db;
7307
+ const poolStats = db2.client.pool;
7308
+ if (poolStats) {
7309
+ console.log(`[EXULU] Connection pool stats for job ${bullmqJob.id}:`, {
7310
+ size: poolStats.numUsed?.() || 0,
7311
+ available: poolStats.numFree?.() || 0,
7312
+ pending: poolStats.numPendingAcquires?.() || 0
7313
+ });
7314
+ }
7315
+ break;
7316
+ } catch (error) {
7317
+ lastError = error instanceof Error ? error : new Error(String(error));
7318
+ console.error(
7319
+ `[EXULU] Failed to acquire database connection (attempt ${attempt}/${retries}) for job ${bullmqJob.id}:`,
7320
+ lastError.message
7321
+ );
7322
+ if (attempt < retries) {
7323
+ const backoffMs = 500 * Math.pow(2, attempt - 1);
7324
+ await new Promise((resolve3) => setTimeout(resolve3, backoffMs));
7325
+ }
7326
+ }
7327
+ }
7328
+ if (!db2) {
7329
+ throw new Error(
7330
+ `Failed to acquire database connection after ${retries} attempts: ${lastError?.message}`
7331
+ );
7332
+ }
7228
7333
  const data = bullmqJob.data;
7229
7334
  const timeoutInSeconds = data.timeoutInSeconds || queue.timeoutInSeconds || 600;
7230
7335
  const timeoutMs = timeoutInSeconds * 1e3;
@@ -7316,7 +7421,7 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7316
7421
  }
7317
7422
  const exuluStorage = new ExuluStorage({ config });
7318
7423
  console.log("[EXULU] POS 2 -- EXULU CONTEXT PROCESS FIELD");
7319
- const processorResult = await context.processor.execute({
7424
+ let processorResult = await context.processor.execute({
7320
7425
  item: data.inputs,
7321
7426
  user: data.user,
7322
7427
  role: data.role,
@@ -7331,12 +7436,16 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7331
7436
  );
7332
7437
  }
7333
7438
  delete processorResult.field;
7439
+ const updateData = { ...processorResult };
7334
7440
  await db2.from(getTableName(context.id)).where({
7335
7441
  id: processorResult.id
7336
7442
  }).update({
7337
- ...processorResult,
7443
+ ...updateData,
7338
7444
  last_processed_at: (/* @__PURE__ */ new Date()).toISOString()
7339
7445
  });
7446
+ Object.keys(updateData).forEach((key) => {
7447
+ delete updateData[key];
7448
+ });
7340
7449
  let jobs = [];
7341
7450
  if (context.processor?.config?.generateEmbeddings) {
7342
7451
  const fullItem = await db2.from(getTableName(context.id)).where({
@@ -7358,12 +7467,18 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7358
7467
  jobs.push(embeddingsJob);
7359
7468
  }
7360
7469
  }
7361
- return {
7362
- result: processorResult,
7470
+ const result = {
7471
+ result: { id: processorResult.id },
7363
7472
  metadata: {
7364
7473
  jobs: jobs.length > 0 ? jobs.join(",") : void 0
7365
7474
  }
7366
7475
  };
7476
+ processorResult = null;
7477
+ const memUsage = process.memoryUsage();
7478
+ console.log(
7479
+ `[EXULU] Memory after processor job ${bullmqJob.id}: ${Math.round(memUsage.heapUsed / 1024 / 1024)}MB / ${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`
7480
+ );
7481
+ return result;
7367
7482
  }
7368
7483
  if (data.type === "workflow") {
7369
7484
  console.log("[EXULU] running a workflow job.", bullmqJob.name);
@@ -7382,10 +7497,10 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7382
7497
  user,
7383
7498
  messages: inputMessages
7384
7499
  } = await validateWorkflowPayload(data, providers);
7385
- const retries = 3;
7500
+ const retries2 = 3;
7386
7501
  let attempts = 0;
7387
7502
  const promise = new Promise(async (resolve3, reject) => {
7388
- while (attempts < retries) {
7503
+ while (attempts < retries2) {
7389
7504
  try {
7390
7505
  const messages2 = await processUiMessagesFlow({
7391
7506
  providers,
@@ -7407,7 +7522,7 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7407
7522
  error instanceof Error ? error.message : String(error)
7408
7523
  );
7409
7524
  attempts++;
7410
- if (attempts >= retries) {
7525
+ if (attempts >= retries2) {
7411
7526
  reject(new Error(error instanceof Error ? error.message : String(error)));
7412
7527
  }
7413
7528
  await new Promise((resolve4) => setTimeout((resolve5) => resolve5(true), 2e3));
@@ -7458,10 +7573,10 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7458
7573
  testCase,
7459
7574
  messages: inputMessages
7460
7575
  } = await validateEvalPayload(data, providers);
7461
- const retries = 3;
7576
+ const retries2 = 3;
7462
7577
  let attempts = 0;
7463
7578
  const promise = new Promise(async (resolve3, reject) => {
7464
- while (attempts < retries) {
7579
+ while (attempts < retries2) {
7465
7580
  try {
7466
7581
  const messages2 = await processUiMessagesFlow({
7467
7582
  providers,
@@ -7482,7 +7597,7 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7482
7597
  error instanceof Error ? error.message : String(error)
7483
7598
  );
7484
7599
  attempts++;
7485
- if (attempts >= retries) {
7600
+ if (attempts >= retries2) {
7486
7601
  reject(new Error(error instanceof Error ? error.message : String(error)));
7487
7602
  }
7488
7603
  await new Promise((resolve4) => setTimeout((resolve5) => resolve5(true), 2e3));
@@ -7731,9 +7846,15 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7731
7846
  try {
7732
7847
  const result = await Promise.race([workPromise, timeoutPromise]);
7733
7848
  clearTimeout(timeoutHandle);
7849
+ if (progressInterval) {
7850
+ clearInterval(progressInterval);
7851
+ }
7734
7852
  return result;
7735
7853
  } catch (error) {
7736
7854
  clearTimeout(timeoutHandle);
7855
+ if (progressInterval) {
7856
+ clearInterval(progressInterval);
7857
+ }
7737
7858
  console.error(
7738
7859
  `[EXULU] job ${bullmqJob.id} failed (error caught in race handler).`,
7739
7860
  error instanceof Error ? error.message : String(error)
@@ -7747,6 +7868,16 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7747
7868
  concurrency: queue.concurrency?.worker || 1,
7748
7869
  removeOnComplete: { count: 1e3 },
7749
7870
  removeOnFail: { count: 5e3 },
7871
+ // Configure settings for long-running jobs (especially processor jobs)
7872
+ // lockDuration: How long a worker can hold a job before it's considered stalled
7873
+ // Set to 5 minutes to accommodate CPU-intensive operations
7874
+ lockDuration: 3e5,
7875
+ // 5 minutes in milliseconds
7876
+ // stalledInterval: How often to check for stalled jobs
7877
+ // Set to 2 minutes to reduce false positives for long-running operations
7878
+ stalledInterval: 12e4,
7879
+ // 2 minutes in milliseconds
7880
+ maxStalledCount: 1,
7750
7881
  ...queue.ratelimit && {
7751
7882
  limiter: {
7752
7883
  max: queue.ratelimit,
@@ -7783,24 +7914,68 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7783
7914
  error: error instanceof Error ? error.message : String(error)
7784
7915
  } : error
7785
7916
  );
7917
+ throw error;
7786
7918
  });
7787
7919
  worker.on("error", (error) => {
7788
7920
  console.error(`[EXULU] worker error.`, error);
7921
+ throw error;
7789
7922
  });
7790
7923
  worker.on("progress", (job, progress) => {
7791
7924
  console.log(`[EXULU] job progress ${job.id}.`, job.name, {
7792
7925
  progress
7793
7926
  });
7794
7927
  });
7795
- const gracefulShutdown = async (signal) => {
7796
- console.log(`Received ${signal}, closing server...`);
7797
- await worker.close();
7798
- process.exit(0);
7799
- };
7800
- process.on("SIGINT", () => gracefulShutdown("SIGINT"));
7801
- process.on("SIGTERM", () => gracefulShutdown("SIGTERM"));
7802
7928
  return worker;
7803
7929
  });
7930
+ const gracefulShutdown = async (signal) => {
7931
+ if (isShuttingDown) {
7932
+ console.log(`[EXULU] Shutdown already in progress, ignoring additional ${signal}`);
7933
+ return;
7934
+ }
7935
+ isShuttingDown = true;
7936
+ console.log(`[EXULU] Received ${signal}, shutting down gracefully...`);
7937
+ try {
7938
+ if (poolMonitoringInterval) {
7939
+ clearInterval(poolMonitoringInterval);
7940
+ poolMonitoringInterval = void 0;
7941
+ }
7942
+ console.log(`[EXULU] Closing ${workers.length} worker(s)...`);
7943
+ const closePromises = workers.map(async (worker, index) => {
7944
+ try {
7945
+ await Promise.race([
7946
+ worker.close(),
7947
+ new Promise(
7948
+ (_, reject) => setTimeout(() => reject(new Error("Worker close timeout")), 3e4)
7949
+ )
7950
+ ]);
7951
+ console.log(`[EXULU] Worker ${index + 1} closed successfully`);
7952
+ } catch (error) {
7953
+ console.error(`[EXULU] Error closing worker ${index + 1}:`, error);
7954
+ }
7955
+ });
7956
+ await Promise.allSettled(closePromises);
7957
+ if (redisConnection) {
7958
+ console.log(`[EXULU] Closing Redis connection...`);
7959
+ await redisConnection.quit();
7960
+ }
7961
+ try {
7962
+ const { db: db2 } = await postgresClient();
7963
+ if (db2?.client) {
7964
+ console.log(`[EXULU] Closing database connection pool...`);
7965
+ await db2.client.destroy();
7966
+ }
7967
+ } catch (error) {
7968
+ console.error(`[EXULU] Error closing database:`, error);
7969
+ }
7970
+ console.log(`[EXULU] Graceful shutdown complete`);
7971
+ process.exit(0);
7972
+ } catch (error) {
7973
+ console.error(`[EXULU] Error during graceful shutdown:`, error);
7974
+ process.exit(1);
7975
+ }
7976
+ };
7977
+ process.once("SIGINT", () => gracefulShutdown("SIGINT"));
7978
+ process.once("SIGTERM", () => gracefulShutdown("SIGTERM"));
7804
7979
  return workers;
7805
7980
  };
7806
7981
  var validateWorkflowPayload = async (data, providers) => {
@@ -9514,6 +9689,7 @@ type AgentEvalFunctionConfig {
9514
9689
 
9515
9690
  type ItemChunks {
9516
9691
  chunk_id: String!
9692
+ chunk_metadata: JSON!
9517
9693
  chunk_index: Int!
9518
9694
  chunk_content: String!
9519
9695
  chunk_source: String!
@@ -9920,7 +10096,7 @@ var ExuluProvider = class {
9920
10096
  prompt: import_zod7.z.string().describe("The prompt (usually a question for the agent) to send to the agent."),
9921
10097
  information: import_zod7.z.string().describe("A summary of relevant context / information from the current session")
9922
10098
  }),
9923
- description: `This tool calls an AI agent named: ${agent.name}. The agent does the following: ${agent.description}.`,
10099
+ description: `This tool calls an agent named: ${agent.name}. The agent does the following: ${agent.description}.`,
9924
10100
  config: [],
9925
10101
  execute: async ({ prompt, information, user, allExuluTools }) => {
9926
10102
  const hasAccessToAgent = await checkRecordAccess(agent, "read", user);
@@ -10033,9 +10209,6 @@ var ExuluProvider = class {
10033
10209
  if (!prompt && !inputMessages?.length) {
10034
10210
  throw new Error("Prompt or message is required for generating.");
10035
10211
  }
10036
- if (outputSchema && !prompt) {
10037
- throw new Error("Prompt is required for generating with an output schema.");
10038
- }
10039
10212
  const model = this.model.create({
10040
10213
  ...providerapikey ? { apiKey: providerapikey } : {}
10041
10214
  });
@@ -10172,14 +10345,18 @@ var ExuluProvider = class {
10172
10345
  let inputTokens = 0;
10173
10346
  let outputTokens = 0;
10174
10347
  if (outputSchema) {
10175
- const { object, usage } = await (0, import_ai4.generateObject)({
10348
+ const { output, usage } = await (0, import_ai4.generateText)({
10176
10349
  model,
10177
10350
  system,
10178
- prompt,
10179
10351
  maxRetries: 3,
10180
- schema: outputSchema
10352
+ output: import_ai4.Output.object({
10353
+ schema: outputSchema
10354
+ }),
10355
+ prompt,
10356
+ stopWhen: [(0, import_ai4.stepCountIs)(5)]
10357
+ // make configurable
10181
10358
  });
10182
- result.object = object;
10359
+ result.object = output;
10183
10360
  inputTokens = usage.inputTokens || 0;
10184
10361
  outputTokens = usage.outputTokens || 0;
10185
10362
  } else {
@@ -10210,6 +10387,7 @@ var ExuluProvider = class {
10210
10387
  agent
10211
10388
  ),
10212
10389
  stopWhen: [(0, import_ai4.stepCountIs)(5)]
10390
+ // make configurable
10213
10391
  });
10214
10392
  result.text = text;
10215
10393
  inputTokens = totalUsage?.inputTokens || 0;
@@ -10684,6 +10862,8 @@ var providerRateLimiter = async (key, windowSeconds, limit, points) => {
10684
10862
  };
10685
10863
 
10686
10864
  // src/exulu/routes.ts
10865
+ var import_zod_from_json_schema = require("zod-from-json-schema");
10866
+ var import_zod8 = require("zod");
10687
10867
  var REQUEST_SIZE_LIMIT = "50mb";
10688
10868
  var getExuluVersionNumber = async () => {
10689
10869
  try {
@@ -11104,6 +11284,16 @@ Mood: friendly and intelligent.
11104
11284
  providers,
11105
11285
  user
11106
11286
  );
11287
+ if (req.body.outputSchema && !!headers.stream) {
11288
+ throw new Error("Providing a outputSchema in the POST body is not allowed when using the streaming API, set 'stream' to false in the headers when defining a response schema.");
11289
+ }
11290
+ let outputSchema;
11291
+ if (req.body.outputSchema) {
11292
+ if (typeof req.body.outputSchema === "string") {
11293
+ req.body.outputSchema = JSON.parse(req.body.outputSchema);
11294
+ }
11295
+ outputSchema = (0, import_zod_from_json_schema.convertJsonSchemaToZod)(req.body.outputSchema);
11296
+ }
11107
11297
  let providerapikey;
11108
11298
  const variableName = agent.providerapikey;
11109
11299
  if (variableName) {
@@ -11250,6 +11440,7 @@ Mood: friendly and intelligent.
11250
11440
  const response = await provider.generateSync({
11251
11441
  contexts,
11252
11442
  rerankers: rerankers || [],
11443
+ outputSchema,
11253
11444
  agent,
11254
11445
  user,
11255
11446
  req,
@@ -11575,7 +11766,7 @@ var import_types2 = require("@modelcontextprotocol/sdk/types.js");
11575
11766
  var import_express4 = require("express");
11576
11767
  var import_api3 = require("@opentelemetry/api");
11577
11768
  var import_crypto_js7 = __toESM(require("crypto-js"), 1);
11578
- var import_zod8 = require("zod");
11769
+ var import_zod9 = require("zod");
11579
11770
  var SESSION_ID_HEADER = "mcp-session-id";
11580
11771
  var ExuluMCP = class {
11581
11772
  server = {};
@@ -11658,7 +11849,7 @@ var ExuluMCP = class {
11658
11849
  title: tool3.name + " agent",
11659
11850
  description: tool3.description,
11660
11851
  inputSchema: {
11661
- inputs: tool3.inputSchema || import_zod8.z.object({})
11852
+ inputs: tool3.inputSchema || import_zod9.z.object({})
11662
11853
  }
11663
11854
  },
11664
11855
  async ({ inputs }, args) => {
@@ -11710,7 +11901,7 @@ var ExuluMCP = class {
11710
11901
  title: "Get List of Prompt Templates",
11711
11902
  description: "Retrieves a list of prompt templates available for this agent. Returns the name, description, and ID of each template.",
11712
11903
  inputSchema: {
11713
- inputs: import_zod8.z.object({})
11904
+ inputs: import_zod9.z.object({})
11714
11905
  }
11715
11906
  },
11716
11907
  async ({ inputs }, args) => {
@@ -11756,8 +11947,8 @@ var ExuluMCP = class {
11756
11947
  title: "Get Prompt Template Details",
11757
11948
  description: "Retrieves the full details of a specific prompt template by ID, including the actual template content with variables.",
11758
11949
  inputSchema: {
11759
- inputs: import_zod8.z.object({
11760
- id: import_zod8.z.string().describe("The ID of the prompt template to retrieve")
11950
+ inputs: import_zod9.z.object({
11951
+ id: import_zod9.z.string().describe("The ID of the prompt template to retrieve")
11761
11952
  })
11762
11953
  }
11763
11954
  },
@@ -12665,7 +12856,7 @@ var ExuluEval = class {
12665
12856
  };
12666
12857
 
12667
12858
  // src/templates/evals/index.ts
12668
- var import_zod9 = require("zod");
12859
+ var import_zod10 = require("zod");
12669
12860
  var llmAsJudgeEval = () => {
12670
12861
  if (process.env.REDIS_HOST?.length && process.env.REDIS_PORT?.length) {
12671
12862
  return new ExuluEval({
@@ -12710,8 +12901,8 @@ var llmAsJudgeEval = () => {
12710
12901
  contexts: [],
12711
12902
  rerankers: [],
12712
12903
  prompt,
12713
- outputSchema: import_zod9.z.object({
12714
- score: import_zod9.z.number().min(0).max(100).describe("The score between 0 and 100.")
12904
+ outputSchema: import_zod10.z.object({
12905
+ score: import_zod10.z.number().min(0).max(100).describe("The score between 0 and 100.")
12715
12906
  }),
12716
12907
  providerapikey
12717
12908
  });
@@ -12939,12 +13130,12 @@ Usage:
12939
13130
  - If no todos exist yet, an empty list will be returned`;
12940
13131
 
12941
13132
  // src/templates/tools/todo/todo.ts
12942
- var import_zod10 = __toESM(require("zod"), 1);
12943
- var TodoSchema = import_zod10.default.object({
12944
- content: import_zod10.default.string().describe("Brief description of the task"),
12945
- status: import_zod10.default.string().describe("Current status of the task: pending, in_progress, completed, cancelled"),
12946
- priority: import_zod10.default.string().describe("Priority level of the task: high, medium, low"),
12947
- id: import_zod10.default.string().describe("Unique identifier for the todo item")
13133
+ var import_zod11 = __toESM(require("zod"), 1);
13134
+ var TodoSchema = import_zod11.default.object({
13135
+ content: import_zod11.default.string().describe("Brief description of the task"),
13136
+ status: import_zod11.default.string().describe("Current status of the task: pending, in_progress, completed, cancelled"),
13137
+ priority: import_zod11.default.string().describe("Priority level of the task: high, medium, low"),
13138
+ id: import_zod11.default.string().describe("Unique identifier for the todo item")
12948
13139
  });
12949
13140
  var TodoWriteTool = new ExuluTool({
12950
13141
  id: "todo_write",
@@ -12960,8 +13151,8 @@ var TodoWriteTool = new ExuluTool({
12960
13151
  default: todowrite_default
12961
13152
  }
12962
13153
  ],
12963
- inputSchema: import_zod10.default.object({
12964
- todos: import_zod10.default.array(TodoSchema).describe("The updated todo list")
13154
+ inputSchema: import_zod11.default.object({
13155
+ todos: import_zod11.default.array(TodoSchema).describe("The updated todo list")
12965
13156
  }),
12966
13157
  execute: async (inputs) => {
12967
13158
  const { sessionID, todos, user } = inputs;
@@ -12996,7 +13187,7 @@ var TodoReadTool = new ExuluTool({
12996
13187
  id: "todo_read",
12997
13188
  name: "Todo Read",
12998
13189
  description: "Use this tool to read your todo list",
12999
- inputSchema: import_zod10.default.object({}),
13190
+ inputSchema: import_zod11.default.object({}),
13000
13191
  type: "function",
13001
13192
  category: "todo",
13002
13193
  config: [
@@ -13035,15 +13226,15 @@ async function getTodos(sessionID) {
13035
13226
  var todoTools = [TodoWriteTool, TodoReadTool];
13036
13227
 
13037
13228
  // src/templates/tools/perplexity.ts
13038
- var import_zod11 = __toESM(require("zod"), 1);
13229
+ var import_zod12 = __toESM(require("zod"), 1);
13039
13230
  var import_perplexity_ai = __toESM(require("@perplexity-ai/perplexity_ai"), 1);
13040
13231
  var internetSearchTool = new ExuluTool({
13041
13232
  id: "internet_search",
13042
13233
  name: "Perplexity Live Internet Search",
13043
13234
  description: "Search the internet for information.",
13044
- inputSchema: import_zod11.default.object({
13045
- query: import_zod11.default.string().describe("The query to the tool."),
13046
- search_recency_filter: import_zod11.default.enum(["day", "week", "month", "year"]).optional().describe("The recency filter for the search, can be day, week, month or year.")
13235
+ inputSchema: import_zod12.default.object({
13236
+ query: import_zod12.default.string().describe("The query to the tool."),
13237
+ search_recency_filter: import_zod12.default.enum(["day", "week", "month", "year"]).optional().describe("The recency filter for the search, can be day, week, month or year.")
13047
13238
  }),
13048
13239
  category: "internet_search",
13049
13240
  type: "web_search",
@@ -15485,6 +15676,22 @@ var MarkdownChunker = class {
15485
15676
  });
15486
15677
  return result;
15487
15678
  }
15679
+ /**
15680
+ * Checks if a position in the text falls within a <diagram> tag.
15681
+ * Returns the adjusted position (before the diagram) if inside a diagram, otherwise returns the original position.
15682
+ */
15683
+ adjustForDiagramTags(text, position) {
15684
+ const diagramRegex = /<diagram>[\s\S]*?<\/diagram>/gi;
15685
+ let match;
15686
+ while ((match = diagramRegex.exec(text)) !== null) {
15687
+ const diagramStart = match.index;
15688
+ const diagramEnd = match.index + match[0].length;
15689
+ if (position > diagramStart && position < diagramEnd) {
15690
+ return diagramStart;
15691
+ }
15692
+ }
15693
+ return position;
15694
+ }
15488
15695
  /**
15489
15696
  * Find the nearest logical breakpoint working backwards from the end of the text.
15490
15697
  * Logical breakpoints are prioritized as follows:
@@ -15496,6 +15703,7 @@ var MarkdownChunker = class {
15496
15703
  *
15497
15704
  * Only considers breakpoints in the last 50% of the text to avoid creating very small chunks.
15498
15705
  * Returns the position of the breakpoint, or null if none found
15706
+ * IMPORTANT: Never splits content within <diagram> tags
15499
15707
  */
15500
15708
  findLogicalBreakpoint(text) {
15501
15709
  if (text.length === 0) return null;
@@ -15515,7 +15723,7 @@ var MarkdownChunker = class {
15515
15723
  }
15516
15724
  }
15517
15725
  if (lastHeaderPosition > 0) {
15518
- return lastHeaderPosition;
15726
+ return this.adjustForDiagramTags(text, lastHeaderPosition);
15519
15727
  }
15520
15728
  let lastParagraphBreak = -1;
15521
15729
  let searchPos = text.length;
@@ -15528,11 +15736,12 @@ var MarkdownChunker = class {
15528
15736
  searchPos = pos;
15529
15737
  }
15530
15738
  if (lastParagraphBreak > 0) {
15531
- return lastParagraphBreak + 2;
15739
+ const adjusted = this.adjustForDiagramTags(text, lastParagraphBreak + 2);
15740
+ return adjusted;
15532
15741
  }
15533
15742
  const newlineIndex = text.lastIndexOf("\n");
15534
15743
  if (newlineIndex >= minPosition) {
15535
- return newlineIndex + 1;
15744
+ return this.adjustForDiagramTags(text, newlineIndex + 1);
15536
15745
  }
15537
15746
  const sentenceEndRegex = /[.!?](?:\s|$)/g;
15538
15747
  let lastSentenceEnd = -1;
@@ -15542,13 +15751,13 @@ var MarkdownChunker = class {
15542
15751
  }
15543
15752
  }
15544
15753
  if (lastSentenceEnd > 0) {
15545
- return lastSentenceEnd;
15754
+ return this.adjustForDiagramTags(text, lastSentenceEnd);
15546
15755
  }
15547
15756
  let lastSpace = text.length;
15548
15757
  while (lastSpace > minPosition) {
15549
15758
  const pos = text.lastIndexOf(" ", lastSpace - 1);
15550
15759
  if (pos >= minPosition) {
15551
- return pos + 1;
15760
+ return this.adjustForDiagramTags(text, pos + 1);
15552
15761
  }
15553
15762
  lastSpace = pos;
15554
15763
  }
@@ -15680,6 +15889,38 @@ var MarkdownChunker = class {
15680
15889
  targetPosition = currentPosition + decoded.length;
15681
15890
  }
15682
15891
  }
15892
+ const diagramCheck = /<diagram>/gi;
15893
+ const diagramCloseCheck = /<\/diagram>/gi;
15894
+ let openDiagramsInSlice = 0;
15895
+ while (diagramCheck.exec(currentSlice) !== null) {
15896
+ openDiagramsInSlice++;
15897
+ }
15898
+ let closeDiagramsInSlice = 0;
15899
+ while (diagramCloseCheck.exec(currentSlice) !== null) {
15900
+ closeDiagramsInSlice++;
15901
+ }
15902
+ if (openDiagramsInSlice > closeDiagramsInSlice) {
15903
+ const lastDiagramOpenIndex = currentSlice.lastIndexOf("<diagram>");
15904
+ if (lastDiagramOpenIndex !== -1) {
15905
+ const remainingText = text.slice(currentPosition + lastDiagramOpenIndex);
15906
+ const closingTagMatch = /<\/diagram>/i.exec(remainingText);
15907
+ if (closingTagMatch) {
15908
+ const closingTagPosition = lastDiagramOpenIndex + closingTagMatch.index + closingTagMatch[0].length;
15909
+ const extendedSlice = text.slice(currentPosition, currentPosition + closingTagPosition);
15910
+ const extendedTokens = tokenizer.encode(extendedSlice);
15911
+ if (extendedTokens.length <= adjustedChunkSize * 1.5) {
15912
+ currentSlice = extendedSlice;
15913
+ targetPosition = currentPosition + closingTagPosition;
15914
+ } else {
15915
+ currentSlice = currentSlice.slice(0, lastDiagramOpenIndex);
15916
+ targetPosition = currentPosition + lastDiagramOpenIndex;
15917
+ }
15918
+ } else {
15919
+ currentSlice = currentSlice.slice(0, lastDiagramOpenIndex);
15920
+ targetPosition = currentPosition + lastDiagramOpenIndex;
15921
+ }
15922
+ }
15923
+ }
15683
15924
  const breakpointPosition = this.findLogicalBreakpoint(currentSlice);
15684
15925
  if (breakpointPosition !== null) {
15685
15926
  currentSlice = currentSlice.slice(0, breakpointPosition);
@@ -15952,7 +16193,7 @@ Or manually run the setup script:
15952
16193
  var fs2 = __toESM(require("fs"), 1);
15953
16194
  var path = __toESM(require("path"), 1);
15954
16195
  var import_ai7 = require("ai");
15955
- var import_zod12 = require("zod");
16196
+ var import_zod13 = require("zod");
15956
16197
  var import_p_limit = __toESM(require("p-limit"), 1);
15957
16198
  var import_crypto = require("crypto");
15958
16199
  var mammoth = __toESM(require("mammoth"), 1);
@@ -16101,6 +16342,8 @@ ${command}`;
16101
16342
  }
16102
16343
 
16103
16344
  // ee/python/documents/processing/doc_processor.ts
16345
+ var import_liteparse = require("@llamaindex/liteparse");
16346
+ var import_mistralai = require("@mistralai/mistralai");
16104
16347
  async function processDocx(file) {
16105
16348
  const html = await mammoth.convertToHtml({ buffer: file });
16106
16349
  const turndownService = new import_turndown.default();
@@ -16175,52 +16418,93 @@ async function validatePageWithVLM(page, imagePath, model) {
16175
16418
  const imageBuffer = await fs2.promises.readFile(imagePath);
16176
16419
  const imageBase64 = imageBuffer.toString("base64");
16177
16420
  const mimeType = "image/png";
16178
- const prompt = `You are validating OCR/document parsing output for a page that might contain tables and images.
16179
-
16180
- Here is the current OCR/parsed content for this page:
16421
+ const prompt = `You are a document validation assistant. Your task is to analyze a page image and correct the output of an OCR/parsing pipeline. The content may include tables, technical diagrams, schematics, and structured text.
16181
16422
 
16182
16423
  ---
16424
+ ## CURRENT OCR OUTPUT
16425
+
16183
16426
  ${page.content}
16184
16427
  ---
16185
16428
 
16186
- Please analyze the page image and validate it:
16429
+ ## YOUR TASK
16187
16430
 
16188
- 1. Check if the extracted markdown text accurately represents the content from the page, including:
16189
- - Table data (rows, columns, headers, values)
16190
- - Technical diagrams, schematics, control boards
16191
- - Icons, checkmarks, symbols
16192
- - Image captions and labels
16431
+ Compare the page image to the OCR output above. Identify errors, omissions, and formatting issues, then return a structured validation result (see OUTPUT FORMAT below).
16193
16432
 
16194
- 2. If the page has significant errors or omissions, provide a corrected version for the page.
16195
-
16196
- 3. Return a validation result for the page.
16197
-
16198
- IMPORTANT OUTPUT FORMAT REQUIREMENTS:
16199
- - You MUST output all tables in proper Markdown table format using pipes (|) and dashes (---)
16200
- - Use simple separator rows: | --- | --- | (NOT long dashes like ----------------------)
16201
- - Every table must have: header row, separator row, and data rows
16202
- - Example format:
16433
+ ---
16434
+ ## VALIDATION CHECKLIST
16435
+
16436
+ Work through these checks in order:
16437
+
16438
+ ### 1. Text Accuracy
16439
+ - Verify all text is correctly transcribed.
16440
+ - For minor character-level OCR errors (e.g. "\xF6" vs "\xFC", "rn" vs "m"), **prefer the original OCR output** unless you are certain of an error. Do not silently "fix" characters based on guesswork.
16441
+
16442
+ ### 2. Heading Levels
16443
+ - Verify that headings use correct Markdown levels (#, ##, ###, ####, #####, ######).
16444
+ - Determine heading level using the following priority:
16445
+ 1. **Hierarchical numbering** (strongest signal): e.g. "1" \u2192 #, "2.1" \u2192 ##, "2.1.1" \u2192 ###, "2.1.2.5" \u2192 ####
16446
+ 2. Font size (larger = higher level)
16447
+ 3. Indentation
16448
+ 4. Bold/emphasis styling
16449
+
16450
+ ### 3. Tables
16451
+
16452
+ **First, decide whether the table should be Markdown or plain text:**
16453
+ - Use **Markdown table format** if the table has a consistent, clear header structure and uniform column layout throughout.
16454
+ - Use **plain text structured description** if the table:
16455
+ - Lacks a clear header row
16456
+ - Uses mixed or irregular column structures across rows
16457
+ - Functions more like a certificate, form, or label layout
16458
+
16459
+ **If using Markdown format**, follow these rules strictly:
16460
+ - Every table must have: header row \u2192 separator row \u2192 data rows
16461
+ - Use simple separators only: \`| --- | --- |\` (NOT \`|---|---|\` or long dashes)
16462
+ - Example:
16463
+ \`\`\`
16203
16464
  | Column 1 | Column 2 |
16204
16465
  | --- | --- |
16205
- | Data 1 | Data 2 |
16206
- - If the extracted content already has tables, preserve their structure but fix any errors you find in the actual data
16207
- - Do NOT output tables as plain text or in any other format
16208
- - Preserve all markdown formatting (headings with ##, lists, etc.)
16209
-
16210
- Specific notes and guidelines:
16211
- - Some pages might contain a table with a column that show black and white dots (for Example Rufe-LEDs). You should translate this into + for black (meaning active) and - for white (meaning inactive).
16212
- - Some tables might use green or black checkmarks and red or black crosses. You should translate this into + for checkmarks (meaning active) and - for a cross (meaning inactive).
16213
- - IMPORTANT: Only provide corrections if you find actual errors in the content. If the extracted text is accurate, set needs_correction to false.
16214
-
16466
+ | Data 1 | Data 2 |
16467
+ \`\`\`
16468
+ - Important: do not use the | character as part of the data inside a cell, this would break the table, if a cell contains a | character, use a capital I.
16469
+
16470
+ **Symbol translation rules for table cells:**
16471
+ - Black/filled dot \u2192 \`+\` (active); White/empty dot \u2192 \`-\` (inactive)
16472
+ *(e.g. Rufe-LED columns)*
16473
+ - Green or black checkmark \u2192 \`+\` (active); Red or black cross \u2192 \`-\` (inactive)
16474
+
16475
+ ### 4. Multi-Page Table Continuity
16476
+ - If this page contains a table with a header row that runs to the bottom of the page (suggesting it may continue on the next page), extract the header row and include it in the \`current_page_table.headers\` field.
16477
+ - If this page contains a table WITHOUT a header row (suggesting it's a continuation from a previous page), set \`current_page_table.is_continuation\` to true and try to identify what the headers might be based on the data structure. Include your best guess for headers in \`current_page_table.headers\`.
16478
+
16479
+ ### 5. Technical Diagrams & Schematics
16480
+ If the page contains a flow-chart, schematic, technical drawing or control board layout that is **absent or poorly described** in the OCR output do the following:
16481
+ - Open a <diagram> tag with the following content:
16482
+ <diagram>
16483
+ <description>
16484
+ Add a detailed description of the diagram here.
16485
+ </description>
16486
+ <mermaid>
16487
+ Add a mermaid diagram schema here that in detail describes the diagram.
16488
+ </mermaid>
16489
+ </diagram>
16490
+
16491
+ ### 6. Captions, Icons & Symbols
16492
+ - Verify that image captions, labels, icons, and checkmarks are present and correctly transcribed.
16493
+
16494
+ ### 7. Only populate \`corrected_text\` when \`needs_correction\` is true. If the OCR output is accurate, return \`needs_correction: false\` and \`corrected_content: null\`.
16215
16495
  `;
16216
16496
  const result = await (0, import_ai7.generateText)({
16217
16497
  model,
16218
16498
  output: import_ai7.Output.object({
16219
- schema: import_zod12.z.object({
16220
- needs_correction: import_zod12.z.boolean(),
16221
- corrected_text: import_zod12.z.string().nullable(),
16222
- confidence: import_zod12.z.enum(["high", "medium", "low"]),
16223
- reasoning: import_zod12.z.string()
16499
+ schema: import_zod13.z.object({
16500
+ needs_correction: import_zod13.z.boolean(),
16501
+ corrected_text: import_zod13.z.string().nullable(),
16502
+ current_page_table: import_zod13.z.object({
16503
+ headers: import_zod13.z.array(import_zod13.z.string()),
16504
+ is_continuation: import_zod13.z.boolean()
16505
+ }).nullable(),
16506
+ confidence: import_zod13.z.enum(["high", "medium", "low"]),
16507
+ reasoning: import_zod13.z.string()
16224
16508
  })
16225
16509
  }),
16226
16510
  messages: [
@@ -16241,23 +16525,80 @@ Specific notes and guidelines:
16241
16525
  needs_correction: parsedOutput.needs_correction,
16242
16526
  corrected_text: parsedOutput.corrected_text || void 0,
16243
16527
  confidence: parsedOutput.confidence,
16528
+ current_page_table: parsedOutput.current_page_table || void 0,
16244
16529
  reasoning: parsedOutput.reasoning
16245
16530
  };
16246
16531
  return validation;
16247
16532
  }
16533
+ function reconstructTableHeaders(document2, validationResults, verbose = false) {
16534
+ let lastTableHeaders = void 0;
16535
+ for (const page of document2) {
16536
+ const validation = validationResults.get(page.page);
16537
+ if (!validation) continue;
16538
+ const tableInfo = validation.current_page_table;
16539
+ if (tableInfo && tableInfo.headers.length > 0) {
16540
+ if (tableInfo.is_continuation && lastTableHeaders) {
16541
+ if (verbose) {
16542
+ console.log(`[EXULU] Page ${page.page}: Reconstructing table headers from previous page`);
16543
+ console.log(`[EXULU] Previous headers: ${lastTableHeaders.join(" | ")}`);
16544
+ }
16545
+ const contentToModify = page.vlm_corrected_text || page.content;
16546
+ const lines = contentToModify.split("\n");
16547
+ const firstTableLineIndex = lines.findIndex((line) => line.trim().startsWith("|"));
16548
+ if (firstTableLineIndex !== -1) {
16549
+ const headerRow = `| ${lastTableHeaders.join(" | ")} |`;
16550
+ const separatorRow = `| ${lastTableHeaders.map(() => "---").join(" | ")} |`;
16551
+ lines.splice(firstTableLineIndex, 0, headerRow, separatorRow);
16552
+ const reconstructedContent = lines.join("\n");
16553
+ if (page.vlm_corrected_text) {
16554
+ page.vlm_corrected_text = reconstructedContent;
16555
+ } else {
16556
+ page.content = reconstructedContent;
16557
+ }
16558
+ if (verbose) {
16559
+ console.log(`[EXULU] Page ${page.page}: Added table headers successfully`);
16560
+ }
16561
+ }
16562
+ if (!tableInfo.is_continuation) {
16563
+ lastTableHeaders = tableInfo.headers;
16564
+ }
16565
+ } else {
16566
+ lastTableHeaders = tableInfo.headers;
16567
+ if (verbose) {
16568
+ console.log(`[EXULU] Page ${page.page}: Storing table headers for potential continuation`);
16569
+ console.log(`[EXULU] Headers: ${lastTableHeaders.join(" | ")}`);
16570
+ }
16571
+ }
16572
+ } else {
16573
+ lastTableHeaders = void 0;
16574
+ }
16575
+ }
16576
+ }
16248
16577
  async function validateWithVLM(document2, model, verbose = false, concurrency = 10) {
16249
16578
  console.log(`[EXULU] Starting VLM validation for docling output, ${document2.length} pages...`);
16250
- console.log(
16251
- `[EXULU] Concurrency limit: ${concurrency}`
16252
- );
16579
+ console.log(`[EXULU] Concurrency limit: ${concurrency}`);
16580
+ const limit = (0, import_p_limit.default)(concurrency);
16581
+ const validationResults = /* @__PURE__ */ new Map();
16253
16582
  let validatedCount = 0;
16254
16583
  let correctedCount = 0;
16255
- const limit = (0, import_p_limit.default)(concurrency);
16256
16584
  const validationTasks = document2.map(
16257
16585
  (page) => limit(async () => {
16586
+ await new Promise((resolve3) => setImmediate(resolve3));
16258
16587
  const imagePath = page.image;
16588
+ if (!page.content) {
16589
+ console.warn(`[EXULU] Page ${page.page}: No content found, skipping validation`);
16590
+ return;
16591
+ }
16259
16592
  if (!imagePath) {
16260
- console.log(`[EXULU] Page ${page.page}: No image found, skipping validation`);
16593
+ console.warn(`[EXULU] Page ${page.page}: No image found, skipping validation`);
16594
+ return;
16595
+ }
16596
+ const hasImage = page.content.match(/\.(jpeg|jpg|png|gif|webp)/i);
16597
+ const hasTable = (page.content.match(/\|/g)?.length || 0) > 1;
16598
+ if (!hasImage && !hasTable) {
16599
+ if (verbose) {
16600
+ console.log(`[EXULU] Page ${page.page}: No image or table found, SKIPPING VLM validation`);
16601
+ }
16261
16602
  return;
16262
16603
  }
16263
16604
  let validation;
@@ -16265,6 +16606,13 @@ async function validateWithVLM(document2, model, verbose = false, concurrency =
16265
16606
  validation = await withRetry(async () => {
16266
16607
  return await validatePageWithVLM(page, imagePath, model);
16267
16608
  }, 3);
16609
+ validationResults.set(page.page, validation);
16610
+ if (verbose && validation.current_page_table) {
16611
+ console.log(`[EXULU] Page ${page.page} table info:`, {
16612
+ headers: validation.current_page_table.headers,
16613
+ is_continuation: validation.current_page_table.is_continuation
16614
+ });
16615
+ }
16268
16616
  } catch (error) {
16269
16617
  console.error(`[EXULU] Error validating page ${page.page} with VLM more than 3 times, skipping:`, error);
16270
16618
  throw error;
@@ -16295,9 +16643,12 @@ async function validateWithVLM(document2, model, verbose = false, concurrency =
16295
16643
  })
16296
16644
  );
16297
16645
  await Promise.all(validationTasks);
16298
- console.log(`[EXULU] VLM validation complete:`);
16299
- console.log(`[EXULU] Validated: ${validatedCount} chunks`);
16300
- console.log(`[EXULU] Corrected: ${correctedCount} chunks`);
16646
+ console.log(`[EXULU] VLM validation complete (parallel processing):`);
16647
+ console.log(`[EXULU] Validated: ${validatedCount} pages`);
16648
+ console.log(`[EXULU] Corrected: ${correctedCount} pages`);
16649
+ console.log(`[EXULU] Starting sequential table header reconstruction...`);
16650
+ reconstructTableHeaders(document2, validationResults, verbose);
16651
+ console.log(`[EXULU] Table header reconstruction complete`);
16301
16652
  return document2;
16302
16653
  }
16303
16654
  async function processDocument(filePath, fileType, buffer, tempDir, config, verbose = false) {
@@ -16312,15 +16663,6 @@ async function processDocument(filePath, fileType, buffer, tempDir, config, verb
16312
16663
  const stripped = filePath.split(".").pop()?.trim();
16313
16664
  let result;
16314
16665
  switch (stripped) {
16315
- case "pdf":
16316
- result = await processPdf(buffer, paths, config, verbose);
16317
- break;
16318
- case "docx":
16319
- result = await processDocx(buffer);
16320
- break;
16321
- case "doc":
16322
- result = await processWord(buffer);
16323
- break;
16324
16666
  case "txt":
16325
16667
  case "md":
16326
16668
  let content = buffer.toString();
@@ -16334,6 +16676,15 @@ async function processDocument(filePath, fileType, buffer, tempDir, config, verb
16334
16676
  }]
16335
16677
  };
16336
16678
  break;
16679
+ case "pdf":
16680
+ result = await processPdf(buffer, paths, config, verbose);
16681
+ break;
16682
+ case "docx":
16683
+ result = await processDocx(buffer);
16684
+ break;
16685
+ case "doc":
16686
+ result = await processWord(buffer);
16687
+ break;
16337
16688
  // Todo other file types with docx and officeparser
16338
16689
  default:
16339
16690
  throw new Error(`[EXULU] Unsupported file type: ${fileType}`);
@@ -16346,10 +16697,17 @@ async function processDocument(filePath, fileType, buffer, tempDir, config, verb
16346
16697
  }
16347
16698
  };
16348
16699
  }
16700
+ var getMistralApiKey = async () => {
16701
+ if (process.env.MISTRAL_API_KEY) {
16702
+ return process.env.MISTRAL_API_KEY;
16703
+ } else {
16704
+ return await ExuluVariables.get("MISTRAL_API_KEY");
16705
+ }
16706
+ };
16349
16707
  async function processPdf(buffer, paths, config, verbose = false) {
16350
16708
  try {
16351
- let json;
16352
- if (config?.docling) {
16709
+ let json = [];
16710
+ if (config?.processor.name === "docling") {
16353
16711
  console.log(`[EXULU] Validating Python environment...`);
16354
16712
  const validation = await validatePythonEnvironment(void 0, true);
16355
16713
  if (!validation.valid) {
@@ -16390,7 +16748,7 @@ ${setupResult.output || ""}`);
16390
16748
  }
16391
16749
  const jsonContent = await fs2.promises.readFile(paths.json, "utf-8");
16392
16750
  json = JSON.parse(jsonContent);
16393
- } else {
16751
+ } else if (config?.processor.name === "officeparser") {
16394
16752
  const text = await (0, import_officeparser2.parseOfficeAsync)(buffer, {
16395
16753
  outputErrorToConsole: false,
16396
16754
  newlineDelimiter: "\n"
@@ -16400,15 +16758,70 @@ ${setupResult.output || ""}`);
16400
16758
  content: text,
16401
16759
  headings: []
16402
16760
  }];
16761
+ } else if (config?.processor.name === "mistral") {
16762
+ const MISTRAL_API_KEY = await getMistralApiKey();
16763
+ if (MISTRAL_API_KEY) {
16764
+ throw new Error('[EXULU] MISTRAL_API_KEY is not set, please set it in the environment variable via process.env or via an Exulu variable named "MISTRAL_API_KEY".');
16765
+ }
16766
+ await new Promise((resolve3) => setTimeout(resolve3, Math.floor(Math.random() * 4e3) + 1e3));
16767
+ const base64Pdf = buffer.toString("base64");
16768
+ const client2 = new import_mistralai.Mistral({ apiKey: MISTRAL_API_KEY });
16769
+ const ocrResponse = await withRetry(async () => {
16770
+ const ocrResponse2 = await client2.ocr.process({
16771
+ document: {
16772
+ type: "document_url",
16773
+ documentUrl: "data:application/pdf;base64," + base64Pdf
16774
+ },
16775
+ model: "mistral-ocr-latest",
16776
+ includeImageBase64: false
16777
+ });
16778
+ return ocrResponse2;
16779
+ }, 10);
16780
+ const parser = new import_liteparse.LiteParse();
16781
+ const screenshots = await parser.screenshot(paths.source, void 0);
16782
+ await fs2.promises.mkdir(paths.images, { recursive: true });
16783
+ for (const screenshot of screenshots) {
16784
+ await fs2.promises.writeFile(
16785
+ path.join(
16786
+ paths.images,
16787
+ `${screenshot.pageNum}.png`
16788
+ ),
16789
+ screenshot.imageBuffer
16790
+ );
16791
+ screenshot.imagePath = path.join(paths.images, `${screenshot.pageNum}.png`);
16792
+ }
16793
+ json = ocrResponse.pages.map((page) => ({
16794
+ page: page.index + 1,
16795
+ content: page.markdown,
16796
+ image: screenshots.find((s) => s.pageNum === page.index + 1)?.imagePath,
16797
+ headings: []
16798
+ }));
16799
+ fs2.writeFileSync(paths.json, JSON.stringify(json, null, 2));
16800
+ } else if (config?.processor.name === "liteparse") {
16801
+ const parser = new import_liteparse.LiteParse();
16802
+ const result = await parser.parse(paths.source);
16803
+ const screenshots = await parser.screenshot(paths.source, void 0);
16804
+ console.log(`[EXULU] Liteparse screenshots: ${JSON.stringify(screenshots)}`);
16805
+ await fs2.promises.mkdir(paths.images, { recursive: true });
16806
+ for (const screenshot of screenshots) {
16807
+ await fs2.promises.writeFile(path.join(paths.images, `${screenshot.pageNum}.png`), screenshot.imageBuffer);
16808
+ screenshot.imagePath = path.join(paths.images, `${screenshot.pageNum}.png`);
16809
+ }
16810
+ json = result.pages.map((page) => ({
16811
+ page: page.pageNum,
16812
+ content: page.text,
16813
+ image: screenshots.find((s) => s.pageNum === page.pageNum)?.imagePath
16814
+ }));
16815
+ fs2.writeFileSync(paths.json, JSON.stringify(json, null, 2));
16403
16816
  }
16404
16817
  console.log(`[EXULU]
16405
16818
  \u2713 Document processing completed successfully`);
16406
16819
  console.log(`[EXULU] Total pages: ${json.length}`);
16407
16820
  console.log(`[EXULU] Output file: ${paths.json}`);
16408
- if (!config?.docling && config?.vlm?.model) {
16821
+ if (config?.vlm?.model) {
16409
16822
  console.error("[EXULU] VLM validation is only supported when docling is enabled, skipping validation.");
16410
16823
  }
16411
- if (config?.docling && config?.vlm?.model) {
16824
+ if (config?.vlm?.model && json.length > 0) {
16412
16825
  json = await validateWithVLM(
16413
16826
  json,
16414
16827
  config.vlm.model,
@@ -16436,29 +16849,37 @@ ${setupResult.output || ""}`);
16436
16849
  "utf-8"
16437
16850
  );
16438
16851
  }
16439
- const markdown = json.map((p) => {
16440
- if (p.vlm_corrected_text) {
16441
- return p.vlm_corrected_text;
16442
- } else {
16443
- return p.content;
16852
+ const markdownStream = fs2.createWriteStream(paths.markdown, { encoding: "utf-8" });
16853
+ for (let i = 0; i < json.length; i++) {
16854
+ const p = json[i];
16855
+ if (!p) continue;
16856
+ const content = p.vlm_corrected_text ?? p.content;
16857
+ markdownStream.write(content);
16858
+ if (i < json.length - 1) {
16859
+ markdownStream.write("\n\n\n<!-- END_OF_PAGE -->\n\n\n");
16444
16860
  }
16445
- }).join("\n\n\n<!-- END_OF_PAGE -->\n\n\n");
16446
- await fs2.promises.writeFile(
16447
- paths.markdown,
16448
- markdown,
16449
- "utf-8"
16450
- );
16861
+ }
16862
+ await new Promise((resolve3, reject) => {
16863
+ markdownStream.end(() => resolve3());
16864
+ markdownStream.on("error", reject);
16865
+ });
16451
16866
  console.log(`[EXULU] Validated output saved to: ${paths.json}`);
16452
16867
  console.log(`[EXULU] Validated markdown saved to: ${paths.markdown}`);
16868
+ const markdown = await fs2.promises.readFile(paths.markdown, "utf-8");
16869
+ const processedJson = json.map((e) => {
16870
+ const finalContent = e.vlm_corrected_text ?? e.content;
16871
+ return {
16872
+ page: e.page,
16873
+ content: finalContent
16874
+ };
16875
+ });
16876
+ json.length = 0;
16877
+ json = [];
16878
+ const memUsage = process.memoryUsage();
16879
+ console.log(`[EXULU] Memory after document processing: ${Math.round(memUsage.heapUsed / 1024 / 1024)}MB / ${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`);
16453
16880
  return {
16454
16881
  markdown,
16455
- json: json.map((e) => {
16456
- const finalContent = e.vlm_corrected_text || e.content;
16457
- return {
16458
- page: e.page,
16459
- content: finalContent
16460
- };
16461
- })
16882
+ json: processedJson
16462
16883
  };
16463
16884
  } catch (error) {
16464
16885
  console.error("[EXULU] Error processing document:", error);
@@ -16471,9 +16892,9 @@ var loadFile = async (file, name, tempDir) => {
16471
16892
  if (!fileType) {
16472
16893
  throw new Error("[EXULU] File name does not include extension, extension is required for document processing.");
16473
16894
  }
16895
+ const UUID = (0, import_crypto.randomUUID)();
16474
16896
  let buffer;
16475
16897
  if (Buffer.isBuffer(file)) {
16476
- const UUID = (0, import_crypto.randomUUID)();
16477
16898
  filePath = path.join(tempDir, `${UUID}.${fileType}`);
16478
16899
  await fs2.promises.writeFile(filePath, file);
16479
16900
  buffer = file;
@@ -16482,7 +16903,10 @@ var loadFile = async (file, name, tempDir) => {
16482
16903
  if (filePath.startsWith("http")) {
16483
16904
  const response = await fetch(filePath);
16484
16905
  const array = await response.arrayBuffer();
16906
+ const tempFilePath = path.join(tempDir, `${UUID}.${fileType}`);
16907
+ await fs2.promises.writeFile(tempFilePath, Buffer.from(array));
16485
16908
  buffer = Buffer.from(array);
16909
+ filePath = tempFilePath;
16486
16910
  } else {
16487
16911
  buffer = await fs2.promises.readFile(file);
16488
16912
  }
@@ -16500,17 +16924,34 @@ async function documentProcessor({
16500
16924
  }
16501
16925
  const uuid = (0, import_crypto.randomUUID)();
16502
16926
  const tempDir = path.join(process.cwd(), "temp", uuid);
16927
+ const localFilesAndFoldersToDelete = [tempDir];
16503
16928
  console.log(`[EXULU] Temporary directory for processing document ${name}: ${tempDir}`);
16504
16929
  await fs2.promises.mkdir(tempDir, { recursive: true });
16930
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString();
16931
+ await fs2.promises.writeFile(path.join(tempDir, "created_at.txt"), timestamp);
16505
16932
  try {
16506
16933
  const {
16507
16934
  filePath,
16508
16935
  fileType,
16509
16936
  buffer
16510
16937
  } = await loadFile(file, name, tempDir);
16511
- const supportedTypes = ["pdf", "docx", "doc", "txt", "md"];
16938
+ let supportedTypes = [];
16939
+ switch (config?.processor.name) {
16940
+ case "docling":
16941
+ supportedTypes = ["pdf", "docx", "doc", "txt", "md"];
16942
+ break;
16943
+ case "officeparser":
16944
+ supportedTypes = [];
16945
+ break;
16946
+ case "liteparse":
16947
+ supportedTypes = ["pdf", "doc", "docx", "docm", "odt", "rtf", "ppt", "pptx", "pptm", "odp", "xls", "xlsx", "xlsm", "ods", "csv", "tsv"];
16948
+ break;
16949
+ case "mistral":
16950
+ supportedTypes = ["pdf", "docx", "doc", "txt", "md"];
16951
+ break;
16952
+ }
16512
16953
  if (!supportedTypes.includes(fileType)) {
16513
- throw new Error(`[EXULU] Unsupported file type: ${fileType} for Exulu document processor.`);
16954
+ throw new Error(`[EXULU] Unsupported file type: ${fileType} for Exulu document processor, the ${config?.processor.name} processor only supports the following file types: ${supportedTypes.join(", ")}.`);
16514
16955
  }
16515
16956
  const { content } = await processDocument(
16516
16957
  filePath,
@@ -16523,9 +16964,19 @@ async function documentProcessor({
16523
16964
  return content.json;
16524
16965
  } catch (error) {
16525
16966
  console.error("Error during chunking:", error);
16526
- return void 0;
16967
+ throw error;
16527
16968
  } finally {
16528
- await fs2.promises.rm(tempDir, { recursive: true });
16969
+ if (config?.debugging?.deleteTempFiles !== false) {
16970
+ for (const file2 of localFilesAndFoldersToDelete) {
16971
+ try {
16972
+ await fs2.promises.rm(file2, { recursive: true });
16973
+ console.log(`[EXULU] Deleted file or folder: ${file2}`);
16974
+ } catch (error) {
16975
+ console.error(`[EXULU] Error deleting file or folder: ${file2}`, error);
16976
+ console.log(`[EXULU] File or folder still exists: ${file2}`);
16977
+ }
16978
+ }
16979
+ }
16529
16980
  }
16530
16981
  }
16531
16982