@exulu/backend 1.50.0 → 1.51.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -205,17 +205,19 @@ async function postgresClient() {
205
205
  // 30 minutes
206
206
  },
207
207
  pool: {
208
- min: 5,
209
- // Increased from 2 to ensure enough connections available
210
- max: 50,
211
- // Increased from 20 to handle more concurrent operations with processor jobs
212
- acquireTimeoutMillis: 6e4,
213
- // Increased from 30s to 60s to handle pool contention
208
+ min: 10,
209
+ // Minimum connections always ready
210
+ max: 300,
211
+ // Increased to support high worker concurrency (250+ concurrent jobs)
212
+ acquireTimeoutMillis: 12e4,
213
+ // 2 minutes - increased to handle high contention during bursts
214
214
  createTimeoutMillis: 3e4,
215
215
  idleTimeoutMillis: 6e4,
216
- // Increased to keep connections alive longer
216
+ // Keep connections alive for reuse
217
217
  reapIntervalMillis: 1e3,
218
218
  createRetryIntervalMillis: 200,
219
+ // Enable propagateCreateError to properly handle connection creation failures
220
+ propagateCreateError: false,
219
221
  // Log pool events to help debug connection issues
220
222
  afterCreate: (conn, done) => {
221
223
  console.log("[EXULU] New database connection created");
@@ -3618,7 +3620,7 @@ var convertExuluToolsToAiSdkTools = async (currentTools, approvedTools, allExulu
3618
3620
  description,
3619
3621
  // The approvedTools array uses the tool.name lookup as the frontend
3620
3622
  // Vercel AI SDK uses the sanitized tool name as the key, so this matches.
3621
- needsApproval: approvedTools?.includes("tool-" + cur.name) ? false : true,
3623
+ needsApproval: approvedTools?.includes("tool-" + cur.name) || !cur.needsApproval ? false : true,
3622
3624
  // todo make configurable
3623
3625
  async *execute(inputs, options) {
3624
3626
  console.log(
@@ -3768,6 +3770,7 @@ var ExuluTool = class {
3768
3770
  inputSchema;
3769
3771
  type;
3770
3772
  tool;
3773
+ needsApproval;
3771
3774
  config;
3772
3775
  constructor({
3773
3776
  id,
@@ -3777,10 +3780,12 @@ var ExuluTool = class {
3777
3780
  inputSchema,
3778
3781
  type,
3779
3782
  execute: execute2,
3780
- config
3783
+ config,
3784
+ needsApproval
3781
3785
  }) {
3782
3786
  this.id = id;
3783
3787
  this.config = config;
3788
+ this.needsApproval = needsApproval ?? true;
3784
3789
  this.category = category || "default";
3785
3790
  this.name = name;
3786
3791
  this.description = description;
@@ -4187,11 +4192,30 @@ var ExuluContext2 = class {
4187
4192
  );
4188
4193
  await db2.from(getChunksTableName(this.id)).where({ source }).delete();
4189
4194
  if (chunks?.length) {
4195
+ const sanitizeString = (str) => {
4196
+ if (!str) return "";
4197
+ return str.replace(/\0/g, "");
4198
+ };
4199
+ const sanitizeMetadata2 = (metadata) => {
4200
+ if (!metadata) return {};
4201
+ const sanitized = {};
4202
+ for (const [key, value] of Object.entries(metadata)) {
4203
+ if (typeof value === "string") {
4204
+ sanitized[key] = sanitizeString(value);
4205
+ } else {
4206
+ sanitized[key] = value;
4207
+ }
4208
+ }
4209
+ return sanitized;
4210
+ };
4190
4211
  await db2.from(getChunksTableName(this.id)).insert(
4191
4212
  chunks.map((chunk) => ({
4192
- source,
4193
- metadata: chunk.metadata,
4194
- content: chunk.content,
4213
+ // Sanitize source to remove null bytes
4214
+ source: sanitizeString(source),
4215
+ // Sanitize metadata to remove null bytes from string values
4216
+ metadata: sanitizeMetadata2(chunk.metadata),
4217
+ // Remove null bytes (0x00) which are invalid in PostgreSQL UTF8 encoding
4218
+ content: sanitizeString(chunk.content),
4195
4219
  chunk_index: chunk.index,
4196
4220
  embedding: import_knex5.default.toSql(chunk.vector)
4197
4221
  }))
@@ -4580,6 +4604,8 @@ var ExuluContext2 = class {
4580
4604
  name: `${this.name}_context_search`,
4581
4605
  type: "context",
4582
4606
  category: "contexts",
4607
+ needsApproval: true,
4608
+ // todo make configurable
4583
4609
  inputSchema: import_zod4.z.object({
4584
4610
  query: import_zod4.z.string().describe("The original question that the user asked"),
4585
4611
  keywords: import_zod4.z.array(import_zod4.z.string()).describe(
@@ -5773,7 +5799,7 @@ var finalizeRequestedFields = async ({
5773
5799
  return result;
5774
5800
  }
5775
5801
  const { db: db2 } = await postgresClient();
5776
- const query = db2.from(getChunksTableName(context.id)).where({ source: result.id }).select("id", "content", "source", "chunk_index", "createdAt", "updatedAt");
5802
+ const query = db2.from(getChunksTableName(context.id)).where({ source: result.id }).select("id", "content", "source", "chunk_index", "createdAt", "updatedAt", "metadata");
5777
5803
  const chunks = await query;
5778
5804
  result.chunks = chunks.map((chunk) => ({
5779
5805
  chunk_content: chunk.content,
@@ -5786,7 +5812,8 @@ var finalizeRequestedFields = async ({
5786
5812
  item_created_at: chunk.item_created_at,
5787
5813
  item_id: chunk.item_id,
5788
5814
  item_external_id: chunk.item_external_id,
5789
- item_name: chunk.item_name
5815
+ item_name: chunk.item_name,
5816
+ chunk_metadata: chunk.metadata
5790
5817
  }));
5791
5818
  }
5792
5819
  }
@@ -7160,6 +7187,36 @@ var import_ai3 = require("ai");
7160
7187
  var import_crypto_js4 = __toESM(require("crypto-js"), 1);
7161
7188
  var redisConnection;
7162
7189
  var unhandledRejectionHandlerInstalled = false;
7190
+ var poolMonitoringInterval;
7191
+ var startPoolMonitoring = () => {
7192
+ if (poolMonitoringInterval) return;
7193
+ poolMonitoringInterval = setInterval(async () => {
7194
+ try {
7195
+ const { db: db2 } = await postgresClient();
7196
+ const poolStats = db2.client.pool;
7197
+ if (poolStats) {
7198
+ const used = poolStats.numUsed?.() || 0;
7199
+ const free = poolStats.numFree?.() || 0;
7200
+ const pending = poolStats.numPendingAcquires?.() || 0;
7201
+ const total = used + free;
7202
+ console.log("[EXULU] Connection pool health check:", {
7203
+ used,
7204
+ free,
7205
+ pending,
7206
+ total,
7207
+ utilization: total > 0 ? `${Math.round(used / total * 100)}%` : "0%"
7208
+ });
7209
+ if (pending > 10) {
7210
+ console.warn(
7211
+ `[EXULU] WARNING: ${pending} jobs waiting for database connections. Consider increasing pool size or reducing worker concurrency.`
7212
+ );
7213
+ }
7214
+ }
7215
+ } catch (error) {
7216
+ console.error("[EXULU] Error checking pool health:", error);
7217
+ }
7218
+ }, 3e4);
7219
+ };
7163
7220
  var installGlobalErrorHandlers = () => {
7164
7221
  if (unhandledRejectionHandlerInstalled) return;
7165
7222
  process.on("unhandledRejection", (reason) => {
@@ -7184,6 +7241,7 @@ var installGlobalErrorHandlers = () => {
7184
7241
  unhandledRejectionHandlerInstalled = true;
7185
7242
  console.log("[EXULU] Global error handlers installed to prevent worker crashes");
7186
7243
  };
7244
+ var isShuttingDown = false;
7187
7245
  var createWorkers = async (providers, queues2, config, contexts, rerankers, evals, tools, tracer) => {
7188
7246
  console.log("[EXULU] creating workers for " + queues2?.length + " queues.");
7189
7247
  console.log(
@@ -7191,7 +7249,8 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7191
7249
  queues2.map((q) => q.queue.name)
7192
7250
  );
7193
7251
  installGlobalErrorHandlers();
7194
- process.setMaxListeners(Math.max(queues2.length * 2 + 5, 15));
7252
+ startPoolMonitoring();
7253
+ process.setMaxListeners(Math.max(15, process.getMaxListeners()));
7195
7254
  if (!redisServer.host || !redisServer.port) {
7196
7255
  console.error(
7197
7256
  "[EXULU] you are trying to start worker, but no redis server is configured in the environment."
@@ -7224,7 +7283,53 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7224
7283
  status: await bullmqJob.getState(),
7225
7284
  type: bullmqJob.data.type
7226
7285
  });
7227
- const { db: db2 } = await postgresClient();
7286
+ let progressInterval;
7287
+ if (bullmqJob.data.type === "processor") {
7288
+ progressInterval = setInterval(async () => {
7289
+ try {
7290
+ await bullmqJob.updateProgress({
7291
+ status: "processing",
7292
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
7293
+ });
7294
+ console.log(`[EXULU] Job ${bullmqJob.id} heartbeat sent to prevent stalling`);
7295
+ } catch (error) {
7296
+ console.error(`[EXULU] Error updating job progress:`, error);
7297
+ }
7298
+ }, 25e3);
7299
+ }
7300
+ let db2;
7301
+ let retries = 3;
7302
+ let lastError;
7303
+ for (let attempt = 1; attempt <= retries; attempt++) {
7304
+ try {
7305
+ const client2 = await postgresClient();
7306
+ db2 = client2.db;
7307
+ const poolStats = db2.client.pool;
7308
+ if (poolStats) {
7309
+ console.log(`[EXULU] Connection pool stats for job ${bullmqJob.id}:`, {
7310
+ size: poolStats.numUsed?.() || 0,
7311
+ available: poolStats.numFree?.() || 0,
7312
+ pending: poolStats.numPendingAcquires?.() || 0
7313
+ });
7314
+ }
7315
+ break;
7316
+ } catch (error) {
7317
+ lastError = error instanceof Error ? error : new Error(String(error));
7318
+ console.error(
7319
+ `[EXULU] Failed to acquire database connection (attempt ${attempt}/${retries}) for job ${bullmqJob.id}:`,
7320
+ lastError.message
7321
+ );
7322
+ if (attempt < retries) {
7323
+ const backoffMs = 500 * Math.pow(2, attempt - 1);
7324
+ await new Promise((resolve3) => setTimeout(resolve3, backoffMs));
7325
+ }
7326
+ }
7327
+ }
7328
+ if (!db2) {
7329
+ throw new Error(
7330
+ `Failed to acquire database connection after ${retries} attempts: ${lastError?.message}`
7331
+ );
7332
+ }
7228
7333
  const data = bullmqJob.data;
7229
7334
  const timeoutInSeconds = data.timeoutInSeconds || queue.timeoutInSeconds || 600;
7230
7335
  const timeoutMs = timeoutInSeconds * 1e3;
@@ -7316,7 +7421,7 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7316
7421
  }
7317
7422
  const exuluStorage = new ExuluStorage({ config });
7318
7423
  console.log("[EXULU] POS 2 -- EXULU CONTEXT PROCESS FIELD");
7319
- const processorResult = await context.processor.execute({
7424
+ let processorResult = await context.processor.execute({
7320
7425
  item: data.inputs,
7321
7426
  user: data.user,
7322
7427
  role: data.role,
@@ -7331,12 +7436,16 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7331
7436
  );
7332
7437
  }
7333
7438
  delete processorResult.field;
7439
+ const updateData = { ...processorResult };
7334
7440
  await db2.from(getTableName(context.id)).where({
7335
7441
  id: processorResult.id
7336
7442
  }).update({
7337
- ...processorResult,
7443
+ ...updateData,
7338
7444
  last_processed_at: (/* @__PURE__ */ new Date()).toISOString()
7339
7445
  });
7446
+ Object.keys(updateData).forEach((key) => {
7447
+ delete updateData[key];
7448
+ });
7340
7449
  let jobs = [];
7341
7450
  if (context.processor?.config?.generateEmbeddings) {
7342
7451
  const fullItem = await db2.from(getTableName(context.id)).where({
@@ -7358,12 +7467,18 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7358
7467
  jobs.push(embeddingsJob);
7359
7468
  }
7360
7469
  }
7361
- return {
7362
- result: processorResult,
7470
+ const result = {
7471
+ result: { id: processorResult.id },
7363
7472
  metadata: {
7364
7473
  jobs: jobs.length > 0 ? jobs.join(",") : void 0
7365
7474
  }
7366
7475
  };
7476
+ processorResult = null;
7477
+ const memUsage = process.memoryUsage();
7478
+ console.log(
7479
+ `[EXULU] Memory after processor job ${bullmqJob.id}: ${Math.round(memUsage.heapUsed / 1024 / 1024)}MB / ${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`
7480
+ );
7481
+ return result;
7367
7482
  }
7368
7483
  if (data.type === "workflow") {
7369
7484
  console.log("[EXULU] running a workflow job.", bullmqJob.name);
@@ -7382,10 +7497,10 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7382
7497
  user,
7383
7498
  messages: inputMessages
7384
7499
  } = await validateWorkflowPayload(data, providers);
7385
- const retries = 3;
7500
+ const retries2 = 3;
7386
7501
  let attempts = 0;
7387
7502
  const promise = new Promise(async (resolve3, reject) => {
7388
- while (attempts < retries) {
7503
+ while (attempts < retries2) {
7389
7504
  try {
7390
7505
  const messages2 = await processUiMessagesFlow({
7391
7506
  providers,
@@ -7407,7 +7522,7 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7407
7522
  error instanceof Error ? error.message : String(error)
7408
7523
  );
7409
7524
  attempts++;
7410
- if (attempts >= retries) {
7525
+ if (attempts >= retries2) {
7411
7526
  reject(new Error(error instanceof Error ? error.message : String(error)));
7412
7527
  }
7413
7528
  await new Promise((resolve4) => setTimeout((resolve5) => resolve5(true), 2e3));
@@ -7458,10 +7573,10 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7458
7573
  testCase,
7459
7574
  messages: inputMessages
7460
7575
  } = await validateEvalPayload(data, providers);
7461
- const retries = 3;
7576
+ const retries2 = 3;
7462
7577
  let attempts = 0;
7463
7578
  const promise = new Promise(async (resolve3, reject) => {
7464
- while (attempts < retries) {
7579
+ while (attempts < retries2) {
7465
7580
  try {
7466
7581
  const messages2 = await processUiMessagesFlow({
7467
7582
  providers,
@@ -7482,7 +7597,7 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7482
7597
  error instanceof Error ? error.message : String(error)
7483
7598
  );
7484
7599
  attempts++;
7485
- if (attempts >= retries) {
7600
+ if (attempts >= retries2) {
7486
7601
  reject(new Error(error instanceof Error ? error.message : String(error)));
7487
7602
  }
7488
7603
  await new Promise((resolve4) => setTimeout((resolve5) => resolve5(true), 2e3));
@@ -7731,9 +7846,15 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7731
7846
  try {
7732
7847
  const result = await Promise.race([workPromise, timeoutPromise]);
7733
7848
  clearTimeout(timeoutHandle);
7849
+ if (progressInterval) {
7850
+ clearInterval(progressInterval);
7851
+ }
7734
7852
  return result;
7735
7853
  } catch (error) {
7736
7854
  clearTimeout(timeoutHandle);
7855
+ if (progressInterval) {
7856
+ clearInterval(progressInterval);
7857
+ }
7737
7858
  console.error(
7738
7859
  `[EXULU] job ${bullmqJob.id} failed (error caught in race handler).`,
7739
7860
  error instanceof Error ? error.message : String(error)
@@ -7747,6 +7868,16 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7747
7868
  concurrency: queue.concurrency?.worker || 1,
7748
7869
  removeOnComplete: { count: 1e3 },
7749
7870
  removeOnFail: { count: 5e3 },
7871
+ // Configure settings for long-running jobs (especially processor jobs)
7872
+ // lockDuration: How long a worker can hold a job before it's considered stalled
7873
+ // Set to 5 minutes to accommodate CPU-intensive operations
7874
+ lockDuration: 3e5,
7875
+ // 5 minutes in milliseconds
7876
+ // stalledInterval: How often to check for stalled jobs
7877
+ // Set to 2 minutes to reduce false positives for long-running operations
7878
+ stalledInterval: 12e4,
7879
+ // 2 minutes in milliseconds
7880
+ maxStalledCount: 1,
7750
7881
  ...queue.ratelimit && {
7751
7882
  limiter: {
7752
7883
  max: queue.ratelimit,
@@ -7783,24 +7914,68 @@ var createWorkers = async (providers, queues2, config, contexts, rerankers, eval
7783
7914
  error: error instanceof Error ? error.message : String(error)
7784
7915
  } : error
7785
7916
  );
7917
+ throw error;
7786
7918
  });
7787
7919
  worker.on("error", (error) => {
7788
7920
  console.error(`[EXULU] worker error.`, error);
7921
+ throw error;
7789
7922
  });
7790
7923
  worker.on("progress", (job, progress) => {
7791
7924
  console.log(`[EXULU] job progress ${job.id}.`, job.name, {
7792
7925
  progress
7793
7926
  });
7794
7927
  });
7795
- const gracefulShutdown = async (signal) => {
7796
- console.log(`Received ${signal}, closing server...`);
7797
- await worker.close();
7798
- process.exit(0);
7799
- };
7800
- process.on("SIGINT", () => gracefulShutdown("SIGINT"));
7801
- process.on("SIGTERM", () => gracefulShutdown("SIGTERM"));
7802
7928
  return worker;
7803
7929
  });
7930
+ const gracefulShutdown = async (signal) => {
7931
+ if (isShuttingDown) {
7932
+ console.log(`[EXULU] Shutdown already in progress, ignoring additional ${signal}`);
7933
+ return;
7934
+ }
7935
+ isShuttingDown = true;
7936
+ console.log(`[EXULU] Received ${signal}, shutting down gracefully...`);
7937
+ try {
7938
+ if (poolMonitoringInterval) {
7939
+ clearInterval(poolMonitoringInterval);
7940
+ poolMonitoringInterval = void 0;
7941
+ }
7942
+ console.log(`[EXULU] Closing ${workers.length} worker(s)...`);
7943
+ const closePromises = workers.map(async (worker, index) => {
7944
+ try {
7945
+ await Promise.race([
7946
+ worker.close(),
7947
+ new Promise(
7948
+ (_, reject) => setTimeout(() => reject(new Error("Worker close timeout")), 3e4)
7949
+ )
7950
+ ]);
7951
+ console.log(`[EXULU] Worker ${index + 1} closed successfully`);
7952
+ } catch (error) {
7953
+ console.error(`[EXULU] Error closing worker ${index + 1}:`, error);
7954
+ }
7955
+ });
7956
+ await Promise.allSettled(closePromises);
7957
+ if (redisConnection) {
7958
+ console.log(`[EXULU] Closing Redis connection...`);
7959
+ await redisConnection.quit();
7960
+ }
7961
+ try {
7962
+ const { db: db2 } = await postgresClient();
7963
+ if (db2?.client) {
7964
+ console.log(`[EXULU] Closing database connection pool...`);
7965
+ await db2.client.destroy();
7966
+ }
7967
+ } catch (error) {
7968
+ console.error(`[EXULU] Error closing database:`, error);
7969
+ }
7970
+ console.log(`[EXULU] Graceful shutdown complete`);
7971
+ process.exit(0);
7972
+ } catch (error) {
7973
+ console.error(`[EXULU] Error during graceful shutdown:`, error);
7974
+ process.exit(1);
7975
+ }
7976
+ };
7977
+ process.once("SIGINT", () => gracefulShutdown("SIGINT"));
7978
+ process.once("SIGTERM", () => gracefulShutdown("SIGTERM"));
7804
7979
  return workers;
7805
7980
  };
7806
7981
  var validateWorkflowPayload = async (data, providers) => {
@@ -9514,6 +9689,7 @@ type AgentEvalFunctionConfig {
9514
9689
 
9515
9690
  type ItemChunks {
9516
9691
  chunk_id: String!
9692
+ chunk_metadata: JSON!
9517
9693
  chunk_index: Int!
9518
9694
  chunk_content: String!
9519
9695
  chunk_source: String!
@@ -9920,7 +10096,7 @@ var ExuluProvider = class {
9920
10096
  prompt: import_zod7.z.string().describe("The prompt (usually a question for the agent) to send to the agent."),
9921
10097
  information: import_zod7.z.string().describe("A summary of relevant context / information from the current session")
9922
10098
  }),
9923
- description: `This tool calls an AI agent named: ${agent.name}. The agent does the following: ${agent.description}.`,
10099
+ description: `This tool calls an agent named: ${agent.name}. The agent does the following: ${agent.description}.`,
9924
10100
  config: [],
9925
10101
  execute: async ({ prompt, information, user, allExuluTools }) => {
9926
10102
  const hasAccessToAgent = await checkRecordAccess(agent, "read", user);
@@ -10033,9 +10209,6 @@ var ExuluProvider = class {
10033
10209
  if (!prompt && !inputMessages?.length) {
10034
10210
  throw new Error("Prompt or message is required for generating.");
10035
10211
  }
10036
- if (outputSchema && !prompt) {
10037
- throw new Error("Prompt is required for generating with an output schema.");
10038
- }
10039
10212
  const model = this.model.create({
10040
10213
  ...providerapikey ? { apiKey: providerapikey } : {}
10041
10214
  });
@@ -10172,14 +10345,18 @@ var ExuluProvider = class {
10172
10345
  let inputTokens = 0;
10173
10346
  let outputTokens = 0;
10174
10347
  if (outputSchema) {
10175
- const { object, usage } = await (0, import_ai4.generateObject)({
10348
+ const { output, usage } = await (0, import_ai4.generateText)({
10176
10349
  model,
10177
10350
  system,
10178
- prompt,
10179
10351
  maxRetries: 3,
10180
- schema: outputSchema
10352
+ output: import_ai4.Output.object({
10353
+ schema: outputSchema
10354
+ }),
10355
+ prompt,
10356
+ stopWhen: [(0, import_ai4.stepCountIs)(5)]
10357
+ // make configurable
10181
10358
  });
10182
- result.object = object;
10359
+ result.object = output;
10183
10360
  inputTokens = usage.inputTokens || 0;
10184
10361
  outputTokens = usage.outputTokens || 0;
10185
10362
  } else {
@@ -10210,6 +10387,7 @@ var ExuluProvider = class {
10210
10387
  agent
10211
10388
  ),
10212
10389
  stopWhen: [(0, import_ai4.stepCountIs)(5)]
10390
+ // make configurable
10213
10391
  });
10214
10392
  result.text = text;
10215
10393
  inputTokens = totalUsage?.inputTokens || 0;
@@ -10684,6 +10862,7 @@ var providerRateLimiter = async (key, windowSeconds, limit, points) => {
10684
10862
  };
10685
10863
 
10686
10864
  // src/exulu/routes.ts
10865
+ var import_zod_from_json_schema = require("zod-from-json-schema");
10687
10866
  var REQUEST_SIZE_LIMIT = "50mb";
10688
10867
  var getExuluVersionNumber = async () => {
10689
10868
  try {
@@ -11104,6 +11283,16 @@ Mood: friendly and intelligent.
11104
11283
  providers,
11105
11284
  user
11106
11285
  );
11286
+ if (req.body.outputSchema && !!headers.stream) {
11287
+ throw new Error("Providing a outputSchema in the POST body is not allowed when using the streaming API, set 'stream' to false in the headers when defining a response schema.");
11288
+ }
11289
+ let outputSchema;
11290
+ if (req.body.outputSchema) {
11291
+ if (typeof req.body.outputSchema === "string") {
11292
+ req.body.outputSchema = JSON.parse(req.body.outputSchema);
11293
+ }
11294
+ outputSchema = (0, import_zod_from_json_schema.convertJsonSchemaToZod)(req.body.outputSchema);
11295
+ }
11107
11296
  let providerapikey;
11108
11297
  const variableName = agent.providerapikey;
11109
11298
  if (variableName) {
@@ -11250,6 +11439,7 @@ Mood: friendly and intelligent.
11250
11439
  const response = await provider.generateSync({
11251
11440
  contexts,
11252
11441
  rerankers: rerankers || [],
11442
+ outputSchema,
11253
11443
  agent,
11254
11444
  user,
11255
11445
  req,
@@ -15485,6 +15675,22 @@ var MarkdownChunker = class {
15485
15675
  });
15486
15676
  return result;
15487
15677
  }
15678
+ /**
15679
+ * Checks if a position in the text falls within a <diagram> tag.
15680
+ * Returns the adjusted position (before the diagram) if inside a diagram, otherwise returns the original position.
15681
+ */
15682
+ adjustForDiagramTags(text, position) {
15683
+ const diagramRegex = /<diagram>[\s\S]*?<\/diagram>/gi;
15684
+ let match;
15685
+ while ((match = diagramRegex.exec(text)) !== null) {
15686
+ const diagramStart = match.index;
15687
+ const diagramEnd = match.index + match[0].length;
15688
+ if (position > diagramStart && position < diagramEnd) {
15689
+ return diagramStart;
15690
+ }
15691
+ }
15692
+ return position;
15693
+ }
15488
15694
  /**
15489
15695
  * Find the nearest logical breakpoint working backwards from the end of the text.
15490
15696
  * Logical breakpoints are prioritized as follows:
@@ -15496,6 +15702,7 @@ var MarkdownChunker = class {
15496
15702
  *
15497
15703
  * Only considers breakpoints in the last 50% of the text to avoid creating very small chunks.
15498
15704
  * Returns the position of the breakpoint, or null if none found
15705
+ * IMPORTANT: Never splits content within <diagram> tags
15499
15706
  */
15500
15707
  findLogicalBreakpoint(text) {
15501
15708
  if (text.length === 0) return null;
@@ -15515,7 +15722,7 @@ var MarkdownChunker = class {
15515
15722
  }
15516
15723
  }
15517
15724
  if (lastHeaderPosition > 0) {
15518
- return lastHeaderPosition;
15725
+ return this.adjustForDiagramTags(text, lastHeaderPosition);
15519
15726
  }
15520
15727
  let lastParagraphBreak = -1;
15521
15728
  let searchPos = text.length;
@@ -15528,11 +15735,12 @@ var MarkdownChunker = class {
15528
15735
  searchPos = pos;
15529
15736
  }
15530
15737
  if (lastParagraphBreak > 0) {
15531
- return lastParagraphBreak + 2;
15738
+ const adjusted = this.adjustForDiagramTags(text, lastParagraphBreak + 2);
15739
+ return adjusted;
15532
15740
  }
15533
15741
  const newlineIndex = text.lastIndexOf("\n");
15534
15742
  if (newlineIndex >= minPosition) {
15535
- return newlineIndex + 1;
15743
+ return this.adjustForDiagramTags(text, newlineIndex + 1);
15536
15744
  }
15537
15745
  const sentenceEndRegex = /[.!?](?:\s|$)/g;
15538
15746
  let lastSentenceEnd = -1;
@@ -15542,13 +15750,13 @@ var MarkdownChunker = class {
15542
15750
  }
15543
15751
  }
15544
15752
  if (lastSentenceEnd > 0) {
15545
- return lastSentenceEnd;
15753
+ return this.adjustForDiagramTags(text, lastSentenceEnd);
15546
15754
  }
15547
15755
  let lastSpace = text.length;
15548
15756
  while (lastSpace > minPosition) {
15549
15757
  const pos = text.lastIndexOf(" ", lastSpace - 1);
15550
15758
  if (pos >= minPosition) {
15551
- return pos + 1;
15759
+ return this.adjustForDiagramTags(text, pos + 1);
15552
15760
  }
15553
15761
  lastSpace = pos;
15554
15762
  }
@@ -15680,6 +15888,38 @@ var MarkdownChunker = class {
15680
15888
  targetPosition = currentPosition + decoded.length;
15681
15889
  }
15682
15890
  }
15891
+ const diagramCheck = /<diagram>/gi;
15892
+ const diagramCloseCheck = /<\/diagram>/gi;
15893
+ let openDiagramsInSlice = 0;
15894
+ while (diagramCheck.exec(currentSlice) !== null) {
15895
+ openDiagramsInSlice++;
15896
+ }
15897
+ let closeDiagramsInSlice = 0;
15898
+ while (diagramCloseCheck.exec(currentSlice) !== null) {
15899
+ closeDiagramsInSlice++;
15900
+ }
15901
+ if (openDiagramsInSlice > closeDiagramsInSlice) {
15902
+ const lastDiagramOpenIndex = currentSlice.lastIndexOf("<diagram>");
15903
+ if (lastDiagramOpenIndex !== -1) {
15904
+ const remainingText = text.slice(currentPosition + lastDiagramOpenIndex);
15905
+ const closingTagMatch = /<\/diagram>/i.exec(remainingText);
15906
+ if (closingTagMatch) {
15907
+ const closingTagPosition = lastDiagramOpenIndex + closingTagMatch.index + closingTagMatch[0].length;
15908
+ const extendedSlice = text.slice(currentPosition, currentPosition + closingTagPosition);
15909
+ const extendedTokens = tokenizer.encode(extendedSlice);
15910
+ if (extendedTokens.length <= adjustedChunkSize * 1.5) {
15911
+ currentSlice = extendedSlice;
15912
+ targetPosition = currentPosition + closingTagPosition;
15913
+ } else {
15914
+ currentSlice = currentSlice.slice(0, lastDiagramOpenIndex);
15915
+ targetPosition = currentPosition + lastDiagramOpenIndex;
15916
+ }
15917
+ } else {
15918
+ currentSlice = currentSlice.slice(0, lastDiagramOpenIndex);
15919
+ targetPosition = currentPosition + lastDiagramOpenIndex;
15920
+ }
15921
+ }
15922
+ }
15683
15923
  const breakpointPosition = this.findLogicalBreakpoint(currentSlice);
15684
15924
  if (breakpointPosition !== null) {
15685
15925
  currentSlice = currentSlice.slice(0, breakpointPosition);
@@ -16101,6 +16341,8 @@ ${command}`;
16101
16341
  }
16102
16342
 
16103
16343
  // ee/python/documents/processing/doc_processor.ts
16344
+ var import_liteparse = require("@llamaindex/liteparse");
16345
+ var import_mistralai = require("@mistralai/mistralai");
16104
16346
  async function processDocx(file) {
16105
16347
  const html = await mammoth.convertToHtml({ buffer: file });
16106
16348
  const turndownService = new import_turndown.default();
@@ -16175,43 +16417,80 @@ async function validatePageWithVLM(page, imagePath, model) {
16175
16417
  const imageBuffer = await fs2.promises.readFile(imagePath);
16176
16418
  const imageBase64 = imageBuffer.toString("base64");
16177
16419
  const mimeType = "image/png";
16178
- const prompt = `You are validating OCR/document parsing output for a page that might contain tables and images.
16179
-
16180
- Here is the current OCR/parsed content for this page:
16420
+ const prompt = `You are a document validation assistant. Your task is to analyze a page image and correct the output of an OCR/parsing pipeline. The content may include tables, technical diagrams, schematics, and structured text.
16181
16421
 
16182
16422
  ---
16423
+ ## CURRENT OCR OUTPUT
16424
+
16183
16425
  ${page.content}
16184
16426
  ---
16185
16427
 
16186
- Please analyze the page image and validate it:
16187
-
16188
- 1. Check if the extracted markdown text accurately represents the content from the page, including:
16189
- - Table data (rows, columns, headers, values)
16190
- - Technical diagrams, schematics, control boards
16191
- - Icons, checkmarks, symbols
16192
- - Image captions and labels
16193
-
16194
- 2. If the page has significant errors or omissions, provide a corrected version for the page.
16428
+ ## YOUR TASK
16195
16429
 
16196
- 3. Return a validation result for the page.
16430
+ Compare the page image to the OCR output above. Identify errors, omissions, and formatting issues, then return a structured validation result (see OUTPUT FORMAT below).
16197
16431
 
16198
- IMPORTANT OUTPUT FORMAT REQUIREMENTS:
16199
- - You MUST output all tables in proper Markdown table format using pipes (|) and dashes (---)
16200
- - Use simple separator rows: | --- | --- | (NOT long dashes like ----------------------)
16201
- - Every table must have: header row, separator row, and data rows
16202
- - Example format:
16432
+ ---
16433
+ ## VALIDATION CHECKLIST
16434
+
16435
+ Work through these checks in order:
16436
+
16437
+ ### 1. Text Accuracy
16438
+ - Verify all text is correctly transcribed.
16439
+ - For minor character-level OCR errors (e.g. "\xF6" vs "\xFC", "rn" vs "m"), **prefer the original OCR output** unless you are certain of an error. Do not silently "fix" characters based on guesswork.
16440
+
16441
+ ### 2. Heading Levels
16442
+ - Verify that headings use correct Markdown levels (#, ##, ###, ####, #####, ######).
16443
+ - Determine heading level using the following priority:
16444
+ 1. **Hierarchical numbering** (strongest signal): e.g. "1" \u2192 #, "2.1" \u2192 ##, "2.1.1" \u2192 ###, "2.1.2.5" \u2192 ####
16445
+ 2. Font size (larger = higher level)
16446
+ 3. Indentation
16447
+ 4. Bold/emphasis styling
16448
+
16449
+ ### 3. Tables
16450
+
16451
+ **First, decide whether the table should be Markdown or plain text:**
16452
+ - Use **Markdown table format** if the table has a consistent, clear header structure and uniform column layout throughout.
16453
+ - Use **plain text structured description** if the table:
16454
+ - Lacks a clear header row
16455
+ - Uses mixed or irregular column structures across rows
16456
+ - Functions more like a certificate, form, or label layout
16457
+
16458
+ **If using Markdown format**, follow these rules strictly:
16459
+ - Every table must have: header row \u2192 separator row \u2192 data rows
16460
+ - Use simple separators only: \`| --- | --- |\` (NOT \`|---|---|\` or long dashes)
16461
+ - Example:
16462
+ \`\`\`
16203
16463
  | Column 1 | Column 2 |
16204
16464
  | --- | --- |
16205
- | Data 1 | Data 2 |
16206
- - If the extracted content already has tables, preserve their structure but fix any errors you find in the actual data
16207
- - Do NOT output tables as plain text or in any other format
16208
- - Preserve all markdown formatting (headings with ##, lists, etc.)
16209
-
16210
- Specific notes and guidelines:
16211
- - Some pages might contain a table with a column that show black and white dots (for Example Rufe-LEDs). You should translate this into + for black (meaning active) and - for white (meaning inactive).
16212
- - Some tables might use green or black checkmarks and red or black crosses. You should translate this into + for checkmarks (meaning active) and - for a cross (meaning inactive).
16213
- - IMPORTANT: Only provide corrections if you find actual errors in the content. If the extracted text is accurate, set needs_correction to false.
16214
-
16465
+ | Data 1 | Data 2 |
16466
+ \`\`\`
16467
+ - Important: do not use the | character as part of the data inside a cell, this would break the table, if a cell contains a | character, use a capital I.
16468
+
16469
+ **Symbol translation rules for table cells:**
16470
+ - Black/filled dot \u2192 \`+\` (active); White/empty dot \u2192 \`-\` (inactive)
16471
+ *(e.g. Rufe-LED columns)*
16472
+ - Green or black checkmark \u2192 \`+\` (active); Red or black cross \u2192 \`-\` (inactive)
16473
+
16474
+ ### 4. Multi-Page Table Continuity
16475
+ - If this page contains a table with a header row that runs to the bottom of the page (suggesting it may continue on the next page), extract the header row and include it in the \`current_page_table.headers\` field.
16476
+ - If this page contains a table WITHOUT a header row (suggesting it's a continuation from a previous page), set \`current_page_table.is_continuation\` to true and try to identify what the headers might be based on the data structure. Include your best guess for headers in \`current_page_table.headers\`.
16477
+
16478
+ ### 5. Technical Diagrams & Schematics
16479
+ If the page contains a flow-chart, schematic, technical drawing or control board layout that is **absent or poorly described** in the OCR output do the following:
16480
+ - Open a <diagram> tag with the following content:
16481
+ <diagram>
16482
+ <description>
16483
+ Add a detailed description of the diagram here.
16484
+ </description>
16485
+ <mermaid>
16486
+ Add a mermaid diagram schema here that in detail describes the diagram.
16487
+ </mermaid>
16488
+ </diagram>
16489
+
16490
+ ### 6. Captions, Icons & Symbols
16491
+ - Verify that image captions, labels, icons, and checkmarks are present and correctly transcribed.
16492
+
16493
+ ### 7. Only populate \`corrected_text\` when \`needs_correction\` is true. If the OCR output is accurate, return \`needs_correction: false\` and \`corrected_content: null\`.
16215
16494
  `;
16216
16495
  const result = await (0, import_ai7.generateText)({
16217
16496
  model,
@@ -16219,6 +16498,10 @@ Specific notes and guidelines:
16219
16498
  schema: import_zod12.z.object({
16220
16499
  needs_correction: import_zod12.z.boolean(),
16221
16500
  corrected_text: import_zod12.z.string().nullable(),
16501
+ current_page_table: import_zod12.z.object({
16502
+ headers: import_zod12.z.array(import_zod12.z.string()),
16503
+ is_continuation: import_zod12.z.boolean()
16504
+ }).nullable(),
16222
16505
  confidence: import_zod12.z.enum(["high", "medium", "low"]),
16223
16506
  reasoning: import_zod12.z.string()
16224
16507
  })
@@ -16241,23 +16524,80 @@ Specific notes and guidelines:
16241
16524
  needs_correction: parsedOutput.needs_correction,
16242
16525
  corrected_text: parsedOutput.corrected_text || void 0,
16243
16526
  confidence: parsedOutput.confidence,
16527
+ current_page_table: parsedOutput.current_page_table || void 0,
16244
16528
  reasoning: parsedOutput.reasoning
16245
16529
  };
16246
16530
  return validation;
16247
16531
  }
16532
+ function reconstructTableHeaders(document2, validationResults, verbose = false) {
16533
+ let lastTableHeaders = void 0;
16534
+ for (const page of document2) {
16535
+ const validation = validationResults.get(page.page);
16536
+ if (!validation) continue;
16537
+ const tableInfo = validation.current_page_table;
16538
+ if (tableInfo && tableInfo.headers.length > 0) {
16539
+ if (tableInfo.is_continuation && lastTableHeaders) {
16540
+ if (verbose) {
16541
+ console.log(`[EXULU] Page ${page.page}: Reconstructing table headers from previous page`);
16542
+ console.log(`[EXULU] Previous headers: ${lastTableHeaders.join(" | ")}`);
16543
+ }
16544
+ const contentToModify = page.vlm_corrected_text || page.content;
16545
+ const lines = contentToModify.split("\n");
16546
+ const firstTableLineIndex = lines.findIndex((line) => line.trim().startsWith("|"));
16547
+ if (firstTableLineIndex !== -1) {
16548
+ const headerRow = `| ${lastTableHeaders.join(" | ")} |`;
16549
+ const separatorRow = `| ${lastTableHeaders.map(() => "---").join(" | ")} |`;
16550
+ lines.splice(firstTableLineIndex, 0, headerRow, separatorRow);
16551
+ const reconstructedContent = lines.join("\n");
16552
+ if (page.vlm_corrected_text) {
16553
+ page.vlm_corrected_text = reconstructedContent;
16554
+ } else {
16555
+ page.content = reconstructedContent;
16556
+ }
16557
+ if (verbose) {
16558
+ console.log(`[EXULU] Page ${page.page}: Added table headers successfully`);
16559
+ }
16560
+ }
16561
+ if (!tableInfo.is_continuation) {
16562
+ lastTableHeaders = tableInfo.headers;
16563
+ }
16564
+ } else {
16565
+ lastTableHeaders = tableInfo.headers;
16566
+ if (verbose) {
16567
+ console.log(`[EXULU] Page ${page.page}: Storing table headers for potential continuation`);
16568
+ console.log(`[EXULU] Headers: ${lastTableHeaders.join(" | ")}`);
16569
+ }
16570
+ }
16571
+ } else {
16572
+ lastTableHeaders = void 0;
16573
+ }
16574
+ }
16575
+ }
16248
16576
  async function validateWithVLM(document2, model, verbose = false, concurrency = 10) {
16249
16577
  console.log(`[EXULU] Starting VLM validation for docling output, ${document2.length} pages...`);
16250
- console.log(
16251
- `[EXULU] Concurrency limit: ${concurrency}`
16252
- );
16578
+ console.log(`[EXULU] Concurrency limit: ${concurrency}`);
16579
+ const limit = (0, import_p_limit.default)(concurrency);
16580
+ const validationResults = /* @__PURE__ */ new Map();
16253
16581
  let validatedCount = 0;
16254
16582
  let correctedCount = 0;
16255
- const limit = (0, import_p_limit.default)(concurrency);
16256
16583
  const validationTasks = document2.map(
16257
16584
  (page) => limit(async () => {
16585
+ await new Promise((resolve3) => setImmediate(resolve3));
16258
16586
  const imagePath = page.image;
16587
+ if (!page.content) {
16588
+ console.warn(`[EXULU] Page ${page.page}: No content found, skipping validation`);
16589
+ return;
16590
+ }
16259
16591
  if (!imagePath) {
16260
- console.log(`[EXULU] Page ${page.page}: No image found, skipping validation`);
16592
+ console.warn(`[EXULU] Page ${page.page}: No image found, skipping validation`);
16593
+ return;
16594
+ }
16595
+ const hasImage = page.content.match(/\.(jpeg|jpg|png|gif|webp)/i);
16596
+ const hasTable = (page.content.match(/\|/g)?.length || 0) > 1;
16597
+ if (!hasImage && !hasTable) {
16598
+ if (verbose) {
16599
+ console.log(`[EXULU] Page ${page.page}: No image or table found, SKIPPING VLM validation`);
16600
+ }
16261
16601
  return;
16262
16602
  }
16263
16603
  let validation;
@@ -16265,6 +16605,13 @@ async function validateWithVLM(document2, model, verbose = false, concurrency =
16265
16605
  validation = await withRetry(async () => {
16266
16606
  return await validatePageWithVLM(page, imagePath, model);
16267
16607
  }, 3);
16608
+ validationResults.set(page.page, validation);
16609
+ if (verbose && validation.current_page_table) {
16610
+ console.log(`[EXULU] Page ${page.page} table info:`, {
16611
+ headers: validation.current_page_table.headers,
16612
+ is_continuation: validation.current_page_table.is_continuation
16613
+ });
16614
+ }
16268
16615
  } catch (error) {
16269
16616
  console.error(`[EXULU] Error validating page ${page.page} with VLM more than 3 times, skipping:`, error);
16270
16617
  throw error;
@@ -16295,9 +16642,12 @@ async function validateWithVLM(document2, model, verbose = false, concurrency =
16295
16642
  })
16296
16643
  );
16297
16644
  await Promise.all(validationTasks);
16298
- console.log(`[EXULU] VLM validation complete:`);
16299
- console.log(`[EXULU] Validated: ${validatedCount} chunks`);
16300
- console.log(`[EXULU] Corrected: ${correctedCount} chunks`);
16645
+ console.log(`[EXULU] VLM validation complete (parallel processing):`);
16646
+ console.log(`[EXULU] Validated: ${validatedCount} pages`);
16647
+ console.log(`[EXULU] Corrected: ${correctedCount} pages`);
16648
+ console.log(`[EXULU] Starting sequential table header reconstruction...`);
16649
+ reconstructTableHeaders(document2, validationResults, verbose);
16650
+ console.log(`[EXULU] Table header reconstruction complete`);
16301
16651
  return document2;
16302
16652
  }
16303
16653
  async function processDocument(filePath, fileType, buffer, tempDir, config, verbose = false) {
@@ -16312,15 +16662,6 @@ async function processDocument(filePath, fileType, buffer, tempDir, config, verb
16312
16662
  const stripped = filePath.split(".").pop()?.trim();
16313
16663
  let result;
16314
16664
  switch (stripped) {
16315
- case "pdf":
16316
- result = await processPdf(buffer, paths, config, verbose);
16317
- break;
16318
- case "docx":
16319
- result = await processDocx(buffer);
16320
- break;
16321
- case "doc":
16322
- result = await processWord(buffer);
16323
- break;
16324
16665
  case "txt":
16325
16666
  case "md":
16326
16667
  let content = buffer.toString();
@@ -16334,6 +16675,15 @@ async function processDocument(filePath, fileType, buffer, tempDir, config, verb
16334
16675
  }]
16335
16676
  };
16336
16677
  break;
16678
+ case "pdf":
16679
+ result = await processPdf(buffer, paths, config, verbose);
16680
+ break;
16681
+ case "docx":
16682
+ result = await processDocx(buffer);
16683
+ break;
16684
+ case "doc":
16685
+ result = await processWord(buffer);
16686
+ break;
16337
16687
  // Todo other file types with docx and officeparser
16338
16688
  default:
16339
16689
  throw new Error(`[EXULU] Unsupported file type: ${fileType}`);
@@ -16348,8 +16698,8 @@ async function processDocument(filePath, fileType, buffer, tempDir, config, verb
16348
16698
  }
16349
16699
  async function processPdf(buffer, paths, config, verbose = false) {
16350
16700
  try {
16351
- let json;
16352
- if (config?.docling) {
16701
+ let json = [];
16702
+ if (config?.processor.name === "docling") {
16353
16703
  console.log(`[EXULU] Validating Python environment...`);
16354
16704
  const validation = await validatePythonEnvironment(void 0, true);
16355
16705
  if (!validation.valid) {
@@ -16390,7 +16740,7 @@ ${setupResult.output || ""}`);
16390
16740
  }
16391
16741
  const jsonContent = await fs2.promises.readFile(paths.json, "utf-8");
16392
16742
  json = JSON.parse(jsonContent);
16393
- } else {
16743
+ } else if (config?.processor.name === "officeparser") {
16394
16744
  const text = await (0, import_officeparser2.parseOfficeAsync)(buffer, {
16395
16745
  outputErrorToConsole: false,
16396
16746
  newlineDelimiter: "\n"
@@ -16400,15 +16750,69 @@ ${setupResult.output || ""}`);
16400
16750
  content: text,
16401
16751
  headings: []
16402
16752
  }];
16753
+ } else if (config?.processor.name === "mistral") {
16754
+ if (!process.env.MISTRAL_API_KEY) {
16755
+ throw new Error("[EXULU] MISTRAL_API_KEY is not set, please set it in the environment variables.");
16756
+ }
16757
+ await new Promise((resolve3) => setTimeout(resolve3, Math.floor(Math.random() * 4e3) + 1e3));
16758
+ const base64Pdf = buffer.toString("base64");
16759
+ const client2 = new import_mistralai.Mistral({ apiKey: process.env.MISTRAL_API_KEY });
16760
+ const ocrResponse = await withRetry(async () => {
16761
+ const ocrResponse2 = await client2.ocr.process({
16762
+ document: {
16763
+ type: "document_url",
16764
+ documentUrl: "data:application/pdf;base64," + base64Pdf
16765
+ },
16766
+ model: "mistral-ocr-latest",
16767
+ includeImageBase64: false
16768
+ });
16769
+ return ocrResponse2;
16770
+ }, 10);
16771
+ const parser = new import_liteparse.LiteParse();
16772
+ const screenshots = await parser.screenshot(paths.source, void 0);
16773
+ await fs2.promises.mkdir(paths.images, { recursive: true });
16774
+ for (const screenshot of screenshots) {
16775
+ await fs2.promises.writeFile(
16776
+ path.join(
16777
+ paths.images,
16778
+ `${screenshot.pageNum}.png`
16779
+ ),
16780
+ screenshot.imageBuffer
16781
+ );
16782
+ screenshot.imagePath = path.join(paths.images, `${screenshot.pageNum}.png`);
16783
+ }
16784
+ json = ocrResponse.pages.map((page) => ({
16785
+ page: page.index + 1,
16786
+ content: page.markdown,
16787
+ image: screenshots.find((s) => s.pageNum === page.index + 1)?.imagePath,
16788
+ headings: []
16789
+ }));
16790
+ fs2.writeFileSync(paths.json, JSON.stringify(json, null, 2));
16791
+ } else if (config?.processor.name === "liteparse") {
16792
+ const parser = new import_liteparse.LiteParse();
16793
+ const result = await parser.parse(paths.source);
16794
+ const screenshots = await parser.screenshot(paths.source, void 0);
16795
+ console.log(`[EXULU] Liteparse screenshots: ${JSON.stringify(screenshots)}`);
16796
+ await fs2.promises.mkdir(paths.images, { recursive: true });
16797
+ for (const screenshot of screenshots) {
16798
+ await fs2.promises.writeFile(path.join(paths.images, `${screenshot.pageNum}.png`), screenshot.imageBuffer);
16799
+ screenshot.imagePath = path.join(paths.images, `${screenshot.pageNum}.png`);
16800
+ }
16801
+ json = result.pages.map((page) => ({
16802
+ page: page.pageNum,
16803
+ content: page.text,
16804
+ image: screenshots.find((s) => s.pageNum === page.pageNum)?.imagePath
16805
+ }));
16806
+ fs2.writeFileSync(paths.json, JSON.stringify(json, null, 2));
16403
16807
  }
16404
16808
  console.log(`[EXULU]
16405
16809
  \u2713 Document processing completed successfully`);
16406
16810
  console.log(`[EXULU] Total pages: ${json.length}`);
16407
16811
  console.log(`[EXULU] Output file: ${paths.json}`);
16408
- if (!config?.docling && config?.vlm?.model) {
16812
+ if (config?.vlm?.model) {
16409
16813
  console.error("[EXULU] VLM validation is only supported when docling is enabled, skipping validation.");
16410
16814
  }
16411
- if (config?.docling && config?.vlm?.model) {
16815
+ if (config?.vlm?.model && json.length > 0) {
16412
16816
  json = await validateWithVLM(
16413
16817
  json,
16414
16818
  config.vlm.model,
@@ -16436,29 +16840,37 @@ ${setupResult.output || ""}`);
16436
16840
  "utf-8"
16437
16841
  );
16438
16842
  }
16439
- const markdown = json.map((p) => {
16440
- if (p.vlm_corrected_text) {
16441
- return p.vlm_corrected_text;
16442
- } else {
16443
- return p.content;
16843
+ const markdownStream = fs2.createWriteStream(paths.markdown, { encoding: "utf-8" });
16844
+ for (let i = 0; i < json.length; i++) {
16845
+ const p = json[i];
16846
+ if (!p) continue;
16847
+ const content = p.vlm_corrected_text ?? p.content;
16848
+ markdownStream.write(content);
16849
+ if (i < json.length - 1) {
16850
+ markdownStream.write("\n\n\n<!-- END_OF_PAGE -->\n\n\n");
16444
16851
  }
16445
- }).join("\n\n\n<!-- END_OF_PAGE -->\n\n\n");
16446
- await fs2.promises.writeFile(
16447
- paths.markdown,
16448
- markdown,
16449
- "utf-8"
16450
- );
16852
+ }
16853
+ await new Promise((resolve3, reject) => {
16854
+ markdownStream.end(() => resolve3());
16855
+ markdownStream.on("error", reject);
16856
+ });
16451
16857
  console.log(`[EXULU] Validated output saved to: ${paths.json}`);
16452
16858
  console.log(`[EXULU] Validated markdown saved to: ${paths.markdown}`);
16859
+ const markdown = await fs2.promises.readFile(paths.markdown, "utf-8");
16860
+ const processedJson = json.map((e) => {
16861
+ const finalContent = e.vlm_corrected_text ?? e.content;
16862
+ return {
16863
+ page: e.page,
16864
+ content: finalContent
16865
+ };
16866
+ });
16867
+ json.length = 0;
16868
+ json = [];
16869
+ const memUsage = process.memoryUsage();
16870
+ console.log(`[EXULU] Memory after document processing: ${Math.round(memUsage.heapUsed / 1024 / 1024)}MB / ${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`);
16453
16871
  return {
16454
16872
  markdown,
16455
- json: json.map((e) => {
16456
- const finalContent = e.vlm_corrected_text || e.content;
16457
- return {
16458
- page: e.page,
16459
- content: finalContent
16460
- };
16461
- })
16873
+ json: processedJson
16462
16874
  };
16463
16875
  } catch (error) {
16464
16876
  console.error("[EXULU] Error processing document:", error);
@@ -16471,9 +16883,9 @@ var loadFile = async (file, name, tempDir) => {
16471
16883
  if (!fileType) {
16472
16884
  throw new Error("[EXULU] File name does not include extension, extension is required for document processing.");
16473
16885
  }
16886
+ const UUID = (0, import_crypto.randomUUID)();
16474
16887
  let buffer;
16475
16888
  if (Buffer.isBuffer(file)) {
16476
- const UUID = (0, import_crypto.randomUUID)();
16477
16889
  filePath = path.join(tempDir, `${UUID}.${fileType}`);
16478
16890
  await fs2.promises.writeFile(filePath, file);
16479
16891
  buffer = file;
@@ -16482,7 +16894,10 @@ var loadFile = async (file, name, tempDir) => {
16482
16894
  if (filePath.startsWith("http")) {
16483
16895
  const response = await fetch(filePath);
16484
16896
  const array = await response.arrayBuffer();
16897
+ const tempFilePath = path.join(tempDir, `${UUID}.${fileType}`);
16898
+ await fs2.promises.writeFile(tempFilePath, Buffer.from(array));
16485
16899
  buffer = Buffer.from(array);
16900
+ filePath = tempFilePath;
16486
16901
  } else {
16487
16902
  buffer = await fs2.promises.readFile(file);
16488
16903
  }
@@ -16500,17 +16915,34 @@ async function documentProcessor({
16500
16915
  }
16501
16916
  const uuid = (0, import_crypto.randomUUID)();
16502
16917
  const tempDir = path.join(process.cwd(), "temp", uuid);
16918
+ const localFilesAndFoldersToDelete = [tempDir];
16503
16919
  console.log(`[EXULU] Temporary directory for processing document ${name}: ${tempDir}`);
16504
16920
  await fs2.promises.mkdir(tempDir, { recursive: true });
16921
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString();
16922
+ await fs2.promises.writeFile(path.join(tempDir, "created_at.txt"), timestamp);
16505
16923
  try {
16506
16924
  const {
16507
16925
  filePath,
16508
16926
  fileType,
16509
16927
  buffer
16510
16928
  } = await loadFile(file, name, tempDir);
16511
- const supportedTypes = ["pdf", "docx", "doc", "txt", "md"];
16929
+ let supportedTypes = [];
16930
+ switch (config?.processor.name) {
16931
+ case "docling":
16932
+ supportedTypes = ["pdf", "docx", "doc", "txt", "md"];
16933
+ break;
16934
+ case "officeparser":
16935
+ supportedTypes = [];
16936
+ break;
16937
+ case "liteparse":
16938
+ supportedTypes = ["pdf", "doc", "docx", "docm", "odt", "rtf", "ppt", "pptx", "pptm", "odp", "xls", "xlsx", "xlsm", "ods", "csv", "tsv"];
16939
+ break;
16940
+ case "mistral":
16941
+ supportedTypes = ["pdf", "docx", "doc", "txt", "md"];
16942
+ break;
16943
+ }
16512
16944
  if (!supportedTypes.includes(fileType)) {
16513
- throw new Error(`[EXULU] Unsupported file type: ${fileType} for Exulu document processor.`);
16945
+ throw new Error(`[EXULU] Unsupported file type: ${fileType} for Exulu document processor, the ${config?.processor.name} processor only supports the following file types: ${supportedTypes.join(", ")}.`);
16514
16946
  }
16515
16947
  const { content } = await processDocument(
16516
16948
  filePath,
@@ -16523,9 +16955,19 @@ async function documentProcessor({
16523
16955
  return content.json;
16524
16956
  } catch (error) {
16525
16957
  console.error("Error during chunking:", error);
16526
- return void 0;
16958
+ throw error;
16527
16959
  } finally {
16528
- await fs2.promises.rm(tempDir, { recursive: true });
16960
+ if (config?.debugging?.deleteTempFiles !== false) {
16961
+ for (const file2 of localFilesAndFoldersToDelete) {
16962
+ try {
16963
+ await fs2.promises.rm(file2, { recursive: true });
16964
+ console.log(`[EXULU] Deleted file or folder: ${file2}`);
16965
+ } catch (error) {
16966
+ console.error(`[EXULU] Error deleting file or folder: ${file2}`, error);
16967
+ console.log(`[EXULU] File or folder still exists: ${file2}`);
16968
+ }
16969
+ }
16970
+ }
16529
16971
  }
16530
16972
  }
16531
16973