@exulu/backend 1.49.2 → 1.51.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,13 +13,20 @@ import { parseOfficeAsync } from "officeparser";
13
13
  import { checkLicense } from '@EE/entitlements';
14
14
  import { executePythonScript } from '@SRC/utils/python-executor';
15
15
  import { setupPythonEnvironment, validatePythonEnvironment } from '@SRC/utils/python-setup';
16
+ import { LiteParse } from '@llamaindex/liteparse';
17
+ import { Mistral } from '@mistralai/mistralai';
16
18
 
17
19
  type DocumentProcessorConfig = {
18
20
  vlm?: {
19
21
  model: LanguageModel;
20
22
  concurrency: number;
21
23
  },
22
- docling?: boolean,
24
+ processor: {
25
+ name: "docling" | "liteparse" | "mistral" | "officeparser"
26
+ }
27
+ debugging?: {
28
+ deleteTempFiles?: boolean;
29
+ }
23
30
  }
24
31
 
25
32
  type ProcessedPage = {
@@ -41,6 +48,10 @@ interface VLMValidationResult {
41
48
  needs_correction: boolean;
42
49
  corrected_text?: string;
43
50
  confidence: 'high' | 'medium' | 'low';
51
+ current_page_table?: {
52
+ headers: string[];
53
+ is_continuation: boolean; // true if this table appears to be missing headers
54
+ }
44
55
  reasoning: string;
45
56
  }
46
57
 
@@ -171,43 +182,80 @@ async function validatePageWithVLM(
171
182
  const imageBase64 = imageBuffer.toString('base64');
172
183
  const mimeType = 'image/png';
173
184
 
174
- const prompt = `You are validating OCR/document parsing output for a page that might contain tables and images.
175
-
176
- Here is the current OCR/parsed content for this page:
185
+ const prompt = `You are a document validation assistant. Your task is to analyze a page image and correct the output of an OCR/parsing pipeline. The content may include tables, technical diagrams, schematics, and structured text.
177
186
 
178
187
  ---
188
+ ## CURRENT OCR OUTPUT
189
+
179
190
  ${page.content}
180
191
  ---
181
192
 
182
- Please analyze the page image and validate it:
183
-
184
- 1. Check if the extracted markdown text accurately represents the content from the page, including:
185
- - Table data (rows, columns, headers, values)
186
- - Technical diagrams, schematics, control boards
187
- - Icons, checkmarks, symbols
188
- - Image captions and labels
193
+ ## YOUR TASK
189
194
 
190
- 2. If the page has significant errors or omissions, provide a corrected version for the page.
195
+ Compare the page image to the OCR output above. Identify errors, omissions, and formatting issues, then return a structured validation result (see OUTPUT FORMAT below).
191
196
 
192
- 3. Return a validation result for the page.
193
-
194
- IMPORTANT OUTPUT FORMAT REQUIREMENTS:
195
- - You MUST output all tables in proper Markdown table format using pipes (|) and dashes (---)
196
- - Use simple separator rows: | --- | --- | (NOT long dashes like ----------------------)
197
- - Every table must have: header row, separator row, and data rows
198
- - Example format:
197
+ ---
198
+ ## VALIDATION CHECKLIST
199
+
200
+ Work through these checks in order:
201
+
202
+ ### 1. Text Accuracy
203
+ - Verify all text is correctly transcribed.
204
+ - For minor character-level OCR errors (e.g. "ö" vs "ü", "rn" vs "m"), **prefer the original OCR output** unless you are certain of an error. Do not silently "fix" characters based on guesswork.
205
+
206
+ ### 2. Heading Levels
207
+ - Verify that headings use correct Markdown levels (#, ##, ###, ####, #####, ######).
208
+ - Determine heading level using the following priority:
209
+ 1. **Hierarchical numbering** (strongest signal): e.g. "1" → #, "2.1" → ##, "2.1.1" → ###, "2.1.2.5" → ####
210
+ 2. Font size (larger = higher level)
211
+ 3. Indentation
212
+ 4. Bold/emphasis styling
213
+
214
+ ### 3. Tables
215
+
216
+ **First, decide whether the table should be Markdown or plain text:**
217
+ - Use **Markdown table format** if the table has a consistent, clear header structure and uniform column layout throughout.
218
+ - Use **plain text structured description** if the table:
219
+ - Lacks a clear header row
220
+ - Uses mixed or irregular column structures across rows
221
+ - Functions more like a certificate, form, or label layout
222
+
223
+ **If using Markdown format**, follow these rules strictly:
224
+ - Every table must have: header row → separator row → data rows
225
+ - Use simple separators only: \`| --- | --- |\` (NOT \`|---|---|\` or long dashes)
226
+ - Example:
227
+ \`\`\`
199
228
  | Column 1 | Column 2 |
200
229
  | --- | --- |
201
- | Data 1 | Data 2 |
202
- - If the extracted content already has tables, preserve their structure but fix any errors you find in the actual data
203
- - Do NOT output tables as plain text or in any other format
204
- - Preserve all markdown formatting (headings with ##, lists, etc.)
205
-
206
- Specific notes and guidelines:
207
- - Some pages might contain a table with a column that show black and white dots (for Example Rufe-LEDs). You should translate this into + for black (meaning active) and - for white (meaning inactive).
208
- - Some tables might use green or black checkmarks and red or black crosses. You should translate this into + for checkmarks (meaning active) and - for a cross (meaning inactive).
209
- - IMPORTANT: Only provide corrections if you find actual errors in the content. If the extracted text is accurate, set needs_correction to false.
210
-
230
+ | Data 1 | Data 2 |
231
+ \`\`\`
232
+ - Important: do not use the | character as part of the data inside a cell, this would break the table, if a cell contains a | character, use a capital I.
233
+
234
+ **Symbol translation rules for table cells:**
235
+ - Black/filled dot → \`+\` (active); White/empty dot → \`-\` (inactive)
236
+ *(e.g. Rufe-LED columns)*
237
+ - Green or black checkmark \`+\` (active); Red or black cross → \`-\` (inactive)
238
+
239
+ ### 4. Multi-Page Table Continuity
240
+ - If this page contains a table with a header row that runs to the bottom of the page (suggesting it may continue on the next page), extract the header row and include it in the \`current_page_table.headers\` field.
241
+ - If this page contains a table WITHOUT a header row (suggesting it's a continuation from a previous page), set \`current_page_table.is_continuation\` to true and try to identify what the headers might be based on the data structure. Include your best guess for headers in \`current_page_table.headers\`.
242
+
243
+ ### 5. Technical Diagrams & Schematics
244
+ If the page contains a flow-chart, schematic, technical drawing or control board layout that is **absent or poorly described** in the OCR output do the following:
245
+ - Open a <diagram> tag with the following content:
246
+ <diagram>
247
+ <description>
248
+ Add a detailed description of the diagram here.
249
+ </description>
250
+ <mermaid>
251
+ Add a mermaid diagram schema here that in detail describes the diagram.
252
+ </mermaid>
253
+ </diagram>
254
+
255
+ ### 6. Captions, Icons & Symbols
256
+ - Verify that image captions, labels, icons, and checkmarks are present and correctly transcribed.
257
+
258
+ ### 7. Only populate \`corrected_text\` when \`needs_correction\` is true. If the OCR output is accurate, return \`needs_correction: false\` and \`corrected_content: null\`.
211
259
  `;
212
260
 
213
261
  const result = await generateText({
@@ -216,6 +264,10 @@ Specific notes and guidelines:
216
264
  schema: z.object({
217
265
  needs_correction: z.boolean(),
218
266
  corrected_text: z.string().nullable(),
267
+ current_page_table: z.object({
268
+ headers: z.array(z.string()),
269
+ is_continuation: z.boolean(),
270
+ }).nullable(),
219
271
  confidence: z.enum(['high', 'medium', 'low']),
220
272
  reasoning: z.string(),
221
273
  }),
@@ -239,6 +291,10 @@ Specific notes and guidelines:
239
291
  needs_correction: boolean;
240
292
  corrected_text: string | null;
241
293
  confidence: 'high' | 'medium' | 'low';
294
+ current_page_table?: {
295
+ headers: string[];
296
+ is_continuation: boolean;
297
+ } | null;
242
298
  reasoning: string;
243
299
  };
244
300
 
@@ -246,6 +302,7 @@ Specific notes and guidelines:
246
302
  needs_correction: parsedOutput.needs_correction,
247
303
  corrected_text: parsedOutput.corrected_text || undefined,
248
304
  confidence: parsedOutput.confidence,
305
+ current_page_table: parsedOutput.current_page_table || undefined,
249
306
  reasoning: parsedOutput.reasoning,
250
307
  };
251
308
 
@@ -253,7 +310,79 @@ Specific notes and guidelines:
253
310
  }
254
311
 
255
312
  /**
256
- * Identifies pages that need VLM validation and validates them
313
+ * Reconstructs table headers across pages sequentially after parallel VLM processing
314
+ */
315
+ function reconstructTableHeaders(
316
+ document: ProcessedDocument,
317
+ validationResults: Map<number, VLMValidationResult>,
318
+ verbose: boolean = false
319
+ ): void {
320
+ let lastTableHeaders: string[] | undefined = undefined;
321
+
322
+ for (const page of document) {
323
+ const validation = validationResults.get(page.page);
324
+ if (!validation) continue;
325
+
326
+ const tableInfo = validation.current_page_table;
327
+
328
+ // If this page has a table
329
+ if (tableInfo && tableInfo.headers.length > 0) {
330
+ // If it's a continuation and we have previous headers, reconstruct
331
+ if (tableInfo.is_continuation && lastTableHeaders) {
332
+ if (verbose) {
333
+ console.log(`[EXULU] Page ${page.page}: Reconstructing table headers from previous page`);
334
+ console.log(`[EXULU] Previous headers: ${lastTableHeaders.join(' | ')}`);
335
+ }
336
+
337
+ // Get the content to modify (corrected or original)
338
+ const contentToModify = page.vlm_corrected_text || page.content;
339
+
340
+ // Find the first table in the content and add headers
341
+ const lines = contentToModify.split('\n');
342
+ const firstTableLineIndex = lines.findIndex(line => line.trim().startsWith('|'));
343
+
344
+ if (firstTableLineIndex !== -1) {
345
+ // Create header row and separator
346
+ const headerRow = `| ${lastTableHeaders.join(' | ')} |`;
347
+ const separatorRow = `| ${lastTableHeaders.map(() => '---').join(' | ')} |`;
348
+
349
+ // Insert headers before the first table row
350
+ lines.splice(firstTableLineIndex, 0, headerRow, separatorRow);
351
+
352
+ // Update the content
353
+ const reconstructedContent = lines.join('\n');
354
+ if (page.vlm_corrected_text) {
355
+ page.vlm_corrected_text = reconstructedContent;
356
+ } else {
357
+ page.content = reconstructedContent;
358
+ }
359
+
360
+ if (verbose) {
361
+ console.log(`[EXULU] Page ${page.page}: Added table headers successfully`);
362
+ }
363
+ }
364
+
365
+ // Update lastTableHeaders if this table also has headers (it might continue further)
366
+ if (!tableInfo.is_continuation) {
367
+ lastTableHeaders = tableInfo.headers;
368
+ }
369
+ } else {
370
+ // This is a new table with headers, store them for next page
371
+ lastTableHeaders = tableInfo.headers;
372
+ if (verbose) {
373
+ console.log(`[EXULU] Page ${page.page}: Storing table headers for potential continuation`);
374
+ console.log(`[EXULU] Headers: ${lastTableHeaders.join(' | ')}`);
375
+ }
376
+ }
377
+ } else {
378
+ // No table on this page, reset the tracking
379
+ lastTableHeaders = undefined;
380
+ }
381
+ }
382
+ }
383
+
384
+ /**
385
+ * Identifies pages that need VLM validation and validates them in parallel
257
386
  */
258
387
  async function validateWithVLM(
259
388
  document: ProcessedDocument,
@@ -262,25 +391,46 @@ async function validateWithVLM(
262
391
  concurrency: number = 10
263
392
  ): Promise<ProcessedDocument> {
264
393
  console.log(`[EXULU] Starting VLM validation for docling output, ${document.length} pages...`);
265
- console.log(
266
- `[EXULU] Concurrency limit: ${concurrency}`
267
- );
394
+ console.log(`[EXULU] Concurrency limit: ${concurrency}`);
395
+
396
+ // Create a concurrency limiter
397
+ const limit = pLimit(concurrency);
398
+
399
+ // Store validation results for post-processing
400
+ const validationResults = new Map<number, VLMValidationResult>();
268
401
 
269
- // Validate each page that needs it
402
+ // Track metrics
270
403
  let validatedCount = 0;
271
404
  let correctedCount = 0;
272
405
 
273
- // Create a limit function for concurrency control
274
- const limit = pLimit(concurrency);
275
-
276
- // Create validation tasks for all pages
277
- const validationTasks = document.map((page) =>
406
+ // Create parallel validation tasks for all pages
407
+ const validationTasks = document.map(page =>
278
408
  limit(async () => {
409
+ // Yield control to the event loop to prevent stalling
410
+ // This is critical for BullMQ to renew job locks during long-running operations
411
+ await new Promise(resolve => setImmediate(resolve));
279
412
 
280
413
  const imagePath = page.image;
281
414
 
415
+ if (!page.content) {
416
+ console.warn(`[EXULU] Page ${page.page}: No content found, skipping validation`);
417
+ return;
418
+ }
419
+
282
420
  if (!imagePath) {
283
- console.log(`[EXULU] Page ${page.page}: No image found, skipping validation`);
421
+ console.warn(`[EXULU] Page ${page.page}: No image found, skipping validation`);
422
+ return;
423
+ }
424
+
425
+ // Check if page.content has a .jpeg, .jpg, .png, .gif, .webp image
426
+ const hasImage = page.content.match(/\.(jpeg|jpg|png|gif|webp)/i);
427
+ // Check if the content has multiple occurences of |
428
+ const hasTable = (page.content.match(/\|/g)?.length || 0) > 1;
429
+
430
+ if (!hasImage && !hasTable) {
431
+ if (verbose) {
432
+ console.log(`[EXULU] Page ${page.page}: No image or table found, SKIPPING VLM validation`);
433
+ }
284
434
  return;
285
435
  }
286
436
 
@@ -290,6 +440,16 @@ async function validateWithVLM(
290
440
  validation = await withRetry(async () => {
291
441
  return await validatePageWithVLM(page, imagePath, model);
292
442
  }, 3);
443
+
444
+ // Store validation result for post-processing
445
+ validationResults.set(page.page, validation);
446
+
447
+ if (verbose && validation.current_page_table) {
448
+ console.log(`[EXULU] Page ${page.page} table info:`, {
449
+ headers: validation.current_page_table.headers,
450
+ is_continuation: validation.current_page_table.is_continuation
451
+ });
452
+ }
293
453
  } catch (error) {
294
454
  console.error(`[EXULU] Error validating page ${page.page} with VLM more than 3 times, skipping:`, error);
295
455
  // Throw so the job fails
@@ -330,12 +490,17 @@ async function validateWithVLM(
330
490
  })
331
491
  );
332
492
 
333
- // Wait for all validation tasks to complete
493
+ // Wait for all parallel validations to complete
334
494
  await Promise.all(validationTasks);
335
495
 
336
- console.log(`[EXULU] VLM validation complete:`);
337
- console.log(`[EXULU] Validated: ${validatedCount} chunks`);
338
- console.log(`[EXULU] Corrected: ${correctedCount} chunks`);
496
+ console.log(`[EXULU] VLM validation complete (parallel processing):`);
497
+ console.log(`[EXULU] Validated: ${validatedCount} pages`);
498
+ console.log(`[EXULU] Corrected: ${correctedCount} pages`);
499
+
500
+ // Post-process: Reconstruct table headers sequentially
501
+ console.log(`[EXULU] Starting sequential table header reconstruction...`);
502
+ reconstructTableHeaders(document, validationResults, verbose);
503
+ console.log(`[EXULU] Table header reconstruction complete`);
339
504
 
340
505
  return document;
341
506
  }
@@ -382,15 +547,6 @@ async function processDocument(
382
547
  const stripped = filePath.split('.').pop()?.trim();
383
548
  let result: ProcessorOutput;
384
549
  switch (stripped) {
385
- case 'pdf':
386
- result = await processPdf(buffer, paths, config, verbose);
387
- break;
388
- case 'docx':
389
- result = await processDocx(buffer);
390
- break;
391
- case 'doc':
392
- result = await processWord(buffer);
393
- break;
394
550
  case 'txt':
395
551
  case 'md':
396
552
  let content = buffer.toString();
@@ -407,6 +563,16 @@ async function processDocument(
407
563
  }],
408
564
  };
409
565
  break;
566
+ case 'pdf':
567
+ result = await processPdf(buffer, paths, config, verbose);
568
+ break;
569
+ case 'docx':
570
+ result = await processDocx(buffer);
571
+ break;
572
+ case 'doc':
573
+ result = await processWord(buffer);
574
+ break;
575
+
410
576
  // Todo other file types with docx and officeparser
411
577
  default:
412
578
  throw new Error(`[EXULU] Unsupported file type: ${fileType}`);
@@ -427,9 +593,9 @@ async function processPdf(
427
593
  verbose: boolean = false,
428
594
  ): Promise<ProcessorOutput> {
429
595
  try {
430
- let json: ProcessedDocument;
596
+ let json: ProcessedDocument = [];
431
597
  // Call the PDF processor script
432
- if (config?.docling) {
598
+ if (config?.processor.name === "docling") {
433
599
 
434
600
  // Validate Python environment and setup if needed
435
601
  console.log(`[EXULU] Validating Python environment...`);
@@ -444,7 +610,6 @@ async function processPdf(
444
610
  force: false, // Only setup if not already done
445
611
  });
446
612
 
447
-
448
613
  if (!setupResult.success) {
449
614
  throw new Error(`Failed to setup Python environment: ${setupResult.message}\n\n${setupResult.output || ''}`);
450
615
  }
@@ -478,7 +643,8 @@ async function processPdf(
478
643
  // Read the generated JSON file
479
644
  const jsonContent = await fs.promises.readFile(paths.json, 'utf-8');
480
645
  json = JSON.parse(jsonContent);
481
- } else {
646
+
647
+ } else if (config?.processor.name === "officeparser") {
482
648
  const text = await parseOfficeAsync(buffer, {
483
649
  outputErrorToConsole: false,
484
650
  newlineDelimiter: "\n",
@@ -488,18 +654,87 @@ async function processPdf(
488
654
  content: text,
489
655
  headings: [],
490
656
  }];
657
+
658
+ } else if (config?.processor.name === "mistral") {
659
+ if (!process.env.MISTRAL_API_KEY) {
660
+ throw new Error('[EXULU] MISTRAL_API_KEY is not set, please set it in the environment variables.');
661
+ }
662
+
663
+ // Wait a randomn time between 1 and 5 seconds to prevent rate limiting
664
+ await new Promise(resolve => setTimeout(resolve, Math.floor(Math.random() * 4000) + 1000));
665
+
666
+ const base64Pdf = buffer.toString('base64');
667
+ const client = new Mistral({ apiKey: process.env.MISTRAL_API_KEY });
668
+
669
+ const ocrResponse = await withRetry(async () => {
670
+ type MistralOCRResponse = Awaited<ReturnType<typeof client.ocr.process>>;
671
+ const ocrResponse: MistralOCRResponse = await client.ocr.process({
672
+ document: {
673
+ type: "document_url",
674
+ documentUrl: "data:application/pdf;base64," + base64Pdf
675
+ },
676
+ model: "mistral-ocr-latest",
677
+ includeImageBase64: false
678
+ });
679
+ return ocrResponse;
680
+ }, 10);
681
+
682
+ const parser = new LiteParse();
683
+ const screenshots = await parser.screenshot(paths.source, undefined);
684
+
685
+ // Save the screenshots in the temp image directory
686
+ await fs.promises.mkdir(paths.images, { recursive: true });
687
+ for (const screenshot of screenshots) {
688
+ await fs.promises.writeFile(path.join(
689
+ paths.images, `${screenshot.pageNum}.png`),
690
+ screenshot.imageBuffer
691
+ );
692
+ screenshot.imagePath = path.join(paths.images, `${screenshot.pageNum}.png`);
693
+ }
694
+
695
+ json = ocrResponse.pages.map(page => ({
696
+ page: page.index + 1,
697
+ content: page.markdown,
698
+ image: screenshots.find(s => s.pageNum === page.index + 1)?.imagePath,
699
+ headings: [],
700
+ }));
701
+
702
+ fs.writeFileSync(paths.json, JSON.stringify(json, null, 2));
703
+
704
+ } else if (config?.processor.name === "liteparse") {
705
+
706
+ const parser = new LiteParse();
707
+ const result = await parser.parse(paths.source);
708
+ const screenshots = await parser.screenshot(paths.source, undefined);
709
+
710
+ console.log(`[EXULU] Liteparse screenshots: ${JSON.stringify(screenshots)}`);
711
+
712
+ // Save the screenshots in the temp image directory
713
+ await fs.promises.mkdir(paths.images, { recursive: true });
714
+ for (const screenshot of screenshots) {
715
+ await fs.promises.writeFile(path.join(paths.images, `${screenshot.pageNum}.png`), screenshot.imageBuffer);
716
+ screenshot.imagePath = path.join(paths.images, `${screenshot.pageNum}.png`);
717
+ }
718
+
719
+ json = result.pages.map(page => ({
720
+ page: page.pageNum,
721
+ content: page.text,
722
+ image: screenshots.find(s => s.pageNum === page.pageNum)?.imagePath,
723
+ }));
724
+
725
+ fs.writeFileSync(paths.json, JSON.stringify(json, null, 2));
491
726
  }
492
727
 
493
728
  console.log(`[EXULU] \n✓ Document processing completed successfully`);
494
729
  console.log(`[EXULU] Total pages: ${json.length}`);
495
730
  console.log(`[EXULU] Output file: ${paths.json}`);
496
731
 
497
- if (!config?.docling && config?.vlm?.model) {
732
+ if (config?.vlm?.model) {
498
733
  console.error('[EXULU] VLM validation is only supported when docling is enabled, skipping validation.');
499
734
  }
500
735
 
501
736
  // Apply VLM validation if enabled
502
- if (config?.docling && config?.vlm?.model) {
737
+ if (config?.vlm?.model && json.length > 0) {
503
738
 
504
739
  json = await validateWithVLM(
505
740
  json,
@@ -535,32 +770,56 @@ async function processPdf(
535
770
  );
536
771
  }
537
772
 
538
- const markdown = json.map(p => {
539
- if (p.vlm_corrected_text) {
540
- return p.vlm_corrected_text;
541
- } else {
542
- return p.content;
773
+ // Memory-efficient: Build markdown incrementally and write to file
774
+ // instead of creating a massive string in memory first
775
+ const markdownStream = fs.createWriteStream(paths.markdown, { encoding: 'utf-8' });
776
+
777
+ for (let i = 0; i < json.length; i++) {
778
+ const p = json[i];
779
+ if (!p) continue;
780
+ const content = p.vlm_corrected_text ?? p.content;
781
+ markdownStream.write(content);
782
+
783
+ // Add separator between pages (but not after the last page)
784
+ if (i < json.length - 1) {
785
+ markdownStream.write('\n\n\n<!-- END_OF_PAGE -->\n\n\n');
543
786
  }
544
- }).join('\n\n\n<!-- END_OF_PAGE -->\n\n\n');
787
+ }
545
788
 
546
- await fs.promises.writeFile(
547
- paths.markdown,
548
- markdown,
549
- 'utf-8'
550
- );
789
+ // Close the stream and wait for it to finish
790
+ await new Promise<void>((resolve, reject) => {
791
+ markdownStream.end(() => resolve());
792
+ markdownStream.on('error', reject);
793
+ });
551
794
 
552
795
  console.log(`[EXULU] Validated output saved to: ${paths.json}`);
553
796
  console.log(`[EXULU] Validated markdown saved to: ${paths.markdown}`);
554
797
 
798
+ // Read markdown back for return (still needed for compatibility)
799
+ // but at least we've written it efficiently
800
+ const markdown = await fs.promises.readFile(paths.markdown, 'utf-8');
801
+
802
+ // Memory optimization: Create minimal return objects
803
+ const processedJson = json.map(e => {
804
+ const finalContent = e.vlm_corrected_text ?? e.content;
805
+ return {
806
+ page: e.page,
807
+ content: finalContent,
808
+ };
809
+ });
810
+
811
+ // Clear references to large objects to help natural GC
812
+ // V8 will collect these on its next GC cycle
813
+ json.length = 0;
814
+ json = [];
815
+
816
+ // Log memory usage for monitoring
817
+ const memUsage = process.memoryUsage();
818
+ console.log(`[EXULU] Memory after document processing: ${Math.round(memUsage.heapUsed / 1024 / 1024)}MB / ${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`);
819
+
555
820
  return {
556
821
  markdown: markdown,
557
- json: json.map(e => {
558
- const finalContent = e.vlm_corrected_text || e.content;
559
- return {
560
- page: e.page,
561
- content: finalContent,
562
- };
563
- }),
822
+ json: processedJson,
564
823
  };
565
824
 
566
825
  } catch (error) {
@@ -581,9 +840,9 @@ const loadFile = async (
581
840
  throw new Error('[EXULU] File name does not include extension, extension is required for document processing.');
582
841
  }
583
842
  // Can be any file type
843
+ const UUID = randomUUID();
584
844
  let buffer: Buffer;
585
845
  if (Buffer.isBuffer(file)) {
586
- const UUID = randomUUID();
587
846
  filePath = path.join(tempDir, `${UUID}.${fileType}`);
588
847
  await fs.promises.writeFile(filePath, file);
589
848
  buffer = file;
@@ -594,7 +853,11 @@ const loadFile = async (
594
853
  // Download the file from the url
595
854
  const response = await fetch(filePath);
596
855
  const array: ArrayBuffer = await response.arrayBuffer();
856
+ // save file to temp file
857
+ const tempFilePath = path.join(tempDir, `${UUID}.${fileType}`);
858
+ await fs.promises.writeFile(tempFilePath, Buffer.from(array));
597
859
  buffer = Buffer.from(array);
860
+ filePath = tempFilePath;
598
861
  } else {
599
862
  // Read the file from the local path
600
863
  buffer = await fs.promises.readFile(file);
@@ -624,11 +887,19 @@ export async function documentProcessor({
624
887
  // Temp dir at the root of the project
625
888
  const uuid = randomUUID()
626
889
  const tempDir = path.join(process.cwd(), 'temp', uuid);
890
+ // Track files to delete locally per job to avoid race conditions in parallel execution
891
+ const localFilesAndFoldersToDelete: string[] = [tempDir];
627
892
  console.log(`[EXULU] Temporary directory for processing document ${name}: ${tempDir}`);
628
893
 
629
894
  // Create the temporary directory
630
895
  await fs.promises.mkdir(tempDir, { recursive: true });
631
896
 
897
+ // Create a .txt file in the temp directory with the current timestamp
898
+ // this can be used to clean up lost temp files that are not deleted by
899
+ // the job after a certain amount of time.
900
+ const timestamp = new Date().toISOString();
901
+ await fs.promises.writeFile(path.join(tempDir, 'created_at.txt'), timestamp);
902
+
632
903
  try {
633
904
  const {
634
905
  filePath,
@@ -636,9 +907,24 @@ export async function documentProcessor({
636
907
  buffer
637
908
  } = await loadFile(file, name, tempDir);
638
909
 
639
- const supportedTypes = ['pdf', 'docx', 'doc', 'txt', 'md'];
910
+ let supportedTypes: string[] = [];
911
+ switch (config?.processor.name) {
912
+ case "docling":
913
+ supportedTypes = ['pdf', 'docx', 'doc', 'txt', 'md'];
914
+ break;
915
+ case "officeparser":
916
+ supportedTypes = [];
917
+ break;
918
+ case "liteparse":
919
+ supportedTypes = ['pdf', 'doc', 'docx', 'docm', 'odt', 'rtf', 'ppt', 'pptx', 'pptm', 'odp', 'xls', 'xlsx', 'xlsm', 'ods', 'csv', 'tsv'];
920
+ break;
921
+ case "mistral":
922
+ supportedTypes = ['pdf', 'docx', 'doc', 'txt', 'md'];
923
+ break;
924
+ }
925
+
640
926
  if (!supportedTypes.includes(fileType)) {
641
- throw new Error(`[EXULU] Unsupported file type: ${fileType} for Exulu document processor.`);
927
+ throw new Error(`[EXULU] Unsupported file type: ${fileType} for Exulu document processor, the ${config?.processor.name} processor only supports the following file types: ${supportedTypes.join(', ')}.`);
642
928
  }
643
929
 
644
930
  // Process document with VLM validation enabled
@@ -656,10 +942,20 @@ export async function documentProcessor({
656
942
 
657
943
  } catch (error) {
658
944
  console.error('Error during chunking:', error);
659
- return undefined;
945
+ throw error;
946
+
660
947
  } finally {
661
- // Delete the temp directory
662
- // todo disabled for debugging
663
- await fs.promises.rm(tempDir, { recursive: true });
948
+ if (config?.debugging?.deleteTempFiles !== false) {
949
+ // Delete the temp directory using the local array to avoid race conditions
950
+ for (const file of localFilesAndFoldersToDelete) {
951
+ try {
952
+ await fs.promises.rm(file, { recursive: true });
953
+ console.log(`[EXULU] Deleted file or folder: ${file}`);
954
+ } catch (error) {
955
+ console.error(`[EXULU] Error deleting file or folder: ${file}`, error);
956
+ console.log(`[EXULU] File or folder still exists: ${file}`);
957
+ }
958
+ }
959
+ }
664
960
  }
665
961
  }