@exulu/backend 1.49.2 → 1.51.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/setup-python.cjs +140 -0
- package/dist/index.cjs +561 -119
- package/dist/index.d.cts +16 -3
- package/dist/index.d.ts +16 -3
- package/dist/index.js +564 -122
- package/ee/chunking/markdown.ts +83 -5
- package/ee/python/documents/processing/doc_processor.ts +380 -84
- package/ee/workers.ts +214 -18
- package/package.json +8 -1
|
@@ -13,13 +13,20 @@ import { parseOfficeAsync } from "officeparser";
|
|
|
13
13
|
import { checkLicense } from '@EE/entitlements';
|
|
14
14
|
import { executePythonScript } from '@SRC/utils/python-executor';
|
|
15
15
|
import { setupPythonEnvironment, validatePythonEnvironment } from '@SRC/utils/python-setup';
|
|
16
|
+
import { LiteParse } from '@llamaindex/liteparse';
|
|
17
|
+
import { Mistral } from '@mistralai/mistralai';
|
|
16
18
|
|
|
17
19
|
type DocumentProcessorConfig = {
|
|
18
20
|
vlm?: {
|
|
19
21
|
model: LanguageModel;
|
|
20
22
|
concurrency: number;
|
|
21
23
|
},
|
|
22
|
-
|
|
24
|
+
processor: {
|
|
25
|
+
name: "docling" | "liteparse" | "mistral" | "officeparser"
|
|
26
|
+
}
|
|
27
|
+
debugging?: {
|
|
28
|
+
deleteTempFiles?: boolean;
|
|
29
|
+
}
|
|
23
30
|
}
|
|
24
31
|
|
|
25
32
|
type ProcessedPage = {
|
|
@@ -41,6 +48,10 @@ interface VLMValidationResult {
|
|
|
41
48
|
needs_correction: boolean;
|
|
42
49
|
corrected_text?: string;
|
|
43
50
|
confidence: 'high' | 'medium' | 'low';
|
|
51
|
+
current_page_table?: {
|
|
52
|
+
headers: string[];
|
|
53
|
+
is_continuation: boolean; // true if this table appears to be missing headers
|
|
54
|
+
}
|
|
44
55
|
reasoning: string;
|
|
45
56
|
}
|
|
46
57
|
|
|
@@ -171,43 +182,80 @@ async function validatePageWithVLM(
|
|
|
171
182
|
const imageBase64 = imageBuffer.toString('base64');
|
|
172
183
|
const mimeType = 'image/png';
|
|
173
184
|
|
|
174
|
-
const prompt = `You are
|
|
175
|
-
|
|
176
|
-
Here is the current OCR/parsed content for this page:
|
|
185
|
+
const prompt = `You are a document validation assistant. Your task is to analyze a page image and correct the output of an OCR/parsing pipeline. The content may include tables, technical diagrams, schematics, and structured text.
|
|
177
186
|
|
|
178
187
|
---
|
|
188
|
+
## CURRENT OCR OUTPUT
|
|
189
|
+
|
|
179
190
|
${page.content}
|
|
180
191
|
---
|
|
181
192
|
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
1. Check if the extracted markdown text accurately represents the content from the page, including:
|
|
185
|
-
- Table data (rows, columns, headers, values)
|
|
186
|
-
- Technical diagrams, schematics, control boards
|
|
187
|
-
- Icons, checkmarks, symbols
|
|
188
|
-
- Image captions and labels
|
|
193
|
+
## YOUR TASK
|
|
189
194
|
|
|
190
|
-
|
|
195
|
+
Compare the page image to the OCR output above. Identify errors, omissions, and formatting issues, then return a structured validation result (see OUTPUT FORMAT below).
|
|
191
196
|
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
-
|
|
197
|
+
---
|
|
198
|
+
## VALIDATION CHECKLIST
|
|
199
|
+
|
|
200
|
+
Work through these checks in order:
|
|
201
|
+
|
|
202
|
+
### 1. Text Accuracy
|
|
203
|
+
- Verify all text is correctly transcribed.
|
|
204
|
+
- For minor character-level OCR errors (e.g. "ö" vs "ü", "rn" vs "m"), **prefer the original OCR output** unless you are certain of an error. Do not silently "fix" characters based on guesswork.
|
|
205
|
+
|
|
206
|
+
### 2. Heading Levels
|
|
207
|
+
- Verify that headings use correct Markdown levels (#, ##, ###, ####, #####, ######).
|
|
208
|
+
- Determine heading level using the following priority:
|
|
209
|
+
1. **Hierarchical numbering** (strongest signal): e.g. "1" → #, "2.1" → ##, "2.1.1" → ###, "2.1.2.5" → ####
|
|
210
|
+
2. Font size (larger = higher level)
|
|
211
|
+
3. Indentation
|
|
212
|
+
4. Bold/emphasis styling
|
|
213
|
+
|
|
214
|
+
### 3. Tables
|
|
215
|
+
|
|
216
|
+
**First, decide whether the table should be Markdown or plain text:**
|
|
217
|
+
- Use **Markdown table format** if the table has a consistent, clear header structure and uniform column layout throughout.
|
|
218
|
+
- Use **plain text structured description** if the table:
|
|
219
|
+
- Lacks a clear header row
|
|
220
|
+
- Uses mixed or irregular column structures across rows
|
|
221
|
+
- Functions more like a certificate, form, or label layout
|
|
222
|
+
|
|
223
|
+
**If using Markdown format**, follow these rules strictly:
|
|
224
|
+
- Every table must have: header row → separator row → data rows
|
|
225
|
+
- Use simple separators only: \`| --- | --- |\` (NOT \`|---|---|\` or long dashes)
|
|
226
|
+
- Example:
|
|
227
|
+
\`\`\`
|
|
199
228
|
| Column 1 | Column 2 |
|
|
200
229
|
| --- | --- |
|
|
201
|
-
| Data 1
|
|
202
|
-
|
|
203
|
-
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
-
|
|
209
|
-
|
|
210
|
-
|
|
230
|
+
| Data 1 | Data 2 |
|
|
231
|
+
\`\`\`
|
|
232
|
+
- Important: do not use the | character as part of the data inside a cell, this would break the table, if a cell contains a | character, use a capital I.
|
|
233
|
+
|
|
234
|
+
**Symbol translation rules for table cells:**
|
|
235
|
+
- Black/filled dot → \`+\` (active); White/empty dot → \`-\` (inactive)
|
|
236
|
+
*(e.g. Rufe-LED columns)*
|
|
237
|
+
- Green or black checkmark → \`+\` (active); Red or black cross → \`-\` (inactive)
|
|
238
|
+
|
|
239
|
+
### 4. Multi-Page Table Continuity
|
|
240
|
+
- If this page contains a table with a header row that runs to the bottom of the page (suggesting it may continue on the next page), extract the header row and include it in the \`current_page_table.headers\` field.
|
|
241
|
+
- If this page contains a table WITHOUT a header row (suggesting it's a continuation from a previous page), set \`current_page_table.is_continuation\` to true and try to identify what the headers might be based on the data structure. Include your best guess for headers in \`current_page_table.headers\`.
|
|
242
|
+
|
|
243
|
+
### 5. Technical Diagrams & Schematics
|
|
244
|
+
If the page contains a flow-chart, schematic, technical drawing or control board layout that is **absent or poorly described** in the OCR output do the following:
|
|
245
|
+
- Open a <diagram> tag with the following content:
|
|
246
|
+
<diagram>
|
|
247
|
+
<description>
|
|
248
|
+
Add a detailed description of the diagram here.
|
|
249
|
+
</description>
|
|
250
|
+
<mermaid>
|
|
251
|
+
Add a mermaid diagram schema here that in detail describes the diagram.
|
|
252
|
+
</mermaid>
|
|
253
|
+
</diagram>
|
|
254
|
+
|
|
255
|
+
### 6. Captions, Icons & Symbols
|
|
256
|
+
- Verify that image captions, labels, icons, and checkmarks are present and correctly transcribed.
|
|
257
|
+
|
|
258
|
+
### 7. Only populate \`corrected_text\` when \`needs_correction\` is true. If the OCR output is accurate, return \`needs_correction: false\` and \`corrected_content: null\`.
|
|
211
259
|
`;
|
|
212
260
|
|
|
213
261
|
const result = await generateText({
|
|
@@ -216,6 +264,10 @@ Specific notes and guidelines:
|
|
|
216
264
|
schema: z.object({
|
|
217
265
|
needs_correction: z.boolean(),
|
|
218
266
|
corrected_text: z.string().nullable(),
|
|
267
|
+
current_page_table: z.object({
|
|
268
|
+
headers: z.array(z.string()),
|
|
269
|
+
is_continuation: z.boolean(),
|
|
270
|
+
}).nullable(),
|
|
219
271
|
confidence: z.enum(['high', 'medium', 'low']),
|
|
220
272
|
reasoning: z.string(),
|
|
221
273
|
}),
|
|
@@ -239,6 +291,10 @@ Specific notes and guidelines:
|
|
|
239
291
|
needs_correction: boolean;
|
|
240
292
|
corrected_text: string | null;
|
|
241
293
|
confidence: 'high' | 'medium' | 'low';
|
|
294
|
+
current_page_table?: {
|
|
295
|
+
headers: string[];
|
|
296
|
+
is_continuation: boolean;
|
|
297
|
+
} | null;
|
|
242
298
|
reasoning: string;
|
|
243
299
|
};
|
|
244
300
|
|
|
@@ -246,6 +302,7 @@ Specific notes and guidelines:
|
|
|
246
302
|
needs_correction: parsedOutput.needs_correction,
|
|
247
303
|
corrected_text: parsedOutput.corrected_text || undefined,
|
|
248
304
|
confidence: parsedOutput.confidence,
|
|
305
|
+
current_page_table: parsedOutput.current_page_table || undefined,
|
|
249
306
|
reasoning: parsedOutput.reasoning,
|
|
250
307
|
};
|
|
251
308
|
|
|
@@ -253,7 +310,79 @@ Specific notes and guidelines:
|
|
|
253
310
|
}
|
|
254
311
|
|
|
255
312
|
/**
|
|
256
|
-
*
|
|
313
|
+
* Reconstructs table headers across pages sequentially after parallel VLM processing
|
|
314
|
+
*/
|
|
315
|
+
function reconstructTableHeaders(
|
|
316
|
+
document: ProcessedDocument,
|
|
317
|
+
validationResults: Map<number, VLMValidationResult>,
|
|
318
|
+
verbose: boolean = false
|
|
319
|
+
): void {
|
|
320
|
+
let lastTableHeaders: string[] | undefined = undefined;
|
|
321
|
+
|
|
322
|
+
for (const page of document) {
|
|
323
|
+
const validation = validationResults.get(page.page);
|
|
324
|
+
if (!validation) continue;
|
|
325
|
+
|
|
326
|
+
const tableInfo = validation.current_page_table;
|
|
327
|
+
|
|
328
|
+
// If this page has a table
|
|
329
|
+
if (tableInfo && tableInfo.headers.length > 0) {
|
|
330
|
+
// If it's a continuation and we have previous headers, reconstruct
|
|
331
|
+
if (tableInfo.is_continuation && lastTableHeaders) {
|
|
332
|
+
if (verbose) {
|
|
333
|
+
console.log(`[EXULU] Page ${page.page}: Reconstructing table headers from previous page`);
|
|
334
|
+
console.log(`[EXULU] Previous headers: ${lastTableHeaders.join(' | ')}`);
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
// Get the content to modify (corrected or original)
|
|
338
|
+
const contentToModify = page.vlm_corrected_text || page.content;
|
|
339
|
+
|
|
340
|
+
// Find the first table in the content and add headers
|
|
341
|
+
const lines = contentToModify.split('\n');
|
|
342
|
+
const firstTableLineIndex = lines.findIndex(line => line.trim().startsWith('|'));
|
|
343
|
+
|
|
344
|
+
if (firstTableLineIndex !== -1) {
|
|
345
|
+
// Create header row and separator
|
|
346
|
+
const headerRow = `| ${lastTableHeaders.join(' | ')} |`;
|
|
347
|
+
const separatorRow = `| ${lastTableHeaders.map(() => '---').join(' | ')} |`;
|
|
348
|
+
|
|
349
|
+
// Insert headers before the first table row
|
|
350
|
+
lines.splice(firstTableLineIndex, 0, headerRow, separatorRow);
|
|
351
|
+
|
|
352
|
+
// Update the content
|
|
353
|
+
const reconstructedContent = lines.join('\n');
|
|
354
|
+
if (page.vlm_corrected_text) {
|
|
355
|
+
page.vlm_corrected_text = reconstructedContent;
|
|
356
|
+
} else {
|
|
357
|
+
page.content = reconstructedContent;
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
if (verbose) {
|
|
361
|
+
console.log(`[EXULU] Page ${page.page}: Added table headers successfully`);
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
// Update lastTableHeaders if this table also has headers (it might continue further)
|
|
366
|
+
if (!tableInfo.is_continuation) {
|
|
367
|
+
lastTableHeaders = tableInfo.headers;
|
|
368
|
+
}
|
|
369
|
+
} else {
|
|
370
|
+
// This is a new table with headers, store them for next page
|
|
371
|
+
lastTableHeaders = tableInfo.headers;
|
|
372
|
+
if (verbose) {
|
|
373
|
+
console.log(`[EXULU] Page ${page.page}: Storing table headers for potential continuation`);
|
|
374
|
+
console.log(`[EXULU] Headers: ${lastTableHeaders.join(' | ')}`);
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
} else {
|
|
378
|
+
// No table on this page, reset the tracking
|
|
379
|
+
lastTableHeaders = undefined;
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
/**
|
|
385
|
+
* Identifies pages that need VLM validation and validates them in parallel
|
|
257
386
|
*/
|
|
258
387
|
async function validateWithVLM(
|
|
259
388
|
document: ProcessedDocument,
|
|
@@ -262,25 +391,46 @@ async function validateWithVLM(
|
|
|
262
391
|
concurrency: number = 10
|
|
263
392
|
): Promise<ProcessedDocument> {
|
|
264
393
|
console.log(`[EXULU] Starting VLM validation for docling output, ${document.length} pages...`);
|
|
265
|
-
console.log(
|
|
266
|
-
|
|
267
|
-
|
|
394
|
+
console.log(`[EXULU] Concurrency limit: ${concurrency}`);
|
|
395
|
+
|
|
396
|
+
// Create a concurrency limiter
|
|
397
|
+
const limit = pLimit(concurrency);
|
|
398
|
+
|
|
399
|
+
// Store validation results for post-processing
|
|
400
|
+
const validationResults = new Map<number, VLMValidationResult>();
|
|
268
401
|
|
|
269
|
-
//
|
|
402
|
+
// Track metrics
|
|
270
403
|
let validatedCount = 0;
|
|
271
404
|
let correctedCount = 0;
|
|
272
405
|
|
|
273
|
-
// Create
|
|
274
|
-
const
|
|
275
|
-
|
|
276
|
-
// Create validation tasks for all pages
|
|
277
|
-
const validationTasks = document.map((page) =>
|
|
406
|
+
// Create parallel validation tasks for all pages
|
|
407
|
+
const validationTasks = document.map(page =>
|
|
278
408
|
limit(async () => {
|
|
409
|
+
// Yield control to the event loop to prevent stalling
|
|
410
|
+
// This is critical for BullMQ to renew job locks during long-running operations
|
|
411
|
+
await new Promise(resolve => setImmediate(resolve));
|
|
279
412
|
|
|
280
413
|
const imagePath = page.image;
|
|
281
414
|
|
|
415
|
+
if (!page.content) {
|
|
416
|
+
console.warn(`[EXULU] Page ${page.page}: No content found, skipping validation`);
|
|
417
|
+
return;
|
|
418
|
+
}
|
|
419
|
+
|
|
282
420
|
if (!imagePath) {
|
|
283
|
-
console.
|
|
421
|
+
console.warn(`[EXULU] Page ${page.page}: No image found, skipping validation`);
|
|
422
|
+
return;
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
// Check if page.content has a .jpeg, .jpg, .png, .gif, .webp image
|
|
426
|
+
const hasImage = page.content.match(/\.(jpeg|jpg|png|gif|webp)/i);
|
|
427
|
+
// Check if the content has multiple occurences of |
|
|
428
|
+
const hasTable = (page.content.match(/\|/g)?.length || 0) > 1;
|
|
429
|
+
|
|
430
|
+
if (!hasImage && !hasTable) {
|
|
431
|
+
if (verbose) {
|
|
432
|
+
console.log(`[EXULU] Page ${page.page}: No image or table found, SKIPPING VLM validation`);
|
|
433
|
+
}
|
|
284
434
|
return;
|
|
285
435
|
}
|
|
286
436
|
|
|
@@ -290,6 +440,16 @@ async function validateWithVLM(
|
|
|
290
440
|
validation = await withRetry(async () => {
|
|
291
441
|
return await validatePageWithVLM(page, imagePath, model);
|
|
292
442
|
}, 3);
|
|
443
|
+
|
|
444
|
+
// Store validation result for post-processing
|
|
445
|
+
validationResults.set(page.page, validation);
|
|
446
|
+
|
|
447
|
+
if (verbose && validation.current_page_table) {
|
|
448
|
+
console.log(`[EXULU] Page ${page.page} table info:`, {
|
|
449
|
+
headers: validation.current_page_table.headers,
|
|
450
|
+
is_continuation: validation.current_page_table.is_continuation
|
|
451
|
+
});
|
|
452
|
+
}
|
|
293
453
|
} catch (error) {
|
|
294
454
|
console.error(`[EXULU] Error validating page ${page.page} with VLM more than 3 times, skipping:`, error);
|
|
295
455
|
// Throw so the job fails
|
|
@@ -330,12 +490,17 @@ async function validateWithVLM(
|
|
|
330
490
|
})
|
|
331
491
|
);
|
|
332
492
|
|
|
333
|
-
// Wait for all
|
|
493
|
+
// Wait for all parallel validations to complete
|
|
334
494
|
await Promise.all(validationTasks);
|
|
335
495
|
|
|
336
|
-
console.log(`[EXULU] VLM validation complete:`);
|
|
337
|
-
console.log(`[EXULU] Validated: ${validatedCount}
|
|
338
|
-
console.log(`[EXULU] Corrected: ${correctedCount}
|
|
496
|
+
console.log(`[EXULU] VLM validation complete (parallel processing):`);
|
|
497
|
+
console.log(`[EXULU] Validated: ${validatedCount} pages`);
|
|
498
|
+
console.log(`[EXULU] Corrected: ${correctedCount} pages`);
|
|
499
|
+
|
|
500
|
+
// Post-process: Reconstruct table headers sequentially
|
|
501
|
+
console.log(`[EXULU] Starting sequential table header reconstruction...`);
|
|
502
|
+
reconstructTableHeaders(document, validationResults, verbose);
|
|
503
|
+
console.log(`[EXULU] Table header reconstruction complete`);
|
|
339
504
|
|
|
340
505
|
return document;
|
|
341
506
|
}
|
|
@@ -382,15 +547,6 @@ async function processDocument(
|
|
|
382
547
|
const stripped = filePath.split('.').pop()?.trim();
|
|
383
548
|
let result: ProcessorOutput;
|
|
384
549
|
switch (stripped) {
|
|
385
|
-
case 'pdf':
|
|
386
|
-
result = await processPdf(buffer, paths, config, verbose);
|
|
387
|
-
break;
|
|
388
|
-
case 'docx':
|
|
389
|
-
result = await processDocx(buffer);
|
|
390
|
-
break;
|
|
391
|
-
case 'doc':
|
|
392
|
-
result = await processWord(buffer);
|
|
393
|
-
break;
|
|
394
550
|
case 'txt':
|
|
395
551
|
case 'md':
|
|
396
552
|
let content = buffer.toString();
|
|
@@ -407,6 +563,16 @@ async function processDocument(
|
|
|
407
563
|
}],
|
|
408
564
|
};
|
|
409
565
|
break;
|
|
566
|
+
case 'pdf':
|
|
567
|
+
result = await processPdf(buffer, paths, config, verbose);
|
|
568
|
+
break;
|
|
569
|
+
case 'docx':
|
|
570
|
+
result = await processDocx(buffer);
|
|
571
|
+
break;
|
|
572
|
+
case 'doc':
|
|
573
|
+
result = await processWord(buffer);
|
|
574
|
+
break;
|
|
575
|
+
|
|
410
576
|
// Todo other file types with docx and officeparser
|
|
411
577
|
default:
|
|
412
578
|
throw new Error(`[EXULU] Unsupported file type: ${fileType}`);
|
|
@@ -427,9 +593,9 @@ async function processPdf(
|
|
|
427
593
|
verbose: boolean = false,
|
|
428
594
|
): Promise<ProcessorOutput> {
|
|
429
595
|
try {
|
|
430
|
-
let json: ProcessedDocument;
|
|
596
|
+
let json: ProcessedDocument = [];
|
|
431
597
|
// Call the PDF processor script
|
|
432
|
-
if (config?.docling) {
|
|
598
|
+
if (config?.processor.name === "docling") {
|
|
433
599
|
|
|
434
600
|
// Validate Python environment and setup if needed
|
|
435
601
|
console.log(`[EXULU] Validating Python environment...`);
|
|
@@ -444,7 +610,6 @@ async function processPdf(
|
|
|
444
610
|
force: false, // Only setup if not already done
|
|
445
611
|
});
|
|
446
612
|
|
|
447
|
-
|
|
448
613
|
if (!setupResult.success) {
|
|
449
614
|
throw new Error(`Failed to setup Python environment: ${setupResult.message}\n\n${setupResult.output || ''}`);
|
|
450
615
|
}
|
|
@@ -478,7 +643,8 @@ async function processPdf(
|
|
|
478
643
|
// Read the generated JSON file
|
|
479
644
|
const jsonContent = await fs.promises.readFile(paths.json, 'utf-8');
|
|
480
645
|
json = JSON.parse(jsonContent);
|
|
481
|
-
|
|
646
|
+
|
|
647
|
+
} else if (config?.processor.name === "officeparser") {
|
|
482
648
|
const text = await parseOfficeAsync(buffer, {
|
|
483
649
|
outputErrorToConsole: false,
|
|
484
650
|
newlineDelimiter: "\n",
|
|
@@ -488,18 +654,87 @@ async function processPdf(
|
|
|
488
654
|
content: text,
|
|
489
655
|
headings: [],
|
|
490
656
|
}];
|
|
657
|
+
|
|
658
|
+
} else if (config?.processor.name === "mistral") {
|
|
659
|
+
if (!process.env.MISTRAL_API_KEY) {
|
|
660
|
+
throw new Error('[EXULU] MISTRAL_API_KEY is not set, please set it in the environment variables.');
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
// Wait a randomn time between 1 and 5 seconds to prevent rate limiting
|
|
664
|
+
await new Promise(resolve => setTimeout(resolve, Math.floor(Math.random() * 4000) + 1000));
|
|
665
|
+
|
|
666
|
+
const base64Pdf = buffer.toString('base64');
|
|
667
|
+
const client = new Mistral({ apiKey: process.env.MISTRAL_API_KEY });
|
|
668
|
+
|
|
669
|
+
const ocrResponse = await withRetry(async () => {
|
|
670
|
+
type MistralOCRResponse = Awaited<ReturnType<typeof client.ocr.process>>;
|
|
671
|
+
const ocrResponse: MistralOCRResponse = await client.ocr.process({
|
|
672
|
+
document: {
|
|
673
|
+
type: "document_url",
|
|
674
|
+
documentUrl: "data:application/pdf;base64," + base64Pdf
|
|
675
|
+
},
|
|
676
|
+
model: "mistral-ocr-latest",
|
|
677
|
+
includeImageBase64: false
|
|
678
|
+
});
|
|
679
|
+
return ocrResponse;
|
|
680
|
+
}, 10);
|
|
681
|
+
|
|
682
|
+
const parser = new LiteParse();
|
|
683
|
+
const screenshots = await parser.screenshot(paths.source, undefined);
|
|
684
|
+
|
|
685
|
+
// Save the screenshots in the temp image directory
|
|
686
|
+
await fs.promises.mkdir(paths.images, { recursive: true });
|
|
687
|
+
for (const screenshot of screenshots) {
|
|
688
|
+
await fs.promises.writeFile(path.join(
|
|
689
|
+
paths.images, `${screenshot.pageNum}.png`),
|
|
690
|
+
screenshot.imageBuffer
|
|
691
|
+
);
|
|
692
|
+
screenshot.imagePath = path.join(paths.images, `${screenshot.pageNum}.png`);
|
|
693
|
+
}
|
|
694
|
+
|
|
695
|
+
json = ocrResponse.pages.map(page => ({
|
|
696
|
+
page: page.index + 1,
|
|
697
|
+
content: page.markdown,
|
|
698
|
+
image: screenshots.find(s => s.pageNum === page.index + 1)?.imagePath,
|
|
699
|
+
headings: [],
|
|
700
|
+
}));
|
|
701
|
+
|
|
702
|
+
fs.writeFileSync(paths.json, JSON.stringify(json, null, 2));
|
|
703
|
+
|
|
704
|
+
} else if (config?.processor.name === "liteparse") {
|
|
705
|
+
|
|
706
|
+
const parser = new LiteParse();
|
|
707
|
+
const result = await parser.parse(paths.source);
|
|
708
|
+
const screenshots = await parser.screenshot(paths.source, undefined);
|
|
709
|
+
|
|
710
|
+
console.log(`[EXULU] Liteparse screenshots: ${JSON.stringify(screenshots)}`);
|
|
711
|
+
|
|
712
|
+
// Save the screenshots in the temp image directory
|
|
713
|
+
await fs.promises.mkdir(paths.images, { recursive: true });
|
|
714
|
+
for (const screenshot of screenshots) {
|
|
715
|
+
await fs.promises.writeFile(path.join(paths.images, `${screenshot.pageNum}.png`), screenshot.imageBuffer);
|
|
716
|
+
screenshot.imagePath = path.join(paths.images, `${screenshot.pageNum}.png`);
|
|
717
|
+
}
|
|
718
|
+
|
|
719
|
+
json = result.pages.map(page => ({
|
|
720
|
+
page: page.pageNum,
|
|
721
|
+
content: page.text,
|
|
722
|
+
image: screenshots.find(s => s.pageNum === page.pageNum)?.imagePath,
|
|
723
|
+
}));
|
|
724
|
+
|
|
725
|
+
fs.writeFileSync(paths.json, JSON.stringify(json, null, 2));
|
|
491
726
|
}
|
|
492
727
|
|
|
493
728
|
console.log(`[EXULU] \n✓ Document processing completed successfully`);
|
|
494
729
|
console.log(`[EXULU] Total pages: ${json.length}`);
|
|
495
730
|
console.log(`[EXULU] Output file: ${paths.json}`);
|
|
496
731
|
|
|
497
|
-
if (
|
|
732
|
+
if (config?.vlm?.model) {
|
|
498
733
|
console.error('[EXULU] VLM validation is only supported when docling is enabled, skipping validation.');
|
|
499
734
|
}
|
|
500
735
|
|
|
501
736
|
// Apply VLM validation if enabled
|
|
502
|
-
if (config?.
|
|
737
|
+
if (config?.vlm?.model && json.length > 0) {
|
|
503
738
|
|
|
504
739
|
json = await validateWithVLM(
|
|
505
740
|
json,
|
|
@@ -535,32 +770,56 @@ async function processPdf(
|
|
|
535
770
|
);
|
|
536
771
|
}
|
|
537
772
|
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
773
|
+
// Memory-efficient: Build markdown incrementally and write to file
|
|
774
|
+
// instead of creating a massive string in memory first
|
|
775
|
+
const markdownStream = fs.createWriteStream(paths.markdown, { encoding: 'utf-8' });
|
|
776
|
+
|
|
777
|
+
for (let i = 0; i < json.length; i++) {
|
|
778
|
+
const p = json[i];
|
|
779
|
+
if (!p) continue;
|
|
780
|
+
const content = p.vlm_corrected_text ?? p.content;
|
|
781
|
+
markdownStream.write(content);
|
|
782
|
+
|
|
783
|
+
// Add separator between pages (but not after the last page)
|
|
784
|
+
if (i < json.length - 1) {
|
|
785
|
+
markdownStream.write('\n\n\n<!-- END_OF_PAGE -->\n\n\n');
|
|
543
786
|
}
|
|
544
|
-
}
|
|
787
|
+
}
|
|
545
788
|
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
'
|
|
550
|
-
);
|
|
789
|
+
// Close the stream and wait for it to finish
|
|
790
|
+
await new Promise<void>((resolve, reject) => {
|
|
791
|
+
markdownStream.end(() => resolve());
|
|
792
|
+
markdownStream.on('error', reject);
|
|
793
|
+
});
|
|
551
794
|
|
|
552
795
|
console.log(`[EXULU] Validated output saved to: ${paths.json}`);
|
|
553
796
|
console.log(`[EXULU] Validated markdown saved to: ${paths.markdown}`);
|
|
554
797
|
|
|
798
|
+
// Read markdown back for return (still needed for compatibility)
|
|
799
|
+
// but at least we've written it efficiently
|
|
800
|
+
const markdown = await fs.promises.readFile(paths.markdown, 'utf-8');
|
|
801
|
+
|
|
802
|
+
// Memory optimization: Create minimal return objects
|
|
803
|
+
const processedJson = json.map(e => {
|
|
804
|
+
const finalContent = e.vlm_corrected_text ?? e.content;
|
|
805
|
+
return {
|
|
806
|
+
page: e.page,
|
|
807
|
+
content: finalContent,
|
|
808
|
+
};
|
|
809
|
+
});
|
|
810
|
+
|
|
811
|
+
// Clear references to large objects to help natural GC
|
|
812
|
+
// V8 will collect these on its next GC cycle
|
|
813
|
+
json.length = 0;
|
|
814
|
+
json = [];
|
|
815
|
+
|
|
816
|
+
// Log memory usage for monitoring
|
|
817
|
+
const memUsage = process.memoryUsage();
|
|
818
|
+
console.log(`[EXULU] Memory after document processing: ${Math.round(memUsage.heapUsed / 1024 / 1024)}MB / ${Math.round(memUsage.heapTotal / 1024 / 1024)}MB`);
|
|
819
|
+
|
|
555
820
|
return {
|
|
556
821
|
markdown: markdown,
|
|
557
|
-
json:
|
|
558
|
-
const finalContent = e.vlm_corrected_text || e.content;
|
|
559
|
-
return {
|
|
560
|
-
page: e.page,
|
|
561
|
-
content: finalContent,
|
|
562
|
-
};
|
|
563
|
-
}),
|
|
822
|
+
json: processedJson,
|
|
564
823
|
};
|
|
565
824
|
|
|
566
825
|
} catch (error) {
|
|
@@ -581,9 +840,9 @@ const loadFile = async (
|
|
|
581
840
|
throw new Error('[EXULU] File name does not include extension, extension is required for document processing.');
|
|
582
841
|
}
|
|
583
842
|
// Can be any file type
|
|
843
|
+
const UUID = randomUUID();
|
|
584
844
|
let buffer: Buffer;
|
|
585
845
|
if (Buffer.isBuffer(file)) {
|
|
586
|
-
const UUID = randomUUID();
|
|
587
846
|
filePath = path.join(tempDir, `${UUID}.${fileType}`);
|
|
588
847
|
await fs.promises.writeFile(filePath, file);
|
|
589
848
|
buffer = file;
|
|
@@ -594,7 +853,11 @@ const loadFile = async (
|
|
|
594
853
|
// Download the file from the url
|
|
595
854
|
const response = await fetch(filePath);
|
|
596
855
|
const array: ArrayBuffer = await response.arrayBuffer();
|
|
856
|
+
// save file to temp file
|
|
857
|
+
const tempFilePath = path.join(tempDir, `${UUID}.${fileType}`);
|
|
858
|
+
await fs.promises.writeFile(tempFilePath, Buffer.from(array));
|
|
597
859
|
buffer = Buffer.from(array);
|
|
860
|
+
filePath = tempFilePath;
|
|
598
861
|
} else {
|
|
599
862
|
// Read the file from the local path
|
|
600
863
|
buffer = await fs.promises.readFile(file);
|
|
@@ -624,11 +887,19 @@ export async function documentProcessor({
|
|
|
624
887
|
// Temp dir at the root of the project
|
|
625
888
|
const uuid = randomUUID()
|
|
626
889
|
const tempDir = path.join(process.cwd(), 'temp', uuid);
|
|
890
|
+
// Track files to delete locally per job to avoid race conditions in parallel execution
|
|
891
|
+
const localFilesAndFoldersToDelete: string[] = [tempDir];
|
|
627
892
|
console.log(`[EXULU] Temporary directory for processing document ${name}: ${tempDir}`);
|
|
628
893
|
|
|
629
894
|
// Create the temporary directory
|
|
630
895
|
await fs.promises.mkdir(tempDir, { recursive: true });
|
|
631
896
|
|
|
897
|
+
// Create a .txt file in the temp directory with the current timestamp
|
|
898
|
+
// this can be used to clean up lost temp files that are not deleted by
|
|
899
|
+
// the job after a certain amount of time.
|
|
900
|
+
const timestamp = new Date().toISOString();
|
|
901
|
+
await fs.promises.writeFile(path.join(tempDir, 'created_at.txt'), timestamp);
|
|
902
|
+
|
|
632
903
|
try {
|
|
633
904
|
const {
|
|
634
905
|
filePath,
|
|
@@ -636,9 +907,24 @@ export async function documentProcessor({
|
|
|
636
907
|
buffer
|
|
637
908
|
} = await loadFile(file, name, tempDir);
|
|
638
909
|
|
|
639
|
-
|
|
910
|
+
let supportedTypes: string[] = [];
|
|
911
|
+
switch (config?.processor.name) {
|
|
912
|
+
case "docling":
|
|
913
|
+
supportedTypes = ['pdf', 'docx', 'doc', 'txt', 'md'];
|
|
914
|
+
break;
|
|
915
|
+
case "officeparser":
|
|
916
|
+
supportedTypes = [];
|
|
917
|
+
break;
|
|
918
|
+
case "liteparse":
|
|
919
|
+
supportedTypes = ['pdf', 'doc', 'docx', 'docm', 'odt', 'rtf', 'ppt', 'pptx', 'pptm', 'odp', 'xls', 'xlsx', 'xlsm', 'ods', 'csv', 'tsv'];
|
|
920
|
+
break;
|
|
921
|
+
case "mistral":
|
|
922
|
+
supportedTypes = ['pdf', 'docx', 'doc', 'txt', 'md'];
|
|
923
|
+
break;
|
|
924
|
+
}
|
|
925
|
+
|
|
640
926
|
if (!supportedTypes.includes(fileType)) {
|
|
641
|
-
throw new Error(`[EXULU] Unsupported file type: ${fileType} for Exulu document processor.`);
|
|
927
|
+
throw new Error(`[EXULU] Unsupported file type: ${fileType} for Exulu document processor, the ${config?.processor.name} processor only supports the following file types: ${supportedTypes.join(', ')}.`);
|
|
642
928
|
}
|
|
643
929
|
|
|
644
930
|
// Process document with VLM validation enabled
|
|
@@ -656,10 +942,20 @@ export async function documentProcessor({
|
|
|
656
942
|
|
|
657
943
|
} catch (error) {
|
|
658
944
|
console.error('Error during chunking:', error);
|
|
659
|
-
|
|
945
|
+
throw error;
|
|
946
|
+
|
|
660
947
|
} finally {
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
948
|
+
if (config?.debugging?.deleteTempFiles !== false) {
|
|
949
|
+
// Delete the temp directory using the local array to avoid race conditions
|
|
950
|
+
for (const file of localFilesAndFoldersToDelete) {
|
|
951
|
+
try {
|
|
952
|
+
await fs.promises.rm(file, { recursive: true });
|
|
953
|
+
console.log(`[EXULU] Deleted file or folder: ${file}`);
|
|
954
|
+
} catch (error) {
|
|
955
|
+
console.error(`[EXULU] Error deleting file or folder: ${file}`, error);
|
|
956
|
+
console.log(`[EXULU] File or folder still exists: ${file}`);
|
|
957
|
+
}
|
|
958
|
+
}
|
|
959
|
+
}
|
|
664
960
|
}
|
|
665
961
|
}
|