@heripo/pdf-parser 0.1.11 → 0.1.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +589 -38
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +6 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +598 -38
- package/dist/index.js.map +1 -1
- package/package.json +4 -4
package/dist/index.js
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
import { Docling } from "docling-sdk";
|
|
3
3
|
import { execSync } from "child_process";
|
|
4
4
|
import { platform } from "os";
|
|
5
|
-
import { join as
|
|
5
|
+
import { join as join8 } from "path";
|
|
6
6
|
|
|
7
7
|
// src/config/constants.ts
|
|
8
8
|
var PDF_PARSER = {
|
|
@@ -49,6 +49,12 @@ var PAGE_RENDERING = {
|
|
|
49
49
|
/** Low-resolution DPI for OCR strategy sampling */
|
|
50
50
|
SAMPLE_DPI: 150
|
|
51
51
|
};
|
|
52
|
+
var CHUNKED_CONVERSION = {
|
|
53
|
+
/** Number of pages per chunk */
|
|
54
|
+
DEFAULT_CHUNK_SIZE: 10,
|
|
55
|
+
/** Maximum retry attempts per failed chunk */
|
|
56
|
+
DEFAULT_MAX_RETRIES: 2
|
|
57
|
+
};
|
|
52
58
|
var IMAGE_PDF_CONVERTER = {
|
|
53
59
|
/**
|
|
54
60
|
* ImageMagick density option (DPI) for PDF to image conversion
|
|
@@ -843,10 +849,10 @@ var DoclingEnvironment = class _DoclingEnvironment {
|
|
|
843
849
|
|
|
844
850
|
// src/core/pdf-converter.ts
|
|
845
851
|
import { omit } from "es-toolkit";
|
|
846
|
-
import { copyFileSync, createWriteStream as
|
|
847
|
-
import { rename as
|
|
848
|
-
import { join as
|
|
849
|
-
import { pipeline as
|
|
852
|
+
import { copyFileSync as copyFileSync2, createWriteStream as createWriteStream4, existsSync as existsSync5, rmSync as rmSync4 } from "fs";
|
|
853
|
+
import { rename as rename3, writeFile as writeFile2 } from "fs/promises";
|
|
854
|
+
import { join as join7 } from "path";
|
|
855
|
+
import { pipeline as pipeline4 } from "stream/promises";
|
|
850
856
|
|
|
851
857
|
// src/errors/image-pdf-fallback-error.ts
|
|
852
858
|
var ImagePdfFallbackError = class extends Error {
|
|
@@ -2337,10 +2343,541 @@ var LocalFileServer = class {
|
|
|
2337
2343
|
}
|
|
2338
2344
|
};
|
|
2339
2345
|
|
|
2346
|
+
// src/core/chunked-pdf-converter.ts
|
|
2347
|
+
import {
|
|
2348
|
+
copyFileSync,
|
|
2349
|
+
createWriteStream as createWriteStream3,
|
|
2350
|
+
existsSync as existsSync3,
|
|
2351
|
+
mkdirSync as mkdirSync3,
|
|
2352
|
+
readFileSync as readFileSync3,
|
|
2353
|
+
readdirSync as readdirSync3,
|
|
2354
|
+
rmSync as rmSync2,
|
|
2355
|
+
writeFileSync as writeFileSync3
|
|
2356
|
+
} from "fs";
|
|
2357
|
+
import { rename as rename2, writeFile } from "fs/promises";
|
|
2358
|
+
import { join as join5 } from "path";
|
|
2359
|
+
import { pipeline as pipeline3 } from "stream/promises";
|
|
2360
|
+
|
|
2361
|
+
// src/processors/docling-document-merger.ts
|
|
2362
|
+
var REF_PATTERN = /^#\/(texts|pictures|tables|groups)\/(\d+)$/;
|
|
2363
|
+
var IMAGE_URI_PATTERN = /^images\/pic_(\d+)\.png$/;
|
|
2364
|
+
var DoclingDocumentMerger = class {
|
|
2365
|
+
/**
|
|
2366
|
+
* Merge an array of DoclingDocuments into one.
|
|
2367
|
+
* The first chunk's metadata (schema_name, version, name, origin) is used as the base.
|
|
2368
|
+
*
|
|
2369
|
+
* @param chunks - Array of DoclingDocument objects to merge (must have at least 1)
|
|
2370
|
+
* @param picFileOffsets - Optional cumulative pic_ file counts per chunk.
|
|
2371
|
+
* When provided, picFileOffsets[i] is used for pic_ URI remapping instead of
|
|
2372
|
+
* the pictures array length, aligning URIs with relocated file indices.
|
|
2373
|
+
* @returns Merged DoclingDocument
|
|
2374
|
+
*/
|
|
2375
|
+
merge(chunks, picFileOffsets) {
|
|
2376
|
+
if (chunks.length === 0) {
|
|
2377
|
+
throw new Error("Cannot merge zero chunks");
|
|
2378
|
+
}
|
|
2379
|
+
if (chunks.length === 1) {
|
|
2380
|
+
return chunks[0];
|
|
2381
|
+
}
|
|
2382
|
+
const base = structuredClone(chunks[0]);
|
|
2383
|
+
for (let i = 1; i < chunks.length; i++) {
|
|
2384
|
+
const chunk = chunks[i];
|
|
2385
|
+
const offsets = {
|
|
2386
|
+
texts: base.texts.length,
|
|
2387
|
+
pictures: base.pictures.length,
|
|
2388
|
+
tables: base.tables.length,
|
|
2389
|
+
groups: base.groups.length
|
|
2390
|
+
};
|
|
2391
|
+
const picFileOffset = picFileOffsets ? picFileOffsets[i] : offsets.pictures;
|
|
2392
|
+
for (const text of chunk.texts) {
|
|
2393
|
+
const remapped = structuredClone(text);
|
|
2394
|
+
remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
|
|
2395
|
+
if (remapped.parent) {
|
|
2396
|
+
remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
|
|
2397
|
+
}
|
|
2398
|
+
remapped.children = remapped.children.map((c) => ({
|
|
2399
|
+
$ref: this.remapRef(c.$ref, offsets)
|
|
2400
|
+
}));
|
|
2401
|
+
base.texts.push(remapped);
|
|
2402
|
+
}
|
|
2403
|
+
for (const picture of chunk.pictures) {
|
|
2404
|
+
const remapped = structuredClone(picture);
|
|
2405
|
+
remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
|
|
2406
|
+
if (remapped.parent) {
|
|
2407
|
+
remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
|
|
2408
|
+
}
|
|
2409
|
+
remapped.children = remapped.children.map((c) => ({
|
|
2410
|
+
$ref: this.remapRef(c.$ref, offsets)
|
|
2411
|
+
}));
|
|
2412
|
+
remapped.captions = remapped.captions.map((c) => ({
|
|
2413
|
+
$ref: this.remapRef(c.$ref, offsets)
|
|
2414
|
+
}));
|
|
2415
|
+
this.remapPictureImageUri(remapped, picFileOffset);
|
|
2416
|
+
base.pictures.push(remapped);
|
|
2417
|
+
}
|
|
2418
|
+
for (const table of chunk.tables) {
|
|
2419
|
+
const remapped = structuredClone(table);
|
|
2420
|
+
remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
|
|
2421
|
+
if (remapped.parent) {
|
|
2422
|
+
remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
|
|
2423
|
+
}
|
|
2424
|
+
remapped.children = remapped.children.map((c) => ({
|
|
2425
|
+
$ref: this.remapRef(c.$ref, offsets)
|
|
2426
|
+
}));
|
|
2427
|
+
remapped.captions = remapped.captions.map((c) => ({
|
|
2428
|
+
$ref: this.remapRef(c.$ref, offsets)
|
|
2429
|
+
}));
|
|
2430
|
+
remapped.footnotes = remapped.footnotes.map((f) => ({
|
|
2431
|
+
$ref: this.remapRef(f.$ref, offsets)
|
|
2432
|
+
}));
|
|
2433
|
+
base.tables.push(remapped);
|
|
2434
|
+
}
|
|
2435
|
+
for (const group of chunk.groups) {
|
|
2436
|
+
const remapped = structuredClone(group);
|
|
2437
|
+
remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
|
|
2438
|
+
if (remapped.parent) {
|
|
2439
|
+
remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
|
|
2440
|
+
}
|
|
2441
|
+
remapped.children = remapped.children.map((c) => ({
|
|
2442
|
+
$ref: this.remapRef(c.$ref, offsets)
|
|
2443
|
+
}));
|
|
2444
|
+
base.groups.push(remapped);
|
|
2445
|
+
}
|
|
2446
|
+
for (const child of chunk.body.children) {
|
|
2447
|
+
base.body.children.push({
|
|
2448
|
+
$ref: this.remapRef(child.$ref, offsets)
|
|
2449
|
+
});
|
|
2450
|
+
}
|
|
2451
|
+
for (const child of chunk.furniture.children) {
|
|
2452
|
+
base.furniture.children.push({
|
|
2453
|
+
$ref: this.remapRef(child.$ref, offsets)
|
|
2454
|
+
});
|
|
2455
|
+
}
|
|
2456
|
+
Object.assign(base.pages, chunk.pages);
|
|
2457
|
+
}
|
|
2458
|
+
return base;
|
|
2459
|
+
}
|
|
2460
|
+
/**
|
|
2461
|
+
* Remap a $ref string by applying offsets.
|
|
2462
|
+
* Only refs matching "#/{texts|pictures|tables|groups}/{N}" are remapped.
|
|
2463
|
+
* Refs like "#/body" or "#/furniture" pass through unchanged.
|
|
2464
|
+
*/
|
|
2465
|
+
remapRef(ref, offsets) {
|
|
2466
|
+
const match = REF_PATTERN.exec(ref);
|
|
2467
|
+
if (!match) {
|
|
2468
|
+
return ref;
|
|
2469
|
+
}
|
|
2470
|
+
const kind = match[1];
|
|
2471
|
+
const index = parseInt(match[2], 10);
|
|
2472
|
+
return `#/${kind}/${index + offsets[kind]}`;
|
|
2473
|
+
}
|
|
2474
|
+
/**
|
|
2475
|
+
* Remap image URI in a picture item by applying the pic file offset.
|
|
2476
|
+
* Transforms "images/pic_N.png" → "images/pic_{N+offset}.png"
|
|
2477
|
+
*/
|
|
2478
|
+
remapPictureImageUri(picture, picFileOffset) {
|
|
2479
|
+
const rec = picture;
|
|
2480
|
+
const image = rec.image;
|
|
2481
|
+
if (!image?.uri) return;
|
|
2482
|
+
const match = IMAGE_URI_PATTERN.exec(image.uri);
|
|
2483
|
+
if (match) {
|
|
2484
|
+
const index = parseInt(match[1], 10);
|
|
2485
|
+
image.uri = `images/pic_${index + picFileOffset}.png`;
|
|
2486
|
+
}
|
|
2487
|
+
}
|
|
2488
|
+
};
|
|
2489
|
+
|
|
2490
|
+
// src/core/chunked-pdf-converter.ts
|
|
2491
|
+
var ChunkedPDFConverter = class {
|
|
2492
|
+
constructor(logger, client, config, timeout = PDF_CONVERTER.DEFAULT_TIMEOUT_MS) {
|
|
2493
|
+
this.logger = logger;
|
|
2494
|
+
this.client = client;
|
|
2495
|
+
this.config = config;
|
|
2496
|
+
this.timeout = timeout;
|
|
2497
|
+
}
|
|
2498
|
+
/**
|
|
2499
|
+
* Convert a local PDF in chunks.
|
|
2500
|
+
*
|
|
2501
|
+
* @param url - file:// URL to the source PDF
|
|
2502
|
+
* @param reportId - Unique report identifier for output directory naming
|
|
2503
|
+
* @param onComplete - Callback invoked with the final output directory
|
|
2504
|
+
* @param cleanupAfterCallback - Whether to clean up the output directory after callback
|
|
2505
|
+
* @param options - PDF conversion options (chunked-specific fields are stripped internally)
|
|
2506
|
+
* @param buildConversionOptions - Function to build Docling ConversionOptions from PDFConvertOptions
|
|
2507
|
+
* @param abortSignal - Optional abort signal for cancellation
|
|
2508
|
+
*/
|
|
2509
|
+
async convertChunked(url, reportId, onComplete, cleanupAfterCallback, options, buildConversionOptions, abortSignal) {
|
|
2510
|
+
const pdfPath = url.slice(7);
|
|
2511
|
+
const cwd = process.cwd();
|
|
2512
|
+
const outputDir = join5(cwd, "output", reportId);
|
|
2513
|
+
const chunksBaseDir = join5(cwd, "output", reportId, "_chunks");
|
|
2514
|
+
const totalPages = await this.getPageCount(pdfPath);
|
|
2515
|
+
if (totalPages === 0) {
|
|
2516
|
+
throw new Error(
|
|
2517
|
+
"[ChunkedPDFConverter] Failed to detect page count from PDF"
|
|
2518
|
+
);
|
|
2519
|
+
}
|
|
2520
|
+
const chunks = this.calculateChunks(totalPages);
|
|
2521
|
+
this.logger.info(
|
|
2522
|
+
`[ChunkedPDFConverter] Starting: ${totalPages} pages \u2192 ${chunks.length} chunks of ${this.config.chunkSize}`
|
|
2523
|
+
);
|
|
2524
|
+
const server = new LocalFileServer();
|
|
2525
|
+
const httpUrl = await server.start(pdfPath);
|
|
2526
|
+
this.logger.info(
|
|
2527
|
+
"[ChunkedPDFConverter] Started local file server:",
|
|
2528
|
+
httpUrl
|
|
2529
|
+
);
|
|
2530
|
+
const chunkDocuments = [];
|
|
2531
|
+
try {
|
|
2532
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
2533
|
+
this.checkAbort(abortSignal);
|
|
2534
|
+
const [start, end] = chunks[i];
|
|
2535
|
+
const chunkDir = join5(chunksBaseDir, `_chunk_${i}`);
|
|
2536
|
+
mkdirSync3(chunkDir, { recursive: true });
|
|
2537
|
+
const doc = await this.convertChunk(
|
|
2538
|
+
i,
|
|
2539
|
+
chunks.length,
|
|
2540
|
+
start,
|
|
2541
|
+
end,
|
|
2542
|
+
httpUrl,
|
|
2543
|
+
chunkDir,
|
|
2544
|
+
options,
|
|
2545
|
+
buildConversionOptions
|
|
2546
|
+
);
|
|
2547
|
+
chunkDocuments.push(doc);
|
|
2548
|
+
}
|
|
2549
|
+
} finally {
|
|
2550
|
+
this.logger.info("[ChunkedPDFConverter] Stopping local file server...");
|
|
2551
|
+
await server.stop();
|
|
2552
|
+
}
|
|
2553
|
+
this.checkAbort(abortSignal);
|
|
2554
|
+
this.logger.info(
|
|
2555
|
+
`[ChunkedPDFConverter] All ${chunks.length} chunks completed, merging...`
|
|
2556
|
+
);
|
|
2557
|
+
const merger = new DoclingDocumentMerger();
|
|
2558
|
+
const picFileOffsets = this.buildPicFileOffsets(
|
|
2559
|
+
chunksBaseDir,
|
|
2560
|
+
chunks.length
|
|
2561
|
+
);
|
|
2562
|
+
const merged = merger.merge(chunkDocuments, picFileOffsets);
|
|
2563
|
+
this.logger.info(
|
|
2564
|
+
`[ChunkedPDFConverter] Merged: ${merged.texts.length} texts, ${merged.pictures.length} pictures, ${merged.tables.length} tables, ${Object.keys(merged.pages).length} pages`
|
|
2565
|
+
);
|
|
2566
|
+
mkdirSync3(outputDir, { recursive: true });
|
|
2567
|
+
const imagesDir = join5(outputDir, "images");
|
|
2568
|
+
mkdirSync3(imagesDir, { recursive: true });
|
|
2569
|
+
this.relocateImages(chunksBaseDir, chunks.length, imagesDir);
|
|
2570
|
+
const resultPath = join5(outputDir, "result.json");
|
|
2571
|
+
writeFileSync3(resultPath, JSON.stringify(merged));
|
|
2572
|
+
try {
|
|
2573
|
+
await this.renderPageImages(pdfPath, outputDir);
|
|
2574
|
+
this.cleanupOrphanedPicFiles(resultPath, imagesDir);
|
|
2575
|
+
this.checkAbort(abortSignal);
|
|
2576
|
+
this.logger.info(
|
|
2577
|
+
"[ChunkedPDFConverter] Executing completion callback..."
|
|
2578
|
+
);
|
|
2579
|
+
await onComplete(outputDir);
|
|
2580
|
+
} finally {
|
|
2581
|
+
if (existsSync3(chunksBaseDir)) {
|
|
2582
|
+
rmSync2(chunksBaseDir, { recursive: true, force: true });
|
|
2583
|
+
}
|
|
2584
|
+
if (cleanupAfterCallback) {
|
|
2585
|
+
this.logger.info(
|
|
2586
|
+
"[ChunkedPDFConverter] Cleaning up output directory:",
|
|
2587
|
+
outputDir
|
|
2588
|
+
);
|
|
2589
|
+
if (existsSync3(outputDir)) {
|
|
2590
|
+
rmSync2(outputDir, { recursive: true, force: true });
|
|
2591
|
+
}
|
|
2592
|
+
} else {
|
|
2593
|
+
this.logger.info(
|
|
2594
|
+
"[ChunkedPDFConverter] Output preserved at:",
|
|
2595
|
+
outputDir
|
|
2596
|
+
);
|
|
2597
|
+
}
|
|
2598
|
+
}
|
|
2599
|
+
return null;
|
|
2600
|
+
}
|
|
2601
|
+
/**
|
|
2602
|
+
* Convert a single chunk with retry logic.
|
|
2603
|
+
*/
|
|
2604
|
+
async convertChunk(chunkIndex, totalChunks, startPage, endPage, httpUrl, chunkDir, options, buildConversionOptions) {
|
|
2605
|
+
const chunkLabel = `Chunk ${chunkIndex + 1}/${totalChunks} (pages ${startPage}-${endPage})`;
|
|
2606
|
+
for (let attempt = 0; attempt <= this.config.maxRetries; attempt++) {
|
|
2607
|
+
try {
|
|
2608
|
+
if (attempt > 0) {
|
|
2609
|
+
this.logger.info(
|
|
2610
|
+
`[ChunkedPDFConverter] ${chunkLabel}: retrying (${attempt}/${this.config.maxRetries})...`
|
|
2611
|
+
);
|
|
2612
|
+
} else {
|
|
2613
|
+
this.logger.info(
|
|
2614
|
+
`[ChunkedPDFConverter] ${chunkLabel}: converting...`
|
|
2615
|
+
);
|
|
2616
|
+
}
|
|
2617
|
+
const startTime = Date.now();
|
|
2618
|
+
const conversionOptions = buildConversionOptions({
|
|
2619
|
+
...options,
|
|
2620
|
+
page_range: [startPage, endPage]
|
|
2621
|
+
});
|
|
2622
|
+
const task = await this.client.convertSourceAsync({
|
|
2623
|
+
sources: [{ kind: "http", url: httpUrl }],
|
|
2624
|
+
options: conversionOptions,
|
|
2625
|
+
target: { kind: "zip" }
|
|
2626
|
+
});
|
|
2627
|
+
await this.trackTaskProgress(task);
|
|
2628
|
+
const zipPath = join5(chunkDir, "result.zip");
|
|
2629
|
+
await this.downloadResult(task.taskId, zipPath);
|
|
2630
|
+
const extractDir = join5(chunkDir, "extracted");
|
|
2631
|
+
const chunkOutputDir = join5(chunkDir, "output");
|
|
2632
|
+
await ImageExtractor.extractAndSaveDocumentsFromZip(
|
|
2633
|
+
this.logger,
|
|
2634
|
+
zipPath,
|
|
2635
|
+
extractDir,
|
|
2636
|
+
chunkOutputDir
|
|
2637
|
+
);
|
|
2638
|
+
const resultJsonPath = join5(chunkOutputDir, "result.json");
|
|
2639
|
+
const doc = await runJqFileJson(".", resultJsonPath);
|
|
2640
|
+
if (existsSync3(zipPath)) rmSync2(zipPath, { force: true });
|
|
2641
|
+
if (existsSync3(extractDir)) {
|
|
2642
|
+
rmSync2(extractDir, { recursive: true, force: true });
|
|
2643
|
+
}
|
|
2644
|
+
const elapsed = ((Date.now() - startTime) / 1e3).toFixed(1);
|
|
2645
|
+
if (attempt > 0) {
|
|
2646
|
+
this.logger.info(
|
|
2647
|
+
`[ChunkedPDFConverter] ${chunkLabel}: completed on retry ${attempt} (${elapsed}s)`
|
|
2648
|
+
);
|
|
2649
|
+
} else {
|
|
2650
|
+
this.logger.info(
|
|
2651
|
+
`[ChunkedPDFConverter] ${chunkLabel}: completed (${elapsed}s)`
|
|
2652
|
+
);
|
|
2653
|
+
}
|
|
2654
|
+
return doc;
|
|
2655
|
+
} catch (error) {
|
|
2656
|
+
if (attempt >= this.config.maxRetries) {
|
|
2657
|
+
this.logger.error(
|
|
2658
|
+
`[ChunkedPDFConverter] ${chunkLabel}: failed after ${this.config.maxRetries} retries`
|
|
2659
|
+
);
|
|
2660
|
+
throw error;
|
|
2661
|
+
}
|
|
2662
|
+
this.logger.warn(
|
|
2663
|
+
`[ChunkedPDFConverter] ${chunkLabel}: failed, retrying (${attempt + 1}/${this.config.maxRetries})...`
|
|
2664
|
+
);
|
|
2665
|
+
}
|
|
2666
|
+
}
|
|
2667
|
+
throw new Error("Unreachable");
|
|
2668
|
+
}
|
|
2669
|
+
/** Calculate page ranges for chunks */
|
|
2670
|
+
calculateChunks(totalPages) {
|
|
2671
|
+
if (this.config.chunkSize <= 0) {
|
|
2672
|
+
throw new Error("[ChunkedPDFConverter] chunkSize must be positive");
|
|
2673
|
+
}
|
|
2674
|
+
const ranges = [];
|
|
2675
|
+
for (let start = 1; start <= totalPages; start += this.config.chunkSize) {
|
|
2676
|
+
const end = Math.min(start + this.config.chunkSize - 1, totalPages);
|
|
2677
|
+
ranges.push([start, end]);
|
|
2678
|
+
}
|
|
2679
|
+
return ranges;
|
|
2680
|
+
}
|
|
2681
|
+
/** Get total page count using pdfinfo */
|
|
2682
|
+
async getPageCount(pdfPath) {
|
|
2683
|
+
const result = await spawnAsync("pdfinfo", [pdfPath]);
|
|
2684
|
+
if (result.code !== 0) {
|
|
2685
|
+
return 0;
|
|
2686
|
+
}
|
|
2687
|
+
const match = result.stdout.match(/^Pages:\s+(\d+)/m);
|
|
2688
|
+
return match ? parseInt(match[1], 10) : 0;
|
|
2689
|
+
}
|
|
2690
|
+
/** Poll task progress until completion */
|
|
2691
|
+
async trackTaskProgress(task) {
|
|
2692
|
+
const startTime = Date.now();
|
|
2693
|
+
while (true) {
|
|
2694
|
+
if (Date.now() - startTime > this.timeout) {
|
|
2695
|
+
throw new Error("[ChunkedPDFConverter] Chunk task timeout");
|
|
2696
|
+
}
|
|
2697
|
+
const status = await task.poll();
|
|
2698
|
+
if (status.task_status === "success") return;
|
|
2699
|
+
if (status.task_status === "failure") {
|
|
2700
|
+
let details = "unknown";
|
|
2701
|
+
try {
|
|
2702
|
+
const result = await task.getResult();
|
|
2703
|
+
if (result.errors?.length) {
|
|
2704
|
+
details = result.errors.map((e) => e.message).join("; ");
|
|
2705
|
+
}
|
|
2706
|
+
} catch {
|
|
2707
|
+
}
|
|
2708
|
+
throw new Error(`[ChunkedPDFConverter] Chunk task failed: ${details}`);
|
|
2709
|
+
}
|
|
2710
|
+
await new Promise(
|
|
2711
|
+
(resolve) => setTimeout(resolve, PDF_CONVERTER.POLL_INTERVAL_MS)
|
|
2712
|
+
);
|
|
2713
|
+
}
|
|
2714
|
+
}
|
|
2715
|
+
/** Download ZIP result for a task */
|
|
2716
|
+
async downloadResult(taskId, zipPath) {
|
|
2717
|
+
const zipResult = await this.client.getTaskResultFile(taskId);
|
|
2718
|
+
if (zipResult.fileStream) {
|
|
2719
|
+
const writeStream = createWriteStream3(zipPath);
|
|
2720
|
+
await pipeline3(zipResult.fileStream, writeStream);
|
|
2721
|
+
return;
|
|
2722
|
+
}
|
|
2723
|
+
if (zipResult.data) {
|
|
2724
|
+
await writeFile(zipPath, zipResult.data);
|
|
2725
|
+
return;
|
|
2726
|
+
}
|
|
2727
|
+
const baseUrl = this.client.getConfig().baseUrl;
|
|
2728
|
+
const response = await fetch(`${baseUrl}/v1/result/${taskId}`, {
|
|
2729
|
+
headers: { Accept: "application/zip" }
|
|
2730
|
+
});
|
|
2731
|
+
if (!response.ok) {
|
|
2732
|
+
throw new Error(
|
|
2733
|
+
`Failed to download chunk ZIP: ${response.status} ${response.statusText}`
|
|
2734
|
+
);
|
|
2735
|
+
}
|
|
2736
|
+
const buffer = new Uint8Array(await response.arrayBuffer());
|
|
2737
|
+
await writeFile(zipPath, buffer);
|
|
2738
|
+
}
|
|
2739
|
+
/**
|
|
2740
|
+
* Relocate images from chunk output directories to the final images directory
|
|
2741
|
+
* with global indexing.
|
|
2742
|
+
*/
|
|
2743
|
+
relocateImages(chunksBaseDir, totalChunks, imagesDir) {
|
|
2744
|
+
let picGlobalIndex = 0;
|
|
2745
|
+
for (let i = 0; i < totalChunks; i++) {
|
|
2746
|
+
const chunkImagesDir = join5(
|
|
2747
|
+
chunksBaseDir,
|
|
2748
|
+
`_chunk_${i}`,
|
|
2749
|
+
"output",
|
|
2750
|
+
"images"
|
|
2751
|
+
);
|
|
2752
|
+
if (!existsSync3(chunkImagesDir)) continue;
|
|
2753
|
+
const picFiles = readdirSync3(chunkImagesDir).filter((f) => f.startsWith("pic_") && f.endsWith(".png")).sort((a, b) => {
|
|
2754
|
+
const numA = parseInt(a.replace("pic_", "").replace(".png", ""), 10);
|
|
2755
|
+
const numB = parseInt(b.replace("pic_", "").replace(".png", ""), 10);
|
|
2756
|
+
return numA - numB;
|
|
2757
|
+
});
|
|
2758
|
+
for (const file of picFiles) {
|
|
2759
|
+
const src = join5(chunkImagesDir, file);
|
|
2760
|
+
const dest = join5(imagesDir, `pic_${picGlobalIndex}.png`);
|
|
2761
|
+
copyFileSync(src, dest);
|
|
2762
|
+
picGlobalIndex++;
|
|
2763
|
+
}
|
|
2764
|
+
}
|
|
2765
|
+
let imageGlobalIndex = 0;
|
|
2766
|
+
for (let i = 0; i < totalChunks; i++) {
|
|
2767
|
+
const chunkImagesDir = join5(
|
|
2768
|
+
chunksBaseDir,
|
|
2769
|
+
`_chunk_${i}`,
|
|
2770
|
+
"output",
|
|
2771
|
+
"images"
|
|
2772
|
+
);
|
|
2773
|
+
if (!existsSync3(chunkImagesDir)) continue;
|
|
2774
|
+
const imageFiles = readdirSync3(chunkImagesDir).filter((f) => f.startsWith("image_") && f.endsWith(".png")).sort((a, b) => {
|
|
2775
|
+
const numA = parseInt(
|
|
2776
|
+
a.replace("image_", "").replace(".png", ""),
|
|
2777
|
+
10
|
|
2778
|
+
);
|
|
2779
|
+
const numB = parseInt(
|
|
2780
|
+
b.replace("image_", "").replace(".png", ""),
|
|
2781
|
+
10
|
|
2782
|
+
);
|
|
2783
|
+
return numA - numB;
|
|
2784
|
+
});
|
|
2785
|
+
for (const file of imageFiles) {
|
|
2786
|
+
const src = join5(chunkImagesDir, file);
|
|
2787
|
+
const dest = join5(imagesDir, `image_${imageGlobalIndex}.png`);
|
|
2788
|
+
copyFileSync(src, dest);
|
|
2789
|
+
imageGlobalIndex++;
|
|
2790
|
+
}
|
|
2791
|
+
}
|
|
2792
|
+
this.logger.info(
|
|
2793
|
+
`[ChunkedPDFConverter] Relocated ${picGlobalIndex} pic + ${imageGlobalIndex} image files to ${imagesDir}`
|
|
2794
|
+
);
|
|
2795
|
+
}
|
|
2796
|
+
/** Render page images from PDF using ImageMagick and update result.json */
|
|
2797
|
+
async renderPageImages(pdfPath, outputDir) {
|
|
2798
|
+
this.logger.info(
|
|
2799
|
+
"[ChunkedPDFConverter] Rendering page images with ImageMagick..."
|
|
2800
|
+
);
|
|
2801
|
+
const renderer = new PageRenderer(this.logger);
|
|
2802
|
+
const renderResult = await renderer.renderPages(pdfPath, outputDir);
|
|
2803
|
+
const resultPath = join5(outputDir, "result.json");
|
|
2804
|
+
const tmpPath = resultPath + ".tmp";
|
|
2805
|
+
const jqProgram = `
|
|
2806
|
+
.pages |= with_entries(
|
|
2807
|
+
if (.value.page_no - 1) >= 0 and (.value.page_no - 1) < ${renderResult.pageCount} then
|
|
2808
|
+
.value.image.uri = "pages/page_\\(.value.page_no - 1).png" |
|
|
2809
|
+
.value.image.mimetype = "image/png" |
|
|
2810
|
+
.value.image.dpi = ${PAGE_RENDERING.DEFAULT_DPI}
|
|
2811
|
+
else . end
|
|
2812
|
+
)
|
|
2813
|
+
`;
|
|
2814
|
+
await runJqFileToFile(jqProgram, resultPath, tmpPath);
|
|
2815
|
+
await rename2(tmpPath, resultPath);
|
|
2816
|
+
this.logger.info(
|
|
2817
|
+
`[ChunkedPDFConverter] Rendered ${renderResult.pageCount} page images`
|
|
2818
|
+
);
|
|
2819
|
+
}
|
|
2820
|
+
/**
|
|
2821
|
+
* Remove pic_ files from images directory that are not referenced in result.json.
|
|
2822
|
+
* Chunked Docling conversion embeds page images as base64 in JSON, which get
|
|
2823
|
+
* extracted as pic_ files. After renderPageImages replaces page URIs with
|
|
2824
|
+
* pages/page_N.png, these pic_ files become orphaned.
|
|
2825
|
+
*/
|
|
2826
|
+
cleanupOrphanedPicFiles(resultPath, imagesDir) {
|
|
2827
|
+
const content = readFileSync3(resultPath, "utf-8");
|
|
2828
|
+
const referencedPics = /* @__PURE__ */ new Set();
|
|
2829
|
+
const picPattern = /images\/pic_\d+\.png/g;
|
|
2830
|
+
let match;
|
|
2831
|
+
while ((match = picPattern.exec(content)) !== null) {
|
|
2832
|
+
referencedPics.add(match[0].replace("images/", ""));
|
|
2833
|
+
}
|
|
2834
|
+
const picFiles = readdirSync3(imagesDir).filter(
|
|
2835
|
+
(f) => f.startsWith("pic_") && f.endsWith(".png")
|
|
2836
|
+
);
|
|
2837
|
+
let removedCount = 0;
|
|
2838
|
+
for (const file of picFiles) {
|
|
2839
|
+
if (!referencedPics.has(file)) {
|
|
2840
|
+
rmSync2(join5(imagesDir, file), { force: true });
|
|
2841
|
+
removedCount++;
|
|
2842
|
+
}
|
|
2843
|
+
}
|
|
2844
|
+
if (removedCount > 0) {
|
|
2845
|
+
this.logger.info(
|
|
2846
|
+
`[ChunkedPDFConverter] Cleaned up ${removedCount} orphaned pic_ files (${referencedPics.size} referenced, kept)`
|
|
2847
|
+
);
|
|
2848
|
+
}
|
|
2849
|
+
}
|
|
2850
|
+
/**
|
|
2851
|
+
* Build cumulative pic_ file offsets per chunk for correct URI remapping.
|
|
2852
|
+
* Each offset[i] is the total number of pic_ files in chunks 0..i-1.
|
|
2853
|
+
*/
|
|
2854
|
+
buildPicFileOffsets(chunksBaseDir, totalChunks) {
|
|
2855
|
+
const offsets = [];
|
|
2856
|
+
let cumulative = 0;
|
|
2857
|
+
for (let i = 0; i < totalChunks; i++) {
|
|
2858
|
+
offsets.push(cumulative);
|
|
2859
|
+
const dir = join5(chunksBaseDir, `_chunk_${i}`, "output", "images");
|
|
2860
|
+
const count = existsSync3(dir) ? readdirSync3(dir).filter(
|
|
2861
|
+
(f) => f.startsWith("pic_") && f.endsWith(".png")
|
|
2862
|
+
).length : 0;
|
|
2863
|
+
cumulative += count;
|
|
2864
|
+
}
|
|
2865
|
+
return offsets;
|
|
2866
|
+
}
|
|
2867
|
+
/** Check if abort has been signalled and throw if so */
|
|
2868
|
+
checkAbort(signal) {
|
|
2869
|
+
if (signal?.aborted) {
|
|
2870
|
+
const error = new Error("Chunked PDF conversion was aborted");
|
|
2871
|
+
error.name = "AbortError";
|
|
2872
|
+
throw error;
|
|
2873
|
+
}
|
|
2874
|
+
}
|
|
2875
|
+
};
|
|
2876
|
+
|
|
2340
2877
|
// src/core/image-pdf-converter.ts
|
|
2341
|
-
import { existsSync as
|
|
2878
|
+
import { existsSync as existsSync4, rmSync as rmSync3 } from "fs";
|
|
2342
2879
|
import { tmpdir } from "os";
|
|
2343
|
-
import { join as
|
|
2880
|
+
import { join as join6 } from "path";
|
|
2344
2881
|
var ImagePdfConverter = class {
|
|
2345
2882
|
constructor(logger) {
|
|
2346
2883
|
this.logger = logger;
|
|
@@ -2356,8 +2893,8 @@ var ImagePdfConverter = class {
|
|
|
2356
2893
|
async convert(pdfUrl, reportId) {
|
|
2357
2894
|
const timestamp = Date.now();
|
|
2358
2895
|
const tempDir = tmpdir();
|
|
2359
|
-
const inputPath =
|
|
2360
|
-
const outputPath =
|
|
2896
|
+
const inputPath = join6(tempDir, `${reportId}-${timestamp}-input.pdf`);
|
|
2897
|
+
const outputPath = join6(tempDir, `${reportId}-${timestamp}-image.pdf`);
|
|
2361
2898
|
try {
|
|
2362
2899
|
this.logger.info("[ImagePdfConverter] Downloading PDF from URL...");
|
|
2363
2900
|
await this.downloadPdf(pdfUrl, inputPath);
|
|
@@ -2366,8 +2903,8 @@ var ImagePdfConverter = class {
|
|
|
2366
2903
|
this.logger.info("[ImagePdfConverter] Image PDF created:", outputPath);
|
|
2367
2904
|
return outputPath;
|
|
2368
2905
|
} finally {
|
|
2369
|
-
if (
|
|
2370
|
-
|
|
2906
|
+
if (existsSync4(inputPath)) {
|
|
2907
|
+
rmSync3(inputPath, { force: true });
|
|
2371
2908
|
}
|
|
2372
2909
|
}
|
|
2373
2910
|
}
|
|
@@ -2414,12 +2951,12 @@ var ImagePdfConverter = class {
|
|
|
2414
2951
|
* Cleanup the temporary image PDF file
|
|
2415
2952
|
*/
|
|
2416
2953
|
cleanup(imagePdfPath) {
|
|
2417
|
-
if (
|
|
2954
|
+
if (existsSync4(imagePdfPath)) {
|
|
2418
2955
|
this.logger.info(
|
|
2419
2956
|
"[ImagePdfConverter] Cleaning up temp file:",
|
|
2420
2957
|
imagePdfPath
|
|
2421
2958
|
);
|
|
2422
|
-
|
|
2959
|
+
rmSync3(imagePdfPath, { force: true });
|
|
2423
2960
|
}
|
|
2424
2961
|
}
|
|
2425
2962
|
};
|
|
@@ -2434,6 +2971,26 @@ var PDFConverter = class {
|
|
|
2434
2971
|
}
|
|
2435
2972
|
async convert(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
|
|
2436
2973
|
this.logger.info("[PDFConverter] Converting:", url);
|
|
2974
|
+
if (options.chunkedConversion && url.startsWith("file://")) {
|
|
2975
|
+
const chunked = new ChunkedPDFConverter(
|
|
2976
|
+
this.logger,
|
|
2977
|
+
this.client,
|
|
2978
|
+
{
|
|
2979
|
+
chunkSize: options.chunkSize ?? CHUNKED_CONVERSION.DEFAULT_CHUNK_SIZE,
|
|
2980
|
+
maxRetries: options.chunkMaxRetries ?? CHUNKED_CONVERSION.DEFAULT_MAX_RETRIES
|
|
2981
|
+
},
|
|
2982
|
+
this.timeout
|
|
2983
|
+
);
|
|
2984
|
+
return chunked.convertChunked(
|
|
2985
|
+
url,
|
|
2986
|
+
reportId,
|
|
2987
|
+
onComplete,
|
|
2988
|
+
cleanupAfterCallback,
|
|
2989
|
+
options,
|
|
2990
|
+
(opts) => this.buildConversionOptions(opts),
|
|
2991
|
+
abortSignal
|
|
2992
|
+
);
|
|
2993
|
+
}
|
|
2437
2994
|
if (options.forceImagePdf) {
|
|
2438
2995
|
return this.convertViaImagePdf(
|
|
2439
2996
|
url,
|
|
@@ -2538,7 +3095,7 @@ var PDFConverter = class {
|
|
|
2538
3095
|
const reason = options.forcedMethod ? `Forced: ${options.forcedMethod}` : !pdfPath ? "Non-local URL, sampling skipped" : "Sampling skipped";
|
|
2539
3096
|
return { method, reason, sampledPages: 0, totalPages: 0 };
|
|
2540
3097
|
}
|
|
2541
|
-
const samplingDir =
|
|
3098
|
+
const samplingDir = join7(process.cwd(), "output", reportId, "_sampling");
|
|
2542
3099
|
const sampler = new OcrStrategySampler(
|
|
2543
3100
|
this.logger,
|
|
2544
3101
|
new PageRenderer(this.logger),
|
|
@@ -2563,8 +3120,8 @@ var PDFConverter = class {
|
|
|
2563
3120
|
}
|
|
2564
3121
|
return strategy;
|
|
2565
3122
|
} finally {
|
|
2566
|
-
if (
|
|
2567
|
-
|
|
3123
|
+
if (existsSync5(samplingDir)) {
|
|
3124
|
+
rmSync4(samplingDir, { recursive: true, force: true });
|
|
2568
3125
|
}
|
|
2569
3126
|
}
|
|
2570
3127
|
}
|
|
@@ -2585,7 +3142,7 @@ var PDFConverter = class {
|
|
|
2585
3142
|
const wrappedCallback = async (outputDir) => {
|
|
2586
3143
|
let pageTexts;
|
|
2587
3144
|
try {
|
|
2588
|
-
const resultPath2 =
|
|
3145
|
+
const resultPath2 = join7(outputDir, "result.json");
|
|
2589
3146
|
const totalPages = await runJqFileJson(
|
|
2590
3147
|
".pages | length",
|
|
2591
3148
|
resultPath2
|
|
@@ -2597,9 +3154,9 @@ var PDFConverter = class {
|
|
|
2597
3154
|
"[PDFConverter] pdftotext extraction failed, proceeding without text reference"
|
|
2598
3155
|
);
|
|
2599
3156
|
}
|
|
2600
|
-
const resultPath =
|
|
2601
|
-
const ocrOriginPath =
|
|
2602
|
-
|
|
3157
|
+
const resultPath = join7(outputDir, "result.json");
|
|
3158
|
+
const ocrOriginPath = join7(outputDir, "result_ocr_origin.json");
|
|
3159
|
+
copyFileSync2(resultPath, ocrOriginPath);
|
|
2603
3160
|
const corrector = new VlmTextCorrector(this.logger);
|
|
2604
3161
|
await corrector.correctAndSave(outputDir, options.vlmProcessorModel, {
|
|
2605
3162
|
concurrency: options.vlmConcurrency,
|
|
@@ -2741,9 +3298,9 @@ var PDFConverter = class {
|
|
|
2741
3298
|
}
|
|
2742
3299
|
}
|
|
2743
3300
|
const cwd = process.cwd();
|
|
2744
|
-
const zipPath =
|
|
2745
|
-
const extractDir =
|
|
2746
|
-
const outputDir =
|
|
3301
|
+
const zipPath = join7(cwd, "result.zip");
|
|
3302
|
+
const extractDir = join7(cwd, "result_extracted");
|
|
3303
|
+
const outputDir = join7(cwd, "output", reportId);
|
|
2747
3304
|
try {
|
|
2748
3305
|
await this.processConvertedFiles(zipPath, extractDir, outputDir);
|
|
2749
3306
|
await this.renderPageImages(url, outputDir);
|
|
@@ -2760,19 +3317,19 @@ var PDFConverter = class {
|
|
|
2760
3317
|
this.logger.info("[PDFConverter] Total time:", duration, "ms");
|
|
2761
3318
|
} finally {
|
|
2762
3319
|
this.logger.info("[PDFConverter] Cleaning up temporary files...");
|
|
2763
|
-
if (
|
|
2764
|
-
|
|
3320
|
+
if (existsSync5(zipPath)) {
|
|
3321
|
+
rmSync4(zipPath, { force: true });
|
|
2765
3322
|
}
|
|
2766
|
-
if (
|
|
2767
|
-
|
|
3323
|
+
if (existsSync5(extractDir)) {
|
|
3324
|
+
rmSync4(extractDir, { recursive: true, force: true });
|
|
2768
3325
|
}
|
|
2769
3326
|
if (cleanupAfterCallback) {
|
|
2770
3327
|
this.logger.info(
|
|
2771
3328
|
"[PDFConverter] Cleaning up output directory:",
|
|
2772
3329
|
outputDir
|
|
2773
3330
|
);
|
|
2774
|
-
if (
|
|
2775
|
-
|
|
3331
|
+
if (existsSync5(outputDir)) {
|
|
3332
|
+
rmSync4(outputDir, { recursive: true, force: true });
|
|
2776
3333
|
}
|
|
2777
3334
|
} else {
|
|
2778
3335
|
this.logger.info("[PDFConverter] Output preserved at:", outputDir);
|
|
@@ -2790,7 +3347,10 @@ var PDFConverter = class {
|
|
|
2790
3347
|
"skipSampling",
|
|
2791
3348
|
"forcedMethod",
|
|
2792
3349
|
"aggregator",
|
|
2793
|
-
"onTokenUsage"
|
|
3350
|
+
"onTokenUsage",
|
|
3351
|
+
"chunkedConversion",
|
|
3352
|
+
"chunkSize",
|
|
3353
|
+
"chunkMaxRetries"
|
|
2794
3354
|
]),
|
|
2795
3355
|
to_formats: ["json", "html"],
|
|
2796
3356
|
image_export_mode: "embedded",
|
|
@@ -2918,15 +3478,15 @@ var PDFConverter = class {
|
|
|
2918
3478
|
"\n[PDFConverter] Task completed, downloading ZIP file..."
|
|
2919
3479
|
);
|
|
2920
3480
|
const zipResult = await this.client.getTaskResultFile(taskId);
|
|
2921
|
-
const zipPath =
|
|
3481
|
+
const zipPath = join7(process.cwd(), "result.zip");
|
|
2922
3482
|
this.logger.info("[PDFConverter] Saving ZIP file to:", zipPath);
|
|
2923
3483
|
if (zipResult.fileStream) {
|
|
2924
|
-
const writeStream =
|
|
2925
|
-
await
|
|
3484
|
+
const writeStream = createWriteStream4(zipPath);
|
|
3485
|
+
await pipeline4(zipResult.fileStream, writeStream);
|
|
2926
3486
|
return;
|
|
2927
3487
|
}
|
|
2928
3488
|
if (zipResult.data) {
|
|
2929
|
-
await
|
|
3489
|
+
await writeFile2(zipPath, zipResult.data);
|
|
2930
3490
|
return;
|
|
2931
3491
|
}
|
|
2932
3492
|
this.logger.warn(
|
|
@@ -2942,7 +3502,7 @@ var PDFConverter = class {
|
|
|
2942
3502
|
);
|
|
2943
3503
|
}
|
|
2944
3504
|
const buffer = new Uint8Array(await response.arrayBuffer());
|
|
2945
|
-
await
|
|
3505
|
+
await writeFile2(zipPath, buffer);
|
|
2946
3506
|
}
|
|
2947
3507
|
async processConvertedFiles(zipPath, extractDir, outputDir) {
|
|
2948
3508
|
await ImageExtractor.extractAndSaveDocumentsFromZip(
|
|
@@ -2971,7 +3531,7 @@ var PDFConverter = class {
|
|
|
2971
3531
|
);
|
|
2972
3532
|
const renderer = new PageRenderer(this.logger);
|
|
2973
3533
|
const renderResult = await renderer.renderPages(pdfPath, outputDir);
|
|
2974
|
-
const resultPath =
|
|
3534
|
+
const resultPath = join7(outputDir, "result.json");
|
|
2975
3535
|
const tmpPath = resultPath + ".tmp";
|
|
2976
3536
|
const jqProgram = `
|
|
2977
3537
|
.pages |= with_entries(
|
|
@@ -2983,7 +3543,7 @@ var PDFConverter = class {
|
|
|
2983
3543
|
)
|
|
2984
3544
|
`;
|
|
2985
3545
|
await runJqFileToFile(jqProgram, resultPath, tmpPath);
|
|
2986
|
-
await
|
|
3546
|
+
await rename3(tmpPath, resultPath);
|
|
2987
3547
|
this.logger.info(
|
|
2988
3548
|
`[PDFConverter] Rendered ${renderResult.pageCount} page images`
|
|
2989
3549
|
);
|
|
@@ -3018,7 +3578,7 @@ var PDFParser = class {
|
|
|
3018
3578
|
this.baseUrl = void 0;
|
|
3019
3579
|
}
|
|
3020
3580
|
this.timeout = timeout;
|
|
3021
|
-
this.venvPath = venvPath ||
|
|
3581
|
+
this.venvPath = venvPath || join8(process.cwd(), ".venv");
|
|
3022
3582
|
this.killExistingProcess = killExistingProcess;
|
|
3023
3583
|
this.enableImagePdfFallback = enableImagePdfFallback;
|
|
3024
3584
|
}
|