@heripo/pdf-parser 0.1.11 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -40,7 +40,7 @@ module.exports = __toCommonJS(src_exports);
40
40
  var import_docling_sdk = require("docling-sdk");
41
41
  var import_node_child_process3 = require("child_process");
42
42
  var import_node_os2 = require("os");
43
- var import_node_path8 = require("path");
43
+ var import_node_path9 = require("path");
44
44
 
45
45
  // src/config/constants.ts
46
46
  var PDF_PARSER = {
@@ -87,6 +87,12 @@ var PAGE_RENDERING = {
87
87
  /** Low-resolution DPI for OCR strategy sampling */
88
88
  SAMPLE_DPI: 150
89
89
  };
90
+ var CHUNKED_CONVERSION = {
91
+ /** Number of pages per chunk */
92
+ DEFAULT_CHUNK_SIZE: 10,
93
+ /** Maximum retry attempts per failed chunk */
94
+ DEFAULT_MAX_RETRIES: 2
95
+ };
90
96
  var IMAGE_PDF_CONVERTER = {
91
97
  /**
92
98
  * ImageMagick density option (DPI) for PDF to image conversion
@@ -875,10 +881,10 @@ var DoclingEnvironment = class _DoclingEnvironment {
875
881
 
876
882
  // src/core/pdf-converter.ts
877
883
  var import_es_toolkit = require("es-toolkit");
878
- var import_node_fs8 = require("fs");
879
- var import_promises4 = require("fs/promises");
880
- var import_node_path7 = require("path");
881
- var import_promises5 = require("stream/promises");
884
+ var import_node_fs9 = require("fs");
885
+ var import_promises6 = require("fs/promises");
886
+ var import_node_path8 = require("path");
887
+ var import_promises7 = require("stream/promises");
882
888
 
883
889
  // src/errors/image-pdf-fallback-error.ts
884
890
  var ImagePdfFallbackError = class extends Error {
@@ -2361,10 +2367,532 @@ var LocalFileServer = class {
2361
2367
  }
2362
2368
  };
2363
2369
 
2364
- // src/core/image-pdf-converter.ts
2370
+ // src/core/chunked-pdf-converter.ts
2365
2371
  var import_node_fs7 = require("fs");
2366
- var import_node_os = require("os");
2372
+ var import_promises4 = require("fs/promises");
2367
2373
  var import_node_path6 = require("path");
2374
+ var import_promises5 = require("stream/promises");
2375
+
2376
+ // src/processors/docling-document-merger.ts
2377
+ var REF_PATTERN = /^#\/(texts|pictures|tables|groups)\/(\d+)$/;
2378
+ var IMAGE_URI_PATTERN = /^images\/pic_(\d+)\.png$/;
2379
+ var DoclingDocumentMerger = class {
2380
+ /**
2381
+ * Merge an array of DoclingDocuments into one.
2382
+ * The first chunk's metadata (schema_name, version, name, origin) is used as the base.
2383
+ *
2384
+ * @param chunks - Array of DoclingDocument objects to merge (must have at least 1)
2385
+ * @param picFileOffsets - Optional cumulative pic_ file counts per chunk.
2386
+ * When provided, picFileOffsets[i] is used for pic_ URI remapping instead of
2387
+ * the pictures array length, aligning URIs with relocated file indices.
2388
+ * @returns Merged DoclingDocument
2389
+ */
2390
+ merge(chunks, picFileOffsets) {
2391
+ if (chunks.length === 0) {
2392
+ throw new Error("Cannot merge zero chunks");
2393
+ }
2394
+ if (chunks.length === 1) {
2395
+ return chunks[0];
2396
+ }
2397
+ const base = structuredClone(chunks[0]);
2398
+ for (let i = 1; i < chunks.length; i++) {
2399
+ const chunk = chunks[i];
2400
+ const offsets = {
2401
+ texts: base.texts.length,
2402
+ pictures: base.pictures.length,
2403
+ tables: base.tables.length,
2404
+ groups: base.groups.length
2405
+ };
2406
+ const picFileOffset = picFileOffsets ? picFileOffsets[i] : offsets.pictures;
2407
+ for (const text of chunk.texts) {
2408
+ const remapped = structuredClone(text);
2409
+ remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
2410
+ if (remapped.parent) {
2411
+ remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
2412
+ }
2413
+ remapped.children = remapped.children.map((c) => ({
2414
+ $ref: this.remapRef(c.$ref, offsets)
2415
+ }));
2416
+ base.texts.push(remapped);
2417
+ }
2418
+ for (const picture of chunk.pictures) {
2419
+ const remapped = structuredClone(picture);
2420
+ remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
2421
+ if (remapped.parent) {
2422
+ remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
2423
+ }
2424
+ remapped.children = remapped.children.map((c) => ({
2425
+ $ref: this.remapRef(c.$ref, offsets)
2426
+ }));
2427
+ remapped.captions = remapped.captions.map((c) => ({
2428
+ $ref: this.remapRef(c.$ref, offsets)
2429
+ }));
2430
+ this.remapPictureImageUri(remapped, picFileOffset);
2431
+ base.pictures.push(remapped);
2432
+ }
2433
+ for (const table of chunk.tables) {
2434
+ const remapped = structuredClone(table);
2435
+ remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
2436
+ if (remapped.parent) {
2437
+ remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
2438
+ }
2439
+ remapped.children = remapped.children.map((c) => ({
2440
+ $ref: this.remapRef(c.$ref, offsets)
2441
+ }));
2442
+ remapped.captions = remapped.captions.map((c) => ({
2443
+ $ref: this.remapRef(c.$ref, offsets)
2444
+ }));
2445
+ remapped.footnotes = remapped.footnotes.map((f) => ({
2446
+ $ref: this.remapRef(f.$ref, offsets)
2447
+ }));
2448
+ base.tables.push(remapped);
2449
+ }
2450
+ for (const group of chunk.groups) {
2451
+ const remapped = structuredClone(group);
2452
+ remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
2453
+ if (remapped.parent) {
2454
+ remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
2455
+ }
2456
+ remapped.children = remapped.children.map((c) => ({
2457
+ $ref: this.remapRef(c.$ref, offsets)
2458
+ }));
2459
+ base.groups.push(remapped);
2460
+ }
2461
+ for (const child of chunk.body.children) {
2462
+ base.body.children.push({
2463
+ $ref: this.remapRef(child.$ref, offsets)
2464
+ });
2465
+ }
2466
+ for (const child of chunk.furniture.children) {
2467
+ base.furniture.children.push({
2468
+ $ref: this.remapRef(child.$ref, offsets)
2469
+ });
2470
+ }
2471
+ Object.assign(base.pages, chunk.pages);
2472
+ }
2473
+ return base;
2474
+ }
2475
+ /**
2476
+ * Remap a $ref string by applying offsets.
2477
+ * Only refs matching "#/{texts|pictures|tables|groups}/{N}" are remapped.
2478
+ * Refs like "#/body" or "#/furniture" pass through unchanged.
2479
+ */
2480
+ remapRef(ref, offsets) {
2481
+ const match = REF_PATTERN.exec(ref);
2482
+ if (!match) {
2483
+ return ref;
2484
+ }
2485
+ const kind = match[1];
2486
+ const index = parseInt(match[2], 10);
2487
+ return `#/${kind}/${index + offsets[kind]}`;
2488
+ }
2489
+ /**
2490
+ * Remap image URI in a picture item by applying the pic file offset.
2491
+ * Transforms "images/pic_N.png" → "images/pic_{N+offset}.png"
2492
+ */
2493
+ remapPictureImageUri(picture, picFileOffset) {
2494
+ const rec = picture;
2495
+ const image = rec.image;
2496
+ if (!image?.uri) return;
2497
+ const match = IMAGE_URI_PATTERN.exec(image.uri);
2498
+ if (match) {
2499
+ const index = parseInt(match[1], 10);
2500
+ image.uri = `images/pic_${index + picFileOffset}.png`;
2501
+ }
2502
+ }
2503
+ };
2504
+
2505
+ // src/core/chunked-pdf-converter.ts
2506
+ var ChunkedPDFConverter = class {
2507
+ constructor(logger, client, config, timeout = PDF_CONVERTER.DEFAULT_TIMEOUT_MS) {
2508
+ this.logger = logger;
2509
+ this.client = client;
2510
+ this.config = config;
2511
+ this.timeout = timeout;
2512
+ }
2513
+ /**
2514
+ * Convert a local PDF in chunks.
2515
+ *
2516
+ * @param url - file:// URL to the source PDF
2517
+ * @param reportId - Unique report identifier for output directory naming
2518
+ * @param onComplete - Callback invoked with the final output directory
2519
+ * @param cleanupAfterCallback - Whether to clean up the output directory after callback
2520
+ * @param options - PDF conversion options (chunked-specific fields are stripped internally)
2521
+ * @param buildConversionOptions - Function to build Docling ConversionOptions from PDFConvertOptions
2522
+ * @param abortSignal - Optional abort signal for cancellation
2523
+ */
2524
+ async convertChunked(url, reportId, onComplete, cleanupAfterCallback, options, buildConversionOptions, abortSignal) {
2525
+ const pdfPath = url.slice(7);
2526
+ const cwd = process.cwd();
2527
+ const outputDir = (0, import_node_path6.join)(cwd, "output", reportId);
2528
+ const chunksBaseDir = (0, import_node_path6.join)(cwd, "output", reportId, "_chunks");
2529
+ const totalPages = await this.getPageCount(pdfPath);
2530
+ if (totalPages === 0) {
2531
+ throw new Error(
2532
+ "[ChunkedPDFConverter] Failed to detect page count from PDF"
2533
+ );
2534
+ }
2535
+ const chunks = this.calculateChunks(totalPages);
2536
+ this.logger.info(
2537
+ `[ChunkedPDFConverter] Starting: ${totalPages} pages \u2192 ${chunks.length} chunks of ${this.config.chunkSize}`
2538
+ );
2539
+ const server = new LocalFileServer();
2540
+ const httpUrl = await server.start(pdfPath);
2541
+ this.logger.info(
2542
+ "[ChunkedPDFConverter] Started local file server:",
2543
+ httpUrl
2544
+ );
2545
+ const chunkDocuments = [];
2546
+ try {
2547
+ for (let i = 0; i < chunks.length; i++) {
2548
+ this.checkAbort(abortSignal);
2549
+ const [start, end] = chunks[i];
2550
+ const chunkDir = (0, import_node_path6.join)(chunksBaseDir, `_chunk_${i}`);
2551
+ (0, import_node_fs7.mkdirSync)(chunkDir, { recursive: true });
2552
+ const doc = await this.convertChunk(
2553
+ i,
2554
+ chunks.length,
2555
+ start,
2556
+ end,
2557
+ httpUrl,
2558
+ chunkDir,
2559
+ options,
2560
+ buildConversionOptions
2561
+ );
2562
+ chunkDocuments.push(doc);
2563
+ }
2564
+ } finally {
2565
+ this.logger.info("[ChunkedPDFConverter] Stopping local file server...");
2566
+ await server.stop();
2567
+ }
2568
+ this.checkAbort(abortSignal);
2569
+ this.logger.info(
2570
+ `[ChunkedPDFConverter] All ${chunks.length} chunks completed, merging...`
2571
+ );
2572
+ const merger = new DoclingDocumentMerger();
2573
+ const picFileOffsets = this.buildPicFileOffsets(
2574
+ chunksBaseDir,
2575
+ chunks.length
2576
+ );
2577
+ const merged = merger.merge(chunkDocuments, picFileOffsets);
2578
+ this.logger.info(
2579
+ `[ChunkedPDFConverter] Merged: ${merged.texts.length} texts, ${merged.pictures.length} pictures, ${merged.tables.length} tables, ${Object.keys(merged.pages).length} pages`
2580
+ );
2581
+ (0, import_node_fs7.mkdirSync)(outputDir, { recursive: true });
2582
+ const imagesDir = (0, import_node_path6.join)(outputDir, "images");
2583
+ (0, import_node_fs7.mkdirSync)(imagesDir, { recursive: true });
2584
+ this.relocateImages(chunksBaseDir, chunks.length, imagesDir);
2585
+ const resultPath = (0, import_node_path6.join)(outputDir, "result.json");
2586
+ (0, import_node_fs7.writeFileSync)(resultPath, JSON.stringify(merged));
2587
+ try {
2588
+ await this.renderPageImages(pdfPath, outputDir);
2589
+ this.cleanupOrphanedPicFiles(resultPath, imagesDir);
2590
+ this.checkAbort(abortSignal);
2591
+ this.logger.info(
2592
+ "[ChunkedPDFConverter] Executing completion callback..."
2593
+ );
2594
+ await onComplete(outputDir);
2595
+ } finally {
2596
+ if ((0, import_node_fs7.existsSync)(chunksBaseDir)) {
2597
+ (0, import_node_fs7.rmSync)(chunksBaseDir, { recursive: true, force: true });
2598
+ }
2599
+ if (cleanupAfterCallback) {
2600
+ this.logger.info(
2601
+ "[ChunkedPDFConverter] Cleaning up output directory:",
2602
+ outputDir
2603
+ );
2604
+ if ((0, import_node_fs7.existsSync)(outputDir)) {
2605
+ (0, import_node_fs7.rmSync)(outputDir, { recursive: true, force: true });
2606
+ }
2607
+ } else {
2608
+ this.logger.info(
2609
+ "[ChunkedPDFConverter] Output preserved at:",
2610
+ outputDir
2611
+ );
2612
+ }
2613
+ }
2614
+ return null;
2615
+ }
2616
+ /**
2617
+ * Convert a single chunk with retry logic.
2618
+ */
2619
+ async convertChunk(chunkIndex, totalChunks, startPage, endPage, httpUrl, chunkDir, options, buildConversionOptions) {
2620
+ const chunkLabel = `Chunk ${chunkIndex + 1}/${totalChunks} (pages ${startPage}-${endPage})`;
2621
+ for (let attempt = 0; attempt <= this.config.maxRetries; attempt++) {
2622
+ try {
2623
+ if (attempt > 0) {
2624
+ this.logger.info(
2625
+ `[ChunkedPDFConverter] ${chunkLabel}: retrying (${attempt}/${this.config.maxRetries})...`
2626
+ );
2627
+ } else {
2628
+ this.logger.info(
2629
+ `[ChunkedPDFConverter] ${chunkLabel}: converting...`
2630
+ );
2631
+ }
2632
+ const startTime = Date.now();
2633
+ const conversionOptions = buildConversionOptions({
2634
+ ...options,
2635
+ page_range: [startPage, endPage]
2636
+ });
2637
+ const task = await this.client.convertSourceAsync({
2638
+ sources: [{ kind: "http", url: httpUrl }],
2639
+ options: conversionOptions,
2640
+ target: { kind: "zip" }
2641
+ });
2642
+ await this.trackTaskProgress(task);
2643
+ const zipPath = (0, import_node_path6.join)(chunkDir, "result.zip");
2644
+ await this.downloadResult(task.taskId, zipPath);
2645
+ const extractDir = (0, import_node_path6.join)(chunkDir, "extracted");
2646
+ const chunkOutputDir = (0, import_node_path6.join)(chunkDir, "output");
2647
+ await ImageExtractor.extractAndSaveDocumentsFromZip(
2648
+ this.logger,
2649
+ zipPath,
2650
+ extractDir,
2651
+ chunkOutputDir
2652
+ );
2653
+ const resultJsonPath = (0, import_node_path6.join)(chunkOutputDir, "result.json");
2654
+ const doc = await runJqFileJson(".", resultJsonPath);
2655
+ if ((0, import_node_fs7.existsSync)(zipPath)) (0, import_node_fs7.rmSync)(zipPath, { force: true });
2656
+ if ((0, import_node_fs7.existsSync)(extractDir)) {
2657
+ (0, import_node_fs7.rmSync)(extractDir, { recursive: true, force: true });
2658
+ }
2659
+ const elapsed = ((Date.now() - startTime) / 1e3).toFixed(1);
2660
+ if (attempt > 0) {
2661
+ this.logger.info(
2662
+ `[ChunkedPDFConverter] ${chunkLabel}: completed on retry ${attempt} (${elapsed}s)`
2663
+ );
2664
+ } else {
2665
+ this.logger.info(
2666
+ `[ChunkedPDFConverter] ${chunkLabel}: completed (${elapsed}s)`
2667
+ );
2668
+ }
2669
+ return doc;
2670
+ } catch (error) {
2671
+ if (attempt >= this.config.maxRetries) {
2672
+ this.logger.error(
2673
+ `[ChunkedPDFConverter] ${chunkLabel}: failed after ${this.config.maxRetries} retries`
2674
+ );
2675
+ throw error;
2676
+ }
2677
+ this.logger.warn(
2678
+ `[ChunkedPDFConverter] ${chunkLabel}: failed, retrying (${attempt + 1}/${this.config.maxRetries})...`
2679
+ );
2680
+ }
2681
+ }
2682
+ throw new Error("Unreachable");
2683
+ }
2684
+ /** Calculate page ranges for chunks */
2685
+ calculateChunks(totalPages) {
2686
+ if (this.config.chunkSize <= 0) {
2687
+ throw new Error("[ChunkedPDFConverter] chunkSize must be positive");
2688
+ }
2689
+ const ranges = [];
2690
+ for (let start = 1; start <= totalPages; start += this.config.chunkSize) {
2691
+ const end = Math.min(start + this.config.chunkSize - 1, totalPages);
2692
+ ranges.push([start, end]);
2693
+ }
2694
+ return ranges;
2695
+ }
2696
+ /** Get total page count using pdfinfo */
2697
+ async getPageCount(pdfPath) {
2698
+ const result = await spawnAsync("pdfinfo", [pdfPath]);
2699
+ if (result.code !== 0) {
2700
+ return 0;
2701
+ }
2702
+ const match = result.stdout.match(/^Pages:\s+(\d+)/m);
2703
+ return match ? parseInt(match[1], 10) : 0;
2704
+ }
2705
+ /** Poll task progress until completion */
2706
+ async trackTaskProgress(task) {
2707
+ const startTime = Date.now();
2708
+ while (true) {
2709
+ if (Date.now() - startTime > this.timeout) {
2710
+ throw new Error("[ChunkedPDFConverter] Chunk task timeout");
2711
+ }
2712
+ const status = await task.poll();
2713
+ if (status.task_status === "success") return;
2714
+ if (status.task_status === "failure") {
2715
+ let details = "unknown";
2716
+ try {
2717
+ const result = await task.getResult();
2718
+ if (result.errors?.length) {
2719
+ details = result.errors.map((e) => e.message).join("; ");
2720
+ }
2721
+ } catch {
2722
+ }
2723
+ throw new Error(`[ChunkedPDFConverter] Chunk task failed: ${details}`);
2724
+ }
2725
+ await new Promise(
2726
+ (resolve) => setTimeout(resolve, PDF_CONVERTER.POLL_INTERVAL_MS)
2727
+ );
2728
+ }
2729
+ }
2730
+ /** Download ZIP result for a task */
2731
+ async downloadResult(taskId, zipPath) {
2732
+ const zipResult = await this.client.getTaskResultFile(taskId);
2733
+ if (zipResult.fileStream) {
2734
+ const writeStream = (0, import_node_fs7.createWriteStream)(zipPath);
2735
+ await (0, import_promises5.pipeline)(zipResult.fileStream, writeStream);
2736
+ return;
2737
+ }
2738
+ if (zipResult.data) {
2739
+ await (0, import_promises4.writeFile)(zipPath, zipResult.data);
2740
+ return;
2741
+ }
2742
+ const baseUrl = this.client.getConfig().baseUrl;
2743
+ const response = await fetch(`${baseUrl}/v1/result/${taskId}`, {
2744
+ headers: { Accept: "application/zip" }
2745
+ });
2746
+ if (!response.ok) {
2747
+ throw new Error(
2748
+ `Failed to download chunk ZIP: ${response.status} ${response.statusText}`
2749
+ );
2750
+ }
2751
+ const buffer = new Uint8Array(await response.arrayBuffer());
2752
+ await (0, import_promises4.writeFile)(zipPath, buffer);
2753
+ }
2754
+ /**
2755
+ * Relocate images from chunk output directories to the final images directory
2756
+ * with global indexing.
2757
+ */
2758
+ relocateImages(chunksBaseDir, totalChunks, imagesDir) {
2759
+ let picGlobalIndex = 0;
2760
+ for (let i = 0; i < totalChunks; i++) {
2761
+ const chunkImagesDir = (0, import_node_path6.join)(
2762
+ chunksBaseDir,
2763
+ `_chunk_${i}`,
2764
+ "output",
2765
+ "images"
2766
+ );
2767
+ if (!(0, import_node_fs7.existsSync)(chunkImagesDir)) continue;
2768
+ const picFiles = (0, import_node_fs7.readdirSync)(chunkImagesDir).filter((f) => f.startsWith("pic_") && f.endsWith(".png")).sort((a, b) => {
2769
+ const numA = parseInt(a.replace("pic_", "").replace(".png", ""), 10);
2770
+ const numB = parseInt(b.replace("pic_", "").replace(".png", ""), 10);
2771
+ return numA - numB;
2772
+ });
2773
+ for (const file of picFiles) {
2774
+ const src = (0, import_node_path6.join)(chunkImagesDir, file);
2775
+ const dest = (0, import_node_path6.join)(imagesDir, `pic_${picGlobalIndex}.png`);
2776
+ (0, import_node_fs7.copyFileSync)(src, dest);
2777
+ picGlobalIndex++;
2778
+ }
2779
+ }
2780
+ let imageGlobalIndex = 0;
2781
+ for (let i = 0; i < totalChunks; i++) {
2782
+ const chunkImagesDir = (0, import_node_path6.join)(
2783
+ chunksBaseDir,
2784
+ `_chunk_${i}`,
2785
+ "output",
2786
+ "images"
2787
+ );
2788
+ if (!(0, import_node_fs7.existsSync)(chunkImagesDir)) continue;
2789
+ const imageFiles = (0, import_node_fs7.readdirSync)(chunkImagesDir).filter((f) => f.startsWith("image_") && f.endsWith(".png")).sort((a, b) => {
2790
+ const numA = parseInt(
2791
+ a.replace("image_", "").replace(".png", ""),
2792
+ 10
2793
+ );
2794
+ const numB = parseInt(
2795
+ b.replace("image_", "").replace(".png", ""),
2796
+ 10
2797
+ );
2798
+ return numA - numB;
2799
+ });
2800
+ for (const file of imageFiles) {
2801
+ const src = (0, import_node_path6.join)(chunkImagesDir, file);
2802
+ const dest = (0, import_node_path6.join)(imagesDir, `image_${imageGlobalIndex}.png`);
2803
+ (0, import_node_fs7.copyFileSync)(src, dest);
2804
+ imageGlobalIndex++;
2805
+ }
2806
+ }
2807
+ this.logger.info(
2808
+ `[ChunkedPDFConverter] Relocated ${picGlobalIndex} pic + ${imageGlobalIndex} image files to ${imagesDir}`
2809
+ );
2810
+ }
2811
+ /** Render page images from PDF using ImageMagick and update result.json */
2812
+ async renderPageImages(pdfPath, outputDir) {
2813
+ this.logger.info(
2814
+ "[ChunkedPDFConverter] Rendering page images with ImageMagick..."
2815
+ );
2816
+ const renderer = new PageRenderer(this.logger);
2817
+ const renderResult = await renderer.renderPages(pdfPath, outputDir);
2818
+ const resultPath = (0, import_node_path6.join)(outputDir, "result.json");
2819
+ const tmpPath = resultPath + ".tmp";
2820
+ const jqProgram = `
2821
+ .pages |= with_entries(
2822
+ if (.value.page_no - 1) >= 0 and (.value.page_no - 1) < ${renderResult.pageCount} then
2823
+ .value.image.uri = "pages/page_\\(.value.page_no - 1).png" |
2824
+ .value.image.mimetype = "image/png" |
2825
+ .value.image.dpi = ${PAGE_RENDERING.DEFAULT_DPI}
2826
+ else . end
2827
+ )
2828
+ `;
2829
+ await runJqFileToFile(jqProgram, resultPath, tmpPath);
2830
+ await (0, import_promises4.rename)(tmpPath, resultPath);
2831
+ this.logger.info(
2832
+ `[ChunkedPDFConverter] Rendered ${renderResult.pageCount} page images`
2833
+ );
2834
+ }
2835
+ /**
2836
+ * Remove pic_ files from images directory that are not referenced in result.json.
2837
+ * Chunked Docling conversion embeds page images as base64 in JSON, which get
2838
+ * extracted as pic_ files. After renderPageImages replaces page URIs with
2839
+ * pages/page_N.png, these pic_ files become orphaned.
2840
+ */
2841
+ cleanupOrphanedPicFiles(resultPath, imagesDir) {
2842
+ const content = (0, import_node_fs7.readFileSync)(resultPath, "utf-8");
2843
+ const referencedPics = /* @__PURE__ */ new Set();
2844
+ const picPattern = /images\/pic_\d+\.png/g;
2845
+ let match;
2846
+ while ((match = picPattern.exec(content)) !== null) {
2847
+ referencedPics.add(match[0].replace("images/", ""));
2848
+ }
2849
+ const picFiles = (0, import_node_fs7.readdirSync)(imagesDir).filter(
2850
+ (f) => f.startsWith("pic_") && f.endsWith(".png")
2851
+ );
2852
+ let removedCount = 0;
2853
+ for (const file of picFiles) {
2854
+ if (!referencedPics.has(file)) {
2855
+ (0, import_node_fs7.rmSync)((0, import_node_path6.join)(imagesDir, file), { force: true });
2856
+ removedCount++;
2857
+ }
2858
+ }
2859
+ if (removedCount > 0) {
2860
+ this.logger.info(
2861
+ `[ChunkedPDFConverter] Cleaned up ${removedCount} orphaned pic_ files (${referencedPics.size} referenced, kept)`
2862
+ );
2863
+ }
2864
+ }
2865
+ /**
2866
+ * Build cumulative pic_ file offsets per chunk for correct URI remapping.
2867
+ * Each offset[i] is the total number of pic_ files in chunks 0..i-1.
2868
+ */
2869
+ buildPicFileOffsets(chunksBaseDir, totalChunks) {
2870
+ const offsets = [];
2871
+ let cumulative = 0;
2872
+ for (let i = 0; i < totalChunks; i++) {
2873
+ offsets.push(cumulative);
2874
+ const dir = (0, import_node_path6.join)(chunksBaseDir, `_chunk_${i}`, "output", "images");
2875
+ const count = (0, import_node_fs7.existsSync)(dir) ? (0, import_node_fs7.readdirSync)(dir).filter(
2876
+ (f) => f.startsWith("pic_") && f.endsWith(".png")
2877
+ ).length : 0;
2878
+ cumulative += count;
2879
+ }
2880
+ return offsets;
2881
+ }
2882
+ /** Check if abort has been signalled and throw if so */
2883
+ checkAbort(signal) {
2884
+ if (signal?.aborted) {
2885
+ const error = new Error("Chunked PDF conversion was aborted");
2886
+ error.name = "AbortError";
2887
+ throw error;
2888
+ }
2889
+ }
2890
+ };
2891
+
2892
+ // src/core/image-pdf-converter.ts
2893
+ var import_node_fs8 = require("fs");
2894
+ var import_node_os = require("os");
2895
+ var import_node_path7 = require("path");
2368
2896
  var ImagePdfConverter = class {
2369
2897
  constructor(logger) {
2370
2898
  this.logger = logger;
@@ -2380,8 +2908,8 @@ var ImagePdfConverter = class {
2380
2908
  async convert(pdfUrl, reportId) {
2381
2909
  const timestamp = Date.now();
2382
2910
  const tempDir = (0, import_node_os.tmpdir)();
2383
- const inputPath = (0, import_node_path6.join)(tempDir, `${reportId}-${timestamp}-input.pdf`);
2384
- const outputPath = (0, import_node_path6.join)(tempDir, `${reportId}-${timestamp}-image.pdf`);
2911
+ const inputPath = (0, import_node_path7.join)(tempDir, `${reportId}-${timestamp}-input.pdf`);
2912
+ const outputPath = (0, import_node_path7.join)(tempDir, `${reportId}-${timestamp}-image.pdf`);
2385
2913
  try {
2386
2914
  this.logger.info("[ImagePdfConverter] Downloading PDF from URL...");
2387
2915
  await this.downloadPdf(pdfUrl, inputPath);
@@ -2390,8 +2918,8 @@ var ImagePdfConverter = class {
2390
2918
  this.logger.info("[ImagePdfConverter] Image PDF created:", outputPath);
2391
2919
  return outputPath;
2392
2920
  } finally {
2393
- if ((0, import_node_fs7.existsSync)(inputPath)) {
2394
- (0, import_node_fs7.rmSync)(inputPath, { force: true });
2921
+ if ((0, import_node_fs8.existsSync)(inputPath)) {
2922
+ (0, import_node_fs8.rmSync)(inputPath, { force: true });
2395
2923
  }
2396
2924
  }
2397
2925
  }
@@ -2438,12 +2966,12 @@ var ImagePdfConverter = class {
2438
2966
  * Cleanup the temporary image PDF file
2439
2967
  */
2440
2968
  cleanup(imagePdfPath) {
2441
- if ((0, import_node_fs7.existsSync)(imagePdfPath)) {
2969
+ if ((0, import_node_fs8.existsSync)(imagePdfPath)) {
2442
2970
  this.logger.info(
2443
2971
  "[ImagePdfConverter] Cleaning up temp file:",
2444
2972
  imagePdfPath
2445
2973
  );
2446
- (0, import_node_fs7.rmSync)(imagePdfPath, { force: true });
2974
+ (0, import_node_fs8.rmSync)(imagePdfPath, { force: true });
2447
2975
  }
2448
2976
  }
2449
2977
  };
@@ -2458,6 +2986,26 @@ var PDFConverter = class {
2458
2986
  }
2459
2987
  async convert(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
2460
2988
  this.logger.info("[PDFConverter] Converting:", url);
2989
+ if (options.chunkedConversion && url.startsWith("file://")) {
2990
+ const chunked = new ChunkedPDFConverter(
2991
+ this.logger,
2992
+ this.client,
2993
+ {
2994
+ chunkSize: options.chunkSize ?? CHUNKED_CONVERSION.DEFAULT_CHUNK_SIZE,
2995
+ maxRetries: options.chunkMaxRetries ?? CHUNKED_CONVERSION.DEFAULT_MAX_RETRIES
2996
+ },
2997
+ this.timeout
2998
+ );
2999
+ return chunked.convertChunked(
3000
+ url,
3001
+ reportId,
3002
+ onComplete,
3003
+ cleanupAfterCallback,
3004
+ options,
3005
+ (opts) => this.buildConversionOptions(opts),
3006
+ abortSignal
3007
+ );
3008
+ }
2461
3009
  if (options.forceImagePdf) {
2462
3010
  return this.convertViaImagePdf(
2463
3011
  url,
@@ -2562,7 +3110,7 @@ var PDFConverter = class {
2562
3110
  const reason = options.forcedMethod ? `Forced: ${options.forcedMethod}` : !pdfPath ? "Non-local URL, sampling skipped" : "Sampling skipped";
2563
3111
  return { method, reason, sampledPages: 0, totalPages: 0 };
2564
3112
  }
2565
- const samplingDir = (0, import_node_path7.join)(process.cwd(), "output", reportId, "_sampling");
3113
+ const samplingDir = (0, import_node_path8.join)(process.cwd(), "output", reportId, "_sampling");
2566
3114
  const sampler = new OcrStrategySampler(
2567
3115
  this.logger,
2568
3116
  new PageRenderer(this.logger),
@@ -2587,8 +3135,8 @@ var PDFConverter = class {
2587
3135
  }
2588
3136
  return strategy;
2589
3137
  } finally {
2590
- if ((0, import_node_fs8.existsSync)(samplingDir)) {
2591
- (0, import_node_fs8.rmSync)(samplingDir, { recursive: true, force: true });
3138
+ if ((0, import_node_fs9.existsSync)(samplingDir)) {
3139
+ (0, import_node_fs9.rmSync)(samplingDir, { recursive: true, force: true });
2592
3140
  }
2593
3141
  }
2594
3142
  }
@@ -2609,7 +3157,7 @@ var PDFConverter = class {
2609
3157
  const wrappedCallback = async (outputDir) => {
2610
3158
  let pageTexts;
2611
3159
  try {
2612
- const resultPath2 = (0, import_node_path7.join)(outputDir, "result.json");
3160
+ const resultPath2 = (0, import_node_path8.join)(outputDir, "result.json");
2613
3161
  const totalPages = await runJqFileJson(
2614
3162
  ".pages | length",
2615
3163
  resultPath2
@@ -2621,9 +3169,9 @@ var PDFConverter = class {
2621
3169
  "[PDFConverter] pdftotext extraction failed, proceeding without text reference"
2622
3170
  );
2623
3171
  }
2624
- const resultPath = (0, import_node_path7.join)(outputDir, "result.json");
2625
- const ocrOriginPath = (0, import_node_path7.join)(outputDir, "result_ocr_origin.json");
2626
- (0, import_node_fs8.copyFileSync)(resultPath, ocrOriginPath);
3172
+ const resultPath = (0, import_node_path8.join)(outputDir, "result.json");
3173
+ const ocrOriginPath = (0, import_node_path8.join)(outputDir, "result_ocr_origin.json");
3174
+ (0, import_node_fs9.copyFileSync)(resultPath, ocrOriginPath);
2627
3175
  const corrector = new VlmTextCorrector(this.logger);
2628
3176
  await corrector.correctAndSave(outputDir, options.vlmProcessorModel, {
2629
3177
  concurrency: options.vlmConcurrency,
@@ -2765,9 +3313,9 @@ var PDFConverter = class {
2765
3313
  }
2766
3314
  }
2767
3315
  const cwd = process.cwd();
2768
- const zipPath = (0, import_node_path7.join)(cwd, "result.zip");
2769
- const extractDir = (0, import_node_path7.join)(cwd, "result_extracted");
2770
- const outputDir = (0, import_node_path7.join)(cwd, "output", reportId);
3316
+ const zipPath = (0, import_node_path8.join)(cwd, "result.zip");
3317
+ const extractDir = (0, import_node_path8.join)(cwd, "result_extracted");
3318
+ const outputDir = (0, import_node_path8.join)(cwd, "output", reportId);
2771
3319
  try {
2772
3320
  await this.processConvertedFiles(zipPath, extractDir, outputDir);
2773
3321
  await this.renderPageImages(url, outputDir);
@@ -2784,19 +3332,19 @@ var PDFConverter = class {
2784
3332
  this.logger.info("[PDFConverter] Total time:", duration, "ms");
2785
3333
  } finally {
2786
3334
  this.logger.info("[PDFConverter] Cleaning up temporary files...");
2787
- if ((0, import_node_fs8.existsSync)(zipPath)) {
2788
- (0, import_node_fs8.rmSync)(zipPath, { force: true });
3335
+ if ((0, import_node_fs9.existsSync)(zipPath)) {
3336
+ (0, import_node_fs9.rmSync)(zipPath, { force: true });
2789
3337
  }
2790
- if ((0, import_node_fs8.existsSync)(extractDir)) {
2791
- (0, import_node_fs8.rmSync)(extractDir, { recursive: true, force: true });
3338
+ if ((0, import_node_fs9.existsSync)(extractDir)) {
3339
+ (0, import_node_fs9.rmSync)(extractDir, { recursive: true, force: true });
2792
3340
  }
2793
3341
  if (cleanupAfterCallback) {
2794
3342
  this.logger.info(
2795
3343
  "[PDFConverter] Cleaning up output directory:",
2796
3344
  outputDir
2797
3345
  );
2798
- if ((0, import_node_fs8.existsSync)(outputDir)) {
2799
- (0, import_node_fs8.rmSync)(outputDir, { recursive: true, force: true });
3346
+ if ((0, import_node_fs9.existsSync)(outputDir)) {
3347
+ (0, import_node_fs9.rmSync)(outputDir, { recursive: true, force: true });
2800
3348
  }
2801
3349
  } else {
2802
3350
  this.logger.info("[PDFConverter] Output preserved at:", outputDir);
@@ -2814,7 +3362,10 @@ var PDFConverter = class {
2814
3362
  "skipSampling",
2815
3363
  "forcedMethod",
2816
3364
  "aggregator",
2817
- "onTokenUsage"
3365
+ "onTokenUsage",
3366
+ "chunkedConversion",
3367
+ "chunkSize",
3368
+ "chunkMaxRetries"
2818
3369
  ]),
2819
3370
  to_formats: ["json", "html"],
2820
3371
  image_export_mode: "embedded",
@@ -2942,15 +3493,15 @@ var PDFConverter = class {
2942
3493
  "\n[PDFConverter] Task completed, downloading ZIP file..."
2943
3494
  );
2944
3495
  const zipResult = await this.client.getTaskResultFile(taskId);
2945
- const zipPath = (0, import_node_path7.join)(process.cwd(), "result.zip");
3496
+ const zipPath = (0, import_node_path8.join)(process.cwd(), "result.zip");
2946
3497
  this.logger.info("[PDFConverter] Saving ZIP file to:", zipPath);
2947
3498
  if (zipResult.fileStream) {
2948
- const writeStream = (0, import_node_fs8.createWriteStream)(zipPath);
2949
- await (0, import_promises5.pipeline)(zipResult.fileStream, writeStream);
3499
+ const writeStream = (0, import_node_fs9.createWriteStream)(zipPath);
3500
+ await (0, import_promises7.pipeline)(zipResult.fileStream, writeStream);
2950
3501
  return;
2951
3502
  }
2952
3503
  if (zipResult.data) {
2953
- await (0, import_promises4.writeFile)(zipPath, zipResult.data);
3504
+ await (0, import_promises6.writeFile)(zipPath, zipResult.data);
2954
3505
  return;
2955
3506
  }
2956
3507
  this.logger.warn(
@@ -2966,7 +3517,7 @@ var PDFConverter = class {
2966
3517
  );
2967
3518
  }
2968
3519
  const buffer = new Uint8Array(await response.arrayBuffer());
2969
- await (0, import_promises4.writeFile)(zipPath, buffer);
3520
+ await (0, import_promises6.writeFile)(zipPath, buffer);
2970
3521
  }
2971
3522
  async processConvertedFiles(zipPath, extractDir, outputDir) {
2972
3523
  await ImageExtractor.extractAndSaveDocumentsFromZip(
@@ -2995,7 +3546,7 @@ var PDFConverter = class {
2995
3546
  );
2996
3547
  const renderer = new PageRenderer(this.logger);
2997
3548
  const renderResult = await renderer.renderPages(pdfPath, outputDir);
2998
- const resultPath = (0, import_node_path7.join)(outputDir, "result.json");
3549
+ const resultPath = (0, import_node_path8.join)(outputDir, "result.json");
2999
3550
  const tmpPath = resultPath + ".tmp";
3000
3551
  const jqProgram = `
3001
3552
  .pages |= with_entries(
@@ -3007,7 +3558,7 @@ var PDFConverter = class {
3007
3558
  )
3008
3559
  `;
3009
3560
  await runJqFileToFile(jqProgram, resultPath, tmpPath);
3010
- await (0, import_promises4.rename)(tmpPath, resultPath);
3561
+ await (0, import_promises6.rename)(tmpPath, resultPath);
3011
3562
  this.logger.info(
3012
3563
  `[PDFConverter] Rendered ${renderResult.pageCount} page images`
3013
3564
  );
@@ -3042,7 +3593,7 @@ var PDFParser = class {
3042
3593
  this.baseUrl = void 0;
3043
3594
  }
3044
3595
  this.timeout = timeout;
3045
- this.venvPath = venvPath || (0, import_node_path8.join)(process.cwd(), ".venv");
3596
+ this.venvPath = venvPath || (0, import_node_path9.join)(process.cwd(), ".venv");
3046
3597
  this.killExistingProcess = killExistingProcess;
3047
3598
  this.enableImagePdfFallback = enableImagePdfFallback;
3048
3599
  }