@heripo/pdf-parser 0.1.11 → 0.1.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -2,7 +2,7 @@
2
2
  import { Docling } from "docling-sdk";
3
3
  import { execSync } from "child_process";
4
4
  import { platform } from "os";
5
- import { join as join7 } from "path";
5
+ import { join as join8 } from "path";
6
6
 
7
7
  // src/config/constants.ts
8
8
  var PDF_PARSER = {
@@ -49,6 +49,12 @@ var PAGE_RENDERING = {
49
49
  /** Low-resolution DPI for OCR strategy sampling */
50
50
  SAMPLE_DPI: 150
51
51
  };
52
+ var CHUNKED_CONVERSION = {
53
+ /** Number of pages per chunk */
54
+ DEFAULT_CHUNK_SIZE: 10,
55
+ /** Maximum retry attempts per failed chunk */
56
+ DEFAULT_MAX_RETRIES: 2
57
+ };
52
58
  var IMAGE_PDF_CONVERTER = {
53
59
  /**
54
60
  * ImageMagick density option (DPI) for PDF to image conversion
@@ -843,10 +849,10 @@ var DoclingEnvironment = class _DoclingEnvironment {
843
849
 
844
850
  // src/core/pdf-converter.ts
845
851
  import { omit } from "es-toolkit";
846
- import { copyFileSync, createWriteStream as createWriteStream3, existsSync as existsSync4, rmSync as rmSync3 } from "fs";
847
- import { rename as rename2, writeFile } from "fs/promises";
848
- import { join as join6 } from "path";
849
- import { pipeline as pipeline3 } from "stream/promises";
852
+ import { copyFileSync as copyFileSync2, createWriteStream as createWriteStream4, existsSync as existsSync5, rmSync as rmSync4 } from "fs";
853
+ import { rename as rename3, writeFile as writeFile2 } from "fs/promises";
854
+ import { join as join7 } from "path";
855
+ import { pipeline as pipeline4 } from "stream/promises";
850
856
 
851
857
  // src/errors/image-pdf-fallback-error.ts
852
858
  var ImagePdfFallbackError = class extends Error {
@@ -1750,7 +1756,8 @@ var VlmTextCorrector = class {
1750
1756
  },
1751
1757
  {
1752
1758
  type: "image",
1753
- image: `data:image/png;base64,${imageBase64}`
1759
+ image: imageBase64,
1760
+ mediaType: "image/png"
1754
1761
  }
1755
1762
  ]
1756
1763
  }
@@ -1962,7 +1969,7 @@ var VlmTextCorrector = class {
1962
1969
  */
1963
1970
  readPageImage(outputDir, pageNo) {
1964
1971
  const imagePath = join4(outputDir, "pages", `page_${pageNo - 1}.png`);
1965
- return readFileSync(imagePath).toString("base64");
1972
+ return new Uint8Array(readFileSync(imagePath));
1966
1973
  }
1967
1974
  /**
1968
1975
  * Apply VLM corrections to the DoclingDocument.
@@ -2231,7 +2238,7 @@ var OcrStrategySampler = class {
2231
2238
  this.logger.debug(
2232
2239
  `[OcrStrategySampler] Analyzing page ${pageNo} for Korean-Hanja mix and language...`
2233
2240
  );
2234
- const base64Image = readFileSync2(pageFile).toString("base64");
2241
+ const imageData = new Uint8Array(readFileSync2(pageFile));
2235
2242
  const messages = [
2236
2243
  {
2237
2244
  role: "user",
@@ -2239,7 +2246,8 @@ var OcrStrategySampler = class {
2239
2246
  { type: "text", text: KOREAN_HANJA_MIX_PROMPT },
2240
2247
  {
2241
2248
  type: "image",
2242
- image: `data:image/png;base64,${base64Image}`
2249
+ image: imageData,
2250
+ mediaType: "image/png"
2243
2251
  }
2244
2252
  ]
2245
2253
  }
@@ -2337,10 +2345,541 @@ var LocalFileServer = class {
2337
2345
  }
2338
2346
  };
2339
2347
 
2348
+ // src/core/chunked-pdf-converter.ts
2349
+ import {
2350
+ copyFileSync,
2351
+ createWriteStream as createWriteStream3,
2352
+ existsSync as existsSync3,
2353
+ mkdirSync as mkdirSync3,
2354
+ readFileSync as readFileSync3,
2355
+ readdirSync as readdirSync3,
2356
+ rmSync as rmSync2,
2357
+ writeFileSync as writeFileSync3
2358
+ } from "fs";
2359
+ import { rename as rename2, writeFile } from "fs/promises";
2360
+ import { join as join5 } from "path";
2361
+ import { pipeline as pipeline3 } from "stream/promises";
2362
+
2363
+ // src/processors/docling-document-merger.ts
2364
+ var REF_PATTERN = /^#\/(texts|pictures|tables|groups)\/(\d+)$/;
2365
+ var IMAGE_URI_PATTERN = /^images\/pic_(\d+)\.png$/;
2366
+ var DoclingDocumentMerger = class {
2367
+ /**
2368
+ * Merge an array of DoclingDocuments into one.
2369
+ * The first chunk's metadata (schema_name, version, name, origin) is used as the base.
2370
+ *
2371
+ * @param chunks - Array of DoclingDocument objects to merge (must have at least 1)
2372
+ * @param picFileOffsets - Optional cumulative pic_ file counts per chunk.
2373
+ * When provided, picFileOffsets[i] is used for pic_ URI remapping instead of
2374
+ * the pictures array length, aligning URIs with relocated file indices.
2375
+ * @returns Merged DoclingDocument
2376
+ */
2377
+ merge(chunks, picFileOffsets) {
2378
+ if (chunks.length === 0) {
2379
+ throw new Error("Cannot merge zero chunks");
2380
+ }
2381
+ if (chunks.length === 1) {
2382
+ return chunks[0];
2383
+ }
2384
+ const base = structuredClone(chunks[0]);
2385
+ for (let i = 1; i < chunks.length; i++) {
2386
+ const chunk = chunks[i];
2387
+ const offsets = {
2388
+ texts: base.texts.length,
2389
+ pictures: base.pictures.length,
2390
+ tables: base.tables.length,
2391
+ groups: base.groups.length
2392
+ };
2393
+ const picFileOffset = picFileOffsets ? picFileOffsets[i] : offsets.pictures;
2394
+ for (const text of chunk.texts) {
2395
+ const remapped = structuredClone(text);
2396
+ remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
2397
+ if (remapped.parent) {
2398
+ remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
2399
+ }
2400
+ remapped.children = remapped.children.map((c) => ({
2401
+ $ref: this.remapRef(c.$ref, offsets)
2402
+ }));
2403
+ base.texts.push(remapped);
2404
+ }
2405
+ for (const picture of chunk.pictures) {
2406
+ const remapped = structuredClone(picture);
2407
+ remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
2408
+ if (remapped.parent) {
2409
+ remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
2410
+ }
2411
+ remapped.children = remapped.children.map((c) => ({
2412
+ $ref: this.remapRef(c.$ref, offsets)
2413
+ }));
2414
+ remapped.captions = remapped.captions.map((c) => ({
2415
+ $ref: this.remapRef(c.$ref, offsets)
2416
+ }));
2417
+ this.remapPictureImageUri(remapped, picFileOffset);
2418
+ base.pictures.push(remapped);
2419
+ }
2420
+ for (const table of chunk.tables) {
2421
+ const remapped = structuredClone(table);
2422
+ remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
2423
+ if (remapped.parent) {
2424
+ remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
2425
+ }
2426
+ remapped.children = remapped.children.map((c) => ({
2427
+ $ref: this.remapRef(c.$ref, offsets)
2428
+ }));
2429
+ remapped.captions = remapped.captions.map((c) => ({
2430
+ $ref: this.remapRef(c.$ref, offsets)
2431
+ }));
2432
+ remapped.footnotes = remapped.footnotes.map((f) => ({
2433
+ $ref: this.remapRef(f.$ref, offsets)
2434
+ }));
2435
+ base.tables.push(remapped);
2436
+ }
2437
+ for (const group of chunk.groups) {
2438
+ const remapped = structuredClone(group);
2439
+ remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
2440
+ if (remapped.parent) {
2441
+ remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
2442
+ }
2443
+ remapped.children = remapped.children.map((c) => ({
2444
+ $ref: this.remapRef(c.$ref, offsets)
2445
+ }));
2446
+ base.groups.push(remapped);
2447
+ }
2448
+ for (const child of chunk.body.children) {
2449
+ base.body.children.push({
2450
+ $ref: this.remapRef(child.$ref, offsets)
2451
+ });
2452
+ }
2453
+ for (const child of chunk.furniture.children) {
2454
+ base.furniture.children.push({
2455
+ $ref: this.remapRef(child.$ref, offsets)
2456
+ });
2457
+ }
2458
+ Object.assign(base.pages, chunk.pages);
2459
+ }
2460
+ return base;
2461
+ }
2462
+ /**
2463
+ * Remap a $ref string by applying offsets.
2464
+ * Only refs matching "#/{texts|pictures|tables|groups}/{N}" are remapped.
2465
+ * Refs like "#/body" or "#/furniture" pass through unchanged.
2466
+ */
2467
+ remapRef(ref, offsets) {
2468
+ const match = REF_PATTERN.exec(ref);
2469
+ if (!match) {
2470
+ return ref;
2471
+ }
2472
+ const kind = match[1];
2473
+ const index = parseInt(match[2], 10);
2474
+ return `#/${kind}/${index + offsets[kind]}`;
2475
+ }
2476
+ /**
2477
+ * Remap image URI in a picture item by applying the pic file offset.
2478
+ * Transforms "images/pic_N.png" → "images/pic_{N+offset}.png"
2479
+ */
2480
+ remapPictureImageUri(picture, picFileOffset) {
2481
+ const rec = picture;
2482
+ const image = rec.image;
2483
+ if (!image?.uri) return;
2484
+ const match = IMAGE_URI_PATTERN.exec(image.uri);
2485
+ if (match) {
2486
+ const index = parseInt(match[1], 10);
2487
+ image.uri = `images/pic_${index + picFileOffset}.png`;
2488
+ }
2489
+ }
2490
+ };
2491
+
2492
+ // src/core/chunked-pdf-converter.ts
2493
+ var ChunkedPDFConverter = class {
2494
+ constructor(logger, client, config, timeout = PDF_CONVERTER.DEFAULT_TIMEOUT_MS) {
2495
+ this.logger = logger;
2496
+ this.client = client;
2497
+ this.config = config;
2498
+ this.timeout = timeout;
2499
+ }
2500
+ /**
2501
+ * Convert a local PDF in chunks.
2502
+ *
2503
+ * @param url - file:// URL to the source PDF
2504
+ * @param reportId - Unique report identifier for output directory naming
2505
+ * @param onComplete - Callback invoked with the final output directory
2506
+ * @param cleanupAfterCallback - Whether to clean up the output directory after callback
2507
+ * @param options - PDF conversion options (chunked-specific fields are stripped internally)
2508
+ * @param buildConversionOptions - Function to build Docling ConversionOptions from PDFConvertOptions
2509
+ * @param abortSignal - Optional abort signal for cancellation
2510
+ */
2511
+ async convertChunked(url, reportId, onComplete, cleanupAfterCallback, options, buildConversionOptions, abortSignal) {
2512
+ const pdfPath = url.slice(7);
2513
+ const cwd = process.cwd();
2514
+ const outputDir = join5(cwd, "output", reportId);
2515
+ const chunksBaseDir = join5(cwd, "output", reportId, "_chunks");
2516
+ const totalPages = await this.getPageCount(pdfPath);
2517
+ if (totalPages === 0) {
2518
+ throw new Error(
2519
+ "[ChunkedPDFConverter] Failed to detect page count from PDF"
2520
+ );
2521
+ }
2522
+ const chunks = this.calculateChunks(totalPages);
2523
+ this.logger.info(
2524
+ `[ChunkedPDFConverter] Starting: ${totalPages} pages \u2192 ${chunks.length} chunks of ${this.config.chunkSize}`
2525
+ );
2526
+ const server = new LocalFileServer();
2527
+ const httpUrl = await server.start(pdfPath);
2528
+ this.logger.info(
2529
+ "[ChunkedPDFConverter] Started local file server:",
2530
+ httpUrl
2531
+ );
2532
+ const chunkDocuments = [];
2533
+ try {
2534
+ for (let i = 0; i < chunks.length; i++) {
2535
+ this.checkAbort(abortSignal);
2536
+ const [start, end] = chunks[i];
2537
+ const chunkDir = join5(chunksBaseDir, `_chunk_${i}`);
2538
+ mkdirSync3(chunkDir, { recursive: true });
2539
+ const doc = await this.convertChunk(
2540
+ i,
2541
+ chunks.length,
2542
+ start,
2543
+ end,
2544
+ httpUrl,
2545
+ chunkDir,
2546
+ options,
2547
+ buildConversionOptions
2548
+ );
2549
+ chunkDocuments.push(doc);
2550
+ }
2551
+ } finally {
2552
+ this.logger.info("[ChunkedPDFConverter] Stopping local file server...");
2553
+ await server.stop();
2554
+ }
2555
+ this.checkAbort(abortSignal);
2556
+ this.logger.info(
2557
+ `[ChunkedPDFConverter] All ${chunks.length} chunks completed, merging...`
2558
+ );
2559
+ const merger = new DoclingDocumentMerger();
2560
+ const picFileOffsets = this.buildPicFileOffsets(
2561
+ chunksBaseDir,
2562
+ chunks.length
2563
+ );
2564
+ const merged = merger.merge(chunkDocuments, picFileOffsets);
2565
+ this.logger.info(
2566
+ `[ChunkedPDFConverter] Merged: ${merged.texts.length} texts, ${merged.pictures.length} pictures, ${merged.tables.length} tables, ${Object.keys(merged.pages).length} pages`
2567
+ );
2568
+ mkdirSync3(outputDir, { recursive: true });
2569
+ const imagesDir = join5(outputDir, "images");
2570
+ mkdirSync3(imagesDir, { recursive: true });
2571
+ this.relocateImages(chunksBaseDir, chunks.length, imagesDir);
2572
+ const resultPath = join5(outputDir, "result.json");
2573
+ writeFileSync3(resultPath, JSON.stringify(merged));
2574
+ try {
2575
+ await this.renderPageImages(pdfPath, outputDir);
2576
+ this.cleanupOrphanedPicFiles(resultPath, imagesDir);
2577
+ this.checkAbort(abortSignal);
2578
+ this.logger.info(
2579
+ "[ChunkedPDFConverter] Executing completion callback..."
2580
+ );
2581
+ await onComplete(outputDir);
2582
+ } finally {
2583
+ if (existsSync3(chunksBaseDir)) {
2584
+ rmSync2(chunksBaseDir, { recursive: true, force: true });
2585
+ }
2586
+ if (cleanupAfterCallback) {
2587
+ this.logger.info(
2588
+ "[ChunkedPDFConverter] Cleaning up output directory:",
2589
+ outputDir
2590
+ );
2591
+ if (existsSync3(outputDir)) {
2592
+ rmSync2(outputDir, { recursive: true, force: true });
2593
+ }
2594
+ } else {
2595
+ this.logger.info(
2596
+ "[ChunkedPDFConverter] Output preserved at:",
2597
+ outputDir
2598
+ );
2599
+ }
2600
+ }
2601
+ return null;
2602
+ }
2603
+ /**
2604
+ * Convert a single chunk with retry logic.
2605
+ */
2606
+ async convertChunk(chunkIndex, totalChunks, startPage, endPage, httpUrl, chunkDir, options, buildConversionOptions) {
2607
+ const chunkLabel = `Chunk ${chunkIndex + 1}/${totalChunks} (pages ${startPage}-${endPage})`;
2608
+ for (let attempt = 0; attempt <= this.config.maxRetries; attempt++) {
2609
+ try {
2610
+ if (attempt > 0) {
2611
+ this.logger.info(
2612
+ `[ChunkedPDFConverter] ${chunkLabel}: retrying (${attempt}/${this.config.maxRetries})...`
2613
+ );
2614
+ } else {
2615
+ this.logger.info(
2616
+ `[ChunkedPDFConverter] ${chunkLabel}: converting...`
2617
+ );
2618
+ }
2619
+ const startTime = Date.now();
2620
+ const conversionOptions = buildConversionOptions({
2621
+ ...options,
2622
+ page_range: [startPage, endPage]
2623
+ });
2624
+ const task = await this.client.convertSourceAsync({
2625
+ sources: [{ kind: "http", url: httpUrl }],
2626
+ options: conversionOptions,
2627
+ target: { kind: "zip" }
2628
+ });
2629
+ await this.trackTaskProgress(task);
2630
+ const zipPath = join5(chunkDir, "result.zip");
2631
+ await this.downloadResult(task.taskId, zipPath);
2632
+ const extractDir = join5(chunkDir, "extracted");
2633
+ const chunkOutputDir = join5(chunkDir, "output");
2634
+ await ImageExtractor.extractAndSaveDocumentsFromZip(
2635
+ this.logger,
2636
+ zipPath,
2637
+ extractDir,
2638
+ chunkOutputDir
2639
+ );
2640
+ const resultJsonPath = join5(chunkOutputDir, "result.json");
2641
+ const doc = await runJqFileJson(".", resultJsonPath);
2642
+ if (existsSync3(zipPath)) rmSync2(zipPath, { force: true });
2643
+ if (existsSync3(extractDir)) {
2644
+ rmSync2(extractDir, { recursive: true, force: true });
2645
+ }
2646
+ const elapsed = ((Date.now() - startTime) / 1e3).toFixed(1);
2647
+ if (attempt > 0) {
2648
+ this.logger.info(
2649
+ `[ChunkedPDFConverter] ${chunkLabel}: completed on retry ${attempt} (${elapsed}s)`
2650
+ );
2651
+ } else {
2652
+ this.logger.info(
2653
+ `[ChunkedPDFConverter] ${chunkLabel}: completed (${elapsed}s)`
2654
+ );
2655
+ }
2656
+ return doc;
2657
+ } catch (error) {
2658
+ if (attempt >= this.config.maxRetries) {
2659
+ this.logger.error(
2660
+ `[ChunkedPDFConverter] ${chunkLabel}: failed after ${this.config.maxRetries} retries`
2661
+ );
2662
+ throw error;
2663
+ }
2664
+ this.logger.warn(
2665
+ `[ChunkedPDFConverter] ${chunkLabel}: failed, retrying (${attempt + 1}/${this.config.maxRetries})...`
2666
+ );
2667
+ }
2668
+ }
2669
+ throw new Error("Unreachable");
2670
+ }
2671
+ /** Calculate page ranges for chunks */
2672
+ calculateChunks(totalPages) {
2673
+ if (this.config.chunkSize <= 0) {
2674
+ throw new Error("[ChunkedPDFConverter] chunkSize must be positive");
2675
+ }
2676
+ const ranges = [];
2677
+ for (let start = 1; start <= totalPages; start += this.config.chunkSize) {
2678
+ const end = Math.min(start + this.config.chunkSize - 1, totalPages);
2679
+ ranges.push([start, end]);
2680
+ }
2681
+ return ranges;
2682
+ }
2683
+ /** Get total page count using pdfinfo */
2684
+ async getPageCount(pdfPath) {
2685
+ const result = await spawnAsync("pdfinfo", [pdfPath]);
2686
+ if (result.code !== 0) {
2687
+ return 0;
2688
+ }
2689
+ const match = result.stdout.match(/^Pages:\s+(\d+)/m);
2690
+ return match ? parseInt(match[1], 10) : 0;
2691
+ }
2692
+ /** Poll task progress until completion */
2693
+ async trackTaskProgress(task) {
2694
+ const startTime = Date.now();
2695
+ while (true) {
2696
+ if (Date.now() - startTime > this.timeout) {
2697
+ throw new Error("[ChunkedPDFConverter] Chunk task timeout");
2698
+ }
2699
+ const status = await task.poll();
2700
+ if (status.task_status === "success") return;
2701
+ if (status.task_status === "failure") {
2702
+ let details = "unknown";
2703
+ try {
2704
+ const result = await task.getResult();
2705
+ if (result.errors?.length) {
2706
+ details = result.errors.map((e) => e.message).join("; ");
2707
+ }
2708
+ } catch {
2709
+ }
2710
+ throw new Error(`[ChunkedPDFConverter] Chunk task failed: ${details}`);
2711
+ }
2712
+ await new Promise(
2713
+ (resolve) => setTimeout(resolve, PDF_CONVERTER.POLL_INTERVAL_MS)
2714
+ );
2715
+ }
2716
+ }
2717
+ /** Download ZIP result for a task */
2718
+ async downloadResult(taskId, zipPath) {
2719
+ const zipResult = await this.client.getTaskResultFile(taskId);
2720
+ if (zipResult.fileStream) {
2721
+ const writeStream = createWriteStream3(zipPath);
2722
+ await pipeline3(zipResult.fileStream, writeStream);
2723
+ return;
2724
+ }
2725
+ if (zipResult.data) {
2726
+ await writeFile(zipPath, zipResult.data);
2727
+ return;
2728
+ }
2729
+ const baseUrl = this.client.getConfig().baseUrl;
2730
+ const response = await fetch(`${baseUrl}/v1/result/${taskId}`, {
2731
+ headers: { Accept: "application/zip" }
2732
+ });
2733
+ if (!response.ok) {
2734
+ throw new Error(
2735
+ `Failed to download chunk ZIP: ${response.status} ${response.statusText}`
2736
+ );
2737
+ }
2738
+ const buffer = new Uint8Array(await response.arrayBuffer());
2739
+ await writeFile(zipPath, buffer);
2740
+ }
2741
+ /**
2742
+ * Relocate images from chunk output directories to the final images directory
2743
+ * with global indexing.
2744
+ */
2745
+ relocateImages(chunksBaseDir, totalChunks, imagesDir) {
2746
+ let picGlobalIndex = 0;
2747
+ for (let i = 0; i < totalChunks; i++) {
2748
+ const chunkImagesDir = join5(
2749
+ chunksBaseDir,
2750
+ `_chunk_${i}`,
2751
+ "output",
2752
+ "images"
2753
+ );
2754
+ if (!existsSync3(chunkImagesDir)) continue;
2755
+ const picFiles = readdirSync3(chunkImagesDir).filter((f) => f.startsWith("pic_") && f.endsWith(".png")).sort((a, b) => {
2756
+ const numA = parseInt(a.replace("pic_", "").replace(".png", ""), 10);
2757
+ const numB = parseInt(b.replace("pic_", "").replace(".png", ""), 10);
2758
+ return numA - numB;
2759
+ });
2760
+ for (const file of picFiles) {
2761
+ const src = join5(chunkImagesDir, file);
2762
+ const dest = join5(imagesDir, `pic_${picGlobalIndex}.png`);
2763
+ copyFileSync(src, dest);
2764
+ picGlobalIndex++;
2765
+ }
2766
+ }
2767
+ let imageGlobalIndex = 0;
2768
+ for (let i = 0; i < totalChunks; i++) {
2769
+ const chunkImagesDir = join5(
2770
+ chunksBaseDir,
2771
+ `_chunk_${i}`,
2772
+ "output",
2773
+ "images"
2774
+ );
2775
+ if (!existsSync3(chunkImagesDir)) continue;
2776
+ const imageFiles = readdirSync3(chunkImagesDir).filter((f) => f.startsWith("image_") && f.endsWith(".png")).sort((a, b) => {
2777
+ const numA = parseInt(
2778
+ a.replace("image_", "").replace(".png", ""),
2779
+ 10
2780
+ );
2781
+ const numB = parseInt(
2782
+ b.replace("image_", "").replace(".png", ""),
2783
+ 10
2784
+ );
2785
+ return numA - numB;
2786
+ });
2787
+ for (const file of imageFiles) {
2788
+ const src = join5(chunkImagesDir, file);
2789
+ const dest = join5(imagesDir, `image_${imageGlobalIndex}.png`);
2790
+ copyFileSync(src, dest);
2791
+ imageGlobalIndex++;
2792
+ }
2793
+ }
2794
+ this.logger.info(
2795
+ `[ChunkedPDFConverter] Relocated ${picGlobalIndex} pic + ${imageGlobalIndex} image files to ${imagesDir}`
2796
+ );
2797
+ }
2798
+ /** Render page images from PDF using ImageMagick and update result.json */
2799
+ async renderPageImages(pdfPath, outputDir) {
2800
+ this.logger.info(
2801
+ "[ChunkedPDFConverter] Rendering page images with ImageMagick..."
2802
+ );
2803
+ const renderer = new PageRenderer(this.logger);
2804
+ const renderResult = await renderer.renderPages(pdfPath, outputDir);
2805
+ const resultPath = join5(outputDir, "result.json");
2806
+ const tmpPath = resultPath + ".tmp";
2807
+ const jqProgram = `
2808
+ .pages |= with_entries(
2809
+ if (.value.page_no - 1) >= 0 and (.value.page_no - 1) < ${renderResult.pageCount} then
2810
+ .value.image.uri = "pages/page_\\(.value.page_no - 1).png" |
2811
+ .value.image.mimetype = "image/png" |
2812
+ .value.image.dpi = ${PAGE_RENDERING.DEFAULT_DPI}
2813
+ else . end
2814
+ )
2815
+ `;
2816
+ await runJqFileToFile(jqProgram, resultPath, tmpPath);
2817
+ await rename2(tmpPath, resultPath);
2818
+ this.logger.info(
2819
+ `[ChunkedPDFConverter] Rendered ${renderResult.pageCount} page images`
2820
+ );
2821
+ }
2822
+ /**
2823
+ * Remove pic_ files from images directory that are not referenced in result.json.
2824
+ * Chunked Docling conversion embeds page images as base64 in JSON, which get
2825
+ * extracted as pic_ files. After renderPageImages replaces page URIs with
2826
+ * pages/page_N.png, these pic_ files become orphaned.
2827
+ */
2828
+ cleanupOrphanedPicFiles(resultPath, imagesDir) {
2829
+ const content = readFileSync3(resultPath, "utf-8");
2830
+ const referencedPics = /* @__PURE__ */ new Set();
2831
+ const picPattern = /images\/pic_\d+\.png/g;
2832
+ let match;
2833
+ while ((match = picPattern.exec(content)) !== null) {
2834
+ referencedPics.add(match[0].replace("images/", ""));
2835
+ }
2836
+ const picFiles = readdirSync3(imagesDir).filter(
2837
+ (f) => f.startsWith("pic_") && f.endsWith(".png")
2838
+ );
2839
+ let removedCount = 0;
2840
+ for (const file of picFiles) {
2841
+ if (!referencedPics.has(file)) {
2842
+ rmSync2(join5(imagesDir, file), { force: true });
2843
+ removedCount++;
2844
+ }
2845
+ }
2846
+ if (removedCount > 0) {
2847
+ this.logger.info(
2848
+ `[ChunkedPDFConverter] Cleaned up ${removedCount} orphaned pic_ files (${referencedPics.size} referenced, kept)`
2849
+ );
2850
+ }
2851
+ }
2852
+ /**
2853
+ * Build cumulative pic_ file offsets per chunk for correct URI remapping.
2854
+ * Each offset[i] is the total number of pic_ files in chunks 0..i-1.
2855
+ */
2856
+ buildPicFileOffsets(chunksBaseDir, totalChunks) {
2857
+ const offsets = [];
2858
+ let cumulative = 0;
2859
+ for (let i = 0; i < totalChunks; i++) {
2860
+ offsets.push(cumulative);
2861
+ const dir = join5(chunksBaseDir, `_chunk_${i}`, "output", "images");
2862
+ const count = existsSync3(dir) ? readdirSync3(dir).filter(
2863
+ (f) => f.startsWith("pic_") && f.endsWith(".png")
2864
+ ).length : 0;
2865
+ cumulative += count;
2866
+ }
2867
+ return offsets;
2868
+ }
2869
+ /** Check if abort has been signalled and throw if so */
2870
+ checkAbort(signal) {
2871
+ if (signal?.aborted) {
2872
+ const error = new Error("Chunked PDF conversion was aborted");
2873
+ error.name = "AbortError";
2874
+ throw error;
2875
+ }
2876
+ }
2877
+ };
2878
+
2340
2879
  // src/core/image-pdf-converter.ts
2341
- import { existsSync as existsSync3, rmSync as rmSync2 } from "fs";
2880
+ import { existsSync as existsSync4, rmSync as rmSync3 } from "fs";
2342
2881
  import { tmpdir } from "os";
2343
- import { join as join5 } from "path";
2882
+ import { join as join6 } from "path";
2344
2883
  var ImagePdfConverter = class {
2345
2884
  constructor(logger) {
2346
2885
  this.logger = logger;
@@ -2356,8 +2895,8 @@ var ImagePdfConverter = class {
2356
2895
  async convert(pdfUrl, reportId) {
2357
2896
  const timestamp = Date.now();
2358
2897
  const tempDir = tmpdir();
2359
- const inputPath = join5(tempDir, `${reportId}-${timestamp}-input.pdf`);
2360
- const outputPath = join5(tempDir, `${reportId}-${timestamp}-image.pdf`);
2898
+ const inputPath = join6(tempDir, `${reportId}-${timestamp}-input.pdf`);
2899
+ const outputPath = join6(tempDir, `${reportId}-${timestamp}-image.pdf`);
2361
2900
  try {
2362
2901
  this.logger.info("[ImagePdfConverter] Downloading PDF from URL...");
2363
2902
  await this.downloadPdf(pdfUrl, inputPath);
@@ -2366,8 +2905,8 @@ var ImagePdfConverter = class {
2366
2905
  this.logger.info("[ImagePdfConverter] Image PDF created:", outputPath);
2367
2906
  return outputPath;
2368
2907
  } finally {
2369
- if (existsSync3(inputPath)) {
2370
- rmSync2(inputPath, { force: true });
2908
+ if (existsSync4(inputPath)) {
2909
+ rmSync3(inputPath, { force: true });
2371
2910
  }
2372
2911
  }
2373
2912
  }
@@ -2414,12 +2953,12 @@ var ImagePdfConverter = class {
2414
2953
  * Cleanup the temporary image PDF file
2415
2954
  */
2416
2955
  cleanup(imagePdfPath) {
2417
- if (existsSync3(imagePdfPath)) {
2956
+ if (existsSync4(imagePdfPath)) {
2418
2957
  this.logger.info(
2419
2958
  "[ImagePdfConverter] Cleaning up temp file:",
2420
2959
  imagePdfPath
2421
2960
  );
2422
- rmSync2(imagePdfPath, { force: true });
2961
+ rmSync3(imagePdfPath, { force: true });
2423
2962
  }
2424
2963
  }
2425
2964
  };
@@ -2434,6 +2973,26 @@ var PDFConverter = class {
2434
2973
  }
2435
2974
  async convert(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
2436
2975
  this.logger.info("[PDFConverter] Converting:", url);
2976
+ if (options.chunkedConversion && url.startsWith("file://")) {
2977
+ const chunked = new ChunkedPDFConverter(
2978
+ this.logger,
2979
+ this.client,
2980
+ {
2981
+ chunkSize: options.chunkSize ?? CHUNKED_CONVERSION.DEFAULT_CHUNK_SIZE,
2982
+ maxRetries: options.chunkMaxRetries ?? CHUNKED_CONVERSION.DEFAULT_MAX_RETRIES
2983
+ },
2984
+ this.timeout
2985
+ );
2986
+ return chunked.convertChunked(
2987
+ url,
2988
+ reportId,
2989
+ onComplete,
2990
+ cleanupAfterCallback,
2991
+ options,
2992
+ (opts) => this.buildConversionOptions(opts),
2993
+ abortSignal
2994
+ );
2995
+ }
2437
2996
  if (options.forceImagePdf) {
2438
2997
  return this.convertViaImagePdf(
2439
2998
  url,
@@ -2538,7 +3097,7 @@ var PDFConverter = class {
2538
3097
  const reason = options.forcedMethod ? `Forced: ${options.forcedMethod}` : !pdfPath ? "Non-local URL, sampling skipped" : "Sampling skipped";
2539
3098
  return { method, reason, sampledPages: 0, totalPages: 0 };
2540
3099
  }
2541
- const samplingDir = join6(process.cwd(), "output", reportId, "_sampling");
3100
+ const samplingDir = join7(process.cwd(), "output", reportId, "_sampling");
2542
3101
  const sampler = new OcrStrategySampler(
2543
3102
  this.logger,
2544
3103
  new PageRenderer(this.logger),
@@ -2563,8 +3122,8 @@ var PDFConverter = class {
2563
3122
  }
2564
3123
  return strategy;
2565
3124
  } finally {
2566
- if (existsSync4(samplingDir)) {
2567
- rmSync3(samplingDir, { recursive: true, force: true });
3125
+ if (existsSync5(samplingDir)) {
3126
+ rmSync4(samplingDir, { recursive: true, force: true });
2568
3127
  }
2569
3128
  }
2570
3129
  }
@@ -2585,7 +3144,7 @@ var PDFConverter = class {
2585
3144
  const wrappedCallback = async (outputDir) => {
2586
3145
  let pageTexts;
2587
3146
  try {
2588
- const resultPath2 = join6(outputDir, "result.json");
3147
+ const resultPath2 = join7(outputDir, "result.json");
2589
3148
  const totalPages = await runJqFileJson(
2590
3149
  ".pages | length",
2591
3150
  resultPath2
@@ -2597,9 +3156,9 @@ var PDFConverter = class {
2597
3156
  "[PDFConverter] pdftotext extraction failed, proceeding without text reference"
2598
3157
  );
2599
3158
  }
2600
- const resultPath = join6(outputDir, "result.json");
2601
- const ocrOriginPath = join6(outputDir, "result_ocr_origin.json");
2602
- copyFileSync(resultPath, ocrOriginPath);
3159
+ const resultPath = join7(outputDir, "result.json");
3160
+ const ocrOriginPath = join7(outputDir, "result_ocr_origin.json");
3161
+ copyFileSync2(resultPath, ocrOriginPath);
2603
3162
  const corrector = new VlmTextCorrector(this.logger);
2604
3163
  await corrector.correctAndSave(outputDir, options.vlmProcessorModel, {
2605
3164
  concurrency: options.vlmConcurrency,
@@ -2741,9 +3300,9 @@ var PDFConverter = class {
2741
3300
  }
2742
3301
  }
2743
3302
  const cwd = process.cwd();
2744
- const zipPath = join6(cwd, "result.zip");
2745
- const extractDir = join6(cwd, "result_extracted");
2746
- const outputDir = join6(cwd, "output", reportId);
3303
+ const zipPath = join7(cwd, "result.zip");
3304
+ const extractDir = join7(cwd, "result_extracted");
3305
+ const outputDir = join7(cwd, "output", reportId);
2747
3306
  try {
2748
3307
  await this.processConvertedFiles(zipPath, extractDir, outputDir);
2749
3308
  await this.renderPageImages(url, outputDir);
@@ -2760,19 +3319,19 @@ var PDFConverter = class {
2760
3319
  this.logger.info("[PDFConverter] Total time:", duration, "ms");
2761
3320
  } finally {
2762
3321
  this.logger.info("[PDFConverter] Cleaning up temporary files...");
2763
- if (existsSync4(zipPath)) {
2764
- rmSync3(zipPath, { force: true });
3322
+ if (existsSync5(zipPath)) {
3323
+ rmSync4(zipPath, { force: true });
2765
3324
  }
2766
- if (existsSync4(extractDir)) {
2767
- rmSync3(extractDir, { recursive: true, force: true });
3325
+ if (existsSync5(extractDir)) {
3326
+ rmSync4(extractDir, { recursive: true, force: true });
2768
3327
  }
2769
3328
  if (cleanupAfterCallback) {
2770
3329
  this.logger.info(
2771
3330
  "[PDFConverter] Cleaning up output directory:",
2772
3331
  outputDir
2773
3332
  );
2774
- if (existsSync4(outputDir)) {
2775
- rmSync3(outputDir, { recursive: true, force: true });
3333
+ if (existsSync5(outputDir)) {
3334
+ rmSync4(outputDir, { recursive: true, force: true });
2776
3335
  }
2777
3336
  } else {
2778
3337
  this.logger.info("[PDFConverter] Output preserved at:", outputDir);
@@ -2790,7 +3349,10 @@ var PDFConverter = class {
2790
3349
  "skipSampling",
2791
3350
  "forcedMethod",
2792
3351
  "aggregator",
2793
- "onTokenUsage"
3352
+ "onTokenUsage",
3353
+ "chunkedConversion",
3354
+ "chunkSize",
3355
+ "chunkMaxRetries"
2794
3356
  ]),
2795
3357
  to_formats: ["json", "html"],
2796
3358
  image_export_mode: "embedded",
@@ -2918,15 +3480,15 @@ var PDFConverter = class {
2918
3480
  "\n[PDFConverter] Task completed, downloading ZIP file..."
2919
3481
  );
2920
3482
  const zipResult = await this.client.getTaskResultFile(taskId);
2921
- const zipPath = join6(process.cwd(), "result.zip");
3483
+ const zipPath = join7(process.cwd(), "result.zip");
2922
3484
  this.logger.info("[PDFConverter] Saving ZIP file to:", zipPath);
2923
3485
  if (zipResult.fileStream) {
2924
- const writeStream = createWriteStream3(zipPath);
2925
- await pipeline3(zipResult.fileStream, writeStream);
3486
+ const writeStream = createWriteStream4(zipPath);
3487
+ await pipeline4(zipResult.fileStream, writeStream);
2926
3488
  return;
2927
3489
  }
2928
3490
  if (zipResult.data) {
2929
- await writeFile(zipPath, zipResult.data);
3491
+ await writeFile2(zipPath, zipResult.data);
2930
3492
  return;
2931
3493
  }
2932
3494
  this.logger.warn(
@@ -2942,7 +3504,7 @@ var PDFConverter = class {
2942
3504
  );
2943
3505
  }
2944
3506
  const buffer = new Uint8Array(await response.arrayBuffer());
2945
- await writeFile(zipPath, buffer);
3507
+ await writeFile2(zipPath, buffer);
2946
3508
  }
2947
3509
  async processConvertedFiles(zipPath, extractDir, outputDir) {
2948
3510
  await ImageExtractor.extractAndSaveDocumentsFromZip(
@@ -2971,7 +3533,7 @@ var PDFConverter = class {
2971
3533
  );
2972
3534
  const renderer = new PageRenderer(this.logger);
2973
3535
  const renderResult = await renderer.renderPages(pdfPath, outputDir);
2974
- const resultPath = join6(outputDir, "result.json");
3536
+ const resultPath = join7(outputDir, "result.json");
2975
3537
  const tmpPath = resultPath + ".tmp";
2976
3538
  const jqProgram = `
2977
3539
  .pages |= with_entries(
@@ -2983,7 +3545,7 @@ var PDFConverter = class {
2983
3545
  )
2984
3546
  `;
2985
3547
  await runJqFileToFile(jqProgram, resultPath, tmpPath);
2986
- await rename2(tmpPath, resultPath);
3548
+ await rename3(tmpPath, resultPath);
2987
3549
  this.logger.info(
2988
3550
  `[PDFConverter] Rendered ${renderResult.pageCount} page images`
2989
3551
  );
@@ -3018,7 +3580,7 @@ var PDFParser = class {
3018
3580
  this.baseUrl = void 0;
3019
3581
  }
3020
3582
  this.timeout = timeout;
3021
- this.venvPath = venvPath || join7(process.cwd(), ".venv");
3583
+ this.venvPath = venvPath || join8(process.cwd(), ".venv");
3022
3584
  this.killExistingProcess = killExistingProcess;
3023
3585
  this.enableImagePdfFallback = enableImagePdfFallback;
3024
3586
  }