@heripo/pdf-parser 0.1.11 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -2,7 +2,7 @@
2
2
  import { Docling } from "docling-sdk";
3
3
  import { execSync } from "child_process";
4
4
  import { platform } from "os";
5
- import { join as join7 } from "path";
5
+ import { join as join8 } from "path";
6
6
 
7
7
  // src/config/constants.ts
8
8
  var PDF_PARSER = {
@@ -49,6 +49,12 @@ var PAGE_RENDERING = {
49
49
  /** Low-resolution DPI for OCR strategy sampling */
50
50
  SAMPLE_DPI: 150
51
51
  };
52
+ var CHUNKED_CONVERSION = {
53
+ /** Number of pages per chunk */
54
+ DEFAULT_CHUNK_SIZE: 10,
55
+ /** Maximum retry attempts per failed chunk */
56
+ DEFAULT_MAX_RETRIES: 2
57
+ };
52
58
  var IMAGE_PDF_CONVERTER = {
53
59
  /**
54
60
  * ImageMagick density option (DPI) for PDF to image conversion
@@ -843,10 +849,10 @@ var DoclingEnvironment = class _DoclingEnvironment {
843
849
 
844
850
  // src/core/pdf-converter.ts
845
851
  import { omit } from "es-toolkit";
846
- import { copyFileSync, createWriteStream as createWriteStream3, existsSync as existsSync4, rmSync as rmSync3 } from "fs";
847
- import { rename as rename2, writeFile } from "fs/promises";
848
- import { join as join6 } from "path";
849
- import { pipeline as pipeline3 } from "stream/promises";
852
+ import { copyFileSync as copyFileSync2, createWriteStream as createWriteStream4, existsSync as existsSync5, rmSync as rmSync4 } from "fs";
853
+ import { rename as rename3, writeFile as writeFile2 } from "fs/promises";
854
+ import { join as join7 } from "path";
855
+ import { pipeline as pipeline4 } from "stream/promises";
850
856
 
851
857
  // src/errors/image-pdf-fallback-error.ts
852
858
  var ImagePdfFallbackError = class extends Error {
@@ -2337,10 +2343,541 @@ var LocalFileServer = class {
2337
2343
  }
2338
2344
  };
2339
2345
 
2346
+ // src/core/chunked-pdf-converter.ts
2347
+ import {
2348
+ copyFileSync,
2349
+ createWriteStream as createWriteStream3,
2350
+ existsSync as existsSync3,
2351
+ mkdirSync as mkdirSync3,
2352
+ readFileSync as readFileSync3,
2353
+ readdirSync as readdirSync3,
2354
+ rmSync as rmSync2,
2355
+ writeFileSync as writeFileSync3
2356
+ } from "fs";
2357
+ import { rename as rename2, writeFile } from "fs/promises";
2358
+ import { join as join5 } from "path";
2359
+ import { pipeline as pipeline3 } from "stream/promises";
2360
+
2361
+ // src/processors/docling-document-merger.ts
2362
+ var REF_PATTERN = /^#\/(texts|pictures|tables|groups)\/(\d+)$/;
2363
+ var IMAGE_URI_PATTERN = /^images\/pic_(\d+)\.png$/;
2364
+ var DoclingDocumentMerger = class {
2365
+ /**
2366
+ * Merge an array of DoclingDocuments into one.
2367
+ * The first chunk's metadata (schema_name, version, name, origin) is used as the base.
2368
+ *
2369
+ * @param chunks - Array of DoclingDocument objects to merge (must have at least 1)
2370
+ * @param picFileOffsets - Optional cumulative pic_ file counts per chunk.
2371
+ * When provided, picFileOffsets[i] is used for pic_ URI remapping instead of
2372
+ * the pictures array length, aligning URIs with relocated file indices.
2373
+ * @returns Merged DoclingDocument
2374
+ */
2375
+ merge(chunks, picFileOffsets) {
2376
+ if (chunks.length === 0) {
2377
+ throw new Error("Cannot merge zero chunks");
2378
+ }
2379
+ if (chunks.length === 1) {
2380
+ return chunks[0];
2381
+ }
2382
+ const base = structuredClone(chunks[0]);
2383
+ for (let i = 1; i < chunks.length; i++) {
2384
+ const chunk = chunks[i];
2385
+ const offsets = {
2386
+ texts: base.texts.length,
2387
+ pictures: base.pictures.length,
2388
+ tables: base.tables.length,
2389
+ groups: base.groups.length
2390
+ };
2391
+ const picFileOffset = picFileOffsets ? picFileOffsets[i] : offsets.pictures;
2392
+ for (const text of chunk.texts) {
2393
+ const remapped = structuredClone(text);
2394
+ remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
2395
+ if (remapped.parent) {
2396
+ remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
2397
+ }
2398
+ remapped.children = remapped.children.map((c) => ({
2399
+ $ref: this.remapRef(c.$ref, offsets)
2400
+ }));
2401
+ base.texts.push(remapped);
2402
+ }
2403
+ for (const picture of chunk.pictures) {
2404
+ const remapped = structuredClone(picture);
2405
+ remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
2406
+ if (remapped.parent) {
2407
+ remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
2408
+ }
2409
+ remapped.children = remapped.children.map((c) => ({
2410
+ $ref: this.remapRef(c.$ref, offsets)
2411
+ }));
2412
+ remapped.captions = remapped.captions.map((c) => ({
2413
+ $ref: this.remapRef(c.$ref, offsets)
2414
+ }));
2415
+ this.remapPictureImageUri(remapped, picFileOffset);
2416
+ base.pictures.push(remapped);
2417
+ }
2418
+ for (const table of chunk.tables) {
2419
+ const remapped = structuredClone(table);
2420
+ remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
2421
+ if (remapped.parent) {
2422
+ remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
2423
+ }
2424
+ remapped.children = remapped.children.map((c) => ({
2425
+ $ref: this.remapRef(c.$ref, offsets)
2426
+ }));
2427
+ remapped.captions = remapped.captions.map((c) => ({
2428
+ $ref: this.remapRef(c.$ref, offsets)
2429
+ }));
2430
+ remapped.footnotes = remapped.footnotes.map((f) => ({
2431
+ $ref: this.remapRef(f.$ref, offsets)
2432
+ }));
2433
+ base.tables.push(remapped);
2434
+ }
2435
+ for (const group of chunk.groups) {
2436
+ const remapped = structuredClone(group);
2437
+ remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
2438
+ if (remapped.parent) {
2439
+ remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
2440
+ }
2441
+ remapped.children = remapped.children.map((c) => ({
2442
+ $ref: this.remapRef(c.$ref, offsets)
2443
+ }));
2444
+ base.groups.push(remapped);
2445
+ }
2446
+ for (const child of chunk.body.children) {
2447
+ base.body.children.push({
2448
+ $ref: this.remapRef(child.$ref, offsets)
2449
+ });
2450
+ }
2451
+ for (const child of chunk.furniture.children) {
2452
+ base.furniture.children.push({
2453
+ $ref: this.remapRef(child.$ref, offsets)
2454
+ });
2455
+ }
2456
+ Object.assign(base.pages, chunk.pages);
2457
+ }
2458
+ return base;
2459
+ }
2460
+ /**
2461
+ * Remap a $ref string by applying offsets.
2462
+ * Only refs matching "#/{texts|pictures|tables|groups}/{N}" are remapped.
2463
+ * Refs like "#/body" or "#/furniture" pass through unchanged.
2464
+ */
2465
+ remapRef(ref, offsets) {
2466
+ const match = REF_PATTERN.exec(ref);
2467
+ if (!match) {
2468
+ return ref;
2469
+ }
2470
+ const kind = match[1];
2471
+ const index = parseInt(match[2], 10);
2472
+ return `#/${kind}/${index + offsets[kind]}`;
2473
+ }
2474
+ /**
2475
+ * Remap image URI in a picture item by applying the pic file offset.
2476
+ * Transforms "images/pic_N.png" → "images/pic_{N+offset}.png"
2477
+ */
2478
+ remapPictureImageUri(picture, picFileOffset) {
2479
+ const rec = picture;
2480
+ const image = rec.image;
2481
+ if (!image?.uri) return;
2482
+ const match = IMAGE_URI_PATTERN.exec(image.uri);
2483
+ if (match) {
2484
+ const index = parseInt(match[1], 10);
2485
+ image.uri = `images/pic_${index + picFileOffset}.png`;
2486
+ }
2487
+ }
2488
+ };
2489
+
2490
+ // src/core/chunked-pdf-converter.ts
2491
+ var ChunkedPDFConverter = class {
2492
+ constructor(logger, client, config, timeout = PDF_CONVERTER.DEFAULT_TIMEOUT_MS) {
2493
+ this.logger = logger;
2494
+ this.client = client;
2495
+ this.config = config;
2496
+ this.timeout = timeout;
2497
+ }
2498
+ /**
2499
+ * Convert a local PDF in chunks.
2500
+ *
2501
+ * @param url - file:// URL to the source PDF
2502
+ * @param reportId - Unique report identifier for output directory naming
2503
+ * @param onComplete - Callback invoked with the final output directory
2504
+ * @param cleanupAfterCallback - Whether to clean up the output directory after callback
2505
+ * @param options - PDF conversion options (chunked-specific fields are stripped internally)
2506
+ * @param buildConversionOptions - Function to build Docling ConversionOptions from PDFConvertOptions
2507
+ * @param abortSignal - Optional abort signal for cancellation
2508
+ */
2509
+ async convertChunked(url, reportId, onComplete, cleanupAfterCallback, options, buildConversionOptions, abortSignal) {
2510
+ const pdfPath = url.slice(7);
2511
+ const cwd = process.cwd();
2512
+ const outputDir = join5(cwd, "output", reportId);
2513
+ const chunksBaseDir = join5(cwd, "output", reportId, "_chunks");
2514
+ const totalPages = await this.getPageCount(pdfPath);
2515
+ if (totalPages === 0) {
2516
+ throw new Error(
2517
+ "[ChunkedPDFConverter] Failed to detect page count from PDF"
2518
+ );
2519
+ }
2520
+ const chunks = this.calculateChunks(totalPages);
2521
+ this.logger.info(
2522
+ `[ChunkedPDFConverter] Starting: ${totalPages} pages \u2192 ${chunks.length} chunks of ${this.config.chunkSize}`
2523
+ );
2524
+ const server = new LocalFileServer();
2525
+ const httpUrl = await server.start(pdfPath);
2526
+ this.logger.info(
2527
+ "[ChunkedPDFConverter] Started local file server:",
2528
+ httpUrl
2529
+ );
2530
+ const chunkDocuments = [];
2531
+ try {
2532
+ for (let i = 0; i < chunks.length; i++) {
2533
+ this.checkAbort(abortSignal);
2534
+ const [start, end] = chunks[i];
2535
+ const chunkDir = join5(chunksBaseDir, `_chunk_${i}`);
2536
+ mkdirSync3(chunkDir, { recursive: true });
2537
+ const doc = await this.convertChunk(
2538
+ i,
2539
+ chunks.length,
2540
+ start,
2541
+ end,
2542
+ httpUrl,
2543
+ chunkDir,
2544
+ options,
2545
+ buildConversionOptions
2546
+ );
2547
+ chunkDocuments.push(doc);
2548
+ }
2549
+ } finally {
2550
+ this.logger.info("[ChunkedPDFConverter] Stopping local file server...");
2551
+ await server.stop();
2552
+ }
2553
+ this.checkAbort(abortSignal);
2554
+ this.logger.info(
2555
+ `[ChunkedPDFConverter] All ${chunks.length} chunks completed, merging...`
2556
+ );
2557
+ const merger = new DoclingDocumentMerger();
2558
+ const picFileOffsets = this.buildPicFileOffsets(
2559
+ chunksBaseDir,
2560
+ chunks.length
2561
+ );
2562
+ const merged = merger.merge(chunkDocuments, picFileOffsets);
2563
+ this.logger.info(
2564
+ `[ChunkedPDFConverter] Merged: ${merged.texts.length} texts, ${merged.pictures.length} pictures, ${merged.tables.length} tables, ${Object.keys(merged.pages).length} pages`
2565
+ );
2566
+ mkdirSync3(outputDir, { recursive: true });
2567
+ const imagesDir = join5(outputDir, "images");
2568
+ mkdirSync3(imagesDir, { recursive: true });
2569
+ this.relocateImages(chunksBaseDir, chunks.length, imagesDir);
2570
+ const resultPath = join5(outputDir, "result.json");
2571
+ writeFileSync3(resultPath, JSON.stringify(merged));
2572
+ try {
2573
+ await this.renderPageImages(pdfPath, outputDir);
2574
+ this.cleanupOrphanedPicFiles(resultPath, imagesDir);
2575
+ this.checkAbort(abortSignal);
2576
+ this.logger.info(
2577
+ "[ChunkedPDFConverter] Executing completion callback..."
2578
+ );
2579
+ await onComplete(outputDir);
2580
+ } finally {
2581
+ if (existsSync3(chunksBaseDir)) {
2582
+ rmSync2(chunksBaseDir, { recursive: true, force: true });
2583
+ }
2584
+ if (cleanupAfterCallback) {
2585
+ this.logger.info(
2586
+ "[ChunkedPDFConverter] Cleaning up output directory:",
2587
+ outputDir
2588
+ );
2589
+ if (existsSync3(outputDir)) {
2590
+ rmSync2(outputDir, { recursive: true, force: true });
2591
+ }
2592
+ } else {
2593
+ this.logger.info(
2594
+ "[ChunkedPDFConverter] Output preserved at:",
2595
+ outputDir
2596
+ );
2597
+ }
2598
+ }
2599
+ return null;
2600
+ }
2601
+ /**
2602
+ * Convert a single chunk with retry logic.
2603
+ */
2604
+ async convertChunk(chunkIndex, totalChunks, startPage, endPage, httpUrl, chunkDir, options, buildConversionOptions) {
2605
+ const chunkLabel = `Chunk ${chunkIndex + 1}/${totalChunks} (pages ${startPage}-${endPage})`;
2606
+ for (let attempt = 0; attempt <= this.config.maxRetries; attempt++) {
2607
+ try {
2608
+ if (attempt > 0) {
2609
+ this.logger.info(
2610
+ `[ChunkedPDFConverter] ${chunkLabel}: retrying (${attempt}/${this.config.maxRetries})...`
2611
+ );
2612
+ } else {
2613
+ this.logger.info(
2614
+ `[ChunkedPDFConverter] ${chunkLabel}: converting...`
2615
+ );
2616
+ }
2617
+ const startTime = Date.now();
2618
+ const conversionOptions = buildConversionOptions({
2619
+ ...options,
2620
+ page_range: [startPage, endPage]
2621
+ });
2622
+ const task = await this.client.convertSourceAsync({
2623
+ sources: [{ kind: "http", url: httpUrl }],
2624
+ options: conversionOptions,
2625
+ target: { kind: "zip" }
2626
+ });
2627
+ await this.trackTaskProgress(task);
2628
+ const zipPath = join5(chunkDir, "result.zip");
2629
+ await this.downloadResult(task.taskId, zipPath);
2630
+ const extractDir = join5(chunkDir, "extracted");
2631
+ const chunkOutputDir = join5(chunkDir, "output");
2632
+ await ImageExtractor.extractAndSaveDocumentsFromZip(
2633
+ this.logger,
2634
+ zipPath,
2635
+ extractDir,
2636
+ chunkOutputDir
2637
+ );
2638
+ const resultJsonPath = join5(chunkOutputDir, "result.json");
2639
+ const doc = await runJqFileJson(".", resultJsonPath);
2640
+ if (existsSync3(zipPath)) rmSync2(zipPath, { force: true });
2641
+ if (existsSync3(extractDir)) {
2642
+ rmSync2(extractDir, { recursive: true, force: true });
2643
+ }
2644
+ const elapsed = ((Date.now() - startTime) / 1e3).toFixed(1);
2645
+ if (attempt > 0) {
2646
+ this.logger.info(
2647
+ `[ChunkedPDFConverter] ${chunkLabel}: completed on retry ${attempt} (${elapsed}s)`
2648
+ );
2649
+ } else {
2650
+ this.logger.info(
2651
+ `[ChunkedPDFConverter] ${chunkLabel}: completed (${elapsed}s)`
2652
+ );
2653
+ }
2654
+ return doc;
2655
+ } catch (error) {
2656
+ if (attempt >= this.config.maxRetries) {
2657
+ this.logger.error(
2658
+ `[ChunkedPDFConverter] ${chunkLabel}: failed after ${this.config.maxRetries} retries`
2659
+ );
2660
+ throw error;
2661
+ }
2662
+ this.logger.warn(
2663
+ `[ChunkedPDFConverter] ${chunkLabel}: failed, retrying (${attempt + 1}/${this.config.maxRetries})...`
2664
+ );
2665
+ }
2666
+ }
2667
+ throw new Error("Unreachable");
2668
+ }
2669
+ /** Calculate page ranges for chunks */
2670
+ calculateChunks(totalPages) {
2671
+ if (this.config.chunkSize <= 0) {
2672
+ throw new Error("[ChunkedPDFConverter] chunkSize must be positive");
2673
+ }
2674
+ const ranges = [];
2675
+ for (let start = 1; start <= totalPages; start += this.config.chunkSize) {
2676
+ const end = Math.min(start + this.config.chunkSize - 1, totalPages);
2677
+ ranges.push([start, end]);
2678
+ }
2679
+ return ranges;
2680
+ }
2681
+ /** Get total page count using pdfinfo */
2682
+ async getPageCount(pdfPath) {
2683
+ const result = await spawnAsync("pdfinfo", [pdfPath]);
2684
+ if (result.code !== 0) {
2685
+ return 0;
2686
+ }
2687
+ const match = result.stdout.match(/^Pages:\s+(\d+)/m);
2688
+ return match ? parseInt(match[1], 10) : 0;
2689
+ }
2690
+ /** Poll task progress until completion */
2691
+ async trackTaskProgress(task) {
2692
+ const startTime = Date.now();
2693
+ while (true) {
2694
+ if (Date.now() - startTime > this.timeout) {
2695
+ throw new Error("[ChunkedPDFConverter] Chunk task timeout");
2696
+ }
2697
+ const status = await task.poll();
2698
+ if (status.task_status === "success") return;
2699
+ if (status.task_status === "failure") {
2700
+ let details = "unknown";
2701
+ try {
2702
+ const result = await task.getResult();
2703
+ if (result.errors?.length) {
2704
+ details = result.errors.map((e) => e.message).join("; ");
2705
+ }
2706
+ } catch {
2707
+ }
2708
+ throw new Error(`[ChunkedPDFConverter] Chunk task failed: ${details}`);
2709
+ }
2710
+ await new Promise(
2711
+ (resolve) => setTimeout(resolve, PDF_CONVERTER.POLL_INTERVAL_MS)
2712
+ );
2713
+ }
2714
+ }
2715
+ /** Download ZIP result for a task */
2716
+ async downloadResult(taskId, zipPath) {
2717
+ const zipResult = await this.client.getTaskResultFile(taskId);
2718
+ if (zipResult.fileStream) {
2719
+ const writeStream = createWriteStream3(zipPath);
2720
+ await pipeline3(zipResult.fileStream, writeStream);
2721
+ return;
2722
+ }
2723
+ if (zipResult.data) {
2724
+ await writeFile(zipPath, zipResult.data);
2725
+ return;
2726
+ }
2727
+ const baseUrl = this.client.getConfig().baseUrl;
2728
+ const response = await fetch(`${baseUrl}/v1/result/${taskId}`, {
2729
+ headers: { Accept: "application/zip" }
2730
+ });
2731
+ if (!response.ok) {
2732
+ throw new Error(
2733
+ `Failed to download chunk ZIP: ${response.status} ${response.statusText}`
2734
+ );
2735
+ }
2736
+ const buffer = new Uint8Array(await response.arrayBuffer());
2737
+ await writeFile(zipPath, buffer);
2738
+ }
2739
+ /**
2740
+ * Relocate images from chunk output directories to the final images directory
2741
+ * with global indexing.
2742
+ */
2743
+ relocateImages(chunksBaseDir, totalChunks, imagesDir) {
2744
+ let picGlobalIndex = 0;
2745
+ for (let i = 0; i < totalChunks; i++) {
2746
+ const chunkImagesDir = join5(
2747
+ chunksBaseDir,
2748
+ `_chunk_${i}`,
2749
+ "output",
2750
+ "images"
2751
+ );
2752
+ if (!existsSync3(chunkImagesDir)) continue;
2753
+ const picFiles = readdirSync3(chunkImagesDir).filter((f) => f.startsWith("pic_") && f.endsWith(".png")).sort((a, b) => {
2754
+ const numA = parseInt(a.replace("pic_", "").replace(".png", ""), 10);
2755
+ const numB = parseInt(b.replace("pic_", "").replace(".png", ""), 10);
2756
+ return numA - numB;
2757
+ });
2758
+ for (const file of picFiles) {
2759
+ const src = join5(chunkImagesDir, file);
2760
+ const dest = join5(imagesDir, `pic_${picGlobalIndex}.png`);
2761
+ copyFileSync(src, dest);
2762
+ picGlobalIndex++;
2763
+ }
2764
+ }
2765
+ let imageGlobalIndex = 0;
2766
+ for (let i = 0; i < totalChunks; i++) {
2767
+ const chunkImagesDir = join5(
2768
+ chunksBaseDir,
2769
+ `_chunk_${i}`,
2770
+ "output",
2771
+ "images"
2772
+ );
2773
+ if (!existsSync3(chunkImagesDir)) continue;
2774
+ const imageFiles = readdirSync3(chunkImagesDir).filter((f) => f.startsWith("image_") && f.endsWith(".png")).sort((a, b) => {
2775
+ const numA = parseInt(
2776
+ a.replace("image_", "").replace(".png", ""),
2777
+ 10
2778
+ );
2779
+ const numB = parseInt(
2780
+ b.replace("image_", "").replace(".png", ""),
2781
+ 10
2782
+ );
2783
+ return numA - numB;
2784
+ });
2785
+ for (const file of imageFiles) {
2786
+ const src = join5(chunkImagesDir, file);
2787
+ const dest = join5(imagesDir, `image_${imageGlobalIndex}.png`);
2788
+ copyFileSync(src, dest);
2789
+ imageGlobalIndex++;
2790
+ }
2791
+ }
2792
+ this.logger.info(
2793
+ `[ChunkedPDFConverter] Relocated ${picGlobalIndex} pic + ${imageGlobalIndex} image files to ${imagesDir}`
2794
+ );
2795
+ }
2796
+ /** Render page images from PDF using ImageMagick and update result.json */
2797
+ async renderPageImages(pdfPath, outputDir) {
2798
+ this.logger.info(
2799
+ "[ChunkedPDFConverter] Rendering page images with ImageMagick..."
2800
+ );
2801
+ const renderer = new PageRenderer(this.logger);
2802
+ const renderResult = await renderer.renderPages(pdfPath, outputDir);
2803
+ const resultPath = join5(outputDir, "result.json");
2804
+ const tmpPath = resultPath + ".tmp";
2805
+ const jqProgram = `
2806
+ .pages |= with_entries(
2807
+ if (.value.page_no - 1) >= 0 and (.value.page_no - 1) < ${renderResult.pageCount} then
2808
+ .value.image.uri = "pages/page_\\(.value.page_no - 1).png" |
2809
+ .value.image.mimetype = "image/png" |
2810
+ .value.image.dpi = ${PAGE_RENDERING.DEFAULT_DPI}
2811
+ else . end
2812
+ )
2813
+ `;
2814
+ await runJqFileToFile(jqProgram, resultPath, tmpPath);
2815
+ await rename2(tmpPath, resultPath);
2816
+ this.logger.info(
2817
+ `[ChunkedPDFConverter] Rendered ${renderResult.pageCount} page images`
2818
+ );
2819
+ }
2820
+ /**
2821
+ * Remove pic_ files from images directory that are not referenced in result.json.
2822
+ * Chunked Docling conversion embeds page images as base64 in JSON, which get
2823
+ * extracted as pic_ files. After renderPageImages replaces page URIs with
2824
+ * pages/page_N.png, these pic_ files become orphaned.
2825
+ */
2826
+ cleanupOrphanedPicFiles(resultPath, imagesDir) {
2827
+ const content = readFileSync3(resultPath, "utf-8");
2828
+ const referencedPics = /* @__PURE__ */ new Set();
2829
+ const picPattern = /images\/pic_\d+\.png/g;
2830
+ let match;
2831
+ while ((match = picPattern.exec(content)) !== null) {
2832
+ referencedPics.add(match[0].replace("images/", ""));
2833
+ }
2834
+ const picFiles = readdirSync3(imagesDir).filter(
2835
+ (f) => f.startsWith("pic_") && f.endsWith(".png")
2836
+ );
2837
+ let removedCount = 0;
2838
+ for (const file of picFiles) {
2839
+ if (!referencedPics.has(file)) {
2840
+ rmSync2(join5(imagesDir, file), { force: true });
2841
+ removedCount++;
2842
+ }
2843
+ }
2844
+ if (removedCount > 0) {
2845
+ this.logger.info(
2846
+ `[ChunkedPDFConverter] Cleaned up ${removedCount} orphaned pic_ files (${referencedPics.size} referenced, kept)`
2847
+ );
2848
+ }
2849
+ }
2850
+ /**
2851
+ * Build cumulative pic_ file offsets per chunk for correct URI remapping.
2852
+ * Each offset[i] is the total number of pic_ files in chunks 0..i-1.
2853
+ */
2854
+ buildPicFileOffsets(chunksBaseDir, totalChunks) {
2855
+ const offsets = [];
2856
+ let cumulative = 0;
2857
+ for (let i = 0; i < totalChunks; i++) {
2858
+ offsets.push(cumulative);
2859
+ const dir = join5(chunksBaseDir, `_chunk_${i}`, "output", "images");
2860
+ const count = existsSync3(dir) ? readdirSync3(dir).filter(
2861
+ (f) => f.startsWith("pic_") && f.endsWith(".png")
2862
+ ).length : 0;
2863
+ cumulative += count;
2864
+ }
2865
+ return offsets;
2866
+ }
2867
+ /** Check if abort has been signalled and throw if so */
2868
+ checkAbort(signal) {
2869
+ if (signal?.aborted) {
2870
+ const error = new Error("Chunked PDF conversion was aborted");
2871
+ error.name = "AbortError";
2872
+ throw error;
2873
+ }
2874
+ }
2875
+ };
2876
+
2340
2877
  // src/core/image-pdf-converter.ts
2341
- import { existsSync as existsSync3, rmSync as rmSync2 } from "fs";
2878
+ import { existsSync as existsSync4, rmSync as rmSync3 } from "fs";
2342
2879
  import { tmpdir } from "os";
2343
- import { join as join5 } from "path";
2880
+ import { join as join6 } from "path";
2344
2881
  var ImagePdfConverter = class {
2345
2882
  constructor(logger) {
2346
2883
  this.logger = logger;
@@ -2356,8 +2893,8 @@ var ImagePdfConverter = class {
2356
2893
  async convert(pdfUrl, reportId) {
2357
2894
  const timestamp = Date.now();
2358
2895
  const tempDir = tmpdir();
2359
- const inputPath = join5(tempDir, `${reportId}-${timestamp}-input.pdf`);
2360
- const outputPath = join5(tempDir, `${reportId}-${timestamp}-image.pdf`);
2896
+ const inputPath = join6(tempDir, `${reportId}-${timestamp}-input.pdf`);
2897
+ const outputPath = join6(tempDir, `${reportId}-${timestamp}-image.pdf`);
2361
2898
  try {
2362
2899
  this.logger.info("[ImagePdfConverter] Downloading PDF from URL...");
2363
2900
  await this.downloadPdf(pdfUrl, inputPath);
@@ -2366,8 +2903,8 @@ var ImagePdfConverter = class {
2366
2903
  this.logger.info("[ImagePdfConverter] Image PDF created:", outputPath);
2367
2904
  return outputPath;
2368
2905
  } finally {
2369
- if (existsSync3(inputPath)) {
2370
- rmSync2(inputPath, { force: true });
2906
+ if (existsSync4(inputPath)) {
2907
+ rmSync3(inputPath, { force: true });
2371
2908
  }
2372
2909
  }
2373
2910
  }
@@ -2414,12 +2951,12 @@ var ImagePdfConverter = class {
2414
2951
  * Cleanup the temporary image PDF file
2415
2952
  */
2416
2953
  cleanup(imagePdfPath) {
2417
- if (existsSync3(imagePdfPath)) {
2954
+ if (existsSync4(imagePdfPath)) {
2418
2955
  this.logger.info(
2419
2956
  "[ImagePdfConverter] Cleaning up temp file:",
2420
2957
  imagePdfPath
2421
2958
  );
2422
- rmSync2(imagePdfPath, { force: true });
2959
+ rmSync3(imagePdfPath, { force: true });
2423
2960
  }
2424
2961
  }
2425
2962
  };
@@ -2434,6 +2971,26 @@ var PDFConverter = class {
2434
2971
  }
2435
2972
  async convert(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
2436
2973
  this.logger.info("[PDFConverter] Converting:", url);
2974
+ if (options.chunkedConversion && url.startsWith("file://")) {
2975
+ const chunked = new ChunkedPDFConverter(
2976
+ this.logger,
2977
+ this.client,
2978
+ {
2979
+ chunkSize: options.chunkSize ?? CHUNKED_CONVERSION.DEFAULT_CHUNK_SIZE,
2980
+ maxRetries: options.chunkMaxRetries ?? CHUNKED_CONVERSION.DEFAULT_MAX_RETRIES
2981
+ },
2982
+ this.timeout
2983
+ );
2984
+ return chunked.convertChunked(
2985
+ url,
2986
+ reportId,
2987
+ onComplete,
2988
+ cleanupAfterCallback,
2989
+ options,
2990
+ (opts) => this.buildConversionOptions(opts),
2991
+ abortSignal
2992
+ );
2993
+ }
2437
2994
  if (options.forceImagePdf) {
2438
2995
  return this.convertViaImagePdf(
2439
2996
  url,
@@ -2538,7 +3095,7 @@ var PDFConverter = class {
2538
3095
  const reason = options.forcedMethod ? `Forced: ${options.forcedMethod}` : !pdfPath ? "Non-local URL, sampling skipped" : "Sampling skipped";
2539
3096
  return { method, reason, sampledPages: 0, totalPages: 0 };
2540
3097
  }
2541
- const samplingDir = join6(process.cwd(), "output", reportId, "_sampling");
3098
+ const samplingDir = join7(process.cwd(), "output", reportId, "_sampling");
2542
3099
  const sampler = new OcrStrategySampler(
2543
3100
  this.logger,
2544
3101
  new PageRenderer(this.logger),
@@ -2563,8 +3120,8 @@ var PDFConverter = class {
2563
3120
  }
2564
3121
  return strategy;
2565
3122
  } finally {
2566
- if (existsSync4(samplingDir)) {
2567
- rmSync3(samplingDir, { recursive: true, force: true });
3123
+ if (existsSync5(samplingDir)) {
3124
+ rmSync4(samplingDir, { recursive: true, force: true });
2568
3125
  }
2569
3126
  }
2570
3127
  }
@@ -2585,7 +3142,7 @@ var PDFConverter = class {
2585
3142
  const wrappedCallback = async (outputDir) => {
2586
3143
  let pageTexts;
2587
3144
  try {
2588
- const resultPath2 = join6(outputDir, "result.json");
3145
+ const resultPath2 = join7(outputDir, "result.json");
2589
3146
  const totalPages = await runJqFileJson(
2590
3147
  ".pages | length",
2591
3148
  resultPath2
@@ -2597,9 +3154,9 @@ var PDFConverter = class {
2597
3154
  "[PDFConverter] pdftotext extraction failed, proceeding without text reference"
2598
3155
  );
2599
3156
  }
2600
- const resultPath = join6(outputDir, "result.json");
2601
- const ocrOriginPath = join6(outputDir, "result_ocr_origin.json");
2602
- copyFileSync(resultPath, ocrOriginPath);
3157
+ const resultPath = join7(outputDir, "result.json");
3158
+ const ocrOriginPath = join7(outputDir, "result_ocr_origin.json");
3159
+ copyFileSync2(resultPath, ocrOriginPath);
2603
3160
  const corrector = new VlmTextCorrector(this.logger);
2604
3161
  await corrector.correctAndSave(outputDir, options.vlmProcessorModel, {
2605
3162
  concurrency: options.vlmConcurrency,
@@ -2741,9 +3298,9 @@ var PDFConverter = class {
2741
3298
  }
2742
3299
  }
2743
3300
  const cwd = process.cwd();
2744
- const zipPath = join6(cwd, "result.zip");
2745
- const extractDir = join6(cwd, "result_extracted");
2746
- const outputDir = join6(cwd, "output", reportId);
3301
+ const zipPath = join7(cwd, "result.zip");
3302
+ const extractDir = join7(cwd, "result_extracted");
3303
+ const outputDir = join7(cwd, "output", reportId);
2747
3304
  try {
2748
3305
  await this.processConvertedFiles(zipPath, extractDir, outputDir);
2749
3306
  await this.renderPageImages(url, outputDir);
@@ -2760,19 +3317,19 @@ var PDFConverter = class {
2760
3317
  this.logger.info("[PDFConverter] Total time:", duration, "ms");
2761
3318
  } finally {
2762
3319
  this.logger.info("[PDFConverter] Cleaning up temporary files...");
2763
- if (existsSync4(zipPath)) {
2764
- rmSync3(zipPath, { force: true });
3320
+ if (existsSync5(zipPath)) {
3321
+ rmSync4(zipPath, { force: true });
2765
3322
  }
2766
- if (existsSync4(extractDir)) {
2767
- rmSync3(extractDir, { recursive: true, force: true });
3323
+ if (existsSync5(extractDir)) {
3324
+ rmSync4(extractDir, { recursive: true, force: true });
2768
3325
  }
2769
3326
  if (cleanupAfterCallback) {
2770
3327
  this.logger.info(
2771
3328
  "[PDFConverter] Cleaning up output directory:",
2772
3329
  outputDir
2773
3330
  );
2774
- if (existsSync4(outputDir)) {
2775
- rmSync3(outputDir, { recursive: true, force: true });
3331
+ if (existsSync5(outputDir)) {
3332
+ rmSync4(outputDir, { recursive: true, force: true });
2776
3333
  }
2777
3334
  } else {
2778
3335
  this.logger.info("[PDFConverter] Output preserved at:", outputDir);
@@ -2790,7 +3347,10 @@ var PDFConverter = class {
2790
3347
  "skipSampling",
2791
3348
  "forcedMethod",
2792
3349
  "aggregator",
2793
- "onTokenUsage"
3350
+ "onTokenUsage",
3351
+ "chunkedConversion",
3352
+ "chunkSize",
3353
+ "chunkMaxRetries"
2794
3354
  ]),
2795
3355
  to_formats: ["json", "html"],
2796
3356
  image_export_mode: "embedded",
@@ -2918,15 +3478,15 @@ var PDFConverter = class {
2918
3478
  "\n[PDFConverter] Task completed, downloading ZIP file..."
2919
3479
  );
2920
3480
  const zipResult = await this.client.getTaskResultFile(taskId);
2921
- const zipPath = join6(process.cwd(), "result.zip");
3481
+ const zipPath = join7(process.cwd(), "result.zip");
2922
3482
  this.logger.info("[PDFConverter] Saving ZIP file to:", zipPath);
2923
3483
  if (zipResult.fileStream) {
2924
- const writeStream = createWriteStream3(zipPath);
2925
- await pipeline3(zipResult.fileStream, writeStream);
3484
+ const writeStream = createWriteStream4(zipPath);
3485
+ await pipeline4(zipResult.fileStream, writeStream);
2926
3486
  return;
2927
3487
  }
2928
3488
  if (zipResult.data) {
2929
- await writeFile(zipPath, zipResult.data);
3489
+ await writeFile2(zipPath, zipResult.data);
2930
3490
  return;
2931
3491
  }
2932
3492
  this.logger.warn(
@@ -2942,7 +3502,7 @@ var PDFConverter = class {
2942
3502
  );
2943
3503
  }
2944
3504
  const buffer = new Uint8Array(await response.arrayBuffer());
2945
- await writeFile(zipPath, buffer);
3505
+ await writeFile2(zipPath, buffer);
2946
3506
  }
2947
3507
  async processConvertedFiles(zipPath, extractDir, outputDir) {
2948
3508
  await ImageExtractor.extractAndSaveDocumentsFromZip(
@@ -2971,7 +3531,7 @@ var PDFConverter = class {
2971
3531
  );
2972
3532
  const renderer = new PageRenderer(this.logger);
2973
3533
  const renderResult = await renderer.renderPages(pdfPath, outputDir);
2974
- const resultPath = join6(outputDir, "result.json");
3534
+ const resultPath = join7(outputDir, "result.json");
2975
3535
  const tmpPath = resultPath + ".tmp";
2976
3536
  const jqProgram = `
2977
3537
  .pages |= with_entries(
@@ -2983,7 +3543,7 @@ var PDFConverter = class {
2983
3543
  )
2984
3544
  `;
2985
3545
  await runJqFileToFile(jqProgram, resultPath, tmpPath);
2986
- await rename2(tmpPath, resultPath);
3546
+ await rename3(tmpPath, resultPath);
2987
3547
  this.logger.info(
2988
3548
  `[PDFConverter] Rendered ${renderResult.pageCount} page images`
2989
3549
  );
@@ -3018,7 +3578,7 @@ var PDFParser = class {
3018
3578
  this.baseUrl = void 0;
3019
3579
  }
3020
3580
  this.timeout = timeout;
3021
- this.venvPath = venvPath || join7(process.cwd(), ".venv");
3581
+ this.venvPath = venvPath || join8(process.cwd(), ".venv");
3022
3582
  this.killExistingProcess = killExistingProcess;
3023
3583
  this.enableImagePdfFallback = enableImagePdfFallback;
3024
3584
  }