npm - @heripo/pdf-parser - Versions diffs - 0.1.10 → 0.1.12 - Mend

@heripo/pdf-parser 0.1.10 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.js CHANGED Viewed

@@ -2,7 +2,7 @@
 import { Docling } from "docling-sdk";
 import { execSync } from "child_process";
 import { platform } from "os";
-import { join as join7 } from "path";
+import { join as join8 } from "path";
 // src/config/constants.ts
 var PDF_PARSER = {
@@ -49,6 +49,12 @@ var PAGE_RENDERING = {
   /** Low-resolution DPI for OCR strategy sampling */
   SAMPLE_DPI: 150
 };
+var CHUNKED_CONVERSION = {
+  /** Number of pages per chunk */
+  DEFAULT_CHUNK_SIZE: 10,
+  /** Maximum retry attempts per failed chunk */
+  DEFAULT_MAX_RETRIES: 2
+};
 var IMAGE_PDF_CONVERTER = {
   /**
    * ImageMagick density option (DPI) for PDF to image conversion
@@ -843,10 +849,10 @@ var DoclingEnvironment = class _DoclingEnvironment {
 // src/core/pdf-converter.ts
 import { omit } from "es-toolkit";
-import { copyFileSync, createWriteStream as createWriteStream3, existsSync as existsSync4, rmSync as rmSync3 } from "fs";
-import { rename as rename2, writeFile } from "fs/promises";
-import { join as join6 } from "path";
-import { pipeline as pipeline3 } from "stream/promises";
+import { copyFileSync as copyFileSync2, createWriteStream as createWriteStream4, existsSync as existsSync5, rmSync as rmSync4 } from "fs";
+import { rename as rename3, writeFile as writeFile2 } from "fs/promises";
+import { join as join7 } from "path";
+import { pipeline as pipeline4 } from "stream/promises";
 // src/errors/image-pdf-fallback-error.ts
 var ImagePdfFallbackError = class extends Error {
@@ -1301,14 +1307,18 @@ var ImageExtractor = class _ImageExtractor {
 // src/processors/page-renderer.ts
 import { existsSync as existsSync2, mkdirSync as mkdirSync2, readdirSync as readdirSync2 } from "fs";
 import { join as join3 } from "path";
-var PROGRESS_POLL_INTERVAL_MS = 2e3;
+var PROGRESS_LOG_PERCENT_STEP = 10;
 var PageRenderer = class {
   constructor(logger) {
     this.logger = logger;
   }
+  lastLoggedPercent = 0;
   /**
    * Render all pages of a PDF to individual PNG files.
    *
+   * Uses per-page rendering (`magick 'input.pdf[N]'`) when page count is known,
+   * limiting peak memory to ~15MB/page instead of loading all pages at once.
+   *
    * @param pdfPath - Absolute path to the source PDF file
    * @param outputDir - Directory where pages/ subdirectory will be created
    * @param options - Rendering options
@@ -1325,50 +1335,54 @@ var PageRenderer = class {
       this.logger.info(
         `[PageRenderer] Rendering ${totalPages} pages at ${dpi} DPI...`
       );
+      this.lastLoggedPercent = 0;
+      for (let i = 0; i < totalPages; i++) {
+        const result = await spawnAsync(
+          "magick",
+          [
+            "-density",
+            dpi.toString(),
+            `${pdfPath}[${i}]`,
+            "-background",
+            "white",
+            "-alpha",
+            "remove",
+            "-alpha",
+            "off",
+            join3(pagesDir, `page_${i}.png`)
+          ],
+          { captureStdout: false }
+        );
+        if (result.code !== 0) {
+          throw new Error(
+            `[PageRenderer] Failed to render page ${i + 1}/${totalPages}: ${result.stderr || "Unknown error"}`
+          );
+        }
+        this.logProgress(i + 1, totalPages);
+      }
     } else {
       this.logger.info(`[PageRenderer] Rendering PDF at ${dpi} DPI...`);
-    }
-    const outputPattern = join3(pagesDir, "page_%d.png");
-    let progressInterval = null;
-    if (totalPages > 0) {
-      let lastLoggedCount = 0;
-      progressInterval = setInterval(() => {
-        try {
-          const rendered = readdirSync2(pagesDir).filter(
-            (f) => f.startsWith("page_") && f.endsWith(".png")
-          ).length;
-          if (rendered > 0 && rendered !== lastLoggedCount) {
-            lastLoggedCount = rendered;
-            this.logger.info(
-              `[PageRenderer] Rendering pages: ${rendered}/${totalPages}`
-            );
-          }
-        } catch {
-        }
-      }, PROGRESS_POLL_INTERVAL_MS);
-    }
-    try {
-      const result = await spawnAsync("magick", [
-        "-density",
-        dpi.toString(),
-        pdfPath,
-        "-background",
-        "white",
-        "-alpha",
-        "remove",
-        "-alpha",
-        "off",
-        outputPattern
-      ]);
+      const result = await spawnAsync(
+        "magick",
+        [
+          "-density",
+          dpi.toString(),
+          pdfPath,
+          "-background",
+          "white",
+          "-alpha",
+          "remove",
+          "-alpha",
+          "off",
+          join3(pagesDir, "page_%d.png")
+        ],
+        { captureStdout: false }
+      );
       if (result.code !== 0) {
         throw new Error(
           `[PageRenderer] Failed to render PDF pages: ${result.stderr || "Unknown error"}`
         );
       }
-    } finally {
-      if (progressInterval) {
-        clearInterval(progressInterval);
-      }
     }
     const pageFiles = readdirSync2(pagesDir).filter((f) => f.startsWith("page_") && f.endsWith(".png")).sort((a, b) => {
       const numA = parseInt(a.replace("page_", "").replace(".png", ""), 10);
@@ -1384,6 +1398,18 @@ var PageRenderer = class {
       pageFiles
     };
   }
+  /**
+   * Log rendering progress at appropriate intervals (every 10%).
+   */
+  logProgress(current, total) {
+    const percent = Math.floor(current / total * 100);
+    if (percent >= this.lastLoggedPercent + PROGRESS_LOG_PERCENT_STEP || current === total) {
+      this.lastLoggedPercent = percent;
+      this.logger.info(
+        `[PageRenderer] Rendering pages: ${current}/${total} (${percent}%)`
+      );
+    }
+  }
   /**
    * Get total page count using pdfinfo.
    * Returns 0 on failure (progress logging will be skipped).
@@ -2317,10 +2343,541 @@ var LocalFileServer = class {
   }
 };
+// src/core/chunked-pdf-converter.ts
+import {
+  copyFileSync,
+  createWriteStream as createWriteStream3,
+  existsSync as existsSync3,
+  mkdirSync as mkdirSync3,
+  readFileSync as readFileSync3,
+  readdirSync as readdirSync3,
+  rmSync as rmSync2,
+  writeFileSync as writeFileSync3
+} from "fs";
+import { rename as rename2, writeFile } from "fs/promises";
+import { join as join5 } from "path";
+import { pipeline as pipeline3 } from "stream/promises";
+// src/processors/docling-document-merger.ts
+var REF_PATTERN = /^#\/(texts|pictures|tables|groups)\/(\d+)$/;
+var IMAGE_URI_PATTERN = /^images\/pic_(\d+)\.png$/;
+var DoclingDocumentMerger = class {
+  /**
+   * Merge an array of DoclingDocuments into one.
+   * The first chunk's metadata (schema_name, version, name, origin) is used as the base.
+   *
+   * @param chunks - Array of DoclingDocument objects to merge (must have at least 1)
+   * @param picFileOffsets - Optional cumulative pic_ file counts per chunk.
+   *   When provided, picFileOffsets[i] is used for pic_ URI remapping instead of
+   *   the pictures array length, aligning URIs with relocated file indices.
+   * @returns Merged DoclingDocument
+   */
+  merge(chunks, picFileOffsets) {
+    if (chunks.length === 0) {
+      throw new Error("Cannot merge zero chunks");
+    }
+    if (chunks.length === 1) {
+      return chunks[0];
+    }
+    const base = structuredClone(chunks[0]);
+    for (let i = 1; i < chunks.length; i++) {
+      const chunk = chunks[i];
+      const offsets = {
+        texts: base.texts.length,
+        pictures: base.pictures.length,
+        tables: base.tables.length,
+        groups: base.groups.length
+      };
+      const picFileOffset = picFileOffsets ? picFileOffsets[i] : offsets.pictures;
+      for (const text of chunk.texts) {
+        const remapped = structuredClone(text);
+        remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
+        if (remapped.parent) {
+          remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
+        }
+        remapped.children = remapped.children.map((c) => ({
+          $ref: this.remapRef(c.$ref, offsets)
+        }));
+        base.texts.push(remapped);
+      }
+      for (const picture of chunk.pictures) {
+        const remapped = structuredClone(picture);
+        remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
+        if (remapped.parent) {
+          remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
+        }
+        remapped.children = remapped.children.map((c) => ({
+          $ref: this.remapRef(c.$ref, offsets)
+        }));
+        remapped.captions = remapped.captions.map((c) => ({
+          $ref: this.remapRef(c.$ref, offsets)
+        }));
+        this.remapPictureImageUri(remapped, picFileOffset);
+        base.pictures.push(remapped);
+      }
+      for (const table of chunk.tables) {
+        const remapped = structuredClone(table);
+        remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
+        if (remapped.parent) {
+          remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
+        }
+        remapped.children = remapped.children.map((c) => ({
+          $ref: this.remapRef(c.$ref, offsets)
+        }));
+        remapped.captions = remapped.captions.map((c) => ({
+          $ref: this.remapRef(c.$ref, offsets)
+        }));
+        remapped.footnotes = remapped.footnotes.map((f) => ({
+          $ref: this.remapRef(f.$ref, offsets)
+        }));
+        base.tables.push(remapped);
+      }
+      for (const group of chunk.groups) {
+        const remapped = structuredClone(group);
+        remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
+        if (remapped.parent) {
+          remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
+        }
+        remapped.children = remapped.children.map((c) => ({
+          $ref: this.remapRef(c.$ref, offsets)
+        }));
+        base.groups.push(remapped);
+      }
+      for (const child of chunk.body.children) {
+        base.body.children.push({
+          $ref: this.remapRef(child.$ref, offsets)
+        });
+      }
+      for (const child of chunk.furniture.children) {
+        base.furniture.children.push({
+          $ref: this.remapRef(child.$ref, offsets)
+        });
+      }
+      Object.assign(base.pages, chunk.pages);
+    }
+    return base;
+  }
+  /**
+   * Remap a $ref string by applying offsets.
+   * Only refs matching "#/{texts|pictures|tables|groups}/{N}" are remapped.
+   * Refs like "#/body" or "#/furniture" pass through unchanged.
+   */
+  remapRef(ref, offsets) {
+    const match = REF_PATTERN.exec(ref);
+    if (!match) {
+      return ref;
+    }
+    const kind = match[1];
+    const index = parseInt(match[2], 10);
+    return `#/${kind}/${index + offsets[kind]}`;
+  }
+  /**
+   * Remap image URI in a picture item by applying the pic file offset.
+   * Transforms "images/pic_N.png" → "images/pic_{N+offset}.png"
+   */
+  remapPictureImageUri(picture, picFileOffset) {
+    const rec = picture;
+    const image = rec.image;
+    if (!image?.uri) return;
+    const match = IMAGE_URI_PATTERN.exec(image.uri);
+    if (match) {
+      const index = parseInt(match[1], 10);
+      image.uri = `images/pic_${index + picFileOffset}.png`;
+    }
+  }
+};
+// src/core/chunked-pdf-converter.ts
+var ChunkedPDFConverter = class {
+  constructor(logger, client, config, timeout = PDF_CONVERTER.DEFAULT_TIMEOUT_MS) {
+    this.logger = logger;
+    this.client = client;
+    this.config = config;
+    this.timeout = timeout;
+  }
+  /**
+   * Convert a local PDF in chunks.
+   *
+   * @param url - file:// URL to the source PDF
+   * @param reportId - Unique report identifier for output directory naming
+   * @param onComplete - Callback invoked with the final output directory
+   * @param cleanupAfterCallback - Whether to clean up the output directory after callback
+   * @param options - PDF conversion options (chunked-specific fields are stripped internally)
+   * @param buildConversionOptions - Function to build Docling ConversionOptions from PDFConvertOptions
+   * @param abortSignal - Optional abort signal for cancellation
+   */
+  async convertChunked(url, reportId, onComplete, cleanupAfterCallback, options, buildConversionOptions, abortSignal) {
+    const pdfPath = url.slice(7);
+    const cwd = process.cwd();
+    const outputDir = join5(cwd, "output", reportId);
+    const chunksBaseDir = join5(cwd, "output", reportId, "_chunks");
+    const totalPages = await this.getPageCount(pdfPath);
+    if (totalPages === 0) {
+      throw new Error(
+        "[ChunkedPDFConverter] Failed to detect page count from PDF"
+      );
+    }
+    const chunks = this.calculateChunks(totalPages);
+    this.logger.info(
+      `[ChunkedPDFConverter] Starting: ${totalPages} pages \u2192 ${chunks.length} chunks of ${this.config.chunkSize}`
+    );
+    const server = new LocalFileServer();
+    const httpUrl = await server.start(pdfPath);
+    this.logger.info(
+      "[ChunkedPDFConverter] Started local file server:",
+      httpUrl
+    );
+    const chunkDocuments = [];
+    try {
+      for (let i = 0; i < chunks.length; i++) {
+        this.checkAbort(abortSignal);
+        const [start, end] = chunks[i];
+        const chunkDir = join5(chunksBaseDir, `_chunk_${i}`);
+        mkdirSync3(chunkDir, { recursive: true });
+        const doc = await this.convertChunk(
+          i,
+          chunks.length,
+          start,
+          end,
+          httpUrl,
+          chunkDir,
+          options,
+          buildConversionOptions
+        );
+        chunkDocuments.push(doc);
+      }
+    } finally {
+      this.logger.info("[ChunkedPDFConverter] Stopping local file server...");
+      await server.stop();
+    }
+    this.checkAbort(abortSignal);
+    this.logger.info(
+      `[ChunkedPDFConverter] All ${chunks.length} chunks completed, merging...`
+    );
+    const merger = new DoclingDocumentMerger();
+    const picFileOffsets = this.buildPicFileOffsets(
+      chunksBaseDir,
+      chunks.length
+    );
+    const merged = merger.merge(chunkDocuments, picFileOffsets);
+    this.logger.info(
+      `[ChunkedPDFConverter] Merged: ${merged.texts.length} texts, ${merged.pictures.length} pictures, ${merged.tables.length} tables, ${Object.keys(merged.pages).length} pages`
+    );
+    mkdirSync3(outputDir, { recursive: true });
+    const imagesDir = join5(outputDir, "images");
+    mkdirSync3(imagesDir, { recursive: true });
+    this.relocateImages(chunksBaseDir, chunks.length, imagesDir);
+    const resultPath = join5(outputDir, "result.json");
+    writeFileSync3(resultPath, JSON.stringify(merged));
+    try {
+      await this.renderPageImages(pdfPath, outputDir);
+      this.cleanupOrphanedPicFiles(resultPath, imagesDir);
+      this.checkAbort(abortSignal);
+      this.logger.info(
+        "[ChunkedPDFConverter] Executing completion callback..."
+      );
+      await onComplete(outputDir);
+    } finally {
+      if (existsSync3(chunksBaseDir)) {
+        rmSync2(chunksBaseDir, { recursive: true, force: true });
+      }
+      if (cleanupAfterCallback) {
+        this.logger.info(
+          "[ChunkedPDFConverter] Cleaning up output directory:",
+          outputDir
+        );
+        if (existsSync3(outputDir)) {
+          rmSync2(outputDir, { recursive: true, force: true });
+        }
+      } else {
+        this.logger.info(
+          "[ChunkedPDFConverter] Output preserved at:",
+          outputDir
+        );
+      }
+    }
+    return null;
+  }
+  /**
+   * Convert a single chunk with retry logic.
+   */
+  async convertChunk(chunkIndex, totalChunks, startPage, endPage, httpUrl, chunkDir, options, buildConversionOptions) {
+    const chunkLabel = `Chunk ${chunkIndex + 1}/${totalChunks} (pages ${startPage}-${endPage})`;
+    for (let attempt = 0; attempt <= this.config.maxRetries; attempt++) {
+      try {
+        if (attempt > 0) {
+          this.logger.info(
+            `[ChunkedPDFConverter] ${chunkLabel}: retrying (${attempt}/${this.config.maxRetries})...`
+          );
+        } else {
+          this.logger.info(
+            `[ChunkedPDFConverter] ${chunkLabel}: converting...`
+          );
+        }
+        const startTime = Date.now();
+        const conversionOptions = buildConversionOptions({
+          ...options,
+          page_range: [startPage, endPage]
+        });
+        const task = await this.client.convertSourceAsync({
+          sources: [{ kind: "http", url: httpUrl }],
+          options: conversionOptions,
+          target: { kind: "zip" }
+        });
+        await this.trackTaskProgress(task);
+        const zipPath = join5(chunkDir, "result.zip");
+        await this.downloadResult(task.taskId, zipPath);
+        const extractDir = join5(chunkDir, "extracted");
+        const chunkOutputDir = join5(chunkDir, "output");
+        await ImageExtractor.extractAndSaveDocumentsFromZip(
+          this.logger,
+          zipPath,
+          extractDir,
+          chunkOutputDir
+        );
+        const resultJsonPath = join5(chunkOutputDir, "result.json");
+        const doc = await runJqFileJson(".", resultJsonPath);
+        if (existsSync3(zipPath)) rmSync2(zipPath, { force: true });
+        if (existsSync3(extractDir)) {
+          rmSync2(extractDir, { recursive: true, force: true });
+        }
+        const elapsed = ((Date.now() - startTime) / 1e3).toFixed(1);
+        if (attempt > 0) {
+          this.logger.info(
+            `[ChunkedPDFConverter] ${chunkLabel}: completed on retry ${attempt} (${elapsed}s)`
+          );
+        } else {
+          this.logger.info(
+            `[ChunkedPDFConverter] ${chunkLabel}: completed (${elapsed}s)`
+          );
+        }
+        return doc;
+      } catch (error) {
+        if (attempt >= this.config.maxRetries) {
+          this.logger.error(
+            `[ChunkedPDFConverter] ${chunkLabel}: failed after ${this.config.maxRetries} retries`
+          );
+          throw error;
+        }
+        this.logger.warn(
+          `[ChunkedPDFConverter] ${chunkLabel}: failed, retrying (${attempt + 1}/${this.config.maxRetries})...`
+        );
+      }
+    }
+    throw new Error("Unreachable");
+  }
+  /** Calculate page ranges for chunks */
+  calculateChunks(totalPages) {
+    if (this.config.chunkSize <= 0) {
+      throw new Error("[ChunkedPDFConverter] chunkSize must be positive");
+    }
+    const ranges = [];
+    for (let start = 1; start <= totalPages; start += this.config.chunkSize) {
+      const end = Math.min(start + this.config.chunkSize - 1, totalPages);
+      ranges.push([start, end]);
+    }
+    return ranges;
+  }
+  /** Get total page count using pdfinfo */
+  async getPageCount(pdfPath) {
+    const result = await spawnAsync("pdfinfo", [pdfPath]);
+    if (result.code !== 0) {
+      return 0;
+    }
+    const match = result.stdout.match(/^Pages:\s+(\d+)/m);
+    return match ? parseInt(match[1], 10) : 0;
+  }
+  /** Poll task progress until completion */
+  async trackTaskProgress(task) {
+    const startTime = Date.now();
+    while (true) {
+      if (Date.now() - startTime > this.timeout) {
+        throw new Error("[ChunkedPDFConverter] Chunk task timeout");
+      }
+      const status = await task.poll();
+      if (status.task_status === "success") return;
+      if (status.task_status === "failure") {
+        let details = "unknown";
+        try {
+          const result = await task.getResult();
+          if (result.errors?.length) {
+            details = result.errors.map((e) => e.message).join("; ");
+          }
+        } catch {
+        }
+        throw new Error(`[ChunkedPDFConverter] Chunk task failed: ${details}`);
+      }
+      await new Promise(
+        (resolve) => setTimeout(resolve, PDF_CONVERTER.POLL_INTERVAL_MS)
+      );
+    }
+  }
+  /** Download ZIP result for a task */
+  async downloadResult(taskId, zipPath) {
+    const zipResult = await this.client.getTaskResultFile(taskId);
+    if (zipResult.fileStream) {
+      const writeStream = createWriteStream3(zipPath);
+      await pipeline3(zipResult.fileStream, writeStream);
+      return;
+    }
+    if (zipResult.data) {
+      await writeFile(zipPath, zipResult.data);
+      return;
+    }
+    const baseUrl = this.client.getConfig().baseUrl;
+    const response = await fetch(`${baseUrl}/v1/result/${taskId}`, {
+      headers: { Accept: "application/zip" }
+    });
+    if (!response.ok) {
+      throw new Error(
+        `Failed to download chunk ZIP: ${response.status} ${response.statusText}`
+      );
+    }
+    const buffer = new Uint8Array(await response.arrayBuffer());
+    await writeFile(zipPath, buffer);
+  }
+  /**
+   * Relocate images from chunk output directories to the final images directory
+   * with global indexing.
+   */
+  relocateImages(chunksBaseDir, totalChunks, imagesDir) {
+    let picGlobalIndex = 0;
+    for (let i = 0; i < totalChunks; i++) {
+      const chunkImagesDir = join5(
+        chunksBaseDir,
+        `_chunk_${i}`,
+        "output",
+        "images"
+      );
+      if (!existsSync3(chunkImagesDir)) continue;
+      const picFiles = readdirSync3(chunkImagesDir).filter((f) => f.startsWith("pic_") && f.endsWith(".png")).sort((a, b) => {
+        const numA = parseInt(a.replace("pic_", "").replace(".png", ""), 10);
+        const numB = parseInt(b.replace("pic_", "").replace(".png", ""), 10);
+        return numA - numB;
+      });
+      for (const file of picFiles) {
+        const src = join5(chunkImagesDir, file);
+        const dest = join5(imagesDir, `pic_${picGlobalIndex}.png`);
+        copyFileSync(src, dest);
+        picGlobalIndex++;
+      }
+    }
+    let imageGlobalIndex = 0;
+    for (let i = 0; i < totalChunks; i++) {
+      const chunkImagesDir = join5(
+        chunksBaseDir,
+        `_chunk_${i}`,
+        "output",
+        "images"
+      );
+      if (!existsSync3(chunkImagesDir)) continue;
+      const imageFiles = readdirSync3(chunkImagesDir).filter((f) => f.startsWith("image_") && f.endsWith(".png")).sort((a, b) => {
+        const numA = parseInt(
+          a.replace("image_", "").replace(".png", ""),
+          10
+        );
+        const numB = parseInt(
+          b.replace("image_", "").replace(".png", ""),
+          10
+        );
+        return numA - numB;
+      });
+      for (const file of imageFiles) {
+        const src = join5(chunkImagesDir, file);
+        const dest = join5(imagesDir, `image_${imageGlobalIndex}.png`);
+        copyFileSync(src, dest);
+        imageGlobalIndex++;
+      }
+    }
+    this.logger.info(
+      `[ChunkedPDFConverter] Relocated ${picGlobalIndex} pic + ${imageGlobalIndex} image files to ${imagesDir}`
+    );
+  }
+  /** Render page images from PDF using ImageMagick and update result.json */
+  async renderPageImages(pdfPath, outputDir) {
+    this.logger.info(
+      "[ChunkedPDFConverter] Rendering page images with ImageMagick..."
+    );
+    const renderer = new PageRenderer(this.logger);
+    const renderResult = await renderer.renderPages(pdfPath, outputDir);
+    const resultPath = join5(outputDir, "result.json");
+    const tmpPath = resultPath + ".tmp";
+    const jqProgram = `
+      .pages |= with_entries(
+        if (.value.page_no - 1) >= 0 and (.value.page_no - 1) < ${renderResult.pageCount} then
+          .value.image.uri = "pages/page_\\(.value.page_no - 1).png" |
+          .value.image.mimetype = "image/png" |
+          .value.image.dpi = ${PAGE_RENDERING.DEFAULT_DPI}
+        else . end
+      )
+    `;
+    await runJqFileToFile(jqProgram, resultPath, tmpPath);
+    await rename2(tmpPath, resultPath);
+    this.logger.info(
+      `[ChunkedPDFConverter] Rendered ${renderResult.pageCount} page images`
+    );
+  }
+  /**
+   * Remove pic_ files from images directory that are not referenced in result.json.
+   * Chunked Docling conversion embeds page images as base64 in JSON, which get
+   * extracted as pic_ files. After renderPageImages replaces page URIs with
+   * pages/page_N.png, these pic_ files become orphaned.
+   */
+  cleanupOrphanedPicFiles(resultPath, imagesDir) {
+    const content = readFileSync3(resultPath, "utf-8");
+    const referencedPics = /* @__PURE__ */ new Set();
+    const picPattern = /images\/pic_\d+\.png/g;
+    let match;
+    while ((match = picPattern.exec(content)) !== null) {
+      referencedPics.add(match[0].replace("images/", ""));
+    }
+    const picFiles = readdirSync3(imagesDir).filter(
+      (f) => f.startsWith("pic_") && f.endsWith(".png")
+    );
+    let removedCount = 0;
+    for (const file of picFiles) {
+      if (!referencedPics.has(file)) {
+        rmSync2(join5(imagesDir, file), { force: true });
+        removedCount++;
+      }
+    }
+    if (removedCount > 0) {
+      this.logger.info(
+        `[ChunkedPDFConverter] Cleaned up ${removedCount} orphaned pic_ files (${referencedPics.size} referenced, kept)`
+      );
+    }
+  }
+  /**
+   * Build cumulative pic_ file offsets per chunk for correct URI remapping.
+   * Each offset[i] is the total number of pic_ files in chunks 0..i-1.
+   */
+  buildPicFileOffsets(chunksBaseDir, totalChunks) {
+    const offsets = [];
+    let cumulative = 0;
+    for (let i = 0; i < totalChunks; i++) {
+      offsets.push(cumulative);
+      const dir = join5(chunksBaseDir, `_chunk_${i}`, "output", "images");
+      const count = existsSync3(dir) ? readdirSync3(dir).filter(
+        (f) => f.startsWith("pic_") && f.endsWith(".png")
+      ).length : 0;
+      cumulative += count;
+    }
+    return offsets;
+  }
+  /** Check if abort has been signalled and throw if so */
+  checkAbort(signal) {
+    if (signal?.aborted) {
+      const error = new Error("Chunked PDF conversion was aborted");
+      error.name = "AbortError";
+      throw error;
+    }
+  }
+};
 // src/core/image-pdf-converter.ts
-import { existsSync as existsSync3, rmSync as rmSync2 } from "fs";
+import { existsSync as existsSync4, rmSync as rmSync3 } from "fs";
 import { tmpdir } from "os";
-import { join as join5 } from "path";
+import { join as join6 } from "path";
 var ImagePdfConverter = class {
   constructor(logger) {
     this.logger = logger;
@@ -2336,8 +2893,8 @@ var ImagePdfConverter = class {
   async convert(pdfUrl, reportId) {
     const timestamp = Date.now();
     const tempDir = tmpdir();
-    const inputPath = join5(tempDir, `${reportId}-${timestamp}-input.pdf`);
-    const outputPath = join5(tempDir, `${reportId}-${timestamp}-image.pdf`);
+    const inputPath = join6(tempDir, `${reportId}-${timestamp}-input.pdf`);
+    const outputPath = join6(tempDir, `${reportId}-${timestamp}-image.pdf`);
     try {
       this.logger.info("[ImagePdfConverter] Downloading PDF from URL...");
       await this.downloadPdf(pdfUrl, inputPath);
@@ -2346,8 +2903,8 @@ var ImagePdfConverter = class {
       this.logger.info("[ImagePdfConverter] Image PDF created:", outputPath);
       return outputPath;
     } finally {
-      if (existsSync3(inputPath)) {
-        rmSync2(inputPath, { force: true });
+      if (existsSync4(inputPath)) {
+        rmSync3(inputPath, { force: true });
       }
     }
   }
@@ -2394,12 +2951,12 @@ var ImagePdfConverter = class {
    * Cleanup the temporary image PDF file
    */
   cleanup(imagePdfPath) {
-    if (existsSync3(imagePdfPath)) {
+    if (existsSync4(imagePdfPath)) {
       this.logger.info(
         "[ImagePdfConverter] Cleaning up temp file:",
         imagePdfPath
       );
-      rmSync2(imagePdfPath, { force: true });
+      rmSync3(imagePdfPath, { force: true });
     }
   }
 };
@@ -2414,6 +2971,26 @@ var PDFConverter = class {
   }
   async convert(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
     this.logger.info("[PDFConverter] Converting:", url);
+    if (options.chunkedConversion && url.startsWith("file://")) {
+      const chunked = new ChunkedPDFConverter(
+        this.logger,
+        this.client,
+        {
+          chunkSize: options.chunkSize ?? CHUNKED_CONVERSION.DEFAULT_CHUNK_SIZE,
+          maxRetries: options.chunkMaxRetries ?? CHUNKED_CONVERSION.DEFAULT_MAX_RETRIES
+        },
+        this.timeout
+      );
+      return chunked.convertChunked(
+        url,
+        reportId,
+        onComplete,
+        cleanupAfterCallback,
+        options,
+        (opts) => this.buildConversionOptions(opts),
+        abortSignal
+      );
+    }
     if (options.forceImagePdf) {
       return this.convertViaImagePdf(
         url,
@@ -2518,7 +3095,7 @@ var PDFConverter = class {
       const reason = options.forcedMethod ? `Forced: ${options.forcedMethod}` : !pdfPath ? "Non-local URL, sampling skipped" : "Sampling skipped";
       return { method, reason, sampledPages: 0, totalPages: 0 };
     }
-    const samplingDir = join6(process.cwd(), "output", reportId, "_sampling");
+    const samplingDir = join7(process.cwd(), "output", reportId, "_sampling");
     const sampler = new OcrStrategySampler(
       this.logger,
       new PageRenderer(this.logger),
@@ -2543,8 +3120,8 @@ var PDFConverter = class {
       }
       return strategy;
     } finally {
-      if (existsSync4(samplingDir)) {
-        rmSync3(samplingDir, { recursive: true, force: true });
+      if (existsSync5(samplingDir)) {
+        rmSync4(samplingDir, { recursive: true, force: true });
       }
     }
   }
@@ -2565,7 +3142,7 @@ var PDFConverter = class {
     const wrappedCallback = async (outputDir) => {
       let pageTexts;
       try {
-        const resultPath2 = join6(outputDir, "result.json");
+        const resultPath2 = join7(outputDir, "result.json");
         const totalPages = await runJqFileJson(
           ".pages | length",
           resultPath2
@@ -2577,9 +3154,9 @@ var PDFConverter = class {
           "[PDFConverter] pdftotext extraction failed, proceeding without text reference"
         );
       }
-      const resultPath = join6(outputDir, "result.json");
-      const ocrOriginPath = join6(outputDir, "result_ocr_origin.json");
-      copyFileSync(resultPath, ocrOriginPath);
+      const resultPath = join7(outputDir, "result.json");
+      const ocrOriginPath = join7(outputDir, "result_ocr_origin.json");
+      copyFileSync2(resultPath, ocrOriginPath);
       const corrector = new VlmTextCorrector(this.logger);
       await corrector.correctAndSave(outputDir, options.vlmProcessorModel, {
         concurrency: options.vlmConcurrency,
@@ -2721,9 +3298,9 @@ var PDFConverter = class {
       }
     }
     const cwd = process.cwd();
-    const zipPath = join6(cwd, "result.zip");
-    const extractDir = join6(cwd, "result_extracted");
-    const outputDir = join6(cwd, "output", reportId);
+    const zipPath = join7(cwd, "result.zip");
+    const extractDir = join7(cwd, "result_extracted");
+    const outputDir = join7(cwd, "output", reportId);
     try {
       await this.processConvertedFiles(zipPath, extractDir, outputDir);
       await this.renderPageImages(url, outputDir);
@@ -2740,19 +3317,19 @@ var PDFConverter = class {
       this.logger.info("[PDFConverter] Total time:", duration, "ms");
     } finally {
       this.logger.info("[PDFConverter] Cleaning up temporary files...");
-      if (existsSync4(zipPath)) {
-        rmSync3(zipPath, { force: true });
+      if (existsSync5(zipPath)) {
+        rmSync4(zipPath, { force: true });
       }
-      if (existsSync4(extractDir)) {
-        rmSync3(extractDir, { recursive: true, force: true });
+      if (existsSync5(extractDir)) {
+        rmSync4(extractDir, { recursive: true, force: true });
       }
       if (cleanupAfterCallback) {
         this.logger.info(
           "[PDFConverter] Cleaning up output directory:",
           outputDir
         );
-        if (existsSync4(outputDir)) {
-          rmSync3(outputDir, { recursive: true, force: true });
+        if (existsSync5(outputDir)) {
+          rmSync4(outputDir, { recursive: true, force: true });
         }
       } else {
         this.logger.info("[PDFConverter] Output preserved at:", outputDir);
@@ -2770,7 +3347,10 @@ var PDFConverter = class {
         "skipSampling",
         "forcedMethod",
         "aggregator",
-        "onTokenUsage"
+        "onTokenUsage",
+        "chunkedConversion",
+        "chunkSize",
+        "chunkMaxRetries"
       ]),
       to_formats: ["json", "html"],
       image_export_mode: "embedded",
@@ -2898,15 +3478,15 @@ var PDFConverter = class {
       "\n[PDFConverter] Task completed, downloading ZIP file..."
     );
     const zipResult = await this.client.getTaskResultFile(taskId);
-    const zipPath = join6(process.cwd(), "result.zip");
+    const zipPath = join7(process.cwd(), "result.zip");
     this.logger.info("[PDFConverter] Saving ZIP file to:", zipPath);
     if (zipResult.fileStream) {
-      const writeStream = createWriteStream3(zipPath);
-      await pipeline3(zipResult.fileStream, writeStream);
+      const writeStream = createWriteStream4(zipPath);
+      await pipeline4(zipResult.fileStream, writeStream);
       return;
     }
     if (zipResult.data) {
-      await writeFile(zipPath, zipResult.data);
+      await writeFile2(zipPath, zipResult.data);
       return;
     }
     this.logger.warn(
@@ -2922,7 +3502,7 @@ var PDFConverter = class {
       );
     }
     const buffer = new Uint8Array(await response.arrayBuffer());
-    await writeFile(zipPath, buffer);
+    await writeFile2(zipPath, buffer);
   }
   async processConvertedFiles(zipPath, extractDir, outputDir) {
     await ImageExtractor.extractAndSaveDocumentsFromZip(
@@ -2951,7 +3531,7 @@ var PDFConverter = class {
     );
     const renderer = new PageRenderer(this.logger);
     const renderResult = await renderer.renderPages(pdfPath, outputDir);
-    const resultPath = join6(outputDir, "result.json");
+    const resultPath = join7(outputDir, "result.json");
     const tmpPath = resultPath + ".tmp";
     const jqProgram = `
       .pages |= with_entries(
@@ -2963,7 +3543,7 @@ var PDFConverter = class {
       )
     `;
     await runJqFileToFile(jqProgram, resultPath, tmpPath);
-    await rename2(tmpPath, resultPath);
+    await rename3(tmpPath, resultPath);
     this.logger.info(
       `[PDFConverter] Rendered ${renderResult.pageCount} page images`
     );
@@ -2998,7 +3578,7 @@ var PDFParser = class {
       this.baseUrl = void 0;
     }
     this.timeout = timeout;
-    this.venvPath = venvPath || join7(process.cwd(), ".venv");
+    this.venvPath = venvPath || join8(process.cwd(), ".venv");
     this.killExistingProcess = killExistingProcess;
     this.enableImagePdfFallback = enableImagePdfFallback;
   }