npm - @heripo/pdf-parser - Versions diffs - 0.1.8 → 0.1.10 - Mend

@heripo/pdf-parser 0.1.8 → 0.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/index.js CHANGED Viewed

@@ -43,11 +43,17 @@ var DOCLING_ENVIRONMENT = {
    */
   STARTUP_DELAY_MS: 2e3
 };
+var PAGE_RENDERING = {
+  /** Default rendering DPI for VLM text recognition quality */
+  DEFAULT_DPI: 200,
+  /** Low-resolution DPI for OCR strategy sampling */
+  SAMPLE_DPI: 150
+};
 var IMAGE_PDF_CONVERTER = {
   /**
    * ImageMagick density option (DPI) for PDF to image conversion
    */
-  DENSITY: 300,
+  DENSITY: PAGE_RENDERING.DEFAULT_DPI,
   /**
    * ImageMagick quality option (1-100)
    */
@@ -837,16 +843,10 @@ var DoclingEnvironment = class _DoclingEnvironment {
 // src/core/pdf-converter.ts
 import { omit } from "es-toolkit";
-import {
-  copyFileSync,
-  createWriteStream as createWriteStream2,
-  existsSync as existsSync4,
-  readFileSync as readFileSync4,
-  rmSync as rmSync3
-} from "fs";
-import { writeFile } from "fs/promises";
+import { copyFileSync, createWriteStream as createWriteStream3, existsSync as existsSync4, rmSync as rmSync3 } from "fs";
+import { rename as rename2, writeFile } from "fs/promises";
 import { join as join6 } from "path";
-import { pipeline } from "stream/promises";
+import { pipeline as pipeline3 } from "stream/promises";
 // src/errors/image-pdf-fallback-error.ts
 var ImagePdfFallbackError = class extends Error {
@@ -862,19 +862,24 @@ var ImagePdfFallbackError = class extends Error {
 // src/processors/image-extractor.ts
 import {
-  createWriteStream,
+  createReadStream,
+  createWriteStream as createWriteStream2,
   existsSync,
   mkdirSync,
-  readFileSync,
   readdirSync,
   rmSync,
   writeFileSync
 } from "fs";
 import { extname, join as join2 } from "path";
+import { Transform } from "stream";
+import { pipeline as pipeline2 } from "stream/promises";
 import * as yauzl from "yauzl";
 // src/utils/jq.ts
 import { spawn as spawn3 } from "child_process";
+import { createWriteStream } from "fs";
+import { rename } from "fs/promises";
+import { pipeline } from "stream/promises";
 function getJqPath() {
   const p = process.env.JQ_PATH?.trim();
   return p && p.length > 0 ? p : "jq";
@@ -926,25 +931,139 @@ function runJqFileJson(program, filePath) {
     });
   });
 }
-function jqExtractBase64PngStrings(filePath) {
-  const program = `
-      [
-        .. |
-        select(type == "string" and startswith("data:image/png;base64"))
-      ]
-    `;
-  return runJqFileJson(program, filePath);
+function runJqFileToFile(program, inputPath, outputPath) {
+  return new Promise((resolve, reject) => {
+    const jqPath = getJqPath();
+    const args = [program, inputPath];
+    const child = spawn3(jqPath, args, {
+      stdio: ["ignore", "pipe", "pipe"],
+      env: process.env
+    });
+    let stderr = "";
+    let exitCode = null;
+    let pipelineDone = false;
+    let settled = false;
+    child.stderr.setEncoding("utf-8");
+    child.stderr.on("data", (chunk) => {
+      stderr += chunk;
+    });
+    const ws = createWriteStream(outputPath);
+    function trySettle() {
+      if (settled) return;
+      if (!pipelineDone || exitCode === null) return;
+      settled = true;
+      if (exitCode !== 0) {
+        reject(
+          new Error(
+            `jq exited with code ${exitCode}. ${stderr ? "Stderr: " + stderr : ""}`
+          )
+        );
+      } else {
+        resolve();
+      }
+    }
+    child.on("error", (err) => {
+      if (settled) return;
+      settled = true;
+      ws.destroy();
+      reject(err);
+    });
+    pipeline(child.stdout, ws).then(() => {
+      pipelineDone = true;
+      trySettle();
+    }).catch((err) => {
+      if (settled) return;
+      settled = true;
+      reject(err);
+    });
+    child.on("close", (code) => {
+      exitCode = code ?? 1;
+      trySettle();
+    });
+  });
+}
+function runJqFileLines(program, filePath, onLine) {
+  return new Promise((resolve, reject) => {
+    const jqPath = getJqPath();
+    const args = ["-r", program, filePath];
+    const child = spawn3(jqPath, args, {
+      stdio: ["ignore", "pipe", "pipe"],
+      env: process.env
+    });
+    let stderr = "";
+    let buffer = "";
+    let callbackError = false;
+    child.stdout.setEncoding("utf-8");
+    child.stderr.setEncoding("utf-8");
+    function safeOnLine(line) {
+      if (callbackError) return;
+      try {
+        onLine(line);
+      } catch (err) {
+        callbackError = true;
+        child.kill();
+        reject(err);
+      }
+    }
+    child.stdout.on("data", (chunk) => {
+      buffer += chunk;
+      let newlineIdx;
+      while ((newlineIdx = buffer.indexOf("\n")) !== -1) {
+        const line = buffer.slice(0, newlineIdx);
+        buffer = buffer.slice(newlineIdx + 1);
+        if (line.length > 0) {
+          safeOnLine(line);
+        }
+      }
+    });
+    child.stderr.on("data", (chunk) => {
+      stderr += chunk;
+    });
+    child.on("error", (err) => {
+      if (!callbackError) reject(err);
+    });
+    child.on("close", (code) => {
+      if (callbackError) return;
+      if (buffer.length > 0) {
+        safeOnLine(buffer);
+      }
+      if (callbackError) return;
+      if (code !== 0) {
+        reject(
+          new Error(
+            `jq exited with code ${code}. ${stderr ? "Stderr: " + stderr : ""}`
+          )
+        );
+      } else {
+        resolve();
+      }
+    });
+  });
 }
-function jqReplaceBase64WithPaths(filePath, dirName, prefix) {
+async function jqExtractBase64PngStringsStreaming(filePath, onImage) {
+  let index = 0;
+  await runJqFileLines(
+    '.. | select(type == "string" and startswith("data:image/png;base64"))',
+    filePath,
+    (line) => {
+      onImage(line, index);
+      index++;
+    }
+  );
+  return index;
+}
+async function jqReplaceBase64WithPathsToFile(inputPath, outputPath, dirName, prefix) {
   const program = `
       reduce paths(type == "string" and startswith("data:image/png;base64")) as $p (
         {data: ., counter: 0};
         .counter as $idx |
         .data |= setpath($p; "${dirName}/${prefix}_\\($idx).png") |
         .counter += 1
-      ) | {data: .data, count: .counter}
+      ) | .data
     `;
-  return runJqFileJson(program, filePath);
+  const tmpPath = outputPath + ".tmp";
+  await runJqFileToFile(program, inputPath, tmpPath);
+  await rename(tmpPath, outputPath);
 }
 // src/processors/image-extractor.ts
@@ -972,7 +1091,7 @@ var ImageExtractor = class _ImageExtractor {
                 return;
               }
               mkdirSync(join2(entryPath, ".."), { recursive: true });
-              const writeStream = createWriteStream(entryPath);
+              const writeStream = createWriteStream2(entryPath);
               readStream.pipe(writeStream);
               writeStream.on("finish", () => {
                 zipfile.readEntry();
@@ -988,26 +1107,6 @@ var ImageExtractor = class _ImageExtractor {
       });
     });
   }
-  /**
-   * Extract base64 images from JSON file using jq (for large files)
-   * Returns array of base64 data strings
-   */
-  static async extractBase64ImagesFromJsonWithJq(jsonPath) {
-    return jqExtractBase64PngStrings(jsonPath);
-  }
-  /**
-   * Replace base64 images with file paths in JSON using jq (for large files)
-   * Uses reduce to maintain counter state while walking the JSON
-   */
-  static async replaceBase64ImagesInJsonWithJq(jsonPath, outputPath, dirName, prefix) {
-    const { data, count } = await jqReplaceBase64WithPaths(
-      jsonPath,
-      dirName,
-      prefix
-    );
-    writeFileSync(outputPath, JSON.stringify(data, null, 2), "utf-8");
-    return count;
-  }
   /**
    * Extract a base64-encoded image to a file and return the relative path
    */
@@ -1021,8 +1120,66 @@ var ImageExtractor = class _ImageExtractor {
     return `${dirName}/${filename}`;
   }
   /**
-   * Save JSON and HTML documents with base64 images extracted to separate files
-   * Uses jq for JSON processing to handle large files
+   * Extract base64 images from HTML using streaming.
+   * Reads HTML file as a stream, extracts base64 images from src attributes,
+   * saves them as PNG files, and replaces with file paths in the output HTML.
+   * Returns the number of images extracted.
+   */
+  static async extractImagesFromHtmlStream(htmlInputPath, htmlOutputPath, imagesDir) {
+    let imageIndex = 0;
+    let pending = "";
+    const MARKER = 'src="data:image/png;base64,';
+    const transform = new Transform({
+      decodeStrings: false,
+      encoding: "utf-8",
+      transform(chunk, _encoding, callback) {
+        pending += chunk;
+        let result = "";
+        while (true) {
+          const markerIdx = pending.indexOf(MARKER);
+          if (markerIdx === -1) {
+            const safeEnd = Math.max(0, pending.length - MARKER.length);
+            result += pending.slice(0, safeEnd);
+            pending = pending.slice(safeEnd);
+            break;
+          }
+          result += pending.slice(0, markerIdx);
+          const dataStart = markerIdx + MARKER.length;
+          const quoteIdx = pending.indexOf('"', dataStart);
+          if (quoteIdx === -1) {
+            pending = pending.slice(markerIdx);
+            break;
+          }
+          const base64Content = pending.slice(dataStart, quoteIdx);
+          const filename = `image_${imageIndex}.png`;
+          const filepath = join2(imagesDir, filename);
+          const buf = Buffer.from(base64Content, "base64");
+          writeFileSync(filepath, buf);
+          const relativePath = `images/${filename}`;
+          result += `src="${relativePath}"`;
+          imageIndex++;
+          pending = pending.slice(quoteIdx + 1);
+        }
+        if (result.length > 0) {
+          this.push(result);
+        }
+        callback();
+      },
+      flush(callback) {
+        if (pending.length > 0) {
+          this.push(pending);
+        }
+        callback();
+      }
+    });
+    const rs = createReadStream(htmlInputPath, { encoding: "utf-8" });
+    const ws = createWriteStream2(htmlOutputPath, { encoding: "utf-8" });
+    await pipeline2(rs, transform, ws);
+    return imageIndex;
+  }
+  /**
+   * Save JSON and HTML documents with base64 images extracted to separate files.
+   * Uses jq for JSON processing and streaming for HTML to handle large files.
    *
    * This method:
    * 1. Extracts base64-encoded images from JSON and HTML content
@@ -1030,7 +1187,7 @@ var ImageExtractor = class _ImageExtractor {
    * 3. Replaces base64 data with relative file paths
    * 4. Saves the transformed documents to the output directory
    */
-  static async saveDocumentsWithExtractedImages(logger, outputDir, filename, jsonSourcePath, htmlContent) {
+  static async saveDocumentsWithExtractedImages(logger, outputDir, filename, jsonSourcePath, htmlSourcePath) {
     try {
       if (existsSync(outputDir)) {
         rmSync(outputDir, { recursive: true, force: true });
@@ -1042,31 +1199,33 @@ var ImageExtractor = class _ImageExtractor {
     const baseName = filename.replace(extname(filename), "");
     const jsonPath = join2(outputDir, `${baseName}.json`);
     try {
-      const pagesDir = join2(outputDir, "pages");
-      if (!existsSync(pagesDir)) {
-        mkdirSync(pagesDir, { recursive: true });
-      }
-      const base64Images = await _ImageExtractor.extractBase64ImagesFromJsonWithJq(jsonSourcePath);
-      base64Images.forEach((base64Data, index) => {
-        _ImageExtractor.extractBase64ImageToFile(
-          base64Data,
-          pagesDir,
-          index,
-          "page",
-          "pages"
-        );
-      });
+      const imagesDir = join2(outputDir, "images");
+      if (!existsSync(imagesDir)) {
+        mkdirSync(imagesDir, { recursive: true });
+      }
+      const imageCount = await jqExtractBase64PngStringsStreaming(
+        jsonSourcePath,
+        (base64Data, index) => {
+          _ImageExtractor.extractBase64ImageToFile(
+            base64Data,
+            imagesDir,
+            index,
+            "pic",
+            "images"
+          );
+        }
+      );
       logger.info(
-        `[PDFConverter] Extracted ${base64Images.length} images from JSON to ${pagesDir}`
+        `[PDFConverter] Extracted ${imageCount} picture images from JSON to ${imagesDir}`
       );
-      const replacedCount = await _ImageExtractor.replaceBase64ImagesInJsonWithJq(
+      await jqReplaceBase64WithPathsToFile(
         jsonSourcePath,
         jsonPath,
-        "pages",
-        "page"
+        "images",
+        "pic"
       );
       logger.info(
-        `[PDFConverter] Replaced ${replacedCount} base64 images with file paths`
+        `[PDFConverter] Replaced ${imageCount} base64 images with file paths`
       );
     } catch (e) {
       logger.warn(
@@ -1082,42 +1241,36 @@ var ImageExtractor = class _ImageExtractor {
       if (!existsSync(imagesDir)) {
         mkdirSync(imagesDir, { recursive: true });
       }
-      let imageIndex = 0;
-      const transformedHtml = htmlContent.replace(
-        /src="data:image\/png;base64,([^"]+)"/g,
-        (_, base64Content) => {
-          const filename2 = `image_${imageIndex}.png`;
-          const filepath = join2(imagesDir, filename2);
-          const buffer = Buffer.from(base64Content, "base64");
-          writeFileSync(filepath, buffer);
-          const relativePath = `images/${filename2}`;
-          imageIndex += 1;
-          return `src="${relativePath}"`;
-        }
+      const htmlImageCount = await _ImageExtractor.extractImagesFromHtmlStream(
+        htmlSourcePath,
+        htmlPath,
+        imagesDir
       );
       logger.info(
-        `[PDFConverter] Extracted ${imageIndex} images from HTML to ${imagesDir}`
+        `[PDFConverter] Extracted ${htmlImageCount} images from HTML to ${imagesDir}`
       );
-      writeFileSync(htmlPath, transformedHtml, "utf-8");
     } catch (e) {
       logger.warn(
-        "[PDFConverter] Failed to extract images from HTML, writing original. Error:",
+        "[PDFConverter] Failed to extract images from HTML, copying original. Error:",
         e
       );
-      writeFileSync(htmlPath, htmlContent, "utf-8");
+      const rs = createReadStream(htmlSourcePath);
+      const ws = createWriteStream2(htmlPath);
+      await pipeline2(rs, ws);
     }
     logger.info("[PDFConverter] Saved HTML:", htmlPath);
   }
   /**
    * Extract documents from ZIP and save with extracted images
-   * Uses jq for JSON processing to handle large files without loading into Node.js memory
+   * Uses jq for JSON processing and streaming for HTML to handle large files
+   * without loading into Node.js memory
    *
    * Complete workflow:
    * 1. Extract ZIP file to temporary directory
    * 2. Find JSON and HTML files from extracted files
-   * 3. Use jq to extract base64 images from JSON and save as separate files
-   * 4. Use jq to replace base64 with file paths in JSON
-   * 5. Process HTML with regex to extract and replace images
+   * 3. Use jq to stream-extract base64 images from JSON and save as separate files
+   * 4. Use jq to replace base64 with file paths in JSON (piped to file)
+   * 5. Process HTML with streaming Transform to extract and replace images
    * 6. Save transformed documents to output directory (as result.json and result.html)
    */
   static async extractAndSaveDocumentsFromZip(logger, zipPath, extractDir, outputDir) {
@@ -1133,14 +1286,13 @@ var ImageExtractor = class _ImageExtractor {
     }
     const jsonPath = join2(extractDir, jsonFile);
     const htmlPath = join2(extractDir, htmlFile);
-    const htmlContent = readFileSync(htmlPath, "utf-8");
     logger.info("[PDFConverter] Saving converted files to output...");
     await _ImageExtractor.saveDocumentsWithExtractedImages(
       logger,
       outputDir,
       "result",
       jsonPath,
-      htmlContent
+      htmlPath
     );
     logger.info("[PDFConverter] Files saved to:", outputDir);
   }
@@ -1149,7 +1301,7 @@ var ImageExtractor = class _ImageExtractor {
 // src/processors/page-renderer.ts
 import { existsSync as existsSync2, mkdirSync as mkdirSync2, readdirSync as readdirSync2 } from "fs";
 import { join as join3 } from "path";
-var DEFAULT_DPI = 300;
+var PROGRESS_POLL_INTERVAL_MS = 2e3;
 var PageRenderer = class {
   constructor(logger) {
     this.logger = logger;
@@ -1163,29 +1315,60 @@ var PageRenderer = class {
    * @returns Render result with page count and file paths
    */
   async renderPages(pdfPath, outputDir, options) {
-    const dpi = options?.dpi ?? DEFAULT_DPI;
+    const dpi = options?.dpi ?? PAGE_RENDERING.DEFAULT_DPI;
     const pagesDir = join3(outputDir, "pages");
     if (!existsSync2(pagesDir)) {
       mkdirSync2(pagesDir, { recursive: true });
     }
-    this.logger.info(`[PageRenderer] Rendering PDF at ${dpi} DPI...`);
-    const outputPattern = join3(pagesDir, "page_%d.png");
-    const result = await spawnAsync("magick", [
-      "-density",
-      dpi.toString(),
-      pdfPath,
-      "-background",
-      "white",
-      "-alpha",
-      "remove",
-      "-alpha",
-      "off",
-      outputPattern
-    ]);
-    if (result.code !== 0) {
-      throw new Error(
-        `[PageRenderer] Failed to render PDF pages: ${result.stderr || "Unknown error"}`
+    const totalPages = await this.getPageCount(pdfPath);
+    if (totalPages > 0) {
+      this.logger.info(
+        `[PageRenderer] Rendering ${totalPages} pages at ${dpi} DPI...`
       );
+    } else {
+      this.logger.info(`[PageRenderer] Rendering PDF at ${dpi} DPI...`);
+    }
+    const outputPattern = join3(pagesDir, "page_%d.png");
+    let progressInterval = null;
+    if (totalPages > 0) {
+      let lastLoggedCount = 0;
+      progressInterval = setInterval(() => {
+        try {
+          const rendered = readdirSync2(pagesDir).filter(
+            (f) => f.startsWith("page_") && f.endsWith(".png")
+          ).length;
+          if (rendered > 0 && rendered !== lastLoggedCount) {
+            lastLoggedCount = rendered;
+            this.logger.info(
+              `[PageRenderer] Rendering pages: ${rendered}/${totalPages}`
+            );
+          }
+        } catch {
+        }
+      }, PROGRESS_POLL_INTERVAL_MS);
+    }
+    try {
+      const result = await spawnAsync("magick", [
+        "-density",
+        dpi.toString(),
+        pdfPath,
+        "-background",
+        "white",
+        "-alpha",
+        "remove",
+        "-alpha",
+        "off",
+        outputPattern
+      ]);
+      if (result.code !== 0) {
+        throw new Error(
+          `[PageRenderer] Failed to render PDF pages: ${result.stderr || "Unknown error"}`
+        );
+      }
+    } finally {
+      if (progressInterval) {
+        clearInterval(progressInterval);
+      }
     }
     const pageFiles = readdirSync2(pagesDir).filter((f) => f.startsWith("page_") && f.endsWith(".png")).sort((a, b) => {
       const numA = parseInt(a.replace("page_", "").replace(".png", ""), 10);
@@ -1201,6 +1384,20 @@ var PageRenderer = class {
       pageFiles
     };
   }
+  /**
+   * Get total page count using pdfinfo.
+   * Returns 0 on failure (progress logging will be skipped).
+   */
+  async getPageCount(pdfPath) {
+    try {
+      const result = await spawnAsync("pdfinfo", [pdfPath]);
+      if (result.code !== 0) return 0;
+      const match = result.stdout.match(/^Pages:\s+(\d+)/m);
+      return match ? parseInt(match[1], 10) : 0;
+    } catch {
+      return 0;
+    }
+  }
 };
 // src/processors/pdf-text-extractor.ts
@@ -1286,7 +1483,7 @@ var PdfTextExtractor = class {
 };
 // src/processors/vlm-text-corrector.ts
-import { readFileSync as readFileSync2, writeFileSync as writeFileSync2 } from "fs";
+import { readFileSync, writeFileSync as writeFileSync2 } from "fs";
 import { join as join4 } from "path";
 // src/types/vlm-text-correction-schema.ts
@@ -1418,7 +1615,7 @@ var VlmTextCorrector = class {
   async correctAndSave(outputDir, model, options) {
     this.logger.info("[VlmTextCorrector] Starting text correction...");
     const resultPath = join4(outputDir, "result.json");
-    const doc = JSON.parse(readFileSync2(resultPath, "utf-8"));
+    const doc = JSON.parse(readFileSync(resultPath, "utf-8"));
     let pageNumbers = this.getPageNumbers(doc);
     if (pageNumbers.length === 0) {
       this.logger.info("[VlmTextCorrector] No pages to process");
@@ -1745,7 +1942,7 @@ var VlmTextCorrector = class {
    */
   readPageImage(outputDir, pageNo) {
     const imagePath = join4(outputDir, "pages", `page_${pageNo - 1}.png`);
-    return readFileSync2(imagePath).toString("base64");
+    return readFileSync(imagePath).toString("base64");
   }
   /**
    * Apply VLM corrections to the DoclingDocument.
@@ -1799,9 +1996,9 @@ var VlmTextCorrector = class {
 };
 // src/samplers/ocr-strategy-sampler.ts
-import { readFileSync as readFileSync3 } from "fs";
+import { normalizeToBcp47 } from "@heripo/model";
+import { readFileSync as readFileSync2 } from "fs";
 import { z as z2 } from "zod/v4";
-var SAMPLE_DPI = 150;
 var EDGE_TRIM_RATIO = 0.1;
 var DEFAULT_MAX_SAMPLE_PAGES = 15;
 var DEFAULT_MAX_RETRIES2 = 3;
@@ -1852,7 +2049,7 @@ var OcrStrategySampler = class {
     const renderResult = await this.pageRenderer.renderPages(
       pdfPath,
       outputDir,
-      { dpi: SAMPLE_DPI }
+      { dpi: PAGE_RENDERING.SAMPLE_DPI }
     );
     if (renderResult.pageCount === 0) {
       this.logger.info("[OcrStrategySampler] No pages found in PDF");
@@ -1871,7 +2068,7 @@ var OcrStrategySampler = class {
       `[OcrStrategySampler] Sampling ${sampleIndices.length} of ${renderResult.pageCount} pages: [${sampleIndices.map((i) => i + 1).join(", ")}]`
     );
     let sampledCount = 0;
-    let detectedLanguages;
+    const languageFrequency = /* @__PURE__ */ new Map();
     for (const idx of sampleIndices) {
       sampledCount++;
       const pageFile = renderResult.pageFiles[idx];
@@ -1881,14 +2078,17 @@ var OcrStrategySampler = class {
         model,
         options
       );
-      detectedLanguages = pageAnalysis.detectedLanguages;
+      for (const lang of pageAnalysis.detectedLanguages) {
+        languageFrequency.set(lang, (languageFrequency.get(lang) ?? 0) + 1);
+      }
       if (pageAnalysis.hasKoreanHanjaMix) {
         this.logger.info(
           `[OcrStrategySampler] Korean-Hanja mix detected on page ${idx + 1} \u2192 VLM strategy`
         );
+        const detectedLanguages2 = this.aggregateLanguages(languageFrequency);
         return {
           method: "vlm",
-          detectedLanguages,
+          detectedLanguages: detectedLanguages2,
           reason: `Korean-Hanja mix detected on page ${idx + 1}`,
           sampledPages: sampledCount,
           totalPages: renderResult.pageCount
@@ -1898,6 +2098,7 @@ var OcrStrategySampler = class {
     this.logger.info(
       "[OcrStrategySampler] No Korean-Hanja mix detected \u2192 ocrmac strategy"
     );
+    const detectedLanguages = this.aggregateLanguages(languageFrequency);
     return {
       method: "ocrmac",
       detectedLanguages,
@@ -2002,14 +2203,15 @@ var OcrStrategySampler = class {
   }
   /**
    * Analyze a single sample page for Korean-Hanja mixed script and primary language.
+   * Normalizes raw VLM language responses to valid BCP 47 tags, filtering out invalid ones.
    *
-   * @returns Object with Korean-Hanja detection result and detected languages
+   * @returns Object with Korean-Hanja detection result and normalized detected languages
    */
   async analyzeSamplePage(pageFile, pageNo, model, options) {
     this.logger.debug(
       `[OcrStrategySampler] Analyzing page ${pageNo} for Korean-Hanja mix and language...`
     );
-    const base64Image = readFileSync3(pageFile).toString("base64");
+    const base64Image = readFileSync2(pageFile).toString("base64");
     const messages = [
       {
         role: "user",
@@ -2037,18 +2239,27 @@ var OcrStrategySampler = class {
       options.aggregator.track(result.usage);
     }
     const output = result.output;
+    const normalizedLanguages = output.detectedLanguages.map(normalizeToBcp47).filter((tag) => tag !== null);
     this.logger.debug(
-      `[OcrStrategySampler] Page ${pageNo}: hasKoreanHanjaMix=${output.hasKoreanHanjaMix}, detectedLanguages=${output.detectedLanguages.join(",")}`
+      `[OcrStrategySampler] Page ${pageNo}: hasKoreanHanjaMix=${output.hasKoreanHanjaMix}, detectedLanguages=${normalizedLanguages.join(",")}`
     );
     return {
       hasKoreanHanjaMix: output.hasKoreanHanjaMix,
-      detectedLanguages: output.detectedLanguages
+      detectedLanguages: normalizedLanguages
     };
   }
+  /**
+   * Aggregate language frequency map into a sorted array.
+   * Returns languages sorted by frequency (descending), or undefined if empty.
+   */
+  aggregateLanguages(frequencyMap) {
+    if (frequencyMap.size === 0) return void 0;
+    return [...frequencyMap.entries()].sort((a, b) => b[1] - a[1]).map(([lang]) => lang);
+  }
 };
 // src/utils/local-file-server.ts
-import { createReadStream, statSync } from "fs";
+import { createReadStream as createReadStream2, statSync } from "fs";
 import { createServer } from "http";
 import { basename } from "path";
 var LocalFileServer = class {
@@ -2070,7 +2281,7 @@ var LocalFileServer = class {
             "Content-Type": "application/pdf",
             "Content-Length": stat.size
           });
-          createReadStream(filePath).pipe(res);
+          createReadStream2(filePath).pipe(res);
         } else {
           res.writeHead(404);
           res.end("Not Found");
@@ -2355,8 +2566,10 @@ var PDFConverter = class {
       let pageTexts;
       try {
         const resultPath2 = join6(outputDir, "result.json");
-        const doc = JSON.parse(readFileSync4(resultPath2, "utf-8"));
-        const totalPages = Object.keys(doc.pages).length;
+        const totalPages = await runJqFileJson(
+          ".pages | length",
+          resultPath2
+        );
         const textExtractor = new PdfTextExtractor(this.logger);
         pageTexts = await textExtractor.extractText(pdfPath, totalPages);
       } catch {
@@ -2513,6 +2726,7 @@ var PDFConverter = class {
     const outputDir = join6(cwd, "output", reportId);
     try {
       await this.processConvertedFiles(zipPath, extractDir, outputDir);
+      await this.renderPageImages(url, outputDir);
       if (abortSignal?.aborted) {
         this.logger.info("[PDFConverter] Conversion aborted before callback");
         const error = new Error("PDF conversion was aborted");
@@ -2568,6 +2782,8 @@ var PDFConverter = class {
         framework: "livetext"
       },
       generate_picture_images: true,
+      generate_page_images: false,
+      // Page images are rendered by PageRenderer (ImageMagick) after conversion
       images_scale: 2,
       /**
        * While disabling this option yields the most accurate text extraction for readable PDFs,
@@ -2685,8 +2901,8 @@ var PDFConverter = class {
     const zipPath = join6(process.cwd(), "result.zip");
     this.logger.info("[PDFConverter] Saving ZIP file to:", zipPath);
     if (zipResult.fileStream) {
-      const writeStream = createWriteStream2(zipPath);
-      await pipeline(zipResult.fileStream, writeStream);
+      const writeStream = createWriteStream3(zipPath);
+      await pipeline3(zipResult.fileStream, writeStream);
       return;
     }
     if (zipResult.data) {
@@ -2716,6 +2932,42 @@ var PDFConverter = class {
       outputDir
     );
   }
+  /**
+   * Render page images from the source PDF using ImageMagick and update result.json.
+   * Uses jq to update the JSON file without loading it into Node.js memory.
+   * Replaces Docling's generate_page_images which fails on large PDFs
+   * due to memory limits when embedding all page images as base64.
+   */
+  async renderPageImages(url, outputDir) {
+    if (!url.startsWith("file://")) {
+      this.logger.warn(
+        "[PDFConverter] Page image rendering skipped: only supported for local files (file:// URLs)"
+      );
+      return;
+    }
+    const pdfPath = url.slice(7);
+    this.logger.info(
+      "[PDFConverter] Rendering page images with ImageMagick..."
+    );
+    const renderer = new PageRenderer(this.logger);
+    const renderResult = await renderer.renderPages(pdfPath, outputDir);
+    const resultPath = join6(outputDir, "result.json");
+    const tmpPath = resultPath + ".tmp";
+    const jqProgram = `
+      .pages |= with_entries(
+        if (.value.page_no - 1) >= 0 and (.value.page_no - 1) < ${renderResult.pageCount} then
+          .value.image.uri = "pages/page_\\(.value.page_no - 1).png" |
+          .value.image.mimetype = "image/png" |
+          .value.image.dpi = ${PAGE_RENDERING.DEFAULT_DPI}
+        else . end
+      )
+    `;
+    await runJqFileToFile(jqProgram, resultPath, tmpPath);
+    await rename2(tmpPath, resultPath);
+    this.logger.info(
+      `[PDFConverter] Rendered ${renderResult.pageCount} page images`
+    );
+  }
 };
 // src/core/pdf-parser.ts
@@ -2754,6 +3006,7 @@ var PDFParser = class {
     this.logger.info("[PDFParser] Initializing...");
     this.checkOperatingSystem();
     this.checkJqInstalled();
+    this.checkPopplerInstalled();
     this.checkMacOSVersion();
     if (this.enableImagePdfFallback && !this.baseUrl) {
       this.checkImageMagickInstalled();
@@ -2810,6 +3063,15 @@ var PDFParser = class {
       );
     }
   }
+  checkPopplerInstalled() {
+    try {
+      execSync("which pdftotext", { stdio: "ignore" });
+    } catch {
+      throw new Error(
+        "poppler is not installed. Please install poppler using: brew install poppler"
+      );
+    }
+  }
   checkMacOSVersion() {
     try {
       const versionOutput = execSync("sw_vers -productVersion", {