npm - @heripo/pdf-parser - Versions diffs - 0.1.8 → 0.1.9 - Mend

@heripo/pdf-parser 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/index.d.cts CHANGED Viewed

@@ -12,7 +12,7 @@ type ConversionCompleteCallback = (outputPath: string) => Promise<void> | void;
 /**
  * Extended options for PDF conversion.
  */
-type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mode' | 'ocr_engine' | 'accelerator_options' | 'ocr_options' | 'generate_picture_images' | 'images_scale' | 'force_ocr' | 'pipeline' | 'vlm_pipeline_model_local' | 'vlm_pipeline_model_api'> & {
+type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mode' | 'ocr_engine' | 'accelerator_options' | 'ocr_options' | 'generate_picture_images' | 'generate_page_images' | 'images_scale' | 'force_ocr' | 'pipeline' | 'vlm_pipeline_model_local' | 'vlm_pipeline_model_api'> & {
     num_threads?: number;
     /**
      * Force pre-conversion to image-based PDF before processing.
@@ -74,6 +74,8 @@ type Options = {
  *   - Install specific version: `pyenv install 3.12.0 && pyenv global 3.12.0`
  * - `jq` - JSON processor
  *   - Install: `brew install jq`
+ * - `poppler` - PDF text extraction tools (pdftotext, pdfinfo)
+ *   - Install: `brew install poppler`
  * - `lsof` - List open files (usually pre-installed on macOS)
  *
  * ## Initialization Process
@@ -122,6 +124,7 @@ declare class PDFParser {
     init(): Promise<void>;
     private checkOperatingSystem;
     private checkJqInstalled;
+    private checkPopplerInstalled;
     private checkMacOSVersion;
     private checkImageMagickInstalled;
     private checkGhostscriptInstalled;

package/dist/index.d.ts CHANGED Viewed

@@ -12,7 +12,7 @@ type ConversionCompleteCallback = (outputPath: string) => Promise<void> | void;
 /**
  * Extended options for PDF conversion.
  */
-type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mode' | 'ocr_engine' | 'accelerator_options' | 'ocr_options' | 'generate_picture_images' | 'images_scale' | 'force_ocr' | 'pipeline' | 'vlm_pipeline_model_local' | 'vlm_pipeline_model_api'> & {
+type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mode' | 'ocr_engine' | 'accelerator_options' | 'ocr_options' | 'generate_picture_images' | 'generate_page_images' | 'images_scale' | 'force_ocr' | 'pipeline' | 'vlm_pipeline_model_local' | 'vlm_pipeline_model_api'> & {
     num_threads?: number;
     /**
      * Force pre-conversion to image-based PDF before processing.
@@ -74,6 +74,8 @@ type Options = {
  *   - Install specific version: `pyenv install 3.12.0 && pyenv global 3.12.0`
  * - `jq` - JSON processor
  *   - Install: `brew install jq`
+ * - `poppler` - PDF text extraction tools (pdftotext, pdfinfo)
+ *   - Install: `brew install poppler`
  * - `lsof` - List open files (usually pre-installed on macOS)
  *
  * ## Initialization Process
@@ -122,6 +124,7 @@ declare class PDFParser {
     init(): Promise<void>;
     private checkOperatingSystem;
     private checkJqInstalled;
+    private checkPopplerInstalled;
     private checkMacOSVersion;
     private checkImageMagickInstalled;
     private checkGhostscriptInstalled;

package/dist/index.js CHANGED Viewed

@@ -1042,28 +1042,28 @@ var ImageExtractor = class _ImageExtractor {
     const baseName = filename.replace(extname(filename), "");
     const jsonPath = join2(outputDir, `${baseName}.json`);
     try {
-      const pagesDir = join2(outputDir, "pages");
-      if (!existsSync(pagesDir)) {
-        mkdirSync(pagesDir, { recursive: true });
+      const imagesDir = join2(outputDir, "images");
+      if (!existsSync(imagesDir)) {
+        mkdirSync(imagesDir, { recursive: true });
       }
       const base64Images = await _ImageExtractor.extractBase64ImagesFromJsonWithJq(jsonSourcePath);
       base64Images.forEach((base64Data, index) => {
         _ImageExtractor.extractBase64ImageToFile(
           base64Data,
-          pagesDir,
+          imagesDir,
           index,
-          "page",
-          "pages"
+          "pic",
+          "images"
         );
       });
       logger.info(
-        `[PDFConverter] Extracted ${base64Images.length} images from JSON to ${pagesDir}`
+        `[PDFConverter] Extracted ${base64Images.length} picture images from JSON to ${imagesDir}`
       );
       const replacedCount = await _ImageExtractor.replaceBase64ImagesInJsonWithJq(
         jsonSourcePath,
         jsonPath,
-        "pages",
-        "page"
+        "images",
+        "pic"
       );
       logger.info(
         `[PDFConverter] Replaced ${replacedCount} base64 images with file paths`
@@ -1799,6 +1799,7 @@ var VlmTextCorrector = class {
 };
 // src/samplers/ocr-strategy-sampler.ts
+import { normalizeToBcp47 } from "@heripo/model";
 import { readFileSync as readFileSync3 } from "fs";
 import { z as z2 } from "zod/v4";
 var SAMPLE_DPI = 150;
@@ -1871,7 +1872,7 @@ var OcrStrategySampler = class {
       `[OcrStrategySampler] Sampling ${sampleIndices.length} of ${renderResult.pageCount} pages: [${sampleIndices.map((i) => i + 1).join(", ")}]`
     );
     let sampledCount = 0;
-    let detectedLanguages;
+    const languageFrequency = /* @__PURE__ */ new Map();
     for (const idx of sampleIndices) {
       sampledCount++;
       const pageFile = renderResult.pageFiles[idx];
@@ -1881,14 +1882,17 @@ var OcrStrategySampler = class {
         model,
         options
       );
-      detectedLanguages = pageAnalysis.detectedLanguages;
+      for (const lang of pageAnalysis.detectedLanguages) {
+        languageFrequency.set(lang, (languageFrequency.get(lang) ?? 0) + 1);
+      }
       if (pageAnalysis.hasKoreanHanjaMix) {
         this.logger.info(
           `[OcrStrategySampler] Korean-Hanja mix detected on page ${idx + 1} \u2192 VLM strategy`
         );
+        const detectedLanguages2 = this.aggregateLanguages(languageFrequency);
         return {
           method: "vlm",
-          detectedLanguages,
+          detectedLanguages: detectedLanguages2,
           reason: `Korean-Hanja mix detected on page ${idx + 1}`,
           sampledPages: sampledCount,
           totalPages: renderResult.pageCount
@@ -1898,6 +1902,7 @@ var OcrStrategySampler = class {
     this.logger.info(
       "[OcrStrategySampler] No Korean-Hanja mix detected \u2192 ocrmac strategy"
     );
+    const detectedLanguages = this.aggregateLanguages(languageFrequency);
     return {
       method: "ocrmac",
       detectedLanguages,
@@ -2002,8 +2007,9 @@ var OcrStrategySampler = class {
   }
   /**
    * Analyze a single sample page for Korean-Hanja mixed script and primary language.
+   * Normalizes raw VLM language responses to valid BCP 47 tags, filtering out invalid ones.
    *
-   * @returns Object with Korean-Hanja detection result and detected languages
+   * @returns Object with Korean-Hanja detection result and normalized detected languages
    */
   async analyzeSamplePage(pageFile, pageNo, model, options) {
     this.logger.debug(
@@ -2037,14 +2043,23 @@ var OcrStrategySampler = class {
       options.aggregator.track(result.usage);
     }
     const output = result.output;
+    const normalizedLanguages = output.detectedLanguages.map(normalizeToBcp47).filter((tag) => tag !== null);
     this.logger.debug(
-      `[OcrStrategySampler] Page ${pageNo}: hasKoreanHanjaMix=${output.hasKoreanHanjaMix}, detectedLanguages=${output.detectedLanguages.join(",")}`
+      `[OcrStrategySampler] Page ${pageNo}: hasKoreanHanjaMix=${output.hasKoreanHanjaMix}, detectedLanguages=${normalizedLanguages.join(",")}`
     );
     return {
       hasKoreanHanjaMix: output.hasKoreanHanjaMix,
-      detectedLanguages: output.detectedLanguages
+      detectedLanguages: normalizedLanguages
     };
   }
+  /**
+   * Aggregate language frequency map into a sorted array.
+   * Returns languages sorted by frequency (descending), or undefined if empty.
+   */
+  aggregateLanguages(frequencyMap) {
+    if (frequencyMap.size === 0) return void 0;
+    return [...frequencyMap.entries()].sort((a, b) => b[1] - a[1]).map(([lang]) => lang);
+  }
 };
 // src/utils/local-file-server.ts
@@ -2513,6 +2528,7 @@ var PDFConverter = class {
     const outputDir = join6(cwd, "output", reportId);
     try {
       await this.processConvertedFiles(zipPath, extractDir, outputDir);
+      await this.renderPageImages(url, outputDir);
       if (abortSignal?.aborted) {
         this.logger.info("[PDFConverter] Conversion aborted before callback");
         const error = new Error("PDF conversion was aborted");
@@ -2568,6 +2584,8 @@ var PDFConverter = class {
         framework: "livetext"
       },
       generate_picture_images: true,
+      generate_page_images: false,
+      // Page images are rendered by PageRenderer (ImageMagick) after conversion
       images_scale: 2,
       /**
        * While disabling this option yields the most accurate text extraction for readable PDFs,
@@ -2716,6 +2734,40 @@ var PDFConverter = class {
       outputDir
     );
   }
+  /**
+   * Render page images from the source PDF using ImageMagick and update result.json.
+   * Replaces Docling's generate_page_images which fails on large PDFs
+   * due to memory limits when embedding all page images as base64.
+   */
+  async renderPageImages(url, outputDir) {
+    if (!url.startsWith("file://")) {
+      this.logger.warn(
+        "[PDFConverter] Page image rendering skipped: only supported for local files (file:// URLs)"
+      );
+      return;
+    }
+    const pdfPath = url.slice(7);
+    this.logger.info(
+      "[PDFConverter] Rendering page images with ImageMagick..."
+    );
+    const renderer = new PageRenderer(this.logger);
+    const renderResult = await renderer.renderPages(pdfPath, outputDir);
+    const resultPath = join6(outputDir, "result.json");
+    const doc = JSON.parse(readFileSync4(resultPath, "utf-8"));
+    for (const page of Object.values(doc.pages)) {
+      const pageNo = page.page_no;
+      const fileIndex = pageNo - 1;
+      if (fileIndex >= 0 && fileIndex < renderResult.pageCount) {
+        page.image.uri = `pages/page_${fileIndex}.png`;
+        page.image.mimetype = "image/png";
+        page.image.dpi = 300;
+      }
+    }
+    await writeFile(resultPath, JSON.stringify(doc, null, 2));
+    this.logger.info(
+      `[PDFConverter] Rendered ${renderResult.pageCount} page images`
+    );
+  }
 };
 // src/core/pdf-parser.ts
@@ -2754,6 +2806,7 @@ var PDFParser = class {
     this.logger.info("[PDFParser] Initializing...");
     this.checkOperatingSystem();
     this.checkJqInstalled();
+    this.checkPopplerInstalled();
     this.checkMacOSVersion();
     if (this.enableImagePdfFallback && !this.baseUrl) {
       this.checkImageMagickInstalled();
@@ -2810,6 +2863,15 @@ var PDFParser = class {
       );
     }
   }
+  checkPopplerInstalled() {
+    try {
+      execSync("which pdftotext", { stdio: "ignore" });
+    } catch {
+      throw new Error(
+        "poppler is not installed. Please install poppler using: brew install poppler"
+      );
+    }
+  }
   checkMacOSVersion() {
     try {
       const versionOutput = execSync("sw_vers -productVersion", {