@heripo/pdf-parser 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -12,7 +12,7 @@ type ConversionCompleteCallback = (outputPath: string) => Promise<void> | void;
12
12
  /**
13
13
  * Extended options for PDF conversion.
14
14
  */
15
- type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mode' | 'ocr_engine' | 'accelerator_options' | 'ocr_options' | 'generate_picture_images' | 'images_scale' | 'force_ocr' | 'pipeline' | 'vlm_pipeline_model_local' | 'vlm_pipeline_model_api'> & {
15
+ type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mode' | 'ocr_engine' | 'accelerator_options' | 'ocr_options' | 'generate_picture_images' | 'generate_page_images' | 'images_scale' | 'force_ocr' | 'pipeline' | 'vlm_pipeline_model_local' | 'vlm_pipeline_model_api'> & {
16
16
  num_threads?: number;
17
17
  /**
18
18
  * Force pre-conversion to image-based PDF before processing.
@@ -74,6 +74,8 @@ type Options = {
74
74
  * - Install specific version: `pyenv install 3.12.0 && pyenv global 3.12.0`
75
75
  * - `jq` - JSON processor
76
76
  * - Install: `brew install jq`
77
+ * - `poppler` - PDF text extraction tools (pdftotext, pdfinfo)
78
+ * - Install: `brew install poppler`
77
79
  * - `lsof` - List open files (usually pre-installed on macOS)
78
80
  *
79
81
  * ## Initialization Process
@@ -122,6 +124,7 @@ declare class PDFParser {
122
124
  init(): Promise<void>;
123
125
  private checkOperatingSystem;
124
126
  private checkJqInstalled;
127
+ private checkPopplerInstalled;
125
128
  private checkMacOSVersion;
126
129
  private checkImageMagickInstalled;
127
130
  private checkGhostscriptInstalled;
package/dist/index.d.ts CHANGED
@@ -12,7 +12,7 @@ type ConversionCompleteCallback = (outputPath: string) => Promise<void> | void;
12
12
  /**
13
13
  * Extended options for PDF conversion.
14
14
  */
15
- type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mode' | 'ocr_engine' | 'accelerator_options' | 'ocr_options' | 'generate_picture_images' | 'images_scale' | 'force_ocr' | 'pipeline' | 'vlm_pipeline_model_local' | 'vlm_pipeline_model_api'> & {
15
+ type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mode' | 'ocr_engine' | 'accelerator_options' | 'ocr_options' | 'generate_picture_images' | 'generate_page_images' | 'images_scale' | 'force_ocr' | 'pipeline' | 'vlm_pipeline_model_local' | 'vlm_pipeline_model_api'> & {
16
16
  num_threads?: number;
17
17
  /**
18
18
  * Force pre-conversion to image-based PDF before processing.
@@ -74,6 +74,8 @@ type Options = {
74
74
  * - Install specific version: `pyenv install 3.12.0 && pyenv global 3.12.0`
75
75
  * - `jq` - JSON processor
76
76
  * - Install: `brew install jq`
77
+ * - `poppler` - PDF text extraction tools (pdftotext, pdfinfo)
78
+ * - Install: `brew install poppler`
77
79
  * - `lsof` - List open files (usually pre-installed on macOS)
78
80
  *
79
81
  * ## Initialization Process
@@ -122,6 +124,7 @@ declare class PDFParser {
122
124
  init(): Promise<void>;
123
125
  private checkOperatingSystem;
124
126
  private checkJqInstalled;
127
+ private checkPopplerInstalled;
125
128
  private checkMacOSVersion;
126
129
  private checkImageMagickInstalled;
127
130
  private checkGhostscriptInstalled;
package/dist/index.js CHANGED
@@ -1042,28 +1042,28 @@ var ImageExtractor = class _ImageExtractor {
1042
1042
  const baseName = filename.replace(extname(filename), "");
1043
1043
  const jsonPath = join2(outputDir, `${baseName}.json`);
1044
1044
  try {
1045
- const pagesDir = join2(outputDir, "pages");
1046
- if (!existsSync(pagesDir)) {
1047
- mkdirSync(pagesDir, { recursive: true });
1045
+ const imagesDir = join2(outputDir, "images");
1046
+ if (!existsSync(imagesDir)) {
1047
+ mkdirSync(imagesDir, { recursive: true });
1048
1048
  }
1049
1049
  const base64Images = await _ImageExtractor.extractBase64ImagesFromJsonWithJq(jsonSourcePath);
1050
1050
  base64Images.forEach((base64Data, index) => {
1051
1051
  _ImageExtractor.extractBase64ImageToFile(
1052
1052
  base64Data,
1053
- pagesDir,
1053
+ imagesDir,
1054
1054
  index,
1055
- "page",
1056
- "pages"
1055
+ "pic",
1056
+ "images"
1057
1057
  );
1058
1058
  });
1059
1059
  logger.info(
1060
- `[PDFConverter] Extracted ${base64Images.length} images from JSON to ${pagesDir}`
1060
+ `[PDFConverter] Extracted ${base64Images.length} picture images from JSON to ${imagesDir}`
1061
1061
  );
1062
1062
  const replacedCount = await _ImageExtractor.replaceBase64ImagesInJsonWithJq(
1063
1063
  jsonSourcePath,
1064
1064
  jsonPath,
1065
- "pages",
1066
- "page"
1065
+ "images",
1066
+ "pic"
1067
1067
  );
1068
1068
  logger.info(
1069
1069
  `[PDFConverter] Replaced ${replacedCount} base64 images with file paths`
@@ -1799,6 +1799,7 @@ var VlmTextCorrector = class {
1799
1799
  };
1800
1800
 
1801
1801
  // src/samplers/ocr-strategy-sampler.ts
1802
+ import { normalizeToBcp47 } from "@heripo/model";
1802
1803
  import { readFileSync as readFileSync3 } from "fs";
1803
1804
  import { z as z2 } from "zod/v4";
1804
1805
  var SAMPLE_DPI = 150;
@@ -1871,7 +1872,7 @@ var OcrStrategySampler = class {
1871
1872
  `[OcrStrategySampler] Sampling ${sampleIndices.length} of ${renderResult.pageCount} pages: [${sampleIndices.map((i) => i + 1).join(", ")}]`
1872
1873
  );
1873
1874
  let sampledCount = 0;
1874
- let detectedLanguages;
1875
+ const languageFrequency = /* @__PURE__ */ new Map();
1875
1876
  for (const idx of sampleIndices) {
1876
1877
  sampledCount++;
1877
1878
  const pageFile = renderResult.pageFiles[idx];
@@ -1881,14 +1882,17 @@ var OcrStrategySampler = class {
1881
1882
  model,
1882
1883
  options
1883
1884
  );
1884
- detectedLanguages = pageAnalysis.detectedLanguages;
1885
+ for (const lang of pageAnalysis.detectedLanguages) {
1886
+ languageFrequency.set(lang, (languageFrequency.get(lang) ?? 0) + 1);
1887
+ }
1885
1888
  if (pageAnalysis.hasKoreanHanjaMix) {
1886
1889
  this.logger.info(
1887
1890
  `[OcrStrategySampler] Korean-Hanja mix detected on page ${idx + 1} \u2192 VLM strategy`
1888
1891
  );
1892
+ const detectedLanguages2 = this.aggregateLanguages(languageFrequency);
1889
1893
  return {
1890
1894
  method: "vlm",
1891
- detectedLanguages,
1895
+ detectedLanguages: detectedLanguages2,
1892
1896
  reason: `Korean-Hanja mix detected on page ${idx + 1}`,
1893
1897
  sampledPages: sampledCount,
1894
1898
  totalPages: renderResult.pageCount
@@ -1898,6 +1902,7 @@ var OcrStrategySampler = class {
1898
1902
  this.logger.info(
1899
1903
  "[OcrStrategySampler] No Korean-Hanja mix detected \u2192 ocrmac strategy"
1900
1904
  );
1905
+ const detectedLanguages = this.aggregateLanguages(languageFrequency);
1901
1906
  return {
1902
1907
  method: "ocrmac",
1903
1908
  detectedLanguages,
@@ -2002,8 +2007,9 @@ var OcrStrategySampler = class {
2002
2007
  }
2003
2008
  /**
2004
2009
  * Analyze a single sample page for Korean-Hanja mixed script and primary language.
2010
+ * Normalizes raw VLM language responses to valid BCP 47 tags, filtering out invalid ones.
2005
2011
  *
2006
- * @returns Object with Korean-Hanja detection result and detected languages
2012
+ * @returns Object with Korean-Hanja detection result and normalized detected languages
2007
2013
  */
2008
2014
  async analyzeSamplePage(pageFile, pageNo, model, options) {
2009
2015
  this.logger.debug(
@@ -2037,14 +2043,23 @@ var OcrStrategySampler = class {
2037
2043
  options.aggregator.track(result.usage);
2038
2044
  }
2039
2045
  const output = result.output;
2046
+ const normalizedLanguages = output.detectedLanguages.map(normalizeToBcp47).filter((tag) => tag !== null);
2040
2047
  this.logger.debug(
2041
- `[OcrStrategySampler] Page ${pageNo}: hasKoreanHanjaMix=${output.hasKoreanHanjaMix}, detectedLanguages=${output.detectedLanguages.join(",")}`
2048
+ `[OcrStrategySampler] Page ${pageNo}: hasKoreanHanjaMix=${output.hasKoreanHanjaMix}, detectedLanguages=${normalizedLanguages.join(",")}`
2042
2049
  );
2043
2050
  return {
2044
2051
  hasKoreanHanjaMix: output.hasKoreanHanjaMix,
2045
- detectedLanguages: output.detectedLanguages
2052
+ detectedLanguages: normalizedLanguages
2046
2053
  };
2047
2054
  }
2055
+ /**
2056
+ * Aggregate language frequency map into a sorted array.
2057
+ * Returns languages sorted by frequency (descending), or undefined if empty.
2058
+ */
2059
+ aggregateLanguages(frequencyMap) {
2060
+ if (frequencyMap.size === 0) return void 0;
2061
+ return [...frequencyMap.entries()].sort((a, b) => b[1] - a[1]).map(([lang]) => lang);
2062
+ }
2048
2063
  };
2049
2064
 
2050
2065
  // src/utils/local-file-server.ts
@@ -2513,6 +2528,7 @@ var PDFConverter = class {
2513
2528
  const outputDir = join6(cwd, "output", reportId);
2514
2529
  try {
2515
2530
  await this.processConvertedFiles(zipPath, extractDir, outputDir);
2531
+ await this.renderPageImages(url, outputDir);
2516
2532
  if (abortSignal?.aborted) {
2517
2533
  this.logger.info("[PDFConverter] Conversion aborted before callback");
2518
2534
  const error = new Error("PDF conversion was aborted");
@@ -2568,6 +2584,8 @@ var PDFConverter = class {
2568
2584
  framework: "livetext"
2569
2585
  },
2570
2586
  generate_picture_images: true,
2587
+ generate_page_images: false,
2588
+ // Page images are rendered by PageRenderer (ImageMagick) after conversion
2571
2589
  images_scale: 2,
2572
2590
  /**
2573
2591
  * While disabling this option yields the most accurate text extraction for readable PDFs,
@@ -2716,6 +2734,40 @@ var PDFConverter = class {
2716
2734
  outputDir
2717
2735
  );
2718
2736
  }
2737
+ /**
2738
+ * Render page images from the source PDF using ImageMagick and update result.json.
2739
+ * Replaces Docling's generate_page_images which fails on large PDFs
2740
+ * due to memory limits when embedding all page images as base64.
2741
+ */
2742
+ async renderPageImages(url, outputDir) {
2743
+ if (!url.startsWith("file://")) {
2744
+ this.logger.warn(
2745
+ "[PDFConverter] Page image rendering skipped: only supported for local files (file:// URLs)"
2746
+ );
2747
+ return;
2748
+ }
2749
+ const pdfPath = url.slice(7);
2750
+ this.logger.info(
2751
+ "[PDFConverter] Rendering page images with ImageMagick..."
2752
+ );
2753
+ const renderer = new PageRenderer(this.logger);
2754
+ const renderResult = await renderer.renderPages(pdfPath, outputDir);
2755
+ const resultPath = join6(outputDir, "result.json");
2756
+ const doc = JSON.parse(readFileSync4(resultPath, "utf-8"));
2757
+ for (const page of Object.values(doc.pages)) {
2758
+ const pageNo = page.page_no;
2759
+ const fileIndex = pageNo - 1;
2760
+ if (fileIndex >= 0 && fileIndex < renderResult.pageCount) {
2761
+ page.image.uri = `pages/page_${fileIndex}.png`;
2762
+ page.image.mimetype = "image/png";
2763
+ page.image.dpi = 300;
2764
+ }
2765
+ }
2766
+ await writeFile(resultPath, JSON.stringify(doc, null, 2));
2767
+ this.logger.info(
2768
+ `[PDFConverter] Rendered ${renderResult.pageCount} page images`
2769
+ );
2770
+ }
2719
2771
  };
2720
2772
 
2721
2773
  // src/core/pdf-parser.ts
@@ -2754,6 +2806,7 @@ var PDFParser = class {
2754
2806
  this.logger.info("[PDFParser] Initializing...");
2755
2807
  this.checkOperatingSystem();
2756
2808
  this.checkJqInstalled();
2809
+ this.checkPopplerInstalled();
2757
2810
  this.checkMacOSVersion();
2758
2811
  if (this.enableImagePdfFallback && !this.baseUrl) {
2759
2812
  this.checkImageMagickInstalled();
@@ -2810,6 +2863,15 @@ var PDFParser = class {
2810
2863
  );
2811
2864
  }
2812
2865
  }
2866
+ checkPopplerInstalled() {
2867
+ try {
2868
+ execSync("which pdftotext", { stdio: "ignore" });
2869
+ } catch {
2870
+ throw new Error(
2871
+ "poppler is not installed. Please install poppler using: brew install poppler"
2872
+ );
2873
+ }
2874
+ }
2813
2875
  checkMacOSVersion() {
2814
2876
  try {
2815
2877
  const versionOutput = execSync("sw_vers -productVersion", {