@heripo/pdf-parser 0.1.8 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.ko.md +19 -9
- package/README.md +19 -9
- package/dist/index.cjs +77 -15
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +4 -1
- package/dist/index.d.ts +4 -1
- package/dist/index.js +77 -15
- package/dist/index.js.map +1 -1
- package/package.json +6 -6
package/dist/index.d.cts
CHANGED
|
@@ -12,7 +12,7 @@ type ConversionCompleteCallback = (outputPath: string) => Promise<void> | void;
|
|
|
12
12
|
/**
|
|
13
13
|
* Extended options for PDF conversion.
|
|
14
14
|
*/
|
|
15
|
-
type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mode' | 'ocr_engine' | 'accelerator_options' | 'ocr_options' | 'generate_picture_images' | 'images_scale' | 'force_ocr' | 'pipeline' | 'vlm_pipeline_model_local' | 'vlm_pipeline_model_api'> & {
|
|
15
|
+
type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mode' | 'ocr_engine' | 'accelerator_options' | 'ocr_options' | 'generate_picture_images' | 'generate_page_images' | 'images_scale' | 'force_ocr' | 'pipeline' | 'vlm_pipeline_model_local' | 'vlm_pipeline_model_api'> & {
|
|
16
16
|
num_threads?: number;
|
|
17
17
|
/**
|
|
18
18
|
* Force pre-conversion to image-based PDF before processing.
|
|
@@ -74,6 +74,8 @@ type Options = {
|
|
|
74
74
|
* - Install specific version: `pyenv install 3.12.0 && pyenv global 3.12.0`
|
|
75
75
|
* - `jq` - JSON processor
|
|
76
76
|
* - Install: `brew install jq`
|
|
77
|
+
* - `poppler` - PDF text extraction tools (pdftotext, pdfinfo)
|
|
78
|
+
* - Install: `brew install poppler`
|
|
77
79
|
* - `lsof` - List open files (usually pre-installed on macOS)
|
|
78
80
|
*
|
|
79
81
|
* ## Initialization Process
|
|
@@ -122,6 +124,7 @@ declare class PDFParser {
|
|
|
122
124
|
init(): Promise<void>;
|
|
123
125
|
private checkOperatingSystem;
|
|
124
126
|
private checkJqInstalled;
|
|
127
|
+
private checkPopplerInstalled;
|
|
125
128
|
private checkMacOSVersion;
|
|
126
129
|
private checkImageMagickInstalled;
|
|
127
130
|
private checkGhostscriptInstalled;
|
package/dist/index.d.ts
CHANGED
|
@@ -12,7 +12,7 @@ type ConversionCompleteCallback = (outputPath: string) => Promise<void> | void;
|
|
|
12
12
|
/**
|
|
13
13
|
* Extended options for PDF conversion.
|
|
14
14
|
*/
|
|
15
|
-
type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mode' | 'ocr_engine' | 'accelerator_options' | 'ocr_options' | 'generate_picture_images' | 'images_scale' | 'force_ocr' | 'pipeline' | 'vlm_pipeline_model_local' | 'vlm_pipeline_model_api'> & {
|
|
15
|
+
type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mode' | 'ocr_engine' | 'accelerator_options' | 'ocr_options' | 'generate_picture_images' | 'generate_page_images' | 'images_scale' | 'force_ocr' | 'pipeline' | 'vlm_pipeline_model_local' | 'vlm_pipeline_model_api'> & {
|
|
16
16
|
num_threads?: number;
|
|
17
17
|
/**
|
|
18
18
|
* Force pre-conversion to image-based PDF before processing.
|
|
@@ -74,6 +74,8 @@ type Options = {
|
|
|
74
74
|
* - Install specific version: `pyenv install 3.12.0 && pyenv global 3.12.0`
|
|
75
75
|
* - `jq` - JSON processor
|
|
76
76
|
* - Install: `brew install jq`
|
|
77
|
+
* - `poppler` - PDF text extraction tools (pdftotext, pdfinfo)
|
|
78
|
+
* - Install: `brew install poppler`
|
|
77
79
|
* - `lsof` - List open files (usually pre-installed on macOS)
|
|
78
80
|
*
|
|
79
81
|
* ## Initialization Process
|
|
@@ -122,6 +124,7 @@ declare class PDFParser {
|
|
|
122
124
|
init(): Promise<void>;
|
|
123
125
|
private checkOperatingSystem;
|
|
124
126
|
private checkJqInstalled;
|
|
127
|
+
private checkPopplerInstalled;
|
|
125
128
|
private checkMacOSVersion;
|
|
126
129
|
private checkImageMagickInstalled;
|
|
127
130
|
private checkGhostscriptInstalled;
|
package/dist/index.js
CHANGED
|
@@ -1042,28 +1042,28 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
1042
1042
|
const baseName = filename.replace(extname(filename), "");
|
|
1043
1043
|
const jsonPath = join2(outputDir, `${baseName}.json`);
|
|
1044
1044
|
try {
|
|
1045
|
-
const
|
|
1046
|
-
if (!existsSync(
|
|
1047
|
-
mkdirSync(
|
|
1045
|
+
const imagesDir = join2(outputDir, "images");
|
|
1046
|
+
if (!existsSync(imagesDir)) {
|
|
1047
|
+
mkdirSync(imagesDir, { recursive: true });
|
|
1048
1048
|
}
|
|
1049
1049
|
const base64Images = await _ImageExtractor.extractBase64ImagesFromJsonWithJq(jsonSourcePath);
|
|
1050
1050
|
base64Images.forEach((base64Data, index) => {
|
|
1051
1051
|
_ImageExtractor.extractBase64ImageToFile(
|
|
1052
1052
|
base64Data,
|
|
1053
|
-
|
|
1053
|
+
imagesDir,
|
|
1054
1054
|
index,
|
|
1055
|
-
"
|
|
1056
|
-
"
|
|
1055
|
+
"pic",
|
|
1056
|
+
"images"
|
|
1057
1057
|
);
|
|
1058
1058
|
});
|
|
1059
1059
|
logger.info(
|
|
1060
|
-
`[PDFConverter] Extracted ${base64Images.length} images from JSON to ${
|
|
1060
|
+
`[PDFConverter] Extracted ${base64Images.length} picture images from JSON to ${imagesDir}`
|
|
1061
1061
|
);
|
|
1062
1062
|
const replacedCount = await _ImageExtractor.replaceBase64ImagesInJsonWithJq(
|
|
1063
1063
|
jsonSourcePath,
|
|
1064
1064
|
jsonPath,
|
|
1065
|
-
"
|
|
1066
|
-
"
|
|
1065
|
+
"images",
|
|
1066
|
+
"pic"
|
|
1067
1067
|
);
|
|
1068
1068
|
logger.info(
|
|
1069
1069
|
`[PDFConverter] Replaced ${replacedCount} base64 images with file paths`
|
|
@@ -1799,6 +1799,7 @@ var VlmTextCorrector = class {
|
|
|
1799
1799
|
};
|
|
1800
1800
|
|
|
1801
1801
|
// src/samplers/ocr-strategy-sampler.ts
|
|
1802
|
+
import { normalizeToBcp47 } from "@heripo/model";
|
|
1802
1803
|
import { readFileSync as readFileSync3 } from "fs";
|
|
1803
1804
|
import { z as z2 } from "zod/v4";
|
|
1804
1805
|
var SAMPLE_DPI = 150;
|
|
@@ -1871,7 +1872,7 @@ var OcrStrategySampler = class {
|
|
|
1871
1872
|
`[OcrStrategySampler] Sampling ${sampleIndices.length} of ${renderResult.pageCount} pages: [${sampleIndices.map((i) => i + 1).join(", ")}]`
|
|
1872
1873
|
);
|
|
1873
1874
|
let sampledCount = 0;
|
|
1874
|
-
|
|
1875
|
+
const languageFrequency = /* @__PURE__ */ new Map();
|
|
1875
1876
|
for (const idx of sampleIndices) {
|
|
1876
1877
|
sampledCount++;
|
|
1877
1878
|
const pageFile = renderResult.pageFiles[idx];
|
|
@@ -1881,14 +1882,17 @@ var OcrStrategySampler = class {
|
|
|
1881
1882
|
model,
|
|
1882
1883
|
options
|
|
1883
1884
|
);
|
|
1884
|
-
|
|
1885
|
+
for (const lang of pageAnalysis.detectedLanguages) {
|
|
1886
|
+
languageFrequency.set(lang, (languageFrequency.get(lang) ?? 0) + 1);
|
|
1887
|
+
}
|
|
1885
1888
|
if (pageAnalysis.hasKoreanHanjaMix) {
|
|
1886
1889
|
this.logger.info(
|
|
1887
1890
|
`[OcrStrategySampler] Korean-Hanja mix detected on page ${idx + 1} \u2192 VLM strategy`
|
|
1888
1891
|
);
|
|
1892
|
+
const detectedLanguages2 = this.aggregateLanguages(languageFrequency);
|
|
1889
1893
|
return {
|
|
1890
1894
|
method: "vlm",
|
|
1891
|
-
detectedLanguages,
|
|
1895
|
+
detectedLanguages: detectedLanguages2,
|
|
1892
1896
|
reason: `Korean-Hanja mix detected on page ${idx + 1}`,
|
|
1893
1897
|
sampledPages: sampledCount,
|
|
1894
1898
|
totalPages: renderResult.pageCount
|
|
@@ -1898,6 +1902,7 @@ var OcrStrategySampler = class {
|
|
|
1898
1902
|
this.logger.info(
|
|
1899
1903
|
"[OcrStrategySampler] No Korean-Hanja mix detected \u2192 ocrmac strategy"
|
|
1900
1904
|
);
|
|
1905
|
+
const detectedLanguages = this.aggregateLanguages(languageFrequency);
|
|
1901
1906
|
return {
|
|
1902
1907
|
method: "ocrmac",
|
|
1903
1908
|
detectedLanguages,
|
|
@@ -2002,8 +2007,9 @@ var OcrStrategySampler = class {
|
|
|
2002
2007
|
}
|
|
2003
2008
|
/**
|
|
2004
2009
|
* Analyze a single sample page for Korean-Hanja mixed script and primary language.
|
|
2010
|
+
* Normalizes raw VLM language responses to valid BCP 47 tags, filtering out invalid ones.
|
|
2005
2011
|
*
|
|
2006
|
-
* @returns Object with Korean-Hanja detection result and detected languages
|
|
2012
|
+
* @returns Object with Korean-Hanja detection result and normalized detected languages
|
|
2007
2013
|
*/
|
|
2008
2014
|
async analyzeSamplePage(pageFile, pageNo, model, options) {
|
|
2009
2015
|
this.logger.debug(
|
|
@@ -2037,14 +2043,23 @@ var OcrStrategySampler = class {
|
|
|
2037
2043
|
options.aggregator.track(result.usage);
|
|
2038
2044
|
}
|
|
2039
2045
|
const output = result.output;
|
|
2046
|
+
const normalizedLanguages = output.detectedLanguages.map(normalizeToBcp47).filter((tag) => tag !== null);
|
|
2040
2047
|
this.logger.debug(
|
|
2041
|
-
`[OcrStrategySampler] Page ${pageNo}: hasKoreanHanjaMix=${output.hasKoreanHanjaMix}, detectedLanguages=${
|
|
2048
|
+
`[OcrStrategySampler] Page ${pageNo}: hasKoreanHanjaMix=${output.hasKoreanHanjaMix}, detectedLanguages=${normalizedLanguages.join(",")}`
|
|
2042
2049
|
);
|
|
2043
2050
|
return {
|
|
2044
2051
|
hasKoreanHanjaMix: output.hasKoreanHanjaMix,
|
|
2045
|
-
detectedLanguages:
|
|
2052
|
+
detectedLanguages: normalizedLanguages
|
|
2046
2053
|
};
|
|
2047
2054
|
}
|
|
2055
|
+
/**
|
|
2056
|
+
* Aggregate language frequency map into a sorted array.
|
|
2057
|
+
* Returns languages sorted by frequency (descending), or undefined if empty.
|
|
2058
|
+
*/
|
|
2059
|
+
aggregateLanguages(frequencyMap) {
|
|
2060
|
+
if (frequencyMap.size === 0) return void 0;
|
|
2061
|
+
return [...frequencyMap.entries()].sort((a, b) => b[1] - a[1]).map(([lang]) => lang);
|
|
2062
|
+
}
|
|
2048
2063
|
};
|
|
2049
2064
|
|
|
2050
2065
|
// src/utils/local-file-server.ts
|
|
@@ -2513,6 +2528,7 @@ var PDFConverter = class {
|
|
|
2513
2528
|
const outputDir = join6(cwd, "output", reportId);
|
|
2514
2529
|
try {
|
|
2515
2530
|
await this.processConvertedFiles(zipPath, extractDir, outputDir);
|
|
2531
|
+
await this.renderPageImages(url, outputDir);
|
|
2516
2532
|
if (abortSignal?.aborted) {
|
|
2517
2533
|
this.logger.info("[PDFConverter] Conversion aborted before callback");
|
|
2518
2534
|
const error = new Error("PDF conversion was aborted");
|
|
@@ -2568,6 +2584,8 @@ var PDFConverter = class {
|
|
|
2568
2584
|
framework: "livetext"
|
|
2569
2585
|
},
|
|
2570
2586
|
generate_picture_images: true,
|
|
2587
|
+
generate_page_images: false,
|
|
2588
|
+
// Page images are rendered by PageRenderer (ImageMagick) after conversion
|
|
2571
2589
|
images_scale: 2,
|
|
2572
2590
|
/**
|
|
2573
2591
|
* While disabling this option yields the most accurate text extraction for readable PDFs,
|
|
@@ -2716,6 +2734,40 @@ var PDFConverter = class {
|
|
|
2716
2734
|
outputDir
|
|
2717
2735
|
);
|
|
2718
2736
|
}
|
|
2737
|
+
/**
|
|
2738
|
+
* Render page images from the source PDF using ImageMagick and update result.json.
|
|
2739
|
+
* Replaces Docling's generate_page_images which fails on large PDFs
|
|
2740
|
+
* due to memory limits when embedding all page images as base64.
|
|
2741
|
+
*/
|
|
2742
|
+
async renderPageImages(url, outputDir) {
|
|
2743
|
+
if (!url.startsWith("file://")) {
|
|
2744
|
+
this.logger.warn(
|
|
2745
|
+
"[PDFConverter] Page image rendering skipped: only supported for local files (file:// URLs)"
|
|
2746
|
+
);
|
|
2747
|
+
return;
|
|
2748
|
+
}
|
|
2749
|
+
const pdfPath = url.slice(7);
|
|
2750
|
+
this.logger.info(
|
|
2751
|
+
"[PDFConverter] Rendering page images with ImageMagick..."
|
|
2752
|
+
);
|
|
2753
|
+
const renderer = new PageRenderer(this.logger);
|
|
2754
|
+
const renderResult = await renderer.renderPages(pdfPath, outputDir);
|
|
2755
|
+
const resultPath = join6(outputDir, "result.json");
|
|
2756
|
+
const doc = JSON.parse(readFileSync4(resultPath, "utf-8"));
|
|
2757
|
+
for (const page of Object.values(doc.pages)) {
|
|
2758
|
+
const pageNo = page.page_no;
|
|
2759
|
+
const fileIndex = pageNo - 1;
|
|
2760
|
+
if (fileIndex >= 0 && fileIndex < renderResult.pageCount) {
|
|
2761
|
+
page.image.uri = `pages/page_${fileIndex}.png`;
|
|
2762
|
+
page.image.mimetype = "image/png";
|
|
2763
|
+
page.image.dpi = 300;
|
|
2764
|
+
}
|
|
2765
|
+
}
|
|
2766
|
+
await writeFile(resultPath, JSON.stringify(doc, null, 2));
|
|
2767
|
+
this.logger.info(
|
|
2768
|
+
`[PDFConverter] Rendered ${renderResult.pageCount} page images`
|
|
2769
|
+
);
|
|
2770
|
+
}
|
|
2719
2771
|
};
|
|
2720
2772
|
|
|
2721
2773
|
// src/core/pdf-parser.ts
|
|
@@ -2754,6 +2806,7 @@ var PDFParser = class {
|
|
|
2754
2806
|
this.logger.info("[PDFParser] Initializing...");
|
|
2755
2807
|
this.checkOperatingSystem();
|
|
2756
2808
|
this.checkJqInstalled();
|
|
2809
|
+
this.checkPopplerInstalled();
|
|
2757
2810
|
this.checkMacOSVersion();
|
|
2758
2811
|
if (this.enableImagePdfFallback && !this.baseUrl) {
|
|
2759
2812
|
this.checkImageMagickInstalled();
|
|
@@ -2810,6 +2863,15 @@ var PDFParser = class {
|
|
|
2810
2863
|
);
|
|
2811
2864
|
}
|
|
2812
2865
|
}
|
|
2866
|
+
checkPopplerInstalled() {
|
|
2867
|
+
try {
|
|
2868
|
+
execSync("which pdftotext", { stdio: "ignore" });
|
|
2869
|
+
} catch {
|
|
2870
|
+
throw new Error(
|
|
2871
|
+
"poppler is not installed. Please install poppler using: brew install poppler"
|
|
2872
|
+
);
|
|
2873
|
+
}
|
|
2874
|
+
}
|
|
2813
2875
|
checkMacOSVersion() {
|
|
2814
2876
|
try {
|
|
2815
2877
|
const versionOutput = execSync("sw_vers -productVersion", {
|