@heripo/pdf-parser 0.1.8 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.ko.md +19 -9
- package/README.md +19 -9
- package/dist/index.cjs +77 -15
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +4 -1
- package/dist/index.d.ts +4 -1
- package/dist/index.js +77 -15
- package/dist/index.js.map +1 -1
- package/package.json +6 -6
package/README.ko.md
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
> PDF 파싱 라이브러리 - Docling SDK를 활용한 OCR 지원
|
|
4
4
|
|
|
5
5
|
[](https://www.npmjs.com/package/@heripo/pdf-parser)
|
|
6
|
-
[](https://nodejs.org/)
|
|
7
7
|
[](https://www.python.org/)
|
|
8
8
|

|
|
9
9
|
[](../../LICENSE)
|
|
@@ -46,7 +46,7 @@
|
|
|
46
46
|
|
|
47
47
|
### 필수 의존성
|
|
48
48
|
|
|
49
|
-
#### 1. Node.js >=
|
|
49
|
+
#### 1. Node.js >= 24.0.0
|
|
50
50
|
|
|
51
51
|
```bash
|
|
52
52
|
brew install node
|
|
@@ -72,7 +72,7 @@ python3.11 --version
|
|
|
72
72
|
|
|
73
73
|
#### 4. poppler (PDF 텍스트 추출)
|
|
74
74
|
|
|
75
|
-
|
|
75
|
+
PDF 페이지 수 확인(`pdfinfo`)과 텍스트 레이어 추출(`pdftotext`)에 필요하며, OCR 전략 시스템의 텍스트 레이어 사전 검사에 사용됩니다.
|
|
76
76
|
|
|
77
77
|
```bash
|
|
78
78
|
brew install poppler
|
|
@@ -281,12 +281,12 @@ const outputPath = await pdfParser.parse(
|
|
|
281
281
|
|
|
282
282
|
`@heripo/pdf-parser`는 다음 시스템 레벨 의존성이 필요합니다:
|
|
283
283
|
|
|
284
|
-
| 의존성 | 필수 버전 | 설치 방법 | 용도
|
|
285
|
-
| ------- | ---------- | -------------------------- |
|
|
286
|
-
| Python | 3.9 - 3.12 | `brew install python@3.11` | Docling SDK 실행 환경
|
|
287
|
-
| poppler | Any | `brew install poppler` |
|
|
288
|
-
| jq | Any | `brew install jq` | JSON 처리 (변환 결과 파싱)
|
|
289
|
-
| lsof | Any | macOS 기본 설치됨 | docling-serve 포트 관리
|
|
284
|
+
| 의존성 | 필수 버전 | 설치 방법 | 용도 |
|
|
285
|
+
| ------- | ---------- | -------------------------- | -------------------------------------------------------------- |
|
|
286
|
+
| Python | 3.9 - 3.12 | `brew install python@3.11` | Docling SDK 실행 환경 |
|
|
287
|
+
| poppler | Any | `brew install poppler` | PDF 페이지 수 확인 (pdfinfo) 및 텍스트 레이어 추출 (pdftotext) |
|
|
288
|
+
| jq | Any | `brew install jq` | JSON 처리 (변환 결과 파싱) |
|
|
289
|
+
| lsof | Any | macOS 기본 설치됨 | docling-serve 포트 관리 |
|
|
290
290
|
|
|
291
291
|
> ⚠️ **Python 3.13+는 지원하지 않습니다.** Docling SDK의 일부 의존성이 Python 3.13과 호환되지 않습니다.
|
|
292
292
|
|
|
@@ -411,6 +411,16 @@ const pdfParser = new PDFParser({
|
|
|
411
411
|
brew install jq
|
|
412
412
|
```
|
|
413
413
|
|
|
414
|
+
### poppler를 찾을 수 없음
|
|
415
|
+
|
|
416
|
+
**증상**: `poppler is not installed. Please install poppler using: brew install poppler`
|
|
417
|
+
|
|
418
|
+
**해결**:
|
|
419
|
+
|
|
420
|
+
```bash
|
|
421
|
+
brew install poppler
|
|
422
|
+
```
|
|
423
|
+
|
|
414
424
|
### 포트 충돌
|
|
415
425
|
|
|
416
426
|
**증상**: `Port 5001 is already in use`
|
package/README.md
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
> PDF parsing library - OCR support with Docling SDK
|
|
4
4
|
|
|
5
5
|
[](https://www.npmjs.com/package/@heripo/pdf-parser)
|
|
6
|
-
[](https://nodejs.org/)
|
|
7
7
|
[](https://www.python.org/)
|
|
8
8
|

|
|
9
9
|
[](../../LICENSE)
|
|
@@ -46,7 +46,7 @@
|
|
|
46
46
|
|
|
47
47
|
### Required Dependencies
|
|
48
48
|
|
|
49
|
-
#### 1. Node.js >=
|
|
49
|
+
#### 1. Node.js >= 24.0.0
|
|
50
50
|
|
|
51
51
|
```bash
|
|
52
52
|
brew install node
|
|
@@ -72,7 +72,7 @@ python3.11 --version
|
|
|
72
72
|
|
|
73
73
|
#### 4. poppler (PDF text extraction)
|
|
74
74
|
|
|
75
|
-
Required for the OCR strategy system's text layer pre-check
|
|
75
|
+
Required for PDF page counting (`pdfinfo`) and text layer extraction (`pdftotext`), used by the OCR strategy system's text layer pre-check.
|
|
76
76
|
|
|
77
77
|
```bash
|
|
78
78
|
brew install poppler
|
|
@@ -281,12 +281,12 @@ Archaeological excavation report PDFs have the following characteristics:
|
|
|
281
281
|
|
|
282
282
|
`@heripo/pdf-parser` requires the following system-level dependencies:
|
|
283
283
|
|
|
284
|
-
| Dependency | Required Version | Installation | Purpose
|
|
285
|
-
| ---------- | ---------------- | -------------------------- |
|
|
286
|
-
| Python | 3.9 - 3.12 | `brew install python@3.11` | Docling SDK runtime
|
|
287
|
-
| poppler | Any | `brew install poppler` |
|
|
288
|
-
| jq | Any | `brew install jq` | JSON processing (conversion result parsing)
|
|
289
|
-
| lsof | Any | Included with macOS | docling-serve port management
|
|
284
|
+
| Dependency | Required Version | Installation | Purpose |
|
|
285
|
+
| ---------- | ---------------- | -------------------------- | ----------------------------------------------------------------- |
|
|
286
|
+
| Python | 3.9 - 3.12 | `brew install python@3.11` | Docling SDK runtime |
|
|
287
|
+
| poppler | Any | `brew install poppler` | PDF page counting (pdfinfo) and text layer extraction (pdftotext) |
|
|
288
|
+
| jq | Any | `brew install jq` | JSON processing (conversion result parsing) |
|
|
289
|
+
| lsof | Any | Included with macOS | docling-serve port management |
|
|
290
290
|
|
|
291
291
|
> ⚠️ **Python 3.13+ is not supported.** Some Docling SDK dependencies are not compatible with Python 3.13.
|
|
292
292
|
|
|
@@ -411,6 +411,16 @@ const pdfParser = new PDFParser({
|
|
|
411
411
|
brew install jq
|
|
412
412
|
```
|
|
413
413
|
|
|
414
|
+
### poppler Not Found
|
|
415
|
+
|
|
416
|
+
**Symptom**: `poppler is not installed. Please install poppler using: brew install poppler`
|
|
417
|
+
|
|
418
|
+
**Solution**:
|
|
419
|
+
|
|
420
|
+
```bash
|
|
421
|
+
brew install poppler
|
|
422
|
+
```
|
|
423
|
+
|
|
414
424
|
### Port Conflict
|
|
415
425
|
|
|
416
426
|
**Symptom**: `Port 5001 is already in use`
|
package/dist/index.cjs
CHANGED
|
@@ -1060,28 +1060,28 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
1060
1060
|
const baseName = filename.replace((0, import_node_path2.extname)(filename), "");
|
|
1061
1061
|
const jsonPath = (0, import_node_path2.join)(outputDir, `${baseName}.json`);
|
|
1062
1062
|
try {
|
|
1063
|
-
const
|
|
1064
|
-
if (!(0, import_node_fs.existsSync)(
|
|
1065
|
-
(0, import_node_fs.mkdirSync)(
|
|
1063
|
+
const imagesDir = (0, import_node_path2.join)(outputDir, "images");
|
|
1064
|
+
if (!(0, import_node_fs.existsSync)(imagesDir)) {
|
|
1065
|
+
(0, import_node_fs.mkdirSync)(imagesDir, { recursive: true });
|
|
1066
1066
|
}
|
|
1067
1067
|
const base64Images = await _ImageExtractor.extractBase64ImagesFromJsonWithJq(jsonSourcePath);
|
|
1068
1068
|
base64Images.forEach((base64Data, index) => {
|
|
1069
1069
|
_ImageExtractor.extractBase64ImageToFile(
|
|
1070
1070
|
base64Data,
|
|
1071
|
-
|
|
1071
|
+
imagesDir,
|
|
1072
1072
|
index,
|
|
1073
|
-
"
|
|
1074
|
-
"
|
|
1073
|
+
"pic",
|
|
1074
|
+
"images"
|
|
1075
1075
|
);
|
|
1076
1076
|
});
|
|
1077
1077
|
logger.info(
|
|
1078
|
-
`[PDFConverter] Extracted ${base64Images.length} images from JSON to ${
|
|
1078
|
+
`[PDFConverter] Extracted ${base64Images.length} picture images from JSON to ${imagesDir}`
|
|
1079
1079
|
);
|
|
1080
1080
|
const replacedCount = await _ImageExtractor.replaceBase64ImagesInJsonWithJq(
|
|
1081
1081
|
jsonSourcePath,
|
|
1082
1082
|
jsonPath,
|
|
1083
|
-
"
|
|
1084
|
-
"
|
|
1083
|
+
"images",
|
|
1084
|
+
"pic"
|
|
1085
1085
|
);
|
|
1086
1086
|
logger.info(
|
|
1087
1087
|
`[PDFConverter] Replaced ${replacedCount} base64 images with file paths`
|
|
@@ -1817,6 +1817,7 @@ var VlmTextCorrector = class {
|
|
|
1817
1817
|
};
|
|
1818
1818
|
|
|
1819
1819
|
// src/samplers/ocr-strategy-sampler.ts
|
|
1820
|
+
var import_model = require("@heripo/model");
|
|
1820
1821
|
var import_node_fs4 = require("fs");
|
|
1821
1822
|
var import_v42 = require("zod/v4");
|
|
1822
1823
|
var SAMPLE_DPI = 150;
|
|
@@ -1889,7 +1890,7 @@ var OcrStrategySampler = class {
|
|
|
1889
1890
|
`[OcrStrategySampler] Sampling ${sampleIndices.length} of ${renderResult.pageCount} pages: [${sampleIndices.map((i) => i + 1).join(", ")}]`
|
|
1890
1891
|
);
|
|
1891
1892
|
let sampledCount = 0;
|
|
1892
|
-
|
|
1893
|
+
const languageFrequency = /* @__PURE__ */ new Map();
|
|
1893
1894
|
for (const idx of sampleIndices) {
|
|
1894
1895
|
sampledCount++;
|
|
1895
1896
|
const pageFile = renderResult.pageFiles[idx];
|
|
@@ -1899,14 +1900,17 @@ var OcrStrategySampler = class {
|
|
|
1899
1900
|
model,
|
|
1900
1901
|
options
|
|
1901
1902
|
);
|
|
1902
|
-
|
|
1903
|
+
for (const lang of pageAnalysis.detectedLanguages) {
|
|
1904
|
+
languageFrequency.set(lang, (languageFrequency.get(lang) ?? 0) + 1);
|
|
1905
|
+
}
|
|
1903
1906
|
if (pageAnalysis.hasKoreanHanjaMix) {
|
|
1904
1907
|
this.logger.info(
|
|
1905
1908
|
`[OcrStrategySampler] Korean-Hanja mix detected on page ${idx + 1} \u2192 VLM strategy`
|
|
1906
1909
|
);
|
|
1910
|
+
const detectedLanguages2 = this.aggregateLanguages(languageFrequency);
|
|
1907
1911
|
return {
|
|
1908
1912
|
method: "vlm",
|
|
1909
|
-
detectedLanguages,
|
|
1913
|
+
detectedLanguages: detectedLanguages2,
|
|
1910
1914
|
reason: `Korean-Hanja mix detected on page ${idx + 1}`,
|
|
1911
1915
|
sampledPages: sampledCount,
|
|
1912
1916
|
totalPages: renderResult.pageCount
|
|
@@ -1916,6 +1920,7 @@ var OcrStrategySampler = class {
|
|
|
1916
1920
|
this.logger.info(
|
|
1917
1921
|
"[OcrStrategySampler] No Korean-Hanja mix detected \u2192 ocrmac strategy"
|
|
1918
1922
|
);
|
|
1923
|
+
const detectedLanguages = this.aggregateLanguages(languageFrequency);
|
|
1919
1924
|
return {
|
|
1920
1925
|
method: "ocrmac",
|
|
1921
1926
|
detectedLanguages,
|
|
@@ -2020,8 +2025,9 @@ var OcrStrategySampler = class {
|
|
|
2020
2025
|
}
|
|
2021
2026
|
/**
|
|
2022
2027
|
* Analyze a single sample page for Korean-Hanja mixed script and primary language.
|
|
2028
|
+
* Normalizes raw VLM language responses to valid BCP 47 tags, filtering out invalid ones.
|
|
2023
2029
|
*
|
|
2024
|
-
* @returns Object with Korean-Hanja detection result and detected languages
|
|
2030
|
+
* @returns Object with Korean-Hanja detection result and normalized detected languages
|
|
2025
2031
|
*/
|
|
2026
2032
|
async analyzeSamplePage(pageFile, pageNo, model, options) {
|
|
2027
2033
|
this.logger.debug(
|
|
@@ -2055,14 +2061,23 @@ var OcrStrategySampler = class {
|
|
|
2055
2061
|
options.aggregator.track(result.usage);
|
|
2056
2062
|
}
|
|
2057
2063
|
const output = result.output;
|
|
2064
|
+
const normalizedLanguages = output.detectedLanguages.map(import_model.normalizeToBcp47).filter((tag) => tag !== null);
|
|
2058
2065
|
this.logger.debug(
|
|
2059
|
-
`[OcrStrategySampler] Page ${pageNo}: hasKoreanHanjaMix=${output.hasKoreanHanjaMix}, detectedLanguages=${
|
|
2066
|
+
`[OcrStrategySampler] Page ${pageNo}: hasKoreanHanjaMix=${output.hasKoreanHanjaMix}, detectedLanguages=${normalizedLanguages.join(",")}`
|
|
2060
2067
|
);
|
|
2061
2068
|
return {
|
|
2062
2069
|
hasKoreanHanjaMix: output.hasKoreanHanjaMix,
|
|
2063
|
-
detectedLanguages:
|
|
2070
|
+
detectedLanguages: normalizedLanguages
|
|
2064
2071
|
};
|
|
2065
2072
|
}
|
|
2073
|
+
/**
|
|
2074
|
+
* Aggregate language frequency map into a sorted array.
|
|
2075
|
+
* Returns languages sorted by frequency (descending), or undefined if empty.
|
|
2076
|
+
*/
|
|
2077
|
+
aggregateLanguages(frequencyMap) {
|
|
2078
|
+
if (frequencyMap.size === 0) return void 0;
|
|
2079
|
+
return [...frequencyMap.entries()].sort((a, b) => b[1] - a[1]).map(([lang]) => lang);
|
|
2080
|
+
}
|
|
2066
2081
|
};
|
|
2067
2082
|
|
|
2068
2083
|
// src/utils/local-file-server.ts
|
|
@@ -2531,6 +2546,7 @@ var PDFConverter = class {
|
|
|
2531
2546
|
const outputDir = (0, import_node_path7.join)(cwd, "output", reportId);
|
|
2532
2547
|
try {
|
|
2533
2548
|
await this.processConvertedFiles(zipPath, extractDir, outputDir);
|
|
2549
|
+
await this.renderPageImages(url, outputDir);
|
|
2534
2550
|
if (abortSignal?.aborted) {
|
|
2535
2551
|
this.logger.info("[PDFConverter] Conversion aborted before callback");
|
|
2536
2552
|
const error = new Error("PDF conversion was aborted");
|
|
@@ -2586,6 +2602,8 @@ var PDFConverter = class {
|
|
|
2586
2602
|
framework: "livetext"
|
|
2587
2603
|
},
|
|
2588
2604
|
generate_picture_images: true,
|
|
2605
|
+
generate_page_images: false,
|
|
2606
|
+
// Page images are rendered by PageRenderer (ImageMagick) after conversion
|
|
2589
2607
|
images_scale: 2,
|
|
2590
2608
|
/**
|
|
2591
2609
|
* While disabling this option yields the most accurate text extraction for readable PDFs,
|
|
@@ -2734,6 +2752,40 @@ var PDFConverter = class {
|
|
|
2734
2752
|
outputDir
|
|
2735
2753
|
);
|
|
2736
2754
|
}
|
|
2755
|
+
/**
|
|
2756
|
+
* Render page images from the source PDF using ImageMagick and update result.json.
|
|
2757
|
+
* Replaces Docling's generate_page_images which fails on large PDFs
|
|
2758
|
+
* due to memory limits when embedding all page images as base64.
|
|
2759
|
+
*/
|
|
2760
|
+
async renderPageImages(url, outputDir) {
|
|
2761
|
+
if (!url.startsWith("file://")) {
|
|
2762
|
+
this.logger.warn(
|
|
2763
|
+
"[PDFConverter] Page image rendering skipped: only supported for local files (file:// URLs)"
|
|
2764
|
+
);
|
|
2765
|
+
return;
|
|
2766
|
+
}
|
|
2767
|
+
const pdfPath = url.slice(7);
|
|
2768
|
+
this.logger.info(
|
|
2769
|
+
"[PDFConverter] Rendering page images with ImageMagick..."
|
|
2770
|
+
);
|
|
2771
|
+
const renderer = new PageRenderer(this.logger);
|
|
2772
|
+
const renderResult = await renderer.renderPages(pdfPath, outputDir);
|
|
2773
|
+
const resultPath = (0, import_node_path7.join)(outputDir, "result.json");
|
|
2774
|
+
const doc = JSON.parse((0, import_node_fs7.readFileSync)(resultPath, "utf-8"));
|
|
2775
|
+
for (const page of Object.values(doc.pages)) {
|
|
2776
|
+
const pageNo = page.page_no;
|
|
2777
|
+
const fileIndex = pageNo - 1;
|
|
2778
|
+
if (fileIndex >= 0 && fileIndex < renderResult.pageCount) {
|
|
2779
|
+
page.image.uri = `pages/page_${fileIndex}.png`;
|
|
2780
|
+
page.image.mimetype = "image/png";
|
|
2781
|
+
page.image.dpi = 300;
|
|
2782
|
+
}
|
|
2783
|
+
}
|
|
2784
|
+
await (0, import_promises.writeFile)(resultPath, JSON.stringify(doc, null, 2));
|
|
2785
|
+
this.logger.info(
|
|
2786
|
+
`[PDFConverter] Rendered ${renderResult.pageCount} page images`
|
|
2787
|
+
);
|
|
2788
|
+
}
|
|
2737
2789
|
};
|
|
2738
2790
|
|
|
2739
2791
|
// src/core/pdf-parser.ts
|
|
@@ -2772,6 +2824,7 @@ var PDFParser = class {
|
|
|
2772
2824
|
this.logger.info("[PDFParser] Initializing...");
|
|
2773
2825
|
this.checkOperatingSystem();
|
|
2774
2826
|
this.checkJqInstalled();
|
|
2827
|
+
this.checkPopplerInstalled();
|
|
2775
2828
|
this.checkMacOSVersion();
|
|
2776
2829
|
if (this.enableImagePdfFallback && !this.baseUrl) {
|
|
2777
2830
|
this.checkImageMagickInstalled();
|
|
@@ -2828,6 +2881,15 @@ var PDFParser = class {
|
|
|
2828
2881
|
);
|
|
2829
2882
|
}
|
|
2830
2883
|
}
|
|
2884
|
+
checkPopplerInstalled() {
|
|
2885
|
+
try {
|
|
2886
|
+
(0, import_node_child_process3.execSync)("which pdftotext", { stdio: "ignore" });
|
|
2887
|
+
} catch {
|
|
2888
|
+
throw new Error(
|
|
2889
|
+
"poppler is not installed. Please install poppler using: brew install poppler"
|
|
2890
|
+
);
|
|
2891
|
+
}
|
|
2892
|
+
}
|
|
2831
2893
|
checkMacOSVersion() {
|
|
2832
2894
|
try {
|
|
2833
2895
|
const versionOutput = (0, import_node_child_process3.execSync)("sw_vers -productVersion", {
|