@d0paminedriven/pdfdown-ocr 0.8.0 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +268 -0
- package/index.d.ts +174 -0
- package/index.js +577 -0
- package/package.json +3 -2
package/README.md
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
# `@d0paminedriven/pdfdown-ocr`
|
|
2
|
+
|
|
3
|
+
Rust-powered PDF extraction for Node.js with Tesseract OCR fallback for image-only pages. A superset of [`@d0paminedriven/pdfdown`](https://www.npmjs.com/package/@d0paminedriven/pdfdown) -- includes all base extraction APIs (text, images, annotations, structured text, metadata) plus OCR.
|
|
4
|
+
|
|
5
|
+
**System requirement:** [Tesseract](https://github.com/tesseract-ocr/tesseract) 5.x must be installed on the host.
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
npm install @d0paminedriven/pdfdown-ocr
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
### Tesseract setup
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
# Ubuntu/Debian (22.04 ships tesseract 3.x -- use the PPA for 5.x)
|
|
17
|
+
sudo add-apt-repository ppa:alex-p/tesseract-ocr5
|
|
18
|
+
sudo apt update
|
|
19
|
+
sudo apt install tesseract-ocr tesseract-ocr-eng -y
|
|
20
|
+
# Optional: all language packs
|
|
21
|
+
# sudo apt install tesseract-ocr-all
|
|
22
|
+
|
|
23
|
+
# macOS
|
|
24
|
+
brew install tesseract
|
|
25
|
+
|
|
26
|
+
# Arch
|
|
27
|
+
sudo pacman -S tesseract tesseract-data-eng
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
Verify with `tesseract --version` -- you should see 5.x.
|
|
31
|
+
|
|
32
|
+
### Tessdata auto-detection
|
|
33
|
+
|
|
34
|
+
The package automatically detects the tessdata directory at runtime by parsing the output of `tesseract --list-langs`. The detected path is cached for the lifetime of the process using a `OnceLock<Option<String>>` -- no global environment mutation, fully thread-safe.
|
|
35
|
+
|
|
36
|
+
**Resolution order:**
|
|
37
|
+
|
|
38
|
+
1. `TESSDATA_PREFIX` environment variable (if set, used as-is -- no auto-detection runs)
|
|
39
|
+
2. Auto-detection via `tesseract --list-langs` (parses the path from `List of available languages in "/path/to/tessdata/"`)
|
|
40
|
+
3. Tesseract's compiled-in default (if neither of the above yields a path)
|
|
41
|
+
|
|
42
|
+
Most users will not need to set `TESSDATA_PREFIX` at all. The auto-detection handles standard installations on Ubuntu (`/usr/share/tesseract-ocr/5/tessdata/`), macOS Homebrew (`/opt/homebrew/share/tessdata/`), Arch, and any other layout where `tesseract` is on `PATH`.
|
|
43
|
+
|
|
44
|
+
Set `TESSDATA_PREFIX` explicitly only if:
|
|
45
|
+
|
|
46
|
+
- Tesseract is not on `PATH` but the tessdata directory exists elsewhere
|
|
47
|
+
- You want to override the detected path (e.g., pointing to a custom-trained data directory)
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
# Override example (not usually needed)
|
|
51
|
+
export TESSDATA_PREFIX="/opt/custom/tessdata"
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## API
|
|
55
|
+
|
|
56
|
+
This package exports everything from `@d0paminedriven/pdfdown` (text, images, annotations, structured text, metadata -- both sync and async), plus the OCR-specific APIs below. See the [base package docs](https://www.npmjs.com/package/@d0paminedriven/pdfdown) for the full base API.
|
|
57
|
+
|
|
58
|
+
### OCR standalone functions
|
|
59
|
+
|
|
60
|
+
```typescript
|
|
61
|
+
// Per-page OCR text extraction
|
|
62
|
+
export declare function extractTextWithOcrPerPage(
|
|
63
|
+
buffer: Buffer,
|
|
64
|
+
opts?: OcrOptions,
|
|
65
|
+
): Array<OcrPageText>
|
|
66
|
+
|
|
67
|
+
export declare function extractTextWithOcrPerPageAsync(
|
|
68
|
+
buffer: Buffer,
|
|
69
|
+
opts?: OcrOptions,
|
|
70
|
+
): Promise<Array<OcrPageText>>
|
|
71
|
+
|
|
72
|
+
// Full document extraction with OCR text fallback
|
|
73
|
+
export declare function pdfDocumentOcr(
|
|
74
|
+
buffer: Buffer,
|
|
75
|
+
opts?: OcrOptions,
|
|
76
|
+
): PdfDocumentOcr
|
|
77
|
+
|
|
78
|
+
export declare function pdfDocumentOcrAsync(
|
|
79
|
+
buffer: Buffer,
|
|
80
|
+
opts?: OcrOptions,
|
|
81
|
+
): Promise<PdfDocumentOcr>
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### `PdfDown` class (includes OCR methods)
|
|
85
|
+
|
|
86
|
+
```typescript
|
|
87
|
+
export declare class PdfDown {
|
|
88
|
+
constructor(buffer: Buffer)
|
|
89
|
+
|
|
90
|
+
// ── Base methods ──
|
|
91
|
+
textPerPage(): Array<PageText>
|
|
92
|
+
textPerPageAsync(): Promise<Array<PageText>>
|
|
93
|
+
imagesPerPage(): Array<PageImage>
|
|
94
|
+
imagesPerPageAsync(): Promise<Array<PageImage>>
|
|
95
|
+
annotationsPerPage(): Array<PageAnnotation>
|
|
96
|
+
annotationsPerPageAsync(): Promise<Array<PageAnnotation>>
|
|
97
|
+
structuredText(): Array<StructuredPageText>
|
|
98
|
+
structuredTextAsync(): Promise<Array<StructuredPageText>>
|
|
99
|
+
metadata(): PdfMeta
|
|
100
|
+
metadataAsync(): Promise<PdfMeta>
|
|
101
|
+
document(): PdfDocument
|
|
102
|
+
documentAsync(): Promise<PdfDocument>
|
|
103
|
+
|
|
104
|
+
// ── OCR methods ──
|
|
105
|
+
textWithOcrPerPage(opts?: OcrOptions): Array<OcrPageText>
|
|
106
|
+
textWithOcrPerPageAsync(opts?: OcrOptions): Promise<Array<OcrPageText>>
|
|
107
|
+
documentOcr(opts?: OcrOptions): PdfDocumentOcr
|
|
108
|
+
documentOcrAsync(opts?: OcrOptions): Promise<PdfDocumentOcr>
|
|
109
|
+
}
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### Types
|
|
113
|
+
|
|
114
|
+
```typescript
|
|
115
|
+
export const enum TextSource {
|
|
116
|
+
Native = 'Native',
|
|
117
|
+
Ocr = 'Ocr',
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
export interface OcrPageText {
|
|
121
|
+
page: number
|
|
122
|
+
text: string
|
|
123
|
+
source: TextSource
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
export interface OcrStructuredPageText {
|
|
127
|
+
page: number
|
|
128
|
+
header: string
|
|
129
|
+
body: string
|
|
130
|
+
footer: string
|
|
131
|
+
source: TextSource
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
export interface OcrOptions {
|
|
135
|
+
lang?: string // Tesseract language code, default "eng"
|
|
136
|
+
minTextLength?: number // non-whitespace char threshold before OCR fallback, default 1
|
|
137
|
+
maxThreads?: number // cap on Rayon threads for OCR parallelism, default 4, clamped to [1, available CPUs]
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
export interface PdfDocumentOcr {
|
|
141
|
+
version: string
|
|
142
|
+
isLinearized: boolean
|
|
143
|
+
pageCount: number
|
|
144
|
+
creator?: string
|
|
145
|
+
producer?: string
|
|
146
|
+
creationDate?: string
|
|
147
|
+
modificationDate?: string
|
|
148
|
+
totalImages: number
|
|
149
|
+
totalAnnotations: number
|
|
150
|
+
imagePages: Array<number>
|
|
151
|
+
annotationPages: Array<number>
|
|
152
|
+
text: Array<OcrPageText>
|
|
153
|
+
structuredText: Array<OcrStructuredPageText>
|
|
154
|
+
images: Array<PageImage>
|
|
155
|
+
annotations: Array<PageAnnotation>
|
|
156
|
+
}
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
## Usage
|
|
160
|
+
|
|
161
|
+
> **Use the async API for OCR.** The sync variants block the Node.js event loop for the duration of OCR processing, which can be significant for multi-page scanned documents.
|
|
162
|
+
|
|
163
|
+
### Standalone
|
|
164
|
+
|
|
165
|
+
```typescript
|
|
166
|
+
import { readFile } from 'fs/promises'
|
|
167
|
+
import { extractTextWithOcrPerPageAsync } from '@d0paminedriven/pdfdown-ocr'
|
|
168
|
+
|
|
169
|
+
const pdf = await readFile('scanned-document.pdf')
|
|
170
|
+
const pages = await extractTextWithOcrPerPageAsync(pdf, { lang: 'eng', minTextLength: 10 })
|
|
171
|
+
|
|
172
|
+
for (const { page, text, source } of pages) {
|
|
173
|
+
console.log(`Page ${page} [${source}]: ${text.slice(0, 100)}...`)
|
|
174
|
+
}
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### Class-based (parse once, extract many)
|
|
178
|
+
|
|
179
|
+
```typescript
|
|
180
|
+
import { readFile } from 'fs/promises'
|
|
181
|
+
import { PdfDown } from '@d0paminedriven/pdfdown-ocr'
|
|
182
|
+
|
|
183
|
+
const pdf = new PdfDown(await readFile('scanned-document.pdf'))
|
|
184
|
+
|
|
185
|
+
// OCR text extraction
|
|
186
|
+
const pages = await pdf.textWithOcrPerPageAsync({ lang: 'eng', minTextLength: 10 })
|
|
187
|
+
|
|
188
|
+
// All base methods work too
|
|
189
|
+
const images = await pdf.imagesPerPageAsync()
|
|
190
|
+
const meta = pdf.metadata()
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
### Extract everything with OCR in one call
|
|
194
|
+
|
|
195
|
+
```typescript
|
|
196
|
+
import { readFile } from 'fs/promises'
|
|
197
|
+
import { PdfDown } from '@d0paminedriven/pdfdown-ocr'
|
|
198
|
+
|
|
199
|
+
const pdf = new PdfDown(await readFile('scanned-document.pdf'))
|
|
200
|
+
const result = await pdf.documentOcrAsync({ minTextLength: 10 })
|
|
201
|
+
|
|
202
|
+
// result.text — OcrPageText[] (page, text, source per page)
|
|
203
|
+
// result.structuredText — OcrStructuredPageText[] (header/body/footer + source per page)
|
|
204
|
+
// result.images — PageImage[] (decoded PNGs with dimensions and color space)
|
|
205
|
+
// result.annotations — PageAnnotation[] (links, destinations, rects)
|
|
206
|
+
// result.pageCount, result.version, result.creator, ...
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
### Combined: OCR text + images for multimodal pipelines
|
|
210
|
+
|
|
211
|
+
```typescript
|
|
212
|
+
import { readFile } from 'fs/promises'
|
|
213
|
+
import { PdfDown } from '@d0paminedriven/pdfdown-ocr'
|
|
214
|
+
|
|
215
|
+
const pdf = new PdfDown(await readFile('scanned-document.pdf'))
|
|
216
|
+
|
|
217
|
+
const [ocrText, images] = await Promise.all([
|
|
218
|
+
pdf.textWithOcrPerPageAsync({ minTextLength: 10 }),
|
|
219
|
+
pdf.imagesPerPageAsync(),
|
|
220
|
+
])
|
|
221
|
+
|
|
222
|
+
const imagesByPage = Map.groupBy(images, (img) => img.page)
|
|
223
|
+
|
|
224
|
+
for (const { page, text, source } of ocrText) {
|
|
225
|
+
const pageImages = (imagesByPage.get(page) ?? []).map((img) => ({
|
|
226
|
+
dataUrl: `data:image/png;base64,${img.data.toString('base64')}`,
|
|
227
|
+
width: img.width,
|
|
228
|
+
height: img.height,
|
|
229
|
+
}))
|
|
230
|
+
// Send { page, text, source, images: pageImages } to your embedding pipeline
|
|
231
|
+
}
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
### `document()` vs `documentOcr()`
|
|
235
|
+
|
|
236
|
+
Both methods extract everything from a PDF in a single call. The difference is how text is extracted:
|
|
237
|
+
|
|
238
|
+
| Method | Text extraction | Return type | Use when |
|
|
239
|
+
|--------|----------------|-------------|----------|
|
|
240
|
+
| `document()` / `documentAsync()` | Native PDF text only | `PdfDocument` | PDF has selectable text |
|
|
241
|
+
| `documentOcr()` / `documentOcrAsync()` | Native with OCR fallback | `PdfDocumentOcr` | PDF may contain scanned/image-only pages |
|
|
242
|
+
|
|
243
|
+
`PdfDocumentOcr` uses `OcrPageText` (with `source: 'Native' | 'Ocr'`) and `OcrStructuredPageText` (with header/body/footer split plus source) instead of the base `PageText` and `StructuredPageText` types. Images, annotations, and metadata are identical in both.
|
|
244
|
+
|
|
245
|
+
## How it works
|
|
246
|
+
|
|
247
|
+
1. **Text extraction:** Each page is first attempted with native PDF text extraction. If a page yields fewer non-whitespace characters than `minTextLength`, its embedded images are decoded and fed to Tesseract for OCR. Each result is tagged with `source: 'Native'` or `source: 'Ocr'`.
|
|
248
|
+
|
|
249
|
+
2. **Structured text:** After text extraction, repeated header/footer lines are detected across pages using frequency analysis (requires 3+ pages). Each page's text is split into `header`, `body`, and `footer` sections. For OCR results, the `source` tag is preserved so you know whether each page's content came from native extraction or OCR.
|
|
250
|
+
|
|
251
|
+
3. **Parallelism:** OCR runs on a dedicated capped Rayon thread pool (default 4 threads, configurable via `maxThreads`) to prevent CPU oversubscription. Text extraction, image extraction, and annotation extraction run concurrently via `rayon::join` when using `documentOcr` / `documentOcrAsync`.
|
|
252
|
+
|
|
253
|
+
4. **Tessdata discovery:** On first OCR invocation, the tessdata path is resolved once and cached in a `OnceLock`. The `TESSDATA_PREFIX` environment variable is checked first; if unset, `tesseract --list-langs` is executed and its output is parsed to extract the path. No environment variables are mutated -- the path is passed directly to Tesseract's init function.
|
|
254
|
+
|
|
255
|
+
## Supported platforms
|
|
256
|
+
|
|
257
|
+
Prebuilt binaries are provided for:
|
|
258
|
+
|
|
259
|
+
- macOS (x64, ARM64)
|
|
260
|
+
- Linux glibc (x64, ARM64)
|
|
261
|
+
|
|
262
|
+
## Relationship to `@d0paminedriven/pdfdown`
|
|
263
|
+
|
|
264
|
+
Same Rust codebase, compiled with the `ocr` Cargo feature flag enabled. This package is a strict superset -- you can use it as a drop-in replacement for the base package if you need OCR capabilities.
|
|
265
|
+
|
|
266
|
+
## License
|
|
267
|
+
|
|
268
|
+
MIT
|
package/index.d.ts
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
/* auto-generated by NAPI-RS */
|
|
2
|
+
/* eslint-disable */
|
|
3
|
+
export declare class PdfDown {
|
|
4
|
+
constructor(buffer: Buffer)
|
|
5
|
+
/** Sync: extract text per page (reuses the already-parsed document) */
|
|
6
|
+
textPerPage(): Array<PageText>
|
|
7
|
+
/** Sync: extract images per page (reuses the already-parsed document) */
|
|
8
|
+
imagesPerPage(): Array<PageImage>
|
|
9
|
+
/** Sync: extract annotations per page (reuses the already-parsed document) */
|
|
10
|
+
annotationsPerPage(): Array<PageAnnotation>
|
|
11
|
+
/** Sync: get PDF metadata (reuses the already-parsed document) */
|
|
12
|
+
metadata(): PdfMeta
|
|
13
|
+
/** Async: extract text per page on the libuv thread pool (shares parsed document via Arc) */
|
|
14
|
+
textPerPageAsync(): Promise<Array<PageText>>
|
|
15
|
+
/** Async: extract images per page on the libuv thread pool (shares parsed document via Arc) */
|
|
16
|
+
imagesPerPageAsync(): Promise<Array<PageImage>>
|
|
17
|
+
/** Async: extract annotations per page on the libuv thread pool (shares parsed document via Arc) */
|
|
18
|
+
annotationsPerPageAsync(): Promise<Array<PageAnnotation>>
|
|
19
|
+
/** Async: get PDF metadata on the libuv thread pool (shares parsed document via Arc) */
|
|
20
|
+
metadataAsync(): Promise<PdfMeta>
|
|
21
|
+
/** Sync: extract everything from the PDF in one call (reuses the already-parsed document) */
|
|
22
|
+
document(): PdfDocument
|
|
23
|
+
/** Async: extract everything from the PDF on the libuv thread pool (shares parsed document via Arc) */
|
|
24
|
+
documentAsync(): Promise<PdfDocument>
|
|
25
|
+
/** Sync: extract structured text with header/footer detection */
|
|
26
|
+
structuredText(): Array<StructuredPageText>
|
|
27
|
+
/** Async: extract structured text with header/footer detection */
|
|
28
|
+
structuredTextAsync(): Promise<Array<StructuredPageText>>
|
|
29
|
+
/** Sync: extract text with OCR fallback for image-only pages */
|
|
30
|
+
textWithOcrPerPage(opts?: OcrOptions | undefined | null): Array<OcrPageText>
|
|
31
|
+
/** Async: extract text with OCR fallback for image-only pages */
|
|
32
|
+
textWithOcrPerPageAsync(opts?: OcrOptions | undefined | null): Promise<Array<OcrPageText>>
|
|
33
|
+
/** Sync: extract everything from the PDF with OCR text fallback */
|
|
34
|
+
documentOcr(opts?: OcrOptions | undefined | null): PdfDocumentOcr
|
|
35
|
+
/** Async: extract everything from the PDF with OCR text fallback */
|
|
36
|
+
documentOcrAsync(opts?: OcrOptions | undefined | null): Promise<PdfDocumentOcr>
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
export declare function extractAnnotationsPerPage(buffer: Buffer): Array<PageAnnotation>
|
|
40
|
+
|
|
41
|
+
export declare function extractAnnotationsPerPageAsync(buffer: Buffer): Promise<Array<PageAnnotation>>
|
|
42
|
+
|
|
43
|
+
export declare function extractImagesPerPage(buffer: Buffer): Array<PageImage>
|
|
44
|
+
|
|
45
|
+
export declare function extractImagesPerPageAsync(buffer: Buffer): Promise<Array<PageImage>>
|
|
46
|
+
|
|
47
|
+
export declare function extractStructuredTextPerPage(buffer: Buffer): Array<StructuredPageText>
|
|
48
|
+
|
|
49
|
+
export declare function extractStructuredTextPerPageAsync(buffer: Buffer): Promise<Array<StructuredPageText>>
|
|
50
|
+
|
|
51
|
+
export declare function extractTextPerPage(buffer: Buffer): Array<PageText>
|
|
52
|
+
|
|
53
|
+
export declare function extractTextPerPageAsync(buffer: Buffer): Promise<Array<PageText>>
|
|
54
|
+
|
|
55
|
+
export declare function extractTextWithOcrPerPage(buffer: Buffer, opts?: OcrOptions | undefined | null): Array<OcrPageText>
|
|
56
|
+
|
|
57
|
+
export declare function extractTextWithOcrPerPageAsync(buffer: Buffer, opts?: OcrOptions | undefined | null): Promise<Array<OcrPageText>>
|
|
58
|
+
|
|
59
|
+
export interface OcrOptions {
|
|
60
|
+
lang?: string
|
|
61
|
+
minTextLength?: number
|
|
62
|
+
maxThreads?: number
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
export interface OcrPageText {
|
|
66
|
+
page: number
|
|
67
|
+
text: string
|
|
68
|
+
source: TextSource
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
export interface OcrStructuredPageText {
|
|
72
|
+
page: number
|
|
73
|
+
header: string
|
|
74
|
+
body: string
|
|
75
|
+
footer: string
|
|
76
|
+
source: TextSource
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
export interface PageAnnotation {
|
|
80
|
+
page: number
|
|
81
|
+
subtype: string
|
|
82
|
+
rect: Array<number>
|
|
83
|
+
uri?: string
|
|
84
|
+
dest?: string
|
|
85
|
+
content?: string
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
export interface PageImage {
|
|
89
|
+
page: number
|
|
90
|
+
imageIndex: number
|
|
91
|
+
width: number
|
|
92
|
+
height: number
|
|
93
|
+
data: Buffer
|
|
94
|
+
colorSpace: string
|
|
95
|
+
bitsPerComponent: number
|
|
96
|
+
filter: string
|
|
97
|
+
xobjectName: string
|
|
98
|
+
objectId: string
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
export interface PageText {
|
|
102
|
+
page: number
|
|
103
|
+
text: string
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
export declare function pdfDocument(buffer: Buffer): PdfDocument
|
|
107
|
+
|
|
108
|
+
export interface PdfDocument {
|
|
109
|
+
version: string
|
|
110
|
+
isLinearized: boolean
|
|
111
|
+
pageCount: number
|
|
112
|
+
creator?: string
|
|
113
|
+
producer?: string
|
|
114
|
+
creationDate?: string
|
|
115
|
+
modificationDate?: string
|
|
116
|
+
totalImages: number
|
|
117
|
+
totalAnnotations: number
|
|
118
|
+
imagePages: Array<number>
|
|
119
|
+
annotationPages: Array<number>
|
|
120
|
+
text: Array<PageText>
|
|
121
|
+
structuredText: Array<StructuredPageText>
|
|
122
|
+
images: Array<PageImage>
|
|
123
|
+
annotations: Array<PageAnnotation>
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
export declare function pdfDocumentAsync(buffer: Buffer): Promise<PdfDocument>
|
|
127
|
+
|
|
128
|
+
export declare function pdfDocumentOcr(buffer: Buffer, opts?: OcrOptions | undefined | null): PdfDocumentOcr
|
|
129
|
+
|
|
130
|
+
export interface PdfDocumentOcr {
|
|
131
|
+
version: string
|
|
132
|
+
isLinearized: boolean
|
|
133
|
+
pageCount: number
|
|
134
|
+
creator?: string
|
|
135
|
+
producer?: string
|
|
136
|
+
creationDate?: string
|
|
137
|
+
modificationDate?: string
|
|
138
|
+
totalImages: number
|
|
139
|
+
totalAnnotations: number
|
|
140
|
+
imagePages: Array<number>
|
|
141
|
+
annotationPages: Array<number>
|
|
142
|
+
text: Array<OcrPageText>
|
|
143
|
+
structuredText: Array<OcrStructuredPageText>
|
|
144
|
+
images: Array<PageImage>
|
|
145
|
+
annotations: Array<PageAnnotation>
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
export declare function pdfDocumentOcrAsync(buffer: Buffer, opts?: OcrOptions | undefined | null): Promise<PdfDocumentOcr>
|
|
149
|
+
|
|
150
|
+
export interface PdfMeta {
|
|
151
|
+
pageCount: number
|
|
152
|
+
version: string
|
|
153
|
+
isLinearized: boolean
|
|
154
|
+
creator?: string
|
|
155
|
+
producer?: string
|
|
156
|
+
creationDate?: string
|
|
157
|
+
modificationDate?: string
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
export declare function pdfMetadata(buffer: Buffer): PdfMeta
|
|
161
|
+
|
|
162
|
+
export declare function pdfMetadataAsync(buffer: Buffer): Promise<PdfMeta>
|
|
163
|
+
|
|
164
|
+
export interface StructuredPageText {
|
|
165
|
+
page: number
|
|
166
|
+
header: string
|
|
167
|
+
body: string
|
|
168
|
+
footer: string
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
export declare const enum TextSource {
|
|
172
|
+
Native = 'Native',
|
|
173
|
+
Ocr = 'Ocr'
|
|
174
|
+
}
|
package/index.js
ADDED
|
@@ -0,0 +1,577 @@
|
|
|
1
|
+
// prettier-ignore
|
|
2
|
+
/* eslint-disable */
|
|
3
|
+
// @ts-nocheck
|
|
4
|
+
/* auto-generated by NAPI-RS */
|
|
5
|
+
|
|
6
|
+
const { createRequire } = require('node:module')
|
|
7
|
+
require = createRequire(__filename)
|
|
8
|
+
|
|
9
|
+
const { readFileSync } = require('node:fs')
|
|
10
|
+
let nativeBinding = null
|
|
11
|
+
const loadErrors = []
|
|
12
|
+
|
|
13
|
+
const isMusl = () => {
|
|
14
|
+
let musl = false
|
|
15
|
+
if (process.platform === 'linux') {
|
|
16
|
+
musl = isMuslFromFilesystem()
|
|
17
|
+
if (musl === null) {
|
|
18
|
+
musl = isMuslFromReport()
|
|
19
|
+
}
|
|
20
|
+
if (musl === null) {
|
|
21
|
+
musl = isMuslFromChildProcess()
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
return musl
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
const isFileMusl = (f) => f.includes('libc.musl-') || f.includes('ld-musl-')
|
|
28
|
+
|
|
29
|
+
const isMuslFromFilesystem = () => {
|
|
30
|
+
try {
|
|
31
|
+
return readFileSync('/usr/bin/ldd', 'utf-8').includes('musl')
|
|
32
|
+
} catch {
|
|
33
|
+
return null
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
const isMuslFromReport = () => {
|
|
38
|
+
let report = null
|
|
39
|
+
if (typeof process.report?.getReport === 'function') {
|
|
40
|
+
process.report.excludeNetwork = true
|
|
41
|
+
report = process.report.getReport()
|
|
42
|
+
}
|
|
43
|
+
if (!report) {
|
|
44
|
+
return null
|
|
45
|
+
}
|
|
46
|
+
if (report.header && report.header.glibcVersionRuntime) {
|
|
47
|
+
return false
|
|
48
|
+
}
|
|
49
|
+
if (Array.isArray(report.sharedObjects)) {
|
|
50
|
+
if (report.sharedObjects.some(isFileMusl)) {
|
|
51
|
+
return true
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
return false
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
const isMuslFromChildProcess = () => {
|
|
58
|
+
try {
|
|
59
|
+
return require('child_process').execSync('ldd --version', { encoding: 'utf8' }).includes('musl')
|
|
60
|
+
} catch (e) {
|
|
61
|
+
// If we reach this case, we don't know if the system is musl or not, so is better to just fallback to false
|
|
62
|
+
return false
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
function requireNative() {
|
|
67
|
+
if (process.env.NAPI_RS_NATIVE_LIBRARY_PATH) {
|
|
68
|
+
try {
|
|
69
|
+
return require(process.env.NAPI_RS_NATIVE_LIBRARY_PATH);
|
|
70
|
+
} catch (err) {
|
|
71
|
+
loadErrors.push(err)
|
|
72
|
+
}
|
|
73
|
+
} else if (process.platform === 'android') {
|
|
74
|
+
if (process.arch === 'arm64') {
|
|
75
|
+
try {
|
|
76
|
+
return require('./pdfdown_ocr.android-arm64.node')
|
|
77
|
+
} catch (e) {
|
|
78
|
+
loadErrors.push(e)
|
|
79
|
+
}
|
|
80
|
+
try {
|
|
81
|
+
const binding = require('@d0paminedriven/pdfdown-ocr-android-arm64')
|
|
82
|
+
const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-android-arm64/package.json').version
|
|
83
|
+
if (bindingPackageVersion !== '0.9.1' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
|
|
84
|
+
throw new Error(`Native binding package version mismatch, expected 0.9.1 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
|
|
85
|
+
}
|
|
86
|
+
return binding
|
|
87
|
+
} catch (e) {
|
|
88
|
+
loadErrors.push(e)
|
|
89
|
+
}
|
|
90
|
+
} else if (process.arch === 'arm') {
|
|
91
|
+
try {
|
|
92
|
+
return require('./pdfdown_ocr.android-arm-eabi.node')
|
|
93
|
+
} catch (e) {
|
|
94
|
+
loadErrors.push(e)
|
|
95
|
+
}
|
|
96
|
+
try {
|
|
97
|
+
const binding = require('@d0paminedriven/pdfdown-ocr-android-arm-eabi')
|
|
98
|
+
const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-android-arm-eabi/package.json').version
|
|
99
|
+
if (bindingPackageVersion !== '0.9.1' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
|
|
100
|
+
throw new Error(`Native binding package version mismatch, expected 0.9.1 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
|
|
101
|
+
}
|
|
102
|
+
return binding
|
|
103
|
+
} catch (e) {
|
|
104
|
+
loadErrors.push(e)
|
|
105
|
+
}
|
|
106
|
+
} else {
|
|
107
|
+
loadErrors.push(new Error(`Unsupported architecture on Android ${process.arch}`))
|
|
108
|
+
}
|
|
109
|
+
} else if (process.platform === 'win32') {
|
|
110
|
+
if (process.arch === 'x64') {
|
|
111
|
+
try {
|
|
112
|
+
return require('./pdfdown_ocr.win32-x64-msvc.node')
|
|
113
|
+
} catch (e) {
|
|
114
|
+
loadErrors.push(e)
|
|
115
|
+
}
|
|
116
|
+
try {
|
|
117
|
+
const binding = require('@d0paminedriven/pdfdown-ocr-win32-x64-msvc')
|
|
118
|
+
const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-win32-x64-msvc/package.json').version
|
|
119
|
+
if (bindingPackageVersion !== '0.9.1' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
|
|
120
|
+
throw new Error(`Native binding package version mismatch, expected 0.9.1 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
|
|
121
|
+
}
|
|
122
|
+
return binding
|
|
123
|
+
} catch (e) {
|
|
124
|
+
loadErrors.push(e)
|
|
125
|
+
}
|
|
126
|
+
} else if (process.arch === 'ia32') {
|
|
127
|
+
try {
|
|
128
|
+
return require('./pdfdown_ocr.win32-ia32-msvc.node')
|
|
129
|
+
} catch (e) {
|
|
130
|
+
loadErrors.push(e)
|
|
131
|
+
}
|
|
132
|
+
try {
|
|
133
|
+
const binding = require('@d0paminedriven/pdfdown-ocr-win32-ia32-msvc')
|
|
134
|
+
const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-win32-ia32-msvc/package.json').version
|
|
135
|
+
if (bindingPackageVersion !== '0.9.1' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
|
|
136
|
+
throw new Error(`Native binding package version mismatch, expected 0.9.1 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
|
|
137
|
+
}
|
|
138
|
+
return binding
|
|
139
|
+
} catch (e) {
|
|
140
|
+
loadErrors.push(e)
|
|
141
|
+
}
|
|
142
|
+
} else if (process.arch === 'arm64') {
|
|
143
|
+
try {
|
|
144
|
+
return require('./pdfdown_ocr.win32-arm64-msvc.node')
|
|
145
|
+
} catch (e) {
|
|
146
|
+
loadErrors.push(e)
|
|
147
|
+
}
|
|
148
|
+
try {
|
|
149
|
+
const binding = require('@d0paminedriven/pdfdown-ocr-win32-arm64-msvc')
|
|
150
|
+
const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-win32-arm64-msvc/package.json').version
|
|
151
|
+
if (bindingPackageVersion !== '0.9.1' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
|
|
152
|
+
throw new Error(`Native binding package version mismatch, expected 0.9.1 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
|
|
153
|
+
}
|
|
154
|
+
return binding
|
|
155
|
+
} catch (e) {
|
|
156
|
+
loadErrors.push(e)
|
|
157
|
+
}
|
|
158
|
+
} else {
|
|
159
|
+
loadErrors.push(new Error(`Unsupported architecture on Windows: ${process.arch}`))
|
|
160
|
+
}
|
|
161
|
+
} else if (process.platform === 'darwin') {
|
|
162
|
+
try {
|
|
163
|
+
return require('./pdfdown_ocr.darwin-universal.node')
|
|
164
|
+
} catch (e) {
|
|
165
|
+
loadErrors.push(e)
|
|
166
|
+
}
|
|
167
|
+
try {
|
|
168
|
+
const binding = require('@d0paminedriven/pdfdown-ocr-darwin-universal')
|
|
169
|
+
const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-darwin-universal/package.json').version
|
|
170
|
+
if (bindingPackageVersion !== '0.9.1' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
|
|
171
|
+
throw new Error(`Native binding package version mismatch, expected 0.9.1 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
|
|
172
|
+
}
|
|
173
|
+
return binding
|
|
174
|
+
} catch (e) {
|
|
175
|
+
loadErrors.push(e)
|
|
176
|
+
}
|
|
177
|
+
if (process.arch === 'x64') {
|
|
178
|
+
try {
|
|
179
|
+
return require('./pdfdown_ocr.darwin-x64.node')
|
|
180
|
+
} catch (e) {
|
|
181
|
+
loadErrors.push(e)
|
|
182
|
+
}
|
|
183
|
+
try {
|
|
184
|
+
const binding = require('@d0paminedriven/pdfdown-ocr-darwin-x64')
|
|
185
|
+
const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-darwin-x64/package.json').version
|
|
186
|
+
if (bindingPackageVersion !== '0.9.1' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
|
|
187
|
+
throw new Error(`Native binding package version mismatch, expected 0.9.1 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
|
|
188
|
+
}
|
|
189
|
+
return binding
|
|
190
|
+
} catch (e) {
|
|
191
|
+
loadErrors.push(e)
|
|
192
|
+
}
|
|
193
|
+
} else if (process.arch === 'arm64') {
|
|
194
|
+
try {
|
|
195
|
+
return require('./pdfdown_ocr.darwin-arm64.node')
|
|
196
|
+
} catch (e) {
|
|
197
|
+
loadErrors.push(e)
|
|
198
|
+
}
|
|
199
|
+
try {
|
|
200
|
+
const binding = require('@d0paminedriven/pdfdown-ocr-darwin-arm64')
|
|
201
|
+
const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-darwin-arm64/package.json').version
|
|
202
|
+
if (bindingPackageVersion !== '0.9.1' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
|
|
203
|
+
throw new Error(`Native binding package version mismatch, expected 0.9.1 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
|
|
204
|
+
}
|
|
205
|
+
return binding
|
|
206
|
+
} catch (e) {
|
|
207
|
+
loadErrors.push(e)
|
|
208
|
+
}
|
|
209
|
+
} else {
|
|
210
|
+
loadErrors.push(new Error(`Unsupported architecture on macOS: ${process.arch}`))
|
|
211
|
+
}
|
|
212
|
+
} else if (process.platform === 'freebsd') {
|
|
213
|
+
if (process.arch === 'x64') {
|
|
214
|
+
try {
|
|
215
|
+
return require('./pdfdown_ocr.freebsd-x64.node')
|
|
216
|
+
} catch (e) {
|
|
217
|
+
loadErrors.push(e)
|
|
218
|
+
}
|
|
219
|
+
try {
|
|
220
|
+
const binding = require('@d0paminedriven/pdfdown-ocr-freebsd-x64')
|
|
221
|
+
const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-freebsd-x64/package.json').version
|
|
222
|
+
if (bindingPackageVersion !== '0.9.1' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
|
|
223
|
+
throw new Error(`Native binding package version mismatch, expected 0.9.1 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
|
|
224
|
+
}
|
|
225
|
+
return binding
|
|
226
|
+
} catch (e) {
|
|
227
|
+
loadErrors.push(e)
|
|
228
|
+
}
|
|
229
|
+
} else if (process.arch === 'arm64') {
|
|
230
|
+
try {
|
|
231
|
+
return require('./pdfdown_ocr.freebsd-arm64.node')
|
|
232
|
+
} catch (e) {
|
|
233
|
+
loadErrors.push(e)
|
|
234
|
+
}
|
|
235
|
+
try {
|
|
236
|
+
const binding = require('@d0paminedriven/pdfdown-ocr-freebsd-arm64')
|
|
237
|
+
const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-freebsd-arm64/package.json').version
|
|
238
|
+
if (bindingPackageVersion !== '0.9.1' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
|
|
239
|
+
throw new Error(`Native binding package version mismatch, expected 0.9.1 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
|
|
240
|
+
}
|
|
241
|
+
return binding
|
|
242
|
+
} catch (e) {
|
|
243
|
+
loadErrors.push(e)
|
|
244
|
+
}
|
|
245
|
+
} else {
|
|
246
|
+
loadErrors.push(new Error(`Unsupported architecture on FreeBSD: ${process.arch}`))
|
|
247
|
+
}
|
|
248
|
+
} else if (process.platform === 'linux') {
|
|
249
|
+
if (process.arch === 'x64') {
|
|
250
|
+
if (isMusl()) {
|
|
251
|
+
try {
|
|
252
|
+
return require('./pdfdown_ocr.linux-x64-musl.node')
|
|
253
|
+
} catch (e) {
|
|
254
|
+
loadErrors.push(e)
|
|
255
|
+
}
|
|
256
|
+
try {
|
|
257
|
+
const binding = require('@d0paminedriven/pdfdown-ocr-linux-x64-musl')
|
|
258
|
+
const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-x64-musl/package.json').version
|
|
259
|
+
if (bindingPackageVersion !== '0.9.1' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
|
|
260
|
+
throw new Error(`Native binding package version mismatch, expected 0.9.1 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
|
|
261
|
+
}
|
|
262
|
+
return binding
|
|
263
|
+
} catch (e) {
|
|
264
|
+
loadErrors.push(e)
|
|
265
|
+
}
|
|
266
|
+
} else {
|
|
267
|
+
try {
|
|
268
|
+
return require('./pdfdown_ocr.linux-x64-gnu.node')
|
|
269
|
+
} catch (e) {
|
|
270
|
+
loadErrors.push(e)
|
|
271
|
+
}
|
|
272
|
+
try {
|
|
273
|
+
const binding = require('@d0paminedriven/pdfdown-ocr-linux-x64-gnu')
|
|
274
|
+
const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-x64-gnu/package.json').version
|
|
275
|
+
if (bindingPackageVersion !== '0.9.1' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
|
|
276
|
+
throw new Error(`Native binding package version mismatch, expected 0.9.1 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
|
|
277
|
+
}
|
|
278
|
+
return binding
|
|
279
|
+
} catch (e) {
|
|
280
|
+
loadErrors.push(e)
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
} else if (process.arch === 'arm64') {
|
|
284
|
+
if (isMusl()) {
|
|
285
|
+
try {
|
|
286
|
+
return require('./pdfdown_ocr.linux-arm64-musl.node')
|
|
287
|
+
} catch (e) {
|
|
288
|
+
loadErrors.push(e)
|
|
289
|
+
}
|
|
290
|
+
try {
|
|
291
|
+
const binding = require('@d0paminedriven/pdfdown-ocr-linux-arm64-musl')
|
|
292
|
+
const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-arm64-musl/package.json').version
|
|
293
|
+
if (bindingPackageVersion !== '0.9.1' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
|
|
294
|
+
throw new Error(`Native binding package version mismatch, expected 0.9.1 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
|
|
295
|
+
}
|
|
296
|
+
return binding
|
|
297
|
+
} catch (e) {
|
|
298
|
+
loadErrors.push(e)
|
|
299
|
+
}
|
|
300
|
+
} else {
|
|
301
|
+
try {
|
|
302
|
+
return require('./pdfdown_ocr.linux-arm64-gnu.node')
|
|
303
|
+
} catch (e) {
|
|
304
|
+
loadErrors.push(e)
|
|
305
|
+
}
|
|
306
|
+
try {
|
|
307
|
+
const binding = require('@d0paminedriven/pdfdown-ocr-linux-arm64-gnu')
|
|
308
|
+
const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-arm64-gnu/package.json').version
|
|
309
|
+
if (bindingPackageVersion !== '0.9.1' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
|
|
310
|
+
throw new Error(`Native binding package version mismatch, expected 0.9.1 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
|
|
311
|
+
}
|
|
312
|
+
return binding
|
|
313
|
+
} catch (e) {
|
|
314
|
+
loadErrors.push(e)
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
} else if (process.arch === 'arm') {
|
|
318
|
+
if (isMusl()) {
|
|
319
|
+
try {
|
|
320
|
+
return require('./pdfdown_ocr.linux-arm-musleabihf.node')
|
|
321
|
+
} catch (e) {
|
|
322
|
+
loadErrors.push(e)
|
|
323
|
+
}
|
|
324
|
+
try {
|
|
325
|
+
const binding = require('@d0paminedriven/pdfdown-ocr-linux-arm-musleabihf')
|
|
326
|
+
const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-arm-musleabihf/package.json').version
|
|
327
|
+
if (bindingPackageVersion !== '0.9.1' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
|
|
328
|
+
throw new Error(`Native binding package version mismatch, expected 0.9.1 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
|
|
329
|
+
}
|
|
330
|
+
return binding
|
|
331
|
+
} catch (e) {
|
|
332
|
+
loadErrors.push(e)
|
|
333
|
+
}
|
|
334
|
+
} else {
|
|
335
|
+
try {
|
|
336
|
+
return require('./pdfdown_ocr.linux-arm-gnueabihf.node')
|
|
337
|
+
} catch (e) {
|
|
338
|
+
loadErrors.push(e)
|
|
339
|
+
}
|
|
340
|
+
try {
|
|
341
|
+
const binding = require('@d0paminedriven/pdfdown-ocr-linux-arm-gnueabihf')
|
|
342
|
+
const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-arm-gnueabihf/package.json').version
|
|
343
|
+
if (bindingPackageVersion !== '0.9.1' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
|
|
344
|
+
throw new Error(`Native binding package version mismatch, expected 0.9.1 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
|
|
345
|
+
}
|
|
346
|
+
return binding
|
|
347
|
+
} catch (e) {
|
|
348
|
+
loadErrors.push(e)
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
} else if (process.arch === 'loong64') {
|
|
352
|
+
if (isMusl()) {
|
|
353
|
+
try {
|
|
354
|
+
return require('./pdfdown_ocr.linux-loong64-musl.node')
|
|
355
|
+
} catch (e) {
|
|
356
|
+
loadErrors.push(e)
|
|
357
|
+
}
|
|
358
|
+
try {
|
|
359
|
+
const binding = require('@d0paminedriven/pdfdown-ocr-linux-loong64-musl')
|
|
360
|
+
const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-loong64-musl/package.json').version
|
|
361
|
+
if (bindingPackageVersion !== '0.9.1' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
|
|
362
|
+
throw new Error(`Native binding package version mismatch, expected 0.9.1 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
|
|
363
|
+
}
|
|
364
|
+
return binding
|
|
365
|
+
} catch (e) {
|
|
366
|
+
loadErrors.push(e)
|
|
367
|
+
}
|
|
368
|
+
} else {
|
|
369
|
+
try {
|
|
370
|
+
return require('./pdfdown_ocr.linux-loong64-gnu.node')
|
|
371
|
+
} catch (e) {
|
|
372
|
+
loadErrors.push(e)
|
|
373
|
+
}
|
|
374
|
+
try {
|
|
375
|
+
const binding = require('@d0paminedriven/pdfdown-ocr-linux-loong64-gnu')
|
|
376
|
+
const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-loong64-gnu/package.json').version
|
|
377
|
+
if (bindingPackageVersion !== '0.9.1' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
|
|
378
|
+
throw new Error(`Native binding package version mismatch, expected 0.9.1 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
|
|
379
|
+
}
|
|
380
|
+
return binding
|
|
381
|
+
} catch (e) {
|
|
382
|
+
loadErrors.push(e)
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
} else if (process.arch === 'riscv64') {
|
|
386
|
+
if (isMusl()) {
|
|
387
|
+
try {
|
|
388
|
+
return require('./pdfdown_ocr.linux-riscv64-musl.node')
|
|
389
|
+
} catch (e) {
|
|
390
|
+
loadErrors.push(e)
|
|
391
|
+
}
|
|
392
|
+
try {
|
|
393
|
+
const binding = require('@d0paminedriven/pdfdown-ocr-linux-riscv64-musl')
|
|
394
|
+
const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-riscv64-musl/package.json').version
|
|
395
|
+
if (bindingPackageVersion !== '0.9.1' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
|
|
396
|
+
throw new Error(`Native binding package version mismatch, expected 0.9.1 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
|
|
397
|
+
}
|
|
398
|
+
return binding
|
|
399
|
+
} catch (e) {
|
|
400
|
+
loadErrors.push(e)
|
|
401
|
+
}
|
|
402
|
+
} else {
|
|
403
|
+
try {
|
|
404
|
+
return require('./pdfdown_ocr.linux-riscv64-gnu.node')
|
|
405
|
+
} catch (e) {
|
|
406
|
+
loadErrors.push(e)
|
|
407
|
+
}
|
|
408
|
+
try {
|
|
409
|
+
const binding = require('@d0paminedriven/pdfdown-ocr-linux-riscv64-gnu')
|
|
410
|
+
const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-riscv64-gnu/package.json').version
|
|
411
|
+
if (bindingPackageVersion !== '0.9.1' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
|
|
412
|
+
throw new Error(`Native binding package version mismatch, expected 0.9.1 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
|
|
413
|
+
}
|
|
414
|
+
return binding
|
|
415
|
+
} catch (e) {
|
|
416
|
+
loadErrors.push(e)
|
|
417
|
+
}
|
|
418
|
+
}
|
|
419
|
+
} else if (process.arch === 'ppc64') {
|
|
420
|
+
try {
|
|
421
|
+
return require('./pdfdown_ocr.linux-ppc64-gnu.node')
|
|
422
|
+
} catch (e) {
|
|
423
|
+
loadErrors.push(e)
|
|
424
|
+
}
|
|
425
|
+
try {
|
|
426
|
+
const binding = require('@d0paminedriven/pdfdown-ocr-linux-ppc64-gnu')
|
|
427
|
+
const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-ppc64-gnu/package.json').version
|
|
428
|
+
if (bindingPackageVersion !== '0.9.1' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
|
|
429
|
+
throw new Error(`Native binding package version mismatch, expected 0.9.1 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
|
|
430
|
+
}
|
|
431
|
+
return binding
|
|
432
|
+
} catch (e) {
|
|
433
|
+
loadErrors.push(e)
|
|
434
|
+
}
|
|
435
|
+
} else if (process.arch === 's390x') {
|
|
436
|
+
try {
|
|
437
|
+
return require('./pdfdown_ocr.linux-s390x-gnu.node')
|
|
438
|
+
} catch (e) {
|
|
439
|
+
loadErrors.push(e)
|
|
440
|
+
}
|
|
441
|
+
try {
|
|
442
|
+
const binding = require('@d0paminedriven/pdfdown-ocr-linux-s390x-gnu')
|
|
443
|
+
const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-linux-s390x-gnu/package.json').version
|
|
444
|
+
if (bindingPackageVersion !== '0.9.1' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
|
|
445
|
+
throw new Error(`Native binding package version mismatch, expected 0.9.1 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
|
|
446
|
+
}
|
|
447
|
+
return binding
|
|
448
|
+
} catch (e) {
|
|
449
|
+
loadErrors.push(e)
|
|
450
|
+
}
|
|
451
|
+
} else {
|
|
452
|
+
loadErrors.push(new Error(`Unsupported architecture on Linux: ${process.arch}`))
|
|
453
|
+
}
|
|
454
|
+
} else if (process.platform === 'openharmony') {
|
|
455
|
+
if (process.arch === 'arm64') {
|
|
456
|
+
try {
|
|
457
|
+
return require('./pdfdown_ocr.openharmony-arm64.node')
|
|
458
|
+
} catch (e) {
|
|
459
|
+
loadErrors.push(e)
|
|
460
|
+
}
|
|
461
|
+
try {
|
|
462
|
+
const binding = require('@d0paminedriven/pdfdown-ocr-openharmony-arm64')
|
|
463
|
+
const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-openharmony-arm64/package.json').version
|
|
464
|
+
if (bindingPackageVersion !== '0.9.1' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
|
|
465
|
+
throw new Error(`Native binding package version mismatch, expected 0.9.1 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
|
|
466
|
+
}
|
|
467
|
+
return binding
|
|
468
|
+
} catch (e) {
|
|
469
|
+
loadErrors.push(e)
|
|
470
|
+
}
|
|
471
|
+
} else if (process.arch === 'x64') {
|
|
472
|
+
try {
|
|
473
|
+
return require('./pdfdown_ocr.openharmony-x64.node')
|
|
474
|
+
} catch (e) {
|
|
475
|
+
loadErrors.push(e)
|
|
476
|
+
}
|
|
477
|
+
try {
|
|
478
|
+
const binding = require('@d0paminedriven/pdfdown-ocr-openharmony-x64')
|
|
479
|
+
const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-openharmony-x64/package.json').version
|
|
480
|
+
if (bindingPackageVersion !== '0.9.1' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
|
|
481
|
+
throw new Error(`Native binding package version mismatch, expected 0.9.1 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
|
|
482
|
+
}
|
|
483
|
+
return binding
|
|
484
|
+
} catch (e) {
|
|
485
|
+
loadErrors.push(e)
|
|
486
|
+
}
|
|
487
|
+
} else if (process.arch === 'arm') {
|
|
488
|
+
try {
|
|
489
|
+
return require('./pdfdown_ocr.openharmony-arm.node')
|
|
490
|
+
} catch (e) {
|
|
491
|
+
loadErrors.push(e)
|
|
492
|
+
}
|
|
493
|
+
try {
|
|
494
|
+
const binding = require('@d0paminedriven/pdfdown-ocr-openharmony-arm')
|
|
495
|
+
const bindingPackageVersion = require('@d0paminedriven/pdfdown-ocr-openharmony-arm/package.json').version
|
|
496
|
+
if (bindingPackageVersion !== '0.9.1' && process.env.NAPI_RS_ENFORCE_VERSION_CHECK && process.env.NAPI_RS_ENFORCE_VERSION_CHECK !== '0') {
|
|
497
|
+
throw new Error(`Native binding package version mismatch, expected 0.9.1 but got ${bindingPackageVersion}. You can reinstall dependencies to fix this issue.`)
|
|
498
|
+
}
|
|
499
|
+
return binding
|
|
500
|
+
} catch (e) {
|
|
501
|
+
loadErrors.push(e)
|
|
502
|
+
}
|
|
503
|
+
} else {
|
|
504
|
+
loadErrors.push(new Error(`Unsupported architecture on OpenHarmony: ${process.arch}`))
|
|
505
|
+
}
|
|
506
|
+
} else {
|
|
507
|
+
loadErrors.push(new Error(`Unsupported OS: ${process.platform}, architecture: ${process.arch}`))
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
nativeBinding = requireNative()
|
|
512
|
+
|
|
513
|
+
if (!nativeBinding || process.env.NAPI_RS_FORCE_WASI) {
|
|
514
|
+
let wasiBinding = null
|
|
515
|
+
let wasiBindingError = null
|
|
516
|
+
try {
|
|
517
|
+
wasiBinding = require('./pdfdown_ocr.wasi.cjs')
|
|
518
|
+
nativeBinding = wasiBinding
|
|
519
|
+
} catch (err) {
|
|
520
|
+
if (process.env.NAPI_RS_FORCE_WASI) {
|
|
521
|
+
wasiBindingError = err
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
if (!nativeBinding) {
|
|
525
|
+
try {
|
|
526
|
+
wasiBinding = require('@d0paminedriven/pdfdown-ocr-wasm32-wasi')
|
|
527
|
+
nativeBinding = wasiBinding
|
|
528
|
+
} catch (err) {
|
|
529
|
+
if (process.env.NAPI_RS_FORCE_WASI) {
|
|
530
|
+
wasiBindingError.cause = err
|
|
531
|
+
loadErrors.push(err)
|
|
532
|
+
}
|
|
533
|
+
}
|
|
534
|
+
}
|
|
535
|
+
if (process.env.NAPI_RS_FORCE_WASI === 'error' && !wasiBinding) {
|
|
536
|
+
const error = new Error('WASI binding not found and NAPI_RS_FORCE_WASI is set to error')
|
|
537
|
+
error.cause = wasiBindingError
|
|
538
|
+
throw error
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
if (!nativeBinding) {
|
|
543
|
+
if (loadErrors.length > 0) {
|
|
544
|
+
throw new Error(
|
|
545
|
+
`Cannot find native binding. ` +
|
|
546
|
+
`npm has a bug related to optional dependencies (https://github.com/npm/cli/issues/4828). ` +
|
|
547
|
+
'Please try `npm i` again after removing both package-lock.json and node_modules directory.',
|
|
548
|
+
{
|
|
549
|
+
cause: loadErrors.reduce((err, cur) => {
|
|
550
|
+
cur.cause = err
|
|
551
|
+
return cur
|
|
552
|
+
}),
|
|
553
|
+
},
|
|
554
|
+
)
|
|
555
|
+
}
|
|
556
|
+
throw new Error(`Failed to load native binding`)
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
module.exports = nativeBinding
|
|
560
|
+
module.exports.PdfDown = nativeBinding.PdfDown
|
|
561
|
+
module.exports.extractAnnotationsPerPage = nativeBinding.extractAnnotationsPerPage
|
|
562
|
+
module.exports.extractAnnotationsPerPageAsync = nativeBinding.extractAnnotationsPerPageAsync
|
|
563
|
+
module.exports.extractImagesPerPage = nativeBinding.extractImagesPerPage
|
|
564
|
+
module.exports.extractImagesPerPageAsync = nativeBinding.extractImagesPerPageAsync
|
|
565
|
+
module.exports.extractStructuredTextPerPage = nativeBinding.extractStructuredTextPerPage
|
|
566
|
+
module.exports.extractStructuredTextPerPageAsync = nativeBinding.extractStructuredTextPerPageAsync
|
|
567
|
+
module.exports.extractTextPerPage = nativeBinding.extractTextPerPage
|
|
568
|
+
module.exports.extractTextPerPageAsync = nativeBinding.extractTextPerPageAsync
|
|
569
|
+
module.exports.extractTextWithOcrPerPage = nativeBinding.extractTextWithOcrPerPage
|
|
570
|
+
module.exports.extractTextWithOcrPerPageAsync = nativeBinding.extractTextWithOcrPerPageAsync
|
|
571
|
+
module.exports.pdfDocument = nativeBinding.pdfDocument
|
|
572
|
+
module.exports.pdfDocumentAsync = nativeBinding.pdfDocumentAsync
|
|
573
|
+
module.exports.pdfDocumentOcr = nativeBinding.pdfDocumentOcr
|
|
574
|
+
module.exports.pdfDocumentOcrAsync = nativeBinding.pdfDocumentOcrAsync
|
|
575
|
+
module.exports.pdfMetadata = nativeBinding.pdfMetadata
|
|
576
|
+
module.exports.pdfMetadataAsync = nativeBinding.pdfMetadataAsync
|
|
577
|
+
module.exports.TextSource = nativeBinding.TextSource
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@d0paminedriven/pdfdown-ocr",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.9.1",
|
|
4
4
|
"description": "Rust powered PDF extraction for Node with OCR fallback (requires system tesseract).",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"repository": {
|
|
@@ -10,7 +10,8 @@
|
|
|
10
10
|
"license": "MIT",
|
|
11
11
|
"files": [
|
|
12
12
|
"index.d.ts",
|
|
13
|
-
"index.js"
|
|
13
|
+
"index.js",
|
|
14
|
+
"README.md"
|
|
14
15
|
],
|
|
15
16
|
"os": [
|
|
16
17
|
"darwin",
|