@heripo/pdf-parser 0.1.15 → 0.1.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.ko.md +292 -103
- package/README.md +293 -104
- package/dist/index.cjs +2679 -2671
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +3 -12
- package/dist/index.d.ts +3 -12
- package/dist/index.js +2701 -2694
- package/dist/index.js.map +1 -1
- package/package.json +5 -5
package/README.md
CHANGED
|
@@ -21,6 +21,11 @@
|
|
|
21
21
|
- [Installation](#installation)
|
|
22
22
|
- [Usage](#usage)
|
|
23
23
|
- [OCR Strategy System](#ocr-strategy-system)
|
|
24
|
+
- [Document Type Validation](#document-type-validation)
|
|
25
|
+
- [Large PDF Chunked Conversion](#large-pdf-chunked-conversion)
|
|
26
|
+
- [Image PDF Fallback](#image-pdf-fallback)
|
|
27
|
+
- [AbortSignal Support](#abortsignal-support)
|
|
28
|
+
- [Server Crash Recovery](#server-crash-recovery)
|
|
24
29
|
- [Why macOS Only?](#why-macos-only)
|
|
25
30
|
- [System Dependencies Details](#system-dependencies-details)
|
|
26
31
|
- [API Documentation](#api-documentation)
|
|
@@ -34,7 +39,11 @@
|
|
|
34
39
|
- **Apple Silicon Optimized**: GPU acceleration on M1/M2/M3/M4/M5 chips
|
|
35
40
|
- **Automatic Environment Setup**: Automatic Python virtual environment and docling-serve installation
|
|
36
41
|
- **Image Extraction**: Automatic extraction and saving of images from PDFs
|
|
37
|
-
- **
|
|
42
|
+
- **Document Type Validation**: Optional LLM-based validation that a PDF is an archaeological report
|
|
43
|
+
- **Chunked Conversion**: Split large PDFs into chunks for reliable processing
|
|
44
|
+
- **Image PDF Fallback**: Automatic fallback to image-based PDF when conversion fails
|
|
45
|
+
- **AbortSignal Support**: Cancel ongoing parsing operations
|
|
46
|
+
- **Server Crash Recovery**: Automatic restart of docling-serve on ECONNREFUSED
|
|
38
47
|
|
|
39
48
|
## Prerequisites
|
|
40
49
|
|
|
@@ -60,7 +69,7 @@ npm install -g pnpm
|
|
|
60
69
|
|
|
61
70
|
#### 3. Python 3.9 - 3.12
|
|
62
71
|
|
|
63
|
-
|
|
72
|
+
> **Important**: Python 3.13+ is not supported. Some Docling SDK dependencies are not compatible with Python 3.13.
|
|
64
73
|
|
|
65
74
|
```bash
|
|
66
75
|
# Install Python 3.11 (recommended)
|
|
@@ -92,11 +101,19 @@ Installed by default on macOS. Verify:
|
|
|
92
101
|
which lsof
|
|
93
102
|
```
|
|
94
103
|
|
|
104
|
+
#### 7. ImageMagick + Ghostscript (optional)
|
|
105
|
+
|
|
106
|
+
Required only when using the image PDF fallback feature (`enableImagePdfFallback` or `forceImagePdf`).
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
brew install imagemagick ghostscript
|
|
110
|
+
```
|
|
111
|
+
|
|
95
112
|
### First Run Setup
|
|
96
113
|
|
|
97
114
|
When using `@heripo/pdf-parser` for the first time, it automatically:
|
|
98
115
|
|
|
99
|
-
1. Creates Python virtual environment at
|
|
116
|
+
1. Creates Python virtual environment at `.venv` in the current working directory (configurable via `venvPath`)
|
|
100
117
|
2. Installs `docling-serve` and dependencies
|
|
101
118
|
3. Starts docling-serve process on local port
|
|
102
119
|
|
|
@@ -125,9 +142,9 @@ import { PDFParser } from '@heripo/pdf-parser';
|
|
|
125
142
|
|
|
126
143
|
const logger = Logger(...);
|
|
127
144
|
|
|
128
|
-
// Create PDFParser instance
|
|
145
|
+
// Create PDFParser instance (logger is required)
|
|
129
146
|
const pdfParser = new PDFParser({
|
|
130
|
-
|
|
147
|
+
port: 5001,
|
|
131
148
|
logger,
|
|
132
149
|
});
|
|
133
150
|
|
|
@@ -135,58 +152,68 @@ const pdfParser = new PDFParser({
|
|
|
135
152
|
await pdfParser.init();
|
|
136
153
|
|
|
137
154
|
// Parse PDF
|
|
138
|
-
const
|
|
139
|
-
'path/to/report.pdf', //
|
|
140
|
-
'
|
|
141
|
-
(
|
|
155
|
+
const tokenUsageReport = await pdfParser.parse(
|
|
156
|
+
'file:///path/to/report.pdf', // PDF URL (file:// or http://)
|
|
157
|
+
'report-001', // Report ID
|
|
158
|
+
async (outputPath) => {
|
|
142
159
|
// Conversion complete callback
|
|
143
|
-
console.log('PDF conversion complete:',
|
|
160
|
+
console.log('PDF conversion complete:', outputPath);
|
|
144
161
|
},
|
|
162
|
+
false, // cleanupAfterCallback
|
|
163
|
+
{}, // PDFConvertOptions
|
|
145
164
|
);
|
|
146
165
|
|
|
147
|
-
//
|
|
148
|
-
console.log('
|
|
166
|
+
// Token usage report (null when no LLM usage)
|
|
167
|
+
console.log('Token usage:', tokenUsageReport);
|
|
149
168
|
```
|
|
150
169
|
|
|
151
170
|
### Advanced Options
|
|
152
171
|
|
|
153
172
|
```typescript
|
|
173
|
+
// Option A: Use local server with port
|
|
154
174
|
const pdfParser = new PDFParser({
|
|
155
|
-
pythonPath: 'python3.11',
|
|
156
175
|
logger,
|
|
176
|
+
port: 5001, // Port to use (default: 5001)
|
|
177
|
+
timeout: 10000000, // Timeout (milliseconds)
|
|
178
|
+
venvPath: '/custom/path/.venv', // Custom venv path (default: CWD/.venv)
|
|
179
|
+
killExistingProcess: true, // Kill existing process on port (default: false)
|
|
180
|
+
enableImagePdfFallback: true, // Enable image PDF fallback (default: false)
|
|
181
|
+
});
|
|
157
182
|
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
// Use external docling-serve
|
|
163
|
-
externalDoclingUrl: 'http://localhost:5000', // When using external server
|
|
183
|
+
// Option B: Use external docling-serve
|
|
184
|
+
const pdfParser = new PDFParser({
|
|
185
|
+
logger,
|
|
186
|
+
baseUrl: 'http://localhost:5000', // External server URL
|
|
164
187
|
});
|
|
165
188
|
|
|
166
|
-
//
|
|
167
|
-
await pdfParser.parse(
|
|
168
|
-
|
|
169
|
-
|
|
189
|
+
// Parse with conversion options
|
|
190
|
+
const tokenUsageReport = await pdfParser.parse(
|
|
191
|
+
'file:///path/to/input.pdf',
|
|
192
|
+
'report-001',
|
|
193
|
+
async (outputPath) => console.log(outputPath),
|
|
194
|
+
false,
|
|
195
|
+
{
|
|
196
|
+
// OCR strategy options
|
|
197
|
+
strategySamplerModel: openai('gpt-5.1'),
|
|
198
|
+
vlmProcessorModel: openai('gpt-5.1'),
|
|
199
|
+
vlmConcurrency: 3,
|
|
170
200
|
|
|
171
|
-
|
|
172
|
-
|
|
201
|
+
// Document validation
|
|
202
|
+
documentValidationModel: openai('gpt-5.1'),
|
|
173
203
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
204
|
+
// Chunked conversion for large PDFs
|
|
205
|
+
chunkedConversion: true,
|
|
206
|
+
chunkSize: 50,
|
|
207
|
+
chunkMaxRetries: 3,
|
|
178
208
|
|
|
179
|
-
|
|
209
|
+
// Force image PDF pre-conversion
|
|
210
|
+
forceImagePdf: false,
|
|
180
211
|
|
|
181
|
-
|
|
212
|
+
// Document processing timeout (seconds)
|
|
213
|
+
document_timeout: 600,
|
|
182
214
|
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
'report.pdf',
|
|
186
|
-
'output',
|
|
187
|
-
(resultPath) => {
|
|
188
|
-
// Images are saved in output/images/ directory
|
|
189
|
-
console.log('Image extraction complete:', resultPath);
|
|
215
|
+
// Token usage tracking
|
|
216
|
+
onTokenUsage: (report) => console.log('Token usage:', report),
|
|
190
217
|
},
|
|
191
218
|
);
|
|
192
219
|
```
|
|
@@ -196,15 +223,15 @@ const outputPath = await pdfParser.parse(
|
|
|
196
223
|
Clean up resources after work is complete:
|
|
197
224
|
|
|
198
225
|
```typescript
|
|
199
|
-
// Terminate docling-serve process
|
|
200
|
-
await pdfParser.
|
|
226
|
+
// Terminate docling-serve process and release resources
|
|
227
|
+
await pdfParser.dispose();
|
|
201
228
|
```
|
|
202
229
|
|
|
203
230
|
## OCR Strategy System
|
|
204
231
|
|
|
205
232
|
### Why This Strategy?
|
|
206
233
|
|
|
207
|
-
**ocrmac (Apple Vision Framework) is an excellent OCR engine**
|
|
234
|
+
**ocrmac (Apple Vision Framework) is an excellent OCR engine** -- it's free, GPU-accelerated, and delivers high-quality results. For processing thousands to millions of archaeological reports, there's no better solution.
|
|
208
235
|
|
|
209
236
|
**However, ocrmac cannot handle mixed character systems.** Documents containing Korean-Hanja combinations (and potentially other mixed scripts) produce garbled text for the non-primary script. Rather than switching the entire pipeline to a costly VLM, the system **targets only the affected pages** for VLM correction, minimizing cost and processing time.
|
|
210
237
|
|
|
@@ -219,16 +246,17 @@ When mixed-script pages are detected, only those pages are sent to the VLM for c
|
|
|
219
246
|
|
|
220
247
|
- Extracts OCR text elements and table cells from each page
|
|
221
248
|
- Uses `pdftotext` reference text as a quality anchor
|
|
222
|
-
- VLM returns substitution-based corrections (find
|
|
249
|
+
- VLM returns substitution-based corrections (find -> replace)
|
|
223
250
|
- Failed page corrections are gracefully skipped, preserving original OCR text
|
|
224
251
|
|
|
225
252
|
### Strategy Options
|
|
226
253
|
|
|
227
254
|
```typescript
|
|
228
|
-
const
|
|
229
|
-
'input.pdf',
|
|
230
|
-
'
|
|
231
|
-
(
|
|
255
|
+
const tokenUsageReport = await pdfParser.parse(
|
|
256
|
+
'file:///path/to/input.pdf',
|
|
257
|
+
'report-001',
|
|
258
|
+
async (outputPath) => console.log(outputPath),
|
|
259
|
+
false,
|
|
232
260
|
{
|
|
233
261
|
// Enable OCR strategy sampling (provide a Vision LLM model)
|
|
234
262
|
strategySamplerModel: openai('gpt-5.1'),
|
|
@@ -245,6 +273,114 @@ const outputPath = await pdfParser.parse(
|
|
|
245
273
|
);
|
|
246
274
|
```
|
|
247
275
|
|
|
276
|
+
## Document Type Validation
|
|
277
|
+
|
|
278
|
+
Optional LLM-based validation that a PDF is an archaeological investigation report. When `documentValidationModel` is provided, the parser extracts text from the PDF and uses the LLM to verify the document type before processing. If validation fails, an `InvalidDocumentTypeError` is thrown.
|
|
279
|
+
|
|
280
|
+
```typescript
|
|
281
|
+
import { InvalidDocumentTypeError } from '@heripo/pdf-parser';
|
|
282
|
+
|
|
283
|
+
try {
|
|
284
|
+
await pdfParser.parse(
|
|
285
|
+
'file:///path/to/input.pdf',
|
|
286
|
+
'report-001',
|
|
287
|
+
async (outputPath) => console.log(outputPath),
|
|
288
|
+
false,
|
|
289
|
+
{
|
|
290
|
+
documentValidationModel: openai('gpt-5.1'),
|
|
291
|
+
},
|
|
292
|
+
);
|
|
293
|
+
} catch (error) {
|
|
294
|
+
if (error instanceof InvalidDocumentTypeError) {
|
|
295
|
+
console.error('Not an archaeological report:', error.message);
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
```
|
|
299
|
+
|
|
300
|
+
## Large PDF Chunked Conversion
|
|
301
|
+
|
|
302
|
+
For large PDFs that may cause timeouts or memory issues, enable chunked conversion to split the PDF into smaller chunks and process them individually. Only works with local files (`file://` URLs).
|
|
303
|
+
|
|
304
|
+
```typescript
|
|
305
|
+
const tokenUsageReport = await pdfParser.parse(
|
|
306
|
+
'file:///path/to/large-report.pdf',
|
|
307
|
+
'report-001',
|
|
308
|
+
async (outputPath) => console.log(outputPath),
|
|
309
|
+
false,
|
|
310
|
+
{
|
|
311
|
+
chunkedConversion: true,
|
|
312
|
+
chunkSize: 50, // Pages per chunk (default: configured in constants)
|
|
313
|
+
chunkMaxRetries: 3, // Max retry attempts per failed chunk (default: configured in constants)
|
|
314
|
+
},
|
|
315
|
+
);
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
## Image PDF Fallback
|
|
319
|
+
|
|
320
|
+
When conversion fails, the parser can automatically fall back to converting the PDF to an image-based PDF first, then retrying conversion. This is useful for PDFs with complex or corrupt structures. Requires ImageMagick and Ghostscript.
|
|
321
|
+
|
|
322
|
+
### Automatic Fallback (on failure)
|
|
323
|
+
|
|
324
|
+
Enable via constructor option. When a conversion fails, the parser automatically retries using an image-based PDF:
|
|
325
|
+
|
|
326
|
+
```typescript
|
|
327
|
+
const pdfParser = new PDFParser({
|
|
328
|
+
logger,
|
|
329
|
+
port: 5001,
|
|
330
|
+
enableImagePdfFallback: true, // Enable automatic fallback
|
|
331
|
+
});
|
|
332
|
+
```
|
|
333
|
+
|
|
334
|
+
### Forced Image PDF (always)
|
|
335
|
+
|
|
336
|
+
Force pre-conversion to image-based PDF via parse option:
|
|
337
|
+
|
|
338
|
+
```typescript
|
|
339
|
+
const tokenUsageReport = await pdfParser.parse(
|
|
340
|
+
'file:///path/to/input.pdf',
|
|
341
|
+
'report-001',
|
|
342
|
+
async (outputPath) => console.log(outputPath),
|
|
343
|
+
false,
|
|
344
|
+
{
|
|
345
|
+
forceImagePdf: true, // Always convert to image PDF first
|
|
346
|
+
},
|
|
347
|
+
);
|
|
348
|
+
```
|
|
349
|
+
|
|
350
|
+
If both the original and fallback conversions fail, an `ImagePdfFallbackError` is thrown containing both errors.
|
|
351
|
+
|
|
352
|
+
## AbortSignal Support
|
|
353
|
+
|
|
354
|
+
Pass an `AbortSignal` to cancel ongoing parsing operations:
|
|
355
|
+
|
|
356
|
+
```typescript
|
|
357
|
+
const controller = new AbortController();
|
|
358
|
+
|
|
359
|
+
// Cancel after 5 minutes
|
|
360
|
+
setTimeout(() => controller.abort(), 5 * 60 * 1000);
|
|
361
|
+
|
|
362
|
+
try {
|
|
363
|
+
await pdfParser.parse(
|
|
364
|
+
'file:///path/to/input.pdf',
|
|
365
|
+
'report-001',
|
|
366
|
+
async (outputPath) => console.log(outputPath),
|
|
367
|
+
false,
|
|
368
|
+
{},
|
|
369
|
+
controller.signal, // AbortSignal
|
|
370
|
+
);
|
|
371
|
+
} catch (error) {
|
|
372
|
+
if (error.name === 'AbortError') {
|
|
373
|
+
console.log('Parsing was cancelled');
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
```
|
|
377
|
+
|
|
378
|
+
## Server Crash Recovery
|
|
379
|
+
|
|
380
|
+
When using a local docling-serve instance (port mode), the parser automatically detects server crashes (ECONNREFUSED errors) and restarts the server. This happens transparently during `parse()` calls -- the failed operation is retried after the server is restarted.
|
|
381
|
+
|
|
382
|
+
> **Note**: Server crash recovery is only available in local server mode (using `port` option). When using an external server (`baseUrl` option), recovery is not attempted.
|
|
383
|
+
|
|
248
384
|
## Why macOS Only?
|
|
249
385
|
|
|
250
386
|
`@heripo/pdf-parser` **intentionally relies heavily on macOS**. The key reason for this decision is **Docling SDK's local OCR performance**.
|
|
@@ -281,14 +417,16 @@ Archaeological excavation report PDFs have the following characteristics:
|
|
|
281
417
|
|
|
282
418
|
`@heripo/pdf-parser` requires the following system-level dependencies:
|
|
283
419
|
|
|
284
|
-
| Dependency
|
|
285
|
-
|
|
|
286
|
-
| Python
|
|
287
|
-
| poppler
|
|
288
|
-
| jq
|
|
289
|
-
| lsof
|
|
420
|
+
| Dependency | Required Version | Installation | Purpose |
|
|
421
|
+
| ----------- | ---------------- | -------------------------- | ----------------------------------------------------------------- |
|
|
422
|
+
| Python | 3.9 - 3.12 | `brew install python@3.11` | Docling SDK runtime |
|
|
423
|
+
| poppler | Any | `brew install poppler` | PDF page counting (pdfinfo) and text layer extraction (pdftotext) |
|
|
424
|
+
| jq | Any | `brew install jq` | JSON processing (conversion result parsing) |
|
|
425
|
+
| lsof | Any | Included with macOS | docling-serve port management |
|
|
426
|
+
| ImageMagick | Any (optional) | `brew install imagemagick` | Image PDF fallback and page rendering |
|
|
427
|
+
| Ghostscript | Any (optional) | `brew install ghostscript` | Image PDF fallback (PDF to image conversion) |
|
|
290
428
|
|
|
291
|
-
>
|
|
429
|
+
> **Python 3.13+ is not supported.** Some Docling SDK dependencies are not compatible with Python 3.13.
|
|
292
430
|
|
|
293
431
|
### Checking Python Version
|
|
294
432
|
|
|
@@ -318,13 +456,16 @@ which jq
|
|
|
318
456
|
#### Constructor Options
|
|
319
457
|
|
|
320
458
|
```typescript
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
}
|
|
459
|
+
type Options = {
|
|
460
|
+
logger: LoggerMethods; // Logger instance (REQUIRED)
|
|
461
|
+
timeout?: number; // Timeout in milliseconds (default: 10000000)
|
|
462
|
+
venvPath?: string; // Python venv path (default: CWD/.venv)
|
|
463
|
+
killExistingProcess?: boolean; // Kill existing process on port (default: false)
|
|
464
|
+
enableImagePdfFallback?: boolean; // Enable image PDF fallback (default: false, requires ImageMagick + Ghostscript)
|
|
465
|
+
} & (
|
|
466
|
+
| { port?: number } // Local server mode (default port: 5001)
|
|
467
|
+
| { baseUrl: string } // External server mode
|
|
468
|
+
);
|
|
328
469
|
```
|
|
329
470
|
|
|
330
471
|
#### Methods
|
|
@@ -337,70 +478,102 @@ Sets up Python environment and starts docling-serve.
|
|
|
337
478
|
await pdfParser.init();
|
|
338
479
|
```
|
|
339
480
|
|
|
340
|
-
##### `parse(
|
|
481
|
+
##### `parse(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal?): Promise<TokenUsageReport | null>`
|
|
341
482
|
|
|
342
483
|
Parses a PDF file.
|
|
343
484
|
|
|
344
485
|
**Parameters:**
|
|
345
486
|
|
|
346
|
-
- `
|
|
347
|
-
- `
|
|
348
|
-
- `
|
|
349
|
-
- `
|
|
487
|
+
- `url` (string): PDF URL (`file://` for local files or `http://` for remote)
|
|
488
|
+
- `reportId` (string): Unique report identifier (used for output directory naming)
|
|
489
|
+
- `onComplete` (ConversionCompleteCallback): Callback function called with the output directory path on conversion complete
|
|
490
|
+
- `cleanupAfterCallback` (boolean): Whether to delete the output directory after the callback completes
|
|
491
|
+
- `options` (PDFConvertOptions): Conversion options
|
|
492
|
+
- `abortSignal` (AbortSignal, optional): Signal to cancel the operation
|
|
350
493
|
|
|
351
494
|
**Returns:**
|
|
352
495
|
|
|
353
|
-
- `Promise<
|
|
354
|
-
|
|
355
|
-
##### `shutdown(): Promise<void>`
|
|
356
|
-
|
|
357
|
-
Terminates the docling-serve process.
|
|
496
|
+
- `Promise<TokenUsageReport | null>`: Token usage report from LLM operations, or `null` when no LLM usage occurs
|
|
358
497
|
|
|
359
|
-
|
|
360
|
-
await pdfParser.shutdown();
|
|
361
|
-
```
|
|
498
|
+
##### `dispose(): Promise<void>`
|
|
362
499
|
|
|
363
|
-
|
|
500
|
+
Disposes the parser instance, kills the local docling-serve process (if started), and releases resources.
|
|
364
501
|
|
|
365
502
|
```typescript
|
|
366
|
-
|
|
367
|
-
doOcr?: boolean; // Enable OCR (default: true)
|
|
368
|
-
formats?: string[]; // Output formats (default: ['docling_json'])
|
|
369
|
-
pdfBackend?: string; // PDF backend (default: 'dlparse_v2')
|
|
370
|
-
}
|
|
503
|
+
await pdfParser.dispose();
|
|
371
504
|
```
|
|
372
505
|
|
|
373
|
-
### PDFConvertOptions
|
|
506
|
+
### PDFConvertOptions
|
|
374
507
|
|
|
375
508
|
```typescript
|
|
376
|
-
|
|
509
|
+
type PDFConvertOptions = {
|
|
510
|
+
// OCR strategy options
|
|
377
511
|
strategySamplerModel?: LanguageModel; // Vision LLM for OCR strategy sampling
|
|
378
512
|
vlmProcessorModel?: LanguageModel; // Vision LLM for text correction
|
|
379
513
|
vlmConcurrency?: number; // Parallel page processing (default: 1)
|
|
380
514
|
skipSampling?: boolean; // Skip strategy sampling
|
|
381
515
|
forcedMethod?: 'ocrmac' | 'vlm'; // Force specific OCR method
|
|
516
|
+
|
|
517
|
+
// Image PDF options
|
|
518
|
+
forceImagePdf?: boolean; // Force pre-conversion to image-based PDF
|
|
519
|
+
|
|
520
|
+
// Token usage tracking
|
|
521
|
+
aggregator?: LLMTokenUsageAggregator; // Token usage aggregator
|
|
522
|
+
onTokenUsage?: (report: TokenUsageReport) => void; // Callback for token usage updates
|
|
523
|
+
|
|
524
|
+
// Document processing
|
|
525
|
+
document_timeout?: number; // Document processing timeout in seconds
|
|
526
|
+
documentValidationModel?: LanguageModel; // LLM for document type validation
|
|
527
|
+
|
|
528
|
+
// Chunked conversion (large PDFs)
|
|
529
|
+
chunkedConversion?: boolean; // Enable chunked conversion
|
|
530
|
+
chunkSize?: number; // Pages per chunk
|
|
531
|
+
chunkMaxRetries?: number; // Max retry attempts per failed chunk
|
|
532
|
+
|
|
533
|
+
// Docling conversion options (inherited)
|
|
534
|
+
num_threads?: number; // Number of processing threads
|
|
535
|
+
ocr_lang?: string[]; // OCR languages
|
|
536
|
+
// ... other Docling ConversionOptions fields
|
|
537
|
+
};
|
|
538
|
+
```
|
|
539
|
+
|
|
540
|
+
### ConvertWithStrategyResult
|
|
541
|
+
|
|
542
|
+
```typescript
|
|
543
|
+
interface ConvertWithStrategyResult {
|
|
544
|
+
/** The OCR strategy that was determined */
|
|
545
|
+
strategy: OcrStrategy;
|
|
546
|
+
/** Token usage report from sampling and/or VLM processing (null when no LLM usage occurs) */
|
|
547
|
+
tokenUsageReport: TokenUsageReport | null;
|
|
382
548
|
}
|
|
383
549
|
```
|
|
384
550
|
|
|
385
|
-
|
|
551
|
+
### ConversionCompleteCallback
|
|
386
552
|
|
|
387
|
-
|
|
553
|
+
```typescript
|
|
554
|
+
type ConversionCompleteCallback = (outputPath: string) => Promise<void> | void;
|
|
555
|
+
```
|
|
388
556
|
|
|
389
|
-
|
|
557
|
+
### Error Types
|
|
390
558
|
|
|
391
|
-
|
|
559
|
+
#### `InvalidDocumentTypeError`
|
|
392
560
|
|
|
393
|
-
|
|
394
|
-
# Install Python 3.11
|
|
395
|
-
brew install python@3.11
|
|
561
|
+
Thrown when the PDF fails document type validation (i.e., it is not an archaeological investigation report).
|
|
396
562
|
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
pythonPath: 'python3.11',
|
|
400
|
-
logger,
|
|
401
|
-
});
|
|
563
|
+
```typescript
|
|
564
|
+
import { InvalidDocumentTypeError } from '@heripo/pdf-parser';
|
|
402
565
|
```
|
|
403
566
|
|
|
567
|
+
#### `ImagePdfFallbackError`
|
|
568
|
+
|
|
569
|
+
Thrown when both the original conversion and the image PDF fallback conversion fail. Contains references to both errors.
|
|
570
|
+
|
|
571
|
+
```typescript
|
|
572
|
+
import { ImagePdfFallbackError } from '@heripo/pdf-parser';
|
|
573
|
+
```
|
|
574
|
+
|
|
575
|
+
## Troubleshooting
|
|
576
|
+
|
|
404
577
|
### jq Not Found
|
|
405
578
|
|
|
406
579
|
**Symptom**: `Command not found: jq`
|
|
@@ -427,13 +600,19 @@ brew install poppler
|
|
|
427
600
|
|
|
428
601
|
**Solution**:
|
|
429
602
|
|
|
430
|
-
```
|
|
431
|
-
|
|
603
|
+
```typescript
|
|
604
|
+
// Use a different port
|
|
432
605
|
const pdfParser = new PDFParser({
|
|
433
|
-
pythonPath: 'python3.11',
|
|
434
606
|
port: 5002, // Specify different port
|
|
435
607
|
logger,
|
|
436
608
|
});
|
|
609
|
+
|
|
610
|
+
// Or kill the existing process
|
|
611
|
+
const pdfParser = new PDFParser({
|
|
612
|
+
port: 5001,
|
|
613
|
+
killExistingProcess: true,
|
|
614
|
+
logger,
|
|
615
|
+
});
|
|
437
616
|
```
|
|
438
617
|
|
|
439
618
|
### docling-serve Start Failure
|
|
@@ -443,21 +622,31 @@ const pdfParser = new PDFParser({
|
|
|
443
622
|
**Solution**:
|
|
444
623
|
|
|
445
624
|
```bash
|
|
446
|
-
# Recreate virtual environment
|
|
447
|
-
rm -rf
|
|
625
|
+
# Recreate virtual environment (default location)
|
|
626
|
+
rm -rf .venv
|
|
448
627
|
# Run init() again
|
|
449
628
|
```
|
|
450
629
|
|
|
630
|
+
### ImageMagick / Ghostscript Not Found
|
|
631
|
+
|
|
632
|
+
**Symptom**: `ImageMagick is not installed but enableImagePdfFallback is enabled`
|
|
633
|
+
|
|
634
|
+
**Solution**:
|
|
635
|
+
|
|
636
|
+
```bash
|
|
637
|
+
brew install imagemagick ghostscript
|
|
638
|
+
```
|
|
639
|
+
|
|
451
640
|
## Linux Support Status
|
|
452
641
|
|
|
453
642
|
Currently **macOS only**. Linux support is **not entirely ruled out**, but due to OCR performance and cost efficiency issues, **there are no specific plans at this time**.
|
|
454
643
|
|
|
455
|
-
| Platform | Status
|
|
456
|
-
| --------------------- |
|
|
457
|
-
| macOS + Apple Silicon |
|
|
458
|
-
| macOS + Intel |
|
|
459
|
-
| Linux |
|
|
460
|
-
| Windows |
|
|
644
|
+
| Platform | Status | Notes |
|
|
645
|
+
| --------------------- | --------- | ----------------------------------------------- |
|
|
646
|
+
| macOS + Apple Silicon | Supported | Optimal performance, GPU acceleration |
|
|
647
|
+
| macOS + Intel | Supported | No GPU acceleration |
|
|
648
|
+
| Linux | TBD | No current plans due to performance/cost issues |
|
|
649
|
+
| Windows | TBD | WSL2 Linux approach possible |
|
|
461
650
|
|
|
462
651
|
### Reason for No Linux Support
|
|
463
652
|
|