@heripo/pdf-parser 0.1.12 → 0.1.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +55 -24
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +55 -24
- package/dist/index.js.map +1 -1
- package/package.json +4 -4
package/dist/index.d.cts
CHANGED
|
@@ -33,6 +33,8 @@ type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mo
|
|
|
33
33
|
aggregator?: LLMTokenUsageAggregator;
|
|
34
34
|
/** Callback fired after each batch of VLM pages completes, with cumulative token usage */
|
|
35
35
|
onTokenUsage?: (report: TokenUsageReport) => void;
|
|
36
|
+
/** Document processing timeout in seconds for the Docling server (default: server default) */
|
|
37
|
+
document_timeout?: number;
|
|
36
38
|
/** Enable chunked conversion for large PDFs (local files only) */
|
|
37
39
|
chunkedConversion?: boolean;
|
|
38
40
|
/** Pages per chunk (default: CHUNKED_CONVERSION.DEFAULT_CHUNK_SIZE) */
|
package/dist/index.d.ts
CHANGED
|
@@ -33,6 +33,8 @@ type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mo
|
|
|
33
33
|
aggregator?: LLMTokenUsageAggregator;
|
|
34
34
|
/** Callback fired after each batch of VLM pages completes, with cumulative token usage */
|
|
35
35
|
onTokenUsage?: (report: TokenUsageReport) => void;
|
|
36
|
+
/** Document processing timeout in seconds for the Docling server (default: server default) */
|
|
37
|
+
document_timeout?: number;
|
|
36
38
|
/** Enable chunked conversion for large PDFs (local files only) */
|
|
37
39
|
chunkedConversion?: boolean;
|
|
38
40
|
/** Pages per chunk (default: CHUNKED_CONVERSION.DEFAULT_CHUNK_SIZE) */
|
package/dist/index.js
CHANGED
|
@@ -1756,7 +1756,8 @@ var VlmTextCorrector = class {
|
|
|
1756
1756
|
},
|
|
1757
1757
|
{
|
|
1758
1758
|
type: "image",
|
|
1759
|
-
image:
|
|
1759
|
+
image: imageBase64,
|
|
1760
|
+
mediaType: "image/png"
|
|
1760
1761
|
}
|
|
1761
1762
|
]
|
|
1762
1763
|
}
|
|
@@ -1968,7 +1969,7 @@ var VlmTextCorrector = class {
|
|
|
1968
1969
|
*/
|
|
1969
1970
|
readPageImage(outputDir, pageNo) {
|
|
1970
1971
|
const imagePath = join4(outputDir, "pages", `page_${pageNo - 1}.png`);
|
|
1971
|
-
return readFileSync(imagePath)
|
|
1972
|
+
return new Uint8Array(readFileSync(imagePath));
|
|
1972
1973
|
}
|
|
1973
1974
|
/**
|
|
1974
1975
|
* Apply VLM corrections to the DoclingDocument.
|
|
@@ -2045,8 +2046,9 @@ Note: Hanja are Chinese characters used in Korean documents, different from mode
|
|
|
2045
2046
|
|
|
2046
2047
|
Answer whether any Hanja characters are present on this page.
|
|
2047
2048
|
|
|
2048
|
-
Also identify all languages present on this page. Return an array of
|
|
2049
|
-
|
|
2049
|
+
Also identify all languages present on this page. Return an array of ocrmac-compatible language tags ordered by prevalence (primary language first).
|
|
2050
|
+
Supported tags: ar-SA, ars-SA, cs-CZ, da-DK, de-DE, en-US, es-ES, fr-FR, id-ID, it-IT, ja-JP, ko-KR, ms-MY, nb-NO, nl-NL, nn-NO, no-NO, pl-PL, pt-BR, ro-RO, ru-RU, sv-SE, th-TH, tr-TR, uk-UA, vi-VT, yue-Hans, yue-Hant, zh-Hans, zh-Hant.
|
|
2051
|
+
Examples: ["ko-KR", "en-US"], ["ja-JP"], ["zh-Hant", "en-US"]`;
|
|
2050
2052
|
var OcrStrategySampler = class {
|
|
2051
2053
|
logger;
|
|
2052
2054
|
pageRenderer;
|
|
@@ -2237,7 +2239,7 @@ var OcrStrategySampler = class {
|
|
|
2237
2239
|
this.logger.debug(
|
|
2238
2240
|
`[OcrStrategySampler] Analyzing page ${pageNo} for Korean-Hanja mix and language...`
|
|
2239
2241
|
);
|
|
2240
|
-
const
|
|
2242
|
+
const imageData = new Uint8Array(readFileSync2(pageFile));
|
|
2241
2243
|
const messages = [
|
|
2242
2244
|
{
|
|
2243
2245
|
role: "user",
|
|
@@ -2245,7 +2247,8 @@ var OcrStrategySampler = class {
|
|
|
2245
2247
|
{ type: "text", text: KOREAN_HANJA_MIX_PROMPT },
|
|
2246
2248
|
{
|
|
2247
2249
|
type: "image",
|
|
2248
|
-
image:
|
|
2250
|
+
image: imageData,
|
|
2251
|
+
mediaType: "image/png"
|
|
2249
2252
|
}
|
|
2250
2253
|
]
|
|
2251
2254
|
}
|
|
@@ -2343,6 +2346,36 @@ var LocalFileServer = class {
|
|
|
2343
2346
|
}
|
|
2344
2347
|
};
|
|
2345
2348
|
|
|
2349
|
+
// src/utils/task-failure-details.ts
|
|
2350
|
+
var MAX_RESULT_RETRIES = 3;
|
|
2351
|
+
var RESULT_RETRY_DELAY_MS = 2e3;
|
|
2352
|
+
async function getTaskFailureDetails(task, logger, logPrefix) {
|
|
2353
|
+
for (let attempt = 0; attempt < MAX_RESULT_RETRIES; attempt++) {
|
|
2354
|
+
try {
|
|
2355
|
+
if (attempt > 0) {
|
|
2356
|
+
await new Promise((r) => setTimeout(r, RESULT_RETRY_DELAY_MS));
|
|
2357
|
+
}
|
|
2358
|
+
const result = await task.getResult();
|
|
2359
|
+
if (result.errors?.length) {
|
|
2360
|
+
return result.errors.map((e) => e.message).join("; ");
|
|
2361
|
+
}
|
|
2362
|
+
return `status: ${result.status ?? "unknown"}`;
|
|
2363
|
+
} catch (err) {
|
|
2364
|
+
if (attempt === MAX_RESULT_RETRIES - 1) {
|
|
2365
|
+
logger.error(
|
|
2366
|
+
`${logPrefix} Failed to retrieve task result after ${MAX_RESULT_RETRIES} attempts:`,
|
|
2367
|
+
err
|
|
2368
|
+
);
|
|
2369
|
+
return "unable to retrieve error details";
|
|
2370
|
+
}
|
|
2371
|
+
logger.warn(
|
|
2372
|
+
`${logPrefix} Result not available yet, retrying (${attempt + 1}/${MAX_RESULT_RETRIES})...`
|
|
2373
|
+
);
|
|
2374
|
+
}
|
|
2375
|
+
}
|
|
2376
|
+
return "unable to retrieve error details";
|
|
2377
|
+
}
|
|
2378
|
+
|
|
2346
2379
|
// src/core/chunked-pdf-converter.ts
|
|
2347
2380
|
import {
|
|
2348
2381
|
copyFileSync,
|
|
@@ -2697,14 +2730,15 @@ var ChunkedPDFConverter = class {
|
|
|
2697
2730
|
const status = await task.poll();
|
|
2698
2731
|
if (status.task_status === "success") return;
|
|
2699
2732
|
if (status.task_status === "failure") {
|
|
2700
|
-
|
|
2701
|
-
|
|
2702
|
-
|
|
2703
|
-
|
|
2704
|
-
|
|
2705
|
-
|
|
2706
|
-
|
|
2707
|
-
|
|
2733
|
+
const elapsed = ((Date.now() - startTime) / 1e3).toFixed(1);
|
|
2734
|
+
this.logger.error(
|
|
2735
|
+
`[ChunkedPDFConverter] Task ${task.taskId} failed after ${elapsed}s`
|
|
2736
|
+
);
|
|
2737
|
+
const details = await getTaskFailureDetails(
|
|
2738
|
+
task,
|
|
2739
|
+
this.logger,
|
|
2740
|
+
"[ChunkedPDFConverter]"
|
|
2741
|
+
);
|
|
2708
2742
|
throw new Error(`[ChunkedPDFConverter] Chunk task failed: ${details}`);
|
|
2709
2743
|
}
|
|
2710
2744
|
await new Promise(
|
|
@@ -3341,6 +3375,7 @@ var PDFConverter = class {
|
|
|
3341
3375
|
return {
|
|
3342
3376
|
...omit(options, [
|
|
3343
3377
|
"num_threads",
|
|
3378
|
+
"document_timeout",
|
|
3344
3379
|
"forceImagePdf",
|
|
3345
3380
|
"strategySamplerModel",
|
|
3346
3381
|
"vlmProcessorModel",
|
|
@@ -3362,6 +3397,8 @@ var PDFConverter = class {
|
|
|
3362
3397
|
framework: "livetext"
|
|
3363
3398
|
},
|
|
3364
3399
|
generate_picture_images: true,
|
|
3400
|
+
do_picture_classification: true,
|
|
3401
|
+
do_picture_description: true,
|
|
3365
3402
|
generate_page_images: false,
|
|
3366
3403
|
// Page images are rendered by PageRenderer (ImageMagick) after conversion
|
|
3367
3404
|
images_scale: 2,
|
|
@@ -3376,6 +3413,9 @@ var PDFConverter = class {
|
|
|
3376
3413
|
accelerator_options: {
|
|
3377
3414
|
device: "mps",
|
|
3378
3415
|
num_threads: options.num_threads
|
|
3416
|
+
},
|
|
3417
|
+
...options.document_timeout !== void 0 && {
|
|
3418
|
+
document_timeout: options.document_timeout
|
|
3379
3419
|
}
|
|
3380
3420
|
};
|
|
3381
3421
|
}
|
|
@@ -3462,16 +3502,7 @@ var PDFConverter = class {
|
|
|
3462
3502
|
* Fetch detailed error information from a failed task result.
|
|
3463
3503
|
*/
|
|
3464
3504
|
async getTaskFailureDetails(task) {
|
|
3465
|
-
|
|
3466
|
-
const result = await task.getResult();
|
|
3467
|
-
if (result.errors?.length) {
|
|
3468
|
-
return result.errors.map((e) => e.message).join("; ");
|
|
3469
|
-
}
|
|
3470
|
-
return `status: ${result.status ?? "unknown"}`;
|
|
3471
|
-
} catch (err) {
|
|
3472
|
-
this.logger.error("[PDFConverter] Failed to retrieve task result:", err);
|
|
3473
|
-
return "unable to retrieve error details";
|
|
3474
|
-
}
|
|
3505
|
+
return getTaskFailureDetails(task, this.logger, "[PDFConverter]");
|
|
3475
3506
|
}
|
|
3476
3507
|
async downloadResult(taskId) {
|
|
3477
3508
|
this.logger.info(
|