@heripo/pdf-parser 0.1.12 → 0.1.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +55 -24
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +55 -24
- package/dist/index.js.map +1 -1
- package/package.json +4 -4
package/dist/index.cjs
CHANGED
|
@@ -1780,7 +1780,8 @@ var VlmTextCorrector = class {
|
|
|
1780
1780
|
},
|
|
1781
1781
|
{
|
|
1782
1782
|
type: "image",
|
|
1783
|
-
image:
|
|
1783
|
+
image: imageBase64,
|
|
1784
|
+
mediaType: "image/png"
|
|
1784
1785
|
}
|
|
1785
1786
|
]
|
|
1786
1787
|
}
|
|
@@ -1992,7 +1993,7 @@ var VlmTextCorrector = class {
|
|
|
1992
1993
|
*/
|
|
1993
1994
|
readPageImage(outputDir, pageNo) {
|
|
1994
1995
|
const imagePath = (0, import_node_path4.join)(outputDir, "pages", `page_${pageNo - 1}.png`);
|
|
1995
|
-
return (0, import_node_fs4.readFileSync)(imagePath)
|
|
1996
|
+
return new Uint8Array((0, import_node_fs4.readFileSync)(imagePath));
|
|
1996
1997
|
}
|
|
1997
1998
|
/**
|
|
1998
1999
|
* Apply VLM corrections to the DoclingDocument.
|
|
@@ -2069,8 +2070,9 @@ Note: Hanja are Chinese characters used in Korean documents, different from mode
|
|
|
2069
2070
|
|
|
2070
2071
|
Answer whether any Hanja characters are present on this page.
|
|
2071
2072
|
|
|
2072
|
-
Also identify all languages present on this page. Return an array of
|
|
2073
|
-
|
|
2073
|
+
Also identify all languages present on this page. Return an array of ocrmac-compatible language tags ordered by prevalence (primary language first).
|
|
2074
|
+
Supported tags: ar-SA, ars-SA, cs-CZ, da-DK, de-DE, en-US, es-ES, fr-FR, id-ID, it-IT, ja-JP, ko-KR, ms-MY, nb-NO, nl-NL, nn-NO, no-NO, pl-PL, pt-BR, ro-RO, ru-RU, sv-SE, th-TH, tr-TR, uk-UA, vi-VT, yue-Hans, yue-Hant, zh-Hans, zh-Hant.
|
|
2075
|
+
Examples: ["ko-KR", "en-US"], ["ja-JP"], ["zh-Hant", "en-US"]`;
|
|
2074
2076
|
var OcrStrategySampler = class {
|
|
2075
2077
|
logger;
|
|
2076
2078
|
pageRenderer;
|
|
@@ -2261,7 +2263,7 @@ var OcrStrategySampler = class {
|
|
|
2261
2263
|
this.logger.debug(
|
|
2262
2264
|
`[OcrStrategySampler] Analyzing page ${pageNo} for Korean-Hanja mix and language...`
|
|
2263
2265
|
);
|
|
2264
|
-
const
|
|
2266
|
+
const imageData = new Uint8Array((0, import_node_fs5.readFileSync)(pageFile));
|
|
2265
2267
|
const messages = [
|
|
2266
2268
|
{
|
|
2267
2269
|
role: "user",
|
|
@@ -2269,7 +2271,8 @@ var OcrStrategySampler = class {
|
|
|
2269
2271
|
{ type: "text", text: KOREAN_HANJA_MIX_PROMPT },
|
|
2270
2272
|
{
|
|
2271
2273
|
type: "image",
|
|
2272
|
-
image:
|
|
2274
|
+
image: imageData,
|
|
2275
|
+
mediaType: "image/png"
|
|
2273
2276
|
}
|
|
2274
2277
|
]
|
|
2275
2278
|
}
|
|
@@ -2367,6 +2370,36 @@ var LocalFileServer = class {
|
|
|
2367
2370
|
}
|
|
2368
2371
|
};
|
|
2369
2372
|
|
|
2373
|
+
// src/utils/task-failure-details.ts
|
|
2374
|
+
var MAX_RESULT_RETRIES = 3;
|
|
2375
|
+
var RESULT_RETRY_DELAY_MS = 2e3;
|
|
2376
|
+
async function getTaskFailureDetails(task, logger, logPrefix) {
|
|
2377
|
+
for (let attempt = 0; attempt < MAX_RESULT_RETRIES; attempt++) {
|
|
2378
|
+
try {
|
|
2379
|
+
if (attempt > 0) {
|
|
2380
|
+
await new Promise((r) => setTimeout(r, RESULT_RETRY_DELAY_MS));
|
|
2381
|
+
}
|
|
2382
|
+
const result = await task.getResult();
|
|
2383
|
+
if (result.errors?.length) {
|
|
2384
|
+
return result.errors.map((e) => e.message).join("; ");
|
|
2385
|
+
}
|
|
2386
|
+
return `status: ${result.status ?? "unknown"}`;
|
|
2387
|
+
} catch (err) {
|
|
2388
|
+
if (attempt === MAX_RESULT_RETRIES - 1) {
|
|
2389
|
+
logger.error(
|
|
2390
|
+
`${logPrefix} Failed to retrieve task result after ${MAX_RESULT_RETRIES} attempts:`,
|
|
2391
|
+
err
|
|
2392
|
+
);
|
|
2393
|
+
return "unable to retrieve error details";
|
|
2394
|
+
}
|
|
2395
|
+
logger.warn(
|
|
2396
|
+
`${logPrefix} Result not available yet, retrying (${attempt + 1}/${MAX_RESULT_RETRIES})...`
|
|
2397
|
+
);
|
|
2398
|
+
}
|
|
2399
|
+
}
|
|
2400
|
+
return "unable to retrieve error details";
|
|
2401
|
+
}
|
|
2402
|
+
|
|
2370
2403
|
// src/core/chunked-pdf-converter.ts
|
|
2371
2404
|
var import_node_fs7 = require("fs");
|
|
2372
2405
|
var import_promises4 = require("fs/promises");
|
|
@@ -2712,14 +2745,15 @@ var ChunkedPDFConverter = class {
|
|
|
2712
2745
|
const status = await task.poll();
|
|
2713
2746
|
if (status.task_status === "success") return;
|
|
2714
2747
|
if (status.task_status === "failure") {
|
|
2715
|
-
|
|
2716
|
-
|
|
2717
|
-
|
|
2718
|
-
|
|
2719
|
-
|
|
2720
|
-
|
|
2721
|
-
|
|
2722
|
-
|
|
2748
|
+
const elapsed = ((Date.now() - startTime) / 1e3).toFixed(1);
|
|
2749
|
+
this.logger.error(
|
|
2750
|
+
`[ChunkedPDFConverter] Task ${task.taskId} failed after ${elapsed}s`
|
|
2751
|
+
);
|
|
2752
|
+
const details = await getTaskFailureDetails(
|
|
2753
|
+
task,
|
|
2754
|
+
this.logger,
|
|
2755
|
+
"[ChunkedPDFConverter]"
|
|
2756
|
+
);
|
|
2723
2757
|
throw new Error(`[ChunkedPDFConverter] Chunk task failed: ${details}`);
|
|
2724
2758
|
}
|
|
2725
2759
|
await new Promise(
|
|
@@ -3356,6 +3390,7 @@ var PDFConverter = class {
|
|
|
3356
3390
|
return {
|
|
3357
3391
|
...(0, import_es_toolkit.omit)(options, [
|
|
3358
3392
|
"num_threads",
|
|
3393
|
+
"document_timeout",
|
|
3359
3394
|
"forceImagePdf",
|
|
3360
3395
|
"strategySamplerModel",
|
|
3361
3396
|
"vlmProcessorModel",
|
|
@@ -3377,6 +3412,8 @@ var PDFConverter = class {
|
|
|
3377
3412
|
framework: "livetext"
|
|
3378
3413
|
},
|
|
3379
3414
|
generate_picture_images: true,
|
|
3415
|
+
do_picture_classification: true,
|
|
3416
|
+
do_picture_description: true,
|
|
3380
3417
|
generate_page_images: false,
|
|
3381
3418
|
// Page images are rendered by PageRenderer (ImageMagick) after conversion
|
|
3382
3419
|
images_scale: 2,
|
|
@@ -3391,6 +3428,9 @@ var PDFConverter = class {
|
|
|
3391
3428
|
accelerator_options: {
|
|
3392
3429
|
device: "mps",
|
|
3393
3430
|
num_threads: options.num_threads
|
|
3431
|
+
},
|
|
3432
|
+
...options.document_timeout !== void 0 && {
|
|
3433
|
+
document_timeout: options.document_timeout
|
|
3394
3434
|
}
|
|
3395
3435
|
};
|
|
3396
3436
|
}
|
|
@@ -3477,16 +3517,7 @@ var PDFConverter = class {
|
|
|
3477
3517
|
* Fetch detailed error information from a failed task result.
|
|
3478
3518
|
*/
|
|
3479
3519
|
async getTaskFailureDetails(task) {
|
|
3480
|
-
|
|
3481
|
-
const result = await task.getResult();
|
|
3482
|
-
if (result.errors?.length) {
|
|
3483
|
-
return result.errors.map((e) => e.message).join("; ");
|
|
3484
|
-
}
|
|
3485
|
-
return `status: ${result.status ?? "unknown"}`;
|
|
3486
|
-
} catch (err) {
|
|
3487
|
-
this.logger.error("[PDFConverter] Failed to retrieve task result:", err);
|
|
3488
|
-
return "unable to retrieve error details";
|
|
3489
|
-
}
|
|
3520
|
+
return getTaskFailureDetails(task, this.logger, "[PDFConverter]");
|
|
3490
3521
|
}
|
|
3491
3522
|
async downloadResult(taskId) {
|
|
3492
3523
|
this.logger.info(
|