@heripo/pdf-parser 0.1.13 → 0.1.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +49 -20
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +49 -20
- package/dist/index.js.map +1 -1
- package/package.json +3 -3
package/dist/index.cjs
CHANGED
|
@@ -2070,8 +2070,9 @@ Note: Hanja are Chinese characters used in Korean documents, different from mode
|
|
|
2070
2070
|
|
|
2071
2071
|
Answer whether any Hanja characters are present on this page.
|
|
2072
2072
|
|
|
2073
|
-
Also identify all languages present on this page. Return an array of
|
|
2074
|
-
|
|
2073
|
+
Also identify all languages present on this page. Return an array of ocrmac-compatible language tags ordered by prevalence (primary language first).
|
|
2074
|
+
Supported tags: ar-SA, ars-SA, cs-CZ, da-DK, de-DE, en-US, es-ES, fr-FR, id-ID, it-IT, ja-JP, ko-KR, ms-MY, nb-NO, nl-NL, nn-NO, no-NO, pl-PL, pt-BR, ro-RO, ru-RU, sv-SE, th-TH, tr-TR, uk-UA, vi-VT, yue-Hans, yue-Hant, zh-Hans, zh-Hant.
|
|
2075
|
+
Examples: ["ko-KR", "en-US"], ["ja-JP"], ["zh-Hant", "en-US"]`;
|
|
2075
2076
|
var OcrStrategySampler = class {
|
|
2076
2077
|
logger;
|
|
2077
2078
|
pageRenderer;
|
|
@@ -2369,6 +2370,36 @@ var LocalFileServer = class {
|
|
|
2369
2370
|
}
|
|
2370
2371
|
};
|
|
2371
2372
|
|
|
2373
|
+
// src/utils/task-failure-details.ts
|
|
2374
|
+
var MAX_RESULT_RETRIES = 3;
|
|
2375
|
+
var RESULT_RETRY_DELAY_MS = 2e3;
|
|
2376
|
+
async function getTaskFailureDetails(task, logger, logPrefix) {
|
|
2377
|
+
for (let attempt = 0; attempt < MAX_RESULT_RETRIES; attempt++) {
|
|
2378
|
+
try {
|
|
2379
|
+
if (attempt > 0) {
|
|
2380
|
+
await new Promise((r) => setTimeout(r, RESULT_RETRY_DELAY_MS));
|
|
2381
|
+
}
|
|
2382
|
+
const result = await task.getResult();
|
|
2383
|
+
if (result.errors?.length) {
|
|
2384
|
+
return result.errors.map((e) => e.message).join("; ");
|
|
2385
|
+
}
|
|
2386
|
+
return `status: ${result.status ?? "unknown"}`;
|
|
2387
|
+
} catch (err) {
|
|
2388
|
+
if (attempt === MAX_RESULT_RETRIES - 1) {
|
|
2389
|
+
logger.error(
|
|
2390
|
+
`${logPrefix} Failed to retrieve task result after ${MAX_RESULT_RETRIES} attempts:`,
|
|
2391
|
+
err
|
|
2392
|
+
);
|
|
2393
|
+
return "unable to retrieve error details";
|
|
2394
|
+
}
|
|
2395
|
+
logger.warn(
|
|
2396
|
+
`${logPrefix} Result not available yet, retrying (${attempt + 1}/${MAX_RESULT_RETRIES})...`
|
|
2397
|
+
);
|
|
2398
|
+
}
|
|
2399
|
+
}
|
|
2400
|
+
return "unable to retrieve error details";
|
|
2401
|
+
}
|
|
2402
|
+
|
|
2372
2403
|
// src/core/chunked-pdf-converter.ts
|
|
2373
2404
|
var import_node_fs7 = require("fs");
|
|
2374
2405
|
var import_promises4 = require("fs/promises");
|
|
@@ -2714,14 +2745,15 @@ var ChunkedPDFConverter = class {
|
|
|
2714
2745
|
const status = await task.poll();
|
|
2715
2746
|
if (status.task_status === "success") return;
|
|
2716
2747
|
if (status.task_status === "failure") {
|
|
2717
|
-
|
|
2718
|
-
|
|
2719
|
-
|
|
2720
|
-
|
|
2721
|
-
|
|
2722
|
-
|
|
2723
|
-
|
|
2724
|
-
|
|
2748
|
+
const elapsed = ((Date.now() - startTime) / 1e3).toFixed(1);
|
|
2749
|
+
this.logger.error(
|
|
2750
|
+
`[ChunkedPDFConverter] Task ${task.taskId} failed after ${elapsed}s`
|
|
2751
|
+
);
|
|
2752
|
+
const details = await getTaskFailureDetails(
|
|
2753
|
+
task,
|
|
2754
|
+
this.logger,
|
|
2755
|
+
"[ChunkedPDFConverter]"
|
|
2756
|
+
);
|
|
2725
2757
|
throw new Error(`[ChunkedPDFConverter] Chunk task failed: ${details}`);
|
|
2726
2758
|
}
|
|
2727
2759
|
await new Promise(
|
|
@@ -3358,6 +3390,7 @@ var PDFConverter = class {
|
|
|
3358
3390
|
return {
|
|
3359
3391
|
...(0, import_es_toolkit.omit)(options, [
|
|
3360
3392
|
"num_threads",
|
|
3393
|
+
"document_timeout",
|
|
3361
3394
|
"forceImagePdf",
|
|
3362
3395
|
"strategySamplerModel",
|
|
3363
3396
|
"vlmProcessorModel",
|
|
@@ -3379,6 +3412,8 @@ var PDFConverter = class {
|
|
|
3379
3412
|
framework: "livetext"
|
|
3380
3413
|
},
|
|
3381
3414
|
generate_picture_images: true,
|
|
3415
|
+
do_picture_classification: true,
|
|
3416
|
+
do_picture_description: true,
|
|
3382
3417
|
generate_page_images: false,
|
|
3383
3418
|
// Page images are rendered by PageRenderer (ImageMagick) after conversion
|
|
3384
3419
|
images_scale: 2,
|
|
@@ -3393,6 +3428,9 @@ var PDFConverter = class {
|
|
|
3393
3428
|
accelerator_options: {
|
|
3394
3429
|
device: "mps",
|
|
3395
3430
|
num_threads: options.num_threads
|
|
3431
|
+
},
|
|
3432
|
+
...options.document_timeout !== void 0 && {
|
|
3433
|
+
document_timeout: options.document_timeout
|
|
3396
3434
|
}
|
|
3397
3435
|
};
|
|
3398
3436
|
}
|
|
@@ -3479,16 +3517,7 @@ var PDFConverter = class {
|
|
|
3479
3517
|
* Fetch detailed error information from a failed task result.
|
|
3480
3518
|
*/
|
|
3481
3519
|
async getTaskFailureDetails(task) {
|
|
3482
|
-
|
|
3483
|
-
const result = await task.getResult();
|
|
3484
|
-
if (result.errors?.length) {
|
|
3485
|
-
return result.errors.map((e) => e.message).join("; ");
|
|
3486
|
-
}
|
|
3487
|
-
return `status: ${result.status ?? "unknown"}`;
|
|
3488
|
-
} catch (err) {
|
|
3489
|
-
this.logger.error("[PDFConverter] Failed to retrieve task result:", err);
|
|
3490
|
-
return "unable to retrieve error details";
|
|
3491
|
-
}
|
|
3520
|
+
return getTaskFailureDetails(task, this.logger, "[PDFConverter]");
|
|
3492
3521
|
}
|
|
3493
3522
|
async downloadResult(taskId) {
|
|
3494
3523
|
this.logger.info(
|