@heripo/pdf-parser 0.1.13 → 0.1.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +49 -20
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +49 -20
- package/dist/index.js.map +1 -1
- package/package.json +3 -3
package/dist/index.d.cts
CHANGED
|
@@ -33,6 +33,8 @@ type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mo
|
|
|
33
33
|
aggregator?: LLMTokenUsageAggregator;
|
|
34
34
|
/** Callback fired after each batch of VLM pages completes, with cumulative token usage */
|
|
35
35
|
onTokenUsage?: (report: TokenUsageReport) => void;
|
|
36
|
+
/** Document processing timeout in seconds for the Docling server (default: server default) */
|
|
37
|
+
document_timeout?: number;
|
|
36
38
|
/** Enable chunked conversion for large PDFs (local files only) */
|
|
37
39
|
chunkedConversion?: boolean;
|
|
38
40
|
/** Pages per chunk (default: CHUNKED_CONVERSION.DEFAULT_CHUNK_SIZE) */
|
package/dist/index.d.ts
CHANGED
|
@@ -33,6 +33,8 @@ type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mo
|
|
|
33
33
|
aggregator?: LLMTokenUsageAggregator;
|
|
34
34
|
/** Callback fired after each batch of VLM pages completes, with cumulative token usage */
|
|
35
35
|
onTokenUsage?: (report: TokenUsageReport) => void;
|
|
36
|
+
/** Document processing timeout in seconds for the Docling server (default: server default) */
|
|
37
|
+
document_timeout?: number;
|
|
36
38
|
/** Enable chunked conversion for large PDFs (local files only) */
|
|
37
39
|
chunkedConversion?: boolean;
|
|
38
40
|
/** Pages per chunk (default: CHUNKED_CONVERSION.DEFAULT_CHUNK_SIZE) */
|
package/dist/index.js
CHANGED
|
@@ -2046,8 +2046,9 @@ Note: Hanja are Chinese characters used in Korean documents, different from mode
|
|
|
2046
2046
|
|
|
2047
2047
|
Answer whether any Hanja characters are present on this page.
|
|
2048
2048
|
|
|
2049
|
-
Also identify all languages present on this page. Return an array of
|
|
2050
|
-
|
|
2049
|
+
Also identify all languages present on this page. Return an array of ocrmac-compatible language tags ordered by prevalence (primary language first).
|
|
2050
|
+
Supported tags: ar-SA, ars-SA, cs-CZ, da-DK, de-DE, en-US, es-ES, fr-FR, id-ID, it-IT, ja-JP, ko-KR, ms-MY, nb-NO, nl-NL, nn-NO, no-NO, pl-PL, pt-BR, ro-RO, ru-RU, sv-SE, th-TH, tr-TR, uk-UA, vi-VT, yue-Hans, yue-Hant, zh-Hans, zh-Hant.
|
|
2051
|
+
Examples: ["ko-KR", "en-US"], ["ja-JP"], ["zh-Hant", "en-US"]`;
|
|
2051
2052
|
var OcrStrategySampler = class {
|
|
2052
2053
|
logger;
|
|
2053
2054
|
pageRenderer;
|
|
@@ -2345,6 +2346,36 @@ var LocalFileServer = class {
|
|
|
2345
2346
|
}
|
|
2346
2347
|
};
|
|
2347
2348
|
|
|
2349
|
+
// src/utils/task-failure-details.ts
|
|
2350
|
+
var MAX_RESULT_RETRIES = 3;
|
|
2351
|
+
var RESULT_RETRY_DELAY_MS = 2e3;
|
|
2352
|
+
async function getTaskFailureDetails(task, logger, logPrefix) {
|
|
2353
|
+
for (let attempt = 0; attempt < MAX_RESULT_RETRIES; attempt++) {
|
|
2354
|
+
try {
|
|
2355
|
+
if (attempt > 0) {
|
|
2356
|
+
await new Promise((r) => setTimeout(r, RESULT_RETRY_DELAY_MS));
|
|
2357
|
+
}
|
|
2358
|
+
const result = await task.getResult();
|
|
2359
|
+
if (result.errors?.length) {
|
|
2360
|
+
return result.errors.map((e) => e.message).join("; ");
|
|
2361
|
+
}
|
|
2362
|
+
return `status: ${result.status ?? "unknown"}`;
|
|
2363
|
+
} catch (err) {
|
|
2364
|
+
if (attempt === MAX_RESULT_RETRIES - 1) {
|
|
2365
|
+
logger.error(
|
|
2366
|
+
`${logPrefix} Failed to retrieve task result after ${MAX_RESULT_RETRIES} attempts:`,
|
|
2367
|
+
err
|
|
2368
|
+
);
|
|
2369
|
+
return "unable to retrieve error details";
|
|
2370
|
+
}
|
|
2371
|
+
logger.warn(
|
|
2372
|
+
`${logPrefix} Result not available yet, retrying (${attempt + 1}/${MAX_RESULT_RETRIES})...`
|
|
2373
|
+
);
|
|
2374
|
+
}
|
|
2375
|
+
}
|
|
2376
|
+
return "unable to retrieve error details";
|
|
2377
|
+
}
|
|
2378
|
+
|
|
2348
2379
|
// src/core/chunked-pdf-converter.ts
|
|
2349
2380
|
import {
|
|
2350
2381
|
copyFileSync,
|
|
@@ -2699,14 +2730,15 @@ var ChunkedPDFConverter = class {
|
|
|
2699
2730
|
const status = await task.poll();
|
|
2700
2731
|
if (status.task_status === "success") return;
|
|
2701
2732
|
if (status.task_status === "failure") {
|
|
2702
|
-
|
|
2703
|
-
|
|
2704
|
-
|
|
2705
|
-
|
|
2706
|
-
|
|
2707
|
-
|
|
2708
|
-
|
|
2709
|
-
|
|
2733
|
+
const elapsed = ((Date.now() - startTime) / 1e3).toFixed(1);
|
|
2734
|
+
this.logger.error(
|
|
2735
|
+
`[ChunkedPDFConverter] Task ${task.taskId} failed after ${elapsed}s`
|
|
2736
|
+
);
|
|
2737
|
+
const details = await getTaskFailureDetails(
|
|
2738
|
+
task,
|
|
2739
|
+
this.logger,
|
|
2740
|
+
"[ChunkedPDFConverter]"
|
|
2741
|
+
);
|
|
2710
2742
|
throw new Error(`[ChunkedPDFConverter] Chunk task failed: ${details}`);
|
|
2711
2743
|
}
|
|
2712
2744
|
await new Promise(
|
|
@@ -3343,6 +3375,7 @@ var PDFConverter = class {
|
|
|
3343
3375
|
return {
|
|
3344
3376
|
...omit(options, [
|
|
3345
3377
|
"num_threads",
|
|
3378
|
+
"document_timeout",
|
|
3346
3379
|
"forceImagePdf",
|
|
3347
3380
|
"strategySamplerModel",
|
|
3348
3381
|
"vlmProcessorModel",
|
|
@@ -3364,6 +3397,8 @@ var PDFConverter = class {
|
|
|
3364
3397
|
framework: "livetext"
|
|
3365
3398
|
},
|
|
3366
3399
|
generate_picture_images: true,
|
|
3400
|
+
do_picture_classification: true,
|
|
3401
|
+
do_picture_description: true,
|
|
3367
3402
|
generate_page_images: false,
|
|
3368
3403
|
// Page images are rendered by PageRenderer (ImageMagick) after conversion
|
|
3369
3404
|
images_scale: 2,
|
|
@@ -3378,6 +3413,9 @@ var PDFConverter = class {
|
|
|
3378
3413
|
accelerator_options: {
|
|
3379
3414
|
device: "mps",
|
|
3380
3415
|
num_threads: options.num_threads
|
|
3416
|
+
},
|
|
3417
|
+
...options.document_timeout !== void 0 && {
|
|
3418
|
+
document_timeout: options.document_timeout
|
|
3381
3419
|
}
|
|
3382
3420
|
};
|
|
3383
3421
|
}
|
|
@@ -3464,16 +3502,7 @@ var PDFConverter = class {
|
|
|
3464
3502
|
* Fetch detailed error information from a failed task result.
|
|
3465
3503
|
*/
|
|
3466
3504
|
async getTaskFailureDetails(task) {
|
|
3467
|
-
|
|
3468
|
-
const result = await task.getResult();
|
|
3469
|
-
if (result.errors?.length) {
|
|
3470
|
-
return result.errors.map((e) => e.message).join("; ");
|
|
3471
|
-
}
|
|
3472
|
-
return `status: ${result.status ?? "unknown"}`;
|
|
3473
|
-
} catch (err) {
|
|
3474
|
-
this.logger.error("[PDFConverter] Failed to retrieve task result:", err);
|
|
3475
|
-
return "unable to retrieve error details";
|
|
3476
|
-
}
|
|
3505
|
+
return getTaskFailureDetails(task, this.logger, "[PDFConverter]");
|
|
3477
3506
|
}
|
|
3478
3507
|
async downloadResult(taskId) {
|
|
3479
3508
|
this.logger.info(
|