@heripo/pdf-parser 0.1.13 → 0.1.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -2070,8 +2070,9 @@ Note: Hanja are Chinese characters used in Korean documents, different from mode
2070
2070
 
2071
2071
  Answer whether any Hanja characters are present on this page.
2072
2072
 
2073
- Also identify all languages present on this page. Return an array of BCP 47 language tags ordered by prevalence (primary language first).
2074
- Examples: ["ko-KR", "en-US"], ["ja-JP"], ["zh-TW", "en-US"]`;
2073
+ Also identify all languages present on this page. Return an array of ocrmac-compatible language tags ordered by prevalence (primary language first).
2074
+ Supported tags: ar-SA, ars-SA, cs-CZ, da-DK, de-DE, en-US, es-ES, fr-FR, id-ID, it-IT, ja-JP, ko-KR, ms-MY, nb-NO, nl-NL, nn-NO, no-NO, pl-PL, pt-BR, ro-RO, ru-RU, sv-SE, th-TH, tr-TR, uk-UA, vi-VT, yue-Hans, yue-Hant, zh-Hans, zh-Hant.
2075
+ Examples: ["ko-KR", "en-US"], ["ja-JP"], ["zh-Hant", "en-US"]`;
2075
2076
  var OcrStrategySampler = class {
2076
2077
  logger;
2077
2078
  pageRenderer;
@@ -2369,6 +2370,36 @@ var LocalFileServer = class {
2369
2370
  }
2370
2371
  };
2371
2372
 
2373
+ // src/utils/task-failure-details.ts
2374
+ var MAX_RESULT_RETRIES = 3;
2375
+ var RESULT_RETRY_DELAY_MS = 2e3;
2376
+ async function getTaskFailureDetails(task, logger, logPrefix) {
2377
+ for (let attempt = 0; attempt < MAX_RESULT_RETRIES; attempt++) {
2378
+ try {
2379
+ if (attempt > 0) {
2380
+ await new Promise((r) => setTimeout(r, RESULT_RETRY_DELAY_MS));
2381
+ }
2382
+ const result = await task.getResult();
2383
+ if (result.errors?.length) {
2384
+ return result.errors.map((e) => e.message).join("; ");
2385
+ }
2386
+ return `status: ${result.status ?? "unknown"}`;
2387
+ } catch (err) {
2388
+ if (attempt === MAX_RESULT_RETRIES - 1) {
2389
+ logger.error(
2390
+ `${logPrefix} Failed to retrieve task result after ${MAX_RESULT_RETRIES} attempts:`,
2391
+ err
2392
+ );
2393
+ return "unable to retrieve error details";
2394
+ }
2395
+ logger.warn(
2396
+ `${logPrefix} Result not available yet, retrying (${attempt + 1}/${MAX_RESULT_RETRIES})...`
2397
+ );
2398
+ }
2399
+ }
2400
+ return "unable to retrieve error details";
2401
+ }
2402
+
2372
2403
  // src/core/chunked-pdf-converter.ts
2373
2404
  var import_node_fs7 = require("fs");
2374
2405
  var import_promises4 = require("fs/promises");
@@ -2714,14 +2745,15 @@ var ChunkedPDFConverter = class {
2714
2745
  const status = await task.poll();
2715
2746
  if (status.task_status === "success") return;
2716
2747
  if (status.task_status === "failure") {
2717
- let details = "unknown";
2718
- try {
2719
- const result = await task.getResult();
2720
- if (result.errors?.length) {
2721
- details = result.errors.map((e) => e.message).join("; ");
2722
- }
2723
- } catch {
2724
- }
2748
+ const elapsed = ((Date.now() - startTime) / 1e3).toFixed(1);
2749
+ this.logger.error(
2750
+ `[ChunkedPDFConverter] Task ${task.taskId} failed after ${elapsed}s`
2751
+ );
2752
+ const details = await getTaskFailureDetails(
2753
+ task,
2754
+ this.logger,
2755
+ "[ChunkedPDFConverter]"
2756
+ );
2725
2757
  throw new Error(`[ChunkedPDFConverter] Chunk task failed: ${details}`);
2726
2758
  }
2727
2759
  await new Promise(
@@ -3358,6 +3390,7 @@ var PDFConverter = class {
3358
3390
  return {
3359
3391
  ...(0, import_es_toolkit.omit)(options, [
3360
3392
  "num_threads",
3393
+ "document_timeout",
3361
3394
  "forceImagePdf",
3362
3395
  "strategySamplerModel",
3363
3396
  "vlmProcessorModel",
@@ -3379,6 +3412,8 @@ var PDFConverter = class {
3379
3412
  framework: "livetext"
3380
3413
  },
3381
3414
  generate_picture_images: true,
3415
+ do_picture_classification: true,
3416
+ do_picture_description: true,
3382
3417
  generate_page_images: false,
3383
3418
  // Page images are rendered by PageRenderer (ImageMagick) after conversion
3384
3419
  images_scale: 2,
@@ -3393,6 +3428,9 @@ var PDFConverter = class {
3393
3428
  accelerator_options: {
3394
3429
  device: "mps",
3395
3430
  num_threads: options.num_threads
3431
+ },
3432
+ ...options.document_timeout !== void 0 && {
3433
+ document_timeout: options.document_timeout
3396
3434
  }
3397
3435
  };
3398
3436
  }
@@ -3479,16 +3517,7 @@ var PDFConverter = class {
3479
3517
  * Fetch detailed error information from a failed task result.
3480
3518
  */
3481
3519
  async getTaskFailureDetails(task) {
3482
- try {
3483
- const result = await task.getResult();
3484
- if (result.errors?.length) {
3485
- return result.errors.map((e) => e.message).join("; ");
3486
- }
3487
- return `status: ${result.status ?? "unknown"}`;
3488
- } catch (err) {
3489
- this.logger.error("[PDFConverter] Failed to retrieve task result:", err);
3490
- return "unable to retrieve error details";
3491
- }
3520
+ return getTaskFailureDetails(task, this.logger, "[PDFConverter]");
3492
3521
  }
3493
3522
  async downloadResult(taskId) {
3494
3523
  this.logger.info(