@heripo/pdf-parser 0.1.12 → 0.1.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1780,7 +1780,8 @@ var VlmTextCorrector = class {
1780
1780
  },
1781
1781
  {
1782
1782
  type: "image",
1783
- image: `data:image/png;base64,${imageBase64}`
1783
+ image: imageBase64,
1784
+ mediaType: "image/png"
1784
1785
  }
1785
1786
  ]
1786
1787
  }
@@ -1992,7 +1993,7 @@ var VlmTextCorrector = class {
1992
1993
  */
1993
1994
  readPageImage(outputDir, pageNo) {
1994
1995
  const imagePath = (0, import_node_path4.join)(outputDir, "pages", `page_${pageNo - 1}.png`);
1995
- return (0, import_node_fs4.readFileSync)(imagePath).toString("base64");
1996
+ return new Uint8Array((0, import_node_fs4.readFileSync)(imagePath));
1996
1997
  }
1997
1998
  /**
1998
1999
  * Apply VLM corrections to the DoclingDocument.
@@ -2069,8 +2070,9 @@ Note: Hanja are Chinese characters used in Korean documents, different from mode
2069
2070
 
2070
2071
  Answer whether any Hanja characters are present on this page.
2071
2072
 
2072
- Also identify all languages present on this page. Return an array of BCP 47 language tags ordered by prevalence (primary language first).
2073
- Examples: ["ko-KR", "en-US"], ["ja-JP"], ["zh-TW", "en-US"]`;
2073
+ Also identify all languages present on this page. Return an array of ocrmac-compatible language tags ordered by prevalence (primary language first).
2074
+ Supported tags: ar-SA, ars-SA, cs-CZ, da-DK, de-DE, en-US, es-ES, fr-FR, id-ID, it-IT, ja-JP, ko-KR, ms-MY, nb-NO, nl-NL, nn-NO, no-NO, pl-PL, pt-BR, ro-RO, ru-RU, sv-SE, th-TH, tr-TR, uk-UA, vi-VT, yue-Hans, yue-Hant, zh-Hans, zh-Hant.
2075
+ Examples: ["ko-KR", "en-US"], ["ja-JP"], ["zh-Hant", "en-US"]`;
2074
2076
  var OcrStrategySampler = class {
2075
2077
  logger;
2076
2078
  pageRenderer;
@@ -2261,7 +2263,7 @@ var OcrStrategySampler = class {
2261
2263
  this.logger.debug(
2262
2264
  `[OcrStrategySampler] Analyzing page ${pageNo} for Korean-Hanja mix and language...`
2263
2265
  );
2264
- const base64Image = (0, import_node_fs5.readFileSync)(pageFile).toString("base64");
2266
+ const imageData = new Uint8Array((0, import_node_fs5.readFileSync)(pageFile));
2265
2267
  const messages = [
2266
2268
  {
2267
2269
  role: "user",
@@ -2269,7 +2271,8 @@ var OcrStrategySampler = class {
2269
2271
  { type: "text", text: KOREAN_HANJA_MIX_PROMPT },
2270
2272
  {
2271
2273
  type: "image",
2272
- image: `data:image/png;base64,${base64Image}`
2274
+ image: imageData,
2275
+ mediaType: "image/png"
2273
2276
  }
2274
2277
  ]
2275
2278
  }
@@ -2367,6 +2370,36 @@ var LocalFileServer = class {
2367
2370
  }
2368
2371
  };
2369
2372
 
2373
+ // src/utils/task-failure-details.ts
2374
+ var MAX_RESULT_RETRIES = 3;
2375
+ var RESULT_RETRY_DELAY_MS = 2e3;
2376
+ async function getTaskFailureDetails(task, logger, logPrefix) {
2377
+ for (let attempt = 0; attempt < MAX_RESULT_RETRIES; attempt++) {
2378
+ try {
2379
+ if (attempt > 0) {
2380
+ await new Promise((r) => setTimeout(r, RESULT_RETRY_DELAY_MS));
2381
+ }
2382
+ const result = await task.getResult();
2383
+ if (result.errors?.length) {
2384
+ return result.errors.map((e) => e.message).join("; ");
2385
+ }
2386
+ return `status: ${result.status ?? "unknown"}`;
2387
+ } catch (err) {
2388
+ if (attempt === MAX_RESULT_RETRIES - 1) {
2389
+ logger.error(
2390
+ `${logPrefix} Failed to retrieve task result after ${MAX_RESULT_RETRIES} attempts:`,
2391
+ err
2392
+ );
2393
+ return "unable to retrieve error details";
2394
+ }
2395
+ logger.warn(
2396
+ `${logPrefix} Result not available yet, retrying (${attempt + 1}/${MAX_RESULT_RETRIES})...`
2397
+ );
2398
+ }
2399
+ }
2400
+ return "unable to retrieve error details";
2401
+ }
2402
+
2370
2403
  // src/core/chunked-pdf-converter.ts
2371
2404
  var import_node_fs7 = require("fs");
2372
2405
  var import_promises4 = require("fs/promises");
@@ -2712,14 +2745,15 @@ var ChunkedPDFConverter = class {
2712
2745
  const status = await task.poll();
2713
2746
  if (status.task_status === "success") return;
2714
2747
  if (status.task_status === "failure") {
2715
- let details = "unknown";
2716
- try {
2717
- const result = await task.getResult();
2718
- if (result.errors?.length) {
2719
- details = result.errors.map((e) => e.message).join("; ");
2720
- }
2721
- } catch {
2722
- }
2748
+ const elapsed = ((Date.now() - startTime) / 1e3).toFixed(1);
2749
+ this.logger.error(
2750
+ `[ChunkedPDFConverter] Task ${task.taskId} failed after ${elapsed}s`
2751
+ );
2752
+ const details = await getTaskFailureDetails(
2753
+ task,
2754
+ this.logger,
2755
+ "[ChunkedPDFConverter]"
2756
+ );
2723
2757
  throw new Error(`[ChunkedPDFConverter] Chunk task failed: ${details}`);
2724
2758
  }
2725
2759
  await new Promise(
@@ -3356,6 +3390,7 @@ var PDFConverter = class {
3356
3390
  return {
3357
3391
  ...(0, import_es_toolkit.omit)(options, [
3358
3392
  "num_threads",
3393
+ "document_timeout",
3359
3394
  "forceImagePdf",
3360
3395
  "strategySamplerModel",
3361
3396
  "vlmProcessorModel",
@@ -3377,6 +3412,8 @@ var PDFConverter = class {
3377
3412
  framework: "livetext"
3378
3413
  },
3379
3414
  generate_picture_images: true,
3415
+ do_picture_classification: true,
3416
+ do_picture_description: true,
3380
3417
  generate_page_images: false,
3381
3418
  // Page images are rendered by PageRenderer (ImageMagick) after conversion
3382
3419
  images_scale: 2,
@@ -3391,6 +3428,9 @@ var PDFConverter = class {
3391
3428
  accelerator_options: {
3392
3429
  device: "mps",
3393
3430
  num_threads: options.num_threads
3431
+ },
3432
+ ...options.document_timeout !== void 0 && {
3433
+ document_timeout: options.document_timeout
3394
3434
  }
3395
3435
  };
3396
3436
  }
@@ -3477,16 +3517,7 @@ var PDFConverter = class {
3477
3517
  * Fetch detailed error information from a failed task result.
3478
3518
  */
3479
3519
  async getTaskFailureDetails(task) {
3480
- try {
3481
- const result = await task.getResult();
3482
- if (result.errors?.length) {
3483
- return result.errors.map((e) => e.message).join("; ");
3484
- }
3485
- return `status: ${result.status ?? "unknown"}`;
3486
- } catch (err) {
3487
- this.logger.error("[PDFConverter] Failed to retrieve task result:", err);
3488
- return "unable to retrieve error details";
3489
- }
3520
+ return getTaskFailureDetails(task, this.logger, "[PDFConverter]");
3490
3521
  }
3491
3522
  async downloadResult(taskId) {
3492
3523
  this.logger.info(