@heripo/pdf-parser 0.1.12 → 0.1.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -33,6 +33,8 @@ type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mo
33
33
  aggregator?: LLMTokenUsageAggregator;
34
34
  /** Callback fired after each batch of VLM pages completes, with cumulative token usage */
35
35
  onTokenUsage?: (report: TokenUsageReport) => void;
36
+ /** Document processing timeout in seconds for the Docling server (default: server default) */
37
+ document_timeout?: number;
36
38
  /** Enable chunked conversion for large PDFs (local files only) */
37
39
  chunkedConversion?: boolean;
38
40
  /** Pages per chunk (default: CHUNKED_CONVERSION.DEFAULT_CHUNK_SIZE) */
package/dist/index.d.ts CHANGED
@@ -33,6 +33,8 @@ type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mo
33
33
  aggregator?: LLMTokenUsageAggregator;
34
34
  /** Callback fired after each batch of VLM pages completes, with cumulative token usage */
35
35
  onTokenUsage?: (report: TokenUsageReport) => void;
36
+ /** Document processing timeout in seconds for the Docling server (default: server default) */
37
+ document_timeout?: number;
36
38
  /** Enable chunked conversion for large PDFs (local files only) */
37
39
  chunkedConversion?: boolean;
38
40
  /** Pages per chunk (default: CHUNKED_CONVERSION.DEFAULT_CHUNK_SIZE) */
package/dist/index.js CHANGED
@@ -1756,7 +1756,8 @@ var VlmTextCorrector = class {
1756
1756
  },
1757
1757
  {
1758
1758
  type: "image",
1759
- image: `data:image/png;base64,${imageBase64}`
1759
+ image: imageBase64,
1760
+ mediaType: "image/png"
1760
1761
  }
1761
1762
  ]
1762
1763
  }
@@ -1968,7 +1969,7 @@ var VlmTextCorrector = class {
1968
1969
  */
1969
1970
  readPageImage(outputDir, pageNo) {
1970
1971
  const imagePath = join4(outputDir, "pages", `page_${pageNo - 1}.png`);
1971
- return readFileSync(imagePath).toString("base64");
1972
+ return new Uint8Array(readFileSync(imagePath));
1972
1973
  }
1973
1974
  /**
1974
1975
  * Apply VLM corrections to the DoclingDocument.
@@ -2045,8 +2046,9 @@ Note: Hanja are Chinese characters used in Korean documents, different from mode
2045
2046
 
2046
2047
  Answer whether any Hanja characters are present on this page.
2047
2048
 
2048
- Also identify all languages present on this page. Return an array of BCP 47 language tags ordered by prevalence (primary language first).
2049
- Examples: ["ko-KR", "en-US"], ["ja-JP"], ["zh-TW", "en-US"]`;
2049
+ Also identify all languages present on this page. Return an array of ocrmac-compatible language tags ordered by prevalence (primary language first).
2050
+ Supported tags: ar-SA, ars-SA, cs-CZ, da-DK, de-DE, en-US, es-ES, fr-FR, id-ID, it-IT, ja-JP, ko-KR, ms-MY, nb-NO, nl-NL, nn-NO, no-NO, pl-PL, pt-BR, ro-RO, ru-RU, sv-SE, th-TH, tr-TR, uk-UA, vi-VT, yue-Hans, yue-Hant, zh-Hans, zh-Hant.
2051
+ Examples: ["ko-KR", "en-US"], ["ja-JP"], ["zh-Hant", "en-US"]`;
2050
2052
  var OcrStrategySampler = class {
2051
2053
  logger;
2052
2054
  pageRenderer;
@@ -2237,7 +2239,7 @@ var OcrStrategySampler = class {
2237
2239
  this.logger.debug(
2238
2240
  `[OcrStrategySampler] Analyzing page ${pageNo} for Korean-Hanja mix and language...`
2239
2241
  );
2240
- const base64Image = readFileSync2(pageFile).toString("base64");
2242
+ const imageData = new Uint8Array(readFileSync2(pageFile));
2241
2243
  const messages = [
2242
2244
  {
2243
2245
  role: "user",
@@ -2245,7 +2247,8 @@ var OcrStrategySampler = class {
2245
2247
  { type: "text", text: KOREAN_HANJA_MIX_PROMPT },
2246
2248
  {
2247
2249
  type: "image",
2248
- image: `data:image/png;base64,${base64Image}`
2250
+ image: imageData,
2251
+ mediaType: "image/png"
2249
2252
  }
2250
2253
  ]
2251
2254
  }
@@ -2343,6 +2346,36 @@ var LocalFileServer = class {
2343
2346
  }
2344
2347
  };
2345
2348
 
2349
+ // src/utils/task-failure-details.ts
2350
+ var MAX_RESULT_RETRIES = 3;
2351
+ var RESULT_RETRY_DELAY_MS = 2e3;
2352
+ async function getTaskFailureDetails(task, logger, logPrefix) {
2353
+ for (let attempt = 0; attempt < MAX_RESULT_RETRIES; attempt++) {
2354
+ try {
2355
+ if (attempt > 0) {
2356
+ await new Promise((r) => setTimeout(r, RESULT_RETRY_DELAY_MS));
2357
+ }
2358
+ const result = await task.getResult();
2359
+ if (result.errors?.length) {
2360
+ return result.errors.map((e) => e.message).join("; ");
2361
+ }
2362
+ return `status: ${result.status ?? "unknown"}`;
2363
+ } catch (err) {
2364
+ if (attempt === MAX_RESULT_RETRIES - 1) {
2365
+ logger.error(
2366
+ `${logPrefix} Failed to retrieve task result after ${MAX_RESULT_RETRIES} attempts:`,
2367
+ err
2368
+ );
2369
+ return "unable to retrieve error details";
2370
+ }
2371
+ logger.warn(
2372
+ `${logPrefix} Result not available yet, retrying (${attempt + 1}/${MAX_RESULT_RETRIES})...`
2373
+ );
2374
+ }
2375
+ }
2376
+ return "unable to retrieve error details";
2377
+ }
2378
+
2346
2379
  // src/core/chunked-pdf-converter.ts
2347
2380
  import {
2348
2381
  copyFileSync,
@@ -2697,14 +2730,15 @@ var ChunkedPDFConverter = class {
2697
2730
  const status = await task.poll();
2698
2731
  if (status.task_status === "success") return;
2699
2732
  if (status.task_status === "failure") {
2700
- let details = "unknown";
2701
- try {
2702
- const result = await task.getResult();
2703
- if (result.errors?.length) {
2704
- details = result.errors.map((e) => e.message).join("; ");
2705
- }
2706
- } catch {
2707
- }
2733
+ const elapsed = ((Date.now() - startTime) / 1e3).toFixed(1);
2734
+ this.logger.error(
2735
+ `[ChunkedPDFConverter] Task ${task.taskId} failed after ${elapsed}s`
2736
+ );
2737
+ const details = await getTaskFailureDetails(
2738
+ task,
2739
+ this.logger,
2740
+ "[ChunkedPDFConverter]"
2741
+ );
2708
2742
  throw new Error(`[ChunkedPDFConverter] Chunk task failed: ${details}`);
2709
2743
  }
2710
2744
  await new Promise(
@@ -3341,6 +3375,7 @@ var PDFConverter = class {
3341
3375
  return {
3342
3376
  ...omit(options, [
3343
3377
  "num_threads",
3378
+ "document_timeout",
3344
3379
  "forceImagePdf",
3345
3380
  "strategySamplerModel",
3346
3381
  "vlmProcessorModel",
@@ -3362,6 +3397,8 @@ var PDFConverter = class {
3362
3397
  framework: "livetext"
3363
3398
  },
3364
3399
  generate_picture_images: true,
3400
+ do_picture_classification: true,
3401
+ do_picture_description: true,
3365
3402
  generate_page_images: false,
3366
3403
  // Page images are rendered by PageRenderer (ImageMagick) after conversion
3367
3404
  images_scale: 2,
@@ -3376,6 +3413,9 @@ var PDFConverter = class {
3376
3413
  accelerator_options: {
3377
3414
  device: "mps",
3378
3415
  num_threads: options.num_threads
3416
+ },
3417
+ ...options.document_timeout !== void 0 && {
3418
+ document_timeout: options.document_timeout
3379
3419
  }
3380
3420
  };
3381
3421
  }
@@ -3462,16 +3502,7 @@ var PDFConverter = class {
3462
3502
  * Fetch detailed error information from a failed task result.
3463
3503
  */
3464
3504
  async getTaskFailureDetails(task) {
3465
- try {
3466
- const result = await task.getResult();
3467
- if (result.errors?.length) {
3468
- return result.errors.map((e) => e.message).join("; ");
3469
- }
3470
- return `status: ${result.status ?? "unknown"}`;
3471
- } catch (err) {
3472
- this.logger.error("[PDFConverter] Failed to retrieve task result:", err);
3473
- return "unable to retrieve error details";
3474
- }
3505
+ return getTaskFailureDetails(task, this.logger, "[PDFConverter]");
3475
3506
  }
3476
3507
  async downloadResult(taskId) {
3477
3508
  this.logger.info(