@heripo/pdf-parser 0.1.13 → 0.1.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -33,6 +33,8 @@ type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mo
33
33
  aggregator?: LLMTokenUsageAggregator;
34
34
  /** Callback fired after each batch of VLM pages completes, with cumulative token usage */
35
35
  onTokenUsage?: (report: TokenUsageReport) => void;
36
+ /** Document processing timeout in seconds for the Docling server (default: server default) */
37
+ document_timeout?: number;
36
38
  /** Enable chunked conversion for large PDFs (local files only) */
37
39
  chunkedConversion?: boolean;
38
40
  /** Pages per chunk (default: CHUNKED_CONVERSION.DEFAULT_CHUNK_SIZE) */
package/dist/index.d.ts CHANGED
@@ -33,6 +33,8 @@ type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mo
33
33
  aggregator?: LLMTokenUsageAggregator;
34
34
  /** Callback fired after each batch of VLM pages completes, with cumulative token usage */
35
35
  onTokenUsage?: (report: TokenUsageReport) => void;
36
+ /** Document processing timeout in seconds for the Docling server (default: server default) */
37
+ document_timeout?: number;
36
38
  /** Enable chunked conversion for large PDFs (local files only) */
37
39
  chunkedConversion?: boolean;
38
40
  /** Pages per chunk (default: CHUNKED_CONVERSION.DEFAULT_CHUNK_SIZE) */
package/dist/index.js CHANGED
@@ -2046,8 +2046,9 @@ Note: Hanja are Chinese characters used in Korean documents, different from mode
2046
2046
 
2047
2047
  Answer whether any Hanja characters are present on this page.
2048
2048
 
2049
- Also identify all languages present on this page. Return an array of BCP 47 language tags ordered by prevalence (primary language first).
2050
- Examples: ["ko-KR", "en-US"], ["ja-JP"], ["zh-TW", "en-US"]`;
2049
+ Also identify all languages present on this page. Return an array of ocrmac-compatible language tags ordered by prevalence (primary language first).
2050
+ Supported tags: ar-SA, ars-SA, cs-CZ, da-DK, de-DE, en-US, es-ES, fr-FR, id-ID, it-IT, ja-JP, ko-KR, ms-MY, nb-NO, nl-NL, nn-NO, no-NO, pl-PL, pt-BR, ro-RO, ru-RU, sv-SE, th-TH, tr-TR, uk-UA, vi-VT, yue-Hans, yue-Hant, zh-Hans, zh-Hant.
2051
+ Examples: ["ko-KR", "en-US"], ["ja-JP"], ["zh-Hant", "en-US"]`;
2051
2052
  var OcrStrategySampler = class {
2052
2053
  logger;
2053
2054
  pageRenderer;
@@ -2345,6 +2346,36 @@ var LocalFileServer = class {
2345
2346
  }
2346
2347
  };
2347
2348
 
2349
+ // src/utils/task-failure-details.ts
2350
+ var MAX_RESULT_RETRIES = 3;
2351
+ var RESULT_RETRY_DELAY_MS = 2e3;
2352
+ async function getTaskFailureDetails(task, logger, logPrefix) {
2353
+ for (let attempt = 0; attempt < MAX_RESULT_RETRIES; attempt++) {
2354
+ try {
2355
+ if (attempt > 0) {
2356
+ await new Promise((r) => setTimeout(r, RESULT_RETRY_DELAY_MS));
2357
+ }
2358
+ const result = await task.getResult();
2359
+ if (result.errors?.length) {
2360
+ return result.errors.map((e) => e.message).join("; ");
2361
+ }
2362
+ return `status: ${result.status ?? "unknown"}`;
2363
+ } catch (err) {
2364
+ if (attempt === MAX_RESULT_RETRIES - 1) {
2365
+ logger.error(
2366
+ `${logPrefix} Failed to retrieve task result after ${MAX_RESULT_RETRIES} attempts:`,
2367
+ err
2368
+ );
2369
+ return "unable to retrieve error details";
2370
+ }
2371
+ logger.warn(
2372
+ `${logPrefix} Result not available yet, retrying (${attempt + 1}/${MAX_RESULT_RETRIES})...`
2373
+ );
2374
+ }
2375
+ }
2376
+ return "unable to retrieve error details";
2377
+ }
2378
+
2348
2379
  // src/core/chunked-pdf-converter.ts
2349
2380
  import {
2350
2381
  copyFileSync,
@@ -2699,14 +2730,15 @@ var ChunkedPDFConverter = class {
2699
2730
  const status = await task.poll();
2700
2731
  if (status.task_status === "success") return;
2701
2732
  if (status.task_status === "failure") {
2702
- let details = "unknown";
2703
- try {
2704
- const result = await task.getResult();
2705
- if (result.errors?.length) {
2706
- details = result.errors.map((e) => e.message).join("; ");
2707
- }
2708
- } catch {
2709
- }
2733
+ const elapsed = ((Date.now() - startTime) / 1e3).toFixed(1);
2734
+ this.logger.error(
2735
+ `[ChunkedPDFConverter] Task ${task.taskId} failed after ${elapsed}s`
2736
+ );
2737
+ const details = await getTaskFailureDetails(
2738
+ task,
2739
+ this.logger,
2740
+ "[ChunkedPDFConverter]"
2741
+ );
2710
2742
  throw new Error(`[ChunkedPDFConverter] Chunk task failed: ${details}`);
2711
2743
  }
2712
2744
  await new Promise(
@@ -3343,6 +3375,7 @@ var PDFConverter = class {
3343
3375
  return {
3344
3376
  ...omit(options, [
3345
3377
  "num_threads",
3378
+ "document_timeout",
3346
3379
  "forceImagePdf",
3347
3380
  "strategySamplerModel",
3348
3381
  "vlmProcessorModel",
@@ -3364,6 +3397,8 @@ var PDFConverter = class {
3364
3397
  framework: "livetext"
3365
3398
  },
3366
3399
  generate_picture_images: true,
3400
+ do_picture_classification: true,
3401
+ do_picture_description: true,
3367
3402
  generate_page_images: false,
3368
3403
  // Page images are rendered by PageRenderer (ImageMagick) after conversion
3369
3404
  images_scale: 2,
@@ -3378,6 +3413,9 @@ var PDFConverter = class {
3378
3413
  accelerator_options: {
3379
3414
  device: "mps",
3380
3415
  num_threads: options.num_threads
3416
+ },
3417
+ ...options.document_timeout !== void 0 && {
3418
+ document_timeout: options.document_timeout
3381
3419
  }
3382
3420
  };
3383
3421
  }
@@ -3464,16 +3502,7 @@ var PDFConverter = class {
3464
3502
  * Fetch detailed error information from a failed task result.
3465
3503
  */
3466
3504
  async getTaskFailureDetails(task) {
3467
- try {
3468
- const result = await task.getResult();
3469
- if (result.errors?.length) {
3470
- return result.errors.map((e) => e.message).join("; ");
3471
- }
3472
- return `status: ${result.status ?? "unknown"}`;
3473
- } catch (err) {
3474
- this.logger.error("[PDFConverter] Failed to retrieve task result:", err);
3475
- return "unable to retrieve error details";
3476
- }
3505
+ return getTaskFailureDetails(task, this.logger, "[PDFConverter]");
3477
3506
  }
3478
3507
  async downloadResult(taskId) {
3479
3508
  this.logger.info(