npm - @heripo/pdf-parser - Versions diffs - 0.1.13 → 0.1.15 - Mend

@heripo/pdf-parser 0.1.13 → 0.1.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.d.cts CHANGED Viewed

@@ -33,12 +33,16 @@ type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mo
     aggregator?: LLMTokenUsageAggregator;
     /** Callback fired after each batch of VLM pages completes, with cumulative token usage */
     onTokenUsage?: (report: TokenUsageReport) => void;
+    /** Document processing timeout in seconds for the Docling server (default: server default) */
+    document_timeout?: number;
     /** Enable chunked conversion for large PDFs (local files only) */
     chunkedConversion?: boolean;
     /** Pages per chunk (default: CHUNKED_CONVERSION.DEFAULT_CHUNK_SIZE) */
     chunkSize?: number;
     /** Max retry attempts per failed chunk (default: CHUNKED_CONVERSION.DEFAULT_MAX_RETRIES) */
     chunkMaxRetries?: number;
+    /** LLM model for document type validation (opt-in: skipped when not set) */
+    documentValidationModel?: LanguageModel;
 };
 /** Result of strategy-based conversion */
 interface ConvertWithStrategyResult {
@@ -177,6 +181,17 @@ declare class ImagePdfFallbackError extends Error {
     constructor(originalError: Error, fallbackError: Error);
 }
+/**
+ * Error thrown when the uploaded PDF does not appear to be
+ * a Korean archaeological investigation report.
+ */
+declare class InvalidDocumentTypeError extends Error {
+    readonly reason: string;
+    readonly name = "InvalidDocumentTypeError";
+    readonly code = "INVALID_DOCUMENT_TYPE";
+    constructor(reason: string);
+}
 /**
  * Intermediate format produced by VLM page-by-page processing.
  * Intentionally kept simple so VLM prompts stay short and accurate.
@@ -285,4 +300,4 @@ declare class VlmResponseValidator {
     private static detectRepetitivePattern;
 }
-export { type ConversionCompleteCallback, type ConvertWithStrategyResult, ImagePdfFallbackError, type PDFConvertOptions, PDFParser, type VlmPageQuality, type VlmQualityIssue, type VlmQualityIssueType, VlmResponseValidator, type VlmValidationResult };
+export { type ConversionCompleteCallback, type ConvertWithStrategyResult, ImagePdfFallbackError, InvalidDocumentTypeError, type PDFConvertOptions, PDFParser, type VlmPageQuality, type VlmQualityIssue, type VlmQualityIssueType, VlmResponseValidator, type VlmValidationResult };

package/dist/index.d.ts CHANGED Viewed

@@ -33,12 +33,16 @@ type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mo
     aggregator?: LLMTokenUsageAggregator;
     /** Callback fired after each batch of VLM pages completes, with cumulative token usage */
     onTokenUsage?: (report: TokenUsageReport) => void;
+    /** Document processing timeout in seconds for the Docling server (default: server default) */
+    document_timeout?: number;
     /** Enable chunked conversion for large PDFs (local files only) */
     chunkedConversion?: boolean;
     /** Pages per chunk (default: CHUNKED_CONVERSION.DEFAULT_CHUNK_SIZE) */
     chunkSize?: number;
     /** Max retry attempts per failed chunk (default: CHUNKED_CONVERSION.DEFAULT_MAX_RETRIES) */
     chunkMaxRetries?: number;
+    /** LLM model for document type validation (opt-in: skipped when not set) */
+    documentValidationModel?: LanguageModel;
 };
 /** Result of strategy-based conversion */
 interface ConvertWithStrategyResult {
@@ -177,6 +181,17 @@ declare class ImagePdfFallbackError extends Error {
     constructor(originalError: Error, fallbackError: Error);
 }
+/**
+ * Error thrown when the uploaded PDF does not appear to be
+ * a Korean archaeological investigation report.
+ */
+declare class InvalidDocumentTypeError extends Error {
+    readonly reason: string;
+    readonly name = "InvalidDocumentTypeError";
+    readonly code = "INVALID_DOCUMENT_TYPE";
+    constructor(reason: string);
+}
 /**
  * Intermediate format produced by VLM page-by-page processing.
  * Intentionally kept simple so VLM prompts stay short and accurate.
@@ -285,4 +300,4 @@ declare class VlmResponseValidator {
     private static detectRepetitivePattern;
 }
-export { type ConversionCompleteCallback, type ConvertWithStrategyResult, ImagePdfFallbackError, type PDFConvertOptions, PDFParser, type VlmPageQuality, type VlmQualityIssue, type VlmQualityIssueType, VlmResponseValidator, type VlmValidationResult };
+export { type ConversionCompleteCallback, type ConvertWithStrategyResult, ImagePdfFallbackError, InvalidDocumentTypeError, type PDFConvertOptions, PDFParser, type VlmPageQuality, type VlmQualityIssue, type VlmQualityIssueType, VlmResponseValidator, type VlmValidationResult };

package/dist/index.js CHANGED Viewed

@@ -1484,6 +1484,32 @@ var PdfTextExtractor = class {
     }
     return result.stdout;
   }
+  /**
+   * Extract text from a range of PDF pages using a single pdftotext invocation.
+   * Returns empty string on failure (logged as warning).
+   *
+   * @param pdfPath - Absolute path to the source PDF file
+   * @param firstPage - First page number (1-based)
+   * @param lastPage - Last page number (1-based, inclusive)
+   */
+  async extractPageRange(pdfPath, firstPage, lastPage) {
+    const result = await spawnAsync("pdftotext", [
+      "-f",
+      firstPage.toString(),
+      "-l",
+      lastPage.toString(),
+      "-layout",
+      pdfPath,
+      "-"
+    ]);
+    if (result.code !== 0) {
+      this.logger.warn(
+        `[PdfTextExtractor] pdftotext failed for pages ${firstPage}-${lastPage}: ${result.stderr || "Unknown error"}`
+      );
+      return "";
+    }
+    return result.stdout;
+  }
   /**
    * Extract text from a single PDF page using pdftotext.
    * Returns empty string on failure (logged as warning).
@@ -2046,8 +2072,9 @@ Note: Hanja are Chinese characters used in Korean documents, different from mode
 Answer whether any Hanja characters are present on this page.
-Also identify all languages present on this page. Return an array of BCP 47 language tags ordered by prevalence (primary language first).
-Examples: ["ko-KR", "en-US"], ["ja-JP"], ["zh-TW", "en-US"]`;
+Also identify all languages present on this page. Return an array of ocrmac-compatible language tags ordered by prevalence (primary language first).
+Supported tags: ar-SA, ars-SA, cs-CZ, da-DK, de-DE, en-US, es-ES, fr-FR, id-ID, it-IT, ja-JP, ko-KR, ms-MY, nb-NO, nl-NL, nn-NO, no-NO, pl-PL, pt-BR, ro-RO, ru-RU, sv-SE, th-TH, tr-TR, uk-UA, vi-VT, yue-Hans, yue-Hant, zh-Hans, zh-Hant.
+Examples: ["ko-KR", "en-US"], ["ja-JP"], ["zh-Hant", "en-US"]`;
 var OcrStrategySampler = class {
   logger;
   pageRenderer;
@@ -2345,6 +2372,119 @@ var LocalFileServer = class {
   }
 };
+// src/utils/task-failure-details.ts
+var MAX_RESULT_RETRIES = 3;
+var RESULT_RETRY_DELAY_MS = 2e3;
+async function getTaskFailureDetails(task, logger, logPrefix) {
+  for (let attempt = 0; attempt < MAX_RESULT_RETRIES; attempt++) {
+    try {
+      if (attempt > 0) {
+        await new Promise((r) => setTimeout(r, RESULT_RETRY_DELAY_MS));
+      }
+      const result = await task.getResult();
+      if (result.errors?.length) {
+        return result.errors.map((e) => e.message).join("; ");
+      }
+      return `status: ${result.status ?? "unknown"}`;
+    } catch (err) {
+      if (attempt === MAX_RESULT_RETRIES - 1) {
+        logger.error(
+          `${logPrefix} Failed to retrieve task result after ${MAX_RESULT_RETRIES} attempts:`,
+          err
+        );
+        return "unable to retrieve error details";
+      }
+      logger.warn(
+        `${logPrefix} Result not available yet, retrying (${attempt + 1}/${MAX_RESULT_RETRIES})...`
+      );
+    }
+  }
+  return "unable to retrieve error details";
+}
+// src/validators/document-type-validator.ts
+import { z as z3 } from "zod";
+// src/errors/invalid-document-type-error.ts
+var InvalidDocumentTypeError = class extends Error {
+  constructor(reason) {
+    super(
+      `The uploaded PDF does not appear to be a Korean archaeological investigation report. Reason: ${reason}`
+    );
+    this.reason = reason;
+  }
+  name = "InvalidDocumentTypeError";
+  code = "INVALID_DOCUMENT_TYPE";
+};
+// src/validators/document-type-validator.ts
+var SYSTEM_PROMPT = `You are given text extracted from the first and last pages of a PDF document.
+Determine if this document is a Korean archaeological investigation report (\uACE0\uACE0\uD559 \uC870\uC0AC \uBCF4\uACE0\uC11C).
+Valid types include:
+- \uBC1C\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C (excavation investigation report)
+- \uC2DC\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C (trial excavation report)
+- \uC9C0\uD45C\uC870\uC0AC\uBCF4\uACE0\uC11C (surface survey report)
+- \uC815\uBC00\uBC1C\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C (detailed excavation report)
+- \uC218\uC911\uBC1C\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C (underwater excavation report)
+NOT valid (these are NOT archaeological investigation reports):
+- \uC218\uB9AC\uBCF4\uACE0\uC11C (repair/restoration report)
+- \uB2E8\uC21C \uC2E4\uCE21 \uBCF4\uACE0\uC11C (simple measurement report)
+- \uAC74\uCD95\uC870\uC0AC\uBCF4\uACE0\uC11C (architectural investigation report)
+- \uD559\uC220\uC870\uC0AC\uBCF4\uACE0\uC11C (academic research report)
+- \uD658\uACBD\uC601\uD5A5\uD3C9\uAC00 (environmental impact assessment)
+- General academic papers or textbooks about archaeology`;
+var documentTypeSchema = z3.object({
+  isValid: z3.boolean().describe("Whether this is a Korean archaeological investigation report"),
+  reason: z3.string().describe("Brief reason for the decision")
+});
+var DocumentTypeValidator = class {
+  textExtractor;
+  constructor(textExtractor) {
+    this.textExtractor = textExtractor;
+  }
+  /**
+   * Validate that the PDF at the given path is an archaeological investigation report.
+   *
+   * @throws {InvalidDocumentTypeError} if the document is not a valid report type
+   */
+  async validate(pdfPath, model, options) {
+    const totalPages = await this.textExtractor.getPageCount(pdfPath);
+    if (totalPages === 0) return;
+    const frontText = await this.textExtractor.extractPageRange(
+      pdfPath,
+      1,
+      Math.min(10, totalPages)
+    );
+    let backText = "";
+    if (totalPages > 20) {
+      backText = await this.textExtractor.extractPageRange(
+        pdfPath,
+        Math.max(1, totalPages - 9),
+        totalPages
+      );
+    }
+    const combinedText = (frontText + "\n" + backText).trim();
+    if (combinedText.length === 0) return;
+    const result = await LLMCaller.call({
+      schema: documentTypeSchema,
+      systemPrompt: SYSTEM_PROMPT,
+      userPrompt: `--- Document text (first and last pages) ---
+${combinedText}`,
+      primaryModel: model,
+      maxRetries: 2,
+      temperature: 0,
+      abortSignal: options?.abortSignal,
+      component: "DocumentTypeValidator",
+      phase: "validation"
+    });
+    if (!result.output.isValid) {
+      throw new InvalidDocumentTypeError(result.output.reason);
+    }
+  }
+};
 // src/core/chunked-pdf-converter.ts
 import {
   copyFileSync,
@@ -2699,14 +2839,15 @@ var ChunkedPDFConverter = class {
       const status = await task.poll();
       if (status.task_status === "success") return;
       if (status.task_status === "failure") {
-        let details = "unknown";
-        try {
-          const result = await task.getResult();
-          if (result.errors?.length) {
-            details = result.errors.map((e) => e.message).join("; ");
-          }
-        } catch {
-        }
+        const elapsed = ((Date.now() - startTime) / 1e3).toFixed(1);
+        this.logger.error(
+          `[ChunkedPDFConverter] Task ${task.taskId} failed after ${elapsed}s`
+        );
+        const details = await getTaskFailureDetails(
+          task,
+          this.logger,
+          "[ChunkedPDFConverter]"
+        );
         throw new Error(`[ChunkedPDFConverter] Chunk task failed: ${details}`);
       }
       await new Promise(
@@ -2971,8 +3112,27 @@ var PDFConverter = class {
     this.enableImagePdfFallback = enableImagePdfFallback;
     this.timeout = timeout;
   }
+  documentTypeValidated = false;
+  /**
+   * Validate that the PDF is a Korean archaeological investigation report.
+   * Skipped when no documentValidationModel is configured or for non-local URLs.
+   * Only runs once per converter instance (flag prevents duplicate checks on recursive calls).
+   */
+  async validateDocumentType(url, options, abortSignal) {
+    if (this.documentTypeValidated) return;
+    this.documentTypeValidated = true;
+    if (!options.documentValidationModel) return;
+    const pdfPath = url.startsWith("file://") ? url.slice(7) : null;
+    if (!pdfPath) return;
+    const textExtractor = new PdfTextExtractor(this.logger);
+    const validator = new DocumentTypeValidator(textExtractor);
+    await validator.validate(pdfPath, options.documentValidationModel, {
+      abortSignal
+    });
+  }
   async convert(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
     this.logger.info("[PDFConverter] Converting:", url);
+    await this.validateDocumentType(url, options, abortSignal);
     if (options.chunkedConversion && url.startsWith("file://")) {
       const chunked = new ChunkedPDFConverter(
         this.logger,
@@ -3027,6 +3187,7 @@ var PDFConverter = class {
     const aggregator = options.aggregator ?? new LLMTokenUsageAggregator();
     const trackedOptions = { ...options, aggregator };
     const pdfPath = url.startsWith("file://") ? url.slice(7) : null;
+    await this.validateDocumentType(url, trackedOptions, abortSignal);
     const strategy = await this.determineStrategy(
       pdfPath,
       reportId,
@@ -3343,6 +3504,7 @@ var PDFConverter = class {
     return {
       ...omit(options, [
         "num_threads",
+        "document_timeout",
         "forceImagePdf",
         "strategySamplerModel",
         "vlmProcessorModel",
@@ -3352,7 +3514,8 @@ var PDFConverter = class {
         "onTokenUsage",
         "chunkedConversion",
         "chunkSize",
-        "chunkMaxRetries"
+        "chunkMaxRetries",
+        "documentValidationModel"
       ]),
       to_formats: ["json", "html"],
       image_export_mode: "embedded",
@@ -3364,6 +3527,8 @@ var PDFConverter = class {
         framework: "livetext"
       },
       generate_picture_images: true,
+      do_picture_classification: true,
+      do_picture_description: true,
       generate_page_images: false,
       // Page images are rendered by PageRenderer (ImageMagick) after conversion
       images_scale: 2,
@@ -3378,6 +3543,9 @@ var PDFConverter = class {
       accelerator_options: {
         device: "mps",
         num_threads: options.num_threads
+      },
+      ...options.document_timeout !== void 0 && {
+        document_timeout: options.document_timeout
       }
     };
   }
@@ -3464,16 +3632,7 @@ var PDFConverter = class {
    * Fetch detailed error information from a failed task result.
    */
   async getTaskFailureDetails(task) {
-    try {
-      const result = await task.getResult();
-      if (result.errors?.length) {
-        return result.errors.map((e) => e.message).join("; ");
-      }
-      return `status: ${result.status ?? "unknown"}`;
-    } catch (err) {
-      this.logger.error("[PDFConverter] Failed to retrieve task result:", err);
-      return "unable to retrieve error details";
-    }
+    return getTaskFailureDetails(task, this.logger, "[PDFConverter]");
   }
   async downloadResult(taskId) {
     this.logger.info(
@@ -4057,6 +4216,7 @@ var VlmResponseValidator = class {
 };
 export {
   ImagePdfFallbackError,
+  InvalidDocumentTypeError,
   PDFParser,
   VlmResponseValidator
 };