npm - @heripo/pdf-parser - Versions diffs - 0.1.14 → 0.1.16 - Mend

@heripo/pdf-parser 0.1.14 → 0.1.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.cjs CHANGED Viewed

@@ -31,6 +31,7 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
 var src_exports = {};
 __export(src_exports, {
   ImagePdfFallbackError: () => ImagePdfFallbackError,
+  InvalidDocumentTypeError: () => InvalidDocumentTypeError,
   PDFParser: () => PDFParser,
   VlmResponseValidator: () => VlmResponseValidator
 });
@@ -1508,6 +1509,32 @@ var PdfTextExtractor = class {
     }
     return result.stdout;
   }
+  /**
+   * Extract text from a range of PDF pages using a single pdftotext invocation.
+   * Returns empty string on failure (logged as warning).
+   *
+   * @param pdfPath - Absolute path to the source PDF file
+   * @param firstPage - First page number (1-based)
+   * @param lastPage - Last page number (1-based, inclusive)
+   */
+  async extractPageRange(pdfPath, firstPage, lastPage) {
+    const result = await spawnAsync("pdftotext", [
+      "-f",
+      firstPage.toString(),
+      "-l",
+      lastPage.toString(),
+      "-layout",
+      pdfPath,
+      "-"
+    ]);
+    if (result.code !== 0) {
+      this.logger.warn(
+        `[PdfTextExtractor] pdftotext failed for pages ${firstPage}-${lastPage}: ${result.stderr || "Unknown error"}`
+      );
+      return "";
+    }
+    return result.stdout;
+  }
   /**
    * Extract text from a single PDF page using pdftotext.
    * Returns empty string on failure (logged as warning).
@@ -2400,6 +2427,94 @@ async function getTaskFailureDetails(task, logger, logPrefix) {
   return "unable to retrieve error details";
 }
+// src/validators/document-type-validator.ts
+var import_zod = require("zod");
+// src/errors/invalid-document-type-error.ts
+var InvalidDocumentTypeError = class extends Error {
+  constructor(reason) {
+    super(
+      `The uploaded PDF does not appear to be a Korean archaeological investigation report. Reason: ${reason}`
+    );
+    this.reason = reason;
+  }
+  name = "InvalidDocumentTypeError";
+  code = "INVALID_DOCUMENT_TYPE";
+};
+// src/validators/document-type-validator.ts
+var SYSTEM_PROMPT = `You are given text extracted from the first and last pages of a PDF document.
+Determine if this document is an archaeological investigation report from any country.
+Valid types include (in any language):
+- Excavation report (\uBC1C\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C)
+- Trial excavation report (\uC2DC\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C)
+- Surface survey report (\uC9C0\uD45C\uC870\uC0AC\uBCF4\uACE0\uC11C)
+- Detailed excavation report (\uC815\uBC00\uBC1C\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C)
+- Underwater excavation report (\uC218\uC911\uBC1C\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C)
+- Salvage excavation report
+- Archaeological assessment report
+- Any other archaeological fieldwork investigation report
+NOT valid (these are NOT archaeological investigation reports):
+- Repair/restoration reports (\uC218\uB9AC\uBCF4\uACE0\uC11C)
+- Simple measurement reports (\uB2E8\uC21C \uC2E4\uCE21 \uBCF4\uACE0\uC11C)
+- Architectural investigation reports (\uAC74\uCD95\uC870\uC0AC\uBCF4\uACE0\uC11C)
+- Academic research reports (\uD559\uC220\uC870\uC0AC\uBCF4\uACE0\uC11C)
+- Environmental impact assessments (\uD658\uACBD\uC601\uD5A5\uD3C9\uAC00)
+- General academic papers or textbooks about archaeology
+- Conservation/preservation reports
+- Museum catalogs or exhibition guides`;
+var documentTypeSchema = import_zod.z.object({
+  isValid: import_zod.z.boolean().describe("Whether this is an archaeological investigation report"),
+  reason: import_zod.z.string().describe("Brief reason for the decision")
+});
+var DocumentTypeValidator = class {
+  textExtractor;
+  constructor(textExtractor) {
+    this.textExtractor = textExtractor;
+  }
+  /**
+   * Validate that the PDF at the given path is an archaeological investigation report.
+   *
+   * @throws {InvalidDocumentTypeError} if the document is not a valid report type
+   */
+  async validate(pdfPath, model, options) {
+    const totalPages = await this.textExtractor.getPageCount(pdfPath);
+    if (totalPages === 0) return;
+    const frontText = await this.textExtractor.extractPageRange(
+      pdfPath,
+      1,
+      Math.min(10, totalPages)
+    );
+    let backText = "";
+    if (totalPages > 20) {
+      backText = await this.textExtractor.extractPageRange(
+        pdfPath,
+        Math.max(1, totalPages - 9),
+        totalPages
+      );
+    }
+    const combinedText = (frontText + "\n" + backText).trim();
+    if (combinedText.length === 0) return;
+    const result = await LLMCaller.call({
+      schema: documentTypeSchema,
+      systemPrompt: SYSTEM_PROMPT,
+      userPrompt: `--- Document text (first and last pages) ---
+${combinedText}`,
+      primaryModel: model,
+      maxRetries: 2,
+      temperature: 0,
+      abortSignal: options?.abortSignal,
+      component: "DocumentTypeValidator",
+      phase: "validation"
+    });
+    if (!result.output.isValid) {
+      throw new InvalidDocumentTypeError(result.output.reason);
+    }
+  }
+};
 // src/core/chunked-pdf-converter.ts
 var import_node_fs7 = require("fs");
 var import_promises4 = require("fs/promises");
@@ -3018,8 +3133,27 @@ var PDFConverter = class {
     this.enableImagePdfFallback = enableImagePdfFallback;
     this.timeout = timeout;
   }
+  documentTypeValidated = false;
+  /**
+   * Validate that the PDF is a Korean archaeological investigation report.
+   * Skipped when no documentValidationModel is configured or for non-local URLs.
+   * Only runs once per converter instance (flag prevents duplicate checks on recursive calls).
+   */
+  async validateDocumentType(url, options, abortSignal) {
+    if (this.documentTypeValidated) return;
+    this.documentTypeValidated = true;
+    if (!options.documentValidationModel) return;
+    const pdfPath = url.startsWith("file://") ? url.slice(7) : null;
+    if (!pdfPath) return;
+    const textExtractor = new PdfTextExtractor(this.logger);
+    const validator = new DocumentTypeValidator(textExtractor);
+    await validator.validate(pdfPath, options.documentValidationModel, {
+      abortSignal
+    });
+  }
   async convert(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
     this.logger.info("[PDFConverter] Converting:", url);
+    await this.validateDocumentType(url, options, abortSignal);
     if (options.chunkedConversion && url.startsWith("file://")) {
       const chunked = new ChunkedPDFConverter(
         this.logger,
@@ -3074,6 +3208,7 @@ var PDFConverter = class {
     const aggregator = options.aggregator ?? new LLMTokenUsageAggregator();
     const trackedOptions = { ...options, aggregator };
     const pdfPath = url.startsWith("file://") ? url.slice(7) : null;
+    await this.validateDocumentType(url, trackedOptions, abortSignal);
     const strategy = await this.determineStrategy(
       pdfPath,
       reportId,
@@ -3400,7 +3535,8 @@ var PDFConverter = class {
         "onTokenUsage",
         "chunkedConversion",
         "chunkSize",
-        "chunkMaxRetries"
+        "chunkMaxRetries",
+        "documentValidationModel"
       ]),
       to_formats: ["json", "html"],
       image_export_mode: "embedded",
@@ -4102,6 +4238,7 @@ var VlmResponseValidator = class {
 // Annotate the CommonJS export names for ESM import in node:
 0 && (module.exports = {
   ImagePdfFallbackError,
+  InvalidDocumentTypeError,
   PDFParser,
   VlmResponseValidator
 });