@heripo/pdf-parser 0.1.14 → 0.1.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -31,6 +31,7 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
31
31
  var src_exports = {};
32
32
  __export(src_exports, {
33
33
  ImagePdfFallbackError: () => ImagePdfFallbackError,
34
+ InvalidDocumentTypeError: () => InvalidDocumentTypeError,
34
35
  PDFParser: () => PDFParser,
35
36
  VlmResponseValidator: () => VlmResponseValidator
36
37
  });
@@ -1508,6 +1509,32 @@ var PdfTextExtractor = class {
1508
1509
  }
1509
1510
  return result.stdout;
1510
1511
  }
1512
+ /**
1513
+ * Extract text from a range of PDF pages using a single pdftotext invocation.
1514
+ * Returns empty string on failure (logged as warning).
1515
+ *
1516
+ * @param pdfPath - Absolute path to the source PDF file
1517
+ * @param firstPage - First page number (1-based)
1518
+ * @param lastPage - Last page number (1-based, inclusive)
1519
+ */
1520
+ async extractPageRange(pdfPath, firstPage, lastPage) {
1521
+ const result = await spawnAsync("pdftotext", [
1522
+ "-f",
1523
+ firstPage.toString(),
1524
+ "-l",
1525
+ lastPage.toString(),
1526
+ "-layout",
1527
+ pdfPath,
1528
+ "-"
1529
+ ]);
1530
+ if (result.code !== 0) {
1531
+ this.logger.warn(
1532
+ `[PdfTextExtractor] pdftotext failed for pages ${firstPage}-${lastPage}: ${result.stderr || "Unknown error"}`
1533
+ );
1534
+ return "";
1535
+ }
1536
+ return result.stdout;
1537
+ }
1511
1538
  /**
1512
1539
  * Extract text from a single PDF page using pdftotext.
1513
1540
  * Returns empty string on failure (logged as warning).
@@ -2400,6 +2427,89 @@ async function getTaskFailureDetails(task, logger, logPrefix) {
2400
2427
  return "unable to retrieve error details";
2401
2428
  }
2402
2429
 
2430
+ // src/validators/document-type-validator.ts
2431
+ var import_zod = require("zod");
2432
+
2433
+ // src/errors/invalid-document-type-error.ts
2434
+ var InvalidDocumentTypeError = class extends Error {
2435
+ constructor(reason) {
2436
+ super(
2437
+ `The uploaded PDF does not appear to be a Korean archaeological investigation report. Reason: ${reason}`
2438
+ );
2439
+ this.reason = reason;
2440
+ }
2441
+ name = "InvalidDocumentTypeError";
2442
+ code = "INVALID_DOCUMENT_TYPE";
2443
+ };
2444
+
2445
+ // src/validators/document-type-validator.ts
2446
+ var SYSTEM_PROMPT = `You are given text extracted from the first and last pages of a PDF document.
2447
+ Determine if this document is a Korean archaeological investigation report (\uACE0\uACE0\uD559 \uC870\uC0AC \uBCF4\uACE0\uC11C).
2448
+
2449
+ Valid types include:
2450
+ - \uBC1C\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C (excavation investigation report)
2451
+ - \uC2DC\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C (trial excavation report)
2452
+ - \uC9C0\uD45C\uC870\uC0AC\uBCF4\uACE0\uC11C (surface survey report)
2453
+ - \uC815\uBC00\uBC1C\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C (detailed excavation report)
2454
+ - \uC218\uC911\uBC1C\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C (underwater excavation report)
2455
+
2456
+ NOT valid (these are NOT archaeological investigation reports):
2457
+ - \uC218\uB9AC\uBCF4\uACE0\uC11C (repair/restoration report)
2458
+ - \uB2E8\uC21C \uC2E4\uCE21 \uBCF4\uACE0\uC11C (simple measurement report)
2459
+ - \uAC74\uCD95\uC870\uC0AC\uBCF4\uACE0\uC11C (architectural investigation report)
2460
+ - \uD559\uC220\uC870\uC0AC\uBCF4\uACE0\uC11C (academic research report)
2461
+ - \uD658\uACBD\uC601\uD5A5\uD3C9\uAC00 (environmental impact assessment)
2462
+ - General academic papers or textbooks about archaeology`;
2463
+ var documentTypeSchema = import_zod.z.object({
2464
+ isValid: import_zod.z.boolean().describe("Whether this is a Korean archaeological investigation report"),
2465
+ reason: import_zod.z.string().describe("Brief reason for the decision")
2466
+ });
2467
+ var DocumentTypeValidator = class {
2468
+ textExtractor;
2469
+ constructor(textExtractor) {
2470
+ this.textExtractor = textExtractor;
2471
+ }
2472
+ /**
2473
+ * Validate that the PDF at the given path is an archaeological investigation report.
2474
+ *
2475
+ * @throws {InvalidDocumentTypeError} if the document is not a valid report type
2476
+ */
2477
+ async validate(pdfPath, model, options) {
2478
+ const totalPages = await this.textExtractor.getPageCount(pdfPath);
2479
+ if (totalPages === 0) return;
2480
+ const frontText = await this.textExtractor.extractPageRange(
2481
+ pdfPath,
2482
+ 1,
2483
+ Math.min(10, totalPages)
2484
+ );
2485
+ let backText = "";
2486
+ if (totalPages > 20) {
2487
+ backText = await this.textExtractor.extractPageRange(
2488
+ pdfPath,
2489
+ Math.max(1, totalPages - 9),
2490
+ totalPages
2491
+ );
2492
+ }
2493
+ const combinedText = (frontText + "\n" + backText).trim();
2494
+ if (combinedText.length === 0) return;
2495
+ const result = await LLMCaller.call({
2496
+ schema: documentTypeSchema,
2497
+ systemPrompt: SYSTEM_PROMPT,
2498
+ userPrompt: `--- Document text (first and last pages) ---
2499
+ ${combinedText}`,
2500
+ primaryModel: model,
2501
+ maxRetries: 2,
2502
+ temperature: 0,
2503
+ abortSignal: options?.abortSignal,
2504
+ component: "DocumentTypeValidator",
2505
+ phase: "validation"
2506
+ });
2507
+ if (!result.output.isValid) {
2508
+ throw new InvalidDocumentTypeError(result.output.reason);
2509
+ }
2510
+ }
2511
+ };
2512
+
2403
2513
  // src/core/chunked-pdf-converter.ts
2404
2514
  var import_node_fs7 = require("fs");
2405
2515
  var import_promises4 = require("fs/promises");
@@ -3018,8 +3128,27 @@ var PDFConverter = class {
3018
3128
  this.enableImagePdfFallback = enableImagePdfFallback;
3019
3129
  this.timeout = timeout;
3020
3130
  }
3131
+ documentTypeValidated = false;
3132
+ /**
3133
+ * Validate that the PDF is a Korean archaeological investigation report.
3134
+ * Skipped when no documentValidationModel is configured or for non-local URLs.
3135
+ * Only runs once per converter instance (flag prevents duplicate checks on recursive calls).
3136
+ */
3137
+ async validateDocumentType(url, options, abortSignal) {
3138
+ if (this.documentTypeValidated) return;
3139
+ this.documentTypeValidated = true;
3140
+ if (!options.documentValidationModel) return;
3141
+ const pdfPath = url.startsWith("file://") ? url.slice(7) : null;
3142
+ if (!pdfPath) return;
3143
+ const textExtractor = new PdfTextExtractor(this.logger);
3144
+ const validator = new DocumentTypeValidator(textExtractor);
3145
+ await validator.validate(pdfPath, options.documentValidationModel, {
3146
+ abortSignal
3147
+ });
3148
+ }
3021
3149
  async convert(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
3022
3150
  this.logger.info("[PDFConverter] Converting:", url);
3151
+ await this.validateDocumentType(url, options, abortSignal);
3023
3152
  if (options.chunkedConversion && url.startsWith("file://")) {
3024
3153
  const chunked = new ChunkedPDFConverter(
3025
3154
  this.logger,
@@ -3074,6 +3203,7 @@ var PDFConverter = class {
3074
3203
  const aggregator = options.aggregator ?? new LLMTokenUsageAggregator();
3075
3204
  const trackedOptions = { ...options, aggregator };
3076
3205
  const pdfPath = url.startsWith("file://") ? url.slice(7) : null;
3206
+ await this.validateDocumentType(url, trackedOptions, abortSignal);
3077
3207
  const strategy = await this.determineStrategy(
3078
3208
  pdfPath,
3079
3209
  reportId,
@@ -3400,7 +3530,8 @@ var PDFConverter = class {
3400
3530
  "onTokenUsage",
3401
3531
  "chunkedConversion",
3402
3532
  "chunkSize",
3403
- "chunkMaxRetries"
3533
+ "chunkMaxRetries",
3534
+ "documentValidationModel"
3404
3535
  ]),
3405
3536
  to_formats: ["json", "html"],
3406
3537
  image_export_mode: "embedded",
@@ -4102,6 +4233,7 @@ var VlmResponseValidator = class {
4102
4233
  // Annotate the CommonJS export names for ESM import in node:
4103
4234
  0 && (module.exports = {
4104
4235
  ImagePdfFallbackError,
4236
+ InvalidDocumentTypeError,
4105
4237
  PDFParser,
4106
4238
  VlmResponseValidator
4107
4239
  });