@heripo/pdf-parser 0.1.14 → 0.1.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -31,6 +31,7 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
31
31
  var src_exports = {};
32
32
  __export(src_exports, {
33
33
  ImagePdfFallbackError: () => ImagePdfFallbackError,
34
+ InvalidDocumentTypeError: () => InvalidDocumentTypeError,
34
35
  PDFParser: () => PDFParser,
35
36
  VlmResponseValidator: () => VlmResponseValidator
36
37
  });
@@ -1508,6 +1509,32 @@ var PdfTextExtractor = class {
1508
1509
  }
1509
1510
  return result.stdout;
1510
1511
  }
1512
+ /**
1513
+ * Extract text from a range of PDF pages using a single pdftotext invocation.
1514
+ * Returns empty string on failure (logged as warning).
1515
+ *
1516
+ * @param pdfPath - Absolute path to the source PDF file
1517
+ * @param firstPage - First page number (1-based)
1518
+ * @param lastPage - Last page number (1-based, inclusive)
1519
+ */
1520
+ async extractPageRange(pdfPath, firstPage, lastPage) {
1521
+ const result = await spawnAsync("pdftotext", [
1522
+ "-f",
1523
+ firstPage.toString(),
1524
+ "-l",
1525
+ lastPage.toString(),
1526
+ "-layout",
1527
+ pdfPath,
1528
+ "-"
1529
+ ]);
1530
+ if (result.code !== 0) {
1531
+ this.logger.warn(
1532
+ `[PdfTextExtractor] pdftotext failed for pages ${firstPage}-${lastPage}: ${result.stderr || "Unknown error"}`
1533
+ );
1534
+ return "";
1535
+ }
1536
+ return result.stdout;
1537
+ }
1511
1538
  /**
1512
1539
  * Extract text from a single PDF page using pdftotext.
1513
1540
  * Returns empty string on failure (logged as warning).
@@ -2400,6 +2427,94 @@ async function getTaskFailureDetails(task, logger, logPrefix) {
2400
2427
  return "unable to retrieve error details";
2401
2428
  }
2402
2429
 
2430
+ // src/validators/document-type-validator.ts
2431
+ var import_zod = require("zod");
2432
+
2433
+ // src/errors/invalid-document-type-error.ts
2434
+ var InvalidDocumentTypeError = class extends Error {
2435
+ constructor(reason) {
2436
+ super(
2437
+ `The uploaded PDF does not appear to be a Korean archaeological investigation report. Reason: ${reason}`
2438
+ );
2439
+ this.reason = reason;
2440
+ }
2441
+ name = "InvalidDocumentTypeError";
2442
+ code = "INVALID_DOCUMENT_TYPE";
2443
+ };
2444
+
2445
+ // src/validators/document-type-validator.ts
2446
+ var SYSTEM_PROMPT = `You are given text extracted from the first and last pages of a PDF document.
2447
+ Determine if this document is an archaeological investigation report from any country.
2448
+
2449
+ Valid types include (in any language):
2450
+ - Excavation report (\uBC1C\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C)
2451
+ - Trial excavation report (\uC2DC\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C)
2452
+ - Surface survey report (\uC9C0\uD45C\uC870\uC0AC\uBCF4\uACE0\uC11C)
2453
+ - Detailed excavation report (\uC815\uBC00\uBC1C\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C)
2454
+ - Underwater excavation report (\uC218\uC911\uBC1C\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C)
2455
+ - Salvage excavation report
2456
+ - Archaeological assessment report
2457
+ - Any other archaeological fieldwork investigation report
2458
+
2459
+ NOT valid (these are NOT archaeological investigation reports):
2460
+ - Repair/restoration reports (\uC218\uB9AC\uBCF4\uACE0\uC11C)
2461
+ - Simple measurement reports (\uB2E8\uC21C \uC2E4\uCE21 \uBCF4\uACE0\uC11C)
2462
+ - Architectural investigation reports (\uAC74\uCD95\uC870\uC0AC\uBCF4\uACE0\uC11C)
2463
+ - Academic research reports (\uD559\uC220\uC870\uC0AC\uBCF4\uACE0\uC11C)
2464
+ - Environmental impact assessments (\uD658\uACBD\uC601\uD5A5\uD3C9\uAC00)
2465
+ - General academic papers or textbooks about archaeology
2466
+ - Conservation/preservation reports
2467
+ - Museum catalogs or exhibition guides`;
2468
+ var documentTypeSchema = import_zod.z.object({
2469
+ isValid: import_zod.z.boolean().describe("Whether this is an archaeological investigation report"),
2470
+ reason: import_zod.z.string().describe("Brief reason for the decision")
2471
+ });
2472
+ var DocumentTypeValidator = class {
2473
+ textExtractor;
2474
+ constructor(textExtractor) {
2475
+ this.textExtractor = textExtractor;
2476
+ }
2477
+ /**
2478
+ * Validate that the PDF at the given path is an archaeological investigation report.
2479
+ *
2480
+ * @throws {InvalidDocumentTypeError} if the document is not a valid report type
2481
+ */
2482
+ async validate(pdfPath, model, options) {
2483
+ const totalPages = await this.textExtractor.getPageCount(pdfPath);
2484
+ if (totalPages === 0) return;
2485
+ const frontText = await this.textExtractor.extractPageRange(
2486
+ pdfPath,
2487
+ 1,
2488
+ Math.min(10, totalPages)
2489
+ );
2490
+ let backText = "";
2491
+ if (totalPages > 20) {
2492
+ backText = await this.textExtractor.extractPageRange(
2493
+ pdfPath,
2494
+ Math.max(1, totalPages - 9),
2495
+ totalPages
2496
+ );
2497
+ }
2498
+ const combinedText = (frontText + "\n" + backText).trim();
2499
+ if (combinedText.length === 0) return;
2500
+ const result = await LLMCaller.call({
2501
+ schema: documentTypeSchema,
2502
+ systemPrompt: SYSTEM_PROMPT,
2503
+ userPrompt: `--- Document text (first and last pages) ---
2504
+ ${combinedText}`,
2505
+ primaryModel: model,
2506
+ maxRetries: 2,
2507
+ temperature: 0,
2508
+ abortSignal: options?.abortSignal,
2509
+ component: "DocumentTypeValidator",
2510
+ phase: "validation"
2511
+ });
2512
+ if (!result.output.isValid) {
2513
+ throw new InvalidDocumentTypeError(result.output.reason);
2514
+ }
2515
+ }
2516
+ };
2517
+
2403
2518
  // src/core/chunked-pdf-converter.ts
2404
2519
  var import_node_fs7 = require("fs");
2405
2520
  var import_promises4 = require("fs/promises");
@@ -3018,8 +3133,27 @@ var PDFConverter = class {
3018
3133
  this.enableImagePdfFallback = enableImagePdfFallback;
3019
3134
  this.timeout = timeout;
3020
3135
  }
3136
+ documentTypeValidated = false;
3137
+ /**
3138
+ * Validate that the PDF is a Korean archaeological investigation report.
3139
+ * Skipped when no documentValidationModel is configured or for non-local URLs.
3140
+ * Only runs once per converter instance (flag prevents duplicate checks on recursive calls).
3141
+ */
3142
+ async validateDocumentType(url, options, abortSignal) {
3143
+ if (this.documentTypeValidated) return;
3144
+ this.documentTypeValidated = true;
3145
+ if (!options.documentValidationModel) return;
3146
+ const pdfPath = url.startsWith("file://") ? url.slice(7) : null;
3147
+ if (!pdfPath) return;
3148
+ const textExtractor = new PdfTextExtractor(this.logger);
3149
+ const validator = new DocumentTypeValidator(textExtractor);
3150
+ await validator.validate(pdfPath, options.documentValidationModel, {
3151
+ abortSignal
3152
+ });
3153
+ }
3021
3154
  async convert(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
3022
3155
  this.logger.info("[PDFConverter] Converting:", url);
3156
+ await this.validateDocumentType(url, options, abortSignal);
3023
3157
  if (options.chunkedConversion && url.startsWith("file://")) {
3024
3158
  const chunked = new ChunkedPDFConverter(
3025
3159
  this.logger,
@@ -3074,6 +3208,7 @@ var PDFConverter = class {
3074
3208
  const aggregator = options.aggregator ?? new LLMTokenUsageAggregator();
3075
3209
  const trackedOptions = { ...options, aggregator };
3076
3210
  const pdfPath = url.startsWith("file://") ? url.slice(7) : null;
3211
+ await this.validateDocumentType(url, trackedOptions, abortSignal);
3077
3212
  const strategy = await this.determineStrategy(
3078
3213
  pdfPath,
3079
3214
  reportId,
@@ -3400,7 +3535,8 @@ var PDFConverter = class {
3400
3535
  "onTokenUsage",
3401
3536
  "chunkedConversion",
3402
3537
  "chunkSize",
3403
- "chunkMaxRetries"
3538
+ "chunkMaxRetries",
3539
+ "documentValidationModel"
3404
3540
  ]),
3405
3541
  to_formats: ["json", "html"],
3406
3542
  image_export_mode: "embedded",
@@ -4102,6 +4238,7 @@ var VlmResponseValidator = class {
4102
4238
  // Annotate the CommonJS export names for ESM import in node:
4103
4239
  0 && (module.exports = {
4104
4240
  ImagePdfFallbackError,
4241
+ InvalidDocumentTypeError,
4105
4242
  PDFParser,
4106
4243
  VlmResponseValidator
4107
4244
  });