@heripo/pdf-parser 0.1.14 → 0.1.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +133 -1
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +14 -1
- package/dist/index.d.ts +14 -1
- package/dist/index.js +132 -1
- package/dist/index.js.map +1 -1
- package/package.json +7 -7
package/dist/index.cjs
CHANGED
|
@@ -31,6 +31,7 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
|
|
|
31
31
|
var src_exports = {};
|
|
32
32
|
__export(src_exports, {
|
|
33
33
|
ImagePdfFallbackError: () => ImagePdfFallbackError,
|
|
34
|
+
InvalidDocumentTypeError: () => InvalidDocumentTypeError,
|
|
34
35
|
PDFParser: () => PDFParser,
|
|
35
36
|
VlmResponseValidator: () => VlmResponseValidator
|
|
36
37
|
});
|
|
@@ -1508,6 +1509,32 @@ var PdfTextExtractor = class {
|
|
|
1508
1509
|
}
|
|
1509
1510
|
return result.stdout;
|
|
1510
1511
|
}
|
|
1512
|
+
/**
|
|
1513
|
+
* Extract text from a range of PDF pages using a single pdftotext invocation.
|
|
1514
|
+
* Returns empty string on failure (logged as warning).
|
|
1515
|
+
*
|
|
1516
|
+
* @param pdfPath - Absolute path to the source PDF file
|
|
1517
|
+
* @param firstPage - First page number (1-based)
|
|
1518
|
+
* @param lastPage - Last page number (1-based, inclusive)
|
|
1519
|
+
*/
|
|
1520
|
+
async extractPageRange(pdfPath, firstPage, lastPage) {
|
|
1521
|
+
const result = await spawnAsync("pdftotext", [
|
|
1522
|
+
"-f",
|
|
1523
|
+
firstPage.toString(),
|
|
1524
|
+
"-l",
|
|
1525
|
+
lastPage.toString(),
|
|
1526
|
+
"-layout",
|
|
1527
|
+
pdfPath,
|
|
1528
|
+
"-"
|
|
1529
|
+
]);
|
|
1530
|
+
if (result.code !== 0) {
|
|
1531
|
+
this.logger.warn(
|
|
1532
|
+
`[PdfTextExtractor] pdftotext failed for pages ${firstPage}-${lastPage}: ${result.stderr || "Unknown error"}`
|
|
1533
|
+
);
|
|
1534
|
+
return "";
|
|
1535
|
+
}
|
|
1536
|
+
return result.stdout;
|
|
1537
|
+
}
|
|
1511
1538
|
/**
|
|
1512
1539
|
* Extract text from a single PDF page using pdftotext.
|
|
1513
1540
|
* Returns empty string on failure (logged as warning).
|
|
@@ -2400,6 +2427,89 @@ async function getTaskFailureDetails(task, logger, logPrefix) {
|
|
|
2400
2427
|
return "unable to retrieve error details";
|
|
2401
2428
|
}
|
|
2402
2429
|
|
|
2430
|
+
// src/validators/document-type-validator.ts
|
|
2431
|
+
var import_zod = require("zod");
|
|
2432
|
+
|
|
2433
|
+
// src/errors/invalid-document-type-error.ts
|
|
2434
|
+
var InvalidDocumentTypeError = class extends Error {
|
|
2435
|
+
constructor(reason) {
|
|
2436
|
+
super(
|
|
2437
|
+
`The uploaded PDF does not appear to be a Korean archaeological investigation report. Reason: ${reason}`
|
|
2438
|
+
);
|
|
2439
|
+
this.reason = reason;
|
|
2440
|
+
}
|
|
2441
|
+
name = "InvalidDocumentTypeError";
|
|
2442
|
+
code = "INVALID_DOCUMENT_TYPE";
|
|
2443
|
+
};
|
|
2444
|
+
|
|
2445
|
+
// src/validators/document-type-validator.ts
|
|
2446
|
+
var SYSTEM_PROMPT = `You are given text extracted from the first and last pages of a PDF document.
|
|
2447
|
+
Determine if this document is a Korean archaeological investigation report (\uACE0\uACE0\uD559 \uC870\uC0AC \uBCF4\uACE0\uC11C).
|
|
2448
|
+
|
|
2449
|
+
Valid types include:
|
|
2450
|
+
- \uBC1C\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C (excavation investigation report)
|
|
2451
|
+
- \uC2DC\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C (trial excavation report)
|
|
2452
|
+
- \uC9C0\uD45C\uC870\uC0AC\uBCF4\uACE0\uC11C (surface survey report)
|
|
2453
|
+
- \uC815\uBC00\uBC1C\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C (detailed excavation report)
|
|
2454
|
+
- \uC218\uC911\uBC1C\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C (underwater excavation report)
|
|
2455
|
+
|
|
2456
|
+
NOT valid (these are NOT archaeological investigation reports):
|
|
2457
|
+
- \uC218\uB9AC\uBCF4\uACE0\uC11C (repair/restoration report)
|
|
2458
|
+
- \uB2E8\uC21C \uC2E4\uCE21 \uBCF4\uACE0\uC11C (simple measurement report)
|
|
2459
|
+
- \uAC74\uCD95\uC870\uC0AC\uBCF4\uACE0\uC11C (architectural investigation report)
|
|
2460
|
+
- \uD559\uC220\uC870\uC0AC\uBCF4\uACE0\uC11C (academic research report)
|
|
2461
|
+
- \uD658\uACBD\uC601\uD5A5\uD3C9\uAC00 (environmental impact assessment)
|
|
2462
|
+
- General academic papers or textbooks about archaeology`;
|
|
2463
|
+
var documentTypeSchema = import_zod.z.object({
|
|
2464
|
+
isValid: import_zod.z.boolean().describe("Whether this is a Korean archaeological investigation report"),
|
|
2465
|
+
reason: import_zod.z.string().describe("Brief reason for the decision")
|
|
2466
|
+
});
|
|
2467
|
+
var DocumentTypeValidator = class {
|
|
2468
|
+
textExtractor;
|
|
2469
|
+
constructor(textExtractor) {
|
|
2470
|
+
this.textExtractor = textExtractor;
|
|
2471
|
+
}
|
|
2472
|
+
/**
|
|
2473
|
+
* Validate that the PDF at the given path is an archaeological investigation report.
|
|
2474
|
+
*
|
|
2475
|
+
* @throws {InvalidDocumentTypeError} if the document is not a valid report type
|
|
2476
|
+
*/
|
|
2477
|
+
async validate(pdfPath, model, options) {
|
|
2478
|
+
const totalPages = await this.textExtractor.getPageCount(pdfPath);
|
|
2479
|
+
if (totalPages === 0) return;
|
|
2480
|
+
const frontText = await this.textExtractor.extractPageRange(
|
|
2481
|
+
pdfPath,
|
|
2482
|
+
1,
|
|
2483
|
+
Math.min(10, totalPages)
|
|
2484
|
+
);
|
|
2485
|
+
let backText = "";
|
|
2486
|
+
if (totalPages > 20) {
|
|
2487
|
+
backText = await this.textExtractor.extractPageRange(
|
|
2488
|
+
pdfPath,
|
|
2489
|
+
Math.max(1, totalPages - 9),
|
|
2490
|
+
totalPages
|
|
2491
|
+
);
|
|
2492
|
+
}
|
|
2493
|
+
const combinedText = (frontText + "\n" + backText).trim();
|
|
2494
|
+
if (combinedText.length === 0) return;
|
|
2495
|
+
const result = await LLMCaller.call({
|
|
2496
|
+
schema: documentTypeSchema,
|
|
2497
|
+
systemPrompt: SYSTEM_PROMPT,
|
|
2498
|
+
userPrompt: `--- Document text (first and last pages) ---
|
|
2499
|
+
${combinedText}`,
|
|
2500
|
+
primaryModel: model,
|
|
2501
|
+
maxRetries: 2,
|
|
2502
|
+
temperature: 0,
|
|
2503
|
+
abortSignal: options?.abortSignal,
|
|
2504
|
+
component: "DocumentTypeValidator",
|
|
2505
|
+
phase: "validation"
|
|
2506
|
+
});
|
|
2507
|
+
if (!result.output.isValid) {
|
|
2508
|
+
throw new InvalidDocumentTypeError(result.output.reason);
|
|
2509
|
+
}
|
|
2510
|
+
}
|
|
2511
|
+
};
|
|
2512
|
+
|
|
2403
2513
|
// src/core/chunked-pdf-converter.ts
|
|
2404
2514
|
var import_node_fs7 = require("fs");
|
|
2405
2515
|
var import_promises4 = require("fs/promises");
|
|
@@ -3018,8 +3128,27 @@ var PDFConverter = class {
|
|
|
3018
3128
|
this.enableImagePdfFallback = enableImagePdfFallback;
|
|
3019
3129
|
this.timeout = timeout;
|
|
3020
3130
|
}
|
|
3131
|
+
documentTypeValidated = false;
|
|
3132
|
+
/**
|
|
3133
|
+
* Validate that the PDF is a Korean archaeological investigation report.
|
|
3134
|
+
* Skipped when no documentValidationModel is configured or for non-local URLs.
|
|
3135
|
+
* Only runs once per converter instance (flag prevents duplicate checks on recursive calls).
|
|
3136
|
+
*/
|
|
3137
|
+
async validateDocumentType(url, options, abortSignal) {
|
|
3138
|
+
if (this.documentTypeValidated) return;
|
|
3139
|
+
this.documentTypeValidated = true;
|
|
3140
|
+
if (!options.documentValidationModel) return;
|
|
3141
|
+
const pdfPath = url.startsWith("file://") ? url.slice(7) : null;
|
|
3142
|
+
if (!pdfPath) return;
|
|
3143
|
+
const textExtractor = new PdfTextExtractor(this.logger);
|
|
3144
|
+
const validator = new DocumentTypeValidator(textExtractor);
|
|
3145
|
+
await validator.validate(pdfPath, options.documentValidationModel, {
|
|
3146
|
+
abortSignal
|
|
3147
|
+
});
|
|
3148
|
+
}
|
|
3021
3149
|
async convert(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
|
|
3022
3150
|
this.logger.info("[PDFConverter] Converting:", url);
|
|
3151
|
+
await this.validateDocumentType(url, options, abortSignal);
|
|
3023
3152
|
if (options.chunkedConversion && url.startsWith("file://")) {
|
|
3024
3153
|
const chunked = new ChunkedPDFConverter(
|
|
3025
3154
|
this.logger,
|
|
@@ -3074,6 +3203,7 @@ var PDFConverter = class {
|
|
|
3074
3203
|
const aggregator = options.aggregator ?? new LLMTokenUsageAggregator();
|
|
3075
3204
|
const trackedOptions = { ...options, aggregator };
|
|
3076
3205
|
const pdfPath = url.startsWith("file://") ? url.slice(7) : null;
|
|
3206
|
+
await this.validateDocumentType(url, trackedOptions, abortSignal);
|
|
3077
3207
|
const strategy = await this.determineStrategy(
|
|
3078
3208
|
pdfPath,
|
|
3079
3209
|
reportId,
|
|
@@ -3400,7 +3530,8 @@ var PDFConverter = class {
|
|
|
3400
3530
|
"onTokenUsage",
|
|
3401
3531
|
"chunkedConversion",
|
|
3402
3532
|
"chunkSize",
|
|
3403
|
-
"chunkMaxRetries"
|
|
3533
|
+
"chunkMaxRetries",
|
|
3534
|
+
"documentValidationModel"
|
|
3404
3535
|
]),
|
|
3405
3536
|
to_formats: ["json", "html"],
|
|
3406
3537
|
image_export_mode: "embedded",
|
|
@@ -4102,6 +4233,7 @@ var VlmResponseValidator = class {
|
|
|
4102
4233
|
// Annotate the CommonJS export names for ESM import in node:
|
|
4103
4234
|
0 && (module.exports = {
|
|
4104
4235
|
ImagePdfFallbackError,
|
|
4236
|
+
InvalidDocumentTypeError,
|
|
4105
4237
|
PDFParser,
|
|
4106
4238
|
VlmResponseValidator
|
|
4107
4239
|
});
|