@heripo/pdf-parser 0.1.14 → 0.1.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +138 -1
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +14 -1
- package/dist/index.d.ts +14 -1
- package/dist/index.js +137 -1
- package/dist/index.js.map +1 -1
- package/package.json +8 -8
package/dist/index.cjs
CHANGED
|
@@ -31,6 +31,7 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
|
|
|
31
31
|
var src_exports = {};
|
|
32
32
|
__export(src_exports, {
|
|
33
33
|
ImagePdfFallbackError: () => ImagePdfFallbackError,
|
|
34
|
+
InvalidDocumentTypeError: () => InvalidDocumentTypeError,
|
|
34
35
|
PDFParser: () => PDFParser,
|
|
35
36
|
VlmResponseValidator: () => VlmResponseValidator
|
|
36
37
|
});
|
|
@@ -1508,6 +1509,32 @@ var PdfTextExtractor = class {
|
|
|
1508
1509
|
}
|
|
1509
1510
|
return result.stdout;
|
|
1510
1511
|
}
|
|
1512
|
+
/**
|
|
1513
|
+
* Extract text from a range of PDF pages using a single pdftotext invocation.
|
|
1514
|
+
* Returns empty string on failure (logged as warning).
|
|
1515
|
+
*
|
|
1516
|
+
* @param pdfPath - Absolute path to the source PDF file
|
|
1517
|
+
* @param firstPage - First page number (1-based)
|
|
1518
|
+
* @param lastPage - Last page number (1-based, inclusive)
|
|
1519
|
+
*/
|
|
1520
|
+
async extractPageRange(pdfPath, firstPage, lastPage) {
|
|
1521
|
+
const result = await spawnAsync("pdftotext", [
|
|
1522
|
+
"-f",
|
|
1523
|
+
firstPage.toString(),
|
|
1524
|
+
"-l",
|
|
1525
|
+
lastPage.toString(),
|
|
1526
|
+
"-layout",
|
|
1527
|
+
pdfPath,
|
|
1528
|
+
"-"
|
|
1529
|
+
]);
|
|
1530
|
+
if (result.code !== 0) {
|
|
1531
|
+
this.logger.warn(
|
|
1532
|
+
`[PdfTextExtractor] pdftotext failed for pages ${firstPage}-${lastPage}: ${result.stderr || "Unknown error"}`
|
|
1533
|
+
);
|
|
1534
|
+
return "";
|
|
1535
|
+
}
|
|
1536
|
+
return result.stdout;
|
|
1537
|
+
}
|
|
1511
1538
|
/**
|
|
1512
1539
|
* Extract text from a single PDF page using pdftotext.
|
|
1513
1540
|
* Returns empty string on failure (logged as warning).
|
|
@@ -2400,6 +2427,94 @@ async function getTaskFailureDetails(task, logger, logPrefix) {
|
|
|
2400
2427
|
return "unable to retrieve error details";
|
|
2401
2428
|
}
|
|
2402
2429
|
|
|
2430
|
+
// src/validators/document-type-validator.ts
|
|
2431
|
+
var import_zod = require("zod");
|
|
2432
|
+
|
|
2433
|
+
// src/errors/invalid-document-type-error.ts
|
|
2434
|
+
var InvalidDocumentTypeError = class extends Error {
|
|
2435
|
+
constructor(reason) {
|
|
2436
|
+
super(
|
|
2437
|
+
`The uploaded PDF does not appear to be a Korean archaeological investigation report. Reason: ${reason}`
|
|
2438
|
+
);
|
|
2439
|
+
this.reason = reason;
|
|
2440
|
+
}
|
|
2441
|
+
name = "InvalidDocumentTypeError";
|
|
2442
|
+
code = "INVALID_DOCUMENT_TYPE";
|
|
2443
|
+
};
|
|
2444
|
+
|
|
2445
|
+
// src/validators/document-type-validator.ts
|
|
2446
|
+
var SYSTEM_PROMPT = `You are given text extracted from the first and last pages of a PDF document.
|
|
2447
|
+
Determine if this document is an archaeological investigation report from any country.
|
|
2448
|
+
|
|
2449
|
+
Valid types include (in any language):
|
|
2450
|
+
- Excavation report (\uBC1C\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C)
|
|
2451
|
+
- Trial excavation report (\uC2DC\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C)
|
|
2452
|
+
- Surface survey report (\uC9C0\uD45C\uC870\uC0AC\uBCF4\uACE0\uC11C)
|
|
2453
|
+
- Detailed excavation report (\uC815\uBC00\uBC1C\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C)
|
|
2454
|
+
- Underwater excavation report (\uC218\uC911\uBC1C\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C)
|
|
2455
|
+
- Salvage excavation report
|
|
2456
|
+
- Archaeological assessment report
|
|
2457
|
+
- Any other archaeological fieldwork investigation report
|
|
2458
|
+
|
|
2459
|
+
NOT valid (these are NOT archaeological investigation reports):
|
|
2460
|
+
- Repair/restoration reports (\uC218\uB9AC\uBCF4\uACE0\uC11C)
|
|
2461
|
+
- Simple measurement reports (\uB2E8\uC21C \uC2E4\uCE21 \uBCF4\uACE0\uC11C)
|
|
2462
|
+
- Architectural investigation reports (\uAC74\uCD95\uC870\uC0AC\uBCF4\uACE0\uC11C)
|
|
2463
|
+
- Academic research reports (\uD559\uC220\uC870\uC0AC\uBCF4\uACE0\uC11C)
|
|
2464
|
+
- Environmental impact assessments (\uD658\uACBD\uC601\uD5A5\uD3C9\uAC00)
|
|
2465
|
+
- General academic papers or textbooks about archaeology
|
|
2466
|
+
- Conservation/preservation reports
|
|
2467
|
+
- Museum catalogs or exhibition guides`;
|
|
2468
|
+
var documentTypeSchema = import_zod.z.object({
|
|
2469
|
+
isValid: import_zod.z.boolean().describe("Whether this is an archaeological investigation report"),
|
|
2470
|
+
reason: import_zod.z.string().describe("Brief reason for the decision")
|
|
2471
|
+
});
|
|
2472
|
+
var DocumentTypeValidator = class {
|
|
2473
|
+
textExtractor;
|
|
2474
|
+
constructor(textExtractor) {
|
|
2475
|
+
this.textExtractor = textExtractor;
|
|
2476
|
+
}
|
|
2477
|
+
/**
|
|
2478
|
+
* Validate that the PDF at the given path is an archaeological investigation report.
|
|
2479
|
+
*
|
|
2480
|
+
* @throws {InvalidDocumentTypeError} if the document is not a valid report type
|
|
2481
|
+
*/
|
|
2482
|
+
async validate(pdfPath, model, options) {
|
|
2483
|
+
const totalPages = await this.textExtractor.getPageCount(pdfPath);
|
|
2484
|
+
if (totalPages === 0) return;
|
|
2485
|
+
const frontText = await this.textExtractor.extractPageRange(
|
|
2486
|
+
pdfPath,
|
|
2487
|
+
1,
|
|
2488
|
+
Math.min(10, totalPages)
|
|
2489
|
+
);
|
|
2490
|
+
let backText = "";
|
|
2491
|
+
if (totalPages > 20) {
|
|
2492
|
+
backText = await this.textExtractor.extractPageRange(
|
|
2493
|
+
pdfPath,
|
|
2494
|
+
Math.max(1, totalPages - 9),
|
|
2495
|
+
totalPages
|
|
2496
|
+
);
|
|
2497
|
+
}
|
|
2498
|
+
const combinedText = (frontText + "\n" + backText).trim();
|
|
2499
|
+
if (combinedText.length === 0) return;
|
|
2500
|
+
const result = await LLMCaller.call({
|
|
2501
|
+
schema: documentTypeSchema,
|
|
2502
|
+
systemPrompt: SYSTEM_PROMPT,
|
|
2503
|
+
userPrompt: `--- Document text (first and last pages) ---
|
|
2504
|
+
${combinedText}`,
|
|
2505
|
+
primaryModel: model,
|
|
2506
|
+
maxRetries: 2,
|
|
2507
|
+
temperature: 0,
|
|
2508
|
+
abortSignal: options?.abortSignal,
|
|
2509
|
+
component: "DocumentTypeValidator",
|
|
2510
|
+
phase: "validation"
|
|
2511
|
+
});
|
|
2512
|
+
if (!result.output.isValid) {
|
|
2513
|
+
throw new InvalidDocumentTypeError(result.output.reason);
|
|
2514
|
+
}
|
|
2515
|
+
}
|
|
2516
|
+
};
|
|
2517
|
+
|
|
2403
2518
|
// src/core/chunked-pdf-converter.ts
|
|
2404
2519
|
var import_node_fs7 = require("fs");
|
|
2405
2520
|
var import_promises4 = require("fs/promises");
|
|
@@ -3018,8 +3133,27 @@ var PDFConverter = class {
|
|
|
3018
3133
|
this.enableImagePdfFallback = enableImagePdfFallback;
|
|
3019
3134
|
this.timeout = timeout;
|
|
3020
3135
|
}
|
|
3136
|
+
documentTypeValidated = false;
|
|
3137
|
+
/**
|
|
3138
|
+
* Validate that the PDF is a Korean archaeological investigation report.
|
|
3139
|
+
* Skipped when no documentValidationModel is configured or for non-local URLs.
|
|
3140
|
+
* Only runs once per converter instance (flag prevents duplicate checks on recursive calls).
|
|
3141
|
+
*/
|
|
3142
|
+
async validateDocumentType(url, options, abortSignal) {
|
|
3143
|
+
if (this.documentTypeValidated) return;
|
|
3144
|
+
this.documentTypeValidated = true;
|
|
3145
|
+
if (!options.documentValidationModel) return;
|
|
3146
|
+
const pdfPath = url.startsWith("file://") ? url.slice(7) : null;
|
|
3147
|
+
if (!pdfPath) return;
|
|
3148
|
+
const textExtractor = new PdfTextExtractor(this.logger);
|
|
3149
|
+
const validator = new DocumentTypeValidator(textExtractor);
|
|
3150
|
+
await validator.validate(pdfPath, options.documentValidationModel, {
|
|
3151
|
+
abortSignal
|
|
3152
|
+
});
|
|
3153
|
+
}
|
|
3021
3154
|
async convert(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
|
|
3022
3155
|
this.logger.info("[PDFConverter] Converting:", url);
|
|
3156
|
+
await this.validateDocumentType(url, options, abortSignal);
|
|
3023
3157
|
if (options.chunkedConversion && url.startsWith("file://")) {
|
|
3024
3158
|
const chunked = new ChunkedPDFConverter(
|
|
3025
3159
|
this.logger,
|
|
@@ -3074,6 +3208,7 @@ var PDFConverter = class {
|
|
|
3074
3208
|
const aggregator = options.aggregator ?? new LLMTokenUsageAggregator();
|
|
3075
3209
|
const trackedOptions = { ...options, aggregator };
|
|
3076
3210
|
const pdfPath = url.startsWith("file://") ? url.slice(7) : null;
|
|
3211
|
+
await this.validateDocumentType(url, trackedOptions, abortSignal);
|
|
3077
3212
|
const strategy = await this.determineStrategy(
|
|
3078
3213
|
pdfPath,
|
|
3079
3214
|
reportId,
|
|
@@ -3400,7 +3535,8 @@ var PDFConverter = class {
|
|
|
3400
3535
|
"onTokenUsage",
|
|
3401
3536
|
"chunkedConversion",
|
|
3402
3537
|
"chunkSize",
|
|
3403
|
-
"chunkMaxRetries"
|
|
3538
|
+
"chunkMaxRetries",
|
|
3539
|
+
"documentValidationModel"
|
|
3404
3540
|
]),
|
|
3405
3541
|
to_formats: ["json", "html"],
|
|
3406
3542
|
image_export_mode: "embedded",
|
|
@@ -4102,6 +4238,7 @@ var VlmResponseValidator = class {
|
|
|
4102
4238
|
// Annotate the CommonJS export names for ESM import in node:
|
|
4103
4239
|
0 && (module.exports = {
|
|
4104
4240
|
ImagePdfFallbackError,
|
|
4241
|
+
InvalidDocumentTypeError,
|
|
4105
4242
|
PDFParser,
|
|
4106
4243
|
VlmResponseValidator
|
|
4107
4244
|
});
|