@heripo/pdf-parser 0.1.14 → 0.1.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +133 -1
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +14 -1
- package/dist/index.d.ts +14 -1
- package/dist/index.js +132 -1
- package/dist/index.js.map +1 -1
- package/package.json +7 -7
package/dist/index.d.cts
CHANGED
|
@@ -41,6 +41,8 @@ type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mo
|
|
|
41
41
|
chunkSize?: number;
|
|
42
42
|
/** Max retry attempts per failed chunk (default: CHUNKED_CONVERSION.DEFAULT_MAX_RETRIES) */
|
|
43
43
|
chunkMaxRetries?: number;
|
|
44
|
+
/** LLM model for document type validation (opt-in: skipped when not set) */
|
|
45
|
+
documentValidationModel?: LanguageModel;
|
|
44
46
|
};
|
|
45
47
|
/** Result of strategy-based conversion */
|
|
46
48
|
interface ConvertWithStrategyResult {
|
|
@@ -179,6 +181,17 @@ declare class ImagePdfFallbackError extends Error {
|
|
|
179
181
|
constructor(originalError: Error, fallbackError: Error);
|
|
180
182
|
}
|
|
181
183
|
|
|
184
|
+
/**
|
|
185
|
+
* Error thrown when the uploaded PDF does not appear to be
|
|
186
|
+
* a Korean archaeological investigation report.
|
|
187
|
+
*/
|
|
188
|
+
declare class InvalidDocumentTypeError extends Error {
|
|
189
|
+
readonly reason: string;
|
|
190
|
+
readonly name = "InvalidDocumentTypeError";
|
|
191
|
+
readonly code = "INVALID_DOCUMENT_TYPE";
|
|
192
|
+
constructor(reason: string);
|
|
193
|
+
}
|
|
194
|
+
|
|
182
195
|
/**
|
|
183
196
|
* Intermediate format produced by VLM page-by-page processing.
|
|
184
197
|
* Intentionally kept simple so VLM prompts stay short and accurate.
|
|
@@ -287,4 +300,4 @@ declare class VlmResponseValidator {
|
|
|
287
300
|
private static detectRepetitivePattern;
|
|
288
301
|
}
|
|
289
302
|
|
|
290
|
-
export { type ConversionCompleteCallback, type ConvertWithStrategyResult, ImagePdfFallbackError, type PDFConvertOptions, PDFParser, type VlmPageQuality, type VlmQualityIssue, type VlmQualityIssueType, VlmResponseValidator, type VlmValidationResult };
|
|
303
|
+
export { type ConversionCompleteCallback, type ConvertWithStrategyResult, ImagePdfFallbackError, InvalidDocumentTypeError, type PDFConvertOptions, PDFParser, type VlmPageQuality, type VlmQualityIssue, type VlmQualityIssueType, VlmResponseValidator, type VlmValidationResult };
|
package/dist/index.d.ts
CHANGED
|
@@ -41,6 +41,8 @@ type PDFConvertOptions = Omit<ConversionOptions, 'to_formats' | 'image_export_mo
|
|
|
41
41
|
chunkSize?: number;
|
|
42
42
|
/** Max retry attempts per failed chunk (default: CHUNKED_CONVERSION.DEFAULT_MAX_RETRIES) */
|
|
43
43
|
chunkMaxRetries?: number;
|
|
44
|
+
/** LLM model for document type validation (opt-in: skipped when not set) */
|
|
45
|
+
documentValidationModel?: LanguageModel;
|
|
44
46
|
};
|
|
45
47
|
/** Result of strategy-based conversion */
|
|
46
48
|
interface ConvertWithStrategyResult {
|
|
@@ -179,6 +181,17 @@ declare class ImagePdfFallbackError extends Error {
|
|
|
179
181
|
constructor(originalError: Error, fallbackError: Error);
|
|
180
182
|
}
|
|
181
183
|
|
|
184
|
+
/**
|
|
185
|
+
* Error thrown when the uploaded PDF does not appear to be
|
|
186
|
+
* a Korean archaeological investigation report.
|
|
187
|
+
*/
|
|
188
|
+
declare class InvalidDocumentTypeError extends Error {
|
|
189
|
+
readonly reason: string;
|
|
190
|
+
readonly name = "InvalidDocumentTypeError";
|
|
191
|
+
readonly code = "INVALID_DOCUMENT_TYPE";
|
|
192
|
+
constructor(reason: string);
|
|
193
|
+
}
|
|
194
|
+
|
|
182
195
|
/**
|
|
183
196
|
* Intermediate format produced by VLM page-by-page processing.
|
|
184
197
|
* Intentionally kept simple so VLM prompts stay short and accurate.
|
|
@@ -287,4 +300,4 @@ declare class VlmResponseValidator {
|
|
|
287
300
|
private static detectRepetitivePattern;
|
|
288
301
|
}
|
|
289
302
|
|
|
290
|
-
export { type ConversionCompleteCallback, type ConvertWithStrategyResult, ImagePdfFallbackError, type PDFConvertOptions, PDFParser, type VlmPageQuality, type VlmQualityIssue, type VlmQualityIssueType, VlmResponseValidator, type VlmValidationResult };
|
|
303
|
+
export { type ConversionCompleteCallback, type ConvertWithStrategyResult, ImagePdfFallbackError, InvalidDocumentTypeError, type PDFConvertOptions, PDFParser, type VlmPageQuality, type VlmQualityIssue, type VlmQualityIssueType, VlmResponseValidator, type VlmValidationResult };
|
package/dist/index.js
CHANGED
|
@@ -1484,6 +1484,32 @@ var PdfTextExtractor = class {
|
|
|
1484
1484
|
}
|
|
1485
1485
|
return result.stdout;
|
|
1486
1486
|
}
|
|
1487
|
+
/**
|
|
1488
|
+
* Extract text from a range of PDF pages using a single pdftotext invocation.
|
|
1489
|
+
* Returns empty string on failure (logged as warning).
|
|
1490
|
+
*
|
|
1491
|
+
* @param pdfPath - Absolute path to the source PDF file
|
|
1492
|
+
* @param firstPage - First page number (1-based)
|
|
1493
|
+
* @param lastPage - Last page number (1-based, inclusive)
|
|
1494
|
+
*/
|
|
1495
|
+
async extractPageRange(pdfPath, firstPage, lastPage) {
|
|
1496
|
+
const result = await spawnAsync("pdftotext", [
|
|
1497
|
+
"-f",
|
|
1498
|
+
firstPage.toString(),
|
|
1499
|
+
"-l",
|
|
1500
|
+
lastPage.toString(),
|
|
1501
|
+
"-layout",
|
|
1502
|
+
pdfPath,
|
|
1503
|
+
"-"
|
|
1504
|
+
]);
|
|
1505
|
+
if (result.code !== 0) {
|
|
1506
|
+
this.logger.warn(
|
|
1507
|
+
`[PdfTextExtractor] pdftotext failed for pages ${firstPage}-${lastPage}: ${result.stderr || "Unknown error"}`
|
|
1508
|
+
);
|
|
1509
|
+
return "";
|
|
1510
|
+
}
|
|
1511
|
+
return result.stdout;
|
|
1512
|
+
}
|
|
1487
1513
|
/**
|
|
1488
1514
|
* Extract text from a single PDF page using pdftotext.
|
|
1489
1515
|
* Returns empty string on failure (logged as warning).
|
|
@@ -2376,6 +2402,89 @@ async function getTaskFailureDetails(task, logger, logPrefix) {
|
|
|
2376
2402
|
return "unable to retrieve error details";
|
|
2377
2403
|
}
|
|
2378
2404
|
|
|
2405
|
+
// src/validators/document-type-validator.ts
|
|
2406
|
+
import { z as z3 } from "zod";
|
|
2407
|
+
|
|
2408
|
+
// src/errors/invalid-document-type-error.ts
|
|
2409
|
+
var InvalidDocumentTypeError = class extends Error {
|
|
2410
|
+
constructor(reason) {
|
|
2411
|
+
super(
|
|
2412
|
+
`The uploaded PDF does not appear to be a Korean archaeological investigation report. Reason: ${reason}`
|
|
2413
|
+
);
|
|
2414
|
+
this.reason = reason;
|
|
2415
|
+
}
|
|
2416
|
+
name = "InvalidDocumentTypeError";
|
|
2417
|
+
code = "INVALID_DOCUMENT_TYPE";
|
|
2418
|
+
};
|
|
2419
|
+
|
|
2420
|
+
// src/validators/document-type-validator.ts
|
|
2421
|
+
var SYSTEM_PROMPT = `You are given text extracted from the first and last pages of a PDF document.
|
|
2422
|
+
Determine if this document is a Korean archaeological investigation report (\uACE0\uACE0\uD559 \uC870\uC0AC \uBCF4\uACE0\uC11C).
|
|
2423
|
+
|
|
2424
|
+
Valid types include:
|
|
2425
|
+
- \uBC1C\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C (excavation investigation report)
|
|
2426
|
+
- \uC2DC\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C (trial excavation report)
|
|
2427
|
+
- \uC9C0\uD45C\uC870\uC0AC\uBCF4\uACE0\uC11C (surface survey report)
|
|
2428
|
+
- \uC815\uBC00\uBC1C\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C (detailed excavation report)
|
|
2429
|
+
- \uC218\uC911\uBC1C\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C (underwater excavation report)
|
|
2430
|
+
|
|
2431
|
+
NOT valid (these are NOT archaeological investigation reports):
|
|
2432
|
+
- \uC218\uB9AC\uBCF4\uACE0\uC11C (repair/restoration report)
|
|
2433
|
+
- \uB2E8\uC21C \uC2E4\uCE21 \uBCF4\uACE0\uC11C (simple measurement report)
|
|
2434
|
+
- \uAC74\uCD95\uC870\uC0AC\uBCF4\uACE0\uC11C (architectural investigation report)
|
|
2435
|
+
- \uD559\uC220\uC870\uC0AC\uBCF4\uACE0\uC11C (academic research report)
|
|
2436
|
+
- \uD658\uACBD\uC601\uD5A5\uD3C9\uAC00 (environmental impact assessment)
|
|
2437
|
+
- General academic papers or textbooks about archaeology`;
|
|
2438
|
+
var documentTypeSchema = z3.object({
|
|
2439
|
+
isValid: z3.boolean().describe("Whether this is a Korean archaeological investigation report"),
|
|
2440
|
+
reason: z3.string().describe("Brief reason for the decision")
|
|
2441
|
+
});
|
|
2442
|
+
var DocumentTypeValidator = class {
|
|
2443
|
+
textExtractor;
|
|
2444
|
+
constructor(textExtractor) {
|
|
2445
|
+
this.textExtractor = textExtractor;
|
|
2446
|
+
}
|
|
2447
|
+
/**
|
|
2448
|
+
* Validate that the PDF at the given path is an archaeological investigation report.
|
|
2449
|
+
*
|
|
2450
|
+
* @throws {InvalidDocumentTypeError} if the document is not a valid report type
|
|
2451
|
+
*/
|
|
2452
|
+
async validate(pdfPath, model, options) {
|
|
2453
|
+
const totalPages = await this.textExtractor.getPageCount(pdfPath);
|
|
2454
|
+
if (totalPages === 0) return;
|
|
2455
|
+
const frontText = await this.textExtractor.extractPageRange(
|
|
2456
|
+
pdfPath,
|
|
2457
|
+
1,
|
|
2458
|
+
Math.min(10, totalPages)
|
|
2459
|
+
);
|
|
2460
|
+
let backText = "";
|
|
2461
|
+
if (totalPages > 20) {
|
|
2462
|
+
backText = await this.textExtractor.extractPageRange(
|
|
2463
|
+
pdfPath,
|
|
2464
|
+
Math.max(1, totalPages - 9),
|
|
2465
|
+
totalPages
|
|
2466
|
+
);
|
|
2467
|
+
}
|
|
2468
|
+
const combinedText = (frontText + "\n" + backText).trim();
|
|
2469
|
+
if (combinedText.length === 0) return;
|
|
2470
|
+
const result = await LLMCaller.call({
|
|
2471
|
+
schema: documentTypeSchema,
|
|
2472
|
+
systemPrompt: SYSTEM_PROMPT,
|
|
2473
|
+
userPrompt: `--- Document text (first and last pages) ---
|
|
2474
|
+
${combinedText}`,
|
|
2475
|
+
primaryModel: model,
|
|
2476
|
+
maxRetries: 2,
|
|
2477
|
+
temperature: 0,
|
|
2478
|
+
abortSignal: options?.abortSignal,
|
|
2479
|
+
component: "DocumentTypeValidator",
|
|
2480
|
+
phase: "validation"
|
|
2481
|
+
});
|
|
2482
|
+
if (!result.output.isValid) {
|
|
2483
|
+
throw new InvalidDocumentTypeError(result.output.reason);
|
|
2484
|
+
}
|
|
2485
|
+
}
|
|
2486
|
+
};
|
|
2487
|
+
|
|
2379
2488
|
// src/core/chunked-pdf-converter.ts
|
|
2380
2489
|
import {
|
|
2381
2490
|
copyFileSync,
|
|
@@ -3003,8 +3112,27 @@ var PDFConverter = class {
|
|
|
3003
3112
|
this.enableImagePdfFallback = enableImagePdfFallback;
|
|
3004
3113
|
this.timeout = timeout;
|
|
3005
3114
|
}
|
|
3115
|
+
documentTypeValidated = false;
|
|
3116
|
+
/**
|
|
3117
|
+
* Validate that the PDF is a Korean archaeological investigation report.
|
|
3118
|
+
* Skipped when no documentValidationModel is configured or for non-local URLs.
|
|
3119
|
+
* Only runs once per converter instance (flag prevents duplicate checks on recursive calls).
|
|
3120
|
+
*/
|
|
3121
|
+
async validateDocumentType(url, options, abortSignal) {
|
|
3122
|
+
if (this.documentTypeValidated) return;
|
|
3123
|
+
this.documentTypeValidated = true;
|
|
3124
|
+
if (!options.documentValidationModel) return;
|
|
3125
|
+
const pdfPath = url.startsWith("file://") ? url.slice(7) : null;
|
|
3126
|
+
if (!pdfPath) return;
|
|
3127
|
+
const textExtractor = new PdfTextExtractor(this.logger);
|
|
3128
|
+
const validator = new DocumentTypeValidator(textExtractor);
|
|
3129
|
+
await validator.validate(pdfPath, options.documentValidationModel, {
|
|
3130
|
+
abortSignal
|
|
3131
|
+
});
|
|
3132
|
+
}
|
|
3006
3133
|
async convert(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
|
|
3007
3134
|
this.logger.info("[PDFConverter] Converting:", url);
|
|
3135
|
+
await this.validateDocumentType(url, options, abortSignal);
|
|
3008
3136
|
if (options.chunkedConversion && url.startsWith("file://")) {
|
|
3009
3137
|
const chunked = new ChunkedPDFConverter(
|
|
3010
3138
|
this.logger,
|
|
@@ -3059,6 +3187,7 @@ var PDFConverter = class {
|
|
|
3059
3187
|
const aggregator = options.aggregator ?? new LLMTokenUsageAggregator();
|
|
3060
3188
|
const trackedOptions = { ...options, aggregator };
|
|
3061
3189
|
const pdfPath = url.startsWith("file://") ? url.slice(7) : null;
|
|
3190
|
+
await this.validateDocumentType(url, trackedOptions, abortSignal);
|
|
3062
3191
|
const strategy = await this.determineStrategy(
|
|
3063
3192
|
pdfPath,
|
|
3064
3193
|
reportId,
|
|
@@ -3385,7 +3514,8 @@ var PDFConverter = class {
|
|
|
3385
3514
|
"onTokenUsage",
|
|
3386
3515
|
"chunkedConversion",
|
|
3387
3516
|
"chunkSize",
|
|
3388
|
-
"chunkMaxRetries"
|
|
3517
|
+
"chunkMaxRetries",
|
|
3518
|
+
"documentValidationModel"
|
|
3389
3519
|
]),
|
|
3390
3520
|
to_formats: ["json", "html"],
|
|
3391
3521
|
image_export_mode: "embedded",
|
|
@@ -4086,6 +4216,7 @@ var VlmResponseValidator = class {
|
|
|
4086
4216
|
};
|
|
4087
4217
|
export {
|
|
4088
4218
|
ImagePdfFallbackError,
|
|
4219
|
+
InvalidDocumentTypeError,
|
|
4089
4220
|
PDFParser,
|
|
4090
4221
|
VlmResponseValidator
|
|
4091
4222
|
};
|