@heripo/pdf-parser 0.1.13 → 0.1.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +182 -21
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +16 -1
- package/dist/index.d.ts +16 -1
- package/dist/index.js +181 -21
- package/dist/index.js.map +1 -1
- package/package.json +7 -7
package/dist/index.cjs
CHANGED
|
@@ -31,6 +31,7 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
|
|
|
31
31
|
var src_exports = {};
|
|
32
32
|
__export(src_exports, {
|
|
33
33
|
ImagePdfFallbackError: () => ImagePdfFallbackError,
|
|
34
|
+
InvalidDocumentTypeError: () => InvalidDocumentTypeError,
|
|
34
35
|
PDFParser: () => PDFParser,
|
|
35
36
|
VlmResponseValidator: () => VlmResponseValidator
|
|
36
37
|
});
|
|
@@ -1508,6 +1509,32 @@ var PdfTextExtractor = class {
|
|
|
1508
1509
|
}
|
|
1509
1510
|
return result.stdout;
|
|
1510
1511
|
}
|
|
1512
|
+
/**
|
|
1513
|
+
* Extract text from a range of PDF pages using a single pdftotext invocation.
|
|
1514
|
+
* Returns empty string on failure (logged as warning).
|
|
1515
|
+
*
|
|
1516
|
+
* @param pdfPath - Absolute path to the source PDF file
|
|
1517
|
+
* @param firstPage - First page number (1-based)
|
|
1518
|
+
* @param lastPage - Last page number (1-based, inclusive)
|
|
1519
|
+
*/
|
|
1520
|
+
async extractPageRange(pdfPath, firstPage, lastPage) {
|
|
1521
|
+
const result = await spawnAsync("pdftotext", [
|
|
1522
|
+
"-f",
|
|
1523
|
+
firstPage.toString(),
|
|
1524
|
+
"-l",
|
|
1525
|
+
lastPage.toString(),
|
|
1526
|
+
"-layout",
|
|
1527
|
+
pdfPath,
|
|
1528
|
+
"-"
|
|
1529
|
+
]);
|
|
1530
|
+
if (result.code !== 0) {
|
|
1531
|
+
this.logger.warn(
|
|
1532
|
+
`[PdfTextExtractor] pdftotext failed for pages ${firstPage}-${lastPage}: ${result.stderr || "Unknown error"}`
|
|
1533
|
+
);
|
|
1534
|
+
return "";
|
|
1535
|
+
}
|
|
1536
|
+
return result.stdout;
|
|
1537
|
+
}
|
|
1511
1538
|
/**
|
|
1512
1539
|
* Extract text from a single PDF page using pdftotext.
|
|
1513
1540
|
* Returns empty string on failure (logged as warning).
|
|
@@ -2070,8 +2097,9 @@ Note: Hanja are Chinese characters used in Korean documents, different from mode
|
|
|
2070
2097
|
|
|
2071
2098
|
Answer whether any Hanja characters are present on this page.
|
|
2072
2099
|
|
|
2073
|
-
Also identify all languages present on this page. Return an array of
|
|
2074
|
-
|
|
2100
|
+
Also identify all languages present on this page. Return an array of ocrmac-compatible language tags ordered by prevalence (primary language first).
|
|
2101
|
+
Supported tags: ar-SA, ars-SA, cs-CZ, da-DK, de-DE, en-US, es-ES, fr-FR, id-ID, it-IT, ja-JP, ko-KR, ms-MY, nb-NO, nl-NL, nn-NO, no-NO, pl-PL, pt-BR, ro-RO, ru-RU, sv-SE, th-TH, tr-TR, uk-UA, vi-VT, yue-Hans, yue-Hant, zh-Hans, zh-Hant.
|
|
2102
|
+
Examples: ["ko-KR", "en-US"], ["ja-JP"], ["zh-Hant", "en-US"]`;
|
|
2075
2103
|
var OcrStrategySampler = class {
|
|
2076
2104
|
logger;
|
|
2077
2105
|
pageRenderer;
|
|
@@ -2369,6 +2397,119 @@ var LocalFileServer = class {
|
|
|
2369
2397
|
}
|
|
2370
2398
|
};
|
|
2371
2399
|
|
|
2400
|
+
// src/utils/task-failure-details.ts
|
|
2401
|
+
var MAX_RESULT_RETRIES = 3;
|
|
2402
|
+
var RESULT_RETRY_DELAY_MS = 2e3;
|
|
2403
|
+
async function getTaskFailureDetails(task, logger, logPrefix) {
|
|
2404
|
+
for (let attempt = 0; attempt < MAX_RESULT_RETRIES; attempt++) {
|
|
2405
|
+
try {
|
|
2406
|
+
if (attempt > 0) {
|
|
2407
|
+
await new Promise((r) => setTimeout(r, RESULT_RETRY_DELAY_MS));
|
|
2408
|
+
}
|
|
2409
|
+
const result = await task.getResult();
|
|
2410
|
+
if (result.errors?.length) {
|
|
2411
|
+
return result.errors.map((e) => e.message).join("; ");
|
|
2412
|
+
}
|
|
2413
|
+
return `status: ${result.status ?? "unknown"}`;
|
|
2414
|
+
} catch (err) {
|
|
2415
|
+
if (attempt === MAX_RESULT_RETRIES - 1) {
|
|
2416
|
+
logger.error(
|
|
2417
|
+
`${logPrefix} Failed to retrieve task result after ${MAX_RESULT_RETRIES} attempts:`,
|
|
2418
|
+
err
|
|
2419
|
+
);
|
|
2420
|
+
return "unable to retrieve error details";
|
|
2421
|
+
}
|
|
2422
|
+
logger.warn(
|
|
2423
|
+
`${logPrefix} Result not available yet, retrying (${attempt + 1}/${MAX_RESULT_RETRIES})...`
|
|
2424
|
+
);
|
|
2425
|
+
}
|
|
2426
|
+
}
|
|
2427
|
+
return "unable to retrieve error details";
|
|
2428
|
+
}
|
|
2429
|
+
|
|
2430
|
+
// src/validators/document-type-validator.ts
|
|
2431
|
+
var import_zod = require("zod");
|
|
2432
|
+
|
|
2433
|
+
// src/errors/invalid-document-type-error.ts
|
|
2434
|
+
var InvalidDocumentTypeError = class extends Error {
|
|
2435
|
+
constructor(reason) {
|
|
2436
|
+
super(
|
|
2437
|
+
`The uploaded PDF does not appear to be a Korean archaeological investigation report. Reason: ${reason}`
|
|
2438
|
+
);
|
|
2439
|
+
this.reason = reason;
|
|
2440
|
+
}
|
|
2441
|
+
name = "InvalidDocumentTypeError";
|
|
2442
|
+
code = "INVALID_DOCUMENT_TYPE";
|
|
2443
|
+
};
|
|
2444
|
+
|
|
2445
|
+
// src/validators/document-type-validator.ts
|
|
2446
|
+
var SYSTEM_PROMPT = `You are given text extracted from the first and last pages of a PDF document.
|
|
2447
|
+
Determine if this document is a Korean archaeological investigation report (\uACE0\uACE0\uD559 \uC870\uC0AC \uBCF4\uACE0\uC11C).
|
|
2448
|
+
|
|
2449
|
+
Valid types include:
|
|
2450
|
+
- \uBC1C\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C (excavation investigation report)
|
|
2451
|
+
- \uC2DC\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C (trial excavation report)
|
|
2452
|
+
- \uC9C0\uD45C\uC870\uC0AC\uBCF4\uACE0\uC11C (surface survey report)
|
|
2453
|
+
- \uC815\uBC00\uBC1C\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C (detailed excavation report)
|
|
2454
|
+
- \uC218\uC911\uBC1C\uAD74\uC870\uC0AC\uBCF4\uACE0\uC11C (underwater excavation report)
|
|
2455
|
+
|
|
2456
|
+
NOT valid (these are NOT archaeological investigation reports):
|
|
2457
|
+
- \uC218\uB9AC\uBCF4\uACE0\uC11C (repair/restoration report)
|
|
2458
|
+
- \uB2E8\uC21C \uC2E4\uCE21 \uBCF4\uACE0\uC11C (simple measurement report)
|
|
2459
|
+
- \uAC74\uCD95\uC870\uC0AC\uBCF4\uACE0\uC11C (architectural investigation report)
|
|
2460
|
+
- \uD559\uC220\uC870\uC0AC\uBCF4\uACE0\uC11C (academic research report)
|
|
2461
|
+
- \uD658\uACBD\uC601\uD5A5\uD3C9\uAC00 (environmental impact assessment)
|
|
2462
|
+
- General academic papers or textbooks about archaeology`;
|
|
2463
|
+
var documentTypeSchema = import_zod.z.object({
|
|
2464
|
+
isValid: import_zod.z.boolean().describe("Whether this is a Korean archaeological investigation report"),
|
|
2465
|
+
reason: import_zod.z.string().describe("Brief reason for the decision")
|
|
2466
|
+
});
|
|
2467
|
+
var DocumentTypeValidator = class {
|
|
2468
|
+
textExtractor;
|
|
2469
|
+
constructor(textExtractor) {
|
|
2470
|
+
this.textExtractor = textExtractor;
|
|
2471
|
+
}
|
|
2472
|
+
/**
|
|
2473
|
+
* Validate that the PDF at the given path is an archaeological investigation report.
|
|
2474
|
+
*
|
|
2475
|
+
* @throws {InvalidDocumentTypeError} if the document is not a valid report type
|
|
2476
|
+
*/
|
|
2477
|
+
async validate(pdfPath, model, options) {
|
|
2478
|
+
const totalPages = await this.textExtractor.getPageCount(pdfPath);
|
|
2479
|
+
if (totalPages === 0) return;
|
|
2480
|
+
const frontText = await this.textExtractor.extractPageRange(
|
|
2481
|
+
pdfPath,
|
|
2482
|
+
1,
|
|
2483
|
+
Math.min(10, totalPages)
|
|
2484
|
+
);
|
|
2485
|
+
let backText = "";
|
|
2486
|
+
if (totalPages > 20) {
|
|
2487
|
+
backText = await this.textExtractor.extractPageRange(
|
|
2488
|
+
pdfPath,
|
|
2489
|
+
Math.max(1, totalPages - 9),
|
|
2490
|
+
totalPages
|
|
2491
|
+
);
|
|
2492
|
+
}
|
|
2493
|
+
const combinedText = (frontText + "\n" + backText).trim();
|
|
2494
|
+
if (combinedText.length === 0) return;
|
|
2495
|
+
const result = await LLMCaller.call({
|
|
2496
|
+
schema: documentTypeSchema,
|
|
2497
|
+
systemPrompt: SYSTEM_PROMPT,
|
|
2498
|
+
userPrompt: `--- Document text (first and last pages) ---
|
|
2499
|
+
${combinedText}`,
|
|
2500
|
+
primaryModel: model,
|
|
2501
|
+
maxRetries: 2,
|
|
2502
|
+
temperature: 0,
|
|
2503
|
+
abortSignal: options?.abortSignal,
|
|
2504
|
+
component: "DocumentTypeValidator",
|
|
2505
|
+
phase: "validation"
|
|
2506
|
+
});
|
|
2507
|
+
if (!result.output.isValid) {
|
|
2508
|
+
throw new InvalidDocumentTypeError(result.output.reason);
|
|
2509
|
+
}
|
|
2510
|
+
}
|
|
2511
|
+
};
|
|
2512
|
+
|
|
2372
2513
|
// src/core/chunked-pdf-converter.ts
|
|
2373
2514
|
var import_node_fs7 = require("fs");
|
|
2374
2515
|
var import_promises4 = require("fs/promises");
|
|
@@ -2714,14 +2855,15 @@ var ChunkedPDFConverter = class {
|
|
|
2714
2855
|
const status = await task.poll();
|
|
2715
2856
|
if (status.task_status === "success") return;
|
|
2716
2857
|
if (status.task_status === "failure") {
|
|
2717
|
-
|
|
2718
|
-
|
|
2719
|
-
|
|
2720
|
-
|
|
2721
|
-
|
|
2722
|
-
|
|
2723
|
-
|
|
2724
|
-
|
|
2858
|
+
const elapsed = ((Date.now() - startTime) / 1e3).toFixed(1);
|
|
2859
|
+
this.logger.error(
|
|
2860
|
+
`[ChunkedPDFConverter] Task ${task.taskId} failed after ${elapsed}s`
|
|
2861
|
+
);
|
|
2862
|
+
const details = await getTaskFailureDetails(
|
|
2863
|
+
task,
|
|
2864
|
+
this.logger,
|
|
2865
|
+
"[ChunkedPDFConverter]"
|
|
2866
|
+
);
|
|
2725
2867
|
throw new Error(`[ChunkedPDFConverter] Chunk task failed: ${details}`);
|
|
2726
2868
|
}
|
|
2727
2869
|
await new Promise(
|
|
@@ -2986,8 +3128,27 @@ var PDFConverter = class {
|
|
|
2986
3128
|
this.enableImagePdfFallback = enableImagePdfFallback;
|
|
2987
3129
|
this.timeout = timeout;
|
|
2988
3130
|
}
|
|
3131
|
+
documentTypeValidated = false;
|
|
3132
|
+
/**
|
|
3133
|
+
* Validate that the PDF is a Korean archaeological investigation report.
|
|
3134
|
+
* Skipped when no documentValidationModel is configured or for non-local URLs.
|
|
3135
|
+
* Only runs once per converter instance (flag prevents duplicate checks on recursive calls).
|
|
3136
|
+
*/
|
|
3137
|
+
async validateDocumentType(url, options, abortSignal) {
|
|
3138
|
+
if (this.documentTypeValidated) return;
|
|
3139
|
+
this.documentTypeValidated = true;
|
|
3140
|
+
if (!options.documentValidationModel) return;
|
|
3141
|
+
const pdfPath = url.startsWith("file://") ? url.slice(7) : null;
|
|
3142
|
+
if (!pdfPath) return;
|
|
3143
|
+
const textExtractor = new PdfTextExtractor(this.logger);
|
|
3144
|
+
const validator = new DocumentTypeValidator(textExtractor);
|
|
3145
|
+
await validator.validate(pdfPath, options.documentValidationModel, {
|
|
3146
|
+
abortSignal
|
|
3147
|
+
});
|
|
3148
|
+
}
|
|
2989
3149
|
async convert(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
|
|
2990
3150
|
this.logger.info("[PDFConverter] Converting:", url);
|
|
3151
|
+
await this.validateDocumentType(url, options, abortSignal);
|
|
2991
3152
|
if (options.chunkedConversion && url.startsWith("file://")) {
|
|
2992
3153
|
const chunked = new ChunkedPDFConverter(
|
|
2993
3154
|
this.logger,
|
|
@@ -3042,6 +3203,7 @@ var PDFConverter = class {
|
|
|
3042
3203
|
const aggregator = options.aggregator ?? new LLMTokenUsageAggregator();
|
|
3043
3204
|
const trackedOptions = { ...options, aggregator };
|
|
3044
3205
|
const pdfPath = url.startsWith("file://") ? url.slice(7) : null;
|
|
3206
|
+
await this.validateDocumentType(url, trackedOptions, abortSignal);
|
|
3045
3207
|
const strategy = await this.determineStrategy(
|
|
3046
3208
|
pdfPath,
|
|
3047
3209
|
reportId,
|
|
@@ -3358,6 +3520,7 @@ var PDFConverter = class {
|
|
|
3358
3520
|
return {
|
|
3359
3521
|
...(0, import_es_toolkit.omit)(options, [
|
|
3360
3522
|
"num_threads",
|
|
3523
|
+
"document_timeout",
|
|
3361
3524
|
"forceImagePdf",
|
|
3362
3525
|
"strategySamplerModel",
|
|
3363
3526
|
"vlmProcessorModel",
|
|
@@ -3367,7 +3530,8 @@ var PDFConverter = class {
|
|
|
3367
3530
|
"onTokenUsage",
|
|
3368
3531
|
"chunkedConversion",
|
|
3369
3532
|
"chunkSize",
|
|
3370
|
-
"chunkMaxRetries"
|
|
3533
|
+
"chunkMaxRetries",
|
|
3534
|
+
"documentValidationModel"
|
|
3371
3535
|
]),
|
|
3372
3536
|
to_formats: ["json", "html"],
|
|
3373
3537
|
image_export_mode: "embedded",
|
|
@@ -3379,6 +3543,8 @@ var PDFConverter = class {
|
|
|
3379
3543
|
framework: "livetext"
|
|
3380
3544
|
},
|
|
3381
3545
|
generate_picture_images: true,
|
|
3546
|
+
do_picture_classification: true,
|
|
3547
|
+
do_picture_description: true,
|
|
3382
3548
|
generate_page_images: false,
|
|
3383
3549
|
// Page images are rendered by PageRenderer (ImageMagick) after conversion
|
|
3384
3550
|
images_scale: 2,
|
|
@@ -3393,6 +3559,9 @@ var PDFConverter = class {
|
|
|
3393
3559
|
accelerator_options: {
|
|
3394
3560
|
device: "mps",
|
|
3395
3561
|
num_threads: options.num_threads
|
|
3562
|
+
},
|
|
3563
|
+
...options.document_timeout !== void 0 && {
|
|
3564
|
+
document_timeout: options.document_timeout
|
|
3396
3565
|
}
|
|
3397
3566
|
};
|
|
3398
3567
|
}
|
|
@@ -3479,16 +3648,7 @@ var PDFConverter = class {
|
|
|
3479
3648
|
* Fetch detailed error information from a failed task result.
|
|
3480
3649
|
*/
|
|
3481
3650
|
async getTaskFailureDetails(task) {
|
|
3482
|
-
|
|
3483
|
-
const result = await task.getResult();
|
|
3484
|
-
if (result.errors?.length) {
|
|
3485
|
-
return result.errors.map((e) => e.message).join("; ");
|
|
3486
|
-
}
|
|
3487
|
-
return `status: ${result.status ?? "unknown"}`;
|
|
3488
|
-
} catch (err) {
|
|
3489
|
-
this.logger.error("[PDFConverter] Failed to retrieve task result:", err);
|
|
3490
|
-
return "unable to retrieve error details";
|
|
3491
|
-
}
|
|
3651
|
+
return getTaskFailureDetails(task, this.logger, "[PDFConverter]");
|
|
3492
3652
|
}
|
|
3493
3653
|
async downloadResult(taskId) {
|
|
3494
3654
|
this.logger.info(
|
|
@@ -4073,6 +4233,7 @@ var VlmResponseValidator = class {
|
|
|
4073
4233
|
// Annotate the CommonJS export names for ESM import in node:
|
|
4074
4234
|
0 && (module.exports = {
|
|
4075
4235
|
ImagePdfFallbackError,
|
|
4236
|
+
InvalidDocumentTypeError,
|
|
4076
4237
|
PDFParser,
|
|
4077
4238
|
VlmResponseValidator
|
|
4078
4239
|
});
|