@heripo/pdf-parser 0.1.11 → 0.1.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +595 -42
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +6 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +604 -42
- package/dist/index.js.map +1 -1
- package/package.json +5 -5
package/dist/index.cjs
CHANGED
|
@@ -40,7 +40,7 @@ module.exports = __toCommonJS(src_exports);
|
|
|
40
40
|
var import_docling_sdk = require("docling-sdk");
|
|
41
41
|
var import_node_child_process3 = require("child_process");
|
|
42
42
|
var import_node_os2 = require("os");
|
|
43
|
-
var
|
|
43
|
+
var import_node_path9 = require("path");
|
|
44
44
|
|
|
45
45
|
// src/config/constants.ts
|
|
46
46
|
var PDF_PARSER = {
|
|
@@ -87,6 +87,12 @@ var PAGE_RENDERING = {
|
|
|
87
87
|
/** Low-resolution DPI for OCR strategy sampling */
|
|
88
88
|
SAMPLE_DPI: 150
|
|
89
89
|
};
|
|
90
|
+
var CHUNKED_CONVERSION = {
|
|
91
|
+
/** Number of pages per chunk */
|
|
92
|
+
DEFAULT_CHUNK_SIZE: 10,
|
|
93
|
+
/** Maximum retry attempts per failed chunk */
|
|
94
|
+
DEFAULT_MAX_RETRIES: 2
|
|
95
|
+
};
|
|
90
96
|
var IMAGE_PDF_CONVERTER = {
|
|
91
97
|
/**
|
|
92
98
|
* ImageMagick density option (DPI) for PDF to image conversion
|
|
@@ -875,10 +881,10 @@ var DoclingEnvironment = class _DoclingEnvironment {
|
|
|
875
881
|
|
|
876
882
|
// src/core/pdf-converter.ts
|
|
877
883
|
var import_es_toolkit = require("es-toolkit");
|
|
878
|
-
var
|
|
879
|
-
var
|
|
880
|
-
var
|
|
881
|
-
var
|
|
884
|
+
var import_node_fs9 = require("fs");
|
|
885
|
+
var import_promises6 = require("fs/promises");
|
|
886
|
+
var import_node_path8 = require("path");
|
|
887
|
+
var import_promises7 = require("stream/promises");
|
|
882
888
|
|
|
883
889
|
// src/errors/image-pdf-fallback-error.ts
|
|
884
890
|
var ImagePdfFallbackError = class extends Error {
|
|
@@ -1774,7 +1780,8 @@ var VlmTextCorrector = class {
|
|
|
1774
1780
|
},
|
|
1775
1781
|
{
|
|
1776
1782
|
type: "image",
|
|
1777
|
-
image:
|
|
1783
|
+
image: imageBase64,
|
|
1784
|
+
mediaType: "image/png"
|
|
1778
1785
|
}
|
|
1779
1786
|
]
|
|
1780
1787
|
}
|
|
@@ -1986,7 +1993,7 @@ var VlmTextCorrector = class {
|
|
|
1986
1993
|
*/
|
|
1987
1994
|
readPageImage(outputDir, pageNo) {
|
|
1988
1995
|
const imagePath = (0, import_node_path4.join)(outputDir, "pages", `page_${pageNo - 1}.png`);
|
|
1989
|
-
return (0, import_node_fs4.readFileSync)(imagePath)
|
|
1996
|
+
return new Uint8Array((0, import_node_fs4.readFileSync)(imagePath));
|
|
1990
1997
|
}
|
|
1991
1998
|
/**
|
|
1992
1999
|
* Apply VLM corrections to the DoclingDocument.
|
|
@@ -2255,7 +2262,7 @@ var OcrStrategySampler = class {
|
|
|
2255
2262
|
this.logger.debug(
|
|
2256
2263
|
`[OcrStrategySampler] Analyzing page ${pageNo} for Korean-Hanja mix and language...`
|
|
2257
2264
|
);
|
|
2258
|
-
const
|
|
2265
|
+
const imageData = new Uint8Array((0, import_node_fs5.readFileSync)(pageFile));
|
|
2259
2266
|
const messages = [
|
|
2260
2267
|
{
|
|
2261
2268
|
role: "user",
|
|
@@ -2263,7 +2270,8 @@ var OcrStrategySampler = class {
|
|
|
2263
2270
|
{ type: "text", text: KOREAN_HANJA_MIX_PROMPT },
|
|
2264
2271
|
{
|
|
2265
2272
|
type: "image",
|
|
2266
|
-
image:
|
|
2273
|
+
image: imageData,
|
|
2274
|
+
mediaType: "image/png"
|
|
2267
2275
|
}
|
|
2268
2276
|
]
|
|
2269
2277
|
}
|
|
@@ -2361,10 +2369,532 @@ var LocalFileServer = class {
|
|
|
2361
2369
|
}
|
|
2362
2370
|
};
|
|
2363
2371
|
|
|
2364
|
-
// src/core/
|
|
2372
|
+
// src/core/chunked-pdf-converter.ts
|
|
2365
2373
|
var import_node_fs7 = require("fs");
|
|
2366
|
-
var
|
|
2374
|
+
var import_promises4 = require("fs/promises");
|
|
2367
2375
|
var import_node_path6 = require("path");
|
|
2376
|
+
var import_promises5 = require("stream/promises");
|
|
2377
|
+
|
|
2378
|
+
// src/processors/docling-document-merger.ts
|
|
2379
|
+
var REF_PATTERN = /^#\/(texts|pictures|tables|groups)\/(\d+)$/;
|
|
2380
|
+
var IMAGE_URI_PATTERN = /^images\/pic_(\d+)\.png$/;
|
|
2381
|
+
var DoclingDocumentMerger = class {
|
|
2382
|
+
/**
|
|
2383
|
+
* Merge an array of DoclingDocuments into one.
|
|
2384
|
+
* The first chunk's metadata (schema_name, version, name, origin) is used as the base.
|
|
2385
|
+
*
|
|
2386
|
+
* @param chunks - Array of DoclingDocument objects to merge (must have at least 1)
|
|
2387
|
+
* @param picFileOffsets - Optional cumulative pic_ file counts per chunk.
|
|
2388
|
+
* When provided, picFileOffsets[i] is used for pic_ URI remapping instead of
|
|
2389
|
+
* the pictures array length, aligning URIs with relocated file indices.
|
|
2390
|
+
* @returns Merged DoclingDocument
|
|
2391
|
+
*/
|
|
2392
|
+
merge(chunks, picFileOffsets) {
|
|
2393
|
+
if (chunks.length === 0) {
|
|
2394
|
+
throw new Error("Cannot merge zero chunks");
|
|
2395
|
+
}
|
|
2396
|
+
if (chunks.length === 1) {
|
|
2397
|
+
return chunks[0];
|
|
2398
|
+
}
|
|
2399
|
+
const base = structuredClone(chunks[0]);
|
|
2400
|
+
for (let i = 1; i < chunks.length; i++) {
|
|
2401
|
+
const chunk = chunks[i];
|
|
2402
|
+
const offsets = {
|
|
2403
|
+
texts: base.texts.length,
|
|
2404
|
+
pictures: base.pictures.length,
|
|
2405
|
+
tables: base.tables.length,
|
|
2406
|
+
groups: base.groups.length
|
|
2407
|
+
};
|
|
2408
|
+
const picFileOffset = picFileOffsets ? picFileOffsets[i] : offsets.pictures;
|
|
2409
|
+
for (const text of chunk.texts) {
|
|
2410
|
+
const remapped = structuredClone(text);
|
|
2411
|
+
remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
|
|
2412
|
+
if (remapped.parent) {
|
|
2413
|
+
remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
|
|
2414
|
+
}
|
|
2415
|
+
remapped.children = remapped.children.map((c) => ({
|
|
2416
|
+
$ref: this.remapRef(c.$ref, offsets)
|
|
2417
|
+
}));
|
|
2418
|
+
base.texts.push(remapped);
|
|
2419
|
+
}
|
|
2420
|
+
for (const picture of chunk.pictures) {
|
|
2421
|
+
const remapped = structuredClone(picture);
|
|
2422
|
+
remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
|
|
2423
|
+
if (remapped.parent) {
|
|
2424
|
+
remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
|
|
2425
|
+
}
|
|
2426
|
+
remapped.children = remapped.children.map((c) => ({
|
|
2427
|
+
$ref: this.remapRef(c.$ref, offsets)
|
|
2428
|
+
}));
|
|
2429
|
+
remapped.captions = remapped.captions.map((c) => ({
|
|
2430
|
+
$ref: this.remapRef(c.$ref, offsets)
|
|
2431
|
+
}));
|
|
2432
|
+
this.remapPictureImageUri(remapped, picFileOffset);
|
|
2433
|
+
base.pictures.push(remapped);
|
|
2434
|
+
}
|
|
2435
|
+
for (const table of chunk.tables) {
|
|
2436
|
+
const remapped = structuredClone(table);
|
|
2437
|
+
remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
|
|
2438
|
+
if (remapped.parent) {
|
|
2439
|
+
remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
|
|
2440
|
+
}
|
|
2441
|
+
remapped.children = remapped.children.map((c) => ({
|
|
2442
|
+
$ref: this.remapRef(c.$ref, offsets)
|
|
2443
|
+
}));
|
|
2444
|
+
remapped.captions = remapped.captions.map((c) => ({
|
|
2445
|
+
$ref: this.remapRef(c.$ref, offsets)
|
|
2446
|
+
}));
|
|
2447
|
+
remapped.footnotes = remapped.footnotes.map((f) => ({
|
|
2448
|
+
$ref: this.remapRef(f.$ref, offsets)
|
|
2449
|
+
}));
|
|
2450
|
+
base.tables.push(remapped);
|
|
2451
|
+
}
|
|
2452
|
+
for (const group of chunk.groups) {
|
|
2453
|
+
const remapped = structuredClone(group);
|
|
2454
|
+
remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
|
|
2455
|
+
if (remapped.parent) {
|
|
2456
|
+
remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
|
|
2457
|
+
}
|
|
2458
|
+
remapped.children = remapped.children.map((c) => ({
|
|
2459
|
+
$ref: this.remapRef(c.$ref, offsets)
|
|
2460
|
+
}));
|
|
2461
|
+
base.groups.push(remapped);
|
|
2462
|
+
}
|
|
2463
|
+
for (const child of chunk.body.children) {
|
|
2464
|
+
base.body.children.push({
|
|
2465
|
+
$ref: this.remapRef(child.$ref, offsets)
|
|
2466
|
+
});
|
|
2467
|
+
}
|
|
2468
|
+
for (const child of chunk.furniture.children) {
|
|
2469
|
+
base.furniture.children.push({
|
|
2470
|
+
$ref: this.remapRef(child.$ref, offsets)
|
|
2471
|
+
});
|
|
2472
|
+
}
|
|
2473
|
+
Object.assign(base.pages, chunk.pages);
|
|
2474
|
+
}
|
|
2475
|
+
return base;
|
|
2476
|
+
}
|
|
2477
|
+
/**
|
|
2478
|
+
* Remap a $ref string by applying offsets.
|
|
2479
|
+
* Only refs matching "#/{texts|pictures|tables|groups}/{N}" are remapped.
|
|
2480
|
+
* Refs like "#/body" or "#/furniture" pass through unchanged.
|
|
2481
|
+
*/
|
|
2482
|
+
remapRef(ref, offsets) {
|
|
2483
|
+
const match = REF_PATTERN.exec(ref);
|
|
2484
|
+
if (!match) {
|
|
2485
|
+
return ref;
|
|
2486
|
+
}
|
|
2487
|
+
const kind = match[1];
|
|
2488
|
+
const index = parseInt(match[2], 10);
|
|
2489
|
+
return `#/${kind}/${index + offsets[kind]}`;
|
|
2490
|
+
}
|
|
2491
|
+
/**
|
|
2492
|
+
* Remap image URI in a picture item by applying the pic file offset.
|
|
2493
|
+
* Transforms "images/pic_N.png" → "images/pic_{N+offset}.png"
|
|
2494
|
+
*/
|
|
2495
|
+
remapPictureImageUri(picture, picFileOffset) {
|
|
2496
|
+
const rec = picture;
|
|
2497
|
+
const image = rec.image;
|
|
2498
|
+
if (!image?.uri) return;
|
|
2499
|
+
const match = IMAGE_URI_PATTERN.exec(image.uri);
|
|
2500
|
+
if (match) {
|
|
2501
|
+
const index = parseInt(match[1], 10);
|
|
2502
|
+
image.uri = `images/pic_${index + picFileOffset}.png`;
|
|
2503
|
+
}
|
|
2504
|
+
}
|
|
2505
|
+
};
|
|
2506
|
+
|
|
2507
|
+
// src/core/chunked-pdf-converter.ts
|
|
2508
|
+
var ChunkedPDFConverter = class {
|
|
2509
|
+
constructor(logger, client, config, timeout = PDF_CONVERTER.DEFAULT_TIMEOUT_MS) {
|
|
2510
|
+
this.logger = logger;
|
|
2511
|
+
this.client = client;
|
|
2512
|
+
this.config = config;
|
|
2513
|
+
this.timeout = timeout;
|
|
2514
|
+
}
|
|
2515
|
+
/**
|
|
2516
|
+
* Convert a local PDF in chunks.
|
|
2517
|
+
*
|
|
2518
|
+
* @param url - file:// URL to the source PDF
|
|
2519
|
+
* @param reportId - Unique report identifier for output directory naming
|
|
2520
|
+
* @param onComplete - Callback invoked with the final output directory
|
|
2521
|
+
* @param cleanupAfterCallback - Whether to clean up the output directory after callback
|
|
2522
|
+
* @param options - PDF conversion options (chunked-specific fields are stripped internally)
|
|
2523
|
+
* @param buildConversionOptions - Function to build Docling ConversionOptions from PDFConvertOptions
|
|
2524
|
+
* @param abortSignal - Optional abort signal for cancellation
|
|
2525
|
+
*/
|
|
2526
|
+
async convertChunked(url, reportId, onComplete, cleanupAfterCallback, options, buildConversionOptions, abortSignal) {
|
|
2527
|
+
const pdfPath = url.slice(7);
|
|
2528
|
+
const cwd = process.cwd();
|
|
2529
|
+
const outputDir = (0, import_node_path6.join)(cwd, "output", reportId);
|
|
2530
|
+
const chunksBaseDir = (0, import_node_path6.join)(cwd, "output", reportId, "_chunks");
|
|
2531
|
+
const totalPages = await this.getPageCount(pdfPath);
|
|
2532
|
+
if (totalPages === 0) {
|
|
2533
|
+
throw new Error(
|
|
2534
|
+
"[ChunkedPDFConverter] Failed to detect page count from PDF"
|
|
2535
|
+
);
|
|
2536
|
+
}
|
|
2537
|
+
const chunks = this.calculateChunks(totalPages);
|
|
2538
|
+
this.logger.info(
|
|
2539
|
+
`[ChunkedPDFConverter] Starting: ${totalPages} pages \u2192 ${chunks.length} chunks of ${this.config.chunkSize}`
|
|
2540
|
+
);
|
|
2541
|
+
const server = new LocalFileServer();
|
|
2542
|
+
const httpUrl = await server.start(pdfPath);
|
|
2543
|
+
this.logger.info(
|
|
2544
|
+
"[ChunkedPDFConverter] Started local file server:",
|
|
2545
|
+
httpUrl
|
|
2546
|
+
);
|
|
2547
|
+
const chunkDocuments = [];
|
|
2548
|
+
try {
|
|
2549
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
2550
|
+
this.checkAbort(abortSignal);
|
|
2551
|
+
const [start, end] = chunks[i];
|
|
2552
|
+
const chunkDir = (0, import_node_path6.join)(chunksBaseDir, `_chunk_${i}`);
|
|
2553
|
+
(0, import_node_fs7.mkdirSync)(chunkDir, { recursive: true });
|
|
2554
|
+
const doc = await this.convertChunk(
|
|
2555
|
+
i,
|
|
2556
|
+
chunks.length,
|
|
2557
|
+
start,
|
|
2558
|
+
end,
|
|
2559
|
+
httpUrl,
|
|
2560
|
+
chunkDir,
|
|
2561
|
+
options,
|
|
2562
|
+
buildConversionOptions
|
|
2563
|
+
);
|
|
2564
|
+
chunkDocuments.push(doc);
|
|
2565
|
+
}
|
|
2566
|
+
} finally {
|
|
2567
|
+
this.logger.info("[ChunkedPDFConverter] Stopping local file server...");
|
|
2568
|
+
await server.stop();
|
|
2569
|
+
}
|
|
2570
|
+
this.checkAbort(abortSignal);
|
|
2571
|
+
this.logger.info(
|
|
2572
|
+
`[ChunkedPDFConverter] All ${chunks.length} chunks completed, merging...`
|
|
2573
|
+
);
|
|
2574
|
+
const merger = new DoclingDocumentMerger();
|
|
2575
|
+
const picFileOffsets = this.buildPicFileOffsets(
|
|
2576
|
+
chunksBaseDir,
|
|
2577
|
+
chunks.length
|
|
2578
|
+
);
|
|
2579
|
+
const merged = merger.merge(chunkDocuments, picFileOffsets);
|
|
2580
|
+
this.logger.info(
|
|
2581
|
+
`[ChunkedPDFConverter] Merged: ${merged.texts.length} texts, ${merged.pictures.length} pictures, ${merged.tables.length} tables, ${Object.keys(merged.pages).length} pages`
|
|
2582
|
+
);
|
|
2583
|
+
(0, import_node_fs7.mkdirSync)(outputDir, { recursive: true });
|
|
2584
|
+
const imagesDir = (0, import_node_path6.join)(outputDir, "images");
|
|
2585
|
+
(0, import_node_fs7.mkdirSync)(imagesDir, { recursive: true });
|
|
2586
|
+
this.relocateImages(chunksBaseDir, chunks.length, imagesDir);
|
|
2587
|
+
const resultPath = (0, import_node_path6.join)(outputDir, "result.json");
|
|
2588
|
+
(0, import_node_fs7.writeFileSync)(resultPath, JSON.stringify(merged));
|
|
2589
|
+
try {
|
|
2590
|
+
await this.renderPageImages(pdfPath, outputDir);
|
|
2591
|
+
this.cleanupOrphanedPicFiles(resultPath, imagesDir);
|
|
2592
|
+
this.checkAbort(abortSignal);
|
|
2593
|
+
this.logger.info(
|
|
2594
|
+
"[ChunkedPDFConverter] Executing completion callback..."
|
|
2595
|
+
);
|
|
2596
|
+
await onComplete(outputDir);
|
|
2597
|
+
} finally {
|
|
2598
|
+
if ((0, import_node_fs7.existsSync)(chunksBaseDir)) {
|
|
2599
|
+
(0, import_node_fs7.rmSync)(chunksBaseDir, { recursive: true, force: true });
|
|
2600
|
+
}
|
|
2601
|
+
if (cleanupAfterCallback) {
|
|
2602
|
+
this.logger.info(
|
|
2603
|
+
"[ChunkedPDFConverter] Cleaning up output directory:",
|
|
2604
|
+
outputDir
|
|
2605
|
+
);
|
|
2606
|
+
if ((0, import_node_fs7.existsSync)(outputDir)) {
|
|
2607
|
+
(0, import_node_fs7.rmSync)(outputDir, { recursive: true, force: true });
|
|
2608
|
+
}
|
|
2609
|
+
} else {
|
|
2610
|
+
this.logger.info(
|
|
2611
|
+
"[ChunkedPDFConverter] Output preserved at:",
|
|
2612
|
+
outputDir
|
|
2613
|
+
);
|
|
2614
|
+
}
|
|
2615
|
+
}
|
|
2616
|
+
return null;
|
|
2617
|
+
}
|
|
2618
|
+
/**
|
|
2619
|
+
* Convert a single chunk with retry logic.
|
|
2620
|
+
*/
|
|
2621
|
+
async convertChunk(chunkIndex, totalChunks, startPage, endPage, httpUrl, chunkDir, options, buildConversionOptions) {
|
|
2622
|
+
const chunkLabel = `Chunk ${chunkIndex + 1}/${totalChunks} (pages ${startPage}-${endPage})`;
|
|
2623
|
+
for (let attempt = 0; attempt <= this.config.maxRetries; attempt++) {
|
|
2624
|
+
try {
|
|
2625
|
+
if (attempt > 0) {
|
|
2626
|
+
this.logger.info(
|
|
2627
|
+
`[ChunkedPDFConverter] ${chunkLabel}: retrying (${attempt}/${this.config.maxRetries})...`
|
|
2628
|
+
);
|
|
2629
|
+
} else {
|
|
2630
|
+
this.logger.info(
|
|
2631
|
+
`[ChunkedPDFConverter] ${chunkLabel}: converting...`
|
|
2632
|
+
);
|
|
2633
|
+
}
|
|
2634
|
+
const startTime = Date.now();
|
|
2635
|
+
const conversionOptions = buildConversionOptions({
|
|
2636
|
+
...options,
|
|
2637
|
+
page_range: [startPage, endPage]
|
|
2638
|
+
});
|
|
2639
|
+
const task = await this.client.convertSourceAsync({
|
|
2640
|
+
sources: [{ kind: "http", url: httpUrl }],
|
|
2641
|
+
options: conversionOptions,
|
|
2642
|
+
target: { kind: "zip" }
|
|
2643
|
+
});
|
|
2644
|
+
await this.trackTaskProgress(task);
|
|
2645
|
+
const zipPath = (0, import_node_path6.join)(chunkDir, "result.zip");
|
|
2646
|
+
await this.downloadResult(task.taskId, zipPath);
|
|
2647
|
+
const extractDir = (0, import_node_path6.join)(chunkDir, "extracted");
|
|
2648
|
+
const chunkOutputDir = (0, import_node_path6.join)(chunkDir, "output");
|
|
2649
|
+
await ImageExtractor.extractAndSaveDocumentsFromZip(
|
|
2650
|
+
this.logger,
|
|
2651
|
+
zipPath,
|
|
2652
|
+
extractDir,
|
|
2653
|
+
chunkOutputDir
|
|
2654
|
+
);
|
|
2655
|
+
const resultJsonPath = (0, import_node_path6.join)(chunkOutputDir, "result.json");
|
|
2656
|
+
const doc = await runJqFileJson(".", resultJsonPath);
|
|
2657
|
+
if ((0, import_node_fs7.existsSync)(zipPath)) (0, import_node_fs7.rmSync)(zipPath, { force: true });
|
|
2658
|
+
if ((0, import_node_fs7.existsSync)(extractDir)) {
|
|
2659
|
+
(0, import_node_fs7.rmSync)(extractDir, { recursive: true, force: true });
|
|
2660
|
+
}
|
|
2661
|
+
const elapsed = ((Date.now() - startTime) / 1e3).toFixed(1);
|
|
2662
|
+
if (attempt > 0) {
|
|
2663
|
+
this.logger.info(
|
|
2664
|
+
`[ChunkedPDFConverter] ${chunkLabel}: completed on retry ${attempt} (${elapsed}s)`
|
|
2665
|
+
);
|
|
2666
|
+
} else {
|
|
2667
|
+
this.logger.info(
|
|
2668
|
+
`[ChunkedPDFConverter] ${chunkLabel}: completed (${elapsed}s)`
|
|
2669
|
+
);
|
|
2670
|
+
}
|
|
2671
|
+
return doc;
|
|
2672
|
+
} catch (error) {
|
|
2673
|
+
if (attempt >= this.config.maxRetries) {
|
|
2674
|
+
this.logger.error(
|
|
2675
|
+
`[ChunkedPDFConverter] ${chunkLabel}: failed after ${this.config.maxRetries} retries`
|
|
2676
|
+
);
|
|
2677
|
+
throw error;
|
|
2678
|
+
}
|
|
2679
|
+
this.logger.warn(
|
|
2680
|
+
`[ChunkedPDFConverter] ${chunkLabel}: failed, retrying (${attempt + 1}/${this.config.maxRetries})...`
|
|
2681
|
+
);
|
|
2682
|
+
}
|
|
2683
|
+
}
|
|
2684
|
+
throw new Error("Unreachable");
|
|
2685
|
+
}
|
|
2686
|
+
/** Calculate page ranges for chunks */
|
|
2687
|
+
calculateChunks(totalPages) {
|
|
2688
|
+
if (this.config.chunkSize <= 0) {
|
|
2689
|
+
throw new Error("[ChunkedPDFConverter] chunkSize must be positive");
|
|
2690
|
+
}
|
|
2691
|
+
const ranges = [];
|
|
2692
|
+
for (let start = 1; start <= totalPages; start += this.config.chunkSize) {
|
|
2693
|
+
const end = Math.min(start + this.config.chunkSize - 1, totalPages);
|
|
2694
|
+
ranges.push([start, end]);
|
|
2695
|
+
}
|
|
2696
|
+
return ranges;
|
|
2697
|
+
}
|
|
2698
|
+
/** Get total page count using pdfinfo */
|
|
2699
|
+
async getPageCount(pdfPath) {
|
|
2700
|
+
const result = await spawnAsync("pdfinfo", [pdfPath]);
|
|
2701
|
+
if (result.code !== 0) {
|
|
2702
|
+
return 0;
|
|
2703
|
+
}
|
|
2704
|
+
const match = result.stdout.match(/^Pages:\s+(\d+)/m);
|
|
2705
|
+
return match ? parseInt(match[1], 10) : 0;
|
|
2706
|
+
}
|
|
2707
|
+
/** Poll task progress until completion */
|
|
2708
|
+
async trackTaskProgress(task) {
|
|
2709
|
+
const startTime = Date.now();
|
|
2710
|
+
while (true) {
|
|
2711
|
+
if (Date.now() - startTime > this.timeout) {
|
|
2712
|
+
throw new Error("[ChunkedPDFConverter] Chunk task timeout");
|
|
2713
|
+
}
|
|
2714
|
+
const status = await task.poll();
|
|
2715
|
+
if (status.task_status === "success") return;
|
|
2716
|
+
if (status.task_status === "failure") {
|
|
2717
|
+
let details = "unknown";
|
|
2718
|
+
try {
|
|
2719
|
+
const result = await task.getResult();
|
|
2720
|
+
if (result.errors?.length) {
|
|
2721
|
+
details = result.errors.map((e) => e.message).join("; ");
|
|
2722
|
+
}
|
|
2723
|
+
} catch {
|
|
2724
|
+
}
|
|
2725
|
+
throw new Error(`[ChunkedPDFConverter] Chunk task failed: ${details}`);
|
|
2726
|
+
}
|
|
2727
|
+
await new Promise(
|
|
2728
|
+
(resolve) => setTimeout(resolve, PDF_CONVERTER.POLL_INTERVAL_MS)
|
|
2729
|
+
);
|
|
2730
|
+
}
|
|
2731
|
+
}
|
|
2732
|
+
/** Download ZIP result for a task */
|
|
2733
|
+
async downloadResult(taskId, zipPath) {
|
|
2734
|
+
const zipResult = await this.client.getTaskResultFile(taskId);
|
|
2735
|
+
if (zipResult.fileStream) {
|
|
2736
|
+
const writeStream = (0, import_node_fs7.createWriteStream)(zipPath);
|
|
2737
|
+
await (0, import_promises5.pipeline)(zipResult.fileStream, writeStream);
|
|
2738
|
+
return;
|
|
2739
|
+
}
|
|
2740
|
+
if (zipResult.data) {
|
|
2741
|
+
await (0, import_promises4.writeFile)(zipPath, zipResult.data);
|
|
2742
|
+
return;
|
|
2743
|
+
}
|
|
2744
|
+
const baseUrl = this.client.getConfig().baseUrl;
|
|
2745
|
+
const response = await fetch(`${baseUrl}/v1/result/${taskId}`, {
|
|
2746
|
+
headers: { Accept: "application/zip" }
|
|
2747
|
+
});
|
|
2748
|
+
if (!response.ok) {
|
|
2749
|
+
throw new Error(
|
|
2750
|
+
`Failed to download chunk ZIP: ${response.status} ${response.statusText}`
|
|
2751
|
+
);
|
|
2752
|
+
}
|
|
2753
|
+
const buffer = new Uint8Array(await response.arrayBuffer());
|
|
2754
|
+
await (0, import_promises4.writeFile)(zipPath, buffer);
|
|
2755
|
+
}
|
|
2756
|
+
/**
|
|
2757
|
+
* Relocate images from chunk output directories to the final images directory
|
|
2758
|
+
* with global indexing.
|
|
2759
|
+
*/
|
|
2760
|
+
relocateImages(chunksBaseDir, totalChunks, imagesDir) {
|
|
2761
|
+
let picGlobalIndex = 0;
|
|
2762
|
+
for (let i = 0; i < totalChunks; i++) {
|
|
2763
|
+
const chunkImagesDir = (0, import_node_path6.join)(
|
|
2764
|
+
chunksBaseDir,
|
|
2765
|
+
`_chunk_${i}`,
|
|
2766
|
+
"output",
|
|
2767
|
+
"images"
|
|
2768
|
+
);
|
|
2769
|
+
if (!(0, import_node_fs7.existsSync)(chunkImagesDir)) continue;
|
|
2770
|
+
const picFiles = (0, import_node_fs7.readdirSync)(chunkImagesDir).filter((f) => f.startsWith("pic_") && f.endsWith(".png")).sort((a, b) => {
|
|
2771
|
+
const numA = parseInt(a.replace("pic_", "").replace(".png", ""), 10);
|
|
2772
|
+
const numB = parseInt(b.replace("pic_", "").replace(".png", ""), 10);
|
|
2773
|
+
return numA - numB;
|
|
2774
|
+
});
|
|
2775
|
+
for (const file of picFiles) {
|
|
2776
|
+
const src = (0, import_node_path6.join)(chunkImagesDir, file);
|
|
2777
|
+
const dest = (0, import_node_path6.join)(imagesDir, `pic_${picGlobalIndex}.png`);
|
|
2778
|
+
(0, import_node_fs7.copyFileSync)(src, dest);
|
|
2779
|
+
picGlobalIndex++;
|
|
2780
|
+
}
|
|
2781
|
+
}
|
|
2782
|
+
let imageGlobalIndex = 0;
|
|
2783
|
+
for (let i = 0; i < totalChunks; i++) {
|
|
2784
|
+
const chunkImagesDir = (0, import_node_path6.join)(
|
|
2785
|
+
chunksBaseDir,
|
|
2786
|
+
`_chunk_${i}`,
|
|
2787
|
+
"output",
|
|
2788
|
+
"images"
|
|
2789
|
+
);
|
|
2790
|
+
if (!(0, import_node_fs7.existsSync)(chunkImagesDir)) continue;
|
|
2791
|
+
const imageFiles = (0, import_node_fs7.readdirSync)(chunkImagesDir).filter((f) => f.startsWith("image_") && f.endsWith(".png")).sort((a, b) => {
|
|
2792
|
+
const numA = parseInt(
|
|
2793
|
+
a.replace("image_", "").replace(".png", ""),
|
|
2794
|
+
10
|
|
2795
|
+
);
|
|
2796
|
+
const numB = parseInt(
|
|
2797
|
+
b.replace("image_", "").replace(".png", ""),
|
|
2798
|
+
10
|
|
2799
|
+
);
|
|
2800
|
+
return numA - numB;
|
|
2801
|
+
});
|
|
2802
|
+
for (const file of imageFiles) {
|
|
2803
|
+
const src = (0, import_node_path6.join)(chunkImagesDir, file);
|
|
2804
|
+
const dest = (0, import_node_path6.join)(imagesDir, `image_${imageGlobalIndex}.png`);
|
|
2805
|
+
(0, import_node_fs7.copyFileSync)(src, dest);
|
|
2806
|
+
imageGlobalIndex++;
|
|
2807
|
+
}
|
|
2808
|
+
}
|
|
2809
|
+
this.logger.info(
|
|
2810
|
+
`[ChunkedPDFConverter] Relocated ${picGlobalIndex} pic + ${imageGlobalIndex} image files to ${imagesDir}`
|
|
2811
|
+
);
|
|
2812
|
+
}
|
|
2813
|
+
/** Render page images from PDF using ImageMagick and update result.json */
|
|
2814
|
+
async renderPageImages(pdfPath, outputDir) {
|
|
2815
|
+
this.logger.info(
|
|
2816
|
+
"[ChunkedPDFConverter] Rendering page images with ImageMagick..."
|
|
2817
|
+
);
|
|
2818
|
+
const renderer = new PageRenderer(this.logger);
|
|
2819
|
+
const renderResult = await renderer.renderPages(pdfPath, outputDir);
|
|
2820
|
+
const resultPath = (0, import_node_path6.join)(outputDir, "result.json");
|
|
2821
|
+
const tmpPath = resultPath + ".tmp";
|
|
2822
|
+
const jqProgram = `
|
|
2823
|
+
.pages |= with_entries(
|
|
2824
|
+
if (.value.page_no - 1) >= 0 and (.value.page_no - 1) < ${renderResult.pageCount} then
|
|
2825
|
+
.value.image.uri = "pages/page_\\(.value.page_no - 1).png" |
|
|
2826
|
+
.value.image.mimetype = "image/png" |
|
|
2827
|
+
.value.image.dpi = ${PAGE_RENDERING.DEFAULT_DPI}
|
|
2828
|
+
else . end
|
|
2829
|
+
)
|
|
2830
|
+
`;
|
|
2831
|
+
await runJqFileToFile(jqProgram, resultPath, tmpPath);
|
|
2832
|
+
await (0, import_promises4.rename)(tmpPath, resultPath);
|
|
2833
|
+
this.logger.info(
|
|
2834
|
+
`[ChunkedPDFConverter] Rendered ${renderResult.pageCount} page images`
|
|
2835
|
+
);
|
|
2836
|
+
}
|
|
2837
|
+
/**
|
|
2838
|
+
* Remove pic_ files from images directory that are not referenced in result.json.
|
|
2839
|
+
* Chunked Docling conversion embeds page images as base64 in JSON, which get
|
|
2840
|
+
* extracted as pic_ files. After renderPageImages replaces page URIs with
|
|
2841
|
+
* pages/page_N.png, these pic_ files become orphaned.
|
|
2842
|
+
*/
|
|
2843
|
+
cleanupOrphanedPicFiles(resultPath, imagesDir) {
|
|
2844
|
+
const content = (0, import_node_fs7.readFileSync)(resultPath, "utf-8");
|
|
2845
|
+
const referencedPics = /* @__PURE__ */ new Set();
|
|
2846
|
+
const picPattern = /images\/pic_\d+\.png/g;
|
|
2847
|
+
let match;
|
|
2848
|
+
while ((match = picPattern.exec(content)) !== null) {
|
|
2849
|
+
referencedPics.add(match[0].replace("images/", ""));
|
|
2850
|
+
}
|
|
2851
|
+
const picFiles = (0, import_node_fs7.readdirSync)(imagesDir).filter(
|
|
2852
|
+
(f) => f.startsWith("pic_") && f.endsWith(".png")
|
|
2853
|
+
);
|
|
2854
|
+
let removedCount = 0;
|
|
2855
|
+
for (const file of picFiles) {
|
|
2856
|
+
if (!referencedPics.has(file)) {
|
|
2857
|
+
(0, import_node_fs7.rmSync)((0, import_node_path6.join)(imagesDir, file), { force: true });
|
|
2858
|
+
removedCount++;
|
|
2859
|
+
}
|
|
2860
|
+
}
|
|
2861
|
+
if (removedCount > 0) {
|
|
2862
|
+
this.logger.info(
|
|
2863
|
+
`[ChunkedPDFConverter] Cleaned up ${removedCount} orphaned pic_ files (${referencedPics.size} referenced, kept)`
|
|
2864
|
+
);
|
|
2865
|
+
}
|
|
2866
|
+
}
|
|
2867
|
+
/**
|
|
2868
|
+
* Build cumulative pic_ file offsets per chunk for correct URI remapping.
|
|
2869
|
+
* Each offset[i] is the total number of pic_ files in chunks 0..i-1.
|
|
2870
|
+
*/
|
|
2871
|
+
buildPicFileOffsets(chunksBaseDir, totalChunks) {
|
|
2872
|
+
const offsets = [];
|
|
2873
|
+
let cumulative = 0;
|
|
2874
|
+
for (let i = 0; i < totalChunks; i++) {
|
|
2875
|
+
offsets.push(cumulative);
|
|
2876
|
+
const dir = (0, import_node_path6.join)(chunksBaseDir, `_chunk_${i}`, "output", "images");
|
|
2877
|
+
const count = (0, import_node_fs7.existsSync)(dir) ? (0, import_node_fs7.readdirSync)(dir).filter(
|
|
2878
|
+
(f) => f.startsWith("pic_") && f.endsWith(".png")
|
|
2879
|
+
).length : 0;
|
|
2880
|
+
cumulative += count;
|
|
2881
|
+
}
|
|
2882
|
+
return offsets;
|
|
2883
|
+
}
|
|
2884
|
+
/** Check if abort has been signalled and throw if so */
|
|
2885
|
+
checkAbort(signal) {
|
|
2886
|
+
if (signal?.aborted) {
|
|
2887
|
+
const error = new Error("Chunked PDF conversion was aborted");
|
|
2888
|
+
error.name = "AbortError";
|
|
2889
|
+
throw error;
|
|
2890
|
+
}
|
|
2891
|
+
}
|
|
2892
|
+
};
|
|
2893
|
+
|
|
2894
|
+
// src/core/image-pdf-converter.ts
|
|
2895
|
+
var import_node_fs8 = require("fs");
|
|
2896
|
+
var import_node_os = require("os");
|
|
2897
|
+
var import_node_path7 = require("path");
|
|
2368
2898
|
var ImagePdfConverter = class {
|
|
2369
2899
|
constructor(logger) {
|
|
2370
2900
|
this.logger = logger;
|
|
@@ -2380,8 +2910,8 @@ var ImagePdfConverter = class {
|
|
|
2380
2910
|
async convert(pdfUrl, reportId) {
|
|
2381
2911
|
const timestamp = Date.now();
|
|
2382
2912
|
const tempDir = (0, import_node_os.tmpdir)();
|
|
2383
|
-
const inputPath = (0,
|
|
2384
|
-
const outputPath = (0,
|
|
2913
|
+
const inputPath = (0, import_node_path7.join)(tempDir, `${reportId}-${timestamp}-input.pdf`);
|
|
2914
|
+
const outputPath = (0, import_node_path7.join)(tempDir, `${reportId}-${timestamp}-image.pdf`);
|
|
2385
2915
|
try {
|
|
2386
2916
|
this.logger.info("[ImagePdfConverter] Downloading PDF from URL...");
|
|
2387
2917
|
await this.downloadPdf(pdfUrl, inputPath);
|
|
@@ -2390,8 +2920,8 @@ var ImagePdfConverter = class {
|
|
|
2390
2920
|
this.logger.info("[ImagePdfConverter] Image PDF created:", outputPath);
|
|
2391
2921
|
return outputPath;
|
|
2392
2922
|
} finally {
|
|
2393
|
-
if ((0,
|
|
2394
|
-
(0,
|
|
2923
|
+
if ((0, import_node_fs8.existsSync)(inputPath)) {
|
|
2924
|
+
(0, import_node_fs8.rmSync)(inputPath, { force: true });
|
|
2395
2925
|
}
|
|
2396
2926
|
}
|
|
2397
2927
|
}
|
|
@@ -2438,12 +2968,12 @@ var ImagePdfConverter = class {
|
|
|
2438
2968
|
* Cleanup the temporary image PDF file
|
|
2439
2969
|
*/
|
|
2440
2970
|
cleanup(imagePdfPath) {
|
|
2441
|
-
if ((0,
|
|
2971
|
+
if ((0, import_node_fs8.existsSync)(imagePdfPath)) {
|
|
2442
2972
|
this.logger.info(
|
|
2443
2973
|
"[ImagePdfConverter] Cleaning up temp file:",
|
|
2444
2974
|
imagePdfPath
|
|
2445
2975
|
);
|
|
2446
|
-
(0,
|
|
2976
|
+
(0, import_node_fs8.rmSync)(imagePdfPath, { force: true });
|
|
2447
2977
|
}
|
|
2448
2978
|
}
|
|
2449
2979
|
};
|
|
@@ -2458,6 +2988,26 @@ var PDFConverter = class {
|
|
|
2458
2988
|
}
|
|
2459
2989
|
async convert(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
|
|
2460
2990
|
this.logger.info("[PDFConverter] Converting:", url);
|
|
2991
|
+
if (options.chunkedConversion && url.startsWith("file://")) {
|
|
2992
|
+
const chunked = new ChunkedPDFConverter(
|
|
2993
|
+
this.logger,
|
|
2994
|
+
this.client,
|
|
2995
|
+
{
|
|
2996
|
+
chunkSize: options.chunkSize ?? CHUNKED_CONVERSION.DEFAULT_CHUNK_SIZE,
|
|
2997
|
+
maxRetries: options.chunkMaxRetries ?? CHUNKED_CONVERSION.DEFAULT_MAX_RETRIES
|
|
2998
|
+
},
|
|
2999
|
+
this.timeout
|
|
3000
|
+
);
|
|
3001
|
+
return chunked.convertChunked(
|
|
3002
|
+
url,
|
|
3003
|
+
reportId,
|
|
3004
|
+
onComplete,
|
|
3005
|
+
cleanupAfterCallback,
|
|
3006
|
+
options,
|
|
3007
|
+
(opts) => this.buildConversionOptions(opts),
|
|
3008
|
+
abortSignal
|
|
3009
|
+
);
|
|
3010
|
+
}
|
|
2461
3011
|
if (options.forceImagePdf) {
|
|
2462
3012
|
return this.convertViaImagePdf(
|
|
2463
3013
|
url,
|
|
@@ -2562,7 +3112,7 @@ var PDFConverter = class {
|
|
|
2562
3112
|
const reason = options.forcedMethod ? `Forced: ${options.forcedMethod}` : !pdfPath ? "Non-local URL, sampling skipped" : "Sampling skipped";
|
|
2563
3113
|
return { method, reason, sampledPages: 0, totalPages: 0 };
|
|
2564
3114
|
}
|
|
2565
|
-
const samplingDir = (0,
|
|
3115
|
+
const samplingDir = (0, import_node_path8.join)(process.cwd(), "output", reportId, "_sampling");
|
|
2566
3116
|
const sampler = new OcrStrategySampler(
|
|
2567
3117
|
this.logger,
|
|
2568
3118
|
new PageRenderer(this.logger),
|
|
@@ -2587,8 +3137,8 @@ var PDFConverter = class {
|
|
|
2587
3137
|
}
|
|
2588
3138
|
return strategy;
|
|
2589
3139
|
} finally {
|
|
2590
|
-
if ((0,
|
|
2591
|
-
(0,
|
|
3140
|
+
if ((0, import_node_fs9.existsSync)(samplingDir)) {
|
|
3141
|
+
(0, import_node_fs9.rmSync)(samplingDir, { recursive: true, force: true });
|
|
2592
3142
|
}
|
|
2593
3143
|
}
|
|
2594
3144
|
}
|
|
@@ -2609,7 +3159,7 @@ var PDFConverter = class {
|
|
|
2609
3159
|
const wrappedCallback = async (outputDir) => {
|
|
2610
3160
|
let pageTexts;
|
|
2611
3161
|
try {
|
|
2612
|
-
const resultPath2 = (0,
|
|
3162
|
+
const resultPath2 = (0, import_node_path8.join)(outputDir, "result.json");
|
|
2613
3163
|
const totalPages = await runJqFileJson(
|
|
2614
3164
|
".pages | length",
|
|
2615
3165
|
resultPath2
|
|
@@ -2621,9 +3171,9 @@ var PDFConverter = class {
|
|
|
2621
3171
|
"[PDFConverter] pdftotext extraction failed, proceeding without text reference"
|
|
2622
3172
|
);
|
|
2623
3173
|
}
|
|
2624
|
-
const resultPath = (0,
|
|
2625
|
-
const ocrOriginPath = (0,
|
|
2626
|
-
(0,
|
|
3174
|
+
const resultPath = (0, import_node_path8.join)(outputDir, "result.json");
|
|
3175
|
+
const ocrOriginPath = (0, import_node_path8.join)(outputDir, "result_ocr_origin.json");
|
|
3176
|
+
(0, import_node_fs9.copyFileSync)(resultPath, ocrOriginPath);
|
|
2627
3177
|
const corrector = new VlmTextCorrector(this.logger);
|
|
2628
3178
|
await corrector.correctAndSave(outputDir, options.vlmProcessorModel, {
|
|
2629
3179
|
concurrency: options.vlmConcurrency,
|
|
@@ -2765,9 +3315,9 @@ var PDFConverter = class {
|
|
|
2765
3315
|
}
|
|
2766
3316
|
}
|
|
2767
3317
|
const cwd = process.cwd();
|
|
2768
|
-
const zipPath = (0,
|
|
2769
|
-
const extractDir = (0,
|
|
2770
|
-
const outputDir = (0,
|
|
3318
|
+
const zipPath = (0, import_node_path8.join)(cwd, "result.zip");
|
|
3319
|
+
const extractDir = (0, import_node_path8.join)(cwd, "result_extracted");
|
|
3320
|
+
const outputDir = (0, import_node_path8.join)(cwd, "output", reportId);
|
|
2771
3321
|
try {
|
|
2772
3322
|
await this.processConvertedFiles(zipPath, extractDir, outputDir);
|
|
2773
3323
|
await this.renderPageImages(url, outputDir);
|
|
@@ -2784,19 +3334,19 @@ var PDFConverter = class {
|
|
|
2784
3334
|
this.logger.info("[PDFConverter] Total time:", duration, "ms");
|
|
2785
3335
|
} finally {
|
|
2786
3336
|
this.logger.info("[PDFConverter] Cleaning up temporary files...");
|
|
2787
|
-
if ((0,
|
|
2788
|
-
(0,
|
|
3337
|
+
if ((0, import_node_fs9.existsSync)(zipPath)) {
|
|
3338
|
+
(0, import_node_fs9.rmSync)(zipPath, { force: true });
|
|
2789
3339
|
}
|
|
2790
|
-
if ((0,
|
|
2791
|
-
(0,
|
|
3340
|
+
if ((0, import_node_fs9.existsSync)(extractDir)) {
|
|
3341
|
+
(0, import_node_fs9.rmSync)(extractDir, { recursive: true, force: true });
|
|
2792
3342
|
}
|
|
2793
3343
|
if (cleanupAfterCallback) {
|
|
2794
3344
|
this.logger.info(
|
|
2795
3345
|
"[PDFConverter] Cleaning up output directory:",
|
|
2796
3346
|
outputDir
|
|
2797
3347
|
);
|
|
2798
|
-
if ((0,
|
|
2799
|
-
(0,
|
|
3348
|
+
if ((0, import_node_fs9.existsSync)(outputDir)) {
|
|
3349
|
+
(0, import_node_fs9.rmSync)(outputDir, { recursive: true, force: true });
|
|
2800
3350
|
}
|
|
2801
3351
|
} else {
|
|
2802
3352
|
this.logger.info("[PDFConverter] Output preserved at:", outputDir);
|
|
@@ -2814,7 +3364,10 @@ var PDFConverter = class {
|
|
|
2814
3364
|
"skipSampling",
|
|
2815
3365
|
"forcedMethod",
|
|
2816
3366
|
"aggregator",
|
|
2817
|
-
"onTokenUsage"
|
|
3367
|
+
"onTokenUsage",
|
|
3368
|
+
"chunkedConversion",
|
|
3369
|
+
"chunkSize",
|
|
3370
|
+
"chunkMaxRetries"
|
|
2818
3371
|
]),
|
|
2819
3372
|
to_formats: ["json", "html"],
|
|
2820
3373
|
image_export_mode: "embedded",
|
|
@@ -2942,15 +3495,15 @@ var PDFConverter = class {
|
|
|
2942
3495
|
"\n[PDFConverter] Task completed, downloading ZIP file..."
|
|
2943
3496
|
);
|
|
2944
3497
|
const zipResult = await this.client.getTaskResultFile(taskId);
|
|
2945
|
-
const zipPath = (0,
|
|
3498
|
+
const zipPath = (0, import_node_path8.join)(process.cwd(), "result.zip");
|
|
2946
3499
|
this.logger.info("[PDFConverter] Saving ZIP file to:", zipPath);
|
|
2947
3500
|
if (zipResult.fileStream) {
|
|
2948
|
-
const writeStream = (0,
|
|
2949
|
-
await (0,
|
|
3501
|
+
const writeStream = (0, import_node_fs9.createWriteStream)(zipPath);
|
|
3502
|
+
await (0, import_promises7.pipeline)(zipResult.fileStream, writeStream);
|
|
2950
3503
|
return;
|
|
2951
3504
|
}
|
|
2952
3505
|
if (zipResult.data) {
|
|
2953
|
-
await (0,
|
|
3506
|
+
await (0, import_promises6.writeFile)(zipPath, zipResult.data);
|
|
2954
3507
|
return;
|
|
2955
3508
|
}
|
|
2956
3509
|
this.logger.warn(
|
|
@@ -2966,7 +3519,7 @@ var PDFConverter = class {
|
|
|
2966
3519
|
);
|
|
2967
3520
|
}
|
|
2968
3521
|
const buffer = new Uint8Array(await response.arrayBuffer());
|
|
2969
|
-
await (0,
|
|
3522
|
+
await (0, import_promises6.writeFile)(zipPath, buffer);
|
|
2970
3523
|
}
|
|
2971
3524
|
async processConvertedFiles(zipPath, extractDir, outputDir) {
|
|
2972
3525
|
await ImageExtractor.extractAndSaveDocumentsFromZip(
|
|
@@ -2995,7 +3548,7 @@ var PDFConverter = class {
|
|
|
2995
3548
|
);
|
|
2996
3549
|
const renderer = new PageRenderer(this.logger);
|
|
2997
3550
|
const renderResult = await renderer.renderPages(pdfPath, outputDir);
|
|
2998
|
-
const resultPath = (0,
|
|
3551
|
+
const resultPath = (0, import_node_path8.join)(outputDir, "result.json");
|
|
2999
3552
|
const tmpPath = resultPath + ".tmp";
|
|
3000
3553
|
const jqProgram = `
|
|
3001
3554
|
.pages |= with_entries(
|
|
@@ -3007,7 +3560,7 @@ var PDFConverter = class {
|
|
|
3007
3560
|
)
|
|
3008
3561
|
`;
|
|
3009
3562
|
await runJqFileToFile(jqProgram, resultPath, tmpPath);
|
|
3010
|
-
await (0,
|
|
3563
|
+
await (0, import_promises6.rename)(tmpPath, resultPath);
|
|
3011
3564
|
this.logger.info(
|
|
3012
3565
|
`[PDFConverter] Rendered ${renderResult.pageCount} page images`
|
|
3013
3566
|
);
|
|
@@ -3042,7 +3595,7 @@ var PDFParser = class {
|
|
|
3042
3595
|
this.baseUrl = void 0;
|
|
3043
3596
|
}
|
|
3044
3597
|
this.timeout = timeout;
|
|
3045
|
-
this.venvPath = venvPath || (0,
|
|
3598
|
+
this.venvPath = venvPath || (0, import_node_path9.join)(process.cwd(), ".venv");
|
|
3046
3599
|
this.killExistingProcess = killExistingProcess;
|
|
3047
3600
|
this.enableImagePdfFallback = enableImagePdfFallback;
|
|
3048
3601
|
}
|