@heripo/pdf-parser 0.1.11 → 0.1.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +595 -42
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +6 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +604 -42
- package/dist/index.js.map +1 -1
- package/package.json +5 -5
package/dist/index.js
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
import { Docling } from "docling-sdk";
|
|
3
3
|
import { execSync } from "child_process";
|
|
4
4
|
import { platform } from "os";
|
|
5
|
-
import { join as
|
|
5
|
+
import { join as join8 } from "path";
|
|
6
6
|
|
|
7
7
|
// src/config/constants.ts
|
|
8
8
|
var PDF_PARSER = {
|
|
@@ -49,6 +49,12 @@ var PAGE_RENDERING = {
|
|
|
49
49
|
/** Low-resolution DPI for OCR strategy sampling */
|
|
50
50
|
SAMPLE_DPI: 150
|
|
51
51
|
};
|
|
52
|
+
var CHUNKED_CONVERSION = {
|
|
53
|
+
/** Number of pages per chunk */
|
|
54
|
+
DEFAULT_CHUNK_SIZE: 10,
|
|
55
|
+
/** Maximum retry attempts per failed chunk */
|
|
56
|
+
DEFAULT_MAX_RETRIES: 2
|
|
57
|
+
};
|
|
52
58
|
var IMAGE_PDF_CONVERTER = {
|
|
53
59
|
/**
|
|
54
60
|
* ImageMagick density option (DPI) for PDF to image conversion
|
|
@@ -843,10 +849,10 @@ var DoclingEnvironment = class _DoclingEnvironment {
|
|
|
843
849
|
|
|
844
850
|
// src/core/pdf-converter.ts
|
|
845
851
|
import { omit } from "es-toolkit";
|
|
846
|
-
import { copyFileSync, createWriteStream as
|
|
847
|
-
import { rename as
|
|
848
|
-
import { join as
|
|
849
|
-
import { pipeline as
|
|
852
|
+
import { copyFileSync as copyFileSync2, createWriteStream as createWriteStream4, existsSync as existsSync5, rmSync as rmSync4 } from "fs";
|
|
853
|
+
import { rename as rename3, writeFile as writeFile2 } from "fs/promises";
|
|
854
|
+
import { join as join7 } from "path";
|
|
855
|
+
import { pipeline as pipeline4 } from "stream/promises";
|
|
850
856
|
|
|
851
857
|
// src/errors/image-pdf-fallback-error.ts
|
|
852
858
|
var ImagePdfFallbackError = class extends Error {
|
|
@@ -1750,7 +1756,8 @@ var VlmTextCorrector = class {
|
|
|
1750
1756
|
},
|
|
1751
1757
|
{
|
|
1752
1758
|
type: "image",
|
|
1753
|
-
image:
|
|
1759
|
+
image: imageBase64,
|
|
1760
|
+
mediaType: "image/png"
|
|
1754
1761
|
}
|
|
1755
1762
|
]
|
|
1756
1763
|
}
|
|
@@ -1962,7 +1969,7 @@ var VlmTextCorrector = class {
|
|
|
1962
1969
|
*/
|
|
1963
1970
|
readPageImage(outputDir, pageNo) {
|
|
1964
1971
|
const imagePath = join4(outputDir, "pages", `page_${pageNo - 1}.png`);
|
|
1965
|
-
return readFileSync(imagePath)
|
|
1972
|
+
return new Uint8Array(readFileSync(imagePath));
|
|
1966
1973
|
}
|
|
1967
1974
|
/**
|
|
1968
1975
|
* Apply VLM corrections to the DoclingDocument.
|
|
@@ -2231,7 +2238,7 @@ var OcrStrategySampler = class {
|
|
|
2231
2238
|
this.logger.debug(
|
|
2232
2239
|
`[OcrStrategySampler] Analyzing page ${pageNo} for Korean-Hanja mix and language...`
|
|
2233
2240
|
);
|
|
2234
|
-
const
|
|
2241
|
+
const imageData = new Uint8Array(readFileSync2(pageFile));
|
|
2235
2242
|
const messages = [
|
|
2236
2243
|
{
|
|
2237
2244
|
role: "user",
|
|
@@ -2239,7 +2246,8 @@ var OcrStrategySampler = class {
|
|
|
2239
2246
|
{ type: "text", text: KOREAN_HANJA_MIX_PROMPT },
|
|
2240
2247
|
{
|
|
2241
2248
|
type: "image",
|
|
2242
|
-
image:
|
|
2249
|
+
image: imageData,
|
|
2250
|
+
mediaType: "image/png"
|
|
2243
2251
|
}
|
|
2244
2252
|
]
|
|
2245
2253
|
}
|
|
@@ -2337,10 +2345,541 @@ var LocalFileServer = class {
|
|
|
2337
2345
|
}
|
|
2338
2346
|
};
|
|
2339
2347
|
|
|
2348
|
+
// src/core/chunked-pdf-converter.ts
|
|
2349
|
+
import {
|
|
2350
|
+
copyFileSync,
|
|
2351
|
+
createWriteStream as createWriteStream3,
|
|
2352
|
+
existsSync as existsSync3,
|
|
2353
|
+
mkdirSync as mkdirSync3,
|
|
2354
|
+
readFileSync as readFileSync3,
|
|
2355
|
+
readdirSync as readdirSync3,
|
|
2356
|
+
rmSync as rmSync2,
|
|
2357
|
+
writeFileSync as writeFileSync3
|
|
2358
|
+
} from "fs";
|
|
2359
|
+
import { rename as rename2, writeFile } from "fs/promises";
|
|
2360
|
+
import { join as join5 } from "path";
|
|
2361
|
+
import { pipeline as pipeline3 } from "stream/promises";
|
|
2362
|
+
|
|
2363
|
+
// src/processors/docling-document-merger.ts
|
|
2364
|
+
var REF_PATTERN = /^#\/(texts|pictures|tables|groups)\/(\d+)$/;
|
|
2365
|
+
var IMAGE_URI_PATTERN = /^images\/pic_(\d+)\.png$/;
|
|
2366
|
+
var DoclingDocumentMerger = class {
|
|
2367
|
+
/**
|
|
2368
|
+
* Merge an array of DoclingDocuments into one.
|
|
2369
|
+
* The first chunk's metadata (schema_name, version, name, origin) is used as the base.
|
|
2370
|
+
*
|
|
2371
|
+
* @param chunks - Array of DoclingDocument objects to merge (must have at least 1)
|
|
2372
|
+
* @param picFileOffsets - Optional cumulative pic_ file counts per chunk.
|
|
2373
|
+
* When provided, picFileOffsets[i] is used for pic_ URI remapping instead of
|
|
2374
|
+
* the pictures array length, aligning URIs with relocated file indices.
|
|
2375
|
+
* @returns Merged DoclingDocument
|
|
2376
|
+
*/
|
|
2377
|
+
merge(chunks, picFileOffsets) {
|
|
2378
|
+
if (chunks.length === 0) {
|
|
2379
|
+
throw new Error("Cannot merge zero chunks");
|
|
2380
|
+
}
|
|
2381
|
+
if (chunks.length === 1) {
|
|
2382
|
+
return chunks[0];
|
|
2383
|
+
}
|
|
2384
|
+
const base = structuredClone(chunks[0]);
|
|
2385
|
+
for (let i = 1; i < chunks.length; i++) {
|
|
2386
|
+
const chunk = chunks[i];
|
|
2387
|
+
const offsets = {
|
|
2388
|
+
texts: base.texts.length,
|
|
2389
|
+
pictures: base.pictures.length,
|
|
2390
|
+
tables: base.tables.length,
|
|
2391
|
+
groups: base.groups.length
|
|
2392
|
+
};
|
|
2393
|
+
const picFileOffset = picFileOffsets ? picFileOffsets[i] : offsets.pictures;
|
|
2394
|
+
for (const text of chunk.texts) {
|
|
2395
|
+
const remapped = structuredClone(text);
|
|
2396
|
+
remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
|
|
2397
|
+
if (remapped.parent) {
|
|
2398
|
+
remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
|
|
2399
|
+
}
|
|
2400
|
+
remapped.children = remapped.children.map((c) => ({
|
|
2401
|
+
$ref: this.remapRef(c.$ref, offsets)
|
|
2402
|
+
}));
|
|
2403
|
+
base.texts.push(remapped);
|
|
2404
|
+
}
|
|
2405
|
+
for (const picture of chunk.pictures) {
|
|
2406
|
+
const remapped = structuredClone(picture);
|
|
2407
|
+
remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
|
|
2408
|
+
if (remapped.parent) {
|
|
2409
|
+
remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
|
|
2410
|
+
}
|
|
2411
|
+
remapped.children = remapped.children.map((c) => ({
|
|
2412
|
+
$ref: this.remapRef(c.$ref, offsets)
|
|
2413
|
+
}));
|
|
2414
|
+
remapped.captions = remapped.captions.map((c) => ({
|
|
2415
|
+
$ref: this.remapRef(c.$ref, offsets)
|
|
2416
|
+
}));
|
|
2417
|
+
this.remapPictureImageUri(remapped, picFileOffset);
|
|
2418
|
+
base.pictures.push(remapped);
|
|
2419
|
+
}
|
|
2420
|
+
for (const table of chunk.tables) {
|
|
2421
|
+
const remapped = structuredClone(table);
|
|
2422
|
+
remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
|
|
2423
|
+
if (remapped.parent) {
|
|
2424
|
+
remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
|
|
2425
|
+
}
|
|
2426
|
+
remapped.children = remapped.children.map((c) => ({
|
|
2427
|
+
$ref: this.remapRef(c.$ref, offsets)
|
|
2428
|
+
}));
|
|
2429
|
+
remapped.captions = remapped.captions.map((c) => ({
|
|
2430
|
+
$ref: this.remapRef(c.$ref, offsets)
|
|
2431
|
+
}));
|
|
2432
|
+
remapped.footnotes = remapped.footnotes.map((f) => ({
|
|
2433
|
+
$ref: this.remapRef(f.$ref, offsets)
|
|
2434
|
+
}));
|
|
2435
|
+
base.tables.push(remapped);
|
|
2436
|
+
}
|
|
2437
|
+
for (const group of chunk.groups) {
|
|
2438
|
+
const remapped = structuredClone(group);
|
|
2439
|
+
remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
|
|
2440
|
+
if (remapped.parent) {
|
|
2441
|
+
remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
|
|
2442
|
+
}
|
|
2443
|
+
remapped.children = remapped.children.map((c) => ({
|
|
2444
|
+
$ref: this.remapRef(c.$ref, offsets)
|
|
2445
|
+
}));
|
|
2446
|
+
base.groups.push(remapped);
|
|
2447
|
+
}
|
|
2448
|
+
for (const child of chunk.body.children) {
|
|
2449
|
+
base.body.children.push({
|
|
2450
|
+
$ref: this.remapRef(child.$ref, offsets)
|
|
2451
|
+
});
|
|
2452
|
+
}
|
|
2453
|
+
for (const child of chunk.furniture.children) {
|
|
2454
|
+
base.furniture.children.push({
|
|
2455
|
+
$ref: this.remapRef(child.$ref, offsets)
|
|
2456
|
+
});
|
|
2457
|
+
}
|
|
2458
|
+
Object.assign(base.pages, chunk.pages);
|
|
2459
|
+
}
|
|
2460
|
+
return base;
|
|
2461
|
+
}
|
|
2462
|
+
/**
|
|
2463
|
+
* Remap a $ref string by applying offsets.
|
|
2464
|
+
* Only refs matching "#/{texts|pictures|tables|groups}/{N}" are remapped.
|
|
2465
|
+
* Refs like "#/body" or "#/furniture" pass through unchanged.
|
|
2466
|
+
*/
|
|
2467
|
+
remapRef(ref, offsets) {
|
|
2468
|
+
const match = REF_PATTERN.exec(ref);
|
|
2469
|
+
if (!match) {
|
|
2470
|
+
return ref;
|
|
2471
|
+
}
|
|
2472
|
+
const kind = match[1];
|
|
2473
|
+
const index = parseInt(match[2], 10);
|
|
2474
|
+
return `#/${kind}/${index + offsets[kind]}`;
|
|
2475
|
+
}
|
|
2476
|
+
/**
|
|
2477
|
+
* Remap image URI in a picture item by applying the pic file offset.
|
|
2478
|
+
* Transforms "images/pic_N.png" → "images/pic_{N+offset}.png"
|
|
2479
|
+
*/
|
|
2480
|
+
remapPictureImageUri(picture, picFileOffset) {
|
|
2481
|
+
const rec = picture;
|
|
2482
|
+
const image = rec.image;
|
|
2483
|
+
if (!image?.uri) return;
|
|
2484
|
+
const match = IMAGE_URI_PATTERN.exec(image.uri);
|
|
2485
|
+
if (match) {
|
|
2486
|
+
const index = parseInt(match[1], 10);
|
|
2487
|
+
image.uri = `images/pic_${index + picFileOffset}.png`;
|
|
2488
|
+
}
|
|
2489
|
+
}
|
|
2490
|
+
};
|
|
2491
|
+
|
|
2492
|
+
// src/core/chunked-pdf-converter.ts
|
|
2493
|
+
var ChunkedPDFConverter = class {
|
|
2494
|
+
constructor(logger, client, config, timeout = PDF_CONVERTER.DEFAULT_TIMEOUT_MS) {
|
|
2495
|
+
this.logger = logger;
|
|
2496
|
+
this.client = client;
|
|
2497
|
+
this.config = config;
|
|
2498
|
+
this.timeout = timeout;
|
|
2499
|
+
}
|
|
2500
|
+
/**
|
|
2501
|
+
* Convert a local PDF in chunks.
|
|
2502
|
+
*
|
|
2503
|
+
* @param url - file:// URL to the source PDF
|
|
2504
|
+
* @param reportId - Unique report identifier for output directory naming
|
|
2505
|
+
* @param onComplete - Callback invoked with the final output directory
|
|
2506
|
+
* @param cleanupAfterCallback - Whether to clean up the output directory after callback
|
|
2507
|
+
* @param options - PDF conversion options (chunked-specific fields are stripped internally)
|
|
2508
|
+
* @param buildConversionOptions - Function to build Docling ConversionOptions from PDFConvertOptions
|
|
2509
|
+
* @param abortSignal - Optional abort signal for cancellation
|
|
2510
|
+
*/
|
|
2511
|
+
async convertChunked(url, reportId, onComplete, cleanupAfterCallback, options, buildConversionOptions, abortSignal) {
|
|
2512
|
+
const pdfPath = url.slice(7);
|
|
2513
|
+
const cwd = process.cwd();
|
|
2514
|
+
const outputDir = join5(cwd, "output", reportId);
|
|
2515
|
+
const chunksBaseDir = join5(cwd, "output", reportId, "_chunks");
|
|
2516
|
+
const totalPages = await this.getPageCount(pdfPath);
|
|
2517
|
+
if (totalPages === 0) {
|
|
2518
|
+
throw new Error(
|
|
2519
|
+
"[ChunkedPDFConverter] Failed to detect page count from PDF"
|
|
2520
|
+
);
|
|
2521
|
+
}
|
|
2522
|
+
const chunks = this.calculateChunks(totalPages);
|
|
2523
|
+
this.logger.info(
|
|
2524
|
+
`[ChunkedPDFConverter] Starting: ${totalPages} pages \u2192 ${chunks.length} chunks of ${this.config.chunkSize}`
|
|
2525
|
+
);
|
|
2526
|
+
const server = new LocalFileServer();
|
|
2527
|
+
const httpUrl = await server.start(pdfPath);
|
|
2528
|
+
this.logger.info(
|
|
2529
|
+
"[ChunkedPDFConverter] Started local file server:",
|
|
2530
|
+
httpUrl
|
|
2531
|
+
);
|
|
2532
|
+
const chunkDocuments = [];
|
|
2533
|
+
try {
|
|
2534
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
2535
|
+
this.checkAbort(abortSignal);
|
|
2536
|
+
const [start, end] = chunks[i];
|
|
2537
|
+
const chunkDir = join5(chunksBaseDir, `_chunk_${i}`);
|
|
2538
|
+
mkdirSync3(chunkDir, { recursive: true });
|
|
2539
|
+
const doc = await this.convertChunk(
|
|
2540
|
+
i,
|
|
2541
|
+
chunks.length,
|
|
2542
|
+
start,
|
|
2543
|
+
end,
|
|
2544
|
+
httpUrl,
|
|
2545
|
+
chunkDir,
|
|
2546
|
+
options,
|
|
2547
|
+
buildConversionOptions
|
|
2548
|
+
);
|
|
2549
|
+
chunkDocuments.push(doc);
|
|
2550
|
+
}
|
|
2551
|
+
} finally {
|
|
2552
|
+
this.logger.info("[ChunkedPDFConverter] Stopping local file server...");
|
|
2553
|
+
await server.stop();
|
|
2554
|
+
}
|
|
2555
|
+
this.checkAbort(abortSignal);
|
|
2556
|
+
this.logger.info(
|
|
2557
|
+
`[ChunkedPDFConverter] All ${chunks.length} chunks completed, merging...`
|
|
2558
|
+
);
|
|
2559
|
+
const merger = new DoclingDocumentMerger();
|
|
2560
|
+
const picFileOffsets = this.buildPicFileOffsets(
|
|
2561
|
+
chunksBaseDir,
|
|
2562
|
+
chunks.length
|
|
2563
|
+
);
|
|
2564
|
+
const merged = merger.merge(chunkDocuments, picFileOffsets);
|
|
2565
|
+
this.logger.info(
|
|
2566
|
+
`[ChunkedPDFConverter] Merged: ${merged.texts.length} texts, ${merged.pictures.length} pictures, ${merged.tables.length} tables, ${Object.keys(merged.pages).length} pages`
|
|
2567
|
+
);
|
|
2568
|
+
mkdirSync3(outputDir, { recursive: true });
|
|
2569
|
+
const imagesDir = join5(outputDir, "images");
|
|
2570
|
+
mkdirSync3(imagesDir, { recursive: true });
|
|
2571
|
+
this.relocateImages(chunksBaseDir, chunks.length, imagesDir);
|
|
2572
|
+
const resultPath = join5(outputDir, "result.json");
|
|
2573
|
+
writeFileSync3(resultPath, JSON.stringify(merged));
|
|
2574
|
+
try {
|
|
2575
|
+
await this.renderPageImages(pdfPath, outputDir);
|
|
2576
|
+
this.cleanupOrphanedPicFiles(resultPath, imagesDir);
|
|
2577
|
+
this.checkAbort(abortSignal);
|
|
2578
|
+
this.logger.info(
|
|
2579
|
+
"[ChunkedPDFConverter] Executing completion callback..."
|
|
2580
|
+
);
|
|
2581
|
+
await onComplete(outputDir);
|
|
2582
|
+
} finally {
|
|
2583
|
+
if (existsSync3(chunksBaseDir)) {
|
|
2584
|
+
rmSync2(chunksBaseDir, { recursive: true, force: true });
|
|
2585
|
+
}
|
|
2586
|
+
if (cleanupAfterCallback) {
|
|
2587
|
+
this.logger.info(
|
|
2588
|
+
"[ChunkedPDFConverter] Cleaning up output directory:",
|
|
2589
|
+
outputDir
|
|
2590
|
+
);
|
|
2591
|
+
if (existsSync3(outputDir)) {
|
|
2592
|
+
rmSync2(outputDir, { recursive: true, force: true });
|
|
2593
|
+
}
|
|
2594
|
+
} else {
|
|
2595
|
+
this.logger.info(
|
|
2596
|
+
"[ChunkedPDFConverter] Output preserved at:",
|
|
2597
|
+
outputDir
|
|
2598
|
+
);
|
|
2599
|
+
}
|
|
2600
|
+
}
|
|
2601
|
+
return null;
|
|
2602
|
+
}
|
|
2603
|
+
/**
|
|
2604
|
+
* Convert a single chunk with retry logic.
|
|
2605
|
+
*/
|
|
2606
|
+
async convertChunk(chunkIndex, totalChunks, startPage, endPage, httpUrl, chunkDir, options, buildConversionOptions) {
|
|
2607
|
+
const chunkLabel = `Chunk ${chunkIndex + 1}/${totalChunks} (pages ${startPage}-${endPage})`;
|
|
2608
|
+
for (let attempt = 0; attempt <= this.config.maxRetries; attempt++) {
|
|
2609
|
+
try {
|
|
2610
|
+
if (attempt > 0) {
|
|
2611
|
+
this.logger.info(
|
|
2612
|
+
`[ChunkedPDFConverter] ${chunkLabel}: retrying (${attempt}/${this.config.maxRetries})...`
|
|
2613
|
+
);
|
|
2614
|
+
} else {
|
|
2615
|
+
this.logger.info(
|
|
2616
|
+
`[ChunkedPDFConverter] ${chunkLabel}: converting...`
|
|
2617
|
+
);
|
|
2618
|
+
}
|
|
2619
|
+
const startTime = Date.now();
|
|
2620
|
+
const conversionOptions = buildConversionOptions({
|
|
2621
|
+
...options,
|
|
2622
|
+
page_range: [startPage, endPage]
|
|
2623
|
+
});
|
|
2624
|
+
const task = await this.client.convertSourceAsync({
|
|
2625
|
+
sources: [{ kind: "http", url: httpUrl }],
|
|
2626
|
+
options: conversionOptions,
|
|
2627
|
+
target: { kind: "zip" }
|
|
2628
|
+
});
|
|
2629
|
+
await this.trackTaskProgress(task);
|
|
2630
|
+
const zipPath = join5(chunkDir, "result.zip");
|
|
2631
|
+
await this.downloadResult(task.taskId, zipPath);
|
|
2632
|
+
const extractDir = join5(chunkDir, "extracted");
|
|
2633
|
+
const chunkOutputDir = join5(chunkDir, "output");
|
|
2634
|
+
await ImageExtractor.extractAndSaveDocumentsFromZip(
|
|
2635
|
+
this.logger,
|
|
2636
|
+
zipPath,
|
|
2637
|
+
extractDir,
|
|
2638
|
+
chunkOutputDir
|
|
2639
|
+
);
|
|
2640
|
+
const resultJsonPath = join5(chunkOutputDir, "result.json");
|
|
2641
|
+
const doc = await runJqFileJson(".", resultJsonPath);
|
|
2642
|
+
if (existsSync3(zipPath)) rmSync2(zipPath, { force: true });
|
|
2643
|
+
if (existsSync3(extractDir)) {
|
|
2644
|
+
rmSync2(extractDir, { recursive: true, force: true });
|
|
2645
|
+
}
|
|
2646
|
+
const elapsed = ((Date.now() - startTime) / 1e3).toFixed(1);
|
|
2647
|
+
if (attempt > 0) {
|
|
2648
|
+
this.logger.info(
|
|
2649
|
+
`[ChunkedPDFConverter] ${chunkLabel}: completed on retry ${attempt} (${elapsed}s)`
|
|
2650
|
+
);
|
|
2651
|
+
} else {
|
|
2652
|
+
this.logger.info(
|
|
2653
|
+
`[ChunkedPDFConverter] ${chunkLabel}: completed (${elapsed}s)`
|
|
2654
|
+
);
|
|
2655
|
+
}
|
|
2656
|
+
return doc;
|
|
2657
|
+
} catch (error) {
|
|
2658
|
+
if (attempt >= this.config.maxRetries) {
|
|
2659
|
+
this.logger.error(
|
|
2660
|
+
`[ChunkedPDFConverter] ${chunkLabel}: failed after ${this.config.maxRetries} retries`
|
|
2661
|
+
);
|
|
2662
|
+
throw error;
|
|
2663
|
+
}
|
|
2664
|
+
this.logger.warn(
|
|
2665
|
+
`[ChunkedPDFConverter] ${chunkLabel}: failed, retrying (${attempt + 1}/${this.config.maxRetries})...`
|
|
2666
|
+
);
|
|
2667
|
+
}
|
|
2668
|
+
}
|
|
2669
|
+
throw new Error("Unreachable");
|
|
2670
|
+
}
|
|
2671
|
+
/** Calculate page ranges for chunks */
|
|
2672
|
+
calculateChunks(totalPages) {
|
|
2673
|
+
if (this.config.chunkSize <= 0) {
|
|
2674
|
+
throw new Error("[ChunkedPDFConverter] chunkSize must be positive");
|
|
2675
|
+
}
|
|
2676
|
+
const ranges = [];
|
|
2677
|
+
for (let start = 1; start <= totalPages; start += this.config.chunkSize) {
|
|
2678
|
+
const end = Math.min(start + this.config.chunkSize - 1, totalPages);
|
|
2679
|
+
ranges.push([start, end]);
|
|
2680
|
+
}
|
|
2681
|
+
return ranges;
|
|
2682
|
+
}
|
|
2683
|
+
/** Get total page count using pdfinfo */
|
|
2684
|
+
async getPageCount(pdfPath) {
|
|
2685
|
+
const result = await spawnAsync("pdfinfo", [pdfPath]);
|
|
2686
|
+
if (result.code !== 0) {
|
|
2687
|
+
return 0;
|
|
2688
|
+
}
|
|
2689
|
+
const match = result.stdout.match(/^Pages:\s+(\d+)/m);
|
|
2690
|
+
return match ? parseInt(match[1], 10) : 0;
|
|
2691
|
+
}
|
|
2692
|
+
/** Poll task progress until completion */
|
|
2693
|
+
async trackTaskProgress(task) {
|
|
2694
|
+
const startTime = Date.now();
|
|
2695
|
+
while (true) {
|
|
2696
|
+
if (Date.now() - startTime > this.timeout) {
|
|
2697
|
+
throw new Error("[ChunkedPDFConverter] Chunk task timeout");
|
|
2698
|
+
}
|
|
2699
|
+
const status = await task.poll();
|
|
2700
|
+
if (status.task_status === "success") return;
|
|
2701
|
+
if (status.task_status === "failure") {
|
|
2702
|
+
let details = "unknown";
|
|
2703
|
+
try {
|
|
2704
|
+
const result = await task.getResult();
|
|
2705
|
+
if (result.errors?.length) {
|
|
2706
|
+
details = result.errors.map((e) => e.message).join("; ");
|
|
2707
|
+
}
|
|
2708
|
+
} catch {
|
|
2709
|
+
}
|
|
2710
|
+
throw new Error(`[ChunkedPDFConverter] Chunk task failed: ${details}`);
|
|
2711
|
+
}
|
|
2712
|
+
await new Promise(
|
|
2713
|
+
(resolve) => setTimeout(resolve, PDF_CONVERTER.POLL_INTERVAL_MS)
|
|
2714
|
+
);
|
|
2715
|
+
}
|
|
2716
|
+
}
|
|
2717
|
+
/** Download ZIP result for a task */
|
|
2718
|
+
async downloadResult(taskId, zipPath) {
|
|
2719
|
+
const zipResult = await this.client.getTaskResultFile(taskId);
|
|
2720
|
+
if (zipResult.fileStream) {
|
|
2721
|
+
const writeStream = createWriteStream3(zipPath);
|
|
2722
|
+
await pipeline3(zipResult.fileStream, writeStream);
|
|
2723
|
+
return;
|
|
2724
|
+
}
|
|
2725
|
+
if (zipResult.data) {
|
|
2726
|
+
await writeFile(zipPath, zipResult.data);
|
|
2727
|
+
return;
|
|
2728
|
+
}
|
|
2729
|
+
const baseUrl = this.client.getConfig().baseUrl;
|
|
2730
|
+
const response = await fetch(`${baseUrl}/v1/result/${taskId}`, {
|
|
2731
|
+
headers: { Accept: "application/zip" }
|
|
2732
|
+
});
|
|
2733
|
+
if (!response.ok) {
|
|
2734
|
+
throw new Error(
|
|
2735
|
+
`Failed to download chunk ZIP: ${response.status} ${response.statusText}`
|
|
2736
|
+
);
|
|
2737
|
+
}
|
|
2738
|
+
const buffer = new Uint8Array(await response.arrayBuffer());
|
|
2739
|
+
await writeFile(zipPath, buffer);
|
|
2740
|
+
}
|
|
2741
|
+
/**
|
|
2742
|
+
* Relocate images from chunk output directories to the final images directory
|
|
2743
|
+
* with global indexing.
|
|
2744
|
+
*/
|
|
2745
|
+
relocateImages(chunksBaseDir, totalChunks, imagesDir) {
|
|
2746
|
+
let picGlobalIndex = 0;
|
|
2747
|
+
for (let i = 0; i < totalChunks; i++) {
|
|
2748
|
+
const chunkImagesDir = join5(
|
|
2749
|
+
chunksBaseDir,
|
|
2750
|
+
`_chunk_${i}`,
|
|
2751
|
+
"output",
|
|
2752
|
+
"images"
|
|
2753
|
+
);
|
|
2754
|
+
if (!existsSync3(chunkImagesDir)) continue;
|
|
2755
|
+
const picFiles = readdirSync3(chunkImagesDir).filter((f) => f.startsWith("pic_") && f.endsWith(".png")).sort((a, b) => {
|
|
2756
|
+
const numA = parseInt(a.replace("pic_", "").replace(".png", ""), 10);
|
|
2757
|
+
const numB = parseInt(b.replace("pic_", "").replace(".png", ""), 10);
|
|
2758
|
+
return numA - numB;
|
|
2759
|
+
});
|
|
2760
|
+
for (const file of picFiles) {
|
|
2761
|
+
const src = join5(chunkImagesDir, file);
|
|
2762
|
+
const dest = join5(imagesDir, `pic_${picGlobalIndex}.png`);
|
|
2763
|
+
copyFileSync(src, dest);
|
|
2764
|
+
picGlobalIndex++;
|
|
2765
|
+
}
|
|
2766
|
+
}
|
|
2767
|
+
let imageGlobalIndex = 0;
|
|
2768
|
+
for (let i = 0; i < totalChunks; i++) {
|
|
2769
|
+
const chunkImagesDir = join5(
|
|
2770
|
+
chunksBaseDir,
|
|
2771
|
+
`_chunk_${i}`,
|
|
2772
|
+
"output",
|
|
2773
|
+
"images"
|
|
2774
|
+
);
|
|
2775
|
+
if (!existsSync3(chunkImagesDir)) continue;
|
|
2776
|
+
const imageFiles = readdirSync3(chunkImagesDir).filter((f) => f.startsWith("image_") && f.endsWith(".png")).sort((a, b) => {
|
|
2777
|
+
const numA = parseInt(
|
|
2778
|
+
a.replace("image_", "").replace(".png", ""),
|
|
2779
|
+
10
|
|
2780
|
+
);
|
|
2781
|
+
const numB = parseInt(
|
|
2782
|
+
b.replace("image_", "").replace(".png", ""),
|
|
2783
|
+
10
|
|
2784
|
+
);
|
|
2785
|
+
return numA - numB;
|
|
2786
|
+
});
|
|
2787
|
+
for (const file of imageFiles) {
|
|
2788
|
+
const src = join5(chunkImagesDir, file);
|
|
2789
|
+
const dest = join5(imagesDir, `image_${imageGlobalIndex}.png`);
|
|
2790
|
+
copyFileSync(src, dest);
|
|
2791
|
+
imageGlobalIndex++;
|
|
2792
|
+
}
|
|
2793
|
+
}
|
|
2794
|
+
this.logger.info(
|
|
2795
|
+
`[ChunkedPDFConverter] Relocated ${picGlobalIndex} pic + ${imageGlobalIndex} image files to ${imagesDir}`
|
|
2796
|
+
);
|
|
2797
|
+
}
|
|
2798
|
+
/** Render page images from PDF using ImageMagick and update result.json */
|
|
2799
|
+
async renderPageImages(pdfPath, outputDir) {
|
|
2800
|
+
this.logger.info(
|
|
2801
|
+
"[ChunkedPDFConverter] Rendering page images with ImageMagick..."
|
|
2802
|
+
);
|
|
2803
|
+
const renderer = new PageRenderer(this.logger);
|
|
2804
|
+
const renderResult = await renderer.renderPages(pdfPath, outputDir);
|
|
2805
|
+
const resultPath = join5(outputDir, "result.json");
|
|
2806
|
+
const tmpPath = resultPath + ".tmp";
|
|
2807
|
+
const jqProgram = `
|
|
2808
|
+
.pages |= with_entries(
|
|
2809
|
+
if (.value.page_no - 1) >= 0 and (.value.page_no - 1) < ${renderResult.pageCount} then
|
|
2810
|
+
.value.image.uri = "pages/page_\\(.value.page_no - 1).png" |
|
|
2811
|
+
.value.image.mimetype = "image/png" |
|
|
2812
|
+
.value.image.dpi = ${PAGE_RENDERING.DEFAULT_DPI}
|
|
2813
|
+
else . end
|
|
2814
|
+
)
|
|
2815
|
+
`;
|
|
2816
|
+
await runJqFileToFile(jqProgram, resultPath, tmpPath);
|
|
2817
|
+
await rename2(tmpPath, resultPath);
|
|
2818
|
+
this.logger.info(
|
|
2819
|
+
`[ChunkedPDFConverter] Rendered ${renderResult.pageCount} page images`
|
|
2820
|
+
);
|
|
2821
|
+
}
|
|
2822
|
+
/**
|
|
2823
|
+
* Remove pic_ files from images directory that are not referenced in result.json.
|
|
2824
|
+
* Chunked Docling conversion embeds page images as base64 in JSON, which get
|
|
2825
|
+
* extracted as pic_ files. After renderPageImages replaces page URIs with
|
|
2826
|
+
* pages/page_N.png, these pic_ files become orphaned.
|
|
2827
|
+
*/
|
|
2828
|
+
cleanupOrphanedPicFiles(resultPath, imagesDir) {
|
|
2829
|
+
const content = readFileSync3(resultPath, "utf-8");
|
|
2830
|
+
const referencedPics = /* @__PURE__ */ new Set();
|
|
2831
|
+
const picPattern = /images\/pic_\d+\.png/g;
|
|
2832
|
+
let match;
|
|
2833
|
+
while ((match = picPattern.exec(content)) !== null) {
|
|
2834
|
+
referencedPics.add(match[0].replace("images/", ""));
|
|
2835
|
+
}
|
|
2836
|
+
const picFiles = readdirSync3(imagesDir).filter(
|
|
2837
|
+
(f) => f.startsWith("pic_") && f.endsWith(".png")
|
|
2838
|
+
);
|
|
2839
|
+
let removedCount = 0;
|
|
2840
|
+
for (const file of picFiles) {
|
|
2841
|
+
if (!referencedPics.has(file)) {
|
|
2842
|
+
rmSync2(join5(imagesDir, file), { force: true });
|
|
2843
|
+
removedCount++;
|
|
2844
|
+
}
|
|
2845
|
+
}
|
|
2846
|
+
if (removedCount > 0) {
|
|
2847
|
+
this.logger.info(
|
|
2848
|
+
`[ChunkedPDFConverter] Cleaned up ${removedCount} orphaned pic_ files (${referencedPics.size} referenced, kept)`
|
|
2849
|
+
);
|
|
2850
|
+
}
|
|
2851
|
+
}
|
|
2852
|
+
/**
|
|
2853
|
+
* Build cumulative pic_ file offsets per chunk for correct URI remapping.
|
|
2854
|
+
* Each offset[i] is the total number of pic_ files in chunks 0..i-1.
|
|
2855
|
+
*/
|
|
2856
|
+
buildPicFileOffsets(chunksBaseDir, totalChunks) {
|
|
2857
|
+
const offsets = [];
|
|
2858
|
+
let cumulative = 0;
|
|
2859
|
+
for (let i = 0; i < totalChunks; i++) {
|
|
2860
|
+
offsets.push(cumulative);
|
|
2861
|
+
const dir = join5(chunksBaseDir, `_chunk_${i}`, "output", "images");
|
|
2862
|
+
const count = existsSync3(dir) ? readdirSync3(dir).filter(
|
|
2863
|
+
(f) => f.startsWith("pic_") && f.endsWith(".png")
|
|
2864
|
+
).length : 0;
|
|
2865
|
+
cumulative += count;
|
|
2866
|
+
}
|
|
2867
|
+
return offsets;
|
|
2868
|
+
}
|
|
2869
|
+
/** Check if abort has been signalled and throw if so */
|
|
2870
|
+
checkAbort(signal) {
|
|
2871
|
+
if (signal?.aborted) {
|
|
2872
|
+
const error = new Error("Chunked PDF conversion was aborted");
|
|
2873
|
+
error.name = "AbortError";
|
|
2874
|
+
throw error;
|
|
2875
|
+
}
|
|
2876
|
+
}
|
|
2877
|
+
};
|
|
2878
|
+
|
|
2340
2879
|
// src/core/image-pdf-converter.ts
|
|
2341
|
-
import { existsSync as
|
|
2880
|
+
import { existsSync as existsSync4, rmSync as rmSync3 } from "fs";
|
|
2342
2881
|
import { tmpdir } from "os";
|
|
2343
|
-
import { join as
|
|
2882
|
+
import { join as join6 } from "path";
|
|
2344
2883
|
var ImagePdfConverter = class {
|
|
2345
2884
|
constructor(logger) {
|
|
2346
2885
|
this.logger = logger;
|
|
@@ -2356,8 +2895,8 @@ var ImagePdfConverter = class {
|
|
|
2356
2895
|
async convert(pdfUrl, reportId) {
|
|
2357
2896
|
const timestamp = Date.now();
|
|
2358
2897
|
const tempDir = tmpdir();
|
|
2359
|
-
const inputPath =
|
|
2360
|
-
const outputPath =
|
|
2898
|
+
const inputPath = join6(tempDir, `${reportId}-${timestamp}-input.pdf`);
|
|
2899
|
+
const outputPath = join6(tempDir, `${reportId}-${timestamp}-image.pdf`);
|
|
2361
2900
|
try {
|
|
2362
2901
|
this.logger.info("[ImagePdfConverter] Downloading PDF from URL...");
|
|
2363
2902
|
await this.downloadPdf(pdfUrl, inputPath);
|
|
@@ -2366,8 +2905,8 @@ var ImagePdfConverter = class {
|
|
|
2366
2905
|
this.logger.info("[ImagePdfConverter] Image PDF created:", outputPath);
|
|
2367
2906
|
return outputPath;
|
|
2368
2907
|
} finally {
|
|
2369
|
-
if (
|
|
2370
|
-
|
|
2908
|
+
if (existsSync4(inputPath)) {
|
|
2909
|
+
rmSync3(inputPath, { force: true });
|
|
2371
2910
|
}
|
|
2372
2911
|
}
|
|
2373
2912
|
}
|
|
@@ -2414,12 +2953,12 @@ var ImagePdfConverter = class {
|
|
|
2414
2953
|
* Cleanup the temporary image PDF file
|
|
2415
2954
|
*/
|
|
2416
2955
|
cleanup(imagePdfPath) {
|
|
2417
|
-
if (
|
|
2956
|
+
if (existsSync4(imagePdfPath)) {
|
|
2418
2957
|
this.logger.info(
|
|
2419
2958
|
"[ImagePdfConverter] Cleaning up temp file:",
|
|
2420
2959
|
imagePdfPath
|
|
2421
2960
|
);
|
|
2422
|
-
|
|
2961
|
+
rmSync3(imagePdfPath, { force: true });
|
|
2423
2962
|
}
|
|
2424
2963
|
}
|
|
2425
2964
|
};
|
|
@@ -2434,6 +2973,26 @@ var PDFConverter = class {
|
|
|
2434
2973
|
}
|
|
2435
2974
|
async convert(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
|
|
2436
2975
|
this.logger.info("[PDFConverter] Converting:", url);
|
|
2976
|
+
if (options.chunkedConversion && url.startsWith("file://")) {
|
|
2977
|
+
const chunked = new ChunkedPDFConverter(
|
|
2978
|
+
this.logger,
|
|
2979
|
+
this.client,
|
|
2980
|
+
{
|
|
2981
|
+
chunkSize: options.chunkSize ?? CHUNKED_CONVERSION.DEFAULT_CHUNK_SIZE,
|
|
2982
|
+
maxRetries: options.chunkMaxRetries ?? CHUNKED_CONVERSION.DEFAULT_MAX_RETRIES
|
|
2983
|
+
},
|
|
2984
|
+
this.timeout
|
|
2985
|
+
);
|
|
2986
|
+
return chunked.convertChunked(
|
|
2987
|
+
url,
|
|
2988
|
+
reportId,
|
|
2989
|
+
onComplete,
|
|
2990
|
+
cleanupAfterCallback,
|
|
2991
|
+
options,
|
|
2992
|
+
(opts) => this.buildConversionOptions(opts),
|
|
2993
|
+
abortSignal
|
|
2994
|
+
);
|
|
2995
|
+
}
|
|
2437
2996
|
if (options.forceImagePdf) {
|
|
2438
2997
|
return this.convertViaImagePdf(
|
|
2439
2998
|
url,
|
|
@@ -2538,7 +3097,7 @@ var PDFConverter = class {
|
|
|
2538
3097
|
const reason = options.forcedMethod ? `Forced: ${options.forcedMethod}` : !pdfPath ? "Non-local URL, sampling skipped" : "Sampling skipped";
|
|
2539
3098
|
return { method, reason, sampledPages: 0, totalPages: 0 };
|
|
2540
3099
|
}
|
|
2541
|
-
const samplingDir =
|
|
3100
|
+
const samplingDir = join7(process.cwd(), "output", reportId, "_sampling");
|
|
2542
3101
|
const sampler = new OcrStrategySampler(
|
|
2543
3102
|
this.logger,
|
|
2544
3103
|
new PageRenderer(this.logger),
|
|
@@ -2563,8 +3122,8 @@ var PDFConverter = class {
|
|
|
2563
3122
|
}
|
|
2564
3123
|
return strategy;
|
|
2565
3124
|
} finally {
|
|
2566
|
-
if (
|
|
2567
|
-
|
|
3125
|
+
if (existsSync5(samplingDir)) {
|
|
3126
|
+
rmSync4(samplingDir, { recursive: true, force: true });
|
|
2568
3127
|
}
|
|
2569
3128
|
}
|
|
2570
3129
|
}
|
|
@@ -2585,7 +3144,7 @@ var PDFConverter = class {
|
|
|
2585
3144
|
const wrappedCallback = async (outputDir) => {
|
|
2586
3145
|
let pageTexts;
|
|
2587
3146
|
try {
|
|
2588
|
-
const resultPath2 =
|
|
3147
|
+
const resultPath2 = join7(outputDir, "result.json");
|
|
2589
3148
|
const totalPages = await runJqFileJson(
|
|
2590
3149
|
".pages | length",
|
|
2591
3150
|
resultPath2
|
|
@@ -2597,9 +3156,9 @@ var PDFConverter = class {
|
|
|
2597
3156
|
"[PDFConverter] pdftotext extraction failed, proceeding without text reference"
|
|
2598
3157
|
);
|
|
2599
3158
|
}
|
|
2600
|
-
const resultPath =
|
|
2601
|
-
const ocrOriginPath =
|
|
2602
|
-
|
|
3159
|
+
const resultPath = join7(outputDir, "result.json");
|
|
3160
|
+
const ocrOriginPath = join7(outputDir, "result_ocr_origin.json");
|
|
3161
|
+
copyFileSync2(resultPath, ocrOriginPath);
|
|
2603
3162
|
const corrector = new VlmTextCorrector(this.logger);
|
|
2604
3163
|
await corrector.correctAndSave(outputDir, options.vlmProcessorModel, {
|
|
2605
3164
|
concurrency: options.vlmConcurrency,
|
|
@@ -2741,9 +3300,9 @@ var PDFConverter = class {
|
|
|
2741
3300
|
}
|
|
2742
3301
|
}
|
|
2743
3302
|
const cwd = process.cwd();
|
|
2744
|
-
const zipPath =
|
|
2745
|
-
const extractDir =
|
|
2746
|
-
const outputDir =
|
|
3303
|
+
const zipPath = join7(cwd, "result.zip");
|
|
3304
|
+
const extractDir = join7(cwd, "result_extracted");
|
|
3305
|
+
const outputDir = join7(cwd, "output", reportId);
|
|
2747
3306
|
try {
|
|
2748
3307
|
await this.processConvertedFiles(zipPath, extractDir, outputDir);
|
|
2749
3308
|
await this.renderPageImages(url, outputDir);
|
|
@@ -2760,19 +3319,19 @@ var PDFConverter = class {
|
|
|
2760
3319
|
this.logger.info("[PDFConverter] Total time:", duration, "ms");
|
|
2761
3320
|
} finally {
|
|
2762
3321
|
this.logger.info("[PDFConverter] Cleaning up temporary files...");
|
|
2763
|
-
if (
|
|
2764
|
-
|
|
3322
|
+
if (existsSync5(zipPath)) {
|
|
3323
|
+
rmSync4(zipPath, { force: true });
|
|
2765
3324
|
}
|
|
2766
|
-
if (
|
|
2767
|
-
|
|
3325
|
+
if (existsSync5(extractDir)) {
|
|
3326
|
+
rmSync4(extractDir, { recursive: true, force: true });
|
|
2768
3327
|
}
|
|
2769
3328
|
if (cleanupAfterCallback) {
|
|
2770
3329
|
this.logger.info(
|
|
2771
3330
|
"[PDFConverter] Cleaning up output directory:",
|
|
2772
3331
|
outputDir
|
|
2773
3332
|
);
|
|
2774
|
-
if (
|
|
2775
|
-
|
|
3333
|
+
if (existsSync5(outputDir)) {
|
|
3334
|
+
rmSync4(outputDir, { recursive: true, force: true });
|
|
2776
3335
|
}
|
|
2777
3336
|
} else {
|
|
2778
3337
|
this.logger.info("[PDFConverter] Output preserved at:", outputDir);
|
|
@@ -2790,7 +3349,10 @@ var PDFConverter = class {
|
|
|
2790
3349
|
"skipSampling",
|
|
2791
3350
|
"forcedMethod",
|
|
2792
3351
|
"aggregator",
|
|
2793
|
-
"onTokenUsage"
|
|
3352
|
+
"onTokenUsage",
|
|
3353
|
+
"chunkedConversion",
|
|
3354
|
+
"chunkSize",
|
|
3355
|
+
"chunkMaxRetries"
|
|
2794
3356
|
]),
|
|
2795
3357
|
to_formats: ["json", "html"],
|
|
2796
3358
|
image_export_mode: "embedded",
|
|
@@ -2918,15 +3480,15 @@ var PDFConverter = class {
|
|
|
2918
3480
|
"\n[PDFConverter] Task completed, downloading ZIP file..."
|
|
2919
3481
|
);
|
|
2920
3482
|
const zipResult = await this.client.getTaskResultFile(taskId);
|
|
2921
|
-
const zipPath =
|
|
3483
|
+
const zipPath = join7(process.cwd(), "result.zip");
|
|
2922
3484
|
this.logger.info("[PDFConverter] Saving ZIP file to:", zipPath);
|
|
2923
3485
|
if (zipResult.fileStream) {
|
|
2924
|
-
const writeStream =
|
|
2925
|
-
await
|
|
3486
|
+
const writeStream = createWriteStream4(zipPath);
|
|
3487
|
+
await pipeline4(zipResult.fileStream, writeStream);
|
|
2926
3488
|
return;
|
|
2927
3489
|
}
|
|
2928
3490
|
if (zipResult.data) {
|
|
2929
|
-
await
|
|
3491
|
+
await writeFile2(zipPath, zipResult.data);
|
|
2930
3492
|
return;
|
|
2931
3493
|
}
|
|
2932
3494
|
this.logger.warn(
|
|
@@ -2942,7 +3504,7 @@ var PDFConverter = class {
|
|
|
2942
3504
|
);
|
|
2943
3505
|
}
|
|
2944
3506
|
const buffer = new Uint8Array(await response.arrayBuffer());
|
|
2945
|
-
await
|
|
3507
|
+
await writeFile2(zipPath, buffer);
|
|
2946
3508
|
}
|
|
2947
3509
|
async processConvertedFiles(zipPath, extractDir, outputDir) {
|
|
2948
3510
|
await ImageExtractor.extractAndSaveDocumentsFromZip(
|
|
@@ -2971,7 +3533,7 @@ var PDFConverter = class {
|
|
|
2971
3533
|
);
|
|
2972
3534
|
const renderer = new PageRenderer(this.logger);
|
|
2973
3535
|
const renderResult = await renderer.renderPages(pdfPath, outputDir);
|
|
2974
|
-
const resultPath =
|
|
3536
|
+
const resultPath = join7(outputDir, "result.json");
|
|
2975
3537
|
const tmpPath = resultPath + ".tmp";
|
|
2976
3538
|
const jqProgram = `
|
|
2977
3539
|
.pages |= with_entries(
|
|
@@ -2983,7 +3545,7 @@ var PDFConverter = class {
|
|
|
2983
3545
|
)
|
|
2984
3546
|
`;
|
|
2985
3547
|
await runJqFileToFile(jqProgram, resultPath, tmpPath);
|
|
2986
|
-
await
|
|
3548
|
+
await rename3(tmpPath, resultPath);
|
|
2987
3549
|
this.logger.info(
|
|
2988
3550
|
`[PDFConverter] Rendered ${renderResult.pageCount} page images`
|
|
2989
3551
|
);
|
|
@@ -3018,7 +3580,7 @@ var PDFParser = class {
|
|
|
3018
3580
|
this.baseUrl = void 0;
|
|
3019
3581
|
}
|
|
3020
3582
|
this.timeout = timeout;
|
|
3021
|
-
this.venvPath = venvPath ||
|
|
3583
|
+
this.venvPath = venvPath || join8(process.cwd(), ".venv");
|
|
3022
3584
|
this.killExistingProcess = killExistingProcess;
|
|
3023
3585
|
this.enableImagePdfFallback = enableImagePdfFallback;
|
|
3024
3586
|
}
|