@heripo/pdf-parser 0.1.10 → 0.1.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +647 -76
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +6 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +656 -76
- package/dist/index.js.map +1 -1
- package/package.json +4 -4
package/dist/index.js
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
import { Docling } from "docling-sdk";
|
|
3
3
|
import { execSync } from "child_process";
|
|
4
4
|
import { platform } from "os";
|
|
5
|
-
import { join as
|
|
5
|
+
import { join as join8 } from "path";
|
|
6
6
|
|
|
7
7
|
// src/config/constants.ts
|
|
8
8
|
var PDF_PARSER = {
|
|
@@ -49,6 +49,12 @@ var PAGE_RENDERING = {
|
|
|
49
49
|
/** Low-resolution DPI for OCR strategy sampling */
|
|
50
50
|
SAMPLE_DPI: 150
|
|
51
51
|
};
|
|
52
|
+
var CHUNKED_CONVERSION = {
|
|
53
|
+
/** Number of pages per chunk */
|
|
54
|
+
DEFAULT_CHUNK_SIZE: 10,
|
|
55
|
+
/** Maximum retry attempts per failed chunk */
|
|
56
|
+
DEFAULT_MAX_RETRIES: 2
|
|
57
|
+
};
|
|
52
58
|
var IMAGE_PDF_CONVERTER = {
|
|
53
59
|
/**
|
|
54
60
|
* ImageMagick density option (DPI) for PDF to image conversion
|
|
@@ -843,10 +849,10 @@ var DoclingEnvironment = class _DoclingEnvironment {
|
|
|
843
849
|
|
|
844
850
|
// src/core/pdf-converter.ts
|
|
845
851
|
import { omit } from "es-toolkit";
|
|
846
|
-
import { copyFileSync, createWriteStream as
|
|
847
|
-
import { rename as
|
|
848
|
-
import { join as
|
|
849
|
-
import { pipeline as
|
|
852
|
+
import { copyFileSync as copyFileSync2, createWriteStream as createWriteStream4, existsSync as existsSync5, rmSync as rmSync4 } from "fs";
|
|
853
|
+
import { rename as rename3, writeFile as writeFile2 } from "fs/promises";
|
|
854
|
+
import { join as join7 } from "path";
|
|
855
|
+
import { pipeline as pipeline4 } from "stream/promises";
|
|
850
856
|
|
|
851
857
|
// src/errors/image-pdf-fallback-error.ts
|
|
852
858
|
var ImagePdfFallbackError = class extends Error {
|
|
@@ -1301,14 +1307,18 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
1301
1307
|
// src/processors/page-renderer.ts
|
|
1302
1308
|
import { existsSync as existsSync2, mkdirSync as mkdirSync2, readdirSync as readdirSync2 } from "fs";
|
|
1303
1309
|
import { join as join3 } from "path";
|
|
1304
|
-
var
|
|
1310
|
+
var PROGRESS_LOG_PERCENT_STEP = 10;
|
|
1305
1311
|
var PageRenderer = class {
|
|
1306
1312
|
constructor(logger) {
|
|
1307
1313
|
this.logger = logger;
|
|
1308
1314
|
}
|
|
1315
|
+
lastLoggedPercent = 0;
|
|
1309
1316
|
/**
|
|
1310
1317
|
* Render all pages of a PDF to individual PNG files.
|
|
1311
1318
|
*
|
|
1319
|
+
* Uses per-page rendering (`magick 'input.pdf[N]'`) when page count is known,
|
|
1320
|
+
* limiting peak memory to ~15MB/page instead of loading all pages at once.
|
|
1321
|
+
*
|
|
1312
1322
|
* @param pdfPath - Absolute path to the source PDF file
|
|
1313
1323
|
* @param outputDir - Directory where pages/ subdirectory will be created
|
|
1314
1324
|
* @param options - Rendering options
|
|
@@ -1325,50 +1335,54 @@ var PageRenderer = class {
|
|
|
1325
1335
|
this.logger.info(
|
|
1326
1336
|
`[PageRenderer] Rendering ${totalPages} pages at ${dpi} DPI...`
|
|
1327
1337
|
);
|
|
1338
|
+
this.lastLoggedPercent = 0;
|
|
1339
|
+
for (let i = 0; i < totalPages; i++) {
|
|
1340
|
+
const result = await spawnAsync(
|
|
1341
|
+
"magick",
|
|
1342
|
+
[
|
|
1343
|
+
"-density",
|
|
1344
|
+
dpi.toString(),
|
|
1345
|
+
`${pdfPath}[${i}]`,
|
|
1346
|
+
"-background",
|
|
1347
|
+
"white",
|
|
1348
|
+
"-alpha",
|
|
1349
|
+
"remove",
|
|
1350
|
+
"-alpha",
|
|
1351
|
+
"off",
|
|
1352
|
+
join3(pagesDir, `page_${i}.png`)
|
|
1353
|
+
],
|
|
1354
|
+
{ captureStdout: false }
|
|
1355
|
+
);
|
|
1356
|
+
if (result.code !== 0) {
|
|
1357
|
+
throw new Error(
|
|
1358
|
+
`[PageRenderer] Failed to render page ${i + 1}/${totalPages}: ${result.stderr || "Unknown error"}`
|
|
1359
|
+
);
|
|
1360
|
+
}
|
|
1361
|
+
this.logProgress(i + 1, totalPages);
|
|
1362
|
+
}
|
|
1328
1363
|
} else {
|
|
1329
1364
|
this.logger.info(`[PageRenderer] Rendering PDF at ${dpi} DPI...`);
|
|
1330
|
-
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
|
|
1336
|
-
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
} catch {
|
|
1347
|
-
}
|
|
1348
|
-
}, PROGRESS_POLL_INTERVAL_MS);
|
|
1349
|
-
}
|
|
1350
|
-
try {
|
|
1351
|
-
const result = await spawnAsync("magick", [
|
|
1352
|
-
"-density",
|
|
1353
|
-
dpi.toString(),
|
|
1354
|
-
pdfPath,
|
|
1355
|
-
"-background",
|
|
1356
|
-
"white",
|
|
1357
|
-
"-alpha",
|
|
1358
|
-
"remove",
|
|
1359
|
-
"-alpha",
|
|
1360
|
-
"off",
|
|
1361
|
-
outputPattern
|
|
1362
|
-
]);
|
|
1365
|
+
const result = await spawnAsync(
|
|
1366
|
+
"magick",
|
|
1367
|
+
[
|
|
1368
|
+
"-density",
|
|
1369
|
+
dpi.toString(),
|
|
1370
|
+
pdfPath,
|
|
1371
|
+
"-background",
|
|
1372
|
+
"white",
|
|
1373
|
+
"-alpha",
|
|
1374
|
+
"remove",
|
|
1375
|
+
"-alpha",
|
|
1376
|
+
"off",
|
|
1377
|
+
join3(pagesDir, "page_%d.png")
|
|
1378
|
+
],
|
|
1379
|
+
{ captureStdout: false }
|
|
1380
|
+
);
|
|
1363
1381
|
if (result.code !== 0) {
|
|
1364
1382
|
throw new Error(
|
|
1365
1383
|
`[PageRenderer] Failed to render PDF pages: ${result.stderr || "Unknown error"}`
|
|
1366
1384
|
);
|
|
1367
1385
|
}
|
|
1368
|
-
} finally {
|
|
1369
|
-
if (progressInterval) {
|
|
1370
|
-
clearInterval(progressInterval);
|
|
1371
|
-
}
|
|
1372
1386
|
}
|
|
1373
1387
|
const pageFiles = readdirSync2(pagesDir).filter((f) => f.startsWith("page_") && f.endsWith(".png")).sort((a, b) => {
|
|
1374
1388
|
const numA = parseInt(a.replace("page_", "").replace(".png", ""), 10);
|
|
@@ -1384,6 +1398,18 @@ var PageRenderer = class {
|
|
|
1384
1398
|
pageFiles
|
|
1385
1399
|
};
|
|
1386
1400
|
}
|
|
1401
|
+
/**
|
|
1402
|
+
* Log rendering progress at appropriate intervals (every 10%).
|
|
1403
|
+
*/
|
|
1404
|
+
logProgress(current, total) {
|
|
1405
|
+
const percent = Math.floor(current / total * 100);
|
|
1406
|
+
if (percent >= this.lastLoggedPercent + PROGRESS_LOG_PERCENT_STEP || current === total) {
|
|
1407
|
+
this.lastLoggedPercent = percent;
|
|
1408
|
+
this.logger.info(
|
|
1409
|
+
`[PageRenderer] Rendering pages: ${current}/${total} (${percent}%)`
|
|
1410
|
+
);
|
|
1411
|
+
}
|
|
1412
|
+
}
|
|
1387
1413
|
/**
|
|
1388
1414
|
* Get total page count using pdfinfo.
|
|
1389
1415
|
* Returns 0 on failure (progress logging will be skipped).
|
|
@@ -2317,10 +2343,541 @@ var LocalFileServer = class {
|
|
|
2317
2343
|
}
|
|
2318
2344
|
};
|
|
2319
2345
|
|
|
2346
|
+
// src/core/chunked-pdf-converter.ts
|
|
2347
|
+
import {
|
|
2348
|
+
copyFileSync,
|
|
2349
|
+
createWriteStream as createWriteStream3,
|
|
2350
|
+
existsSync as existsSync3,
|
|
2351
|
+
mkdirSync as mkdirSync3,
|
|
2352
|
+
readFileSync as readFileSync3,
|
|
2353
|
+
readdirSync as readdirSync3,
|
|
2354
|
+
rmSync as rmSync2,
|
|
2355
|
+
writeFileSync as writeFileSync3
|
|
2356
|
+
} from "fs";
|
|
2357
|
+
import { rename as rename2, writeFile } from "fs/promises";
|
|
2358
|
+
import { join as join5 } from "path";
|
|
2359
|
+
import { pipeline as pipeline3 } from "stream/promises";
|
|
2360
|
+
|
|
2361
|
+
// src/processors/docling-document-merger.ts
|
|
2362
|
+
var REF_PATTERN = /^#\/(texts|pictures|tables|groups)\/(\d+)$/;
|
|
2363
|
+
var IMAGE_URI_PATTERN = /^images\/pic_(\d+)\.png$/;
|
|
2364
|
+
var DoclingDocumentMerger = class {
|
|
2365
|
+
/**
|
|
2366
|
+
* Merge an array of DoclingDocuments into one.
|
|
2367
|
+
* The first chunk's metadata (schema_name, version, name, origin) is used as the base.
|
|
2368
|
+
*
|
|
2369
|
+
* @param chunks - Array of DoclingDocument objects to merge (must have at least 1)
|
|
2370
|
+
* @param picFileOffsets - Optional cumulative pic_ file counts per chunk.
|
|
2371
|
+
* When provided, picFileOffsets[i] is used for pic_ URI remapping instead of
|
|
2372
|
+
* the pictures array length, aligning URIs with relocated file indices.
|
|
2373
|
+
* @returns Merged DoclingDocument
|
|
2374
|
+
*/
|
|
2375
|
+
merge(chunks, picFileOffsets) {
|
|
2376
|
+
if (chunks.length === 0) {
|
|
2377
|
+
throw new Error("Cannot merge zero chunks");
|
|
2378
|
+
}
|
|
2379
|
+
if (chunks.length === 1) {
|
|
2380
|
+
return chunks[0];
|
|
2381
|
+
}
|
|
2382
|
+
const base = structuredClone(chunks[0]);
|
|
2383
|
+
for (let i = 1; i < chunks.length; i++) {
|
|
2384
|
+
const chunk = chunks[i];
|
|
2385
|
+
const offsets = {
|
|
2386
|
+
texts: base.texts.length,
|
|
2387
|
+
pictures: base.pictures.length,
|
|
2388
|
+
tables: base.tables.length,
|
|
2389
|
+
groups: base.groups.length
|
|
2390
|
+
};
|
|
2391
|
+
const picFileOffset = picFileOffsets ? picFileOffsets[i] : offsets.pictures;
|
|
2392
|
+
for (const text of chunk.texts) {
|
|
2393
|
+
const remapped = structuredClone(text);
|
|
2394
|
+
remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
|
|
2395
|
+
if (remapped.parent) {
|
|
2396
|
+
remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
|
|
2397
|
+
}
|
|
2398
|
+
remapped.children = remapped.children.map((c) => ({
|
|
2399
|
+
$ref: this.remapRef(c.$ref, offsets)
|
|
2400
|
+
}));
|
|
2401
|
+
base.texts.push(remapped);
|
|
2402
|
+
}
|
|
2403
|
+
for (const picture of chunk.pictures) {
|
|
2404
|
+
const remapped = structuredClone(picture);
|
|
2405
|
+
remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
|
|
2406
|
+
if (remapped.parent) {
|
|
2407
|
+
remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
|
|
2408
|
+
}
|
|
2409
|
+
remapped.children = remapped.children.map((c) => ({
|
|
2410
|
+
$ref: this.remapRef(c.$ref, offsets)
|
|
2411
|
+
}));
|
|
2412
|
+
remapped.captions = remapped.captions.map((c) => ({
|
|
2413
|
+
$ref: this.remapRef(c.$ref, offsets)
|
|
2414
|
+
}));
|
|
2415
|
+
this.remapPictureImageUri(remapped, picFileOffset);
|
|
2416
|
+
base.pictures.push(remapped);
|
|
2417
|
+
}
|
|
2418
|
+
for (const table of chunk.tables) {
|
|
2419
|
+
const remapped = structuredClone(table);
|
|
2420
|
+
remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
|
|
2421
|
+
if (remapped.parent) {
|
|
2422
|
+
remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
|
|
2423
|
+
}
|
|
2424
|
+
remapped.children = remapped.children.map((c) => ({
|
|
2425
|
+
$ref: this.remapRef(c.$ref, offsets)
|
|
2426
|
+
}));
|
|
2427
|
+
remapped.captions = remapped.captions.map((c) => ({
|
|
2428
|
+
$ref: this.remapRef(c.$ref, offsets)
|
|
2429
|
+
}));
|
|
2430
|
+
remapped.footnotes = remapped.footnotes.map((f) => ({
|
|
2431
|
+
$ref: this.remapRef(f.$ref, offsets)
|
|
2432
|
+
}));
|
|
2433
|
+
base.tables.push(remapped);
|
|
2434
|
+
}
|
|
2435
|
+
for (const group of chunk.groups) {
|
|
2436
|
+
const remapped = structuredClone(group);
|
|
2437
|
+
remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
|
|
2438
|
+
if (remapped.parent) {
|
|
2439
|
+
remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
|
|
2440
|
+
}
|
|
2441
|
+
remapped.children = remapped.children.map((c) => ({
|
|
2442
|
+
$ref: this.remapRef(c.$ref, offsets)
|
|
2443
|
+
}));
|
|
2444
|
+
base.groups.push(remapped);
|
|
2445
|
+
}
|
|
2446
|
+
for (const child of chunk.body.children) {
|
|
2447
|
+
base.body.children.push({
|
|
2448
|
+
$ref: this.remapRef(child.$ref, offsets)
|
|
2449
|
+
});
|
|
2450
|
+
}
|
|
2451
|
+
for (const child of chunk.furniture.children) {
|
|
2452
|
+
base.furniture.children.push({
|
|
2453
|
+
$ref: this.remapRef(child.$ref, offsets)
|
|
2454
|
+
});
|
|
2455
|
+
}
|
|
2456
|
+
Object.assign(base.pages, chunk.pages);
|
|
2457
|
+
}
|
|
2458
|
+
return base;
|
|
2459
|
+
}
|
|
2460
|
+
/**
|
|
2461
|
+
* Remap a $ref string by applying offsets.
|
|
2462
|
+
* Only refs matching "#/{texts|pictures|tables|groups}/{N}" are remapped.
|
|
2463
|
+
* Refs like "#/body" or "#/furniture" pass through unchanged.
|
|
2464
|
+
*/
|
|
2465
|
+
remapRef(ref, offsets) {
|
|
2466
|
+
const match = REF_PATTERN.exec(ref);
|
|
2467
|
+
if (!match) {
|
|
2468
|
+
return ref;
|
|
2469
|
+
}
|
|
2470
|
+
const kind = match[1];
|
|
2471
|
+
const index = parseInt(match[2], 10);
|
|
2472
|
+
return `#/${kind}/${index + offsets[kind]}`;
|
|
2473
|
+
}
|
|
2474
|
+
/**
|
|
2475
|
+
* Remap image URI in a picture item by applying the pic file offset.
|
|
2476
|
+
* Transforms "images/pic_N.png" → "images/pic_{N+offset}.png"
|
|
2477
|
+
*/
|
|
2478
|
+
remapPictureImageUri(picture, picFileOffset) {
|
|
2479
|
+
const rec = picture;
|
|
2480
|
+
const image = rec.image;
|
|
2481
|
+
if (!image?.uri) return;
|
|
2482
|
+
const match = IMAGE_URI_PATTERN.exec(image.uri);
|
|
2483
|
+
if (match) {
|
|
2484
|
+
const index = parseInt(match[1], 10);
|
|
2485
|
+
image.uri = `images/pic_${index + picFileOffset}.png`;
|
|
2486
|
+
}
|
|
2487
|
+
}
|
|
2488
|
+
};
|
|
2489
|
+
|
|
2490
|
+
// src/core/chunked-pdf-converter.ts
|
|
2491
|
+
var ChunkedPDFConverter = class {
|
|
2492
|
+
constructor(logger, client, config, timeout = PDF_CONVERTER.DEFAULT_TIMEOUT_MS) {
|
|
2493
|
+
this.logger = logger;
|
|
2494
|
+
this.client = client;
|
|
2495
|
+
this.config = config;
|
|
2496
|
+
this.timeout = timeout;
|
|
2497
|
+
}
|
|
2498
|
+
/**
|
|
2499
|
+
* Convert a local PDF in chunks.
|
|
2500
|
+
*
|
|
2501
|
+
* @param url - file:// URL to the source PDF
|
|
2502
|
+
* @param reportId - Unique report identifier for output directory naming
|
|
2503
|
+
* @param onComplete - Callback invoked with the final output directory
|
|
2504
|
+
* @param cleanupAfterCallback - Whether to clean up the output directory after callback
|
|
2505
|
+
* @param options - PDF conversion options (chunked-specific fields are stripped internally)
|
|
2506
|
+
* @param buildConversionOptions - Function to build Docling ConversionOptions from PDFConvertOptions
|
|
2507
|
+
* @param abortSignal - Optional abort signal for cancellation
|
|
2508
|
+
*/
|
|
2509
|
+
async convertChunked(url, reportId, onComplete, cleanupAfterCallback, options, buildConversionOptions, abortSignal) {
|
|
2510
|
+
const pdfPath = url.slice(7);
|
|
2511
|
+
const cwd = process.cwd();
|
|
2512
|
+
const outputDir = join5(cwd, "output", reportId);
|
|
2513
|
+
const chunksBaseDir = join5(cwd, "output", reportId, "_chunks");
|
|
2514
|
+
const totalPages = await this.getPageCount(pdfPath);
|
|
2515
|
+
if (totalPages === 0) {
|
|
2516
|
+
throw new Error(
|
|
2517
|
+
"[ChunkedPDFConverter] Failed to detect page count from PDF"
|
|
2518
|
+
);
|
|
2519
|
+
}
|
|
2520
|
+
const chunks = this.calculateChunks(totalPages);
|
|
2521
|
+
this.logger.info(
|
|
2522
|
+
`[ChunkedPDFConverter] Starting: ${totalPages} pages \u2192 ${chunks.length} chunks of ${this.config.chunkSize}`
|
|
2523
|
+
);
|
|
2524
|
+
const server = new LocalFileServer();
|
|
2525
|
+
const httpUrl = await server.start(pdfPath);
|
|
2526
|
+
this.logger.info(
|
|
2527
|
+
"[ChunkedPDFConverter] Started local file server:",
|
|
2528
|
+
httpUrl
|
|
2529
|
+
);
|
|
2530
|
+
const chunkDocuments = [];
|
|
2531
|
+
try {
|
|
2532
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
2533
|
+
this.checkAbort(abortSignal);
|
|
2534
|
+
const [start, end] = chunks[i];
|
|
2535
|
+
const chunkDir = join5(chunksBaseDir, `_chunk_${i}`);
|
|
2536
|
+
mkdirSync3(chunkDir, { recursive: true });
|
|
2537
|
+
const doc = await this.convertChunk(
|
|
2538
|
+
i,
|
|
2539
|
+
chunks.length,
|
|
2540
|
+
start,
|
|
2541
|
+
end,
|
|
2542
|
+
httpUrl,
|
|
2543
|
+
chunkDir,
|
|
2544
|
+
options,
|
|
2545
|
+
buildConversionOptions
|
|
2546
|
+
);
|
|
2547
|
+
chunkDocuments.push(doc);
|
|
2548
|
+
}
|
|
2549
|
+
} finally {
|
|
2550
|
+
this.logger.info("[ChunkedPDFConverter] Stopping local file server...");
|
|
2551
|
+
await server.stop();
|
|
2552
|
+
}
|
|
2553
|
+
this.checkAbort(abortSignal);
|
|
2554
|
+
this.logger.info(
|
|
2555
|
+
`[ChunkedPDFConverter] All ${chunks.length} chunks completed, merging...`
|
|
2556
|
+
);
|
|
2557
|
+
const merger = new DoclingDocumentMerger();
|
|
2558
|
+
const picFileOffsets = this.buildPicFileOffsets(
|
|
2559
|
+
chunksBaseDir,
|
|
2560
|
+
chunks.length
|
|
2561
|
+
);
|
|
2562
|
+
const merged = merger.merge(chunkDocuments, picFileOffsets);
|
|
2563
|
+
this.logger.info(
|
|
2564
|
+
`[ChunkedPDFConverter] Merged: ${merged.texts.length} texts, ${merged.pictures.length} pictures, ${merged.tables.length} tables, ${Object.keys(merged.pages).length} pages`
|
|
2565
|
+
);
|
|
2566
|
+
mkdirSync3(outputDir, { recursive: true });
|
|
2567
|
+
const imagesDir = join5(outputDir, "images");
|
|
2568
|
+
mkdirSync3(imagesDir, { recursive: true });
|
|
2569
|
+
this.relocateImages(chunksBaseDir, chunks.length, imagesDir);
|
|
2570
|
+
const resultPath = join5(outputDir, "result.json");
|
|
2571
|
+
writeFileSync3(resultPath, JSON.stringify(merged));
|
|
2572
|
+
try {
|
|
2573
|
+
await this.renderPageImages(pdfPath, outputDir);
|
|
2574
|
+
this.cleanupOrphanedPicFiles(resultPath, imagesDir);
|
|
2575
|
+
this.checkAbort(abortSignal);
|
|
2576
|
+
this.logger.info(
|
|
2577
|
+
"[ChunkedPDFConverter] Executing completion callback..."
|
|
2578
|
+
);
|
|
2579
|
+
await onComplete(outputDir);
|
|
2580
|
+
} finally {
|
|
2581
|
+
if (existsSync3(chunksBaseDir)) {
|
|
2582
|
+
rmSync2(chunksBaseDir, { recursive: true, force: true });
|
|
2583
|
+
}
|
|
2584
|
+
if (cleanupAfterCallback) {
|
|
2585
|
+
this.logger.info(
|
|
2586
|
+
"[ChunkedPDFConverter] Cleaning up output directory:",
|
|
2587
|
+
outputDir
|
|
2588
|
+
);
|
|
2589
|
+
if (existsSync3(outputDir)) {
|
|
2590
|
+
rmSync2(outputDir, { recursive: true, force: true });
|
|
2591
|
+
}
|
|
2592
|
+
} else {
|
|
2593
|
+
this.logger.info(
|
|
2594
|
+
"[ChunkedPDFConverter] Output preserved at:",
|
|
2595
|
+
outputDir
|
|
2596
|
+
);
|
|
2597
|
+
}
|
|
2598
|
+
}
|
|
2599
|
+
return null;
|
|
2600
|
+
}
|
|
2601
|
+
/**
|
|
2602
|
+
* Convert a single chunk with retry logic.
|
|
2603
|
+
*/
|
|
2604
|
+
async convertChunk(chunkIndex, totalChunks, startPage, endPage, httpUrl, chunkDir, options, buildConversionOptions) {
|
|
2605
|
+
const chunkLabel = `Chunk ${chunkIndex + 1}/${totalChunks} (pages ${startPage}-${endPage})`;
|
|
2606
|
+
for (let attempt = 0; attempt <= this.config.maxRetries; attempt++) {
|
|
2607
|
+
try {
|
|
2608
|
+
if (attempt > 0) {
|
|
2609
|
+
this.logger.info(
|
|
2610
|
+
`[ChunkedPDFConverter] ${chunkLabel}: retrying (${attempt}/${this.config.maxRetries})...`
|
|
2611
|
+
);
|
|
2612
|
+
} else {
|
|
2613
|
+
this.logger.info(
|
|
2614
|
+
`[ChunkedPDFConverter] ${chunkLabel}: converting...`
|
|
2615
|
+
);
|
|
2616
|
+
}
|
|
2617
|
+
const startTime = Date.now();
|
|
2618
|
+
const conversionOptions = buildConversionOptions({
|
|
2619
|
+
...options,
|
|
2620
|
+
page_range: [startPage, endPage]
|
|
2621
|
+
});
|
|
2622
|
+
const task = await this.client.convertSourceAsync({
|
|
2623
|
+
sources: [{ kind: "http", url: httpUrl }],
|
|
2624
|
+
options: conversionOptions,
|
|
2625
|
+
target: { kind: "zip" }
|
|
2626
|
+
});
|
|
2627
|
+
await this.trackTaskProgress(task);
|
|
2628
|
+
const zipPath = join5(chunkDir, "result.zip");
|
|
2629
|
+
await this.downloadResult(task.taskId, zipPath);
|
|
2630
|
+
const extractDir = join5(chunkDir, "extracted");
|
|
2631
|
+
const chunkOutputDir = join5(chunkDir, "output");
|
|
2632
|
+
await ImageExtractor.extractAndSaveDocumentsFromZip(
|
|
2633
|
+
this.logger,
|
|
2634
|
+
zipPath,
|
|
2635
|
+
extractDir,
|
|
2636
|
+
chunkOutputDir
|
|
2637
|
+
);
|
|
2638
|
+
const resultJsonPath = join5(chunkOutputDir, "result.json");
|
|
2639
|
+
const doc = await runJqFileJson(".", resultJsonPath);
|
|
2640
|
+
if (existsSync3(zipPath)) rmSync2(zipPath, { force: true });
|
|
2641
|
+
if (existsSync3(extractDir)) {
|
|
2642
|
+
rmSync2(extractDir, { recursive: true, force: true });
|
|
2643
|
+
}
|
|
2644
|
+
const elapsed = ((Date.now() - startTime) / 1e3).toFixed(1);
|
|
2645
|
+
if (attempt > 0) {
|
|
2646
|
+
this.logger.info(
|
|
2647
|
+
`[ChunkedPDFConverter] ${chunkLabel}: completed on retry ${attempt} (${elapsed}s)`
|
|
2648
|
+
);
|
|
2649
|
+
} else {
|
|
2650
|
+
this.logger.info(
|
|
2651
|
+
`[ChunkedPDFConverter] ${chunkLabel}: completed (${elapsed}s)`
|
|
2652
|
+
);
|
|
2653
|
+
}
|
|
2654
|
+
return doc;
|
|
2655
|
+
} catch (error) {
|
|
2656
|
+
if (attempt >= this.config.maxRetries) {
|
|
2657
|
+
this.logger.error(
|
|
2658
|
+
`[ChunkedPDFConverter] ${chunkLabel}: failed after ${this.config.maxRetries} retries`
|
|
2659
|
+
);
|
|
2660
|
+
throw error;
|
|
2661
|
+
}
|
|
2662
|
+
this.logger.warn(
|
|
2663
|
+
`[ChunkedPDFConverter] ${chunkLabel}: failed, retrying (${attempt + 1}/${this.config.maxRetries})...`
|
|
2664
|
+
);
|
|
2665
|
+
}
|
|
2666
|
+
}
|
|
2667
|
+
throw new Error("Unreachable");
|
|
2668
|
+
}
|
|
2669
|
+
/** Calculate page ranges for chunks */
|
|
2670
|
+
calculateChunks(totalPages) {
|
|
2671
|
+
if (this.config.chunkSize <= 0) {
|
|
2672
|
+
throw new Error("[ChunkedPDFConverter] chunkSize must be positive");
|
|
2673
|
+
}
|
|
2674
|
+
const ranges = [];
|
|
2675
|
+
for (let start = 1; start <= totalPages; start += this.config.chunkSize) {
|
|
2676
|
+
const end = Math.min(start + this.config.chunkSize - 1, totalPages);
|
|
2677
|
+
ranges.push([start, end]);
|
|
2678
|
+
}
|
|
2679
|
+
return ranges;
|
|
2680
|
+
}
|
|
2681
|
+
/** Get total page count using pdfinfo */
|
|
2682
|
+
async getPageCount(pdfPath) {
|
|
2683
|
+
const result = await spawnAsync("pdfinfo", [pdfPath]);
|
|
2684
|
+
if (result.code !== 0) {
|
|
2685
|
+
return 0;
|
|
2686
|
+
}
|
|
2687
|
+
const match = result.stdout.match(/^Pages:\s+(\d+)/m);
|
|
2688
|
+
return match ? parseInt(match[1], 10) : 0;
|
|
2689
|
+
}
|
|
2690
|
+
/** Poll task progress until completion */
|
|
2691
|
+
async trackTaskProgress(task) {
|
|
2692
|
+
const startTime = Date.now();
|
|
2693
|
+
while (true) {
|
|
2694
|
+
if (Date.now() - startTime > this.timeout) {
|
|
2695
|
+
throw new Error("[ChunkedPDFConverter] Chunk task timeout");
|
|
2696
|
+
}
|
|
2697
|
+
const status = await task.poll();
|
|
2698
|
+
if (status.task_status === "success") return;
|
|
2699
|
+
if (status.task_status === "failure") {
|
|
2700
|
+
let details = "unknown";
|
|
2701
|
+
try {
|
|
2702
|
+
const result = await task.getResult();
|
|
2703
|
+
if (result.errors?.length) {
|
|
2704
|
+
details = result.errors.map((e) => e.message).join("; ");
|
|
2705
|
+
}
|
|
2706
|
+
} catch {
|
|
2707
|
+
}
|
|
2708
|
+
throw new Error(`[ChunkedPDFConverter] Chunk task failed: ${details}`);
|
|
2709
|
+
}
|
|
2710
|
+
await new Promise(
|
|
2711
|
+
(resolve) => setTimeout(resolve, PDF_CONVERTER.POLL_INTERVAL_MS)
|
|
2712
|
+
);
|
|
2713
|
+
}
|
|
2714
|
+
}
|
|
2715
|
+
/** Download ZIP result for a task */
|
|
2716
|
+
async downloadResult(taskId, zipPath) {
|
|
2717
|
+
const zipResult = await this.client.getTaskResultFile(taskId);
|
|
2718
|
+
if (zipResult.fileStream) {
|
|
2719
|
+
const writeStream = createWriteStream3(zipPath);
|
|
2720
|
+
await pipeline3(zipResult.fileStream, writeStream);
|
|
2721
|
+
return;
|
|
2722
|
+
}
|
|
2723
|
+
if (zipResult.data) {
|
|
2724
|
+
await writeFile(zipPath, zipResult.data);
|
|
2725
|
+
return;
|
|
2726
|
+
}
|
|
2727
|
+
const baseUrl = this.client.getConfig().baseUrl;
|
|
2728
|
+
const response = await fetch(`${baseUrl}/v1/result/${taskId}`, {
|
|
2729
|
+
headers: { Accept: "application/zip" }
|
|
2730
|
+
});
|
|
2731
|
+
if (!response.ok) {
|
|
2732
|
+
throw new Error(
|
|
2733
|
+
`Failed to download chunk ZIP: ${response.status} ${response.statusText}`
|
|
2734
|
+
);
|
|
2735
|
+
}
|
|
2736
|
+
const buffer = new Uint8Array(await response.arrayBuffer());
|
|
2737
|
+
await writeFile(zipPath, buffer);
|
|
2738
|
+
}
|
|
2739
|
+
/**
|
|
2740
|
+
* Relocate images from chunk output directories to the final images directory
|
|
2741
|
+
* with global indexing.
|
|
2742
|
+
*/
|
|
2743
|
+
relocateImages(chunksBaseDir, totalChunks, imagesDir) {
|
|
2744
|
+
let picGlobalIndex = 0;
|
|
2745
|
+
for (let i = 0; i < totalChunks; i++) {
|
|
2746
|
+
const chunkImagesDir = join5(
|
|
2747
|
+
chunksBaseDir,
|
|
2748
|
+
`_chunk_${i}`,
|
|
2749
|
+
"output",
|
|
2750
|
+
"images"
|
|
2751
|
+
);
|
|
2752
|
+
if (!existsSync3(chunkImagesDir)) continue;
|
|
2753
|
+
const picFiles = readdirSync3(chunkImagesDir).filter((f) => f.startsWith("pic_") && f.endsWith(".png")).sort((a, b) => {
|
|
2754
|
+
const numA = parseInt(a.replace("pic_", "").replace(".png", ""), 10);
|
|
2755
|
+
const numB = parseInt(b.replace("pic_", "").replace(".png", ""), 10);
|
|
2756
|
+
return numA - numB;
|
|
2757
|
+
});
|
|
2758
|
+
for (const file of picFiles) {
|
|
2759
|
+
const src = join5(chunkImagesDir, file);
|
|
2760
|
+
const dest = join5(imagesDir, `pic_${picGlobalIndex}.png`);
|
|
2761
|
+
copyFileSync(src, dest);
|
|
2762
|
+
picGlobalIndex++;
|
|
2763
|
+
}
|
|
2764
|
+
}
|
|
2765
|
+
let imageGlobalIndex = 0;
|
|
2766
|
+
for (let i = 0; i < totalChunks; i++) {
|
|
2767
|
+
const chunkImagesDir = join5(
|
|
2768
|
+
chunksBaseDir,
|
|
2769
|
+
`_chunk_${i}`,
|
|
2770
|
+
"output",
|
|
2771
|
+
"images"
|
|
2772
|
+
);
|
|
2773
|
+
if (!existsSync3(chunkImagesDir)) continue;
|
|
2774
|
+
const imageFiles = readdirSync3(chunkImagesDir).filter((f) => f.startsWith("image_") && f.endsWith(".png")).sort((a, b) => {
|
|
2775
|
+
const numA = parseInt(
|
|
2776
|
+
a.replace("image_", "").replace(".png", ""),
|
|
2777
|
+
10
|
|
2778
|
+
);
|
|
2779
|
+
const numB = parseInt(
|
|
2780
|
+
b.replace("image_", "").replace(".png", ""),
|
|
2781
|
+
10
|
|
2782
|
+
);
|
|
2783
|
+
return numA - numB;
|
|
2784
|
+
});
|
|
2785
|
+
for (const file of imageFiles) {
|
|
2786
|
+
const src = join5(chunkImagesDir, file);
|
|
2787
|
+
const dest = join5(imagesDir, `image_${imageGlobalIndex}.png`);
|
|
2788
|
+
copyFileSync(src, dest);
|
|
2789
|
+
imageGlobalIndex++;
|
|
2790
|
+
}
|
|
2791
|
+
}
|
|
2792
|
+
this.logger.info(
|
|
2793
|
+
`[ChunkedPDFConverter] Relocated ${picGlobalIndex} pic + ${imageGlobalIndex} image files to ${imagesDir}`
|
|
2794
|
+
);
|
|
2795
|
+
}
|
|
2796
|
+
/** Render page images from PDF using ImageMagick and update result.json */
|
|
2797
|
+
async renderPageImages(pdfPath, outputDir) {
|
|
2798
|
+
this.logger.info(
|
|
2799
|
+
"[ChunkedPDFConverter] Rendering page images with ImageMagick..."
|
|
2800
|
+
);
|
|
2801
|
+
const renderer = new PageRenderer(this.logger);
|
|
2802
|
+
const renderResult = await renderer.renderPages(pdfPath, outputDir);
|
|
2803
|
+
const resultPath = join5(outputDir, "result.json");
|
|
2804
|
+
const tmpPath = resultPath + ".tmp";
|
|
2805
|
+
const jqProgram = `
|
|
2806
|
+
.pages |= with_entries(
|
|
2807
|
+
if (.value.page_no - 1) >= 0 and (.value.page_no - 1) < ${renderResult.pageCount} then
|
|
2808
|
+
.value.image.uri = "pages/page_\\(.value.page_no - 1).png" |
|
|
2809
|
+
.value.image.mimetype = "image/png" |
|
|
2810
|
+
.value.image.dpi = ${PAGE_RENDERING.DEFAULT_DPI}
|
|
2811
|
+
else . end
|
|
2812
|
+
)
|
|
2813
|
+
`;
|
|
2814
|
+
await runJqFileToFile(jqProgram, resultPath, tmpPath);
|
|
2815
|
+
await rename2(tmpPath, resultPath);
|
|
2816
|
+
this.logger.info(
|
|
2817
|
+
`[ChunkedPDFConverter] Rendered ${renderResult.pageCount} page images`
|
|
2818
|
+
);
|
|
2819
|
+
}
|
|
2820
|
+
/**
|
|
2821
|
+
* Remove pic_ files from images directory that are not referenced in result.json.
|
|
2822
|
+
* Chunked Docling conversion embeds page images as base64 in JSON, which get
|
|
2823
|
+
* extracted as pic_ files. After renderPageImages replaces page URIs with
|
|
2824
|
+
* pages/page_N.png, these pic_ files become orphaned.
|
|
2825
|
+
*/
|
|
2826
|
+
cleanupOrphanedPicFiles(resultPath, imagesDir) {
|
|
2827
|
+
const content = readFileSync3(resultPath, "utf-8");
|
|
2828
|
+
const referencedPics = /* @__PURE__ */ new Set();
|
|
2829
|
+
const picPattern = /images\/pic_\d+\.png/g;
|
|
2830
|
+
let match;
|
|
2831
|
+
while ((match = picPattern.exec(content)) !== null) {
|
|
2832
|
+
referencedPics.add(match[0].replace("images/", ""));
|
|
2833
|
+
}
|
|
2834
|
+
const picFiles = readdirSync3(imagesDir).filter(
|
|
2835
|
+
(f) => f.startsWith("pic_") && f.endsWith(".png")
|
|
2836
|
+
);
|
|
2837
|
+
let removedCount = 0;
|
|
2838
|
+
for (const file of picFiles) {
|
|
2839
|
+
if (!referencedPics.has(file)) {
|
|
2840
|
+
rmSync2(join5(imagesDir, file), { force: true });
|
|
2841
|
+
removedCount++;
|
|
2842
|
+
}
|
|
2843
|
+
}
|
|
2844
|
+
if (removedCount > 0) {
|
|
2845
|
+
this.logger.info(
|
|
2846
|
+
`[ChunkedPDFConverter] Cleaned up ${removedCount} orphaned pic_ files (${referencedPics.size} referenced, kept)`
|
|
2847
|
+
);
|
|
2848
|
+
}
|
|
2849
|
+
}
|
|
2850
|
+
/**
|
|
2851
|
+
* Build cumulative pic_ file offsets per chunk for correct URI remapping.
|
|
2852
|
+
* Each offset[i] is the total number of pic_ files in chunks 0..i-1.
|
|
2853
|
+
*/
|
|
2854
|
+
buildPicFileOffsets(chunksBaseDir, totalChunks) {
|
|
2855
|
+
const offsets = [];
|
|
2856
|
+
let cumulative = 0;
|
|
2857
|
+
for (let i = 0; i < totalChunks; i++) {
|
|
2858
|
+
offsets.push(cumulative);
|
|
2859
|
+
const dir = join5(chunksBaseDir, `_chunk_${i}`, "output", "images");
|
|
2860
|
+
const count = existsSync3(dir) ? readdirSync3(dir).filter(
|
|
2861
|
+
(f) => f.startsWith("pic_") && f.endsWith(".png")
|
|
2862
|
+
).length : 0;
|
|
2863
|
+
cumulative += count;
|
|
2864
|
+
}
|
|
2865
|
+
return offsets;
|
|
2866
|
+
}
|
|
2867
|
+
/** Check if abort has been signalled and throw if so */
|
|
2868
|
+
checkAbort(signal) {
|
|
2869
|
+
if (signal?.aborted) {
|
|
2870
|
+
const error = new Error("Chunked PDF conversion was aborted");
|
|
2871
|
+
error.name = "AbortError";
|
|
2872
|
+
throw error;
|
|
2873
|
+
}
|
|
2874
|
+
}
|
|
2875
|
+
};
|
|
2876
|
+
|
|
2320
2877
|
// src/core/image-pdf-converter.ts
|
|
2321
|
-
import { existsSync as
|
|
2878
|
+
import { existsSync as existsSync4, rmSync as rmSync3 } from "fs";
|
|
2322
2879
|
import { tmpdir } from "os";
|
|
2323
|
-
import { join as
|
|
2880
|
+
import { join as join6 } from "path";
|
|
2324
2881
|
var ImagePdfConverter = class {
|
|
2325
2882
|
constructor(logger) {
|
|
2326
2883
|
this.logger = logger;
|
|
@@ -2336,8 +2893,8 @@ var ImagePdfConverter = class {
|
|
|
2336
2893
|
async convert(pdfUrl, reportId) {
|
|
2337
2894
|
const timestamp = Date.now();
|
|
2338
2895
|
const tempDir = tmpdir();
|
|
2339
|
-
const inputPath =
|
|
2340
|
-
const outputPath =
|
|
2896
|
+
const inputPath = join6(tempDir, `${reportId}-${timestamp}-input.pdf`);
|
|
2897
|
+
const outputPath = join6(tempDir, `${reportId}-${timestamp}-image.pdf`);
|
|
2341
2898
|
try {
|
|
2342
2899
|
this.logger.info("[ImagePdfConverter] Downloading PDF from URL...");
|
|
2343
2900
|
await this.downloadPdf(pdfUrl, inputPath);
|
|
@@ -2346,8 +2903,8 @@ var ImagePdfConverter = class {
|
|
|
2346
2903
|
this.logger.info("[ImagePdfConverter] Image PDF created:", outputPath);
|
|
2347
2904
|
return outputPath;
|
|
2348
2905
|
} finally {
|
|
2349
|
-
if (
|
|
2350
|
-
|
|
2906
|
+
if (existsSync4(inputPath)) {
|
|
2907
|
+
rmSync3(inputPath, { force: true });
|
|
2351
2908
|
}
|
|
2352
2909
|
}
|
|
2353
2910
|
}
|
|
@@ -2394,12 +2951,12 @@ var ImagePdfConverter = class {
|
|
|
2394
2951
|
* Cleanup the temporary image PDF file
|
|
2395
2952
|
*/
|
|
2396
2953
|
cleanup(imagePdfPath) {
|
|
2397
|
-
if (
|
|
2954
|
+
if (existsSync4(imagePdfPath)) {
|
|
2398
2955
|
this.logger.info(
|
|
2399
2956
|
"[ImagePdfConverter] Cleaning up temp file:",
|
|
2400
2957
|
imagePdfPath
|
|
2401
2958
|
);
|
|
2402
|
-
|
|
2959
|
+
rmSync3(imagePdfPath, { force: true });
|
|
2403
2960
|
}
|
|
2404
2961
|
}
|
|
2405
2962
|
};
|
|
@@ -2414,6 +2971,26 @@ var PDFConverter = class {
|
|
|
2414
2971
|
}
|
|
2415
2972
|
async convert(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
|
|
2416
2973
|
this.logger.info("[PDFConverter] Converting:", url);
|
|
2974
|
+
if (options.chunkedConversion && url.startsWith("file://")) {
|
|
2975
|
+
const chunked = new ChunkedPDFConverter(
|
|
2976
|
+
this.logger,
|
|
2977
|
+
this.client,
|
|
2978
|
+
{
|
|
2979
|
+
chunkSize: options.chunkSize ?? CHUNKED_CONVERSION.DEFAULT_CHUNK_SIZE,
|
|
2980
|
+
maxRetries: options.chunkMaxRetries ?? CHUNKED_CONVERSION.DEFAULT_MAX_RETRIES
|
|
2981
|
+
},
|
|
2982
|
+
this.timeout
|
|
2983
|
+
);
|
|
2984
|
+
return chunked.convertChunked(
|
|
2985
|
+
url,
|
|
2986
|
+
reportId,
|
|
2987
|
+
onComplete,
|
|
2988
|
+
cleanupAfterCallback,
|
|
2989
|
+
options,
|
|
2990
|
+
(opts) => this.buildConversionOptions(opts),
|
|
2991
|
+
abortSignal
|
|
2992
|
+
);
|
|
2993
|
+
}
|
|
2417
2994
|
if (options.forceImagePdf) {
|
|
2418
2995
|
return this.convertViaImagePdf(
|
|
2419
2996
|
url,
|
|
@@ -2518,7 +3095,7 @@ var PDFConverter = class {
|
|
|
2518
3095
|
const reason = options.forcedMethod ? `Forced: ${options.forcedMethod}` : !pdfPath ? "Non-local URL, sampling skipped" : "Sampling skipped";
|
|
2519
3096
|
return { method, reason, sampledPages: 0, totalPages: 0 };
|
|
2520
3097
|
}
|
|
2521
|
-
const samplingDir =
|
|
3098
|
+
const samplingDir = join7(process.cwd(), "output", reportId, "_sampling");
|
|
2522
3099
|
const sampler = new OcrStrategySampler(
|
|
2523
3100
|
this.logger,
|
|
2524
3101
|
new PageRenderer(this.logger),
|
|
@@ -2543,8 +3120,8 @@ var PDFConverter = class {
|
|
|
2543
3120
|
}
|
|
2544
3121
|
return strategy;
|
|
2545
3122
|
} finally {
|
|
2546
|
-
if (
|
|
2547
|
-
|
|
3123
|
+
if (existsSync5(samplingDir)) {
|
|
3124
|
+
rmSync4(samplingDir, { recursive: true, force: true });
|
|
2548
3125
|
}
|
|
2549
3126
|
}
|
|
2550
3127
|
}
|
|
@@ -2565,7 +3142,7 @@ var PDFConverter = class {
|
|
|
2565
3142
|
const wrappedCallback = async (outputDir) => {
|
|
2566
3143
|
let pageTexts;
|
|
2567
3144
|
try {
|
|
2568
|
-
const resultPath2 =
|
|
3145
|
+
const resultPath2 = join7(outputDir, "result.json");
|
|
2569
3146
|
const totalPages = await runJqFileJson(
|
|
2570
3147
|
".pages | length",
|
|
2571
3148
|
resultPath2
|
|
@@ -2577,9 +3154,9 @@ var PDFConverter = class {
|
|
|
2577
3154
|
"[PDFConverter] pdftotext extraction failed, proceeding without text reference"
|
|
2578
3155
|
);
|
|
2579
3156
|
}
|
|
2580
|
-
const resultPath =
|
|
2581
|
-
const ocrOriginPath =
|
|
2582
|
-
|
|
3157
|
+
const resultPath = join7(outputDir, "result.json");
|
|
3158
|
+
const ocrOriginPath = join7(outputDir, "result_ocr_origin.json");
|
|
3159
|
+
copyFileSync2(resultPath, ocrOriginPath);
|
|
2583
3160
|
const corrector = new VlmTextCorrector(this.logger);
|
|
2584
3161
|
await corrector.correctAndSave(outputDir, options.vlmProcessorModel, {
|
|
2585
3162
|
concurrency: options.vlmConcurrency,
|
|
@@ -2721,9 +3298,9 @@ var PDFConverter = class {
|
|
|
2721
3298
|
}
|
|
2722
3299
|
}
|
|
2723
3300
|
const cwd = process.cwd();
|
|
2724
|
-
const zipPath =
|
|
2725
|
-
const extractDir =
|
|
2726
|
-
const outputDir =
|
|
3301
|
+
const zipPath = join7(cwd, "result.zip");
|
|
3302
|
+
const extractDir = join7(cwd, "result_extracted");
|
|
3303
|
+
const outputDir = join7(cwd, "output", reportId);
|
|
2727
3304
|
try {
|
|
2728
3305
|
await this.processConvertedFiles(zipPath, extractDir, outputDir);
|
|
2729
3306
|
await this.renderPageImages(url, outputDir);
|
|
@@ -2740,19 +3317,19 @@ var PDFConverter = class {
|
|
|
2740
3317
|
this.logger.info("[PDFConverter] Total time:", duration, "ms");
|
|
2741
3318
|
} finally {
|
|
2742
3319
|
this.logger.info("[PDFConverter] Cleaning up temporary files...");
|
|
2743
|
-
if (
|
|
2744
|
-
|
|
3320
|
+
if (existsSync5(zipPath)) {
|
|
3321
|
+
rmSync4(zipPath, { force: true });
|
|
2745
3322
|
}
|
|
2746
|
-
if (
|
|
2747
|
-
|
|
3323
|
+
if (existsSync5(extractDir)) {
|
|
3324
|
+
rmSync4(extractDir, { recursive: true, force: true });
|
|
2748
3325
|
}
|
|
2749
3326
|
if (cleanupAfterCallback) {
|
|
2750
3327
|
this.logger.info(
|
|
2751
3328
|
"[PDFConverter] Cleaning up output directory:",
|
|
2752
3329
|
outputDir
|
|
2753
3330
|
);
|
|
2754
|
-
if (
|
|
2755
|
-
|
|
3331
|
+
if (existsSync5(outputDir)) {
|
|
3332
|
+
rmSync4(outputDir, { recursive: true, force: true });
|
|
2756
3333
|
}
|
|
2757
3334
|
} else {
|
|
2758
3335
|
this.logger.info("[PDFConverter] Output preserved at:", outputDir);
|
|
@@ -2770,7 +3347,10 @@ var PDFConverter = class {
|
|
|
2770
3347
|
"skipSampling",
|
|
2771
3348
|
"forcedMethod",
|
|
2772
3349
|
"aggregator",
|
|
2773
|
-
"onTokenUsage"
|
|
3350
|
+
"onTokenUsage",
|
|
3351
|
+
"chunkedConversion",
|
|
3352
|
+
"chunkSize",
|
|
3353
|
+
"chunkMaxRetries"
|
|
2774
3354
|
]),
|
|
2775
3355
|
to_formats: ["json", "html"],
|
|
2776
3356
|
image_export_mode: "embedded",
|
|
@@ -2898,15 +3478,15 @@ var PDFConverter = class {
|
|
|
2898
3478
|
"\n[PDFConverter] Task completed, downloading ZIP file..."
|
|
2899
3479
|
);
|
|
2900
3480
|
const zipResult = await this.client.getTaskResultFile(taskId);
|
|
2901
|
-
const zipPath =
|
|
3481
|
+
const zipPath = join7(process.cwd(), "result.zip");
|
|
2902
3482
|
this.logger.info("[PDFConverter] Saving ZIP file to:", zipPath);
|
|
2903
3483
|
if (zipResult.fileStream) {
|
|
2904
|
-
const writeStream =
|
|
2905
|
-
await
|
|
3484
|
+
const writeStream = createWriteStream4(zipPath);
|
|
3485
|
+
await pipeline4(zipResult.fileStream, writeStream);
|
|
2906
3486
|
return;
|
|
2907
3487
|
}
|
|
2908
3488
|
if (zipResult.data) {
|
|
2909
|
-
await
|
|
3489
|
+
await writeFile2(zipPath, zipResult.data);
|
|
2910
3490
|
return;
|
|
2911
3491
|
}
|
|
2912
3492
|
this.logger.warn(
|
|
@@ -2922,7 +3502,7 @@ var PDFConverter = class {
|
|
|
2922
3502
|
);
|
|
2923
3503
|
}
|
|
2924
3504
|
const buffer = new Uint8Array(await response.arrayBuffer());
|
|
2925
|
-
await
|
|
3505
|
+
await writeFile2(zipPath, buffer);
|
|
2926
3506
|
}
|
|
2927
3507
|
async processConvertedFiles(zipPath, extractDir, outputDir) {
|
|
2928
3508
|
await ImageExtractor.extractAndSaveDocumentsFromZip(
|
|
@@ -2951,7 +3531,7 @@ var PDFConverter = class {
|
|
|
2951
3531
|
);
|
|
2952
3532
|
const renderer = new PageRenderer(this.logger);
|
|
2953
3533
|
const renderResult = await renderer.renderPages(pdfPath, outputDir);
|
|
2954
|
-
const resultPath =
|
|
3534
|
+
const resultPath = join7(outputDir, "result.json");
|
|
2955
3535
|
const tmpPath = resultPath + ".tmp";
|
|
2956
3536
|
const jqProgram = `
|
|
2957
3537
|
.pages |= with_entries(
|
|
@@ -2963,7 +3543,7 @@ var PDFConverter = class {
|
|
|
2963
3543
|
)
|
|
2964
3544
|
`;
|
|
2965
3545
|
await runJqFileToFile(jqProgram, resultPath, tmpPath);
|
|
2966
|
-
await
|
|
3546
|
+
await rename3(tmpPath, resultPath);
|
|
2967
3547
|
this.logger.info(
|
|
2968
3548
|
`[PDFConverter] Rendered ${renderResult.pageCount} page images`
|
|
2969
3549
|
);
|
|
@@ -2998,7 +3578,7 @@ var PDFParser = class {
|
|
|
2998
3578
|
this.baseUrl = void 0;
|
|
2999
3579
|
}
|
|
3000
3580
|
this.timeout = timeout;
|
|
3001
|
-
this.venvPath = venvPath ||
|
|
3581
|
+
this.venvPath = venvPath || join8(process.cwd(), ".venv");
|
|
3002
3582
|
this.killExistingProcess = killExistingProcess;
|
|
3003
3583
|
this.enableImagePdfFallback = enableImagePdfFallback;
|
|
3004
3584
|
}
|