@heripo/pdf-parser 0.1.10 → 0.1.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +647 -76
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +6 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +656 -76
- package/dist/index.js.map +1 -1
- package/package.json +4 -4
package/dist/index.cjs
CHANGED
|
@@ -40,7 +40,7 @@ module.exports = __toCommonJS(src_exports);
|
|
|
40
40
|
var import_docling_sdk = require("docling-sdk");
|
|
41
41
|
var import_node_child_process3 = require("child_process");
|
|
42
42
|
var import_node_os2 = require("os");
|
|
43
|
-
var
|
|
43
|
+
var import_node_path9 = require("path");
|
|
44
44
|
|
|
45
45
|
// src/config/constants.ts
|
|
46
46
|
var PDF_PARSER = {
|
|
@@ -87,6 +87,12 @@ var PAGE_RENDERING = {
|
|
|
87
87
|
/** Low-resolution DPI for OCR strategy sampling */
|
|
88
88
|
SAMPLE_DPI: 150
|
|
89
89
|
};
|
|
90
|
+
var CHUNKED_CONVERSION = {
|
|
91
|
+
/** Number of pages per chunk */
|
|
92
|
+
DEFAULT_CHUNK_SIZE: 10,
|
|
93
|
+
/** Maximum retry attempts per failed chunk */
|
|
94
|
+
DEFAULT_MAX_RETRIES: 2
|
|
95
|
+
};
|
|
90
96
|
var IMAGE_PDF_CONVERTER = {
|
|
91
97
|
/**
|
|
92
98
|
* ImageMagick density option (DPI) for PDF to image conversion
|
|
@@ -875,10 +881,10 @@ var DoclingEnvironment = class _DoclingEnvironment {
|
|
|
875
881
|
|
|
876
882
|
// src/core/pdf-converter.ts
|
|
877
883
|
var import_es_toolkit = require("es-toolkit");
|
|
878
|
-
var
|
|
879
|
-
var
|
|
880
|
-
var
|
|
881
|
-
var
|
|
884
|
+
var import_node_fs9 = require("fs");
|
|
885
|
+
var import_promises6 = require("fs/promises");
|
|
886
|
+
var import_node_path8 = require("path");
|
|
887
|
+
var import_promises7 = require("stream/promises");
|
|
882
888
|
|
|
883
889
|
// src/errors/image-pdf-fallback-error.ts
|
|
884
890
|
var ImagePdfFallbackError = class extends Error {
|
|
@@ -1325,14 +1331,18 @@ var ImageExtractor = class _ImageExtractor {
|
|
|
1325
1331
|
// src/processors/page-renderer.ts
|
|
1326
1332
|
var import_node_fs3 = require("fs");
|
|
1327
1333
|
var import_node_path3 = require("path");
|
|
1328
|
-
var
|
|
1334
|
+
var PROGRESS_LOG_PERCENT_STEP = 10;
|
|
1329
1335
|
var PageRenderer = class {
|
|
1330
1336
|
constructor(logger) {
|
|
1331
1337
|
this.logger = logger;
|
|
1332
1338
|
}
|
|
1339
|
+
lastLoggedPercent = 0;
|
|
1333
1340
|
/**
|
|
1334
1341
|
* Render all pages of a PDF to individual PNG files.
|
|
1335
1342
|
*
|
|
1343
|
+
* Uses per-page rendering (`magick 'input.pdf[N]'`) when page count is known,
|
|
1344
|
+
* limiting peak memory to ~15MB/page instead of loading all pages at once.
|
|
1345
|
+
*
|
|
1336
1346
|
* @param pdfPath - Absolute path to the source PDF file
|
|
1337
1347
|
* @param outputDir - Directory where pages/ subdirectory will be created
|
|
1338
1348
|
* @param options - Rendering options
|
|
@@ -1349,50 +1359,54 @@ var PageRenderer = class {
|
|
|
1349
1359
|
this.logger.info(
|
|
1350
1360
|
`[PageRenderer] Rendering ${totalPages} pages at ${dpi} DPI...`
|
|
1351
1361
|
);
|
|
1362
|
+
this.lastLoggedPercent = 0;
|
|
1363
|
+
for (let i = 0; i < totalPages; i++) {
|
|
1364
|
+
const result = await spawnAsync(
|
|
1365
|
+
"magick",
|
|
1366
|
+
[
|
|
1367
|
+
"-density",
|
|
1368
|
+
dpi.toString(),
|
|
1369
|
+
`${pdfPath}[${i}]`,
|
|
1370
|
+
"-background",
|
|
1371
|
+
"white",
|
|
1372
|
+
"-alpha",
|
|
1373
|
+
"remove",
|
|
1374
|
+
"-alpha",
|
|
1375
|
+
"off",
|
|
1376
|
+
(0, import_node_path3.join)(pagesDir, `page_${i}.png`)
|
|
1377
|
+
],
|
|
1378
|
+
{ captureStdout: false }
|
|
1379
|
+
);
|
|
1380
|
+
if (result.code !== 0) {
|
|
1381
|
+
throw new Error(
|
|
1382
|
+
`[PageRenderer] Failed to render page ${i + 1}/${totalPages}: ${result.stderr || "Unknown error"}`
|
|
1383
|
+
);
|
|
1384
|
+
}
|
|
1385
|
+
this.logProgress(i + 1, totalPages);
|
|
1386
|
+
}
|
|
1352
1387
|
} else {
|
|
1353
1388
|
this.logger.info(`[PageRenderer] Rendering PDF at ${dpi} DPI...`);
|
|
1354
|
-
|
|
1355
|
-
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
|
|
1361
|
-
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
} catch {
|
|
1371
|
-
}
|
|
1372
|
-
}, PROGRESS_POLL_INTERVAL_MS);
|
|
1373
|
-
}
|
|
1374
|
-
try {
|
|
1375
|
-
const result = await spawnAsync("magick", [
|
|
1376
|
-
"-density",
|
|
1377
|
-
dpi.toString(),
|
|
1378
|
-
pdfPath,
|
|
1379
|
-
"-background",
|
|
1380
|
-
"white",
|
|
1381
|
-
"-alpha",
|
|
1382
|
-
"remove",
|
|
1383
|
-
"-alpha",
|
|
1384
|
-
"off",
|
|
1385
|
-
outputPattern
|
|
1386
|
-
]);
|
|
1389
|
+
const result = await spawnAsync(
|
|
1390
|
+
"magick",
|
|
1391
|
+
[
|
|
1392
|
+
"-density",
|
|
1393
|
+
dpi.toString(),
|
|
1394
|
+
pdfPath,
|
|
1395
|
+
"-background",
|
|
1396
|
+
"white",
|
|
1397
|
+
"-alpha",
|
|
1398
|
+
"remove",
|
|
1399
|
+
"-alpha",
|
|
1400
|
+
"off",
|
|
1401
|
+
(0, import_node_path3.join)(pagesDir, "page_%d.png")
|
|
1402
|
+
],
|
|
1403
|
+
{ captureStdout: false }
|
|
1404
|
+
);
|
|
1387
1405
|
if (result.code !== 0) {
|
|
1388
1406
|
throw new Error(
|
|
1389
1407
|
`[PageRenderer] Failed to render PDF pages: ${result.stderr || "Unknown error"}`
|
|
1390
1408
|
);
|
|
1391
1409
|
}
|
|
1392
|
-
} finally {
|
|
1393
|
-
if (progressInterval) {
|
|
1394
|
-
clearInterval(progressInterval);
|
|
1395
|
-
}
|
|
1396
1410
|
}
|
|
1397
1411
|
const pageFiles = (0, import_node_fs3.readdirSync)(pagesDir).filter((f) => f.startsWith("page_") && f.endsWith(".png")).sort((a, b) => {
|
|
1398
1412
|
const numA = parseInt(a.replace("page_", "").replace(".png", ""), 10);
|
|
@@ -1408,6 +1422,18 @@ var PageRenderer = class {
|
|
|
1408
1422
|
pageFiles
|
|
1409
1423
|
};
|
|
1410
1424
|
}
|
|
1425
|
+
/**
|
|
1426
|
+
* Log rendering progress at appropriate intervals (every 10%).
|
|
1427
|
+
*/
|
|
1428
|
+
logProgress(current, total) {
|
|
1429
|
+
const percent = Math.floor(current / total * 100);
|
|
1430
|
+
if (percent >= this.lastLoggedPercent + PROGRESS_LOG_PERCENT_STEP || current === total) {
|
|
1431
|
+
this.lastLoggedPercent = percent;
|
|
1432
|
+
this.logger.info(
|
|
1433
|
+
`[PageRenderer] Rendering pages: ${current}/${total} (${percent}%)`
|
|
1434
|
+
);
|
|
1435
|
+
}
|
|
1436
|
+
}
|
|
1411
1437
|
/**
|
|
1412
1438
|
* Get total page count using pdfinfo.
|
|
1413
1439
|
* Returns 0 on failure (progress logging will be skipped).
|
|
@@ -2341,10 +2367,532 @@ var LocalFileServer = class {
|
|
|
2341
2367
|
}
|
|
2342
2368
|
};
|
|
2343
2369
|
|
|
2344
|
-
// src/core/
|
|
2370
|
+
// src/core/chunked-pdf-converter.ts
|
|
2345
2371
|
var import_node_fs7 = require("fs");
|
|
2346
|
-
var
|
|
2372
|
+
var import_promises4 = require("fs/promises");
|
|
2347
2373
|
var import_node_path6 = require("path");
|
|
2374
|
+
var import_promises5 = require("stream/promises");
|
|
2375
|
+
|
|
2376
|
+
// src/processors/docling-document-merger.ts
|
|
2377
|
+
var REF_PATTERN = /^#\/(texts|pictures|tables|groups)\/(\d+)$/;
|
|
2378
|
+
var IMAGE_URI_PATTERN = /^images\/pic_(\d+)\.png$/;
|
|
2379
|
+
var DoclingDocumentMerger = class {
|
|
2380
|
+
/**
|
|
2381
|
+
* Merge an array of DoclingDocuments into one.
|
|
2382
|
+
* The first chunk's metadata (schema_name, version, name, origin) is used as the base.
|
|
2383
|
+
*
|
|
2384
|
+
* @param chunks - Array of DoclingDocument objects to merge (must have at least 1)
|
|
2385
|
+
* @param picFileOffsets - Optional cumulative pic_ file counts per chunk.
|
|
2386
|
+
* When provided, picFileOffsets[i] is used for pic_ URI remapping instead of
|
|
2387
|
+
* the pictures array length, aligning URIs with relocated file indices.
|
|
2388
|
+
* @returns Merged DoclingDocument
|
|
2389
|
+
*/
|
|
2390
|
+
merge(chunks, picFileOffsets) {
|
|
2391
|
+
if (chunks.length === 0) {
|
|
2392
|
+
throw new Error("Cannot merge zero chunks");
|
|
2393
|
+
}
|
|
2394
|
+
if (chunks.length === 1) {
|
|
2395
|
+
return chunks[0];
|
|
2396
|
+
}
|
|
2397
|
+
const base = structuredClone(chunks[0]);
|
|
2398
|
+
for (let i = 1; i < chunks.length; i++) {
|
|
2399
|
+
const chunk = chunks[i];
|
|
2400
|
+
const offsets = {
|
|
2401
|
+
texts: base.texts.length,
|
|
2402
|
+
pictures: base.pictures.length,
|
|
2403
|
+
tables: base.tables.length,
|
|
2404
|
+
groups: base.groups.length
|
|
2405
|
+
};
|
|
2406
|
+
const picFileOffset = picFileOffsets ? picFileOffsets[i] : offsets.pictures;
|
|
2407
|
+
for (const text of chunk.texts) {
|
|
2408
|
+
const remapped = structuredClone(text);
|
|
2409
|
+
remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
|
|
2410
|
+
if (remapped.parent) {
|
|
2411
|
+
remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
|
|
2412
|
+
}
|
|
2413
|
+
remapped.children = remapped.children.map((c) => ({
|
|
2414
|
+
$ref: this.remapRef(c.$ref, offsets)
|
|
2415
|
+
}));
|
|
2416
|
+
base.texts.push(remapped);
|
|
2417
|
+
}
|
|
2418
|
+
for (const picture of chunk.pictures) {
|
|
2419
|
+
const remapped = structuredClone(picture);
|
|
2420
|
+
remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
|
|
2421
|
+
if (remapped.parent) {
|
|
2422
|
+
remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
|
|
2423
|
+
}
|
|
2424
|
+
remapped.children = remapped.children.map((c) => ({
|
|
2425
|
+
$ref: this.remapRef(c.$ref, offsets)
|
|
2426
|
+
}));
|
|
2427
|
+
remapped.captions = remapped.captions.map((c) => ({
|
|
2428
|
+
$ref: this.remapRef(c.$ref, offsets)
|
|
2429
|
+
}));
|
|
2430
|
+
this.remapPictureImageUri(remapped, picFileOffset);
|
|
2431
|
+
base.pictures.push(remapped);
|
|
2432
|
+
}
|
|
2433
|
+
for (const table of chunk.tables) {
|
|
2434
|
+
const remapped = structuredClone(table);
|
|
2435
|
+
remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
|
|
2436
|
+
if (remapped.parent) {
|
|
2437
|
+
remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
|
|
2438
|
+
}
|
|
2439
|
+
remapped.children = remapped.children.map((c) => ({
|
|
2440
|
+
$ref: this.remapRef(c.$ref, offsets)
|
|
2441
|
+
}));
|
|
2442
|
+
remapped.captions = remapped.captions.map((c) => ({
|
|
2443
|
+
$ref: this.remapRef(c.$ref, offsets)
|
|
2444
|
+
}));
|
|
2445
|
+
remapped.footnotes = remapped.footnotes.map((f) => ({
|
|
2446
|
+
$ref: this.remapRef(f.$ref, offsets)
|
|
2447
|
+
}));
|
|
2448
|
+
base.tables.push(remapped);
|
|
2449
|
+
}
|
|
2450
|
+
for (const group of chunk.groups) {
|
|
2451
|
+
const remapped = structuredClone(group);
|
|
2452
|
+
remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
|
|
2453
|
+
if (remapped.parent) {
|
|
2454
|
+
remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
|
|
2455
|
+
}
|
|
2456
|
+
remapped.children = remapped.children.map((c) => ({
|
|
2457
|
+
$ref: this.remapRef(c.$ref, offsets)
|
|
2458
|
+
}));
|
|
2459
|
+
base.groups.push(remapped);
|
|
2460
|
+
}
|
|
2461
|
+
for (const child of chunk.body.children) {
|
|
2462
|
+
base.body.children.push({
|
|
2463
|
+
$ref: this.remapRef(child.$ref, offsets)
|
|
2464
|
+
});
|
|
2465
|
+
}
|
|
2466
|
+
for (const child of chunk.furniture.children) {
|
|
2467
|
+
base.furniture.children.push({
|
|
2468
|
+
$ref: this.remapRef(child.$ref, offsets)
|
|
2469
|
+
});
|
|
2470
|
+
}
|
|
2471
|
+
Object.assign(base.pages, chunk.pages);
|
|
2472
|
+
}
|
|
2473
|
+
return base;
|
|
2474
|
+
}
|
|
2475
|
+
/**
|
|
2476
|
+
* Remap a $ref string by applying offsets.
|
|
2477
|
+
* Only refs matching "#/{texts|pictures|tables|groups}/{N}" are remapped.
|
|
2478
|
+
* Refs like "#/body" or "#/furniture" pass through unchanged.
|
|
2479
|
+
*/
|
|
2480
|
+
remapRef(ref, offsets) {
|
|
2481
|
+
const match = REF_PATTERN.exec(ref);
|
|
2482
|
+
if (!match) {
|
|
2483
|
+
return ref;
|
|
2484
|
+
}
|
|
2485
|
+
const kind = match[1];
|
|
2486
|
+
const index = parseInt(match[2], 10);
|
|
2487
|
+
return `#/${kind}/${index + offsets[kind]}`;
|
|
2488
|
+
}
|
|
2489
|
+
/**
|
|
2490
|
+
* Remap image URI in a picture item by applying the pic file offset.
|
|
2491
|
+
* Transforms "images/pic_N.png" → "images/pic_{N+offset}.png"
|
|
2492
|
+
*/
|
|
2493
|
+
remapPictureImageUri(picture, picFileOffset) {
|
|
2494
|
+
const rec = picture;
|
|
2495
|
+
const image = rec.image;
|
|
2496
|
+
if (!image?.uri) return;
|
|
2497
|
+
const match = IMAGE_URI_PATTERN.exec(image.uri);
|
|
2498
|
+
if (match) {
|
|
2499
|
+
const index = parseInt(match[1], 10);
|
|
2500
|
+
image.uri = `images/pic_${index + picFileOffset}.png`;
|
|
2501
|
+
}
|
|
2502
|
+
}
|
|
2503
|
+
};
|
|
2504
|
+
|
|
2505
|
+
// src/core/chunked-pdf-converter.ts
|
|
2506
|
+
var ChunkedPDFConverter = class {
|
|
2507
|
+
constructor(logger, client, config, timeout = PDF_CONVERTER.DEFAULT_TIMEOUT_MS) {
|
|
2508
|
+
this.logger = logger;
|
|
2509
|
+
this.client = client;
|
|
2510
|
+
this.config = config;
|
|
2511
|
+
this.timeout = timeout;
|
|
2512
|
+
}
|
|
2513
|
+
/**
|
|
2514
|
+
* Convert a local PDF in chunks.
|
|
2515
|
+
*
|
|
2516
|
+
* @param url - file:// URL to the source PDF
|
|
2517
|
+
* @param reportId - Unique report identifier for output directory naming
|
|
2518
|
+
* @param onComplete - Callback invoked with the final output directory
|
|
2519
|
+
* @param cleanupAfterCallback - Whether to clean up the output directory after callback
|
|
2520
|
+
* @param options - PDF conversion options (chunked-specific fields are stripped internally)
|
|
2521
|
+
* @param buildConversionOptions - Function to build Docling ConversionOptions from PDFConvertOptions
|
|
2522
|
+
* @param abortSignal - Optional abort signal for cancellation
|
|
2523
|
+
*/
|
|
2524
|
+
async convertChunked(url, reportId, onComplete, cleanupAfterCallback, options, buildConversionOptions, abortSignal) {
|
|
2525
|
+
const pdfPath = url.slice(7);
|
|
2526
|
+
const cwd = process.cwd();
|
|
2527
|
+
const outputDir = (0, import_node_path6.join)(cwd, "output", reportId);
|
|
2528
|
+
const chunksBaseDir = (0, import_node_path6.join)(cwd, "output", reportId, "_chunks");
|
|
2529
|
+
const totalPages = await this.getPageCount(pdfPath);
|
|
2530
|
+
if (totalPages === 0) {
|
|
2531
|
+
throw new Error(
|
|
2532
|
+
"[ChunkedPDFConverter] Failed to detect page count from PDF"
|
|
2533
|
+
);
|
|
2534
|
+
}
|
|
2535
|
+
const chunks = this.calculateChunks(totalPages);
|
|
2536
|
+
this.logger.info(
|
|
2537
|
+
`[ChunkedPDFConverter] Starting: ${totalPages} pages \u2192 ${chunks.length} chunks of ${this.config.chunkSize}`
|
|
2538
|
+
);
|
|
2539
|
+
const server = new LocalFileServer();
|
|
2540
|
+
const httpUrl = await server.start(pdfPath);
|
|
2541
|
+
this.logger.info(
|
|
2542
|
+
"[ChunkedPDFConverter] Started local file server:",
|
|
2543
|
+
httpUrl
|
|
2544
|
+
);
|
|
2545
|
+
const chunkDocuments = [];
|
|
2546
|
+
try {
|
|
2547
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
2548
|
+
this.checkAbort(abortSignal);
|
|
2549
|
+
const [start, end] = chunks[i];
|
|
2550
|
+
const chunkDir = (0, import_node_path6.join)(chunksBaseDir, `_chunk_${i}`);
|
|
2551
|
+
(0, import_node_fs7.mkdirSync)(chunkDir, { recursive: true });
|
|
2552
|
+
const doc = await this.convertChunk(
|
|
2553
|
+
i,
|
|
2554
|
+
chunks.length,
|
|
2555
|
+
start,
|
|
2556
|
+
end,
|
|
2557
|
+
httpUrl,
|
|
2558
|
+
chunkDir,
|
|
2559
|
+
options,
|
|
2560
|
+
buildConversionOptions
|
|
2561
|
+
);
|
|
2562
|
+
chunkDocuments.push(doc);
|
|
2563
|
+
}
|
|
2564
|
+
} finally {
|
|
2565
|
+
this.logger.info("[ChunkedPDFConverter] Stopping local file server...");
|
|
2566
|
+
await server.stop();
|
|
2567
|
+
}
|
|
2568
|
+
this.checkAbort(abortSignal);
|
|
2569
|
+
this.logger.info(
|
|
2570
|
+
`[ChunkedPDFConverter] All ${chunks.length} chunks completed, merging...`
|
|
2571
|
+
);
|
|
2572
|
+
const merger = new DoclingDocumentMerger();
|
|
2573
|
+
const picFileOffsets = this.buildPicFileOffsets(
|
|
2574
|
+
chunksBaseDir,
|
|
2575
|
+
chunks.length
|
|
2576
|
+
);
|
|
2577
|
+
const merged = merger.merge(chunkDocuments, picFileOffsets);
|
|
2578
|
+
this.logger.info(
|
|
2579
|
+
`[ChunkedPDFConverter] Merged: ${merged.texts.length} texts, ${merged.pictures.length} pictures, ${merged.tables.length} tables, ${Object.keys(merged.pages).length} pages`
|
|
2580
|
+
);
|
|
2581
|
+
(0, import_node_fs7.mkdirSync)(outputDir, { recursive: true });
|
|
2582
|
+
const imagesDir = (0, import_node_path6.join)(outputDir, "images");
|
|
2583
|
+
(0, import_node_fs7.mkdirSync)(imagesDir, { recursive: true });
|
|
2584
|
+
this.relocateImages(chunksBaseDir, chunks.length, imagesDir);
|
|
2585
|
+
const resultPath = (0, import_node_path6.join)(outputDir, "result.json");
|
|
2586
|
+
(0, import_node_fs7.writeFileSync)(resultPath, JSON.stringify(merged));
|
|
2587
|
+
try {
|
|
2588
|
+
await this.renderPageImages(pdfPath, outputDir);
|
|
2589
|
+
this.cleanupOrphanedPicFiles(resultPath, imagesDir);
|
|
2590
|
+
this.checkAbort(abortSignal);
|
|
2591
|
+
this.logger.info(
|
|
2592
|
+
"[ChunkedPDFConverter] Executing completion callback..."
|
|
2593
|
+
);
|
|
2594
|
+
await onComplete(outputDir);
|
|
2595
|
+
} finally {
|
|
2596
|
+
if ((0, import_node_fs7.existsSync)(chunksBaseDir)) {
|
|
2597
|
+
(0, import_node_fs7.rmSync)(chunksBaseDir, { recursive: true, force: true });
|
|
2598
|
+
}
|
|
2599
|
+
if (cleanupAfterCallback) {
|
|
2600
|
+
this.logger.info(
|
|
2601
|
+
"[ChunkedPDFConverter] Cleaning up output directory:",
|
|
2602
|
+
outputDir
|
|
2603
|
+
);
|
|
2604
|
+
if ((0, import_node_fs7.existsSync)(outputDir)) {
|
|
2605
|
+
(0, import_node_fs7.rmSync)(outputDir, { recursive: true, force: true });
|
|
2606
|
+
}
|
|
2607
|
+
} else {
|
|
2608
|
+
this.logger.info(
|
|
2609
|
+
"[ChunkedPDFConverter] Output preserved at:",
|
|
2610
|
+
outputDir
|
|
2611
|
+
);
|
|
2612
|
+
}
|
|
2613
|
+
}
|
|
2614
|
+
return null;
|
|
2615
|
+
}
|
|
2616
|
+
/**
|
|
2617
|
+
* Convert a single chunk with retry logic.
|
|
2618
|
+
*/
|
|
2619
|
+
async convertChunk(chunkIndex, totalChunks, startPage, endPage, httpUrl, chunkDir, options, buildConversionOptions) {
|
|
2620
|
+
const chunkLabel = `Chunk ${chunkIndex + 1}/${totalChunks} (pages ${startPage}-${endPage})`;
|
|
2621
|
+
for (let attempt = 0; attempt <= this.config.maxRetries; attempt++) {
|
|
2622
|
+
try {
|
|
2623
|
+
if (attempt > 0) {
|
|
2624
|
+
this.logger.info(
|
|
2625
|
+
`[ChunkedPDFConverter] ${chunkLabel}: retrying (${attempt}/${this.config.maxRetries})...`
|
|
2626
|
+
);
|
|
2627
|
+
} else {
|
|
2628
|
+
this.logger.info(
|
|
2629
|
+
`[ChunkedPDFConverter] ${chunkLabel}: converting...`
|
|
2630
|
+
);
|
|
2631
|
+
}
|
|
2632
|
+
const startTime = Date.now();
|
|
2633
|
+
const conversionOptions = buildConversionOptions({
|
|
2634
|
+
...options,
|
|
2635
|
+
page_range: [startPage, endPage]
|
|
2636
|
+
});
|
|
2637
|
+
const task = await this.client.convertSourceAsync({
|
|
2638
|
+
sources: [{ kind: "http", url: httpUrl }],
|
|
2639
|
+
options: conversionOptions,
|
|
2640
|
+
target: { kind: "zip" }
|
|
2641
|
+
});
|
|
2642
|
+
await this.trackTaskProgress(task);
|
|
2643
|
+
const zipPath = (0, import_node_path6.join)(chunkDir, "result.zip");
|
|
2644
|
+
await this.downloadResult(task.taskId, zipPath);
|
|
2645
|
+
const extractDir = (0, import_node_path6.join)(chunkDir, "extracted");
|
|
2646
|
+
const chunkOutputDir = (0, import_node_path6.join)(chunkDir, "output");
|
|
2647
|
+
await ImageExtractor.extractAndSaveDocumentsFromZip(
|
|
2648
|
+
this.logger,
|
|
2649
|
+
zipPath,
|
|
2650
|
+
extractDir,
|
|
2651
|
+
chunkOutputDir
|
|
2652
|
+
);
|
|
2653
|
+
const resultJsonPath = (0, import_node_path6.join)(chunkOutputDir, "result.json");
|
|
2654
|
+
const doc = await runJqFileJson(".", resultJsonPath);
|
|
2655
|
+
if ((0, import_node_fs7.existsSync)(zipPath)) (0, import_node_fs7.rmSync)(zipPath, { force: true });
|
|
2656
|
+
if ((0, import_node_fs7.existsSync)(extractDir)) {
|
|
2657
|
+
(0, import_node_fs7.rmSync)(extractDir, { recursive: true, force: true });
|
|
2658
|
+
}
|
|
2659
|
+
const elapsed = ((Date.now() - startTime) / 1e3).toFixed(1);
|
|
2660
|
+
if (attempt > 0) {
|
|
2661
|
+
this.logger.info(
|
|
2662
|
+
`[ChunkedPDFConverter] ${chunkLabel}: completed on retry ${attempt} (${elapsed}s)`
|
|
2663
|
+
);
|
|
2664
|
+
} else {
|
|
2665
|
+
this.logger.info(
|
|
2666
|
+
`[ChunkedPDFConverter] ${chunkLabel}: completed (${elapsed}s)`
|
|
2667
|
+
);
|
|
2668
|
+
}
|
|
2669
|
+
return doc;
|
|
2670
|
+
} catch (error) {
|
|
2671
|
+
if (attempt >= this.config.maxRetries) {
|
|
2672
|
+
this.logger.error(
|
|
2673
|
+
`[ChunkedPDFConverter] ${chunkLabel}: failed after ${this.config.maxRetries} retries`
|
|
2674
|
+
);
|
|
2675
|
+
throw error;
|
|
2676
|
+
}
|
|
2677
|
+
this.logger.warn(
|
|
2678
|
+
`[ChunkedPDFConverter] ${chunkLabel}: failed, retrying (${attempt + 1}/${this.config.maxRetries})...`
|
|
2679
|
+
);
|
|
2680
|
+
}
|
|
2681
|
+
}
|
|
2682
|
+
throw new Error("Unreachable");
|
|
2683
|
+
}
|
|
2684
|
+
/** Calculate page ranges for chunks */
|
|
2685
|
+
calculateChunks(totalPages) {
|
|
2686
|
+
if (this.config.chunkSize <= 0) {
|
|
2687
|
+
throw new Error("[ChunkedPDFConverter] chunkSize must be positive");
|
|
2688
|
+
}
|
|
2689
|
+
const ranges = [];
|
|
2690
|
+
for (let start = 1; start <= totalPages; start += this.config.chunkSize) {
|
|
2691
|
+
const end = Math.min(start + this.config.chunkSize - 1, totalPages);
|
|
2692
|
+
ranges.push([start, end]);
|
|
2693
|
+
}
|
|
2694
|
+
return ranges;
|
|
2695
|
+
}
|
|
2696
|
+
/** Get total page count using pdfinfo */
|
|
2697
|
+
async getPageCount(pdfPath) {
|
|
2698
|
+
const result = await spawnAsync("pdfinfo", [pdfPath]);
|
|
2699
|
+
if (result.code !== 0) {
|
|
2700
|
+
return 0;
|
|
2701
|
+
}
|
|
2702
|
+
const match = result.stdout.match(/^Pages:\s+(\d+)/m);
|
|
2703
|
+
return match ? parseInt(match[1], 10) : 0;
|
|
2704
|
+
}
|
|
2705
|
+
/** Poll task progress until completion */
|
|
2706
|
+
async trackTaskProgress(task) {
|
|
2707
|
+
const startTime = Date.now();
|
|
2708
|
+
while (true) {
|
|
2709
|
+
if (Date.now() - startTime > this.timeout) {
|
|
2710
|
+
throw new Error("[ChunkedPDFConverter] Chunk task timeout");
|
|
2711
|
+
}
|
|
2712
|
+
const status = await task.poll();
|
|
2713
|
+
if (status.task_status === "success") return;
|
|
2714
|
+
if (status.task_status === "failure") {
|
|
2715
|
+
let details = "unknown";
|
|
2716
|
+
try {
|
|
2717
|
+
const result = await task.getResult();
|
|
2718
|
+
if (result.errors?.length) {
|
|
2719
|
+
details = result.errors.map((e) => e.message).join("; ");
|
|
2720
|
+
}
|
|
2721
|
+
} catch {
|
|
2722
|
+
}
|
|
2723
|
+
throw new Error(`[ChunkedPDFConverter] Chunk task failed: ${details}`);
|
|
2724
|
+
}
|
|
2725
|
+
await new Promise(
|
|
2726
|
+
(resolve) => setTimeout(resolve, PDF_CONVERTER.POLL_INTERVAL_MS)
|
|
2727
|
+
);
|
|
2728
|
+
}
|
|
2729
|
+
}
|
|
2730
|
+
/** Download ZIP result for a task */
|
|
2731
|
+
async downloadResult(taskId, zipPath) {
|
|
2732
|
+
const zipResult = await this.client.getTaskResultFile(taskId);
|
|
2733
|
+
if (zipResult.fileStream) {
|
|
2734
|
+
const writeStream = (0, import_node_fs7.createWriteStream)(zipPath);
|
|
2735
|
+
await (0, import_promises5.pipeline)(zipResult.fileStream, writeStream);
|
|
2736
|
+
return;
|
|
2737
|
+
}
|
|
2738
|
+
if (zipResult.data) {
|
|
2739
|
+
await (0, import_promises4.writeFile)(zipPath, zipResult.data);
|
|
2740
|
+
return;
|
|
2741
|
+
}
|
|
2742
|
+
const baseUrl = this.client.getConfig().baseUrl;
|
|
2743
|
+
const response = await fetch(`${baseUrl}/v1/result/${taskId}`, {
|
|
2744
|
+
headers: { Accept: "application/zip" }
|
|
2745
|
+
});
|
|
2746
|
+
if (!response.ok) {
|
|
2747
|
+
throw new Error(
|
|
2748
|
+
`Failed to download chunk ZIP: ${response.status} ${response.statusText}`
|
|
2749
|
+
);
|
|
2750
|
+
}
|
|
2751
|
+
const buffer = new Uint8Array(await response.arrayBuffer());
|
|
2752
|
+
await (0, import_promises4.writeFile)(zipPath, buffer);
|
|
2753
|
+
}
|
|
2754
|
+
/**
|
|
2755
|
+
* Relocate images from chunk output directories to the final images directory
|
|
2756
|
+
* with global indexing.
|
|
2757
|
+
*/
|
|
2758
|
+
relocateImages(chunksBaseDir, totalChunks, imagesDir) {
|
|
2759
|
+
let picGlobalIndex = 0;
|
|
2760
|
+
for (let i = 0; i < totalChunks; i++) {
|
|
2761
|
+
const chunkImagesDir = (0, import_node_path6.join)(
|
|
2762
|
+
chunksBaseDir,
|
|
2763
|
+
`_chunk_${i}`,
|
|
2764
|
+
"output",
|
|
2765
|
+
"images"
|
|
2766
|
+
);
|
|
2767
|
+
if (!(0, import_node_fs7.existsSync)(chunkImagesDir)) continue;
|
|
2768
|
+
const picFiles = (0, import_node_fs7.readdirSync)(chunkImagesDir).filter((f) => f.startsWith("pic_") && f.endsWith(".png")).sort((a, b) => {
|
|
2769
|
+
const numA = parseInt(a.replace("pic_", "").replace(".png", ""), 10);
|
|
2770
|
+
const numB = parseInt(b.replace("pic_", "").replace(".png", ""), 10);
|
|
2771
|
+
return numA - numB;
|
|
2772
|
+
});
|
|
2773
|
+
for (const file of picFiles) {
|
|
2774
|
+
const src = (0, import_node_path6.join)(chunkImagesDir, file);
|
|
2775
|
+
const dest = (0, import_node_path6.join)(imagesDir, `pic_${picGlobalIndex}.png`);
|
|
2776
|
+
(0, import_node_fs7.copyFileSync)(src, dest);
|
|
2777
|
+
picGlobalIndex++;
|
|
2778
|
+
}
|
|
2779
|
+
}
|
|
2780
|
+
let imageGlobalIndex = 0;
|
|
2781
|
+
for (let i = 0; i < totalChunks; i++) {
|
|
2782
|
+
const chunkImagesDir = (0, import_node_path6.join)(
|
|
2783
|
+
chunksBaseDir,
|
|
2784
|
+
`_chunk_${i}`,
|
|
2785
|
+
"output",
|
|
2786
|
+
"images"
|
|
2787
|
+
);
|
|
2788
|
+
if (!(0, import_node_fs7.existsSync)(chunkImagesDir)) continue;
|
|
2789
|
+
const imageFiles = (0, import_node_fs7.readdirSync)(chunkImagesDir).filter((f) => f.startsWith("image_") && f.endsWith(".png")).sort((a, b) => {
|
|
2790
|
+
const numA = parseInt(
|
|
2791
|
+
a.replace("image_", "").replace(".png", ""),
|
|
2792
|
+
10
|
|
2793
|
+
);
|
|
2794
|
+
const numB = parseInt(
|
|
2795
|
+
b.replace("image_", "").replace(".png", ""),
|
|
2796
|
+
10
|
|
2797
|
+
);
|
|
2798
|
+
return numA - numB;
|
|
2799
|
+
});
|
|
2800
|
+
for (const file of imageFiles) {
|
|
2801
|
+
const src = (0, import_node_path6.join)(chunkImagesDir, file);
|
|
2802
|
+
const dest = (0, import_node_path6.join)(imagesDir, `image_${imageGlobalIndex}.png`);
|
|
2803
|
+
(0, import_node_fs7.copyFileSync)(src, dest);
|
|
2804
|
+
imageGlobalIndex++;
|
|
2805
|
+
}
|
|
2806
|
+
}
|
|
2807
|
+
this.logger.info(
|
|
2808
|
+
`[ChunkedPDFConverter] Relocated ${picGlobalIndex} pic + ${imageGlobalIndex} image files to ${imagesDir}`
|
|
2809
|
+
);
|
|
2810
|
+
}
|
|
2811
|
+
/** Render page images from PDF using ImageMagick and update result.json */
|
|
2812
|
+
async renderPageImages(pdfPath, outputDir) {
|
|
2813
|
+
this.logger.info(
|
|
2814
|
+
"[ChunkedPDFConverter] Rendering page images with ImageMagick..."
|
|
2815
|
+
);
|
|
2816
|
+
const renderer = new PageRenderer(this.logger);
|
|
2817
|
+
const renderResult = await renderer.renderPages(pdfPath, outputDir);
|
|
2818
|
+
const resultPath = (0, import_node_path6.join)(outputDir, "result.json");
|
|
2819
|
+
const tmpPath = resultPath + ".tmp";
|
|
2820
|
+
const jqProgram = `
|
|
2821
|
+
.pages |= with_entries(
|
|
2822
|
+
if (.value.page_no - 1) >= 0 and (.value.page_no - 1) < ${renderResult.pageCount} then
|
|
2823
|
+
.value.image.uri = "pages/page_\\(.value.page_no - 1).png" |
|
|
2824
|
+
.value.image.mimetype = "image/png" |
|
|
2825
|
+
.value.image.dpi = ${PAGE_RENDERING.DEFAULT_DPI}
|
|
2826
|
+
else . end
|
|
2827
|
+
)
|
|
2828
|
+
`;
|
|
2829
|
+
await runJqFileToFile(jqProgram, resultPath, tmpPath);
|
|
2830
|
+
await (0, import_promises4.rename)(tmpPath, resultPath);
|
|
2831
|
+
this.logger.info(
|
|
2832
|
+
`[ChunkedPDFConverter] Rendered ${renderResult.pageCount} page images`
|
|
2833
|
+
);
|
|
2834
|
+
}
|
|
2835
|
+
/**
|
|
2836
|
+
* Remove pic_ files from images directory that are not referenced in result.json.
|
|
2837
|
+
* Chunked Docling conversion embeds page images as base64 in JSON, which get
|
|
2838
|
+
* extracted as pic_ files. After renderPageImages replaces page URIs with
|
|
2839
|
+
* pages/page_N.png, these pic_ files become orphaned.
|
|
2840
|
+
*/
|
|
2841
|
+
cleanupOrphanedPicFiles(resultPath, imagesDir) {
|
|
2842
|
+
const content = (0, import_node_fs7.readFileSync)(resultPath, "utf-8");
|
|
2843
|
+
const referencedPics = /* @__PURE__ */ new Set();
|
|
2844
|
+
const picPattern = /images\/pic_\d+\.png/g;
|
|
2845
|
+
let match;
|
|
2846
|
+
while ((match = picPattern.exec(content)) !== null) {
|
|
2847
|
+
referencedPics.add(match[0].replace("images/", ""));
|
|
2848
|
+
}
|
|
2849
|
+
const picFiles = (0, import_node_fs7.readdirSync)(imagesDir).filter(
|
|
2850
|
+
(f) => f.startsWith("pic_") && f.endsWith(".png")
|
|
2851
|
+
);
|
|
2852
|
+
let removedCount = 0;
|
|
2853
|
+
for (const file of picFiles) {
|
|
2854
|
+
if (!referencedPics.has(file)) {
|
|
2855
|
+
(0, import_node_fs7.rmSync)((0, import_node_path6.join)(imagesDir, file), { force: true });
|
|
2856
|
+
removedCount++;
|
|
2857
|
+
}
|
|
2858
|
+
}
|
|
2859
|
+
if (removedCount > 0) {
|
|
2860
|
+
this.logger.info(
|
|
2861
|
+
`[ChunkedPDFConverter] Cleaned up ${removedCount} orphaned pic_ files (${referencedPics.size} referenced, kept)`
|
|
2862
|
+
);
|
|
2863
|
+
}
|
|
2864
|
+
}
|
|
2865
|
+
/**
|
|
2866
|
+
* Build cumulative pic_ file offsets per chunk for correct URI remapping.
|
|
2867
|
+
* Each offset[i] is the total number of pic_ files in chunks 0..i-1.
|
|
2868
|
+
*/
|
|
2869
|
+
buildPicFileOffsets(chunksBaseDir, totalChunks) {
|
|
2870
|
+
const offsets = [];
|
|
2871
|
+
let cumulative = 0;
|
|
2872
|
+
for (let i = 0; i < totalChunks; i++) {
|
|
2873
|
+
offsets.push(cumulative);
|
|
2874
|
+
const dir = (0, import_node_path6.join)(chunksBaseDir, `_chunk_${i}`, "output", "images");
|
|
2875
|
+
const count = (0, import_node_fs7.existsSync)(dir) ? (0, import_node_fs7.readdirSync)(dir).filter(
|
|
2876
|
+
(f) => f.startsWith("pic_") && f.endsWith(".png")
|
|
2877
|
+
).length : 0;
|
|
2878
|
+
cumulative += count;
|
|
2879
|
+
}
|
|
2880
|
+
return offsets;
|
|
2881
|
+
}
|
|
2882
|
+
/** Check if abort has been signalled and throw if so */
|
|
2883
|
+
checkAbort(signal) {
|
|
2884
|
+
if (signal?.aborted) {
|
|
2885
|
+
const error = new Error("Chunked PDF conversion was aborted");
|
|
2886
|
+
error.name = "AbortError";
|
|
2887
|
+
throw error;
|
|
2888
|
+
}
|
|
2889
|
+
}
|
|
2890
|
+
};
|
|
2891
|
+
|
|
2892
|
+
// src/core/image-pdf-converter.ts
|
|
2893
|
+
var import_node_fs8 = require("fs");
|
|
2894
|
+
var import_node_os = require("os");
|
|
2895
|
+
var import_node_path7 = require("path");
|
|
2348
2896
|
var ImagePdfConverter = class {
|
|
2349
2897
|
constructor(logger) {
|
|
2350
2898
|
this.logger = logger;
|
|
@@ -2360,8 +2908,8 @@ var ImagePdfConverter = class {
|
|
|
2360
2908
|
async convert(pdfUrl, reportId) {
|
|
2361
2909
|
const timestamp = Date.now();
|
|
2362
2910
|
const tempDir = (0, import_node_os.tmpdir)();
|
|
2363
|
-
const inputPath = (0,
|
|
2364
|
-
const outputPath = (0,
|
|
2911
|
+
const inputPath = (0, import_node_path7.join)(tempDir, `${reportId}-${timestamp}-input.pdf`);
|
|
2912
|
+
const outputPath = (0, import_node_path7.join)(tempDir, `${reportId}-${timestamp}-image.pdf`);
|
|
2365
2913
|
try {
|
|
2366
2914
|
this.logger.info("[ImagePdfConverter] Downloading PDF from URL...");
|
|
2367
2915
|
await this.downloadPdf(pdfUrl, inputPath);
|
|
@@ -2370,8 +2918,8 @@ var ImagePdfConverter = class {
|
|
|
2370
2918
|
this.logger.info("[ImagePdfConverter] Image PDF created:", outputPath);
|
|
2371
2919
|
return outputPath;
|
|
2372
2920
|
} finally {
|
|
2373
|
-
if ((0,
|
|
2374
|
-
(0,
|
|
2921
|
+
if ((0, import_node_fs8.existsSync)(inputPath)) {
|
|
2922
|
+
(0, import_node_fs8.rmSync)(inputPath, { force: true });
|
|
2375
2923
|
}
|
|
2376
2924
|
}
|
|
2377
2925
|
}
|
|
@@ -2418,12 +2966,12 @@ var ImagePdfConverter = class {
|
|
|
2418
2966
|
* Cleanup the temporary image PDF file
|
|
2419
2967
|
*/
|
|
2420
2968
|
cleanup(imagePdfPath) {
|
|
2421
|
-
if ((0,
|
|
2969
|
+
if ((0, import_node_fs8.existsSync)(imagePdfPath)) {
|
|
2422
2970
|
this.logger.info(
|
|
2423
2971
|
"[ImagePdfConverter] Cleaning up temp file:",
|
|
2424
2972
|
imagePdfPath
|
|
2425
2973
|
);
|
|
2426
|
-
(0,
|
|
2974
|
+
(0, import_node_fs8.rmSync)(imagePdfPath, { force: true });
|
|
2427
2975
|
}
|
|
2428
2976
|
}
|
|
2429
2977
|
};
|
|
@@ -2438,6 +2986,26 @@ var PDFConverter = class {
|
|
|
2438
2986
|
}
|
|
2439
2987
|
async convert(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
|
|
2440
2988
|
this.logger.info("[PDFConverter] Converting:", url);
|
|
2989
|
+
if (options.chunkedConversion && url.startsWith("file://")) {
|
|
2990
|
+
const chunked = new ChunkedPDFConverter(
|
|
2991
|
+
this.logger,
|
|
2992
|
+
this.client,
|
|
2993
|
+
{
|
|
2994
|
+
chunkSize: options.chunkSize ?? CHUNKED_CONVERSION.DEFAULT_CHUNK_SIZE,
|
|
2995
|
+
maxRetries: options.chunkMaxRetries ?? CHUNKED_CONVERSION.DEFAULT_MAX_RETRIES
|
|
2996
|
+
},
|
|
2997
|
+
this.timeout
|
|
2998
|
+
);
|
|
2999
|
+
return chunked.convertChunked(
|
|
3000
|
+
url,
|
|
3001
|
+
reportId,
|
|
3002
|
+
onComplete,
|
|
3003
|
+
cleanupAfterCallback,
|
|
3004
|
+
options,
|
|
3005
|
+
(opts) => this.buildConversionOptions(opts),
|
|
3006
|
+
abortSignal
|
|
3007
|
+
);
|
|
3008
|
+
}
|
|
2441
3009
|
if (options.forceImagePdf) {
|
|
2442
3010
|
return this.convertViaImagePdf(
|
|
2443
3011
|
url,
|
|
@@ -2542,7 +3110,7 @@ var PDFConverter = class {
|
|
|
2542
3110
|
const reason = options.forcedMethod ? `Forced: ${options.forcedMethod}` : !pdfPath ? "Non-local URL, sampling skipped" : "Sampling skipped";
|
|
2543
3111
|
return { method, reason, sampledPages: 0, totalPages: 0 };
|
|
2544
3112
|
}
|
|
2545
|
-
const samplingDir = (0,
|
|
3113
|
+
const samplingDir = (0, import_node_path8.join)(process.cwd(), "output", reportId, "_sampling");
|
|
2546
3114
|
const sampler = new OcrStrategySampler(
|
|
2547
3115
|
this.logger,
|
|
2548
3116
|
new PageRenderer(this.logger),
|
|
@@ -2567,8 +3135,8 @@ var PDFConverter = class {
|
|
|
2567
3135
|
}
|
|
2568
3136
|
return strategy;
|
|
2569
3137
|
} finally {
|
|
2570
|
-
if ((0,
|
|
2571
|
-
(0,
|
|
3138
|
+
if ((0, import_node_fs9.existsSync)(samplingDir)) {
|
|
3139
|
+
(0, import_node_fs9.rmSync)(samplingDir, { recursive: true, force: true });
|
|
2572
3140
|
}
|
|
2573
3141
|
}
|
|
2574
3142
|
}
|
|
@@ -2589,7 +3157,7 @@ var PDFConverter = class {
|
|
|
2589
3157
|
const wrappedCallback = async (outputDir) => {
|
|
2590
3158
|
let pageTexts;
|
|
2591
3159
|
try {
|
|
2592
|
-
const resultPath2 = (0,
|
|
3160
|
+
const resultPath2 = (0, import_node_path8.join)(outputDir, "result.json");
|
|
2593
3161
|
const totalPages = await runJqFileJson(
|
|
2594
3162
|
".pages | length",
|
|
2595
3163
|
resultPath2
|
|
@@ -2601,9 +3169,9 @@ var PDFConverter = class {
|
|
|
2601
3169
|
"[PDFConverter] pdftotext extraction failed, proceeding without text reference"
|
|
2602
3170
|
);
|
|
2603
3171
|
}
|
|
2604
|
-
const resultPath = (0,
|
|
2605
|
-
const ocrOriginPath = (0,
|
|
2606
|
-
(0,
|
|
3172
|
+
const resultPath = (0, import_node_path8.join)(outputDir, "result.json");
|
|
3173
|
+
const ocrOriginPath = (0, import_node_path8.join)(outputDir, "result_ocr_origin.json");
|
|
3174
|
+
(0, import_node_fs9.copyFileSync)(resultPath, ocrOriginPath);
|
|
2607
3175
|
const corrector = new VlmTextCorrector(this.logger);
|
|
2608
3176
|
await corrector.correctAndSave(outputDir, options.vlmProcessorModel, {
|
|
2609
3177
|
concurrency: options.vlmConcurrency,
|
|
@@ -2745,9 +3313,9 @@ var PDFConverter = class {
|
|
|
2745
3313
|
}
|
|
2746
3314
|
}
|
|
2747
3315
|
const cwd = process.cwd();
|
|
2748
|
-
const zipPath = (0,
|
|
2749
|
-
const extractDir = (0,
|
|
2750
|
-
const outputDir = (0,
|
|
3316
|
+
const zipPath = (0, import_node_path8.join)(cwd, "result.zip");
|
|
3317
|
+
const extractDir = (0, import_node_path8.join)(cwd, "result_extracted");
|
|
3318
|
+
const outputDir = (0, import_node_path8.join)(cwd, "output", reportId);
|
|
2751
3319
|
try {
|
|
2752
3320
|
await this.processConvertedFiles(zipPath, extractDir, outputDir);
|
|
2753
3321
|
await this.renderPageImages(url, outputDir);
|
|
@@ -2764,19 +3332,19 @@ var PDFConverter = class {
|
|
|
2764
3332
|
this.logger.info("[PDFConverter] Total time:", duration, "ms");
|
|
2765
3333
|
} finally {
|
|
2766
3334
|
this.logger.info("[PDFConverter] Cleaning up temporary files...");
|
|
2767
|
-
if ((0,
|
|
2768
|
-
(0,
|
|
3335
|
+
if ((0, import_node_fs9.existsSync)(zipPath)) {
|
|
3336
|
+
(0, import_node_fs9.rmSync)(zipPath, { force: true });
|
|
2769
3337
|
}
|
|
2770
|
-
if ((0,
|
|
2771
|
-
(0,
|
|
3338
|
+
if ((0, import_node_fs9.existsSync)(extractDir)) {
|
|
3339
|
+
(0, import_node_fs9.rmSync)(extractDir, { recursive: true, force: true });
|
|
2772
3340
|
}
|
|
2773
3341
|
if (cleanupAfterCallback) {
|
|
2774
3342
|
this.logger.info(
|
|
2775
3343
|
"[PDFConverter] Cleaning up output directory:",
|
|
2776
3344
|
outputDir
|
|
2777
3345
|
);
|
|
2778
|
-
if ((0,
|
|
2779
|
-
(0,
|
|
3346
|
+
if ((0, import_node_fs9.existsSync)(outputDir)) {
|
|
3347
|
+
(0, import_node_fs9.rmSync)(outputDir, { recursive: true, force: true });
|
|
2780
3348
|
}
|
|
2781
3349
|
} else {
|
|
2782
3350
|
this.logger.info("[PDFConverter] Output preserved at:", outputDir);
|
|
@@ -2794,7 +3362,10 @@ var PDFConverter = class {
|
|
|
2794
3362
|
"skipSampling",
|
|
2795
3363
|
"forcedMethod",
|
|
2796
3364
|
"aggregator",
|
|
2797
|
-
"onTokenUsage"
|
|
3365
|
+
"onTokenUsage",
|
|
3366
|
+
"chunkedConversion",
|
|
3367
|
+
"chunkSize",
|
|
3368
|
+
"chunkMaxRetries"
|
|
2798
3369
|
]),
|
|
2799
3370
|
to_formats: ["json", "html"],
|
|
2800
3371
|
image_export_mode: "embedded",
|
|
@@ -2922,15 +3493,15 @@ var PDFConverter = class {
|
|
|
2922
3493
|
"\n[PDFConverter] Task completed, downloading ZIP file..."
|
|
2923
3494
|
);
|
|
2924
3495
|
const zipResult = await this.client.getTaskResultFile(taskId);
|
|
2925
|
-
const zipPath = (0,
|
|
3496
|
+
const zipPath = (0, import_node_path8.join)(process.cwd(), "result.zip");
|
|
2926
3497
|
this.logger.info("[PDFConverter] Saving ZIP file to:", zipPath);
|
|
2927
3498
|
if (zipResult.fileStream) {
|
|
2928
|
-
const writeStream = (0,
|
|
2929
|
-
await (0,
|
|
3499
|
+
const writeStream = (0, import_node_fs9.createWriteStream)(zipPath);
|
|
3500
|
+
await (0, import_promises7.pipeline)(zipResult.fileStream, writeStream);
|
|
2930
3501
|
return;
|
|
2931
3502
|
}
|
|
2932
3503
|
if (zipResult.data) {
|
|
2933
|
-
await (0,
|
|
3504
|
+
await (0, import_promises6.writeFile)(zipPath, zipResult.data);
|
|
2934
3505
|
return;
|
|
2935
3506
|
}
|
|
2936
3507
|
this.logger.warn(
|
|
@@ -2946,7 +3517,7 @@ var PDFConverter = class {
|
|
|
2946
3517
|
);
|
|
2947
3518
|
}
|
|
2948
3519
|
const buffer = new Uint8Array(await response.arrayBuffer());
|
|
2949
|
-
await (0,
|
|
3520
|
+
await (0, import_promises6.writeFile)(zipPath, buffer);
|
|
2950
3521
|
}
|
|
2951
3522
|
async processConvertedFiles(zipPath, extractDir, outputDir) {
|
|
2952
3523
|
await ImageExtractor.extractAndSaveDocumentsFromZip(
|
|
@@ -2975,7 +3546,7 @@ var PDFConverter = class {
|
|
|
2975
3546
|
);
|
|
2976
3547
|
const renderer = new PageRenderer(this.logger);
|
|
2977
3548
|
const renderResult = await renderer.renderPages(pdfPath, outputDir);
|
|
2978
|
-
const resultPath = (0,
|
|
3549
|
+
const resultPath = (0, import_node_path8.join)(outputDir, "result.json");
|
|
2979
3550
|
const tmpPath = resultPath + ".tmp";
|
|
2980
3551
|
const jqProgram = `
|
|
2981
3552
|
.pages |= with_entries(
|
|
@@ -2987,7 +3558,7 @@ var PDFConverter = class {
|
|
|
2987
3558
|
)
|
|
2988
3559
|
`;
|
|
2989
3560
|
await runJqFileToFile(jqProgram, resultPath, tmpPath);
|
|
2990
|
-
await (0,
|
|
3561
|
+
await (0, import_promises6.rename)(tmpPath, resultPath);
|
|
2991
3562
|
this.logger.info(
|
|
2992
3563
|
`[PDFConverter] Rendered ${renderResult.pageCount} page images`
|
|
2993
3564
|
);
|
|
@@ -3022,7 +3593,7 @@ var PDFParser = class {
|
|
|
3022
3593
|
this.baseUrl = void 0;
|
|
3023
3594
|
}
|
|
3024
3595
|
this.timeout = timeout;
|
|
3025
|
-
this.venvPath = venvPath || (0,
|
|
3596
|
+
this.venvPath = venvPath || (0, import_node_path9.join)(process.cwd(), ".venv");
|
|
3026
3597
|
this.killExistingProcess = killExistingProcess;
|
|
3027
3598
|
this.enableImagePdfFallback = enableImagePdfFallback;
|
|
3028
3599
|
}
|