@heripo/pdf-parser 0.1.10 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -2,7 +2,7 @@
2
2
  import { Docling } from "docling-sdk";
3
3
  import { execSync } from "child_process";
4
4
  import { platform } from "os";
5
- import { join as join7 } from "path";
5
+ import { join as join8 } from "path";
6
6
 
7
7
  // src/config/constants.ts
8
8
  var PDF_PARSER = {
@@ -49,6 +49,12 @@ var PAGE_RENDERING = {
49
49
  /** Low-resolution DPI for OCR strategy sampling */
50
50
  SAMPLE_DPI: 150
51
51
  };
52
+ var CHUNKED_CONVERSION = {
53
+ /** Number of pages per chunk */
54
+ DEFAULT_CHUNK_SIZE: 10,
55
+ /** Maximum retry attempts per failed chunk */
56
+ DEFAULT_MAX_RETRIES: 2
57
+ };
52
58
  var IMAGE_PDF_CONVERTER = {
53
59
  /**
54
60
  * ImageMagick density option (DPI) for PDF to image conversion
@@ -843,10 +849,10 @@ var DoclingEnvironment = class _DoclingEnvironment {
843
849
 
844
850
  // src/core/pdf-converter.ts
845
851
  import { omit } from "es-toolkit";
846
- import { copyFileSync, createWriteStream as createWriteStream3, existsSync as existsSync4, rmSync as rmSync3 } from "fs";
847
- import { rename as rename2, writeFile } from "fs/promises";
848
- import { join as join6 } from "path";
849
- import { pipeline as pipeline3 } from "stream/promises";
852
+ import { copyFileSync as copyFileSync2, createWriteStream as createWriteStream4, existsSync as existsSync5, rmSync as rmSync4 } from "fs";
853
+ import { rename as rename3, writeFile as writeFile2 } from "fs/promises";
854
+ import { join as join7 } from "path";
855
+ import { pipeline as pipeline4 } from "stream/promises";
850
856
 
851
857
  // src/errors/image-pdf-fallback-error.ts
852
858
  var ImagePdfFallbackError = class extends Error {
@@ -1301,14 +1307,18 @@ var ImageExtractor = class _ImageExtractor {
1301
1307
  // src/processors/page-renderer.ts
1302
1308
  import { existsSync as existsSync2, mkdirSync as mkdirSync2, readdirSync as readdirSync2 } from "fs";
1303
1309
  import { join as join3 } from "path";
1304
- var PROGRESS_POLL_INTERVAL_MS = 2e3;
1310
+ var PROGRESS_LOG_PERCENT_STEP = 10;
1305
1311
  var PageRenderer = class {
1306
1312
  constructor(logger) {
1307
1313
  this.logger = logger;
1308
1314
  }
1315
+ lastLoggedPercent = 0;
1309
1316
  /**
1310
1317
  * Render all pages of a PDF to individual PNG files.
1311
1318
  *
1319
+ * Uses per-page rendering (`magick 'input.pdf[N]'`) when page count is known,
1320
+ * limiting peak memory to ~15MB/page instead of loading all pages at once.
1321
+ *
1312
1322
  * @param pdfPath - Absolute path to the source PDF file
1313
1323
  * @param outputDir - Directory where pages/ subdirectory will be created
1314
1324
  * @param options - Rendering options
@@ -1325,50 +1335,54 @@ var PageRenderer = class {
1325
1335
  this.logger.info(
1326
1336
  `[PageRenderer] Rendering ${totalPages} pages at ${dpi} DPI...`
1327
1337
  );
1338
+ this.lastLoggedPercent = 0;
1339
+ for (let i = 0; i < totalPages; i++) {
1340
+ const result = await spawnAsync(
1341
+ "magick",
1342
+ [
1343
+ "-density",
1344
+ dpi.toString(),
1345
+ `${pdfPath}[${i}]`,
1346
+ "-background",
1347
+ "white",
1348
+ "-alpha",
1349
+ "remove",
1350
+ "-alpha",
1351
+ "off",
1352
+ join3(pagesDir, `page_${i}.png`)
1353
+ ],
1354
+ { captureStdout: false }
1355
+ );
1356
+ if (result.code !== 0) {
1357
+ throw new Error(
1358
+ `[PageRenderer] Failed to render page ${i + 1}/${totalPages}: ${result.stderr || "Unknown error"}`
1359
+ );
1360
+ }
1361
+ this.logProgress(i + 1, totalPages);
1362
+ }
1328
1363
  } else {
1329
1364
  this.logger.info(`[PageRenderer] Rendering PDF at ${dpi} DPI...`);
1330
- }
1331
- const outputPattern = join3(pagesDir, "page_%d.png");
1332
- let progressInterval = null;
1333
- if (totalPages > 0) {
1334
- let lastLoggedCount = 0;
1335
- progressInterval = setInterval(() => {
1336
- try {
1337
- const rendered = readdirSync2(pagesDir).filter(
1338
- (f) => f.startsWith("page_") && f.endsWith(".png")
1339
- ).length;
1340
- if (rendered > 0 && rendered !== lastLoggedCount) {
1341
- lastLoggedCount = rendered;
1342
- this.logger.info(
1343
- `[PageRenderer] Rendering pages: ${rendered}/${totalPages}`
1344
- );
1345
- }
1346
- } catch {
1347
- }
1348
- }, PROGRESS_POLL_INTERVAL_MS);
1349
- }
1350
- try {
1351
- const result = await spawnAsync("magick", [
1352
- "-density",
1353
- dpi.toString(),
1354
- pdfPath,
1355
- "-background",
1356
- "white",
1357
- "-alpha",
1358
- "remove",
1359
- "-alpha",
1360
- "off",
1361
- outputPattern
1362
- ]);
1365
+ const result = await spawnAsync(
1366
+ "magick",
1367
+ [
1368
+ "-density",
1369
+ dpi.toString(),
1370
+ pdfPath,
1371
+ "-background",
1372
+ "white",
1373
+ "-alpha",
1374
+ "remove",
1375
+ "-alpha",
1376
+ "off",
1377
+ join3(pagesDir, "page_%d.png")
1378
+ ],
1379
+ { captureStdout: false }
1380
+ );
1363
1381
  if (result.code !== 0) {
1364
1382
  throw new Error(
1365
1383
  `[PageRenderer] Failed to render PDF pages: ${result.stderr || "Unknown error"}`
1366
1384
  );
1367
1385
  }
1368
- } finally {
1369
- if (progressInterval) {
1370
- clearInterval(progressInterval);
1371
- }
1372
1386
  }
1373
1387
  const pageFiles = readdirSync2(pagesDir).filter((f) => f.startsWith("page_") && f.endsWith(".png")).sort((a, b) => {
1374
1388
  const numA = parseInt(a.replace("page_", "").replace(".png", ""), 10);
@@ -1384,6 +1398,18 @@ var PageRenderer = class {
1384
1398
  pageFiles
1385
1399
  };
1386
1400
  }
1401
+ /**
1402
+ * Log rendering progress at appropriate intervals (every 10%).
1403
+ */
1404
+ logProgress(current, total) {
1405
+ const percent = Math.floor(current / total * 100);
1406
+ if (percent >= this.lastLoggedPercent + PROGRESS_LOG_PERCENT_STEP || current === total) {
1407
+ this.lastLoggedPercent = percent;
1408
+ this.logger.info(
1409
+ `[PageRenderer] Rendering pages: ${current}/${total} (${percent}%)`
1410
+ );
1411
+ }
1412
+ }
1387
1413
  /**
1388
1414
  * Get total page count using pdfinfo.
1389
1415
  * Returns 0 on failure (progress logging will be skipped).
@@ -2317,10 +2343,541 @@ var LocalFileServer = class {
2317
2343
  }
2318
2344
  };
2319
2345
 
2346
+ // src/core/chunked-pdf-converter.ts
2347
+ import {
2348
+ copyFileSync,
2349
+ createWriteStream as createWriteStream3,
2350
+ existsSync as existsSync3,
2351
+ mkdirSync as mkdirSync3,
2352
+ readFileSync as readFileSync3,
2353
+ readdirSync as readdirSync3,
2354
+ rmSync as rmSync2,
2355
+ writeFileSync as writeFileSync3
2356
+ } from "fs";
2357
+ import { rename as rename2, writeFile } from "fs/promises";
2358
+ import { join as join5 } from "path";
2359
+ import { pipeline as pipeline3 } from "stream/promises";
2360
+
2361
+ // src/processors/docling-document-merger.ts
2362
+ var REF_PATTERN = /^#\/(texts|pictures|tables|groups)\/(\d+)$/;
2363
+ var IMAGE_URI_PATTERN = /^images\/pic_(\d+)\.png$/;
2364
+ var DoclingDocumentMerger = class {
2365
+ /**
2366
+ * Merge an array of DoclingDocuments into one.
2367
+ * The first chunk's metadata (schema_name, version, name, origin) is used as the base.
2368
+ *
2369
+ * @param chunks - Array of DoclingDocument objects to merge (must have at least 1)
2370
+ * @param picFileOffsets - Optional cumulative pic_ file counts per chunk.
2371
+ * When provided, picFileOffsets[i] is used for pic_ URI remapping instead of
2372
+ * the pictures array length, aligning URIs with relocated file indices.
2373
+ * @returns Merged DoclingDocument
2374
+ */
2375
+ merge(chunks, picFileOffsets) {
2376
+ if (chunks.length === 0) {
2377
+ throw new Error("Cannot merge zero chunks");
2378
+ }
2379
+ if (chunks.length === 1) {
2380
+ return chunks[0];
2381
+ }
2382
+ const base = structuredClone(chunks[0]);
2383
+ for (let i = 1; i < chunks.length; i++) {
2384
+ const chunk = chunks[i];
2385
+ const offsets = {
2386
+ texts: base.texts.length,
2387
+ pictures: base.pictures.length,
2388
+ tables: base.tables.length,
2389
+ groups: base.groups.length
2390
+ };
2391
+ const picFileOffset = picFileOffsets ? picFileOffsets[i] : offsets.pictures;
2392
+ for (const text of chunk.texts) {
2393
+ const remapped = structuredClone(text);
2394
+ remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
2395
+ if (remapped.parent) {
2396
+ remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
2397
+ }
2398
+ remapped.children = remapped.children.map((c) => ({
2399
+ $ref: this.remapRef(c.$ref, offsets)
2400
+ }));
2401
+ base.texts.push(remapped);
2402
+ }
2403
+ for (const picture of chunk.pictures) {
2404
+ const remapped = structuredClone(picture);
2405
+ remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
2406
+ if (remapped.parent) {
2407
+ remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
2408
+ }
2409
+ remapped.children = remapped.children.map((c) => ({
2410
+ $ref: this.remapRef(c.$ref, offsets)
2411
+ }));
2412
+ remapped.captions = remapped.captions.map((c) => ({
2413
+ $ref: this.remapRef(c.$ref, offsets)
2414
+ }));
2415
+ this.remapPictureImageUri(remapped, picFileOffset);
2416
+ base.pictures.push(remapped);
2417
+ }
2418
+ for (const table of chunk.tables) {
2419
+ const remapped = structuredClone(table);
2420
+ remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
2421
+ if (remapped.parent) {
2422
+ remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
2423
+ }
2424
+ remapped.children = remapped.children.map((c) => ({
2425
+ $ref: this.remapRef(c.$ref, offsets)
2426
+ }));
2427
+ remapped.captions = remapped.captions.map((c) => ({
2428
+ $ref: this.remapRef(c.$ref, offsets)
2429
+ }));
2430
+ remapped.footnotes = remapped.footnotes.map((f) => ({
2431
+ $ref: this.remapRef(f.$ref, offsets)
2432
+ }));
2433
+ base.tables.push(remapped);
2434
+ }
2435
+ for (const group of chunk.groups) {
2436
+ const remapped = structuredClone(group);
2437
+ remapped.self_ref = this.remapRef(remapped.self_ref, offsets);
2438
+ if (remapped.parent) {
2439
+ remapped.parent.$ref = this.remapRef(remapped.parent.$ref, offsets);
2440
+ }
2441
+ remapped.children = remapped.children.map((c) => ({
2442
+ $ref: this.remapRef(c.$ref, offsets)
2443
+ }));
2444
+ base.groups.push(remapped);
2445
+ }
2446
+ for (const child of chunk.body.children) {
2447
+ base.body.children.push({
2448
+ $ref: this.remapRef(child.$ref, offsets)
2449
+ });
2450
+ }
2451
+ for (const child of chunk.furniture.children) {
2452
+ base.furniture.children.push({
2453
+ $ref: this.remapRef(child.$ref, offsets)
2454
+ });
2455
+ }
2456
+ Object.assign(base.pages, chunk.pages);
2457
+ }
2458
+ return base;
2459
+ }
2460
+ /**
2461
+ * Remap a $ref string by applying offsets.
2462
+ * Only refs matching "#/{texts|pictures|tables|groups}/{N}" are remapped.
2463
+ * Refs like "#/body" or "#/furniture" pass through unchanged.
2464
+ */
2465
+ remapRef(ref, offsets) {
2466
+ const match = REF_PATTERN.exec(ref);
2467
+ if (!match) {
2468
+ return ref;
2469
+ }
2470
+ const kind = match[1];
2471
+ const index = parseInt(match[2], 10);
2472
+ return `#/${kind}/${index + offsets[kind]}`;
2473
+ }
2474
+ /**
2475
+ * Remap image URI in a picture item by applying the pic file offset.
2476
+ * Transforms "images/pic_N.png" → "images/pic_{N+offset}.png"
2477
+ */
2478
+ remapPictureImageUri(picture, picFileOffset) {
2479
+ const rec = picture;
2480
+ const image = rec.image;
2481
+ if (!image?.uri) return;
2482
+ const match = IMAGE_URI_PATTERN.exec(image.uri);
2483
+ if (match) {
2484
+ const index = parseInt(match[1], 10);
2485
+ image.uri = `images/pic_${index + picFileOffset}.png`;
2486
+ }
2487
+ }
2488
+ };
2489
+
2490
+ // src/core/chunked-pdf-converter.ts
2491
+ var ChunkedPDFConverter = class {
2492
+ constructor(logger, client, config, timeout = PDF_CONVERTER.DEFAULT_TIMEOUT_MS) {
2493
+ this.logger = logger;
2494
+ this.client = client;
2495
+ this.config = config;
2496
+ this.timeout = timeout;
2497
+ }
2498
+ /**
2499
+ * Convert a local PDF in chunks.
2500
+ *
2501
+ * @param url - file:// URL to the source PDF
2502
+ * @param reportId - Unique report identifier for output directory naming
2503
+ * @param onComplete - Callback invoked with the final output directory
2504
+ * @param cleanupAfterCallback - Whether to clean up the output directory after callback
2505
+ * @param options - PDF conversion options (chunked-specific fields are stripped internally)
2506
+ * @param buildConversionOptions - Function to build Docling ConversionOptions from PDFConvertOptions
2507
+ * @param abortSignal - Optional abort signal for cancellation
2508
+ */
2509
+ async convertChunked(url, reportId, onComplete, cleanupAfterCallback, options, buildConversionOptions, abortSignal) {
2510
+ const pdfPath = url.slice(7);
2511
+ const cwd = process.cwd();
2512
+ const outputDir = join5(cwd, "output", reportId);
2513
+ const chunksBaseDir = join5(cwd, "output", reportId, "_chunks");
2514
+ const totalPages = await this.getPageCount(pdfPath);
2515
+ if (totalPages === 0) {
2516
+ throw new Error(
2517
+ "[ChunkedPDFConverter] Failed to detect page count from PDF"
2518
+ );
2519
+ }
2520
+ const chunks = this.calculateChunks(totalPages);
2521
+ this.logger.info(
2522
+ `[ChunkedPDFConverter] Starting: ${totalPages} pages \u2192 ${chunks.length} chunks of ${this.config.chunkSize}`
2523
+ );
2524
+ const server = new LocalFileServer();
2525
+ const httpUrl = await server.start(pdfPath);
2526
+ this.logger.info(
2527
+ "[ChunkedPDFConverter] Started local file server:",
2528
+ httpUrl
2529
+ );
2530
+ const chunkDocuments = [];
2531
+ try {
2532
+ for (let i = 0; i < chunks.length; i++) {
2533
+ this.checkAbort(abortSignal);
2534
+ const [start, end] = chunks[i];
2535
+ const chunkDir = join5(chunksBaseDir, `_chunk_${i}`);
2536
+ mkdirSync3(chunkDir, { recursive: true });
2537
+ const doc = await this.convertChunk(
2538
+ i,
2539
+ chunks.length,
2540
+ start,
2541
+ end,
2542
+ httpUrl,
2543
+ chunkDir,
2544
+ options,
2545
+ buildConversionOptions
2546
+ );
2547
+ chunkDocuments.push(doc);
2548
+ }
2549
+ } finally {
2550
+ this.logger.info("[ChunkedPDFConverter] Stopping local file server...");
2551
+ await server.stop();
2552
+ }
2553
+ this.checkAbort(abortSignal);
2554
+ this.logger.info(
2555
+ `[ChunkedPDFConverter] All ${chunks.length} chunks completed, merging...`
2556
+ );
2557
+ const merger = new DoclingDocumentMerger();
2558
+ const picFileOffsets = this.buildPicFileOffsets(
2559
+ chunksBaseDir,
2560
+ chunks.length
2561
+ );
2562
+ const merged = merger.merge(chunkDocuments, picFileOffsets);
2563
+ this.logger.info(
2564
+ `[ChunkedPDFConverter] Merged: ${merged.texts.length} texts, ${merged.pictures.length} pictures, ${merged.tables.length} tables, ${Object.keys(merged.pages).length} pages`
2565
+ );
2566
+ mkdirSync3(outputDir, { recursive: true });
2567
+ const imagesDir = join5(outputDir, "images");
2568
+ mkdirSync3(imagesDir, { recursive: true });
2569
+ this.relocateImages(chunksBaseDir, chunks.length, imagesDir);
2570
+ const resultPath = join5(outputDir, "result.json");
2571
+ writeFileSync3(resultPath, JSON.stringify(merged));
2572
+ try {
2573
+ await this.renderPageImages(pdfPath, outputDir);
2574
+ this.cleanupOrphanedPicFiles(resultPath, imagesDir);
2575
+ this.checkAbort(abortSignal);
2576
+ this.logger.info(
2577
+ "[ChunkedPDFConverter] Executing completion callback..."
2578
+ );
2579
+ await onComplete(outputDir);
2580
+ } finally {
2581
+ if (existsSync3(chunksBaseDir)) {
2582
+ rmSync2(chunksBaseDir, { recursive: true, force: true });
2583
+ }
2584
+ if (cleanupAfterCallback) {
2585
+ this.logger.info(
2586
+ "[ChunkedPDFConverter] Cleaning up output directory:",
2587
+ outputDir
2588
+ );
2589
+ if (existsSync3(outputDir)) {
2590
+ rmSync2(outputDir, { recursive: true, force: true });
2591
+ }
2592
+ } else {
2593
+ this.logger.info(
2594
+ "[ChunkedPDFConverter] Output preserved at:",
2595
+ outputDir
2596
+ );
2597
+ }
2598
+ }
2599
+ return null;
2600
+ }
2601
+ /**
2602
+ * Convert a single chunk with retry logic.
2603
+ */
2604
+ async convertChunk(chunkIndex, totalChunks, startPage, endPage, httpUrl, chunkDir, options, buildConversionOptions) {
2605
+ const chunkLabel = `Chunk ${chunkIndex + 1}/${totalChunks} (pages ${startPage}-${endPage})`;
2606
+ for (let attempt = 0; attempt <= this.config.maxRetries; attempt++) {
2607
+ try {
2608
+ if (attempt > 0) {
2609
+ this.logger.info(
2610
+ `[ChunkedPDFConverter] ${chunkLabel}: retrying (${attempt}/${this.config.maxRetries})...`
2611
+ );
2612
+ } else {
2613
+ this.logger.info(
2614
+ `[ChunkedPDFConverter] ${chunkLabel}: converting...`
2615
+ );
2616
+ }
2617
+ const startTime = Date.now();
2618
+ const conversionOptions = buildConversionOptions({
2619
+ ...options,
2620
+ page_range: [startPage, endPage]
2621
+ });
2622
+ const task = await this.client.convertSourceAsync({
2623
+ sources: [{ kind: "http", url: httpUrl }],
2624
+ options: conversionOptions,
2625
+ target: { kind: "zip" }
2626
+ });
2627
+ await this.trackTaskProgress(task);
2628
+ const zipPath = join5(chunkDir, "result.zip");
2629
+ await this.downloadResult(task.taskId, zipPath);
2630
+ const extractDir = join5(chunkDir, "extracted");
2631
+ const chunkOutputDir = join5(chunkDir, "output");
2632
+ await ImageExtractor.extractAndSaveDocumentsFromZip(
2633
+ this.logger,
2634
+ zipPath,
2635
+ extractDir,
2636
+ chunkOutputDir
2637
+ );
2638
+ const resultJsonPath = join5(chunkOutputDir, "result.json");
2639
+ const doc = await runJqFileJson(".", resultJsonPath);
2640
+ if (existsSync3(zipPath)) rmSync2(zipPath, { force: true });
2641
+ if (existsSync3(extractDir)) {
2642
+ rmSync2(extractDir, { recursive: true, force: true });
2643
+ }
2644
+ const elapsed = ((Date.now() - startTime) / 1e3).toFixed(1);
2645
+ if (attempt > 0) {
2646
+ this.logger.info(
2647
+ `[ChunkedPDFConverter] ${chunkLabel}: completed on retry ${attempt} (${elapsed}s)`
2648
+ );
2649
+ } else {
2650
+ this.logger.info(
2651
+ `[ChunkedPDFConverter] ${chunkLabel}: completed (${elapsed}s)`
2652
+ );
2653
+ }
2654
+ return doc;
2655
+ } catch (error) {
2656
+ if (attempt >= this.config.maxRetries) {
2657
+ this.logger.error(
2658
+ `[ChunkedPDFConverter] ${chunkLabel}: failed after ${this.config.maxRetries} retries`
2659
+ );
2660
+ throw error;
2661
+ }
2662
+ this.logger.warn(
2663
+ `[ChunkedPDFConverter] ${chunkLabel}: failed, retrying (${attempt + 1}/${this.config.maxRetries})...`
2664
+ );
2665
+ }
2666
+ }
2667
+ throw new Error("Unreachable");
2668
+ }
2669
+ /** Calculate page ranges for chunks */
2670
+ calculateChunks(totalPages) {
2671
+ if (this.config.chunkSize <= 0) {
2672
+ throw new Error("[ChunkedPDFConverter] chunkSize must be positive");
2673
+ }
2674
+ const ranges = [];
2675
+ for (let start = 1; start <= totalPages; start += this.config.chunkSize) {
2676
+ const end = Math.min(start + this.config.chunkSize - 1, totalPages);
2677
+ ranges.push([start, end]);
2678
+ }
2679
+ return ranges;
2680
+ }
2681
+ /** Get total page count using pdfinfo */
2682
+ async getPageCount(pdfPath) {
2683
+ const result = await spawnAsync("pdfinfo", [pdfPath]);
2684
+ if (result.code !== 0) {
2685
+ return 0;
2686
+ }
2687
+ const match = result.stdout.match(/^Pages:\s+(\d+)/m);
2688
+ return match ? parseInt(match[1], 10) : 0;
2689
+ }
2690
+ /** Poll task progress until completion */
2691
+ async trackTaskProgress(task) {
2692
+ const startTime = Date.now();
2693
+ while (true) {
2694
+ if (Date.now() - startTime > this.timeout) {
2695
+ throw new Error("[ChunkedPDFConverter] Chunk task timeout");
2696
+ }
2697
+ const status = await task.poll();
2698
+ if (status.task_status === "success") return;
2699
+ if (status.task_status === "failure") {
2700
+ let details = "unknown";
2701
+ try {
2702
+ const result = await task.getResult();
2703
+ if (result.errors?.length) {
2704
+ details = result.errors.map((e) => e.message).join("; ");
2705
+ }
2706
+ } catch {
2707
+ }
2708
+ throw new Error(`[ChunkedPDFConverter] Chunk task failed: ${details}`);
2709
+ }
2710
+ await new Promise(
2711
+ (resolve) => setTimeout(resolve, PDF_CONVERTER.POLL_INTERVAL_MS)
2712
+ );
2713
+ }
2714
+ }
2715
+ /** Download ZIP result for a task */
2716
+ async downloadResult(taskId, zipPath) {
2717
+ const zipResult = await this.client.getTaskResultFile(taskId);
2718
+ if (zipResult.fileStream) {
2719
+ const writeStream = createWriteStream3(zipPath);
2720
+ await pipeline3(zipResult.fileStream, writeStream);
2721
+ return;
2722
+ }
2723
+ if (zipResult.data) {
2724
+ await writeFile(zipPath, zipResult.data);
2725
+ return;
2726
+ }
2727
+ const baseUrl = this.client.getConfig().baseUrl;
2728
+ const response = await fetch(`${baseUrl}/v1/result/${taskId}`, {
2729
+ headers: { Accept: "application/zip" }
2730
+ });
2731
+ if (!response.ok) {
2732
+ throw new Error(
2733
+ `Failed to download chunk ZIP: ${response.status} ${response.statusText}`
2734
+ );
2735
+ }
2736
+ const buffer = new Uint8Array(await response.arrayBuffer());
2737
+ await writeFile(zipPath, buffer);
2738
+ }
2739
+ /**
2740
+ * Relocate images from chunk output directories to the final images directory
2741
+ * with global indexing.
2742
+ */
2743
+ relocateImages(chunksBaseDir, totalChunks, imagesDir) {
2744
+ let picGlobalIndex = 0;
2745
+ for (let i = 0; i < totalChunks; i++) {
2746
+ const chunkImagesDir = join5(
2747
+ chunksBaseDir,
2748
+ `_chunk_${i}`,
2749
+ "output",
2750
+ "images"
2751
+ );
2752
+ if (!existsSync3(chunkImagesDir)) continue;
2753
+ const picFiles = readdirSync3(chunkImagesDir).filter((f) => f.startsWith("pic_") && f.endsWith(".png")).sort((a, b) => {
2754
+ const numA = parseInt(a.replace("pic_", "").replace(".png", ""), 10);
2755
+ const numB = parseInt(b.replace("pic_", "").replace(".png", ""), 10);
2756
+ return numA - numB;
2757
+ });
2758
+ for (const file of picFiles) {
2759
+ const src = join5(chunkImagesDir, file);
2760
+ const dest = join5(imagesDir, `pic_${picGlobalIndex}.png`);
2761
+ copyFileSync(src, dest);
2762
+ picGlobalIndex++;
2763
+ }
2764
+ }
2765
+ let imageGlobalIndex = 0;
2766
+ for (let i = 0; i < totalChunks; i++) {
2767
+ const chunkImagesDir = join5(
2768
+ chunksBaseDir,
2769
+ `_chunk_${i}`,
2770
+ "output",
2771
+ "images"
2772
+ );
2773
+ if (!existsSync3(chunkImagesDir)) continue;
2774
+ const imageFiles = readdirSync3(chunkImagesDir).filter((f) => f.startsWith("image_") && f.endsWith(".png")).sort((a, b) => {
2775
+ const numA = parseInt(
2776
+ a.replace("image_", "").replace(".png", ""),
2777
+ 10
2778
+ );
2779
+ const numB = parseInt(
2780
+ b.replace("image_", "").replace(".png", ""),
2781
+ 10
2782
+ );
2783
+ return numA - numB;
2784
+ });
2785
+ for (const file of imageFiles) {
2786
+ const src = join5(chunkImagesDir, file);
2787
+ const dest = join5(imagesDir, `image_${imageGlobalIndex}.png`);
2788
+ copyFileSync(src, dest);
2789
+ imageGlobalIndex++;
2790
+ }
2791
+ }
2792
+ this.logger.info(
2793
+ `[ChunkedPDFConverter] Relocated ${picGlobalIndex} pic + ${imageGlobalIndex} image files to ${imagesDir}`
2794
+ );
2795
+ }
2796
+ /** Render page images from PDF using ImageMagick and update result.json */
2797
+ async renderPageImages(pdfPath, outputDir) {
2798
+ this.logger.info(
2799
+ "[ChunkedPDFConverter] Rendering page images with ImageMagick..."
2800
+ );
2801
+ const renderer = new PageRenderer(this.logger);
2802
+ const renderResult = await renderer.renderPages(pdfPath, outputDir);
2803
+ const resultPath = join5(outputDir, "result.json");
2804
+ const tmpPath = resultPath + ".tmp";
2805
+ const jqProgram = `
2806
+ .pages |= with_entries(
2807
+ if (.value.page_no - 1) >= 0 and (.value.page_no - 1) < ${renderResult.pageCount} then
2808
+ .value.image.uri = "pages/page_\\(.value.page_no - 1).png" |
2809
+ .value.image.mimetype = "image/png" |
2810
+ .value.image.dpi = ${PAGE_RENDERING.DEFAULT_DPI}
2811
+ else . end
2812
+ )
2813
+ `;
2814
+ await runJqFileToFile(jqProgram, resultPath, tmpPath);
2815
+ await rename2(tmpPath, resultPath);
2816
+ this.logger.info(
2817
+ `[ChunkedPDFConverter] Rendered ${renderResult.pageCount} page images`
2818
+ );
2819
+ }
2820
+ /**
2821
+ * Remove pic_ files from images directory that are not referenced in result.json.
2822
+ * Chunked Docling conversion embeds page images as base64 in JSON, which get
2823
+ * extracted as pic_ files. After renderPageImages replaces page URIs with
2824
+ * pages/page_N.png, these pic_ files become orphaned.
2825
+ */
2826
+ cleanupOrphanedPicFiles(resultPath, imagesDir) {
2827
+ const content = readFileSync3(resultPath, "utf-8");
2828
+ const referencedPics = /* @__PURE__ */ new Set();
2829
+ const picPattern = /images\/pic_\d+\.png/g;
2830
+ let match;
2831
+ while ((match = picPattern.exec(content)) !== null) {
2832
+ referencedPics.add(match[0].replace("images/", ""));
2833
+ }
2834
+ const picFiles = readdirSync3(imagesDir).filter(
2835
+ (f) => f.startsWith("pic_") && f.endsWith(".png")
2836
+ );
2837
+ let removedCount = 0;
2838
+ for (const file of picFiles) {
2839
+ if (!referencedPics.has(file)) {
2840
+ rmSync2(join5(imagesDir, file), { force: true });
2841
+ removedCount++;
2842
+ }
2843
+ }
2844
+ if (removedCount > 0) {
2845
+ this.logger.info(
2846
+ `[ChunkedPDFConverter] Cleaned up ${removedCount} orphaned pic_ files (${referencedPics.size} referenced, kept)`
2847
+ );
2848
+ }
2849
+ }
2850
+ /**
2851
+ * Build cumulative pic_ file offsets per chunk for correct URI remapping.
2852
+ * Each offset[i] is the total number of pic_ files in chunks 0..i-1.
2853
+ */
2854
+ buildPicFileOffsets(chunksBaseDir, totalChunks) {
2855
+ const offsets = [];
2856
+ let cumulative = 0;
2857
+ for (let i = 0; i < totalChunks; i++) {
2858
+ offsets.push(cumulative);
2859
+ const dir = join5(chunksBaseDir, `_chunk_${i}`, "output", "images");
2860
+ const count = existsSync3(dir) ? readdirSync3(dir).filter(
2861
+ (f) => f.startsWith("pic_") && f.endsWith(".png")
2862
+ ).length : 0;
2863
+ cumulative += count;
2864
+ }
2865
+ return offsets;
2866
+ }
2867
+ /** Check if abort has been signalled and throw if so */
2868
+ checkAbort(signal) {
2869
+ if (signal?.aborted) {
2870
+ const error = new Error("Chunked PDF conversion was aborted");
2871
+ error.name = "AbortError";
2872
+ throw error;
2873
+ }
2874
+ }
2875
+ };
2876
+
2320
2877
  // src/core/image-pdf-converter.ts
2321
- import { existsSync as existsSync3, rmSync as rmSync2 } from "fs";
2878
+ import { existsSync as existsSync4, rmSync as rmSync3 } from "fs";
2322
2879
  import { tmpdir } from "os";
2323
- import { join as join5 } from "path";
2880
+ import { join as join6 } from "path";
2324
2881
  var ImagePdfConverter = class {
2325
2882
  constructor(logger) {
2326
2883
  this.logger = logger;
@@ -2336,8 +2893,8 @@ var ImagePdfConverter = class {
2336
2893
  async convert(pdfUrl, reportId) {
2337
2894
  const timestamp = Date.now();
2338
2895
  const tempDir = tmpdir();
2339
- const inputPath = join5(tempDir, `${reportId}-${timestamp}-input.pdf`);
2340
- const outputPath = join5(tempDir, `${reportId}-${timestamp}-image.pdf`);
2896
+ const inputPath = join6(tempDir, `${reportId}-${timestamp}-input.pdf`);
2897
+ const outputPath = join6(tempDir, `${reportId}-${timestamp}-image.pdf`);
2341
2898
  try {
2342
2899
  this.logger.info("[ImagePdfConverter] Downloading PDF from URL...");
2343
2900
  await this.downloadPdf(pdfUrl, inputPath);
@@ -2346,8 +2903,8 @@ var ImagePdfConverter = class {
2346
2903
  this.logger.info("[ImagePdfConverter] Image PDF created:", outputPath);
2347
2904
  return outputPath;
2348
2905
  } finally {
2349
- if (existsSync3(inputPath)) {
2350
- rmSync2(inputPath, { force: true });
2906
+ if (existsSync4(inputPath)) {
2907
+ rmSync3(inputPath, { force: true });
2351
2908
  }
2352
2909
  }
2353
2910
  }
@@ -2394,12 +2951,12 @@ var ImagePdfConverter = class {
2394
2951
  * Cleanup the temporary image PDF file
2395
2952
  */
2396
2953
  cleanup(imagePdfPath) {
2397
- if (existsSync3(imagePdfPath)) {
2954
+ if (existsSync4(imagePdfPath)) {
2398
2955
  this.logger.info(
2399
2956
  "[ImagePdfConverter] Cleaning up temp file:",
2400
2957
  imagePdfPath
2401
2958
  );
2402
- rmSync2(imagePdfPath, { force: true });
2959
+ rmSync3(imagePdfPath, { force: true });
2403
2960
  }
2404
2961
  }
2405
2962
  };
@@ -2414,6 +2971,26 @@ var PDFConverter = class {
2414
2971
  }
2415
2972
  async convert(url, reportId, onComplete, cleanupAfterCallback, options, abortSignal) {
2416
2973
  this.logger.info("[PDFConverter] Converting:", url);
2974
+ if (options.chunkedConversion && url.startsWith("file://")) {
2975
+ const chunked = new ChunkedPDFConverter(
2976
+ this.logger,
2977
+ this.client,
2978
+ {
2979
+ chunkSize: options.chunkSize ?? CHUNKED_CONVERSION.DEFAULT_CHUNK_SIZE,
2980
+ maxRetries: options.chunkMaxRetries ?? CHUNKED_CONVERSION.DEFAULT_MAX_RETRIES
2981
+ },
2982
+ this.timeout
2983
+ );
2984
+ return chunked.convertChunked(
2985
+ url,
2986
+ reportId,
2987
+ onComplete,
2988
+ cleanupAfterCallback,
2989
+ options,
2990
+ (opts) => this.buildConversionOptions(opts),
2991
+ abortSignal
2992
+ );
2993
+ }
2417
2994
  if (options.forceImagePdf) {
2418
2995
  return this.convertViaImagePdf(
2419
2996
  url,
@@ -2518,7 +3095,7 @@ var PDFConverter = class {
2518
3095
  const reason = options.forcedMethod ? `Forced: ${options.forcedMethod}` : !pdfPath ? "Non-local URL, sampling skipped" : "Sampling skipped";
2519
3096
  return { method, reason, sampledPages: 0, totalPages: 0 };
2520
3097
  }
2521
- const samplingDir = join6(process.cwd(), "output", reportId, "_sampling");
3098
+ const samplingDir = join7(process.cwd(), "output", reportId, "_sampling");
2522
3099
  const sampler = new OcrStrategySampler(
2523
3100
  this.logger,
2524
3101
  new PageRenderer(this.logger),
@@ -2543,8 +3120,8 @@ var PDFConverter = class {
2543
3120
  }
2544
3121
  return strategy;
2545
3122
  } finally {
2546
- if (existsSync4(samplingDir)) {
2547
- rmSync3(samplingDir, { recursive: true, force: true });
3123
+ if (existsSync5(samplingDir)) {
3124
+ rmSync4(samplingDir, { recursive: true, force: true });
2548
3125
  }
2549
3126
  }
2550
3127
  }
@@ -2565,7 +3142,7 @@ var PDFConverter = class {
2565
3142
  const wrappedCallback = async (outputDir) => {
2566
3143
  let pageTexts;
2567
3144
  try {
2568
- const resultPath2 = join6(outputDir, "result.json");
3145
+ const resultPath2 = join7(outputDir, "result.json");
2569
3146
  const totalPages = await runJqFileJson(
2570
3147
  ".pages | length",
2571
3148
  resultPath2
@@ -2577,9 +3154,9 @@ var PDFConverter = class {
2577
3154
  "[PDFConverter] pdftotext extraction failed, proceeding without text reference"
2578
3155
  );
2579
3156
  }
2580
- const resultPath = join6(outputDir, "result.json");
2581
- const ocrOriginPath = join6(outputDir, "result_ocr_origin.json");
2582
- copyFileSync(resultPath, ocrOriginPath);
3157
+ const resultPath = join7(outputDir, "result.json");
3158
+ const ocrOriginPath = join7(outputDir, "result_ocr_origin.json");
3159
+ copyFileSync2(resultPath, ocrOriginPath);
2583
3160
  const corrector = new VlmTextCorrector(this.logger);
2584
3161
  await corrector.correctAndSave(outputDir, options.vlmProcessorModel, {
2585
3162
  concurrency: options.vlmConcurrency,
@@ -2721,9 +3298,9 @@ var PDFConverter = class {
2721
3298
  }
2722
3299
  }
2723
3300
  const cwd = process.cwd();
2724
- const zipPath = join6(cwd, "result.zip");
2725
- const extractDir = join6(cwd, "result_extracted");
2726
- const outputDir = join6(cwd, "output", reportId);
3301
+ const zipPath = join7(cwd, "result.zip");
3302
+ const extractDir = join7(cwd, "result_extracted");
3303
+ const outputDir = join7(cwd, "output", reportId);
2727
3304
  try {
2728
3305
  await this.processConvertedFiles(zipPath, extractDir, outputDir);
2729
3306
  await this.renderPageImages(url, outputDir);
@@ -2740,19 +3317,19 @@ var PDFConverter = class {
2740
3317
  this.logger.info("[PDFConverter] Total time:", duration, "ms");
2741
3318
  } finally {
2742
3319
  this.logger.info("[PDFConverter] Cleaning up temporary files...");
2743
- if (existsSync4(zipPath)) {
2744
- rmSync3(zipPath, { force: true });
3320
+ if (existsSync5(zipPath)) {
3321
+ rmSync4(zipPath, { force: true });
2745
3322
  }
2746
- if (existsSync4(extractDir)) {
2747
- rmSync3(extractDir, { recursive: true, force: true });
3323
+ if (existsSync5(extractDir)) {
3324
+ rmSync4(extractDir, { recursive: true, force: true });
2748
3325
  }
2749
3326
  if (cleanupAfterCallback) {
2750
3327
  this.logger.info(
2751
3328
  "[PDFConverter] Cleaning up output directory:",
2752
3329
  outputDir
2753
3330
  );
2754
- if (existsSync4(outputDir)) {
2755
- rmSync3(outputDir, { recursive: true, force: true });
3331
+ if (existsSync5(outputDir)) {
3332
+ rmSync4(outputDir, { recursive: true, force: true });
2756
3333
  }
2757
3334
  } else {
2758
3335
  this.logger.info("[PDFConverter] Output preserved at:", outputDir);
@@ -2770,7 +3347,10 @@ var PDFConverter = class {
2770
3347
  "skipSampling",
2771
3348
  "forcedMethod",
2772
3349
  "aggregator",
2773
- "onTokenUsage"
3350
+ "onTokenUsage",
3351
+ "chunkedConversion",
3352
+ "chunkSize",
3353
+ "chunkMaxRetries"
2774
3354
  ]),
2775
3355
  to_formats: ["json", "html"],
2776
3356
  image_export_mode: "embedded",
@@ -2898,15 +3478,15 @@ var PDFConverter = class {
2898
3478
  "\n[PDFConverter] Task completed, downloading ZIP file..."
2899
3479
  );
2900
3480
  const zipResult = await this.client.getTaskResultFile(taskId);
2901
- const zipPath = join6(process.cwd(), "result.zip");
3481
+ const zipPath = join7(process.cwd(), "result.zip");
2902
3482
  this.logger.info("[PDFConverter] Saving ZIP file to:", zipPath);
2903
3483
  if (zipResult.fileStream) {
2904
- const writeStream = createWriteStream3(zipPath);
2905
- await pipeline3(zipResult.fileStream, writeStream);
3484
+ const writeStream = createWriteStream4(zipPath);
3485
+ await pipeline4(zipResult.fileStream, writeStream);
2906
3486
  return;
2907
3487
  }
2908
3488
  if (zipResult.data) {
2909
- await writeFile(zipPath, zipResult.data);
3489
+ await writeFile2(zipPath, zipResult.data);
2910
3490
  return;
2911
3491
  }
2912
3492
  this.logger.warn(
@@ -2922,7 +3502,7 @@ var PDFConverter = class {
2922
3502
  );
2923
3503
  }
2924
3504
  const buffer = new Uint8Array(await response.arrayBuffer());
2925
- await writeFile(zipPath, buffer);
3505
+ await writeFile2(zipPath, buffer);
2926
3506
  }
2927
3507
  async processConvertedFiles(zipPath, extractDir, outputDir) {
2928
3508
  await ImageExtractor.extractAndSaveDocumentsFromZip(
@@ -2951,7 +3531,7 @@ var PDFConverter = class {
2951
3531
  );
2952
3532
  const renderer = new PageRenderer(this.logger);
2953
3533
  const renderResult = await renderer.renderPages(pdfPath, outputDir);
2954
- const resultPath = join6(outputDir, "result.json");
3534
+ const resultPath = join7(outputDir, "result.json");
2955
3535
  const tmpPath = resultPath + ".tmp";
2956
3536
  const jqProgram = `
2957
3537
  .pages |= with_entries(
@@ -2963,7 +3543,7 @@ var PDFConverter = class {
2963
3543
  )
2964
3544
  `;
2965
3545
  await runJqFileToFile(jqProgram, resultPath, tmpPath);
2966
- await rename2(tmpPath, resultPath);
3546
+ await rename3(tmpPath, resultPath);
2967
3547
  this.logger.info(
2968
3548
  `[PDFConverter] Rendered ${renderResult.pageCount} page images`
2969
3549
  );
@@ -2998,7 +3578,7 @@ var PDFParser = class {
2998
3578
  this.baseUrl = void 0;
2999
3579
  }
3000
3580
  this.timeout = timeout;
3001
- this.venvPath = venvPath || join7(process.cwd(), ".venv");
3581
+ this.venvPath = venvPath || join8(process.cwd(), ".venv");
3002
3582
  this.killExistingProcess = killExistingProcess;
3003
3583
  this.enableImagePdfFallback = enableImagePdfFallback;
3004
3584
  }