@heripo/pdf-parser 0.1.10 → 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1325,14 +1325,18 @@ var ImageExtractor = class _ImageExtractor {
1325
1325
  // src/processors/page-renderer.ts
1326
1326
  var import_node_fs3 = require("fs");
1327
1327
  var import_node_path3 = require("path");
1328
- var PROGRESS_POLL_INTERVAL_MS = 2e3;
1328
+ var PROGRESS_LOG_PERCENT_STEP = 10;
1329
1329
  var PageRenderer = class {
1330
1330
  constructor(logger) {
1331
1331
  this.logger = logger;
1332
1332
  }
1333
+ lastLoggedPercent = 0;
1333
1334
  /**
1334
1335
  * Render all pages of a PDF to individual PNG files.
1335
1336
  *
1337
+ * Uses per-page rendering (`magick 'input.pdf[N]'`) when page count is known,
1338
+ * limiting peak memory to ~15MB/page instead of loading all pages at once.
1339
+ *
1336
1340
  * @param pdfPath - Absolute path to the source PDF file
1337
1341
  * @param outputDir - Directory where pages/ subdirectory will be created
1338
1342
  * @param options - Rendering options
@@ -1349,50 +1353,54 @@ var PageRenderer = class {
1349
1353
  this.logger.info(
1350
1354
  `[PageRenderer] Rendering ${totalPages} pages at ${dpi} DPI...`
1351
1355
  );
1356
+ this.lastLoggedPercent = 0;
1357
+ for (let i = 0; i < totalPages; i++) {
1358
+ const result = await spawnAsync(
1359
+ "magick",
1360
+ [
1361
+ "-density",
1362
+ dpi.toString(),
1363
+ `${pdfPath}[${i}]`,
1364
+ "-background",
1365
+ "white",
1366
+ "-alpha",
1367
+ "remove",
1368
+ "-alpha",
1369
+ "off",
1370
+ (0, import_node_path3.join)(pagesDir, `page_${i}.png`)
1371
+ ],
1372
+ { captureStdout: false }
1373
+ );
1374
+ if (result.code !== 0) {
1375
+ throw new Error(
1376
+ `[PageRenderer] Failed to render page ${i + 1}/${totalPages}: ${result.stderr || "Unknown error"}`
1377
+ );
1378
+ }
1379
+ this.logProgress(i + 1, totalPages);
1380
+ }
1352
1381
  } else {
1353
1382
  this.logger.info(`[PageRenderer] Rendering PDF at ${dpi} DPI...`);
1354
- }
1355
- const outputPattern = (0, import_node_path3.join)(pagesDir, "page_%d.png");
1356
- let progressInterval = null;
1357
- if (totalPages > 0) {
1358
- let lastLoggedCount = 0;
1359
- progressInterval = setInterval(() => {
1360
- try {
1361
- const rendered = (0, import_node_fs3.readdirSync)(pagesDir).filter(
1362
- (f) => f.startsWith("page_") && f.endsWith(".png")
1363
- ).length;
1364
- if (rendered > 0 && rendered !== lastLoggedCount) {
1365
- lastLoggedCount = rendered;
1366
- this.logger.info(
1367
- `[PageRenderer] Rendering pages: ${rendered}/${totalPages}`
1368
- );
1369
- }
1370
- } catch {
1371
- }
1372
- }, PROGRESS_POLL_INTERVAL_MS);
1373
- }
1374
- try {
1375
- const result = await spawnAsync("magick", [
1376
- "-density",
1377
- dpi.toString(),
1378
- pdfPath,
1379
- "-background",
1380
- "white",
1381
- "-alpha",
1382
- "remove",
1383
- "-alpha",
1384
- "off",
1385
- outputPattern
1386
- ]);
1383
+ const result = await spawnAsync(
1384
+ "magick",
1385
+ [
1386
+ "-density",
1387
+ dpi.toString(),
1388
+ pdfPath,
1389
+ "-background",
1390
+ "white",
1391
+ "-alpha",
1392
+ "remove",
1393
+ "-alpha",
1394
+ "off",
1395
+ (0, import_node_path3.join)(pagesDir, "page_%d.png")
1396
+ ],
1397
+ { captureStdout: false }
1398
+ );
1387
1399
  if (result.code !== 0) {
1388
1400
  throw new Error(
1389
1401
  `[PageRenderer] Failed to render PDF pages: ${result.stderr || "Unknown error"}`
1390
1402
  );
1391
1403
  }
1392
- } finally {
1393
- if (progressInterval) {
1394
- clearInterval(progressInterval);
1395
- }
1396
1404
  }
1397
1405
  const pageFiles = (0, import_node_fs3.readdirSync)(pagesDir).filter((f) => f.startsWith("page_") && f.endsWith(".png")).sort((a, b) => {
1398
1406
  const numA = parseInt(a.replace("page_", "").replace(".png", ""), 10);
@@ -1408,6 +1416,18 @@ var PageRenderer = class {
1408
1416
  pageFiles
1409
1417
  };
1410
1418
  }
1419
+ /**
1420
+ * Log rendering progress at appropriate intervals (every 10%).
1421
+ */
1422
+ logProgress(current, total) {
1423
+ const percent = Math.floor(current / total * 100);
1424
+ if (percent >= this.lastLoggedPercent + PROGRESS_LOG_PERCENT_STEP || current === total) {
1425
+ this.lastLoggedPercent = percent;
1426
+ this.logger.info(
1427
+ `[PageRenderer] Rendering pages: ${current}/${total} (${percent}%)`
1428
+ );
1429
+ }
1430
+ }
1411
1431
  /**
1412
1432
  * Get total page count using pdfinfo.
1413
1433
  * Returns 0 on failure (progress logging will be skipped).