@heripo/pdf-parser 0.1.10 → 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1301,14 +1301,18 @@ var ImageExtractor = class _ImageExtractor {
1301
1301
  // src/processors/page-renderer.ts
1302
1302
  import { existsSync as existsSync2, mkdirSync as mkdirSync2, readdirSync as readdirSync2 } from "fs";
1303
1303
  import { join as join3 } from "path";
1304
- var PROGRESS_POLL_INTERVAL_MS = 2e3;
1304
+ var PROGRESS_LOG_PERCENT_STEP = 10;
1305
1305
  var PageRenderer = class {
1306
1306
  constructor(logger) {
1307
1307
  this.logger = logger;
1308
1308
  }
1309
+ lastLoggedPercent = 0;
1309
1310
  /**
1310
1311
  * Render all pages of a PDF to individual PNG files.
1311
1312
  *
1313
+ * Uses per-page rendering (`magick 'input.pdf[N]'`) when page count is known,
1314
+ * limiting peak memory to ~15MB/page instead of loading all pages at once.
1315
+ *
1312
1316
  * @param pdfPath - Absolute path to the source PDF file
1313
1317
  * @param outputDir - Directory where pages/ subdirectory will be created
1314
1318
  * @param options - Rendering options
@@ -1325,50 +1329,54 @@ var PageRenderer = class {
1325
1329
  this.logger.info(
1326
1330
  `[PageRenderer] Rendering ${totalPages} pages at ${dpi} DPI...`
1327
1331
  );
1332
+ this.lastLoggedPercent = 0;
1333
+ for (let i = 0; i < totalPages; i++) {
1334
+ const result = await spawnAsync(
1335
+ "magick",
1336
+ [
1337
+ "-density",
1338
+ dpi.toString(),
1339
+ `${pdfPath}[${i}]`,
1340
+ "-background",
1341
+ "white",
1342
+ "-alpha",
1343
+ "remove",
1344
+ "-alpha",
1345
+ "off",
1346
+ join3(pagesDir, `page_${i}.png`)
1347
+ ],
1348
+ { captureStdout: false }
1349
+ );
1350
+ if (result.code !== 0) {
1351
+ throw new Error(
1352
+ `[PageRenderer] Failed to render page ${i + 1}/${totalPages}: ${result.stderr || "Unknown error"}`
1353
+ );
1354
+ }
1355
+ this.logProgress(i + 1, totalPages);
1356
+ }
1328
1357
  } else {
1329
1358
  this.logger.info(`[PageRenderer] Rendering PDF at ${dpi} DPI...`);
1330
- }
1331
- const outputPattern = join3(pagesDir, "page_%d.png");
1332
- let progressInterval = null;
1333
- if (totalPages > 0) {
1334
- let lastLoggedCount = 0;
1335
- progressInterval = setInterval(() => {
1336
- try {
1337
- const rendered = readdirSync2(pagesDir).filter(
1338
- (f) => f.startsWith("page_") && f.endsWith(".png")
1339
- ).length;
1340
- if (rendered > 0 && rendered !== lastLoggedCount) {
1341
- lastLoggedCount = rendered;
1342
- this.logger.info(
1343
- `[PageRenderer] Rendering pages: ${rendered}/${totalPages}`
1344
- );
1345
- }
1346
- } catch {
1347
- }
1348
- }, PROGRESS_POLL_INTERVAL_MS);
1349
- }
1350
- try {
1351
- const result = await spawnAsync("magick", [
1352
- "-density",
1353
- dpi.toString(),
1354
- pdfPath,
1355
- "-background",
1356
- "white",
1357
- "-alpha",
1358
- "remove",
1359
- "-alpha",
1360
- "off",
1361
- outputPattern
1362
- ]);
1359
+ const result = await spawnAsync(
1360
+ "magick",
1361
+ [
1362
+ "-density",
1363
+ dpi.toString(),
1364
+ pdfPath,
1365
+ "-background",
1366
+ "white",
1367
+ "-alpha",
1368
+ "remove",
1369
+ "-alpha",
1370
+ "off",
1371
+ join3(pagesDir, "page_%d.png")
1372
+ ],
1373
+ { captureStdout: false }
1374
+ );
1363
1375
  if (result.code !== 0) {
1364
1376
  throw new Error(
1365
1377
  `[PageRenderer] Failed to render PDF pages: ${result.stderr || "Unknown error"}`
1366
1378
  );
1367
1379
  }
1368
- } finally {
1369
- if (progressInterval) {
1370
- clearInterval(progressInterval);
1371
- }
1372
1380
  }
1373
1381
  const pageFiles = readdirSync2(pagesDir).filter((f) => f.startsWith("page_") && f.endsWith(".png")).sort((a, b) => {
1374
1382
  const numA = parseInt(a.replace("page_", "").replace(".png", ""), 10);
@@ -1384,6 +1392,18 @@ var PageRenderer = class {
1384
1392
  pageFiles
1385
1393
  };
1386
1394
  }
1395
+ /**
1396
+ * Log rendering progress at appropriate intervals (every 10%).
1397
+ */
1398
+ logProgress(current, total) {
1399
+ const percent = Math.floor(current / total * 100);
1400
+ if (percent >= this.lastLoggedPercent + PROGRESS_LOG_PERCENT_STEP || current === total) {
1401
+ this.lastLoggedPercent = percent;
1402
+ this.logger.info(
1403
+ `[PageRenderer] Rendering pages: ${current}/${total} (${percent}%)`
1404
+ );
1405
+ }
1406
+ }
1387
1407
  /**
1388
1408
  * Get total page count using pdfinfo.
1389
1409
  * Returns 0 on failure (progress logging will be skipped).