node-pdf2img 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,613 @@
1
+ /**
2
+ * PDF2IMG 核心转换器
3
+ *
4
+ * 提供统一的 API 用于 PDF 转图片
5
+ *
6
+ * 架构:主线程协调 + 工作线程池处理
7
+ * - 主线程:负责 I/O、任务分发、结果收集
8
+ * - 工作线程池:负责 CPU 密集型任务(PDFium 渲染 + Sharp 编码)
9
+ *
10
+ * 性能优化:
11
+ * - 使用 piscina 线程池,充分利用多核 CPU
12
+ * - 异步文件 I/O,不阻塞事件循环
13
+ * - 原生模块直接读取文件路径,避免 Node.js 堆内存占用
14
+ * - 流式下载,减少内存峰值
15
+ */
16
+
17
+ import fs from 'fs';
18
+ import path from 'path';
19
+ import os from 'os';
20
+ import { pipeline } from 'stream/promises';
21
+ import { fileURLToPath } from 'url';
22
+ import pLimit from 'p-limit';
23
+ import Piscina from 'piscina';
24
+ import { createLogger } from '../utils/logger.js';
25
+ import { RENDER_CONFIG, TIMEOUT_CONFIG, SUPPORTED_FORMATS, getExtension, getMimeType } from './config.js';
26
+ import * as nativeRenderer from '../renderers/native.js';
27
+
28
+ const logger = createLogger('Converter');
29
+
30
+ // ==================== 线程池初始化 ====================
31
+
32
+ // 获取 worker.js 的路径
33
+ const __filename = fileURLToPath(import.meta.url);
34
+ const __dirname = path.dirname(__filename);
35
+ const workerPath = path.resolve(__dirname, '../worker.js');
36
+
37
+ // 创建全局线程池实例
38
+ // 线程数默认为 CPU 核心数,可通过环境变量调整
39
+ const threadCount = parseInt(process.env.PDF2IMG_THREAD_COUNT, 10) || os.cpus().length;
40
+
41
+ let piscina = null;
42
+
43
+ /**
44
+ * 获取或创建线程池实例(懒加载)
45
+ */
46
+ function getThreadPool() {
47
+ if (!piscina) {
48
+ piscina = new Piscina({
49
+ filename: workerPath,
50
+ maxThreads: threadCount,
51
+ idleTimeout: 30000, // 空闲 30 秒后销毁线程
52
+ });
53
+ logger.info(`Thread pool initialized with ${threadCount} workers`);
54
+ }
55
+ return piscina;
56
+ }
57
+
58
+ /**
59
+ * 默认并发限制
60
+ */
61
+ const DEFAULT_CONCURRENCY = {
62
+ FILE_IO: 10, // 文件写入并发数
63
+ COS_UPLOAD: 8, // COS 上传并发数
64
+ };
65
+
66
+ /**
67
+ * 输入类型枚举
68
+ */
69
+ export const InputType = {
70
+ FILE: 'file',
71
+ URL: 'url',
72
+ BUFFER: 'buffer',
73
+ };
74
+
75
+ /**
76
+ * 输出类型枚举
77
+ */
78
+ export const OutputType = {
79
+ FILE: 'file', // 保存到本地文件
80
+ BUFFER: 'buffer', // 返回 Buffer 数组
81
+ COS: 'cos', // 上传到腾讯云 COS
82
+ };
83
+
84
+ /**
85
+ * 检测输入类型
86
+ */
87
+ function detectInputType(input) {
88
+ if (Buffer.isBuffer(input)) {
89
+ return InputType.BUFFER;
90
+ }
91
+ if (typeof input === 'string') {
92
+ if (input.startsWith('http://') || input.startsWith('https://')) {
93
+ return InputType.URL;
94
+ }
95
+ return InputType.FILE;
96
+ }
97
+ throw new Error('Invalid input: must be a file path, URL, or Buffer');
98
+ }
99
+
100
+ /**
101
+ * 从 URL 获取文件大小
102
+ */
103
+ async function getRemoteFileSize(url) {
104
+ const response = await fetch(url, {
105
+ method: 'HEAD',
106
+ signal: AbortSignal.timeout(TIMEOUT_CONFIG.DOWNLOAD_TIMEOUT),
107
+ });
108
+
109
+ if (!response.ok) {
110
+ throw new Error(`Failed to get file size: ${response.status} ${response.statusText}`);
111
+ }
112
+
113
+ const contentLength = response.headers.get('content-length');
114
+ if (!contentLength) {
115
+ throw new Error('Server did not return Content-Length header');
116
+ }
117
+
118
+ return parseInt(contentLength, 10);
119
+ }
120
+
121
+ /**
122
+ * 流式下载远程文件到临时文件
123
+ */
124
+ async function downloadToTempFile(url) {
125
+ const response = await fetch(url, {
126
+ signal: AbortSignal.timeout(TIMEOUT_CONFIG.DOWNLOAD_TIMEOUT),
127
+ });
128
+
129
+ if (!response.ok) {
130
+ throw new Error(`Failed to download file: ${response.status} ${response.statusText}`);
131
+ }
132
+
133
+ const tempDir = os.tmpdir();
134
+ const tempFile = path.join(tempDir, `pdf2img_${Date.now()}_${Math.random().toString(36).slice(2)}.pdf`);
135
+
136
+ const fileStream = fs.createWriteStream(tempFile);
137
+
138
+ try {
139
+ await pipeline(response.body, fileStream);
140
+ return tempFile;
141
+ } catch (err) {
142
+ try {
143
+ await fs.promises.unlink(tempFile);
144
+ } catch {}
145
+ throw err;
146
+ }
147
+ }
148
+
149
+ /**
150
+ * 保存单个页面到文件
151
+ */
152
+ async function savePageToFile(page, outputDir, prefix, ext) {
153
+ if (!page.success || !page.buffer) {
154
+ return { ...page, outputPath: null };
155
+ }
156
+
157
+ try {
158
+ const filename = `${prefix}_${page.pageNum}.${ext}`;
159
+ const outputPath = path.join(outputDir, filename);
160
+ await fs.promises.writeFile(outputPath, page.buffer);
161
+
162
+ return {
163
+ pageNum: page.pageNum,
164
+ width: page.width,
165
+ height: page.height,
166
+ success: true,
167
+ outputPath,
168
+ size: page.buffer.length,
169
+ };
170
+ } catch (err) {
171
+ return {
172
+ pageNum: page.pageNum,
173
+ width: page.width,
174
+ height: page.height,
175
+ success: false,
176
+ error: `File save failed: ${err.message}`,
177
+ outputPath: null,
178
+ };
179
+ }
180
+ }
181
+
182
+ /**
183
+ * 保存渲染结果到文件
184
+ */
185
+ async function saveToFiles(pages, outputDir, prefix = 'page', format = 'webp', concurrency = DEFAULT_CONCURRENCY.FILE_IO) {
186
+ await fs.promises.mkdir(outputDir, { recursive: true });
187
+
188
+ const ext = getExtension(format);
189
+ const limit = pLimit(concurrency);
190
+
191
+ const results = await Promise.all(
192
+ pages.map(page => limit(() => savePageToFile(page, outputDir, prefix, ext)))
193
+ );
194
+
195
+ return results.sort((a, b) => a.pageNum - b.pageNum);
196
+ }
197
+
198
+ /**
199
+ * 上传单个页面到 COS
200
+ */
201
+ async function uploadPageToCos(page, cos, cosConfig, keyPrefix, ext, mimeType) {
202
+ if (!page.success || !page.buffer) {
203
+ return { ...page, cosKey: null };
204
+ }
205
+
206
+ try {
207
+ const key = `${keyPrefix}/page_${page.pageNum}.${ext}`;
208
+
209
+ await new Promise((resolve, reject) => {
210
+ cos.putObject({
211
+ Bucket: cosConfig.bucket,
212
+ Region: cosConfig.region,
213
+ Key: key,
214
+ Body: page.buffer,
215
+ ContentType: mimeType,
216
+ }, (err) => {
217
+ if (err) reject(err);
218
+ else resolve();
219
+ });
220
+ });
221
+
222
+ return {
223
+ pageNum: page.pageNum,
224
+ width: page.width,
225
+ height: page.height,
226
+ success: true,
227
+ cosKey: key,
228
+ size: page.buffer.length,
229
+ };
230
+ } catch (err) {
231
+ return {
232
+ pageNum: page.pageNum,
233
+ width: page.width,
234
+ height: page.height,
235
+ success: false,
236
+ error: `Upload failed: ${err.message}`,
237
+ cosKey: null,
238
+ };
239
+ }
240
+ }
241
+
242
+ /**
243
+ * 上传渲染结果到 COS
244
+ */
245
+ async function uploadToCos(pages, cosConfig, keyPrefix, format = 'webp', concurrency = DEFAULT_CONCURRENCY.COS_UPLOAD) {
246
+ const COS = (await import('cos-nodejs-sdk-v5')).default;
247
+
248
+ const cos = new COS({
249
+ SecretId: cosConfig.secretId,
250
+ SecretKey: cosConfig.secretKey,
251
+ });
252
+
253
+ const ext = getExtension(format);
254
+ const mimeType = getMimeType(format);
255
+ const limit = pLimit(concurrency);
256
+
257
+ const results = await Promise.all(
258
+ pages.map(page => limit(() => uploadPageToCos(page, cos, cosConfig, keyPrefix, ext, mimeType)))
259
+ );
260
+
261
+ return results.sort((a, b) => a.pageNum - b.pageNum);
262
+ }
263
+
264
+ /**
265
+ * 使用线程池渲染 PDF 页面
266
+ *
267
+ * 主线程负责协调,工作线程负责 CPU 密集型任务
268
+ *
269
+ * @param {string|Buffer} input - 输入
270
+ * @param {string} inputType - 输入类型
271
+ * @param {number[]} pages - 页码数组
272
+ * @param {Object} options - 选项
273
+ * @returns {Promise<Object>} 渲染结果
274
+ */
275
+ async function renderPages(input, inputType, pages, options) {
276
+ const startTime = Date.now();
277
+ let filePath = null;
278
+ let pdfBuffer = null;
279
+ let tempFile = null;
280
+ let numPages;
281
+
282
+ // 准备输入
283
+ if (inputType === InputType.FILE) {
284
+ try {
285
+ await fs.promises.access(input, fs.constants.R_OK);
286
+ } catch {
287
+ throw new Error(`File not found or not readable: ${input}`);
288
+ }
289
+ filePath = input;
290
+ numPages = nativeRenderer.getPageCountFromFile(filePath);
291
+ } else if (inputType === InputType.BUFFER) {
292
+ pdfBuffer = Buffer.isBuffer(input) ? input : Buffer.from(input);
293
+ numPages = nativeRenderer.getPageCount(pdfBuffer);
294
+ } else if (inputType === InputType.URL) {
295
+ const fileSize = await getRemoteFileSize(input);
296
+ logger.debug(`Remote file size: ${(fileSize / 1024 / 1024).toFixed(2)}MB, downloading...`);
297
+ tempFile = await downloadToTempFile(input);
298
+ filePath = tempFile;
299
+ numPages = nativeRenderer.getPageCountFromFile(filePath);
300
+ }
301
+
302
+ // 确定目标页码
303
+ let targetPages;
304
+ if (pages.length === 0) {
305
+ targetPages = Array.from({ length: numPages }, (_, i) => i + 1);
306
+ } else {
307
+ targetPages = pages.filter(p => p >= 1 && p <= numPages);
308
+ }
309
+
310
+ logger.debug(`Rendering ${targetPages.length} pages using thread pool (${threadCount} workers)`);
311
+
312
+ // 获取线程池
313
+ const pool = getThreadPool();
314
+
315
+ try {
316
+ // 为每一页创建任务并提交到线程池
317
+ const tasks = targetPages.map(pageNum => {
318
+ const task = {
319
+ pageNum,
320
+ options,
321
+ };
322
+
323
+ if (filePath) {
324
+ task.filePath = filePath;
325
+ } else if (pdfBuffer) {
326
+ // 注意:Buffer 会被序列化传递给工作线程
327
+ // 对于大文件,建议先保存到临时文件再传递路径
328
+ task.pdfBuffer = pdfBuffer;
329
+ }
330
+
331
+ // 提交任务到线程池
332
+ return pool.run(task);
333
+ });
334
+
335
+ // 等待所有页面的并行处理完成
336
+ const results = await Promise.all(tasks);
337
+
338
+ results.sort((a, b) => a.pageNum - b.pageNum);
339
+
340
+ return {
341
+ success: true,
342
+ numPages,
343
+ pages: results,
344
+ totalTime: Date.now() - startTime,
345
+ renderTime: results.reduce((sum, p) => sum + (p.renderTime || 0), 0),
346
+ encodeTime: results.reduce((sum, p) => sum + (p.encodeTime || 0), 0),
347
+ };
348
+ } finally {
349
+ // 清理临时文件
350
+ if (tempFile) {
351
+ try {
352
+ await fs.promises.unlink(tempFile);
353
+ } catch {}
354
+ }
355
+ }
356
+ }
357
+
358
+ /**
359
+ * PDF 转图片
360
+ *
361
+ * @param {string|Buffer} input - PDF 输入(文件路径、URL 或 Buffer)
362
+ * @param {Object} options - 转换选项
363
+ * @param {number[]} [options.pages] - 要转换的页码(1-based),空数组表示全部
364
+ * @param {string} [options.outputType='buffer'] - 输出类型:'file'、'buffer'、'cos'
365
+ * @param {string} [options.outputDir] - 输出目录(outputType='file' 时必需)
366
+ * @param {string} [options.prefix='page'] - 输出文件名前缀
367
+ * @param {string} [options.format='webp'] - 输出格式:'webp'、'png'、'jpg'
368
+ * @param {number} [options.quality] - 图片质量(0-100,用于 webp 和 jpg)
369
+ * @param {Object} [options.webp] - WebP 编码配置
370
+ * @param {number} [options.webp.quality] - WebP 质量(0-100,默认 80)
371
+ * @param {number} [options.webp.method] - WebP 编码方法(0-6,默认 4,0最快6最慢)
372
+ * @param {Object} [options.jpeg] - JPEG 编码配置
373
+ * @param {number} [options.jpeg.quality] - JPEG 质量(0-100,默认 85)
374
+ * @param {Object} [options.png] - PNG 编码配置
375
+ * @param {number} [options.png.compressionLevel] - PNG 压缩级别(0-9,默认 6)
376
+ * @param {Object} [options.cos] - COS 配置(outputType='cos' 时必需)
377
+ * @param {string} [options.cosKeyPrefix] - COS key 前缀
378
+ * @param {number} [options.targetWidth] - 目标渲染宽度(默认 1280)
379
+ * @param {number} [options.concurrency] - 文件/上传并发数
380
+ * @returns {Promise<Object>} 转换结果
381
+ */
382
+ export async function convert(input, options = {}) {
383
+ const startTime = Date.now();
384
+
385
+ const {
386
+ pages = [],
387
+ outputType = OutputType.BUFFER,
388
+ outputDir,
389
+ prefix = 'page',
390
+ format = RENDER_CONFIG.OUTPUT_FORMAT,
391
+ cos: cosConfig,
392
+ cosKeyPrefix = `pdf2img/${Date.now()}`,
393
+ concurrency,
394
+ ...renderOptions
395
+ } = options;
396
+
397
+ // 验证格式
398
+ const normalizedFormat = format.toLowerCase();
399
+ if (!SUPPORTED_FORMATS.includes(normalizedFormat)) {
400
+ throw new Error(`Unsupported format: ${format}. Supported formats: ${SUPPORTED_FORMATS.join(', ')}`);
401
+ }
402
+
403
+ // 检查渲染器可用性
404
+ if (!nativeRenderer.isNativeAvailable()) {
405
+ throw new Error('Native renderer is not available. Please ensure PDFium library is installed.');
406
+ }
407
+
408
+ // 检测输入类型
409
+ const inputType = detectInputType(input);
410
+ logger.debug(`Input type: ${inputType}`);
411
+
412
+ // 构建编码选项
413
+ const encodeOptions = {
414
+ format: normalizedFormat,
415
+ quality: renderOptions.quality,
416
+ webpQuality: renderOptions.webp?.quality,
417
+ webpMethod: renderOptions.webp?.method,
418
+ jpegQuality: renderOptions.jpeg?.quality,
419
+ pngCompression: renderOptions.png?.compressionLevel,
420
+ targetWidth: renderOptions.targetWidth,
421
+ detectScan: renderOptions.detectScan,
422
+ };
423
+
424
+ // 使用线程池渲染页面
425
+ const result = await renderPages(input, inputType, pages, encodeOptions);
426
+
427
+ // 处理输出
428
+ let outputResult;
429
+
430
+ if (outputType === OutputType.FILE) {
431
+ if (!outputDir) {
432
+ throw new Error('outputDir is required when outputType is "file"');
433
+ }
434
+ outputResult = await saveToFiles(result.pages, outputDir, prefix, normalizedFormat, concurrency);
435
+
436
+ } else if (outputType === OutputType.COS) {
437
+ if (!cosConfig) {
438
+ throw new Error('cos config is required when outputType is "cos"');
439
+ }
440
+ outputResult = await uploadToCos(result.pages, cosConfig, cosKeyPrefix, normalizedFormat, concurrency);
441
+
442
+ } else {
443
+ // 返回 Buffer
444
+ outputResult = result.pages.map(page => {
445
+ if (!page.success || !page.buffer) {
446
+ return {
447
+ pageNum: page.pageNum,
448
+ width: page.width,
449
+ height: page.height,
450
+ success: false,
451
+ buffer: null,
452
+ error: page.error || 'Render failed',
453
+ };
454
+ }
455
+
456
+ // 确保 buffer 是 Buffer 类型
457
+ // Piscina 跨线程传递时 Buffer 可能被序列化为普通对象
458
+ let buffer = page.buffer;
459
+ if (!Buffer.isBuffer(buffer)) {
460
+ // 尝试从序列化的对象恢复 Buffer
461
+ try {
462
+ if (buffer && typeof buffer === 'object') {
463
+ // 可能是 { type: 'Buffer', data: [...] } 格式
464
+ if (buffer.type === 'Buffer' && Array.isArray(buffer.data)) {
465
+ buffer = Buffer.from(buffer.data);
466
+ } else if (buffer.data && ArrayBuffer.isView(buffer.data)) {
467
+ buffer = Buffer.from(buffer.data);
468
+ } else if (ArrayBuffer.isView(buffer)) {
469
+ // Uint8Array 等 TypedArray
470
+ buffer = Buffer.from(buffer);
471
+ } else {
472
+ // 最后尝试直接转换
473
+ buffer = Buffer.from(buffer);
474
+ }
475
+ } else {
476
+ throw new Error(`Cannot convert ${typeof buffer} to Buffer`);
477
+ }
478
+ } catch (e) {
479
+ logger.error(`Buffer type mismatch: ${typeof page.buffer}, conversion failed: ${e.message}`);
480
+ return {
481
+ pageNum: page.pageNum,
482
+ width: page.width,
483
+ height: page.height,
484
+ success: false,
485
+ buffer: null,
486
+ error: `Invalid buffer type returned from worker: ${e.message}`,
487
+ };
488
+ }
489
+ }
490
+
491
+ return {
492
+ pageNum: page.pageNum,
493
+ width: page.width,
494
+ height: page.height,
495
+ success: true,
496
+ buffer,
497
+ size: buffer.length,
498
+ };
499
+ }).sort((a, b) => a.pageNum - b.pageNum);
500
+ }
501
+
502
+ return {
503
+ success: true,
504
+ numPages: result.numPages,
505
+ renderedPages: outputResult.filter(p => p.success).length,
506
+ format: normalizedFormat,
507
+ pages: outputResult,
508
+ timing: {
509
+ total: Date.now() - startTime,
510
+ render: result.renderTime,
511
+ encode: result.encodeTime,
512
+ },
513
+ threadPool: {
514
+ workers: threadCount,
515
+ },
516
+ };
517
+ }
518
+
519
+ /**
520
+ * 获取 PDF 页数(异步版本)
521
+ *
522
+ * @param {string|Buffer} input - PDF 输入(文件路径或 Buffer)
523
+ * @returns {Promise<number>} 页数
524
+ */
525
+ export async function getPageCount(input) {
526
+ if (!nativeRenderer.isNativeAvailable()) {
527
+ throw new Error('Native renderer is not available');
528
+ }
529
+
530
+ if (Buffer.isBuffer(input)) {
531
+ return nativeRenderer.getPageCount(input);
532
+ }
533
+
534
+ if (typeof input === 'string') {
535
+ try {
536
+ await fs.promises.access(input, fs.constants.R_OK);
537
+ } catch {
538
+ throw new Error(`File not found or not readable: ${input}`);
539
+ }
540
+ return nativeRenderer.getPageCountFromFile(input);
541
+ }
542
+
543
+ throw new Error('Invalid input: must be a file path or Buffer');
544
+ }
545
+
546
+ /**
547
+ * 获取 PDF 页数(同步版本,保持向后兼容)
548
+ *
549
+ * @deprecated 使用 getPageCount 的异步版本以获得更好的性能
550
+ */
551
+ export function getPageCountSync(input) {
552
+ if (!nativeRenderer.isNativeAvailable()) {
553
+ throw new Error('Native renderer is not available');
554
+ }
555
+
556
+ let buffer;
557
+ if (Buffer.isBuffer(input)) {
558
+ buffer = input;
559
+ } else if (typeof input === 'string' && fs.existsSync(input)) {
560
+ buffer = fs.readFileSync(input);
561
+ } else {
562
+ throw new Error('Invalid input: must be a file path or Buffer');
563
+ }
564
+
565
+ return nativeRenderer.getPageCount(buffer);
566
+ }
567
+
568
+ /**
569
+ * 检查渲染器是否可用
570
+ */
571
+ export function isAvailable() {
572
+ return nativeRenderer.isNativeAvailable();
573
+ }
574
+
575
+ /**
576
+ * 获取版本信息
577
+ */
578
+ export function getVersion() {
579
+ return nativeRenderer.getVersion();
580
+ }
581
+
582
+ /**
583
+ * 获取线程池统计信息
584
+ */
585
+ export function getThreadPoolStats() {
586
+ if (!piscina) {
587
+ return {
588
+ initialized: false,
589
+ workers: threadCount,
590
+ };
591
+ }
592
+ return {
593
+ initialized: true,
594
+ workers: threadCount,
595
+ completed: piscina.completed,
596
+ waitTime: piscina.waitTime,
597
+ runTime: piscina.runTime,
598
+ utilization: piscina.utilization,
599
+ };
600
+ }
601
+
602
+ /**
603
+ * 销毁线程池
604
+ *
605
+ * 在应用关闭时调用,释放工作线程资源
606
+ */
607
+ export async function destroyThreadPool() {
608
+ if (piscina) {
609
+ await piscina.destroy();
610
+ piscina = null;
611
+ logger.info('Thread pool destroyed');
612
+ }
613
+ }