node-pdf2img 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,179 @@
1
+ /**
2
+ * 输出处理模块
3
+ *
4
+ * 负责将渲染结果保存到文件或上传到 COS
5
+ */
6
+
7
+ import fs from 'fs';
8
+ import path from 'path';
9
+ import pLimit from 'p-limit';
10
+ import { createLogger } from '../utils/logger.js';
11
+ import { getExtension, getMimeType } from './config.js';
12
+
13
+ const logger = createLogger('OutputHandler');
14
+
15
+ /**
16
+ * 默认并发限制
17
+ */
18
+ export const DEFAULT_CONCURRENCY = {
19
+ FILE_IO: 10, // 文件写入并发数
20
+ COS_UPLOAD: 4, // COS 上传并发数(降低以减少 EPIPE 错误)
21
+ };
22
+
23
+ /**
24
+ * 延迟函数
25
+ */
26
+ function sleep(ms) {
27
+ return new Promise(resolve => setTimeout(resolve, ms));
28
+ }
29
+
30
+ /**
31
+ * 保存单个页面到文件
32
+ */
33
+ async function savePageToFile(page, outputDir, prefix, ext) {
34
+ if (!page.success || !page.buffer) {
35
+ return { ...page, outputPath: null };
36
+ }
37
+
38
+ try {
39
+ const filename = `${prefix}_${page.pageNum}.${ext}`;
40
+ const outputPath = path.join(outputDir, filename);
41
+ await fs.promises.writeFile(outputPath, page.buffer);
42
+
43
+ return {
44
+ pageNum: page.pageNum,
45
+ width: page.width,
46
+ height: page.height,
47
+ success: true,
48
+ outputPath,
49
+ size: page.buffer.length,
50
+ };
51
+ } catch (err) {
52
+ return {
53
+ pageNum: page.pageNum,
54
+ width: page.width,
55
+ height: page.height,
56
+ success: false,
57
+ error: `File save failed: ${err.message}`,
58
+ outputPath: null,
59
+ };
60
+ }
61
+ }
62
+
63
+ /**
64
+ * 保存渲染结果到文件
65
+ *
66
+ * @param {Array} pages - 渲染结果数组
67
+ * @param {string} outputDir - 输出目录
68
+ * @param {string} prefix - 文件名前缀
69
+ * @param {string} format - 输出格式
70
+ * @param {number} concurrency - 并发数
71
+ * @returns {Promise<Array>} 保存结果
72
+ */
73
+ export async function saveToFiles(pages, outputDir, prefix = 'page', format = 'webp', concurrency = DEFAULT_CONCURRENCY.FILE_IO) {
74
+ await fs.promises.mkdir(outputDir, { recursive: true });
75
+
76
+ const ext = getExtension(format);
77
+ const limit = pLimit(concurrency);
78
+
79
+ const results = await Promise.all(
80
+ pages.map(page => limit(() => savePageToFile(page, outputDir, prefix, ext)))
81
+ );
82
+
83
+ return results.sort((a, b) => a.pageNum - b.pageNum);
84
+ }
85
+
86
+ /**
87
+ * 上传单个页面到 COS(带重试)
88
+ */
89
+ async function uploadPageToCos(page, cos, cosConfig, keyPrefix, ext, mimeType, maxRetries = 3) {
90
+ if (!page.success || !page.buffer) {
91
+ return { ...page, cosKey: null };
92
+ }
93
+
94
+ const key = `${keyPrefix}/page_${page.pageNum}.${ext}`;
95
+ let lastError;
96
+
97
+ for (let attempt = 1; attempt <= maxRetries; attempt++) {
98
+ try {
99
+ await new Promise((resolve, reject) => {
100
+ cos.putObject({
101
+ Bucket: cosConfig.bucket,
102
+ Region: cosConfig.region,
103
+ Key: key,
104
+ Body: page.buffer,
105
+ ContentType: mimeType,
106
+ }, (err) => {
107
+ if (err) reject(err);
108
+ else resolve();
109
+ });
110
+ });
111
+
112
+ return {
113
+ pageNum: page.pageNum,
114
+ width: page.width,
115
+ height: page.height,
116
+ success: true,
117
+ cosKey: key,
118
+ size: page.buffer.length,
119
+ };
120
+ } catch (err) {
121
+ lastError = err;
122
+ const isRetryable = err.code === 'EPIPE' ||
123
+ err.code === 'ECONNRESET' ||
124
+ err.code === 'ETIMEDOUT' ||
125
+ err.code === 'ECONNREFUSED' ||
126
+ (err.statusCode && err.statusCode >= 500);
127
+
128
+ if (isRetryable && attempt < maxRetries) {
129
+ // 指数退避:1s, 2s, 4s...
130
+ const delay = Math.pow(2, attempt - 1) * 1000;
131
+ logger.debug(`Page ${page.pageNum} upload failed (${err.code || err.message}), retrying in ${delay}ms (attempt ${attempt}/${maxRetries})`);
132
+ await sleep(delay);
133
+ } else if (!isRetryable) {
134
+ break;
135
+ }
136
+ }
137
+ }
138
+
139
+ return {
140
+ pageNum: page.pageNum,
141
+ width: page.width,
142
+ height: page.height,
143
+ success: false,
144
+ error: `Upload failed after ${maxRetries} attempts: ${lastError.message}`,
145
+ cosKey: null,
146
+ };
147
+ }
148
+
149
+ /**
150
+ * 上传渲染结果到 COS
151
+ *
152
+ * @param {Array} pages - 渲染结果数组
153
+ * @param {Object} cosConfig - COS 配置
154
+ * @param {string} keyPrefix - COS key 前缀
155
+ * @param {string} format - 输出格式
156
+ * @param {number} concurrency - 并发数
157
+ * @returns {Promise<Array>} 上传结果
158
+ */
159
+ export async function uploadToCos(pages, cosConfig, keyPrefix, format = 'webp', concurrency = DEFAULT_CONCURRENCY.COS_UPLOAD) {
160
+ const COS = (await import('cos-nodejs-sdk-v5')).default;
161
+
162
+ const cos = new COS({
163
+ SecretId: cosConfig.secretId,
164
+ SecretKey: cosConfig.secretKey,
165
+ Protocol: cosConfig.protocol || 'https:',
166
+ ServiceDomain: cosConfig.serviceDomain,
167
+ Domain: cosConfig.domain,
168
+ });
169
+
170
+ const ext = getExtension(format);
171
+ const mimeType = getMimeType(format);
172
+ const limit = pLimit(concurrency);
173
+
174
+ const results = await Promise.all(
175
+ pages.map(page => limit(() => uploadPageToCos(page, cos, cosConfig, keyPrefix, ext, mimeType)))
176
+ );
177
+
178
+ return results.sort((a, b) => a.pageNum - b.pageNum);
179
+ }
@@ -0,0 +1,289 @@
1
+ /**
2
+ * PDF 渲染模块
3
+ *
4
+ * 负责 PDF 页面的渲染逻辑,支持本地文件、Buffer 和 URL 输入
5
+ * 支持两种渲染器:
6
+ * - pdfium: PDFium 原生渲染器(默认,高性能)
7
+ * - pdfjs: PDF.js 渲染器(纯 JavaScript,无需原生依赖)
8
+ */
9
+
10
+ import fs from 'fs';
11
+ import { createLogger } from '../utils/logger.js';
12
+ import { RendererType, DEFAULT_RENDERER } from './config.js';
13
+ import * as nativeRenderer from '../renderers/native.js';
14
+ import * as pdfjsRenderer from '../renderers/pdfjs.js';
15
+ import { getThreadPool, getThreadCount } from './thread-pool.js';
16
+ import { getRemoteFileSize, downloadToTempFile } from './downloader.js';
17
+
18
+ const logger = createLogger('Renderer');
19
+
20
+ /**
21
+ * 输入类型枚举
22
+ */
23
+ export const InputType = {
24
+ FILE: 'file',
25
+ URL: 'url',
26
+ BUFFER: 'buffer',
27
+ };
28
+
29
+ /**
30
+ * 流式渲染阈值(小于此值使用下载模式)
31
+ */
32
+ const STREAM_THRESHOLD = 2 * 1024 * 1024; // 2MB
33
+
34
+ /**
35
+ * 检测输入类型
36
+ *
37
+ * @param {string|Buffer} input - 输入
38
+ * @returns {string} 输入类型
39
+ */
40
+ export function detectInputType(input) {
41
+ if (Buffer.isBuffer(input)) {
42
+ return InputType.BUFFER;
43
+ }
44
+ if (typeof input === 'string') {
45
+ if (input.startsWith('http://') || input.startsWith('https://')) {
46
+ return InputType.URL;
47
+ }
48
+ return InputType.FILE;
49
+ }
50
+ throw new Error('Invalid input: must be a file path, URL, or Buffer');
51
+ }
52
+
53
+ /**
54
+ * 获取当前使用的渲染器类型
55
+ *
56
+ * @param {Object} options - 选项
57
+ * @returns {string} 渲染器类型
58
+ */
59
+ export function getRendererType(options = {}) {
60
+ const renderer = options.renderer || DEFAULT_RENDERER;
61
+
62
+ // 验证渲染器类型
63
+ if (renderer === RendererType.PDFJS) {
64
+ return RendererType.PDFJS;
65
+ }
66
+
67
+ // 默认使用 pdfium,如果不可用则回退到 pdfjs
68
+ if (!nativeRenderer.isNativeAvailable()) {
69
+ logger.warn('PDFium 渲染器不可用,回退到 PDF.js');
70
+ return RendererType.PDFJS;
71
+ }
72
+
73
+ return RendererType.PDFIUM;
74
+ }
75
+
76
+ /**
77
+ * 使用指定渲染器渲染 PDF 页面
78
+ *
79
+ * @param {string|Buffer} input - 输入
80
+ * @param {string} inputType - 输入类型
81
+ * @param {number[]} pages - 页码数组
82
+ * @param {Object} options - 选项
83
+ * @returns {Promise<Object>} 渲染结果
84
+ */
85
+ export async function renderPages(input, inputType, pages, options) {
86
+ const startTime = Date.now();
87
+ const rendererType = getRendererType(options);
88
+
89
+ logger.debug(`Using renderer: ${rendererType}`);
90
+
91
+ if (rendererType === RendererType.PDFJS) {
92
+ return renderPagesWithPdfjs(input, inputType, pages, options, startTime);
93
+ }
94
+
95
+ // 使用 pdfium 渲染器
96
+ if (inputType === InputType.URL) {
97
+ return renderPagesFromUrl(input, pages, options, startTime);
98
+ }
99
+
100
+ return renderPagesFromLocal(input, inputType, pages, options, startTime);
101
+ }
102
+
103
+ /**
104
+ * 使用 PDF.js 渲染器渲染 PDF 页面
105
+ */
106
+ async function renderPagesWithPdfjs(input, inputType, pages, options, startTime) {
107
+ let result;
108
+
109
+ if (inputType === InputType.URL) {
110
+ result = await pdfjsRenderer.renderFromUrl(input, pages, options);
111
+ } else if (inputType === InputType.BUFFER) {
112
+ result = await pdfjsRenderer.renderFromBuffer(input, pages, options);
113
+ } else {
114
+ result = await pdfjsRenderer.renderFromFile(input, pages, options);
115
+ }
116
+
117
+ if (!result.success) {
118
+ throw new Error(result.error || 'PDF.js 渲染失败');
119
+ }
120
+
121
+ return {
122
+ success: true,
123
+ numPages: result.numPages,
124
+ pages: result.pages,
125
+ totalTime: Date.now() - startTime,
126
+ renderTime: result.renderTime || result.pages.reduce((sum, p) => sum + (p.renderTime || 0), 0),
127
+ encodeTime: result.pages.reduce((sum, p) => sum + (p.encodeTime || 0), 0),
128
+ renderer: RendererType.PDFJS,
129
+ streamStats: result.streamStats,
130
+ };
131
+ }
132
+
133
+ /**
134
+ * 从 URL 渲染 PDF 页面(流式)
135
+ *
136
+ * 使用 HTTP Range 请求按需获取数据,避免完整下载
137
+ */
138
+ async function renderPagesFromUrl(url, pages, options, startTime) {
139
+ // 获取文件大小
140
+ const fileSize = await getRemoteFileSize(url);
141
+
142
+ // 小文件直接下载后渲染,避免多次 Range 请求开销
143
+ if (fileSize < STREAM_THRESHOLD) {
144
+ logger.debug(`Remote file size: ${(fileSize / 1024 / 1024).toFixed(2)}MB (< 2MB), using download mode`);
145
+ return renderPagesWithDownload(url, pages, options, startTime);
146
+ }
147
+
148
+ logger.debug(`Remote file size: ${(fileSize / 1024 / 1024).toFixed(2)}MB, using stream rendering`);
149
+
150
+ try {
151
+ // 使用流式渲染
152
+ const result = await nativeRenderer.renderFromStream(url, fileSize, pages, options);
153
+
154
+ return {
155
+ success: true,
156
+ numPages: result.numPages,
157
+ pages: result.pages,
158
+ totalTime: Date.now() - startTime,
159
+ renderTime: result.pages.reduce((sum, p) => sum + (p.renderTime || 0), 0),
160
+ encodeTime: result.pages.reduce((sum, p) => sum + (p.encodeTime || 0), 0),
161
+ streamStats: result.streamStats,
162
+ renderer: RendererType.PDFIUM,
163
+ };
164
+ } catch (err) {
165
+ // 流式渲染失败,回退到下载后渲染
166
+ logger.warn(`Stream rendering failed: ${err.message}, falling back to download`);
167
+ return renderPagesWithDownload(url, pages, options, startTime);
168
+ }
169
+ }
170
+
171
+ /**
172
+ * 下载后渲染(回退方案)
173
+ */
174
+ async function renderPagesWithDownload(url, pages, options, startTime) {
175
+ const tempFile = await downloadToTempFile(url);
176
+ const threadCount = getThreadCount();
177
+
178
+ try {
179
+ const numPages = nativeRenderer.getPageCountFromFile(tempFile);
180
+
181
+ // 确定目标页码
182
+ let targetPages;
183
+ if (pages.length === 0) {
184
+ targetPages = Array.from({ length: numPages }, (_, i) => i + 1);
185
+ } else {
186
+ targetPages = pages.filter(p => p >= 1 && p <= numPages);
187
+ }
188
+
189
+ logger.debug(`Rendering ${targetPages.length} pages using thread pool (${threadCount} workers)`);
190
+
191
+ const pool = getThreadPool();
192
+
193
+ const tasks = targetPages.map(pageNum => {
194
+ return pool.run({
195
+ pageNum,
196
+ options,
197
+ filePath: tempFile,
198
+ });
199
+ });
200
+
201
+ const results = await Promise.all(tasks);
202
+ results.sort((a, b) => a.pageNum - b.pageNum);
203
+
204
+ return {
205
+ success: true,
206
+ numPages,
207
+ pages: results,
208
+ totalTime: Date.now() - startTime,
209
+ renderTime: results.reduce((sum, p) => sum + (p.renderTime || 0), 0),
210
+ encodeTime: results.reduce((sum, p) => sum + (p.encodeTime || 0), 0),
211
+ renderer: RendererType.PDFIUM,
212
+ };
213
+ } finally {
214
+ try {
215
+ await fs.promises.unlink(tempFile);
216
+ } catch {}
217
+ }
218
+ }
219
+
220
+ /**
221
+ * 从本地文件或 Buffer 渲染 PDF 页面
222
+ */
223
+ async function renderPagesFromLocal(input, inputType, pages, options, startTime) {
224
+ let filePath = null;
225
+ let pdfBuffer = null;
226
+ let numPages;
227
+ const threadCount = getThreadCount();
228
+
229
+ // 准备输入
230
+ if (inputType === InputType.FILE) {
231
+ try {
232
+ await fs.promises.access(input, fs.constants.R_OK);
233
+ } catch {
234
+ throw new Error(`File not found or not readable: ${input}`);
235
+ }
236
+ filePath = input;
237
+ numPages = nativeRenderer.getPageCountFromFile(filePath);
238
+ } else if (inputType === InputType.BUFFER) {
239
+ pdfBuffer = Buffer.isBuffer(input) ? input : Buffer.from(input);
240
+ numPages = nativeRenderer.getPageCount(pdfBuffer);
241
+ }
242
+
243
+ // 确定目标页码
244
+ let targetPages;
245
+ if (pages.length === 0) {
246
+ targetPages = Array.from({ length: numPages }, (_, i) => i + 1);
247
+ } else {
248
+ targetPages = pages.filter(p => p >= 1 && p <= numPages);
249
+ }
250
+
251
+ logger.debug(`Rendering ${targetPages.length} pages using thread pool (${threadCount} workers)`);
252
+
253
+ // 获取线程池
254
+ const pool = getThreadPool();
255
+
256
+ // 为每一页创建任务并提交到线程池
257
+ const tasks = targetPages.map(pageNum => {
258
+ const task = {
259
+ pageNum,
260
+ options,
261
+ };
262
+
263
+ if (filePath) {
264
+ task.filePath = filePath;
265
+ } else if (pdfBuffer) {
266
+ // 注意:Buffer 会被序列化传递给工作线程
267
+ // 对于大文件,建议先保存到临时文件再传递路径
268
+ task.pdfBuffer = pdfBuffer;
269
+ }
270
+
271
+ // 提交任务到线程池
272
+ return pool.run(task);
273
+ });
274
+
275
+ // 等待所有页面的并行处理完成
276
+ const results = await Promise.all(tasks);
277
+
278
+ results.sort((a, b) => a.pageNum - b.pageNum);
279
+
280
+ return {
281
+ success: true,
282
+ numPages,
283
+ pages: results,
284
+ totalTime: Date.now() - startTime,
285
+ renderTime: results.reduce((sum, p) => sum + (p.renderTime || 0), 0),
286
+ encodeTime: results.reduce((sum, p) => sum + (p.encodeTime || 0), 0),
287
+ renderer: RendererType.PDFIUM,
288
+ };
289
+ }
@@ -0,0 +1,78 @@
1
+ /**
2
+ * 线程池管理模块
3
+ *
4
+ * 使用 Piscina 管理工作线程池,用于 CPU 密集型任务
5
+ */
6
+
7
+ import os from 'os';
8
+ import path from 'path';
9
+ import { fileURLToPath } from 'url';
10
+ import Piscina from 'piscina';
11
+ import { createLogger } from '../utils/logger.js';
12
+
13
+ const logger = createLogger('ThreadPool');
14
+
15
+ // 获取 worker.js 的路径
16
+ const __filename = fileURLToPath(import.meta.url);
17
+ const __dirname = path.dirname(__filename);
18
+ const workerPath = path.resolve(__dirname, '../worker.js');
19
+
20
+ // 线程数默认为 CPU 核心数,可通过环境变量调整
21
+ const threadCount = parseInt(process.env.PDF2IMG_THREAD_COUNT, 10) || os.cpus().length;
22
+
23
+ let piscina = null;
24
+
25
+ /**
26
+ * 获取或创建线程池实例(懒加载)
27
+ */
28
+ export function getThreadPool() {
29
+ if (!piscina) {
30
+ piscina = new Piscina({
31
+ filename: workerPath,
32
+ maxThreads: threadCount,
33
+ idleTimeout: 30000, // 空闲 30 秒后销毁线程
34
+ });
35
+ logger.info(`Thread pool initialized with ${threadCount} workers`);
36
+ }
37
+ return piscina;
38
+ }
39
+
40
+ /**
41
+ * 获取线程数
42
+ */
43
+ export function getThreadCount() {
44
+ return threadCount;
45
+ }
46
+
47
+ /**
48
+ * 获取线程池统计信息
49
+ */
50
+ export function getThreadPoolStats() {
51
+ if (!piscina) {
52
+ return {
53
+ initialized: false,
54
+ workers: threadCount,
55
+ };
56
+ }
57
+ return {
58
+ initialized: true,
59
+ workers: threadCount,
60
+ completed: piscina.completed,
61
+ waitTime: piscina.waitTime,
62
+ runTime: piscina.runTime,
63
+ utilization: piscina.utilization,
64
+ };
65
+ }
66
+
67
+ /**
68
+ * 销毁线程池
69
+ *
70
+ * 在应用关闭时调用,释放工作线程资源
71
+ */
72
+ export async function destroyThreadPool() {
73
+ if (piscina) {
74
+ await piscina.destroy();
75
+ piscina = null;
76
+ logger.info('Thread pool destroyed');
77
+ }
78
+ }