@leolionart/n8n-nodes-pdf-extractor 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -1,46 +1,40 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
-
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
-
if (k2 === undefined) k2 = k;
|
|
4
|
-
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
-
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
-
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
-
}
|
|
8
|
-
Object.defineProperty(o, k2, desc);
|
|
9
|
-
}) : (function(o, m, k, k2) {
|
|
10
|
-
if (k2 === undefined) k2 = k;
|
|
11
|
-
o[k2] = m[k];
|
|
12
|
-
}));
|
|
13
|
-
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
-
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
-
}) : function(o, v) {
|
|
16
|
-
o["default"] = v;
|
|
17
|
-
});
|
|
18
|
-
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
-
var ownKeys = function(o) {
|
|
20
|
-
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
-
var ar = [];
|
|
22
|
-
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
-
return ar;
|
|
24
|
-
};
|
|
25
|
-
return ownKeys(o);
|
|
26
|
-
};
|
|
27
|
-
return function (mod) {
|
|
28
|
-
if (mod && mod.__esModule) return mod;
|
|
29
|
-
var result = {};
|
|
30
|
-
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
-
__setModuleDefault(result, mod);
|
|
32
|
-
return result;
|
|
33
|
-
};
|
|
34
|
-
})();
|
|
35
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
3
|
exports.PdfExtractor = void 0;
|
|
37
4
|
const n8n_workflow_1 = require("n8n-workflow");
|
|
38
|
-
|
|
39
|
-
const
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
5
|
+
// Use legacy build for Node.js compatibility
|
|
6
|
+
const pdf_mjs_1 = require("pdfjs-dist/legacy/build/pdf.mjs");
|
|
7
|
+
// Disable worker for Node.js environment
|
|
8
|
+
pdf_mjs_1.GlobalWorkerOptions.workerSrc = '';
|
|
9
|
+
/**
|
|
10
|
+
* Parse page range string into array of page numbers
|
|
11
|
+
* Supports: "1-5", "1,3,5", "1-3,7,9-11", or empty for all pages
|
|
12
|
+
*/
|
|
13
|
+
function parsePageRange(pageRange, totalPages) {
|
|
14
|
+
if (!pageRange.trim()) {
|
|
15
|
+
// Return all pages
|
|
16
|
+
return Array.from({ length: totalPages }, (_, i) => i + 1);
|
|
17
|
+
}
|
|
18
|
+
const pages = new Set();
|
|
19
|
+
const parts = pageRange.split(',').map(p => p.trim());
|
|
20
|
+
for (const part of parts) {
|
|
21
|
+
if (part.includes('-')) {
|
|
22
|
+
const [start, end] = part.split('-').map(n => parseInt(n.trim(), 10));
|
|
23
|
+
if (!isNaN(start) && !isNaN(end)) {
|
|
24
|
+
for (let i = Math.max(1, start); i <= Math.min(totalPages, end); i++) {
|
|
25
|
+
pages.add(i);
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
else {
|
|
30
|
+
const pageNum = parseInt(part, 10);
|
|
31
|
+
if (!isNaN(pageNum) && pageNum >= 1 && pageNum <= totalPages) {
|
|
32
|
+
pages.add(pageNum);
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
return Array.from(pages).sort((a, b) => a - b);
|
|
37
|
+
}
|
|
44
38
|
class PdfExtractor {
|
|
45
39
|
constructor() {
|
|
46
40
|
this.description = {
|
|
@@ -50,7 +44,7 @@ class PdfExtractor {
|
|
|
50
44
|
group: ['transform'],
|
|
51
45
|
version: 1,
|
|
52
46
|
subtitle: '={{$parameter["operation"]}}',
|
|
53
|
-
description: 'Extract text from password-protected PDFs
|
|
47
|
+
description: 'Extract text from password-protected PDFs. No external dependencies required - works out of the box in n8n Docker.',
|
|
54
48
|
defaults: {
|
|
55
49
|
name: 'PDF Extractor',
|
|
56
50
|
},
|
|
@@ -69,12 +63,6 @@ class PdfExtractor {
|
|
|
69
63
|
description: 'Extract text content from PDF',
|
|
70
64
|
action: 'Extract text from PDF',
|
|
71
65
|
},
|
|
72
|
-
{
|
|
73
|
-
name: 'Decrypt Only',
|
|
74
|
-
value: 'decrypt',
|
|
75
|
-
description: 'Decrypt PDF and return as binary',
|
|
76
|
-
action: 'Decrypt PDF file',
|
|
77
|
-
},
|
|
78
66
|
],
|
|
79
67
|
default: 'extractText',
|
|
80
68
|
},
|
|
@@ -105,18 +93,30 @@ class PdfExtractor {
|
|
|
105
93
|
default: {},
|
|
106
94
|
options: [
|
|
107
95
|
{
|
|
108
|
-
displayName: '
|
|
109
|
-
name: '
|
|
96
|
+
displayName: 'Join Pages',
|
|
97
|
+
name: 'joinPages',
|
|
110
98
|
type: 'boolean',
|
|
111
99
|
default: true,
|
|
112
|
-
description: 'Whether to
|
|
100
|
+
description: 'Whether to join all pages into a single text output',
|
|
101
|
+
},
|
|
102
|
+
{
|
|
103
|
+
displayName: 'Page Separator',
|
|
104
|
+
name: 'pageSeparator',
|
|
105
|
+
type: 'string',
|
|
106
|
+
default: '\n\n--- Page {page} ---\n\n',
|
|
107
|
+
description: 'Separator between pages when joining. Use {page} for page number.',
|
|
108
|
+
displayOptions: {
|
|
109
|
+
show: {
|
|
110
|
+
joinPages: [true],
|
|
111
|
+
},
|
|
112
|
+
},
|
|
113
113
|
},
|
|
114
114
|
{
|
|
115
115
|
displayName: 'Page Range',
|
|
116
116
|
name: 'pageRange',
|
|
117
117
|
type: 'string',
|
|
118
118
|
default: '',
|
|
119
|
-
placeholder: '1-5',
|
|
119
|
+
placeholder: '1-5 or 1,3,5',
|
|
120
120
|
description: 'Extract specific pages only (e.g., "1-5" or "1,3,5"). Leave empty for all pages.',
|
|
121
121
|
},
|
|
122
122
|
{
|
|
@@ -127,16 +127,16 @@ class PdfExtractor {
|
|
|
127
127
|
description: 'Name of the JSON property to store extracted text',
|
|
128
128
|
},
|
|
129
129
|
{
|
|
130
|
-
displayName: '
|
|
131
|
-
name: '
|
|
132
|
-
type: '
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
130
|
+
displayName: 'Include Page Numbers',
|
|
131
|
+
name: 'includePageNumbers',
|
|
132
|
+
type: 'boolean',
|
|
133
|
+
default: false,
|
|
134
|
+
description: 'Whether to include page numbers in the output when not joining pages',
|
|
135
|
+
displayOptions: {
|
|
136
|
+
show: {
|
|
137
|
+
joinPages: [false],
|
|
138
|
+
},
|
|
139
|
+
},
|
|
140
140
|
},
|
|
141
141
|
],
|
|
142
142
|
},
|
|
@@ -146,112 +146,103 @@ class PdfExtractor {
|
|
|
146
146
|
async execute() {
|
|
147
147
|
const items = this.getInputData();
|
|
148
148
|
const returnData = [];
|
|
149
|
-
// Check if required tools are installed
|
|
150
|
-
try {
|
|
151
|
-
await execAsync('which qpdf');
|
|
152
|
-
await execAsync('which pdftotext');
|
|
153
|
-
}
|
|
154
|
-
catch {
|
|
155
|
-
throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'Required tools not found. Please install qpdf and poppler-utils in your n8n container:\n' +
|
|
156
|
-
'docker exec -u root n8n apk add --no-cache qpdf poppler-utils');
|
|
157
|
-
}
|
|
158
149
|
for (let itemIndex = 0; itemIndex < items.length; itemIndex++) {
|
|
159
150
|
try {
|
|
160
|
-
const operation = this.getNodeParameter('operation', itemIndex);
|
|
161
151
|
const binaryPropertyName = this.getNodeParameter('binaryPropertyName', itemIndex);
|
|
162
152
|
const password = this.getNodeParameter('password', itemIndex);
|
|
163
153
|
const options = this.getNodeParameter('options', itemIndex, {});
|
|
164
154
|
// Validate binary data exists
|
|
165
155
|
const binaryData = this.helpers.assertBinaryData(itemIndex, binaryPropertyName);
|
|
166
156
|
const buffer = await this.helpers.getBinaryDataBuffer(itemIndex, binaryPropertyName);
|
|
167
|
-
//
|
|
168
|
-
const
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
const inputPath = path.join(tempDir, `n8n_pdf_input_${timestamp}_${randomId}.pdf`);
|
|
172
|
-
const decryptedPath = path.join(tempDir, `n8n_pdf_decrypted_${timestamp}_${randomId}.pdf`);
|
|
173
|
-
// Write PDF to temp file
|
|
174
|
-
fs.writeFileSync(inputPath, buffer);
|
|
175
|
-
let pdfPath = inputPath;
|
|
157
|
+
// Convert buffer to Uint8Array for pdfjs
|
|
158
|
+
const pdfData = new Uint8Array(buffer);
|
|
159
|
+
// Load PDF document
|
|
160
|
+
let pdfDocument;
|
|
176
161
|
try {
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
}
|
|
189
|
-
throw new n8n_workflow_1.NodeOperationError(this.getNode(), `Failed to decrypt PDF: ${errorMessage}`, { itemIndex });
|
|
190
|
-
}
|
|
191
|
-
}
|
|
192
|
-
if (operation === 'extractText') {
|
|
193
|
-
// Build pdftotext command
|
|
194
|
-
const pdftotextArgs = [];
|
|
195
|
-
if (options.layout !== false) {
|
|
196
|
-
pdftotextArgs.push('-layout');
|
|
197
|
-
}
|
|
198
|
-
if (options.encoding) {
|
|
199
|
-
pdftotextArgs.push(`-enc ${options.encoding}`);
|
|
200
|
-
}
|
|
201
|
-
if (options.pageRange) {
|
|
202
|
-
const pageMatch = options.pageRange.match(/^(\d+)(?:-(\d+))?$/);
|
|
203
|
-
if (pageMatch) {
|
|
204
|
-
pdftotextArgs.push(`-f ${pageMatch[1]}`);
|
|
205
|
-
if (pageMatch[2]) {
|
|
206
|
-
pdftotextArgs.push(`-l ${pageMatch[2]}`);
|
|
207
|
-
}
|
|
208
|
-
}
|
|
209
|
-
}
|
|
210
|
-
const pdftotextCmd = `pdftotext ${pdftotextArgs.join(' ')} "${pdfPath}" -`;
|
|
211
|
-
const { stdout, stderr } = await execAsync(pdftotextCmd, { maxBuffer: 50 * 1024 * 1024 });
|
|
212
|
-
if (stderr && !stderr.includes('Syntax Warning')) {
|
|
213
|
-
console.warn(`pdftotext warning: ${stderr}`);
|
|
214
|
-
}
|
|
215
|
-
const outputProperty = options.outputProperty || 'text';
|
|
216
|
-
returnData.push({
|
|
217
|
-
json: {
|
|
218
|
-
[outputProperty]: stdout,
|
|
219
|
-
fileName: binaryData.fileName,
|
|
220
|
-
mimeType: binaryData.mimeType,
|
|
221
|
-
fileSize: buffer.length,
|
|
222
|
-
encrypted: !!password,
|
|
223
|
-
},
|
|
224
|
-
pairedItem: { item: itemIndex },
|
|
225
|
-
});
|
|
162
|
+
const loadingTask = (0, pdf_mjs_1.getDocument)({
|
|
163
|
+
data: pdfData,
|
|
164
|
+
password: password || undefined,
|
|
165
|
+
useSystemFonts: true,
|
|
166
|
+
});
|
|
167
|
+
pdfDocument = await loadingTask.promise;
|
|
168
|
+
}
|
|
169
|
+
catch (error) {
|
|
170
|
+
const errorMessage = error.message || String(error);
|
|
171
|
+
if (errorMessage.includes('Invalid password') || errorMessage.includes('Incorrect Password')) {
|
|
172
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'Invalid password for PDF file', { itemIndex });
|
|
226
173
|
}
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
const decryptedBuffer = fs.readFileSync(pdfPath);
|
|
230
|
-
const newBinaryData = await this.helpers.prepareBinaryData(decryptedBuffer, binaryData.fileName?.replace('.pdf', '_decrypted.pdf') || 'decrypted.pdf', 'application/pdf');
|
|
231
|
-
returnData.push({
|
|
232
|
-
json: {
|
|
233
|
-
fileName: binaryData.fileName,
|
|
234
|
-
decrypted: true,
|
|
235
|
-
},
|
|
236
|
-
binary: {
|
|
237
|
-
[binaryPropertyName]: newBinaryData,
|
|
238
|
-
},
|
|
239
|
-
pairedItem: { item: itemIndex },
|
|
240
|
-
});
|
|
174
|
+
if (errorMessage.includes('password')) {
|
|
175
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'This PDF is password-protected. Please provide the correct password.', { itemIndex });
|
|
241
176
|
}
|
|
177
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), `Failed to load PDF: ${errorMessage}`, { itemIndex });
|
|
242
178
|
}
|
|
243
|
-
|
|
244
|
-
|
|
179
|
+
const numPages = pdfDocument.numPages;
|
|
180
|
+
// Parse page range
|
|
181
|
+
const pagesToExtract = parsePageRange(options.pageRange || '', numPages);
|
|
182
|
+
// Extract text from each page
|
|
183
|
+
const pageTexts = [];
|
|
184
|
+
for (const pageNum of pagesToExtract) {
|
|
245
185
|
try {
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
186
|
+
const page = await pdfDocument.getPage(pageNum);
|
|
187
|
+
const textContent = await page.getTextContent();
|
|
188
|
+
// Extract text items and join them
|
|
189
|
+
const pageText = textContent.items
|
|
190
|
+
.filter((item) => 'str' in item)
|
|
191
|
+
.map((item) => item.str)
|
|
192
|
+
.join(' ')
|
|
193
|
+
.replace(/\s+/g, ' ')
|
|
194
|
+
.trim();
|
|
195
|
+
pageTexts.push({ page: pageNum, text: pageText });
|
|
250
196
|
}
|
|
251
|
-
catch {
|
|
252
|
-
//
|
|
197
|
+
catch (pageError) {
|
|
198
|
+
// Continue with other pages if one fails
|
|
199
|
+
console.warn(`Failed to extract text from page ${pageNum}: ${pageError}`);
|
|
200
|
+
pageTexts.push({ page: pageNum, text: '' });
|
|
253
201
|
}
|
|
254
202
|
}
|
|
203
|
+
const outputProperty = options.outputProperty || 'text';
|
|
204
|
+
const joinPages = options.joinPages !== false; // Default to true
|
|
205
|
+
let outputData;
|
|
206
|
+
if (joinPages) {
|
|
207
|
+
// Join all pages with separator
|
|
208
|
+
const separator = options.pageSeparator || '\n\n--- Page {page} ---\n\n';
|
|
209
|
+
const fullText = pageTexts
|
|
210
|
+
.map((p, index) => {
|
|
211
|
+
if (index === 0) {
|
|
212
|
+
return p.text;
|
|
213
|
+
}
|
|
214
|
+
return separator.replace('{page}', String(p.page)) + p.text;
|
|
215
|
+
})
|
|
216
|
+
.join('');
|
|
217
|
+
outputData = {
|
|
218
|
+
[outputProperty]: fullText,
|
|
219
|
+
fileName: binaryData.fileName,
|
|
220
|
+
mimeType: binaryData.mimeType,
|
|
221
|
+
fileSize: buffer.length,
|
|
222
|
+
totalPages: numPages,
|
|
223
|
+
extractedPages: pagesToExtract.length,
|
|
224
|
+
encrypted: !!password,
|
|
225
|
+
};
|
|
226
|
+
}
|
|
227
|
+
else {
|
|
228
|
+
// Return array of pages
|
|
229
|
+
const pagesOutput = options.includePageNumbers
|
|
230
|
+
? pageTexts
|
|
231
|
+
: pageTexts.map(p => p.text);
|
|
232
|
+
outputData = {
|
|
233
|
+
[outputProperty]: pagesOutput,
|
|
234
|
+
fileName: binaryData.fileName,
|
|
235
|
+
mimeType: binaryData.mimeType,
|
|
236
|
+
fileSize: buffer.length,
|
|
237
|
+
totalPages: numPages,
|
|
238
|
+
extractedPages: pagesToExtract.length,
|
|
239
|
+
encrypted: !!password,
|
|
240
|
+
};
|
|
241
|
+
}
|
|
242
|
+
returnData.push({
|
|
243
|
+
json: outputData,
|
|
244
|
+
pairedItem: { item: itemIndex },
|
|
245
|
+
});
|
|
255
246
|
}
|
|
256
247
|
catch (error) {
|
|
257
248
|
if (this.continueOnFail()) {
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@leolionart/n8n-nodes-pdf-extractor",
|
|
3
|
-
"version": "1.
|
|
4
|
-
"description": "n8n community node to extract text from password-protected PDFs
|
|
3
|
+
"version": "1.1.0",
|
|
4
|
+
"description": "n8n community node to extract text from password-protected PDFs - no external dependencies required",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"n8n-community-node-package",
|
|
7
7
|
"n8n",
|
|
@@ -9,8 +9,7 @@
|
|
|
9
9
|
"extract",
|
|
10
10
|
"password",
|
|
11
11
|
"decrypt",
|
|
12
|
-
"
|
|
13
|
-
"qpdf"
|
|
12
|
+
"text-extraction"
|
|
14
13
|
],
|
|
15
14
|
"license": "MIT",
|
|
16
15
|
"homepage": "https://github.com/pntai/n8n-nodes-pdf-extractor",
|
|
@@ -58,5 +57,8 @@
|
|
|
58
57
|
},
|
|
59
58
|
"peerDependencies": {
|
|
60
59
|
"n8n-workflow": "*"
|
|
60
|
+
},
|
|
61
|
+
"dependencies": {
|
|
62
|
+
"pdfjs-dist": "^4.9.155"
|
|
61
63
|
}
|
|
62
64
|
}
|