@leolionart/n8n-nodes-pdf-extractor 1.0.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -1,46 +1,38 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
-
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
-
if (k2 === undefined) k2 = k;
|
|
4
|
-
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
-
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
-
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
-
}
|
|
8
|
-
Object.defineProperty(o, k2, desc);
|
|
9
|
-
}) : (function(o, m, k, k2) {
|
|
10
|
-
if (k2 === undefined) k2 = k;
|
|
11
|
-
o[k2] = m[k];
|
|
12
|
-
}));
|
|
13
|
-
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
-
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
-
}) : function(o, v) {
|
|
16
|
-
o["default"] = v;
|
|
17
|
-
});
|
|
18
|
-
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
-
var ownKeys = function(o) {
|
|
20
|
-
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
-
var ar = [];
|
|
22
|
-
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
-
return ar;
|
|
24
|
-
};
|
|
25
|
-
return ownKeys(o);
|
|
26
|
-
};
|
|
27
|
-
return function (mod) {
|
|
28
|
-
if (mod && mod.__esModule) return mod;
|
|
29
|
-
var result = {};
|
|
30
|
-
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
-
__setModuleDefault(result, mod);
|
|
32
|
-
return result;
|
|
33
|
-
};
|
|
34
|
-
})();
|
|
35
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
3
|
exports.PdfExtractor = void 0;
|
|
37
4
|
const n8n_workflow_1 = require("n8n-workflow");
|
|
38
|
-
|
|
39
|
-
const
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
5
|
+
// unpdf provides a simple API for PDF text extraction with password support
|
|
6
|
+
const unpdf_1 = require("unpdf");
|
|
7
|
+
/**
|
|
8
|
+
* Parse page range string into array of page numbers
|
|
9
|
+
* Supports: "1-5", "1,3,5", "1-3,7,9-11", or empty for all pages
|
|
10
|
+
*/
|
|
11
|
+
function parsePageRange(pageRange, totalPages) {
|
|
12
|
+
if (!pageRange.trim()) {
|
|
13
|
+
// Return all pages
|
|
14
|
+
return Array.from({ length: totalPages }, (_, i) => i + 1);
|
|
15
|
+
}
|
|
16
|
+
const pages = new Set();
|
|
17
|
+
const parts = pageRange.split(',').map(p => p.trim());
|
|
18
|
+
for (const part of parts) {
|
|
19
|
+
if (part.includes('-')) {
|
|
20
|
+
const [start, end] = part.split('-').map(n => parseInt(n.trim(), 10));
|
|
21
|
+
if (!isNaN(start) && !isNaN(end)) {
|
|
22
|
+
for (let i = Math.max(1, start); i <= Math.min(totalPages, end); i++) {
|
|
23
|
+
pages.add(i);
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
else {
|
|
28
|
+
const pageNum = parseInt(part, 10);
|
|
29
|
+
if (!isNaN(pageNum) && pageNum >= 1 && pageNum <= totalPages) {
|
|
30
|
+
pages.add(pageNum);
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
return Array.from(pages).sort((a, b) => a - b);
|
|
35
|
+
}
|
|
44
36
|
class PdfExtractor {
|
|
45
37
|
constructor() {
|
|
46
38
|
this.description = {
|
|
@@ -50,7 +42,7 @@ class PdfExtractor {
|
|
|
50
42
|
group: ['transform'],
|
|
51
43
|
version: 1,
|
|
52
44
|
subtitle: '={{$parameter["operation"]}}',
|
|
53
|
-
description: 'Extract text from password-protected PDFs
|
|
45
|
+
description: 'Extract text from password-protected PDFs. No external dependencies required - works out of the box in n8n Docker.',
|
|
54
46
|
defaults: {
|
|
55
47
|
name: 'PDF Extractor',
|
|
56
48
|
},
|
|
@@ -69,12 +61,6 @@ class PdfExtractor {
|
|
|
69
61
|
description: 'Extract text content from PDF',
|
|
70
62
|
action: 'Extract text from PDF',
|
|
71
63
|
},
|
|
72
|
-
{
|
|
73
|
-
name: 'Decrypt Only',
|
|
74
|
-
value: 'decrypt',
|
|
75
|
-
description: 'Decrypt PDF and return as binary',
|
|
76
|
-
action: 'Decrypt PDF file',
|
|
77
|
-
},
|
|
78
64
|
],
|
|
79
65
|
default: 'extractText',
|
|
80
66
|
},
|
|
@@ -105,18 +91,30 @@ class PdfExtractor {
|
|
|
105
91
|
default: {},
|
|
106
92
|
options: [
|
|
107
93
|
{
|
|
108
|
-
displayName: '
|
|
109
|
-
name: '
|
|
94
|
+
displayName: 'Join Pages',
|
|
95
|
+
name: 'joinPages',
|
|
110
96
|
type: 'boolean',
|
|
111
97
|
default: true,
|
|
112
|
-
description: 'Whether to
|
|
98
|
+
description: 'Whether to join all pages into a single text output',
|
|
99
|
+
},
|
|
100
|
+
{
|
|
101
|
+
displayName: 'Page Separator',
|
|
102
|
+
name: 'pageSeparator',
|
|
103
|
+
type: 'string',
|
|
104
|
+
default: '\n\n--- Page {page} ---\n\n',
|
|
105
|
+
description: 'Separator between pages when joining. Use {page} for page number.',
|
|
106
|
+
displayOptions: {
|
|
107
|
+
show: {
|
|
108
|
+
joinPages: [true],
|
|
109
|
+
},
|
|
110
|
+
},
|
|
113
111
|
},
|
|
114
112
|
{
|
|
115
113
|
displayName: 'Page Range',
|
|
116
114
|
name: 'pageRange',
|
|
117
115
|
type: 'string',
|
|
118
116
|
default: '',
|
|
119
|
-
placeholder: '1-5',
|
|
117
|
+
placeholder: '1-5 or 1,3,5',
|
|
120
118
|
description: 'Extract specific pages only (e.g., "1-5" or "1,3,5"). Leave empty for all pages.',
|
|
121
119
|
},
|
|
122
120
|
{
|
|
@@ -127,16 +125,16 @@ class PdfExtractor {
|
|
|
127
125
|
description: 'Name of the JSON property to store extracted text',
|
|
128
126
|
},
|
|
129
127
|
{
|
|
130
|
-
displayName: '
|
|
131
|
-
name: '
|
|
132
|
-
type: '
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
128
|
+
displayName: 'Include Page Numbers',
|
|
129
|
+
name: 'includePageNumbers',
|
|
130
|
+
type: 'boolean',
|
|
131
|
+
default: false,
|
|
132
|
+
description: 'Whether to include page numbers in the output when not joining pages',
|
|
133
|
+
displayOptions: {
|
|
134
|
+
show: {
|
|
135
|
+
joinPages: [false],
|
|
136
|
+
},
|
|
137
|
+
},
|
|
140
138
|
},
|
|
141
139
|
],
|
|
142
140
|
},
|
|
@@ -146,112 +144,98 @@ class PdfExtractor {
|
|
|
146
144
|
async execute() {
|
|
147
145
|
const items = this.getInputData();
|
|
148
146
|
const returnData = [];
|
|
149
|
-
// Check if required tools are installed
|
|
150
|
-
try {
|
|
151
|
-
await execAsync('which qpdf');
|
|
152
|
-
await execAsync('which pdftotext');
|
|
153
|
-
}
|
|
154
|
-
catch {
|
|
155
|
-
throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'Required tools not found. Please install qpdf and poppler-utils in your n8n container:\n' +
|
|
156
|
-
'docker exec -u root n8n apk add --no-cache qpdf poppler-utils');
|
|
157
|
-
}
|
|
158
147
|
for (let itemIndex = 0; itemIndex < items.length; itemIndex++) {
|
|
159
148
|
try {
|
|
160
|
-
const operation = this.getNodeParameter('operation', itemIndex);
|
|
161
149
|
const binaryPropertyName = this.getNodeParameter('binaryPropertyName', itemIndex);
|
|
162
150
|
const password = this.getNodeParameter('password', itemIndex);
|
|
163
151
|
const options = this.getNodeParameter('options', itemIndex, {});
|
|
164
152
|
// Validate binary data exists
|
|
165
153
|
const binaryData = this.helpers.assertBinaryData(itemIndex, binaryPropertyName);
|
|
166
154
|
const buffer = await this.helpers.getBinaryDataBuffer(itemIndex, binaryPropertyName);
|
|
167
|
-
//
|
|
168
|
-
const
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
const inputPath = path.join(tempDir, `n8n_pdf_input_${timestamp}_${randomId}.pdf`);
|
|
172
|
-
const decryptedPath = path.join(tempDir, `n8n_pdf_decrypted_${timestamp}_${randomId}.pdf`);
|
|
173
|
-
// Write PDF to temp file
|
|
174
|
-
fs.writeFileSync(inputPath, buffer);
|
|
175
|
-
let pdfPath = inputPath;
|
|
155
|
+
// Convert buffer to Uint8Array
|
|
156
|
+
const pdfData = new Uint8Array(buffer);
|
|
157
|
+
// Get document info first to know total pages
|
|
158
|
+
let pdf;
|
|
176
159
|
try {
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
if (errorMessage.includes('invalid password')) {
|
|
187
|
-
throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'Invalid password for PDF file', { itemIndex });
|
|
188
|
-
}
|
|
189
|
-
throw new n8n_workflow_1.NodeOperationError(this.getNode(), `Failed to decrypt PDF: ${errorMessage}`, { itemIndex });
|
|
190
|
-
}
|
|
191
|
-
}
|
|
192
|
-
if (operation === 'extractText') {
|
|
193
|
-
// Build pdftotext command
|
|
194
|
-
const pdftotextArgs = [];
|
|
195
|
-
if (options.layout !== false) {
|
|
196
|
-
pdftotextArgs.push('-layout');
|
|
197
|
-
}
|
|
198
|
-
if (options.encoding) {
|
|
199
|
-
pdftotextArgs.push(`-enc ${options.encoding}`);
|
|
200
|
-
}
|
|
201
|
-
if (options.pageRange) {
|
|
202
|
-
const pageMatch = options.pageRange.match(/^(\d+)(?:-(\d+))?$/);
|
|
203
|
-
if (pageMatch) {
|
|
204
|
-
pdftotextArgs.push(`-f ${pageMatch[1]}`);
|
|
205
|
-
if (pageMatch[2]) {
|
|
206
|
-
pdftotextArgs.push(`-l ${pageMatch[2]}`);
|
|
207
|
-
}
|
|
208
|
-
}
|
|
160
|
+
pdf = await (0, unpdf_1.getDocumentProxy)(pdfData, {
|
|
161
|
+
password: password || undefined,
|
|
162
|
+
});
|
|
163
|
+
}
|
|
164
|
+
catch (error) {
|
|
165
|
+
const errorMessage = error.message || String(error);
|
|
166
|
+
if (errorMessage.toLowerCase().includes('password')) {
|
|
167
|
+
if (password) {
|
|
168
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'Invalid password for PDF file', { itemIndex });
|
|
209
169
|
}
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
if (stderr && !stderr.includes('Syntax Warning')) {
|
|
213
|
-
console.warn(`pdftotext warning: ${stderr}`);
|
|
170
|
+
else {
|
|
171
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'This PDF is password-protected. Please provide the password.', { itemIndex });
|
|
214
172
|
}
|
|
215
|
-
const outputProperty = options.outputProperty || 'text';
|
|
216
|
-
returnData.push({
|
|
217
|
-
json: {
|
|
218
|
-
[outputProperty]: stdout,
|
|
219
|
-
fileName: binaryData.fileName,
|
|
220
|
-
mimeType: binaryData.mimeType,
|
|
221
|
-
fileSize: buffer.length,
|
|
222
|
-
encrypted: !!password,
|
|
223
|
-
},
|
|
224
|
-
pairedItem: { item: itemIndex },
|
|
225
|
-
});
|
|
226
|
-
}
|
|
227
|
-
else if (operation === 'decrypt') {
|
|
228
|
-
// Read decrypted PDF and return as binary
|
|
229
|
-
const decryptedBuffer = fs.readFileSync(pdfPath);
|
|
230
|
-
const newBinaryData = await this.helpers.prepareBinaryData(decryptedBuffer, binaryData.fileName?.replace('.pdf', '_decrypted.pdf') || 'decrypted.pdf', 'application/pdf');
|
|
231
|
-
returnData.push({
|
|
232
|
-
json: {
|
|
233
|
-
fileName: binaryData.fileName,
|
|
234
|
-
decrypted: true,
|
|
235
|
-
},
|
|
236
|
-
binary: {
|
|
237
|
-
[binaryPropertyName]: newBinaryData,
|
|
238
|
-
},
|
|
239
|
-
pairedItem: { item: itemIndex },
|
|
240
|
-
});
|
|
241
173
|
}
|
|
174
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), `Failed to load PDF: ${errorMessage}`, { itemIndex });
|
|
242
175
|
}
|
|
243
|
-
|
|
244
|
-
|
|
176
|
+
const numPages = pdf.numPages;
|
|
177
|
+
const pagesToExtract = parsePageRange(options.pageRange || '', numPages);
|
|
178
|
+
// Extract text from selected pages
|
|
179
|
+
const pageTexts = [];
|
|
180
|
+
for (const pageNum of pagesToExtract) {
|
|
245
181
|
try {
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
182
|
+
const page = await pdf.getPage(pageNum);
|
|
183
|
+
const textContent = await page.getTextContent();
|
|
184
|
+
// Join text items
|
|
185
|
+
const pageText = textContent.items
|
|
186
|
+
.filter((item) => typeof item === 'object' && item !== null && 'str' in item)
|
|
187
|
+
.map((item) => item.str)
|
|
188
|
+
.join(' ')
|
|
189
|
+
.replace(/\s+/g, ' ')
|
|
190
|
+
.trim();
|
|
191
|
+
pageTexts.push({ page: pageNum, text: pageText });
|
|
250
192
|
}
|
|
251
|
-
catch {
|
|
252
|
-
|
|
193
|
+
catch (pageError) {
|
|
194
|
+
console.warn(`Failed to extract page ${pageNum}: ${pageError}`);
|
|
195
|
+
pageTexts.push({ page: pageNum, text: '' });
|
|
253
196
|
}
|
|
254
197
|
}
|
|
198
|
+
const outputProperty = options.outputProperty || 'text';
|
|
199
|
+
const joinPages = options.joinPages !== false;
|
|
200
|
+
let outputData;
|
|
201
|
+
if (joinPages) {
|
|
202
|
+
const separator = options.pageSeparator || '\n\n--- Page {page} ---\n\n';
|
|
203
|
+
const fullText = pageTexts
|
|
204
|
+
.map((p, index) => {
|
|
205
|
+
if (index === 0) {
|
|
206
|
+
return p.text;
|
|
207
|
+
}
|
|
208
|
+
return separator.replace('{page}', String(p.page)) + p.text;
|
|
209
|
+
})
|
|
210
|
+
.join('');
|
|
211
|
+
outputData = {
|
|
212
|
+
[outputProperty]: fullText,
|
|
213
|
+
fileName: binaryData.fileName,
|
|
214
|
+
mimeType: binaryData.mimeType,
|
|
215
|
+
fileSize: buffer.length,
|
|
216
|
+
totalPages: numPages,
|
|
217
|
+
extractedPages: pagesToExtract.length,
|
|
218
|
+
encrypted: !!password,
|
|
219
|
+
};
|
|
220
|
+
}
|
|
221
|
+
else {
|
|
222
|
+
const pagesOutput = options.includePageNumbers
|
|
223
|
+
? pageTexts
|
|
224
|
+
: pageTexts.map(p => p.text);
|
|
225
|
+
outputData = {
|
|
226
|
+
[outputProperty]: pagesOutput,
|
|
227
|
+
fileName: binaryData.fileName,
|
|
228
|
+
mimeType: binaryData.mimeType,
|
|
229
|
+
fileSize: buffer.length,
|
|
230
|
+
totalPages: numPages,
|
|
231
|
+
extractedPages: pagesToExtract.length,
|
|
232
|
+
encrypted: !!password,
|
|
233
|
+
};
|
|
234
|
+
}
|
|
235
|
+
returnData.push({
|
|
236
|
+
json: outputData,
|
|
237
|
+
pairedItem: { item: itemIndex },
|
|
238
|
+
});
|
|
255
239
|
}
|
|
256
240
|
catch (error) {
|
|
257
241
|
if (this.continueOnFail()) {
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@leolionart/n8n-nodes-pdf-extractor",
|
|
3
|
-
"version": "1.
|
|
4
|
-
"description": "n8n community node to extract text from password-protected PDFs
|
|
3
|
+
"version": "1.2.0",
|
|
4
|
+
"description": "n8n community node to extract text from password-protected PDFs - no external dependencies required",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"n8n-community-node-package",
|
|
7
7
|
"n8n",
|
|
@@ -9,8 +9,7 @@
|
|
|
9
9
|
"extract",
|
|
10
10
|
"password",
|
|
11
11
|
"decrypt",
|
|
12
|
-
"
|
|
13
|
-
"qpdf"
|
|
12
|
+
"text-extraction"
|
|
14
13
|
],
|
|
15
14
|
"license": "MIT",
|
|
16
15
|
"homepage": "https://github.com/pntai/n8n-nodes-pdf-extractor",
|
|
@@ -58,5 +57,8 @@
|
|
|
58
57
|
},
|
|
59
58
|
"peerDependencies": {
|
|
60
59
|
"n8n-workflow": "*"
|
|
60
|
+
},
|
|
61
|
+
"dependencies": {
|
|
62
|
+
"unpdf": "^0.12.1"
|
|
61
63
|
}
|
|
62
64
|
}
|