@leolionart/n8n-nodes-pdf-extractor 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 NAAI Studio
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,127 @@
1
+ # n8n-nodes-pdf-extractor
2
+
3
+ This is an n8n community node that extracts text from **password-protected PDFs** reliably using `qpdf` and `pdftotext` command-line tools.
4
+
5
+ This node was created to solve the [known crashing issue](https://github.com/n8n-io/n8n/issues/23754) with the built-in "Extract from File" PDF node.
6
+
7
+ [n8n](https://n8n.io/) is a [fair-code licensed](https://docs.n8n.io/reference/license/) workflow automation platform.
8
+
9
+ ## Features
10
+
11
+ - ✅ **Extract text** from password-protected PDFs
12
+ - ✅ **Decrypt PDFs** and return as binary for further processing
13
+ - ✅ **No crashes** - uses battle-tested command-line tools instead of buggy JavaScript libraries
14
+ - ✅ **Layout preservation** - maintains original text positioning
15
+ - ✅ **Page range selection** - extract specific pages only
16
+ - ✅ **Multiple encodings** - UTF-8, Latin1, ASCII7
17
+
18
+ ## Prerequisites
19
+
20
+ Before using this node, you **must install** the required tools in your n8n container:
21
+
22
+ ```bash
23
+ docker exec -u root n8n apk add --no-cache qpdf poppler-utils
24
+ ```
25
+
26
+ For **persistent installation**, add this to your Docker Compose file:
27
+
28
+ ```yaml
29
+ services:
30
+ n8n:
31
+ image: n8nio/n8n:latest
32
+ # ... other config
33
+ entrypoint: /bin/sh
34
+ command:
35
+ - -c
36
+ - |
37
+ apk add --no-cache qpdf poppler-utils
38
+ exec tini -- /docker-entrypoint.sh
39
+ ```
40
+
41
+ ## Installation
42
+
43
+ ### Via n8n UI (Recommended)
44
+
45
+ 1. Go to **Settings** → **Community Nodes**
46
+ 2. Click **Install**
47
+ 3. Enter: `n8n-nodes-pdf-extractor`
48
+ 4. Click **Install**
49
+
50
+ ### Via npm
51
+
52
+ ```bash
53
+ cd ~/.n8n/nodes
54
+ npm install n8n-nodes-pdf-extractor
55
+ ```
56
+
57
+ ## Operations
58
+
59
+ ### Extract Text
60
+
61
+ Extracts text content from a PDF file.
62
+
63
+ **Parameters:**
64
+ - **Binary Property**: Name of the binary property containing the PDF (default: `data`)
65
+ - **Password**: Password to decrypt the PDF (leave empty if not encrypted)
66
+
67
+ **Options:**
68
+ - **Layout Mode**: Maintain original text layout (default: true)
69
+ - **Page Range**: Extract specific pages (e.g., "1-5" or "1,3,5")
70
+ - **Output Property**: JSON property name for extracted text (default: `text`)
71
+ - **Encoding**: Text encoding (UTF-8, Latin1, ASCII7)
72
+
73
+ ### Decrypt Only
74
+
75
+ Decrypts a password-protected PDF and returns it as a binary file for further processing.
76
+
77
+ ## Example Usage
78
+
79
+ ### Extract text from bank statement
80
+
81
+ ```
82
+ [Gmail Trigger] → [PDF Extractor] → [AI/LLM] → [Google Sheets]
83
+ ```
84
+
85
+ 1. Gmail Trigger receives email with PDF attachment
86
+ 2. PDF Extractor extracts text with password
87
+ 3. AI extracts structured data
88
+ 4. Save to Google Sheets
89
+
90
+ ## Why This Node?
91
+
92
+ The built-in n8n "Extract from File" node uses `pdf-parse` JavaScript library which:
93
+
94
+ - ❌ Crashes n8n container with certain PDF encryption types
95
+ - ❌ Causes "SIGILL" errors on Alpine Linux
96
+ - ❌ Has memory issues with large PDFs
97
+
98
+ This node uses:
99
+
100
+ - ✅ **qpdf** - Industry-standard PDF manipulation tool
101
+ - ✅ **pdftotext** (poppler-utils) - Robust text extraction from PDFs
102
+
103
+ ## Troubleshooting
104
+
105
+ ### "Required tools not found"
106
+
107
+ Install the required tools:
108
+ ```bash
109
+ docker exec -u root n8n apk add --no-cache qpdf poppler-utils
110
+ ```
111
+
112
+ ### "Invalid password for PDF file"
113
+
114
+ Check that the password is correct. Some PDFs use owner password vs user password.
115
+
116
+ ### Empty text output
117
+
118
+ The PDF might be scanned/image-based. This node extracts text layers only. For scanned PDFs, use OCR tools.
119
+
120
+ ## Resources
121
+
122
+ - [n8n community nodes documentation](https://docs.n8n.io/integrations/community-nodes/)
123
+ - [GitHub Issue #23754 - PDF crash bug](https://github.com/n8n-io/n8n/issues/23754)
124
+
125
+ ## License
126
+
127
+ [MIT](LICENSE)
@@ -0,0 +1,5 @@
1
+ import { IExecuteFunctions, INodeExecutionData, INodeType, INodeTypeDescription } from 'n8n-workflow';
2
+ export declare class PdfExtractor implements INodeType {
3
+ description: INodeTypeDescription;
4
+ execute(this: IExecuteFunctions): Promise<INodeExecutionData[][]>;
5
+ }
@@ -0,0 +1,273 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ exports.PdfExtractor = void 0;
37
+ const n8n_workflow_1 = require("n8n-workflow");
38
+ const child_process_1 = require("child_process");
39
+ const util_1 = require("util");
40
+ const fs = __importStar(require("fs"));
41
+ const path = __importStar(require("path"));
42
+ const os = __importStar(require("os"));
43
+ const execAsync = (0, util_1.promisify)(child_process_1.exec);
44
+ class PdfExtractor {
45
+ constructor() {
46
+ this.description = {
47
+ displayName: 'PDF Extractor',
48
+ name: 'pdfExtractor',
49
+ icon: 'file:pdf-extractor.svg',
50
+ group: ['transform'],
51
+ version: 1,
52
+ subtitle: '={{$parameter["operation"]}}',
53
+ description: 'Extract text from password-protected PDFs using qpdf and pdftotext. Requires qpdf and poppler-utils installed in the n8n container.',
54
+ defaults: {
55
+ name: 'PDF Extractor',
56
+ },
57
+ inputs: ['main'],
58
+ outputs: ['main'],
59
+ properties: [
60
+ {
61
+ displayName: 'Operation',
62
+ name: 'operation',
63
+ type: 'options',
64
+ noDataExpression: true,
65
+ options: [
66
+ {
67
+ name: 'Extract Text',
68
+ value: 'extractText',
69
+ description: 'Extract text content from PDF',
70
+ action: 'Extract text from PDF',
71
+ },
72
+ {
73
+ name: 'Decrypt Only',
74
+ value: 'decrypt',
75
+ description: 'Decrypt PDF and return as binary',
76
+ action: 'Decrypt PDF file',
77
+ },
78
+ ],
79
+ default: 'extractText',
80
+ },
81
+ {
82
+ displayName: 'Binary Property',
83
+ name: 'binaryPropertyName',
84
+ type: 'string',
85
+ default: 'data',
86
+ required: true,
87
+ description: 'Name of the binary property containing the PDF file',
88
+ placeholder: 'data',
89
+ },
90
+ {
91
+ displayName: 'Password',
92
+ name: 'password',
93
+ type: 'string',
94
+ typeOptions: {
95
+ password: true,
96
+ },
97
+ default: '',
98
+ description: 'Password to decrypt the PDF. Leave empty if the PDF is not encrypted.',
99
+ },
100
+ {
101
+ displayName: 'Options',
102
+ name: 'options',
103
+ type: 'collection',
104
+ placeholder: 'Add Option',
105
+ default: {},
106
+ options: [
107
+ {
108
+ displayName: 'Layout Mode',
109
+ name: 'layout',
110
+ type: 'boolean',
111
+ default: true,
112
+ description: 'Whether to maintain the original physical layout of the text',
113
+ },
114
+ {
115
+ displayName: 'Page Range',
116
+ name: 'pageRange',
117
+ type: 'string',
118
+ default: '',
119
+ placeholder: '1-5',
120
+ description: 'Extract specific pages only (e.g., "1-5" or "1,3,5"). Leave empty for all pages.',
121
+ },
122
+ {
123
+ displayName: 'Output Property',
124
+ name: 'outputProperty',
125
+ type: 'string',
126
+ default: 'text',
127
+ description: 'Name of the JSON property to store extracted text',
128
+ },
129
+ {
130
+ displayName: 'Encoding',
131
+ name: 'encoding',
132
+ type: 'options',
133
+ options: [
134
+ { name: 'UTF-8', value: 'UTF-8' },
135
+ { name: 'Latin1', value: 'Latin1' },
136
+ { name: 'ASCII7', value: 'ASCII7' },
137
+ ],
138
+ default: 'UTF-8',
139
+ description: 'Text encoding for output',
140
+ },
141
+ ],
142
+ },
143
+ ],
144
+ };
145
+ }
146
+ async execute() {
147
+ const items = this.getInputData();
148
+ const returnData = [];
149
+ // Check if required tools are installed
150
+ try {
151
+ await execAsync('which qpdf');
152
+ await execAsync('which pdftotext');
153
+ }
154
+ catch {
155
+ throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'Required tools not found. Please install qpdf and poppler-utils in your n8n container:\n' +
156
+ 'docker exec -u root n8n apk add --no-cache qpdf poppler-utils');
157
+ }
158
+ for (let itemIndex = 0; itemIndex < items.length; itemIndex++) {
159
+ try {
160
+ const operation = this.getNodeParameter('operation', itemIndex);
161
+ const binaryPropertyName = this.getNodeParameter('binaryPropertyName', itemIndex);
162
+ const password = this.getNodeParameter('password', itemIndex);
163
+ const options = this.getNodeParameter('options', itemIndex, {});
164
+ // Validate binary data exists
165
+ const binaryData = this.helpers.assertBinaryData(itemIndex, binaryPropertyName);
166
+ const buffer = await this.helpers.getBinaryDataBuffer(itemIndex, binaryPropertyName);
167
+ // Create temp files with unique names
168
+ const tempDir = os.tmpdir();
169
+ const timestamp = Date.now();
170
+ const randomId = Math.random().toString(36).substring(7);
171
+ const inputPath = path.join(tempDir, `n8n_pdf_input_${timestamp}_${randomId}.pdf`);
172
+ const decryptedPath = path.join(tempDir, `n8n_pdf_decrypted_${timestamp}_${randomId}.pdf`);
173
+ // Write PDF to temp file
174
+ fs.writeFileSync(inputPath, buffer);
175
+ let pdfPath = inputPath;
176
+ try {
177
+ // Decrypt if password provided
178
+ if (password) {
179
+ const qpdfCmd = `qpdf --decrypt --password="${password.replace(/"/g, '\\"')}" "${inputPath}" "${decryptedPath}"`;
180
+ try {
181
+ await execAsync(qpdfCmd);
182
+ pdfPath = decryptedPath;
183
+ }
184
+ catch (error) {
185
+ const errorMessage = error.message || String(error);
186
+ if (errorMessage.includes('invalid password')) {
187
+ throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'Invalid password for PDF file', { itemIndex });
188
+ }
189
+ throw new n8n_workflow_1.NodeOperationError(this.getNode(), `Failed to decrypt PDF: ${errorMessage}`, { itemIndex });
190
+ }
191
+ }
192
+ if (operation === 'extractText') {
193
+ // Build pdftotext command
194
+ const pdftotextArgs = [];
195
+ if (options.layout !== false) {
196
+ pdftotextArgs.push('-layout');
197
+ }
198
+ if (options.encoding) {
199
+ pdftotextArgs.push(`-enc ${options.encoding}`);
200
+ }
201
+ if (options.pageRange) {
202
+ const pageMatch = options.pageRange.match(/^(\d+)(?:-(\d+))?$/);
203
+ if (pageMatch) {
204
+ pdftotextArgs.push(`-f ${pageMatch[1]}`);
205
+ if (pageMatch[2]) {
206
+ pdftotextArgs.push(`-l ${pageMatch[2]}`);
207
+ }
208
+ }
209
+ }
210
+ const pdftotextCmd = `pdftotext ${pdftotextArgs.join(' ')} "${pdfPath}" -`;
211
+ const { stdout, stderr } = await execAsync(pdftotextCmd, { maxBuffer: 50 * 1024 * 1024 });
212
+ if (stderr && !stderr.includes('Syntax Warning')) {
213
+ console.warn(`pdftotext warning: ${stderr}`);
214
+ }
215
+ const outputProperty = options.outputProperty || 'text';
216
+ returnData.push({
217
+ json: {
218
+ [outputProperty]: stdout,
219
+ fileName: binaryData.fileName,
220
+ mimeType: binaryData.mimeType,
221
+ fileSize: buffer.length,
222
+ encrypted: !!password,
223
+ },
224
+ pairedItem: { item: itemIndex },
225
+ });
226
+ }
227
+ else if (operation === 'decrypt') {
228
+ // Read decrypted PDF and return as binary
229
+ const decryptedBuffer = fs.readFileSync(pdfPath);
230
+ const newBinaryData = await this.helpers.prepareBinaryData(decryptedBuffer, binaryData.fileName?.replace('.pdf', '_decrypted.pdf') || 'decrypted.pdf', 'application/pdf');
231
+ returnData.push({
232
+ json: {
233
+ fileName: binaryData.fileName,
234
+ decrypted: true,
235
+ },
236
+ binary: {
237
+ [binaryPropertyName]: newBinaryData,
238
+ },
239
+ pairedItem: { item: itemIndex },
240
+ });
241
+ }
242
+ }
243
+ finally {
244
+ // Cleanup temp files
245
+ try {
246
+ if (fs.existsSync(inputPath))
247
+ fs.unlinkSync(inputPath);
248
+ if (fs.existsSync(decryptedPath))
249
+ fs.unlinkSync(decryptedPath);
250
+ }
251
+ catch {
252
+ // Ignore cleanup errors
253
+ }
254
+ }
255
+ }
256
+ catch (error) {
257
+ if (this.continueOnFail()) {
258
+ returnData.push({
259
+ json: {
260
+ error: error.message,
261
+ success: false,
262
+ },
263
+ pairedItem: { item: itemIndex },
264
+ });
265
+ continue;
266
+ }
267
+ throw error;
268
+ }
269
+ }
270
+ return [returnData];
271
+ }
272
+ }
273
+ exports.PdfExtractor = PdfExtractor;
@@ -0,0 +1,11 @@
1
+ <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 64 64" fill="none">
2
+ <rect width="64" height="64" rx="8" fill="#E53935"/>
3
+ <path d="M20 12h16l12 12v28a4 4 0 01-4 4H20a4 4 0 01-4-4V16a4 4 0 014-4z" fill="white"/>
4
+ <path d="M36 12v12h12" fill="none" stroke="#E53935" stroke-width="2"/>
5
+ <path d="M24 32h16M24 38h12M24 44h8" stroke="#E53935" stroke-width="2" stroke-linecap="round"/>
6
+ <circle cx="48" cy="48" r="12" fill="#4CAF50"/>
7
+ <path d="M44 48h8M48 44v8" stroke="white" stroke-width="2" stroke-linecap="round"/>
8
+ <rect x="42" y="26" width="8" height="10" rx="1" fill="#FFC107"/>
9
+ <circle cx="46" cy="36" r="2" fill="#795548"/>
10
+ <path d="M46 38v4" stroke="#795548" stroke-width="1.5"/>
11
+ </svg>
package/package.json ADDED
@@ -0,0 +1,62 @@
1
+ {
2
+ "name": "@leolionart/n8n-nodes-pdf-extractor",
3
+ "version": "1.0.0",
4
+ "description": "n8n community node to extract text from password-protected PDFs using qpdf and pdftotext",
5
+ "keywords": [
6
+ "n8n-community-node-package",
7
+ "n8n",
8
+ "pdf",
9
+ "extract",
10
+ "password",
11
+ "decrypt",
12
+ "pdftotext",
13
+ "qpdf"
14
+ ],
15
+ "license": "MIT",
16
+ "homepage": "https://github.com/pntai/n8n-nodes-pdf-extractor",
17
+ "author": {
18
+ "name": "NAAI Studio",
19
+ "email": "art.leolion@gmail.com"
20
+ },
21
+ "repository": {
22
+ "type": "git",
23
+ "url": "https://github.com/pntai/n8n-nodes-pdf-extractor.git"
24
+ },
25
+ "engines": {
26
+ "node": ">=18.0.0"
27
+ },
28
+ "main": "dist/index.js",
29
+ "types": "dist/index.d.ts",
30
+ "files": [
31
+ "dist"
32
+ ],
33
+ "scripts": {
34
+ "build": "tsc && gulp build:icons",
35
+ "dev": "tsc --watch",
36
+ "format": "prettier --write .",
37
+ "lint": "eslint .",
38
+ "lintfix": "eslint . --fix",
39
+ "prepublishOnly": "npm run build"
40
+ },
41
+ "n8n": {
42
+ "n8nNodesApiVersion": 1,
43
+ "nodes": [
44
+ "dist/nodes/PdfExtractor/PdfExtractor.node.js"
45
+ ],
46
+ "credentials": []
47
+ },
48
+ "devDependencies": {
49
+ "@types/node": "^20.10.0",
50
+ "@typescript-eslint/eslint-plugin": "^6.0.0",
51
+ "@typescript-eslint/parser": "^6.0.0",
52
+ "eslint": "^8.56.0",
53
+ "eslint-plugin-n8n-nodes-base": "^1.16.0",
54
+ "gulp": "^4.0.2",
55
+ "n8n-workflow": "*",
56
+ "prettier": "^3.1.0",
57
+ "typescript": "^5.3.0"
58
+ },
59
+ "peerDependencies": {
60
+ "n8n-workflow": "*"
61
+ }
62
+ }