n8n-nodes-ocrbro 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,158 @@
1
+ # n8n-nodes-ocrbro
2
+
3
+ Extract text from images and PDFs in your n8n workflows. This community node provides OCR (Optical Character Recognition) for images using Tesseract.js and text extraction from PDF documents.
4
+
5
+ ![OCR Bro Node](https://img.shields.io/npm/v/n8n-nodes-ocrbro.svg)
6
+
7
+ ## Features
8
+
9
+ - **OCR from Images** - Extract text from PNG, JPG, TIFF, BMP, and other image formats using Tesseract.js
10
+ - **Extract Text from PDFs** - Pull text content from PDF documents
11
+ - **Multi-language Support** - OCR supports 100+ languages via Tesseract language packs
12
+ - **No External APIs** - All processing happens locally, no data leaves your server
13
+
14
+ ## Installation
15
+
16
+ ### Via n8n Community Nodes (Recommended)
17
+
18
+ 1. Open your n8n instance
19
+ 2. Go to **Settings** → **Community Nodes**
20
+ 3. Click **Install a community node**
21
+ 4. Enter: `n8n-nodes-ocrbro`
22
+ 5. Click **Install**
23
+ 6. Restart n8n when prompted
24
+
25
+ ## Video Tutorials
26
+
27
+ ### 1. How to install ocrbro n8n node for free?
28
+ [![How to install ocrbro n8n node for free?](https://img.youtube.com/vi/v-SByxejyQ8/0.jpg)](https://youtu.be/v-SByxejyQ8)
29
+
30
+ ### 2. Example PDF Text Extraction
31
+ [![Example PDF Text Extraction](https://img.youtube.com/vi/JpXKcSkO61o/0.jpg)](https://youtu.be/JpXKcSkO61o)
32
+
33
+ ### 3. Example Image Text extraction OCR
34
+ [![Example Image Text extraction OCR](https://img.youtube.com/vi/sZb9GHWbtbo/0.jpg)](https://youtu.be/sZb9GHWbtbo)
35
+
36
+ ### 4. How to create Private n8n PDF text extraction API endpoint on n8n
37
+ [![How to create Private n8n PDF text extraction API endpoint on n8n](https://img.youtube.com/vi/sZb9GHWbtbo/0.jpg)](https://youtu.be/sZb9GHWbtbo)
38
+
39
+ ### 5. How to create Private n8n Image text extraction API endpoint on n8n
40
+ [![How to create Private n8n Image text extraction API endpoint on n8n](https://img.youtube.com/vi/crR1N8z0IHw/0.jpg)](https://youtu.be/crR1N8z0IHw)
41
+
42
+ ### Via npm (Self-hosted)
43
+
44
+ ```bash
45
+ cd ~/.n8n/nodes
46
+ npm install n8n-nodes-ocrbro
47
+ # Restart n8n
48
+ ```
49
+
50
+ ### Docker
51
+
52
+ Mount the node into your n8n container:
53
+
54
+ ```bash
55
+ docker run -it --rm \
56
+ --name n8n \
57
+ -p 5678:5678 \
58
+ -e N8N_CUSTOM_EXTENSIONS="/home/node/.n8n/custom/n8n-nodes-ocrbro" \
59
+ -v n8n_data:/home/node/.n8n \
60
+ docker.n8n.io/n8nio/n8n
61
+ ```
62
+
63
+ ## Usage
64
+
65
+ ### OCR from Image
66
+
67
+ Extract text from images using Tesseract OCR.
68
+
69
+ 1. Add **OCR Bro** node to your workflow
70
+ 2. Set **Operation** to `OCR from Image`
71
+ 3. Configure:
72
+ - **Input Binary Field**: Name of the binary property containing the image (default: `data`)
73
+ - **Language**: Tesseract language code (default: `eng`)
74
+
75
+ **Example workflow:**
76
+ ```
77
+ [Read Binary File] → [OCR Bro] → [Set Node]
78
+ ```
79
+
80
+ **Supported image formats:** PNG, JPG/JPEG, TIFF, BMP, GIF, WebP
81
+
82
+ **Language codes:**
83
+ - `eng` - English
84
+ - `deu` - German
85
+ - `fra` - French
86
+ - `spa` - Spanish
87
+ - `chi_sim` - Chinese (Simplified)
88
+ - `jpn` - Japanese
89
+ - Multiple languages: `eng+deu+fra`
90
+
91
+ ### Extract Text from PDF
92
+
93
+ Extract text content from PDF documents.
94
+
95
+ 1. Add **OCR Bro** node to your workflow
96
+ 2. Set **Operation** to `Extract Text from PDF`
97
+ 3. Configure:
98
+ - **Input Binary Field**: Name of the binary property containing the PDF (default: `data`)
99
+
100
+ **Example workflow:**
101
+ ```
102
+ [HTTP Request (PDF URL)] → [OCR Bro] → [Code Node]
103
+ ```
104
+
105
+ **Output:**
106
+ ```json
107
+ {
108
+ "text": "Extracted text content...",
109
+ "pages": 5
110
+ }
111
+ ```
112
+
113
+ ## Examples
114
+
115
+ ### Basic Image OCR
116
+
117
+ 1. Use **Read Binary File** to load an image
118
+ 2. Connect to **OCR Bro** with operation `OCR from Image`
119
+ 3. Output contains `text`, `confidence`, and `words` count
120
+
121
+ ### Batch Process Images
122
+
123
+ 1. Use **Read Binary Files** to load multiple images
124
+ 2. Connect to **OCR Bro**
125
+ 3. Each item will be processed and return extracted text
126
+
127
+ ### Process PDF and Send via Email
128
+
129
+ 1. **HTTP Request** - Download PDF from URL
130
+ 2. **OCR Bro** - Extract text (operation: `Extract Text from PDF`)
131
+ 3. **Send Email** - Include extracted text in email body
132
+
133
+
134
+
135
+ ## Troubleshooting
136
+
137
+ ### Node not appearing after installation
138
+ - Restart your n8n instance
139
+ - Check the n8n logs for any errors
140
+
141
+ ### Low OCR accuracy
142
+ - Use higher resolution images (300 DPI recommended)
143
+ - Ensure good contrast between text and background
144
+ - Specify the correct language code
145
+ - Pre-process images to remove noise if needed
146
+
147
+ ### PDF extraction returns empty text
148
+ - The PDF may contain scanned images instead of text
149
+ - For scanned PDFs, convert pages to images first, then use the OCR operation
150
+
151
+ ## License
152
+
153
+ MIT
154
+
155
+ ## Links
156
+
157
+ - [npm Package](https://www.npmjs.com/package/n8n-nodes-ocrbro)
158
+ - [n8n Community Nodes](https://docs.n8n.io/integrations/community-nodes/)
@@ -1,4 +1,37 @@
1
1
  "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
2
35
  Object.defineProperty(exports, "__esModule", { value: true });
3
36
  exports.OcrBro = void 0;
4
37
  const tesseract_js_1 = require("tesseract.js");
@@ -10,27 +43,79 @@ class OcrBro {
10
43
  icon: 'file:ocrbro.png',
11
44
  group: ['transform'],
12
45
  version: 1,
13
- description: 'OCR Images using Tesseract.js - extracts text from images',
46
+ description: 'Extract text from images (OCR) or PDFs',
14
47
  defaults: {
15
48
  name: 'OCR Bro',
16
49
  },
17
50
  inputs: ['main'],
18
51
  outputs: ['main'],
19
52
  properties: [
53
+ {
54
+ displayName: 'Operation',
55
+ name: 'operation',
56
+ type: 'options',
57
+ noDataExpression: true,
58
+ options: [
59
+ {
60
+ name: 'OCR from Image',
61
+ value: 'ocrImage',
62
+ description: 'Extract text from images using Tesseract OCR',
63
+ },
64
+ {
65
+ name: 'Extract Text from PDF',
66
+ value: 'extractPdf',
67
+ description: 'Extract text from PDF documents',
68
+ },
69
+ ],
70
+ default: 'ocrImage',
71
+ },
20
72
  {
21
73
  displayName: 'Input Binary Field',
22
74
  name: 'binaryPropertyName',
23
75
  type: 'string',
24
76
  default: 'data',
25
77
  required: true,
26
- description: 'The name of the binary property containing the image file to OCR',
78
+ description: 'The name of the binary property containing the file',
27
79
  },
28
80
  {
29
81
  displayName: 'Language',
30
82
  name: 'language',
31
83
  type: 'string',
32
84
  default: 'eng',
33
- description: 'Tesseract language code (e.g., eng, deu, spa). Multiple languages can be specified separated by "+".',
85
+ description: 'Tesseract language code (e.g., eng, deu, spa). Multiple languages can be separated by "+".',
86
+ displayOptions: {
87
+ show: {
88
+ operation: ['ocrImage'],
89
+ },
90
+ },
91
+ },
92
+ {
93
+ displayName: 'Password Protected?',
94
+ name: 'isPasswordProtected',
95
+ type: 'boolean',
96
+ default: false,
97
+ description: 'Whether the PDF file is encrypted/password protected',
98
+ displayOptions: {
99
+ show: {
100
+ operation: ['extractPdf'],
101
+ },
102
+ },
103
+ },
104
+ {
105
+ displayName: 'Password',
106
+ name: 'password',
107
+ type: 'string',
108
+ typeOptions: {
109
+ password: true,
110
+ },
111
+ default: '',
112
+ description: 'The password required to unlock the PDF document',
113
+ displayOptions: {
114
+ show: {
115
+ operation: ['extractPdf'],
116
+ isPasswordProtected: [true],
117
+ },
118
+ },
34
119
  },
35
120
  ],
36
121
  };
@@ -39,51 +124,96 @@ class OcrBro {
39
124
  var _a;
40
125
  const items = this.getInputData();
41
126
  const returnData = [];
127
+ const operation = this.getNodeParameter('operation', 0);
42
128
  const binaryPropertyName = this.getNodeParameter('binaryPropertyName', 0);
43
- const language = this.getNodeParameter('language', 0);
44
- // @ts-ignore
45
- const worker = await (0, tesseract_js_1.createWorker)();
46
- // @ts-ignore
47
- await worker.loadLanguage(language);
48
- // @ts-ignore
49
- await worker.initialize(language);
50
- for (let i = 0; i < items.length; i++) {
51
- try {
52
- const item = items[i];
53
- // @ts-ignore - n8n types are sometimes weird with binary helpers in dry run
54
- const binaryData = await this.helpers.getBinaryDataBuffer(i, binaryPropertyName);
55
- const binaryMetadata = item.binary ? item.binary[binaryPropertyName] : undefined;
56
- if (!binaryMetadata) {
57
- throw new Error(`Binary property "${binaryPropertyName}" does not exist on item ${i}`);
129
+ if (operation === 'ocrImage') {
130
+ const language = this.getNodeParameter('language', 0);
131
+ // @ts-ignore
132
+ const worker = await (0, tesseract_js_1.createWorker)();
133
+ // @ts-ignore
134
+ await worker.loadLanguage(language);
135
+ // @ts-ignore
136
+ await worker.initialize(language);
137
+ for (let i = 0; i < items.length; i++) {
138
+ try {
139
+ const item = items[i];
140
+ // @ts-ignore
141
+ const binaryData = await this.helpers.getBinaryDataBuffer(i, binaryPropertyName);
142
+ const binaryMetadata = item.binary ? item.binary[binaryPropertyName] : undefined;
143
+ if (!binaryMetadata) {
144
+ throw new Error(`Binary property "${binaryPropertyName}" does not exist on item ${i}`);
145
+ }
146
+ const mimeType = binaryMetadata.mimeType;
147
+ if (!mimeType.startsWith('image/')) {
148
+ throw new Error(`Unsupported file type: ${mimeType}. Use "Extract Text from PDF" for PDF files.`);
149
+ }
150
+ // @ts-ignore
151
+ const { data } = await worker.recognize(binaryData);
152
+ returnData.push({
153
+ json: {
154
+ text: data.text,
155
+ confidence: data.confidence,
156
+ words: ((_a = data.words) === null || _a === void 0 ? void 0 : _a.length) || 0,
157
+ },
158
+ binary: item.binary,
159
+ });
58
160
  }
59
- const mimeType = binaryMetadata.mimeType;
60
- if (!mimeType.startsWith('image/')) {
61
- throw new Error(`Unsupported file type: ${mimeType}. OCR Bro only supports image files (PNG, JPG, TIFF, BMP, etc.)`);
161
+ catch (error) {
162
+ if (this.continueOnFail()) {
163
+ returnData.push({ json: { error: error.message } });
164
+ continue;
165
+ }
166
+ throw error;
62
167
  }
63
- // @ts-ignore
64
- const { data } = await worker.recognize(binaryData);
65
- returnData.push({
66
- json: {
67
- text: data.text,
68
- confidence: data.confidence,
69
- words: ((_a = data.words) === null || _a === void 0 ? void 0 : _a.length) || 0,
70
- },
71
- binary: item.binary,
72
- });
73
168
  }
74
- catch (error) {
75
- if (this.continueOnFail()) {
169
+ await worker.terminate();
170
+ }
171
+ else if (operation === 'extractPdf') {
172
+ // Dynamic import for ESM module
173
+ const { extractText, getDocumentProxy } = await Promise.resolve().then(() => __importStar(require('unpdf')));
174
+ for (let i = 0; i < items.length; i++) {
175
+ try {
176
+ const item = items[i];
177
+ // @ts-ignore
178
+ const binaryData = await this.helpers.getBinaryDataBuffer(i, binaryPropertyName);
179
+ const binaryMetadata = item.binary ? item.binary[binaryPropertyName] : undefined;
180
+ if (!binaryMetadata) {
181
+ throw new Error(`Binary property "${binaryPropertyName}" does not exist on item ${i}`);
182
+ }
183
+ const mimeType = binaryMetadata.mimeType;
184
+ if (mimeType !== 'application/pdf') {
185
+ throw new Error(`Expected PDF file but got: ${mimeType}. Use "OCR from Image" for image files.`);
186
+ }
187
+ let pdfBufferToProcess = new Uint8Array(binaryData);
188
+ const isPasswordProtected = this.getNodeParameter('isPasswordProtected', i, false);
189
+ if (isPasswordProtected) {
190
+ const password = this.getNodeParameter('password', i);
191
+ const { PDFDocument } = await Promise.resolve().then(() => __importStar(require('@yongseok_choi/pdf-lib')));
192
+ const pdfDoc = await PDFDocument.load(binaryData, {
193
+ password: password,
194
+ ignoreEncryption: true,
195
+ });
196
+ pdfBufferToProcess = (await pdfDoc.save());
197
+ }
198
+ const pdf = await getDocumentProxy(pdfBufferToProcess);
199
+ const { text, totalPages } = await extractText(pdf, { mergePages: true });
76
200
  returnData.push({
77
201
  json: {
78
- error: error.message,
202
+ text: text,
203
+ pages: totalPages,
79
204
  },
205
+ binary: item.binary,
80
206
  });
81
- continue;
82
207
  }
83
- throw error;
208
+ catch (error) {
209
+ if (this.continueOnFail()) {
210
+ returnData.push({ json: { error: error.message } });
211
+ continue;
212
+ }
213
+ throw error;
214
+ }
84
215
  }
85
216
  }
86
- await worker.terminate();
87
217
  return [returnData];
88
218
  }
89
219
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "n8n-nodes-ocrbro",
3
- "version": "0.1.0",
3
+ "version": "0.1.2",
4
4
  "description": "Native n8n node for OCR using Tesseract.js",
5
5
  "keywords": [
6
6
  "n8n-community-node-package"
@@ -42,6 +42,9 @@
42
42
  "typescript": "^5.0.0"
43
43
  },
44
44
  "dependencies": {
45
- "tesseract.js": "^4.0.0"
45
+ "@yongseok_choi/pdf-lib": "^2.2.6",
46
+ "pdf2json": "^4.0.0",
47
+ "tesseract.js": "^4.0.0",
48
+ "unpdf": "^1.4.0"
46
49
  }
47
- }
50
+ }
Binary file
package/eng.traineddata DELETED
Binary file