n8n-nodes-ocrbro 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,37 @@
1
1
  "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
2
35
  Object.defineProperty(exports, "__esModule", { value: true });
3
36
  exports.OcrBro = void 0;
4
37
  const tesseract_js_1 = require("tesseract.js");
@@ -10,27 +43,51 @@ class OcrBro {
10
43
  icon: 'file:ocrbro.png',
11
44
  group: ['transform'],
12
45
  version: 1,
13
- description: 'OCR Images using Tesseract.js - extracts text from images',
46
+ description: 'Extract text from images (OCR) or PDFs',
14
47
  defaults: {
15
48
  name: 'OCR Bro',
16
49
  },
17
50
  inputs: ['main'],
18
51
  outputs: ['main'],
19
52
  properties: [
53
+ {
54
+ displayName: 'Operation',
55
+ name: 'operation',
56
+ type: 'options',
57
+ noDataExpression: true,
58
+ options: [
59
+ {
60
+ name: 'OCR from Image',
61
+ value: 'ocrImage',
62
+ description: 'Extract text from images using Tesseract OCR',
63
+ },
64
+ {
65
+ name: 'Extract Text from PDF',
66
+ value: 'extractPdf',
67
+ description: 'Extract text from PDF documents',
68
+ },
69
+ ],
70
+ default: 'ocrImage',
71
+ },
20
72
  {
21
73
  displayName: 'Input Binary Field',
22
74
  name: 'binaryPropertyName',
23
75
  type: 'string',
24
76
  default: 'data',
25
77
  required: true,
26
- description: 'The name of the binary property containing the image file to OCR',
78
+ description: 'The name of the binary property containing the file',
27
79
  },
28
80
  {
29
81
  displayName: 'Language',
30
82
  name: 'language',
31
83
  type: 'string',
32
84
  default: 'eng',
33
- description: 'Tesseract language code (e.g., eng, deu, spa). Multiple languages can be specified separated by "+".',
85
+ description: 'Tesseract language code (e.g., eng, deu, spa). Multiple languages can be separated by "+".',
86
+ displayOptions: {
87
+ show: {
88
+ operation: ['ocrImage'],
89
+ },
90
+ },
34
91
  },
35
92
  ],
36
93
  };
@@ -39,51 +96,85 @@ class OcrBro {
39
96
  var _a;
40
97
  const items = this.getInputData();
41
98
  const returnData = [];
99
+ const operation = this.getNodeParameter('operation', 0);
42
100
  const binaryPropertyName = this.getNodeParameter('binaryPropertyName', 0);
43
- const language = this.getNodeParameter('language', 0);
44
- // @ts-ignore
45
- const worker = await (0, tesseract_js_1.createWorker)();
46
- // @ts-ignore
47
- await worker.loadLanguage(language);
48
- // @ts-ignore
49
- await worker.initialize(language);
50
- for (let i = 0; i < items.length; i++) {
51
- try {
52
- const item = items[i];
53
- // @ts-ignore - n8n types are sometimes weird with binary helpers in dry run
54
- const binaryData = await this.helpers.getBinaryDataBuffer(i, binaryPropertyName);
55
- const binaryMetadata = item.binary ? item.binary[binaryPropertyName] : undefined;
56
- if (!binaryMetadata) {
57
- throw new Error(`Binary property "${binaryPropertyName}" does not exist on item ${i}`);
101
+ if (operation === 'ocrImage') {
102
+ const language = this.getNodeParameter('language', 0);
103
+ // @ts-ignore
104
+ const worker = await (0, tesseract_js_1.createWorker)();
105
+ // @ts-ignore
106
+ await worker.loadLanguage(language);
107
+ // @ts-ignore
108
+ await worker.initialize(language);
109
+ for (let i = 0; i < items.length; i++) {
110
+ try {
111
+ const item = items[i];
112
+ // @ts-ignore
113
+ const binaryData = await this.helpers.getBinaryDataBuffer(i, binaryPropertyName);
114
+ const binaryMetadata = item.binary ? item.binary[binaryPropertyName] : undefined;
115
+ if (!binaryMetadata) {
116
+ throw new Error(`Binary property "${binaryPropertyName}" does not exist on item ${i}`);
117
+ }
118
+ const mimeType = binaryMetadata.mimeType;
119
+ if (!mimeType.startsWith('image/')) {
120
+ throw new Error(`Unsupported file type: ${mimeType}. Use "Extract Text from PDF" for PDF files.`);
121
+ }
122
+ // @ts-ignore
123
+ const { data } = await worker.recognize(binaryData);
124
+ returnData.push({
125
+ json: {
126
+ text: data.text,
127
+ confidence: data.confidence,
128
+ words: ((_a = data.words) === null || _a === void 0 ? void 0 : _a.length) || 0,
129
+ },
130
+ binary: item.binary,
131
+ });
58
132
  }
59
- const mimeType = binaryMetadata.mimeType;
60
- if (!mimeType.startsWith('image/')) {
61
- throw new Error(`Unsupported file type: ${mimeType}. OCR Bro only supports image files (PNG, JPG, TIFF, BMP, etc.)`);
133
+ catch (error) {
134
+ if (this.continueOnFail()) {
135
+ returnData.push({ json: { error: error.message } });
136
+ continue;
137
+ }
138
+ throw error;
62
139
  }
63
- // @ts-ignore
64
- const { data } = await worker.recognize(binaryData);
65
- returnData.push({
66
- json: {
67
- text: data.text,
68
- confidence: data.confidence,
69
- words: ((_a = data.words) === null || _a === void 0 ? void 0 : _a.length) || 0,
70
- },
71
- binary: item.binary,
72
- });
73
140
  }
74
- catch (error) {
75
- if (this.continueOnFail()) {
141
+ await worker.terminate();
142
+ }
143
+ else if (operation === 'extractPdf') {
144
+ // Dynamic import for ESM module
145
+ const { extractText, getDocumentProxy } = await Promise.resolve().then(() => __importStar(require('unpdf')));
146
+ for (let i = 0; i < items.length; i++) {
147
+ try {
148
+ const item = items[i];
149
+ // @ts-ignore
150
+ const binaryData = await this.helpers.getBinaryDataBuffer(i, binaryPropertyName);
151
+ const binaryMetadata = item.binary ? item.binary[binaryPropertyName] : undefined;
152
+ if (!binaryMetadata) {
153
+ throw new Error(`Binary property "${binaryPropertyName}" does not exist on item ${i}`);
154
+ }
155
+ const mimeType = binaryMetadata.mimeType;
156
+ if (mimeType !== 'application/pdf') {
157
+ throw new Error(`Expected PDF file but got: ${mimeType}. Use "OCR from Image" for image files.`);
158
+ }
159
+ const pdf = await getDocumentProxy(new Uint8Array(binaryData));
160
+ const { text, totalPages } = await extractText(pdf, { mergePages: true });
76
161
  returnData.push({
77
162
  json: {
78
- error: error.message,
163
+ text: text,
164
+ pages: totalPages,
79
165
  },
166
+ binary: item.binary,
80
167
  });
81
- continue;
82
168
  }
83
- throw error;
169
+ catch (error) {
170
+ if (this.continueOnFail()) {
171
+ returnData.push({ json: { error: error.message } });
172
+ continue;
173
+ }
174
+ throw error;
175
+ }
84
176
  }
85
177
  }
86
- await worker.terminate();
87
178
  return [returnData];
88
179
  }
89
180
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "n8n-nodes-ocrbro",
3
- "version": "0.1.0",
3
+ "version": "0.1.1",
4
4
  "description": "Native n8n node for OCR using Tesseract.js",
5
5
  "keywords": [
6
6
  "n8n-community-node-package"
@@ -42,6 +42,8 @@
42
42
  "typescript": "^5.0.0"
43
43
  },
44
44
  "dependencies": {
45
- "tesseract.js": "^4.0.0"
45
+ "pdf2json": "^4.0.0",
46
+ "tesseract.js": "^4.0.0",
47
+ "unpdf": "^1.4.0"
46
48
  }
47
- }
49
+ }